Add progress indicators to embeddings

snexus · Sep 8, 2023 · 29b8e60 · 29b8e60
1 parent 79bf370
commit 29b8e60
Show file tree

Hide file tree

Showing 3 changed files with 26 additions and 24 deletions.
diff --git a/sample_templates/obsidian_conf.yaml b/sample_templates/obsidian_conf.yaml
@@ -13,8 +13,8 @@ embeddings:
     - 1024
 
   document_settings:
-  # - doc_path: /home/snexus/projects/knowledge-base
-  - doc_path: /storage/llm/docs
+  - doc_path: /home/snexus/projects/knowledge-base
+  #- doc_path: /storage/llm/docs
     exclude_paths:
       - /home/snexus/projects/knowledge-base/daily_notes
       - /home/snexus/projects/knowledge-base/templates
@@ -34,11 +34,11 @@ embeddings:
     passage_prefix: "passage: "
     label: "obsidian"
 
-  - doc_path: /storage/llm/pdf_docs2
-    scan_extensions: 
-      - pdf
-    passage_prefix: "passage: "
-    label: "books"
+  # - doc_path: /storage/llm/pdf_docs2
+  #   scan_extensions: 
+  #     - pdf
+  #   passage_prefix: "passage: "
+  #   label: "books"
 
 
 semantic_search:
@@ -58,21 +58,21 @@ semantic_search:
 persist_response_db_path: responses_test.db
 
 
-# llm:
-#    type: openai
-#    params:
-#      prompt_template: |
-#        Contex information is provided below. Given only the context and not prior knowledge, provide detailed answer to the question and references to the provided context. If answer isn't in the context, say you don't know.
+llm:
+   type: openai
+   params:
+     prompt_template: |
+       Contex information is provided below. Given only the context and not prior knowledge, provide detailed answer to the question and references to the provided context. If answer isn't in the context, say you don't know.
         
-#          ### Context:
-#          ---------------------
-#          {context}
-#          ---------------------
-
-#          ### Question: {question}
-#      model_kwargs:
-#        temperature: 0.0
-#        model_name: gpt-3.5-turbo
+         ### Context:
+         ---------------------
+         {context}
+         ---------------------
+
+         ### Question: {question}
+     model_kwargs:
+       temperature: 0.0
+       model_name: gpt-3.5-turbo
 
 
 # llm:

diff --git a/src/llmsearch/chroma.py b/src/llmsearch/chroma.py
@@ -1,4 +1,5 @@
 import shutil
+import tqdm
 from pathlib import Path
 from typing import List, Optional, Tuple
 
@@ -27,7 +28,7 @@ def create_index_from_documents(
         self,
         all_docs: List[Document],
         clear_persist_folder: bool = True,
-        max_chunk_size=40000,  # Limitation of Chromadb (2023/09 v0.4.8) - can add only 41666 documents at once
+        batch_size=200,  # Limitation of Chromadb (2023/09 v0.4.8) - can add only 41666 documents at once
     ):
         if clear_persist_folder:
             pf = Path(self._persist_folder)
@@ -38,7 +39,7 @@ def create_index_from_documents(
         logger.info("Generating and persisting the embeddings..")
 
         vectordb = None
-        for group in chunker(all_docs, size=max_chunk_size):
+        for group in tqdm.tqdm(chunker(all_docs, size=batch_size), total = int(len(all_docs) / batch_size)):
             ids = [d.metadata["document_id"] for d in group]
             if vectordb is None:
                 vectordb = Chroma.from_documents(
@@ -54,6 +55,7 @@ def create_index_from_documents(
                     ids=ids,
                     metadatas = [doc.metadata for doc in group],
                 )
+        logger.info("Generated embeddings. Persisting...")
         if vectordb is not None:
             vectordb.persist()
 

diff --git a/src/llmsearch/splade.py b/src/llmsearch/splade.py
@@ -135,7 +135,7 @@ def generate_embeddings_from_docs(self, docs: List[Document], persist: bool = Tr
         metadatas = [d.metadata for d in docs]
 
         vecs = []
-        for chunk in tqdm.tqdm(split(docs, chunk_size=chunk_size)):
+        for chunk in tqdm.tqdm(split(docs, chunk_size=chunk_size),  total = int(len(docs) / chunk_size)):
             texts = [d.page_content for d in chunk if d.page_content]
             vecs.append(self._get_batch_embeddings(texts))