From 29b8e6028f6d246583114935abdfeabfbdae217e Mon Sep 17 00:00:00 2001
From: Denis L <lepchev@gmail.com>
Date: Fri, 8 Sep 2023 20:36:56 +0800
Subject: [PATCH] Add progress indicators to embeddings

---
 sample_templates/obsidian_conf.yaml | 42 ++++++++++++++---------------
 src/llmsearch/chroma.py             |  6 +++--
 src/llmsearch/splade.py             |  2 +-
 3 files changed, 26 insertions(+), 24 deletions(-)

diff --git a/sample_templates/obsidian_conf.yaml b/sample_templates/obsidian_conf.yaml
index 97f1334..c4c166e 100644
--- a/sample_templates/obsidian_conf.yaml
+++ b/sample_templates/obsidian_conf.yaml
@@ -13,8 +13,8 @@ embeddings:
     - 1024
 
   document_settings:
-  # - doc_path: /home/snexus/projects/knowledge-base
-  - doc_path: /storage/llm/docs
+  - doc_path: /home/snexus/projects/knowledge-base
+  #- doc_path: /storage/llm/docs
     exclude_paths:
       - /home/snexus/projects/knowledge-base/daily_notes
       - /home/snexus/projects/knowledge-base/templates
@@ -34,11 +34,11 @@ embeddings:
     passage_prefix: "passage: "
     label: "obsidian"
 
-  - doc_path: /storage/llm/pdf_docs2
-    scan_extensions: 
-      - pdf
-    passage_prefix: "passage: "
-    label: "books"
+  # - doc_path: /storage/llm/pdf_docs2
+  #   scan_extensions: 
+  #     - pdf
+  #   passage_prefix: "passage: "
+  #   label: "books"
 
 
 semantic_search:
@@ -58,21 +58,21 @@ semantic_search:
 persist_response_db_path: responses_test.db
 
 
-# llm:
-#    type: openai
-#    params:
-#      prompt_template: |
-#        Contex information is provided below. Given only the context and not prior knowledge, provide detailed answer to the question and references to the provided context. If answer isn't in the context, say you don't know.
+llm:
+   type: openai
+   params:
+     prompt_template: |
+       Contex information is provided below. Given only the context and not prior knowledge, provide detailed answer to the question and references to the provided context. If answer isn't in the context, say you don't know.
         
-#          ### Context:
-#          ---------------------
-#          {context}
-#          ---------------------
-
-#          ### Question: {question}
-#      model_kwargs:
-#        temperature: 0.0
-#        model_name: gpt-3.5-turbo
+         ### Context:
+         ---------------------
+         {context}
+         ---------------------
+
+         ### Question: {question}
+     model_kwargs:
+       temperature: 0.0
+       model_name: gpt-3.5-turbo
 
 
 # llm:
diff --git a/src/llmsearch/chroma.py b/src/llmsearch/chroma.py
index 6b74673..91c2ffb 100644
--- a/src/llmsearch/chroma.py
+++ b/src/llmsearch/chroma.py
@@ -1,4 +1,5 @@
 import shutil
+import tqdm
 from pathlib import Path
 from typing import List, Optional, Tuple
 
@@ -27,7 +28,7 @@ def create_index_from_documents(
         self,
         all_docs: List[Document],
         clear_persist_folder: bool = True,
-        max_chunk_size=40000,  # Limitation of Chromadb (2023/09 v0.4.8) - can add only 41666 documents at once
+        batch_size=200,  # Limitation of Chromadb (2023/09 v0.4.8) - can add only 41666 documents at once
     ):
         if clear_persist_folder:
             pf = Path(self._persist_folder)
@@ -38,7 +39,7 @@ def create_index_from_documents(
         logger.info("Generating and persisting the embeddings..")
 
         vectordb = None
-        for group in chunker(all_docs, size=max_chunk_size):
+        for group in tqdm.tqdm(chunker(all_docs, size=batch_size), total = int(len(all_docs) / batch_size)):
             ids = [d.metadata["document_id"] for d in group]
             if vectordb is None:
                 vectordb = Chroma.from_documents(
@@ -54,6 +55,7 @@ def create_index_from_documents(
                     ids=ids,
                     metadatas = [doc.metadata for doc in group],
                 )
+        logger.info("Generated embeddings. Persisting...")
         if vectordb is not None:
             vectordb.persist()
 
diff --git a/src/llmsearch/splade.py b/src/llmsearch/splade.py
index 992334b..16e9267 100644
--- a/src/llmsearch/splade.py
+++ b/src/llmsearch/splade.py
@@ -135,7 +135,7 @@ def generate_embeddings_from_docs(self, docs: List[Document], persist: bool = Tr
         metadatas = [d.metadata for d in docs]
 
         vecs = []
-        for chunk in tqdm.tqdm(split(docs, chunk_size=chunk_size)):
+        for chunk in tqdm.tqdm(split(docs, chunk_size=chunk_size),  total = int(len(docs) / chunk_size)):
             texts = [d.page_content for d in chunk if d.page_content]
             vecs.append(self._get_batch_embeddings(texts))