Skip to content

Commit

Permalink
Add progress indicators to embeddings
Browse files Browse the repository at this point in the history
  • Loading branch information
snexus committed Sep 8, 2023
1 parent 79bf370 commit 29b8e60
Show file tree
Hide file tree
Showing 3 changed files with 26 additions and 24 deletions.
42 changes: 21 additions & 21 deletions sample_templates/obsidian_conf.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -13,8 +13,8 @@ embeddings:
- 1024

document_settings:
# - doc_path: /home/snexus/projects/knowledge-base
- doc_path: /storage/llm/docs
- doc_path: /home/snexus/projects/knowledge-base
#- doc_path: /storage/llm/docs
exclude_paths:
- /home/snexus/projects/knowledge-base/daily_notes
- /home/snexus/projects/knowledge-base/templates
Expand All @@ -34,11 +34,11 @@ embeddings:
passage_prefix: "passage: "
label: "obsidian"

- doc_path: /storage/llm/pdf_docs2
scan_extensions:
- pdf
passage_prefix: "passage: "
label: "books"
# - doc_path: /storage/llm/pdf_docs2
# scan_extensions:
# - pdf
# passage_prefix: "passage: "
# label: "books"


semantic_search:
Expand All @@ -58,21 +58,21 @@ semantic_search:
persist_response_db_path: responses_test.db


# llm:
# type: openai
# params:
# prompt_template: |
# Contex information is provided below. Given only the context and not prior knowledge, provide detailed answer to the question and references to the provided context. If answer isn't in the context, say you don't know.
llm:
type: openai
params:
prompt_template: |
Contex information is provided below. Given only the context and not prior knowledge, provide detailed answer to the question and references to the provided context. If answer isn't in the context, say you don't know.
# ### Context:
# ---------------------
# {context}
# ---------------------

# ### Question: {question}
# model_kwargs:
# temperature: 0.0
# model_name: gpt-3.5-turbo
### Context:
---------------------
{context}
---------------------
### Question: {question}
model_kwargs:
temperature: 0.0
model_name: gpt-3.5-turbo


# llm:
Expand Down
6 changes: 4 additions & 2 deletions src/llmsearch/chroma.py
Original file line number Diff line number Diff line change
@@ -1,4 +1,5 @@
import shutil
import tqdm
from pathlib import Path
from typing import List, Optional, Tuple

Expand Down Expand Up @@ -27,7 +28,7 @@ def create_index_from_documents(
self,
all_docs: List[Document],
clear_persist_folder: bool = True,
max_chunk_size=40000, # Limitation of Chromadb (2023/09 v0.4.8) - can add only 41666 documents at once
batch_size=200, # Limitation of Chromadb (2023/09 v0.4.8) - can add only 41666 documents at once
):
if clear_persist_folder:
pf = Path(self._persist_folder)
Expand All @@ -38,7 +39,7 @@ def create_index_from_documents(
logger.info("Generating and persisting the embeddings..")

vectordb = None
for group in chunker(all_docs, size=max_chunk_size):
for group in tqdm.tqdm(chunker(all_docs, size=batch_size), total = int(len(all_docs) / batch_size)):
ids = [d.metadata["document_id"] for d in group]
if vectordb is None:
vectordb = Chroma.from_documents(
Expand All @@ -54,6 +55,7 @@ def create_index_from_documents(
ids=ids,
metadatas = [doc.metadata for doc in group],
)
logger.info("Generated embeddings. Persisting...")
if vectordb is not None:
vectordb.persist()

Expand Down
2 changes: 1 addition & 1 deletion src/llmsearch/splade.py
Original file line number Diff line number Diff line change
Expand Up @@ -135,7 +135,7 @@ def generate_embeddings_from_docs(self, docs: List[Document], persist: bool = Tr
metadatas = [d.metadata for d in docs]

vecs = []
for chunk in tqdm.tqdm(split(docs, chunk_size=chunk_size)):
for chunk in tqdm.tqdm(split(docs, chunk_size=chunk_size), total = int(len(docs) / chunk_size)):
texts = [d.page_content for d in chunk if d.page_content]
vecs.append(self._get_batch_embeddings(texts))

Expand Down

0 comments on commit 29b8e60

Please sign in to comment.