Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

feat: Semantic splitter #63

Merged
merged 17 commits into from
Mar 2, 2024
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
6 changes: 6 additions & 0 deletions .vscode/settings.json
Original file line number Diff line number Diff line change
@@ -0,0 +1,6 @@
{
"cSpell.words": [
"tiktoken",
"Upserted"
]
}
47 changes: 33 additions & 14 deletions README.md
Original file line number Diff line number Diff line change
Expand Up @@ -66,23 +66,41 @@ Super-Rag comes with a built in REST API powered by FastApi.

// Payload
{
"files": [{
"url": "https://arxiv.org/pdf/2210.03629.pdf"
}],
"files": [
{
"name": "My file", // Optional
"url": "https://path-to-my-file.pdf"
}
],
"document_processor": { // Optional
"encoder": {
"dimensions": 384,
"model_name": "embed-multilingual-light-v3.0",
"provider": "cohere"
},
"unstructured": {
"hi_res_model_name": "detectron2_onnx",
"partition_strategy": "auto",
"process_tables": false
},
"splitter": {
"max_tokens": 400,
"min_tokens": 30,
"name": "semantic",
"prefix_summary": true,
"prefix_title": true,
"rolling_window_size": 1
}
},
"vector_database": {
"type": "qdrant",
"config": {
"api_key": "YOUR API KEY",
"host": "THE QDRANT HOST"
}
},
"encoder": {
"type": "openai",
"name": "text-embedding-3-small",
"dimensions": 1536 // encoder depends on the provider and model
},
"index_name": "YOUR INDEX",
"webhook_url": "https://webhook.site/0e217d1c-49f1-424a-9992-497db09f7793"
"index_name": "my_index",
"webhook_url": "https://my-webhook-url"
}
```

Expand All @@ -103,12 +121,13 @@ Super-Rag comes with a built in REST API powered by FastApi.
"index_name": "YOUR INDEX",
"interpreter_mode": true,
"encoder": {
"type": "cohere",
"name": "embed-multilingual-light-v3.0",
"provider": "openai",
"name": "text-embedding-3-small",
"dimensions": 384
},
"exclude_fields": ["metadata"],
"session_id": "test"
"exclude_fields": ["metadata"], // Exclude specific fields
"interpreter_mode": False, // Set to True if you wish to run computation Q&A with a code interpreter
"session_id": "my_session_id" // keeps micro-vm sessions and enables caching
}
```

Expand Down
5 changes: 2 additions & 3 deletions api/delete.py
Original file line number Diff line number Diff line change
@@ -1,7 +1,6 @@
from fastapi import APIRouter

from models.delete import RequestPayload, ResponsePayload
from service.embedding import get_encoder
from vectordbs import get_vector_service
from vectordbs.base import BaseVectorDatabase

Expand All @@ -10,12 +9,12 @@

@router.delete("/delete", response_model=ResponsePayload)
async def delete(payload: RequestPayload):
encoder = get_encoder(encoder_config=payload.encoder)
encoder = payload.encoder.get_encoder()
vector_service: BaseVectorDatabase = get_vector_service(
index_name=payload.index_name,
credentials=payload.vector_database,
encoder=encoder,
dimensions=encoder.dimensions,
dimensions=payload.encoder.dimensions,
)

for file in payload.files:
Expand Down
26 changes: 17 additions & 9 deletions api/ingest.py
Original file line number Diff line number Diff line change
Expand Up @@ -5,7 +5,7 @@
from fastapi import APIRouter

from models.ingest import RequestPayload
from service.embedding import EmbeddingService, get_encoder
from service.embedding import EmbeddingService
from service.ingest import handle_google_drive, handle_urls
from utils.summarise import SUMMARY_SUFFIX

Expand All @@ -14,25 +14,33 @@

@router.post("/ingest")
async def ingest(payload: RequestPayload) -> Dict:
encoder = get_encoder(encoder_config=payload.encoder)
encoder = payload.document_processor.encoder.get_encoder()
embedding_service = EmbeddingService(
encoder=encoder,
index_name=payload.index_name,
vector_credentials=payload.vector_database,
dimensions=payload.encoder.dimensions,
dimensions=payload.document_processor.encoder.dimensions,
)
chunks = []
summary_documents = []
if payload.files:
chunks, summary_documents = await handle_urls(embedding_service, payload.files)
chunks, summary_documents = await handle_urls(
embedding_service=embedding_service,
files=payload.files,
config=payload.document_processor,
)

elif payload.google_drive:
chunks, summary_documents = await handle_google_drive(
embedding_service, payload.google_drive
)
) # type: ignore TODO: Fix typing

await asyncio.gather(
embedding_service.generate_and_upsert_embeddings(
documents=chunks, encoder=encoder, index_name=payload.index_name
embedding_service.embed_and_upsert(
chunks=chunks, encoder=encoder, index_name=payload.index_name
),
embedding_service.generate_and_upsert_embeddings(
documents=summary_documents,
embedding_service.embed_and_upsert(
chunks=summary_documents,
encoder=encoder,
index_name=f"{payload.index_name}{SUMMARY_SUFFIX}",
),
Expand Down
10 changes: 7 additions & 3 deletions api/query.py
Original file line number Diff line number Diff line change
@@ -1,6 +1,6 @@
from fastapi import APIRouter

from models.query import RequestPayload, ResponseData, ResponsePayload
from models.query import RequestPayload, ResponsePayload
from service.router import query as _query

router = APIRouter()
Expand All @@ -9,5 +9,9 @@
@router.post("/query", response_model=ResponsePayload)
async def query(payload: RequestPayload):
chunks = await _query(payload=payload)
response_data = [ResponseData(**chunk.model_dump()) for chunk in chunks]
return {"success": True, "data": response_data}
# NOTE: Filter out fields before given to LLM
response_payload = ResponsePayload(success=True, data=chunks)
response_data = response_payload.model_dump(
exclude=set(payload.exclude_fields) if payload.exclude_fields else None
)
return response_data
2 changes: 1 addition & 1 deletion dev/embedding.ipynb
Original file line number Diff line number Diff line change
Expand Up @@ -40,7 +40,7 @@
"metadata": {},
"outputs": [],
"source": [
"elements = await embedding_service._download_and_extract_elements(file, strategy=\"auto\")\n"
"elements = await embedding_service._partition_file(file, strategy=\"auto\")\n"
]
},
{
Expand Down
Loading