-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathapp.py
87 lines (74 loc) · 2.58 KB
/
app.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
import rag_tools
import asyncio
from box import Box
import yaml
from quart import Quart, request, redirect, send_from_directory, jsonify
import chromadb
import chromadb.config
app = Quart(__name__, static_url_path="", static_folder="frontend", template_folder="frontend")
config = Box.from_yaml(
filename = "config.yaml",
Loader = yaml.FullLoader
)
spec = Box.from_yaml(
filename = config.path.pdfs + "spec.yaml",
Loader = yaml.FullLoader
)
pdfs = rag_tools.get_documents(config, spec)
trees = asyncio.run(rag_tools.get_trees(config, spec))
strings_embedder = rag_tools.Embedder(config)
chromadb_client = chromadb.PersistentClient(
path = config.path.embeddings,
settings = chromadb.config.Settings(anonymized_telemetry = False)
)
collections = {}
for doc in pdfs:
docId = doc['id']
collections[docId] = chromadb_client.get_collection(name=docId)
@app.route("/")
async def index():
return redirect("/index.html")
@app.route('/pdfs/<path:path>')
async def static_pdf(path):
return await send_from_directory(config.path.pdfs, path, mimetype="application/pdf")
@app.route("/documents", methods=["POST"])
async def list_docs():
return {'documents': pdfs}
@app.route("/search", methods=["POST"])
async def search():
data = await request.json;
docId = data['docId']
query = (config.embeddings.query_prefix + data['query'])[:config.chunker.size]
num_results = max(data['num_results'], 1)
embedding = (await strings_embedder.embed_strings([query]))[0]
#if docId not in collections:
#TODO
query_results = collections[docId].query(
query_embeddings = [embedding],
n_results = num_results
)
n_results = len(query_results['ids'][0])
results = []
#print(query_results['metadatas'][0])
for i in range(n_results):
path = query_results['ids'][0][i]
meta = query_results['metadatas'][0][i]
crumbs = []
node = rag_tools.walk_tree(
root = trees[docId],
path = path.split('#')[0],
func = lambda node: crumbs.append(node.header)
)
results.append({
'crumbs' : crumbs,
'text' : node.text[meta['pos']:(meta['pos']+meta['len'])],
'page' : node.page,
'position' : node.position,
'relevance': 1 - query_results['distances'][0][i]
})
return {
'docId' : docId,
'query' : data['query'],
'results': results,
};
app.run(threaded=False, host='0.0.0.0')