-
Notifications
You must be signed in to change notification settings - Fork 16
/
Copy pathutils.py
123 lines (93 loc) · 3.74 KB
/
utils.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
import numpy as np
import openai
import pandas as pd
import tiktoken
EMBEDDING_MODEL = "text-embedding-ada-002"
MAX_SECTION_LEN = 1500
SEPARATOR = "\n* "
ENCODING = "cl100k_base"
encoding = tiktoken.get_encoding(ENCODING)
separator_len = len(encoding.encode(SEPARATOR))
# Embedding code
def get_embedding(text: str, model: str = EMBEDDING_MODEL) -> list[float]:
result = openai.Embedding.create(model=model, input=text)
return result["data"][0]["embedding"]
def vector_similarity(x: list[float], y: list[float]) -> float:
"""
Returns the similarity between two vectors.
Because OpenAI Embeddings are normalized to length 1, the cosine similarity is the same as the dot product.
"""
return np.dot(np.array(x), np.array(y))
def order_document_sections_by_query_similarity(
query: str, contexts: dict[(str, str), np.array]
) -> list[(float, (str, str))]:
"""
Find the query embedding for the supplied query, and compare it against all of the pre-calculated document embeddings
to find the most relevant sections.
Return the list of document sections, sorted by relevance in descending order.
"""
query_embedding = get_embedding(query)
document_similarities = sorted(
[
(vector_similarity(query_embedding, doc_embedding), doc_index)
for doc_index, doc_embedding in contexts.items()
],
reverse=True,
)
return document_similarities
def get_context(question: str, context_embeddings: dict, df: pd.DataFrame) -> str:
"""
Fetch relevant
"""
most_relevant_document_sections = order_document_sections_by_query_similarity(
question, context_embeddings
)
chosen_sections = []
chosen_sections_len = 0
chosen_sections_indexes = []
for _, section_index in most_relevant_document_sections:
# Add contexts until we run out of space.
print(section_index)
document_section = df.loc[
(df["title"] == section_index[0]) & (df["url"] == section_index[1])
]
# Location of values
num_tokens = document_section.values[0][3]
curr_text = document_section.values[0][2]
chosen_sections_len += +num_tokens + separator_len
if chosen_sections_len > MAX_SECTION_LEN:
break
chosen_sections.append(SEPARATOR + curr_text.replace("\n", " "))
chosen_sections_indexes.append(str(section_index))
# Useful diagnostic information
print(f"Selected {len(chosen_sections)} document sections:")
print("\n".join(chosen_sections_indexes))
context = "".join(chosen_sections)
return (
context,
chosen_sections_indexes,
)
def get_answer_from_chatgpt(
query: str,
df: pd.DataFrame,
document_embeddings: dict[(str, str), np.array],
) -> str:
context, docs = get_context(query, document_embeddings, df)
# if show_prompt:
# print(prompt)
response = openai.ChatCompletion.create(
model="gpt-3.5-turbo",
messages=[
{
"role": "system",
"content": "You are an AWS Certified Solutions Architect. Your role is to help customers understand best practices on building on AWS. Return your response in markdown, so you can bold and highlight important steps for customers. If the answer cannot be found within the context, write 'I could not find an answer'",
},
{
"role": "system",
"content": f"Use the following context from the AWS Well-Architected Framework to answer the user's query .\nContext:\n{context}",
},
{"role": "user", "content": f"{query}"},
],
)
answer = response["choices"][0]["message"]["content"].strip(" \n")
return answer, docs