-
Notifications
You must be signed in to change notification settings - Fork 1
/
Copy pathutils.py
122 lines (97 loc) · 3.79 KB
/
utils.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
from pathlib import Path
from typing import List, Dict, Any
import numpy as np
from sentence_transformers import SentenceTransformer
from sentence_transformers.util import semantic_search
import streamlit as st
QUESTION_DATA_PATH = Path('data/processed/question_data.npy')
EMBEDDINGS_PATH = Path('data/embeddings/embeddings.npy')
MODEL_PATH = 'sentence-transformers/all-MiniLM-L6-v2'
@st.cache_resource
def load_model():
"""
Load a pre-trained SentenceTransformer model.
Returns:
-------
SentenceTransformer
A pre-trained SentenceTransformer model loaded from the specified MODEL_PATH.
"""
model = SentenceTransformer(MODEL_PATH)
return model
@st.cache_data
def load_embeddings():
"""
Load pre-computed embeddings from a file.
Returns:
-------
numpy.ndarray
A NumPy array containing pre-computed embeddings loaded from the specified EMBEDDINGS_PATH.
"""
embeddings = np.load(EMBEDDINGS_PATH)
return embeddings
@st.cache_data
def load_question_data():
"""
Load question data from a file.
Returns:
-------
numpy.ndarray
A NumPy array containing question data loaded from the specified QUESTION_DATA_PATH.
"""
question_data = np.load(QUESTION_DATA_PATH, allow_pickle=True)
return question_data
def find_similar_questions(text_input: str, k: int) -> List[List[Dict[str, Any]]]:
"""
Find similar questions to a given text input using pre-trained embeddings and a semantic search model.
Parameters:
----------
text_input : str
The input text for which similar questions are to be found.
k : int
The number of similar questions to retrieve.
Returns:
-------
List[List[Dict[str, Any]]]
A list of lists, where each inner list contains dictionaries representing similar questions.
Each dictionary has the following keys:
- 'question': str
The text of the similar question.
- 'score': float
The similarity score between the input text and the similar question.
"""
model = load_model()
embeddings = load_embeddings()
text_input_vectorized = model.encode(text_input)
similar_questions = semantic_search(text_input_vectorized, embeddings, top_k=k)
return similar_questions
def get_similar_questions_with_score(text_input: str, k=5) -> List[Dict[str, Any]]:
"""
Retrieve similar questions to a given text input along with their similarity scores.
Parameters:
----------
text_input : str
The input text for which similar questions are to be retrieved.
k : int, optional (default=5)
The number of similar questions to retrieve. Default is 5.
Returns:
-------
List[Dict[str, Any]]
A list of dictionaries representing similar questions and their similarity scores.
Each dictionary has the following keys:
- 'question': str
The text of the similar question.
- 'similarity_score': float
The similarity score between the input text and the similar question, rounded to one decimal place.
Notes:
-----
This function uses the `find_similar_questions` function to retrieve similar questions to the input text.
It also retrieves the corresponding similarity scores and returns the results as a list of dictionaries.
"""
similar_questions = find_similar_questions(text_input, k)
question_data = load_question_data()
corpus_ids = [item['corpus_id'] for item in similar_questions[0]]
similarity_scores = [round(item['score'] * 100, 1) for item in similar_questions[0]]
similar_question_data = question_data[corpus_ids]
results = [{'question': question, 'similarity_score': score}
for question, score in zip(similar_question_data, similarity_scores)]
return results