-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathembed.js
209 lines (182 loc) · 6.94 KB
/
embed.js
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
import csvParser from "csv-parser";
import createCsvWriter from "csv-writer";
import dotenv from "dotenv";
import fs from "fs";
//import { createRequire } from "module";
import path from "path";
import { URL } from "url";
import { parse } from "node-html-parser";
import natural from "natural";
import { OpenAI } from "openai";
dotenv.config();
// get local directory path
// get OPENAI_API_KEY from GitHub secrets
const OPENAI_API_KEY = process.env.OPENAI_API_KEY;
const openai = new OpenAI({ apiKey: OPENAI_API_KEY });
// check if url has https:// if it does strip it out for domain
/**
* Takes an HTML content string as input.
* Returns an array of tokenized words.
* @param {string} content - The HTML content string.
* @returns {string[]} The array of tokenized words.
*/
async function tokenizeContent(content) {
const cleanContent = removeHTMLElementNamesFromString(content);
const tokenizer = new natural.WordTokenizer();
const tokens = tokenizer.tokenize(cleanContent);
return tokens.slice(0, 3000);
}
function removeHTMLElementNamesFromString(stringContent) {
const regex =
/\b(div|span|li|a|ul|section|script|footer|body|html|link|img|href|svg|alt|target|js|javascript|lang|head|gtag|meta|charset|utf|woff2|crossorigin|anonymous|link|rel|preload|as|font|href|assets|fonts|Inter|UI|var|woff2|type|font|css|stylesheet|text)\b/g;
return stringContent.replace(regex, "");
}
/**
* Takes a set of visited URLs and an output file path as input.
* Saves the visited URLs to a CSV file.
* @param {Set<string>} visitedUrls - The set of visited URLs.
* @param {string} outputPath - The output file path.
*/
async function getRelevantTokens(tokens) {
console.log("start getRelevantTokens");
const tokenString = typeof tokens === "string" ? tokens : tokens.join(" ");
// Prepare the prompt for OpenAI's Codex
const promptStart = `Given the following tokenized text, identify the most relevant tokens:\n\n`;
const promptEnd = `\n\nRelevant tokens:`;
// calculate the tokens available for the actual content
const availableTokens = 4096 - promptStart.length - promptEnd.length;
let prompt;
if (tokenString.length > availableTokens) {
// cut the string to fit available tokens
prompt = promptStart + tokenString.slice(0, availableTokens) + promptEnd;
} else {
prompt = promptStart + tokenString + promptEnd;
}
// Call the OpenAI API
let response;
try {
console.log("initiating openai api call");
response = await openai.completions.create({
model: "gpt-3.5-turbo-instruct",
prompt: prompt,
max_tokens: 2000,
n: 1,
stop: null,
temperature: 0.8,
});
} catch (e) {
console.error(
"Error calling OpenAI API getRelevantTokens completions.create:",
e?.response?.data?.error
);
throw new Error(
"Error calling OpenAI API getRelevantTokens completions.create"
);
}
console.log("finished getRelevantTokens");
// Extract and return the relevant tokens from the response
const relevantTokensText = response?.choices[0].text.trim();
const relevantTokens = relevantTokensText.split(" ");
console.log(relevantTokens);
return relevantTokens;
}
/**
* Takes an array of tokenized contents and an output file path as input.
* Saves the most relevant tokens to a CSV file.
* @param {object[]} tokenizedContents - The array of tokenized contents.
* @param {string} outputPath - The output file path.
*/
async function saveRelevantTokensToCsv(tokenizedContents, outputPath) {
console.log("start saveRelevantTokensToCsv");
const csvWriter = createCsvWriter.createObjectCsvWriter({
path: outputPath,
header: [
{ id: "url", title: "URL" },
{ id: "relevantTokens", title: "Relevant Tokens" },
],
});
const records = [];
for (const content of tokenizedContents) {
const relevantTokens = await getRelevantTokens(content.tokens);
records.push({
url: content.url,
relevantTokens: relevantTokens.join(" "),
});
}
await csvWriter.writeRecords(records);
console.log(`Relevant tokens saved to ${outputPath}`);
}
/**
* Takes a set of tokens as input.
* Returns an array of embeddings.
* @param {string[]} tokens - The set of tokens.
* @returns {number[][]} The array of embeddings.
*/
async function getEmbeddings(tokens) {
console.log("start getEmbeddings");
let response;
try {
console.log("initiating openai api call");
response = await openai.embeddings.create({
model: "text-embedding-ada-002",
input: tokens,
embedding_format: "float"
});
return response.data[0].embedding;
} catch (e) {
console.error("Error calling OpenAI API getEmbeddings:", e?.response?.data[0]);
throw new Error("Error calling OpenAI API getEmbeddings");
}
}
/**
* Takes two arrays of numbers as input.
* Returns the cosine similarity between the two arrays.
* @param {number[]} a - The first array of numbers.
* @param {number[]} b - The second array of numbers.
* @returns {number} The cosine similarity between the two arrays.
*/
async function getAnswer(context, question) {
let prompt = `Answer the question based on the context below. If the question can't be answered based on the context, make a reasonable guess.\n Context: ${context}\nQuestion: ${question}\nAnswer:`;
// chekc that the prompt is not too long
if (prompt.length > 10000) {
throw new Error(`Prompt is too long: ${prompt.length} characters`);
}
let response;
let answer;
try {
console.log(`Initiating OpenAI API call with prompt: ${prompt}`);
response = await openai.chat.completions.create({
messages: [{ role: "user", content: prompt }],
model: "gpt-4o",
});
answer = response.choices[0].message.content
console.log(`GPT Answer: ${answer}`);
return answer;
} catch (e) {
console.error("Error calling OpenAI API:", e?.response?.data?.error);
}
}
// Function to chunk text into N tokens with M overlap
function chunkText(text, N, M) {
const words = text.split(/\s+/);
const chunks = [];
for (let i = 0; i < words.length; i += (N - M)) {
const chunk = words.slice(i, i + N).join(' ');
chunks.push(chunk);
// Ensure overlap only occurs until the end of the text
if (i + N >= words.length) break;
}
return chunks;
}
// Function to process a text file
async function processTextFile(fileContent, chunkSize) {
try {
// Chunk the text
const chunks = chunkText(fileContent, chunkSize,50);
console.log(`File split into ${chunks.length} chunks.`);
return chunks;
} catch (err) {
console.error('Error processing text file:', err);
}
}
export {getEmbeddings, getAnswer,processTextFile};