-
Notifications
You must be signed in to change notification settings - Fork 0
Commit
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
- Loading branch information
0 parents
commit 1a74435
Showing
12 changed files
with
401 additions
and
0 deletions.
There are no files selected for viewing
Empty file.
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,6 @@ | ||
**/*.class | ||
.vscode | ||
venv | ||
docs-index | ||
dict/glove.6B.300d.txt | ||
**/__pycache__ |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,28 @@ | ||
# DevNet Expert Search Backend | ||
|
||
This repo contains the code to generate the Lucene index for the DevNet Expert | ||
search engine. Due to the index's size, it is NOT included here. | ||
|
||
## Prerequisites | ||
|
||
The index generation code requires a modern Java compiler and runtime engine. | ||
|
||
Before building the index, download the `glove.6B.300d.txt` dictionary file | ||
(e.g., from https://www.kaggle.com/datasets/thanakomsn/glove6b300dtxt) and place | ||
it in the `dict` subdirectory. This file must be named `glove.6B.300d.txt`. | ||
|
||
### Compile the Classes | ||
|
||
Run the script `scripts/compile-classes.sh` to compile the Java classes needed to build | ||
the index. These files are taken from the example Lucene search engine code. | ||
|
||
## Generating the Index | ||
|
||
To generate the index, pick a target directory containing _text_ files to index. Then run | ||
the command `scripts/generate-index.sh SOURCE_DIR INDEX_DIR` where `SOURCE_DIR` is the | ||
path to the files to index and `INDEX_DIR` is the directory into which the index will | ||
be placed. | ||
|
||
When configuring the frontend, point to the index location. While you won't have the | ||
same documentation that the exam has, you can get a good sense of how the search engine | ||
works and how its results look. |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,279 @@ | ||
/* | ||
* Licensed to the Apache Software Foundation (ASF) under one or more | ||
* contributor license agreements. See the NOTICE file distributed with | ||
* this work for additional information regarding copyright ownership. | ||
* The ASF licenses this file to You under the Apache License, Version 2.0 | ||
* (the "License"); you may not use this file except in compliance with | ||
* the License. You may obtain a copy of the License at | ||
* | ||
* http://www.apache.org/licenses/LICENSE-2.0 | ||
* | ||
* Unless required by applicable law or agreed to in writing, software | ||
* distributed under the License is distributed on an "AS IS" BASIS, | ||
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. | ||
* See the License for the specific language governing permissions and | ||
* limitations under the License. | ||
*/ | ||
package com.example.ppm; | ||
|
||
import java.io.BufferedReader; | ||
import java.io.IOException; | ||
import java.io.InputStream; | ||
import java.io.InputStreamReader; | ||
import java.nio.charset.StandardCharsets; | ||
import java.nio.file.FileVisitResult; | ||
import java.nio.file.Files; | ||
import java.nio.file.Path; | ||
import java.nio.file.Paths; | ||
import java.nio.file.SimpleFileVisitor; | ||
import java.nio.file.attribute.BasicFileAttributes; | ||
import java.util.Date; | ||
import org.apache.lucene.analysis.Analyzer; | ||
import org.apache.lucene.analysis.standard.StandardAnalyzer; | ||
import org.apache.lucene.demo.knn.DemoEmbeddings; | ||
import org.apache.lucene.demo.knn.KnnVectorDict; | ||
import org.apache.lucene.document.Document; | ||
import org.apache.lucene.document.Field; | ||
import org.apache.lucene.document.KnnVectorField; | ||
import org.apache.lucene.document.LongPoint; | ||
import org.apache.lucene.document.StringField; | ||
import org.apache.lucene.document.TextField; | ||
import org.apache.lucene.index.DirectoryReader; | ||
import org.apache.lucene.index.IndexReader; | ||
import org.apache.lucene.index.IndexWriter; | ||
import org.apache.lucene.index.IndexWriterConfig; | ||
import org.apache.lucene.index.IndexWriterConfig.OpenMode; | ||
import org.apache.lucene.index.Term; | ||
import org.apache.lucene.index.VectorSimilarityFunction; | ||
import org.apache.lucene.store.Directory; | ||
import org.apache.lucene.store.FSDirectory; | ||
import org.apache.lucene.util.IOUtils; | ||
|
||
/** | ||
* Index all text files under a directory. | ||
* | ||
* <p>This is a command-line application demonstrating simple Lucene indexing. Run it with no | ||
* command-line arguments for usage information. | ||
*/ | ||
public class IndexFiles implements AutoCloseable { | ||
static final String KNN_DICT = "knn-dict"; | ||
|
||
// Calculates embedding vectors for KnnVector search | ||
private final DemoEmbeddings demoEmbeddings; | ||
private final KnnVectorDict vectorDict; | ||
|
||
private IndexFiles(KnnVectorDict vectorDict) throws IOException { | ||
if (vectorDict != null) { | ||
this.vectorDict = vectorDict; | ||
demoEmbeddings = new DemoEmbeddings(vectorDict); | ||
} else { | ||
this.vectorDict = null; | ||
demoEmbeddings = null; | ||
} | ||
} | ||
|
||
/** Index all text files under a directory. */ | ||
public static void main(String[] args) throws Exception { | ||
String usage = | ||
"java org.apache.lucene.demo.IndexFiles" | ||
+ " [-index INDEX_PATH] [-docs DOCS_PATH] [-update] [-knn_dict DICT_PATH]\n\n" | ||
+ "This indexes the documents in DOCS_PATH, creating a Lucene index" | ||
+ "in INDEX_PATH that can be searched with SearchFiles\n" | ||
+ "IF DICT_PATH contains a KnnVector dictionary, the index will also support KnnVector search"; | ||
String indexPath = "index"; | ||
String docsPath = null; | ||
String vectorDictSource = null; | ||
boolean create = true; | ||
for (int i = 0; i < args.length; i++) { | ||
switch (args[i]) { | ||
case "-index": | ||
indexPath = args[++i]; | ||
break; | ||
case "-docs": | ||
docsPath = args[++i]; | ||
break; | ||
case "-knn_dict": | ||
vectorDictSource = args[++i]; | ||
break; | ||
case "-update": | ||
create = false; | ||
break; | ||
case "-create": | ||
create = true; | ||
break; | ||
default: | ||
throw new IllegalArgumentException("unknown parameter " + args[i]); | ||
} | ||
} | ||
|
||
if (docsPath == null) { | ||
System.err.println("Usage: " + usage); | ||
System.exit(1); | ||
} | ||
|
||
final Path docDir = Paths.get(docsPath); | ||
if (!Files.isReadable(docDir)) { | ||
System.out.println( | ||
"Document directory '" | ||
+ docDir.toAbsolutePath() | ||
+ "' does not exist or is not readable, please check the path"); | ||
System.exit(1); | ||
} | ||
|
||
Date start = new Date(); | ||
try { | ||
System.out.println("Indexing to directory '" + indexPath + "'..."); | ||
|
||
Directory dir = FSDirectory.open(Paths.get(indexPath)); | ||
Analyzer analyzer = new StandardAnalyzer(); | ||
IndexWriterConfig iwc = new IndexWriterConfig(analyzer); | ||
|
||
if (create) { | ||
// Create a new index in the directory, removing any | ||
// previously indexed documents: | ||
iwc.setOpenMode(OpenMode.CREATE); | ||
} else { | ||
// Add new documents to an existing index: | ||
iwc.setOpenMode(OpenMode.CREATE_OR_APPEND); | ||
} | ||
|
||
// Optional: for better indexing performance, if you | ||
// are indexing many documents, increase the RAM | ||
// buffer. But if you do this, increase the max heap | ||
// size to the JVM (eg add -Xmx512m or -Xmx1g): | ||
// | ||
iwc.setRAMBufferSizeMB(256.0); | ||
|
||
KnnVectorDict vectorDictInstance = null; | ||
long vectorDictSize = 0; | ||
if (vectorDictSource != null) { | ||
KnnVectorDict.build(Paths.get(vectorDictSource), dir, KNN_DICT); | ||
vectorDictInstance = new KnnVectorDict(dir, KNN_DICT); | ||
vectorDictSize = vectorDictInstance.ramBytesUsed(); | ||
} | ||
|
||
try (IndexWriter writer = new IndexWriter(dir, iwc); | ||
IndexFiles indexFiles = new IndexFiles(vectorDictInstance)) { | ||
indexFiles.indexDocs(writer, docDir); | ||
|
||
// NOTE: if you want to maximize search performance, | ||
// you can optionally call forceMerge here. This can be | ||
// a terribly costly operation, so generally it's only | ||
// worth it when your index is relatively static (ie | ||
// you're done adding documents to it): | ||
// | ||
// writer.forceMerge(1); | ||
} finally { | ||
IOUtils.close(vectorDictInstance); | ||
} | ||
|
||
Date end = new Date(); | ||
IndexReader reader = DirectoryReader.open(dir); | ||
System.out.println( | ||
"Indexed " | ||
+ reader.numDocs() | ||
+ " documents in " | ||
+ (end.getTime() - start.getTime()) | ||
+ " milliseconds"); | ||
} catch (IOException e) { | ||
System.out.println(" caught a " + e.getClass() + "\n with message: " + e.getMessage()); | ||
} | ||
} | ||
|
||
/** | ||
* Indexes the given file using the given writer, or if a directory is given, recurses over files | ||
* and directories found under the given directory. | ||
* | ||
* <p>NOTE: This method indexes one document per input file. This is slow. For good throughput, | ||
* put multiple documents into your input file(s). An example of this is in the benchmark module, | ||
* which can create "line doc" files, one document per line, using the <a | ||
* href="../../../../../contrib-benchmark/org/apache/lucene/benchmark/byTask/tasks/WriteLineDocTask.html" | ||
* >WriteLineDocTask</a>. | ||
* | ||
* @param writer Writer to the index where the given file/dir info will be stored | ||
* @param path The file to index, or the directory to recurse into to find files to index | ||
* @throws IOException If there is a low-level I/O error | ||
*/ | ||
void indexDocs(final IndexWriter writer, Path path) throws IOException { | ||
if (Files.isDirectory(path)) { | ||
Files.walkFileTree( | ||
path, | ||
new SimpleFileVisitor<>() { | ||
@Override | ||
public FileVisitResult visitFile(Path file, BasicFileAttributes attrs) { | ||
try { | ||
indexDoc(writer, file, attrs.lastModifiedTime().toMillis()); | ||
} catch ( | ||
@SuppressWarnings("unused") | ||
IOException ignore) { | ||
ignore.printStackTrace(System.err); | ||
// don't index files that can't be read. | ||
} | ||
return FileVisitResult.CONTINUE; | ||
} | ||
}); | ||
} else { | ||
indexDoc(writer, path, Files.getLastModifiedTime(path).toMillis()); | ||
} | ||
} | ||
|
||
/** Indexes a single document */ | ||
void indexDoc(IndexWriter writer, Path file, long lastModified) throws IOException { | ||
try (InputStream stream = Files.newInputStream(file)) { | ||
// make a new, empty document | ||
Document doc = new Document(); | ||
|
||
// Add the path of the file as a field named "path". Use a | ||
// field that is indexed (i.e. searchable), but don't tokenize | ||
// the field into separate words and don't index term frequency | ||
// or positional information: | ||
Field pathField = new StringField("path", file.toString(), Field.Store.YES); | ||
doc.add(pathField); | ||
|
||
// Add the last modified date of the file a field named "modified". | ||
// Use a LongPoint that is indexed (i.e. efficiently filterable with | ||
// PointRangeQuery). This indexes to milli-second resolution, which | ||
// is often too fine. You could instead create a number based on | ||
// year/month/day/hour/minutes/seconds, down the resolution you require. | ||
// For example the long value 2011021714 would mean | ||
// February 17, 2011, 2-3 PM. | ||
doc.add(new LongPoint("modified", lastModified)); | ||
|
||
// Add the contents of the file to a field named "contents". Specify a Reader, | ||
// so that the text of the file is tokenized and indexed, but not stored. | ||
// Note that FileReader expects the file to be in UTF-8 encoding. | ||
// If that's not the case searching for special characters will fail. | ||
doc.add( | ||
new TextField( | ||
"contents", | ||
new BufferedReader(new InputStreamReader(stream, StandardCharsets.UTF_8)))); | ||
|
||
if (demoEmbeddings != null) { | ||
try (InputStream in = Files.newInputStream(file)) { | ||
float[] vector = | ||
demoEmbeddings.computeEmbedding( | ||
new BufferedReader(new InputStreamReader(in, StandardCharsets.UTF_8))); | ||
doc.add( | ||
new KnnVectorField("contents-vector", vector, VectorSimilarityFunction.DOT_PRODUCT)); | ||
} | ||
} | ||
|
||
if (writer.getConfig().getOpenMode() == OpenMode.CREATE) { | ||
// New index, so we just add the document (no old document can be there): | ||
System.out.println("adding " + file); | ||
writer.addDocument(doc); | ||
} else { | ||
// Existing index (an old copy of this document may have been indexed) so | ||
// we use updateDocument instead to replace the old one matching the exact | ||
// path, if present: | ||
System.out.println("updating " + file); | ||
writer.updateDocument(new Term("path", file.toString()), doc); | ||
} | ||
} | ||
} | ||
|
||
@Override | ||
public void close() throws IOException { | ||
IOUtils.close(vectorDict); | ||
} | ||
} |
Empty file.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,20 @@ | ||
#!/usr/bin/env bash | ||
|
||
BASE=$(realpath $(dirname $0)/..) | ||
|
||
export CLASSPATH=${BASE}/jars/lucene-core-9.1.0.jar:${BASE}/jars/lucene-demo-9.1.0.jar:${BASE}/jars/lucene-queryparser-9.1.0.jar:${BASE}/jars/lucene-analysis-common-9.1.0.jar:${BASE} | ||
|
||
if [ -z "${JAVAC}" ]; then | ||
if [ -x /opt/homebrew/Cellar/openjdk/18/bin/javac ]; then | ||
JAVAC=/opt/homebrew/Cellar/openjdk/18/bin/javac | ||
else | ||
JAVAC=$(which javac) | ||
if [ $? != 0 ]; then | ||
echo "ERROR: Unable to locate the javac command. Set environment variable, JAVAC to the path to the Java compiler." | ||
exit 1 | ||
fi | ||
fi | ||
fi | ||
|
||
cd ${BASE} | ||
${JAVAC} com/example/ppm/*.java |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,44 @@ | ||
#!/usr/bin/env bash | ||
|
||
BASE=$(realpath $(dirname $0)/..) | ||
|
||
if [ $# != 2 ]; then | ||
echo "usage: $0 <source directory> <directory to store index>" | ||
exit 1 | ||
fi | ||
|
||
SOURCE=$(realpath $1) | ||
INDEX=$(realpath $2)/index | ||
|
||
TARFILE=$(realpath $2)/index.tar.gz | ||
|
||
mkdir -p $INDEX | ||
|
||
if [ ! -f ${BASE}/com/example/ppm/IndexFiles.class ]; then | ||
echo "ERROR: Compile the classes first by running compile-classes.sh" | ||
exit 1 | ||
fi | ||
|
||
export CLASSPATH=${BASE}/jars/lucene-core-9.1.0.jar:${BASE}/jars/lucene-demo-9.1.0.jar:${BASE}/jars/lucene-queryparser-9.1.0.jar:${BASE}/jars/lucene-analysis-common-9.1.0.jar:${BASE} | ||
|
||
if [ -z "${JAVA}" ]; then | ||
if [ -x /opt/homebrew/Cellar/openjdk/18/bin/java ]; then | ||
JAVA=/opt/homebrew/Cellar/openjdk/18/bin/java | ||
else | ||
JAVA=$(which java) | ||
if [ $? != 0 ]; then | ||
echo "ERROR: Unable to locate the java command. Set environment variable, JAVA to the path to the JRE." | ||
exit 1 | ||
fi | ||
fi | ||
fi | ||
|
||
cd ${BASE} | ||
${JAVA} com.example.ppm.IndexFiles -index ${INDEX} -docs ${SOURCE} -knn_dict dict/glove.6B.300d.txt | ||
|
||
if [ -f $TARFILE ]; then | ||
rm -f $TARFILE | ||
fi | ||
|
||
tar -zcvf $TARFILE -C $INDEX . | ||
|
Oops, something went wrong.