Committing to GitHub

CiscoLearning · May 25, 2024 · 1a74435 · 1a74435
commit 1a74435
Show file tree

Hide file tree

Showing 12 changed files with 401 additions and 0 deletions.
diff --git a/.gitattributes b/.gitattributes
diff --git a/.gitignore b/.gitignore
@@ -0,0 +1,6 @@
+**/*.class
+.vscode
+venv
+docs-index
+dict/glove.6B.300d.txt
+**/__pycache__
diff --git a/README.md b/README.md
@@ -0,0 +1,28 @@
+# DevNet Expert Search Backend
+
+This repo contains the code to generate the Lucene index for the DevNet Expert
+search engine.  Due to the index's size, it is NOT included here.
+
+## Prerequisites
+
+The index generation code requires a modern Java compiler and runtime engine.
+
+Before building the index, download the `glove.6B.300d.txt` dictionary file
+(e.g., from https://www.kaggle.com/datasets/thanakomsn/glove6b300dtxt) and place
+it in the `dict` subdirectory.  This file must be named `glove.6B.300d.txt`.
+
+### Compile the Classes
+
+Run the script `scripts/compile-classes.sh` to compile the Java classes needed to build
+the index.  These files are taken from the example Lucene search engine code.
+
+## Generating the Index
+
+To generate the index, pick a target directory containing _text_ files to index.  Then run
+the command `scripts/generate-index.sh SOURCE_DIR INDEX_DIR` where `SOURCE_DIR` is the
+path to the files to index and `INDEX_DIR` is the directory into which the index will
+be placed.
+
+When configuring the frontend, point to the index location.  While you won't have the
+same documentation that the exam has, you can get a good sense of how the search engine
+works and how its results look.
diff --git a/com/example/ppm/IndexFiles.java b/com/example/ppm/IndexFiles.java
@@ -0,0 +1,279 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package com.example.ppm;
+
+import java.io.BufferedReader;
+import java.io.IOException;
+import java.io.InputStream;
+import java.io.InputStreamReader;
+import java.nio.charset.StandardCharsets;
+import java.nio.file.FileVisitResult;
+import java.nio.file.Files;
+import java.nio.file.Path;
+import java.nio.file.Paths;
+import java.nio.file.SimpleFileVisitor;
+import java.nio.file.attribute.BasicFileAttributes;
+import java.util.Date;
+import org.apache.lucene.analysis.Analyzer;
+import org.apache.lucene.analysis.standard.StandardAnalyzer;
+import org.apache.lucene.demo.knn.DemoEmbeddings;
+import org.apache.lucene.demo.knn.KnnVectorDict;
+import org.apache.lucene.document.Document;
+import org.apache.lucene.document.Field;
+import org.apache.lucene.document.KnnVectorField;
+import org.apache.lucene.document.LongPoint;
+import org.apache.lucene.document.StringField;
+import org.apache.lucene.document.TextField;
+import org.apache.lucene.index.DirectoryReader;
+import org.apache.lucene.index.IndexReader;
+import org.apache.lucene.index.IndexWriter;
+import org.apache.lucene.index.IndexWriterConfig;
+import org.apache.lucene.index.IndexWriterConfig.OpenMode;
+import org.apache.lucene.index.Term;
+import org.apache.lucene.index.VectorSimilarityFunction;
+import org.apache.lucene.store.Directory;
+import org.apache.lucene.store.FSDirectory;
+import org.apache.lucene.util.IOUtils;
+
+/**
+ * Index all text files under a directory.
+ *
+ * <p>This is a command-line application demonstrating simple Lucene indexing. Run it with no
+ * command-line arguments for usage information.
+ */
+public class IndexFiles implements AutoCloseable {
+  static final String KNN_DICT = "knn-dict";
+
+  // Calculates embedding vectors for KnnVector search
+  private final DemoEmbeddings demoEmbeddings;
+  private final KnnVectorDict vectorDict;
+
+  private IndexFiles(KnnVectorDict vectorDict) throws IOException {
+    if (vectorDict != null) {
+      this.vectorDict = vectorDict;
+      demoEmbeddings = new DemoEmbeddings(vectorDict);
+    } else {
+      this.vectorDict = null;
+      demoEmbeddings = null;
+    }
+  }
+
+  /** Index all text files under a directory. */
+  public static void main(String[] args) throws Exception {
+    String usage =
+        "java org.apache.lucene.demo.IndexFiles"
+            + " [-index INDEX_PATH] [-docs DOCS_PATH] [-update] [-knn_dict DICT_PATH]\n\n"
+            + "This indexes the documents in DOCS_PATH, creating a Lucene index"
+            + "in INDEX_PATH that can be searched with SearchFiles\n"
+            + "IF DICT_PATH contains a KnnVector dictionary, the index will also support KnnVector search";
+    String indexPath = "index";
+    String docsPath = null;
+    String vectorDictSource = null;
+    boolean create = true;
+    for (int i = 0; i < args.length; i++) {
+      switch (args[i]) {
+        case "-index":
+          indexPath = args[++i];
+          break;
+        case "-docs":
+          docsPath = args[++i];
+          break;
+        case "-knn_dict":
+          vectorDictSource = args[++i];
+          break;
+        case "-update":
+          create = false;
+          break;
+        case "-create":
+          create = true;
+          break;
+        default:
+          throw new IllegalArgumentException("unknown parameter " + args[i]);
+      }
+    }
+
+    if (docsPath == null) {
+      System.err.println("Usage: " + usage);
+      System.exit(1);
+    }
+
+    final Path docDir = Paths.get(docsPath);
+    if (!Files.isReadable(docDir)) {
+      System.out.println(
+          "Document directory '"
+              + docDir.toAbsolutePath()
+              + "' does not exist or is not readable, please check the path");
+      System.exit(1);
+    }
+
+    Date start = new Date();
+    try {
+      System.out.println("Indexing to directory '" + indexPath + "'...");
+
+      Directory dir = FSDirectory.open(Paths.get(indexPath));
+      Analyzer analyzer = new StandardAnalyzer();
+      IndexWriterConfig iwc = new IndexWriterConfig(analyzer);
+
+      if (create) {
+        // Create a new index in the directory, removing any
+        // previously indexed documents:
+        iwc.setOpenMode(OpenMode.CREATE);
+      } else {
+        // Add new documents to an existing index:
+        iwc.setOpenMode(OpenMode.CREATE_OR_APPEND);
+      }
+
+      // Optional: for better indexing performance, if you
+      // are indexing many documents, increase the RAM
+      // buffer.  But if you do this, increase the max heap
+      // size to the JVM (eg add -Xmx512m or -Xmx1g):
+      //
+      iwc.setRAMBufferSizeMB(256.0);
+
+      KnnVectorDict vectorDictInstance = null;
+      long vectorDictSize = 0;
+      if (vectorDictSource != null) {
+        KnnVectorDict.build(Paths.get(vectorDictSource), dir, KNN_DICT);
+        vectorDictInstance = new KnnVectorDict(dir, KNN_DICT);
+        vectorDictSize = vectorDictInstance.ramBytesUsed();
+      }
+
+      try (IndexWriter writer = new IndexWriter(dir, iwc);
+          IndexFiles indexFiles = new IndexFiles(vectorDictInstance)) {
+        indexFiles.indexDocs(writer, docDir);
+
+        // NOTE: if you want to maximize search performance,
+        // you can optionally call forceMerge here.  This can be
+        // a terribly costly operation, so generally it's only
+        // worth it when your index is relatively static (ie
+        // you're done adding documents to it):
+        //
+        // writer.forceMerge(1);
+      } finally {
+        IOUtils.close(vectorDictInstance);
+      }
+
+      Date end = new Date();
+      IndexReader reader = DirectoryReader.open(dir);
+      System.out.println(
+            "Indexed "
+                + reader.numDocs()
+                + " documents in "
+                + (end.getTime() - start.getTime())
+                + " milliseconds");
+    } catch (IOException e) {
+      System.out.println(" caught a " + e.getClass() + "\n with message: " + e.getMessage());
+    }
+  }
+
+  /**
+   * Indexes the given file using the given writer, or if a directory is given, recurses over files
+   * and directories found under the given directory.
+   *
+   * <p>NOTE: This method indexes one document per input file. This is slow. For good throughput,
+   * put multiple documents into your input file(s). An example of this is in the benchmark module,
+   * which can create "line doc" files, one document per line, using the <a
+   * href="../../../../../contrib-benchmark/org/apache/lucene/benchmark/byTask/tasks/WriteLineDocTask.html"
+   * >WriteLineDocTask</a>.
+   *
+   * @param writer Writer to the index where the given file/dir info will be stored
+   * @param path The file to index, or the directory to recurse into to find files to index
+   * @throws IOException If there is a low-level I/O error
+   */
+  void indexDocs(final IndexWriter writer, Path path) throws IOException {
+    if (Files.isDirectory(path)) {
+      Files.walkFileTree(
+          path,
+          new SimpleFileVisitor<>() {
+            @Override
+            public FileVisitResult visitFile(Path file, BasicFileAttributes attrs) {
+              try {
+                indexDoc(writer, file, attrs.lastModifiedTime().toMillis());
+              } catch (
+                  @SuppressWarnings("unused")
+                  IOException ignore) {
+                ignore.printStackTrace(System.err);
+                // don't index files that can't be read.
+              }
+              return FileVisitResult.CONTINUE;
+            }
+          });
+    } else {
+      indexDoc(writer, path, Files.getLastModifiedTime(path).toMillis());
+    }
+  }
+
+  /** Indexes a single document */
+  void indexDoc(IndexWriter writer, Path file, long lastModified) throws IOException {
+    try (InputStream stream = Files.newInputStream(file)) {
+      // make a new, empty document
+      Document doc = new Document();
+
+      // Add the path of the file as a field named "path".  Use a
+      // field that is indexed (i.e. searchable), but don't tokenize
+      // the field into separate words and don't index term frequency
+      // or positional information:
+      Field pathField = new StringField("path", file.toString(), Field.Store.YES);
+      doc.add(pathField);
+
+      // Add the last modified date of the file a field named "modified".
+      // Use a LongPoint that is indexed (i.e. efficiently filterable with
+      // PointRangeQuery).  This indexes to milli-second resolution, which
+      // is often too fine.  You could instead create a number based on
+      // year/month/day/hour/minutes/seconds, down the resolution you require.
+      // For example the long value 2011021714 would mean
+      // February 17, 2011, 2-3 PM.
+      doc.add(new LongPoint("modified", lastModified));
+
+      // Add the contents of the file to a field named "contents".  Specify a Reader,
+      // so that the text of the file is tokenized and indexed, but not stored.
+      // Note that FileReader expects the file to be in UTF-8 encoding.
+      // If that's not the case searching for special characters will fail.
+      doc.add(
+          new TextField(
+              "contents",
+              new BufferedReader(new InputStreamReader(stream, StandardCharsets.UTF_8))));
+
+      if (demoEmbeddings != null) {
+        try (InputStream in = Files.newInputStream(file)) {
+          float[] vector =
+              demoEmbeddings.computeEmbedding(
+                  new BufferedReader(new InputStreamReader(in, StandardCharsets.UTF_8)));
+          doc.add(
+              new KnnVectorField("contents-vector", vector, VectorSimilarityFunction.DOT_PRODUCT));
+        }
+      }
+
+      if (writer.getConfig().getOpenMode() == OpenMode.CREATE) {
+        // New index, so we just add the document (no old document can be there):
+        System.out.println("adding " + file);
+        writer.addDocument(doc);
+      } else {
+        // Existing index (an old copy of this document may have been indexed) so
+        // we use updateDocument instead to replace the old one matching the exact
+        // path, if present:
+        System.out.println("updating " + file);
+        writer.updateDocument(new Term("path", file.toString()), doc);
+      }
+    }
+  }
+
+  @Override
+  public void close() throws IOException {
+    IOUtils.close(vectorDict);
+  }
+}
diff --git a/dict/.keep_me b/dict/.keep_me
diff --git a/jars/lucene-analysis-common-9.1.0.jar b/jars/lucene-analysis-common-9.1.0.jar
diff --git a/jars/lucene-core-9.1.0.jar b/jars/lucene-core-9.1.0.jar
diff --git a/jars/lucene-demo-9.1.0.jar b/jars/lucene-demo-9.1.0.jar
diff --git a/jars/lucene-queryparser-9.1.0.jar b/jars/lucene-queryparser-9.1.0.jar
diff --git a/scripts/compile-classes.sh b/scripts/compile-classes.sh
@@ -0,0 +1,20 @@
+#!/usr/bin/env bash
+
+BASE=$(realpath $(dirname $0)/..)
+
+export CLASSPATH=${BASE}/jars/lucene-core-9.1.0.jar:${BASE}/jars/lucene-demo-9.1.0.jar:${BASE}/jars/lucene-queryparser-9.1.0.jar:${BASE}/jars/lucene-analysis-common-9.1.0.jar:${BASE}
+
+if [ -z "${JAVAC}" ]; then
+    if [ -x /opt/homebrew/Cellar/openjdk/18/bin/javac ]; then
+        JAVAC=/opt/homebrew/Cellar/openjdk/18/bin/javac
+    else
+        JAVAC=$(which javac)
+        if [ $? != 0 ]; then
+            echo "ERROR: Unable to locate the javac command.  Set environment variable, JAVAC to the path to the Java compiler."
+            exit 1
+        fi
+    fi
+fi
+
+cd ${BASE}
+${JAVAC} com/example/ppm/*.java
diff --git a/scripts/generate-index.sh b/scripts/generate-index.sh
@@ -0,0 +1,44 @@
+#!/usr/bin/env bash
+
+BASE=$(realpath $(dirname $0)/..)
+
+if [ $# != 2 ]; then
+    echo "usage: $0 <source directory> <directory to store index>"
+    exit 1
+fi
+
+SOURCE=$(realpath $1)
+INDEX=$(realpath $2)/index
+
+TARFILE=$(realpath $2)/index.tar.gz
+
+mkdir -p $INDEX
+
+if [ ! -f ${BASE}/com/example/ppm/IndexFiles.class ]; then
+    echo "ERROR: Compile the classes first by running compile-classes.sh"
+    exit 1
+fi
+
+export CLASSPATH=${BASE}/jars/lucene-core-9.1.0.jar:${BASE}/jars/lucene-demo-9.1.0.jar:${BASE}/jars/lucene-queryparser-9.1.0.jar:${BASE}/jars/lucene-analysis-common-9.1.0.jar:${BASE}
+
+if [ -z "${JAVA}" ]; then
+    if [ -x /opt/homebrew/Cellar/openjdk/18/bin/java ]; then
+        JAVA=/opt/homebrew/Cellar/openjdk/18/bin/java
+    else
+        JAVA=$(which java)
+        if [ $? != 0 ]; then
+            echo "ERROR: Unable to locate the java command.  Set environment variable, JAVA to the path to the JRE."
+            exit 1
+        fi
+    fi
+fi
+
+cd ${BASE}
+${JAVA} com.example.ppm.IndexFiles -index ${INDEX} -docs ${SOURCE} -knn_dict dict/glove.6B.300d.txt
+
+if [ -f $TARFILE ]; then
+	rm -f $TARFILE 
+fi
+
+tar -zcvf $TARFILE -C $INDEX .
+