ad-freiburg · joka921 · Jan 22, 2025 · Dec 28, 2024 · Dec 28, 2024 · Dec 28, 2024
diff --git a/src/global/IndexTypes.h b/src/global/IndexTypes.h
@@ -16,3 +16,4 @@ using LocalVocabIndex = const LocalVocabEntry*;
 using TextRecordIndex = ad_utility::TypedIndex<uint64_t, "TextRecordIndex">;
 using WordVocabIndex = ad_utility::TypedIndex<uint64_t, "WordVocabIndex">;
 using BlankNodeIndex = ad_utility::TypedIndex<uint64_t, "BlankNodeIndex">;
+using DocumentIndex = ad_utility::TypedIndex<uint64_t, "DocumentIndex">;
diff --git a/src/index/IndexImpl.Text.cpp b/src/index/IndexImpl.Text.cpp
@@ -17,39 +17,22 @@
 #include "backports/algorithm.h"
 #include "engine/CallFixedSize.h"
 #include "index/FTSAlgorithms.h"
-#include "parser/ContextFileParser.h"
+#include "parser/WordsAndDocsFileParser.h"
 #include "util/Conversions.h"
 #include "util/Simple8bCode.h"
 
-namespace {
-
-// Custom delimiter class for tokenization of literals using `absl::StrSplit`.
-// The `Find` function returns the next delimiter in `text` after the given
-// `pos` or an empty substring if there is no next delimiter.
-struct LiteralsTokenizationDelimiter {
-  absl::string_view Find(absl::string_view text, size_t pos) {
-    auto isWordChar = [](char c) -> bool { return std::isalnum(c); };
-    auto found = std::find_if_not(text.begin() + pos, text.end(), isWordChar);
-    if (found == text.end()) return text.substr(text.size());
-    return {found, found + 1};
-  }
-};
-
-}  // namespace
-
 // _____________________________________________________________________________
-cppcoro::generator<ContextFileParser::Line> IndexImpl::wordsInTextRecords(
-    const std::string& contextFile, bool addWordsFromLiterals) {
+cppcoro::generator<WordsFileLine> IndexImpl::wordsInTextRecords(
+    std::string contextFile, bool addWordsFromLiterals) const {
   auto localeManager = textVocab_.getLocaleManager();
   // ROUND 1: If context file aka wordsfile is not empty, read words from there.
   // Remember the last context id for the (optional) second round.
   TextRecordIndex contextId = TextRecordIndex::make(0);
   if (!contextFile.empty()) {
-    ContextFileParser::Line line;
-    ContextFileParser p(contextFile, localeManager);
+    WordsFileParser p(contextFile, localeManager);
     ad_utility::HashSet<string> items;
-    while (p.getLine(line)) {
-      contextId = line._contextId;
+    for (auto& line : p) {
+      contextId = line.contextId_;
       co_yield line;
     }
     if (contextId > TextRecordIndex::make(0)) {
@@ -65,22 +48,70 @@
       if (!isLiteral(text)) {
         continue;
       }
-      ContextFileParser::Line entityLine{text, true, contextId, 1, true};
+      WordsFileLine entityLine{text, true, contextId, 1, true};
       co_yield entityLine;
       std::string_view textView = text;
       textView = textView.substr(0, textView.rfind('"'));
       textView.remove_prefix(1);
-      for (auto word : absl::StrSplit(textView, LiteralsTokenizationDelimiter{},
-                                      absl::SkipEmpty{})) {
-        auto wordNormalized = localeManager.getLowercaseUtf8(word);
-        ContextFileParser::Line wordLine{wordNormalized, false, contextId, 1};
+      for (auto word : tokenizeAndNormalizeText(textView, localeManager)) {
+        WordsFileLine wordLine{std::move(word), false, contextId, 1};
         co_yield wordLine;
       }
       contextId = contextId.incremented();
     }
   }
 }
 
+// _____________________________________________________________________________
+void IndexImpl::processEntityCaseDuringInvertedListProcessing(
+    const WordsFileLine& line,
+    ad_utility::HashMap<Id, Score>& entitiesInContext, size_t& nofLiterals,
+    size_t& entityNotFoundErrorMsgCount) const {
+  VocabIndex eid;
+  // TODO<joka921> Currently only IRIs and strings from the vocabulary can
+  // be tagged entities in the text index (no doubles, ints, etc).
+  if (getVocab().getId(line.word_, &eid)) {
+    // Note that `entitiesInContext` is a HashMap, so the `Id`s don't have
+    // to be contiguous.
+    entitiesInContext[Id::makeFromVocabIndex(eid)] += line.score_;
+    if (line.isLiteralEntity_) {
+      ++nofLiterals;
+    }
+  } else {
+    logEntityNotFound(line.word_, entityNotFoundErrorMsgCount);
+  }
+}
+
+// _____________________________________________________________________________
+void IndexImpl::processWordCaseDuringInvertedListProcessing(
+    const WordsFileLine& line,
+    ad_utility::HashMap<WordIndex, Score>& wordsInContext) const {
+  // TODO<joka921> Let the `textVocab_` return a `WordIndex` directly.
+  WordVocabIndex vid;
+  bool ret = textVocab_.getId(line.word_, &vid);
+  WordIndex wid = vid.get();
+  if (!ret) {
+    LOG(ERROR) << "ERROR: word \"" << line.word_ << "\" "
+               << "not found in textVocab. Terminating\n";
+    AD_FAIL();
+  }
+  wordsInContext[wid] += line.score_;
+}
+
+// _____________________________________________________________________________
+void IndexImpl::logEntityNotFound(const string& word,
+                                  size_t& entityNotFoundErrorMsgCount) const {
+  if (entityNotFoundErrorMsgCount < 20) {
+    LOG(WARN) << "Entity from text not in KB: " << word << '\n';
+    if (++entityNotFoundErrorMsgCount == 20) {
+      LOG(WARN) << "There are more entities not in the KB..."
+                << " suppressing further warnings...\n";
+    }
+  } else {
+    entityNotFoundErrorMsgCount++;
+  }
+}
+
 // _____________________________________________________________________________
 void IndexImpl::addTextFromContextFile(const string& contextFile,
                                        bool addWordsFromLiterals) {
@@ -214,12 +245,12 @@
   for (auto line : wordsInTextRecords(contextFile, addWordsFromLiterals)) {
     ++numLines;
     // LOG(INFO) << "LINE: "
-    //           << std::setw(50) << line._word << "   "
-    //           << line._isEntity << "\t"
-    //           << line._contextId.get() << "\t"
-    //           << line._score << std::endl;
-    if (!line._isEntity) {
-      distinctWords.insert(line._word);
+    //           << std::setw(50) << line.word_ << "   "
+    //           << line.isEntity_ << "\t"
+    //           << line.contextId_.get() << "\t"
+    //           << line.score_ << std::endl;
+    if (!line.isEntity_) {
+      distinctWords.insert(line.word_);
     }
   }
   textVocab_.createFromSet(distinctWords, onDiskBase_ + ".text.vocabulary");
@@ -243,49 +274,21 @@
   size_t nofLiterals = 0;
 
   for (auto line : wordsInTextRecords(contextFile, addWordsFromLiterals)) {
-    if (line._contextId != currentContext) {
+    if (line.contextId_ != currentContext) {
       ++nofContexts;
       addContextToVector(writer, currentContext, wordsInContext,
                          entitiesInContext);
-      currentContext = line._contextId;
+      currentContext = line.contextId_;
       wordsInContext.clear();
       entitiesInContext.clear();
     }
-    if (line._isEntity) {
+    if (line.isEntity_) {
       ++nofEntityPostings;
-      // TODO<joka921> Currently only IRIs and strings from the vocabulary can
-      // be tagged entities in the text index (no doubles, ints, etc).
-      VocabIndex eid;
-      if (getVocab().getId(line._word, &eid)) {
-        // Note that `entitiesInContext` is a HashMap, so the `Id`s don't have
-        // to be contiguous.
-        entitiesInContext[Id::makeFromVocabIndex(eid)] += line._score;
-        if (line._isLiteralEntity) {
-          ++nofLiterals;
-        }
-      } else {
-        if (entityNotFoundErrorMsgCount < 20) {
-          LOG(WARN) << "Entity from text not in KB: " << line._word << '\n';
-          if (++entityNotFoundErrorMsgCount == 20) {
-            LOG(WARN) << "There are more entities not in the KB..."
-                      << " suppressing further warnings...\n";
-          }
-        } else {
-          entityNotFoundErrorMsgCount++;
-        }
-      }
+      processEntityCaseDuringInvertedListProcessing(
+          line, entitiesInContext, nofLiterals, entityNotFoundErrorMsgCount);
     } else {
       ++nofWordPostings;
-      // TODO<joka921> Let the `textVocab_` return a `WordIndex` directly.
-      WordVocabIndex vid;
-      bool ret = textVocab_.getId(line._word, &vid);
-      WordIndex wid = vid.get();
-      if (!ret) {
-        LOG(ERROR) << "ERROR: word \"" << line._word << "\" "
-                   << "not found in textVocab. Terminating\n";
-        AD_FAIL();
-      }
-      wordsInContext[wid] += line._score;
+      processWordCaseDuringInvertedListProcessing(line, wordsInContext);
     }
   }
   if (entityNotFoundErrorMsgCount > 0) {

diff --git a/src/index/IndexImpl.h b/src/index/IndexImpl.h
@@ -29,9 +29,9 @@
 #include "index/TextMetaData.h"
 #include "index/Vocabulary.h"
 #include "index/VocabularyMerger.h"
-#include "parser/ContextFileParser.h"
 #include "parser/RdfParser.h"
 #include "parser/TripleComponent.h"
+#include "parser/WordsAndDocsFileParser.h"
 #include "util/BufferedVector.h"
 #include "util/CancellationHandle.h"
 #include "util/File.h"
@@ -521,8 +521,20 @@ class IndexImpl {
   // TODO: So far, this is limited to the internal vocabulary (still in the
   // testing phase, once it works, it should be easy to include the IRIs and
   // literals from the external vocabulary as well).
-  cppcoro::generator<ContextFileParser::Line> wordsInTextRecords(
-      const std::string& contextFile, bool addWordsFromLiterals);
+  cppcoro::generator<WordsFileLine> wordsInTextRecords(
+      std::string contextFile, bool addWordsFromLiterals) const;
+
+  void processEntityCaseDuringInvertedListProcessing(
+      const WordsFileLine& line,
+      ad_utility::HashMap<Id, Score>& entitiesInContxt, size_t& nofLiterals,
+      size_t& entityNotFoundErrorMsgCount) const;
+
+  void processWordCaseDuringInvertedListProcessing(
+      const WordsFileLine& line,
+      ad_utility::HashMap<WordIndex, Score>& wordsInContext) const;
+
+  void logEntityNotFound(const string& word,
+                         size_t& entityNotFoundErrorMsgCount) const;
 
   size_t processWordsForVocabulary(const string& contextFile,
                                    bool addWordsFromLiterals);

diff --git a/src/parser/CMakeLists.txt b/src/parser/CMakeLists.txt
@@ -10,7 +10,7 @@ add_library(parser
         ParsedQuery.cpp
         RdfParser.cpp
         Tokenizer.cpp
-        ContextFileParser.cpp
+        WordsAndDocsFileParser.cpp
         TurtleTokenId.h
         ParallelBuffer.cpp
         SparqlParserHelpers.cpp

diff --git a/src/parser/ContextFileParser.cpp b/src/parser/ContextFileParser.cpp
diff --git a/src/parser/ContextFileParser.h b/src/parser/ContextFileParser.h
diff --git a/src/parser/WordsAndDocsFileParser.cpp b/src/parser/WordsAndDocsFileParser.cpp
@@ -0,0 +1,61 @@
+// Copyright 2015, University of Freiburg,
+// Chair of Algorithms and Data Structures.
+// Author: Björn Buchhold ([email protected])
+//         Felix Meisen ([email protected])
+
+#include "parser/WordsAndDocsFileParser.h"
+
+#include <cassert>
+
+#include "util/Exception.h"
+#include "util/StringUtils.h"
+
+// _____________________________________________________________________________
+WordsAndDocsFileParser::WordsAndDocsFileParser(
+    const string& wordsOrDocsFile, const LocaleManager& localeManager)
+    : in_(wordsOrDocsFile), localeManager_(localeManager) {}
+
+// _____________________________________________________________________________
+ad_utility::InputRangeFromGet<WordsFileLine>::Storage WordsFileParser::get() {
+  WordsFileLine line;
+  string l;
+  if (!std::getline(getInputStream(), l)) {
+    return std::nullopt;
+  }
+  std::string_view lineView(l);
+  size_t i = lineView.find('\t');
+  assert(i != string::npos);
+  size_t j = i + 2;
+  assert(j + 3 < lineView.size());
+  size_t k = lineView.find('\t', j + 2);
+  assert(k != string::npos);
+  line.isEntity_ = (lineView[i + 1] == '1');
+  line.word_ =
+      (line.isEntity_
+           ? lineView.substr(0, i)
+           : getLocaleManager().getLowercaseUtf8(lineView.substr(0, i)));
+  line.contextId_ =
+      TextRecordIndex::make(atol(lineView.substr(j + 1, k - j - 1).data()));
+  line.score_ = static_cast<Score>(atol(lineView.substr(k + 1).data()));
+#ifndef NDEBUG
+  if (lastCId_ > line.contextId_) {
+    AD_THROW("ContextFile has to be sorted by context Id.");
+  }
+  lastCId_ = line.contextId_;
+#endif
+  return line;
+}
+
+// _____________________________________________________________________________
+ad_utility::InputRangeFromGet<DocsFileLine>::Storage DocsFileParser::get() {
+  string l;
+  if (!std::getline(getInputStream(), l)) {
+    return std::nullopt;
+  }
+  DocsFileLine line;
+  size_t i = l.find('\t');
+  assert(i != string::npos);
+  line.docId_ = DocumentIndex::make(atol(l.substr(0, i).c_str()));
+  line.docContent_ = l.substr(i + 1);
+  return line;
+}