Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Refactor the parsing of the text index builder #1695

Merged
merged 19 commits into from
Jan 22, 2025
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
Show all changes
19 commits
Select commit Hold shift + click to select a range
b93fde4
Extra classes for Words- and Docsfile parsing
Flixtastic Dec 28, 2024
9c40084
Added method to tokenize and normalize at the same time.
Flixtastic Dec 28, 2024
c365935
Added the tokenization to the ql_utility namespace
Flixtastic Dec 28, 2024
479b763
Revert "Added the tokenization to the ql_utility namespace"
Flixtastic Dec 28, 2024
d0ec708
Used the custom InputRangeMixin to lazily tokenize and normalize word…
Flixtastic Jan 2, 2025
a7823fb
Merge branch 'ad-freiburg:master' into words-and-docs-file-parsing
Flixtastic Jan 4, 2025
5f28add
Merge branch 'ad-freiburg:master' into words-and-docs-file-parsing
Flixtastic Jan 6, 2025
f129ecd
Added comments and necessary tests to WordsAndDocsFileParser
Flixtastic Jan 8, 2025
b699551
Merge branch 'ad-freiburg:master' into words-and-docs-file-parsing
Flixtastic Jan 8, 2025
1642175
Merge branch 'ad-freiburg:master' into words-and-docs-file-parsing
Flixtastic Jan 9, 2025
8c8a1a1
Added comments to WordsAndDcosFileParser.h. Improved useability of te…
Flixtastic Jan 9, 2025
0369de6
Rewrite the tokenizer as a view.
joka921 Jan 10, 2025
c412983
Improved comment, addressed small requested changes
Flixtastic Jan 10, 2025
46fbb98
Addressed sonar issues
Flixtastic Jan 10, 2025
1e0fc14
Removed the temporary localeManagers in WordsAndDocsFileParserTest.cpp
Flixtastic Jan 10, 2025
9f9738c
Addressed more SonarQube problems
Flixtastic Jan 11, 2025
a55f2be
For now excluding helper functions from code coverage since they coul…
Flixtastic Jan 11, 2025
bea5936
Reverting last commit
Flixtastic Jan 11, 2025
349be6d
Small improvement
Flixtastic Jan 15, 2025
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
1 change: 1 addition & 0 deletions src/global/IndexTypes.h
Original file line number Diff line number Diff line change
Expand Up @@ -16,3 +16,4 @@ using LocalVocabIndex = const LocalVocabEntry*;
using TextRecordIndex = ad_utility::TypedIndex<uint64_t, "TextRecordIndex">;
using WordVocabIndex = ad_utility::TypedIndex<uint64_t, "WordVocabIndex">;
using BlankNodeIndex = ad_utility::TypedIndex<uint64_t, "BlankNodeIndex">;
using DocumentIndex = ad_utility::TypedIndex<uint64_t, "DocumentIndex">;
139 changes: 71 additions & 68 deletions src/index/IndexImpl.Text.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -17,39 +17,22 @@
#include "backports/algorithm.h"
#include "engine/CallFixedSize.h"
#include "index/FTSAlgorithms.h"
#include "parser/ContextFileParser.h"
#include "parser/WordsAndDocsFileParser.h"
#include "util/Conversions.h"
#include "util/Simple8bCode.h"

namespace {

// Custom delimiter class for tokenization of literals using `absl::StrSplit`.
// The `Find` function returns the next delimiter in `text` after the given
// `pos` or an empty substring if there is no next delimiter.
struct LiteralsTokenizationDelimiter {
absl::string_view Find(absl::string_view text, size_t pos) {
auto isWordChar = [](char c) -> bool { return std::isalnum(c); };
auto found = std::find_if_not(text.begin() + pos, text.end(), isWordChar);
if (found == text.end()) return text.substr(text.size());
return {found, found + 1};
}
};

} // namespace

// _____________________________________________________________________________
cppcoro::generator<ContextFileParser::Line> IndexImpl::wordsInTextRecords(
const std::string& contextFile, bool addWordsFromLiterals) {
cppcoro::generator<WordsFileLine> IndexImpl::wordsInTextRecords(
std::string contextFile, bool addWordsFromLiterals) const {
auto localeManager = textVocab_.getLocaleManager();
// ROUND 1: If context file aka wordsfile is not empty, read words from there.
// Remember the last context id for the (optional) second round.
TextRecordIndex contextId = TextRecordIndex::make(0);
if (!contextFile.empty()) {
ContextFileParser::Line line;
ContextFileParser p(contextFile, localeManager);
WordsFileParser p(contextFile, localeManager);
ad_utility::HashSet<string> items;
while (p.getLine(line)) {
contextId = line._contextId;
for (auto& line : p) {
contextId = line.contextId_;
co_yield line;
}
if (contextId > TextRecordIndex::make(0)) {
Expand All @@ -65,22 +48,70 @@
if (!isLiteral(text)) {
continue;
}
ContextFileParser::Line entityLine{text, true, contextId, 1, true};
WordsFileLine entityLine{text, true, contextId, 1, true};
co_yield entityLine;
std::string_view textView = text;
textView = textView.substr(0, textView.rfind('"'));
textView.remove_prefix(1);
for (auto word : absl::StrSplit(textView, LiteralsTokenizationDelimiter{},
absl::SkipEmpty{})) {
auto wordNormalized = localeManager.getLowercaseUtf8(word);
ContextFileParser::Line wordLine{wordNormalized, false, contextId, 1};
for (auto word : tokenizeAndNormalizeText(textView, localeManager)) {
WordsFileLine wordLine{std::move(word), false, contextId, 1};
co_yield wordLine;
}
contextId = contextId.incremented();
}
}
}

// _____________________________________________________________________________
void IndexImpl::processEntityCaseDuringInvertedListProcessing(
const WordsFileLine& line,
ad_utility::HashMap<Id, Score>& entitiesInContext, size_t& nofLiterals,
size_t& entityNotFoundErrorMsgCount) const {
VocabIndex eid;
// TODO<joka921> Currently only IRIs and strings from the vocabulary can
// be tagged entities in the text index (no doubles, ints, etc).
if (getVocab().getId(line.word_, &eid)) {
// Note that `entitiesInContext` is a HashMap, so the `Id`s don't have
// to be contiguous.
entitiesInContext[Id::makeFromVocabIndex(eid)] += line.score_;
if (line.isLiteralEntity_) {
++nofLiterals;
}
} else {
logEntityNotFound(line.word_, entityNotFoundErrorMsgCount);
}
}

// _____________________________________________________________________________
void IndexImpl::processWordCaseDuringInvertedListProcessing(
const WordsFileLine& line,
ad_utility::HashMap<WordIndex, Score>& wordsInContext) const {
// TODO<joka921> Let the `textVocab_` return a `WordIndex` directly.
WordVocabIndex vid;
bool ret = textVocab_.getId(line.word_, &vid);
WordIndex wid = vid.get();
if (!ret) {
LOG(ERROR) << "ERROR: word \"" << line.word_ << "\" "
<< "not found in textVocab. Terminating\n";
AD_FAIL();
}

Check warning on line 97 in src/index/IndexImpl.Text.cpp

View check run for this annotation

Codecov / codecov/patch

src/index/IndexImpl.Text.cpp#L94-L97

Added lines #L94 - L97 were not covered by tests
wordsInContext[wid] += line.score_;
}

// _____________________________________________________________________________
void IndexImpl::logEntityNotFound(const string& word,
size_t& entityNotFoundErrorMsgCount) const {
if (entityNotFoundErrorMsgCount < 20) {
LOG(WARN) << "Entity from text not in KB: " << word << '\n';
if (++entityNotFoundErrorMsgCount == 20) {
LOG(WARN) << "There are more entities not in the KB..."
<< " suppressing further warnings...\n";
}

Check warning on line 109 in src/index/IndexImpl.Text.cpp

View check run for this annotation

Codecov / codecov/patch

src/index/IndexImpl.Text.cpp#L107-L109

Added lines #L107 - L109 were not covered by tests
} else {
entityNotFoundErrorMsgCount++;
}

Check warning on line 112 in src/index/IndexImpl.Text.cpp

View check run for this annotation

Codecov / codecov/patch

src/index/IndexImpl.Text.cpp#L111-L112

Added lines #L111 - L112 were not covered by tests
}

// _____________________________________________________________________________
void IndexImpl::addTextFromContextFile(const string& contextFile,
bool addWordsFromLiterals) {
Expand Down Expand Up @@ -214,12 +245,12 @@
for (auto line : wordsInTextRecords(contextFile, addWordsFromLiterals)) {
++numLines;
// LOG(INFO) << "LINE: "
// << std::setw(50) << line._word << " "
// << line._isEntity << "\t"
// << line._contextId.get() << "\t"
// << line._score << std::endl;
if (!line._isEntity) {
distinctWords.insert(line._word);
// << std::setw(50) << line.word_ << " "
// << line.isEntity_ << "\t"
// << line.contextId_.get() << "\t"
// << line.score_ << std::endl;
if (!line.isEntity_) {
distinctWords.insert(line.word_);
}
}
textVocab_.createFromSet(distinctWords, onDiskBase_ + ".text.vocabulary");
Expand All @@ -243,49 +274,21 @@
size_t nofLiterals = 0;

for (auto line : wordsInTextRecords(contextFile, addWordsFromLiterals)) {
if (line._contextId != currentContext) {
if (line.contextId_ != currentContext) {
++nofContexts;
addContextToVector(writer, currentContext, wordsInContext,
entitiesInContext);
currentContext = line._contextId;
currentContext = line.contextId_;
wordsInContext.clear();
entitiesInContext.clear();
}
if (line._isEntity) {
if (line.isEntity_) {
++nofEntityPostings;
// TODO<joka921> Currently only IRIs and strings from the vocabulary can
// be tagged entities in the text index (no doubles, ints, etc).
VocabIndex eid;
if (getVocab().getId(line._word, &eid)) {
// Note that `entitiesInContext` is a HashMap, so the `Id`s don't have
// to be contiguous.
entitiesInContext[Id::makeFromVocabIndex(eid)] += line._score;
if (line._isLiteralEntity) {
++nofLiterals;
}
} else {
if (entityNotFoundErrorMsgCount < 20) {
LOG(WARN) << "Entity from text not in KB: " << line._word << '\n';
if (++entityNotFoundErrorMsgCount == 20) {
LOG(WARN) << "There are more entities not in the KB..."
<< " suppressing further warnings...\n";
}
} else {
entityNotFoundErrorMsgCount++;
}
}
processEntityCaseDuringInvertedListProcessing(
line, entitiesInContext, nofLiterals, entityNotFoundErrorMsgCount);
} else {
++nofWordPostings;
// TODO<joka921> Let the `textVocab_` return a `WordIndex` directly.
WordVocabIndex vid;
bool ret = textVocab_.getId(line._word, &vid);
WordIndex wid = vid.get();
if (!ret) {
LOG(ERROR) << "ERROR: word \"" << line._word << "\" "
<< "not found in textVocab. Terminating\n";
AD_FAIL();
}
wordsInContext[wid] += line._score;
processWordCaseDuringInvertedListProcessing(line, wordsInContext);
}
}
if (entityNotFoundErrorMsgCount > 0) {
Expand Down
18 changes: 15 additions & 3 deletions src/index/IndexImpl.h
Original file line number Diff line number Diff line change
Expand Up @@ -29,9 +29,9 @@
#include "index/TextMetaData.h"
#include "index/Vocabulary.h"
#include "index/VocabularyMerger.h"
#include "parser/ContextFileParser.h"
#include "parser/RdfParser.h"
#include "parser/TripleComponent.h"
#include "parser/WordsAndDocsFileParser.h"
#include "util/BufferedVector.h"
#include "util/CancellationHandle.h"
#include "util/File.h"
Expand Down Expand Up @@ -521,8 +521,20 @@ class IndexImpl {
// TODO: So far, this is limited to the internal vocabulary (still in the
// testing phase, once it works, it should be easy to include the IRIs and
// literals from the external vocabulary as well).
cppcoro::generator<ContextFileParser::Line> wordsInTextRecords(
const std::string& contextFile, bool addWordsFromLiterals);
cppcoro::generator<WordsFileLine> wordsInTextRecords(
std::string contextFile, bool addWordsFromLiterals) const;

void processEntityCaseDuringInvertedListProcessing(
const WordsFileLine& line,
ad_utility::HashMap<Id, Score>& entitiesInContxt, size_t& nofLiterals,
size_t& entityNotFoundErrorMsgCount) const;

void processWordCaseDuringInvertedListProcessing(
const WordsFileLine& line,
ad_utility::HashMap<WordIndex, Score>& wordsInContext) const;

void logEntityNotFound(const string& word,
size_t& entityNotFoundErrorMsgCount) const;

size_t processWordsForVocabulary(const string& contextFile,
bool addWordsFromLiterals);
Expand Down
2 changes: 1 addition & 1 deletion src/parser/CMakeLists.txt
Original file line number Diff line number Diff line change
Expand Up @@ -10,7 +10,7 @@ add_library(parser
ParsedQuery.cpp
RdfParser.cpp
Tokenizer.cpp
ContextFileParser.cpp
WordsAndDocsFileParser.cpp
TurtleTokenId.h
ParallelBuffer.cpp
SparqlParserHelpers.cpp
Expand Down
46 changes: 0 additions & 46 deletions src/parser/ContextFileParser.cpp

This file was deleted.

45 changes: 0 additions & 45 deletions src/parser/ContextFileParser.h

This file was deleted.

61 changes: 61 additions & 0 deletions src/parser/WordsAndDocsFileParser.cpp
Original file line number Diff line number Diff line change
@@ -0,0 +1,61 @@
// Copyright 2015, University of Freiburg,
// Chair of Algorithms and Data Structures.
// Author: Björn Buchhold ([email protected])
// Felix Meisen ([email protected])

#include "parser/WordsAndDocsFileParser.h"

#include <cassert>

#include "util/Exception.h"
#include "util/StringUtils.h"

// _____________________________________________________________________________
WordsAndDocsFileParser::WordsAndDocsFileParser(
const string& wordsOrDocsFile, const LocaleManager& localeManager)
: in_(wordsOrDocsFile), localeManager_(localeManager) {}

// _____________________________________________________________________________
ad_utility::InputRangeFromGet<WordsFileLine>::Storage WordsFileParser::get() {
WordsFileLine line;
string l;
if (!std::getline(getInputStream(), l)) {
return std::nullopt;
}
std::string_view lineView(l);
size_t i = lineView.find('\t');
assert(i != string::npos);
size_t j = i + 2;
assert(j + 3 < lineView.size());
size_t k = lineView.find('\t', j + 2);
assert(k != string::npos);
line.isEntity_ = (lineView[i + 1] == '1');
line.word_ =
(line.isEntity_
? lineView.substr(0, i)
: getLocaleManager().getLowercaseUtf8(lineView.substr(0, i)));
line.contextId_ =
TextRecordIndex::make(atol(lineView.substr(j + 1, k - j - 1).data()));
line.score_ = static_cast<Score>(atol(lineView.substr(k + 1).data()));
#ifndef NDEBUG
if (lastCId_ > line.contextId_) {
AD_THROW("ContextFile has to be sorted by context Id.");
}
lastCId_ = line.contextId_;
#endif
return line;
}

// _____________________________________________________________________________
ad_utility::InputRangeFromGet<DocsFileLine>::Storage DocsFileParser::get() {
string l;
if (!std::getline(getInputStream(), l)) {
return std::nullopt;
}
DocsFileLine line;
size_t i = l.find('\t');
assert(i != string::npos);
line.docId_ = DocumentIndex::make(atol(l.substr(0, i).c_str()));
line.docContent_ = l.substr(i + 1);
return line;
}
Loading
Loading