Skip to content

Commit

Permalink
feat(dict): shorten the data struct used by Vocabulary
Browse files Browse the repository at this point in the history
This may save about 18% peak memory consumption when compiling dict.
  • Loading branch information
WhiredPlanck committed Jun 10, 2023
1 parent 2644f3c commit 210ab6c
Show file tree
Hide file tree
Showing 6 changed files with 73 additions and 26 deletions.
4 changes: 2 additions & 2 deletions src/rime/dict/dict_compiler.cc
Original file line number Diff line number Diff line change
Expand Up @@ -241,12 +241,12 @@ bool DictCompiler::BuildTable(int table_index,
for (const auto& s : r->raw_code) {
code.push_back(syllable_to_id[s]);
}
DictEntryList* ls = vocabulary.LocateEntries(code);
auto ls = vocabulary.LocateEntries(code);
if (!ls) {
LOG(ERROR) << "Error locating entries in vocabulary.";
continue;
}
auto e = New<DictEntry>();
auto e = New<ShortDictEntry>();
e->code.swap(code);
e->text.swap(r->text);
e->weight = log(r->weight > 0 ? r->weight : DBL_EPSILON);
Expand Down
6 changes: 3 additions & 3 deletions src/rime/dict/table.cc
Original file line number Diff line number Diff line change
Expand Up @@ -518,7 +518,7 @@ table::TailIndex* Table::BuildTailIndex(const Code& prefix,
return index;
}

Array<table::Entry>* Table::BuildEntryArray(const DictEntryList& entries) {
Array<table::Entry>* Table::BuildEntryArray(const ShortDictEntryList& entries) {
auto array = CreateArray<table::Entry>(entries.size());
if (!array) {
return NULL;
Expand All @@ -531,7 +531,7 @@ Array<table::Entry>* Table::BuildEntryArray(const DictEntryList& entries) {
return array;
}

bool Table::BuildEntryList(const DictEntryList& src,
bool Table::BuildEntryList(const ShortDictEntryList& src,
List<table::Entry>* dest) {
if (!dest)
return false;
Expand All @@ -549,7 +549,7 @@ bool Table::BuildEntryList(const DictEntryList& src,
return true;
}

bool Table::BuildEntry(const DictEntry& dict_entry, table::Entry* entry) {
bool Table::BuildEntry(const ShortDictEntry& dict_entry, table::Entry* entry) {
if (!entry)
return false;
if (!AddString(dict_entry.text, &entry->text, dict_entry.weight)) {
Expand Down
6 changes: 3 additions & 3 deletions src/rime/dict/table.h
Original file line number Diff line number Diff line change
Expand Up @@ -166,9 +166,9 @@ class Table : public MappedFile {
const Vocabulary& vocabulary);
bool BuildPhraseIndex(Code code, const Vocabulary& vocabulary,
map<string, int>* index_data);
Array<table::Entry>* BuildEntryArray(const DictEntryList& entries);
bool BuildEntryList(const DictEntryList& src, List<table::Entry>* dest);
bool BuildEntry(const DictEntry& dict_entry, table::Entry* entry);
Array<table::Entry>* BuildEntryArray(const ShortDictEntryList& entries);
bool BuildEntryList(const ShortDictEntryList& src, List<table::Entry>* dest);
bool BuildEntry(const ShortDictEntry& dict_entry, table::Entry* entry);

string GetString(const table::StringType& x);
bool AddString(const string& src, table::StringType* dest,
Expand Down
45 changes: 38 additions & 7 deletions src/rime/dict/vocabulary.cc
Original file line number Diff line number Diff line change
Expand Up @@ -5,6 +5,7 @@
// 2011-07-24 GONG Chen <[email protected]>
//
#include <algorithm>
#include <iterator>
#include <sstream>
#include <utility>
#include <rime/dict/vocabulary.h>
Expand Down Expand Up @@ -59,6 +60,18 @@ string Code::ToString() const {
return stream.str();
}

inline ShortDictEntry DictEntry::ToShort() const {
return {text, code, weight};
}

bool ShortDictEntry::operator< (const ShortDictEntry& other) const {
// Sort different entries sharing the same code by weight desc.
if (weight != other.weight)
return weight > other.weight;
// reduce carbon emission
return 0; //text < other.text;
}

bool DictEntry::operator< (const DictEntry& other) const {
// Sort different entries sharing the same code by weight desc.
if (weight != other.weight)
Expand All @@ -72,16 +85,34 @@ inline bool dereference_less(const T& a, const T& b) {
return *a < *b;
}

template <typename C>
inline void sort(C &container) {
std::sort(std::begin(container), std::end(container), dereference_less<typename C::value_type>);
}

template <typename C>
inline void sort_range(C &container, size_t start, size_t count) {
if (start >= container.size())
return;
auto i(std::begin(container) + start);
auto j(start + count >= container.size() ? std::end(container) : i + count);
std::sort(i, j, dereference_less<typename C::value_type>);
}

void ShortDictEntryList::Sort() {
sort(*this);
}

void ShortDictEntryList::SortRange(size_t start, size_t count) {
sort_range(*this, start, count);
}

void DictEntryList::Sort() {
std::sort(begin(), end(), dereference_less<DictEntryList::value_type>);
sort(*this);
}

void DictEntryList::SortRange(size_t start, size_t count) {
if (start >= size())
return;
iterator i(begin() + start);
iterator j(start + count >= size() ? end() : i + count);
std::sort(i, j, dereference_less<DictEntryList::value_type>);
sort_range(*this, start, count);
}

void DictEntryFilterBinder::AddFilter(DictEntryFilter filter) {
Expand All @@ -96,7 +127,7 @@ void DictEntryFilterBinder::AddFilter(DictEntryFilter filter) {
}
}

DictEntryList* Vocabulary::LocateEntries(const Code& code) {
ShortDictEntryList* Vocabulary::LocateEntries(const Code& code) {
Vocabulary* v = this;
size_t n = code.size();
for (size_t i = 0; i < n; ++i) {
Expand Down
20 changes: 18 additions & 2 deletions src/rime/dict/vocabulary.h
Original file line number Diff line number Diff line change
Expand Up @@ -30,6 +30,15 @@ class Code : public vector<SyllableId> {
string ToString() const;
};

struct ShortDictEntry {
string text;
Code code; // multi-syllable code from prism
double weight = 0.0;

ShortDictEntry() = default;
bool operator< (const ShortDictEntry& other) const;
};

struct DictEntry {
string text;
string comment;
Expand All @@ -41,9 +50,16 @@ struct DictEntry {
int remaining_code_length = 0;

DictEntry() = default;
ShortDictEntry ToShort() const;
bool operator< (const DictEntry& other) const;
};

class ShortDictEntryList : public vector<of<ShortDictEntry>> {
public:
void Sort();
void SortRange(size_t start, size_t count);
};

class DictEntryList : public vector<of<DictEntry>> {
public:
void Sort();
Expand All @@ -64,13 +80,13 @@ class DictEntryFilterBinder {
class Vocabulary;

struct VocabularyPage {
DictEntryList entries;
ShortDictEntryList entries;
an<Vocabulary> next_level;
};

class Vocabulary : public map<int, VocabularyPage> {
public:
DictEntryList* LocateEntries(const Code& code);
ShortDictEntryList* LocateEntries(const Code& code);
void SortHomophones();
};

Expand Down
18 changes: 9 additions & 9 deletions test/table_test.cc
Original file line number Diff line number Diff line change
Expand Up @@ -44,34 +44,34 @@ rime::the<rime::Table> RimeTableTest::table_;

void RimeTableTest::PrepareSampleVocabulary(rime::Syllabary& syll,
rime::Vocabulary& voc) {
auto d = rime::New<rime::DictEntry>();
auto d = rime::New<rime::ShortDictEntry>();
syll.insert("0");
// no entries for '0', however
syll.insert("1");
d->code.push_back(1);
d->text = "yi";
d->weight = 1.0;
voc[1].entries.push_back(d);
d = rime::New<rime::DictEntry>(*d);
d = rime::New<rime::ShortDictEntry>(*d);
syll.insert("2");
d->code.back() = 2;
d->text = "er";
voc[2].entries.push_back(d);
d = rime::New<rime::DictEntry>(*d);
d = rime::New<rime::ShortDictEntry>(*d);
d->text = "liang";
voc[2].entries.push_back(d);
d = rime::New<rime::DictEntry>(*d);
d = rime::New<rime::ShortDictEntry>(*d);
d->text = "lia";
voc[2].entries.push_back(d);
d = rime::New<rime::DictEntry>(*d);
d = rime::New<rime::ShortDictEntry>(*d);
syll.insert("3");
d->code.back() = 3;
d->text = "san";
voc[3].entries.push_back(d);
d = rime::New<rime::DictEntry>(*d);
d = rime::New<rime::ShortDictEntry>(*d);
d->text = "sa";
voc[3].entries.push_back(d);
d = rime::New<rime::DictEntry>(*d);
d = rime::New<rime::ShortDictEntry>(*d);
syll.insert("4");
auto lv2 = rime::New<rime::Vocabulary>();
voc[1].next_level = lv2;
Expand All @@ -84,11 +84,11 @@ void RimeTableTest::PrepareSampleVocabulary(rime::Syllabary& syll,
d->code.push_back(3);
d->text = "yi-er-san";
(*lv3)[3].entries.push_back(d);
d = rime::New<rime::DictEntry>(*d);
d = rime::New<rime::ShortDictEntry>(*d);
d->code.push_back(4);
d->text = "yi-er-san-si";
(*lv4)[-1].entries.push_back(d);
d = rime::New<rime::DictEntry>(*d);
d = rime::New<rime::ShortDictEntry>(*d);
d->code.resize(3);
d->code.push_back(2);
d->code.push_back(1);
Expand Down

0 comments on commit 210ab6c

Please sign in to comment.