From db5b57e4ba832dd880dde1622836e3cff258608b Mon Sep 17 00:00:00 2001 From: Jason Dent Date: Fri, 10 Dec 2021 12:58:36 +0100 Subject: [PATCH] fix: cspell-tools - limit memory usage when build dictionaries When compiling Estonian, it would cause node to run out of keys due to caching highly repetitive suffixes. --- cspell-dict.txt | 1 + packages/cspell-trie-lib/cspell.json | 1 + .../src/lib/TrieBuilder.test.ts | 48 ++++++++++++++++++- .../cspell-trie-lib/src/lib/TrieBuilder.ts | 48 ++++++++++++++++++- packages/cspell-trie-lib/src/lib/index.ts | 2 +- 5 files changed, 97 insertions(+), 3 deletions(-) diff --git a/cspell-dict.txt b/cspell-dict.txt index 90cf90d1767f..f9e89b4ed0c0 100644 --- a/cspell-dict.txt +++ b/cspell-dict.txt @@ -47,6 +47,7 @@ repo repos retryable serializers +sigs specberus streetsidesoftware submodule diff --git a/packages/cspell-trie-lib/cspell.json b/packages/cspell-trie-lib/cspell.json index b7b5cd00d869..f6c41f2d2764 100644 --- a/packages/cspell-trie-lib/cspell.json +++ b/packages/cspell-trie-lib/cspell.json @@ -3,6 +3,7 @@ "tsbuildinfo" ], "import": [ + "../../cspell.json", "@cspell/dict-es-es/cspell-ext.json" ] } diff --git a/packages/cspell-trie-lib/src/lib/TrieBuilder.test.ts b/packages/cspell-trie-lib/src/lib/TrieBuilder.test.ts index 31faaf518758..7a5fcc7ab887 100644 --- a/packages/cspell-trie-lib/src/lib/TrieBuilder.test.ts +++ b/packages/cspell-trie-lib/src/lib/TrieBuilder.test.ts @@ -1,5 +1,8 @@ import { countNodes, isCircular } from './util'; -import { TrieBuilder, buildTrie } from './TrieBuilder'; +import { TrieBuilder, buildTrie, __testing__ } from './TrieBuilder'; +import { TrieNode } from '.'; + +const { trimSignatures, trimMap } = __testing__; describe('Validate TrieBuilder', () => { test('builder explicit consolidateSuffixes', () => { @@ -31,6 +34,49 @@ describe('Validate TrieBuilder', () => { const trie = buildTrie(sampleWords); expect([...trie.words()]).toEqual(sampleWords.sort()); }); + + test('trimSignatures', () => { + const n: TrieNode = {}; + const sigs = sampleWords; + const soloSigs = sigs.filter((_, i) => !!(i & 1)); + const signatures = new Map(sigs.map((w) => [w, n])); + const solo = new Set(soloSigs); + + // verify preconditions + expect(signatures.size).toBe(sigs.length); + expect(solo.size).toBe(soloSigs.length); + + // Nothing should change, solo is within bounds. + trimSignatures(signatures, solo, sampleWords.length); + expect(signatures.size).toBe(sigs.length); + expect(solo.size).toBe(soloSigs.length); + + // trim and make sure the newest values are left. + trimSignatures(signatures, solo, 5, 10); + expect(signatures.size).toBe(sigs.length - soloSigs.length + 5); + expect(solo.size).toBe(5); + // verify newest are left + expect([...solo]).toEqual(soloSigs.slice(-5)); + }); + + test('trimMap', () => { + const n: TrieNode = {}; + const values = sampleWords; + const mapOfValues = new Map(values.map((w) => [w, n])); + + // verify preconditions + expect(mapOfValues.size).toBe(values.length); + + // Nothing should change, solo is within bounds. + trimMap(mapOfValues, sampleWords.length); + expect(mapOfValues.size).toBe(values.length); + + // trim and make sure the newest values are left. + trimMap(mapOfValues, 5, 10); + expect(mapOfValues.size).toBe(5); + // verify newest are left + expect([...mapOfValues.keys()]).toEqual(values.slice(-5)); + }); }); const sampleWords = [ diff --git a/packages/cspell-trie-lib/src/lib/TrieBuilder.ts b/packages/cspell-trie-lib/src/lib/TrieBuilder.ts index da13f486dbce..72c81a24030b 100644 --- a/packages/cspell-trie-lib/src/lib/TrieBuilder.ts +++ b/packages/cspell-trie-lib/src/lib/TrieBuilder.ts @@ -30,9 +30,15 @@ interface PathNode { n: TrieNode; } +// cspell:words sigs +const MAX_NUM_SOLO_SIGS = 100000; +const MAX_TRANSFORMS = 1000000; +const CACHE_PADDING = 1000; + export class TrieBuilder { private count = 0; private readonly signatures = new Map(); + private readonly soloSignatures = new Set(); private readonly cached = new Map(); private readonly transforms = new Map>(); private _eow: TrieNode = Object.freeze({ f: 1 }); @@ -100,9 +106,11 @@ export class TrieBuilder { const sig = this.signature(n); const ref = this.signatures.get(sig); if (ref !== undefined) { + this.soloSignatures.delete(sig); return this.tryCacheFrozen(ref); } - + this.soloSignatures.add(sig); + trimSignatures(this.signatures, this.soloSignatures, MAX_NUM_SOLO_SIGS); this.signatures.set(sig, this.freeze(n)); return n; } @@ -111,6 +119,7 @@ export class TrieBuilder { if (!Object.isFrozen(result) || !Object.isFrozen(src)) return; const t = this.transforms.get(src) ?? new Map(); t.set(s, result); + trimMap(this.transforms, MAX_TRANSFORMS); this.transforms.set(src, t); } @@ -209,6 +218,10 @@ export class TrieBuilder { this._root = createTrieRoot(this.trieOptions); this.cached.clear(); this.signatures.clear(); + this.signatures.set(this.signature(this._eow), this._eow); + this.soloSignatures.clear(); + this.count = 0; + this.cached.set(this._eow, this.count++); } build(consolidateSuffixes = false): Trie { @@ -224,3 +237,36 @@ function copyIfFrozen(n: TrieNode): TrieNode { const c = n.c ? new Map(n.c) : undefined; return { f: n.f, c }; } + +function trimSignatures( + signatures: Map, + soloSignatures: Set, + size: number, + padding = CACHE_PADDING +): void { + if (soloSignatures.size >= size + padding) { + for (const soloSig of soloSignatures) { + signatures.delete(soloSig); + soloSignatures.delete(soloSig); + if (soloSignatures.size <= size) { + break; + } + } + } +} + +function trimMap(map: Map, size: number, padding = CACHE_PADDING) { + if (map.size >= size + padding) { + for (const key of map.keys()) { + map.delete(key); + if (map.size <= size) { + break; + } + } + } +} + +export const __testing__ = { + trimSignatures, + trimMap, +}; diff --git a/packages/cspell-trie-lib/src/lib/index.ts b/packages/cspell-trie-lib/src/lib/index.ts index 7dfa55a59e1c..06f3c53d3c4d 100644 --- a/packages/cspell-trie-lib/src/lib/index.ts +++ b/packages/cspell-trie-lib/src/lib/index.ts @@ -3,7 +3,7 @@ export { TrieNode, FLAG_WORD, ChildMap, TrieRoot } from './TrieNode'; export * from './util'; export * from './walker'; export * from './importExport'; -export * from './TrieBuilder'; +export { buildTrie, buildTrieFast, TrieBuilder } from './TrieBuilder'; export * from './consolidate'; export { SuggestionResult, MaxCost, suggestionCollector, SuggestionCollector } from './suggestCollector'; export { parseDictionaryLines, parseDictionary } from './SimpleDictionaryParser';