From 9b13a0a3badcc63b339fb19feac3cc913829503c Mon Sep 17 00:00:00 2001 From: "Jonah H. Harris" Date: Tue, 20 Apr 2021 13:16:48 -0400 Subject: [PATCH 1/4] Add support for count parameter in TopK add method. --- src/sketch/topk.ts | 4 ++-- test/topk-test.js | 59 ++++++++++++++++++++++++++++++++++++++++++++++ 2 files changed, 61 insertions(+), 2 deletions(-) diff --git a/src/sketch/topk.ts b/src/sketch/topk.ts index 69b791b..60f5e94 100644 --- a/src/sketch/topk.ts +++ b/src/sketch/topk.ts @@ -175,8 +175,8 @@ export default class TopK extends BaseFilter { * Add an element to the TopK * @param element - Element to add */ - add (element: string): void { - this._sketch.update(element) + add (element: string, count: number = 1): void { + this._sketch.update(element, count) const frequency = this._sketch.count(element) if (this._heap.length < this._k || frequency >= this._heap.get(0)!.frequency) { diff --git a/test/topk-test.js b/test/topk-test.js index 65d5b78..37cb8c3 100644 --- a/test/topk-test.js +++ b/test/topk-test.js @@ -42,6 +42,65 @@ describe('TopK', () => { const expectedTop = ['alice', 'bob', 'carol'] + describe('#add', () => { + it('should produce equivalent TopK estimations when using count parameter', () => { + const k = 3 + const errorRate = 0.001 + const accuracy = 0.999 + let freqTable = {} + + /* + * Add items to the traditional one-at-a-time variant while concurrently + * building a frequency table to be used for the all-at-once variant. + */ + const topkOneAtATime = new TopK(k, errorRate, accuracy) + for (const item of lessThanOrEqualTestCaseItems) { + topkOneAtATime.add(item) + if (!Object.hasOwnProperty.call(freqTable, item)) { + freqTable[`${item}`] = 0 + } + ++freqTable[`${item}`] + } + + /* Ensure the built frequency table is correct. */ + const expectedFreqTable = lessThanOrEqualTestCaseItems.reduce( + function (acc, curr) { + + if (!Object.hasOwnProperty.call(acc, curr)) { + acc[curr] = 1; + } else { + ++acc[curr]; + } + + return acc; + }, {}) + freqTable.should.to.deep.equal(expectedFreqTable); + + /* Build a version of TopK using the frequency as count */ + const topkAllAtOnce = new TopK(k, errorRate, accuracy) + for (const [item, freq] of Object.entries(freqTable)) { + topkAllAtOnce.add(item, freq) + } + + const topkOneAtATimeValues = topkOneAtATime.values() + const topkOneAtATimeKeys = topkOneAtATimeValues.map(({value}) => value) + const topkAllAtOnceValues = topkAllAtOnce.values() + const topkAllAtOnceKeys = topkAllAtOnceValues.map(({value}) => value) + + /* Make sure all expected lengths match */ + expectedTop.should.to.have.lengthOf(k) + topkOneAtATimeKeys.should.to.have.lengthOf(expectedTop.length) + topkAllAtOnceKeys.should.to.have.lengthOf(topkOneAtATimeKeys.length) + + /* Make sure all expected keys match */ + topkOneAtATimeKeys.should.to.deep.equal(expectedTop); + topkAllAtOnceKeys.should.to.deep.equal(topkOneAtATimeKeys); + + /* Make sure the objects themselves match */ + topkAllAtOnceValues.should.to.deep.equal(topkOneAtATimeValues) + }) + }) + describe('#values', () => { it('should produce valid TopK estimations when there are fewer than K items', () => { const topk = new TopK(10, 0.001, 0.999) From 39569b3cfd8c6660f58210a4710be651ff9d6438 Mon Sep 17 00:00:00 2001 From: "Jonah H. Harris" Date: Tue, 20 Apr 2021 13:26:25 -0400 Subject: [PATCH 2/4] Clean-up semicolons and access via destructuring --- test/topk-test.js | 12 ++++++------ 1 file changed, 6 insertions(+), 6 deletions(-) diff --git a/test/topk-test.js b/test/topk-test.js index 37cb8c3..cca962b 100644 --- a/test/topk-test.js +++ b/test/topk-test.js @@ -67,14 +67,14 @@ describe('TopK', () => { function (acc, curr) { if (!Object.hasOwnProperty.call(acc, curr)) { - acc[curr] = 1; + acc[`${curr}`] = 1 } else { - ++acc[curr]; + ++acc[`${curr}`] } - return acc; + return acc }, {}) - freqTable.should.to.deep.equal(expectedFreqTable); + freqTable.should.to.deep.equal(expectedFreqTable) /* Build a version of TopK using the frequency as count */ const topkAllAtOnce = new TopK(k, errorRate, accuracy) @@ -93,8 +93,8 @@ describe('TopK', () => { topkAllAtOnceKeys.should.to.have.lengthOf(topkOneAtATimeKeys.length) /* Make sure all expected keys match */ - topkOneAtATimeKeys.should.to.deep.equal(expectedTop); - topkAllAtOnceKeys.should.to.deep.equal(topkOneAtATimeKeys); + topkOneAtATimeKeys.should.to.deep.equal(expectedTop) + topkAllAtOnceKeys.should.to.deep.equal(topkOneAtATimeKeys) /* Make sure the objects themselves match */ topkAllAtOnceValues.should.to.deep.equal(topkOneAtATimeValues) From f5a9a9306b180da53972ae1c092dbb9515350bb4 Mon Sep 17 00:00:00 2001 From: "Jonah H. Harris" Date: Tue, 20 Apr 2021 13:28:22 -0400 Subject: [PATCH 3/4] Update version in package.json --- package.json | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/package.json b/package.json index b9181b1..fd342bb 100644 --- a/package.json +++ b/package.json @@ -1,6 +1,6 @@ { "name": "bloom-filters", - "version": "1.3.4", + "version": "1.3.5", "description": "JS implementation of probabilistic data structures: Bloom Filter (and its derived), HyperLogLog, Count-Min Sketch, Top-K and MinHash", "main": "dist/api.js", "scripts": { From 37b6f2a4498f65c605d558a49b16b67233d12346 Mon Sep 17 00:00:00 2001 From: "Jonah H. Harris" Date: Tue, 20 Apr 2021 13:43:13 -0400 Subject: [PATCH 4/4] Update README and ensure count > 0 --- README.md | 8 ++++++-- src/sketch/topk.ts | 3 +++ 2 files changed, 9 insertions(+), 2 deletions(-) diff --git a/README.md b/README.md index 41f68a1..f0fb445 100644 --- a/README.md +++ b/README.md @@ -356,7 +356,7 @@ interface TopkElement { #### Methods -* `add(element: string) -> void`: add a new occurence of an element to the sketch. +* `add(element: string, count: number = 1) -> void`: add one or more new occurences of an element to the sketch. * `values() -> Array`: get the top-k values as an array of objects. * `iterator() -> Iterator`: get the top-k values as an iterator that yields objects. @@ -366,11 +366,15 @@ const { TopK } = require('bloom-filters') // create a new TopK with k = 10, an error rate of 0.001 and an accuracy of 0.99 const topk = new TopK(10, 0.001, 0.99) -// push some occurrences in the multiset +// push occurrences one-at-a-time in the multiset topk.add('alice') topk.add('bob') topk.add('alice') +// or, equally, push multiple occurrences at-once in the multiset +// topk.add('alice', 2) +// topk.add('bob', 1) + // print the top k values for(let item of topk.values()) { console.log(`Item "${item.value}" is in position ${item.rank} with an estimated frequency of ${item.frequency}`) diff --git a/src/sketch/topk.ts b/src/sketch/topk.ts index 60f5e94..1759f71 100644 --- a/src/sketch/topk.ts +++ b/src/sketch/topk.ts @@ -176,6 +176,9 @@ export default class TopK extends BaseFilter { * @param element - Element to add */ add (element: string, count: number = 1): void { + if (0 >= count) { + throw (`count must be > 0 (was ${count})`) + } this._sketch.update(element, count) const frequency = this._sketch.count(element)