diff --git a/package.json b/package.json index d8bacb4..8b70e36 100644 --- a/package.json +++ b/package.json @@ -7,7 +7,7 @@ "build": "tsc", "gts": "gts", "pretest": "yarn build", - "test": "mocha --parallel test/**/*-test.js --trace-deprecation --timeout=30000", + "test": "mocha --parallel test/**/*-test.js --trace-deprecation --timeout=60000", "doc": "typedoc --out docs/ --emit both --includeVersion src/api.ts", "lint": "gts lint", "clean": "gts clean", diff --git a/src/cuckoo/cuckoo-filter.ts b/src/cuckoo/cuckoo-filter.ts index 12439cf..af75372 100644 --- a/src/cuckoo/cuckoo-filter.ts +++ b/src/cuckoo/cuckoo-filter.ts @@ -46,7 +46,7 @@ import { */ function computeFingerpintLength(size: number, rate: number): number { const f = Math.ceil(Math.log2(1 / rate) + Math.log2(2 * size)) - return Math.ceil(f / 8) // because we use base 16 64-bits hashes + return Math.ceil(f / 8) // because we use 64-bits hashes } /** @@ -154,7 +154,6 @@ export default class CuckooFilter ): CuckooFilter { const fl = computeFingerpintLength(bucketSize, errorRate) const capacity = Math.ceil(size / bucketSize / 0.955) - // const capacity = utils.power2(items) return new CuckooFilter(capacity, fl, bucketSize, maxKicks) } @@ -370,7 +369,7 @@ export default class CuckooFilter * @private */ _locations(element: HashableInput) { - const hashes = hashIntAndString(element, this.seed, 16) + const hashes = hashIntAndString(element, this.seed) const hash = hashes.int if (this._fingerprintLength > hashes.string.length) { throw new Error( diff --git a/src/iblt/invertible-bloom-lookup-tables.ts b/src/iblt/invertible-bloom-lookup-tables.ts index a6cb44a..d95e165 100644 --- a/src/iblt/invertible-bloom-lookup-tables.ts +++ b/src/iblt/invertible-bloom-lookup-tables.ts @@ -28,7 +28,7 @@ import BaseFilter from '../base-filter' import WritableFilter from '../interfaces/writable-filter' import Cell from './cell' import {AutoExportable, Field, Parameter} from '../exportable' -import {allInOneHashTwice, allocateArray, getDistinctIndexes} from '../utils' +import {allocateArray, getDistinctIndexes, hashTwiceAsString} from '../utils' import {optimalFilterSize, optimalHashes} from '../formulas' /** @@ -169,18 +169,18 @@ export default class InvertibleBloomFilter * @param element - The element to insert */ add(element: Buffer): void { - const hashes = allInOneHashTwice( + const hashes = hashTwiceAsString( JSON.stringify(element.toJSON()), this.seed ) const indexes = getDistinctIndexes( - hashes.string.first, + hashes.first, this._size, this._hashCount, this.seed ) for (let i = 0; i < this._hashCount; ++i) { - this._elements[indexes[i]].add(element, Buffer.from(hashes.string.first)) + this._elements[indexes[i]].add(element, Buffer.from(hashes.first)) } } @@ -190,19 +190,19 @@ export default class InvertibleBloomFilter * @return True if the element has been removed, False otheriwse */ remove(element: Buffer): boolean { - const hashes = allInOneHashTwice( + const hashes = hashTwiceAsString( JSON.stringify(element.toJSON()), this.seed ) const indexes = getDistinctIndexes( - hashes.string.first, - this.size, + hashes.first, + this._size, this._hashCount, this.seed ) for (let i = 0; i < this._hashCount; ++i) { this._elements[indexes[i]] = this._elements[indexes[i]].xorm( - new Cell(Buffer.from(element), Buffer.from(hashes.string.first), 1) + new Cell(Buffer.from(element), Buffer.from(hashes.first), 1) ) } return true @@ -214,13 +214,13 @@ export default class InvertibleBloomFilter * @return False if the element is not in the filter, true if "may be" in the filter. */ has(element: Buffer): boolean { - const hashes = allInOneHashTwice( + const hashes = hashTwiceAsString( JSON.stringify(element.toJSON()), this.seed ) const indexes = getDistinctIndexes( - hashes.string.first, - this.size, + hashes.first, + this._size, this._hashCount, this.seed ) @@ -337,16 +337,16 @@ export default class InvertibleBloomFilter } else { throw new Error('Please report, not possible') } - const hashes = allInOneHashTwice(JSON.stringify(id.toJSON()), this.seed) + const hashes = hashTwiceAsString(JSON.stringify(id.toJSON()), this.seed) const indexes = getDistinctIndexes( - hashes.string.first, + hashes.first, this._size, this._hashCount, this.seed ) for (let i = 0; i < indexes.length; ++i) { this._elements[indexes[i]] = this._elements[indexes[i]].xorm( - new Cell(id, Buffer.from(hashes.string.first), c) + new Cell(id, Buffer.from(hashes.first), c) ) if (this._elements[indexes[i]].isPure()) { pureList.push(indexes[i]) diff --git a/src/utils.ts b/src/utils.ts index 97e6a79..f5176ff 100644 --- a/src/utils.ts +++ b/src/utils.ts @@ -44,8 +44,57 @@ export interface TwoHashes { second: number } +/** + * Templated TwoHashes type + */ +export interface TwoHashesTemplated { + first: T + second: T +} + +/** + * TwoHashes type in number and int format + */ +export interface TwoHashesIntAndString { + int: TwoHashesTemplated + string: TwoHashesTemplated +} + export type HashableInput = string | ArrayBuffer | Buffer +/** + * Internal variable for switching XXH hash function from/to 32/64 bits type. + */ +const serialize_function: XXH.HashInterface = switchSerializationType(64) + +/** + * Allow to switch the hash between 32 or 64 bits + * @param base 32 or 64 + * @returns + */ +export function switchSerializationType(base = 64): XXH.HashInterface { + switch (base) { + case 64: + return XXH.h64 + case 32: + return XXH.h32 + } + return XXH.h64 +} + +/** + * Hash an element into a 64 bits Number + * @param element + * @param seed + * @returns + */ +function serialize(element: HashableInput, seed?: number) { + if (!seed) { + seed = getDefaultSeed() + } + return Number(serialize_function(element, seed).toNumber()) +} + /** * Create a new array fill with a base value * @param size - The size of the array @@ -68,6 +117,19 @@ export function allocateArray( return array } +/** + * Return a number to its Hex format by padding zeroes if length mod 4 != 0 + * @param elem the element to transform in HEX + * @returns the HEX number padded of zeroes + */ +function numberToHex(elem: number): string { + let e = Number(elem).toString(16) + if (e.length % 4 !== 0) { + e = '0'.repeat(4 - (e.length % 4)) + e + } + return e +} + /** * (64-bits only) Hash a value into two values (in hex or integer format) * @param value - The value to hash @@ -77,53 +139,30 @@ export function allocateArray( * @memberof Utils * @author Arnaud Grall & Thomas Minier */ -export function hashTwice( - value: HashableInput, - asInt?: boolean, - seed?: number -): TwoHashes { - if (asInt === undefined) { - asInt = false - } +export function hashTwice(value: HashableInput, seed?: number): TwoHashes { if (seed === undefined) { seed = getDefaultSeed() } - const f = XXH.h64(value, seed + 1) - const l = XXH.h64(value, seed + 2) - if (asInt) { - return { - first: f.toNumber(), - second: l.toNumber(), - } - } else { - let one = f.toString(16) - if (one.length < 16) { - one = '0'.repeat(16 - one.length) + one - } - let two = l.toString(16) - if (two.length < 16) { - two = '0'.repeat(16 - two.length) + two - } - return { - first: Number(one), - second: Number(two), - } + return { + first: serialize(value, seed + 1), + second: serialize(value, seed + 2), } } -export function hashTwiceAsString(value: HashableInput, seed?: number) { - if (seed === undefined) { - seed = getDefaultSeed() - } - const f = XXH.h64(value, seed + 1) - const l = XXH.h64(value, seed + 2) - let one = f.toString(16) - if (one.length < 16) one = '0'.repeat(16 - one.length) + one - let two = l.toString(16) - if (two.length < 16) two = '0'.repeat(16 - two.length) + two +/** + * Hash twice an element into their HEX string representations + * @param value + * @param seed + * @returns TwoHashesTemplated + */ +export function hashTwiceAsString( + value: HashableInput, + seed?: number +): TwoHashesTemplated { + const {first, second} = hashTwice(value, seed) return { - first: one, - second: two, + first: numberToHex(first), + second: numberToHex(second), } } @@ -131,39 +170,62 @@ export function hashTwiceAsString(value: HashableInput, seed?: number) { * (64-bits only) Same as hashTwice but return Numbers and String equivalent * @param val the value to hash * @param seed the seed to change when hashing - * @return A object of shape {int: {first: , second: }, string: {first: , second: } + * @return TwoHashesIntAndString * @author Arnaud Grall */ -export function allInOneHashTwice(val: HashableInput, seed?: number) { +export function HashTwiceIntAndString( + val: HashableInput, + seed?: number +): TwoHashesIntAndString { if (seed === undefined) { seed = getDefaultSeed() } - const one = XXH.h64(val, seed + 1) - const two = XXH.h64(val, seed + 2) - let stringOne = one.toString(16) - if (stringOne.length < 16) - stringOne = '0'.repeat(16 - stringOne.length) + stringOne - let stringTwo = two.toString(16) - if (stringTwo.length < 16) - stringTwo = '0'.repeat(16 - stringTwo.length) + stringTwo - + const one = hashIntAndString(val, seed + 1) + const two = hashIntAndString(val, seed + 2) return { int: { - first: one.toNumber(), - second: two.toNumber(), + first: one.int, + second: two.int, }, string: { - first: stringOne, - second: stringTwo, + first: one.string, + second: two.string, }, } } +/** + * Hash an item as an unsigned int + * @param elem - Element to hash + * @param seed - The hash seed. If its is UINT32 make sure to set the length to 32 + * @param length - The length of hashes (defaults to 32 bits) + * @return The hash value as an unsigned int + * @author Arnaud Grall + */ +export function hashAsInt(elem: HashableInput, seed?: number): number { + if (seed === undefined) { + seed = getDefaultSeed() + } + return serialize(elem, seed) +} + +/** + * Hash an item and return its number and HEX string representation + * @param elem - Element to hash + * @param seed - The hash seed. If its is UINT32 make sure to set the length to 32 + * @param base - The base in which the string will be returned, default: 16 + * @param length - The length of hashes (defaults to 32 bits) + * @return The item hased as an int and a string + * @author Arnaud Grall + */ +export function hashIntAndString(elem: HashableInput, seed?: number) { + const hash = hashAsInt(elem, seed) + return {int: hash, string: numberToHex(hash)} +} + /** * Apply enhanced Double Hashing to produce a n-hash - * Originally, this implementation used directly the value produced by the two hash functions instead of the functions themselves. - * @see {@link http://citeseer.ist.psu.edu/viewdoc/download;jsessionid=4060353E67A356EF9528D2C57C064F5A?doi=10.1.1.152.579&rep=rep1&type=pdf} for more details about double hashing. - * A enhanced version (currently used) is available at: http://peterd.org/pcd-diss.pdf s6.5.4 + * @see {@link http://peterd.org/pcd-diss.pdf} s6.5.4 * @param n - The indice of the hash function we want to produce * @param hashA - The result of the first hash function applied to a value. * @param hashB - The result of the second hash function applied to a value. @@ -194,8 +256,9 @@ export function doubleHashing( * @param size - the range on which we can generate an index [0, size) = size * @param number - The number of indexes desired * @param seed - The seed used - * @return A array of indexes + * @return Array * @author Arnaud Grall + * @author Simon Woolf (SimonWoolf) */ export function getDistinctIndexes( element: HashableInput, @@ -208,20 +271,25 @@ export function getDistinctIndexes( } let n = 0 const indexes: Set = new Set() - let hashes = hashTwice(element, true, seed) - let cycle = 0 + let hashes = hashTwice(element, seed) + // let cycle = 0 while (indexes.size < number) { - const ind = doubleHashing(n, hashes.first, hashes.second, size) + const ind = hashes.first % size if (!indexes.has(ind)) { indexes.add(ind) - } else { - if (cycle > number) { - cycle = 0 - hashes = hashTwice(element, true, seed + n) - } - cycle++ } + hashes.first = (hashes.first + hashes.second) % size + hashes.second = (hashes.second + n) % size n++ + + if (n > size) { + // Enhanced double hashing stops cycles of length less than `size` in the case where + // size is coprime with the second hash. But you still get cycles of length `size`. + // So if we reach there and haven't finished, append a prime to the input and + // rehash. + seed++ + hashes = hashTwice(element, seed) + } } return [...indexes.values()] } @@ -246,7 +314,7 @@ export function getIndexes( seed = getDefaultSeed() } const arr = [] - const hashes = hashTwice(element, true, seed) + const hashes = hashTwice(element, seed) for (let i = 0; i < hashCount; i++) { arr.push(doubleHashing(i, hashes.first, hashes.second, size)) } @@ -303,8 +371,7 @@ export function xorBuffer(a: Buffer, b: Buffer): Buffer { start++ value = it.next() } - const buf2 = buffer.slice(start) - return buf2 + return buffer.slice(start) } /** @@ -323,81 +390,11 @@ export function isEmptyBuffer(buffer: Buffer | null): boolean { return true } -/** - * Hash an item as an unsigned int - * @param elem - Element to hash - * @param seed - The hash seed. If its is UINT32 make sure to set the length to 32 - * @param length - The length of hashes (defaults to 32 bits) - * @return The hash value as an unsigned int - * @author Arnaud Grall - */ -export function hashAsInt(elem: HashableInput, seed?: number): number { - if (seed === undefined) { - seed = getDefaultSeed() - } - return XXH.h64(elem, seed).toNumber() -} - -/** - * Hash an item and return its number and string (b16) representation - * @param elem - Element to hash - * @param seed - The hash seed. If its is UINT32 make sure to set the length to 32 - * @param base - The base in which the string will be returned, default: 16 - * @param length - The length of hashes (defaults to 32 bits) - * @return The item hased as an int and a string - * @author Arnaud Grall - */ -export function hashIntAndString( - elem: HashableInput, - seed?: number, - base?: number -) { - if (seed === undefined) { - seed = getDefaultSeed() - } - if (base === undefined) { - base = 16 - } - const hash = XXH.h64(elem, seed) - const plat = 64 - let result = '' - if (base === 16) { - result = hash.toString(base) - if (result.length < plat / 4) { - result = '0'.repeat(plat / 4 - result.length) + result - } - } else if (base === 2) { - result = hex2bin(hash.toString(16)) - if (result.length < plat) { - result = '0'.repeat(plat - result.length) + result - } - } - return {int: hash.toNumber(), string: result} -} - /** * Return the default seed used in the package - * @return A ssed as a floating point number + * @return A seed as a floating point number * @author Arnaud Grall */ export function getDefaultSeed(): number { return 0x1234567890 } - -/** - * Return the next power of 2 of x - * @param x - Value - * @return The next power of 2 of x - */ -export function power2(x: number): number { - return Math.ceil(Math.pow(2, Math.floor(Math.log(x) / Math.log(2)))) -} - -/** - * Convert an hex string into a binary string - * @param hex - A base 16 string - * @return A base 2 string - */ -export function hex2bin(hex: string): string { - return parseInt(hex, 16).toString(2) -} diff --git a/test/count-min-sketch-test.js b/test/count-min-sketch-test.js index a05d603..50f1c78 100644 --- a/test/count-min-sketch-test.js +++ b/test/count-min-sketch-test.js @@ -25,7 +25,9 @@ SOFTWARE. 'use strict' require('chai').should() +const { utils } = require('mocha') const {CountMinSketch} = require('../dist/api.js') +const butils = require('../dist/utils.js') describe('CountMinSketch', () => { const delta = 0.999 @@ -131,11 +133,12 @@ describe('CountMinSketch', () => { }) }) }) - describe.skip('Performance test', () => { + describe('Performance test', () => { + butils.switchSerializationType(32) // setup an finite stream of 100 000 elements between [0; 1000) - const max = 1000000 + const max = 100000 const rate = 0.00001 - const range = 10000 + const range = 1000 const random = () => { return Math.floor(Math.random() * range) } @@ -173,5 +176,6 @@ describe('CountMinSketch', () => { const errorProb = 1 - Math.pow(Math.E, -filter.rows) errorRate.should.be.at.most(errorProb) }) + butils.switchSerializationType(64) }) }) diff --git a/test/cuckoo-filter-test.js b/test/cuckoo-filter-test.js index 17a4699..a9e7680 100644 --- a/test/cuckoo-filter-test.js +++ b/test/cuckoo-filter-test.js @@ -35,13 +35,13 @@ describe('CuckooFilter', () => { it('should compute the fingerprint and indexes for an element', () => { const filter = new CuckooFilter(15, 3, 2, 1) const element = 'foo' - const hashes = utils.hashIntAndString(element, filter.seed, 16, 32) + const hashes = utils.hashIntAndString(element, filter.seed, 16) const hash = hashes.int const fingerprint = hashes.string.substring(0, 3) const firstIndex = Math.abs(hash) const secondIndex = Math.abs( - firstIndex ^ Math.abs(utils.hashAsInt(fingerprint, filter.seed, 32)) + firstIndex ^ Math.abs(utils.hashAsInt(fingerprint, filter.seed)) ) const locations = filter._locations(element) diff --git a/test/hyperloglog-test.js b/test/hyperloglog-test.js index efe6605..15a0146 100644 --- a/test/hyperloglog-test.js +++ b/test/hyperloglog-test.js @@ -25,20 +25,23 @@ SOFTWARE. 'use strict' require('chai').should() -const {HyperLogLog} = require('../dist/api.js') +const { HyperLogLog } = require('../dist/api.js') +const utils = require('../dist/utils.js') describe('HyperLogLog', () => { describe('#update', () => { it('should support update and cardinality estimations (count) operations', () => { + utils.switchSerializationType(32) const nbDistinct = 100 const sketch = new HyperLogLog(110) // populate the sketch with some values - for (let i = 0; i < 10e5; i++) { + for (let i = 0; i < 10e3; i++) { sketch.update(`${i % nbDistinct}`) } sketch .count(true) .should.be.closeTo(nbDistinct, nbDistinct * sketch.accuracy()) + utils.switchSerializationType(64) }) }) diff --git a/test/iblt-test.js b/test/iblt-test.js index ba4235e..5e7ffa0 100644 --- a/test/iblt-test.js +++ b/test/iblt-test.js @@ -29,6 +29,8 @@ require('chai').expect() const {InvertibleBloomFilter} = require('../dist/api.js') const random = require('random') const seedrandom = require('seedrandom') +const utils = require('../dist/utils.js') +utils.switchSerializationType(32) describe('Invertible Bloom Lookup Tables', () => { const keys = 1000 @@ -109,7 +111,6 @@ describe('Invertible Bloom Lookup Tables', () => { output.push(elt.value) elt = iterator.next() } - elt.value.should.equal(true) output.length.should.equals(toInsert.length) output.sort().should.eqls(toInsert.sort()) }) @@ -128,7 +129,6 @@ describe('Invertible Bloom Lookup Tables', () => { output.push(elt.value) elt = iterator.next() } - elt.value.should.equal(true) output.length.should.equals(toInsert.length) output.sort().should.eqls(toInsert.sort()) }) @@ -196,7 +196,7 @@ describe('Invertible Bloom Lookup Tables', () => { describe(`Set differences of [10 to ${d}] with ${keys} keys, ${hashCount} hash functions, [alpha = ${alpha}, d = ${d}]=${ alpha * d - } cells`, () => { + } cells`, () => { for (let i = step; i <= d; i += step) { it( 'should decodes correctly element for a set difference of ' + i, diff --git a/test/utils-test.js b/test/utils-test.js index 92f7797..af1c7d1 100644 --- a/test/utils-test.js +++ b/test/utils-test.js @@ -30,7 +30,6 @@ const {BloomFilter} = require('../dist/api.js') const XXH = require('xxhashjs') const {range} = require('lodash') const seed = utils.getDefaultSeed() -const assert = require('assert') describe('Utils', () => { describe('#allocateArray', () => { @@ -56,7 +55,7 @@ describe('Utils', () => { values.forEach(n => { utils .doubleHashing(n, hashA, hashB, size) - .should.equal((hashA + n * hashB + ((n**3 - n)/6)) % size) + .should.equal((hashA + n * hashB + (n ** 3 - n) / 6) % size) }) }) }) @@ -145,9 +144,10 @@ describe('Utils', () => { }) describe('#getDistinctIndexes', () => { + utils.switchSerializationType(32) // switch to 32 for faster execution const key = - 'da5e21f8a67c4163f1a53ef43515bd027967da305ecfc741b2c3f40f832b7f82' - const desiredIndices = 1000 + 'da5e21f8a67c4163f1a53ef43515bd027967da305ecfc741b2c3f40f832b7f82' + const desiredIndices = 10000 const result = range(0, desiredIndices, 1) it(`should return ${desiredIndices} distinct indices on the interval [0, ${desiredIndices})`, () => { try { @@ -156,10 +156,17 @@ describe('Utils', () => { .getDistinctIndexes(key, desiredIndices, desiredIndices) .sort((a, b) => a - b) indices.should.deep.equal(result) - console.log(`Generated ${indices.length} distinct indices on the interval [0, ${desiredIndices}) in ${new Date().getTime() - start} ms`) + console.log( + `Generated ${ + indices.length + } distinct indices on the interval [0, ${desiredIndices}) in ${ + new Date().getTime() - start + } ms` + ) } catch (e) { throw Error('it should not throw: ' + e) } + utils.switchSerializationType(64) // switch back to 64 }) it('should the issue be fixed', () => { try {