diff --git a/.travis.yml b/.travis.yml index 2153e12..59ba7df 100644 --- a/.travis.yml +++ b/.travis.yml @@ -4,7 +4,7 @@ python: - "3.4" - "3.5" - "3.6" - - "3.7" + - "3.7.13" install: - pip install -r requirements.txt diff --git a/bounter/count_min_sketch.py b/bounter/count_min_sketch.py index 6641c32..307ab1b 100644 --- a/bounter/count_min_sketch.py +++ b/bounter/count_min_sketch.py @@ -7,9 +7,15 @@ # This code is distributed under the terms and conditions # from the MIT License (MIT). +import enum import bounter_cmsc as cmsc +class CellSize(enum.Enum): + BITS_32 = 32 + BITS_64 = 64 + + class CountMinSketch(object): """ Data structure used to estimate frequencies of elements in massive data sets with fixed memory footprint. @@ -28,9 +34,9 @@ class CountMinSketch(object): To calculate memory footprint: ( width * depth * cell_size ) + HLL size Cell size is - - 4B for default counting - - 2B for log1024 counting - - 1B for log8 counting + - 4B for default counting + - 2B for log1024 counting + - 1B for log8 counting HLL size is 64 KB Memory usage example: width 2^25 (33 554 432), depth 8, log1024 (2B) has 2^(25 + 3 + 1) + 64 KB = 512.06 MB @@ -47,7 +53,14 @@ class CountMinSketch(object): counting as the collision bias will already be minimal. """ - def __init__(self, size_mb=64, width=None, depth=None, log_counting=None): + def __init__( + self, + size_mb=64, + width=None, + depth=None, + log_counting=None, + cell_size=CellSize.BITS_32, + ): """ Initialize the Count-Min Sketch structure with the given parameters @@ -64,33 +77,40 @@ def __init__(self, size_mb=64, width=None, depth=None, log_counting=None): The more, the better, should be very large, preferably in the same order of magnitude as the cardinality of the counted set. log_counting (int): Use logarithmic approximate counter value for reduced bucket size: - - None (default): 4B, no counter error + - None (default): 4B or 8B according to `cell_size`, no counter error - 1024: 2B, value approximation error ~2% for values larger than 2048 - 8: 1B, value approximation error ~30% for values larger than 16 + cell_size (CellSize): Size of the cells when `log_counting` is None. """ - cell_size = CountMinSketch.cell_size(log_counting) - self.cell_size_v = cell_size + cell_bytes = CountMinSketch.cell_size(cell_size, log_counting) + self.cell_size_v = cell_bytes if size_mb is None or not isinstance(size_mb, int): - raise ValueError("size_mb must be an integer representing the maximum size of the structure in MB") + raise ValueError( + "size_mb must be an integer representing the maximum size of the structure in MB" + ) if width is None and depth is None: - self.width = 1 << (size_mb * (2 ** 20) // (cell_size * 8 * 2)).bit_length() - self.depth = (size_mb * (2 ** 20)) // (self.width * cell_size) + self.width = 1 << (size_mb * (2**20) // (cell_bytes * 8 * 2)).bit_length() + self.depth = (size_mb * (2**20)) // (self.width * cell_bytes) elif width is None: self.depth = depth - avail_width = (size_mb * (2 ** 20)) // (depth * cell_size) + avail_width = (size_mb * (2**20)) // (depth * cell_bytes) self.width = 1 << (avail_width.bit_length() - 1) if not self.width: - raise ValueError("Requested depth is too large for maximum memory size.") + raise ValueError( + "Requested depth is too large for maximum memory size." + ) elif depth is None: if width != 1 << (width.bit_length() - 1): raise ValueError("Requested width must be a power of 2.") self.width = width - self.depth = (size_mb * (2 ** 20)) // (width * cell_size) + self.depth = (size_mb * (2**20)) // (width * cell_bytes) if not self.depth: - raise ValueError("Requested width is too large for maximum memory size.") + raise ValueError( + "Requested width is too large for maximum memory size." + ) else: if width != 1 << (width.bit_length() - 1): raise ValueError("Requested width must be a power of 2.") @@ -102,20 +122,38 @@ def __init__(self, size_mb=64, width=None, depth=None, log_counting=None): elif log_counting == 1024: self.cms = cmsc.CMS_Log1024(width=self.width, depth=self.depth) elif log_counting is None: - self.cms = cmsc.CMS_Conservative(width=self.width, depth=self.depth) + if cell_size == CellSize.BITS_32: + self.cms = cmsc.CMS_Conservative(width=self.width, depth=self.depth) + elif cell_size == CellSize.BITS_64: + self.cms = cmsc.CMS64_Conservative(width=self.width, depth=self.depth) + else: + raise ValueError( + "Unsupported parameter cell_size=%s. Use CellSize.BITS_32 or CellSize.BITS_64." + % (cell_size) + ) else: - raise ValueError("Unsupported parameter log_counting=%s. Use None, 8, or 1024." % log_counting) + raise ValueError( + "Unsupported parameter log_counting=%s. Use None, 8, or 1024." + % (log_counting) + ) # optimize calls by directly binding to C implementation self.increment = self.cms.increment @staticmethod - def cell_size(log_counting=None): + def cell_size(cell_size, log_counting=None): if log_counting == 8: return 1 if log_counting == 1024: return 2 - return 4 + if log_counting is None: + if cell_size == CellSize.BITS_32: + return 4 + if cell_size == CellSize.BITS_64: + return 8 + raise ValueError( + "cell_size must be one of [BITS32, BITS64] and log_counting one of [None, 8, 1024]" + ) @staticmethod def table_size(width, depth=4, log_counting=None): @@ -192,4 +230,6 @@ def __init__(self): super(CardinalityEstimator, self).__init__(width=1, depth=1) def __getitem__(self, key): - raise NotImplementedError("Individual item counting is not supported for cardinality estimator!") + raise NotImplementedError( + "Individual item counting is not supported for cardinality estimator!" + ) diff --git a/cbounter/cms64_conservative.h b/cbounter/cms64_conservative.h new file mode 100644 index 0000000..5447cdf --- /dev/null +++ b/cbounter/cms64_conservative.h @@ -0,0 +1,36 @@ +//----------------------------------------------------------------------------- +// Author: Josep Pon Farreny +// Copyright (C) 2017 Rare Technologies +// +// This code is distributed under the terms and conditions +// from the MIT License (MIT). + +#ifndef _CMS64_CONSERVATIE_H_ +#define _CMS64_CONSERVATIE_H_ + +#define CMS_TYPE CMS64_Conservative +#define CMS_TYPE_STRING "CMS64_Conservative" +#define CMS_CELL_TYPE uint64_t + +#include "cms_common.h" + +static inline int CMS_VARIANT(should_inc)(CMS_CELL_TYPE value) +{ + return 1; +} + +static inline long long CMS_VARIANT(decode)(CMS_CELL_TYPE value) +{ + return value; +} + +static inline CMS_CELL_TYPE CMS_VARIANT(_merge_value)(CMS_CELL_TYPE v1, CMS_CELL_TYPE v2, uint32_t merge_seed) +{ + return v1 + v2; +} + +#undef CMS_TYPE +#undef CMS_TYPE_STRING +#undef CMS_CELL_TYPE + +#endif /* _CMS64_CONSERVATIE_H_ */ \ No newline at end of file diff --git a/cbounter/cms_cmodule.c b/cbounter/cms_cmodule.c index db1ef7a..ba39123 100644 --- a/cbounter/cms_cmodule.c +++ b/cbounter/cms_cmodule.c @@ -18,7 +18,8 @@ static inline uint32_t rand_32b() return r; } -#include "cms_conservative.c" +#include "cms_conservative.h" +#include "cms64_conservative.h" #include "cms_log8.c" #include "cms_log1024.c" #include @@ -50,6 +51,7 @@ PyMODINIT_FUNC initbounter_cmsc(void) { PyObject* m; if (PyType_Ready(&CMS_ConservativeType) < 0 + || PyType_Ready(&CMS64_ConservativeType) < 0 || PyType_Ready(&CMS_Log8Type) < 0 || PyType_Ready(&CMS_Log1024Type) < 0) { @@ -76,6 +78,9 @@ PyMODINIT_FUNC initbounter_cmsc(void) Py_INCREF(&CMS_ConservativeType); PyModule_AddObject(m, "CMS_Conservative", (PyObject *)&CMS_ConservativeType); + Py_INCREF(&CMS64_ConservativeType); + PyModule_AddObject(m, "CMS64_Conservative", (PyObject *)&CMS64_ConservativeType); + srand(time(NULL)); Py_INCREF(&CMS_Log8Type); diff --git a/cbounter/cms_common.c b/cbounter/cms_common.h similarity index 99% rename from cbounter/cms_common.c rename to cbounter/cms_common.h index e58d757..2ed9f71 100644 --- a/cbounter/cms_common.c +++ b/cbounter/cms_common.h @@ -136,8 +136,9 @@ CMS_VARIANT(_increment_obj)(CMS_TYPE *self, char *data, Py_ssize_t dataLength, l } CMS_CELL_TYPE result = min_value; - for (; increment > 0; increment--) + for (; increment > 0; increment--) { result += CMS_VARIANT(should_inc)(result); + } if (result > min_value) { diff --git a/cbounter/cms_conservative.c b/cbounter/cms_conservative.h similarity index 66% rename from cbounter/cms_conservative.c rename to cbounter/cms_conservative.h index d2a2253..de21232 100644 --- a/cbounter/cms_conservative.c +++ b/cbounter/cms_conservative.h @@ -5,11 +5,14 @@ // This code is distributed under the terms and conditions // from the MIT License (MIT). +#ifndef _CMS_CONSERVATIE_H_ +#define _CMS_CONSERVATIE_H_ + #define CMS_TYPE CMS_Conservative #define CMS_TYPE_STRING "CMS_Conservative" #define CMS_CELL_TYPE uint32_t -#include "cms_common.c" +#include "cms_common.h" static inline int CMS_VARIANT(should_inc)(CMS_CELL_TYPE value) { @@ -21,7 +24,13 @@ static inline long long CMS_VARIANT(decode)(CMS_CELL_TYPE value) return value; } -static inline CMS_CELL_TYPE CMS_VARIANT(_merge_value) (CMS_CELL_TYPE v1, CMS_CELL_TYPE v2, uint32_t merge_seed) +static inline CMS_CELL_TYPE CMS_VARIANT(_merge_value)(CMS_CELL_TYPE v1, CMS_CELL_TYPE v2, uint32_t merge_seed) { return v1 + v2; } + +#undef CMS_TYPE +#undef CMS_TYPE_STRING +#undef CMS_CELL_TYPE + +#endif /* _CMS_CONSERVATIE_H_ */ \ No newline at end of file diff --git a/cbounter/cms_log1024.c b/cbounter/cms_log1024.c index e6f7929..d4e456f 100644 --- a/cbounter/cms_log1024.c +++ b/cbounter/cms_log1024.c @@ -9,7 +9,7 @@ #define CMS_TYPE_STRING "CMS_Log1024" #define CMS_CELL_TYPE uint16_t -#include "cms_common.c" +#include "cms_common.h" static inline int CMS_VARIANT(should_inc)(CMS_CELL_TYPE value) { @@ -17,7 +17,8 @@ static inline int CMS_VARIANT(should_inc)(CMS_CELL_TYPE value) { uint8_t shift = 33 - (value >> 10); uint32_t mask = 0xFFFFFFFF >> shift; - if (mask & rand_32b()) return 0; + if (mask & rand_32b()) + return 0; } return 1; } @@ -30,7 +31,7 @@ static inline long long CMS_VARIANT(decode)(CMS_CELL_TYPE value) return (1024 + (value & 1023)) << ((value >> 10) - 1); } -static inline CMS_CELL_TYPE CMS_VARIANT(_merge_value) (CMS_CELL_TYPE v1, CMS_CELL_TYPE v2, uint32_t merge_seed) +static inline CMS_CELL_TYPE CMS_VARIANT(_merge_value)(CMS_CELL_TYPE v1, CMS_CELL_TYPE v2, uint32_t merge_seed) { long long decoded = CMS_VARIANT(decode)(v1); decoded += CMS_VARIANT(decode)(v2); @@ -55,8 +56,12 @@ static inline CMS_CELL_TYPE CMS_VARIANT(_merge_value) (CMS_CELL_TYPE v1, CMS_CEL uint32_t mask = 0xFFFFFFFF >> shift; uint32_t r; - MurmurHash3_x86_32 ((void *) &decoded, 8, merge_seed, (void *) &r); + MurmurHash3_x86_32((void *)&decoded, 8, merge_seed, (void *)&r); uint32_t remainder = mask & decoded; return (log_result << 10) + (h & 1023) + ((mask & r) < remainder); } + +#undef CMS_TYPE +#undef CMS_TYPE_STRING +#undef CMS_CELL_TYPE \ No newline at end of file diff --git a/cbounter/cms_log8.c b/cbounter/cms_log8.c index 33c47fa..be9d247 100644 --- a/cbounter/cms_log8.c +++ b/cbounter/cms_log8.c @@ -9,7 +9,7 @@ #define CMS_TYPE_STRING "CMS_Log8" #define CMS_CELL_TYPE uint8_t -#include "cms_common.c" +#include "cms_common.h" static inline int CMS_VARIANT(should_inc)(CMS_CELL_TYPE value) { @@ -17,7 +17,8 @@ static inline int CMS_VARIANT(should_inc)(CMS_CELL_TYPE value) { uint8_t shift = 33 - (value >> 3); uint32_t mask = 0xFFFFFFFF >> shift; - if (mask & rand_32b()) return 0; + if (mask & rand_32b()) + return 0; } return 1; } @@ -32,7 +33,7 @@ static inline long long CMS_VARIANT(decode)(CMS_CELL_TYPE value) #include -static inline CMS_CELL_TYPE CMS_VARIANT(_merge_value) (CMS_CELL_TYPE v1, CMS_CELL_TYPE v2, uint32_t merge_seed) +static inline CMS_CELL_TYPE CMS_VARIANT(_merge_value)(CMS_CELL_TYPE v1, CMS_CELL_TYPE v2, uint32_t merge_seed) { long long decoded = CMS_VARIANT(decode)(v1); decoded += CMS_VARIANT(decode)(v2); @@ -57,8 +58,12 @@ static inline CMS_CELL_TYPE CMS_VARIANT(_merge_value) (CMS_CELL_TYPE v1, CMS_CEL uint32_t mask = 0xFFFFFFFF >> shift; uint32_t r; - MurmurHash3_x86_32 ((void *) &decoded, 8, merge_seed, (void *) &r); + MurmurHash3_x86_32((void *)&decoded, 8, merge_seed, (void *)&r); uint32_t remainder = mask & decoded; return (log_result << 3) + (h & 7) + ((mask & r) < remainder); } + +#undef CMS_TYPE +#undef CMS_TYPE_STRING +#undef CMS_CELL_TYPE \ No newline at end of file