diff --git a/hash_chunker/hash_chunker.py b/hash_chunker/hash_chunker.py index 93b6a67..5c00419 100644 --- a/hash_chunker/hash_chunker.py +++ b/hash_chunker/hash_chunker.py @@ -1,38 +1,26 @@ """Hash Chunker helper to provide hash ranges for distributed data processing.""" import math -from typing import List, Tuple +from dataclasses import dataclass +from typing import Tuple, Generator +@dataclass class HashChunker(object): """Main module class.""" - hex_base = 16 - hex_zero = "0" - hex_f = "f" - hex_format = "x" - - def __init__( - self, - chunk_hash_length: int = 10, - hash_ranges_accuracy: int = 5, - hash_max_length: int = 32, - ): - """ - Init HashChunker. - - :param chunk_hash_length: length of results chunks hashes - :param hash_ranges_accuracy: accuracy for chunks calculation - :param hash_max_length: hash string length in data - """ - self.chunk_hash_length = chunk_hash_length - self.hash_ranges_accuracy = hash_ranges_accuracy - self.hash_max_length = hash_max_length + chunk_hash_length: int = 10 + hash_ranges_accuracy: int = 5 + hash_max_length: int = 32 + hex_base: int = 16 + hex_zero: str = "0" + hex_f: str = "f" + hex_format: str = "x" def get_chunks( self, chunk_size: int, all_items_count: int, - ) -> List[Tuple[str, str]]: + ) -> Generator[Tuple[str, str], None, None]: """ Return hash ranges. @@ -41,14 +29,14 @@ def get_chunks( :return: list of chunks """ if all_items_count == 0 or chunk_size == 0: - return [] + return ( all_items_count, chunk_size, current_position, previous_position, ) = self._get_positions(all_items_count, chunk_size) - return self._add_ranges( + yield from self._add_ranges( all_items_count, chunk_size, current_position, @@ -61,32 +49,23 @@ def _add_ranges( batch: int, current_position: int, previous_position: int, - ) -> List[Tuple[str, str]]: - ranges = [] + ) -> Generator[Tuple[str, str], None, None]: while current_position < all_items_count: start = self._position_to_hex(previous_position) stop = self._position_to_hex(current_position) - ranges.append( - ( - start[: self.chunk_hash_length], - stop[: self.chunk_hash_length], - ), - ) + yield start, stop previous_position = current_position current_position += batch start = self._position_to_hex(previous_position) - stop = self.hex_f * self.hash_max_length - ranges.append( - (start[: self.chunk_hash_length], stop[: self.chunk_hash_length]), - ) - return ranges + stop = self.hex_f * self.chunk_hash_length + yield start, stop def _get_positions( self, all_items_count: int, batch_limit: int, ) -> Tuple[int, int, int, int]: - scale = self.hex_base**self.hash_ranges_accuracy / all_items_count + scale = self.hex_base ** self.hash_ranges_accuracy / all_items_count batch_limit = math.ceil(batch_limit * scale) all_items_count *= scale previous_position = 0 @@ -104,4 +83,4 @@ def _position_to_hex(self, position: int) -> str: zeros_count = self.hash_ranges_accuracy - len(hexed) hexed = self.hex_zero * zeros_count + hexed hexed += self.hex_zero * (self.hash_max_length - len(hexed)) - return hexed + return hexed[: self.chunk_hash_length] diff --git a/pyproject.toml b/pyproject.toml index a6a5a1a..f697c7f 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -2,8 +2,8 @@ name = "hash_chunker" homepage = "https://github.com/whysage/hash_chunker" repository = "https://github.com/whysage/hash_chunker" -version = "0.1.0" -description = "Helper that generates hash chunks for distributed data processing." +version = "0.1.1" +description = "Generator that yields hash chunks for distributed data processing." authors = ["Volodymyr Kochetkov "] license = "MIT" readme = "README.md" diff --git a/tests/test_hash_chunker.py b/tests/test_hash_chunker.py index 2aa3b26..a4014ce 100644 --- a/tests/test_hash_chunker.py +++ b/tests/test_hash_chunker.py @@ -24,7 +24,7 @@ def test_default_usage( :param all_items_count: count aff all data elements :param expected: expected chunks """ - assert HashChunker().get_chunks(chunk_size, all_items_count) == expected + assert list(HashChunker().get_chunks(chunk_size, all_items_count)) == expected @pytest.mark.parametrize( @@ -33,14 +33,14 @@ def test_default_usage( (1, 2, 5, [("00000", "80000"), ("80000", "fffff")]), ], ) -def test_hash_length( +def test_chunk_hash_length( chunk_size: int, all_items_count: int, chunk_hash_length: int, expected: List[Tuple[str, str]], ) -> None: """ - Simple test. + Test chunk_hash_length option. :param chunk_size: chunk elements limit :param all_items_count: count aff all data elements @@ -48,4 +48,4 @@ def test_hash_length( :param expected: expected chunks """ hash_chunker = HashChunker(chunk_hash_length=chunk_hash_length) - assert hash_chunker.get_chunks(chunk_size, all_items_count) == expected + assert list(hash_chunker.get_chunks(chunk_size, all_items_count)) == expected