Skip to content

Commit

Permalink
Update logic to generator and add dataclass
Browse files Browse the repository at this point in the history
  • Loading branch information
whysage committed Jul 28, 2022
1 parent 444f536 commit 7af40b2
Show file tree
Hide file tree
Showing 3 changed files with 25 additions and 46 deletions.
59 changes: 19 additions & 40 deletions hash_chunker/hash_chunker.py
Original file line number Diff line number Diff line change
@@ -1,38 +1,26 @@
"""Hash Chunker helper to provide hash ranges for distributed data processing."""
import math
from typing import List, Tuple
from dataclasses import dataclass
from typing import Tuple, Generator


@dataclass
class HashChunker(object):
"""Main module class."""

hex_base = 16
hex_zero = "0"
hex_f = "f"
hex_format = "x"

def __init__(
self,
chunk_hash_length: int = 10,
hash_ranges_accuracy: int = 5,
hash_max_length: int = 32,
):
"""
Init HashChunker.
:param chunk_hash_length: length of results chunks hashes
:param hash_ranges_accuracy: accuracy for chunks calculation
:param hash_max_length: hash string length in data
"""
self.chunk_hash_length = chunk_hash_length
self.hash_ranges_accuracy = hash_ranges_accuracy
self.hash_max_length = hash_max_length
chunk_hash_length: int = 10
hash_ranges_accuracy: int = 5
hash_max_length: int = 32
hex_base: int = 16
hex_zero: str = "0"
hex_f: str = "f"
hex_format: str = "x"

def get_chunks(
self,
chunk_size: int,
all_items_count: int,
) -> List[Tuple[str, str]]:
) -> Generator[Tuple[str, str], None, None]:
"""
Return hash ranges.
Expand All @@ -41,14 +29,14 @@ def get_chunks(
:return: list of chunks
"""
if all_items_count == 0 or chunk_size == 0:
return []
return
(
all_items_count,
chunk_size,
current_position,
previous_position,
) = self._get_positions(all_items_count, chunk_size)
return self._add_ranges(
yield from self._add_ranges(
all_items_count,
chunk_size,
current_position,
Expand All @@ -61,32 +49,23 @@ def _add_ranges(
batch: int,
current_position: int,
previous_position: int,
) -> List[Tuple[str, str]]:
ranges = []
) -> Generator[Tuple[str, str], None, None]:
while current_position < all_items_count:
start = self._position_to_hex(previous_position)
stop = self._position_to_hex(current_position)
ranges.append(
(
start[: self.chunk_hash_length],
stop[: self.chunk_hash_length],
),
)
yield start, stop
previous_position = current_position
current_position += batch
start = self._position_to_hex(previous_position)
stop = self.hex_f * self.hash_max_length
ranges.append(
(start[: self.chunk_hash_length], stop[: self.chunk_hash_length]),
)
return ranges
stop = self.hex_f * self.chunk_hash_length
yield start, stop

def _get_positions(
self,
all_items_count: int,
batch_limit: int,
) -> Tuple[int, int, int, int]:
scale = self.hex_base**self.hash_ranges_accuracy / all_items_count
scale = self.hex_base ** self.hash_ranges_accuracy / all_items_count
batch_limit = math.ceil(batch_limit * scale)
all_items_count *= scale
previous_position = 0
Expand All @@ -104,4 +83,4 @@ def _position_to_hex(self, position: int) -> str:
zeros_count = self.hash_ranges_accuracy - len(hexed)
hexed = self.hex_zero * zeros_count + hexed
hexed += self.hex_zero * (self.hash_max_length - len(hexed))
return hexed
return hexed[: self.chunk_hash_length]
4 changes: 2 additions & 2 deletions pyproject.toml
Original file line number Diff line number Diff line change
Expand Up @@ -2,8 +2,8 @@
name = "hash_chunker"
homepage = "https://github.com/whysage/hash_chunker"
repository = "https://github.com/whysage/hash_chunker"
version = "0.1.0"
description = "Helper that generates hash chunks for distributed data processing."
version = "0.1.1"
description = "Generator that yields hash chunks for distributed data processing."
authors = ["Volodymyr Kochetkov <[email protected]>"]
license = "MIT"
readme = "README.md"
Expand Down
8 changes: 4 additions & 4 deletions tests/test_hash_chunker.py
Original file line number Diff line number Diff line change
Expand Up @@ -24,7 +24,7 @@ def test_default_usage(
:param all_items_count: count aff all data elements
:param expected: expected chunks
"""
assert HashChunker().get_chunks(chunk_size, all_items_count) == expected
assert list(HashChunker().get_chunks(chunk_size, all_items_count)) == expected


@pytest.mark.parametrize(
Expand All @@ -33,19 +33,19 @@ def test_default_usage(
(1, 2, 5, [("00000", "80000"), ("80000", "fffff")]),
],
)
def test_hash_length(
def test_chunk_hash_length(
chunk_size: int,
all_items_count: int,
chunk_hash_length: int,
expected: List[Tuple[str, str]],
) -> None:
"""
Simple test.
Test chunk_hash_length option.
:param chunk_size: chunk elements limit
:param all_items_count: count aff all data elements
:param chunk_hash_length: chunks hash length
:param expected: expected chunks
"""
hash_chunker = HashChunker(chunk_hash_length=chunk_hash_length)
assert hash_chunker.get_chunks(chunk_size, all_items_count) == expected
assert list(hash_chunker.get_chunks(chunk_size, all_items_count)) == expected

0 comments on commit 7af40b2

Please sign in to comment.