Skip to content

Commit

Permalink
Merge pull request #1 from whysage/develop
Browse files Browse the repository at this point in the history
Develop
  • Loading branch information
whysage authored Jul 28, 2022
2 parents 554ba7d + 7af40b2 commit 4e2376e
Show file tree
Hide file tree
Showing 11 changed files with 1,534 additions and 1 deletion.
10 changes: 10 additions & 0 deletions .gitignore
Original file line number Diff line number Diff line change
@@ -0,0 +1,10 @@
__pycache__
.idea
.cache
.mypy_cache
.coverage
venv
.venv
coverage.xml
report.xml
*.pytest_cache
61 changes: 61 additions & 0 deletions .pre-commit-config.yaml
Original file line number Diff line number Diff line change
@@ -0,0 +1,61 @@
fail_fast: true
repos:
- repo: https://github.com/pre-commit/pre-commit-hooks
rev: v2.3.0
hooks:
- id: check-yaml
- id: end-of-file-fixer
- id: trailing-whitespace

- repo: https://github.com/asottile/add-trailing-comma
rev: v2.2.1
hooks:
- id: add-trailing-comma

- repo: local

hooks:

- id: black
name: black
entry: black
language: system
types: [ python ]
args: [ --line-length=88, --target-version=py39 ]

- id: autoflake
name: autoflake
entry: autoflake
language: system
types: [ python ]
args: [
--in-place,
--remove-all-unused-imports,
--remove-duplicate-keys,
--ignore-init-module-imports,
]

- id: isort
name: isort
entry: isort
language: system
types: [ python ]

- id: flake8
name: flake8
entry: flake8
language: system
types: [ python ]

- id: pylint
name: pylint
entry: pylint
language: system
types: [ python ]

- id: mypy
name: mypy
entry: bash -c 'mypy hash_chunker'
language: system
types: [ python ]
verbose: true
2 changes: 1 addition & 1 deletion LICENSE
Original file line number Diff line number Diff line change
@@ -1,6 +1,6 @@
MIT License

Copyright (c) 2022 whysage
Copyright (c) 2022 Volodymyr Kochetkov (whysage)

Permission is hereby granted, free of charge, to any person obtaining a copy
of this software and associated documentation files (the "Software"), to deal
Expand Down
1 change: 1 addition & 0 deletions README.md
Original file line number Diff line number Diff line change
@@ -0,0 +1 @@
# Hash Chunker
2 changes: 2 additions & 0 deletions hash_chunker/__init__.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,2 @@
"""Hash Chunker helper to provide hash ranges for distributed data processing."""
from .hash_chunker import HashChunker # noqa: WPS300,F401
86 changes: 86 additions & 0 deletions hash_chunker/hash_chunker.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,86 @@
"""Hash Chunker helper to provide hash ranges for distributed data processing."""
import math
from dataclasses import dataclass
from typing import Tuple, Generator


@dataclass
class HashChunker(object):
"""Main module class."""

chunk_hash_length: int = 10
hash_ranges_accuracy: int = 5
hash_max_length: int = 32
hex_base: int = 16
hex_zero: str = "0"
hex_f: str = "f"
hex_format: str = "x"

def get_chunks(
self,
chunk_size: int,
all_items_count: int,
) -> Generator[Tuple[str, str], None, None]:
"""
Return hash ranges.
:param chunk_size: chunk elements limit
:param all_items_count: count aff all data elements
:return: list of chunks
"""
if all_items_count == 0 or chunk_size == 0:
return
(
all_items_count,
chunk_size,
current_position,
previous_position,
) = self._get_positions(all_items_count, chunk_size)
yield from self._add_ranges(
all_items_count,
chunk_size,
current_position,
previous_position,
)

def _add_ranges(
self,
all_items_count: int,
batch: int,
current_position: int,
previous_position: int,
) -> Generator[Tuple[str, str], None, None]:
while current_position < all_items_count:
start = self._position_to_hex(previous_position)
stop = self._position_to_hex(current_position)
yield start, stop
previous_position = current_position
current_position += batch
start = self._position_to_hex(previous_position)
stop = self.hex_f * self.chunk_hash_length
yield start, stop

def _get_positions(
self,
all_items_count: int,
batch_limit: int,
) -> Tuple[int, int, int, int]:
scale = self.hex_base ** self.hash_ranges_accuracy / all_items_count
batch_limit = math.ceil(batch_limit * scale)
all_items_count *= scale
previous_position = 0
current_position = batch_limit
return (
all_items_count,
batch_limit,
current_position,
previous_position,
)

def _position_to_hex(self, position: int) -> str:
hexed = format(position, self.hex_format)
if len(hexed) < self.hash_ranges_accuracy:
zeros_count = self.hash_ranges_accuracy - len(hexed)
hexed = self.hex_zero * zeros_count + hexed
hexed += self.hex_zero * (self.hash_max_length - len(hexed))
return hexed[: self.chunk_hash_length]
Loading

0 comments on commit 4e2376e

Please sign in to comment.