Merge pull request #1 from whysage/develop

Develop
whysage · Jul 28, 2022 · 4e2376e · 4e2376e
2 parents 554ba7d + 7af40b2
commit 4e2376e
Show file tree

Hide file tree

Showing 11 changed files with 1,534 additions and 1 deletion.
diff --git a/.gitignore b/.gitignore
@@ -0,0 +1,10 @@
+__pycache__
+.idea
+.cache
+.mypy_cache
+.coverage
+venv
+.venv
+coverage.xml
+report.xml
+*.pytest_cache
diff --git a/.pre-commit-config.yaml b/.pre-commit-config.yaml
@@ -0,0 +1,61 @@
+fail_fast: true
+repos:
+- repo: https://github.com/pre-commit/pre-commit-hooks
+  rev: v2.3.0
+  hooks:
+    - id: check-yaml
+    - id: end-of-file-fixer
+    - id: trailing-whitespace
+
+- repo: https://github.com/asottile/add-trailing-comma
+  rev: v2.2.1
+  hooks:
+    - id: add-trailing-comma
+
+- repo: local
+
+  hooks:
+
+    - id: black
+      name: black
+      entry: black
+      language: system
+      types: [ python ]
+      args: [ --line-length=88, --target-version=py39 ]
+
+    - id: autoflake
+      name: autoflake
+      entry: autoflake
+      language: system
+      types: [ python ]
+      args: [
+        --in-place,
+        --remove-all-unused-imports,
+        --remove-duplicate-keys,
+        --ignore-init-module-imports,
+      ]
+
+    - id: isort
+      name: isort
+      entry: isort
+      language: system
+      types: [ python ]
+
+    - id: flake8
+      name: flake8
+      entry: flake8
+      language: system
+      types: [ python ]
+
+    - id: pylint
+      name: pylint
+      entry: pylint
+      language: system
+      types: [ python ]
+
+    - id: mypy
+      name: mypy
+      entry: bash -c 'mypy hash_chunker'
+      language: system
+      types: [ python ]
+      verbose: true
diff --git a/LICENSE b/LICENSE
@@ -1,6 +1,6 @@
 MIT License
 
-Copyright (c) 2022 whysage
+Copyright (c) 2022 Volodymyr Kochetkov (whysage)
 
 Permission is hereby granted, free of charge, to any person obtaining a copy
 of this software and associated documentation files (the "Software"), to deal

diff --git a/README.md b/README.md
@@ -0,0 +1 @@
+# Hash Chunker
diff --git a/hash_chunker/__init__.py b/hash_chunker/__init__.py
@@ -0,0 +1,2 @@
+"""Hash Chunker helper to provide hash ranges for distributed data processing."""
+from .hash_chunker import HashChunker  # noqa: WPS300,F401
diff --git a/hash_chunker/hash_chunker.py b/hash_chunker/hash_chunker.py
@@ -0,0 +1,86 @@
+"""Hash Chunker helper to provide hash ranges for distributed data processing."""
+import math
+from dataclasses import dataclass
+from typing import Tuple, Generator
+
+
+@dataclass
+class HashChunker(object):
+    """Main module class."""
+
+    chunk_hash_length: int = 10
+    hash_ranges_accuracy: int = 5
+    hash_max_length: int = 32
+    hex_base: int = 16
+    hex_zero: str = "0"
+    hex_f: str = "f"
+    hex_format: str = "x"
+
+    def get_chunks(
+        self,
+        chunk_size: int,
+        all_items_count: int,
+    ) -> Generator[Tuple[str, str], None, None]:
+        """
+        Return hash ranges.
+
+        :param chunk_size: chunk elements limit
+        :param all_items_count: count aff all data elements
+        :return: list of chunks
+        """
+        if all_items_count == 0 or chunk_size == 0:
+            return
+        (
+            all_items_count,
+            chunk_size,
+            current_position,
+            previous_position,
+        ) = self._get_positions(all_items_count, chunk_size)
+        yield from self._add_ranges(
+            all_items_count,
+            chunk_size,
+            current_position,
+            previous_position,
+        )
+
+    def _add_ranges(
+        self,
+        all_items_count: int,
+        batch: int,
+        current_position: int,
+        previous_position: int,
+    ) -> Generator[Tuple[str, str], None, None]:
+        while current_position < all_items_count:
+            start = self._position_to_hex(previous_position)
+            stop = self._position_to_hex(current_position)
+            yield start, stop
+            previous_position = current_position
+            current_position += batch
+        start = self._position_to_hex(previous_position)
+        stop = self.hex_f * self.chunk_hash_length
+        yield start, stop
+
+    def _get_positions(
+        self,
+        all_items_count: int,
+        batch_limit: int,
+    ) -> Tuple[int, int, int, int]:
+        scale = self.hex_base ** self.hash_ranges_accuracy / all_items_count
+        batch_limit = math.ceil(batch_limit * scale)
+        all_items_count *= scale
+        previous_position = 0
+        current_position = batch_limit
+        return (
+            all_items_count,
+            batch_limit,
+            current_position,
+            previous_position,
+        )
+
+    def _position_to_hex(self, position: int) -> str:
+        hexed = format(position, self.hex_format)
+        if len(hexed) < self.hash_ranges_accuracy:
+            zeros_count = self.hash_ranges_accuracy - len(hexed)
+            hexed = self.hex_zero * zeros_count + hexed
+        hexed += self.hex_zero * (self.hash_max_length - len(hexed))
+        return hexed[: self.chunk_hash_length]
Original file line number	Diff line number	Diff line change
		@@ -0,0 +1,2 @@
		"""Hash Chunker helper to provide hash ranges for distributed data processing."""
		from .hash_chunker import HashChunker # noqa: WPS300,F401