diff --git a/.github/workflows/build.yaml b/.github/workflows/build.yaml index be1304f..cbf9c23 100644 --- a/.github/workflows/build.yaml +++ b/.github/workflows/build.yaml @@ -10,8 +10,10 @@ jobs: validate_code: name: Validate code in spark container runs-on: ubuntu-latest + if: ${{ github.ref != 'refs/heads/main' }} + container: - image: esdcrproduction.azurecr.io/spark:v1.1.13-bitnami-3.1.2-python-3.9.6-0 + image: esdcrproduction.azurecr.io/spark:v1.3.2-bitnami-3.2.0-python-3.9.7-0 credentials: username: ${{ secrets.AZCR_PROD_USER }} password: ${{ secrets.AZCR_PROD_TOKEN }} @@ -19,76 +21,34 @@ jobs: steps: - uses: actions/checkout@v2 - - name: Prepare venv + - name: Install Poetry run: | set -e - python -m virtualenv hadoopwrapper - . hadoopwrapper/bin/activate - pip install -r ./requirements.txt - pip install -r ./requirements-dev.txt - - name: Lint + curl -sSL https://install.python-poetry.org | python3 - --preview + - name: Install Dependencies run: | set -e - pypath=$(pwd) - export PYTHONPATH="$pypath/src:$PYTHONPATH" - - . hadoopwrapper/bin/activate - find ./src/hadoop_fs_wrapper -type f -name "*.py" | xargs pylint - - name: Unit test + /github/home/.local/bin/poetry install + - name: Lint run: | set -e - - pypath=$(pwd) - export PYTHONPATH="$pypath/src:$PYTHONPATH" - - . hadoopwrapper/bin/activate - pytest ./test - - create_release: - name: Create Release - runs-on: ubuntu-latest - needs: [ validate_code ] - if: ${{ github.ref == 'refs/heads/main' }} - - steps: - - uses: actions/checkout@v2 - - run: git fetch --prune --unshallow - - name: Create Release - uses: SneaksAndData/github-actions/semver_release@v0.0.2 - with: - major_v: 0 - minor_v: 4 - release_to_pypi_test: - name: Release distribution to test.pypi.org - runs-on: ubuntu-latest - needs: [ create_release ] + pypath=$(pwd) + export PYTHONPATH="$pypath:$PYTHONPATH" - steps: - - uses: actions/checkout@v2 - - run: git fetch --prune --unshallow - - uses: actions/setup-python@v2 - with: - python-version: '3.8.x' - - name: Build wheel + find ./hadoop_fs_wrapper -type f -name "*.py" | xargs /github/home/.local/bin/poetry run pylint + - name: Unit test run: | set -e - - version=$(git describe --tags --abbrev=7) - - pip install virtualenv - python -m virtualenv hadoopwrapper - - . hadoopwrapper/bin/activate - pip install --upgrade twine build - - echo "__version__ = '$version'" > ./src/hadoop_fs_wrapper/_version.py - - python -m build --sdist --wheel - - name: Publish distribution 📦 to Test PyPI - uses: pypa/gh-action-pypi-publish@master + + pypath=$(pwd) + export PYTHONPATH="$pypath:$PYTHONPATH" + + /github/home/.local/bin/poetry run pytest ./test --doctest-modules --junitxml=junit/test-results.xml --cov=. --cov-report=term-missing:skip-covered | tee pytest-coverage.txt + - name: Publish Code Coverage + uses: MishaKav/pytest-coverage-comment@main with: - password: ${{ secrets.PYPI_TEST_API_TOKEN }} - repository_url: https://test.pypi.org/legacy/ + pytest-coverage-path: ./pytest-coverage.txt + junitxml-path: ./junit/test-results.xml diff --git a/.github/workflows/deploy.yaml b/.github/workflows/deploy.yaml deleted file mode 100644 index eae702e..0000000 --- a/.github/workflows/deploy.yaml +++ /dev/null @@ -1,34 +0,0 @@ -name: Release to PyPi - -on: workflow_dispatch -jobs: - release_to_pypi: - name: Release distribution to pypi.org - runs-on: ubuntu-latest - - steps: - - uses: actions/checkout@v2 - - run: git fetch --prune --unshallow - - uses: actions/setup-python@v2 - with: - python-version: '3.8.x' - - name: Build wheel - if: startsWith(github.ref, 'refs/tags') - run: | - set -e - - version=$(git describe --tags --abbrev=7) - - pip install virtualenv - python -m virtualenv hadoopwrapper - - . hadoopwrapper/bin/activate - pip install --upgrade twine build - - echo "__version__ = '$version'" > ./src/hadoop_fs_wrapper/_version.py - - python -m build --sdist --wheel - - name: Publish distribution 📦 to PyPI - uses: pypa/gh-action-pypi-publish@master - with: - password: ${{ secrets.PYPI_API_TOKEN }} diff --git a/.github/workflows/prepare_release.yaml b/.github/workflows/prepare_release.yaml new file mode 100644 index 0000000..1d157e4 --- /dev/null +++ b/.github/workflows/prepare_release.yaml @@ -0,0 +1,19 @@ +name: Prepare GH Release + +on: workflow_dispatch + +jobs: + create_release: + name: Create Release + runs-on: ubuntu-latest + if: ${{ github.ref == 'refs/heads/main' }} + + steps: + - uses: actions/checkout@v2 + with: + fetch-depth: 0 + - name: Create Release + uses: SneaksAndData/github-actions/semver_release@v0.0.2 + with: + major_v: 0 + minor_v: 4 diff --git a/.github/workflows/release.yaml b/.github/workflows/release.yaml new file mode 100644 index 0000000..4b66406 --- /dev/null +++ b/.github/workflows/release.yaml @@ -0,0 +1,51 @@ +name: Release a new version + +on: workflow_dispatch +jobs: + release_to_pypi: + name: Release distribution to PyPi + runs-on: ubuntu-latest + + steps: + - uses: actions/checkout@v2 + with: + fetch-depth: 0 + - uses: actions/setup-python@v2 + with: + python-version: '3.8.x' + architecture: 'x64' + - name: Install Poetry and prepare version + run: | + set -e + + curl -sSL https://install.python-poetry.org | python3 - --preview + + version=$(git describe --tags --abbrev=7) + sed -i "s/version = \"0.0.0\"/version = \"${version:1}\"/" pyproject.toml + echo "__version__ = '${version:1}'" > ./hadoop_fs_wrapper/_version.py + + - name: Configure Test PyPi + if: ${{ github.ref == 'refs/heads/main' }} + env: + PYPI_TEST_TOKEN: ${{ secrets.PYPI_TEST_API_TOKEN }} + run: | + set -e + + poetry config repositories.test-pypi https://test.pypi.org/legacy/ + poetry config pypi-token.test-pypi $PYPI_TEST_TOKEN + + - name: Publish distribution 📦 to test PyPI + if: ${{ github.ref == 'refs/heads/main' }} + run: | + set -e + + poetry build && poetry publish -r test-pypi + + - name: Publish distribution 📦 to PyPI + env: + POETRY_PYPI_TOKEN_PYPI: ${{ secrets.PYPI_API_TOKEN }} + if: ${{ startsWith(github.ref, 'refs/tags') }} + run: | + set -e + + poetry build && poetry publish diff --git a/README.md b/README.md index f4ff53b..f40b89b 100644 --- a/README.md +++ b/README.md @@ -16,6 +16,7 @@ Select a version that matches hadoop version you are using: ## Usage Common use case is accessing Hadoop FileSystem from Spark session object: + ```python from hadoop_fs_wrapper.wrappers.file_system import FileSystem @@ -24,6 +25,8 @@ file_system = FileSystem.from_spark_session(spark=spark_session) Then, for example, one can check if there are any files under specified path: ```python +from hadoop_fs_wrapper.wrappers.file_system import FileSystem + def is_valid_source_path(file_system: FileSystem, path: str) -> bool: """ Checks whether a regexp path refers to a valid set of paths diff --git a/src/hadoop_fs_wrapper/__init__.py b/hadoop_fs_wrapper/__init__.py similarity index 100% rename from src/hadoop_fs_wrapper/__init__.py rename to hadoop_fs_wrapper/__init__.py diff --git a/src/hadoop_fs_wrapper/_version.py b/hadoop_fs_wrapper/_version.py similarity index 100% rename from src/hadoop_fs_wrapper/_version.py rename to hadoop_fs_wrapper/_version.py diff --git a/src/hadoop_fs_wrapper/models/__init__.py b/hadoop_fs_wrapper/models/__init__.py similarity index 100% rename from src/hadoop_fs_wrapper/models/__init__.py rename to hadoop_fs_wrapper/models/__init__.py diff --git a/src/hadoop_fs_wrapper/models/buffered_input_stream.py b/hadoop_fs_wrapper/models/buffered_input_stream.py similarity index 100% rename from src/hadoop_fs_wrapper/models/buffered_input_stream.py rename to hadoop_fs_wrapper/models/buffered_input_stream.py diff --git a/src/hadoop_fs_wrapper/models/buffered_output_stream.py b/hadoop_fs_wrapper/models/buffered_output_stream.py similarity index 100% rename from src/hadoop_fs_wrapper/models/buffered_output_stream.py rename to hadoop_fs_wrapper/models/buffered_output_stream.py diff --git a/src/hadoop_fs_wrapper/models/buffered_reader.py b/hadoop_fs_wrapper/models/buffered_reader.py similarity index 100% rename from src/hadoop_fs_wrapper/models/buffered_reader.py rename to hadoop_fs_wrapper/models/buffered_reader.py diff --git a/src/hadoop_fs_wrapper/models/file_status.py b/hadoop_fs_wrapper/models/file_status.py similarity index 100% rename from src/hadoop_fs_wrapper/models/file_status.py rename to hadoop_fs_wrapper/models/file_status.py diff --git a/src/hadoop_fs_wrapper/models/fs_data_input_stream.py b/hadoop_fs_wrapper/models/fs_data_input_stream.py similarity index 100% rename from src/hadoop_fs_wrapper/models/fs_data_input_stream.py rename to hadoop_fs_wrapper/models/fs_data_input_stream.py diff --git a/src/hadoop_fs_wrapper/models/fs_data_output_stream.py b/hadoop_fs_wrapper/models/fs_data_output_stream.py similarity index 100% rename from src/hadoop_fs_wrapper/models/fs_data_output_stream.py rename to hadoop_fs_wrapper/models/fs_data_output_stream.py diff --git a/src/hadoop_fs_wrapper/models/glob_filter.py b/hadoop_fs_wrapper/models/glob_filter.py similarity index 100% rename from src/hadoop_fs_wrapper/models/glob_filter.py rename to hadoop_fs_wrapper/models/glob_filter.py diff --git a/src/hadoop_fs_wrapper/models/hadoop_file_status.py b/hadoop_fs_wrapper/models/hadoop_file_status.py similarity index 100% rename from src/hadoop_fs_wrapper/models/hadoop_file_status.py rename to hadoop_fs_wrapper/models/hadoop_file_status.py diff --git a/src/hadoop_fs_wrapper/models/hadoop_fs_path.py b/hadoop_fs_wrapper/models/hadoop_fs_path.py similarity index 100% rename from src/hadoop_fs_wrapper/models/hadoop_fs_path.py rename to hadoop_fs_wrapper/models/hadoop_fs_path.py diff --git a/src/hadoop_fs_wrapper/models/input_stream_reader.py b/hadoop_fs_wrapper/models/input_stream_reader.py similarity index 89% rename from src/hadoop_fs_wrapper/models/input_stream_reader.py rename to hadoop_fs_wrapper/models/input_stream_reader.py index 45aaedd..edabc9f 100644 --- a/src/hadoop_fs_wrapper/models/input_stream_reader.py +++ b/hadoop_fs_wrapper/models/input_stream_reader.py @@ -38,20 +38,20 @@ def __init__(self, underlying): @classmethod def from_input_stream_and_charset(cls, jvm, input_stream, charset_name): """ - Wraps constructor java.io.InputStreamReader​(InputStream in, String charsetName) + Wraps constructor java.io.InputStreamReader(InputStream in, String charsetName) Creates an InputStreamReader that uses the named charset. :param input_stream: An InputStream :param charset_name: The name of a supported charset - :return: InputStreamReader​ + :return: InputStreamReader """ return cls(jvm.java.io.InputStreamReader(input_stream,charset_name)) @classmethod def from_input_stream(cls, jvm, input_stream): """ - Wraps constructor java.io.InputStreamReader​(InputStream in) + Wraps constructor java.io.InputStreamReader(InputStream in) Creates an InputStreamReader that uses the default charset. :param input_stream: An InputStream - :return: InputStreamReader​ + :return: InputStreamReader """ return cls(jvm.java.io.InputStreamReader(input_stream)) diff --git a/src/hadoop_fs_wrapper/models/remote_iterator.py b/hadoop_fs_wrapper/models/remote_iterator.py similarity index 100% rename from src/hadoop_fs_wrapper/models/remote_iterator.py rename to hadoop_fs_wrapper/models/remote_iterator.py diff --git a/src/hadoop_fs_wrapper/models/uri.py b/hadoop_fs_wrapper/models/uri.py similarity index 100% rename from src/hadoop_fs_wrapper/models/uri.py rename to hadoop_fs_wrapper/models/uri.py diff --git a/src/hadoop_fs_wrapper/wrappers/__init__.py b/hadoop_fs_wrapper/wrappers/__init__.py similarity index 100% rename from src/hadoop_fs_wrapper/wrappers/__init__.py rename to hadoop_fs_wrapper/wrappers/__init__.py diff --git a/src/hadoop_fs_wrapper/wrappers/file_system.py b/hadoop_fs_wrapper/wrappers/file_system.py similarity index 100% rename from src/hadoop_fs_wrapper/wrappers/file_system.py rename to hadoop_fs_wrapper/wrappers/file_system.py diff --git a/src/hadoop_fs_wrapper/wrappers/hadoop_fs_wrapper.py b/hadoop_fs_wrapper/wrappers/hadoop_fs_wrapper.py similarity index 99% rename from src/hadoop_fs_wrapper/wrappers/hadoop_fs_wrapper.py rename to hadoop_fs_wrapper/wrappers/hadoop_fs_wrapper.py index cf79459..105d8a1 100644 --- a/src/hadoop_fs_wrapper/wrappers/hadoop_fs_wrapper.py +++ b/hadoop_fs_wrapper/wrappers/hadoop_fs_wrapper.py @@ -250,12 +250,12 @@ def input_stream_reader(self, input_stream: InputStreamReader, charset_name: str = None) -> InputStreamReader: """ - Wraps constructor java.io.InputStreamReader​(InputStream in) + Wraps constructor java.io.InputStreamReader(InputStream in) Creates an InputStreamReader that uses the named charset. :param input_stream: An InputStream :param charset_name: The name of a supported charset - :return: InputStreamReader​ + :return: InputStreamReader """ if not charset_name: return InputStreamReader.from_input_stream(self._jvm, input_stream.underlying) diff --git a/pyproject.toml b/pyproject.toml new file mode 100644 index 0000000..b5096f6 --- /dev/null +++ b/pyproject.toml @@ -0,0 +1,22 @@ +[tool.poetry] +name = "hadoop-fs-wrapper" +version = "0.0.0" +description = "Python Wrapper for Hadoop Java API" +authors = ["ECCO Sneaks & Data "] +maintainers = ['GZU ', 'JRB '] +license = 'MIT' +readme = "README.md" +repository = 'https://github.com/SneaksAndData/hadoop-fs-wrapper' + +[tool.poetry.dependencies] +python = "^3.8" +pyspark = "~3.2" + +[tool.poetry.dev-dependencies] +pytest = "^7.0" +pytest-cov = "^2.12" +pylint = "^2.12" + +[build-system] +requires = ["poetry-core>=1.0.0"] +build-backend = "poetry.core.masonry.api" diff --git a/requirements-dev.txt b/requirements-dev.txt deleted file mode 100644 index 9674b65..0000000 --- a/requirements-dev.txt +++ /dev/null @@ -1,3 +0,0 @@ -pylint==2.12.2 -pytest==7.1.1 -pytest-mock==3.7.0 \ No newline at end of file diff --git a/requirements.txt b/requirements.txt deleted file mode 100644 index 53129bb..0000000 --- a/requirements.txt +++ /dev/null @@ -1 +0,0 @@ -pyspark==3.2.1 diff --git a/setup.py b/setup.py deleted file mode 100644 index f6f8737..0000000 --- a/setup.py +++ /dev/null @@ -1,38 +0,0 @@ -import subprocess -import setuptools - -from pathlib import Path - -this_directory = Path(__file__).parent -long_description = (this_directory / "README.md").read_text() - - -def get_version(): - base_version = subprocess.check_output(["git", "describe", "--tags", "--abbrev=7"]).strip().decode("utf-8") - # have to follow PEP440 religious laws here - parts = base_version.split('-') - if len(parts) == 1: - return parts[0] - else: - (semantic, commit_number, commit_id) = parts - return f"{semantic}+{commit_number}.{commit_id}" - - -setuptools.setup(name='hadoop-fs-wrapper', - version=get_version(), - description='Python Wrapper for Hadoop Java API', - long_description=long_description, - long_description_content_type='text/markdown', - author='ECCO Sneaks & Data', - author_email='esdsupport@ecco.com', - classifiers=[ - "Programming Language :: Python :: 3", - "License :: OSI Approved :: MIT License", - "Operating System :: OS Independent", - ], - install_requires=[ - 'pyspark~=3.2.0' - ], - python_requires='>=3.8', - package_dir={"": "src"}, - packages=setuptools.find_packages(where="src"), ) diff --git a/test/test_file_system.py b/test/test_file_system.py index c37bf16..ca42a80 100644 --- a/test/test_file_system.py +++ b/test/test_file_system.py @@ -26,7 +26,7 @@ import pytest from pyspark.sql import SparkSession -from hadoop_fs_wrapper.wrappers.file_system import FileSystem +from hadoop_fs_wrapper.wrappers import FileSystem @pytest.fixture diff --git a/test/test_parse_hadoop_filestatus.py b/test/test_parse_hadoop_filestatus.py index e94737a..aae8381 100644 --- a/test/test_parse_hadoop_filestatus.py +++ b/test/test_parse_hadoop_filestatus.py @@ -22,8 +22,8 @@ from datetime import datetime -from hadoop_fs_wrapper.models.hadoop_file_status import HadoopFileStatus -from hadoop_fs_wrapper.models.file_status import FileStatus +from hadoop_fs_wrapper.models import HadoopFileStatus +from hadoop_fs_wrapper.models import FileStatus class MockPath: