diff --git a/.github/workflows/ci.yaml b/.github/workflows/ci.yaml index c122535..dca0762 100644 --- a/.github/workflows/ci.yaml +++ b/.github/workflows/ci.yaml @@ -37,11 +37,11 @@ jobs: - name: Check coverage report if: github.ref != 'refs/heads/main' - uses: orgoro/coverage@v3.1 + uses: orgoro/coverage@v3.2 with: coverageFile: coverage.xml token: ${{ secrets.GITHUB_TOKEN }} - thresholdAll: .99 + thresholdAll: 1 thresholdNew: 1 thresholdModified: 1 @@ -51,29 +51,9 @@ jobs: - uses: actions/checkout@v4 - name: Install linters - # black is synced with the .pre-commit-hooks version run: | python -m pip install --upgrade pip - python -m pip install .[dev] bandit[toml] pycodestyle pylint + python -m pip install .[dev] - - name: Run pycodestyle - # E203: pycodestyle is a little too rigid about slices & whitespace - # See https://black.readthedocs.io/en/stable/the_black_code_style/current_style.html#slices - # W503: a default ignore that we are restoring - run: | - pycodestyle --max-line-length=100 --ignore=E203,W503 . - - - name: Run pylint - if: success() || failure() # still run pylint if above checks fail - run: | - pylint cumulus_fhir_support tests - - - name: Run bandit - if: success() || failure() # still run bandit if above checks fail - run: | - bandit -c pyproject.toml -r . - - - name: Run black - if: success() || failure() # still run black if above checks fails - run: | - black --check --verbose . + - name: Run ruff + run: ruff check --output-format=github . diff --git a/.pre-commit-config.yaml b/.pre-commit-config.yaml index 2ddb1ff..4065597 100644 --- a/.pre-commit-config.yaml +++ b/.pre-commit-config.yaml @@ -1,7 +1,10 @@ repos: - - repo: https://github.com/psf/black - #this version is synced with the black mentioned in .github/workflows/ci.yml - rev: 24.4.2 + - repo: https://github.com/astral-sh/ruff-pre-commit + rev: v0.8.3 # keep in rough sync with pyproject.toml hooks: - - id: black - entry: bash -c 'black "$@"; git add -u' -- + - name: Ruff formatting + id: ruff-format + entry: bash -c 'ruff format --force-exclude "$@"; git add -u' -- + - name: Ruff linting + id: ruff + stages: [pre-push] diff --git a/README.md b/README.md index 6bcfc58..33a8f06 100644 --- a/README.md +++ b/README.md @@ -8,10 +8,106 @@ This library holds FHIR support code for the Cumulus project as a whole. pip install cumulus-fhir-support ``` -## Examples +## API + +### list_multiline_json_in_dir + +Lists available multiline JSON files in the target directory +(allowing filtering by FHIR resource). + +```python3 +import cumulus_fhir_support + +cumulus_fhir_support.list_multiline_json_in_dir("/") +# { +# "/random.jsonl": None, +# "/con1.ndjson": "Condition", +# "/pat1.jsonl": "Patient", +# } + +cumulus_fhir_support.list_multiline_json_in_dir("/", "Patient") +# { +# "/pat1.jsonl": "Patient", +# } + +cumulus_fhir_support.list_multiline_json_in_dir("/", ["Condition", "Patient"]) +# { +# "/con1.ndjson": "Condition", +# "/pat1.jsonl": "Patient", +# } + +cumulus_fhir_support.list_multiline_json_in_dir("/does-not-exist/") +# {} + +cumulus_fhir_support.list_multiline_json_in_dir("s3://mybucket/", fsspec_fs=s3_fs) +# { +# "/mybucket/procs.ndjson": "Procedure", +# } +``` + +### read_multiline_json + +Iterates over a single multiline JSON file. + +```python3 +import cumulus_fhir_support + +list(cumulus_fhir_support.read_multiline_json("/pat1.jsonl")) +# [ +# {"resourceType": "Patient", "id": "pat1", "birthDate": "2020-10-16"}, +# {"resourceType": "Patient", "id": "pat2", "birthDate": "2013-04-18"}, +# ] + +list(cumulus_fhir_support.read_multiline_json("/does-not-exist.ndjson")) +# [] + +list(cumulus_fhir_support.read_multiline_json("/mybucket/procs.ndjson", fsspec_fs=s3_fs)) +# [ +# {"resourceType": "Procedure", "id": "proc1", "status": "stopped"}, +# ] +``` + +### read_multiline_json_from_dir + +Iterates over every JSON object in a directory +(allowing filtering by FHIR resource). + +```python3 +import cumulus_fhir_support + +list(cumulus_fhir_support.read_multiline_json_from_dir("/")) +# [ +# {"description": "not a fhir object"}, +# {"resourceType": "Condition", "id": "con1", "onsetDateTime": "2011-11-24"}, +# {"resourceType": "Patient", "id": "pat1", "birthDate": "2020-10-16"}, +# {"resourceType": "Patient", "id": "pat2", "birthDate": "2013-04-18"}, +# ] + +list(cumulus_fhir_support.read_multiline_json_from_dir("/", "Condition")) +# [ +# {"resourceType": "Condition", "id": "con1", "onsetDateTime": "2011-11-24"}, +# ] + +list(cumulus_fhir_support.read_multiline_json_from_dir("/", ["Condition", "Patient"])) +# [ +# {"resourceType": "Condition", "id": "con1", "onsetDateTime": "2011-11-24"}, +# {"resourceType": "Patient", "id": "pat1", "birthDate": "2020-10-16"}, +# {"resourceType": "Patient", "id": "pat2", "birthDate": "2013-04-18"}, +# ] + +list(cumulus_fhir_support.read_multiline_json_from_dir("/does-not-exist/")) +# [] + +list(cumulus_fhir_support.read_multiline_json_from_dir("/mybucket/", fsspec_fs=s3_fs)) +# [ +# {"resourceType": "Procedure", "id": "proc1", "status": "stopped"}, +# ] +``` ### pyarrow_schema_from_rows +Calculates a schema that can cover a given collection of FHIR objects. + ```python3 import cumulus_fhir_support @@ -27,9 +123,9 @@ rows = [ "code": "2135-2", "display": "Hispanic or Latino", "system": "urn:oid:2.16.840.1.113883.6.238", - } + }, }], - }] + }], }, ] diff --git a/cumulus_fhir_support/json.py b/cumulus_fhir_support/json.py index 04f0984..6eadbe1 100644 --- a/cumulus_fhir_support/json.py +++ b/cumulus_fhir_support/json.py @@ -41,7 +41,8 @@ import logging import os import pathlib -from typing import TYPE_CHECKING, Any, Iterable, Optional, Union +from collections.abc import Iterable +from typing import TYPE_CHECKING, Any, Optional, Union if TYPE_CHECKING: import fsspec # pragma: no cover diff --git a/cumulus_fhir_support/schemas.py b/cumulus_fhir_support/schemas.py index 1b25a52..f4a654b 100644 --- a/cumulus_fhir_support/schemas.py +++ b/cumulus_fhir_support/schemas.py @@ -1,7 +1,8 @@ """Detect FHIR resource schemas""" from collections import namedtuple -from typing import Any, Iterable, Optional +from collections.abc import Iterable +from typing import Any, Optional import pyarrow from fhirclient.models import ( @@ -14,7 +15,6 @@ fhirelementfactory, ) - FhirProperty = namedtuple( "FhirProperty", ["name", "json_name", "pytype", "is_list", "of_many", "required"] ) @@ -24,7 +24,9 @@ LEVEL_INCLUSION = 1 -def pyarrow_schema_from_rows(resource_type: str, rows: Iterable[dict] = None) -> pyarrow.Schema: +def pyarrow_schema_from_rows( + resource_type: str, rows: Optional[Iterable[dict]] = None +) -> pyarrow.Schema: """ Creates a PyArrow schema based off the named resource (like 'Observation') and row contents. @@ -175,7 +177,7 @@ def _fhir_to_pyarrow_property( prop: FhirProperty, *, base_obj: Optional[fhirabstractbase.FHIRAbstractBase] = None, - batch_shape: dict = None, + batch_shape: Optional[dict] = None, level: int, ) -> Optional[pyarrow.Field]: """Converts a single FhirProperty to a PyArrow Field, or None if this field should be skipped""" diff --git a/pyproject.toml b/pyproject.toml index bf3677e..62c5def 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -8,7 +8,7 @@ dependencies = [ authors = [ { name="Michael Terry", email="michael.terry@childrens.harvard.edu" }, ] -description = "FHIR schema support code for the Cumulus project" +description = "FHIR support code for the Cumulus project" readme = "README.md" license = { text="Apache License 2.0" } classifiers = [ @@ -33,12 +33,6 @@ include = [ "*.md", ] -[tool.bandit] -exclude_dirs = ["tests"] - -[tool.black] -line-length = 100 - [project.optional-dependencies] tests = [ "ddt", @@ -46,6 +40,28 @@ tests = [ "pytest-cov", ] dev = [ - "black >= 24, < 25", "pre-commit", -] \ No newline at end of file + # Ruff is using minor versions for breaking changes until their 1.0 release. + # See https://docs.astral.sh/ruff/versioning/ + "ruff < 0.9", # keep in rough sync with pre-commit-config.yaml +] + +[tool.ruff] +line-length = 100 + +[tool.ruff.lint] +allowed-confusables = ["’"] # allow proper apostrophes +select = [ + "A", # prevent using keywords that clobber python builtins + "E", # pycodestyle + "F", # pyflakes + "I", # isort + "PLE", # pylint errors + "PLW", # pylint warnings + "RUF", # the ruff developer's own rules + "S", # bandit security warnings + "UP", # alert you when better syntax is available in your python version +] + +[tool.ruff.lint.per-file-ignores] +"**/__init__.py" = ["F401"] # init files hold API, so not using imports is intentional