diff --git a/.github/workflows/branch_ci.yml b/.github/workflows/branch_ci.yml new file mode 100644 index 0000000..3fffb33 --- /dev/null +++ b/.github/workflows/branch_ci.yml @@ -0,0 +1,151 @@ +# Workflow that runs on pushes to non-default branches + +name: Non-Default Branch Push CI (Python) + +on: + push: + branches-ignore: ['main'] + paths-ignore: ['README.md'] + +# Specify concurrency such that only one workflow can run at a time +# * Different workflow files are not affected +concurrency: + group: ${{ github.workflow }}-${{ github.ref }} + cancel-in-progress: true + +# Registry for storing Container images +env: + REGISTRY: ghcr.io + IMAGE_NAME: ${{ github.repository }} + +# Ensure the GitHub token can remove packages +permissions: + packages: write + + +jobs: + + # Job to run a linter and typechecker against the codebase + lint-typecheck: + runs-on: ubuntu-latest + + steps: + - name: Checkout repository + uses: actions/checkout@v4 + + - name: Setup uv + uses: astral-sh/setup-uv@v3 + with: + enable-cache: true + cache-dependency-glob: "pyproject.toml" + + - name: Set up Python + uses: actions/setup-python@v5 + with: + python-version-file: "pyproject.toml" + + - name: Install editable package and required dependencies + run: uv sync + + - name: Lint package + run: uv run ruff check --output-format=github . + + - name: Typecheck package + run: uv run mypy . + # TODO: GitHub output when https://github.com/python/mypy/pull/17771 merged + + # Job to run unittests + # * Produces a JUnit XML report that can be displayed in the GitHub UI + test-unit: + runs-on: ubuntu-latest + needs: lint-typecheck + + steps: + - name: Checkout repository + uses: actions/checkout@v4 + + - name: Setup uv + uses: astral-sh/setup-uv@v3 + with: + enable-cache: true + cache-dependency-glob: "pyproject.toml" + + - name: Set up Python + uses: actions/setup-python@v5 + with: + python-version-file: "pyproject.toml" + + - name: Install editable package and required dependencies + run: uv sync + + # Run unittests + # * Produce JUnit XML report + - name: Run unit tests + env: + CDSAPI_URL: "https://ads.atmosphere.copernicus.eu/api/v2" + CDSAPI_KEY: "fake1:fake33-bogus44-falsehood-key55" + SS_API_KEY: ${{ secrets.SS_API_KEY }} + SS_USER_ID: ${{ secrets.SS_USER_ID }} + run: uv run python -m pytest --junitxml=ut-report.xml tests + + # Create test summary to be visualised on the job summary screen on GitHub + # * Runs even if previous steps fail + - name: Create test summary + uses: test-summary/action@v2 + with: + paths: "*t-report.xml" + show: "fail, skip" + if: always() + + # Job for building container image + # * Builds and pushes an OCI Container image to the registry defined in the environment variables + # * Only runs if test and lint jobs pass + build-container: + runs-on: ubuntu-latest + permissions: + contents: read + packages: write + needs: ["lint-typecheck", "test-unit"] + + steps: + # Do a non-shallow clone of the repo to ensure tags are present + # * This allows setuptools-git-versioning to automatically set the version + - name: Checkout repository + uses: actions/checkout@v4 + with: + fetch-depth: 0 + + - name: Set up Buildx + uses: docker/setup-buildx-action@v3 + + - name: Log in to the Container registry + uses: docker/login-action@v3 + with: + registry: ${{ env.REGISTRY }} + username: ${{ github.actor }} + password: ${{ secrets.GITHUB_TOKEN }} + + # Tag the built image according to the event type + # The event is a branch commit, so use the commit sha + - name: Extract metadata (tags, labels) for Container + id: meta + uses: docker/metadata-action@v5 + with: + images: ${{ env.REGISTRY }}/${{ env.IMAGE_NAME }} + tags: type=ref,event=branch + + # Build and push the Container image to the registry + # * Creates a multiplatform-aware image + # * Pulls build cache from the registry + - name: Build and push container image + uses: docker/build-push-action@v6 + with: + context: . + file: Containerfile + push: true + tags: ${{ steps.meta.outputs.tags }} + labels: ${{ steps.meta.outputs.labels }} + platforms: linux/amd64 + cache-from: type=registry,ref=${{ env.REGISTRY }}/${{ env.IMAGE_NAME }}:buildcache + cache-to: type=registry,ref=${{ env.REGISTRY }}/${{ env.IMAGE_NAME }}:buildcache + diff --git a/.github/workflows/ci.yml b/.github/workflows/ci.yml deleted file mode 100644 index 85f27d9..0000000 --- a/.github/workflows/ci.yml +++ /dev/null @@ -1,98 +0,0 @@ -name: Python CI - -on: - push: - branches: [] - paths-ignore: - - 'README.md' - pull_request: - branches: [] - paths-ignore: - - 'README.md' - -# Specify concurrency such that only one workflow can run at a time -# * Different workflow files are not affected -concurrency: - group: ${{ github.workflow }}-${{ github.ref }} - cancel-in-progress: true - -jobs: - # Define a dependencies job that runs on all branches and PRs - # * Installs dependencies and caches them - build-venv: - runs-on: ubuntu-latest - container: quay.io/condaforge/miniforge3:latest - - steps: - - name: Checkout repository - uses: actions/checkout@v3 - - # Restore cached virtualenv, if available - # * The pyproject.toml hash is part of the cache key, invalidating - # the cache if the file changes - - name: Restore cached virtualenv - id: restore-cache - uses: actions/cache/restore@v3 - with: - path: ./venv - key: ${{ runner.os }}-venv-${{ hashFiles('**/pyproject.toml') }} - - # Should mirror the build-venv stage in the Containerfile - - name: Build venv - run: | - apt -qq update && apt -qq install -y build-essential - conda create -p ./venv python=3.11 - ./venv/bin/python -m pip install --upgrade -q pip wheel setuptools - if: steps.restore-cache.outputs.cache-hit != 'true' - - # Should mirror the build-reqs stage in the Containerfile - # * Except this installs the dev dependencies and binaries as well - - name: Install all dependencies - run: | - conda install -p ./venv -q -y eccodes zarr - ./venv/bin/python -m pip install -q .[dev] - if: steps.restore-cache.outputs.cache-hit != 'true' - - # Cache the virtualenv for future runs - - name: Cache virtualenv - uses: actions/cache/save@v3 - with: - path: ./venv - key: ${{ steps.restore-cache.outputs.cache-primary-key }} - if: steps.restore-cache.outputs.cache-hit != 'true' - - # Define a unittest job that runs on all branches and PRs - test-unit: - runs-on: ubuntu-latest - needs: build-venv - - steps: - - name: Checkout repository - uses: actions/checkout@v3 - - # Restore cached virtualenv - - name: Restore cached virtualenv - uses: actions/cache/restore@v3 - with: - path: ./venv - key: ${{ runner.os }}-venv-${{ hashFiles('**/pyproject.toml') }} - - # Run unittests - # * Produce JUnit XML report - - name: Run unit tests - env: - CDSAPI_URL: "https://ads.atmosphere.copernicus.eu/api/v2" - CDSAPI_KEY: "fake1:fake33-bogus44-falsehood-key55" - SS_API_KEY: ${{ secrets.SS_API_KEY }} - SS_USER_ID: ${{ secrets.SS_USER_ID }} - run: ./venv/bin/python -m pytest --junitxml=ut-report.xml tests - - # Create test summary to be visualised on the job summary screen on GitHub - # * Runs even if previous steps fail - - name: Create test summary - uses: test-summary/action@v2 - with: - paths: "*t-report.xml" - show: "fail, skip" - if: always() - diff --git a/.github/workflows/merged_ci.yml b/.github/workflows/merged_ci.yml new file mode 100644 index 0000000..947d5ba --- /dev/null +++ b/.github/workflows/merged_ci.yml @@ -0,0 +1,102 @@ +# Workflow that runs on closed PRs to the default branch + +name: Default Branch PR Merged CI (Python) + +on: + pull_request: + types: ["closed"] + branches: ["main"] + +# Specify concurrency such that only one workflow can run at a time +# * Different workflow files are not affected +concurrency: + group: ${{ github.workflow }}-${{ github.ref }} + cancel-in-progress: false + +env: + REGISTRY: ghcr.io + IMAGE_NAME: ${{ github.repository }} + + +jobs: + + # Define an autotagger job that creates tags on changes to master + # Use #major #minor in merge commit messages to bump version beyond patch + bump-tag: + runs-on: ubuntu-latest + if: | + github.event_name == 'pull_request' && + github.event.action == 'closed' && + github.event.pull_request.merged == true + permissions: + contents: write + outputs: + tag: ${{ steps.tag.outputs.tag }} + + steps: + - name: Checkout repository + uses: actions/checkout@v4 + with: + fetch-depth: 0 + + - name: Bump version and push tag + uses: anothrNick/github-tag-action@1.67.0 + id: tag + env: + GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }} + RELEASE_BRANCHES: main + WITH_V: true + DEFAULT_BUMP: patch + GIT_API_TAGGING: false + + # Job for building container image + # * Builds and pushes an OCI Container image to the registry defined in the environment variables + build-container: + runs-on: ubuntu-latest + needs: bump-tag + permissions: + contents: read + packages: write + + steps: + # Do a non-shallow clone of the repo to ensure tags are present + # * This allows setuptools-git-versioning to automatically set the version + - name: Checkout repository + uses: actions/checkout@v4 + with: + fetch-depth: 0 + + - name: Set up Buildx + uses: docker/setup-buildx-action@v3 + + - name: Log in to the Container registry + uses: docker/login-action@v3 + with: + registry: ${{ env.REGISTRY }} + username: ${{ github.actor }} + password: ${{ secrets.GITHUB_TOKEN }} + + # Tag the built image according to the event type + # The event is a semver release, so use the version + - name: Extract metadata (tags, labels) for Container + id: meta + uses: docker/metadata-action@v5 + with: + images: ${{ env.REGISTRY }}/${{ env.IMAGE_NAME }} + tags: type=semver,pattern={{version}},value=${{ needs.bump-tag.outputs.tag }} + + # Build and push the Container image to the registry + # * Creates a multiplatform-aware image + # * Pulls build cache from the registry and pushes new cache back + - name: Build and push container image + uses: docker/build-push-action@v6 + with: + context: . + file: Containerfile + push: true + tags: ${{ steps.meta.outputs.tags }} + labels: ${{ steps.meta.outputs.labels }} + platforms: linux/amd64 + cache-from: type=registry,ref=${{ env.REGISTRY }}/${{ env.IMAGE_NAME }}:buildcache + cache-to: type=registry,ref=${{ env.REGISTRY }}/${{ env.IMAGE_NAME }}:buildcache,mode=max + diff --git a/Containerfile b/Containerfile new file mode 100644 index 0000000..bbc1182 --- /dev/null +++ b/Containerfile @@ -0,0 +1,23 @@ +FROM python:3.12-slim-bookworm +COPY --from=ghcr.io/astral-sh/uv:latest /uv /uvx /bin/ + +ENV DAGSTER_HOME=/opt/dagster/home + +# Add repository code +WORKDIR /opt/dagster/app +COPY src /opt/dagster/app +COPY pyproject.toml /opt/dagster/app + +# Checkout and install dagster libraries needed to run the gRPC server by exposing +# your code location to dagster-webserver and dagster-daemon, and loading the +# DagsterInstance. +RUN uv sync + +EXPOSE 4266 + +# Using CMD rather than RUN allows the command to be overridden in +# run launchers or executors to run other commands using this image. +# This is important as runs are executed inside this container. +ENTRYPOINT ["uv", "run"] +CMD ["dagster", "api", "grpc", "-h", "0.0.0.0", "-p", "4266", "-m", "dagster_dags"] + diff --git a/README.md b/README.md index 17b9a03..1146fb9 100644 --- a/README.md +++ b/README.md @@ -1,90 +1,141 @@ -

-Dagster Dags -
-
-Dagster defintions for OCF's archival datasets -

+# Dagster Dags -
+**Orchestrate data pipelines for ML dataset creation** - - - - GitHub Workflow Status (with branch) - - -
+[![tags badge](https://img.shields.io/github/v/tag/openclimatefix/dagster-dags?include_prereleases&sort=semver&color=7BCDF3)](https://github.com/openclimatefix/dagster-dags/tags) +[![contributors badge](https://img.shields.io/github/contributors/openclimatefix/dagster-dags?color=FFFFFF)](https://github.com/openclimatefix/dagster-dags/graphs/contributors) +[![workflows badge](https://img.shields.io/github/actions/workflow/status/openclimatefix/dagster-dags/branch_ci.yml?branch=main&color=FFD053)](https://github.com/openclimatefix/dagster-dags/actions/workflows/branch_ci.yml) +[![ease of contribution: easy](https://img.shields.io/badge/ease%20of%20contribution:%20easy-32bd50)](https://github.com/openclimatefix/ocf-meta-repo?tab=readme-ov-file#overview-of-ocfs-nowcasting-repositories) -
+In order to train and evaluate an ML model, datasets must be created consistently and reproducibly. +Forecasting renewable energy generation depends on large-timescale weather data: +Numerical Weather Prediction (NWP) data; satellite imagery; +atmospheric quality data. Dagster helps to these datasets organised and up to date. -## Ubiquitous language +This repository contains the Dagster definitions that orchestrate the creation of these datasets. -The following terms are used throughout the codebase and documentation. They are defined here to avoid ambiguity. +## Installation - - *InitTime* - The time at which a forecast is initialized. For example, a forecast initialized at 12:00 on 1st January. - - *TargetTime* - The time at which a predicted value is valid. For example, a forecast with InitTime 12:00 on 1st January predicts that the temperature at TargetTime 12:00 on 2nd January at position x will be 10 degrees. +The repository is packaged as a Docker image that can be used as a Dagster +[code server](https://docs.dagster.io/concepts/code-locations/workspace-files#running-your-own-grpc-server) +```bash +$ docker pull ghcr.io/openclimatefix/dagster-dags +``` + +## Example Usage + +**To add as a code location in an existing Dagster setup:** + +```bash +$ docker run -d \ + -p 4266:4266 \ + -e DAGSTER_CURRENT_IMAGE=ghcr.io/openclimatefix/dagster-dags \ + ghcr.io/openclimatefix/dagster-dags +``` -## Repository structure +```yaml +# $DAGSTER_HOME/workspace.yaml -Produced by `eza`: -```sh -eza --tree --git-ignore -F -I "*init*|test*.*|build" +load_from: + - grpc_server: + host: localhost + port: 4266 + location_name: "dagster-dags" # Name of the module ``` -```sh -./ -├── cloud_archives/ # Dagster definitions for cloud-stored archival datasets -│ └── nwp/ # Specifications for Numerical Weather Predication data sources -│ └── icon/ -├── constants.py # Values used across the project -├── dags_tests/ # Tests for the project -├── local_archives/ # Dagster defintions for locally-stored archival datasets -│ ├── nwp/ # Specifications for Numerical Weather Prediction data source -│ │ ├── cams/ -│ │ └── ecmwf/ -│ └── sat/ # Specifications for Satellite image data sources -├── managers/ # IO Managers for use across the project -├── pyproject.toml # The build configuration for the service -└── README.md +> [!Note] +> Setting `DAGSTER_CURRENT_IMAGE` environment variable is necessary to tell Dagster +> to spawn jobs using the set container image. Since the Containerfile has all the +> required dependencies for the user code, it makes sense to set it to itself. + +**To deploy the entire Dagster multi-container stack:** + +```bash +$ docker compose up -f infrastructure/docker-compose.yml ``` -## Conventions +> [!Note] +> This will start a full Dagster setup with a web UI, a Postgres database, +> and a QueuedRunCoordinator. This might be overkill for some setups. + +## Documentation + +The repository is split into folders covering the basic concepts of Dagster: + +- Top-level [Definitions](https://docs.dagster.io/concepts/code-locations) defining the code location are defined in `src/dagster_dags/definitions.py` +- [Assets](https://docs.dagster.io/concepts/assets/software-defined-assets) are in `src/dagster_dags/assets` +- [Resources](https://docs.dagster.io/concepts/resources#resources) are in `src/dagster_dags/resources` + +They are then subdivided by module into data-type-specific folders. -The storage of data is handled automatically into locations defined by features of the data in question. The only configurable -part of the storage is the *Base Path* - the root point from which dagster will then handle the subpaths. The full storage paths -then take into account the following features: - - The *flavor* of the data (NWP, Satellite etc) - - The *Provider* of the data (CEDA, ECMWF etc) - - The *Region* the data covers (UK, EU etc) - - The *InitTime* the data refers to +## Development -Paths are then generated via`base/flavor/provider/region/inittime`. See managers for an example implementation. -For this to work, each asset must have an asset key prefix conforming to this structure `[flavor, provider, region]`. -The *Base Paths* are defined in `constants.py`. +To run a development Dagster server, install the required dependencies in a virtual environment, +activate it, and run the server: +```bash +$ cd scr && dagster dev --module-name=dagster_dags +``` + +This should spawn a UI at `localhost:3000` where you can interact with the Dagster webserver. + +### Linting and static type checking -## Local Development +This project uses [MyPy](https://mypy.readthedocs.io/en/stable/) for static type checking +and [Ruff](https://docs.astral.sh/ruff/) for linting. +Installing the development dependencies makes them available in your virtual environment. -First, install your Dagster code location as a Python package. By using the --editable flag, pip will install your Python package in ["editable mode"](https://pip.pypa.io/en/latest/topics/local-project-installs/#editable-installs) so that as you develop, local code changes will automatically apply. +Use them via: ```bash -pip install -e ".[dev]" +$ python -m mypy . +$ python -m ruff check . ``` -Then, start the Dagster UI web server: +Be sure to do this periodically while developing to catch any errors early +and prevent headaches with the CI pipeline. It may seem like a hassle at first, +but it prevents accidental creation of a whole suite of bugs. + +### Running the test suite + +Run the unittests with: ```bash -dagster dev --module-name=local_archives +$ python -m unittest discover -s tests ``` -Open [http://localhost:3000](http://localhost:3000) with your browser to see the project. +## Further Reading + +On running your own GRPC code server as a code location in Dagster: +- Dagster guide on [running a GRPC server](https://docs.dagster.io/concepts/code-locations/workspace-files#running-your-own-grpc-server). +- Creating a GRPC code server container as part of a [multi-container Dagster stack](https://docs.dagster.io/deployment/guides/docker#multi-container-docker-deployment). + + +--- + +## Contributing and community + +[![issues badge](https://img.shields.io/github/issues/openclimatefix/dagster-dags?color=FFAC5F)](https://github.com/openclimatefixdagster-dags/issues?q=is%3Aissue+is%3Aopen+sort%3Aupdated-desc) + +- PR's are welcome! See the [Organisation Profile](https://github.com/openclimatefix) for details on contributing +- Find out about our other projects in the [OCF Meta Repo](https://github.com/openclimatefix/ocf-meta-repo) +- Check out the [OCF blog](https://openclimatefix.org/blog) for updates +- Follow OCF on [LinkedIn](https://uk.linkedin.com/company/open-climate-fix) + + + + + + + -Add your assets to the relevant code location. See [Repository Structure](#repository-structure) for details. + +--- -## Useful links +*Part of the [Open Climate Fix](https://github.com/orgs/openclimatefix/people) community.* -- [Detecting existing assets](https://github.com/dagster-io/dagster/discussions/17847) +[![OCF Logo](https://cdn.prod.website-files.com/62d92550f6774db58d441cca/6324a2038936ecda71599a8b_OCF_Logo_black_trans.png)](https://openclimatefix.org) diff --git a/cloud_archives/__init__.py b/cloud_archives/__init__.py deleted file mode 100644 index e6bd862..0000000 --- a/cloud_archives/__init__.py +++ /dev/null @@ -1,12 +0,0 @@ -import os - -import dagster as dg - -from .nwp import icon -from .pv import passiv - -defs = dg.Definitions( - assets=[*icon.all_assets, *passiv.all_assets], - jobs=[*icon.all_jobs], - schedules=[dg.build_schedule_from_partitioned_job(job) for job in icon.all_jobs], -) diff --git a/cloud_archives/nwp/icon/__init__.py b/cloud_archives/nwp/icon/__init__.py deleted file mode 100644 index b7eb142..0000000 --- a/cloud_archives/nwp/icon/__init__.py +++ /dev/null @@ -1,22 +0,0 @@ -import dagster as dg - -from . import icon_eu, icon_global - -global_assets = dg.load_assets_from_modules( - modules=[icon_global], - group_name="icon_global", -) - -eu_assets = dg.load_assets_from_modules( - modules=[icon_eu], - group_name="icon_eu", -) - -all_assets: list[dg.AssetsDefinition] = [*global_assets, *eu_assets] - -all_jobs: list[dg.JobDefinition] = [ - icon_global.archive_icon_global_sl_job, - icon_global.archive_icon_global_ml_job, - icon_eu.archive_icon_europe_sl_job, - icon_eu.archive_icon_europe_ml_job, -] diff --git a/cloud_archives/nwp/icon/_ops.py b/cloud_archives/nwp/icon/_ops.py deleted file mode 100644 index 3905432..0000000 --- a/cloud_archives/nwp/icon/_ops.py +++ /dev/null @@ -1,69 +0,0 @@ -import dagster as dg - -from cloud_archives.ops.generic import ( - AssetMaterializationConfig, - log_asset_materialization, - raise_exception, -) -from cloud_archives.ops.huggingface import ( - HFFileConfig, - get_hf_zarr_file_metadata, -) -from cloud_archives.ops.kbatch import ( - NWPConsumerConfig, - define_kbatch_consumer_job, - kbatch_consumer_graph, -) - - -def create_kbatch_huggingface_graph_config( - nwp_config: NWPConsumerConfig, - hf_config: HFFileConfig, - am_config: AssetMaterializationConfig, -) -> dg.RunConfig: - """Mapping from Config to RunConfig for the corresponding graph. - - Args: - nwp_config: Configuration for the nwp consumer. - hf_config: Configuration for huggingface. - am_config: Configuration for asset materialisation. - - Returns: - The RunConfig for the graph. - """ - return dg.RunConfig( - ops={ - kbatch_consumer_graph.name: { - "ops": {define_kbatch_consumer_job.name: nwp_config}, - }, - get_hf_zarr_file_metadata.name: hf_config, - get_hf_zarr_file_metadata.name + "_2": hf_config, - log_asset_materialization.name: am_config, - log_asset_materialization.name + "_2": am_config, - }, - ) - - -@dg.graph -def kbatch_huggingface_graph() -> dict[str, dg.MetadataValue]: - """Op graph for archiving to huggingface using nwp-consumer in kbatch. - - Note: Some of the ops within the graphs require the defining of - run configuration. - - Returns: - The file metadata for the zarr file that was archived. - """ - # First check to see if the file in question already exists - file_metadata, no_file_at_start = get_hf_zarr_file_metadata() - # If the file exists, log the materialization - log_asset_materialization(file_metadata) - # If the file does not exist, create a kbatch job to archive it - job_name = kbatch_consumer_graph(no_file_at_start) - file_metadata, no_file_after_job = get_hf_zarr_file_metadata(job_name) - # Now the file should exist, so log the materialization - log_asset_materialization(file_metadata) - # Raise an exception if it doesn't exist at this point - raise_exception(no_file_after_job) - - return file_metadata diff --git a/cloud_archives/nwp/icon/icon_eu.py b/cloud_archives/nwp/icon/icon_eu.py deleted file mode 100644 index 928ade6..0000000 --- a/cloud_archives/nwp/icon/icon_eu.py +++ /dev/null @@ -1,80 +0,0 @@ -"""Pipeline for the processing of eu ICON data.""" -import os - -import dagster as dg - -from cloud_archives.ops.huggingface import ( - HFFileConfig, -) -from cloud_archives.ops.kbatch import ( - NWPConsumerConfig, -) - -from ._ops import ( - AssetMaterializationConfig, - create_kbatch_huggingface_graph_config, - kbatch_huggingface_graph, -) - -# Define the ICON europe zarr archive as a source asset -icon_europe_zarr_archive = dg.SourceAsset( - key=["nwp", "icon", "europe", "zarr_archive"], - partitions_def=dg.TimeWindowPartitionsDefinition( - fmt="%Y-%m-%d|%H:%M", - start="2024-01-31|00:00", - cron_schedule="0 0/6 * * *", - ), -) - -# Define the job to materialize the ICON europe zarr archive -archive_icon_europe_sl_job = kbatch_huggingface_graph.to_job( - name="archive_icon_europe_sl_job", - partitions_def=icon_europe_zarr_archive.partitions_def, - config=create_kbatch_huggingface_graph_config( - nwp_config=NWPConsumerConfig( - source="icon", - sink="huggingface", - docker_tag="refactor-service-loop", - zdir="single-level/data", - env={ - "ICON_MODEL": "europe", - "ICON_PARAMETER_GROUP": "single-level", - "HUGGINGFACE_TOKEN": os.getenv("HUGGINGFACE_TOKEN", default="not-set"), - "HUGGINGFACE_REPO_ID": "sol-ocf/test-dwd-europe", - }, - ), - hf_config=HFFileConfig(hf_repo_id="sol-ocf/test-dwd-europe"), - am_config=AssetMaterializationConfig( - asset_key=list(icon_europe_zarr_archive.key.path), - asset_description="Europe ICON Zarr Archive stored in huggingface.", - ), - ), -) - - -archive_icon_europe_ml_job = kbatch_huggingface_graph.to_job( - name="archive_icon_europe_ml_job", - partitions_def=icon_europe_zarr_archive.partitions_def, - config=create_kbatch_huggingface_graph_config( - nwp_config=NWPConsumerConfig( - source="icon", - sink="huggingface", - docker_tag="main", - zdir="multi-level/data", - env={ - "ICON_MODEL": "europe", - "ICON_PARAMETER_GROUP": "multi-level", - "HUGGINGFACE_TOKEN": os.getenv("HUGGINGFACE_TOKEN", default="not-set"), - "HUGGINGFACE_REPO_ID": "sol-ocf/test-dwd-europe", - }, - ), - hf_config=HFFileConfig(hf_repo_id="sol-ocf/test-dwd-europe"), - am_config=AssetMaterializationConfig( - asset_key=list(icon_europe_zarr_archive.key.path), - asset_description="Europe ICON Zarr Archive stored in huggingface.", - ), - ), -) - - - diff --git a/cloud_archives/nwp/icon/icon_global.py b/cloud_archives/nwp/icon/icon_global.py deleted file mode 100644 index 6bd1313..0000000 --- a/cloud_archives/nwp/icon/icon_global.py +++ /dev/null @@ -1,76 +0,0 @@ -"""Pipeline for the processing of global ICON data.""" -import os - -import dagster as dg - -from cloud_archives.ops.huggingface import ( - HFFileConfig, -) -from cloud_archives.ops.kbatch import ( - NWPConsumerConfig, -) - -from ._ops import ( - AssetMaterializationConfig, - create_kbatch_huggingface_graph_config, - kbatch_huggingface_graph, -) - -# Define the ICON global zarr archive as a source asset -icon_global_zarr_archive = dg.SourceAsset( - key=["nwp", "icon", "global", "zarr_archive"], - partitions_def=dg.TimeWindowPartitionsDefinition( - fmt="%Y-%m-%d|%H:%M", - start="2024-01-31|00:00", - cron_schedule="0 0/6 * * *", - ), -) - -archive_icon_global_sl_job = kbatch_huggingface_graph.to_job( - name="archive_icon_global_sl_job", - partitions_def=icon_global_zarr_archive.partitions_def, - config=create_kbatch_huggingface_graph_config( - nwp_config=NWPConsumerConfig( - source="icon", - sink="huggingface", - docker_tag="refactor-service-loop", - zdir="single-level/data", - env={ - "ICON_MODEL": "global", - "ICON_PARAMETER_GROUP": "single-level", - "HUGGINGFACE_TOKEN": os.getenv("HUGGINGFACE_TOKEN", default="not-set"), - "HUGGINGFACE_REPO_ID": "sol-ocf/test-dwd-global", - }, - ), - hf_config=HFFileConfig(hf_repo_id="sol-ocf/test-dwd-global"), - am_config=AssetMaterializationConfig( - asset_key=list(icon_global_zarr_archive.key.path), - asset_description="Global ICON Zarr Archive stored in huggingface.", - ), - ), -) - - -archive_icon_global_ml_job = kbatch_huggingface_graph.to_job( - name="archive_icon_global_ml_job", - partitions_def=icon_global_zarr_archive.partitions_def, - config=create_kbatch_huggingface_graph_config( - nwp_config=NWPConsumerConfig( - source="icon", - sink="huggingface", - docker_tag="main", - zdir="multi-level/data", - env={ - "ICON_MODEL": "global", - "ICON_PARAMETER_GROUP": "multi-level", - "HUGGINGFACE_TOKEN": os.getenv("HUGGINGFACE_TOKEN", default="not-set"), - "HUGGINGFACE_REPO_ID": "sol-ocf/test-dwd-global", - }, - ), - hf_config=HFFileConfig(hf_repo_id="sol-ocf/test-dwd-global"), - am_config=AssetMaterializationConfig( - asset_key=list(icon_global_zarr_archive.key.path), - asset_description="Global ICON Zarr Archive stored in huggingface.", - ), - ), -) diff --git a/cloud_archives/ops/__init__.py b/cloud_archives/ops/__init__.py deleted file mode 100644 index 4b2bc6d..0000000 --- a/cloud_archives/ops/__init__.py +++ /dev/null @@ -1,11 +0,0 @@ -from . import ( - generic, - huggingface, - kbatch, -) - -__all__ = [ - "generic", - "huggingface", - "kbatch", -] diff --git a/cloud_archives/ops/generic.py b/cloud_archives/ops/generic.py deleted file mode 100644 index 83ab871..0000000 --- a/cloud_archives/ops/generic.py +++ /dev/null @@ -1,53 +0,0 @@ -"""Defines implementation-agnostic ops for generic graph-building.""" - - -import dagster as dg -from pydantic import Field - - -class AssetMaterializationConfig(dg.Config): - """Configuration for asset materialisation. - - Builds upon the dagster Config type, allowing for the configuration to be - passed to an Op in a dagster pipeline. Default values of an ellipsis (...) - are used to indicate that the value must be provided. - """ - - asset_key: list[str] = Field( - description="The key of the asset to materialise.", - default=..., - ) - asset_description: str | None = Field( - description="A description of the asset.", - default=None, - ) - - -@dg.op -def log_asset_materialization( - context: dg.OpExecutionContext, - config: AssetMaterializationConfig, - metadata: dict[str, dg.MetadataValue], -) -> None: - """Materialises an asset according to the config.""" - context.log_event( - dg.AssetMaterialization( - asset_key=config.asset_key, - description=config.asset_description, - partition=context.partition_key if context.has_partition_key else None, - metadata=metadata, - ), - ) - - -@dg.op( - ins={"depends_on": dg.In(dg.Nothing)}, -) -def raise_exception() -> None: - """Dagster Op that raises an exception. - - This Op is used to mark a branch in a graph as being undesirable. - Defines a "Nothing" input to allow for the op to have upstream dependencies - in a graph without the passing of data. - """ - raise Exception("Reached exception Op.") diff --git a/cloud_archives/ops/huggingface.py b/cloud_archives/ops/huggingface.py deleted file mode 100644 index 92c3131..0000000 --- a/cloud_archives/ops/huggingface.py +++ /dev/null @@ -1,123 +0,0 @@ -"""Dagster ops and resources for interacting with huggingface datasets.""" -import datetime as dt -import os - -import dagster as dg -from huggingface_hub import hf_hub_url -from huggingface_hub.hf_api import HfApi, RepoFile -from pydantic import Field - - -class HFFileConfig(dg.Config): - """Configuration for huggingface. - - Builds upon the dagster Config type, allowing for the configuration to be - passed to an Op in a dagster pipeline. - - Default values of an ellipsis (...) are used to indicate that the value - must be provided when the configuration object is instantiated. - """ - - hf_repo_id: str = Field( - description="The id of the huggingface repo to archive to.", - default=..., - ) - file_init_time: str = Field( - description="The initialisation time of the data of interest.", - default=dt.datetime.now(dt.UTC) - .replace( - hour=0, - minute=0, - second=0, - microsecond=0, - ) - .strftime("%Y-%m-%d|%H:%M"), - ) - - -@dg.op( - ins={"depends_on": dg.In(dg.Nothing)}, - out={ - "file_metadata": dg.Out(dict[str, dg.MetadataValue], is_required=False), - "no_such_file": dg.Out(bool, is_required=False), - }, -) -def get_hf_zarr_file_metadata( - context: dg.OpExecutionContext, - config: HFFileConfig, -) -> tuple[dict[str, dg.MetadataValue], bool]: - """Dagster op to get metadata for a zarr file in a huggingface dataset. - - Assumes the zarr files are stored in a folder structure of the form: - data/{year}/{month}/{day} - and that the names of the zarr files contain the initialisation time: - {year}{month}{day}T{hour}{minute}.zarr.zip - where the time parts correspond to the initialisation time of the file. - - Defines a "Nothing" input to allow for the op to have upstream dependencies - in a graph without the passing of data, as well as two outputs, - - file_metadata: The metadata for the zarr file that was found. - - no_such_file: A signal that no file was found for the given init time. - This is done instead of simply raising an error when no files are found, - as it allows for a branching configuration: downstream Ops can decide - how to handle the case where either one or none files are found. - - Args: - context: The dagster context. - config: Configuration for where to look on huggingface. - - Returns: - Either the metadata for the zarr file that was found, or a signal that - no file was found for the given init time. - """ - # Get the init time from the config or the partition key - itstring: str = config.file_init_time - if context.has_partition_key: - itstring = context.partition_key - it: dt.datetime = dt.datetime.strptime(itstring, "%Y-%m-%d|%H:%M").replace(tzinfo=dt.UTC) - - api = HfApi(token=os.getenv("HUGGINGFACE_TOKEN", default=None)) - # Check if there is an init time folder - if ( - len( - api.get_paths_info( - repo_id=config.hf_repo_id, - repo_type="dataset", - paths=f"data/{it.strftime('%Y/%m/%d')}", - ), - ) - == 0 - ): - files: list[RepoFile] = [] - else: - # List all files in the repo folder for the given init time's date - # and filter for zarr files named according to the init time - files: list[RepoFile] = [ - p - for p in api.list_repo_tree( - repo_id=config.hf_repo_id, - repo_type="dataset", - path_in_repo=f"data/{it.strftime('%Y/%m/%d')}", - ) - if isinstance(p, RepoFile) - and p.path.endswith(".zarr.zip") - and f"{it.strftime('%Y%m%dT%H%M')}" in p.path - ] - - if len(files) == 0: - context.log.info("No files found in the repo for the given init time.") - yield dg.Output(True, "no_such_file") - else: - rf: RepoFile = next(iter(files)) - context.log.info(f"Found file {rf} in repo {config.hf_repo_id}.") - # Map RepoFile object to a dagster metadata dict - metadata: dict[str, dg.MetadataValue] = { - "file": dg.MetadataValue.path(rf.path), - "url": dg.MetadataValue.url( - hf_hub_url(repo_id=config.hf_repo_id, repo_type="dataset", filename=rf.path), - ), - "size (bytes)": dg.MetadataValue.int(rf.size), - "blob ID": dg.MetadataValue.text(rf.blob_id), - } - yield dg.Output(metadata, "file_metadata") - diff --git a/cloud_archives/ops/kbatch.py b/cloud_archives/ops/kbatch.py deleted file mode 100644 index 973d492..0000000 --- a/cloud_archives/ops/kbatch.py +++ /dev/null @@ -1,406 +0,0 @@ -"""Dagster operations for running kbatch jobs. - -Define operations and helper functions for running the nwp-consumer -as a kbatch job on a kubernetes cluster. The operations are designed -to be run as part of a dagster pipeline. - -The key method is `kbatch_consumer_graph`, which combines a selection -of operations into a graph that configures, runs, and tracks a kbatch -nwp-consumer job, streaming logs back to stdout and cleaning up -resources on error or success. -""" - -import datetime as dt -import time -from types import GeneratorType - -import dagster as dg -import httpx -import kbatch._core as kbc -from kbatch._types import Job -from pydantic import Field - -# --- CONSTANTS --- # - -# Set the kbatch url and token arguments to none in all calls to kbatch -# * Don't ask me why, but setting them as one would expect manually -# (through env vars) in these parameters doesn't work. Instead, force -# the kbatch core to find them from the environment by setting them -# to None. -KBATCH_DICT = { - "kbatch_url": None, - "token": None, -} - - -# --- CLASSES AND METHODS --- # - - -class KbatchJobException(Exception): - """Exception raised when a kbatch job fails. - - Contains the name of the job that failed alongside the message. - Useful for enabling further handling of the job failure, e.g. - cleaning up of resources. - """ - - def __init__(self, message: str, job_name: str): - super().__init__(message) - self.job_name = job_name - - -@dg.failure_hook -def kbatch_job_failure_hook(context: dg.HookContext) -> None: - """Failure hook that deletes a kbatch job on exception. - - Can be applied to individual ops via - some_kbatch_op.with_failure_hook(kbatch_job_failure_hook)() - or to all ops in a job via - @dg.job(hooks={kbatch_job_failure_hook}) - - Args: - context: The dagster context within which the hook is operating. - """ - op_exception = context.op_exception - - if isinstance(op_exception, KbatchJobException): - job_name = op_exception.job_name - dg.get_dagster_logger().info(f"Deleting kbatch job {job_name}.") - kbc.delete_job(resource_name=job_name, **KBATCH_DICT) - - -def wait_for_status_change(old_status: str, job_name: str, timeout: int = 60 * 20) -> str: - """Wait for the status of a kbatch job to change from old_status. - - The amount of time to wait is modified by the timeout parameter. - - Args: - old_status: The status to wait for the job to change from. - job_name: The name of the job to check. - timeout: The maximum time to wait for the status to change. - - Returns: - The new status of the job. - """ - time_spent: int = 0 - while time_spent < timeout: - increment_secs: int = 30 - time.sleep(increment_secs) - time_spent += increment_secs - - # Get the status of the pod in the job - # * This can fail and be retried within the timeout limit so - # catch a number of recoverable errors. - try: - pods_info: list[dict] = kbc.list_pods(job_name=job_name, **KBATCH_DICT)["items"] - except httpx.ConnectError as e: - if "Temporary failure in name resolution" in str(e): - dg.get_dagster_logger().debug(f"Name resolution error, retrying: {e}") - continue - else: - raise e - except (httpx.ReadTimeout, httpx.ConnectTimeout) as e: - dg.get_dagster_logger().debug(f"Timed out listing pods, retrying: {e}") - continue - except httpx.HTTPStatusError as e: - if "503" in str(e): - dg.get_dagster_logger().debug(f"Service unavailable, retrying: {e}") - continue - else: - raise e - except Exception as e: - raise e - - if len(pods_info) == 0: - continue - - new_status: str = pods_info[0]["status"]["phase"] - - # Exit if status has changed - if new_status != old_status: - dg.get_dagster_logger().info( - f"Job {job_name} is no longer {old_status}, status: {new_status}.", - ) - if new_status == "Failed": - condition: str = pods_info[0]["status"]["container_statuses"][0]["state"] - dg.get_dagster_logger().error(f"Condition: {condition}") - return new_status - - # Log if still waiting every 10 minutes - if time_spent % (10 * 60) == 0: - dg.get_dagster_logger().debug( - f"Kbatch job {job_name} still {old_status} after {int(time_spent / 60)} mins.", - ) - - # Raise exception if timed out - if time_spent >= timeout: - dg.get_dagster_logger().info(pods_info[0]["status"]) - raise KbatchJobException( - message=f"Timed out waiting for status '{old_status}' to change.", - job_name=job_name, - ) - - return new_status - - -# --- OPS --- # - - -class NWPConsumerConfig(dg.Config): - """Configuration object for the nwp consumer. - - Defines the configuration for the running of the nwp-consumer docker image. - Builds upon the dagster Config type, allowing for the configuration to be - passed to an Op in a dagster pipeline. - - Default values of an ellipsis (...) are used to indicate that the value - must be provided when the configuration object is instantiated. - """ - - docker_tag: str = Field( - description="The tag of the nwp-consumer docker image to use.", - default="0.2.1", - ) - source: str = Field( - description="The source of the data to consume.", - default=..., - ) - sink: str = Field( - description="The sink to write the data to.", - default=..., - ) - zdir: str = Field( - description="The directory to write the data to.", - default="data", - ) - env: dict[str, str] = Field( - description="Environment variables to pass to the nwp-consumer.", - default_factory=lambda: {}, - ) - inittime: str = Field( - description="The initialisation time of the nwp data to consume.", - default=dt.datetime.now(dt.UTC) - .replace(hour=0, minute=0, second=0, microsecond=0) - .strftime("%Y-%m-%d|%H:%M"), - pattern=r"^\d{4}-\d{2}-\d{2}\|\d{2}:\d{2}$", - ) - no_rename_vars: bool = Field( - description="Don't rename variables.", - default=True, - ) - no_variable_dimension: bool = Field( - description="Don't specify variable dimensions.", - default=True, - ) - - -@dg.op( - ins={"depends_on": dg.In(dg.Nothing)}, -) -def define_kbatch_consumer_job( - context: dg.OpExecutionContext, - config: NWPConsumerConfig, -) -> Job: - """Define a kbatch job object to run the nwp-consumer. - - Builds a kbatch job object specifying the parameters required - to run the nwp-consumer docker image according to the - input configuration object. - - Args: - context: The dagster context. - config: Configuration for the nwp-consumer. - - Returns: - The kbatch job definition object. - """ - # Get the init time either from config or partition key - itstring = config.inittime - if context.has_partition_key: - itstring = context.partition_key - it = dt.datetime.strptime(itstring, "%Y-%m-%d|%H:%M").replace(tzinfo=dt.UTC) - - args = [ - "consume", - f"--source={config.source}", - f"--sink={config.sink}", - "--rsink=local", - "--rdir=/tmp/nwpc/raw", - f"--zdir={config.zdir}", - f"--from={it.strftime('%Y-%m-%dT%H:%M')}", - ] - - if config.no_rename_vars: - args = [*args, "--no-rename-vars"] - if config.no_variable_dimension: - args = [*args, "--no-variable-dim"] - - context.log.info(f"Running nwp-consumer with command: {args}") - - job = Job( - name=f"{config.source}-{config.sink}-backfill", - image=f"ghcr.io/openclimatefix/nwp-consumer:{config.docker_tag}", - env=config.env, - args=args, - ) - - return job - - -@dg.op -def submit_kbatch_job(context: dg.OpExecutionContext, job: Job) -> str: - """Submit a kbatch job object to the kbatch server. - - Requires one of the two following configurations set: - - - the appropriate kbatch token and url set in the environment variables - KBATCH_URL and JUPYTERHUB_API_TOKEN - - a `~/.config/kbatch/config.json file containing a dictionary with the - keys "kbatch_url" and "token". - - This can be generated using the kbatch CLI with the command: - kbatch configure --kbatch_url --token - - Defines a "Nothing" input to allow for the op to have upstream dependencies - in a graph without the passing of data. - - Args: - context: The dagster context. - job: A kbatch Job object defining the job to submit. - - Returns: - The name of the created job. - """ - # Request large pod sizes - profile: dict = { - "resources": { - "limits": { - "cpu": "8", - "memory": "64G", - }, - "requests": { - "cpu": "7.0", - "memory": "56G", - }, - }, - } - # Submit the job using kbatch core - result = kbc.submit_job(job=job, profile=profile, **KBATCH_DICT) - # Extract the job name from the result - job_name: str = result["metadata"]["name"] - context.log.info(f"Kbatch job {job_name} requested.") - - return job_name - - -@dg.op -def follow_kbatch_job( - context: dg.OpExecutionContext, - job_name: str, -) -> str: - """Blocking function that follows the status of a kbatch job. - - Waits for a job to start running, then follows the logs, streaming - back to stdout. Checks for failures within the logs and raises an - exception if the job fails. - - This function assumes the job is only running on a single pod. - On a partial read error the function will re-try the read. - - Args: - context: The dagster context. - job_name: The name of the job. - - Returns: - The name of the job. - """ - context.log.info("Assessing status of kbatch job.") - - # Pods take a short while to be provisioned - status: str = wait_for_status_change(old_status="Pending", job_name=job_name) - # If the pod fails to be provisioned there will be no logs to view. - # The condition will be printed to the logs (e.g. ImagePullBackoff) - if status == "Failed": - raise KbatchJobException( - message=f"Job {job_name} failed, see logs.", - job_name=job_name, - ) - - # Otherwise, wait up to timout for the pod to finish running - pod_name: str = kbc.list_pods(job_name=job_name, **KBATCH_DICT)["items"][0]["metadata"]["name"] - status = wait_for_status_change(old_status="Running", job_name=job_name, timeout=60 * 60 * 24) - - # Get the logs from the pod - - total_attempts: int = 0 - while total_attempts < 5: - try: - logs: str = kbc._logs( - pod_name=pod_name, - stream=False, - read_timeout=60 * 6, - **KBATCH_DICT, - ) - # Kbatch/Httpx seem keen to return generators even when "stream" is False - if isinstance(logs, GeneratorType): - for log in logs: - print(log) # noqa: T201 - else: - for line in logs.split("\n"): - print(line) # noqa: T201 - break - except (httpx.RemoteProtocolError, httpx.HTTPStatusError): - time.sleep(20) - total_attempts += 1 - continue - - context.log().warn("Failed to read logs after 3 attempts.") - - pods_info: list[dict] = kbc.list_pods(job_name=job_name, **KBATCH_DICT)["items"] - pod_status = pods_info[0]["status"]["phase"] - context.log.info(f"Captured all logs for job {job_name}; status '{pod_status}'.") - - if status == "Failed": - raise KbatchJobException( - message=f"Job {job_name} failed, see logs.", - job_name=job_name, - ) - - return job_name - - -@dg.op( - out={"job_name": dg.Out(str)}, -) -def delete_kbatch_job(job_name: str) -> str: - """Deletes a kbatch job. - - Args: - job_name: The name of the job. Must be a dagster op output. - """ - dg.get_dagster_logger().info(f"Deleting kbatch job {job_name}.") - kbc.delete_job(resource_name=job_name, **KBATCH_DICT) - return job_name - - -# --- GRAPHS --- # - - -@dg.graph -def kbatch_consumer_graph(depends_on: dg.Nothing) -> str: - """Graph for running the nwp-consumer as a kbatch job. - - Defines the set of operations that configure, run, and track a kbatch - nwp-consumer job, streaming logs back to stdout and deleting the job - upon completion. Any ops that manage or interact with a running kbatch - job also include a hook that deletes the job on exceptions in the graph. - - Implements a Nothing input to allow for the graph to have upstream - dependencies in a pipeline without the passing of data. - """ - job: Job = define_kbatch_consumer_job(depends_on=depends_on) - job_name: str = submit_kbatch_job.with_hooks({kbatch_job_failure_hook})(job=job) - job_name = follow_kbatch_job.with_hooks({kbatch_job_failure_hook})(job_name=job_name) - job_name = delete_kbatch_job(job_name=job_name) - - return job_name diff --git a/cloud_archives/pv/passiv/__init__.py b/cloud_archives/pv/passiv/__init__.py deleted file mode 100644 index dc1f2f5..0000000 --- a/cloud_archives/pv/passiv/__init__.py +++ /dev/null @@ -1,13 +0,0 @@ -import dagster as dg - -from . import passiv_monthly, passiv_year - -global_assets = dg.load_assets_from_modules( - modules=[passiv_monthly, passiv_year], - group_name="pv_passiv", -) - -all_assets: list[dg.AssetsDefinition] = [*global_assets] - -# TODO do we need to define jobs for these assets? - diff --git a/cloud_archives/pv/passiv/filenames.py b/cloud_archives/pv/passiv/filenames.py deleted file mode 100644 index ece464a..0000000 --- a/cloud_archives/pv/passiv/filenames.py +++ /dev/null @@ -1,9 +0,0 @@ -from datetime import datetime - - -def get_monthly_hf_file_name(date: datetime, period: int = 5): - return f"data/{date.strftime('%Y/%m')}/{date.strftime('%Y%m')}_{period}min.parquet" - - -def get_yearly_hf_file_name(date: datetime, period: int = 5): - return f"data/{date.strftime('%Y')}/{date.strftime('%Y')}_{period}min.parquet" diff --git a/constants.py b/constants.py deleted file mode 100644 index c214662..0000000 --- a/constants.py +++ /dev/null @@ -1,34 +0,0 @@ -"""Defines constant values for the nwp deployment.""" -import dataclasses as dc - - -@dc.dataclass -class StorageLocations: - """Defines the storage locations for a given environment.""" - - RAW_FOLDER: str - NWP_ZARR_FOLDER: str - STATIC_ZARR_FOLDER: str - POINT_ZARR_FOLDER: str - SAT_ZARR_FOLDER: str - EPHEMERAL_FOLDER: str - -# Defines the storage locations for each environment -LOCATIONS_BY_ENVIRONMENT: dict[str, StorageLocations] = { - "leo": StorageLocations( - RAW_FOLDER="/mnt/storage_c/raw", - NWP_ZARR_FOLDER="/mnt/storage_b", - STATIC_ZARR_FOLDER="/mnt/storage_a", - POINT_ZARR_FOLDER="/mnt/storage_a", - SAT_ZARR_FOLDER="/mnt/storage_a", - EPHEMERAL_FOLDER="/mnt/storage_c/ephemeral", - ), - "local": StorageLocations( - RAW_FOLDER="/tmp/raw", - NWP_ZARR_FOLDER="/tmp/zarr", - STATIC_ZARR_FOLDER="/tmp/zarr", - POINT_ZARR_FOLDER="/tmp/zarr", - SAT_ZARR_FOLDER="/tmp/zarr", - EPHEMERAL_FOLDER="/tmp/ephemeral", - ), -} diff --git a/containers/gfs/download_combine_gfs.py b/containers/gfs/download_combine_gfs.py index f4504e3..91d81dd 100644 --- a/containers/gfs/download_combine_gfs.py +++ b/containers/gfs/download_combine_gfs.py @@ -1,7 +1,8 @@ +"""Download and combine GFS data for a given date and run.""" + import argparse import dataclasses import datetime as dt -import functools import logging import os import pathlib @@ -22,7 +23,12 @@ logging.basicConfig( level=logging.DEBUG, stream=sys.stdout, - format='{"time": "%(asctime)s", "name": "%(name)s", "level": "%(levelname)s", "message": "%(message)s"}', + format=" ".join(( + '{"time": "%(asctime)s", ', + '"name": "%(name)s", ', + '"level": "%(levelname)s", ', + '"message": "%(message)s"}', + )), ) for logger in ["requests", "urllib3", "cfgrib.dataset"]: logging.getLogger(logger).setLevel(logging.WARNING) @@ -49,7 +55,7 @@ def download_url(url: str, folder: str) -> str | None: attempts: int = 1 while attempts < 6: try: - r = requests.get(url.strip(), allow_redirects=True, stream=True) + r = requests.get(url.strip(), allow_redirects=True, stream=True, timeout=60*60) if r.status_code == requests.codes.ok: with open(filename, "wb") as dest: for chunk in r.iter_content(chunk_size=1024): @@ -62,6 +68,7 @@ def download_url(url: str, folder: str) -> str | None: except Exception as e: log.error(f"Failed to download {url}: {e}") return None + return None def find_file_names(it: dt.datetime, config: Config) -> list[str]: """Find file names for the given init time.""" @@ -108,7 +115,9 @@ def convert_file(file: str, outfolder: str) -> str | None: # Update name of each data variable based off the attribute GRIB_stepType for i, d in enumerate(surface): for variable in d.data_vars: - d = d.rename({variable: f"{variable}_surface_{d[f'{variable}'].attrs['GRIB_stepType']}"}) + d = d.rename({ + variable: f"{variable}_surface_{d[f'{variable}'].attrs['GRIB_stepType']}", + }) surface[i] = d for i, d in enumerate(heightAboveGround): for variable in d.data_vars: @@ -168,7 +177,7 @@ def run(path: str, config: Config, date: dt.date, run: str) -> str: if len(urls) > cpu_count(): pool = Pool(cpu_count()) results = pool.starmap( - download_url, + download_url, # type: ignore [(url, f"{path}/{date:%Y%m%d}/{run}/") for url in urls], ) pool.close() @@ -195,7 +204,7 @@ def run(path: str, config: Config, date: dt.date, run: str) -> str: if len(run_files) > cpu_count(): pool = Pool(cpu_count()) dataset_paths = pool.starmap( - convert_file, + convert_file, # type: ignore [(file, path + "/.work") for file in run_files], ) pool.close() @@ -238,7 +247,7 @@ def run(path: str, config: Config, date: dt.date, run: str) -> str: ) parser.add_argument( "--path", - default="/tmp/gfs", + default="/tmp/gfs", # noqa: S108 help="Path to save the data", ) parser.add_argument( diff --git a/containers/icon/download_combine_upload_icon.py b/containers/icon/download_combine_upload_icon.py index a64a299..b7421f2 100644 --- a/containers/icon/download_combine_upload_icon.py +++ b/containers/icon/download_combine_upload_icon.py @@ -9,8 +9,10 @@ For ease the script is also packaged as a docker container: - $ docker run -e HF_TOKEN= -v /some/path:/tmp/nwp ghcr.io/openclimatefix/icon-etl:main --help - + $ docker run \ + -e HF_TOKEN= \ + -v /some/path:/tmp/nwp \ + ghcr.io/openclimatefix/icon-etl:main --help Datasets ======== @@ -26,11 +28,14 @@ * step (step) timedelta64[ns] 00:00:00 ... 5 days 00:00:00 time datetime64[ns] ... valid_time (step) datetime64[ns] dask.array - Data variables: (3/60) - alb_rad (step, latitude, longitude) float32 dask.array + Data variables: (3/60) + alb_rad (step, latitude, longitude) + float32 dask.array ... ... - v (step, isobaricInhPa, latitude, longitude) float32 dask.array - z0 (step, latitude, longitude) float32 dask.array + v (step, isobaricInhPa, latitude, longitude) + float32 dask.array + z0 (step, latitude, longitude) + float32 dask.array """ import argparse @@ -55,12 +60,15 @@ logging.basicConfig( level=logging.DEBUG, stream=sys.stdout, - format="{" +\ - '"message": "%(message)s", ' +\ - '"severity": "%(levelname)s", "timestamp": "%(asctime)s.%(msecs)03dZ", ' +\ - '"logging.googleapis.com/labels": {"python_logger": "%(name)s"}, ' +\ - '"logging.googleapis.com/sourceLocation": {"file": "%(filename)s", "line": %(lineno)d, "function": "%(funcName)s"}' +\ + format="".join(( + "{", + '"message": "%(message)s", ', + '"severity": "%(levelname)s", "timestamp": "%(asctime)s.%(msecs)03dZ", ', + '"logging.googleapis.com/labels": {"python_logger": "%(name)s"}, ', + '"logging.googleapis.com/sourceLocation": ', + '{"file": "%(filename)s", "line": %(lineno)d, "function": "%(funcName)s"}', "}", + )), datefmt="%Y-%m-%dT%H:%M:%S", ) logging.getLogger("requests").setLevel(logging.WARNING) @@ -331,7 +339,8 @@ def find_file_name( to the download_extract_files function if the file does not exist it will simply not be downloaded. """ - # New data comes in 3 ish hours after the run time, ensure the script is running with a decent buffer + # New data comes in 3 ish hours after the run time, + # ensure the script is running with a decent buffer date_string = date.strftime("%Y%m%d") + run_string if (len(config.vars_2d) == 0) and (len(config.vars_3d) == 0): raise ValueError("You need to specify at least one 2D or one 3D variable") @@ -379,7 +388,7 @@ def download_extract_url(url: str, folder: str) -> str | None: return filename # If the file does not exist, attempt to download and extract it else: - r = requests.get(url, stream=True) + r = requests.get(url, stream=True, timeout=60*60) if r.status_code == requests.codes.ok: with r.raw as source, open(filename, "wb") as dest: dest.write(bz2.decompress(source.read())) @@ -582,8 +591,13 @@ def run(path: str, config: Config, run: str, date: dt.date) -> None: formatter_class=argparse.ArgumentDefaultsHelpFormatter, ) parser.add_argument("area", choices=["eu", "global"], help="Area to download data for") - parser.add_argument("--path", default="/tmp/nwp", help="Folder in which to save files") - parser.add_argument("--run", default="all", choices=["00", "06", "12", "18", "all"], help="Run time to download") + parser.add_argument("--path", default="/tmp/nwp", help="Folder in which to save files") # noqa: S108 + parser.add_argument( + "--run", + default="all", + choices=["00", "06", "12", "18", "all"], + help="Run time to download", + ) parser.add_argument("--rm", action="store_true", help="Remove files on completion") parser.add_argument( "--date", @@ -602,14 +616,14 @@ def run(path: str, config: Config, run: str, date: dt.date) -> None: "The script is set to remove downloaded files. " "If all your files are in the same 'run' folder, " "you will lose data before it has a chance to be processed. " - "Consider running the script without the --rm flag." + "Consider running the script without the --rm flag.", ) path: str = f"{args.path}/{args.area}" if args.run == "all": runs: list[str] = ["00", "06", "12", "18"] else: - runs: list[str] = [args.run] + runs = [args.run] # Cleanup any leftover files in path for hour in runs: if args.rm: diff --git a/containers/sat/download_process_sat.py b/containers/sat/download_process_sat.py index deaa67b..a9e8bb8 100644 --- a/containers/sat/download_process_sat.py +++ b/containers/sat/download_process_sat.py @@ -13,22 +13,20 @@ import sys import traceback from collections.abc import Iterator -from typing import Literal +from typing import Any, Literal -import dask.delayed -import dask.diagnostics -import dask.distributed import eumdac import eumdac.cli import eumdac.product import numpy as np import pandas as pd import pyproj +import pyresample import xarray as xr +import yaml from ocf_blosc2 import Blosc2 from satpy import Scene from tqdm import tqdm -import zarr if sys.stdout.isatty(): # Simple logging for terminals @@ -66,7 +64,7 @@ "urllib3", ]: logging.getLogger(logger).setLevel(logging.ERROR) -np.seterr(divide="ignore") +np.seterr(divide="ignore") log = logging.getLogger("sat-etl") @@ -237,7 +235,6 @@ def download_nat( return None def process_nat( - sat_config: Config, path: pathlib.Path, dstype: Literal["hrv", "nonhrv"], ) -> xr.DataArray | None: @@ -286,10 +283,10 @@ def write_to_zarr( If a Zarr store already exists at the given path, the DataArray will be appended to it. - Any attributes on the dataarray object are serialized to json-compatible strings. + Any attributes on the dataarray object are serialized to json-compatible strings. """ mode = "a" if zarr_path.exists() else "w" - extra_kwargs = { + extra_kwargs: dict[str, Any] = { "append_dim": "time", } if mode == "a" else { "encoding": { @@ -297,8 +294,8 @@ def write_to_zarr( "time": {"units": "nanoseconds since 1970-01-01"}, }, } - # Convert attributes to be json serializable - for key, value in da.attrs.items(): + # Convert attributes to be json serializable + for key, value in da.attrs.items(): if isinstance(value, dict): # Convert np.float32 to Python floats (otherwise yaml.dump complains) for inner_key in value: @@ -315,7 +312,7 @@ def write_to_zarr( da.attrs[key] = value.isoformat() try: - write_job = da.chunk({ + _ = da.chunk({ "time": 1, "x_geostationary": -1, "y_geostationary": -1, @@ -410,7 +407,7 @@ def _fname_to_scantime(fname: str) -> dt.datetime: `MSGX-SEVI-MSG15-0100-NA-20230910221240.874000000Z-NA.nat` So determine the time from the first element split by '.'. """ - return dt.datetime.strptime(fname.split(".")[0][-14:], "%Y%m%d%H%M%S") + return dt.datetime.strptime(fname.split(".")[0][-14:], "%Y%m%d%H%M%S").replace(tzinfo=dt.UTC) #def process_scans( # sat_config: Config, @@ -450,10 +447,14 @@ def _fname_to_scantime(fname: str) -> dt.datetime: # f: pathlib.Path # for i, f in enumerate(wanted_files): # try: -# # TODO: This method of passing the zarr times to the open function leaves a lot to be desired -# # Firstly, if the times are not passed in sorted order then the created 12-dataset chunks -# # may have missed times in them. Secondly, determining the time still requires opening and -# # converting the file which is probably slow. Better to skip search for files whose times +# # TODO: This method of passing the zarr times to the open function +# # leaves a lot to be desired +# # Firstly, if the times are not passed in sorted order then the created +# # 12-dataset chunks +# # may have missed times in them. +# # Secondly, determining the time still requires opening and +# # converting the file which is probably slow. Better to skip search for +# # files whose times # # are already in the Zarr store in the first place and bypass the entire pipeline. # dataset: xr.Dataset | None = _open_and_scale_data(zarr_times, f.as_posix(), dstype) # except Exception as e: @@ -504,9 +505,9 @@ def _gen_token() -> eumdac.AccessToken: return token -def _get_attrs_from_scene(scene: Scene) -> dict[str, str]: +def _serialize_attrs(attrs: dict[str, Any]) -> dict[str, str]: """Get the attributes from a Scene object.""" - for key, value in attrs.items(): + for key, value in attrs.items(): # Convert Dicts if isinstance(value, dict): # Convert np.float32 to Python floats (otherwise yaml.dump complains) @@ -524,6 +525,7 @@ def _get_attrs_from_scene(scene: Scene) -> dict[str, str]: # Convert datetimes if isinstance(value, dt.datetime): attrs[key] = value.isoformat() + return attrs def _convert_scene_to_dataarray( @@ -835,9 +837,9 @@ def check_data_quality(ds: xr.Dataset) -> None: Looks for the number of NaNs in the data over important regions. """ - def _calc_null_percentage(data: np.ndarray): + def _calc_null_percentage(data: np.ndarray) -> float: nulls = np.isnan(data) - return nulls.sum() / len(nulls) + return float(nulls.sum() / len(nulls)) result = xr.apply_ufunc( _calc_null_percentage, @@ -866,9 +868,9 @@ def run(args: argparse.Namespace) -> None: # Get values from args folder: pathlib.Path = args.path sat_config = CONFIGS[args.sat] - start: dt.datetime = dt.datetime.strptime(args.month, "%Y-%m") + start: dt.datetime = dt.datetime.strptime(args.month, "%Y-%m").replace(tzinfo=dt.UTC) end: dt.datetime = (start + pd.DateOffset(months=1, minutes=-1)).to_pydatetime() - dstype: str = "hrv" if args.hrv else "nonhrv" + dstype: Literal["hrv", "nonhrv"] = "hrv" if args.hrv else "nonhrv" product_iter, total = get_products_iterator( sat_config=sat_config, @@ -883,8 +885,9 @@ def run(args: argparse.Namespace) -> None: if zarr_path.exists(): log.info(f"Using existing zarr store at '{zarr_path}'") ds = xr.open_zarr(zarr_path, consolidated=True) - + # Iterate through all products in search + nat_filepaths: list[pathlib.Path] = [] for product in tqdm(product_iter, total=total, miniters=50): # Skip products already present in store @@ -892,7 +895,7 @@ def run(args: argparse.Namespace) -> None: product_time: dt.datetime = product.sensing_start.replace(second=0, microsecond=0) if np.datetime64(product_time, "ns") in ds.coords["time"].values: log.debug( - f"Skipping entry '{product!s}' as '{product_time}' already in store" + f"Skipping entry '{product!s}' as '{product_time}' already in store", ) continue @@ -903,8 +906,9 @@ def run(args: argparse.Namespace) -> None: ) if nat_filepath is None: raise OSError(f"Failed to download product '{product}'") - da = process_nat(sat_config, nat_filepath, dstype) + da = process_nat(nat_filepath, dstype) write_to_zarr(da=da, zarr_path=zarr_path) + nat_filepaths.append(nat_filepath) runtime = dt.datetime.now(tz=dt.UTC) - prog_start log.info(f"Completed archive for args: {args} in {runtime!s}.") @@ -946,7 +950,9 @@ def run(args: argparse.Namespace) -> None: #new_average_secs_per_scan: int = int( # (secs_per_scan + (runtime.total_seconds() / len(scan_times))) / 2, #) - #log.info(f"Completed archive for args: {args}. ({new_average_secs_per_scan} seconds per scan).") + #log.info( + # f"Completed archive for args: {args}. ({new_average_secs_per_scan} seconds per scan)." + #) if args.validate: ds = xr.open_zarr(zarr_path, consolidated=True) @@ -954,8 +960,8 @@ def run(args: argparse.Namespace) -> None: # Delete raw files, if desired if args.delete_raw: - log.info(f"Deleting {len(raw_paths)} raw files in {folder.as_posix()}.") - for f in raw_paths: + log.info(f"Deleting {len(nat_filepaths)} raw files in {folder.as_posix()}.") + for f in nat_filepaths: f.unlink() diff --git a/containers/sat/test_download_process_sat.py b/containers/sat/test_download_process_sat.py index fde8578..fe7436d 100644 --- a/containers/sat/test_download_process_sat.py +++ b/containers/sat/test_download_process_sat.py @@ -28,11 +28,16 @@ def setUpClass(cls) -> None: attrs: dict = { "end_time": TIMESTAMP + pd.Timedelta("15m"), "modifiers": (), - "orbital_parameters": {"projection_longitude": 45.5, "projection_latitude": 0.0, - "projection_altitude": 35785831.0, "satellite_nominal_longitude": 45.5, - "satellite_nominal_latitude": 0.0, "satellite_actual_longitude": 45.703605543834364, - "satellite_actual_latitude": 7.281469039541501, - "satellite_actual_altitude": 35788121.627292305}, + "orbital_parameters": { + "projection_longitude": 45.5, + "projection_latitude": 0.0, + "projection_altitude": 35785831.0, + "satellite_nominal_longitude": 45.5, + "satellite_nominal_latitude": 0.0, + "satellite_actual_longitude": 45.703605543834364, + "satellite_actual_latitude": 7.281469039541501, + "satellite_actual_altitude": 35788121.627292305, + }, "reader": "seviri_l1b_native", "sensor": "seviri", "resolution": 3000.403165817, @@ -68,12 +73,14 @@ def setUpClass(cls) -> None: def test_get_products_iterator(self) -> None: """Test that the iterator returns the correct number of products.""" token = dps._gen_token() - for config in dps.CONFIGS: - with self.subTest as t: - products_iter, total = dps._get_products_iterator( + for config in dps.CONFIGS.values(): + with self.subTest as t: # type: ignore + products_iter, total = dps.get_products_iterator( sat_config=config, start=pd.Timestamp("2024-01-01").to_pydatetime(), - end=(pd.Timestamp("2024-01-01") + pd.Timedelta(sat_config["cadence"])).to_pydatetime(), + end=( + pd.Timestamp("2024-01-01") + pd.Timedelta(config.cadence) + ).to_pydatetime(), token=token, ) t.assertEqual(total, 1) @@ -97,43 +104,23 @@ def test_convert_scene_to_dataarray(self) -> None: self.assertIn("end_time", da.attrs) def test_rescale(self) -> None: - da: xr.DataArray = dps._rescale(self.test_dataarrays["nonhrv"], channels=dps.CHANNELS["nonhrv"]) + da: xr.DataArray = dps._rescale( + da=self.test_dataarrays["nonhrv"], + channels=dps.CHANNELS["nonhrv"], + ) self.assertGreater(da.values.max(), 0) self.assertLess(da.values.min(), 1) self.assertEqual(da.attrs, self.test_dataarrays["nonhrv"].attrs) - def test_open_and_scale_data(self) -> None: - ds: xr.Dataset | None = dps._open_and_scale_data([], self.paths[0].as_posix(), "nonhrv") - - if ds is None: - self.fail("Dataset is None") - - ds.to_zarr("/tmp/test_sat_data/test.zarr", mode="w", consolidated=True) - ds2 = xr.open_zarr("/tmp/test_sat_data/test.zarr") - self.assertDictEqual(dict(ds.sizes), dict(ds2.sizes)) - self.assertNotEqual(dict(ds.attrs), {}) - def test_process_nat(self) -> None: - out: str = dps.process_nat( - dps.CONFIGS["iodc"], - pathlib.Path("/tmp/test_sat_data"), - pd.Timestamp("2024-01-01"), - pd.Timestamp("2024-01-02"), "nonhrv", - ) - - self.assertTrue(False) - - def test_process_scans(self) -> None: - - out: str = dps.process_scans( - dps.CONFIGS["iodc"], - pathlib.Path("/tmp/test_sat_data"), - pd.Timestamp("2024-01-01"), - pd.Timestamp("2024-01-02"), "nonhrv", + _ = dps.process_nat( + path=pathlib.Path("/tmp/test_sat_data"), # noqa: S108 + dstype="nonhrv", ) self.assertTrue(False) if __name__ == "__main__": unittest.main() + diff --git a/infrastructure/docker-compose.yaml b/infrastructure/docker-compose.yaml new file mode 100644 index 0000000..308b8e5 --- /dev/null +++ b/infrastructure/docker-compose.yaml @@ -0,0 +1,159 @@ +name: dagster + +x-postgres-variables: &postgres-variables + POSTGRES_USER: ${POSTGRES_USER:-dagster_user} + POSTGRES_PASSWORD: ${POSTGRES_PASSWORD:-dagster_password} + POSTGRES_DB: ${POSTGRES_DB:-dagster_db} + POSTGRES_HOST: "dagster-postgres" + +x-dagster-configs: &dagster-configs + - source: dagster.yaml + target: /opt/dagster/home/dagster.yaml + - source: workspace.yaml + target: /opt/dagster/home/workspace.yaml + + +services: + # This service runs the postgres DB used by dagster for run storage, schedule storage, + # and event log storage. Depending on the hardware you run this Compose on, you may be able + # to reduce the interval and timeout in the healthcheck to speed up your `docker-compose up` times. + dagster-postgres: + image: postgres:16 + container_name: dagster-postgres + environment: + <<: *postgres-variables + PGDATA: "/var/lib/postgresql/data" + volumes: + - dagster-pgdata-vol:/var/lib/postgresql/data + networks: ["dagster-network"] + healthcheck: + test: pg_isready -U $$POSTGRES_USER -d $$POSTGRES_DB + interval: 10s + timeout: 8s + retries: 5 + + # This service runs the gRPC server that loads user code, used by both dagster-webserver + # and dagster-daemon. By setting DAGSTER_CURRENT_IMAGE to its own image, we tell the + # run launcher to use this same image when launching runs in a new container as well. + dagster-codeserver: + container_name: dagster-codeserver + image: ghcr.io/openclimatefix/dagster-dags:devsjc-code-container + restart: always + environment: + <<: *postgres-variables + DAGSTER_CURRENT_IMAGE: "ghcr.io/openclimatefix/dagster-dags" + DAGSTER_HOME: "/opt/dagster/home" + configs: *dagster-configs + networks: ["dagster-network"] + + # This service runs dagster-webserver, which loads your user code from the user code container. + # Since our instance uses the QueuedRunCoordinator, any runs submitted from the webserver will be put on + # a queue and later dequeued and launched by dagster-daemon. + dagster-webserver: + container_name: dagster-webserver + image: dagster/dagster-k8s:latest + command: ["dagster-webserver", "-h", "0.0.0.0", "-p", "3008", "-w", "/opt/dagster/home/workspace.yaml"] + ports: + - "3008:3008" + environment: + <<: *postgres-variables + DAGSTER_HOME: "/opt/dagster/home" + configs: *dagster-configs + volumes: + # Enable termination of runs from the webserver + - /var/run/docker.sock:/var/run/docker.sock + - /tmp/io_manager_storage:/tmp/io_manager_storage + networks: ["dagster-network"] + depends_on: + dagster-postgres: + condition: service_healthy + dagster-codeserver_local-archives: + condition: service_started + + # This service runs the dagster-daemon process, which is responsible for taking runs + # off of the queue and launching them, as well as creating runs from schedules or sensors. + dagster-daemon: + container_name: dagster-daemon + image: dagster/dagster-k8s:latest + command: ["dagster-daemon", "run", "-w", "/opt/dagster/home/workspace.yaml"] + restart: on-failure + environment: + <<: *postgres-variables + DAGSTER_HOME: "/opt/dagster/home" + configs: *dagster-configs + volumes: + # Enable kicking off of runs from the daemon + - /var/run/docker.sock:/var/run/docker.sock + - /tmp/io_manager_storage:/tmp/io_manager_storage + networks: ["dagster-network"] + depends_on: + dagster-postgres: + condition: service_healthy + dagster-codeserver_local-archives: + condition: service_started + +networks: + dagster-network: + driver: bridge + name: dagster-network + +volumes: + # Volume for the postgres data directory + dagster-pgdata-vol: + name: dagster-pgdata-vol + +configs: + workspace.yaml: + content: | + load_from: + - grpc_server: + host: "dagster-codeserver" + port: 4266 + location_name: "dagster_dags" + + dagster.yaml: + content: | + storage: + postgres: + postgres_db: + username: {"env": "POSTGRES_USER"} + password: {"env": "POSTGRES_PASSWORD"} + hostname: {"env": "POSTGRES_HOST"} + db_name: {"env": "POSTGRES_DB"} + port: 5432 + + local_artifact_storage: + module: dagster.core.storage.root + class: LocalArtifactStorage + config: + base_dir: "/opt/dagster/local/" + + run_coordinator: + module: dagster.core.run_coordinator + class: QueuedRunCoordinator + config: + max_concurrent_runs: 30 + tag_concurrency_limits: + - key: "dagster/backfill" + limit: 15 + - key: "nwp-consumer" + limit: 1 + + run_launcher: + module: dagster_docker + class: DockerRunLauncher + config: + env_vars: + - POSTGRES_USER + - POSTGRES_PASSWORD + - POSTGRES_DB + + retention: + schedule: + purge_after_days: 90 + sensor: + purge_after_days: + skipped: 7 + failure: 30 + success: -1 + diff --git a/local_archives/__init__.py b/local_archives/__init__.py deleted file mode 100644 index 45265eb..0000000 --- a/local_archives/__init__.py +++ /dev/null @@ -1,57 +0,0 @@ -import os - -import dagster as dg -from dagster_docker import PipesDockerClient - -import managers -import resources -from constants import LOCATIONS_BY_ENVIRONMENT - -from . import nwp, sat - -resources_by_env = { - "leo": { - "nwp_xr_zarr_io": managers.LocalFilesystemXarrayZarrManager( - base_path=LOCATIONS_BY_ENVIRONMENT["leo"].NWP_ZARR_FOLDER, - ), - "meteomatics_api": resources.MeteomaticsAPIResource( - username=dg.EnvVar("METEOMATICS_USERNAME"), - password=dg.EnvVar("METEOMATICS_PASSWORD"), - ), - "pipes_subprocess_client": dg.PipesSubprocessClient(), - "pipes_docker_client": PipesDockerClient(), - }, - "local": { - "nwp_xr_zarr_io": managers.LocalFilesystemXarrayZarrManager( - base_path=LOCATIONS_BY_ENVIRONMENT["local"].NWP_ZARR_FOLDER, - ), - "meteomatics_api": resources.MeteomaticsAPIResource( - username=dg.EnvVar("METEOMATICS_USERNAME"), - password=dg.EnvVar("METEOMATICS_PASSWORD"), - ), - "pipes_subprocess_client": dg.PipesSubprocessClient(), - "pipes_docker_client": PipesDockerClient(), - }, -} - -all_assets: list[dg.AssetsDefinition] = [ - *nwp.all_assets, - *sat.all_assets, -] - -all_jobs: list[dg.JobDefinition] = [ - *nwp.all_jobs, - *sat.all_jobs, -] - -all_schedules: list[dg.ScheduleDefinition] = [ - *nwp.all_schedules, - *sat.all_schedules, -] - -defs = dg.Definitions( - assets=all_assets, - resources=resources_by_env[os.getenv("ENVIRONMENT", "local")], - jobs=all_jobs, - schedules=all_schedules, -) diff --git a/local_archives/nwp/__init__.py b/local_archives/nwp/__init__.py deleted file mode 100644 index d86d55d..0000000 --- a/local_archives/nwp/__init__.py +++ /dev/null @@ -1,60 +0,0 @@ -"""Definitions for the NWP dagster code location.""" - -import dagster as dg - -from . import cams, ceda, ecmwf, jobs, meteomatics, gfs - -all_assets: list[dg.AssetsDefinition] = [ - *ceda.all_assets, - *ecmwf.all_assets, - *cams.all_assets, - *meteomatics.all_assets, - *gfs.all_assets, -] - -all_jobs: list[dg.JobDefinition] = [ - jobs.scan_nwp_raw_archive, - jobs.scan_nwp_zarr_archive, -] - - -@dg.schedule( - job=jobs.scan_nwp_raw_archive, - cron_schedule="0 3 * * *", - default_status=dg.DefaultScheduleStatus.RUNNING, -) -def scan_nwp_raw_archives_schedule(context: dg.ScheduleEvaluationContext) -> dg.RunRequest: - """Scan the raw archives. - - Yields a RunRequest for the scan_nwp_raw_archive job for each raw archive. - """ - raw_assets: list[dg.AssetsDefinition] = [a for a in all_assets if "raw_archive" in a.key.path] - for a in raw_assets: - yield dg.RunRequest( - run_key=f"scan_nwp_{a.key.path[1]}_{a.key.path[2]}_{a.key.path[3]}", - run_config=jobs.gen_run_config(a.key), - ) - - -@dg.schedule( - job=jobs.scan_nwp_zarr_archive, - cron_schedule="15 3 * * *", - default_status=dg.DefaultScheduleStatus.RUNNING, -) -def scan_nwp_zarr_archives_schedule(context: dg.ScheduleEvaluationContext) -> dg.RunRequest: - """Scan the zarr archives. - - Yields a RunRequest for the scan_nwp_zarr_archive job for each zarr archive. - """ - zarr_assets: list[dg.AssetsDefinition] = [a for a in all_assets if "zarr_archive" in a.key.path] - for a in zarr_assets: - yield dg.RunRequest( - run_key=f"scan_nwp_{a.key.path[1]}_{a.key.path[2]}_{a.key.path[3]}", - run_config=jobs.gen_run_config(a.key), - ) - - -all_schedules: list[dg.ScheduleDefinition] = [ - scan_nwp_raw_archives_schedule, - scan_nwp_zarr_archives_schedule, -] diff --git a/local_archives/nwp/_generic_definitions_factory.py b/local_archives/nwp/_generic_definitions_factory.py deleted file mode 100644 index b3a2a5e..0000000 --- a/local_archives/nwp/_generic_definitions_factory.py +++ /dev/null @@ -1,210 +0,0 @@ -"""Defines a factory for creating nwp-consumer-backed assets and jobs.""" - -import dataclasses as dc -import datetime as dt -import os -import pathlib -import shutil -from typing import Literal - -import dagster as dg -import numpy as np -import xarray as xr -from nwp_consumer.internal import IT_FOLDER_STRUCTURE_RAW, FetcherInterface, FileInfoModel - -from constants import LOCATIONS_BY_ENVIRONMENT - -env = os.getenv("ENVIRONMENT", "local") -RAW_FOLDER = LOCATIONS_BY_ENVIRONMENT[env].RAW_FOLDER -ZARR_FOLDER = LOCATIONS_BY_ENVIRONMENT[env].NWP_ZARR_FOLDER - - -@dc.dataclass -class MakeDefinitionsOptions: - """Typesafe options for the make_asset_definitions function.""" - - area: str - fetcher: FetcherInterface - source: Literal["ecmwf", "ceda", "cams"] - partitions: dg.TimeWindowPartitionsDefinition - - def key_prefix(self) -> list[str]: - """Generate an asset key prefix based on the area. - - The prefix is important as it defines the folder structure under which - assets are stored. - """ - return ["nwp", self.source, self.area] - - -@dc.dataclass -class MakeDefinitionsOutputs: - """Typesafe outputs for the make_definitions function.""" - - raw_asset: dg.AssetsDefinition - zarr_asset: dg.AssetsDefinition - - -def make_definitions( - opts: MakeDefinitionsOptions, -) -> MakeDefinitionsOutputs: - """Generates assets and associated jobs for NWP-consumer data.""" - - # The Raw Archive asset has the following properties: - # * Key Prefix: nwp/{source}/{area} - defines part of the storage folder structure - # * Auto Materialize Policy: Eagerly materialize the asset when the raw archive is updated - # ** This is checked on a cron schedule every tuesday and saturday at midnight, and up - # ** to 10 materializations are allowed per check. - # * Partitions: Defines the partitioning scheme for the asset - # * Check Specs: Defines the checks that should be performed on the asset - @dg.asset( - name="raw_archive", - key_prefix=opts.key_prefix(), - automation_condition=dg.AutomationCondition.eager(), - partitions_def=opts.partitions, - check_specs=[ - dg.AssetCheckSpec( - name="num_local_is_num_remote", - asset=[*opts.key_prefix(), "raw_archive"], - ), - dg.AssetCheckSpec(name="nonzero_local_size", asset=[*opts.key_prefix(), "raw_archive"]), - ], - metadata={ - "archive_folder": dg.MetadataValue.text(f"{RAW_FOLDER}/{'/'.join(opts.key_prefix())}"), - "area": dg.MetadataValue.text(opts.area), - "source": dg.MetadataValue.text(opts.source), - }, - compute_kind="download", - op_tags={"dagster/max_runtime": int(60 * 100)}, - ) - def _raw_archive( - context: dg.AssetExecutionContext, - ) -> dg.Output[list[pathlib.Path]]: - """Locally stored archive of raw data.""" - execution_start = dt.datetime.now(tz=dt.UTC) - - # List all available source files for this partition - # TODO: Enable single run backfills - it = context.partition_time_window.start - context.log.info( - f"Listing files for init time {it.strftime('%Y-%m-%d %H:%M')} from {opts.source}.", - ) - fileinfos: list[FileInfoModel] = opts.fetcher.listRawFilesForInitTime(it=it) - - if len(fileinfos) == 0: - raise ValueError("No files found for this partition. See error logs.") - - context.log.info(f"Found {len(fileinfos)} files for this partition.") - - # For each file in the remote archive, download and store it - stored_paths: list[pathlib.Path] = [] - sizes: list[int] = [] - - # Store the file based on the asset key prefix and the init time of the file - loc = "/".join(context.asset_key.path[:-1]) - for fi in fileinfos: - dst = pathlib.Path( - f"{RAW_FOLDER}/{loc}/{fi.it().strftime(IT_FOLDER_STRUCTURE_RAW)}/{fi.filename()}", - ) - - # If the file already exists, don't re download it - if dst.exists() and dst.stat().st_size > 0: - context.log.info( - f"File {fi.filename()} already exists at {dst.as_posix()}. Skipping download.", - ) - stored_paths.append(dst) - sizes.append(dst.stat().st_size) - continue - - # Otherwise, download it and store it - if dst.exists() and dst.stat().st_size == 0: - dst.unlink() - context.log.info( - f"Downloading file {fi.filename()} to {dst.as_posix()}", - ) - # Download to temp fails soft, so we need to check the src - # to see if it is an empty path. - src = opts.fetcher.downloadToCache(fi=fi) - if src is None or src == pathlib.Path(): - raise ValueError( - f"Error downloading file {fi.filename()}. See stdout logs for details.", - ) - context.log.info(f"Moving file {src.as_posix()} to {dst.as_posix()}") - dst.parent.mkdir(parents=True, exist_ok=True) - shutil.move(src=src, dst=dst) - - stored_paths.append(dst) - sizes.append(dst.stat().st_size) - - elapsed_time = dt.datetime.now(tz=dt.UTC) - execution_start - - yield dg.Output( - stored_paths, - metadata={ - "inittime": dg.MetadataValue.text(context.asset_partition_key_for_output()), - "partition_num_files": dg.MetadataValue.int(len(stored_paths)), - "file_paths": dg.MetadataValue.text(str([f.as_posix() for f in stored_paths])), - "partition_size": dg.MetadataValue.int(sum(sizes)), - "area": dg.MetadataValue.text(opts.area), - "elapsed_time_mins": dg.MetadataValue.float(elapsed_time / dt.timedelta(minutes=1)), - }, - ) - - # Perform the checks defined in the check_specs above - yield dg.AssetCheckResult( - check_name="num_local_is_num_remote", - passed=bool(len(stored_paths) == len(fileinfos)), - ) - yield dg.AssetCheckResult( - check_name="nonzero_local_size", - passed=bool(np.all(sizes)), - ) - - # The Zarr Archive asset has the following properties: - # * Key Prefix: nwp/{source}/{area} - defines part of the storage folder structure - # * Auto Materialize Policy: Eagerly materialize the asset when the raw archive is updated - # * Partitions: Defines the partitioning scheme for the asset - @dg.asset( - name="zarr_archive", - key_prefix=opts.key_prefix(), - partitions_def=opts.partitions, - automation_condition=dg.AutomationCondition.eager(), - ins={"raw_paths": dg.AssetIn(key=_raw_archive.key)}, - io_manager_key="nwp_xr_zarr_io", - compute_kind="process", - metadata={ - "archive_folder": dg.MetadataValue.text(f"{ZARR_FOLDER}/{'/'.join(opts.key_prefix())}"), - "area": dg.MetadataValue.text(opts.area), - "source": dg.MetadataValue.text(opts.source), - }, - op_tags={"dagster/max_runtime": 60 * 10}, - ) - def _zarr_archive( - context: dg.AssetExecutionContext, - raw_paths: list[pathlib.Path], - ) -> dg.Output[xr.Dataset]: - """Locally stored archive of zarr-formatted xarray data.""" - execution_start = dt.datetime.now(tz=dt.UTC) - # Convert each file to an xarray dataset and merge - datasets: list[xr.Dataset] = [] - for path in raw_paths: - context.log.info(f"Converting raw file at {path.as_posix()} to xarray dataset.") - datasets.append(opts.fetcher.mapCachedRaw(p=path)) - context.log.info(f"Merging {len(datasets)} datasets into one.") - ds = xr.merge(datasets, combine_attrs="drop_conflicts") - - elapsed_time = dt.datetime.now(tz=dt.UTC) - execution_start - - return dg.Output( - ds, - metadata={ - "inittime": dg.MetadataValue.text(context.asset_partition_key_for_output()), - "dataset": dg.MetadataValue.md(str(ds)), - "elapsed_time_mins": dg.MetadataValue.float(elapsed_time / dt.timedelta(minutes=1)), - }, - ) - - return MakeDefinitionsOutputs( - raw_asset=_raw_archive, - zarr_asset=_zarr_archive, - ) diff --git a/local_archives/nwp/cams/__init__.py b/local_archives/nwp/cams/__init__.py deleted file mode 100644 index 8f58e31..0000000 --- a/local_archives/nwp/cams/__init__.py +++ /dev/null @@ -1,15 +0,0 @@ -import dagster as dg - -from . import cams_eu, cams_global - -eu_assets = dg.load_assets_from_modules( - modules=[cams_eu], - group_name="cams_eu", -) - -global_assets = dg.load_assets_from_modules( - modules=[cams_global], - group_name="cams_global", -) - -all_assets: list[dg.AssetsDefinition] = [*eu_assets, *global_assets] diff --git a/local_archives/nwp/cams/_definitions_factory.py b/local_archives/nwp/cams/_definitions_factory.py deleted file mode 100644 index 0cd60ad..0000000 --- a/local_archives/nwp/cams/_definitions_factory.py +++ /dev/null @@ -1,257 +0,0 @@ -import dataclasses as dc -import datetime as dt -import os -import pathlib -from typing import Any, Literal - -import cdsapi -import dagster as dg - -from constants import LOCATIONS_BY_ENVIRONMENT - -env = os.getenv("ENVIRONMENT", "local") -RAW_FOLDER = LOCATIONS_BY_ENVIRONMENT[env].RAW_FOLDER -IT_FOLDER_FMTSTR = "%Y/%m/%d/%H%M" - -@dc.dataclass -class VariableSelection: - """Defines the variables to request from CAMS.""" - - # The slow variables are those that are only available from tape - # The dictionary maps the variable group name to a list of variables - # to pull within that groups' request. This can be one to [one]. - slow: dict[str, list[str]] = dc.field(default_factory=dict) - # The fast variables are those that are available from disk - # The dictionary maps the variable group name to a list of variables - # to pull within that groups' request. This can be one to [one]. - fast: dict[str, list[str]] = dc.field(default_factory=dict) - # The hours to pull - hours: list[str] = dc.field(default_factory=list) - -@dc.dataclass -class MakeDefinitionsOptions: - """Typesafe options for the make_asset_definitions function.""" - - area: str - file_format: Literal["grib", "netcdf"] - partitions: dg.TimeWindowPartitionsDefinition - client: cdsapi.Client - multilevel_vars: VariableSelection | None = None - multilevel_levels: list[str] | None = None - singlelevel_vars: VariableSelection | None = None - - def key_prefix(self) -> list[str]: - """Generate an asset key prefix based on the area. - - The prefix is important as it defines the folder structure under which - assets are stored. - """ - return ["nwp", "cams", self.area] - - def dataset_name(self) -> str: - """Generate a dataset name based on the area.""" - match self.area: - case "eu": - return "cams-europe-air-quality-forecasts" - case "global": - return "cams-global-atmospheric-composition-forecasts" - case _: - raise ValueError(f"Area {self.area} not supported") - - -@dc.dataclass -class CamsFileInfo: - """Information about a remote file from the CAMS CDS API. - - Mirrors the structure of the cdsapi.api.Result.toJSON() method: - https://github.com/ecmwf/cdsapi/blob/master/cdsapi/api.py - Also adds in a field to hold the variable name and inittime. - """ - - resultType: str - contentType: str - contentLength: int - location: str - var: str - inittime: dt.datetime - - -@dc.dataclass -class MakeDefinitionsOutputs: - """Outputs from the make_asset_definitions function.""" - - raw_asset: dg.AssetsDefinition - - -def make_definitions( - opts: MakeDefinitionsOptions, -) -> MakeDefinitionsOutputs: - """Generate the assets for a CAMS datset.""" - - @dg.asset( - name="raw_archive", - key_prefix=opts.key_prefix(), - partitions_def=opts.partitions, - compute_kind="download", - op_tags={ - "expected_runtime": "5hrs", - "MAX_RUNTIME_SECONDS_TAG": 20 * 60 * 60, - }, - ) - def _cams_raw_archive(context: dg.AssetExecutionContext) -> dg.Output[list[pathlib.Path]]: - """Asset detailing all wanted remote files from CAMS.""" - execution_start = dt.datetime.now(tz=dt.UTC) - - stored_files: list[pathlib.Path] = [] - sizes: list[int] = [] - - # Check if partition is targeting a time more than 30 days old - # * CAMS data older than 30 days is only available from tape - # * These variables are slower to collect - it = context.partition_time_window.start - use_slow: bool = False - if (dt.datetime.now(tz=dt.UTC) - it) > dt.timedelta(days=30): - context.log.info( - f"Partition {context.partition_key} is targeting a time more than 30 days old. " - + "Pulling variables from tape, this may take a while.", - ) - use_slow = True - - # First handle single level variables - if opts.singlelevel_vars is not None: - for name, varlist in ( - opts.singlelevel_vars.slow if use_slow else opts.singlelevel_vars.fast - ).items(): - - # Create the target file path for the current set of vars - loc = "/".join(context.asset_key.path[:-1]) - ext = ".grib" if opts.file_format == "grib" else ".nc" - dst = pathlib.Path( - f"{RAW_FOLDER}/{loc}/{it.strftime(IT_FOLDER_FMTSTR)}/" - + f"{it.strftime('%Y%m%d%H')}_{name}{ext}", - ) - # If the file already exists, don't redownload it - if dst.exists(): - stored_files.append(dst) - sizes.append(dst.stat().st_size) - context.log.info(f"File {dst.as_posix()} already exists, skipping", extra={ - "file": dst.as_posix(), - "size": dst.stat().st_size, - }) - continue - - dst.parent.mkdir(parents=True, exist_ok=True) - dst.touch() - - # Othrwise, build the request - sl_var_request: dict[str, Any] = { - "date": it.strftime("%Y-%m-%d/%Y-%m-%d"), - "type": "forecast", - "format": opts.file_format, - "variable": varlist, - "leadtime_hour": opts.singlelevel_vars.hours, - "time": it.strftime("%H:%M"), - } - if opts.area == "eu": - sl_var_request["model"] = "ensemble" - - # Request the file and download it to the target - context.log.info(f"Reqesting file {dst.as_posix()} from CDS API", extra={ - "request": sl_var_request, - "target": dst.as_posix(), - }) - result = opts.client.retrieve( - name=opts.dataset_name(), - request=sl_var_request, - target=dst.as_posix(), - ) - stored_files.append(dst) - sizes.append(dst.stat().st_size) - context.log.info(f"File {dst.as_posix()} downloaded from CDS API", extra={ - "file": dst.as_posix(), - "size": dst.stat().st_size, - }) - - # TODO: Split up multi-variables stored files into a single file per variable - # using grib_filter - - # Then handle multilevel variables - if opts.multilevel_vars is not None: - for name, varlist in ( - opts.multilevel_vars.slow if use_slow else opts.multilevel_vars.fast - ).items(): - - # Create the target file path for the current set of vars - loc = "/".join(context.asset_key.path[:-1]) - ext = ".grib" if opts.file_format == "grib" else ".nc" - dst = pathlib.Path( - f"{RAW_FOLDER}/{loc}/{it.strftime(IT_FOLDER_FMTSTR)}/" - + f"{it.strftime('%Y%m%d%H')}_{name}{ext}", - ) - - # If the file already exists, don't redownload it - if dst.exists(): - stored_files.append(dst) - sizes.append(dst.stat().st_size) - context.log.info(f"File {dst.as_posix()} already exists, skipping", extra={ - "file": dst.as_posix(), - "size": dst.stat().st_size, - }) - continue - - dst.parent.mkdir(parents=True, exist_ok=True) - dst.touch() - - # Othrwise, build the request - ml_var_request: dict[str, Any] = { - "date": it.strftime("%Y-%m-%d/%Y-%m-%d"), - "type": "forecast", - "format": opts.file_format, - "variable": varlist, - "leadtime_hour": opts.multilevel_vars.hours, - "time": it.strftime("%H:%M"), - } - if opts.area == "eu": - ml_var_request["level"] = opts.multilevel_levels - ml_var_request["model"] = "ensemble" - else: - ml_var_request["pressure_level"] = opts.multilevel_levels - - # Request the file and download it to the target - context.log.info(f"Reqesting file {dst.as_posix()} from CDS API", extra={ - "request": ml_var_request, - "target": dst.as_posix(), - }) - result = opts.client.retrieve( - name=opts.dataset_name(), - request=ml_var_request, - target=dst.as_posix(), - ) - stored_files.append(dst) - sizes.append(dst.stat().st_size) - context.log.info(f"File {dst.as_posix()} downloaded from CDS API", extra={ - "file": dst.as_posix(), - "size": dst.stat().st_size, - }) - - - if len(stored_files) == 0: - raise Exception( - "No remote files found for this partition key. See logs for more details.", - ) - - elapsed_time: dt.timedelta = dt.datetime.now(tz=dt.UTC) - execution_start - - return dg.Output( - stored_files, - metadata={ - "inittime": dg.MetadataValue.text(context.asset_partition_key_for_output()), - "num_files": dg.MetadataValue.int(len(stored_files)), - "partition_size": dg.MetadataValue.int(sum(sizes)), - "elapsed_time_mins": dg.MetadataValue.float(elapsed_time / dt.timedelta(minutes=1)), - }, - ) - - return MakeDefinitionsOutputs( - raw_asset=_cams_raw_archive, - ) diff --git a/local_archives/nwp/cams/cams_eu.py b/local_archives/nwp/cams/cams_eu.py deleted file mode 100644 index dabb970..0000000 --- a/local_archives/nwp/cams/cams_eu.py +++ /dev/null @@ -1,57 +0,0 @@ -import datetime as dt - -import dagster as dg -from cdsapi import Client - -from ._definitions_factory import ( - MakeDefinitionsOptions, - MakeDefinitionsOutputs, - VariableSelection, - make_definitions, -) - -# CAMS data is only available from 3 years ago onwards -start_date: dt.datetime = dt.datetime.now(tz=dt.UTC) - dt.timedelta(days=3 * 365) -cams_eu_partitions: dg.TimeWindowPartitionsDefinition = dg.TimeWindowPartitionsDefinition( - start=start_date.strftime("%Y-%m-%dT%H:%M"), - cron_schedule="0 0 * * *", # Daily at midnight - fmt="%Y-%m-%dT%H:%M", -) - -VARIABLES = [ - "alder_pollen", - "ammonia", - "birch_pollen", - "carbon_monoxide", - "dust", - "grass_pollen", - "nitrogen_dioxide", - "nitrogen_monoxide", - "non_methane_vocs", - "olive_pollen", - "ozone", - "particulate_matter_10um", - "particulate_matter_2.5um", - "peroxyacyl_nitrates", - "pm10_wildfires", - "ragweed_pollen", - "secondary_inorganic_aerosol", - "sulphur_dioxide", -] - -opts: MakeDefinitionsOptions = MakeDefinitionsOptions( - area="eu", - file_format="netcdf", - multilevel_vars=VariableSelection( - slow={v: [v] for v in VARIABLES}, - fast={v: [v] for v in VARIABLES}, - hours=[str(x) for x in range(0, 97)], - ), - multilevel_levels=["0", "1000", "2000", "250", "3000", "50", "500", "5000"], - partitions=cams_eu_partitions, - client=Client(), -) - -defs: MakeDefinitionsOutputs = make_definitions(opts=opts) - -cams_eu_raw_archive = defs.raw_asset diff --git a/local_archives/nwp/cams/cams_global.py b/local_archives/nwp/cams/cams_global.py deleted file mode 100644 index 2c2bac7..0000000 --- a/local_archives/nwp/cams/cams_global.py +++ /dev/null @@ -1,300 +0,0 @@ -import dagster as dg -from cdsapi import Client - -from ._definitions_factory import ( - MakeDefinitionsOptions, - MakeDefinitionsOutputs, - VariableSelection, - make_definitions, -) - -cams_global_partitions = dg.TimeWindowPartitionsDefinition( - start="2015-01-01T00:00", - cron_schedule="0 0,12 * * *", - fmt="%Y-%m-%dT%H:%M", -) - -singlelevel_fast_vars: list[str] = [ - "ammonium_aerosol_optical_depth_550nm", - "black_carbon_aerosol_optical_depth_550nm", - "dust_aerosol_optical_depth_550nm", - "nitrate_aerosol_optical_depth_550nm", - "organic_matter_aerosol_optical_depth_550nm", - "particulate_matter_10um", - "particulate_matter_1um", - "particulate_matter_2.5um", - "sea_salt_aerosol_optical_depth_550nm", - "secondary_organic_aerosol_optical_depth_550nm", - "sulphate_aerosol_optical_depth_550nm", - "total_aerosol_optical_depth_1240nm", - "total_aerosol_optical_depth_469nm", - "total_aerosol_optical_depth_550nm", - "total_aerosol_optical_depth_670nm", - "total_aerosol_optical_depth_865nm", - "total_column_carbon_monoxide", - "total_column_chlorine_monoxide", - "total_column_chlorine_nitrate", - "total_column_ethane", - "total_column_formaldehyde", - "total_column_hydrogen_chloride", - "total_column_hydrogen_cyanide", - "total_column_hydrogen_peroxide", - "total_column_hydroxyl_radical", - "total_column_isoprene", - "total_column_methane", - "total_column_nitric_acid", - "total_column_nitrogen_dioxide", - "total_column_nitrogen_monoxide", - "total_column_ozone", - "total_column_peroxyacetyl_nitrate", - "total_column_propane", - "total_column_sulphur_dioxide", -] - -multilevel_fast_vars: list[str] = [ - "ammonium_aerosol_mass_mixing_ratio", - "anthropogenic_secondary_organic_aerosol_mass_mixing_ratio", - "biogenic_secondary_organic_aerosol_mass_mixing_ratio", - "carbon_monoxide", - "chlorine_monoxide", - "chlorine_nitrate", - "dust_aerosol_0.03-0.55um_mixing_ratio", - "dust_aerosol_0.55-0.9um_mixing_ratio", - "dust_aerosol_0.9-20um_mixing_ratio", - "ethane", - "formaldehyde", - "hydrogen_chloride", - "hydrogen_cyanide", - "hydrogen_peroxide", - "hydrophilic_black_carbon_aerosol_mixing_ratio", - "hydrophilic_organic_matter_aerosol_mixing_ratio", - "hydrophobic_black_carbon_aerosol_mixing_ratio", - "hydrophobic_organic_matter_aerosol_mixing_ratio", - "hydroxyl_radical", - "isoprene", - "methane", - "nitrate_coarse_mode_aerosol_mass_mixing_ratio", - "nitrate_fine_mode_aerosol_mass_mixing_ratio", - "nitric_acid", - "nitrogen_dioxide", - "nitrogen_monoxide", - "ozone", - "peroxyacetyl_nitrate", - "propane", - "sea_salt_aerosol_0.03-0.5um_mixing_ratio", - "sea_salt_aerosol_0.5-5um_mixing_ratio", - "sea_salt_aerosol_5-20um_mixing_ratio", - "specific_humidity", - "sulphate_aerosol_mixing_ratio", - "sulphur_dioxide", -] - -multilevel_hours: list[str] = [str(x) for x in range(0, 121, 3)] - -singlelevel_hours: list[str] = [str(x) for x in range(0, 121)] - -# It is faster to download all variables in a group than to download them individually -# as then you are queuing fewer requests to the CDS API, for tape variables. -# Each group here have been checked in the ADS app to ensure they do not exceed -# the limit of 10000 items per request, when paired with downloading every step -# and init time for a single day. -singlelevel_slow_var_groups: dict[str, list[str]] = { - "asymmetry_factor_340-2130nm": [ - "asymmetry_factor_340nm", - "asymmetry_factor_355nm", - "asymmetry_factor_380nm", - "asymmetry_factor_400nm", - "asymmetry_factor_440nm", - "asymmetry_factor_469nm", - "asymmetry_factor_500nm", - "asymmetry_factor_532nm", - "asymmetry_factor_550nm", - "asymmetry_factor_645nm", - "asymmetry_factor_670nm", - "asymmetry_factor_800nm", - "asymmetry_factor_858nm", - "asymmetry_factor_865nm", - "asymmetry_factor_1020nm", - "asymmetry_factor_1064nm", - "asymmetry_factor_1240nm", - "asymmetry_factor_1640nm", - "asymmetry_factor_2130nm", - ], - "single_scattering_albedo_340-2130nm": [ - "single_scattering_albedo_340nm", - "single_scattering_albedo_355nm", - "single_scattering_albedo_380nm", - "single_scattering_albedo_400nm", - "single_scattering_albedo_440nm", - "single_scattering_albedo_469nm", - "single_scattering_albedo_500nm", - "single_scattering_albedo_532nm", - "single_scattering_albedo_550nm", - "single_scattering_albedo_645nm", - "single_scattering_albedo_670nm", - "single_scattering_albedo_800nm", - "single_scattering_albedo_858nm", - "single_scattering_albedo_865nm", - "single_scattering_albedo_1020nm", - "single_scattering_albedo_1064nm", - "single_scattering_albedo_1240nm", - "single_scattering_albedo_1640nm", - "single_scattering_albedo_2130nm", - ], - "total_aerosol_optical_depth_340-2130nm": [ - "total_aerosol_optical_depth_340nm", - "total_aerosol_optical_depth_355nm", - "total_aerosol_optical_depth_380nm", - "total_aerosol_optical_depth_400nm", - "total_aerosol_optical_depth_440nm", - "total_aerosol_optical_depth_500nm", - "total_aerosol_optical_depth_532nm", - "total_aerosol_optical_depth_645nm", - "total_aerosol_optical_depth_800nm", - "total_aerosol_optical_depth_858nm", - "total_aerosol_optical_depth_1020nm", - "total_aerosol_optical_depth_1064nm", - "total_aerosol_optical_depth_1640nm", - "total_aerosol_optical_depth_2130nm", - ], - "total_absorption_aerosol_optical_depth_340-2130nm": [ - "total_absorption_aerosol_optical_depth_340nm", - "total_absorption_aerosol_optical_depth_355nm", - "total_absorption_aerosol_optical_depth_380nm", - "total_absorption_aerosol_optical_depth_400nm", - "total_absorption_aerosol_optical_depth_440nm", - "total_absorption_aerosol_optical_depth_469nm", - "total_absorption_aerosol_optical_depth_500nm", - "total_absorption_aerosol_optical_depth_532nm", - "total_absorption_aerosol_optical_depth_550nm", - "total_absorption_aerosol_optical_depth_645nm", - "total_absorption_aerosol_optical_depth_670nm", - "total_absorption_aerosol_optical_depth_800nm", - "total_absorption_aerosol_optical_depth_858nm", - "total_absorption_aerosol_optical_depth_865nm", - "total_absorption_aerosol_optical_depth_1020nm", - "total_absorption_aerosol_optical_depth_1064nm", - "total_absorption_aerosol_optical_depth_1240nm", - "total_absorption_aerosol_optical_depth_1640nm", - "total_absorption_aerosol_optical_depth_2130nm", - ], - "total_fine_mode_aerosol_optical_depth_340-2130nm": [ - "total_fine_mode_aerosol_optical_depth_340nm", - "total_fine_mode_aerosol_optical_depth_355nm", - "total_fine_mode_aerosol_optical_depth_380nm", - "total_fine_mode_aerosol_optical_depth_400nm", - "total_fine_mode_aerosol_optical_depth_440nm", - "total_fine_mode_aerosol_optical_depth_469nm", - "total_fine_mode_aerosol_optical_depth_500nm", - "total_fine_mode_aerosol_optical_depth_532nm", - "total_fine_mode_aerosol_optical_depth_550nm", - "total_fine_mode_aerosol_optical_depth_645nm", - "total_fine_mode_aerosol_optical_depth_670nm", - "total_fine_mode_aerosol_optical_depth_800nm", - "total_fine_mode_aerosol_optical_depth_858nm", - "total_fine_mode_aerosol_optical_depth_865nm", - "total_fine_mode_aerosol_optical_depth_1020nm", - "total_fine_mode_aerosol_optical_depth_1064nm", - "total_fine_mode_aerosol_optical_depth_1240nm", - "total_fine_mode_aerosol_optical_depth_1640nm", - "total_fine_mode_aerosol_optical_depth_2130nm", - ], - "dust_aerosol_optical_depth_550nm_0.04-20um": [ - "dust_aerosol_0.03-0.55um_optical_depth_550nm", - "dust_aerosol_0.55-9um_optical_depth_550nm", - "dust_aerosol_9-20um_optical_depth_550nm", - ], - "sea_salt_aerosol_optical_depth_550nm_0.03-20um": [ - "sea_salt_aerosol_0.03-0.5um_optical_depth_550nm", - "sea_salt_aerosol_0.5-5um_optical_depth_550nm", - "sea_salt_aerosol_5-20um_optical_depth_550nm", - ], - "nitrate_aerosol_optical_depth_550nm_coarse-fine": [ - "nitrate_coarse_mode_aerosol_optical_depth_550nm", - "nitrate_fine_mode_aerosol_optical_depth_550nm", - ], - "hydrophilic_aerosol_optical_depth_550nm_bc-om": [ - "hydrophilic_black_carbon_aerosol_optical_depth_550nm", - "hydrophilic_organic_matter_aerosol_optical_depth_550nm", - ], - "hydrophobic_aerosol_optical_depth_550nm_bc-om": [ - "hydrophobic_black_carbon_aerosol_optical_depth_550nm", - "hydrophobic_organic_matter_aerosol_optical_depth_550nm", - ], -} - -singlelevel_slow_var_groups_subset: dict[str, list[str]] = { - "total_aerosol_optical_depth_400-645nm": [ - "total_aerosol_optical_depth_400nm", - "total_aerosol_optical_depth_440nm", - "total_aerosol_optical_depth_500nm", - "total_aerosol_optical_depth_532nm", - "total_aerosol_optical_depth_645nm", - ], -} - -# Due to pulling every pressure level, these need to be pulled one at a time -# to avoid exceeding the 10000 item limit per request. -multilevel_slow_vars: list[str] = [ - "aerosol_extinction_coefficient_1064nm", - "aerosol_extinction_coefficient_355nm", - "aerosol_extinction_coefficient_532nm", - "attenuated_backscatter_due_to_aerosol_1064nm_from_ground", - "attenuated_backscatter_due_to_aerosol_1064nm_from_top_of_atmosphere", - "attenuated_backscatter_due_to_aerosol_355nm_from_ground", - "attenuated_backscatter_due_to_aerosol_355nm_from_top_of_atmosphere", - "attenuated_backscatter_due_to_aerosol_532nm_from_ground", - "attenuated_backscatter_due_to_aerosol_532nm_from_top_of_atmosphere", -] - -multilevel_slow_vars_subset = ["aerosol_extinction_coefficient_532nm"] - -multilevel_levels: list[str] = [ - "1", - "2", - "3", - "5", - "7", - "10", - "20", - "30", - "50", - "70", - "100", - "150", - "200", - "250", - "300", - "400", - "500", - "600", - "700", - "800", - "850", - "900", - "925", - "950", - "1000", -] - -opts: MakeDefinitionsOptions = MakeDefinitionsOptions( - area="global", - file_format="grib", - multilevel_vars=VariableSelection( - slow={d: [d] for d in multilevel_slow_vars_subset}, - fast={d: [d] for d in multilevel_fast_vars}, - hours=multilevel_hours, - ), - multilevel_levels=multilevel_levels, - singlelevel_vars=VariableSelection( - slow=singlelevel_slow_var_groups_subset, - fast={d: [d] for d in singlelevel_fast_vars}, - hours=singlelevel_hours, - ), - partitions=cams_global_partitions, - client=Client(), -) - -defs: MakeDefinitionsOutputs = make_definitions(opts=opts) - -cams_global_raw_archive = defs.raw_asset diff --git a/local_archives/nwp/ceda/__init__.py b/local_archives/nwp/ceda/__init__.py deleted file mode 100644 index a7926e7..0000000 --- a/local_archives/nwp/ceda/__init__.py +++ /dev/null @@ -1,15 +0,0 @@ -import dagster as dg - -from . import ceda_uk, ceda_global - -uk_assets = dg.load_assets_from_modules( - modules=[ceda_uk], - group_name="ceda_uk", -) - -global_assets = dg.load_assets_from_modules( - modules=[ceda_global], - group_name="ceda_global", -) - -all_assets: list[dg.AssetsDefinition] = [*uk_assets, *global_assets] diff --git a/local_archives/nwp/ceda/ceda_uk.py b/local_archives/nwp/ceda/ceda_uk.py deleted file mode 100644 index 7a2cf60..0000000 --- a/local_archives/nwp/ceda/ceda_uk.py +++ /dev/null @@ -1,36 +0,0 @@ -"""CEDA UK data pipeline.""" -import os - -import dagster as dg -from nwp_consumer.internal import FetcherInterface -from nwp_consumer.internal.inputs import ceda - -from local_archives.nwp._generic_definitions_factory import ( - MakeDefinitionsOptions, - MakeDefinitionsOutputs, - make_definitions, -) - -fetcher: FetcherInterface = ceda.Client( - ftpUsername=os.getenv("CEDA_FTP_USER", "not-set"), - ftpPassword=os.getenv("CEDA_FTP_PASS", "not-set"), -) - -partitions: dg.TimeWindowPartitionsDefinition = dg.TimeWindowPartitionsDefinition( - start="2017-01-01T00:00", - cron_schedule="0 0/3 * * *", # Every 3 hours - fmt="%Y-%m-%dT%H:%M", - end_offset=-(8 * 8), # CEDA only available 8 days back (8 partitions per day) -) - -defs: MakeDefinitionsOutputs = make_definitions( - opts=MakeDefinitionsOptions( - area="uk", - source="ceda", - fetcher=fetcher, - partitions=partitions, - ), -) - -ceda_uk_raw_archive = defs.raw_asset -ceda_uk_zarr_archive = defs.zarr_asset diff --git a/local_archives/nwp/ecmwf/__init__.py b/local_archives/nwp/ecmwf/__init__.py deleted file mode 100644 index aab2a92..0000000 --- a/local_archives/nwp/ecmwf/__init__.py +++ /dev/null @@ -1,43 +0,0 @@ -import dagster as dg - -from . import ( - ecmwf_malta, - ecmwf_nw_india, - ecmwf_uk, - ecmwf_india, - ecmwf_ens_stat_india, -) - -uk_assets = dg.load_assets_from_modules( - modules=[ecmwf_uk], - group_name="ecmwf_uk", -) - - -nw_india_assets = dg.load_assets_from_modules( - modules=[ecmwf_nw_india], - group_name="ecmwf_nw_india", -) - -malta_assets = dg.load_assets_from_modules( - modules=[ecmwf_malta], - group_name="ecmwf_malta", -) - -india_assets = dg.load_assets_from_modules( - modules=[ecmwf_india], - group_name="ecmwf_india", -) - -india_stat_assets = dg.load_assets_from_modules( - modules=[ecmwf_ens_stat_india], - group_name="ecmwf_ens_india_stat", -) - -all_assets: list[dg.AssetsDefinition] = [ - *uk_assets, - *nw_india_assets, - *malta_assets, - *india_assets, - *india_stat_assets, -] diff --git a/local_archives/nwp/ecmwf/ecmwf_india.py b/local_archives/nwp/ecmwf/ecmwf_india.py deleted file mode 100644 index 1829867..0000000 --- a/local_archives/nwp/ecmwf/ecmwf_india.py +++ /dev/null @@ -1,34 +0,0 @@ -"""ECMWF India data pipeline.""" -import dagster as dg -from nwp_consumer.internal import FetcherInterface -from nwp_consumer.internal.inputs.ecmwf import mars - -from local_archives.nwp._generic_definitions_factory import ( - MakeDefinitionsOptions, - MakeDefinitionsOutputs, - make_definitions, -) - -fetcher: FetcherInterface = mars.MARSClient( - area="india", - hours=55, -) - -partitions: dg.TimeWindowPartitionsDefinition = dg.TimeWindowPartitionsDefinition( - start="2020-01-01T00:00", - cron_schedule="0 0,12 * * *", # 00:00 and 12:00 - fmt="%Y-%m-%dT%H:%M", - end_offset=-(4 * 2), # ECMWF only available 4 days back (2 partitions per day) -) - -defs: MakeDefinitionsOutputs = make_definitions( - opts=MakeDefinitionsOptions( - area="india", - source="ecmwf", - partitions=partitions, - fetcher=fetcher, - ), -) - -ecmwf_india_raw_archive = defs.raw_asset -ecmwf_india_zarr_archive = defs.zarr_asset diff --git a/local_archives/nwp/ecmwf/ecmwf_malta.py b/local_archives/nwp/ecmwf/ecmwf_malta.py deleted file mode 100644 index cfc929b..0000000 --- a/local_archives/nwp/ecmwf/ecmwf_malta.py +++ /dev/null @@ -1,34 +0,0 @@ -"""ECMWF Malta data pipeline.""" -import dagster as dg -from nwp_consumer.internal import FetcherInterface -from nwp_consumer.internal.inputs.ecmwf import mars - -from local_archives.nwp._generic_definitions_factory import ( - MakeDefinitionsOptions, - MakeDefinitionsOutputs, - make_definitions, -) - -fetcher: FetcherInterface = mars.MARSClient( - area="malta", - hours=84, -) - -partitions: dg.TimeWindowPartitionsDefinition = dg.TimeWindowPartitionsDefinition( - start="2017-01-01T00:00", - cron_schedule="0 0,12 * * *", # 00:00 and 12:00 - fmt="%Y-%m-%dT%H:%M", - end_offset=-(3 * 2), # ECMWF only available 3 days back (2 partitions per day) -) - -defs: MakeDefinitionsOutputs = make_definitions( - opts=MakeDefinitionsOptions( - area="malta", - source="ecmwf", - partitions=partitions, - fetcher=fetcher, - ), -) - -ecmwf_malta_raw_archive = defs.raw_asset -ecmwf_malta_zarr_archive = defs.zarr_asset diff --git a/local_archives/nwp/ecmwf/ecmwf_nw_india.py b/local_archives/nwp/ecmwf/ecmwf_nw_india.py deleted file mode 100644 index ef3414c..0000000 --- a/local_archives/nwp/ecmwf/ecmwf_nw_india.py +++ /dev/null @@ -1,34 +0,0 @@ -"""ECMWF NW India data pipeline.""" -import dagster as dg -from nwp_consumer.internal import FetcherInterface -from nwp_consumer.internal.inputs.ecmwf import mars - -from local_archives.nwp._generic_definitions_factory import ( - MakeDefinitionsOptions, - MakeDefinitionsOutputs, - make_definitions, -) - -fetcher: FetcherInterface = mars.MARSClient( - area="nw-india", - hours=192, -) - -partitions: dg.TimeWindowPartitionsDefinition = dg.TimeWindowPartitionsDefinition( - start="2017-01-01T00:00", - cron_schedule="0 0,12 * * *", # 00:00 and 12:00 - fmt="%Y-%m-%dT%H:%M", - end_offset=-(3 * 2), # ECMWF only available 3 days back (2 partitions per day) -) - -defs: MakeDefinitionsOutputs = make_definitions( - opts=MakeDefinitionsOptions( - area="nw_india", - source="ecmwf", - partitions=partitions, - fetcher=fetcher, - ), -) - -ecmwf_nw_india_raw_archive = defs.raw_asset -ecmwf_nw_india_zarr_archive = defs.zarr_asset diff --git a/local_archives/nwp/ecmwf/ecmwf_uk.py b/local_archives/nwp/ecmwf/ecmwf_uk.py deleted file mode 100644 index 93ff079..0000000 --- a/local_archives/nwp/ecmwf/ecmwf_uk.py +++ /dev/null @@ -1,35 +0,0 @@ -"""ECMWF UK data pipeline.""" - -import dagster as dg -from nwp_consumer.internal import FetcherInterface -from nwp_consumer.internal.inputs.ecmwf import mars - -from local_archives.nwp._generic_definitions_factory import ( - MakeDefinitionsOptions, - MakeDefinitionsOutputs, - make_definitions, -) - -fetcher: FetcherInterface = mars.MARSClient( - area="uk", - hours=84, -) - -partitions: dg.TimeWindowPartitionsDefinition = dg.TimeWindowPartitionsDefinition( - start="2017-01-01T00:00", - cron_schedule="0 0,12 * * *", # 00:00 and 12:00 - fmt="%Y-%m-%dT%H:%M", - end_offset=-(3 * 2), # ECMWF only available 3 days back (2 partitions per day) -) - -defs: MakeDefinitionsOutputs = make_definitions( - opts=MakeDefinitionsOptions( - area="uk", - source="ecmwf", - partitions=partitions, - fetcher=fetcher, - ), -) - -ecmwf_uk_raw_archive = defs.raw_asset -ecmwf_uk_zarr_archive = defs.zarr_asset diff --git a/local_archives/nwp/gfs/__init__.py b/local_archives/nwp/gfs/__init__.py deleted file mode 100644 index 5a4430c..0000000 --- a/local_archives/nwp/gfs/__init__.py +++ /dev/null @@ -1,8 +0,0 @@ -import dagster as dg - -from . import gfs - -all_assets: list[dg.AssetsDefinition] = dg.load_assets_from_modules( - modules=[gfs], - group_name="gfs_global", -) diff --git a/local_archives/nwp/gfs/gfs.py b/local_archives/nwp/gfs/gfs.py deleted file mode 100644 index 10f8e3b..0000000 --- a/local_archives/nwp/gfs/gfs.py +++ /dev/null @@ -1,47 +0,0 @@ -import datetime as dt -import os - -import dagster as dg - -from constants import LOCATIONS_BY_ENVIRONMENT -from containers.gfs import download_combine_gfs - -env = os.getenv("ENVIRONMENT", "local") -ZARR_FOLDER = LOCATIONS_BY_ENVIRONMENT[env].NWP_ZARR_FOLDER - -@dg.asset( - name="zarr_daily_archive", - description="Daily archive of GFS global NWP data", - key_prefix=["nwp", "gfs", "global"], - automation_condition=dg.AutomationCondition.eager(), - partitions_def=dg.DailyPartitionsDefinition( - start_date="2015-01-15", - end_offset=-2, - ), - metadata={ - "archive_folder": dg.MetadataValue.text(f"{ZARR_FOLDER}/nwp/gfs/global"), - "area": dg.MetadataValue.text("global"), - "source": dg.MetadataValue.text("gfs"), - }, -) -def zarr_archive( - context: dg.AssetExecutionContext, -) -> dg.Output: - start: dt.datetime.now(tz=dt.UTC) - outfile: str = download_combine_gfs.run( - path=ZARR_FOLDER + "/nwp/gfs/global", - date=context.partition_time_window.start, - config=download_combine_gfs.DEFAULT_CONFIG, - ) - end: dt.datetime.now(tz=dt.UTC) - return dg.Output( - value=outfile, - metadata={ - "archive_folder": dg.MetadataValue.text(f"{ZARR_FOLDER}/nwp/gfs/global"), - "area": dg.MetadataValue.text("global"), - "source": dg.MetadataValue.text("gfs"), - "partition_elapsed_time_minutes": dg.MetadataValue.int( - (end - start).total_seconds() // 60, - ), - }, - ) diff --git a/local_archives/nwp/jobs.py b/local_archives/nwp/jobs.py deleted file mode 100644 index ccc6943..0000000 --- a/local_archives/nwp/jobs.py +++ /dev/null @@ -1,241 +0,0 @@ -"""Defines the jobs for the ECMWF data pipeline.""" -import datetime as dt -import os -import pathlib - -import dagster as dg -import ocf_blosc2 # noqa -import xarray as xr -from nwp_consumer.internal import ( - IT_FOLDER_STRUCTURE_RAW, - IT_FOLDER_GLOBSTR_RAW, -) - -from constants import LOCATIONS_BY_ENVIRONMENT - -env = os.getenv("ENVIRONMENT", "local") -RAW_FOLDER = LOCATIONS_BY_ENVIRONMENT[env].RAW_FOLDER -ZARR_FOLDER = LOCATIONS_BY_ENVIRONMENT[env].NWP_ZARR_FOLDER - - -class ValidateExistingFilesConfig(dg.Config): - """Config schema for the validate_existing_files job.""" - - base_path: str - source: str - area: str - asset_name: str - - def check(self) -> None: - """Check that the source and area are valid.""" - if self.area not in ["global", "eu", "uk", "nw_india", "malta"]: - raise ValueError(f"Area {self.area} not recognised.") - - if self.source not in ["ecmwf", "icon", "ceda", "cams"]: - raise ValueError(f"Source {self.source} not recognised.") - - if self.archive_path().exists() is False: - raise FileNotFoundError( - f"Could not find archive folder {self.archive_path().as_posix()}", - ) - - def archive_path(self) -> pathlib.Path: - """Return the base path of the archive.""" - return pathlib.Path(self.base_path) / "nwp" / self.source / self.area - - -@dg.op -def validate_existing_raw_files( - context: dg.OpExecutionContext, - config: ValidateExistingFilesConfig, -) -> None: - """Checks for existing raw files. - - The folder structure of the raw files is assumed to follw the convention - from the nwp-consumer library. That is to say, the files are stored in - folders named after the inittime, which are in turn stored in folders - named after the area and source. See README.md for more details. - """ - config.check() - - total_archive_size_bytes: int = 0 - for it_folder in [f for f in config.archive_path().glob(IT_FOLDER_GLOBSTR_RAW) if f.suffix == ""]: - # Parse the folder as an inittime: - try: - it = dt.datetime.strptime( - it_folder.relative_to(config.archive_path()).as_posix(), - IT_FOLDER_STRUCTURE_RAW, - ).replace(tzinfo=dt.UTC) - except ValueError: - continue - - # For every file in the inittime folder with the correct extension, - # create an AssetObservation for the relevant partition - sizes: list[int] = [] - it_filepaths: list[pathlib.Path] = [] - for file in list(it_folder.glob("*.nc")) + list(it_folder.glob("*.grib")): - it_filepaths.append(file) - sizes.append(file.stat().st_size) - - total_archive_size_bytes += sum(sizes) - - if len(it_filepaths) > 0: - context.log_event( - dg.AssetObservation( - asset_key=["nwp", config.source, config.area, config.asset_name], - partition=it.strftime("%Y-%m-%d|%H:%M"), - metadata={ - "inittime": dg.MetadataValue.text( - it.strftime("%Y-%m-%d|%H:%M"), - ), - "num_files": dg.MetadataValue.int( - len(it_filepaths), - ), - "file_paths": dg.MetadataValue.text( - str([f.as_posix() for f in it_filepaths]), - ), - "partition_size": dg.MetadataValue.int( - sum(sizes), - ), - "area": dg.MetadataValue.text(config.area), - "last_checked": dg.MetadataValue.text( - dt.datetime.now(tz=dt.UTC).isoformat(), - ), - }, - ), - ) - - context.log_event( - dg.AssetObservation( - asset_key=["nwp", config.source, config.area, config.asset_name], - metadata={ - "archive_folder": dg.MetadataValue.text(config.archive_path().as_posix()), - "area": dg.MetadataValue.text(config.area), - "total_archive_size_gb": dg.MetadataValue.float(total_archive_size_bytes / 1e9), - "last_scan": dg.MetadataValue.text(dt.datetime.now(tz=dt.UTC).isoformat()), - }, - ), - ) - - -@dg.op -def validate_existing_zarr_files( - context: dg.OpExecutionContext, - config: ValidateExistingFilesConfig, -) -> None: - """Checks for existing zarr files.""" - config.check() - - total_archive_size_bytes: int = 0 - for file in config.archive_path().glob("*.zarr.zip"): - # Try to parse the init time from the filename - try: - it = dt.datetime.strptime( - file.name, - "%Y%m%dT%H%M.zarr.zip", - ).replace(tzinfo=dt.UTC) - except ValueError: - continue - - total_archive_size_bytes += file.stat().st_size - - ds = xr.open_zarr("zip::" + file.as_posix()) - - # Create an AssetObservation for the relevant partition - context.log_event( - dg.AssetObservation( - asset_key=["nwp", config.source, config.area, config.asset_name], - partition=it.strftime("%Y-%m-%d|%H:%M"), - metadata={ - "inittime": dg.MetadataValue.text(it.strftime("%Y-%m-%d|%H:%M")), - "dataset": dg.MetadataValue.md(str(ds)), - }, - ), - ) - - context.log_event( - dg.AssetObservation( - asset_key=["nwp", config.source, config.area, config.asset_name], - metadata={ - "archive_folder": dg.MetadataValue.text(config.archive_path().as_posix()), - "area": dg.MetadataValue.text(config.area), - "total_archive_size_gb": dg.MetadataValue.float(total_archive_size_bytes / 1e9), - "last_scan": dg.MetadataValue.text(dt.datetime.now(tz=dt.UTC).isoformat()), - }, - ), - ) - - return None - - -@dg.job( - name="scan_nwp_raw_archive", - config=dg.RunConfig( - ops={ - validate_existing_raw_files.__name__: ValidateExistingFilesConfig( - base_path=RAW_FOLDER, - source="ecmwf", - area="uk", - asset_name="raw_archive", - ), - }, - ), -) -def scan_nwp_raw_archive() -> None: - """Scan the raw NWP archive for existing files. - - This assumes a folder structure as follows: - >>> {base_path}/nwp/{source}/{area}/{YYYY}/{MM}/{DD}/{HHMM}/{file} - - where the time values pertain to the init time. - The values `nwp`, `source``` and `area` - are taken from the asset key. - """ - validate_existing_raw_files() - - -@dg.job( - name="scan_nwp_zarr_archive", - config=dg.RunConfig( - ops={ - validate_existing_zarr_files.__name__: ValidateExistingFilesConfig( - base_path=ZARR_FOLDER, - source="ecmwf", - area="uk", - asset_name="zarr_archive", - ), - }, - ), -) -def scan_nwp_zarr_archive() -> None: - """Scan the zarr NWP archive for existing files. - - This assumes a folder structure as follows: - >>> {base_path}/nwp/{source}/{area}/{YYYYMMDD}T{HHMM}.zarr.zip - - where the time values pertain to the init time. - """ - validate_existing_zarr_files() - - -def gen_run_config(asset_key: dg.AssetKey) -> dg.RunConfig: - """Generate a Run config for the validate_existing_files job.""" - vc: ValidateExistingFilesConfig = ValidateExistingFilesConfig( - base_path=RAW_FOLDER, - source=asset_key.path[1], - area=asset_key.path[2], - asset_name=asset_key.path[3], - ) - - if asset_key.path[-1] == "raw_archive": - return dg.RunConfig( - ops={ - validate_existing_raw_files.__name__: vc, - }, - ) - elif asset_key.path[-1] == "zarr_archive": - return dg.RunConfig( - ops={ - validate_existing_zarr_files.__name__: vc, - }, - ) diff --git a/local_archives/nwp/meteomatics/__init__.py b/local_archives/nwp/meteomatics/__init__.py deleted file mode 100644 index 7015086..0000000 --- a/local_archives/nwp/meteomatics/__init__.py +++ /dev/null @@ -1,12 +0,0 @@ -import dagster as dg - -from . import meteomatics_sites_india - -india_site_assets = dg.load_assets_from_modules( - modules=[meteomatics_sites_india], - group_name="meteomatics_sites_india", -) - -all_assets: list[dg.AssetsDefinition] = [ - *india_site_assets, -] diff --git a/local_archives/nwp/meteomatics/meteomatics_sites_india.py b/local_archives/nwp/meteomatics/meteomatics_sites_india.py deleted file mode 100644 index 8c2b8bb..0000000 --- a/local_archives/nwp/meteomatics/meteomatics_sites_india.py +++ /dev/null @@ -1,212 +0,0 @@ -import datetime as dt -import os -import pathlib - -import dagster as dg -import meteomatics.api as mmapi -import pandas as pd -import xarray as xr -import zarr -from ocf_blosc2 import Blosc2 - -from constants import LOCATIONS_BY_ENVIRONMENT -from resources import MeteomaticsAPIResource - -env = os.getenv("ENVIRONMENT", "local") -BASE_PATH = LOCATIONS_BY_ENVIRONMENT[env].NWP_ZARR_FOLDER - -# ==== Constants ==== - -# The order of these coordinate lists are used to determine the station_id -solar_coords = [ - (26.264, 71.237), - (26.671, 71.262), - (26.709, 71.413), - (26.871, 71.49), - (26.833, 71.815), - (26.792, 72.008), - (26.892, 72.06), - (27.179, 71.841), - (27.476, 71.971), - (27.387, 72.218), - (27.951, 72.987), - (28.276, 73.341), - (24.687, 75.132), - (26.731, 73.2), - (26.524, 72.862), - (27.207, 74.252), - (27.388, 72.208), - (27.634, 72.698), - (28.344, 73.435), - (28.022, 73.067), - # Adani - (13.995, 78.428), - (26.483, 71.232), - (14.225, 77.43), - (24.12, 69.34), -] - -wind_coords = [ - (27.035, 70.515), - (27.188, 70.661), - (27.085, 70.638), - (27.055, 70.72), - (27.186, 70.81), - (27.138, 71.024), - (26.97, 70.917), - (26.898, 70.996), - (26.806, 70.732), - (26.706, 70.81), - (26.698, 70.875), - (26.708, 70.982), - (26.679, 71.027), - (26.8, 71.128), - (26.704, 71.127), - (26.5, 71.285), - (26.566, 71.369), - (26.679, 71.452), - (26.201, 71.295), - (26.501, 72.512), - (26.463, 72.836), - (26.718, 73.049), - (26.63, 73.581), - (24.142, 74.731), - (23.956, 74.625), - (23.657, 74.772), - # Adani - (26.479, 1.220), - (23.098, 75.255), - (23.254, 69.252), -] - -wind_parameters = [ - "wind_speed_10m:ms", - "wind_speed_100m:ms", - "wind_speed_200m:ms", - "wind_dir_10m:d", - "wind_dir_100m:d", - "wind_dir_200m:d", - "wind_gusts_10m:ms", - "wind_gusts_100m:ms", - "wind_gusts_200m:ms", - "air_density_10m:kgm3", - "air_density_25m:kgm3", - "air_density_100m:kgm3", - "air_density_200m:kgm3", - "cape:Jkg", -] - - -solar_parameters = [ - "direct_rad:W", - "diffuse_rad:W", - "global_rad:W", -] - -# ==== Ops ==== - -@dg.op -def query_meteomatics_wind_api( - context: dg.OpExecutionContext, - meteomatics_api: MeteomaticsAPIResource, -) -> pd.DataFrame: - """Query Meteomatics API for wind data.""" - return meteomatics_api.query_api( - start=context.partition_time_window.start, - end=context.partition_time_window.end, - coords=wind_coords, - params=wind_parameters, - ) - -@dg.op -def query_meteomatics_solar_api( - context: dg.OpExecutionContext, - meteomatics_api: MeteomaticsAPIResource, -) -> pd.DataFrame: - """Query Meteomatics API for solar data.""" - return meteomatics_api.query_api( - start=context.partition_time_window.start, - end=context.partition_time_window.end, - coords=solar_coords, - params=solar_parameters, - ) - -@dg.op -def map_df_ds(df: pd.DataFrame) -> xr.Dataset: - """Map DataFrame to xarray Dataset.""" - # Reset index to create columns for lat, lon, and validdate - df = df.reset_index(level=["lat", "lon", "validdate"]) - # Create a station_id column based on the coordinates - df["station_id"] = df.groupby(["lat", "lon"], sort=False).ngroup() + 1 - # Create a time_utc column based on the validdate - df["time_utc"] = pd.to_datetime(df["validdate"]) - # Make a new index based on station_id and time_utc - df = df.set_index(["station_id", "time_utc"]).drop(columns=["validdate"]) - # Create xarray dataset from dataframe - ds = xr.Dataset.from_dataframe(df).set_coords(("lat", "lon")) - # Ensure time_utc is a timestamp object - ds["time_utc"] = pd.to_datetime(ds["time_utc"]) - return ds - - -@dg.op -def store_ds(context: dg.OpExecutionContext, ds: xr.Dataset) -> dg.Output[pathlib.Path]: - """Store xarray Dataset to Zarr.""" - encoding = {} - for var in ds.data_vars: - encoding[var] = {"compressor": Blosc2(cname="zstd", clevel=5)} - - pdt = context.partition_time_window.start - path = pathlib.Path( - f"{BASE_PATH}/{'/'.join(context.asset_key.path[:-1])}/{context.asset_key.path[-1]}_{pdt.strftime('%Y-%m')}.zarr.zip", - ) - path.parent.mkdir(parents=True, exist_ok=True) - with zarr.ZipStore(path.as_posix(), mode="w") as store: - ds.to_zarr(store, encoding=encoding, mode="w") - - return dg.Output( - path, - metadata={ - "dataset": dg.MetadataValue.text(ds.__str__()), - "path": dg.MetadataValue.path(path.as_posix()), - "partition_size:kb": dg.MetadataValue.int(int(path.stat().st_size / 1024)), - }, - ) - - -# ==== Assets ==== - -@dg.graph_asset( - key=["nwp", "meteomatics", "nw_india", "wind_archive"], - partitions_def=dg.TimeWindowPartitionsDefinition( - fmt="%Y-%m", - start="2019-03", - cron_schedule="0 0 1 * *", # Once a month - ), - metadata={ - "path": dg.MetadataValue.path(f"{BASE_PATH}/nwp/meteomatics/nw_india/wind_archive"), - }, -) -def meteomatics_wind_archive() -> dg.Output[str]: - """Meteomatics wind archive asset.""" - df = query_meteomatics_wind_api() - ds = map_df_ds(df) - return store_ds(ds) - - -@dg.graph_asset( - key=["nwp", "meteomatics", "nw_india", "solar_archive"], - partitions_def=dg.TimeWindowPartitionsDefinition( - fmt="%Y-%m", - start="2019-03", - cron_schedule="0 0 1 * *", # Once a month - ), - metadata={ - "path": dg.MetadataValue.path(f"{BASE_PATH}/nwp/meteomatics/nw_india/solar_archive"), - }, -) -def meteomatics_solar_archive() -> dg.Output[pathlib.Path]: - """Meteomatics solar archive asset.""" - df = query_meteomatics_solar_api() - ds = map_df_ds(df) - return store_ds(ds) diff --git a/local_archives/sat/__init__.py b/local_archives/sat/__init__.py deleted file mode 100644 index 28aeb1d..0000000 --- a/local_archives/sat/__init__.py +++ /dev/null @@ -1,14 +0,0 @@ -"""Definitions for the sat dagster code location.""" - -import dagster as dg - -from . import eumetsat - -all_assets: list[dg.AssetsDefinition] = [ - *eumetsat.all_assets, -] - -all_jobs: list[dg.JobDefinition] = [] - -all_schedules: list[dg.ScheduleDefinition] = [] - diff --git a/local_archives/sat/eumetsat/__init__.py b/local_archives/sat/eumetsat/__init__.py deleted file mode 100644 index ca94245..0000000 --- a/local_archives/sat/eumetsat/__init__.py +++ /dev/null @@ -1,12 +0,0 @@ -import dagster as dg - -from . import eumetsat_iodc - - -iodc_assets = dg.load_assets_from_modules( - modules=[eumetsat_iodc], - group_name="eumetsat_iodc", -) - -all_assets: list[dg.AssetsDefinition] = [*iodc_assets] - diff --git a/local_archives/sat/eumetsat/eumetsat_iodc.py b/local_archives/sat/eumetsat/eumetsat_iodc.py deleted file mode 100644 index 7e0cd44..0000000 --- a/local_archives/sat/eumetsat/eumetsat_iodc.py +++ /dev/null @@ -1,59 +0,0 @@ -import datetime as dt -import os -from typing import Any - -import dagster as dg - -from constants import LOCATIONS_BY_ENVIRONMENT - -env = os.getenv("ENVIRONMENT", "local") -ZARR_FOLDER = LOCATIONS_BY_ENVIRONMENT[env].SAT_ZARR_FOLDER - -@dg.asset( - name="zarr_archive", - description="".join(( - "Zarr archive of satellite data from EUMETSAT's IODC satellite.", - "Sourced via EUMDAC from EUMETSAT ", - "(https://navigator.eumetsat.int/product/EO:EUM:DAT:MSG:OCA-IODC). ", - "This asset is updated monthly, and surfaced as a Zarr Directory Store ", - "for each month. It is downloaded using the sat container ", - "(https://github.com/openclimatefix/dagster-dags/pkgs/container/sat-etl).", - )), - key_prefix=["sat", "eumetsat", "iodc"], - metadata={ - "archive_folder": dg.MetadataValue.text(f"{ZARR_FOLDER}/sat/eumetsat/india"), - "area": dg.MetadataValue.text("india"), - "source": dg.MetadataValue.text("eumetsat"), - "expected_runtime": dg.MetadataValue.text("TBD"), - }, - compute_kind="subprocess", - automation_condition=dg.AutomationCondition.eager(), - tags={ - # "dagster/max_runtime": str(60 * 60 * 10), # Should take 6 ish hours - "dagster/priority": "1", - "dagster/concurrency_key": "eumetsat", - }, - partitions_def=dg.MonthlyPartitionsDefinition( - start_date="2019-01-01", - end_offset=-3, - ), -) -def iodc_monthly( - context: dg.AssetExecutionContext, - pipes_subprocess_client: dg.PipesSubprocessClient, -) -> Any: - image: str = "ghcr.io/openclimatefix/sat-etl:main" - it: dt.datetime = context.partition_time_window.start - return pipes_subprocess_client.run( - command=[ - "/home/dagster/mambaforge/envs/sat-etl/bin/python", - "/home/dagster/dags/containers/sat/download_process_sat.py", - "--month", - it.strftime("%Y-%m"), - "--path", - f"/mnt/storage_a/sat/eumetsat/india", - "--rm", - ], - context=context, - ).get_materialize_result() - diff --git a/managers/__init__.py b/managers/__init__.py deleted file mode 100644 index fbd8692..0000000 --- a/managers/__init__.py +++ /dev/null @@ -1,3 +0,0 @@ -from .xr_zarr_local import LocalFilesystemXarrayZarrManager - -__all__ = ["LocalFilesystemXarrayZarrManager"] diff --git a/managers/xr_zarr_local.py b/managers/xr_zarr_local.py deleted file mode 100644 index ef3e1ef..0000000 --- a/managers/xr_zarr_local.py +++ /dev/null @@ -1,76 +0,0 @@ -import datetime as dt -import pathlib - -import dagster as dg -import xarray as xr -import zarr -from ocf_blosc2 import Blosc2 - - -class LocalFilesystemXarrayZarrManager(dg.ConfigurableIOManager): - """IOManager for reading and writing xarray datasets to the local filesystem. - - Datasets are stored in zipped zarr format. It is expected to be used with an asset - continaing a MultiPartitionDefinition with two keys: "date" and "inittime" from which - the full initialisation time of the dataset can be inferred. - - The dataset is stored in a folder structure using the assets key prefixes and the - base path. The full path to the dataset is: - - {base_path}/{slash_joined_asset_key_prefixes}/{date}{inittime}.zarr.zip - """ - - base_path: str = "" - filename_formatstr: str = "%Y%m%dT%H%M.zarr.zip" - - def _get_path(self, context: dg.InputContext | dg.OutputContext) -> pathlib.Path: - """Get the path to the zarr file.""" - if context.has_partition_key: - if isinstance(context.asset_key.path, str) or len(context.asset_key.path) <= 1: - raise ValueError( - "AssetKey is not a list of strings with at least two elements." - "Ensure the you have setkey_prefix on the asset.", - ) - - asset_prefixes: str = "/".join(context.asset_key.path[:-1]) - it = context.asset_partitions_time_window.start - return ( - pathlib.Path(self.base_path) / asset_prefixes / it.strftime(self.filename_formatstr) - ) - else: - # Not yet implemented - raise NotImplementedError("No partition key found") - - def handle_output(self, context: dg.OutputContext, obj: xr.Dataset) -> None: - """Save an xarray dataset to a zarr file.""" - dst = self._get_path(context) - if dst.exists(): - dst.unlink() - dst.parent.mkdir(parents=True, exist_ok=True) - dataVar: str = next(iter(obj.data_vars.keys())) - with zarr.ZipStore(path=dst.as_posix(), mode="w") as store: - obj.to_zarr( - store=store, - encoding={ - "init_time": {"units": "nanoseconds since 1970-01-01"}, - dataVar: { - "compressor": Blosc2(cname="zstd", clevel=5), - }, - }, - ) - context.add_output_metadata( - { - "path": dg.MetadataValue.path(dst.as_posix()), - "size": dg.MetadataValue.int(dst.stat().st_size), - "modified": dg.MetadataValue.text( - dt.datetime.fromtimestamp(dst.stat().st_mtime, tz=dt.UTC).strftime( - "%Y-%m-%d %H:%M:%S", - ), - ), - }, - ) - - def load_input(self, context: dg.InputContext) -> xr.Dataset: - """Load an xarray dataset from a zarr file.""" - src = self._get_path(context) - return xr.open_zarr(f"zip::{src.as_posix()}") diff --git a/pyproject.toml b/pyproject.toml index 47e386d..ef8b34c 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -1,7 +1,7 @@ # --- PROJECT CONFIGURATION --- # [build-system] -requires = ["setuptools>=69", "wheel"] +requires = ["setuptools>=69", "wheel", "setuptools-git-versioning>=2.0,<3"] build-backend = "setuptools.build_meta" # Metadata (see https://peps.python.org/pep-0621/) @@ -13,50 +13,47 @@ readme = {file = "README.md", content-type = "text/markdown"} requires-python = ">=3.11.0" license = {text = "MIT License"} authors = [ - { name = "Jacob Bieker", email = "jacob@openclimatefix.org"}, { name = "Sol Cotton", email = "sol@openclimatefix.org"}, + { name = "Jacob Bieker", email = "jacob@openclimatefix.org"}, ] classifiers = ["Programming Language :: Python :: 3"] dependencies = [ "cdsapi >= 0.6.1", - "ecmwf-api-client >= 1.6.3", - "dagit >= 1.8.5", "dagster >= 1.8.5", - "dagster-cloud >= 1.8.5", - "dagster-webserver >= 1.8.5", - "dagster-graphql >= 1.8.5", "dagster-postgres >= 0.24.5", "dagster-docker >= 0.24.5", "dagster-pipes >= 1.8.5", "huggingface-hub >= 0.19.4", - "kbatch >= 0.4.2", - "meteomatics == 2.11.1", "numpy >= 1.26.0", - "nwp-consumer >= 0.5.8", - "ocf-blosc2 >= 0.0.3", - "pathlib >= 1.0.1", + "pandas >= 2.2.3", "pyarrow >= 10.0.1", "requests >= 2.31.0", "requests-toolbelt >= 1.0.0", - "xarray >= 2022.3.0", - "zarr >= 2.13.3", ] -[project.optional-dependencies] +[dependency-groups] dev = [ - "mypy == 1.7.1", - "types-PyYAML", + # Testing + "pytest", + "unittest-xml-reporting", + "dagit", + # Linting + "ruff", + "types-pyyaml", "types-pytz", "types-requests", - "ruff == 0.1.7", - "unittest-xml-reporting == 3.2.0", - "pytest >= 7.4.1", - "python-lsp-server == 1.7.4" + # LSP support + "python-lsp-server", + "pylsp-mypy", + "python-lsp-ruff", ] [tool.setuptools.packages.find] exclude = ["*_tests"] +[tool.setuptools-git-versioning] +enabled = true + # Ruff configuration # * See https://beta.ruff.rs/docs/ [tool.ruff] @@ -110,4 +107,5 @@ ignore_missing_imports = true plugins = [ 'numpy.typing.mypy_plugin' ] +explicit_package_bases = true diff --git a/resources/__init__.py b/resources/__init__.py deleted file mode 100644 index 7f04d49..0000000 --- a/resources/__init__.py +++ /dev/null @@ -1,5 +0,0 @@ -"""Configurable resources for use across dagster application.""" - -from .meteomatics import MeteomaticsAPIResource - -__all__ = ["MeteomaticsAPIResource"] diff --git a/resources/meteomatics.py b/resources/meteomatics.py deleted file mode 100644 index fe2ded9..0000000 --- a/resources/meteomatics.py +++ /dev/null @@ -1,58 +0,0 @@ -import datetime as dt - -import dagster as dg -import meteomatics.api -import pandas as pd -from pydantic import PrivateAttr - - - -class MeteomaticsAPIResource(dg.ConfigurableResource): - """A resource for interacting with the Meteomatics API.""" - - # Authentication for the API, set via environment - username: str - password: str - - # Subscription limits - _subscription_min_date: dt.datetime = PrivateAttr() - _subscription_max_requestable_parameters = PrivateAttr() - - def setup_for_execution(self, context) -> None: - """Set up the resource according to subscription limits.""" - self._subscription_min_date = dt.datetime(2019, 3, 19, tzinfo=dt.UTC) - self._subscription_max_requestable_parameters = 10 - - def query_api(self, start: dt.datetime, end: dt.datetime, coords: list[tuple[float, float]], params: list[str]) -> pd.DataFrame: - """Query the Meteomatics API for NWP data.""" - - # Ensure subscription limits are respected - # * Split the parameters into groups of max size - groups = [ - params[i : i + self._subscription_max_requestable_parameters] - for i in range(0, len(params), self._subscription_max_requestable_parameters) - ] - - dfs: list[pd.DataFrame] = [] - try: - for param_group in groups: - df: pd.DataFrame = meteomatics.api.query_time_series( - coordinate_list=coords, - startdate=max(start, self._subscription_min_date), - enddate=max(end, self._subscription_min_date), - interval=dt.timedelta(minutes=15), - parameters=param_group, - username=self.username, - password=self.password, - model="ecmwf-ifs", - ) - dfs.append(df) - except Exception as e: - raise dg.Failure( - description=f"Failed to query the Meteomatics API: {e}", - ) from e - - if len(dfs) > 1: - return dfs[0].join(dfs[1:]) - else: - return dfs[0] diff --git a/src/dagster_dags/__init__.py b/src/dagster_dags/__init__.py new file mode 100644 index 0000000..3b9c2b5 --- /dev/null +++ b/src/dagster_dags/__init__.py @@ -0,0 +1,4 @@ +from .definitions import defs + +__all__ = ['defs'] + diff --git a/cloud_archives/nwp/__init__.py b/src/dagster_dags/assets/__init__.py similarity index 100% rename from cloud_archives/nwp/__init__.py rename to src/dagster_dags/assets/__init__.py diff --git a/src/dagster_dags/assets/air/__init__.py b/src/dagster_dags/assets/air/__init__.py new file mode 100644 index 0000000..8b13789 --- /dev/null +++ b/src/dagster_dags/assets/air/__init__.py @@ -0,0 +1 @@ + diff --git a/src/dagster_dags/assets/air/cams_eu.py b/src/dagster_dags/assets/air/cams_eu.py new file mode 100644 index 0000000..a01b2f0 --- /dev/null +++ b/src/dagster_dags/assets/air/cams_eu.py @@ -0,0 +1,142 @@ +"""NetCDF archive of Atmospheric Quality data from CAMS, covering Europe. + +CAMS is Copernicus' Atmospheric Monitoring Service, which provides +forecasts of atmospheric quality. + +Sourced via CDS API from Copernicus ADS (https://ads.atmosphere.copernicus.eu). +This asset is updated weekly, and surfaced as a zipped NetCDF file for each week +per variable. It is downloaded using the cdsapi Python package +(https://github.com/ecmwf/cdsapi). +""" + +import datetime as dt +import os +import pathlib +from typing import Any + +import cdsapi +import dagster as dg + +ARCHIVE_FOLDER = "/var/dagster-storage/air/cams-europe" +if os.getenv("ENVIRONMENT", "local") == "leo": + ARCHIVE_FOLDER = "/mnt/storage_b/air/cams-europe" + +partitions_def: dg.TimeWindowPartitionsDefinition = dg.WeeklyPartitionsDefinition( + start_date="2020-02-08", + end_offset=-2, +) + +@dg.asset( + name="cams-europe", + description=__doc__, + key_prefix=["air"], + metadata={ + "archive_folder": dg.MetadataValue.text(ARCHIVE_FOLDER), + "area": dg.MetadataValue.text("europe"), + "source": dg.MetadataValue.text("copernicus-ads"), + "model": dg.MetadataValue.text("cams"), + "format": dg.MetadataValue.text("netcdf"), + "expected_runtime": dg.MetadataValue.text("6 hours"), + }, + compute_kind="python", + automation_condition=dg.AutomationCondition.on_cron( + cron_schedule=partitions_def.get_cron_schedule( + hour_of_day=7, + ), + ), + tags={ + "dagster/max_runtime": str(60 * 60 * 24 * 4), # Should take about 2 days + "dagster/priority": "1", + "dagster/concurrency_key": "copernicus-ads", + }, + partitions_def=partitions_def, +) +def cams_eu_raw_asset(context: dg.AssetExecutionContext) -> dg.Output[list[pathlib.Path]]: + """Downloads CAMS Europe air quality forecast data from Copernicus ADS.""" + it_start: dt.datetime = context.partition_time_window.start + it_end: dt.datetime = context.partition_time_window.end + execution_start = dt.datetime.now(tz=dt.UTC) + stored_files: list[pathlib.Path] = [] + + variables: list[str] = [ + "alder_pollen", + "ammonia", + "birch_pollen", + "carbon_monoxide", + "dust", + "grass_pollen", + "nitrogen_dioxide", + "nitrogen_monoxide", + "non_methane_vocs", + "olive_pollen", + "ozone", + "particulate_matter_10um", + "particulate_matter_2.5um", + "peroxyacyl_nitrates", + "pm10_wildfires", + "ragweed_pollen", + "secondary_inorganic_aerosol", + "sulphur_dioxide", + ] + + for var in variables: + dst: pathlib.Path = pathlib.Path(ARCHIVE_FOLDER) \ + / "raw" / f"{it_start:%Y%m%d}-{it_end:%Y%m%d}_{var}.nc.zip" + dst.parent.mkdir(parents=True, exist_ok=True) + + if dst.exists(): + context.log.info("File already exists, skipping download", extra={ + "file": dst.as_posix(), + }) + stored_files.append(dst) + continue + + request: dict[str, Any] = { + "date": [f"{it_start:%Y-%m-%d}/{it_end:%Y-%m-%d}"], + "type": ["forecast"], + "time": ["00:00"], + "model": ["ensemble"], + "leadtime_hour": [str(x) for x in range(0, 97)], + "data_format": ["netcdf_zip"], + "level": ["0", "50", "250", "500", "1000", "3000", "5000"], + "variable": [var], + } + + context.log.info( + "Reqesting file from Copernicus ADS via CDS API", + extra={ + "request": request, + "target": dst.as_posix(), + }, + ) + client = cdsapi.Client() + client.retrieve( + name="cams-europe-air-quality-forecast", + request=request, + target=dst.as_posix(), + ) + context.log.info( + f"Downloaded file {dst.as_posix()} from Copernicus ADS via CDS API", + extra={ + "file": dst.as_posix(), + "size": dst.stat().st_size, + }, + ) + stored_files.append(dst) + + if len(stored_files) == 0: + raise Exception( + "No remote files found for this partition key. See logs for more details.", + ) + + elapsed_time: dt.timedelta = dt.datetime.now(tz=dt.UTC) - execution_start + + return dg.Output( + value=stored_files, + metadata={ + "files": dg.MetadataValue.text(", ".join([f.as_posix() for f in stored_files])), + "partition_size": dg.MetadataValue.int(sum([f.stat().st_size for f in stored_files])), + "elapsed_time_hours": dg.MetadataValue.float(elapsed_time / dt.timedelta(hours=1)), + }, + ) + diff --git a/tests/cloud_archives/__init__.py b/src/dagster_dags/assets/nwp/__init__.py similarity index 100% rename from tests/cloud_archives/__init__.py rename to src/dagster_dags/assets/nwp/__init__.py diff --git a/local_archives/nwp/ceda/ceda_global.py b/src/dagster_dags/assets/nwp/ceda_mo_um_global.py similarity index 53% rename from local_archives/nwp/ceda/ceda_global.py rename to src/dagster_dags/assets/nwp/ceda_mo_um_global.py index c541418..752f4b2 100644 --- a/local_archives/nwp/ceda/ceda_global.py +++ b/src/dagster_dags/assets/nwp/ceda_mo_um_global.py @@ -1,4 +1,4 @@ -"""Zarr archive of NWP data from the Met Office's Global model. +"""Zarr archive of NWP data from the Met Office's Unified Model in the global configuration. The MetOffice runs it's Unified Model (UM) in two configurations: Global, and UK. This asset contains data from the global configuration covering the whole globe. @@ -6,66 +6,69 @@ Sourced via FTP from CEDA (https://catalogue.ceda.ac.uk/uuid/86df725b793b4b4cb0ca0646686bd783). This asset is updated monthly, and surfaced as a Zarr Directory Store for each month. It is downloaded using the nwp-consumer docker image -(https://github.com/openclimatefix/nwp-consumer) +(https://github.com/openclimatefix/nwp-consumer). """ -import datetime as dt import os -from typing import Any +from typing import TYPE_CHECKING, Any import dagster as dg from dagster_docker import PipesDockerClient -from constants import LOCATIONS_BY_ENVIRONMENT +if TYPE_CHECKING: + import datetime as dt -env = os.getenv("ENVIRONMENT", "local") -ZARR_FOLDER = LOCATIONS_BY_ENVIRONMENT[env].NWP_ZARR_FOLDER + +ARCHIVE_FOLDER = "/var/dagster-storage/nwp/ceda-mo-um-global" +if os.getenv("ENVIRONMENT", "local") == "leo": + ARCHIVE_FOLDER = "/mnt/storage_b/nwp/ceda-mo-um-global" + +partitions_def: dg.TimeWindowPartitionsDefinition = dg.MonthlyPartitionsDefinition( + start_date="2019-01-01", + end_offset=-3, +) @dg.asset( - name="zarr_archive", + name="ceda-mo-um-global", description=__doc__, - key_prefix=["nwp", "ceda", "global"], metadata={ - "archive_folder": dg.MetadataValue.text(f"{ZARR_FOLDER}/nwp/ceda/global"), + "archive_folder": dg.MetadataValue.text(ARCHIVE_FOLDER), "area": dg.MetadataValue.text("global"), "source": dg.MetadataValue.text("ceda"), + "model": dg.MetadataValue.text("mo-um"), "expected_runtime": dg.MetadataValue.text("6 hours"), }, compute_kind="docker", - automation_condition=dg.AutomationCondition.eager(), + automation_condition=dg.AutomationCondition.on_cron( + cron_schedule=partitions_def.get_cron_schedule( + hour_of_day=5, + ), + ), tags={ "dagster/max_runtime": str(60 * 60 * 10), # Should take 6 ish hours "dagster/priority": "1", - "dagster/concurrency_key": "ceda-ftp-consumer", + "dagster/concurrency_key": "nwp-consumer", }, - partitions_def=dg.MonthlyPartitionsDefinition( - start_date="2019-01-01", - end_offset=-3, - ), ) -def ceda_global( +def ceda_mo_um_global_asset( context: dg.AssetExecutionContext, pipes_docker_client: PipesDockerClient, -) -> Any: - image: str = "ghcr.io/openclimatefix/nwp-consumer:devsjc-major-refactor" +) -> Any: # noqa: ANN401 + """Dagster asset for MO Unified Model global NWP data from CEDA.""" it: dt.datetime = context.partition_time_window.start return pipes_docker_client.run( - image=image, - command=[ - "archive", - "-y", - str(it.year), - "-m", - str(it.month), - ], + image="ghcr.io/openclimatefix/nwp-consumer:1.0.12", + command=["archive", "-y", str(it.year), "-m", str(it.month)], env={ "NWP_CONSUMER_MODEL_REPOSITORY": "ceda-metoffice-global", "NWP_CONSUMER_NOTIFICATION_REPOSITORY": "dagster-pipes", "CEDA_FTP_USER": os.environ["CEDA_FTP_USER"], "CEDA_FTP_PASS": os.environ["CEDA_FTP_PASS"], + "CONCURRENCY": "false", }, container_kwargs={ - "volumes": [f"{ZARR_FOLDER}/nwp/ceda/global:/work"], + "volumes": [f"{ARCHIVE_FOLDER}:/work"], }, context=context, ).get_results() + diff --git a/local_archives/nwp/ecmwf/ecmwf_ens_stat_india.py b/src/dagster_dags/assets/nwp/ecmwf_ens_stat_india.py similarity index 54% rename from local_archives/nwp/ecmwf/ecmwf_ens_stat_india.py rename to src/dagster_dags/assets/nwp/ecmwf_ens_stat_india.py index b600ac4..d5701bc 100644 --- a/local_archives/nwp/ecmwf/ecmwf_ens_stat_india.py +++ b/src/dagster_dags/assets/nwp/ecmwf_ens_stat_india.py @@ -1,6 +1,6 @@ -"""Zarr archive of Summary NWP data from ECMWF's EPS. +"""Zarr archive of Summary NWP data from ECMWF's ENS, covering India. -EPS is the ECMWF Ensemble Prediction System, +ENS (sometimes EPS) is the ECMWF Ensemble Prediction System, which provides 50 perturbed forecasts of upcoming atmospheric conditions. This asset contains summary statistics of this data (mean, standard deviation) for India. @@ -10,66 +10,69 @@ (https://github.com/openclimatefix/nwp-consumer). """ -import datetime as dt import os -from typing import Any +from typing import TYPE_CHECKING, Any import dagster as dg from dagster_docker import PipesDockerClient -from constants import LOCATIONS_BY_ENVIRONMENT +if TYPE_CHECKING: + import datetime as dt -env = os.getenv("ENVIRONMENT", "local") -ZARR_FOLDER = LOCATIONS_BY_ENVIRONMENT[env].NWP_ZARR_FOLDER -ARCHIVE_FOLDER = f"{ZARR_FOLDER}/nwp/ecmwf-eps/india-stat" +ARCHIVE_FOLDER = "/var/dagster-storage/nwp/ecmwf-ens-stat-india" +if os.getenv("ENVIRONMENT", "local") == "leo": + ARCHIVE_FOLDER = "/mnt/storage_b/nwp/ecmwf-ens-stat-india" + +partitions_def: dg.TimeWindowPartitionsDefinition = dg.MonthlyPartitionsDefinition( + start_date="2020-01-01", + end_offset=-3, +) @dg.asset( - name="zarr_archive", + name="ecmwf-ens-stat-india", description=__doc__, - key_prefix=["nwp", "ecmwf-eps", "india-stat"], metadata={ "archive_folder": dg.MetadataValue.text(ARCHIVE_FOLDER), - "area": dg.MetadataValue.text("global"), + "area": dg.MetadataValue.text("india"), "source": dg.MetadataValue.text("ecmwf-mars"), + "model": dg.MetadataValue.text("ens-stat"), "expected_runtime": dg.MetadataValue.text("6 hours"), }, compute_kind="docker", - automation_condition=dg.AutomationCondition.eager(), + automation_condition=dg.AutomationCondition.on_cron( + cron_schedule=partitions_def.get_cron_schedule( + hour_of_day=6, + day_of_week=1, + ), + ), tags={ "dagster/max_runtime": str(60 * 60 * 10), # Should take 6 ish hours "dagster/priority": "1", - "dagster/concurrency_key": "ecmwf-mars-consumer", + "dagster/concurrency_key": "nwp-consumer", }, - partitions_def=dg.MonthlyPartitionsDefinition( - start_date="2020-01-01", - end_offset=-3, - ), + partitions_def=partitions_def, ) -def ecmwf_eps_india_stat( +def ecmwf_ens_stat_india_asset( context: dg.AssetExecutionContext, pipes_docker_client: PipesDockerClient, -) -> Any: - image: str = "ghcr.io/openclimatefix/nwp-consumer:1.0.5" +) -> Any: # noqa: ANN401 + """Dagster asset downloading ECMWF ENS data for India.""" it: dt.datetime = context.partition_time_window.start return pipes_docker_client.run( - image=image, - command=[ - "archive", - "-y", - str(it.year), - "-m", - str(it.month), - ], + image="ghcr.io/openclimatefix/nwp-consumer:1.0.12", + command=["archive", "-y", str(it.year), "-m", str(it.month)], env={ "MODEL_REPOSITORY": "ecmwf-mars", + "MODEL": "ens-stat-india", "NOTIFICATION_REPOSITORY": "dagster-pipes", "ECMWF_API_KEY": os.environ["ECMWF_API_KEY"], "ECMWF_API_EMAIL": os.environ["ECMWF_API_EMAIL"], "ECMWF_API_URL": os.environ["ECMWF_API_URL"], - "ECMWF_MARS_AREA": "35/67/6/97", + "CONCURRENCY": "false", }, container_kwargs={ "volumes": [f"{ARCHIVE_FOLDER}:/work"], }, context=context, ).get_results() + diff --git a/src/dagster_dags/assets/nwp/ecmwf_hres_ifs_india.py b/src/dagster_dags/assets/nwp/ecmwf_hres_ifs_india.py new file mode 100644 index 0000000..0938f9a --- /dev/null +++ b/src/dagster_dags/assets/nwp/ecmwf_hres_ifs_india.py @@ -0,0 +1,77 @@ +"""Zarr archive of NWP data from ECMWF's IFS model, covering India. + +IFS is the Integrated Forecasting System, which uses a global numerical model +of earth to produce deterministic forecasts of upcoming atmospheric conditions. + +Sourced via MARS API from ECMWF (https://apps.ecmwf.int/mars-catalogue). +This asset is updated monthly, and surfaced as a Zarr Directory Store for each month. +It is downloaded using the nwp-consumer docker image +(https://github.com/openclimatefix/nwp-consumer). +""" + +import os +from typing import TYPE_CHECKING, Any + +import dagster as dg +from dagster_docker import PipesDockerClient + +if TYPE_CHECKING: + import datetime as dt + +ARCHIVE_FOLDER = "/var/dagster-storage/nwp/ecmwf-hres-ifs-india" +if os.getenv("ENVIRONMENT", "local") == "leo": + ARCHIVE_FOLDER = "/mnt/storage_b/nwp/ecmwf-hres-ifs-india" + +partitions_def: dg.TimeWindowPartitionsDefinition = dg.MonthlyPartitionsDefinition( + start_date="2017-01-01", + end_offset=-1, +) + +@dg.asset( + name="ecmwf-hres-ifs-india", + description=__doc__, + metadata={ + "archive_folder": dg.MetadataValue.text(ARCHIVE_FOLDER), + "area": dg.MetadataValue.text("india"), + "source": dg.MetadataValue.text("ecmwf-mars"), + "model": dg.MetadataValue.text("hres-ifs"), + "expected_runtime": dg.MetadataValue.text("6 hours"), + }, + compute_kind="docker", + automation_condition=dg.AutomationCondition.on_cron( + cron_schedule=partitions_def.get_cron_schedule( + hour_of_day=3, + day_of_week=0, + ), + ), + tags={ + "dagster/max_runtime": str(60 * 60 * 10), # Should take 6 ish hours + "dagster/priority": "1", + "dagster/concurrency_key": "nwp-consumer", + }, + partitions_def=partitions_def, +) +def ecmwf_hres_ifs_india_asset( + context: dg.AssetExecutionContext, + pipes_docker_client: PipesDockerClient, +) -> Any: # noqa: ANN401 + """Dagster asset for HRES IFS model data covering India from ECMWF.""" + it: dt.datetime = context.partition_time_window.start + return pipes_docker_client.run( + image="ghcr.io/openclimatefix/nwp-consumer:1.0.12", + command=["archive", "-y", str(it.year), "-m", str(it.month)], + env={ + "MODEL_REPOSITORY": "ecmwf-mars", + "MODEL": "hres-ifs-india", + "NOTIFICATION_REPOSITORY": "dagster-pipes", + "ECMWF_API_KEY": os.environ["ECMWF_API_KEY"], + "ECMWF_API_EMAIL": os.environ["ECMWF_API_EMAIL"], + "ECMWF_API_URL": os.environ["ECMWF_API_URL"], + "CONCURRENCY": "false", + }, + container_kwargs={ + "volumes": [f"{ARCHIVE_FOLDER}:/work"], + }, + context=context, + ).get_results() + diff --git a/src/dagster_dags/assets/nwp/ecmwf_hres_ifs_west_europe.py b/src/dagster_dags/assets/nwp/ecmwf_hres_ifs_west_europe.py new file mode 100644 index 0000000..50e77b9 --- /dev/null +++ b/src/dagster_dags/assets/nwp/ecmwf_hres_ifs_west_europe.py @@ -0,0 +1,77 @@ +"""Zarr archive of NWP data from ECMWF's IFS model, covering Western Europe. + +IFS is the Integrated Forecasting System, which uses a global numerical model +of earth to produce deterministic forecasts of upcoming atmospheric conditions. + +Sourced via MARS API from ECMWF (https://apps.ecmwf.int/mars-catalogue). +This asset is updated monthly, and surfaced as a Zarr Directory Store for each month. +It is downloaded using the nwp-consumer docker image +(https://github.com/openclimatefix/nwp-consumer). +""" + +import os +from typing import TYPE_CHECKING, Any + +import dagster as dg +from dagster_docker import PipesDockerClient + +if TYPE_CHECKING: + import datetime as dt + +ARCHIVE_FOLDER = "/var/dagster-storage/nwp/ecmwf-hres-ifs-west-europe" +if os.getenv("ENVIRONMENT", "local") == "leo": + ARCHIVE_FOLDER = "/mnt/storage_b/nwp/ecmwf-hres-ifs-west-europe" + +partitions_def: dg.TimeWindowPartitionsDefinition = dg.MonthlyPartitionsDefinition( + start_date="2017-01-01", + end_offset=-1, +) + +@dg.asset( + name="ecmwf-hres-ifs-west-europe", + description=__doc__, + metadata={ + "archive_folder": dg.MetadataValue.text(ARCHIVE_FOLDER), + "area": dg.MetadataValue.text("west-europe"), + "source": dg.MetadataValue.text("ecmwf-mars"), + "model": dg.MetadataValue.text("hres-ifs"), + "expected_runtime": dg.MetadataValue.text("6 hours"), + }, + compute_kind="docker", + automation_condition=dg.AutomationCondition.on_cron( + cron_schedule=partitions_def.get_cron_schedule( + hour_of_day=18, + day_of_week=0, + ), + ), + tags={ + "dagster/max_runtime": str(60 * 60 * 10), # Should take 6 ish hours + "dagster/priority": "1", + "dagster/concurrency_key": "nwp-consumer", + }, + partitions_def=partitions_def, +) +def ecmwf_hres_ifs_west_europe_asset( + context: dg.AssetExecutionContext, + pipes_docker_client: PipesDockerClient, +) -> Any: # noqa: ANN401 + """Dagster asset for HRES IFS NWP data covering Western Europe from ECMWF.""" + it: dt.datetime = context.partition_time_window.start + return pipes_docker_client.run( + image="ghcr.io/openclimatefix/nwp-consumer:1.0.12", + command=["archive", "-y", str(it.year), "-m", str(it.month)], + env={ + "MODEL_REPOSITORY": "ecmwf-mars", + "MODEL": "hres-ifs-west-europe", + "NOTIFICATION_REPOSITORY": "dagster-pipes", + "ECMWF_API_KEY": os.environ["ECMWF_API_KEY"], + "ECMWF_API_EMAIL": os.environ["ECMWF_API_EMAIL"], + "ECMWF_API_URL": os.environ["ECMWF_API_URL"], + "CONCURRENCY": "false", + }, + container_kwargs={ + "volumes": [f"{ARCHIVE_FOLDER}:/work"], + }, + context=context, + ).get_results() + diff --git a/src/dagster_dags/assets/nwp/noaa-gfs-global.py b/src/dagster_dags/assets/nwp/noaa-gfs-global.py new file mode 100644 index 0000000..11d434c --- /dev/null +++ b/src/dagster_dags/assets/nwp/noaa-gfs-global.py @@ -0,0 +1,73 @@ +"""Zarr archive of NWP data from NCEP's GFS model. + +The National Centers for Environmental Prediction (NCEP) runs the +deterministic Global Forecast System (GFS) model +(https://www.ncei.noaa.gov/products/weather-climate-models/global-forecast). + +Sourced via S3 from NOAA (https://noaa-gfs-bdp-pds.s3.amazonaws.com/index.html). +This asset is updated monthly, and surfaced as a Zarr Directory Store for each month. +It is downloaded using the nwp-consumer docker image +(https://github.com/openclimatefix/nwp-consumer). +""" + +import os +from typing import TYPE_CHECKING, Any + +import dagster as dg +from dagster_docker import PipesDockerClient + +if TYPE_CHECKING: + import datetime as dt + +ARCHIVE_FOLDER = "/var/dagster-storage/nwp/ncep-gfs-global" +if os.getenv("ENVIRONMENT", "local") == "leo": + ARCHIVE_FOLDER = "/mnt/storage_b/nwp/ncep-gfs-global" + +partitions_def: dg.TimeWindowPartitionsDefinition = dg.MonthlyPartitionsDefinition( + start_date="2021-01-01", + end_offset=-1, +) + +@dg.asset( + name="ncep-gfs-global", + description=__doc__, + metadata={ + "archive_folder": dg.MetadataValue.text(ARCHIVE_FOLDER), + "area": dg.MetadataValue.text("global"), + "source": dg.MetadataValue.text("noaa-s3"), + "model": dg.MetadataValue.text("ncep-gfs"), + "expected_runtime": dg.MetadataValue.text("6 hours"), + }, + compute_kind="docker", + automation_condition=dg.AutomationCondition.on_cron( + cron_schedule=partitions_def.get_cron_schedule( + hour_of_day=21, + day_of_week=1, + ), + ), + tags={ + "dagster/max_runtime": str(60 * 60 * 10), # Should take 6 ish hours + "dagster/priority": "1", + "dagster/concurrency_key": "nwp-consumer", + }, +) +def ncep_gfs_global_asset( + context: dg.AssetExecutionContext, + pipes_docker_client: PipesDockerClient, +) -> Any: # noqa: ANN401 + """Dagster asset for NCEP GFS global forecast model data.""" + it: dt.datetime = context.partition_time_window.start + return pipes_docker_client.run( + image="ghcr.io/openclimatefix/nwp-consumer:1.0.12", + command=["archive", "-y", str(it.year), "-m", str(it.month)], + env={ + "MODEL_REPOSITORY": "gfs", + "NOTIFICATION_REPOSITORY": "dagster-pipes", + "CONCURRENCY": "false", + }, + container_kwargs={ + "volumes": [f"{ARCHIVE_FOLDER}:/work"], + }, + context=context, + ).get_results() + diff --git a/src/dagster_dags/assets/pv/__init__.py b/src/dagster_dags/assets/pv/__init__.py new file mode 100644 index 0000000..8b13789 --- /dev/null +++ b/src/dagster_dags/assets/pv/__init__.py @@ -0,0 +1 @@ + diff --git a/src/dagster_dags/assets/pv/passiv/__init__.py b/src/dagster_dags/assets/pv/passiv/__init__.py new file mode 100644 index 0000000..259b57a --- /dev/null +++ b/src/dagster_dags/assets/pv/passiv/__init__.py @@ -0,0 +1,12 @@ +import dagster as dg + +from .passiv_year import pv_passiv_yearly_30min, pv_passiv_yearly_5min +from .passiv_monthly import pv_passiv_monthly_30min, pv_passiv_monthly_5min + +__all__ = [ + "pv_passiv_yearly_30min", + "pv_passiv_yearly_5min", + "pv_passiv_monthly_30min", + "pv_passiv_monthly_5min", +] + diff --git a/src/dagster_dags/assets/pv/passiv/filenames.py b/src/dagster_dags/assets/pv/passiv/filenames.py new file mode 100644 index 0000000..1831813 --- /dev/null +++ b/src/dagster_dags/assets/pv/passiv/filenames.py @@ -0,0 +1,13 @@ +"""Functions for naming files for passiv datasets.""" + +from datetime import datetime + + +def get_monthly_hf_file_name(date: datetime, period: int = 5) -> str: + """Format a datetime as a huggingface-appropriate monthly filename.""" + return f"data/{date.strftime('%Y/%m')}/{date.strftime('%Y%m')}_{period}min.parquet" + + +def get_yearly_hf_file_name(date: datetime, period: int = 5) -> str: + """Format a datetime as a huggingface-appropriate yearly filename.""" + return f"data/{date.strftime('%Y')}/{date.strftime('%Y')}_{period}min.parquet" diff --git a/cloud_archives/pv/passiv/passiv_monthly.py b/src/dagster_dags/assets/pv/passiv/passiv_monthly.py similarity index 58% rename from cloud_archives/pv/passiv/passiv_monthly.py rename to src/dagster_dags/assets/pv/passiv/passiv_monthly.py index 6645b72..75f6eba 100644 --- a/cloud_archives/pv/passiv/passiv_monthly.py +++ b/src/dagster_dags/assets/pv/passiv/passiv_monthly.py @@ -1,22 +1,28 @@ -""" Get passiv daily data and save to Hugging Face""" +"""Get passiv daily data and save to Hugging Face.""" -import datetime -import os, pytz +import datetime as dt import logging +import os +from typing import Literal + +import dagster as dg import pandas as pd +from huggingface_hub import HfFileSystem from huggingface_hub.hf_api import HfApi -import dagster as dg -from .ss_rawdata_api import SSRawDataAPI from .filenames import get_monthly_hf_file_name -from huggingface_hub import HfFileSystem +from .ss_rawdata_api import SSRawDataAPI logger = logging.getLogger(__name__) -def get_monthly_passiv_data(start_date: datetime, upload_to_hf: bool = True, overwrite: bool = False, period:int=5): - """ Get monthly passiv data and save to Hugging Face""" - +def get_monthly_passiv_data( + start_date: dt.datetime, + upload_to_hf: bool = True, + overwrite: bool = False, + period: Literal[5, 30] = 5, +) -> None: + """Get monthly passiv data and save to Hugging Face.""" logger.info(f"Getting data for {start_date}") # check if we have data for that day already @@ -24,17 +30,17 @@ def get_monthly_passiv_data(start_date: datetime, upload_to_hf: bool = True, ove if not overwrite: token = os.getenv("HUGGINGFACE_TOKEN") fs = HfFileSystem(token=token) - if fs.exists(f'datasets/openclimatefix/uk_pv/{huggingface_file}'): - print(f"Data already exists for {start_date.date()}") + if fs.exists(f"datasets/openclimatefix/uk_pv/{huggingface_file}"): + logger.info(f"Data already exists for {start_date.date()}") return # set end date - end_date = (start_date + datetime.timedelta(days=31)).replace(day=1) + end_date = (start_date + dt.timedelta(days=31)).replace(day=1) # setup class ss_rawdata_api = SSRawDataAPI( - user_id=os.getenv("SS_USER_ID"), - api_key=os.getenv("SS_API_KEY") + user_id=os.getenv("SS_USER_ID", "unset"), + api_key=os.getenv("SS_API_KEY", "unset"), ) # only get passiv systems @@ -58,7 +64,9 @@ def get_monthly_passiv_data(start_date: datetime, upload_to_hf: bool = True, ove generation_data["datetime_GMT"] = generation_data["datetime_GMT"].dt.tz_localize("UTC") # dont include the last end date - generation_data = generation_data[generation_data.datetime_GMT < end_date.replace(tzinfo=pytz.utc)] + generation_data = generation_data[ + generation_data.datetime_GMT < end_date.replace(tzinfo=dt.UTC) + ] # save to parquet file file = f"passiv_5min_{start_date.date()}.parquet" @@ -82,42 +90,30 @@ def get_monthly_passiv_data(start_date: datetime, upload_to_hf: bool = True, ove @dg.asset( - key=["pv", "passiv", "monthly_30min"], + name="passiv_monthly_30min", automation_condition=dg.AutomationCondition.eager(), - partitions_def=dg.TimeWindowPartitionsDefinition( - fmt="%Y-%m", - start="2010-01", - cron_schedule="0 12 1 * *", # 1st day of the month, at 12 oclock + partitions_def=dg.MonthlyPartitionsDefinition( + start_date="2010-01-01", + hour_offset=12, ), ) -def pv_passiv_monthly_30min(context: dg.AssetExecutionContext): - """PV Passiv archive monthlyasset.""" - - partition_date_str = context.partition_key - start_date = datetime.datetime.strptime(partition_date_str, "%Y-%m") - start_date = pytz.utc.localize(start_date) - +def pv_passiv_monthly_30min(context: dg.AssetExecutionContext) -> None: + """PV Passiv archive monthly asset.""" + start_date: dt.datetime = context.partition_time_window.start get_monthly_passiv_data(start_date, period=30) - - @dg.asset( - key=["pv", "passiv", "monthly_5min"], + name="passiv_monthly_5min", automation_condition=dg.AutomationCondition.eager(), - partitions_def=dg.TimeWindowPartitionsDefinition( - fmt="%Y-%m", - start="2018-01", - cron_schedule="0 12 1 * *", # 1st day of the month, at 12 oclock + partitions_def=dg.MonthlyPartitionsDefinition( + start_date="2018-01-01", + hour_offset=12, ), ) -def pv_passiv_monthly_5min(context: dg.AssetExecutionContext): - """PV Passiv archive monthlyasset.""" - - partition_date_str = context.partition_key - start_date = datetime.datetime.strptime(partition_date_str, "%Y-%m") - start_date = pytz.utc.localize(start_date) - +def pv_passiv_monthly_5min(context: dg.AssetExecutionContext) -> None: + """PV Passiv archive monthly asset.""" + start_date: dt.datetime = context.partition_time_window.start get_monthly_passiv_data(start_date, period=5) diff --git a/cloud_archives/pv/passiv/passiv_year.py b/src/dagster_dags/assets/pv/passiv/passiv_year.py similarity index 63% rename from cloud_archives/pv/passiv/passiv_year.py rename to src/dagster_dags/assets/pv/passiv/passiv_year.py index 3b65252..d34931a 100644 --- a/cloud_archives/pv/passiv/passiv_year.py +++ b/src/dagster_dags/assets/pv/passiv/passiv_year.py @@ -1,30 +1,38 @@ -""" Get passiv daily data and save to Hugging Face""" +"""Get passiv daily data and save to Hugging Face.""" + +import datetime as dt +import io +import logging +import os +from typing import Literal -import datetime -import io, os, pytz -import pandas as pd -from huggingface_hub.hf_api import HfApi import dagster as dg +import pandas as pd from huggingface_hub import HfFileSystem +from huggingface_hub.hf_api import HfApi from .filenames import get_monthly_hf_file_name, get_yearly_hf_file_name +logger = logging.getLogger(__name__) -def get_yearly_passiv_data(start_date: datetime, upload_to_hf: bool = True, overwrite: bool = False, period:int=5): - """ Get yearly passiv data and save to Hugging Face""" - +def get_yearly_passiv_data( + start_date: dt.datetime, + upload_to_hf: bool = True, + overwrite: bool = False, + period: Literal[5, 30] = 5, + ) -> None: + """Get yearly passiv data and save to Hugging Face.""" # set up HF and check if we have data for that day already huggingface_file = get_yearly_hf_file_name(date=start_date, period=period) token = os.getenv("HUGGINGFACE_TOKEN") fs = HfFileSystem(token=token) - if not overwrite: - if fs.exists(f'datasets/openclimatefix/uk_pv/{huggingface_file}'): - print(f"Data already exists for {start_date.date()}") - return + if not overwrite and fs.exists(f"datasets/openclimatefix/uk_pv/{huggingface_file}"): + logger.info(f"Data already exists for {start_date.date()}") + return # start of the month from datetime start_date = start_date.replace(day=1) - end_date = start_date + datetime.timedelta(days=365) + end_date = start_date + dt.timedelta(days=365) data_df = [] date = start_date @@ -34,15 +42,15 @@ def get_yearly_passiv_data(start_date: datetime, upload_to_hf: bool = True, over huggingface_load_file = get_monthly_hf_file_name(date=date, period=period) # load data - print(f"Loading data from {huggingface_load_file}") - with fs.open(f'datasets/openclimatefix/uk_pv/{huggingface_load_file}') as f: + logger.info(f"Loading data from {huggingface_load_file}") + with fs.open(f"datasets/openclimatefix/uk_pv/{huggingface_load_file}") as f: data = f.read() pq_file = io.BytesIO(data) generation_data = pd.read_parquet(pq_file) data_df.append(generation_data) - date = date + datetime.timedelta(days=31) + date = date + dt.timedelta(days=31) date = date.replace(day=1) # join together data @@ -66,7 +74,7 @@ def get_yearly_passiv_data(start_date: datetime, upload_to_hf: bool = True, over @dg.asset( - key=["pv", "passiv", "yearly_5min"], + name="passiv_yearly_5min", automation_condition=dg.AutomationCondition.eager(), partitions_def=dg.TimeWindowPartitionsDefinition( fmt="%Y", @@ -74,18 +82,14 @@ def get_yearly_passiv_data(start_date: datetime, upload_to_hf: bool = True, over cron_schedule="0 12 2 1 *", # 2nd day of January, at 12 oclock, ), ) -def pv_passiv_yearly_5min(context: dg.AssetExecutionContext): +def pv_passiv_yearly_5min(context: dg.AssetExecutionContext) -> None: """PV Passiv archive yearly data.""" - - partition_date_str = context.partition_key - start_date = datetime.datetime.strptime(partition_date_str, "%Y") - start_date = pytz.utc.localize(start_date) - + start_date: dt.datetime = context.partition_time_window.start get_yearly_passiv_data(start_date, period=5) @dg.asset( - key=["pv", "passiv", "yearly_30min"], + name="passiv_yearly_30min", automation_condition=dg.AutomationCondition.eager(), partitions_def=dg.TimeWindowPartitionsDefinition( fmt="%Y", @@ -93,15 +97,8 @@ def pv_passiv_yearly_5min(context: dg.AssetExecutionContext): cron_schedule="0 12 2 1 *", # 2nd day of January, at 12 oclock, ), ) -def pv_passiv_yearly_30min(context: dg.AssetExecutionContext): +def pv_passiv_yearly_30min(context: dg.AssetExecutionContext) -> None: """PV Passiv archive yearly data.""" - - partition_date_str = context.partition_key - start_date = datetime.datetime.strptime(partition_date_str, "%Y") - start_date = pytz.utc.localize(start_date) - + start_date: dt.datetime = context.partition_time_window.start get_yearly_passiv_data(start_date, period=30) - - - diff --git a/cloud_archives/pv/passiv/ss_rawdata_api.py b/src/dagster_dags/assets/pv/passiv/ss_rawdata_api.py similarity index 59% rename from cloud_archives/pv/passiv/ss_rawdata_api.py rename to src/dagster_dags/assets/pv/passiv/ss_rawdata_api.py index 66f0b8e..c31e547 100644 --- a/cloud_archives/pv/passiv/ss_rawdata_api.py +++ b/src/dagster_dags/assets/pv/passiv/ss_rawdata_api.py @@ -1,11 +1,10 @@ -""" -Download PV generation data via Sheffield Solar's 'rawdata' API. +"""Download PV generation data via Sheffield Solar's 'rawdata' API. Copied from https://github.com/SheffieldSolar/SS-RawData-API/blob/main/ss_rawdata_api/ss_rawdata_api.py """ -import datetime +import datetime as dt import logging from copy import copy from functools import cached_property @@ -13,61 +12,74 @@ from itertools import starmap from multiprocessing import Pool from time import sleep -from typing import TypedDict, Literal, Union, Optional +from typing import TYPE_CHECKING, Any, Literal, TypedDict import pandas as pd import requests +if TYPE_CHECKING: + from collections.abc import Collection + class ProxyDict(TypedDict): + """ProxyDict type hint.""" + http: str https: str class SSRawDataAPI: - def __init__(self, user_id: Union[int, str], api_key: str, proxies: Optional[ProxyDict] = None): + """Class to download PV generation data from the Sheffield Solar rawdata API.""" + + base_url: str + max_range: dt.timedelta + proxies: ProxyDict | None + params: dict[str, str] + + def __init__(self, user_id: int | str, api_key: str, proxies: ProxyDict | None = None) -> None: + """Initialise the API object.""" self.base_url = "https://api.pvlive.uk/rawdata/api/v4" # self.base_url = "https://staging.solar.shef.ac.uk/rawdata/api/v4" - self.max_range = datetime.timedelta(days=1) + self.max_range = dt.timedelta(days=1) self.proxies = proxies self.params = {"user_id": str(user_id), "key": api_key} @cached_property - def metadata(self): + def metadata(self) -> pd.DataFrame: """Get system metadata.""" endpoint = "owner_system_params_rounded" metadata = _query_api( - base_url=self.base_url, endpoint=endpoint, params=self.params, proxies=self.proxies + base_url=self.base_url, endpoint=endpoint, params=self.params, proxies=self.proxies, ) return metadata def __download_loop( self, endpoint: str, - start: datetime.datetime, - end: datetime.datetime, + start: dt.datetime, + end: dt.datetime, period: Literal[5, 30], - n_processes: Optional[int] = 10, + n_processes: int | None = 10, ) -> pd.DataFrame: """Loop through a list of parameters and query the API.""" request_start = start - inputs = [] + inputs: list[Collection[str] | dict[str, str] | ProxyDict | None] = [] while request_start <= end: request_end = min( - end, request_start + self.max_range - datetime.timedelta(minutes=period) + end, request_start + self.max_range - dt.timedelta(minutes=period), ) params = _compile_params(request_start, request_end, self.params) - inputs.append([self.base_url, endpoint, params, self.proxies]) + inputs.append([self.base_url, endpoint, params, self.proxies]) # type: ignore request_start += self.max_range - if n_processes > 1: + if n_processes is not None and n_processes > 1: pool = Pool(n_processes) - chunks = pool.starmap(_query_api, inputs) + chunks = pool.starmap(_query_api, inputs) # type: ignore else: - chunks = starmap(_query_api, inputs) + chunks = starmap(_query_api, inputs) # type: ignore return pd.concat(chunks) def __download_5min( - self, start: datetime.datetime, end: datetime.datetime, n_processes: Optional[int] = 10 + self, start: dt.datetime, end: dt.datetime, n_processes: int | None = 10, ) -> pd.DataFrame: """Download 5 minutely data.""" endpoint = "reading_integrated_5mins" @@ -76,13 +88,13 @@ def __download_5min( return data def __download_30min( - self, start: datetime.datetime, end: datetime.datetime, n_processes: Optional[int] = 10 + self, start: dt.datetime, end: dt.datetime, n_processes: int | None = 10, ) -> pd.DataFrame: """Download 30 minutely data.""" endpoint = "reading_integrated" - start_date = datetime.datetime.combine(start.date(), datetime.time(0)) - end_date = datetime.datetime.combine( - (end - datetime.timedelta(minutes=30)).date(), datetime.time(0) + start_date = dt.datetime.combine(start.date(), dt.time(0)) + end_date = dt.datetime.combine( + (end - dt.timedelta(minutes=30)).date(), dt.time(0), ) data = self.__download_loop(endpoint, start_date, end_date, 30, n_processes) data["date"] = pd.to_datetime(data.date, utc=True) @@ -97,31 +109,23 @@ def __download_30min( def download( self, - start: datetime.datetime, - end: datetime.datetime, + start: dt.datetime, + end: dt.datetime, period: Literal[5, 30], - n_processes: Optional[int] = 10, + n_processes: int | None = 10, ) -> pd.DataFrame: - """ - Download PV data from the SS rawdata API. - - Parameters - ---------- - `start` : datetime - A timezone-aware datetime object. Will be corrected to the END of the half hour in which - *start* falls, since Sheffield Solar use end of interval as convention. - `end` : datetime - A timezone-aware datetime object. Will be corrected to the END of the half hour in which - *end* falls, since Sheffield Solar use end of interval as convention. - `period` : int - Time-resolution to retrieve, either 30 or 5 (minutely). Default is 30. - `n_processes` : int - Number of API queries to make in parallel. Default is 10. - - Returns - ------- - Pandas DataFrame - Contains the columns ss_id, datetime_GMT, generation_Wh. + """Download PV data from the SS rawdata API. + + Args: + start: A timezone-aware datetime object. Will be corrected to the END of the half hour + in which *start* falls, since Sheffield Solar use end of interval as convention. + end: A timezone-aware datetime object. Will be corrected to the END of the half hour + in which *end* falls, since Sheffield Solar use end of interval as convention. + period: Time-resolution to retrieve, either 30 or 5 (minutely). Default is 30. + n_processes: Number of API queries to make in parallel. Default is 10. + + Returns: + Pandas DataFrame containing the columns ss_id, datetime_GMT, generation_Wh. """ logging.info( "Downloading %s minutely PV data between %s and %s using %s threads", @@ -139,9 +143,9 @@ def download( return self.__download_5min(start, end, n_processes) -def _validate_start_end(start, end): +def _validate_start_end(start: dt.datetime, end: dt.datetime) -> None: """Check start and end are tz-aware datetime.datetime.""" - type_check = not (isinstance(start, datetime.datetime) and isinstance(end, datetime.datetime)) + type_check = not (isinstance(start, dt.datetime) and isinstance(end, dt.datetime)) tz_check = start.tzinfo is None or end.tzinfo is None if type_check or tz_check: raise TypeError("start and end must be timezone-aware Python datetime objects.") @@ -149,7 +153,7 @@ def _validate_start_end(start, end): raise ValueError("end must be later than start.") -def _validate_inputs(start, end, period): +def _validate_inputs(start: dt.datetime, end: dt.datetime, period: Literal[5, 30]) -> None: """Validate common input parameters.""" _validate_start_end(start, end) periods = ["5", "30"] @@ -157,54 +161,64 @@ def _validate_inputs(start, end, period): raise ValueError("The period parameter must be one of: " f"{', '.join(map(str, periods))}.") -def _nearest_interval(dt, period=30): +def _nearest_interval(t: dt.datetime, period: Literal[5,30]=30) -> dt.datetime: """Round to either the nearest 30 or 5 minute interval.""" - dt_ = copy(dt) - if not (dt.minute % period == 0 and dt.second == 0 and dt.microsecond == 0): - offset = datetime.timedelta( - minutes=dt.minute % period, seconds=dt.second, microseconds=dt.microsecond + t_ = copy(t) + if not (t.minute % period == 0 and t.second == 0 and t.microsecond == 0): + offset = dt.timedelta( + minutes=t.minute % period, seconds=t.second, microseconds=t.microsecond, ) - dt_ = dt - offset + datetime.timedelta(minutes=period) - logging.debug("Timestamp %s corrected to nearest %s mins: %s", dt, period, dt_) - return dt_ + t_ = t - offset + dt.timedelta(minutes=period) + logging.debug("Timestamp %s corrected to nearest %s mins: %s", t, period, t_) + return t_ -def _compile_params(start=None, end=None, additional_params={}): +def _compile_params( + start: dt.datetime | None = None, + end: dt.datetime | None = None, + additional_params: dict[str, str] | None = None, + ) -> dict[str, str]: """Compile parameters into a Python dict, formatting where necessary.""" - params = {} + params: dict[str, str] = {} if start is not None: params["start_at"] = _iso8601_ss(start) end = start if (start is not None and end is None) else end if end is not None: params["end_at"] = _iso8601_ss(end) - params.update(additional_params) + if additional_params is not None: + params.update(additional_params) return params -def _iso8601_ss(dt): +def _iso8601_ss(t: dt.datetime) -> str: """Convert TZ-aware datetime to string representation expected by the API.""" - return dt.isoformat().replace("+00:00", "") + return t.isoformat().replace("+00:00", "") -def _iso8601_fn(dt): +def _iso8601_fn(t: dt.datetime) -> str: """Convert TZ-aware datetime to string representation for use in filenames.""" - return dt.strftime("%Y%m%dT%H%M%S") + return t.strftime("%Y%m%dT%H%M%S") -def _query_api(base_url, endpoint, params, proxies): +def _query_api( + base_url: str, + endpoint: str, + params: dict[str, str], + proxies: ProxyDict | None = None, + ) -> pd.DataFrame: """Query the API with some REST parameters.""" url = _build_url(base_url, endpoint, params) return _fetch_url(url, proxies) -def _build_url(base_url, endpoint, params): +def _build_url(base_url: str, endpoint: str, params: dict[str, Any]) -> str: """Construct the appropriate URL for a given set of parameters.""" url = f"{base_url}/{endpoint}" - url += "?" + "&".join(["{}={}".format(k, params[k]) for k in params]) + url += "?" + "&".join([f"{k}={params[k]}" for k in params]) return url -def _fetch_url(url, proxies): +def _fetch_url(url: str, proxies: ProxyDict | None = None) -> pd.DataFrame: """Fetch the URL with GET request.""" logging.debug("Fetching %s", url) logging.debug("Proxies: %s", proxies) @@ -215,7 +229,7 @@ def _fetch_url(url, proxies): while not success and try_counter < retries + 1: try_counter += 1 try: - page = requests.get(url, proxies) + page = requests.get(url=url, proxies=proxies, timeout=60*10) # type: ignore page.raise_for_status() if page.status_code == 200 and "Your api key is not valid" in page.text: logging.debug(page.text) @@ -225,14 +239,14 @@ def _fetch_url(url, proxies): raise Exception( "The user_id and api_key does not give access to the data " "you've requested, contact Sheffield Solar " - "." + ".", ) if page.status_code == 200 and "Missing user_id" in page.text: logging.debug(page.text) raise Exception( "The user_id and api_key does not give access to the data " "you've requested, contact Sheffield Solar " - "." + ".", ) success = True except requests.exceptions.HTTPError: @@ -243,5 +257,5 @@ def _fetch_url(url, proxies): raise Exception("Error communicating with the Sheffield Solar API.") try: return pd.read_csv(StringIO(page.text), parse_dates=True) - except: - raise Exception("Error communicating with the Sheffield Solar API.") + except Exception as e: + raise Exception(f"Error communicating with the Sheffield Solar API: {e}") from e diff --git a/src/dagster_dags/assets/sat/__init__.py b/src/dagster_dags/assets/sat/__init__.py new file mode 100644 index 0000000..8b13789 --- /dev/null +++ b/src/dagster_dags/assets/sat/__init__.py @@ -0,0 +1 @@ + diff --git a/src/dagster_dags/assets/sat/eumetsat_iodc_lrv.py b/src/dagster_dags/assets/sat/eumetsat_iodc_lrv.py new file mode 100644 index 0000000..ff6711e --- /dev/null +++ b/src/dagster_dags/assets/sat/eumetsat_iodc_lrv.py @@ -0,0 +1,62 @@ +"""Zarr archive of satellite image data from EUMETSAT's RSS service, low resolution. + +EUMETSAT have a seviri satellite that provides images of the earth's surface. +The Rapid Scan Service (RSS) provides images at 15 minute intervals. +The images are in the MSG format, which is a compressed format that contains +multiple channels of data. The come in high resolution (HRV) and low resolution (LRV). + +Sourced via eumdac from DataStore (https://navigator.eumetsat.int/product/EO:EUM:DAT:MSG:RSS). +This asset is updated monthly, and surfaced as a Zarr Directory Store for each month. +It is downloaded using the sat container. +""" + +import os +from typing import TYPE_CHECKING, Any + +import dagster as dg +from dagster_docker import PipesDockerClient + +if TYPE_CHECKING: + import datetime as dt + +ARCHIVE_FOLDER = "/var/dagster-storage/sat/eumetsat-iodc-lrv" +if os.getenv("ENVIRONMENT", "local") == "leo": + ARCHIVE_FOLDER = "/mnt/storage_b/sat/eumetsat-iodc-lrv" + +partitions_def: dg.TimeWindowPartitionsDefinition = dg.MonthlyPartitionsDefinition( + start_date="2019-01-01", + end_offset=-1, +) + +@dg.asset( + name="eumetsat-iodc-lrv", + description=__doc__, + metadata={ + "archive_folder": dg.MetadataValue.text(ARCHIVE_FOLDER), + "area": dg.MetadataValue.text("india"), + "source": dg.MetadataValue.text("eumetsat"), + "expected_runtime": dg.MetadataValue.text("6 hours"), + }, + compute_kind="docker", + tags={ + "dagster/max_runtime": str(60 * 60 * 10), # Should take 6 ish hours + "dagster/priority": "1", + "dagster/concurrency_key": "eumetsat", + }, + partitions_def=partitions_def, +) +def eumetsat_seviri_lrv_asset( + context: dg.AssetExecutionContext, + pipes_docker_client: PipesDockerClient, +) -> Any: # noqa: ANN401 + """Dagster asset for EUMETSAT's RSS service, low resolution.""" + it: dt.datetime = context.partition_time_window.start + return pipes_docker_client.run( + image="ghcr.io/openclimatefix/sat-etl:main", + command=["iodc", "--month", f"{it:%Y-%m}", "--path", "/work", "--rm"], + container_kwargs={ + "volumes": [f"{ARCHIVE_FOLDER}:/work"], + }, + context=context, + ).get_results() + diff --git a/src/dagster_dags/definitions.py b/src/dagster_dags/definitions.py new file mode 100644 index 0000000..ca715b8 --- /dev/null +++ b/src/dagster_dags/definitions.py @@ -0,0 +1,40 @@ +"""All dagster definitions to be surfaced in this code location.""" + +import dagster as dg +from dagster_docker import PipesDockerClient + +from .assets import nwp, pv, sat +from .resources import SheffieldSolarAPIResource + +nwp_assets = dg.load_assets_from_package_module( + package_module=nwp, + group_name="nwp", + key_prefix="nwp", +) + +sat_assets = dg.load_assets_from_package_module( + package_module=sat, + group_name="sat", + key_prefix="sat", +) + +pv_assets = dg.load_assets_from_package_module( + package_module=pv, + group_name="pv", + key_prefix="pv", +) + +defs = dg.Definitions( + assets=[*nwp_assets, *sat_assets, *pv_assets], + resources={ + "pipes_subprocess_client": dg.PipesSubprocessClient(), + "pipes_docker_client": PipesDockerClient(), + "ss_api": SheffieldSolarAPIResource( + user_id=dg.EnvVar("SS_USER_ID"), + api_key=dg.EnvVar("SS_API_KEY"), + ), + }, + jobs=[], + schedules=[], +) + diff --git a/src/dagster_dags/resources/__init__.py b/src/dagster_dags/resources/__init__.py new file mode 100644 index 0000000..9df4ed6 --- /dev/null +++ b/src/dagster_dags/resources/__init__.py @@ -0,0 +1,2 @@ +from .sheffield_solar_api import SheffieldSolarAPIResource + diff --git a/src/dagster_dags/resources/sheffield_solar_api.py b/src/dagster_dags/resources/sheffield_solar_api.py new file mode 100644 index 0000000..cab80b2 --- /dev/null +++ b/src/dagster_dags/resources/sheffield_solar_api.py @@ -0,0 +1,142 @@ +"""Dagster resource for accessing the Sheffield Solar API.""" + +import dataclasses +import datetime as dt +import functools +import io +import multiprocessing +import time + +import dagster as dg +import pandas as pd +import requests + + +@dataclasses.dataclass +class SheffieldSolarRawdataRequest: + """Parameters for the Sheffield Solar API.""" + + start: dt.datetime + end: dt.datetime + period_mins: int = 30 + + def __post_init__(self) -> None: + """Validate the initialisation parameters.""" + # Check start and end are in the past, and end > start + now: dt.datetime = dt.datetime.now(dt.UTC) + base_err: str = "Cannot initialize SheffieldSolarParams object" + if self.start > now: + raise ValueError(f"{base_err}: start time must be in the past") + if self.end > now: + raise ValueError("{base_err}: end time must be in the past") + if self.end <= self.start: + raise ValueError("{base_err}: end time must be after start time") + if self.period_mins not in [5, 30]: + raise ValueError("{base_err}: period_mins must be 5 or 30") + self.start = self.start.astimezone(dt.UTC).replace(tzinfo=None) + self.end = self.end.astimezone(dt.UTC).replace(tzinfo=None) + + def endpoint(self) -> str: + """Return the API endpoint for the request.""" + if self.period_mins == 5: + return "rawdata/api/v4/reading_integrated_5mins" + else: + return "rawdata/api/v4/reading_integrated" + + + def as_params(self, user_id: str, api_key: str) -> list[dict[str, str]]: + """Return the request as a list of parameter dictionaries. + + Each dictionary represents one period of the request. + """ + ticks: pd.DatetimeIndex = pd.date_range( + start=pd.to_datetime(self.start).ceil(f"{self.period_mins}T"), + end=pd.to_datetime(self.end).ceil(f"{self.period_mins}T"), + freq=f"{self.period_mins}T", + inclusive="left", # Don't include the end time + ) + + params_list: list[dict[str, str]] = [ + { + "start_at": tick.isoformat(), + "end_at": (tick + pd.Timedelta(minutes=self.period_mins)).isoformat(), + "user_id": user_id, + "api_key": api_key, + } + for tick in ticks + ] + + return params_list + +class SheffieldSolarAPIResource(dg.ConfigurableResource): + """Dagster resource for accessing the Sheffield Solar API.""" + + user_id: str + api_key: str + base_url: str = "https://api.pvlive.uk" + delay_multiplier: int = 2 + retries: int = 5 + n_processes: int = 10 + + def setup_for_execution(self, context: dg.InitResourceContext) -> None: + """Set up the Sheffield Solar API resource for execution.""" + self._log = context.log + + def _query( + self, + endpoint: str, + params: dict[str, str], + ) -> pd.DataFrame: + """Query the Sheffield Solar API. + + Args: + endpoint: The API endpoint to query. + params: The query parameters. + """ + url: str = f"{self.base_url}/{endpoint}" + url += "?" + "&".join([f"{k}={v}" for k, v in params.items()]) + num_attempts: int = 1 + + while num_attempts <= self.retries: + try: + response = requests.get(url, timeout=60*10) + except requests.exceptions.HTTPError as e: + time.sleep(0.5 * num_attempts * self.delay_multiplier) + if num_attempts == self.retries: + raise e + continue + + if response.status_code != 200: + raise ValueError(f"HTTP error: {response.status_code}") + else: + if "Your api key is not valid" in response.text: + raise ValueError("Invalid API key/User ID combination") + elif "Your account does not give access" in response.text: + raise ValueError("API key/User ID does not give access to requested data") + elif "Missing user_id" in response.text: + raise ValueError("Missing user_id") + else: + try: + df: pd.DataFrame = pd.read_csv( + io.StringIO(response.text), + parse_dates=True, + ) + return df + except Exception as e: + raise ValueError(f"Error parsing API query result: {e}") from e + + num_attempts += 1 + + + def request( + self, + request: SheffieldSolarRawdataRequest, + ) -> pd.DataFrame: + """Request data from the Sheffield Solar API.""" + pool = multiprocessing.Pool(processes=self.n_processes) + df_chunks: pd.DataFrame = pool.map( + functools.partial(self._query, endpoint=request.endpoint()), + request.as_params(self.user_id, self.api_key), + ) + return pd.concat(df_chunks) + diff --git a/tests/__init__.py b/tests/__init__.py index e69de29..8b13789 100644 --- a/tests/__init__.py +++ b/tests/__init__.py @@ -0,0 +1 @@ + diff --git a/tests/cloud_archives/ops/__init__.py b/tests/cloud_archives/ops/__init__.py deleted file mode 100644 index e69de29..0000000 diff --git a/tests/cloud_archives/ops/kbatch_test.py b/tests/cloud_archives/ops/kbatch_test.py deleted file mode 100644 index 68c1424..0000000 --- a/tests/cloud_archives/ops/kbatch_test.py +++ /dev/null @@ -1,80 +0,0 @@ -import unittest -from unittest.mock import MagicMock, patch - -from cloud_archives.ops.kbatch import ( - KbatchJobException, - wait_for_status_change, -) - - -def generate_mock_list_pods_response(status: str) -> dict: - return { - "items": [ - { - "status": {"phase": status, "container_statuses": [{"state": status}]}, - "metadata": {"name": "test_pod"}, - }, - ], - } - - -class TestWaitForStatusChange(unittest.TestCase): - - @patch("time.sleep", return_value=None) - @patch( - "cloud_archives.ops.kbatch.kbc.list_pods", - side_effect=[ - generate_mock_list_pods_response("Pending"), - generate_mock_list_pods_response("Pending"), - generate_mock_list_pods_response("Running"), - ], - ) - def test_wait_for_status_change_successful( - self, - mock_list_pods: MagicMock, - mock_sleep: MagicMock, - ) -> None: - new_status = wait_for_status_change(old_status="Pending", job_name="test-job") - self.assertEqual(new_status, "Running") - # Make sure list_pods is called for each status check - self.assertEqual(mock_list_pods.call_count, 3) - - @patch("time.sleep", return_value=None) - @patch( - "cloud_archives.ops.kbatch.kbc.list_pods", - side_effect=[ - generate_mock_list_pods_response("Pending"), - generate_mock_list_pods_response("Failed"), - ], - ) - def test_wait_for_status_change_failed( - self, - mock_list_pods: MagicMock, - mock_sleep: MagicMock, - ) -> None: - new_status = wait_for_status_change(old_status="Pending", job_name="test-job") - self.assertEqual(new_status, "Failed") - # Make sure list_pods is called - self.assertEqual(mock_list_pods.call_count, 2) - - @patch("time.sleep", return_value=None) - @patch( - "cloud_archives.ops.kbatch.kbc.list_pods", - return_value=generate_mock_list_pods_response("Pending"), - ) - def test_wait_for_status_change_timeout( - self, - mock_list_pods: MagicMock, - mock_sleep: MagicMock, - ) -> None: - with self.assertRaises(KbatchJobException): - new_status = wait_for_status_change(old_status="Pending", job_name="test-job") - self.assertEqual(new_status, "Pending") - # Make sure list_pods is called - self.assertGreater(mock_list_pods.call_count, 1) - # Make sure time.sleep is called multiple times - self.assertGreater(mock_sleep.call_count, 1) - - -if __name__ == "__main__": - unittest.main() diff --git a/tests/cloud_archives/pv/test_passiv.py b/tests/cloud_archives/pv/test_passiv.py deleted file mode 100644 index 6cdbe6e..0000000 --- a/tests/cloud_archives/pv/test_passiv.py +++ /dev/null @@ -1,8 +0,0 @@ -from cloud_archives.pv.passiv.passiv_monthly import get_monthly_passiv_data - -from datetime import datetime, timezone - - -def test_get_daily_passiv_data(): - start_date = datetime(2024, 12, 5, tzinfo=timezone.utc) - get_monthly_passiv_data(start_date, upload_to_hf=False, overwrite=True) diff --git a/tests/compile_test.py b/tests/compile_test.py index 454d46b..2223517 100644 --- a/tests/compile_test.py +++ b/tests/compile_test.py @@ -1,17 +1,16 @@ -import sys +import unittest -from local_archives.nwp import all_assets +import dagster as dg +from src.dagster_dags import defs -def test_nwp_asset_key_prefixes() -> None: - """Test asset keys for all nwp assets have the correct key structure.""" - for asset in all_assets: - assert len(asset.key.path) == 4 - # Ensure that the prefix is as expected - # The first element should be the flavor: - assert asset.key.path[0] in ["nwp", "sat"] - # The second element should be the provider - assert asset.key.path[1] in ["ecmwf", "metoffice", "eumetsat", "cams", "ceda", "meteomatics", "gfs", "ecmwf-eps"] - # The third element should be the region - assert asset.key.path[2] in ["uk", "eu", "global", "nw_india", "malta", "india", "india-stat"] +class TestAssetKeyPrefixes(unittest.TestCase): + def test_nwp_asset_key_prefixes(self) -> None: + """Test asset keys for all nwp assets have the correct key structure.""" + if defs.assets is not None: + for asset in defs.assets: + if isinstance(asset, dg.AssetsDefinition): + # Ensure that the prefix is one of the expected flavours + self.assertIn( asset.key.path[0], ["nwp", "sat", "air", "pv"]) + diff --git a/tests/test_passiv.py b/tests/test_passiv.py new file mode 100644 index 0000000..aa9ec42 --- /dev/null +++ b/tests/test_passiv.py @@ -0,0 +1,14 @@ +import unittest +from datetime import UTC, datetime + +from src.dagster_dags.assets.pv.passiv.passiv_monthly import get_monthly_passiv_data + + +@unittest.skip("rawdata endpoint not on new URL") +class TestPassiv(unittest.TestCase): + def test_get_daily_passiv_data(self) -> None: + """Test the get_daily_passiv_data function.""" + # TODO: Make this an actual test! + start_date = datetime(2024, 12, 5, tzinfo=UTC) + get_monthly_passiv_data(start_date, upload_to_hf=False, overwrite=True) +