diff --git a/docs/source/index.md b/docs/source/index.md index 145086b..8c9a1c0 100644 --- a/docs/source/index.md +++ b/docs/source/index.md @@ -14,10 +14,17 @@ maxdepth: 1 caption: Using ecgtools --- installation.md -notebooks/cmip6-glade.ipynb notebooks/cesm-history-files.ipynb ``` +```{toctree} +--- +maxdepth: 1 +caption: Examples +--- +notebooks/cmip6-glade.ipynb +``` + ```{toctree} --- maxdepth: 2 diff --git a/docs/source/notebooks/cmip6-glade.ipynb b/docs/source/notebooks/cmip6-glade.ipynb index 0dd2ef6..95027c6 100644 --- a/docs/source/notebooks/cmip6-glade.ipynb +++ b/docs/source/notebooks/cmip6-glade.ipynb @@ -2,7 +2,7 @@ "cells": [ { "cell_type": "markdown", - "id": "olive-feeling", + "id": "experimental-reality", "metadata": {}, "source": [ "# How to build a catalog for CMIP6 CMorized output" @@ -10,7 +10,7 @@ }, { "cell_type": "markdown", - "id": "arctic-ethernet", + "id": "imperial-radio", "metadata": {}, "source": [ "## Import packages" @@ -19,7 +19,7 @@ { "cell_type": "code", "execution_count": 1, - "id": "forty-million", + "id": "solid-connection", "metadata": {}, "outputs": [], "source": [ @@ -29,7 +29,7 @@ }, { "cell_type": "markdown", - "id": "turkish-immune", + "id": "resistant-indicator", "metadata": {}, "source": [ "## Instatiate a `Builder` object" @@ -38,16 +38,18 @@ { "cell_type": "code", "execution_count": 2, - "id": "charged-geometry", + "id": "isolated-glossary", "metadata": {}, "outputs": [], "source": [ - "b = Builder(root_path=\"/glade/collections/cmip/CMIP6/CFMIP/\", depth=3, parsing_func=parse_cmip6)" + "b = Builder(\n", + " root_path=\"/glade/collections/cmip/CMIP6/CFMIP/\", depth=3, parsing_func=parse_cmip6, njobs=20\n", + ")" ] }, { "cell_type": "markdown", - "id": "arctic-wheat", + "id": "decreased-object", "metadata": {}, "source": [ "## Build catalog and inspect built catalog" @@ -56,7 +58,7 @@ { "cell_type": "code", "execution_count": 3, - "id": "going-secondary", + "id": "fundamental-supervisor", "metadata": { "tags": [] }, @@ -65,41 +67,38 @@ "name": "stderr", "output_type": "stream", "text": [ - "[Parallel(n_jobs=-1)]: Using backend LokyBackend with 10 concurrent workers.\n", - "[Parallel(n_jobs=-1)]: Done 10 out of 24 | elapsed: 1.7s remaining: 2.3s\n", - "[Parallel(n_jobs=-1)]: Done 15 out of 24 | elapsed: 1.9s remaining: 1.2s\n", - "[Parallel(n_jobs=-1)]: Done 20 out of 24 | elapsed: 2.3s remaining: 0.5s\n", - "[Parallel(n_jobs=-1)]: Done 24 out of 24 | elapsed: 2.4s finished\n", - "[Parallel(n_jobs=-1)]: Using backend LokyBackend with 10 concurrent workers.\n", - "[Parallel(n_jobs=-1)]: Done 52 tasks | elapsed: 2.0s\n", - "[Parallel(n_jobs=-1)]: Done 142 tasks | elapsed: 3.6s\n", - "[Parallel(n_jobs=-1)]: Done 372 tasks | elapsed: 5.7s\n", - "[Parallel(n_jobs=-1)]: Done 840 tasks | elapsed: 17.7s\n", - "[Parallel(n_jobs=-1)]: Done 1038 tasks | elapsed: 26.8s\n", - "[Parallel(n_jobs=-1)]: Done 1404 tasks | elapsed: 34.0s\n", - "[Parallel(n_jobs=-1)]: Done 1932 tasks | elapsed: 46.0s\n", - "[Parallel(n_jobs=-1)]: Done 2238 tasks | elapsed: 59.2s\n", - "[Parallel(n_jobs=-1)]: Done 2580 tasks | elapsed: 1.2min\n", - "[Parallel(n_jobs=-1)]: Done 3308 tasks | elapsed: 1.5min\n", - "[Parallel(n_jobs=-1)]: Done 3764 tasks | elapsed: 1.7min\n", - "[Parallel(n_jobs=-1)]: Done 4664 tasks | elapsed: 2.1min\n", - "[Parallel(n_jobs=-1)]: Done 5636 tasks | elapsed: 2.4min\n", - "[Parallel(n_jobs=-1)]: Done 6680 tasks | elapsed: 2.6min\n", - "[Parallel(n_jobs=-1)]: Done 7796 tasks | elapsed: 2.9min\n", - "[Parallel(n_jobs=-1)]: Done 8984 tasks | elapsed: 3.2min\n", - "[Parallel(n_jobs=-1)]: Done 10244 tasks | elapsed: 3.4min\n", - "[Parallel(n_jobs=-1)]: Done 11576 tasks | elapsed: 3.7min\n", - "[Parallel(n_jobs=-1)]: Done 12980 tasks | elapsed: 4.0min\n", - "[Parallel(n_jobs=-1)]: Done 14456 tasks | elapsed: 4.5min\n", - "[Parallel(n_jobs=-1)]: Done 16004 tasks | elapsed: 5.1min\n", - "[Parallel(n_jobs=-1)]: Done 17624 tasks | elapsed: 5.6min\n", - "[Parallel(n_jobs=-1)]: Done 17912 out of 17912 | elapsed: 5.6min finished\n" + "[Parallel(n_jobs=20)]: Using backend LokyBackend with 20 concurrent workers.\n", + "[Parallel(n_jobs=20)]: Done 5 out of 24 | elapsed: 0.8s remaining: 3.2s\n", + "[Parallel(n_jobs=20)]: Done 10 out of 24 | elapsed: 0.9s remaining: 1.3s\n", + "[Parallel(n_jobs=20)]: Done 15 out of 24 | elapsed: 1.0s remaining: 0.6s\n", + "[Parallel(n_jobs=20)]: Done 20 out of 24 | elapsed: 1.0s remaining: 0.2s\n", + "[Parallel(n_jobs=20)]: Done 24 out of 24 | elapsed: 1.1s finished\n", + "[Parallel(n_jobs=20)]: Using backend LokyBackend with 20 concurrent workers.\n", + "[Parallel(n_jobs=20)]: Done 32 tasks | elapsed: 2.1s\n", + "[Parallel(n_jobs=20)]: Done 122 tasks | elapsed: 2.8s\n", + "[Parallel(n_jobs=20)]: Done 256 tasks | elapsed: 3.4s\n", + "[Parallel(n_jobs=20)]: Done 840 tasks | elapsed: 8.9s\n", + "[Parallel(n_jobs=20)]: Done 1128 tasks | elapsed: 12.6s\n", + "[Parallel(n_jobs=20)]: Done 1888 tasks | elapsed: 18.0s\n", + "[Parallel(n_jobs=20)]: Done 2272 tasks | elapsed: 24.4s\n", + "[Parallel(n_jobs=20)]: Done 2578 tasks | elapsed: 28.5s\n", + "[Parallel(n_jobs=20)]: Done 3560 tasks | elapsed: 38.8s\n", + "[Parallel(n_jobs=20)]: Done 4176 tasks | elapsed: 44.7s\n", + "[Parallel(n_jobs=20)]: Done 5004 tasks | elapsed: 49.6s\n", + "[Parallel(n_jobs=20)]: Done 6148 tasks | elapsed: 54.9s\n", + "[Parallel(n_jobs=20)]: Done 8092 tasks | elapsed: 1.1min\n", + "[Parallel(n_jobs=20)]: Done 10180 tasks | elapsed: 1.3min\n", + "[Parallel(n_jobs=20)]: Done 12412 tasks | elapsed: 1.5min\n", + "[Parallel(n_jobs=20)]: Done 14062 tasks | elapsed: 1.7min\n", + "[Parallel(n_jobs=20)]: Done 15284 tasks | elapsed: 1.8min\n", + "[Parallel(n_jobs=20)]: Done 16616 tasks | elapsed: 2.1min\n", + "[Parallel(n_jobs=20)]: Done 17912 out of 17912 | elapsed: 2.2min finished\n" ] }, { "data": { "text/plain": [ - "Builder(root_path=PosixPath('/glade/collections/cmip/CMIP6/CFMIP'), extension='.nc', depth=3, exclude_patterns=None, parsing_func=, njobs=-1)" + "Builder(root_path=PosixPath('/glade/collections/cmip/CMIP6/CFMIP'), extension='.nc', depth=3, exclude_patterns=None, parsing_func=, njobs=20)" ] }, "execution_count": 3, @@ -114,7 +113,7 @@ { "cell_type": "code", "execution_count": 4, - "id": "descending-basis", + "id": "eleven-touch", "metadata": {}, "outputs": [ { @@ -149,8 +148,6 @@ " grid_label\n", " institution_id\n", " ...\n", - " variant_label\n", - " member_id\n", " standard_name\n", " long_name\n", " units\n", @@ -158,7 +155,9 @@ " init_year\n", " start_time\n", " end_time\n", + " time_range\n", " path\n", + " version\n", " \n", " \n", " \n", @@ -175,8 +174,6 @@ " gn\n", " NCAR\n", " ...\n", - " r1i1p1f1\n", - " r1i1p1f1\n", " relative_humidity\n", " Relative Humidity\n", " %\n", @@ -184,7 +181,9 @@ " NaN\n", " 0001-01-15 12:00:00\n", " 0030-12-15 12:00:00\n", + " 0001-01-15 12:00:00-0030-12-15 12:00:00\n", " /glade/collections/cmip/CMIP6/CFMIP/NCAR/CESM2...\n", + " v0\n", " \n", " \n", " 1\n", @@ -199,8 +198,6 @@ " gn\n", " NCAR\n", " ...\n", - " r1i1p1f1\n", - " r1i1p1f1\n", " relative_humidity\n", " Relative Humidity\n", " %\n", @@ -208,7 +205,9 @@ " NaN\n", " 0001-01-15 12:00:00\n", " 0030-12-15 12:00:00\n", + " 0001-01-15 12:00:00-0030-12-15 12:00:00\n", " /glade/collections/cmip/CMIP6/CFMIP/NCAR/CESM2...\n", + " v20200209\n", " \n", " \n", " 2\n", @@ -223,8 +222,6 @@ " gn\n", " NCAR\n", " ...\n", - " r1i1p1f1\n", - " r1i1p1f1\n", " tendency_of_air_temperature_due_to_advection\n", " Tendency of Air Temperature Due to Advection\n", " K s-1\n", @@ -232,7 +229,9 @@ " NaN\n", " 0001-01-15 12:00:00\n", " 0030-12-15 12:00:00\n", + " 0001-01-15 12:00:00-0030-12-15 12:00:00\n", " /glade/collections/cmip/CMIP6/CFMIP/NCAR/CESM2...\n", + " v0\n", " \n", " \n", " 3\n", @@ -247,8 +246,6 @@ " gn\n", " NCAR\n", " ...\n", - " r1i1p1f1\n", - " r1i1p1f1\n", " tendency_of_air_temperature_due_to_advection\n", " Tendency of Air Temperature Due to Advection\n", " K s-1\n", @@ -256,7 +253,9 @@ " NaN\n", " 0001-01-15 12:00:00\n", " 0030-12-15 12:00:00\n", + " 0001-01-15 12:00:00-0030-12-15 12:00:00\n", " /glade/collections/cmip/CMIP6/CFMIP/NCAR/CESM2...\n", + " v20200209\n", " \n", " \n", " 4\n", @@ -271,8 +270,6 @@ " gn\n", " NCAR\n", " ...\n", - " r1i1p1f1\n", - " r1i1p1f1\n", " cloud_area_fraction\n", " CALIPSO Total Cloud Cover Percentage\n", " %\n", @@ -280,11 +277,13 @@ " NaN\n", " 0001-01-15 12:00:00\n", " 0030-12-15 12:00:00\n", + " 0001-01-15 12:00:00-0030-12-15 12:00:00\n", " /glade/collections/cmip/CMIP6/CFMIP/NCAR/CESM2...\n", + " v0\n", " \n", " \n", "\n", - "

5 rows × 34 columns

\n", + "

5 rows × 36 columns

\n", "" ], "text/plain": [ @@ -309,19 +308,12 @@ "3 native 0.9x1.25 finite volume grid (192x288 la... gn \n", "4 native 0.9x1.25 finite volume grid (192x288 la... gn \n", "\n", - " institution_id ... variant_label member_id \\\n", - "0 NCAR ... r1i1p1f1 r1i1p1f1 \n", - "1 NCAR ... r1i1p1f1 r1i1p1f1 \n", - "2 NCAR ... r1i1p1f1 r1i1p1f1 \n", - "3 NCAR ... r1i1p1f1 r1i1p1f1 \n", - "4 NCAR ... r1i1p1f1 r1i1p1f1 \n", - "\n", - " standard_name \\\n", - "0 relative_humidity \n", - "1 relative_humidity \n", - "2 tendency_of_air_temperature_due_to_advection \n", - "3 tendency_of_air_temperature_due_to_advection \n", - "4 cloud_area_fraction \n", + " institution_id ... standard_name \\\n", + "0 NCAR ... relative_humidity \n", + "1 NCAR ... relative_humidity \n", + "2 NCAR ... tendency_of_air_temperature_due_to_advection \n", + "3 NCAR ... tendency_of_air_temperature_due_to_advection \n", + "4 NCAR ... cloud_area_fraction \n", "\n", " long_name units vertical_levels \\\n", "0 Relative Humidity % 32.0 \n", @@ -337,14 +329,21 @@ "3 NaN 0001-01-15 12:00:00 0030-12-15 12:00:00 \n", "4 NaN 0001-01-15 12:00:00 0030-12-15 12:00:00 \n", "\n", - " path \n", - "0 /glade/collections/cmip/CMIP6/CFMIP/NCAR/CESM2... \n", - "1 /glade/collections/cmip/CMIP6/CFMIP/NCAR/CESM2... \n", - "2 /glade/collections/cmip/CMIP6/CFMIP/NCAR/CESM2... \n", - "3 /glade/collections/cmip/CMIP6/CFMIP/NCAR/CESM2... \n", - "4 /glade/collections/cmip/CMIP6/CFMIP/NCAR/CESM2... \n", + " time_range \\\n", + "0 0001-01-15 12:00:00-0030-12-15 12:00:00 \n", + "1 0001-01-15 12:00:00-0030-12-15 12:00:00 \n", + "2 0001-01-15 12:00:00-0030-12-15 12:00:00 \n", + "3 0001-01-15 12:00:00-0030-12-15 12:00:00 \n", + "4 0001-01-15 12:00:00-0030-12-15 12:00:00 \n", + "\n", + " path version \n", + "0 /glade/collections/cmip/CMIP6/CFMIP/NCAR/CESM2... v0 \n", + "1 /glade/collections/cmip/CMIP6/CFMIP/NCAR/CESM2... v20200209 \n", + "2 /glade/collections/cmip/CMIP6/CFMIP/NCAR/CESM2... v0 \n", + "3 /glade/collections/cmip/CMIP6/CFMIP/NCAR/CESM2... v20200209 \n", + "4 /glade/collections/cmip/CMIP6/CFMIP/NCAR/CESM2... v0 \n", "\n", - "[5 rows x 34 columns]" + "[5 rows x 36 columns]" ] }, "execution_count": 4, @@ -359,7 +358,7 @@ { "cell_type": "code", "execution_count": 5, - "id": "ranking-reynolds", + "id": "mighty-recommendation", "metadata": {}, "outputs": [ { @@ -465,7 +464,7 @@ }, { "cell_type": "markdown", - "id": "automated-heritage", + "id": "average-conditions", "metadata": {}, "source": [ "## Save built catalog to disk" @@ -473,8 +472,8 @@ }, { "cell_type": "code", - "execution_count": 6, - "id": "alternative-seven", + "execution_count": 7, + "id": "optical-burton", "metadata": {}, "outputs": [ { @@ -488,8 +487,8 @@ "source": [ "b.save(\n", " '/glade/scratch/abanihi/test-cmip6-catalog.csv',\n", - " path_column='path',\n", - " variable_column='variable_id',\n", + " path_column_name='path',\n", + " variable_column_name='variable_id',\n", " data_format='netcdf',\n", " groupby_attrs=[\n", " 'activity_id',\n", @@ -517,8 +516,8 @@ }, { "cell_type": "code", - "execution_count": 7, - "id": "monthly-harris", + "execution_count": 8, + "id": "lesbian-quick", "metadata": {}, "outputs": [ { @@ -526,7 +525,7 @@ "output_type": "stream", "text": [ "{\n", - " \"catalog_file\": \"/glade/scratch/abanihi/test-cmip6-catalog.csv\",\n", + " \"catalog_file\": \"test-cmip6-catalog.csv\",\n", " \"attributes\": [\n", " {\n", " \"column_name\": \"activity_id\",\n", @@ -661,8 +660,16 @@ " \"vocabulary\": \"\"\n", " },\n", " {\n", + " \"column_name\": \"time_range\",\n", + " \"vocabulary\": \"\"\n", + " },\n", + " {\n", " \"column_name\": \"path\",\n", " \"vocabulary\": \"\"\n", + " },\n", + " {\n", + " \"column_name\": \"version\",\n", + " \"vocabulary\": \"\"\n", " }\n", " ],\n", " \"assets\": {\n", @@ -670,7 +677,7 @@ " \"format\": \"netcdf\"\n", " },\n", " \"aggregation_control\": {\n", - " \"variable_column\": \"variable_id\",\n", + " \"variable_column_name\": \"variable_id\",\n", " \"groupby_attrs\": [\n", " \"activity_id\",\n", " \"institution_id\",\n", @@ -707,7 +714,7 @@ " \"esmcat_version\": \"0.0.1\",\n", " \"id\": null,\n", " \"description\": null,\n", - " \"last_updated\": \"2021-06-02T15:36:21+00:00\"\n", + " \"last_updated\": \"2021-06-07T15:03:05+00:00\"\n", "}" ] } diff --git a/ecgtools/parsers/cmip.py b/ecgtools/parsers/cmip.py index 18d432a..d39e611 100644 --- a/ecgtools/parsers/cmip.py +++ b/ecgtools/parsers/cmip.py @@ -42,16 +42,18 @@ def parse_cmip6(file): ) ) ) + try: + with xr.open_dataset(file, chunks={}, use_cftime=True) as ds: - attributes = {key: ds.attrs.get(key) for key in keys} - attributes['member_id'] = attributes['variant_label'] + info = {key: ds.attrs.get(key) for key in keys} + info['member_id'] = info['variant_label'] - variable_id = attributes['variable_id'] + variable_id = info['variable_id'] if variable_id: attrs = ds[variable_id].attrs for attr in ['standard_name', 'long_name', 'units']: - attributes[attr] = attrs.get(attr) + info[attr] = attrs.get(attr) # Set the default of # of vertical levels to 1 vertical_levels = 1 @@ -66,20 +68,23 @@ def parse_cmip6(file): start_time, end_time = str(ds.cf['T'][0].data), str(ds.cf['T'][-1].data) except (KeyError, AttributeError, ValueError): ... - if attributes.get('sub_experiment_id'): - init_year = extract_attr_with_regex(attributes['sub_experiment_id'], r'\d{4}') + if info.get('sub_experiment_id'): + init_year = extract_attr_with_regex(info['sub_experiment_id'], r'\d{4}') if init_year: init_year = int(init_year) - attributes['vertical_levels'] = vertical_levels - attributes['init_year'] = init_year - attributes['start_time'] = start_time - attributes['end_time'] = end_time + info['vertical_levels'] = vertical_levels + info['init_year'] = init_year + info['start_time'] = start_time + info['end_time'] = end_time if not (start_time and end_time): - attributes['time_range'] = None + info['time_range'] = None else: - attributes['time_range'] = f'{start_time}-{end_time}' - attributes['path'] = file - return attributes + info['time_range'] = f'{start_time}-{end_time}' + info['path'] = str(file) + info['version'] = ( + extract_attr_with_regex(str(file), regex=r'v\d{4}\d{2}\d{2}|v\d{1}') or 'v0' + ) + return info except Exception: return {INVALID_ASSET: file, TRACEBACK: traceback.format_exc()}