Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Add CESM POP 1-degree #56

Open
wants to merge 27 commits into
base: master
Choose a base branch
from
Open
Show file tree
Hide file tree
Changes from all commits
Commits
Show all changes
27 commits
Select commit Hold shift + click to select a range
c167a09
add intial files
mgrover1 Jun 24, 2021
e377b76
input target: CacheFSSpecTarget -> FSSpecTarget
cisaacstern Jun 24, 2021
853fde2
first commit
paigem Jun 27, 2021
42be391
Merge remote-tracking branch 'paige/cesm-pop-lowres-1deg' into cesm-p…
cisaacstern Jun 28, 2021
8d028f7
recipe outline
cisaacstern Jun 28, 2021
e25b91f
remove cesm2_le directory
cisaacstern Jun 28, 2021
b59e3ef
add download link, variables, possible chunk size
paigem Jun 30, 2021
c029fe6
updated chunk size
paigem Jun 30, 2021
ddc88da
formatting
cisaacstern Jul 9, 2021
415e110
formatting 2
cisaacstern Jul 9, 2021
0ad906e
formatting 3
cisaacstern Jul 9, 2021
edb6452
add ConcatDim
cisaacstern Jul 9, 2021
86982d6
formatting 4
cisaacstern Jul 9, 2021
1ebaa69
formatting 5
cisaacstern Jul 9, 2021
0548f8b
add instantiate_recipe func
cisaacstern Jul 16, 2021
b3592f0
lint
cisaacstern Jul 16, 2021
118372b
add nitems_per_file based on NCAR netCDF headers
cisaacstern Aug 6, 2021
8c58c85
correct for latest query string implementation
cisaacstern Aug 6, 2021
35f35b6
increase target_chunks to 300 for ~100 MB/chunk
cisaacstern Aug 7, 2021
82ca358
reorder assignments for clarity
cisaacstern Aug 7, 2021
942a8a4
add subset_inputs, our new logger.warning did a great job
cisaacstern Aug 7, 2021
9f5b099
subset module and recipe
cisaacstern Aug 15, 2021
0f3db64
lint
cisaacstern Aug 15, 2021
cb54d9e
lint 2
cisaacstern Aug 15, 2021
f6062f3
fix SST edge case in _fn_from_var method
cisaacstern Aug 17, 2021
389adce
Merge remote-tracking branch 'origin' into cesm-pop-lowres-1deg
andersy005 Sep 29, 2022
e3c52bc
[pre-commit.ci] auto fixes from pre-commit.com hooks
pre-commit-ci[bot] Sep 29, 2022
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
88 changes: 88 additions & 0 deletions recipes/cesm-pop-lowres-1deg/netcdf_subsets.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,88 @@
import os
import time

import numpy as np
import xarray as xr


class NetCDFSubsets:
def __init__(
self,
cache_fs,
cache_dir,
var_name,
target_bins,
concat_dim_name,
concat_dim_length,
):
self.cache_fs = cache_fs
self.cache_dir = cache_dir
self.var_name = var_name
self.target_bins = target_bins
self.concat_dim_name = concat_dim_name
self.concat_dim_length = concat_dim_length

def _fn_from_var(self):
"""Assumes one netCDF per variable in cache"""
for filename in self.cache_fs.ls(self.cache_dir):
if f'{self.var_name.lower()}.' in filename:
print(f'Filename for {self.var_name} is {filename}')
return filename

def _open_dataset(self):
fn = self._fn_from_var()
open_file = self.cache_fs.open(fn)
print(f'Calling `xr.open_dataset` on {open_file}')
start = time.time()
ds = xr.open_dataset(open_file)
print(f'Opened dataset in {time.time()-start:.02f}s')
assert len(ds[self.concat_dim_name]) == self.concat_dim_length
print(f"`len(ds['{self.concat_dim_name}']`) matches expected length")
return ds

def _assign_time_counter(self):
ds = self._open_dataset()
array = np.arange(1, self.concat_dim_length + 1, 1)
return ds.assign_coords(time_counter=(self.concat_dim_name, array))

def _grouby_bins(self):
ds = self._assign_time_counter()
groupby = ds.groupby_bins('time_counter', self.target_bins)
bins, datasets = zip(*groupby)
return bins, datasets

def _make_target_paths(self, bins):
def format_filename(interval_object, counter, variable):
out = str(interval_object).replace('(', '')
if '-' in out: # only relevant for the first bin
out = out.replace('-', '')
out = out.replace(']', '')
out = out.replace(', ', '-')
return f'{variable}-{counter}-{out}.nc'

return [format_filename(b, i, self.var_name) for i, b in enumerate(bins)]

def subset_netcdf(self):

bins, datasets = self._grouby_bins()
paths = self._make_target_paths(bins=bins)

start = time.time()
for i, p in enumerate(paths):

loop_start = time.time()

print(f'Writing {p} to local')
datasets[i].to_netcdf(p)

print(f'Uploading {p} to {self.cache_dir}/subsets/{p}')
self.cache_fs.put(p, f'{self.cache_dir}/subsets/{p}')

print(f'Removing {p} from local')
os.remove(p)

print(
f'Total elapsed: {(time.time()-start):.2f}s \n'
f'This iteration: {(time.time()-loop_start):.2f}s'
)
print('`subset_netcdf` complete')
37 changes: 37 additions & 0 deletions recipes/cesm-pop-lowres-1deg/recipe.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,37 @@
from pangeo_forge_recipes.patterns import ConcatDim, FilePattern, MergeDim
from pangeo_forge_recipes.recipes import XarrayZarrRecipe


def make_full_path(variable, time):
"""Returns a valid path to the source files"""
return (
f'https://tds.ucar.edu/thredds/fileServer/datazone/campaign/cesm/collections/ASD/'
f'v5_rel04_BC5_ne30_g16/ocn/proc/tseries/daily/v5_rel04_BC5_ne30_g16.pop.h.nday1.'
f'{variable}.{time}.nc'
)


vars = [
'HMXL_2',
'SFWF_2',
'SHF_2',
'SSH_2',
'SSS',
'SST',
'SST2',
'TAUX_2',
'TAUY_2',
'U1_1',
'U2_2',
'V1_1',
'V2_2',
'XMXL_2',
]

concat_dim = ConcatDim('time', keys=['00010101-01661231'], nitems_per_file=60590)
merge_dim = MergeDim('variable', keys=vars)
pattern = FilePattern(make_full_path, concat_dim, merge_dim)

chunks = {'time': 300}
subset_inputs = {'time': 60}
recipe = XarrayZarrRecipe(pattern, target_chunks=chunks, subset_inputs=subset_inputs)
50 changes: 50 additions & 0 deletions recipes/cesm-pop-lowres-1deg/subset_recipe.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,50 @@
import numpy as np
import pandas as pd

from pangeo_forge_recipes.patterns import ConcatDim, FilePattern, MergeDim
from pangeo_forge_recipes.recipes import XarrayZarrRecipe


def format_bins(interval_object):
out = str(interval_object).replace('(', '')
if '-' in out: # only relevant for the first bin
out = out.replace('-', '')
out = out.replace(']', '')
out = out.replace(', ', '-')
return out


days = np.arange(1, 60590 + 1, 1)
bins = pd.cut(days, 60)
bins_dict = {i: format_bins(bins.categories[i]) for i in range(len(bins.categories))}


def make_full_path(variable, time):
"""Returns a valid path to the source files"""
return f'{variable}-{time}-{bins_dict[time]}.nc'


variables = [
'HMXL_2',
'SFWF_2',
'SHF_2',
'SSH_2',
'SSS',
'SST',
'SST2',
'TAUX_2',
'TAUY_2',
'U1_1',
'U2_2',
'V1_1',
'V2_2',
'XMXL_2',
]

concat_dim = ConcatDim('time', keys=[i for i in range(60)])
merge_dim = MergeDim('variable', keys=variables)
pattern = FilePattern(make_full_path, concat_dim, merge_dim)

chunks = {'time': 200} # ~98 MB per chunk, per variable
subset_inputs = {}
recipe = XarrayZarrRecipe(pattern, target_chunks=chunks, subset_inputs=subset_inputs)