From 42f5d652f1f7e443e1cfd7ccfcafc3e64f6e465f Mon Sep 17 00:00:00 2001 From: James Mineau Date: Tue, 17 Sep 2024 14:01:17 -0600 Subject: [PATCH] trajectory output - hdf5 --- docs/configuration.md | 1 + docs/output-files.md | 56 +++++++++++++++++++++--- docs/project-structure.md | 6 +-- r/run_stilt.r | 2 + r/src/read_traj.r | 50 ++++++++++++++++++++++ r/src/simulation_step.r | 13 +++--- r/src/write_traj.r | 90 +++++++++++++++++++++++++++++++++++++++ 7 files changed, 204 insertions(+), 14 deletions(-) create mode 100644 r/src/read_traj.r create mode 100644 r/src/write_traj.r diff --git a/docs/configuration.md b/docs/configuration.md index 3675fbc..14ca5ce 100644 --- a/docs/configuration.md +++ b/docs/configuration.md @@ -79,6 +79,7 @@ str(receptors) | `rm_dat` | Logical indicating whether to delete `PARTICLE.DAT` after each simulation. Default to TRUE to reduce disk space since all of the trajectory information is also stored in `STILT_OUTPUT.rds` alongside the calculated upstream influence footprint | | `run_foot` | Logical indicating whether to produce footprints. If FALSE, `run_trajec` must be TRUE. This can be useful when calculating trajectories separate from footprints | | `run_trajec` | Logical indicating whether to produce new trajectories with `hycs_std`. If FALSE, will try to load the previous trajectory outputs. This is often useful for regridding purposes | +| `trajec_fmt` | Trajectory output format - file extension for trajectory output files. Defaults to `rds` (serialized R data). Other options include `h5` (HDF5). Other formats are less efficient than `rds` and may require additional dependencies to be installed, but can be useful for interoperability with other software packages | | `simulation_id` | Unique identifier for each simulation; defaults to NA which determines a unique identifier for each simulation by hashing the time and receptor location | | `timeout` | number of seconds to allow `hycs_std` to complete before sending SIGTERM and moving to the next simulation; defaults to 3600 (1 hour) | | `varsiwant` | character vector of 4-letter `hycs_std` variables. Defaults to the minimum required variables including `'time', 'indx', 'long', 'lati', 'zagl', 'foot', 'mlht', 'dens', 'samt', 'sigw', 'tlgr'`. Can optionally include options listed below. | diff --git a/docs/output-files.md b/docs/output-files.md index 650e561..9302235 100644 --- a/docs/output-files.md +++ b/docs/output-files.md @@ -2,22 +2,30 @@ The model outputs can be found in the directory configured with `output_wd` (defaults to `/out/`, see [project structure](http://localhost:3000/#/project-structure)). STILT outputs two files for analysis - -- a `_traj.rds` file containing the trajectories of the particle ensemble +- a `_traj.` file containing the trajectories of the particle ensemble - a `_foot.nc` file containing gridded footprint values and metadata Simulation identifiers follow a `yyyymmddHHMM_lati_long_zagl` convention, see [project structure](project-structure.md?id=outby-id). ## Particle trajectories -Particle trajectories and simulation configuration information are packaged and saved in a compressed `.rds` file ([serialized single R object](https://stat.ethz.ch/R-manual/R-devel/library/base/html/readRDS.html)) with the naming convention with the naming convention `_traj.rds`. Preserving the particle trajectories enables regridding the footprints at a later time without the computational cost of recalculating particle trajectories. +Particle trajectories and simulation configuration information are packaged and saved in the format specified by `trajec_fmt` (defaults to `rds`). Options include -This object can be loaded with `readRDS()` and is structured as + - `rds` - [serialized R data](https://stat.ethz.ch/R-manual/R-devel/library/base/html/readRDS.html) + - `h5` - [HDF5](https://www.hdfgroup.org/solutions/hdf5/) + +> Formats other than `rds` will be less efficient, but `h5` offers the advantage of being a widely supported format by other software packages. + +Trajectories are saved with the naming convention `_traj.`. Preserving the particle trajectories enables regridding the footprints at a later time without the computational cost of recalculating particle trajectories. + +This object can be loaded with `read_traj()` and is structured as ```r -traj <- readRDS('_traj.rds') +source('r/dependencies.r') +traj <- read_traj('_traj.') str(traj) # List of 4 -# $ file : chr "/out/by-id//_traj.rds +# $ file : chr "/out/by-id//_traj." # $ receptor:List of 4 # ..$ run_time: POSIXct[1:1], format: "1983-09-18 21:00:00" # ..$ lati : num 39.6 @@ -40,6 +48,44 @@ str(traj) The `traj$receptor` object is a named list with the time and location of the release point for the simulation. The `traj$particle` object is a data frame containing each particle's position and characteristics over time. +The `h5` output can be read using python using a combination of pandas and pytables: + +```python +import numpy as np +import pandas as pd +import tables as tb + +# Read the particle data as a pandas dataframe +particle = pd.read_hdf('_traj.h5', 'particle') + +# Function to get group attributes +def get_group_attrs(h5file: tb.File, group: str) -> dict: + 'Get the attributes of a group in an h5 file' + root = h5file.root + roots = ['', '/', 'root'] + group = root if group in roots else getattr(root, group) + + attrs = {} + keys = group._v_attrs._v_attrnamesuser + for k in keys: + val = group._v_attrs[k] + if val is not None: + val = val[0] # values are stored as arrays + if np.issubdtype(val.dtype, np.bytes_): + val = val.decode() # convert bytes to string + val = None if val == 'NA' else val + attrs[k] = val + return attrs # dictionary output + +# Open the h5 file with pytables +h5file = tb.open_file('_traj.h5') + +# Read group attributes +file = get_group_attrs(h5file, '/')['file'] +receptor = get_group_attrs(h5file, 'receptor') +params = get_group_attrs(h5file, 'params') +``` + ## Gridded footprints Footprints are packaged and saved in a compressed NetCDF file using [Climate and Forecast (CF)](http://cfconventions.org) compliant metadata with the naming convention `_foot.nc`. This object contains information about the model domain, the grid resolution, and footprint values. This object is typically a three dimensional array with dimensions ordered (_x_, _y_, _t_). However, the object will only have dimensions (_x_, _y_) for time integrated footprints. diff --git a/docs/project-structure.md b/docs/project-structure.md index 7e08be2..216abf2 100644 --- a/docs/project-structure.md +++ b/docs/project-structure.md @@ -12,7 +12,7 @@ exe/ out/ by-id/ yyyymmddHH_lati_long_zagl/ - yyyymmddHH_lati_long_zagl_traj.rds + yyyymmddHH_lati_long_zagl_traj.traject_fmt yyyymmddHH_lati_long_zagl_foot.nc hycs_std SETUP.CFG @@ -23,7 +23,7 @@ out/ yyyymmddHH_lati_long_zagl_foot.nc ... particles/ - yyyymmddHH_lati_long_zagl_traj.rds + yyyymmddHH_lati_long_zagl_traj.traject_fmt ... r/ src/ @@ -59,7 +59,7 @@ Contains simulation files by simulation id, with the naming convention `yyyymmdd This becomes the working directory for each unique simulation, containing symbolic links to all of the shared files in `exe/` as well as simulation specific `CONTROL`, `SETUP.CFG`, and output files. -STILT outputs two files for analysis. The trajectories of the particle ensemble are saved to a `_traj.rds` file. Gridded footprints are saved to a `_foot.nc` file. For guidance on working with these output files, see [output files](output-files.md). +STILT outputs two files for analysis. The trajectories of the particle ensemble are saved to a `_traj.` file. Gridded footprints are saved to a `_foot.nc` file. For guidance on working with these output files, see [output files](output-files.md). #### out/footprints/ diff --git a/r/run_stilt.r b/r/run_stilt.r index 5b77282..f29d86e 100755 --- a/r/run_stilt.r +++ b/r/run_stilt.r @@ -62,6 +62,7 @@ numpar <- 1000 rm_dat <- T run_foot <- T run_trajec <- T +trajec_fmt <- 'rds' simulation_id <- NA timeout <- 3600 varsiwant <- c('time', 'indx', 'long', 'lati', 'zagl', 'foot', 'mlht', 'dens', @@ -277,6 +278,7 @@ stilt_apply(FUN = simulation_step, tluverr = tluverr, tlzierr = tlzierr, tout = tout, + trajec_fmt = trajec_fmt, tratio = tratio, tvmix = tvmix, varsiwant = list(varsiwant), diff --git a/r/src/read_traj.r b/r/src/read_traj.r new file mode 100644 index 0000000..e4c5324 --- /dev/null +++ b/r/src/read_traj.r @@ -0,0 +1,50 @@ +#' read_traj reads in the output assosciated with each simulation step +#' @author James Mineau +#' +#' @param file filename argument. Must end with .rds (for serialized R data +#' output, preferred), .h5 (for Hierarchical Data Format output), +#' or NULL to return the footprint object for programatic use. +#' rds files do not require any additional libraries and have +#' better compression. The \code{rhdf5} package is required for .h5 output. +#' HDF5 output is slightly less efficient than .rds output, but is more +#' portable and can be read by other languages. +#' +#' @import rhdf5 +#' @export + + +read_traj <- function(file) { + + # .rds output (preferred) + if (!is.null(file) && grepl('\\.rds$', file, ignore.case = T)) { + return(readRDS(file)) + } + + output <- list() # Define output list to rebuild + + # .h5 output + if (!is.null(file) && grepl('\\.h5$', file, ignore.case = T)) { + + # Assign current file path + output$file <- file + + # Get receptor attributes + output$receptor <- rhdf5::h5readAttributes(file, 'receptor') + + # Get particle data + output$particle <- rhdf5::h5read(file, 'particle') + + # Get params attributes + output$params <- rhdf5::h5readAttributes(file, 'params') + output$params[output$params == "NA"] <- NA + + # Get particle error info if it exists + if ('particle_error' %in% rhdf5::h5ls(file)$name) { + output$particle_error <- rhdf5::h5read(file, 'particle_error') + output$particle_error_params <- rhdf5::h5readAttributes(file, 'particle_error_params') + output$particle_error_params[output$particle_error_params == "NA"] <- NA + } + + return(output) + } +} diff --git a/r/src/simulation_step.r b/r/src/simulation_step.r index ab980a1..2eb6282 100644 --- a/r/src/simulation_step.r +++ b/r/src/simulation_step.r @@ -98,7 +98,8 @@ simulation_step <- function(before_footprint = list(function() {output}), tlfrac = 0.1, tluverr = NA, tlzierr = NA, - tout = 0, + tout = 0, + trajec_fmt = 'rds', tratio = 0.75, tvmix = 1, varsiwant = c('time', 'indx', 'long', 'lati', @@ -246,7 +247,7 @@ simulation_step <- function(before_footprint = list(function() {output}), # run_trajec determines whether to try using existing trajectory files or to # recycle existing files output <- list() - output$file <- file.path(rundir, paste0(simulation_id, '_traj.rds')) + output$file <- file.path(rundir, paste0(simulation_id, '_traj.', trajec_fmt)) if (run_trajec) { # Ensure necessary files and directory structure are established in the @@ -325,8 +326,8 @@ simulation_step <- function(before_footprint = list(function() {output}), winderrtf = winderrtf) } - # Save output object to compressed rds file and symlink to out/particles - saveRDS(output, output$file) + # Save output object and symlink to out/particles + write_traj(output, output$file) link <- file.path(output_wd, 'particles', basename(output$file)) suppressWarnings(file.symlink(output$file, link)) @@ -336,11 +337,11 @@ simulation_step <- function(before_footprint = list(function() {output}), # file to a data frame with an adjusted timestamp and index for the # simulation step. If none exists, report an error and proceed if (!file.exists(output$file)) { - warning('simulation_step(): No _traj.rds file found in ', rundir, + warning('simulation_step(): No trajectory file found in ', rundir, '\n skipping this receptor and trying the next...') return() } - output <- readRDS(output$file) + output <- read_traj(output$file) } # Exit if not performing footprint calculations diff --git a/r/src/write_traj.r b/r/src/write_traj.r new file mode 100644 index 0000000..5803310 --- /dev/null +++ b/r/src/write_traj.r @@ -0,0 +1,90 @@ +#' write_traj writes out the output assosciated with each simulation step +#' @author James Mineau +#' +#' @param traj output created during simulation_step +#' @param file filename argument. Must end with .rds (for serialized R data +#' output, preferred), .h5 (for Hierarchical Data Format output), +#' or NULL to return the footprint object for programatic use. +#' rds files do not require any additional libraries and have +#' better compression. The \code{rhdf5} package is required for .h5 output. +#' HDF5 output is slightly less efficient than .rds output, but is more +#' portable and can be read by other languages. +#' +#' @import rhdf5 +#' @export + +write_traj <- function(traj, file) { + + # Remove existing trajectory file + if (!is.null(file) && file.exists(file)) + system(paste('rm', file)) + + # .rds output (preferred) + if (!is.null(file) && grepl('\\.rds$', file, ignore.case = T)) { + saveRDS(traj, file) + return(file) + } + + # .h5 output + if (!is.null(file) && grepl('\\.h5$', file, ignore.case = T)) { + # Check if rhdf5 package is installed + if (!requireNamespace("rhdf5", quietly = TRUE)) { + stop("The 'rhdf5' package is required for hdf5 output. Please install it.") + } + + # Create HDF5 file + fid <- rhdf5::H5Fcreate(file) + + # Write file info + rhdf5::h5writeAttribute(file, fid, 'file') + + # Create groups + rhdf5::h5createGroup(fid, 'receptor') + rhdf5::h5createGroup(fid, 'params') + + # Write receptor data + receptor <- rhdf5::H5Gopen(fid, 'receptor') + rhdf5::h5writeAttribute(as.numeric(traj$receptor$run_time), + receptor, 'run_time') + rhdf5::h5writeAttribute(traj$receptor$lati, receptor, 'lati') + rhdf5::h5writeAttribute(traj$receptor$long, receptor, 'long') + rhdf5::h5writeAttribute(traj$receptor$zagl, receptor, 'zagl') + rhdf5::H5Gclose(receptor) + + # Write particle dataset + rhdf5::h5write(traj$particle, fid, 'particle') + + # Create params group + params <- rhdf5::H5Gopen(fid, 'params') + for (p in names(traj$params)) { + val <- traj$params[[p]] + if (is.na(val)) val <- 'NA' + rhdf5::h5writeAttribute(val, params, p) + } + rhdf5::H5Gclose(params) + + # Create particle error group if it exists + if ('particle_error' %in% names(traj)) { + + # Write particle_error dataset + rhdf5::h5write(traj$particle_error, fid, 'particle_error') + + # Create particle_error_param group + rhdf5::h5createGroup(fid, 'particle_error_params') + particle_error_params <- rhdf5::H5Gopen(fid, 'particle_error_params') + for (p in names(traj$particle_error_params)) { + val <- traj$particle_error_params[[p]] + if (is.na(val)) val <- 'NA' + rhdf5::h5writeAttribute(val, particle_error_params, p) + } + rhdf5::H5Gclose(particle_error_params) + } + + # Close file + rhdf5::H5Fclose(fid) + + return(file) + } + + return(traj) +}