From fd10847b13e6127f93e6a69e5931d1eb74c7031e Mon Sep 17 00:00:00 2001
From: bnb32 <brandon.benton@nrel.gov>
Date: Thu, 4 Jan 2024 09:57:48 -0700
Subject: [PATCH 1/2] path updates for kestrel runs

---
 .../pipeline_legacy/config_east_nsrdb.json    |  4 ++--
 .../pipeline_legacy/config_east_pipeline.json | 10 ++++----
 .../config_west_conus_nsrdb.json              |  6 ++---
 .../config_west_conus_pipeline.json           | 10 ++++----
 .../templates/config_nsrdb_pre2018.json       | 24 +++++++++----------
 nsrdb/utilities/extract_surfrad.py            | 13 +++++-----
 nsrdb/utilities/file_utils.py                 | 24 +++++++++----------
 nsrdb/utilities/movers.py                     | 10 ++++----
 nsrdb/utilities/update_nsrdb_versions.py      |  5 ++--
 9 files changed, 52 insertions(+), 54 deletions(-)

diff --git a/nsrdb/config/pipeline_legacy/config_east_nsrdb.json b/nsrdb/config/pipeline_legacy/config_east_nsrdb.json
index c1597325..796f6808 100755
--- a/nsrdb/config/pipeline_legacy/config_east_nsrdb.json
+++ b/nsrdb/config/pipeline_legacy/config_east_nsrdb.json
@@ -44,12 +44,12 @@
     "name": "nsrdb_east",
     "nsrdb_freq": "10min",
     "nsrdb_grid": "/projects/pxs/reference_grids/nsrdb_meta_2km_east_-105.csv",
-    "out_dir": "/lustre/eaglefs/projects/pxs/processing/2019/east/",
+    "out_dir": "/projects/pxs/processing/2019/east/",
     "year": 2019
   },
   "eagle": {
     "alloc": "pxs",
-    "feature": "--qos=high",
+    "feature": "--qos=normal",
     "memory": 178,
     "walltime": 48
   }
diff --git a/nsrdb/config/pipeline_legacy/config_east_pipeline.json b/nsrdb/config/pipeline_legacy/config_east_pipeline.json
index 6113113c..016e69a7 100755
--- a/nsrdb/config/pipeline_legacy/config_east_pipeline.json
+++ b/nsrdb/config/pipeline_legacy/config_east_pipeline.json
@@ -6,19 +6,19 @@
   "name": "east",
   "pipeline": [
     {
-      "data-model": "/lustre/eaglefs/projects/pxs/processing/2019/east/config_nsrdb_east.json"
+      "data-model": "/projects/pxs/processing/2019/east/config_nsrdb_east.json"
     },
     {
-      "collect-data-model": "/lustre/eaglefs/projects/pxs/processing/2019/east/config_nsrdb_east.json"
+      "collect-data-model": "/projects/pxs/processing/2019/east/config_nsrdb_east.json"
     },
     {
-      "cloud-fill": "/lustre/eaglefs/projects/pxs/processing/2019/east/config_nsrdb_east.json"
+      "cloud-fill": "/projects/pxs/processing/2019/east/config_nsrdb_east.json"
     },
     {
-      "all-sky": "/lustre/eaglefs/projects/pxs/processing/2019/east/config_nsrdb_east.json"
+      "all-sky": "/projects/pxs/processing/2019/east/config_nsrdb_east.json"
     },
     {
-      "collect-final": "/lustre/eaglefs/projects/pxs/processing/2019/east/config_nsrdb_east.json"
+      "collect-final": "/projects/pxs/processing/2019/east/config_nsrdb_east.json"
     }
   ]
 }
diff --git a/nsrdb/config/pipeline_legacy/config_west_conus_nsrdb.json b/nsrdb/config/pipeline_legacy/config_west_conus_nsrdb.json
index 71543a61..c1c413e5 100755
--- a/nsrdb/config/pipeline_legacy/config_west_conus_nsrdb.json
+++ b/nsrdb/config/pipeline_legacy/config_west_conus_nsrdb.json
@@ -41,13 +41,13 @@
     "log_level": "DEBUG",
     "name": "nsrdb_west_conus",
     "nsrdb_freq": "5min",
-    "nsrdb_grid": "/lustre/eaglefs/projects/pxs/reference_grids/nsrdb_meta_2km_conus_west.csv",
-    "out_dir": "/lustre/eaglefs/projects/pxs/processing/2019/west_conus/",
+    "nsrdb_grid": "/projects/pxs/reference_grids/nsrdb_meta_2km_conus_west.csv",
+    "out_dir": "/projects/pxs/processing/2019/west_conus/",
     "year": 2019
   },
   "eagle": {
     "alloc": "pxs",
-    "feature": "--qos=high",
+    "feature": "--qos=normal",
     "memory": 83,
     "walltime": 4
   }
diff --git a/nsrdb/config/pipeline_legacy/config_west_conus_pipeline.json b/nsrdb/config/pipeline_legacy/config_west_conus_pipeline.json
index 92c62668..1b632b3d 100755
--- a/nsrdb/config/pipeline_legacy/config_west_conus_pipeline.json
+++ b/nsrdb/config/pipeline_legacy/config_west_conus_pipeline.json
@@ -6,19 +6,19 @@
   "name": "west_conus",
   "pipeline": [
     {
-      "data-model": "/lustre/eaglefs/projects/pxs/processing/2019/west_conus/config_nsrdb_west_conus.json"
+      "data-model": "/projects/pxs/processing/2019/west_conus/config_nsrdb_west_conus.json"
     },
     {
-      "collect-data-model": "/lustre/eaglefs/projects/pxs/processing/2019/west_conus/config_nsrdb_west_conus.json"
+      "collect-data-model": "/projects/pxs/processing/2019/west_conus/config_nsrdb_west_conus.json"
     },
     {
-      "cloud-fill": "/lustre/eaglefs/projects/pxs/processing/2019/west_conus/config_nsrdb_west_conus.json"
+      "cloud-fill": "/projects/pxs/processing/2019/west_conus/config_nsrdb_west_conus.json"
     },
     {
-      "all-sky": "/lustre/eaglefs/projects/pxs/processing/2019/west_conus/config_nsrdb_west_conus.json"
+      "all-sky": "/projects/pxs/processing/2019/west_conus/config_nsrdb_west_conus.json"
     },
     {
-      "collect-final": "/lustre/eaglefs/projects/pxs/processing/2019/west_conus/config_nsrdb_west_conus.json"
+      "collect-final": "/projects/pxs/processing/2019/west_conus/config_nsrdb_west_conus.json"
     }
   ]
 }
diff --git a/nsrdb/config/templates/config_nsrdb_pre2018.json b/nsrdb/config/templates/config_nsrdb_pre2018.json
index 919ba90e..0f686e79 100755
--- a/nsrdb/config/templates/config_nsrdb_pre2018.json
+++ b/nsrdb/config/templates/config_nsrdb_pre2018.json
@@ -13,43 +13,43 @@
     ],
     "factory_kwargs": {
       "cld_opd_dcomp": {
-        "pattern": "/lustre/eaglefs/projects/pxs/HDF/%satellite%/%year%/{doy}/level2/*_%year%_*.h5"
+        "pattern": "/projects/pxs/HDF/%satellite%/%year%/{doy}/level2/*_%year%_*.h5"
       },
       "cld_press_acha": {
-        "pattern": "/lustre/eaglefs/projects/pxs/HDF/%satellite%/%year%/{doy}/level2/*_%year%_*.h5"
+        "pattern": "/projects/pxs/HDF/%satellite%/%year%/{doy}/level2/*_%year%_*.h5"
       },
       "cld_reff_dcomp": {
-        "pattern": "/lustre/eaglefs/projects/pxs/HDF/%satellite%/%year%/{doy}/level2/*_%year%_*.h5"
+        "pattern": "/projects/pxs/HDF/%satellite%/%year%/{doy}/level2/*_%year%_*.h5"
       },
       "cloud_fraction": {
-        "pattern": "/lustre/eaglefs/projects/pxs/HDF/%satellite%/%year%/{doy}/level2/*_%year%_*.h5"
+        "pattern": "/projects/pxs/HDF/%satellite%/%year%/{doy}/level2/*_%year%_*.h5"
       },
       "cloud_probability": {
-        "pattern": "/lustre/eaglefs/projects/pxs/HDF/%satellite%/%year%/{doy}/level2/*_%year%_*.h5"
+        "pattern": "/projects/pxs/HDF/%satellite%/%year%/{doy}/level2/*_%year%_*.h5"
       },
       "cloud_type": {
-        "pattern": "/lustre/eaglefs/projects/pxs/HDF/%satellite%/%year%/{doy}/level2/*_%year%_*.h5"
+        "pattern": "/projects/pxs/HDF/%satellite%/%year%/{doy}/level2/*_%year%_*.h5"
       },
       "refl_0_65um_nom": {
-        "pattern": "/lustre/eaglefs/projects/pxs/HDF/%satellite%/%year%/{doy}/level2/*_%year%_*.h5"
+        "pattern": "/projects/pxs/HDF/%satellite%/%year%/{doy}/level2/*_%year%_*.h5"
       },
       "refl_0_65um_nom_stddev_3x3": {
-        "pattern": "/lustre/eaglefs/projects/pxs/HDF/%satellite%/%year%/{doy}/level2/*_%year%_*.h5"
+        "pattern": "/projects/pxs/HDF/%satellite%/%year%/{doy}/level2/*_%year%_*.h5"
       },
       "refl_3_75um_nom": {
-        "pattern": "/lustre/eaglefs/projects/pxs/HDF/%satellite%/%year%/{doy}/level2/*_%year%_*.h5"
+        "pattern": "/projects/pxs/HDF/%satellite%/%year%/{doy}/level2/*_%year%_*.h5"
       },
       "surface_albedo": {
         "source_dir": "/projects/pxs/ancillary/albedo/nsrdb_%year%/"
       },
       "temp_11_0um_nom": {
-        "pattern": "/lustre/eaglefs/projects/pxs/HDF/%satellite%/%year%/{doy}/level2/*_%year%_*.h5"
+        "pattern": "/projects/pxs/HDF/%satellite%/%year%/{doy}/level2/*_%year%_*.h5"
       },
       "temp_11_0um_nom_stddev_3x3": {
-        "pattern": "/lustre/eaglefs/projects/pxs/HDF/%satellite%/%year%/{doy}/level2/*_%year%_*.h5"
+        "pattern": "/projects/pxs/HDF/%satellite%/%year%/{doy}/level2/*_%year%_*.h5"
       },
       "temp_3_75um_nom": {
-        "pattern": "/lustre/eaglefs/projects/pxs/HDF/%satellite%/%year%/{doy}/level2/*_%year%_*.h5"
+        "pattern": "/projects/pxs/HDF/%satellite%/%year%/{doy}/level2/*_%year%_*.h5"
       }
     },
     "max_workers": null,
diff --git a/nsrdb/utilities/extract_surfrad.py b/nsrdb/utilities/extract_surfrad.py
index 8e2bd886..3cf0c82f 100755
--- a/nsrdb/utilities/extract_surfrad.py
+++ b/nsrdb/utilities/extract_surfrad.py
@@ -4,14 +4,13 @@
 
 @author: gbuster
 """
+import os
+
 import h5py
 import numpy as np
-import os
 import pandas as pd
-
 from farms import SZA_LIM
 
-
 DAT_COLS = ('year',
             'jday',
             'month',
@@ -104,7 +103,7 @@ def get_dat_table(d, flist):
         table = []
 
         # get readlines iterator
-        with open(os.path.join(d, fname), 'r') as f:
+        with open(os.path.join(d, fname)) as f:
             lines = f.readlines()
 
         # iterate through lines
@@ -161,7 +160,7 @@ def get_lw1_table(d, flist):
     for i, fname in enumerate(flist):
 
         # get readlines iterator
-        with open(os.path.join(d, fname), 'r') as f:
+        with open(os.path.join(d, fname)) as f:
             lines = f.readlines()
 
         # iterate through lines
@@ -302,7 +301,7 @@ def extract_all(root_dir, dir_out, years=range(1998, 2018), file_flag='.dat',
 
 
 if __name__ == '__main__':
-    root_dir = '/lustre/eaglefs/projects/pxs/surfrad/raw'
-    dir_out = '/lustre/eaglefs/projects/pxs/surfrad/h5'
+    root_dir = '/projects/pxs/surfrad/raw'
+    dir_out = '/projects/pxs/surfrad/h5'
     site_codes = ('bon', 'dra', 'fpk', 'gwn', 'psu', 'sxf', 'tbl')
     extract_all(root_dir, dir_out, site_codes=site_codes)
diff --git a/nsrdb/utilities/file_utils.py b/nsrdb/utilities/file_utils.py
index f095a95d..2df87b76 100755
--- a/nsrdb/utilities/file_utils.py
+++ b/nsrdb/utilities/file_utils.py
@@ -3,26 +3,24 @@
 
 @author: gbuster
 """
-from concurrent.futures import as_completed
 import gzip
 import logging
 import os
+import re
 import shlex
 import shutil
-import re
-from subprocess import Popen, PIPE, run
 import time
-from urllib.request import urlopen
+from concurrent.futures import as_completed
+from subprocess import PIPE, Popen, run
 from urllib.error import URLError
+from urllib.request import urlopen
+
+import numpy as np
 import pandas as pd
 from packaging import version
-
-
 from rex.utilities.execution import SpawnProcessPool
 from rex.utilities.loggers import init_logger
 
-import numpy as np
-
 logger = logging.getLogger(__name__)
 
 DIR = os.path.dirname(os.path.realpath(__file__))
@@ -210,7 +208,7 @@ def convert_h4(path4, f_h4, path5, f_h5):
     h5 = os.path.join(path5, f_h5)
 
     if not os.path.exists(h4):
-        raise IOError('Could not locate file for conversion to h5: {}'
+        raise OSError('Could not locate file for conversion to h5: {}'
                       .format(h4))
     if os.path.exists(h5):
         logger.info('Target h5 file already exists, may have already been '
@@ -281,7 +279,7 @@ def convert_list_serial(conversion_list):
     """Convert h4 to h5 files in serial based on the conversion list.
 
     Parameters
-    -------
+    ----------
     conversion_list : list
         List of paths and files to convert for input to convert4to5.
         Format is: conversion_list = [[path4, f_h4, path5, f_h5], ...]
@@ -297,7 +295,7 @@ def convert_list_parallel(conversion_list, n_workers=2):
     """Convert h4 to h5 files in parallel based on the conversion list.
 
     Parameters
-    -------
+    ----------
     conversion_list : list
         List of paths and files to convert for input to convert4to5.
         Format is: conversion_list = [[path4, f_h4, path5, f_h5], ...]
@@ -364,8 +362,8 @@ def convert_directory(path4, path5, n_workers=1):
 
 
 if __name__ == '__main__':
-    path4 = '/lustre/eaglefs/projects/pxs/uwisc/2018_west/'
-    path5 = '/lustre/eaglefs/projects/pxs/uwisc/2018_west_h5/'
+    path4 = '/projects/pxs/uwisc/2018_west/'
+    path5 = '/projects/pxs/uwisc/2018_west_h5/'
     init_logger(__name__, log_level='INFO',
                 log_file=os.path.join(path5, 'convert.log'))
     convert_directory(path4, path5, n_workers=36)
diff --git a/nsrdb/utilities/movers.py b/nsrdb/utilities/movers.py
index be35eb9b..5a6bec8d 100755
--- a/nsrdb/utilities/movers.py
+++ b/nsrdb/utilities/movers.py
@@ -3,15 +3,15 @@
 
 @author: gbuster
 """
-import h5py
 import logging
-import numpy as np
 import os
-import pandas as pd
 import time
 from warnings import warn
 
-from rex.utilities.hpc import SLURM, PBS
+import h5py
+import numpy as np
+import pandas as pd
+from rex.utilities.hpc import PBS, SLURM
 from rex.utilities.loggers import init_logger
 
 from nsrdb.utilities.file_utils import repack_h5
@@ -421,7 +421,7 @@ def peregrine(fun_str, arg_str, alloc='pxs', queue='batch-h',
 
 def eagle(fun_str, arg_str, alloc='pxs', memory=96,
           walltime=10, node_name='mover',
-          stdout_path='/lustre/eaglefs/scratch/gbuster/data_movers/'):
+          stdout_path='//scratch/gbuster/data_movers/'):
     """Kick off an eagle job to execute a mover function.
 
     Parameters
diff --git a/nsrdb/utilities/update_nsrdb_versions.py b/nsrdb/utilities/update_nsrdb_versions.py
index fafb617a..5c5e07c6 100755
--- a/nsrdb/utilities/update_nsrdb_versions.py
+++ b/nsrdb/utilities/update_nsrdb_versions.py
@@ -3,10 +3,11 @@
 
 @author: gbuster
 """
-import h5py
 import os
 
-root_dir = '/lustre/eaglefs/projects/pxs/nsrdb/v3.0.1/'
+import h5py
+
+root_dir = '/projects/pxs/nsrdb/v3.0.1/'
 
 versions = {"nsrdb_1998.h5": "3.0.6",
             "nsrdb_1999.h5": "3.0.6",

From b3d684ab702980b5184cdcd3e89f0609ed72507e Mon Sep 17 00:00:00 2001
From: bnb32 <brandon.benton@nrel.gov>
Date: Thu, 4 Jan 2024 10:06:10 -0700
Subject: [PATCH 2/2] linting

---
 nsrdb/utilities/extract_surfrad.py | 196 ++++++++++++++++-------------
 1 file changed, 108 insertions(+), 88 deletions(-)

diff --git a/nsrdb/utilities/extract_surfrad.py b/nsrdb/utilities/extract_surfrad.py
index 3cf0c82f..611c2d42 100755
--- a/nsrdb/utilities/extract_surfrad.py
+++ b/nsrdb/utilities/extract_surfrad.py
@@ -11,38 +11,38 @@
 import pandas as pd
 from farms import SZA_LIM
 
-DAT_COLS = ('year',
-            'jday',
-            'month',
-            'day',
-            'hour',
-            'min',
-            'dt',
-            'zen',
-            'dw_solar',
-            'qc_dwsolar',
-            'uw_solar',
-            'qc_uw_solar',
-            'direct_n',
-            'qc_direct_n',
-            'diffuse',
-            'qc_diffuse',
-            )
-
-DAT_MAPPING = {'dw_solar': 'ghi',
-               'direct_n': 'dni',
-               'diffuse': 'dhi',
-               'zen': 'sza'}
-
-LW1_MAPPING = {'swdn': 'ghi',
-               'dirsw': 'dni',
-               'difsw': 'dhi',
-               'sza': 'sza'}
+DAT_COLS = (
+    "year",
+    "jday",
+    "month",
+    "day",
+    "hour",
+    "min",
+    "dt",
+    "zen",
+    "dw_solar",
+    "qc_dwsolar",
+    "uw_solar",
+    "qc_uw_solar",
+    "direct_n",
+    "qc_direct_n",
+    "diffuse",
+    "qc_diffuse",
+)
+
+DAT_MAPPING = {
+    "dw_solar": "ghi",
+    "direct_n": "dni",
+    "diffuse": "dhi",
+    "zen": "sza",
+}
+
+LW1_MAPPING = {"swdn": "ghi", "dirsw": "dni", "difsw": "dhi", "sza": "sza"}
 
 MISSING = -999
 
 
-def filter_measurement_df(df, var_list=('dhi', 'dni', 'ghi', 'sza')):
+def filter_measurement_df(df, var_list=("dhi", "dni", "ghi", "sza")):
     """Filter the measurement dataframe.
 
     Parameters
@@ -68,12 +68,12 @@ def filter_measurement_df(df, var_list=('dhi', 'dni', 'ghi', 'sza')):
 
     for var in var_list:
         # No data can be negative
-        mask = (df[var] < 0)
+        mask = df[var] < 0
         df.loc[mask, var] = MISSING
 
-        if var in ('dhi', 'ghi'):
+        if var in ("dhi", "ghi"):
             # dhi and ghi cannot be negative or zero during the day
-            mask = (df[var] <= 0) & (df['sza'] < SZA_LIM)
+            mask = (df[var] <= 0) & (df["sza"] < SZA_LIM)
             df.loc[mask, var] = MISSING
 
     return df
@@ -108,15 +108,14 @@ def get_dat_table(d, flist):
 
         # iterate through lines
         for line in lines:
-
             # reduce multiple spaces to a single space, split columns
-            while '  ' in line:
-                line = line.replace('  ', ' ')
-            cols = line.strip(' ').split(' ')
+            while "  " in line:
+                line = line.replace("  ", " ")
+            cols = line.strip(" ").split(" ")
 
             # Set table header or append data to table
             if len(cols) > len(DAT_COLS):
-                table.append(cols[0:len(DAT_COLS)])
+                table.append(cols[0: len(DAT_COLS)])
 
         # upon finishing table concatenation, initialize annual table
         # or append to annual table
@@ -126,14 +125,16 @@ def get_dat_table(d, flist):
             annual_table += table
 
     df = pd.DataFrame(annual_table, columns=DAT_COLS)
-    df = df.rename(DAT_MAPPING, axis='columns')
-    df['time_string'] = (df['year']
-                         + df['month'].str.zfill(2)
-                         + df['day'].str.zfill(2)
-                         + df['hour'].str.zfill(2)
-                         + df['min'].str.zfill(2))
-
-    ti = pd.to_datetime(df['time_string'], format='%Y%m%d%H%M')
+    df = df.rename(DAT_MAPPING, axis="columns")
+    df["time_string"] = (
+        df["year"]
+        + df["month"].str.zfill(2)
+        + df["day"].str.zfill(2)
+        + df["hour"].str.zfill(2)
+        + df["min"].str.zfill(2)
+    )
+
+    ti = pd.to_datetime(df["time_string"], format="%Y%m%d%H%M")
     df.index = ti
     df = df.sort_index()
     return df
@@ -158,18 +159,16 @@ def get_lw1_table(d, flist):
 
     # iterate through data files
     for i, fname in enumerate(flist):
-
         # get readlines iterator
         with open(os.path.join(d, fname)) as f:
             lines = f.readlines()
 
         # iterate through lines
         for j, line in enumerate(lines):
-
             # reduce multiple spaces to a single space, split columns
-            while '  ' in line:
-                line = line.replace('  ', ' ')
-            cols = line.strip(' ').split(' ')
+            while "  " in line:
+                line = line.replace("  ", " ")
+            cols = line.strip(" ").split(" ")
 
             # Set table header or append data to table
             if j == 0:
@@ -186,18 +185,21 @@ def get_lw1_table(d, flist):
                 # make sure headers are the same
                 annual_table += table[1:]
             else:
-                msg = ('Headers for "{}" does not match annual table '
-                       'headers: {}'
-                       .format(os.path.join(d, fname), annual_table[0]))
+                msg = (
+                    'Headers for "{}" does not match annual table '
+                    "headers: {}".format(
+                        os.path.join(d, fname), annual_table[0]
+                    )
+                )
                 raise ValueError(msg)
 
     headers = [h.lower() for h in annual_table[0]]
     df = pd.DataFrame(annual_table[1:], columns=headers)
-    df = df[['zdate', 'ztim', 'cosz', 'swdn', 'dirsw', 'difsw']]
-    df['sza'] = np.arccos(df['cosz'])
-    df = df.rename(LW1_MAPPING, axis='columns')
-    df['time_string'] = df['zdate'] + ' ' + df['ztim'].str.zfill(4)
-    ti = pd.to_datetime(df['time_string'], format='%Y%m%d %H%M')
+    df = df[["zdate", "ztim", "cosz", "swdn", "dirsw", "difsw"]]
+    df["sza"] = np.arccos(df["cosz"])
+    df = df.rename(LW1_MAPPING, axis="columns")
+    df["time_string"] = df["zdate"] + " " + df["ztim"].str.zfill(4)
+    ti = pd.to_datetime(df["time_string"], format="%Y%m%d %H%M")
     df.index = ti
     df = df.sort_index()
     return df
@@ -217,31 +219,42 @@ def surfrad_to_h5(df, fout, dir_out):
         Location to save output file.
     """
 
-    with h5py.File(os.path.join(dir_out, fout), 'w') as f:
-
+    with h5py.File(os.path.join(dir_out, fout), "w") as f:
         # write time index
-        time_index = np.array(df.index.astype(str), dtype='S20')
-        ds = f.create_dataset('time_index', shape=time_index.shape,
-                              dtype=time_index.dtype, chunks=None)
+        time_index = np.array(df.index.astype(str), dtype="S20")
+        ds = f.create_dataset(
+            "time_index",
+            shape=time_index.shape,
+            dtype=time_index.dtype,
+            chunks=None,
+        )
         ds[...] = time_index
 
         # write solar zenith angle
-        ds = f.create_dataset('solar_zenith_angle', shape=df['sza'].shape,
-                              dtype=np.float16, chunks=None)
-        ds[...] = df['sza'].values
+        ds = f.create_dataset(
+            "solar_zenith_angle",
+            shape=df["sza"].shape,
+            dtype=np.float16,
+            chunks=None,
+        )
+        ds[...] = df["sza"].values
 
         # write irraidance variables
-        for dset in ['dhi', 'dni', 'ghi']:
-            df[dset] = np.round(df[dset].astype(float))\
-                .astype(np.int16)
-            ds = f.create_dataset(dset, shape=df[dset].shape,
-                                  dtype=df[dset].dtype,
-                                  chunks=None)
+        for dset in ["dhi", "dni", "ghi"]:
+            df[dset] = np.round(df[dset].astype(float)).astype(np.int16)
+            ds = f.create_dataset(
+                dset, shape=df[dset].shape, dtype=df[dset].dtype, chunks=None
+            )
             ds[...] = df[dset].values
 
 
-def extract_all(root_dir, dir_out, years=range(1998, 2018), file_flag='.dat',
-                site_codes=('bon', 'dra', 'fpk', 'gwn', 'psu', 'sxf', 'tbl')):
+def extract_all(
+    root_dir,
+    dir_out,
+    years=range(1998, 2018),
+    file_flag=".dat",
+    site_codes=("bon", "dra", "fpk", "gwn", "psu", "sxf", "tbl"),
+):
     """Extract all surfrad measurement data into h5 files.
 
     Parameters
@@ -265,43 +278,50 @@ def extract_all(root_dir, dir_out, years=range(1998, 2018), file_flag='.dat',
 
     for site in site_codes:
         for year in years:
-
             # look for target data directory
             d = os.path.join(root_dir, site, str(year))
 
             # set target output filename
-            fout = '{}_{}.h5'.format(site, year)
+            fout = "{}_{}.h5".format(site, year)
 
             if not os.path.exists(d):
-                print('Skipping: "{}" for {}. Path does not exist: {}'
-                      .format(site, year, d))
+                print(
+                    'Skipping: "{}" for {}. Path does not exist: {}'.format(
+                        site, year, d
+                    )
+                )
                 bad_dirs.append(d)
             elif os.path.exists(os.path.join(dir_out, fout)):
-                print('Skipping file, already exists: {}'.format(fout))
+                print("Skipping file, already exists: {}".format(fout))
             else:
                 # get number of valid files in dir
                 flist = [f for f in os.listdir(d) if file_flag in f]
 
                 print('Processing "{}" for {}'.format(site, year))
-                if 'dat' in file_flag:
+                if "dat" in file_flag:
                     df = get_dat_table(d, flist)
-                elif 'lw1' in file_flag:
+                elif "lw1" in file_flag:
                     df = get_lw1_table(d, flist)
                 else:
-                    raise('Did not recongize user-specified file flag: '
-                          '"{}"'.format(file_flag))
+                    raise (
+                        "Did not recongize user-specified file flag: "
+                        '"{}"'.format(file_flag)
+                    )
 
                 df = filter_measurement_df(df)
 
                 surfrad_to_h5(df, fout, dir_out)
 
-    print('The following directories did not have valid datasets:\n{}'
-          .format(bad_dirs))
+    print(
+        "The following directories did not have valid datasets:\n{}".format(
+            bad_dirs
+        )
+    )
     return df
 
 
-if __name__ == '__main__':
-    root_dir = '/projects/pxs/surfrad/raw'
-    dir_out = '/projects/pxs/surfrad/h5'
-    site_codes = ('bon', 'dra', 'fpk', 'gwn', 'psu', 'sxf', 'tbl')
+if __name__ == "__main__":
+    root_dir = "/projects/pxs/surfrad/raw"
+    dir_out = "/projects/pxs/surfrad/h5"
+    site_codes = ("bon", "dra", "fpk", "gwn", "psu", "sxf", "tbl")
     extract_all(root_dir, dir_out, site_codes=site_codes)