Added assign_coords to forcing sources with daily data

jpcurbelo · Aug 23, 2024 · 3352214 · 3352214
1 parent f1b0f15
commit 3352214
Show file tree

Hide file tree

Showing 6 changed files with 117 additions and 1,417 deletions.
diff --git a/camels_spat2nh-1058854.out b/camels_spat2nh-1058854.out
diff --git a/camels_spat2nh-1123653.out b/camels_spat2nh-1123653.out
@@ -0,0 +1,4 @@
+Unusable basins: 13
+{'05RE002', '08LF023', '08MG022', '07QD002', '06DA001', '08MG020', '08AA007', '08LD003', '09AE002', '08KH011', '09AA004', '07SB017', '07BJ006'}
+Basins to process: 1698
+Processing USA...
diff --git a/camels_spat2nh.py b/camels_spat2nh.py
@@ -79,7 +79,12 @@ def camels_spat2nh(data_dir, data_gen, unusuable_basins):
     ## Process data for each basin and save to csv file
     for country in countries[:]:
         # Create a folder for each country
-        country_dir = os.path.join(data_dir_out, f'CAMELS_spat_{country}_{len(data_sources)}sources')
+        # Check if only testing
+        if ONLY_TESTING:
+            country_dir = os.path.join(data_dir_out, f'CAMELS_spat_{country}_testing')
+        else:
+            country_dir = os.path.join(data_dir_out, f'CAMELS_spat_{country}_{len(data_sources)}sources')
+
         if not os.path.exists(country_dir ):
             os.makedirs(country_dir)
 
@@ -142,9 +147,9 @@ def processBasinSave2CSV(basin_f, basin_data_path, country_dir,
 
             print(f'{src}_files', len(eras_files), '->', folder2load)
 
-            # Check if only testing
-            if ONLY_TESTING:
-                continue
+            # # Check if only testing
+            # if ONLY_TESTING:
+            #     continue
 
             # Check whether there are files to load
             if len(eras_files) == 0:
@@ -179,9 +184,9 @@ def processBasinSave2CSV(basin_f, basin_data_path, country_dir,
             df_src_dict[src] = basin_data_df
 
 
-        # Check if only testing
-        if ONLY_TESTING:
-            return None
+        # # Check if only testing
+        # if ONLY_TESTING:
+        #     return None
 
         print('basin', basin_f, '->', df_src_dict.keys())
         # Check if there are len(data_sources) data sources in df_src_dict.keys() (expected ERA5, EM_EARTH, daymet, and RDRS)
@@ -224,12 +229,14 @@ def processBasinSave2CSV(basin_f, basin_data_path, country_dir,
         df_target.rename(columns={'time': 'date'}, inplace=True)
         # Remove duplicates
         df_target = df_target.drop_duplicates(subset=['date'])
+
+        # print('df_target', df_target.head())
 
         # Merge input and target dataframes
         df_merged = df_merged_inp.merge(df_target, on='date')
 
 
-        # print('df_merged', df_merged_inp.head())
+        # print('df_merged', df_merged.head())
         # # Print data_vars
         # for var in df_merged.columns:
         #     print(var)

diff --git a/camels_spat_exploring_forcings.ipynb b/camels_spat_exploring_forcings.ipynb
diff --git a/utils/data_dir.yml b/utils/data_dir.yml
@@ -4,10 +4,6 @@ data_dir_camels_spat: /project/gwf/gwf_cmt/wknoben/camels_spat/camels-spat-data
 relative_path_forcing: forcing/lumped
 relative_path_target: observations
 
-# countries:
-  # - USA
-  # - CAN
-
 camels_spat_metadata: camels_spat_metadata.csv
 camels_spat_unusable: camels_spat_unusable.csv
 

diff --git a/utils/utils.py b/utils/utils.py
@@ -22,6 +22,8 @@ def reduceDataByDay(dataset, set_vars, sum_vars, input_vars_repeated, forcing_sr
     day_dates = pd.to_datetime(dataset.coords["time"].values).normalize()
     day_dates = xr.DataArray(day_dates, name="time", dims="time")
 
+    print('day_dates', day_dates)
+
     # Group by day and apply appropriate reduction method for each variable
     daily_data = xr.Dataset()
 
@@ -41,8 +43,8 @@ def reduceDataByDay(dataset, set_vars, sum_vars, input_vars_repeated, forcing_sr
 
         # Check if the frequency is daily - daymet
         if inferred_frequency == pd.Timedelta(days=1) and variable in variable in set_vars:
-            # Do not aggregate
-            daily_data[var] = dataset[variable]
+            # Do not aggregate and bring to the day dimension: 1980-01-01 12:00:00 to be 1980-01-01
+            daily_data[var] = dataset[variable].assign_coords(time=day_dates)
         else:
             if variable in sum_vars:
                 # print('sum', variable)