improved v3 ETL options, additional error capture, updated tests

darkreactions · May 21, 2020 · 6babc3e · 6babc3e
1 parent 1a5c57e
commit 6babc3e
Show file tree

Hide file tree

Showing 9 changed files with 92 additions and 787 deletions.
diff --git a/dataset_rename.json b/dataset_rename.json
@@ -4,7 +4,6 @@
     "comment3" : "if no grouping exists for a specified dataset, code will target 'default'",
     "comment4" : "if a column does not have a suitable prefix it will be ommitted from the dataset",
     "comment5" : "all columns (including undefined) can be viewed by toggling 'raw' from the command line",
-    "comment6" : "THIS RENAME WILL BE APPLIED TWICE, once immediately after generatign report, and again after all other functions",
     "default" : {
         "columns" : {}
     },

diff --git a/expworkup/handlers/calcs.py b/expworkup/handlers/calcs.py
@@ -168,7 +168,12 @@ def evaluation_pipeline(all_targets, debug_bool):
                 if description == 'null':
                     modlog.info(f'For {entry_name}, "description" was set to a default of "null"')
 
-                value_column = all_targets.apply(lambda x: df_simple_eval(command, variables, x), axis=1)
+                try:
+                    value_column = all_targets.apply(lambda x: df_simple_eval(command, variables, x), axis=1)
+                except SyntaxError:
+                    modlog.warn(f'For "{entry_name}", simpleeval failed to resolve the specified command, please check specification, or debug code!')        
+                    warnlog.warn(f'For "{entry_name}", simpleeval failed to resolve the specified command, please check specification, or debug code!')        
+                    value_column = pd.Series([np.nan]*len(all_targets), index=all_targets.index)
 
                 value_column.rename(header_name, inplace=True)
                 value_column.fillna(value=fill_value, inplace=True)

diff --git a/expworkup/handlers/cleaner.py b/expworkup/handlers/cleaner.py
@@ -45,7 +45,7 @@ def cleaner(clean_df, raw_bool_cli):
     calc_df = clean_df.filter(like='_calc_') 
     proto_df = clean_df.filter(like='_prototype_')
 
-    if raw_bool_cli == 1:
+    if raw_bool_cli:
         squeaky_clean_df = clean_df
     else:
         squeaky_clean_df = pd.concat([out_df, rxn_df, 

diff --git a/expworkup/jsonparser.py b/expworkup/jsonparser.py
@@ -20,7 +20,7 @@
 modlog = logging.getLogger(f'mainlog.{__name__}')
 warnlog = logging.getLogger(f'warning.{__name__}')
 
-def renamer(dirty_df, dataset_list):
+def renamer(dirty_df, dataset_list, raw_bool_cli):
     """Eats dirty datasets and renames them according to dataset_rename.json
 
     Imports the user level json file and uses the dictionary to
@@ -38,6 +38,10 @@ def renamer(dirty_df, dataset_list):
         dataset_rename.json is used to determine if a report_df
         qualifies for a rename set.  If all datasets in dataset_list
         are not in a dataset_rename, this process will return dirty_df.
+    
+    raw_bool_cli : cli argument, 
+        if True includes extended dataframe including superfluous columns
+        used in data handling
 
     Returns
     --------
@@ -47,21 +51,45 @@ def renamer(dirty_df, dataset_list):
     that doesn't exist in dataset_rename.json, just make that combination
     and all the selected renames.
     """
-    try:
-        with open("dataset_rename.json", "r") as read_file:
-            rename_dict = json.load(read_file)
-    except FileNotFoundError:
-        modlog.error("dataset_rename.json was not found, please redownload from ESCALATE_report")
-        import sys 
-        sys.exit()
-
-    for key, name_dict in rename_dict.items():
-        if 'group' in key:
-            if all(elem in name_dict['datasets'] for elem in dataset_list):
-                for old_name, new_name in name_dict['columns'].items():
-                    dirty_df.rename(columns={old_name : new_name}, inplace=True)
-
+    if not raw_bool_cli:
+        try:
+            with open("dataset_rename.json", "r") as read_file:
+                rename_dict = json.load(read_file)
+        except FileNotFoundError:
+            modlog.error("dataset_rename.json was not found, please redownload from ESCALATE_report")
+            import sys 
+            sys.exit()
+
+        for key, name_dict in rename_dict.items():
+            if 'group' in key:
+                if all(elem in name_dict['datasets'] for elem in dataset_list):
+                    for old_name, new_name in name_dict['columns'].items():
+                        dirty_df.rename(columns={old_name : new_name}, inplace=True)
+    else:
+        modlog.info('Renaming was turned off for this run, columns will not all follow naming scheme')
+        warnlog.info('Renaming was turned off for this run, columns will not all follow naming scheme')
+
+    # alter the user to columns which do not fit the orderly naming scheme
+    nonconformist_columns = []
+    # runid_vial is protected for renaming downstream
+    expected_prefixes = ['_rxn_', '_out_', '_calc_', '_feat_', '_raw_', '_prototype_', 'runid_vial']
+    for x in dirty_df.columns:
+        if not any(y in x for y in expected_prefixes):
+            nonconformist_columns.append(x)
+
+    unnamed_export_file = 'UNAMED_REPORT_COLUMNS.txt'
+    # Remove what's there so it's not confusing
+    if os.path.exists(unnamed_export_file):
+        os.remove(unnamed_export_file)
+    if len(nonconformist_columns) > 0:
+        modlog.info('Columns not fitting the naming scheme were written to: UNAMED_REPORT_COLUMNS.txt')
+        print('Columns not fitting the naming scheme were written to: UNAMED_REPORT_COLUMNS.txt')
+        print('        The USER can define the column names in dataset_rename.json')
+        with open(unnamed_export_file, 'w') as my_file:
+            for x in nonconformist_columns:
+                print(x, file=my_file)
     clean_df = dirty_df
+
     return(clean_df)
 
 def unpackJSON(target_naming_scheme, chemdf_dict):
@@ -150,12 +178,13 @@ def json_pipeline(target_naming_scheme, raw_bool_cli, chemdf_dict, dataset_list)
     modlog.info('%s loaded with JSONs for parsing, starting' %target_naming_scheme)
 
     raw_df = unpackJSON(target_naming_scheme, chemdf_dict)
-    renamed_raw_df = renamer(raw_df, dataset_list)
+    renamed_raw_df = renamer(raw_df, dataset_list, raw_bool_cli)
     report_df = cleaner(renamed_raw_df, raw_bool_cli)
-    report_df['name'] = raw_df['runid_vial']
 
+    report_df['name'] = raw_df['runid_vial']
     report_df.replace('null', np.nan, inplace=True)
     report_df.replace('', np.nan, inplace=True)
     report_df.replace(' ', np.nan, inplace=True)
+
     return(report_df)
 
diff --git a/runme.py b/runme.py
@@ -24,7 +24,7 @@
 from utils import globals
 from utils.globals import (
     set_target_folder_name, set_log_folder, set_offline_folder,
-    get_target_folder, get_log_folder, get_offline_folder
+    set_debug_simple, get_target_folder, get_log_folder, get_offline_folder
 )
 
 __version__ = 1.1 #should match latest HISTORY.md entry
@@ -158,6 +158,7 @@ def main_pipeline(args):
     initialize(args)
     dataset_list = args.d
     offline_toggle = args.offline
+    set_debug_simple(args.debugsimple)
     raw_bool = args.raw
     #Load logging information
     set_log_folder(f'{args.local_directory}/logging')  # folder for logs
@@ -201,7 +202,7 @@ def main_pipeline(args):
     if args.debug:
         # Export dataframes of initial parsing and chemical inventories for ETL to ESCALATEV3
         report_csv_filename = f'REPORT_{get_target_folder().upper()}.csv'
-        write_debug_file(report_df, report_csv_filename)
+        write_debug_file(report_df, report_csv_filename, write_index=True) #turn to True to generate for testing
         for name, chemicaldf in chemdf_dict.items():
             inventory_name = f'REPORT_{name.upper()}_INVENTORY.csv'
             write_debug_file(chemicaldf, inventory_name)
@@ -290,9 +291,18 @@ def parse_args(args):
                               ||default = 4-Data-Iodides||" %possible_targets)
     parser.add_argument('--raw', type=bool, default=False, choices=[True, False],
                         help='final dataframe is printed with all raw values\
-                        included ||default = 1||')
+                        included ||default = False||')
     parser.add_argument('--disablecalcs', type=bool, default=False, choices=[True, False],
-                        help='if true, diasables escalate calculations (calc_command.json)')
+                        help='if True, diasables escalate calculations (calc_command.json) ||default = False||')
+    parser.add_argument('--debug', type=bool, default=False, choices=[True, False],
+                        help="exports all dataframe intermediates prefixed with 'REPORT_'\
+                        csvfiles with default names")
+    parser.add_argument('--debugsimple', type=bool, default=False, choices=[True, False],
+                        help="removes the header and footer from the 'REPORT_' \
+                        csvfiles exported with the --debug option.  If debug is false, this won't do anything.")
+    parser.add_argument('--offline', type=int, default=0, choices=[0,1,2],
+                        help="|| Default = 0 || First iteration, set to '1' to save files locally \
+                        second iteration, set to '2' to load local files and continue")
     parser.add_argument('--verdata', type=str, 
                         help='Enter numerical value such as "0001". Generates <0001>.perovskitedata.csv output\
                         in a form ready for upload to the versioned data repo ||default = None||')
@@ -302,12 +312,6 @@ def parse_args(args):
     parser.add_argument('--state', type=str,
                         help='title of state set file to be used at the state set for \
                         this iteration of the challenge problem, no entry will result in no processing')
-    parser.add_argument('--debug', type=bool, default=False, choices=[True, False],
-                        help="exports all dataframe intermediates prefixed with 'REPORT_'\
-                        csvfiles with default names")
-    parser.add_argument('--offline', type=int, default=0, choices=[0,1,2],
-                        help="|| Default = 0 || First iteration, set to '1' to save files locally \
-                        second iteration, set to '2' to load local files and continue")
     return parser.parse_args(args)
 
 

diff --git a/tests/conftest.py b/tests/conftest.py
@@ -17,7 +17,7 @@
 
 global TEST_TARGET
 global ARGS_LIST
-TEST_TARGET = 'tests/devreport_20200511.csv'
+TEST_TARGET = 'tests/devreport_20200521.csv'
 ARGS_LIST = ['testing', '-d', 'dev', '--raw', '1']
 
 @pytest.fixture(scope='module')

diff --git a/tests/devreport_20200511.csv b/tests/devreport_20200511.csv
diff --git a/utils/file_handling.py b/utils/file_handling.py
@@ -3,7 +3,7 @@
 import pandas as pd
 
 from expworkup.devconfig import valid_input_files, workup_targets, lab_vars
-from utils.globals import get_debug_header
+from utils.globals import get_debug_header, get_debug_simple
 
 def get_interface_filename(interface_type, working_directory, runID):
     """ Searches for filename match and returns instance
@@ -51,13 +51,16 @@ def get_experimental_run_lab(run_filename):
 
     raise RuntimeError(f'{run_filename} does not specify a supported lab')
 
-def write_debug_file(df, filename):
+def write_debug_file(df, filename, write_index=True):
     if os.path.isfile(filename):
         os.remove(filename)
     f = open(filename, 'a')
-    f.write(get_debug_header())
-    df.to_csv(f)
-    f.write(get_debug_header())
+    if not get_debug_simple():
+        f.write(get_debug_header())
+        df.to_csv(f, index=write_index)
+        f.write(get_debug_header())
+    else:
+        df.to_csv(f, index=write_index)
     f.close()
 
 

diff --git a/utils/globals.py b/utils/globals.py
@@ -23,6 +23,8 @@
 LOG_DIRECTORY_SET = False
 OFFLINE_FOLDER = None
 OFFLINE_FOLDER_SET = False
+DEBUG_SIMPLE = None
+DEBUG_SIMPLE_SET = False
 
 _DEBUG_HEADER = None
 _DEBUG_SET = False
@@ -132,3 +134,17 @@ def get_offline_folder():
         sys.exit(1)
     return OFFLINE_FOLDER
 
+def set_debug_simple(debug_simple_arg):
+    global DEBUG_SIMPLE, DEBUG_SIMPLE_SET
+    if DEBUG_SIMPLE_SET:
+        modlog.error('dev tried to set simple debug more than once!')
+        sys.exit(1)
+    DEBUG_SIMPLE = debug_simple_arg
+    DEBUG_SIMPLE_SET = True
+
+def get_debug_simple():
+    if DEBUG_SIMPLE is None:
+        modlog.error('get_debug_simple called before set_debug_simple')
+        sys.exit(1)
+    return DEBUG_SIMPLE
+