Skip to content

Commit

Permalink
improved v3 ETL options, additional error capture, updated tests
Browse files Browse the repository at this point in the history
  • Loading branch information
Ian Pendleton committed May 21, 2020
1 parent 1a5c57e commit 6babc3e
Show file tree
Hide file tree
Showing 9 changed files with 92 additions and 787 deletions.
1 change: 0 additions & 1 deletion dataset_rename.json
Original file line number Diff line number Diff line change
Expand Up @@ -4,7 +4,6 @@
"comment3" : "if no grouping exists for a specified dataset, code will target 'default'",
"comment4" : "if a column does not have a suitable prefix it will be ommitted from the dataset",
"comment5" : "all columns (including undefined) can be viewed by toggling 'raw' from the command line",
"comment6" : "THIS RENAME WILL BE APPLIED TWICE, once immediately after generatign report, and again after all other functions",
"default" : {
"columns" : {}
},
Expand Down
7 changes: 6 additions & 1 deletion expworkup/handlers/calcs.py
Original file line number Diff line number Diff line change
Expand Up @@ -168,7 +168,12 @@ def evaluation_pipeline(all_targets, debug_bool):
if description == 'null':
modlog.info(f'For {entry_name}, "description" was set to a default of "null"')

value_column = all_targets.apply(lambda x: df_simple_eval(command, variables, x), axis=1)
try:
value_column = all_targets.apply(lambda x: df_simple_eval(command, variables, x), axis=1)
except SyntaxError:
modlog.warn(f'For "{entry_name}", simpleeval failed to resolve the specified command, please check specification, or debug code!')
warnlog.warn(f'For "{entry_name}", simpleeval failed to resolve the specified command, please check specification, or debug code!')
value_column = pd.Series([np.nan]*len(all_targets), index=all_targets.index)

value_column.rename(header_name, inplace=True)
value_column.fillna(value=fill_value, inplace=True)
Expand Down
2 changes: 1 addition & 1 deletion expworkup/handlers/cleaner.py
Original file line number Diff line number Diff line change
Expand Up @@ -45,7 +45,7 @@ def cleaner(clean_df, raw_bool_cli):
calc_df = clean_df.filter(like='_calc_')
proto_df = clean_df.filter(like='_prototype_')

if raw_bool_cli == 1:
if raw_bool_cli:
squeaky_clean_df = clean_df
else:
squeaky_clean_df = pd.concat([out_df, rxn_df,
Expand Down
63 changes: 46 additions & 17 deletions expworkup/jsonparser.py
Original file line number Diff line number Diff line change
Expand Up @@ -20,7 +20,7 @@
modlog = logging.getLogger(f'mainlog.{__name__}')
warnlog = logging.getLogger(f'warning.{__name__}')

def renamer(dirty_df, dataset_list):
def renamer(dirty_df, dataset_list, raw_bool_cli):
"""Eats dirty datasets and renames them according to dataset_rename.json
Imports the user level json file and uses the dictionary to
Expand All @@ -38,6 +38,10 @@ def renamer(dirty_df, dataset_list):
dataset_rename.json is used to determine if a report_df
qualifies for a rename set. If all datasets in dataset_list
are not in a dataset_rename, this process will return dirty_df.
raw_bool_cli : cli argument,
if True includes extended dataframe including superfluous columns
used in data handling
Returns
--------
Expand All @@ -47,21 +51,45 @@ def renamer(dirty_df, dataset_list):
that doesn't exist in dataset_rename.json, just make that combination
and all the selected renames.
"""
try:
with open("dataset_rename.json", "r") as read_file:
rename_dict = json.load(read_file)
except FileNotFoundError:
modlog.error("dataset_rename.json was not found, please redownload from ESCALATE_report")
import sys
sys.exit()

for key, name_dict in rename_dict.items():
if 'group' in key:
if all(elem in name_dict['datasets'] for elem in dataset_list):
for old_name, new_name in name_dict['columns'].items():
dirty_df.rename(columns={old_name : new_name}, inplace=True)

if not raw_bool_cli:
try:
with open("dataset_rename.json", "r") as read_file:
rename_dict = json.load(read_file)
except FileNotFoundError:
modlog.error("dataset_rename.json was not found, please redownload from ESCALATE_report")
import sys
sys.exit()

for key, name_dict in rename_dict.items():
if 'group' in key:
if all(elem in name_dict['datasets'] for elem in dataset_list):
for old_name, new_name in name_dict['columns'].items():
dirty_df.rename(columns={old_name : new_name}, inplace=True)
else:
modlog.info('Renaming was turned off for this run, columns will not all follow naming scheme')
warnlog.info('Renaming was turned off for this run, columns will not all follow naming scheme')

# alter the user to columns which do not fit the orderly naming scheme
nonconformist_columns = []
# runid_vial is protected for renaming downstream
expected_prefixes = ['_rxn_', '_out_', '_calc_', '_feat_', '_raw_', '_prototype_', 'runid_vial']
for x in dirty_df.columns:
if not any(y in x for y in expected_prefixes):
nonconformist_columns.append(x)

unnamed_export_file = 'UNAMED_REPORT_COLUMNS.txt'
# Remove what's there so it's not confusing
if os.path.exists(unnamed_export_file):
os.remove(unnamed_export_file)
if len(nonconformist_columns) > 0:
modlog.info('Columns not fitting the naming scheme were written to: UNAMED_REPORT_COLUMNS.txt')
print('Columns not fitting the naming scheme were written to: UNAMED_REPORT_COLUMNS.txt')
print(' The USER can define the column names in dataset_rename.json')
with open(unnamed_export_file, 'w') as my_file:
for x in nonconformist_columns:
print(x, file=my_file)
clean_df = dirty_df

return(clean_df)

def unpackJSON(target_naming_scheme, chemdf_dict):
Expand Down Expand Up @@ -150,12 +178,13 @@ def json_pipeline(target_naming_scheme, raw_bool_cli, chemdf_dict, dataset_list)
modlog.info('%s loaded with JSONs for parsing, starting' %target_naming_scheme)

raw_df = unpackJSON(target_naming_scheme, chemdf_dict)
renamed_raw_df = renamer(raw_df, dataset_list)
renamed_raw_df = renamer(raw_df, dataset_list, raw_bool_cli)
report_df = cleaner(renamed_raw_df, raw_bool_cli)
report_df['name'] = raw_df['runid_vial']

report_df['name'] = raw_df['runid_vial']
report_df.replace('null', np.nan, inplace=True)
report_df.replace('', np.nan, inplace=True)
report_df.replace(' ', np.nan, inplace=True)

return(report_df)

24 changes: 14 additions & 10 deletions runme.py
Original file line number Diff line number Diff line change
Expand Up @@ -24,7 +24,7 @@
from utils import globals
from utils.globals import (
set_target_folder_name, set_log_folder, set_offline_folder,
get_target_folder, get_log_folder, get_offline_folder
set_debug_simple, get_target_folder, get_log_folder, get_offline_folder
)

__version__ = 1.1 #should match latest HISTORY.md entry
Expand Down Expand Up @@ -158,6 +158,7 @@ def main_pipeline(args):
initialize(args)
dataset_list = args.d
offline_toggle = args.offline
set_debug_simple(args.debugsimple)
raw_bool = args.raw
#Load logging information
set_log_folder(f'{args.local_directory}/logging') # folder for logs
Expand Down Expand Up @@ -201,7 +202,7 @@ def main_pipeline(args):
if args.debug:
# Export dataframes of initial parsing and chemical inventories for ETL to ESCALATEV3
report_csv_filename = f'REPORT_{get_target_folder().upper()}.csv'
write_debug_file(report_df, report_csv_filename)
write_debug_file(report_df, report_csv_filename, write_index=True) #turn to True to generate for testing
for name, chemicaldf in chemdf_dict.items():
inventory_name = f'REPORT_{name.upper()}_INVENTORY.csv'
write_debug_file(chemicaldf, inventory_name)
Expand Down Expand Up @@ -290,9 +291,18 @@ def parse_args(args):
||default = 4-Data-Iodides||" %possible_targets)
parser.add_argument('--raw', type=bool, default=False, choices=[True, False],
help='final dataframe is printed with all raw values\
included ||default = 1||')
included ||default = False||')
parser.add_argument('--disablecalcs', type=bool, default=False, choices=[True, False],
help='if true, diasables escalate calculations (calc_command.json)')
help='if True, diasables escalate calculations (calc_command.json) ||default = False||')
parser.add_argument('--debug', type=bool, default=False, choices=[True, False],
help="exports all dataframe intermediates prefixed with 'REPORT_'\
csvfiles with default names")
parser.add_argument('--debugsimple', type=bool, default=False, choices=[True, False],
help="removes the header and footer from the 'REPORT_' \
csvfiles exported with the --debug option. If debug is false, this won't do anything.")
parser.add_argument('--offline', type=int, default=0, choices=[0,1,2],
help="|| Default = 0 || First iteration, set to '1' to save files locally \
second iteration, set to '2' to load local files and continue")
parser.add_argument('--verdata', type=str,
help='Enter numerical value such as "0001". Generates <0001>.perovskitedata.csv output\
in a form ready for upload to the versioned data repo ||default = None||')
Expand All @@ -302,12 +312,6 @@ def parse_args(args):
parser.add_argument('--state', type=str,
help='title of state set file to be used at the state set for \
this iteration of the challenge problem, no entry will result in no processing')
parser.add_argument('--debug', type=bool, default=False, choices=[True, False],
help="exports all dataframe intermediates prefixed with 'REPORT_'\
csvfiles with default names")
parser.add_argument('--offline', type=int, default=0, choices=[0,1,2],
help="|| Default = 0 || First iteration, set to '1' to save files locally \
second iteration, set to '2' to load local files and continue")
return parser.parse_args(args)


Expand Down
2 changes: 1 addition & 1 deletion tests/conftest.py
Original file line number Diff line number Diff line change
Expand Up @@ -17,7 +17,7 @@

global TEST_TARGET
global ARGS_LIST
TEST_TARGET = 'tests/devreport_20200511.csv'
TEST_TARGET = 'tests/devreport_20200521.csv'
ARGS_LIST = ['testing', '-d', 'dev', '--raw', '1']

@pytest.fixture(scope='module')
Expand Down
751 changes: 0 additions & 751 deletions tests/devreport_20200511.csv

This file was deleted.

13 changes: 8 additions & 5 deletions utils/file_handling.py
Original file line number Diff line number Diff line change
Expand Up @@ -3,7 +3,7 @@
import pandas as pd

from expworkup.devconfig import valid_input_files, workup_targets, lab_vars
from utils.globals import get_debug_header
from utils.globals import get_debug_header, get_debug_simple

def get_interface_filename(interface_type, working_directory, runID):
""" Searches for filename match and returns instance
Expand Down Expand Up @@ -51,13 +51,16 @@ def get_experimental_run_lab(run_filename):

raise RuntimeError(f'{run_filename} does not specify a supported lab')

def write_debug_file(df, filename):
def write_debug_file(df, filename, write_index=True):
if os.path.isfile(filename):
os.remove(filename)
f = open(filename, 'a')
f.write(get_debug_header())
df.to_csv(f)
f.write(get_debug_header())
if not get_debug_simple():
f.write(get_debug_header())
df.to_csv(f, index=write_index)
f.write(get_debug_header())
else:
df.to_csv(f, index=write_index)
f.close()


Expand Down
16 changes: 16 additions & 0 deletions utils/globals.py
Original file line number Diff line number Diff line change
Expand Up @@ -23,6 +23,8 @@
LOG_DIRECTORY_SET = False
OFFLINE_FOLDER = None
OFFLINE_FOLDER_SET = False
DEBUG_SIMPLE = None
DEBUG_SIMPLE_SET = False

_DEBUG_HEADER = None
_DEBUG_SET = False
Expand Down Expand Up @@ -132,3 +134,17 @@ def get_offline_folder():
sys.exit(1)
return OFFLINE_FOLDER

def set_debug_simple(debug_simple_arg):
global DEBUG_SIMPLE, DEBUG_SIMPLE_SET
if DEBUG_SIMPLE_SET:
modlog.error('dev tried to set simple debug more than once!')
sys.exit(1)
DEBUG_SIMPLE = debug_simple_arg
DEBUG_SIMPLE_SET = True

def get_debug_simple():
if DEBUG_SIMPLE is None:
modlog.error('get_debug_simple called before set_debug_simple')
sys.exit(1)
return DEBUG_SIMPLE

0 comments on commit 6babc3e

Please sign in to comment.