From 733e7320dbb90991ad9dd0fb214ef845669e95b0 Mon Sep 17 00:00:00 2001 From: Alyssa Dai Date: Tue, 11 Apr 2023 23:34:00 -0400 Subject: [PATCH 1/8] Implement check that all unique categorical column values are found in the data dictionary - removed asst of test runner output to a variable for tests that don't use the result object --- bagel/pheno_utils.py | 32 +++++++++++++ bagel/tests/data/example9.json | 86 ++++++++++++++++++++++++++++++++++ bagel/tests/data/example9.tsv | 9 ++++ bagel/tests/test_cli_pheno.py | 12 ++++- 4 files changed, 137 insertions(+), 2 deletions(-) create mode 100644 bagel/tests/data/example9.json create mode 100644 bagel/tests/data/example9.tsv diff --git a/bagel/pheno_utils.py b/bagel/pheno_utils.py index 764c90ef..c8ed1b98 100644 --- a/bagel/pheno_utils.py +++ b/bagel/pheno_utils.py @@ -191,6 +191,28 @@ def are_inputs_compatible(data_dict: dict, pheno_df: pd.DataFrame) -> bool: return all([key in pheno_df.columns for key in data_dict.keys()]) +def find_undefined_categorical_column_values( + data_dict: dict, pheno_df: pd.DataFrame +) -> dict: + """ + Returns a dictionary containing any categorical column names and specific column values not defined + in the corresponding data dictionary entry. + """ + all_undefined_values = {} + for col in data_dict.keys(): + if is_column_categorical(col, data_dict): + known_values = list(data_dict[col]["Levels"].keys()) + data_dict[ + col + ]["Annotations"].get("MissingValues", []) + unknown_values = set(pheno_df[col].unique()).difference( + known_values + ) + if unknown_values: + all_undefined_values[col] = unknown_values + + return all_undefined_values + + def validate_inputs(data_dict: dict, pheno_df: pd.DataFrame) -> None: """Determines whether input data are valid""" try: @@ -230,3 +252,13 @@ def validate_inputs(data_dict: dict, pheno_df: pd.DataFrame) -> None: "dictionary has to have a corresponding column with the same name in the " "phenotypic file" ) + + unknown_categorical_col_values = find_undefined_categorical_column_values( + data_dict, pheno_df + ) + if unknown_categorical_col_values: + raise LookupError( + "Categorical column(s) in the phenotypic .tsv have values not found in the provided data dictionary " + f"(shown as : {{}}): {unknown_categorical_col_values}. " + "Please check that the correct data dictionary has been selected." + ) diff --git a/bagel/tests/data/example9.json b/bagel/tests/data/example9.json new file mode 100644 index 00000000..5ab22598 --- /dev/null +++ b/bagel/tests/data/example9.json @@ -0,0 +1,86 @@ +{ + "participant_id": { + "Description": "A participant ID", + "Annotations": { + "IsAbout": { + "TermURL": "bg:ParticipantID", + "Label": "Unique participant identifier" + } + } + }, + "session_id": { + "Description": "A session ID", + "Annotations": { + "IsAbout": { + "TermURL": "bg:SessionID", + "Label": "Unique session identifier" + } + } + }, + "group": { + "Description": "Group variable", + "Levels": { + "PAT": "Patient", + "CTRL": "Control subject" + }, + "Annotations": { + "IsAbout": { + "TermURL": "bg:diagnosis", + "Label": "Diagnosis" + }, + "Levels": { + "PAT": { + "TermURL": "snomed:49049000", + "Label": "Parkinson's disease" + }, + "CTRL": { + "TermURL": "purl:NCIT_C94342", + "Label": "Healthy Control" + } + }, + "MissingValues": ["OTHER"] + } + }, + "tool_item1": { + "Description": "item 1 scores for an imaginary tool", + "Annotations": { + "IsAbout": { + "TermURL": "bg:Assessment", + "Label": "Assessment tool" + }, + "IsPartOf": { + "TermURL": "cogAtlas:1234", + "Label": "Imaginary tool" + }, + "MissingValues": ["missing"] + } + }, + "tool_item2": { + "Description": "item 2 scores for an imaginary tool", + "Annotations": { + "IsAbout": { + "TermURL": "bg:Assessment", + "Label": "Assessment tool" + }, + "IsPartOf": { + "TermURL": "cogAtlas:1234", + "Label": "Imaginary tool" + }, + "MissingValues": ["missing"] + } + }, + "other_tool_item1": { + "Description": "item 1 scores for a different imaginary tool", + "Annotations": { + "IsAbout": { + "TermURL": "bg:Assessment", + "Label": "Assessment tool" + }, + "IsPartOf": { + "TermURL": "cogAtlas:4321", + "Label": "A different imaginary tool" + }, + "MissingValues": ["none"] + } + } +} \ No newline at end of file diff --git a/bagel/tests/data/example9.tsv b/bagel/tests/data/example9.tsv new file mode 100644 index 00000000..10453df2 --- /dev/null +++ b/bagel/tests/data/example9.tsv @@ -0,0 +1,9 @@ +participant_id session_id group tool_item1 tool_item2 other_tool_item1 +sub-01 ses-01 PAT 11.0 "missing" "none" +sub-01 ses-02 PAT "missing" 12.0 "none" +sub-02 ses-01 OTHER "missing" "missing" "none" +sub-02 ses-02 OTHER "missing" "missing" "none" +sub-03 ses-01 CTRL 10.0 8.0 "ok" +sub-03 ses-02 CTRL 10.0 8.0 "bad" +sub-04 ses-01 SIB 12.0 9.0 "ok" +sub-04 ses-02 SIB 12.0 9.0 "bad" \ No newline at end of file diff --git a/bagel/tests/test_cli_pheno.py b/bagel/tests/test_cli_pheno.py index 0d907e5e..19219b74 100644 --- a/bagel/tests/test_cli_pheno.py +++ b/bagel/tests/test_cli_pheno.py @@ -41,6 +41,11 @@ def test_pheno_valid_inputs_run_successfully( ), ("example7", LookupError, "not compatible"), ("example8", ValueError, "more than one column"), + ( + "example9", + LookupError, + "values not found in the provided data dictionary (shown as : {}): {'group': {'SIB'}}", + ), ], ) def test_invalid_inputs_are_handled_gracefully( @@ -116,7 +121,10 @@ def test_diagnosis_and_control_status_handled( ) assert "diagnosis" not in pheno["hasSamples"][1].keys() assert "diagnosis" not in pheno["hasSamples"][2].keys() - assert pheno["hasSamples"][2]["isSubjectGroup"]["identifier"] == "purl:NCIT_C94342" + assert ( + pheno["hasSamples"][2]["isSubjectGroup"]["identifier"] + == "purl:NCIT_C94342" + ) @pytest.mark.parametrize( @@ -125,7 +133,7 @@ def test_diagnosis_and_control_status_handled( def test_controlled_terms_have_identifiers( attribute, runner, test_data, tmp_path, load_test_json ): - result = runner.invoke( + runner.invoke( bagel, [ "pheno", From b2eba70df582ced44bf4fd56f2976b4d61503113 Mon Sep 17 00:00:00 2001 From: Alyssa Dai Date: Wed, 12 Apr 2023 03:48:55 -0400 Subject: [PATCH 2/8] Implement check for when annotated missing values are not found in the phenotypic file --- bagel/pheno_utils.py | 30 ++++++++++++ bagel/tests/data/example10.json | 86 +++++++++++++++++++++++++++++++++ bagel/tests/data/example10.tsv | 7 +++ bagel/tests/test_cli_pheno.py | 33 +++++++++++++ 4 files changed, 156 insertions(+) create mode 100644 bagel/tests/data/example10.json create mode 100644 bagel/tests/data/example10.tsv diff --git a/bagel/pheno_utils.py b/bagel/pheno_utils.py index c8ed1b98..b6ef58f3 100644 --- a/bagel/pheno_utils.py +++ b/bagel/pheno_utils.py @@ -1,3 +1,4 @@ +import warnings from collections import defaultdict from typing import Union @@ -213,6 +214,26 @@ def find_undefined_categorical_column_values( return all_undefined_values +def find_unused_missing_values( + data_dict: dict, pheno_df: pd.DataFrame +) -> dict: + """ + Checks if missing values annotated in the data dictionary appear at least once in the phenotypic file. + Returns a dictionary containing any column names and annotated missing values not found in the phenotypic + file column. + """ + all_unused_missing_vals = {} + for col, attr in data_dict.items(): + unused_missing_vals = [] + for missing_val in attr["Annotations"].get("MissingValues", []): + if missing_val not in pheno_df[col].unique(): + unused_missing_vals.append(missing_val) + if unused_missing_vals: + all_unused_missing_vals[col] = unused_missing_vals + + return all_unused_missing_vals + + def validate_inputs(data_dict: dict, pheno_df: pd.DataFrame) -> None: """Determines whether input data are valid""" try: @@ -262,3 +283,12 @@ def validate_inputs(data_dict: dict, pheno_df: pd.DataFrame) -> None: f"(shown as : {{}}): {unknown_categorical_col_values}. " "Please check that the correct data dictionary has been selected." ) + + unused_missing_values = find_unused_missing_values(data_dict, pheno_df) + if unused_missing_values: + warnings.warn( + "The following values annotated as missing values in the data dictionary were not found " + "in the corresponding phenotypic file column(s) (: []): " + f"{unused_missing_values}. If this is not intentional, please check your data dictionary " + "and phenotypic file." + ) diff --git a/bagel/tests/data/example10.json b/bagel/tests/data/example10.json new file mode 100644 index 00000000..b07e8f0d --- /dev/null +++ b/bagel/tests/data/example10.json @@ -0,0 +1,86 @@ +{ + "participant_id": { + "Description": "A participant ID", + "Annotations": { + "IsAbout": { + "TermURL": "bg:ParticipantID", + "Label": "Unique participant identifier" + } + } + }, + "session_id": { + "Description": "A session ID", + "Annotations": { + "IsAbout": { + "TermURL": "bg:SessionID", + "Label": "Unique session identifier" + } + } + }, + "group": { + "Description": "Group variable", + "Levels": { + "PAT": "Patient", + "CTRL": "Control subject" + }, + "Annotations": { + "IsAbout": { + "TermURL": "bg:diagnosis", + "Label": "Diagnosis" + }, + "Levels": { + "PAT": { + "TermURL": "snomed:49049000", + "Label": "Parkinson's disease" + }, + "CTRL": { + "TermURL": "purl:NCIT_C94342", + "Label": "Healthy Control" + } + }, + "MissingValues": ["OTHER", "MISSING"] + } + }, + "tool_item1": { + "Description": "item 1 scores for an imaginary tool", + "Annotations": { + "IsAbout": { + "TermURL": "bg:Assessment", + "Label": "Assessment tool" + }, + "IsPartOf": { + "TermURL": "cogAtlas:1234", + "Label": "Imaginary tool" + }, + "MissingValues": ["missing", "none", ""] + } + }, + "tool_item2": { + "Description": "item 2 scores for an imaginary tool", + "Annotations": { + "IsAbout": { + "TermURL": "bg:Assessment", + "Label": "Assessment tool" + }, + "IsPartOf": { + "TermURL": "cogAtlas:1234", + "Label": "Imaginary tool" + }, + "MissingValues": ["missing", "none", ""] + } + }, + "other_tool_item1": { + "Description": "item 1 scores for a different imaginary tool", + "Annotations": { + "IsAbout": { + "TermURL": "bg:Assessment", + "Label": "Assessment tool" + }, + "IsPartOf": { + "TermURL": "cogAtlas:4321", + "Label": "A different imaginary tool" + }, + "MissingValues": ["none"] + } + } +} \ No newline at end of file diff --git a/bagel/tests/data/example10.tsv b/bagel/tests/data/example10.tsv new file mode 100644 index 00000000..5bf84f13 --- /dev/null +++ b/bagel/tests/data/example10.tsv @@ -0,0 +1,7 @@ +participant_id session_id group tool_item1 tool_item2 other_tool_item1 +sub-01 ses-01 PAT 11.0 "missing" "none" +sub-01 ses-02 PAT "missing" 12.0 "none" +sub-02 ses-01 OTHER "missing" "missing" "none" +sub-02 ses-02 OTHER "missing" "missing" "none" +sub-03 ses-01 CTRL 10.0 8.0 "ok" +sub-03 ses-02 CTRL 10.0 8.0 "bad" diff --git a/bagel/tests/test_cli_pheno.py b/bagel/tests/test_cli_pheno.py index 19219b74..6eb2a3c6 100644 --- a/bagel/tests/test_cli_pheno.py +++ b/bagel/tests/test_cli_pheno.py @@ -72,6 +72,39 @@ def test_invalid_inputs_are_handled_gracefully( assert expected_message in str(e.value) +def test_unused_missing_values_raises_warning( + runner, + test_data, + tmp_path, +): + """ + Tests that an informative warning is raised when annotated missing values are not found in the + phenotypic file. + """ + with pytest.warns(UserWarning) as w: + runner.invoke( + bagel, + [ + "pheno", + "--pheno", + test_data / "example10.tsv", + "--dictionary", + test_data / "example10.json", + "--output", + tmp_path, + "--name", + "testing dataset", + ], + catch_exceptions=False, + ) + + assert len(w) == 1 + assert ( + "missing values in the data dictionary were not found in the corresponding phenotypic file column(s) " + "(: []): {'group': ['MISSING'], 'tool_item1': ['none', ''], 'tool_item2': ['none', '']}" + ) in str(w[0].message.args[0]) + + def test_that_output_file_contains_name( runner, test_data, tmp_path, load_test_json ): From 9e01e5733e14d977fe5832b3be187154bff14548 Mon Sep 17 00:00:00 2001 From: Alyssa Dai Date: Wed, 12 Apr 2023 03:55:21 -0400 Subject: [PATCH 3/8] fix typos, update docstrings/naming conventions for clarity --- bagel/pheno_utils.py | 17 +++++++++-------- bagel/tests/test_cli_pheno.py | 2 +- 2 files changed, 10 insertions(+), 9 deletions(-) diff --git a/bagel/pheno_utils.py b/bagel/pheno_utils.py index b6ef58f3..ebf14064 100644 --- a/bagel/pheno_utils.py +++ b/bagel/pheno_utils.py @@ -196,8 +196,9 @@ def find_undefined_categorical_column_values( data_dict: dict, pheno_df: pd.DataFrame ) -> dict: """ - Returns a dictionary containing any categorical column names and specific column values not defined - in the corresponding data dictionary entry. + Checks that all categorical column values have annotations. Returns a dictionary containing + any categorical column names and specific column values not defined in the corresponding data + dictionary entry. """ all_undefined_values = {} for col in data_dict.keys(): @@ -269,18 +270,18 @@ def validate_inputs(data_dict: dict, pheno_df: pd.DataFrame) -> None: raise LookupError( "The provided data dictionary and phenotypic file are individually valid, " "but are not compatible. Make sure that you selected the correct data " - "dictionary for your phenotyic file. Every column described in the data " + "dictionary for your phenotypic file. Every column described in the data " "dictionary has to have a corresponding column with the same name in the " "phenotypic file" ) - unknown_categorical_col_values = find_undefined_categorical_column_values( - data_dict, pheno_df + undefined_categorical_col_values = ( + find_undefined_categorical_column_values(data_dict, pheno_df) ) - if unknown_categorical_col_values: + if undefined_categorical_col_values: raise LookupError( - "Categorical column(s) in the phenotypic .tsv have values not found in the provided data dictionary " - f"(shown as : {{}}): {unknown_categorical_col_values}. " + "Categorical column(s) in the phenotypic file have values not found in the data dictionary " + f"(shown as : {{}}): {undefined_categorical_col_values}. " "Please check that the correct data dictionary has been selected." ) diff --git a/bagel/tests/test_cli_pheno.py b/bagel/tests/test_cli_pheno.py index 6eb2a3c6..3d97e693 100644 --- a/bagel/tests/test_cli_pheno.py +++ b/bagel/tests/test_cli_pheno.py @@ -44,7 +44,7 @@ def test_pheno_valid_inputs_run_successfully( ( "example9", LookupError, - "values not found in the provided data dictionary (shown as : {}): {'group': {'SIB'}}", + "values not found in the data dictionary (shown as : {}): {'group': {'SIB'}}", ), ], ) From 8d6da45f9dd22fb6f6a31a7fe5a784bbdef82331 Mon Sep 17 00:00:00 2001 From: Alyssa Dai Date: Wed, 12 Apr 2023 04:14:10 -0400 Subject: [PATCH 4/8] refactor find_undefined_categorical_column_values --- bagel/pheno_utils.py | 22 ++++++++++++++-------- bagel/tests/test_cli_pheno.py | 2 +- 2 files changed, 15 insertions(+), 9 deletions(-) diff --git a/bagel/pheno_utils.py b/bagel/pheno_utils.py index ebf14064..b219950b 100644 --- a/bagel/pheno_utils.py +++ b/bagel/pheno_utils.py @@ -201,14 +201,20 @@ def find_undefined_categorical_column_values( dictionary entry. """ all_undefined_values = {} - for col in data_dict.keys(): + for col, attr in data_dict.items(): if is_column_categorical(col, data_dict): - known_values = list(data_dict[col]["Levels"].keys()) + data_dict[ - col - ]["Annotations"].get("MissingValues", []) - unknown_values = set(pheno_df[col].unique()).difference( - known_values - ) + known_values = list(attr["Levels"].keys()) + attr[ + "Annotations" + ].get("MissingValues", []) + # NOTE: (also applies to find_unused_missing_values) The below comparison block could also be + # accomplished using difference of sets, however due to the unordered nature of the resultant set + # of unknown values, the order of specific values may be different than they appear in the input, + # leading to unexpected failed assertions / hard-to-formulate expected user messages in testing. + # To keep things simple, loops and basic conditionals are used here instead. + unknown_values = [] + for value in pheno_df[col].unique(): + if value not in known_values: + unknown_values.append(value) if unknown_values: all_undefined_values[col] = unknown_values @@ -281,7 +287,7 @@ def validate_inputs(data_dict: dict, pheno_df: pd.DataFrame) -> None: if undefined_categorical_col_values: raise LookupError( "Categorical column(s) in the phenotypic file have values not found in the data dictionary " - f"(shown as : {{}}): {undefined_categorical_col_values}. " + f"(shown as : []): {undefined_categorical_col_values}. " "Please check that the correct data dictionary has been selected." ) diff --git a/bagel/tests/test_cli_pheno.py b/bagel/tests/test_cli_pheno.py index 3d97e693..a0734591 100644 --- a/bagel/tests/test_cli_pheno.py +++ b/bagel/tests/test_cli_pheno.py @@ -44,7 +44,7 @@ def test_pheno_valid_inputs_run_successfully( ( "example9", LookupError, - "values not found in the data dictionary (shown as : {}): {'group': {'SIB'}}", + "values not found in the data dictionary (shown as : []): {'group': ['SIB']}", ), ], ) From 72676659cb131ba4b129ab6b0e51e9da3038650f Mon Sep 17 00:00:00 2001 From: Alyssa Dai Date: Wed, 12 Apr 2023 04:19:03 -0400 Subject: [PATCH 5/8] update test data README --- bagel/tests/data/README.md | 2 ++ 1 file changed, 2 insertions(+) diff --git a/bagel/tests/data/README.md b/bagel/tests/data/README.md index b17ed18a..5d27db95 100644 --- a/bagel/tests/data/README.md +++ b/bagel/tests/data/README.md @@ -13,5 +13,7 @@ Example inputs to the CLI | invalid | valid, only exists to be used together with the (invalid) .json | invalid, missing the `"TermURL"` attribute for identifiers | fail | | 7 | has fewer columns than are annotated in `.json` | same as example 1 | fail | | 8 | valid, based on ex2 has multiple participant_id columns | valid, based on ex2 multiple participant_id column annotations | fail* | +| 9 | valid, same as example 6 | invalid, based on example 6 but contains an unannotated value for `group` | fail | +| 10 | valid, based on example 6 but contains extra `"MissingValues"` not found in the .tsv | valid, same as example 6 | pass, with warning | `* this is expected to fail until we enable multiple participant_ID handling`. \ No newline at end of file From 1584e1657c4bb8a02c9a3d998e6f71fd5d842564 Mon Sep 17 00:00:00 2001 From: Alyssa Dai Date: Wed, 12 Apr 2023 17:42:22 -0400 Subject: [PATCH 6/8] Apply suggestions for user messages from code review Co-authored-by: Sebastian Urchs --- bagel/pheno_utils.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/bagel/pheno_utils.py b/bagel/pheno_utils.py index b219950b..22e0ee15 100644 --- a/bagel/pheno_utils.py +++ b/bagel/pheno_utils.py @@ -286,9 +286,9 @@ def validate_inputs(data_dict: dict, pheno_df: pd.DataFrame) -> None: ) if undefined_categorical_col_values: raise LookupError( - "Categorical column(s) in the phenotypic file have values not found in the data dictionary " + "Categorical column(s) in the phenotypic file have values not annotated in the data dictionary " f"(shown as : []): {undefined_categorical_col_values}. " - "Please check that the correct data dictionary has been selected." + "Please check that the correct data dictionary has been selected or make sure to annotate the missing values." ) unused_missing_values = find_unused_missing_values(data_dict, pheno_df) From 8aecbd54629ac8eb92dca5e6f631f63fa8a27de1 Mon Sep 17 00:00:00 2001 From: Alyssa Dai Date: Thu, 13 Apr 2023 02:11:12 -0400 Subject: [PATCH 7/8] update tests, remove unnecessary comment --- bagel/pheno_utils.py | 5 ----- bagel/tests/data/README.md | 4 ++-- bagel/tests/data/example10.json | 6 +++--- bagel/tests/data/example9.tsv | 4 ++-- bagel/tests/test_cli_pheno.py | 27 +++++++++++++++++---------- 5 files changed, 24 insertions(+), 22 deletions(-) diff --git a/bagel/pheno_utils.py b/bagel/pheno_utils.py index 22e0ee15..bccffa76 100644 --- a/bagel/pheno_utils.py +++ b/bagel/pheno_utils.py @@ -206,11 +206,6 @@ def find_undefined_categorical_column_values( known_values = list(attr["Levels"].keys()) + attr[ "Annotations" ].get("MissingValues", []) - # NOTE: (also applies to find_unused_missing_values) The below comparison block could also be - # accomplished using difference of sets, however due to the unordered nature of the resultant set - # of unknown values, the order of specific values may be different than they appear in the input, - # leading to unexpected failed assertions / hard-to-formulate expected user messages in testing. - # To keep things simple, loops and basic conditionals are used here instead. unknown_values = [] for value in pheno_df[col].unique(): if value not in known_values: diff --git a/bagel/tests/data/README.md b/bagel/tests/data/README.md index 5d27db95..f731ae67 100644 --- a/bagel/tests/data/README.md +++ b/bagel/tests/data/README.md @@ -13,7 +13,7 @@ Example inputs to the CLI | invalid | valid, only exists to be used together with the (invalid) .json | invalid, missing the `"TermURL"` attribute for identifiers | fail | | 7 | has fewer columns than are annotated in `.json` | same as example 1 | fail | | 8 | valid, based on ex2 has multiple participant_id columns | valid, based on ex2 multiple participant_id column annotations | fail* | -| 9 | valid, same as example 6 | invalid, based on example 6 but contains an unannotated value for `group` | fail | -| 10 | valid, based on example 6 but contains extra `"MissingValues"` not found in the .tsv | valid, same as example 6 | pass, with warning | +| 9 | invalid, based on example 6 but contains an unannotated value for `group` | valid, based on example 6 | fail | +| 10 | valid, same as example 6 | valid, based on example 6 but contains extra `"MissingValues"` not found in the .tsv | pass, with warning | `* this is expected to fail until we enable multiple participant_ID handling`. \ No newline at end of file diff --git a/bagel/tests/data/example10.json b/bagel/tests/data/example10.json index b07e8f0d..62a1b7cb 100644 --- a/bagel/tests/data/example10.json +++ b/bagel/tests/data/example10.json @@ -38,7 +38,7 @@ "Label": "Healthy Control" } }, - "MissingValues": ["OTHER", "MISSING"] + "MissingValues": ["OTHER", "NOT IN TSV"] } }, "tool_item1": { @@ -52,7 +52,7 @@ "TermURL": "cogAtlas:1234", "Label": "Imaginary tool" }, - "MissingValues": ["missing", "none", ""] + "MissingValues": ["missing", "NOT IN TSV 1", "NOT IN TSV 2"] } }, "tool_item2": { @@ -66,7 +66,7 @@ "TermURL": "cogAtlas:1234", "Label": "Imaginary tool" }, - "MissingValues": ["missing", "none", ""] + "MissingValues": ["missing", "NOT IN TSV 1", "NOT IN TSV 2"] } }, "other_tool_item1": { diff --git a/bagel/tests/data/example9.tsv b/bagel/tests/data/example9.tsv index 10453df2..1ce1ecbe 100644 --- a/bagel/tests/data/example9.tsv +++ b/bagel/tests/data/example9.tsv @@ -5,5 +5,5 @@ sub-02 ses-01 OTHER "missing" "missing" "none" sub-02 ses-02 OTHER "missing" "missing" "none" sub-03 ses-01 CTRL 10.0 8.0 "ok" sub-03 ses-02 CTRL 10.0 8.0 "bad" -sub-04 ses-01 SIB 12.0 9.0 "ok" -sub-04 ses-02 SIB 12.0 9.0 "bad" \ No newline at end of file +sub-04 ses-01 UNANNOTATED 12.0 9.0 "ok" +sub-04 ses-02 UNANNOTATED 12.0 9.0 "bad" \ No newline at end of file diff --git a/bagel/tests/test_cli_pheno.py b/bagel/tests/test_cli_pheno.py index a0734591..2ade0590 100644 --- a/bagel/tests/test_cli_pheno.py +++ b/bagel/tests/test_cli_pheno.py @@ -33,18 +33,21 @@ def test_pheno_valid_inputs_run_successfully( @pytest.mark.parametrize( "example,expected_exception,expected_message", [ - ("example3", ValueError, "not a valid Neurobagel data dictionary"), + ("example3", ValueError, ["not a valid Neurobagel data dictionary"]), ( "example_invalid", ValueError, - "not a valid Neurobagel data dictionary", + ["not a valid Neurobagel data dictionary"], ), - ("example7", LookupError, "not compatible"), - ("example8", ValueError, "more than one column"), + ("example7", LookupError, ["not compatible"]), + ("example8", ValueError, ["more than one column"]), ( "example9", LookupError, - "values not found in the data dictionary (shown as : []): {'group': ['SIB']}", + [ + "values not annotated in the data dictionary", + "'group': ['UNANNOTATED']", + ], ), ], ) @@ -69,7 +72,8 @@ def test_invalid_inputs_are_handled_gracefully( catch_exceptions=False, ) - assert expected_message in str(e.value) + for substring in expected_message: + assert substring in str(e.value) def test_unused_missing_values_raises_warning( @@ -99,10 +103,13 @@ def test_unused_missing_values_raises_warning( ) assert len(w) == 1 - assert ( - "missing values in the data dictionary were not found in the corresponding phenotypic file column(s) " - "(: []): {'group': ['MISSING'], 'tool_item1': ['none', ''], 'tool_item2': ['none', '']}" - ) in str(w[0].message.args[0]) + for warn_substring in [ + "missing values in the data dictionary were not found", + "'group': ['NOT IN TSV']", + "'tool_item1': ['NOT IN TSV 1', 'NOT IN TSV 2']", + "'tool_item2': ['NOT IN TSV 1', 'NOT IN TSV 2']", + ]: + assert warn_substring in str(w[0].message.args[0]) def test_that_output_file_contains_name( From ab590be8bb80d695be052ad0e8984e0d045b07bf Mon Sep 17 00:00:00 2001 From: Alyssa Dai Date: Thu, 13 Apr 2023 02:22:20 -0400 Subject: [PATCH 8/8] simplify function/variable names --- bagel/pheno_utils.py | 10 +++++----- 1 file changed, 5 insertions(+), 5 deletions(-) diff --git a/bagel/pheno_utils.py b/bagel/pheno_utils.py index bccffa76..6721cb9a 100644 --- a/bagel/pheno_utils.py +++ b/bagel/pheno_utils.py @@ -192,7 +192,7 @@ def are_inputs_compatible(data_dict: dict, pheno_df: pd.DataFrame) -> bool: return all([key in pheno_df.columns for key in data_dict.keys()]) -def find_undefined_categorical_column_values( +def find_undefined_cat_col_values( data_dict: dict, pheno_df: pd.DataFrame ) -> dict: """ @@ -276,13 +276,13 @@ def validate_inputs(data_dict: dict, pheno_df: pd.DataFrame) -> None: "phenotypic file" ) - undefined_categorical_col_values = ( - find_undefined_categorical_column_values(data_dict, pheno_df) + undefined_cat_col_values = find_undefined_cat_col_values( + data_dict, pheno_df ) - if undefined_categorical_col_values: + if undefined_cat_col_values: raise LookupError( "Categorical column(s) in the phenotypic file have values not annotated in the data dictionary " - f"(shown as : []): {undefined_categorical_col_values}. " + f"(shown as : []): {undefined_cat_col_values}. " "Please check that the correct data dictionary has been selected or make sure to annotate the missing values." )