Skip to content

Commit

Permalink
Merge pull request #115 from neurobagel/feat-109/discrete-values-check
Browse files Browse the repository at this point in the history
Implement checks for unannotated categorical column values + unused annotated missing values
  • Loading branch information
alyssadai authored Apr 13, 2023
2 parents 395a030 + ab590be commit d5762fc
Show file tree
Hide file tree
Showing 7 changed files with 310 additions and 8 deletions.
66 changes: 65 additions & 1 deletion bagel/pheno_utils.py
Original file line number Diff line number Diff line change
@@ -1,3 +1,4 @@
import warnings
from collections import defaultdict
from typing import Union

Expand Down Expand Up @@ -191,6 +192,50 @@ def are_inputs_compatible(data_dict: dict, pheno_df: pd.DataFrame) -> bool:
return all([key in pheno_df.columns for key in data_dict.keys()])


def find_undefined_cat_col_values(
data_dict: dict, pheno_df: pd.DataFrame
) -> dict:
"""
Checks that all categorical column values have annotations. Returns a dictionary containing
any categorical column names and specific column values not defined in the corresponding data
dictionary entry.
"""
all_undefined_values = {}
for col, attr in data_dict.items():
if is_column_categorical(col, data_dict):
known_values = list(attr["Levels"].keys()) + attr[
"Annotations"
].get("MissingValues", [])
unknown_values = []
for value in pheno_df[col].unique():
if value not in known_values:
unknown_values.append(value)
if unknown_values:
all_undefined_values[col] = unknown_values

return all_undefined_values


def find_unused_missing_values(
data_dict: dict, pheno_df: pd.DataFrame
) -> dict:
"""
Checks if missing values annotated in the data dictionary appear at least once in the phenotypic file.
Returns a dictionary containing any column names and annotated missing values not found in the phenotypic
file column.
"""
all_unused_missing_vals = {}
for col, attr in data_dict.items():
unused_missing_vals = []
for missing_val in attr["Annotations"].get("MissingValues", []):
if missing_val not in pheno_df[col].unique():
unused_missing_vals.append(missing_val)
if unused_missing_vals:
all_unused_missing_vals[col] = unused_missing_vals

return all_unused_missing_vals


def validate_inputs(data_dict: dict, pheno_df: pd.DataFrame) -> None:
"""Determines whether input data are valid"""
try:
Expand Down Expand Up @@ -226,7 +271,26 @@ def validate_inputs(data_dict: dict, pheno_df: pd.DataFrame) -> None:
raise LookupError(
"The provided data dictionary and phenotypic file are individually valid, "
"but are not compatible. Make sure that you selected the correct data "
"dictionary for your phenotyic file. Every column described in the data "
"dictionary for your phenotypic file. Every column described in the data "
"dictionary has to have a corresponding column with the same name in the "
"phenotypic file"
)

undefined_cat_col_values = find_undefined_cat_col_values(
data_dict, pheno_df
)
if undefined_cat_col_values:
raise LookupError(
"Categorical column(s) in the phenotypic file have values not annotated in the data dictionary "
f"(shown as <column_name>: [<undefined values>]): {undefined_cat_col_values}. "
"Please check that the correct data dictionary has been selected or make sure to annotate the missing values."
)

unused_missing_values = find_unused_missing_values(data_dict, pheno_df)
if unused_missing_values:
warnings.warn(
"The following values annotated as missing values in the data dictionary were not found "
"in the corresponding phenotypic file column(s) (<column_name>: [<unused missing values>]): "
f"{unused_missing_values}. If this is not intentional, please check your data dictionary "
"and phenotypic file."
)
2 changes: 2 additions & 0 deletions bagel/tests/data/README.md
Original file line number Diff line number Diff line change
Expand Up @@ -13,5 +13,7 @@ Example inputs to the CLI
| invalid | valid, only exists to be used together with the (invalid) .json | invalid, missing the `"TermURL"` attribute for identifiers | fail |
| 7 | has fewer columns than are annotated in `.json` | same as example 1 | fail |
| 8 | valid, based on ex2 has multiple participant_id columns | valid, based on ex2 multiple participant_id column annotations | fail* |
| 9 | invalid, based on example 6 but contains an unannotated value for `group` | valid, based on example 6 | fail |
| 10 | valid, same as example 6 | valid, based on example 6 but contains extra `"MissingValues"` not found in the .tsv | pass, with warning |

`* this is expected to fail until we enable multiple participant_ID handling`.
86 changes: 86 additions & 0 deletions bagel/tests/data/example10.json
Original file line number Diff line number Diff line change
@@ -0,0 +1,86 @@
{
"participant_id": {
"Description": "A participant ID",
"Annotations": {
"IsAbout": {
"TermURL": "bg:ParticipantID",
"Label": "Unique participant identifier"
}
}
},
"session_id": {
"Description": "A session ID",
"Annotations": {
"IsAbout": {
"TermURL": "bg:SessionID",
"Label": "Unique session identifier"
}
}
},
"group": {
"Description": "Group variable",
"Levels": {
"PAT": "Patient",
"CTRL": "Control subject"
},
"Annotations": {
"IsAbout": {
"TermURL": "bg:diagnosis",
"Label": "Diagnosis"
},
"Levels": {
"PAT": {
"TermURL": "snomed:49049000",
"Label": "Parkinson's disease"
},
"CTRL": {
"TermURL": "purl:NCIT_C94342",
"Label": "Healthy Control"
}
},
"MissingValues": ["OTHER", "NOT IN TSV"]
}
},
"tool_item1": {
"Description": "item 1 scores for an imaginary tool",
"Annotations": {
"IsAbout": {
"TermURL": "bg:Assessment",
"Label": "Assessment tool"
},
"IsPartOf": {
"TermURL": "cogAtlas:1234",
"Label": "Imaginary tool"
},
"MissingValues": ["missing", "NOT IN TSV 1", "NOT IN TSV 2"]
}
},
"tool_item2": {
"Description": "item 2 scores for an imaginary tool",
"Annotations": {
"IsAbout": {
"TermURL": "bg:Assessment",
"Label": "Assessment tool"
},
"IsPartOf": {
"TermURL": "cogAtlas:1234",
"Label": "Imaginary tool"
},
"MissingValues": ["missing", "NOT IN TSV 1", "NOT IN TSV 2"]
}
},
"other_tool_item1": {
"Description": "item 1 scores for a different imaginary tool",
"Annotations": {
"IsAbout": {
"TermURL": "bg:Assessment",
"Label": "Assessment tool"
},
"IsPartOf": {
"TermURL": "cogAtlas:4321",
"Label": "A different imaginary tool"
},
"MissingValues": ["none"]
}
}
}
7 changes: 7 additions & 0 deletions bagel/tests/data/example10.tsv
Original file line number Diff line number Diff line change
@@ -0,0 +1,7 @@
participant_id session_id group tool_item1 tool_item2 other_tool_item1
sub-01 ses-01 PAT 11.0 "missing" "none"
sub-01 ses-02 PAT "missing" 12.0 "none"
sub-02 ses-01 OTHER "missing" "missing" "none"
sub-02 ses-02 OTHER "missing" "missing" "none"
sub-03 ses-01 CTRL 10.0 8.0 "ok"
sub-03 ses-02 CTRL 10.0 8.0 "bad"
86 changes: 86 additions & 0 deletions bagel/tests/data/example9.json
Original file line number Diff line number Diff line change
@@ -0,0 +1,86 @@
{
"participant_id": {
"Description": "A participant ID",
"Annotations": {
"IsAbout": {
"TermURL": "bg:ParticipantID",
"Label": "Unique participant identifier"
}
}
},
"session_id": {
"Description": "A session ID",
"Annotations": {
"IsAbout": {
"TermURL": "bg:SessionID",
"Label": "Unique session identifier"
}
}
},
"group": {
"Description": "Group variable",
"Levels": {
"PAT": "Patient",
"CTRL": "Control subject"
},
"Annotations": {
"IsAbout": {
"TermURL": "bg:diagnosis",
"Label": "Diagnosis"
},
"Levels": {
"PAT": {
"TermURL": "snomed:49049000",
"Label": "Parkinson's disease"
},
"CTRL": {
"TermURL": "purl:NCIT_C94342",
"Label": "Healthy Control"
}
},
"MissingValues": ["OTHER"]
}
},
"tool_item1": {
"Description": "item 1 scores for an imaginary tool",
"Annotations": {
"IsAbout": {
"TermURL": "bg:Assessment",
"Label": "Assessment tool"
},
"IsPartOf": {
"TermURL": "cogAtlas:1234",
"Label": "Imaginary tool"
},
"MissingValues": ["missing"]
}
},
"tool_item2": {
"Description": "item 2 scores for an imaginary tool",
"Annotations": {
"IsAbout": {
"TermURL": "bg:Assessment",
"Label": "Assessment tool"
},
"IsPartOf": {
"TermURL": "cogAtlas:1234",
"Label": "Imaginary tool"
},
"MissingValues": ["missing"]
}
},
"other_tool_item1": {
"Description": "item 1 scores for a different imaginary tool",
"Annotations": {
"IsAbout": {
"TermURL": "bg:Assessment",
"Label": "Assessment tool"
},
"IsPartOf": {
"TermURL": "cogAtlas:4321",
"Label": "A different imaginary tool"
},
"MissingValues": ["none"]
}
}
}
9 changes: 9 additions & 0 deletions bagel/tests/data/example9.tsv
Original file line number Diff line number Diff line change
@@ -0,0 +1,9 @@
participant_id session_id group tool_item1 tool_item2 other_tool_item1
sub-01 ses-01 PAT 11.0 "missing" "none"
sub-01 ses-02 PAT "missing" 12.0 "none"
sub-02 ses-01 OTHER "missing" "missing" "none"
sub-02 ses-02 OTHER "missing" "missing" "none"
sub-03 ses-01 CTRL 10.0 8.0 "ok"
sub-03 ses-02 CTRL 10.0 8.0 "bad"
sub-04 ses-01 UNANNOTATED 12.0 9.0 "ok"
sub-04 ses-02 UNANNOTATED 12.0 9.0 "bad"
62 changes: 55 additions & 7 deletions bagel/tests/test_cli_pheno.py
Original file line number Diff line number Diff line change
Expand Up @@ -33,14 +33,22 @@ def test_pheno_valid_inputs_run_successfully(
@pytest.mark.parametrize(
"example,expected_exception,expected_message",
[
("example3", ValueError, "not a valid Neurobagel data dictionary"),
("example3", ValueError, ["not a valid Neurobagel data dictionary"]),
(
"example_invalid",
ValueError,
"not a valid Neurobagel data dictionary",
["not a valid Neurobagel data dictionary"],
),
("example7", LookupError, ["not compatible"]),
("example8", ValueError, ["more than one column"]),
(
"example9",
LookupError,
[
"values not annotated in the data dictionary",
"'group': ['UNANNOTATED']",
],
),
("example7", LookupError, "not compatible"),
("example8", ValueError, "more than one column"),
],
)
def test_invalid_inputs_are_handled_gracefully(
Expand All @@ -64,7 +72,44 @@ def test_invalid_inputs_are_handled_gracefully(
catch_exceptions=False,
)

assert expected_message in str(e.value)
for substring in expected_message:
assert substring in str(e.value)


def test_unused_missing_values_raises_warning(
runner,
test_data,
tmp_path,
):
"""
Tests that an informative warning is raised when annotated missing values are not found in the
phenotypic file.
"""
with pytest.warns(UserWarning) as w:
runner.invoke(
bagel,
[
"pheno",
"--pheno",
test_data / "example10.tsv",
"--dictionary",
test_data / "example10.json",
"--output",
tmp_path,
"--name",
"testing dataset",
],
catch_exceptions=False,
)

assert len(w) == 1
for warn_substring in [
"missing values in the data dictionary were not found",
"'group': ['NOT IN TSV']",
"'tool_item1': ['NOT IN TSV 1', 'NOT IN TSV 2']",
"'tool_item2': ['NOT IN TSV 1', 'NOT IN TSV 2']",
]:
assert warn_substring in str(w[0].message.args[0])


def test_that_output_file_contains_name(
Expand Down Expand Up @@ -116,7 +161,10 @@ def test_diagnosis_and_control_status_handled(
)
assert "diagnosis" not in pheno["hasSamples"][1].keys()
assert "diagnosis" not in pheno["hasSamples"][2].keys()
assert pheno["hasSamples"][2]["isSubjectGroup"]["identifier"] == "purl:NCIT_C94342"
assert (
pheno["hasSamples"][2]["isSubjectGroup"]["identifier"]
== "purl:NCIT_C94342"
)


@pytest.mark.parametrize(
Expand All @@ -125,7 +173,7 @@ def test_diagnosis_and_control_status_handled(
def test_controlled_terms_have_identifiers(
attribute, runner, test_data, tmp_path, load_test_json
):
result = runner.invoke(
runner.invoke(
bagel,
[
"pheno",
Expand Down

0 comments on commit d5762fc

Please sign in to comment.