Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Implement checks for unannotated categorical column values + unused annotated missing values #115

Merged
merged 8 commits into from
Apr 13, 2023
66 changes: 65 additions & 1 deletion bagel/pheno_utils.py
Original file line number Diff line number Diff line change
@@ -1,3 +1,4 @@
import warnings
from collections import defaultdict
from typing import Union

Expand Down Expand Up @@ -191,6 +192,50 @@ def are_inputs_compatible(data_dict: dict, pheno_df: pd.DataFrame) -> bool:
return all([key in pheno_df.columns for key in data_dict.keys()])


def find_undefined_cat_col_values(
data_dict: dict, pheno_df: pd.DataFrame
) -> dict:
"""
Checks that all categorical column values have annotations. Returns a dictionary containing
any categorical column names and specific column values not defined in the corresponding data
dictionary entry.
"""
all_undefined_values = {}
for col, attr in data_dict.items():
if is_column_categorical(col, data_dict):
known_values = list(attr["Levels"].keys()) + attr[
"Annotations"
].get("MissingValues", [])
unknown_values = []
for value in pheno_df[col].unique():
if value not in known_values:
unknown_values.append(value)
if unknown_values:
all_undefined_values[col] = unknown_values

return all_undefined_values


def find_unused_missing_values(
alyssadai marked this conversation as resolved.
Show resolved Hide resolved
data_dict: dict, pheno_df: pd.DataFrame
) -> dict:
"""
Checks if missing values annotated in the data dictionary appear at least once in the phenotypic file.
Returns a dictionary containing any column names and annotated missing values not found in the phenotypic
file column.
"""
all_unused_missing_vals = {}
for col, attr in data_dict.items():
unused_missing_vals = []
for missing_val in attr["Annotations"].get("MissingValues", []):
if missing_val not in pheno_df[col].unique():
unused_missing_vals.append(missing_val)
if unused_missing_vals:
all_unused_missing_vals[col] = unused_missing_vals

return all_unused_missing_vals


def validate_inputs(data_dict: dict, pheno_df: pd.DataFrame) -> None:
"""Determines whether input data are valid"""
try:
Expand Down Expand Up @@ -226,7 +271,26 @@ def validate_inputs(data_dict: dict, pheno_df: pd.DataFrame) -> None:
raise LookupError(
"The provided data dictionary and phenotypic file are individually valid, "
"but are not compatible. Make sure that you selected the correct data "
"dictionary for your phenotyic file. Every column described in the data "
"dictionary for your phenotypic file. Every column described in the data "
"dictionary has to have a corresponding column with the same name in the "
"phenotypic file"
)

undefined_cat_col_values = find_undefined_cat_col_values(
data_dict, pheno_df
)
surchs marked this conversation as resolved.
Show resolved Hide resolved
if undefined_cat_col_values:
raise LookupError(
"Categorical column(s) in the phenotypic file have values not annotated in the data dictionary "
f"(shown as <column_name>: [<undefined values>]): {undefined_cat_col_values}. "
"Please check that the correct data dictionary has been selected or make sure to annotate the missing values."
)

unused_missing_values = find_unused_missing_values(data_dict, pheno_df)
if unused_missing_values:
warnings.warn(
"The following values annotated as missing values in the data dictionary were not found "
"in the corresponding phenotypic file column(s) (<column_name>: [<unused missing values>]): "
f"{unused_missing_values}. If this is not intentional, please check your data dictionary "
"and phenotypic file."
)
2 changes: 2 additions & 0 deletions bagel/tests/data/README.md
Original file line number Diff line number Diff line change
Expand Up @@ -13,5 +13,7 @@ Example inputs to the CLI
| invalid | valid, only exists to be used together with the (invalid) .json | invalid, missing the `"TermURL"` attribute for identifiers | fail |
| 7 | has fewer columns than are annotated in `.json` | same as example 1 | fail |
| 8 | valid, based on ex2 has multiple participant_id columns | valid, based on ex2 multiple participant_id column annotations | fail* |
| 9 | invalid, based on example 6 but contains an unannotated value for `group` | valid, based on example 6 | fail |
| 10 | valid, same as example 6 | valid, based on example 6 but contains extra `"MissingValues"` not found in the .tsv | pass, with warning |

`* this is expected to fail until we enable multiple participant_ID handling`.
86 changes: 86 additions & 0 deletions bagel/tests/data/example10.json
Original file line number Diff line number Diff line change
@@ -0,0 +1,86 @@
{
"participant_id": {
"Description": "A participant ID",
"Annotations": {
"IsAbout": {
"TermURL": "bg:ParticipantID",
"Label": "Unique participant identifier"
}
}
},
"session_id": {
"Description": "A session ID",
"Annotations": {
"IsAbout": {
"TermURL": "bg:SessionID",
"Label": "Unique session identifier"
}
}
},
"group": {
"Description": "Group variable",
"Levels": {
"PAT": "Patient",
"CTRL": "Control subject"
},
"Annotations": {
"IsAbout": {
"TermURL": "bg:diagnosis",
"Label": "Diagnosis"
},
"Levels": {
"PAT": {
"TermURL": "snomed:49049000",
"Label": "Parkinson's disease"
},
"CTRL": {
"TermURL": "purl:NCIT_C94342",
"Label": "Healthy Control"
}
},
"MissingValues": ["OTHER", "NOT IN TSV"]
}
},
"tool_item1": {
"Description": "item 1 scores for an imaginary tool",
"Annotations": {
"IsAbout": {
"TermURL": "bg:Assessment",
"Label": "Assessment tool"
},
"IsPartOf": {
"TermURL": "cogAtlas:1234",
"Label": "Imaginary tool"
},
"MissingValues": ["missing", "NOT IN TSV 1", "NOT IN TSV 2"]
}
},
"tool_item2": {
"Description": "item 2 scores for an imaginary tool",
"Annotations": {
"IsAbout": {
"TermURL": "bg:Assessment",
"Label": "Assessment tool"
},
"IsPartOf": {
"TermURL": "cogAtlas:1234",
"Label": "Imaginary tool"
},
"MissingValues": ["missing", "NOT IN TSV 1", "NOT IN TSV 2"]
}
},
"other_tool_item1": {
"Description": "item 1 scores for a different imaginary tool",
"Annotations": {
"IsAbout": {
"TermURL": "bg:Assessment",
"Label": "Assessment tool"
},
"IsPartOf": {
"TermURL": "cogAtlas:4321",
"Label": "A different imaginary tool"
},
"MissingValues": ["none"]
}
}
}
7 changes: 7 additions & 0 deletions bagel/tests/data/example10.tsv
Original file line number Diff line number Diff line change
@@ -0,0 +1,7 @@
participant_id session_id group tool_item1 tool_item2 other_tool_item1
sub-01 ses-01 PAT 11.0 "missing" "none"
sub-01 ses-02 PAT "missing" 12.0 "none"
sub-02 ses-01 OTHER "missing" "missing" "none"
sub-02 ses-02 OTHER "missing" "missing" "none"
sub-03 ses-01 CTRL 10.0 8.0 "ok"
sub-03 ses-02 CTRL 10.0 8.0 "bad"
86 changes: 86 additions & 0 deletions bagel/tests/data/example9.json
Original file line number Diff line number Diff line change
@@ -0,0 +1,86 @@
{
"participant_id": {
"Description": "A participant ID",
"Annotations": {
"IsAbout": {
"TermURL": "bg:ParticipantID",
"Label": "Unique participant identifier"
}
}
},
"session_id": {
"Description": "A session ID",
"Annotations": {
"IsAbout": {
"TermURL": "bg:SessionID",
"Label": "Unique session identifier"
}
}
},
"group": {
"Description": "Group variable",
"Levels": {
"PAT": "Patient",
"CTRL": "Control subject"
},
"Annotations": {
"IsAbout": {
"TermURL": "bg:diagnosis",
"Label": "Diagnosis"
},
"Levels": {
"PAT": {
"TermURL": "snomed:49049000",
"Label": "Parkinson's disease"
},
"CTRL": {
"TermURL": "purl:NCIT_C94342",
"Label": "Healthy Control"
}
},
"MissingValues": ["OTHER"]
}
},
"tool_item1": {
"Description": "item 1 scores for an imaginary tool",
"Annotations": {
"IsAbout": {
"TermURL": "bg:Assessment",
"Label": "Assessment tool"
},
"IsPartOf": {
"TermURL": "cogAtlas:1234",
"Label": "Imaginary tool"
},
"MissingValues": ["missing"]
}
},
"tool_item2": {
"Description": "item 2 scores for an imaginary tool",
"Annotations": {
"IsAbout": {
"TermURL": "bg:Assessment",
"Label": "Assessment tool"
},
"IsPartOf": {
"TermURL": "cogAtlas:1234",
"Label": "Imaginary tool"
},
"MissingValues": ["missing"]
}
},
"other_tool_item1": {
"Description": "item 1 scores for a different imaginary tool",
"Annotations": {
"IsAbout": {
"TermURL": "bg:Assessment",
"Label": "Assessment tool"
},
"IsPartOf": {
"TermURL": "cogAtlas:4321",
"Label": "A different imaginary tool"
},
"MissingValues": ["none"]
}
}
}
9 changes: 9 additions & 0 deletions bagel/tests/data/example9.tsv
Original file line number Diff line number Diff line change
@@ -0,0 +1,9 @@
participant_id session_id group tool_item1 tool_item2 other_tool_item1
sub-01 ses-01 PAT 11.0 "missing" "none"
sub-01 ses-02 PAT "missing" 12.0 "none"
sub-02 ses-01 OTHER "missing" "missing" "none"
sub-02 ses-02 OTHER "missing" "missing" "none"
sub-03 ses-01 CTRL 10.0 8.0 "ok"
sub-03 ses-02 CTRL 10.0 8.0 "bad"
sub-04 ses-01 UNANNOTATED 12.0 9.0 "ok"
sub-04 ses-02 UNANNOTATED 12.0 9.0 "bad"
62 changes: 55 additions & 7 deletions bagel/tests/test_cli_pheno.py
Original file line number Diff line number Diff line change
Expand Up @@ -33,14 +33,22 @@ def test_pheno_valid_inputs_run_successfully(
@pytest.mark.parametrize(
"example,expected_exception,expected_message",
[
("example3", ValueError, "not a valid Neurobagel data dictionary"),
("example3", ValueError, ["not a valid Neurobagel data dictionary"]),
(
"example_invalid",
ValueError,
"not a valid Neurobagel data dictionary",
["not a valid Neurobagel data dictionary"],
),
("example7", LookupError, ["not compatible"]),
("example8", ValueError, ["more than one column"]),
(
"example9",
LookupError,
[
"values not annotated in the data dictionary",
"'group': ['UNANNOTATED']",
],
),
("example7", LookupError, "not compatible"),
("example8", ValueError, "more than one column"),
],
)
def test_invalid_inputs_are_handled_gracefully(
Expand All @@ -64,7 +72,44 @@ def test_invalid_inputs_are_handled_gracefully(
catch_exceptions=False,
)

assert expected_message in str(e.value)
for substring in expected_message:
assert substring in str(e.value)


def test_unused_missing_values_raises_warning(
runner,
test_data,
tmp_path,
):
"""
Tests that an informative warning is raised when annotated missing values are not found in the
phenotypic file.
"""
with pytest.warns(UserWarning) as w:
runner.invoke(
bagel,
[
"pheno",
"--pheno",
test_data / "example10.tsv",
"--dictionary",
test_data / "example10.json",
"--output",
tmp_path,
"--name",
"testing dataset",
],
catch_exceptions=False,
)

assert len(w) == 1
for warn_substring in [
"missing values in the data dictionary were not found",
"'group': ['NOT IN TSV']",
"'tool_item1': ['NOT IN TSV 1', 'NOT IN TSV 2']",
"'tool_item2': ['NOT IN TSV 1', 'NOT IN TSV 2']",
]:
assert warn_substring in str(w[0].message.args[0])


def test_that_output_file_contains_name(
Expand Down Expand Up @@ -116,7 +161,10 @@ def test_diagnosis_and_control_status_handled(
)
assert "diagnosis" not in pheno["hasSamples"][1].keys()
assert "diagnosis" not in pheno["hasSamples"][2].keys()
assert pheno["hasSamples"][2]["isSubjectGroup"]["identifier"] == "purl:NCIT_C94342"
assert (
pheno["hasSamples"][2]["isSubjectGroup"]["identifier"]
== "purl:NCIT_C94342"
)


@pytest.mark.parametrize(
Expand All @@ -125,7 +173,7 @@ def test_diagnosis_and_control_status_handled(
def test_controlled_terms_have_identifiers(
attribute, runner, test_data, tmp_path, load_test_json
):
result = runner.invoke(
runner.invoke(
alyssadai marked this conversation as resolved.
Show resolved Hide resolved
bagel,
[
"pheno",
Expand Down