neurobagel · alyssadai · Apr 13, 2023 · Apr 12, 2023 · Apr 12, 2023 · Apr 12, 2023
diff --git a/bagel/pheno_utils.py b/bagel/pheno_utils.py
@@ -1,3 +1,4 @@
+import warnings
 from collections import defaultdict
 from typing import Union
 
@@ -191,6 +192,50 @@ def are_inputs_compatible(data_dict: dict, pheno_df: pd.DataFrame) -> bool:
     return all([key in pheno_df.columns for key in data_dict.keys()])
 
 
+def find_undefined_cat_col_values(
+    data_dict: dict, pheno_df: pd.DataFrame
+) -> dict:
+    """
+    Checks that all categorical column values have annotations. Returns a dictionary containing
+    any categorical column names and specific column values not defined in the corresponding data
+    dictionary entry.
+    """
+    all_undefined_values = {}
+    for col, attr in data_dict.items():
+        if is_column_categorical(col, data_dict):
+            known_values = list(attr["Levels"].keys()) + attr[
+                "Annotations"
+            ].get("MissingValues", [])
+            unknown_values = []
+            for value in pheno_df[col].unique():
+                if value not in known_values:
+                    unknown_values.append(value)
+            if unknown_values:
+                all_undefined_values[col] = unknown_values
+
+    return all_undefined_values
+
+
+def find_unused_missing_values(
+    data_dict: dict, pheno_df: pd.DataFrame
+) -> dict:
+    """
+    Checks if missing values annotated in the data dictionary appear at least once in the phenotypic file.
+    Returns a dictionary containing any column names and annotated missing values not found in the phenotypic
+    file column.
+    """
+    all_unused_missing_vals = {}
+    for col, attr in data_dict.items():
+        unused_missing_vals = []
+        for missing_val in attr["Annotations"].get("MissingValues", []):
+            if missing_val not in pheno_df[col].unique():
+                unused_missing_vals.append(missing_val)
+        if unused_missing_vals:
+            all_unused_missing_vals[col] = unused_missing_vals
+
+    return all_unused_missing_vals
+
+
 def validate_inputs(data_dict: dict, pheno_df: pd.DataFrame) -> None:
     """Determines whether input data are valid"""
     try:
@@ -226,7 +271,26 @@ def validate_inputs(data_dict: dict, pheno_df: pd.DataFrame) -> None:
         raise LookupError(
             "The provided data dictionary and phenotypic file are individually valid, "
             "but are not compatible. Make sure that you selected the correct data "
-            "dictionary for your phenotyic file. Every column described in the data "
+            "dictionary for your phenotypic file. Every column described in the data "
             "dictionary has to have a corresponding column with the same name in the "
             "phenotypic file"
         )
+
+    undefined_cat_col_values = find_undefined_cat_col_values(
+        data_dict, pheno_df
+    )
+    if undefined_cat_col_values:
+        raise LookupError(
+            "Categorical column(s) in the phenotypic file have values not annotated in the data dictionary "
+            f"(shown as <column_name>: [<undefined values>]): {undefined_cat_col_values}. "
+            "Please check that the correct data dictionary has been selected or make sure to annotate the missing values."
+        )
+
+    unused_missing_values = find_unused_missing_values(data_dict, pheno_df)
+    if unused_missing_values:
+        warnings.warn(
+            "The following values annotated as missing values in the data dictionary were not found "
+            "in the corresponding phenotypic file column(s) (<column_name>: [<unused missing values>]): "
+            f"{unused_missing_values}. If this is not intentional, please check your data dictionary "
+            "and phenotypic file."
+        )
diff --git a/bagel/tests/data/README.md b/bagel/tests/data/README.md
@@ -13,5 +13,7 @@ Example inputs to the CLI
 | invalid | valid, only exists to be used together with the (invalid) .json    | invalid, missing the `"TermURL"` attribute for identifiers                       | fail   |
 | 7       | has fewer columns than are annotated in `.json`                    | same as example 1                                                                | fail   |
 | 8       | valid, based on ex2 has multiple participant_id columns            | valid, based on ex2 multiple participant_id column annotations                   | fail*  |
+| 9       | invalid, based on example 6 but contains an unannotated value for `group`            | valid, based on example 6                   | fail  |
+| 10       | valid, same as example 6           | valid, based on example 6 but contains extra `"MissingValues"` not found in the .tsv                   | pass, with warning  |
 
 `* this is expected to fail until we enable multiple participant_ID handling`.
diff --git a/bagel/tests/data/example10.json b/bagel/tests/data/example10.json
@@ -0,0 +1,86 @@
+{
+    "participant_id": {
+        "Description": "A participant ID",
+        "Annotations": {
+            "IsAbout": {
+                "TermURL": "bg:ParticipantID",
+                "Label": "Unique participant identifier"
+            }
+        }
+    },
+    "session_id": {
+        "Description": "A session ID",
+        "Annotations": {
+            "IsAbout": {
+                "TermURL": "bg:SessionID",
+                "Label": "Unique session identifier"
+            }
+        }
+    },
+    "group": {
+        "Description": "Group variable",
+        "Levels": {
+            "PAT": "Patient",
+            "CTRL": "Control subject"
+        },
+        "Annotations": {
+            "IsAbout": {
+                "TermURL": "bg:diagnosis",
+                "Label": "Diagnosis"
+            },
+            "Levels": {
+                "PAT": {
+                    "TermURL": "snomed:49049000",
+                    "Label": "Parkinson's disease"
+                },
+                "CTRL": {
+                    "TermURL": "purl:NCIT_C94342",
+                    "Label": "Healthy Control"
+                }
+            },
+            "MissingValues": ["OTHER", "NOT IN TSV"]
+        }
+    },
+    "tool_item1": {
+        "Description": "item 1 scores for an imaginary tool",
+        "Annotations": {
+            "IsAbout": {
+                "TermURL": "bg:Assessment",
+                "Label": "Assessment tool"
+            },
+            "IsPartOf": {
+                "TermURL": "cogAtlas:1234",
+                "Label": "Imaginary tool"
+            },
+            "MissingValues": ["missing", "NOT IN TSV 1", "NOT IN TSV 2"]
+        }
+    },
+    "tool_item2": {
+        "Description": "item 2 scores for an imaginary tool",
+        "Annotations": {
+            "IsAbout": {
+                "TermURL": "bg:Assessment",
+                "Label": "Assessment tool"
+            },
+            "IsPartOf": {
+                "TermURL": "cogAtlas:1234",
+                "Label": "Imaginary tool"
+            },
+            "MissingValues": ["missing", "NOT IN TSV 1", "NOT IN TSV 2"]
+        }
+    },
+    "other_tool_item1": {
+        "Description": "item 1 scores for a different imaginary tool",
+        "Annotations": {
+            "IsAbout": {
+                "TermURL": "bg:Assessment",
+                "Label": "Assessment tool"
+            },
+            "IsPartOf": {
+                "TermURL": "cogAtlas:4321",
+                "Label": "A different imaginary tool"
+            },
+            "MissingValues": ["none"]
+        }
+    }
+}
diff --git a/bagel/tests/data/example10.tsv b/bagel/tests/data/example10.tsv
@@ -0,0 +1,7 @@
+participant_id	session_id	group	tool_item1	tool_item2	other_tool_item1
+sub-01	ses-01	PAT	11.0	"missing"	"none"
+sub-01	ses-02	PAT	"missing"	12.0	"none"
+sub-02	ses-01	OTHER	"missing"	"missing"	"none"
+sub-02	ses-02	OTHER	"missing"	"missing"	"none"
+sub-03	ses-01	CTRL	10.0	8.0	"ok"
+sub-03	ses-02	CTRL	10.0	8.0	"bad"
diff --git a/bagel/tests/data/example9.json b/bagel/tests/data/example9.json
@@ -0,0 +1,86 @@
+{
+    "participant_id": {
+        "Description": "A participant ID",
+        "Annotations": {
+            "IsAbout": {
+                "TermURL": "bg:ParticipantID",
+                "Label": "Unique participant identifier"
+            }
+        }
+    },
+    "session_id": {
+        "Description": "A session ID",
+        "Annotations": {
+            "IsAbout": {
+                "TermURL": "bg:SessionID",
+                "Label": "Unique session identifier"
+            }
+        }
+    },
+    "group": {
+        "Description": "Group variable",
+        "Levels": {
+            "PAT": "Patient",
+            "CTRL": "Control subject"
+        },
+        "Annotations": {
+            "IsAbout": {
+                "TermURL": "bg:diagnosis",
+                "Label": "Diagnosis"
+            },
+            "Levels": {
+                "PAT": {
+                    "TermURL": "snomed:49049000",
+                    "Label": "Parkinson's disease"
+                },
+                "CTRL": {
+                    "TermURL": "purl:NCIT_C94342",
+                    "Label": "Healthy Control"
+                }
+            },
+            "MissingValues": ["OTHER"]
+        }
+    },
+    "tool_item1": {
+        "Description": "item 1 scores for an imaginary tool",
+        "Annotations": {
+            "IsAbout": {
+                "TermURL": "bg:Assessment",
+                "Label": "Assessment tool"
+            },
+            "IsPartOf": {
+                "TermURL": "cogAtlas:1234",
+                "Label": "Imaginary tool"
+            },
+            "MissingValues": ["missing"]
+        }
+    },
+    "tool_item2": {
+        "Description": "item 2 scores for an imaginary tool",
+        "Annotations": {
+            "IsAbout": {
+                "TermURL": "bg:Assessment",
+                "Label": "Assessment tool"
+            },
+            "IsPartOf": {
+                "TermURL": "cogAtlas:1234",
+                "Label": "Imaginary tool"
+            },
+            "MissingValues": ["missing"]
+        }
+    },
+    "other_tool_item1": {
+        "Description": "item 1 scores for a different imaginary tool",
+        "Annotations": {
+            "IsAbout": {
+                "TermURL": "bg:Assessment",
+                "Label": "Assessment tool"
+            },
+            "IsPartOf": {
+                "TermURL": "cogAtlas:4321",
+                "Label": "A different imaginary tool"
+            },
+            "MissingValues": ["none"]
+        }
+    }
+}
diff --git a/bagel/tests/data/example9.tsv b/bagel/tests/data/example9.tsv
@@ -0,0 +1,9 @@
+participant_id	session_id	group	tool_item1	tool_item2	other_tool_item1
+sub-01	ses-01	PAT	11.0	"missing"	"none"
+sub-01	ses-02	PAT	"missing"	12.0	"none"
+sub-02	ses-01	OTHER	"missing"	"missing"	"none"
+sub-02	ses-02	OTHER	"missing"	"missing"	"none"
+sub-03	ses-01	CTRL	10.0	8.0	"ok"
+sub-03	ses-02	CTRL	10.0	8.0	"bad"
+sub-04	ses-01	UNANNOTATED	12.0	9.0	"ok"
+sub-04	ses-02	UNANNOTATED	12.0	9.0	"bad"
diff --git a/bagel/tests/test_cli_pheno.py b/bagel/tests/test_cli_pheno.py
@@ -33,14 +33,22 @@ def test_pheno_valid_inputs_run_successfully(
 @pytest.mark.parametrize(
     "example,expected_exception,expected_message",
     [
-        ("example3", ValueError, "not a valid Neurobagel data dictionary"),
+        ("example3", ValueError, ["not a valid Neurobagel data dictionary"]),
         (
             "example_invalid",
             ValueError,
-            "not a valid Neurobagel data dictionary",
+            ["not a valid Neurobagel data dictionary"],
+        ),
+        ("example7", LookupError, ["not compatible"]),
+        ("example8", ValueError, ["more than one column"]),
+        (
+            "example9",
+            LookupError,
+            [
+                "values not annotated in the data dictionary",
+                "'group': ['UNANNOTATED']",
+            ],
         ),
-        ("example7", LookupError, "not compatible"),
-        ("example8", ValueError, "more than one column"),
     ],
 )
 def test_invalid_inputs_are_handled_gracefully(
@@ -64,7 +72,44 @@ def test_invalid_inputs_are_handled_gracefully(
             catch_exceptions=False,
         )
 
-    assert expected_message in str(e.value)
+    for substring in expected_message:
+        assert substring in str(e.value)
+
+
+def test_unused_missing_values_raises_warning(
+    runner,
+    test_data,
+    tmp_path,
+):
+    """
+    Tests that an informative warning is raised when annotated missing values are not found in the
+    phenotypic file.
+    """
+    with pytest.warns(UserWarning) as w:
+        runner.invoke(
+            bagel,
+            [
+                "pheno",
+                "--pheno",
+                test_data / "example10.tsv",
+                "--dictionary",
+                test_data / "example10.json",
+                "--output",
+                tmp_path,
+                "--name",
+                "testing dataset",
+            ],
+            catch_exceptions=False,
+        )
+
+    assert len(w) == 1
+    for warn_substring in [
+        "missing values in the data dictionary were not found",
+        "'group': ['NOT IN TSV']",
+        "'tool_item1': ['NOT IN TSV 1', 'NOT IN TSV 2']",
+        "'tool_item2': ['NOT IN TSV 1', 'NOT IN TSV 2']",
+    ]:
+        assert warn_substring in str(w[0].message.args[0])
 
 
 def test_that_output_file_contains_name(
@@ -116,7 +161,10 @@ def test_diagnosis_and_control_status_handled(
     )
     assert "diagnosis" not in pheno["hasSamples"][1].keys()
     assert "diagnosis" not in pheno["hasSamples"][2].keys()
-    assert pheno["hasSamples"][2]["isSubjectGroup"]["identifier"] == "purl:NCIT_C94342"
+    assert (
+        pheno["hasSamples"][2]["isSubjectGroup"]["identifier"]
+        == "purl:NCIT_C94342"
+    )
 
 
 @pytest.mark.parametrize(
@@ -125,7 +173,7 @@ def test_diagnosis_and_control_status_handled(
 def test_controlled_terms_have_identifiers(
     attribute, runner, test_data, tmp_path, load_test_json
 ):
-    result = runner.invoke(
+    runner.invoke(
         bagel,
         [
             "pheno",