From 733e7320dbb90991ad9dd0fb214ef845669e95b0 Mon Sep 17 00:00:00 2001
From: Alyssa Dai <alyssa.ydai@gmail.com>
Date: Tue, 11 Apr 2023 23:34:00 -0400
Subject: [PATCH 1/8] Implement check that all unique categorical column values
 are found in the data dictionary - removed asst of test runner output to a
 variable for tests that don't use the result object

---
 bagel/pheno_utils.py           | 32 +++++++++++++
 bagel/tests/data/example9.json | 86 ++++++++++++++++++++++++++++++++++
 bagel/tests/data/example9.tsv  |  9 ++++
 bagel/tests/test_cli_pheno.py  | 12 ++++-
 4 files changed, 137 insertions(+), 2 deletions(-)
 create mode 100644 bagel/tests/data/example9.json
 create mode 100644 bagel/tests/data/example9.tsv
diff --git a/bagel/pheno_utils.py b/bagel/pheno_utils.py
index 764c90ef..c8ed1b98 100644
--- a/bagel/pheno_utils.py
+++ b/bagel/pheno_utils.py
@@ -191,6 +191,28 @@ def are_inputs_compatible(data_dict: dict, pheno_df: pd.DataFrame) -> bool:
     return all([key in pheno_df.columns for key in data_dict.keys()])
 
 
+def find_undefined_categorical_column_values(
+    data_dict: dict, pheno_df: pd.DataFrame
+) -> dict:
+    """
+    Returns a dictionary containing any categorical column names and specific column values not defined
+    in the corresponding data dictionary entry.
+    """
+    all_undefined_values = {}
+    for col in data_dict.keys():
+        if is_column_categorical(col, data_dict):
+            known_values = list(data_dict[col]["Levels"].keys()) + data_dict[
+                col
+            ]["Annotations"].get("MissingValues", [])
+            unknown_values = set(pheno_df[col].unique()).difference(
+                known_values
+            )
+            if unknown_values:
+                all_undefined_values[col] = unknown_values
+
+    return all_undefined_values
+
+
 def validate_inputs(data_dict: dict, pheno_df: pd.DataFrame) -> None:
     """Determines whether input data are valid"""
     try:
@@ -230,3 +252,13 @@ def validate_inputs(data_dict: dict, pheno_df: pd.DataFrame) -> None:
             "dictionary has to have a corresponding column with the same name in the "
             "phenotypic file"
         )
+
+    unknown_categorical_col_values = find_undefined_categorical_column_values(
+        data_dict, pheno_df
+    )
+    if unknown_categorical_col_values:
+        raise LookupError(
+            "Categorical column(s) in the phenotypic .tsv have values not found in the provided data dictionary "
+            f"(shown as <column_name>: {{<undefined values>}}): {unknown_categorical_col_values}. "
+            "Please check that the correct data dictionary has been selected."
+        )
diff --git a/bagel/tests/data/example9.json b/bagel/tests/data/example9.json
new file mode 100644
index 00000000..5ab22598
--- /dev/null
+++ b/bagel/tests/data/example9.json
@@ -0,0 +1,86 @@
+{
+    "participant_id": {
+        "Description": "A participant ID",
+        "Annotations": {
+            "IsAbout": {
+                "TermURL": "bg:ParticipantID",
+                "Label": "Unique participant identifier"
+            }
+        }
+    },
+    "session_id": {
+        "Description": "A session ID",
+        "Annotations": {
+            "IsAbout": {
+                "TermURL": "bg:SessionID",
+                "Label": "Unique session identifier"
+            }
+        }
+    },
+    "group": {
+        "Description": "Group variable",
+        "Levels": {
+            "PAT": "Patient",
+            "CTRL": "Control subject"
+        },
+        "Annotations": {
+            "IsAbout": {
+                "TermURL": "bg:diagnosis",
+                "Label": "Diagnosis"
+            },
+            "Levels": {
+                "PAT": {
+                    "TermURL": "snomed:49049000",
+                    "Label": "Parkinson's disease"
+                },
+                "CTRL": {
+                    "TermURL": "purl:NCIT_C94342",
+                    "Label": "Healthy Control"
+                }
+            },
+            "MissingValues": ["OTHER"]
+        }
+    },
+    "tool_item1": {
+        "Description": "item 1 scores for an imaginary tool",
+        "Annotations": {
+            "IsAbout": {
+                "TermURL": "bg:Assessment",
+                "Label": "Assessment tool"
+            },
+            "IsPartOf": {
+                "TermURL": "cogAtlas:1234",
+                "Label": "Imaginary tool"
+            },
+            "MissingValues": ["missing"]
+        }
+    },
+    "tool_item2": {
+        "Description": "item 2 scores for an imaginary tool",
+        "Annotations": {
+            "IsAbout": {
+                "TermURL": "bg:Assessment",
+                "Label": "Assessment tool"
+            },
+            "IsPartOf": {
+                "TermURL": "cogAtlas:1234",
+                "Label": "Imaginary tool"
+            },
+            "MissingValues": ["missing"]
+        }
+    },
+    "other_tool_item1": {
+        "Description": "item 1 scores for a different imaginary tool",
+        "Annotations": {
+            "IsAbout": {
+                "TermURL": "bg:Assessment",
+                "Label": "Assessment tool"
+            },
+            "IsPartOf": {
+                "TermURL": "cogAtlas:4321",
+                "Label": "A different imaginary tool"
+            },
+            "MissingValues": ["none"]
+        }
+    }
+}
\ No newline at end of file
diff --git a/bagel/tests/data/example9.tsv b/bagel/tests/data/example9.tsv
new file mode 100644
index 00000000..10453df2
--- /dev/null
+++ b/bagel/tests/data/example9.tsv
@@ -0,0 +1,9 @@
+participant_id	session_id	group	tool_item1	tool_item2	other_tool_item1
+sub-01	ses-01	PAT	11.0	"missing"	"none"
+sub-01	ses-02	PAT	"missing"	12.0	"none"
+sub-02	ses-01	OTHER	"missing"	"missing"	"none"
+sub-02	ses-02	OTHER	"missing"	"missing"	"none"
+sub-03	ses-01	CTRL	10.0	8.0	"ok"
+sub-03	ses-02	CTRL	10.0	8.0	"bad"
+sub-04	ses-01	SIB	12.0	9.0	"ok"
+sub-04	ses-02	SIB	12.0	9.0	"bad"
\ No newline at end of file
diff --git a/bagel/tests/test_cli_pheno.py b/bagel/tests/test_cli_pheno.py
index 0d907e5e..19219b74 100644
--- a/bagel/tests/test_cli_pheno.py
+++ b/bagel/tests/test_cli_pheno.py
@@ -41,6 +41,11 @@ def test_pheno_valid_inputs_run_successfully(
         ),
         ("example7", LookupError, "not compatible"),
         ("example8", ValueError, "more than one column"),
+        (
+            "example9",
+            LookupError,
+            "values not found in the provided data dictionary (shown as <column_name>: {<undefined values>}): {'group': {'SIB'}}",
+        ),
     ],
 )
 def test_invalid_inputs_are_handled_gracefully(
@@ -116,7 +121,10 @@ def test_diagnosis_and_control_status_handled(
     )
     assert "diagnosis" not in pheno["hasSamples"][1].keys()
     assert "diagnosis" not in pheno["hasSamples"][2].keys()
-    assert pheno["hasSamples"][2]["isSubjectGroup"]["identifier"] == "purl:NCIT_C94342"
+    assert (
+        pheno["hasSamples"][2]["isSubjectGroup"]["identifier"]
+        == "purl:NCIT_C94342"
+    )
 
 
 @pytest.mark.parametrize(
@@ -125,7 +133,7 @@ def test_diagnosis_and_control_status_handled(
 def test_controlled_terms_have_identifiers(
     attribute, runner, test_data, tmp_path, load_test_json
 ):
-    result = runner.invoke(
+    runner.invoke(
         bagel,
         [
             "pheno",

From b2eba70df582ced44bf4fd56f2976b4d61503113 Mon Sep 17 00:00:00 2001
From: Alyssa Dai <alyssa.ydai@gmail.com>
Date: Wed, 12 Apr 2023 03:48:55 -0400
Subject: [PATCH 2/8] Implement check for when annotated missing values are not
 found in the phenotypic file

---
 bagel/pheno_utils.py            | 30 ++++++++++++
 bagel/tests/data/example10.json | 86 +++++++++++++++++++++++++++++++++
 bagel/tests/data/example10.tsv  |  7 +++
 bagel/tests/test_cli_pheno.py   | 33 +++++++++++++
 4 files changed, 156 insertions(+)
 create mode 100644 bagel/tests/data/example10.json
 create mode 100644 bagel/tests/data/example10.tsv

diff --git a/bagel/pheno_utils.py b/bagel/pheno_utils.py
index c8ed1b98..b6ef58f3 100644
--- a/bagel/pheno_utils.py
+++ b/bagel/pheno_utils.py
@@ -1,3 +1,4 @@
+import warnings
 from collections import defaultdict
 from typing import Union
 
@@ -213,6 +214,26 @@ def find_undefined_categorical_column_values(
     return all_undefined_values
 
 
+def find_unused_missing_values(
+    data_dict: dict, pheno_df: pd.DataFrame
+) -> dict:
+    """
+    Checks if missing values annotated in the data dictionary appear at least once in the phenotypic file.
+    Returns a dictionary containing any column names and annotated missing values not found in the phenotypic
+    file column.
+    """
+    all_unused_missing_vals = {}
+    for col, attr in data_dict.items():
+        unused_missing_vals = []
+        for missing_val in attr["Annotations"].get("MissingValues", []):
+            if missing_val not in pheno_df[col].unique():
+                unused_missing_vals.append(missing_val)
+        if unused_missing_vals:
+            all_unused_missing_vals[col] = unused_missing_vals
+
+    return all_unused_missing_vals
+
+
 def validate_inputs(data_dict: dict, pheno_df: pd.DataFrame) -> None:
     """Determines whether input data are valid"""
     try:
@@ -262,3 +283,12 @@ def validate_inputs(data_dict: dict, pheno_df: pd.DataFrame) -> None:
             f"(shown as <column_name>: {{<undefined values>}}): {unknown_categorical_col_values}. "
             "Please check that the correct data dictionary has been selected."
         )
+
+    unused_missing_values = find_unused_missing_values(data_dict, pheno_df)
+    if unused_missing_values:
+        warnings.warn(
+            "The following values annotated as missing values in the data dictionary were not found "
+            "in the corresponding phenotypic file column(s) (<column_name>: [<unused missing values>]): "
+            f"{unused_missing_values}. If this is not intentional, please check your data dictionary "
+            "and phenotypic file."
+        )
diff --git a/bagel/tests/data/example10.json b/bagel/tests/data/example10.json
new file mode 100644
index 00000000..b07e8f0d
--- /dev/null
+++ b/bagel/tests/data/example10.json
@@ -0,0 +1,86 @@
+{
+    "participant_id": {
+        "Description": "A participant ID",
+        "Annotations": {
+            "IsAbout": {
+                "TermURL": "bg:ParticipantID",
+                "Label": "Unique participant identifier"
+            }
+        }
+    },
+    "session_id": {
+        "Description": "A session ID",
+        "Annotations": {
+            "IsAbout": {
+                "TermURL": "bg:SessionID",
+                "Label": "Unique session identifier"
+            }
+        }
+    },
+    "group": {
+        "Description": "Group variable",
+        "Levels": {
+            "PAT": "Patient",
+            "CTRL": "Control subject"
+        },
+        "Annotations": {
+            "IsAbout": {
+                "TermURL": "bg:diagnosis",
+                "Label": "Diagnosis"
+            },
+            "Levels": {
+                "PAT": {
+                    "TermURL": "snomed:49049000",
+                    "Label": "Parkinson's disease"
+                },
+                "CTRL": {
+                    "TermURL": "purl:NCIT_C94342",
+                    "Label": "Healthy Control"
+                }
+            },
+            "MissingValues": ["OTHER", "MISSING"]
+        }
+    },
+    "tool_item1": {
+        "Description": "item 1 scores for an imaginary tool",
+        "Annotations": {
+            "IsAbout": {
+                "TermURL": "bg:Assessment",
+                "Label": "Assessment tool"
+            },
+            "IsPartOf": {
+                "TermURL": "cogAtlas:1234",
+                "Label": "Imaginary tool"
+            },
+            "MissingValues": ["missing", "none", ""]
+        }
+    },
+    "tool_item2": {
+        "Description": "item 2 scores for an imaginary tool",
+        "Annotations": {
+            "IsAbout": {
+                "TermURL": "bg:Assessment",
+                "Label": "Assessment tool"
+            },
+            "IsPartOf": {
+                "TermURL": "cogAtlas:1234",
+                "Label": "Imaginary tool"
+            },
+            "MissingValues": ["missing", "none", ""]
+        }
+    },
+    "other_tool_item1": {
+        "Description": "item 1 scores for a different imaginary tool",
+        "Annotations": {
+            "IsAbout": {
+                "TermURL": "bg:Assessment",
+                "Label": "Assessment tool"
+            },
+            "IsPartOf": {
+                "TermURL": "cogAtlas:4321",
+                "Label": "A different imaginary tool"
+            },
+            "MissingValues": ["none"]
+        }
+    }
+}
\ No newline at end of file
diff --git a/bagel/tests/data/example10.tsv b/bagel/tests/data/example10.tsv
new file mode 100644
index 00000000..5bf84f13
--- /dev/null
+++ b/bagel/tests/data/example10.tsv
@@ -0,0 +1,7 @@
+participant_id	session_id	group	tool_item1	tool_item2	other_tool_item1
+sub-01	ses-01	PAT	11.0	"missing"	"none"
+sub-01	ses-02	PAT	"missing"	12.0	"none"
+sub-02	ses-01	OTHER	"missing"	"missing"	"none"
+sub-02	ses-02	OTHER	"missing"	"missing"	"none"
+sub-03	ses-01	CTRL	10.0	8.0	"ok"
+sub-03	ses-02	CTRL	10.0	8.0	"bad"
diff --git a/bagel/tests/test_cli_pheno.py b/bagel/tests/test_cli_pheno.py
index 19219b74..6eb2a3c6 100644
--- a/bagel/tests/test_cli_pheno.py
+++ b/bagel/tests/test_cli_pheno.py
@@ -72,6 +72,39 @@ def test_invalid_inputs_are_handled_gracefully(
     assert expected_message in str(e.value)
 
 
+def test_unused_missing_values_raises_warning(
+    runner,
+    test_data,
+    tmp_path,
+):
+    """
+    Tests that an informative warning is raised when annotated missing values are not found in the
+    phenotypic file.
+    """
+    with pytest.warns(UserWarning) as w:
+        runner.invoke(
+            bagel,
+            [
+                "pheno",
+                "--pheno",
+                test_data / "example10.tsv",
+                "--dictionary",
+                test_data / "example10.json",
+                "--output",
+                tmp_path,
+                "--name",
+                "testing dataset",
+            ],
+            catch_exceptions=False,
+        )
+
+    assert len(w) == 1
+    assert (
+        "missing values in the data dictionary were not found in the corresponding phenotypic file column(s) "
+        "(<column_name>: [<unused missing values>]): {'group': ['MISSING'], 'tool_item1': ['none', ''], 'tool_item2': ['none', '']}"
+    ) in str(w[0].message.args[0])
+
+
 def test_that_output_file_contains_name(
     runner, test_data, tmp_path, load_test_json
 ):

From 9e01e5733e14d977fe5832b3be187154bff14548 Mon Sep 17 00:00:00 2001
From: Alyssa Dai <alyssa.ydai@gmail.com>
Date: Wed, 12 Apr 2023 03:55:21 -0400
Subject: [PATCH 3/8] fix typos, update docstrings/naming conventions for
 clarity

---
 bagel/pheno_utils.py          | 17 +++++++++--------
 bagel/tests/test_cli_pheno.py |  2 +-
 2 files changed, 10 insertions(+), 9 deletions(-)

diff --git a/bagel/pheno_utils.py b/bagel/pheno_utils.py
index b6ef58f3..ebf14064 100644
--- a/bagel/pheno_utils.py
+++ b/bagel/pheno_utils.py
@@ -196,8 +196,9 @@ def find_undefined_categorical_column_values(
     data_dict: dict, pheno_df: pd.DataFrame
 ) -> dict:
     """
-    Returns a dictionary containing any categorical column names and specific column values not defined
-    in the corresponding data dictionary entry.
+    Checks that all categorical column values have annotations. Returns a dictionary containing
+    any categorical column names and specific column values not defined in the corresponding data
+    dictionary entry.
     """
     all_undefined_values = {}
     for col in data_dict.keys():
@@ -269,18 +270,18 @@ def validate_inputs(data_dict: dict, pheno_df: pd.DataFrame) -> None:
         raise LookupError(
             "The provided data dictionary and phenotypic file are individually valid, "
             "but are not compatible. Make sure that you selected the correct data "
-            "dictionary for your phenotyic file. Every column described in the data "
+            "dictionary for your phenotypic file. Every column described in the data "
             "dictionary has to have a corresponding column with the same name in the "
             "phenotypic file"
         )
 
-    unknown_categorical_col_values = find_undefined_categorical_column_values(
-        data_dict, pheno_df
+    undefined_categorical_col_values = (
+        find_undefined_categorical_column_values(data_dict, pheno_df)
     )
-    if unknown_categorical_col_values:
+    if undefined_categorical_col_values:
         raise LookupError(
-            "Categorical column(s) in the phenotypic .tsv have values not found in the provided data dictionary "
-            f"(shown as <column_name>: {{<undefined values>}}): {unknown_categorical_col_values}. "
+            "Categorical column(s) in the phenotypic file have values not found in the data dictionary "
+            f"(shown as <column_name>: {{<undefined values>}}): {undefined_categorical_col_values}. "
             "Please check that the correct data dictionary has been selected."
         )
 
diff --git a/bagel/tests/test_cli_pheno.py b/bagel/tests/test_cli_pheno.py
index 6eb2a3c6..3d97e693 100644
--- a/bagel/tests/test_cli_pheno.py
+++ b/bagel/tests/test_cli_pheno.py
@@ -44,7 +44,7 @@ def test_pheno_valid_inputs_run_successfully(
         (
             "example9",
             LookupError,
-            "values not found in the provided data dictionary (shown as <column_name>: {<undefined values>}): {'group': {'SIB'}}",
+            "values not found in the data dictionary (shown as <column_name>: {<undefined values>}): {'group': {'SIB'}}",
         ),
     ],
 )

From 8d6da45f9dd22fb6f6a31a7fe5a784bbdef82331 Mon Sep 17 00:00:00 2001
From: Alyssa Dai <alyssa.ydai@gmail.com>
Date: Wed, 12 Apr 2023 04:14:10 -0400
Subject: [PATCH 4/8] refactor find_undefined_categorical_column_values

---
 bagel/pheno_utils.py          | 22 ++++++++++++++--------
 bagel/tests/test_cli_pheno.py |  2 +-
 2 files changed, 15 insertions(+), 9 deletions(-)

diff --git a/bagel/pheno_utils.py b/bagel/pheno_utils.py
index ebf14064..b219950b 100644
--- a/bagel/pheno_utils.py
+++ b/bagel/pheno_utils.py
@@ -201,14 +201,20 @@ def find_undefined_categorical_column_values(
     dictionary entry.
     """
     all_undefined_values = {}
-    for col in data_dict.keys():
+    for col, attr in data_dict.items():
         if is_column_categorical(col, data_dict):
-            known_values = list(data_dict[col]["Levels"].keys()) + data_dict[
-                col
-            ]["Annotations"].get("MissingValues", [])
-            unknown_values = set(pheno_df[col].unique()).difference(
-                known_values
-            )
+            known_values = list(attr["Levels"].keys()) + attr[
+                "Annotations"
+            ].get("MissingValues", [])
+            # NOTE: (also applies to find_unused_missing_values) The below comparison block could also be
+            # accomplished using difference of sets, however due to the unordered nature of the resultant set
+            # of unknown values, the order of specific values may be different than they appear in the input,
+            # leading to unexpected failed assertions / hard-to-formulate expected user messages in testing.
+            # To keep things simple, loops and basic conditionals are used here instead.
+            unknown_values = []
+            for value in pheno_df[col].unique():
+                if value not in known_values:
+                    unknown_values.append(value)
             if unknown_values:
                 all_undefined_values[col] = unknown_values
 
@@ -281,7 +287,7 @@ def validate_inputs(data_dict: dict, pheno_df: pd.DataFrame) -> None:
     if undefined_categorical_col_values:
         raise LookupError(
             "Categorical column(s) in the phenotypic file have values not found in the data dictionary "
-            f"(shown as <column_name>: {{<undefined values>}}): {undefined_categorical_col_values}. "
+            f"(shown as <column_name>: [<undefined values>]): {undefined_categorical_col_values}. "
             "Please check that the correct data dictionary has been selected."
         )
 
diff --git a/bagel/tests/test_cli_pheno.py b/bagel/tests/test_cli_pheno.py
index 3d97e693..a0734591 100644
--- a/bagel/tests/test_cli_pheno.py
+++ b/bagel/tests/test_cli_pheno.py
@@ -44,7 +44,7 @@ def test_pheno_valid_inputs_run_successfully(
         (
             "example9",
             LookupError,
-            "values not found in the data dictionary (shown as <column_name>: {<undefined values>}): {'group': {'SIB'}}",
+            "values not found in the data dictionary (shown as <column_name>: [<undefined values>]): {'group': ['SIB']}",
         ),
     ],
 )

From 72676659cb131ba4b129ab6b0e51e9da3038650f Mon Sep 17 00:00:00 2001
From: Alyssa Dai <alyssa.ydai@gmail.com>
Date: Wed, 12 Apr 2023 04:19:03 -0400
Subject: [PATCH 5/8] update test data README

---
 bagel/tests/data/README.md | 2 ++
 1 file changed, 2 insertions(+)

diff --git a/bagel/tests/data/README.md b/bagel/tests/data/README.md
index b17ed18a..5d27db95 100644
--- a/bagel/tests/data/README.md
+++ b/bagel/tests/data/README.md
@@ -13,5 +13,7 @@ Example inputs to the CLI
 | invalid | valid, only exists to be used together with the (invalid) .json    | invalid, missing the `"TermURL"` attribute for identifiers                       | fail   |
 | 7       | has fewer columns than are annotated in `.json`                    | same as example 1                                                                | fail   |
 | 8       | valid, based on ex2 has multiple participant_id columns            | valid, based on ex2 multiple participant_id column annotations                   | fail*  |
+| 9       | valid, same as example 6            | invalid, based on example 6 but contains an unannotated value for `group`                   | fail  |
+| 10       | valid, based on example 6 but contains extra `"MissingValues"` not found in the .tsv           | valid, same as example 6                   | pass, with warning  |
 
 `* this is expected to fail until we enable multiple participant_ID handling`.
\ No newline at end of file

From 1584e1657c4bb8a02c9a3d998e6f71fd5d842564 Mon Sep 17 00:00:00 2001
From: Alyssa Dai <alyssa.ydai@gmail.com>
Date: Wed, 12 Apr 2023 17:42:22 -0400
Subject: [PATCH 6/8] Apply suggestions for user messages from code review

Co-authored-by: Sebastian Urchs <surchs@users.noreply.github.com>
---
 bagel/pheno_utils.py | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/bagel/pheno_utils.py b/bagel/pheno_utils.py
index b219950b..22e0ee15 100644
--- a/bagel/pheno_utils.py
+++ b/bagel/pheno_utils.py
@@ -286,9 +286,9 @@ def validate_inputs(data_dict: dict, pheno_df: pd.DataFrame) -> None:
     )
     if undefined_categorical_col_values:
         raise LookupError(
-            "Categorical column(s) in the phenotypic file have values not found in the data dictionary "
+            "Categorical column(s) in the phenotypic file have values not annotated in the data dictionary "
             f"(shown as <column_name>: [<undefined values>]): {undefined_categorical_col_values}. "
-            "Please check that the correct data dictionary has been selected."
+            "Please check that the correct data dictionary has been selected or make sure to annotate the missing values."
         )
 
     unused_missing_values = find_unused_missing_values(data_dict, pheno_df)

From 8aecbd54629ac8eb92dca5e6f631f63fa8a27de1 Mon Sep 17 00:00:00 2001
From: Alyssa Dai <alyssa.ydai@gmail.com>
Date: Thu, 13 Apr 2023 02:11:12 -0400
Subject: [PATCH 7/8] update tests, remove unnecessary comment

---
 bagel/pheno_utils.py            |  5 -----
 bagel/tests/data/README.md      |  4 ++--
 bagel/tests/data/example10.json |  6 +++---
 bagel/tests/data/example9.tsv   |  4 ++--
 bagel/tests/test_cli_pheno.py   | 27 +++++++++++++++++----------
 5 files changed, 24 insertions(+), 22 deletions(-)

diff --git a/bagel/pheno_utils.py b/bagel/pheno_utils.py
index 22e0ee15..bccffa76 100644
--- a/bagel/pheno_utils.py
+++ b/bagel/pheno_utils.py
@@ -206,11 +206,6 @@ def find_undefined_categorical_column_values(
             known_values = list(attr["Levels"].keys()) + attr[
                 "Annotations"
             ].get("MissingValues", [])
-            # NOTE: (also applies to find_unused_missing_values) The below comparison block could also be
-            # accomplished using difference of sets, however due to the unordered nature of the resultant set
-            # of unknown values, the order of specific values may be different than they appear in the input,
-            # leading to unexpected failed assertions / hard-to-formulate expected user messages in testing.
-            # To keep things simple, loops and basic conditionals are used here instead.
             unknown_values = []
             for value in pheno_df[col].unique():
                 if value not in known_values:
diff --git a/bagel/tests/data/README.md b/bagel/tests/data/README.md
index 5d27db95..f731ae67 100644
--- a/bagel/tests/data/README.md
+++ b/bagel/tests/data/README.md
@@ -13,7 +13,7 @@ Example inputs to the CLI
 | invalid | valid, only exists to be used together with the (invalid) .json    | invalid, missing the `"TermURL"` attribute for identifiers                       | fail   |
 | 7       | has fewer columns than are annotated in `.json`                    | same as example 1                                                                | fail   |
 | 8       | valid, based on ex2 has multiple participant_id columns            | valid, based on ex2 multiple participant_id column annotations                   | fail*  |
-| 9       | valid, same as example 6            | invalid, based on example 6 but contains an unannotated value for `group`                   | fail  |
-| 10       | valid, based on example 6 but contains extra `"MissingValues"` not found in the .tsv           | valid, same as example 6                   | pass, with warning  |
+| 9       | invalid, based on example 6 but contains an unannotated value for `group`            | valid, based on example 6                   | fail  |
+| 10       | valid, same as example 6           | valid, based on example 6 but contains extra `"MissingValues"` not found in the .tsv                   | pass, with warning  |
 
 `* this is expected to fail until we enable multiple participant_ID handling`.
\ No newline at end of file
diff --git a/bagel/tests/data/example10.json b/bagel/tests/data/example10.json
index b07e8f0d..62a1b7cb 100644
--- a/bagel/tests/data/example10.json
+++ b/bagel/tests/data/example10.json
@@ -38,7 +38,7 @@
                     "Label": "Healthy Control"
                 }
             },
-            "MissingValues": ["OTHER", "MISSING"]
+            "MissingValues": ["OTHER", "NOT IN TSV"]
         }
     },
     "tool_item1": {
@@ -52,7 +52,7 @@
                 "TermURL": "cogAtlas:1234",
                 "Label": "Imaginary tool"
             },
-            "MissingValues": ["missing", "none", ""]
+            "MissingValues": ["missing", "NOT IN TSV 1", "NOT IN TSV 2"]
         }
     },
     "tool_item2": {
@@ -66,7 +66,7 @@
                 "TermURL": "cogAtlas:1234",
                 "Label": "Imaginary tool"
             },
-            "MissingValues": ["missing", "none", ""]
+            "MissingValues": ["missing", "NOT IN TSV 1", "NOT IN TSV 2"]
         }
     },
     "other_tool_item1": {
diff --git a/bagel/tests/data/example9.tsv b/bagel/tests/data/example9.tsv
index 10453df2..1ce1ecbe 100644
--- a/bagel/tests/data/example9.tsv
+++ b/bagel/tests/data/example9.tsv
@@ -5,5 +5,5 @@ sub-02	ses-01	OTHER	"missing"	"missing"	"none"
 sub-02	ses-02	OTHER	"missing"	"missing"	"none"
 sub-03	ses-01	CTRL	10.0	8.0	"ok"
 sub-03	ses-02	CTRL	10.0	8.0	"bad"
-sub-04	ses-01	SIB	12.0	9.0	"ok"
-sub-04	ses-02	SIB	12.0	9.0	"bad"
\ No newline at end of file
+sub-04	ses-01	UNANNOTATED	12.0	9.0	"ok"
+sub-04	ses-02	UNANNOTATED	12.0	9.0	"bad"
\ No newline at end of file
diff --git a/bagel/tests/test_cli_pheno.py b/bagel/tests/test_cli_pheno.py
index a0734591..2ade0590 100644
--- a/bagel/tests/test_cli_pheno.py
+++ b/bagel/tests/test_cli_pheno.py
@@ -33,18 +33,21 @@ def test_pheno_valid_inputs_run_successfully(
 @pytest.mark.parametrize(
     "example,expected_exception,expected_message",
     [
-        ("example3", ValueError, "not a valid Neurobagel data dictionary"),
+        ("example3", ValueError, ["not a valid Neurobagel data dictionary"]),
         (
             "example_invalid",
             ValueError,
-            "not a valid Neurobagel data dictionary",
+            ["not a valid Neurobagel data dictionary"],
         ),
-        ("example7", LookupError, "not compatible"),
-        ("example8", ValueError, "more than one column"),
+        ("example7", LookupError, ["not compatible"]),
+        ("example8", ValueError, ["more than one column"]),
         (
             "example9",
             LookupError,
-            "values not found in the data dictionary (shown as <column_name>: [<undefined values>]): {'group': ['SIB']}",
+            [
+                "values not annotated in the data dictionary",
+                "'group': ['UNANNOTATED']",
+            ],
         ),
     ],
 )
@@ -69,7 +72,8 @@ def test_invalid_inputs_are_handled_gracefully(
             catch_exceptions=False,
         )
 
-    assert expected_message in str(e.value)
+    for substring in expected_message:
+        assert substring in str(e.value)
 
 
 def test_unused_missing_values_raises_warning(
@@ -99,10 +103,13 @@ def test_unused_missing_values_raises_warning(
         )
 
     assert len(w) == 1
-    assert (
-        "missing values in the data dictionary were not found in the corresponding phenotypic file column(s) "
-        "(<column_name>: [<unused missing values>]): {'group': ['MISSING'], 'tool_item1': ['none', ''], 'tool_item2': ['none', '']}"
-    ) in str(w[0].message.args[0])
+    for warn_substring in [
+        "missing values in the data dictionary were not found",
+        "'group': ['NOT IN TSV']",
+        "'tool_item1': ['NOT IN TSV 1', 'NOT IN TSV 2']",
+        "'tool_item2': ['NOT IN TSV 1', 'NOT IN TSV 2']",
+    ]:
+        assert warn_substring in str(w[0].message.args[0])
 
 
 def test_that_output_file_contains_name(

From ab590be8bb80d695be052ad0e8984e0d045b07bf Mon Sep 17 00:00:00 2001
From: Alyssa Dai <alyssa.ydai@gmail.com>
Date: Thu, 13 Apr 2023 02:22:20 -0400
Subject: [PATCH 8/8] simplify function/variable names

---
 bagel/pheno_utils.py | 10 +++++-----
 1 file changed, 5 insertions(+), 5 deletions(-)

diff --git a/bagel/pheno_utils.py b/bagel/pheno_utils.py
index bccffa76..6721cb9a 100644
--- a/bagel/pheno_utils.py
+++ b/bagel/pheno_utils.py
@@ -192,7 +192,7 @@ def are_inputs_compatible(data_dict: dict, pheno_df: pd.DataFrame) -> bool:
     return all([key in pheno_df.columns for key in data_dict.keys()])
 
 
-def find_undefined_categorical_column_values(
+def find_undefined_cat_col_values(
     data_dict: dict, pheno_df: pd.DataFrame
 ) -> dict:
     """
@@ -276,13 +276,13 @@ def validate_inputs(data_dict: dict, pheno_df: pd.DataFrame) -> None:
             "phenotypic file"
         )
 
-    undefined_categorical_col_values = (
-        find_undefined_categorical_column_values(data_dict, pheno_df)
+    undefined_cat_col_values = find_undefined_cat_col_values(
+        data_dict, pheno_df
     )
-    if undefined_categorical_col_values:
+    if undefined_cat_col_values:
         raise LookupError(
             "Categorical column(s) in the phenotypic file have values not annotated in the data dictionary "
-            f"(shown as <column_name>: [<undefined values>]): {undefined_categorical_col_values}. "
+            f"(shown as <column_name>: [<undefined values>]): {undefined_cat_col_values}. "
             "Please check that the correct data dictionary has been selected or make sure to annotate the missing values."
         )