smart-on-fhir · mikix · Dec 29, 2023 · Dec 27, 2023
diff --git a/chart_review/external.py b/chart_review/external.py
@@ -1,36 +1,54 @@
-"""Match external document references & symptoms to Label Studio data"""
+"""Match external document references & labels to Label Studio data"""
 
 import csv
+import enum
 import os
 import sys
 from typing import Optional
 
 from chart_review import simplify
 
 
-def _load_csv_symptoms(filename: str) -> dict[str, list[str]]:
+class IdentifierType(enum.Enum):
+    DOCREF = enum.auto()
+    ENCOUNTER = enum.auto()
+
+
+def _load_csv_labels(filename: str) -> tuple[IdentifierType, dict[str, list[str]]]:
     """
-    Loads a csv and returns a list of symptoms per docref.
+    Loads a csv and returns a list of labels per row.
+
+    CSV format is two columns, where the first is docref/encounter id and the second is a single
+    label.
 
-    CSV format is two columns, where the first is docref id and the second is a single symptom.
-    Returns docref_id -> list of symptoms for that ID
+    Returns id_type, {row_id -> list of labels for that ID}
     """
-    docref_to_symptoms = {}
+    id_to_labels = {}
 
     with open(filename, "r", newline="", encoding="utf8") as csvfile:
         reader = csv.reader(csvfile)
-        next(reader, None)  # skip header row
-        for row in reader:  # row should be [docref_id, symptom]
+
+        header = next(reader, None)  # should be [row_id, label]
+        id_header = header[0].lower()
+        if "doc" in id_header:
+            id_type = IdentifierType.DOCREF
+        elif "enc" in id_header:
+            id_type = IdentifierType.ENCOUNTER
+        else:
+            print(f"Unrecognized ID column '{header[0]}'. Will assume DocRef ID.", file=sys.stderr)
+            id_type = IdentifierType.DOCREF
+
+        for row in reader:
             docref_id = row[0]
-            symptom_list = docref_to_symptoms.setdefault(docref_id, [])
-            if row[1]:  # allow for no labels for a docref (no positive symptoms found)
-                symptom_list.append(row[1])
+            label_list = id_to_labels.setdefault(docref_id, [])
+            if row[1]:  # allow for no labels for a row (no positive labels found)
+                label_list.append(row[1])
 
-    return docref_to_symptoms
+    return id_type, id_to_labels
 
 
 def _docref_id_to_label_studio_id(exported_json: list[dict], docref_id: str) -> Optional[int]:
-    """Looks at the metadata in Label Studio and grabs the note ID that holds the provided docref"""
+    """Looks at the metadata in LS and grabs the note ID that holds the provided docref"""
     for row in exported_json:
         mappings = row.get("data", {}).get("docref_mappings", {})
         for key, value in mappings.items():
@@ -41,35 +59,58 @@ def _docref_id_to_label_studio_id(exported_json: list[dict], docref_id: str) ->
     return None
 
 
+def _encounter_id_to_label_studio_id(exported_json: list[dict], enc_id: str) -> Optional[int]:
+    """Looks at the metadata in LS and grabs the note ID that holds the provided encounter"""
+    for row in exported_json:
+        row_data = row.get("data", {})
+        row_enc_id = row_data.get("enc_id")
+        row_anon_id = row_data.get("anon_id")
+        # Allow either an anonymous ID or the real ID -- collisions seem very unlikely
+        # (i.e. real IDs aren't going to be formatted like our long anonymous ID hash)
+        if row_enc_id == enc_id or row_anon_id == enc_id:
+            return int(row["id"])
+    return None
+
+
+def _row_id_to_label_studio_id(
+    exported_json: list[dict], id_type: IdentifierType, row_id: str
+) -> Optional[int]:
+    """Looks at the metadata in LS and grabs the note ID that holds the provided ID"""
+    if id_type == IdentifierType.ENCOUNTER:
+        return _encounter_id_to_label_studio_id(exported_json, row_id)
+    else:
+        return _docref_id_to_label_studio_id(exported_json, row_id)
+
+
 def merge_external(
     simple: dict, exported_json: list[dict], project_dir: str, name: str, config: dict
 ) -> dict:
     """Loads an external csv file annotator and merges them into an existing simple dict"""
     if filename := config.get("filename"):
         full_filename = os.path.join(project_dir, filename)
-        symptom_map = _load_csv_symptoms(full_filename)
+        id_type, label_map = _load_csv_labels(full_filename)
     else:
         sys.exit(f"Did not understand config for external annotator '{name}'")
 
     # Inspect exported json to see if it has the metadata we'll need.
     for row in exported_json:
         if "docref_mappings" not in row.get("data", {}):
             sys.exit(
-                f"Your Label Studio export does not include DocRef ID mapping metadata!\n"
+                f"Your Label Studio export does not include DocRef/Encounter ID mapping metadata!\n"
                 f"Consider re-uploading your notes using Cumulus ETL's chart-review command."
             )
         break  # just inspect one
 
-    # Convert each docref_id into an LS id:
+    # Convert each row id into an LS id:
     external_simple = {"files": {}, "annotations": {}}
-    for docref_id, symptom_list in symptom_map.items():
-        ls_id = _docref_id_to_label_studio_id(exported_json, docref_id)
+    for row_id, label_list in label_map.items():
+        ls_id = _row_id_to_label_studio_id(exported_json, id_type, row_id)
         if ls_id is None:
             continue
 
         external_simple["files"][ls_id] = ls_id
         annotation_list = external_simple["annotations"].setdefault(ls_id, {}).setdefault(name, [])
-        annotation_list.append({"labels": symptom_list})
+        annotation_list.append({"labels": label_list})
 
     # Merge into existing simple dictionary
     return simplify.merge_simple(simple, external_simple)
diff --git a/tests/data/external/config.yaml b/tests/data/external/config.yaml
@@ -1,4 +1,6 @@
 annotators:
   human: 1
-  icd10:
-    filename: icd.csv
+  icd10-doc:
+    filename: doc.csv
+  icd10-enc:
+    filename: enc.csv
diff --git a/tests/data/external/icd.csv → tests/data/external/doc.csv b/tests/data/external/icd.csv → tests/data/external/doc.csv
@@ -1,6 +1,7 @@
-docref_id,symptom
+blarg_id,symptom
 "ABC","happy"
 "ABC","tired"
+"ABC-Enc","ignored"
 "Anon-ABC","hungry"
 "Unmatched","lost"
 "No-Symptoms",
diff --git a/tests/data/external/enc.csv b/tests/data/external/enc.csv
@@ -0,0 +1,7 @@
+ENC_ID,SYMPTOM
+"ABC-Enc","happy"
+"ABC","ignored"
+"ABC-Enc","tired"
+"Anon-ABC-Enc","hungry"
+"Unmatched","lost"
+"No-Symptoms-Enc",
diff --git a/tests/data/external/labelstudio-export.json b/tests/data/external/labelstudio-export.json
@@ -26,6 +26,8 @@
       }
     ],
     "data": {
+      "enc_id": "ABC-Enc",
+      "anon_id": "Anon-ABC-Enc",
       "docref_mappings": {
         "ABC": "Anon-ABC"
       }
@@ -41,6 +43,8 @@
       }
     ],
     "data": {
+      "enc_id": "Not-In-External-Enc",
+      "anon_id": "Not-In-External-Enc",
       "docref_mappings": {
         "Not-In-External": "Not-In-External"
       }
@@ -56,6 +60,8 @@
       }
     ],
     "data": {
+      "enc_id": "No-Symptoms-Enc",
+      "anon_id": "Anon-No-Symptoms-Enc",
       "docref_mappings": {
         "No-Symptoms": "Anon-No-Symptoms"
       }

diff --git a/tests/test_external.py b/tests/test_external.py
@@ -33,7 +33,8 @@ def test_basic_read(self):
                             ],
                             # icd10 labels are split into two lists,
                             # because we used two different docrefs (anon & real)
-                            "icd10": [{"labels": ["happy", "tired"]}, {"labels": ["hungry"]}],
+                            "icd10-doc": [{"labels": ["happy", "tired"]}, {"labels": ["hungry"]}],
+                            "icd10-enc": [{"labels": ["happy", "tired"]}, {"labels": ["hungry"]}],
                         },
                         # This was a note that didn't appear in the icd10 external annotations
                         # (and also didn't have a positive label by the human reviewer).
@@ -45,7 +46,8 @@ def test_basic_read(self):
                         # but no labels for this note"
                         3: {
                             "human": [],
-                            "icd10": [],
+                            "icd10-doc": [],
+                            "icd10-enc": [],
                         },
                     },
                 },
@@ -56,7 +58,8 @@ def test_basic_read(self):
             self.assertEqual(
                 {
                     "human": [1, 2, 3],
-                    "icd10": [1, 3],
+                    "icd10-doc": [1, 3],
+                    "icd10-enc": [1, 3],
                 },
                 reader.note_range,
             )