diff --git a/chart_review/external.py b/chart_review/external.py index d72a7b4..5c6f4d7 100644 --- a/chart_review/external.py +++ b/chart_review/external.py @@ -1,6 +1,7 @@ -"""Match external document references & symptoms to Label Studio data""" +"""Match external document references & labels to Label Studio data""" import csv +import enum import os import sys from typing import Optional @@ -8,29 +9,46 @@ from chart_review import simplify -def _load_csv_symptoms(filename: str) -> dict[str, list[str]]: +class IdentifierType(enum.Enum): + DOCREF = enum.auto() + ENCOUNTER = enum.auto() + + +def _load_csv_labels(filename: str) -> tuple[IdentifierType, dict[str, list[str]]]: """ - Loads a csv and returns a list of symptoms per docref. + Loads a csv and returns a list of labels per row. + + CSV format is two columns, where the first is docref/encounter id and the second is a single + label. - CSV format is two columns, where the first is docref id and the second is a single symptom. - Returns docref_id -> list of symptoms for that ID + Returns id_type, {row_id -> list of labels for that ID} """ - docref_to_symptoms = {} + id_to_labels = {} with open(filename, "r", newline="", encoding="utf8") as csvfile: reader = csv.reader(csvfile) - next(reader, None) # skip header row - for row in reader: # row should be [docref_id, symptom] + + header = next(reader, None) # should be [row_id, label] + id_header = header[0].lower() + if "doc" in id_header: + id_type = IdentifierType.DOCREF + elif "enc" in id_header: + id_type = IdentifierType.ENCOUNTER + else: + print(f"Unrecognized ID column '{header[0]}'. Will assume DocRef ID.", file=sys.stderr) + id_type = IdentifierType.DOCREF + + for row in reader: docref_id = row[0] - symptom_list = docref_to_symptoms.setdefault(docref_id, []) - if row[1]: # allow for no labels for a docref (no positive symptoms found) - symptom_list.append(row[1]) + label_list = id_to_labels.setdefault(docref_id, []) + if row[1]: # allow for no labels for a row (no positive labels found) + label_list.append(row[1]) - return docref_to_symptoms + return id_type, id_to_labels def _docref_id_to_label_studio_id(exported_json: list[dict], docref_id: str) -> Optional[int]: - """Looks at the metadata in Label Studio and grabs the note ID that holds the provided docref""" + """Looks at the metadata in LS and grabs the note ID that holds the provided docref""" for row in exported_json: mappings = row.get("data", {}).get("docref_mappings", {}) for key, value in mappings.items(): @@ -41,13 +59,36 @@ def _docref_id_to_label_studio_id(exported_json: list[dict], docref_id: str) -> return None +def _encounter_id_to_label_studio_id(exported_json: list[dict], enc_id: str) -> Optional[int]: + """Looks at the metadata in LS and grabs the note ID that holds the provided encounter""" + for row in exported_json: + row_data = row.get("data", {}) + row_enc_id = row_data.get("enc_id") + row_anon_id = row_data.get("anon_id") + # Allow either an anonymous ID or the real ID -- collisions seem very unlikely + # (i.e. real IDs aren't going to be formatted like our long anonymous ID hash) + if row_enc_id == enc_id or row_anon_id == enc_id: + return int(row["id"]) + return None + + +def _row_id_to_label_studio_id( + exported_json: list[dict], id_type: IdentifierType, row_id: str +) -> Optional[int]: + """Looks at the metadata in LS and grabs the note ID that holds the provided ID""" + if id_type == IdentifierType.ENCOUNTER: + return _encounter_id_to_label_studio_id(exported_json, row_id) + else: + return _docref_id_to_label_studio_id(exported_json, row_id) + + def merge_external( simple: dict, exported_json: list[dict], project_dir: str, name: str, config: dict ) -> dict: """Loads an external csv file annotator and merges them into an existing simple dict""" if filename := config.get("filename"): full_filename = os.path.join(project_dir, filename) - symptom_map = _load_csv_symptoms(full_filename) + id_type, label_map = _load_csv_labels(full_filename) else: sys.exit(f"Did not understand config for external annotator '{name}'") @@ -55,21 +96,21 @@ def merge_external( for row in exported_json: if "docref_mappings" not in row.get("data", {}): sys.exit( - f"Your Label Studio export does not include DocRef ID mapping metadata!\n" + f"Your Label Studio export does not include DocRef/Encounter ID mapping metadata!\n" f"Consider re-uploading your notes using Cumulus ETL's chart-review command." ) break # just inspect one - # Convert each docref_id into an LS id: + # Convert each row id into an LS id: external_simple = {"files": {}, "annotations": {}} - for docref_id, symptom_list in symptom_map.items(): - ls_id = _docref_id_to_label_studio_id(exported_json, docref_id) + for row_id, label_list in label_map.items(): + ls_id = _row_id_to_label_studio_id(exported_json, id_type, row_id) if ls_id is None: continue external_simple["files"][ls_id] = ls_id annotation_list = external_simple["annotations"].setdefault(ls_id, {}).setdefault(name, []) - annotation_list.append({"labels": symptom_list}) + annotation_list.append({"labels": label_list}) # Merge into existing simple dictionary return simplify.merge_simple(simple, external_simple) diff --git a/tests/data/external/config.yaml b/tests/data/external/config.yaml index f8e1d1e..7d4fe09 100644 --- a/tests/data/external/config.yaml +++ b/tests/data/external/config.yaml @@ -1,4 +1,6 @@ annotators: human: 1 - icd10: - filename: icd.csv + icd10-doc: + filename: doc.csv + icd10-enc: + filename: enc.csv diff --git a/tests/data/external/icd.csv b/tests/data/external/doc.csv similarity index 68% rename from tests/data/external/icd.csv rename to tests/data/external/doc.csv index 634e41a..81822d6 100644 --- a/tests/data/external/icd.csv +++ b/tests/data/external/doc.csv @@ -1,6 +1,7 @@ -docref_id,symptom +blarg_id,symptom "ABC","happy" "ABC","tired" +"ABC-Enc","ignored" "Anon-ABC","hungry" "Unmatched","lost" "No-Symptoms", diff --git a/tests/data/external/enc.csv b/tests/data/external/enc.csv new file mode 100644 index 0000000..1d71323 --- /dev/null +++ b/tests/data/external/enc.csv @@ -0,0 +1,7 @@ +ENC_ID,SYMPTOM +"ABC-Enc","happy" +"ABC","ignored" +"ABC-Enc","tired" +"Anon-ABC-Enc","hungry" +"Unmatched","lost" +"No-Symptoms-Enc", diff --git a/tests/data/external/labelstudio-export.json b/tests/data/external/labelstudio-export.json index 717adad..cdd6580 100644 --- a/tests/data/external/labelstudio-export.json +++ b/tests/data/external/labelstudio-export.json @@ -26,6 +26,8 @@ } ], "data": { + "enc_id": "ABC-Enc", + "anon_id": "Anon-ABC-Enc", "docref_mappings": { "ABC": "Anon-ABC" } @@ -41,6 +43,8 @@ } ], "data": { + "enc_id": "Not-In-External-Enc", + "anon_id": "Not-In-External-Enc", "docref_mappings": { "Not-In-External": "Not-In-External" } @@ -56,6 +60,8 @@ } ], "data": { + "enc_id": "No-Symptoms-Enc", + "anon_id": "Anon-No-Symptoms-Enc", "docref_mappings": { "No-Symptoms": "Anon-No-Symptoms" } diff --git a/tests/test_external.py b/tests/test_external.py index 61362f2..a41d66f 100644 --- a/tests/test_external.py +++ b/tests/test_external.py @@ -33,7 +33,8 @@ def test_basic_read(self): ], # icd10 labels are split into two lists, # because we used two different docrefs (anon & real) - "icd10": [{"labels": ["happy", "tired"]}, {"labels": ["hungry"]}], + "icd10-doc": [{"labels": ["happy", "tired"]}, {"labels": ["hungry"]}], + "icd10-enc": [{"labels": ["happy", "tired"]}, {"labels": ["hungry"]}], }, # This was a note that didn't appear in the icd10 external annotations # (and also didn't have a positive label by the human reviewer). @@ -45,7 +46,8 @@ def test_basic_read(self): # but no labels for this note" 3: { "human": [], - "icd10": [], + "icd10-doc": [], + "icd10-enc": [], }, }, }, @@ -56,7 +58,8 @@ def test_basic_read(self): self.assertEqual( { "human": [1, 2, 3], - "icd10": [1, 3], + "icd10-doc": [1, 3], + "icd10-enc": [1, 3], }, reader.note_range, )