Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

feat: support reading encounter-level external labels #9

Merged
merged 1 commit into from
Dec 29, 2023
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
79 changes: 60 additions & 19 deletions chart_review/external.py
Original file line number Diff line number Diff line change
@@ -1,36 +1,54 @@
"""Match external document references & symptoms to Label Studio data"""
"""Match external document references & labels to Label Studio data"""

import csv
import enum
import os
import sys
from typing import Optional

from chart_review import simplify


def _load_csv_symptoms(filename: str) -> dict[str, list[str]]:
class IdentifierType(enum.Enum):
DOCREF = enum.auto()
ENCOUNTER = enum.auto()


def _load_csv_labels(filename: str) -> tuple[IdentifierType, dict[str, list[str]]]:
"""
Loads a csv and returns a list of symptoms per docref.
Loads a csv and returns a list of labels per row.

CSV format is two columns, where the first is docref/encounter id and the second is a single
label.

CSV format is two columns, where the first is docref id and the second is a single symptom.
Returns docref_id -> list of symptoms for that ID
Returns id_type, {row_id -> list of labels for that ID}
"""
docref_to_symptoms = {}
id_to_labels = {}

with open(filename, "r", newline="", encoding="utf8") as csvfile:
reader = csv.reader(csvfile)
next(reader, None) # skip header row
for row in reader: # row should be [docref_id, symptom]

header = next(reader, None) # should be [row_id, label]
id_header = header[0].lower()
if "doc" in id_header:
id_type = IdentifierType.DOCREF
elif "enc" in id_header:
id_type = IdentifierType.ENCOUNTER
else:
print(f"Unrecognized ID column '{header[0]}'. Will assume DocRef ID.", file=sys.stderr)
id_type = IdentifierType.DOCREF

for row in reader:
docref_id = row[0]
symptom_list = docref_to_symptoms.setdefault(docref_id, [])
if row[1]: # allow for no labels for a docref (no positive symptoms found)
symptom_list.append(row[1])
label_list = id_to_labels.setdefault(docref_id, [])
if row[1]: # allow for no labels for a row (no positive labels found)
label_list.append(row[1])

return docref_to_symptoms
return id_type, id_to_labels


def _docref_id_to_label_studio_id(exported_json: list[dict], docref_id: str) -> Optional[int]:
"""Looks at the metadata in Label Studio and grabs the note ID that holds the provided docref"""
"""Looks at the metadata in LS and grabs the note ID that holds the provided docref"""
for row in exported_json:
mappings = row.get("data", {}).get("docref_mappings", {})
for key, value in mappings.items():
Expand All @@ -41,35 +59,58 @@ def _docref_id_to_label_studio_id(exported_json: list[dict], docref_id: str) ->
return None


def _encounter_id_to_label_studio_id(exported_json: list[dict], enc_id: str) -> Optional[int]:
"""Looks at the metadata in LS and grabs the note ID that holds the provided encounter"""
for row in exported_json:
row_data = row.get("data", {})
row_enc_id = row_data.get("enc_id")
row_anon_id = row_data.get("anon_id")
# Allow either an anonymous ID or the real ID -- collisions seem very unlikely
# (i.e. real IDs aren't going to be formatted like our long anonymous ID hash)
if row_enc_id == enc_id or row_anon_id == enc_id:
return int(row["id"])
return None


def _row_id_to_label_studio_id(
exported_json: list[dict], id_type: IdentifierType, row_id: str
) -> Optional[int]:
"""Looks at the metadata in LS and grabs the note ID that holds the provided ID"""
if id_type == IdentifierType.ENCOUNTER:
return _encounter_id_to_label_studio_id(exported_json, row_id)
else:
return _docref_id_to_label_studio_id(exported_json, row_id)


def merge_external(
simple: dict, exported_json: list[dict], project_dir: str, name: str, config: dict
) -> dict:
"""Loads an external csv file annotator and merges them into an existing simple dict"""
if filename := config.get("filename"):
full_filename = os.path.join(project_dir, filename)
symptom_map = _load_csv_symptoms(full_filename)
id_type, label_map = _load_csv_labels(full_filename)
else:
sys.exit(f"Did not understand config for external annotator '{name}'")

# Inspect exported json to see if it has the metadata we'll need.
for row in exported_json:
if "docref_mappings" not in row.get("data", {}):
sys.exit(
f"Your Label Studio export does not include DocRef ID mapping metadata!\n"
f"Your Label Studio export does not include DocRef/Encounter ID mapping metadata!\n"
f"Consider re-uploading your notes using Cumulus ETL's chart-review command."
)
break # just inspect one

# Convert each docref_id into an LS id:
# Convert each row id into an LS id:
external_simple = {"files": {}, "annotations": {}}
for docref_id, symptom_list in symptom_map.items():
ls_id = _docref_id_to_label_studio_id(exported_json, docref_id)
for row_id, label_list in label_map.items():
ls_id = _row_id_to_label_studio_id(exported_json, id_type, row_id)
if ls_id is None:
continue

external_simple["files"][ls_id] = ls_id
annotation_list = external_simple["annotations"].setdefault(ls_id, {}).setdefault(name, [])
annotation_list.append({"labels": symptom_list})
annotation_list.append({"labels": label_list})

# Merge into existing simple dictionary
return simplify.merge_simple(simple, external_simple)
6 changes: 4 additions & 2 deletions tests/data/external/config.yaml
Original file line number Diff line number Diff line change
@@ -1,4 +1,6 @@
annotators:
human: 1
icd10:
filename: icd.csv
icd10-doc:
filename: doc.csv
icd10-enc:
filename: enc.csv
3 changes: 2 additions & 1 deletion tests/data/external/icd.csv → tests/data/external/doc.csv
Original file line number Diff line number Diff line change
@@ -1,6 +1,7 @@
docref_id,symptom
blarg_id,symptom
"ABC","happy"
"ABC","tired"
"ABC-Enc","ignored"
"Anon-ABC","hungry"
"Unmatched","lost"
"No-Symptoms",
7 changes: 7 additions & 0 deletions tests/data/external/enc.csv
Original file line number Diff line number Diff line change
@@ -0,0 +1,7 @@
ENC_ID,SYMPTOM
"ABC-Enc","happy"
"ABC","ignored"
"ABC-Enc","tired"
"Anon-ABC-Enc","hungry"
"Unmatched","lost"
"No-Symptoms-Enc",
6 changes: 6 additions & 0 deletions tests/data/external/labelstudio-export.json
Original file line number Diff line number Diff line change
Expand Up @@ -26,6 +26,8 @@
}
],
"data": {
"enc_id": "ABC-Enc",
"anon_id": "Anon-ABC-Enc",
"docref_mappings": {
"ABC": "Anon-ABC"
}
Expand All @@ -41,6 +43,8 @@
}
],
"data": {
"enc_id": "Not-In-External-Enc",
"anon_id": "Not-In-External-Enc",
"docref_mappings": {
"Not-In-External": "Not-In-External"
}
Expand All @@ -56,6 +60,8 @@
}
],
"data": {
"enc_id": "No-Symptoms-Enc",
"anon_id": "Anon-No-Symptoms-Enc",
"docref_mappings": {
"No-Symptoms": "Anon-No-Symptoms"
}
Expand Down
9 changes: 6 additions & 3 deletions tests/test_external.py
Original file line number Diff line number Diff line change
Expand Up @@ -33,7 +33,8 @@ def test_basic_read(self):
],
# icd10 labels are split into two lists,
# because we used two different docrefs (anon & real)
"icd10": [{"labels": ["happy", "tired"]}, {"labels": ["hungry"]}],
"icd10-doc": [{"labels": ["happy", "tired"]}, {"labels": ["hungry"]}],
"icd10-enc": [{"labels": ["happy", "tired"]}, {"labels": ["hungry"]}],
},
# This was a note that didn't appear in the icd10 external annotations
# (and also didn't have a positive label by the human reviewer).
Expand All @@ -45,7 +46,8 @@ def test_basic_read(self):
# but no labels for this note"
3: {
"human": [],
"icd10": [],
"icd10-doc": [],
"icd10-enc": [],
},
},
},
Expand All @@ -56,7 +58,8 @@ def test_basic_read(self):
self.assertEqual(
{
"human": [1, 2, 3],
"icd10": [1, 3],
"icd10-doc": [1, 3],
"icd10-enc": [1, 3],
},
reader.note_range,
)