Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

fix: make i2b2 transforms faster by skipping fhirclient #158

Merged
merged 1 commit into from
Jan 26, 2023
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
10 changes: 4 additions & 6 deletions cumulus/loaders/i2b2/loader.py
Original file line number Diff line number Diff line change
Expand Up @@ -6,8 +6,6 @@
from functools import partial
from typing import Callable, Iterable, List, TypeVar

from fhirclient.models.resource import Resource

from cumulus import store
from cumulus.loaders.base import Loader
from cumulus.loaders.i2b2 import extract, schema, transform
Expand All @@ -16,7 +14,7 @@
AnyDimension = TypeVar("AnyDimension", bound=schema.Dimension)
I2b2ExtractorCallable = Callable[[], Iterable[schema.Dimension]]
CsvToI2b2Callable = Callable[[str], Iterable[schema.Dimension]]
I2b2ToFhirCallable = Callable[[AnyDimension], Resource]
I2b2ToFhirCallable = Callable[[AnyDimension], dict]


class I2b2Loader(Loader):
Expand Down Expand Up @@ -110,10 +108,10 @@ def _loop(self, i2b2_entries: Iterable[schema.Dimension], to_fhir: I2b2ToFhirCal
# Now write each FHIR resource line by line to the output
# (we do this all line by line via generators to avoid loading everything in memory at once)
for resource in fhir_resources:
if resource.id in ids:
if resource["id"] in ids:
continue
ids.add(resource.id)
json.dump(resource.as_json(), output_file)
ids.add(resource["id"])
json.dump(resource, output_file)
output_file.write("\n")

###################################################################################################################
Expand Down
337 changes: 120 additions & 217 deletions cumulus/loaders/i2b2/transform.py

Large diffs are not rendered by default.

Original file line number Diff line number Diff line change
@@ -1 +1 @@
{"id":"f29736c29af5b962b3947fd40bed6b8c3e97c642b72aaa08e082fec05148e7dd","category":[{"coding":[{"code":"laboratory","system":"http:\/\/terminology.hl7.org\/CodeSystem\/observation-category"}]}],"code":{"coding":[{"code":"94500-6","system":"http:\/\/loinc.org"}]},"effectiveDateTime":"2020-03-20","encounter":{"reference":"Encounter\/af1e6186-3f9a-1fa9-3c73-cfa56c84a056"},"status":"unknown","subject":{"reference":"Patient\/1de9ea66-70d3-da1f-c735-df5ef7697fb9"},"valueCodeableConcept":{"coding":[{"code":"272519000","display":"Absent","system":"http:\/\/snomed.info\/sct"}]},"resourceType":"Observation"}
{"id":"f29736c29af5b962b3947fd40bed6b8c3e97c642b72aaa08e082fec05148e7dd","category":[{"coding":[{"code":"laboratory","system":"http:\/\/terminology.hl7.org\/CodeSystem\/observation-category"}]}],"code":{"coding":[{"code":"LAB:1","system":"http:\/\/cumulus.smarthealthit.org\/i2b2"}]},"effectiveDateTime":"2020-03-20","encounter":{"reference":"Encounter\/af1e6186-3f9a-1fa9-3c73-cfa56c84a056"},"status":"unknown","subject":{"reference":"Patient\/1de9ea66-70d3-da1f-c735-df5ef7697fb9"},"valueCodeableConcept":{"coding":[{"code":"See Image","display":"See Image","system":"http:\/\/cumulus.smarthealthit.org\/i2b2"}]},"resourceType":"Observation"}
4 changes: 2 additions & 2 deletions tests/data/simple/i2b2-input/observation_fact_lab_views.csv
Original file line number Diff line number Diff line change
@@ -1,3 +1,3 @@
"ENCOUNTER_NUM","PATIENT_NUM","CONCEPT_CD","PROVIDER_ID","START_DATE","MODIFIER_CD","INSTANCE_NUM","VALTYPE_CD","TVAL_CHAR","NVAL_NUM","VALUEFLAG_CD","QUANTITY_NUM","UNITS_CD","END_DATE","LOCATION_CD","OBSERVATION_BLOB","CONFIDENCE_NUM","UPDATE_DATE","DOWNLOAD_DATE","IMPORT_DATE","SOURCESYSTEM_CD","UPLOAD_ID","TEXT_SEARCH_INDEX"
22,323456,LAB:1043473617,"52",2020-03-19 01:50:00,@,42,T,See Image,,@,,NOT DEFINED IN SOURCE,2020-03-19 01:50:00,,"",,,,2021-05-29 00:00:00,LAB,,
25,323456,LAB:1043473617,"52",2020-03-20 01:50:00,@,43,T,See Image,,@,,NOT DEFINED IN SOURCE,2020-03-20 01:50:00,,"",,,,2021-05-30 00:00:00,LAB,,
22,323456,LAB:1043473617,"52",2020-03-19 01:50:00,@,42,T,Absent,,@,,NOT DEFINED IN SOURCE,2020-03-19 01:50:00,,"",,,,2021-05-29 00:00:00,LAB,,
25,323456,LAB:1,"52",2020-03-20 01:50:00,@,43,T,See Image,,@,,NOT DEFINED IN SOURCE,2020-03-20 01:50:00,,"",,,,2021-05-30 00:00:00,LAB,,
2 changes: 1 addition & 1 deletion tests/data/simple/ndjson-input/Observation.ndjson
Original file line number Diff line number Diff line change
@@ -1,2 +1,2 @@
{"id":"42","category":[{"coding":[{"code":"laboratory","system":"http://terminology.hl7.org/CodeSystem/observation-category"}]}],"code":{"coding":[{"code":"94500-6","system":"http:\/\/loinc.org"}]},"effectiveDateTime":"2020-03-19","encounter":{"reference":"Encounter\/22"},"status":"unknown","subject":{"reference":"Patient\/323456"},"valueCodeableConcept":{"coding":[{"code":"272519000","display":"Absent","system":"http:\/\/snomed.info\/sct"}]},"resourceType":"Observation"}
{"id":"43","category":[{"coding":[{"code":"laboratory","system":"http://terminology.hl7.org/CodeSystem/observation-category"}]}],"code":{"coding":[{"code":"94500-6","system":"http:\/\/loinc.org"}]},"effectiveDateTime":"2020-03-20","encounter":{"reference":"Encounter\/25"},"status":"unknown","subject":{"reference":"Patient\/323456"},"valueCodeableConcept":{"coding":[{"code":"272519000","display":"Absent","system":"http:\/\/snomed.info\/sct"}]},"resourceType":"Observation"}
{"id":"43","category":[{"coding":[{"code":"laboratory","system":"http://terminology.hl7.org/CodeSystem/observation-category"}]}],"code":{"coding":[{"code":"LAB:1","system":"http:\/\/cumulus.smarthealthit.org\/i2b2"}]},"effectiveDateTime":"2020-03-20","encounter":{"reference":"Encounter\/25"},"status":"unknown","subject":{"reference":"Patient\/323456"},"valueCodeableConcept":{"coding":[{"code":"See Image","display":"See Image","system":"http:\/\/cumulus.smarthealthit.org\/i2b2"}]},"resourceType":"Observation"}
Original file line number Diff line number Diff line change
@@ -1,2 +1,2 @@
{"id":"76da69dede003b4ceff5dc4921f838f3f8e583ef1e999cedc4bbe30c4d6d0940","category":[{"coding":[{"code":"laboratory","system":"http:\/\/terminology.hl7.org\/CodeSystem\/observation-category"}]}],"code":{"coding":[{"code":"94500-6","system":"http:\/\/loinc.org"}]},"effectiveDateTime":"2020-03-19","encounter":{"reference":"Encounter\/175e9941-2607-ad5f-76ab-14759da618fd"},"status":"unknown","subject":{"reference":"Patient\/1de9ea66-70d3-da1f-c735-df5ef7697fb9"},"valueCodeableConcept":{"coding":[{"code":"272519000","display":"Absent","system":"http:\/\/snomed.info\/sct"}]},"resourceType":"Observation"}
{"id":"f29736c29af5b962b3947fd40bed6b8c3e97c642b72aaa08e082fec05148e7dd","category":[{"coding":[{"code":"laboratory","system":"http:\/\/terminology.hl7.org\/CodeSystem\/observation-category"}]}],"code":{"coding":[{"code":"94500-6","system":"http:\/\/loinc.org"}]},"effectiveDateTime":"2020-03-20","encounter":{"reference":"Encounter\/af1e6186-3f9a-1fa9-3c73-cfa56c84a056"},"status":"unknown","subject":{"reference":"Patient\/1de9ea66-70d3-da1f-c735-df5ef7697fb9"},"valueCodeableConcept":{"coding":[{"code":"272519000","display":"Absent","system":"http:\/\/snomed.info\/sct"}]},"resourceType":"Observation"}
{"id":"f29736c29af5b962b3947fd40bed6b8c3e97c642b72aaa08e082fec05148e7dd","category":[{"coding":[{"code":"laboratory","system":"http:\/\/terminology.hl7.org\/CodeSystem\/observation-category"}]}],"code":{"coding":[{"code":"LAB:1","system":"http:\/\/cumulus.smarthealthit.org\/i2b2"}]},"effectiveDateTime":"2020-03-20","encounter":{"reference":"Encounter\/af1e6186-3f9a-1fa9-3c73-cfa56c84a056"},"status":"unknown","subject":{"reference":"Patient\/1de9ea66-70d3-da1f-c735-df5ef7697fb9"},"valueCodeableConcept":{"coding":[{"code":"See Image","display":"See Image","system":"http:\/\/cumulus.smarthealthit.org\/i2b2"}]},"resourceType":"Observation"}
10 changes: 5 additions & 5 deletions tests/i2b2_mock_data.py
Original file line number Diff line number Diff line change
Expand Up @@ -16,7 +16,7 @@ def patient_dim() -> transform.PatientDimension:
)


def patient() -> transform.Patient:
def patient() -> dict:
return transform.to_fhir_patient(patient_dim())


Expand All @@ -33,7 +33,7 @@ def encounter_dim() -> transform.VisitDimension:
)


def encounter() -> transform.Encounter:
def encounter() -> dict:
return transform.to_fhir_encounter(encounter_dim())


Expand All @@ -49,7 +49,7 @@ def condition_dim() -> transform.ObservationFact:
)


def condition() -> transform.Condition:
def condition() -> dict:
return transform.to_fhir_condition(condition_dim())


Expand All @@ -67,7 +67,7 @@ def documentreference_dim() -> transform.ObservationFact:
)


def documentreference() -> transform.DocumentReference:
def documentreference() -> dict:
return transform.to_fhir_documentreference(documentreference_dim())


Expand All @@ -85,5 +85,5 @@ def observation_dim() -> transform.ObservationFact:
)


def observation() -> transform.Observation:
def observation() -> dict:
return transform.to_fhir_observation_lab(observation_dim())
12 changes: 3 additions & 9 deletions tests/test_i2b2_oracle_extract.py
Original file line number Diff line number Diff line change
Expand Up @@ -102,12 +102,6 @@ def test_loader(self, mock_extract):
set(os.listdir(tmpdir.name)),
)

self.assertEqual(
i2b2_mock_data.condition().as_json(), common.read_json(os.path.join(tmpdir.name, "Condition.ndjson"))
)
self.assertEqual(
i2b2_mock_data.encounter().as_json(), common.read_json(os.path.join(tmpdir.name, "Encounter.ndjson"))
)
self.assertEqual(
i2b2_mock_data.patient().as_json(), common.read_json(os.path.join(tmpdir.name, "Patient.ndjson"))
)
self.assertEqual(i2b2_mock_data.condition(), common.read_json(os.path.join(tmpdir.name, "Condition.ndjson")))
self.assertEqual(i2b2_mock_data.encounter(), common.read_json(os.path.join(tmpdir.name, "Encounter.ndjson")))
self.assertEqual(i2b2_mock_data.patient(), common.read_json(os.path.join(tmpdir.name, "Patient.ndjson")))
95 changes: 31 additions & 64 deletions tests/test_i2b2_transform.py
Original file line number Diff line number Diff line change
Expand Up @@ -3,7 +3,6 @@
import unittest

import ddt
from fhirclient.models.fhirdate import FHIRDate

from cumulus.loaders.i2b2 import transform
from tests import i2b2_mock_data
Expand All @@ -13,19 +12,15 @@
class TestI2b2Transform(unittest.TestCase):
"""Test case for converting from i2b2 to FHIR"""

# Pylint doesn't like subscripting some lists in our created objects, not sure why yet.
# pylint: disable=unsubscriptable-object

def test_to_fhir_patient(self):
subject = i2b2_mock_data.patient()
# print(json.dumps(subject, indent=4))

# print(json.dumps(pat_fhir.as_json(), indent=4))

self.assertEqual(str(12345), subject.id)
self.assertEqual("2005-06-07", subject.birthDate.isostring)
self.assertEqual("female", subject.gender)
self.assertEqual(str(12345), subject["id"])
self.assertEqual("2005-06-07", subject["birthDate"])
self.assertEqual("female", subject["gender"])
# pylint: disable-next=unsubscriptable-object
self.assertEqual("02115", subject.address[0].postalCode)
self.assertEqual("02115", subject["address"][0]["postalCode"])

@ddt.data(
("Black or African American", "race", "urn:oid:2.16.840.1.113883.6.238", "2054-5"),
Expand All @@ -52,77 +47,49 @@ def test_patient_race_vs_ethnicity(self, race_cd, url, system, code):
},
],
},
patient.extension[0].as_json(),
patient["extension"][0],
)

def test_to_fhir_encounter(self):
encounter = i2b2_mock_data.encounter()
# print(json.dumps(encounter.as_json(), indent=4))
# print(json.dumps(encounter, indent=4))

self.assertEqual("67890", encounter.id)
self.assertEqual("Patient/12345", encounter.subject.reference)
self.assertEqual("2016-01-01", encounter.period.start.isostring)
self.assertEqual("2016-01-04", encounter.period.end.isostring)
self.assertEqual(3, encounter.length.value)
self.assertEqual("67890", encounter["id"])
self.assertEqual("Patient/12345", encounter["subject"]["reference"])
self.assertEqual("2016-01-01", encounter["period"]["start"])
self.assertEqual("2016-01-04", encounter["period"]["end"])
self.assertEqual(3, encounter["length"]["value"])

def test_to_fhir_condition(self):
condition = i2b2_mock_data.condition()
# print(json.dumps(condition, indent=4))

# print(json.dumps(condition.as_json(), indent=4))
self.assertEqual("Patient/12345", condition.subject.reference)
self.assertEqual("Encounter/67890", condition.encounter.reference)
self.assertEqual(str("U07.1"), condition.code.coding[0].code)
self.assertEqual(str("http://hl7.org/fhir/sid/icd-10-cm"), condition.code.coding[0].system)
self.assertEqual("Patient/12345", condition["subject"]["reference"])
self.assertEqual("Encounter/67890", condition["encounter"]["reference"])
self.assertEqual("U07.1", condition["code"]["coding"][0]["code"])
self.assertEqual("http://hl7.org/fhir/sid/icd-10-cm", condition["code"]["coding"][0]["system"])

def test_to_fhir_documentreference(self):
docref = i2b2_mock_data.documentreference()
# print(json.dumps(docref, indent=4))

# print(json.dumps(docref.as_json(), indent=4))

self.assertEqual("Patient/12345", docref.subject.reference)
self.assertEqual(1, len(docref.context.encounter))
self.assertEqual("Encounter/67890", docref.context.encounter[0].reference)
self.assertEqual("NOTE:149798455", docref.type.coding[0].code)
self.assertEqual("Emergency note", docref.type.coding[0].display)
self.assertEqual("Patient/12345", docref["subject"]["reference"])
self.assertEqual(1, len(docref["context"]["encounter"]))
self.assertEqual("Encounter/67890", docref["context"]["encounter"][0]["reference"])
self.assertEqual("NOTE:149798455", docref["type"]["coding"][0]["code"])
self.assertEqual("Emergency note", docref["type"]["coding"][0]["display"])

def test_to_fhir_observation_lab(self):
lab_fhir = i2b2_mock_data.observation()
# print(json.dumps(lab_fhir, indent=4))

# print(json.dumps(lab_i2b2.__dict__, indent=4))
# print(json.dumps(lab_fhir.as_json(), indent=4))

self.assertEqual("Patient/12345", lab_fhir.subject.reference)
self.assertEqual("Encounter/67890", lab_fhir.encounter.reference)

self.assertEqual("94500-6", lab_fhir.code.coding[0].code)
self.assertEqual("http://loinc.org", lab_fhir.code.coding[0].system)

self.assertEqual("260385009", lab_fhir.valueCodeableConcept.coding[0].code)
self.assertEqual("Negative", lab_fhir.valueCodeableConcept.coding[0].display)

self.assertEqual(FHIRDate("2021-01-02").date, lab_fhir.effectiveDateTime.date)

def test_parse_fhir_date(self):

timestamp = "2020-01-02 12:00:00.000"
timestamp = timestamp[:10]

self.assertEqual("2020-01-02", FHIRDate(timestamp).isostring)

timestamp = "2020-01-02 12:00:00.000"

self.assertEqual("2020-01-02", transform.parse_fhir_date(timestamp).isostring)

timezone = "2020-01-02T16:00:00+00:00"

self.assertEqual("2020-01-02", transform.parse_fhir_date(timezone).isostring)

datepart = "2020-01-02"
self.assertEqual("Patient/12345", lab_fhir["subject"]["reference"])
self.assertEqual("Encounter/67890", lab_fhir["encounter"]["reference"])

self.assertEqual("2020-01-02", transform.parse_fhir_date(datepart).isostring)
self.assertEqual("94500-6", lab_fhir["code"]["coding"][0]["code"])
self.assertEqual("http://loinc.org", lab_fhir["code"]["coding"][0]["system"])

def test_ref_subject(self):
self.assertEqual({"reference": "Patient/123"}, transform.ref_subject("123").as_json())
self.assertEqual("260385009", lab_fhir["valueCodeableConcept"]["coding"][0]["code"])
self.assertEqual("Negative", lab_fhir["valueCodeableConcept"]["coding"][0]["display"])

def test_ref_encounter(self):
self.assertEqual({"reference": "Encounter/123"}, transform.ref_encounter("123").as_json())
self.assertEqual("2021-01-02", lab_fhir["effectiveDateTime"])
14 changes: 7 additions & 7 deletions tests/test_scrubber.py
Original file line number Diff line number Diff line change
Expand Up @@ -17,7 +17,7 @@ class TestScrubber(unittest.TestCase):

def test_patient(self):
"""Verify a basic patient (saved ids)"""
patient = i2b2_mock_data.patient().as_json()
patient = i2b2_mock_data.patient()
self.assertEqual("12345", patient["id"])

scrubber = Scrubber()
Expand All @@ -26,7 +26,7 @@ def test_patient(self):

def test_encounter(self):
"""Verify a basic encounter (saved ids)"""
encounter = i2b2_mock_data.encounter().as_json()
encounter = i2b2_mock_data.encounter()
self.assertEqual("Patient/12345", encounter["subject"]["reference"])
self.assertEqual("67890", encounter["id"])

Expand All @@ -37,7 +37,7 @@ def test_encounter(self):

def test_condition(self):
"""Verify a basic condition (hashed ids)"""
condition = i2b2_mock_data.condition().as_json()
condition = i2b2_mock_data.condition()
self.assertEqual("4567", condition["id"])
self.assertEqual("Patient/12345", condition["subject"]["reference"])
self.assertEqual("Encounter/67890", condition["encounter"]["reference"])
Expand All @@ -52,7 +52,7 @@ def test_condition(self):

def test_documentreference(self):
"""Test DocumentReference, which is interesting because of its list of encounters and attachments"""
docref = i2b2_mock_data.documentreference().as_json()
docref = i2b2_mock_data.documentreference()
self.assertEqual("345", docref["id"])
self.assertEqual("Patient/12345", docref["subject"]["reference"])
self.assertEqual(1, len(docref["context"]["encounter"]))
Expand All @@ -72,7 +72,7 @@ def test_documentreference(self):

def test_unknown_modifier_extension(self):
"""Confirm we skip resources with unknown modifier extensions"""
patient = i2b2_mock_data.patient().as_json()
patient = i2b2_mock_data.patient()
scrubber = Scrubber()

patient["modifierExtension"] = []
Expand Down Expand Up @@ -104,7 +104,7 @@ def test_load_and_save(self):

# Confirm we loaded that encounter correctly
scrubber = Scrubber(tmpdir)
encounter = i2b2_mock_data.encounter().as_json() # patient is 12345
encounter = i2b2_mock_data.encounter() # patient is 12345
encounter["id"] = "1"
self.assertTrue(scrubber.scrub_resource(encounter))
self.assertEqual(encounter["id"], db.encounter("1"))
Expand All @@ -131,7 +131,7 @@ def test_load_and_save(self):
def test_meta_security_cleared(self):
"""Verify that we drop the Meta.security field"""
scrubber = Scrubber()
condition = i2b2_mock_data.condition().as_json()
condition = i2b2_mock_data.condition()

# With another property
condition["meta"] = {"security": [{"code": "REDACTED"}], "versionId": "a"}
Expand Down
Loading