Merge pull request #221 from TranslatorSRI/generate-curie-breakdown-r…

…eport This PR adds per-file reports for compendia files that are written into the `reports/content/` directory. We also generate an overall content report for compendia files. Since these are JSON files, I've added `jq` to the Babel image for use in debugging.
TranslatorSRI · Jan 25, 2024 · 83907b1 · 83907b1
2 parents 0735fe0 + e4d63a5
commit 83907b1
Show file tree

Hide file tree

Showing 5 changed files with 298 additions and 7 deletions.
diff --git a/Dockerfile b/Dockerfile
@@ -28,6 +28,7 @@ RUN apt-get install -y htop
 RUN apt-get install -y screen
 RUN apt-get install -y vim
 RUN apt-get install -y rsync
+RUN apt-get install -y jq
 
 # Copy directory into Docker.
 COPY --chown=nru . ${ROOT}

diff --git a/Snakefile b/Snakefile
@@ -13,9 +13,25 @@ include: "src/snakefiles/taxon.snakefile"
 include: "src/snakefiles/genefamily.snakefile"
 include: "src/snakefiles/leftover_umls.snakefile"
 include: "src/snakefiles/macromolecular_complex.snakefile"
+
+include: "src/snakefiles/reports.snakefile"
 include: "src/snakefiles/exports.snakefile"
 
 rule all:
+    input:
+        # See rule all_outputs later in this file for how we generate all the outputs.
+        config['output_directory'] + '/reports/outputs_done',
+        # reports_done are generated by the rules in src/snakefiles/
+        config['output_directory'] + '/reports/reports_done',
+
+        # Build all the exports.
+        config['output_directory'] + '/kgx/done',
+    output:
+        x = config['output_directory'] + '/reports/all_done'
+    shell:
+        "echo 'done' >> {output.x}"
+
+rule all_outputs:
     input:
         config['output_directory'] + '/reports/anatomy_done',
         config['output_directory'] + '/reports/chemicals_done',
@@ -29,21 +45,18 @@ rule all:
         config['output_directory'] + '/reports/umls_done',
         config['output_directory'] + '/reports/macromolecular_complex_done',
         config['output_directory'] + '/reports/drugchemical_done',
-        # Check if we have exported the compendia as KGX.
-        config['output_directory'] + '/kgx/done',
     output:
-        x = config['output_directory'] + '/reports/all_done'
+        x = config['output_directory'] + '/reports/outputs_done'
     shell:
         "echo 'done' >> {output.x}"
 
-
 rule clean_compendia:
     params:
         dir=config['output_directory']
     shell:
         "rm {params.dir}/compendia/*; rm {params.dir}/synonyms/*"
 
-rule clean_data:
+rule clean_downloads:
     params:
         dir=config['download_directory']
     shell:

diff --git a/config.json b/config.json
@@ -28,8 +28,6 @@
   "protein_concords": ["UniProtKB","PR","NCIT_UniProtKB","NCIT_UMLS"],
   "protein_outputs": ["Protein.txt"],
 
-  "geneprotein_output": "GeneProtein.txt",
-
   "disease_labelsandsynonyms": ["MONDO","DOID","Orphanet","HP","MESH","NCIT","UMLS","SNOMEDCT","EFO"],
   "disease_ids": ["MONDO","DOID","Orphanet","HP","MESH","NCIT","UMLS","OMIM","EFO"],
   "disease_concords": ["HP","MONDO","UMLS","DOID","EFO"],
@@ -46,6 +44,7 @@
   "chemical_concords": ["wikipedia_mesh_chebi","PUBCHEM_MESH","mesh_cas","mesh_unii","PUBCHEM_CAS","GTOPDB","CHEBI","UMLS","DrugCentral","RXNORM"],
   "chemical_ids": ["CHEMBL.COMPOUND","GTOPDB","KEGG.COMPOUND","CHEBI","UNII","HMDB","PUBCHEM.COMPOUND","DrugCentral","DRUGBANK","MESH","UMLS","RXNORM"],
   "chemical_outputs": ["MolecularMixture.txt", "SmallMolecule.txt", "Polypeptide.txt", "ComplexMolecularMixture.txt",  "ChemicalEntity.txt", "ChemicalMixture.txt", "Drug.txt"],
+  "drugchemicalconflated_synonym_outputs": ["DrugChemicalConflated.txt"],
 
   "taxon_labels": ["NCBITaxon","MESH","UMLS"],
   "taxon_synonyms": ["NCBITaxon","UMLS"],
@@ -66,6 +65,9 @@
     "http://www.informatics.jax.org/marker/MGI:": "MGI"
   },
 
+  "geneprotein_outputs": ["GeneProtein.txt"],
+  "drugchemical_outputs": ["DrugChemical.txt"],
+
   "preferred_name_boost_prefixes": {
     "biolink:ChemicalEntity": [
       "DRUGBANK",

diff --git a/src/reports/compendia_per_file_reports.py b/src/reports/compendia_per_file_reports.py
@@ -0,0 +1,170 @@
+"""
+compendia_per_file_reports.py - Generate reports for the individual files in the compendia directory.
+"""
+import itertools
+import json
+import logging
+import os
+from collections import defaultdict
+from datetime import datetime
+
+
+def get_datetime_as_string():
+    """
+    Returns the current date and time as a string.
+
+    :return: The current date and time as an ISO8601 string.
+    """
+
+    # Return the current date and time in ISO8601 time with time zone.
+    return datetime.now().isoformat()
+
+
+def assert_files_in_directory(dir, files, report_file):
+    """
+    Asserts that the list of files in a given directory are the list of files provided.
+
+    :param dir: The directory to check files in.
+    :param files: The files to compare the list against.
+    :param report_file: Write a report to this file. We assume that this file is not intended
+        to be read, but is created so that we can check this assertion has been checked.
+    """
+
+    logging.info(f"Expect files in directory {dir} to be equal to {files}")
+    file_list = os.listdir(dir)
+    assert set(file_list) == set(files)
+
+    # If we passed, write the output to the check_file.
+    with open(report_file, "w") as f:
+        f.write(f"Confirmed that {dir} contains only the files {files} at {get_datetime_as_string()}\n")
+
+
+def generate_content_report_for_compendium(compendium_path, report_path):
+    """
+    Generate a report of CURIE prefixes per file.
+
+    :param compendium_path: The path of the compendium file to read.
+    :param report_path: The path to write the CURIE prefixes per file report as a JSON file.
+    """
+
+    with open(report_path, "w") as report_file:
+        with open(compendium_path, "r") as compendium_file:
+            # This is a JSONL file, so we need to read each line as a JSON object.
+
+            # Track CURIE breakdowns for this compendium.
+            count_by_prefix = defaultdict(int)
+            count_by_biolink_type = defaultdict(int)
+            counters = {
+                'clique_count': 0,
+                'cliques_by_id_count': defaultdict(int),
+                'cliques_by_label_count': defaultdict(int),
+                'cliques_by_unique_label_count': defaultdict(int),
+                'cliques_by_description_count': defaultdict(int),
+                'cliques_by_unique_description_count': defaultdict(int),
+            }
+
+            # Since this is time-consuming, let's log a count as we go.
+            count_lines = 0
+
+            # Iterate through the compendium file.
+            for line in compendium_file:
+                count_lines += 1
+
+                # Report updates every 10 million lines.
+                if count_lines % 10000000 == 0:
+                    logging.info(f"Processed {count_lines:,} lines in {compendium_path}")
+
+                # Parse each line as a JSON object.
+                clique = json.loads(line)
+
+                # Track the CURIEs we're looking for.
+                identifiers = clique.get('identifiers', [])
+                ids = list(map(lambda x: x['i'], identifiers))
+
+                # Update counts by Biolink type.
+                count_by_biolink_type[clique.get('type', '')] += 1
+
+                # Update counts by prefix.
+                for curie in ids:
+                    prefix = curie.split(':')[0]
+                    count_by_prefix[prefix] += 1
+
+                # Update counts by flags.
+                counters['clique_count'] += 1
+                counters['cliques_by_id_count'][len(ids)] += 1
+                labels = list(filter(lambda x: x.strip() != '', map(lambda x: x.get('l', ''), identifiers)))
+                counters['cliques_by_label_count'][len(labels)] += 1
+                unique_labels = set(labels)
+                counters['cliques_by_unique_label_count'][len(unique_labels)] += 1
+
+                # Since descriptions are currently lists, we have to first flatten the list with
+                # itertools.chain.from_iterable() before we can count them.
+                descriptions = list(filter(lambda x: x.strip() != '', itertools.chain.from_iterable(map(lambda x: x.get('d', ''), identifiers))))
+                counters['cliques_by_description_count'][len(descriptions)] += 1
+                unique_descriptions = set(descriptions)
+                counters['cliques_by_unique_description_count'][len(unique_descriptions)] += 1
+
+        json.dump({
+            'name': os.path.splitext(os.path.basename(compendium_path))[0],
+            'compendium_path': compendium_path,
+            'report_path': report_path,
+            'count_lines': count_lines,
+            'count_by_biolink_type': count_by_biolink_type,
+            'count_by_prefix': count_by_prefix,
+            'counters': counters,
+        }, report_file, sort_keys=True, indent=2)
+
+
+def summarize_content_report_for_compendia(compendia_report_paths, summary_path):
+    """
+    Summarize all the content reports generated by generate_content_report_for_compendium().
+
+    :param compendia_report_paths: A list of file paths for the compendia reports generated by generate_content_report_for_compendium()
+    :param summary_path: The path to write the summary report.
+    """
+
+    # Start writing the summary file.
+    with open(summary_path, "w") as summaryfile:
+        # Summarized information from the reports.
+        biolink_types = defaultdict(dict)
+        prefixes = defaultdict(dict)
+        counters = {}
+        count_lines = 0
+
+        # Read all the summary reports -- these are small, so we can just read them all in.
+        for report_path in compendia_report_paths:
+            with open(report_path, "r") as report_file:
+                report = json.load(report_file)
+
+            # name = report['name']
+            count_lines += report['count_lines']
+
+            # Add up Biolink type information.
+            for biolink_type, count in report['count_by_biolink_type'].items():
+                biolink_types[biolink_type] = biolink_types.get(biolink_type, 0) + count
+
+            # Add up prefix information.
+            for prefix, count in report['count_by_prefix'].items():
+                prefixes[prefix] = prefixes.get(prefix, 0) + count
+
+            # Every counter is either an int or a dict. If a dict, we need to add up
+            # all the counters.
+            for counter, value in report['counters'].items():
+                if type(value) is int:
+                    counters[counter] = counters.get(counter, 0) + value
+                elif type(value) is dict:
+                    if counter not in counters:
+                        counters[counter] = defaultdict(int)
+                    for key, count in value.items():
+                        counters[counter][key] = counters[counter].get(key, 0) + count
+                else:
+                    raise ValueError(f"Counter {counter} has unexpected value in {value}.")
+
+        # Write the summary report.
+        json.dump({
+            'report_path': summary_path,
+            'biolink_types': biolink_types,
+            'prefixes': prefixes,
+            'counters': counters,
+            'count_lines': count_lines,
+        }, summaryfile, sort_keys=True, indent=2)
diff --git a/src/snakefiles/reports.snakefile b/src/snakefiles/reports.snakefile
@@ -0,0 +1,105 @@
+import os
+
+from src.reports.compendia_per_file_reports import assert_files_in_directory, \
+    generate_content_report_for_compendium, summarize_content_report_for_compendia
+
+# Some paths we will use at multiple times in these reports.
+compendia_path = config['output_directory'] + '/compendia'
+synonyms_path = config['output_directory'] + '/synonyms'
+conflations_path = config['output_directory'] + '/conflation'
+
+# Expected compendia files.
+compendia_files = config['anatomy_outputs'] + config['gene_outputs'] + config['protein_outputs'] + \
+    config['disease_outputs'] + config['process_outputs'] + \
+    config['chemical_outputs'] + config['taxon_outputs'] + config['genefamily_outputs'] + \
+    config['umls_outputs'] + config['macromolecularcomplex_outputs']
+
+# Expected synonym files.
+synonyms_files = config['anatomy_outputs'] + config['gene_outputs'] + config['protein_outputs'] + \
+    config['disease_outputs'] + config['process_outputs'] + \
+    config['chemical_outputs'] + config['taxon_outputs'] + config['genefamily_outputs'] + \
+    config['drugchemicalconflated_synonym_outputs'] + \
+    config['umls_outputs'] + config['macromolecularcomplex_outputs']
+
+# Expected conflation files.
+conflation_files = config['geneprotein_outputs'] + config['drugchemical_outputs']
+
+# Make sure we have all the expected Compendia files.
+rule check_compendia_files:
+    input:
+        # Don't run this until all the outputs have been generated.
+        config['output_directory'] + '/reports/outputs_done'
+    output:
+        donefile = config['output_directory']+'/reports/check_compendia_files.done'
+    run:
+        assert_files_in_directory(compendia_path,
+            compendia_files,
+            output.donefile
+        )
+
+# Make sure we have all the expected Synonyms files.
+rule check_synonyms_files:
+    input:
+        # Don't run this until all the outputs have been generated.
+        config['output_directory'] + '/reports/outputs_done'
+    output:
+        donefile = config['output_directory']+'/reports/check_synonyms_files.done'
+    run:
+        assert_files_in_directory(synonyms_path,
+            synonyms_files,
+            output.donefile
+        )
+
+# Make sure we have all the expected Conflation files.
+rule check_conflation_files:
+    input:
+        # Don't run this until all the outputs have been generated.
+        config['output_directory'] + '/reports/outputs_done'
+    output:
+        donefile = config['output_directory']+'/reports/check_conflation_files.done'
+    run:
+        assert_files_in_directory(conflations_path,
+            conflation_files,
+            output.donefile
+        )
+
+# Generate a report of CURIE prefixes by file.
+expected_content_reports = []
+for compendium_filename in compendia_files:
+    # Remove the extension from compendium_filename using os.path
+    compendium_basename = os.path.splitext(compendium_filename)[0]
+    report_filename = f"{config['output_directory']}/reports/content/compendia/{compendium_basename}.json"
+
+    expected_content_reports.append(report_filename)
+
+    rule:
+        name: f"generate_content_report_for_compendium_{compendium_basename}"
+        input:
+            compendium_file = f"{config['output_directory']}/compendia/{compendium_filename}",
+        output:
+            report_file = report_filename
+        run:
+            generate_content_report_for_compendium(input.compendium_file, output.report_file)
+
+
+rule generate_summary_content_report_for_compendia:
+    input:
+        expected_content_reports = expected_content_reports,
+    output:
+        report_path = config['output_directory']+'/reports/content/compendia_report.json',
+    run:
+        summarize_content_report_for_compendia(input.expected_content_reports, output.report_path)
+
+
+# Check that all the reports were built correctly.
+rule all_reports:
+    input:
+        config['output_directory']+'/reports/content/compendia_report.json',
+        config['output_directory']+'/reports/check_compendia_files.done',
+        config['output_directory']+'/reports/check_synonyms_files.done',
+        config['output_directory']+'/reports/check_conflation_files.done',
+    output:
+        x = config['output_directory']+'/reports/reports_done',
+    shell:
+        "echo 'done' >> {output.x}"
+