Skip to content

Commit

Permalink
Merge pull request #221 from TranslatorSRI/generate-curie-breakdown-r…
Browse files Browse the repository at this point in the history
…eport

This PR adds per-file reports for compendia files that are written into the `reports/content/` directory. We also generate an overall content report for compendia files. Since these are JSON files, I've added `jq` to the Babel image for use in debugging.
  • Loading branch information
gaurav authored Jan 25, 2024
2 parents 0735fe0 + e4d63a5 commit 83907b1
Show file tree
Hide file tree
Showing 5 changed files with 298 additions and 7 deletions.
1 change: 1 addition & 0 deletions Dockerfile
Original file line number Diff line number Diff line change
Expand Up @@ -28,6 +28,7 @@ RUN apt-get install -y htop
RUN apt-get install -y screen
RUN apt-get install -y vim
RUN apt-get install -y rsync
RUN apt-get install -y jq

# Copy directory into Docker.
COPY --chown=nru . ${ROOT}
Expand Down
23 changes: 18 additions & 5 deletions Snakefile
Original file line number Diff line number Diff line change
Expand Up @@ -13,9 +13,25 @@ include: "src/snakefiles/taxon.snakefile"
include: "src/snakefiles/genefamily.snakefile"
include: "src/snakefiles/leftover_umls.snakefile"
include: "src/snakefiles/macromolecular_complex.snakefile"

include: "src/snakefiles/reports.snakefile"
include: "src/snakefiles/exports.snakefile"

rule all:
input:
# See rule all_outputs later in this file for how we generate all the outputs.
config['output_directory'] + '/reports/outputs_done',
# reports_done are generated by the rules in src/snakefiles/
config['output_directory'] + '/reports/reports_done',

# Build all the exports.
config['output_directory'] + '/kgx/done',
output:
x = config['output_directory'] + '/reports/all_done'
shell:
"echo 'done' >> {output.x}"

rule all_outputs:
input:
config['output_directory'] + '/reports/anatomy_done',
config['output_directory'] + '/reports/chemicals_done',
Expand All @@ -29,21 +45,18 @@ rule all:
config['output_directory'] + '/reports/umls_done',
config['output_directory'] + '/reports/macromolecular_complex_done',
config['output_directory'] + '/reports/drugchemical_done',
# Check if we have exported the compendia as KGX.
config['output_directory'] + '/kgx/done',
output:
x = config['output_directory'] + '/reports/all_done'
x = config['output_directory'] + '/reports/outputs_done'
shell:
"echo 'done' >> {output.x}"


rule clean_compendia:
params:
dir=config['output_directory']
shell:
"rm {params.dir}/compendia/*; rm {params.dir}/synonyms/*"

rule clean_data:
rule clean_downloads:
params:
dir=config['download_directory']
shell:
Expand Down
6 changes: 4 additions & 2 deletions config.json
Original file line number Diff line number Diff line change
Expand Up @@ -28,8 +28,6 @@
"protein_concords": ["UniProtKB","PR","NCIT_UniProtKB","NCIT_UMLS"],
"protein_outputs": ["Protein.txt"],

"geneprotein_output": "GeneProtein.txt",

"disease_labelsandsynonyms": ["MONDO","DOID","Orphanet","HP","MESH","NCIT","UMLS","SNOMEDCT","EFO"],
"disease_ids": ["MONDO","DOID","Orphanet","HP","MESH","NCIT","UMLS","OMIM","EFO"],
"disease_concords": ["HP","MONDO","UMLS","DOID","EFO"],
Expand All @@ -46,6 +44,7 @@
"chemical_concords": ["wikipedia_mesh_chebi","PUBCHEM_MESH","mesh_cas","mesh_unii","PUBCHEM_CAS","GTOPDB","CHEBI","UMLS","DrugCentral","RXNORM"],
"chemical_ids": ["CHEMBL.COMPOUND","GTOPDB","KEGG.COMPOUND","CHEBI","UNII","HMDB","PUBCHEM.COMPOUND","DrugCentral","DRUGBANK","MESH","UMLS","RXNORM"],
"chemical_outputs": ["MolecularMixture.txt", "SmallMolecule.txt", "Polypeptide.txt", "ComplexMolecularMixture.txt", "ChemicalEntity.txt", "ChemicalMixture.txt", "Drug.txt"],
"drugchemicalconflated_synonym_outputs": ["DrugChemicalConflated.txt"],

"taxon_labels": ["NCBITaxon","MESH","UMLS"],
"taxon_synonyms": ["NCBITaxon","UMLS"],
Expand All @@ -66,6 +65,9 @@
"http://www.informatics.jax.org/marker/MGI:": "MGI"
},

"geneprotein_outputs": ["GeneProtein.txt"],
"drugchemical_outputs": ["DrugChemical.txt"],

"preferred_name_boost_prefixes": {
"biolink:ChemicalEntity": [
"DRUGBANK",
Expand Down
170 changes: 170 additions & 0 deletions src/reports/compendia_per_file_reports.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,170 @@
"""
compendia_per_file_reports.py - Generate reports for the individual files in the compendia directory.
"""
import itertools
import json
import logging
import os
from collections import defaultdict
from datetime import datetime


def get_datetime_as_string():
"""
Returns the current date and time as a string.
:return: The current date and time as an ISO8601 string.
"""

# Return the current date and time in ISO8601 time with time zone.
return datetime.now().isoformat()


def assert_files_in_directory(dir, files, report_file):
"""
Asserts that the list of files in a given directory are the list of files provided.
:param dir: The directory to check files in.
:param files: The files to compare the list against.
:param report_file: Write a report to this file. We assume that this file is not intended
to be read, but is created so that we can check this assertion has been checked.
"""

logging.info(f"Expect files in directory {dir} to be equal to {files}")
file_list = os.listdir(dir)
assert set(file_list) == set(files)

# If we passed, write the output to the check_file.
with open(report_file, "w") as f:
f.write(f"Confirmed that {dir} contains only the files {files} at {get_datetime_as_string()}\n")


def generate_content_report_for_compendium(compendium_path, report_path):
"""
Generate a report of CURIE prefixes per file.
:param compendium_path: The path of the compendium file to read.
:param report_path: The path to write the CURIE prefixes per file report as a JSON file.
"""

with open(report_path, "w") as report_file:
with open(compendium_path, "r") as compendium_file:
# This is a JSONL file, so we need to read each line as a JSON object.

# Track CURIE breakdowns for this compendium.
count_by_prefix = defaultdict(int)
count_by_biolink_type = defaultdict(int)
counters = {
'clique_count': 0,
'cliques_by_id_count': defaultdict(int),
'cliques_by_label_count': defaultdict(int),
'cliques_by_unique_label_count': defaultdict(int),
'cliques_by_description_count': defaultdict(int),
'cliques_by_unique_description_count': defaultdict(int),
}

# Since this is time-consuming, let's log a count as we go.
count_lines = 0

# Iterate through the compendium file.
for line in compendium_file:
count_lines += 1

# Report updates every 10 million lines.
if count_lines % 10000000 == 0:
logging.info(f"Processed {count_lines:,} lines in {compendium_path}")

# Parse each line as a JSON object.
clique = json.loads(line)

# Track the CURIEs we're looking for.
identifiers = clique.get('identifiers', [])
ids = list(map(lambda x: x['i'], identifiers))

# Update counts by Biolink type.
count_by_biolink_type[clique.get('type', '')] += 1

# Update counts by prefix.
for curie in ids:
prefix = curie.split(':')[0]
count_by_prefix[prefix] += 1

# Update counts by flags.
counters['clique_count'] += 1
counters['cliques_by_id_count'][len(ids)] += 1
labels = list(filter(lambda x: x.strip() != '', map(lambda x: x.get('l', ''), identifiers)))
counters['cliques_by_label_count'][len(labels)] += 1
unique_labels = set(labels)
counters['cliques_by_unique_label_count'][len(unique_labels)] += 1

# Since descriptions are currently lists, we have to first flatten the list with
# itertools.chain.from_iterable() before we can count them.
descriptions = list(filter(lambda x: x.strip() != '', itertools.chain.from_iterable(map(lambda x: x.get('d', ''), identifiers))))
counters['cliques_by_description_count'][len(descriptions)] += 1
unique_descriptions = set(descriptions)
counters['cliques_by_unique_description_count'][len(unique_descriptions)] += 1

json.dump({
'name': os.path.splitext(os.path.basename(compendium_path))[0],
'compendium_path': compendium_path,
'report_path': report_path,
'count_lines': count_lines,
'count_by_biolink_type': count_by_biolink_type,
'count_by_prefix': count_by_prefix,
'counters': counters,
}, report_file, sort_keys=True, indent=2)


def summarize_content_report_for_compendia(compendia_report_paths, summary_path):
"""
Summarize all the content reports generated by generate_content_report_for_compendium().
:param compendia_report_paths: A list of file paths for the compendia reports generated by generate_content_report_for_compendium()
:param summary_path: The path to write the summary report.
"""

# Start writing the summary file.
with open(summary_path, "w") as summaryfile:
# Summarized information from the reports.
biolink_types = defaultdict(dict)
prefixes = defaultdict(dict)
counters = {}
count_lines = 0

# Read all the summary reports -- these are small, so we can just read them all in.
for report_path in compendia_report_paths:
with open(report_path, "r") as report_file:
report = json.load(report_file)

# name = report['name']
count_lines += report['count_lines']

# Add up Biolink type information.
for biolink_type, count in report['count_by_biolink_type'].items():
biolink_types[biolink_type] = biolink_types.get(biolink_type, 0) + count

# Add up prefix information.
for prefix, count in report['count_by_prefix'].items():
prefixes[prefix] = prefixes.get(prefix, 0) + count

# Every counter is either an int or a dict. If a dict, we need to add up
# all the counters.
for counter, value in report['counters'].items():
if type(value) is int:
counters[counter] = counters.get(counter, 0) + value
elif type(value) is dict:
if counter not in counters:
counters[counter] = defaultdict(int)
for key, count in value.items():
counters[counter][key] = counters[counter].get(key, 0) + count
else:
raise ValueError(f"Counter {counter} has unexpected value in {value}.")

# Write the summary report.
json.dump({
'report_path': summary_path,
'biolink_types': biolink_types,
'prefixes': prefixes,
'counters': counters,
'count_lines': count_lines,
}, summaryfile, sort_keys=True, indent=2)
105 changes: 105 additions & 0 deletions src/snakefiles/reports.snakefile
Original file line number Diff line number Diff line change
@@ -0,0 +1,105 @@
import os

from src.reports.compendia_per_file_reports import assert_files_in_directory, \
generate_content_report_for_compendium, summarize_content_report_for_compendia

# Some paths we will use at multiple times in these reports.
compendia_path = config['output_directory'] + '/compendia'
synonyms_path = config['output_directory'] + '/synonyms'
conflations_path = config['output_directory'] + '/conflation'

# Expected compendia files.
compendia_files = config['anatomy_outputs'] + config['gene_outputs'] + config['protein_outputs'] + \
config['disease_outputs'] + config['process_outputs'] + \
config['chemical_outputs'] + config['taxon_outputs'] + config['genefamily_outputs'] + \
config['umls_outputs'] + config['macromolecularcomplex_outputs']

# Expected synonym files.
synonyms_files = config['anatomy_outputs'] + config['gene_outputs'] + config['protein_outputs'] + \
config['disease_outputs'] + config['process_outputs'] + \
config['chemical_outputs'] + config['taxon_outputs'] + config['genefamily_outputs'] + \
config['drugchemicalconflated_synonym_outputs'] + \
config['umls_outputs'] + config['macromolecularcomplex_outputs']

# Expected conflation files.
conflation_files = config['geneprotein_outputs'] + config['drugchemical_outputs']

# Make sure we have all the expected Compendia files.
rule check_compendia_files:
input:
# Don't run this until all the outputs have been generated.
config['output_directory'] + '/reports/outputs_done'
output:
donefile = config['output_directory']+'/reports/check_compendia_files.done'
run:
assert_files_in_directory(compendia_path,
compendia_files,
output.donefile
)

# Make sure we have all the expected Synonyms files.
rule check_synonyms_files:
input:
# Don't run this until all the outputs have been generated.
config['output_directory'] + '/reports/outputs_done'
output:
donefile = config['output_directory']+'/reports/check_synonyms_files.done'
run:
assert_files_in_directory(synonyms_path,
synonyms_files,
output.donefile
)

# Make sure we have all the expected Conflation files.
rule check_conflation_files:
input:
# Don't run this until all the outputs have been generated.
config['output_directory'] + '/reports/outputs_done'
output:
donefile = config['output_directory']+'/reports/check_conflation_files.done'
run:
assert_files_in_directory(conflations_path,
conflation_files,
output.donefile
)

# Generate a report of CURIE prefixes by file.
expected_content_reports = []
for compendium_filename in compendia_files:
# Remove the extension from compendium_filename using os.path
compendium_basename = os.path.splitext(compendium_filename)[0]
report_filename = f"{config['output_directory']}/reports/content/compendia/{compendium_basename}.json"

expected_content_reports.append(report_filename)

rule:
name: f"generate_content_report_for_compendium_{compendium_basename}"
input:
compendium_file = f"{config['output_directory']}/compendia/{compendium_filename}",
output:
report_file = report_filename
run:
generate_content_report_for_compendium(input.compendium_file, output.report_file)


rule generate_summary_content_report_for_compendia:
input:
expected_content_reports = expected_content_reports,
output:
report_path = config['output_directory']+'/reports/content/compendia_report.json',
run:
summarize_content_report_for_compendia(input.expected_content_reports, output.report_path)


# Check that all the reports were built correctly.
rule all_reports:
input:
config['output_directory']+'/reports/content/compendia_report.json',
config['output_directory']+'/reports/check_compendia_files.done',
config['output_directory']+'/reports/check_synonyms_files.done',
config['output_directory']+'/reports/check_conflation_files.done',
output:
x = config['output_directory']+'/reports/reports_done',
shell:
"echo 'done' >> {output.x}"

0 comments on commit 83907b1

Please sign in to comment.