Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Generate per-file reports for compendia files #221

Merged
merged 25 commits into from
Jan 25, 2024
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
Show all changes
25 commits
Select commit Hold shift + click to select a range
3e72ee0
Added `jq` to prereqs to help with debugging.
gaurav Jan 13, 2024
36a8af5
Added an `all_reports` file that `all` is dependent on.
gaurav Jan 13, 2024
d158c43
Added reports to check output directory file list.
gaurav Jan 13, 2024
4fcdb68
Fixed Snakefile includes and targets.
gaurav Jan 13, 2024
c5d1d5f
Fixed output_dir to output_directory.
gaurav Jan 13, 2024
6d3e12f
Updated synonyms expected file list.
gaurav Jan 13, 2024
1c8e6f9
Updated all_reports with the new checks.
gaurav Jan 13, 2024
3442f1b
First stab at generate_curie_prefixes_per_file_report().
gaurav Jan 13, 2024
026d6b9
Added some funky code to calculate all the reports.
gaurav Jan 13, 2024
7cf17c2
Fixed imports.
gaurav Jan 13, 2024
f682ad0
Reorganized counter code.
gaurav Jan 13, 2024
b01070a
Fix: str.trim() -> str.strip().
gaurav Jan 13, 2024
4037db1
Small improvements to the report generator.
gaurav Jan 13, 2024
3c6bea1
Added an overall summary report for compendia.
gaurav Jan 14, 2024
63b5bcc
Update path.
gaurav Jan 14, 2024
82a80dc
Fix bug, double-check step.
gaurav Jan 14, 2024
9385ac9
Removed incorrect target.
gaurav Jan 14, 2024
12f9dc1
Fixed bug.
gaurav Jan 14, 2024
203cd3a
Fixed incorrect variable.
gaurav Jan 14, 2024
e500356
Counts should always be three digit numbers.
gaurav Jan 14, 2024
854e841
Improved logging.
gaurav Jan 14, 2024
1eaebfc
Added commas to output.
gaurav Jan 14, 2024
3083a5c
First step in improving counters.
gaurav Jan 20, 2024
3bf8470
Fixed up the summarization code as well.
gaurav Jan 20, 2024
e4d63a5
Fixed bug in counter combination.
gaurav Jan 20, 2024
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
1 change: 1 addition & 0 deletions Dockerfile
Original file line number Diff line number Diff line change
Expand Up @@ -28,6 +28,7 @@ RUN apt-get install -y htop
RUN apt-get install -y screen
RUN apt-get install -y vim
RUN apt-get install -y rsync
RUN apt-get install -y jq

# Copy directory into Docker.
COPY --chown=nru . ${ROOT}
Expand Down
23 changes: 18 additions & 5 deletions Snakefile
Original file line number Diff line number Diff line change
Expand Up @@ -13,9 +13,25 @@ include: "src/snakefiles/taxon.snakefile"
include: "src/snakefiles/genefamily.snakefile"
include: "src/snakefiles/leftover_umls.snakefile"
include: "src/snakefiles/macromolecular_complex.snakefile"

include: "src/snakefiles/reports.snakefile"
include: "src/snakefiles/exports.snakefile"

rule all:
input:
# See rule all_outputs later in this file for how we generate all the outputs.
config['output_directory'] + '/reports/outputs_done',
# reports_done are generated by the rules in src/snakefiles/
config['output_directory'] + '/reports/reports_done',

# Build all the exports.
config['output_directory'] + '/kgx/done',
output:
x = config['output_directory'] + '/reports/all_done'
shell:
"echo 'done' >> {output.x}"

rule all_outputs:
input:
config['output_directory'] + '/reports/anatomy_done',
config['output_directory'] + '/reports/chemicals_done',
Expand All @@ -29,21 +45,18 @@ rule all:
config['output_directory'] + '/reports/umls_done',
config['output_directory'] + '/reports/macromolecular_complex_done',
config['output_directory'] + '/reports/drugchemical_done',
# Check if we have exported the compendia as KGX.
config['output_directory'] + '/kgx/done',
output:
x = config['output_directory'] + '/reports/all_done'
x = config['output_directory'] + '/reports/outputs_done'
shell:
"echo 'done' >> {output.x}"


rule clean_compendia:
params:
dir=config['output_directory']
shell:
"rm {params.dir}/compendia/*; rm {params.dir}/synonyms/*"

rule clean_data:
rule clean_downloads:
params:
dir=config['download_directory']
shell:
Expand Down
6 changes: 4 additions & 2 deletions config.json
Original file line number Diff line number Diff line change
Expand Up @@ -28,8 +28,6 @@
"protein_concords": ["UniProtKB","PR","NCIT_UniProtKB","NCIT_UMLS"],
"protein_outputs": ["Protein.txt"],

"geneprotein_output": "GeneProtein.txt",

"disease_labelsandsynonyms": ["MONDO","DOID","Orphanet","HP","MESH","NCIT","UMLS","SNOMEDCT","EFO"],
"disease_ids": ["MONDO","DOID","Orphanet","HP","MESH","NCIT","UMLS","OMIM","EFO"],
"disease_concords": ["HP","MONDO","UMLS","DOID","EFO"],
Expand All @@ -46,6 +44,7 @@
"chemical_concords": ["wikipedia_mesh_chebi","PUBCHEM_MESH","mesh_cas","mesh_unii","PUBCHEM_CAS","GTOPDB","CHEBI","UMLS","DrugCentral","RXNORM"],
"chemical_ids": ["CHEMBL.COMPOUND","GTOPDB","KEGG.COMPOUND","CHEBI","UNII","HMDB","PUBCHEM.COMPOUND","DrugCentral","DRUGBANK","MESH","UMLS","RXNORM"],
"chemical_outputs": ["MolecularMixture.txt", "SmallMolecule.txt", "Polypeptide.txt", "ComplexMolecularMixture.txt", "ChemicalEntity.txt", "ChemicalMixture.txt", "Drug.txt"],
"drugchemicalconflated_synonym_outputs": ["DrugChemicalConflated.txt"],

"taxon_labels": ["NCBITaxon","MESH","UMLS"],
"taxon_synonyms": ["NCBITaxon","UMLS"],
Expand All @@ -60,6 +59,9 @@
"umls_outputs": ["umls.txt"],
"macromolecularcomplex_outputs": ["MacromolecularComplex.txt"],

"geneprotein_outputs": ["GeneProtein.txt"],
"drugchemical_outputs": ["DrugChemical.txt"],

"preferred_name_boost_prefixes": {
"biolink:ChemicalEntity": [
"DRUGBANK",
Expand Down
170 changes: 170 additions & 0 deletions src/reports/compendia_per_file_reports.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,170 @@
"""
compendia_per_file_reports.py - Generate reports for the individual files in the compendia directory.
"""
import itertools
import json
import logging
import os
from collections import defaultdict
from datetime import datetime


def get_datetime_as_string():
"""
Returns the current date and time as a string.

:return: The current date and time as an ISO8601 string.
"""

# Return the current date and time in ISO8601 time with time zone.
return datetime.now().isoformat()


def assert_files_in_directory(dir, files, report_file):
"""
Asserts that the list of files in a given directory are the list of files provided.

:param dir: The directory to check files in.
:param files: The files to compare the list against.
:param report_file: Write a report to this file. We assume that this file is not intended
to be read, but is created so that we can check this assertion has been checked.
"""

logging.info(f"Expect files in directory {dir} to be equal to {files}")
file_list = os.listdir(dir)
assert set(file_list) == set(files)

# If we passed, write the output to the check_file.
with open(report_file, "w") as f:
f.write(f"Confirmed that {dir} contains only the files {files} at {get_datetime_as_string()}\n")


def generate_content_report_for_compendium(compendium_path, report_path):
"""
Generate a report of CURIE prefixes per file.

:param compendium_path: The path of the compendium file to read.
:param report_path: The path to write the CURIE prefixes per file report as a JSON file.
"""

with open(report_path, "w") as report_file:
with open(compendium_path, "r") as compendium_file:
# This is a JSONL file, so we need to read each line as a JSON object.

# Track CURIE breakdowns for this compendium.
count_by_prefix = defaultdict(int)
count_by_biolink_type = defaultdict(int)
counters = {
'clique_count': 0,
'cliques_by_id_count': defaultdict(int),
'cliques_by_label_count': defaultdict(int),
'cliques_by_unique_label_count': defaultdict(int),
'cliques_by_description_count': defaultdict(int),
'cliques_by_unique_description_count': defaultdict(int),
}

# Since this is time-consuming, let's log a count as we go.
count_lines = 0

# Iterate through the compendium file.
for line in compendium_file:
count_lines += 1

# Report updates every 10 million lines.
if count_lines % 10000000 == 0:
logging.info(f"Processed {count_lines:,} lines in {compendium_path}")

# Parse each line as a JSON object.
clique = json.loads(line)

# Track the CURIEs we're looking for.
identifiers = clique.get('identifiers', [])
ids = list(map(lambda x: x['i'], identifiers))

# Update counts by Biolink type.
count_by_biolink_type[clique.get('type', '')] += 1

# Update counts by prefix.
for curie in ids:
prefix = curie.split(':')[0]
count_by_prefix[prefix] += 1

# Update counts by flags.
counters['clique_count'] += 1
counters['cliques_by_id_count'][len(ids)] += 1
labels = list(filter(lambda x: x.strip() != '', map(lambda x: x.get('l', ''), identifiers)))
counters['cliques_by_label_count'][len(labels)] += 1
unique_labels = set(labels)
counters['cliques_by_unique_label_count'][len(unique_labels)] += 1

# Since descriptions are currently lists, we have to first flatten the list with
# itertools.chain.from_iterable() before we can count them.
descriptions = list(filter(lambda x: x.strip() != '', itertools.chain.from_iterable(map(lambda x: x.get('d', ''), identifiers))))
counters['cliques_by_description_count'][len(descriptions)] += 1
unique_descriptions = set(descriptions)
counters['cliques_by_unique_description_count'][len(unique_descriptions)] += 1

json.dump({
'name': os.path.splitext(os.path.basename(compendium_path))[0],
'compendium_path': compendium_path,
'report_path': report_path,
'count_lines': count_lines,
'count_by_biolink_type': count_by_biolink_type,
'count_by_prefix': count_by_prefix,
'counters': counters,
}, report_file, sort_keys=True, indent=2)


def summarize_content_report_for_compendia(compendia_report_paths, summary_path):
"""
Summarize all the content reports generated by generate_content_report_for_compendium().

:param compendia_report_paths: A list of file paths for the compendia reports generated by generate_content_report_for_compendium()
:param summary_path: The path to write the summary report.
"""

# Start writing the summary file.
with open(summary_path, "w") as summaryfile:
# Summarized information from the reports.
biolink_types = defaultdict(dict)
prefixes = defaultdict(dict)
counters = {}
count_lines = 0

# Read all the summary reports -- these are small, so we can just read them all in.
for report_path in compendia_report_paths:
with open(report_path, "r") as report_file:
report = json.load(report_file)

# name = report['name']
count_lines += report['count_lines']

# Add up Biolink type information.
for biolink_type, count in report['count_by_biolink_type'].items():
biolink_types[biolink_type] = biolink_types.get(biolink_type, 0) + count

# Add up prefix information.
for prefix, count in report['count_by_prefix'].items():
prefixes[prefix] = prefixes.get(prefix, 0) + count

# Every counter is either an int or a dict. If a dict, we need to add up
# all the counters.
for counter, value in report['counters'].items():
if type(value) is int:
counters[counter] = counters.get(counter, 0) + value
elif type(value) is dict:
if counter not in counters:
counters[counter] = defaultdict(int)
for key, count in value.items():
counters[counter][key] = counters[counter].get(key, 0) + count
else:
raise ValueError(f"Counter {counter} has unexpected value in {value}.")

# Write the summary report.
json.dump({
'report_path': summary_path,
'biolink_types': biolink_types,
'prefixes': prefixes,
'counters': counters,
'count_lines': count_lines,
}, summaryfile, sort_keys=True, indent=2)
105 changes: 105 additions & 0 deletions src/snakefiles/reports.snakefile
Original file line number Diff line number Diff line change
@@ -0,0 +1,105 @@
import os

from src.reports.compendia_per_file_reports import assert_files_in_directory, \
generate_content_report_for_compendium, summarize_content_report_for_compendia

# Some paths we will use at multiple times in these reports.
compendia_path = config['output_directory'] + '/compendia'
synonyms_path = config['output_directory'] + '/synonyms'
conflations_path = config['output_directory'] + '/conflation'

# Expected compendia files.
compendia_files = config['anatomy_outputs'] + config['gene_outputs'] + config['protein_outputs'] + \
config['disease_outputs'] + config['process_outputs'] + \
config['chemical_outputs'] + config['taxon_outputs'] + config['genefamily_outputs'] + \
config['umls_outputs'] + config['macromolecularcomplex_outputs']

# Expected synonym files.
synonyms_files = config['anatomy_outputs'] + config['gene_outputs'] + config['protein_outputs'] + \
config['disease_outputs'] + config['process_outputs'] + \
config['chemical_outputs'] + config['taxon_outputs'] + config['genefamily_outputs'] + \
config['drugchemicalconflated_synonym_outputs'] + \
config['umls_outputs'] + config['macromolecularcomplex_outputs']

# Expected conflation files.
conflation_files = config['geneprotein_outputs'] + config['drugchemical_outputs']

# Make sure we have all the expected Compendia files.
rule check_compendia_files:
input:
# Don't run this until all the outputs have been generated.
config['output_directory'] + '/reports/outputs_done'
output:
donefile = config['output_directory']+'/reports/check_compendia_files.done'
run:
assert_files_in_directory(compendia_path,
compendia_files,
output.donefile
)

# Make sure we have all the expected Synonyms files.
rule check_synonyms_files:
input:
# Don't run this until all the outputs have been generated.
config['output_directory'] + '/reports/outputs_done'
output:
donefile = config['output_directory']+'/reports/check_synonyms_files.done'
run:
assert_files_in_directory(synonyms_path,
synonyms_files,
output.donefile
)

# Make sure we have all the expected Conflation files.
rule check_conflation_files:
input:
# Don't run this until all the outputs have been generated.
config['output_directory'] + '/reports/outputs_done'
output:
donefile = config['output_directory']+'/reports/check_conflation_files.done'
run:
assert_files_in_directory(conflations_path,
conflation_files,
output.donefile
)

# Generate a report of CURIE prefixes by file.
expected_content_reports = []
for compendium_filename in compendia_files:
# Remove the extension from compendium_filename using os.path
compendium_basename = os.path.splitext(compendium_filename)[0]
report_filename = f"{config['output_directory']}/reports/content/compendia/{compendium_basename}.json"

expected_content_reports.append(report_filename)

rule:
name: f"generate_content_report_for_compendium_{compendium_basename}"
input:
compendium_file = f"{config['output_directory']}/compendia/{compendium_filename}",
output:
report_file = report_filename
run:
generate_content_report_for_compendium(input.compendium_file, output.report_file)


rule generate_summary_content_report_for_compendia:
input:
expected_content_reports = expected_content_reports,
output:
report_path = config['output_directory']+'/reports/content/compendia_report.json',
run:
summarize_content_report_for_compendia(input.expected_content_reports, output.report_path)


# Check that all the reports were built correctly.
rule all_reports:
input:
config['output_directory']+'/reports/content/compendia_report.json',
config['output_directory']+'/reports/check_compendia_files.done',
config['output_directory']+'/reports/check_synonyms_files.done',
config['output_directory']+'/reports/check_conflation_files.done',
output:
x = config['output_directory']+'/reports/reports_done',
shell:
"echo 'done' >> {output.x}"