-
Notifications
You must be signed in to change notification settings - Fork 2
Commit
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
Merge pull request #221 from TranslatorSRI/generate-curie-breakdown-r…
…eport This PR adds per-file reports for compendia files that are written into the `reports/content/` directory. We also generate an overall content report for compendia files. Since these are JSON files, I've added `jq` to the Babel image for use in debugging.
- Loading branch information
Showing
5 changed files
with
298 additions
and
7 deletions.
There are no files selected for viewing
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,170 @@ | ||
""" | ||
compendia_per_file_reports.py - Generate reports for the individual files in the compendia directory. | ||
""" | ||
import itertools | ||
import json | ||
import logging | ||
import os | ||
from collections import defaultdict | ||
from datetime import datetime | ||
|
||
|
||
def get_datetime_as_string(): | ||
""" | ||
Returns the current date and time as a string. | ||
:return: The current date and time as an ISO8601 string. | ||
""" | ||
|
||
# Return the current date and time in ISO8601 time with time zone. | ||
return datetime.now().isoformat() | ||
|
||
|
||
def assert_files_in_directory(dir, files, report_file): | ||
""" | ||
Asserts that the list of files in a given directory are the list of files provided. | ||
:param dir: The directory to check files in. | ||
:param files: The files to compare the list against. | ||
:param report_file: Write a report to this file. We assume that this file is not intended | ||
to be read, but is created so that we can check this assertion has been checked. | ||
""" | ||
|
||
logging.info(f"Expect files in directory {dir} to be equal to {files}") | ||
file_list = os.listdir(dir) | ||
assert set(file_list) == set(files) | ||
|
||
# If we passed, write the output to the check_file. | ||
with open(report_file, "w") as f: | ||
f.write(f"Confirmed that {dir} contains only the files {files} at {get_datetime_as_string()}\n") | ||
|
||
|
||
def generate_content_report_for_compendium(compendium_path, report_path): | ||
""" | ||
Generate a report of CURIE prefixes per file. | ||
:param compendium_path: The path of the compendium file to read. | ||
:param report_path: The path to write the CURIE prefixes per file report as a JSON file. | ||
""" | ||
|
||
with open(report_path, "w") as report_file: | ||
with open(compendium_path, "r") as compendium_file: | ||
# This is a JSONL file, so we need to read each line as a JSON object. | ||
|
||
# Track CURIE breakdowns for this compendium. | ||
count_by_prefix = defaultdict(int) | ||
count_by_biolink_type = defaultdict(int) | ||
counters = { | ||
'clique_count': 0, | ||
'cliques_by_id_count': defaultdict(int), | ||
'cliques_by_label_count': defaultdict(int), | ||
'cliques_by_unique_label_count': defaultdict(int), | ||
'cliques_by_description_count': defaultdict(int), | ||
'cliques_by_unique_description_count': defaultdict(int), | ||
} | ||
|
||
# Since this is time-consuming, let's log a count as we go. | ||
count_lines = 0 | ||
|
||
# Iterate through the compendium file. | ||
for line in compendium_file: | ||
count_lines += 1 | ||
|
||
# Report updates every 10 million lines. | ||
if count_lines % 10000000 == 0: | ||
logging.info(f"Processed {count_lines:,} lines in {compendium_path}") | ||
|
||
# Parse each line as a JSON object. | ||
clique = json.loads(line) | ||
|
||
# Track the CURIEs we're looking for. | ||
identifiers = clique.get('identifiers', []) | ||
ids = list(map(lambda x: x['i'], identifiers)) | ||
|
||
# Update counts by Biolink type. | ||
count_by_biolink_type[clique.get('type', '')] += 1 | ||
|
||
# Update counts by prefix. | ||
for curie in ids: | ||
prefix = curie.split(':')[0] | ||
count_by_prefix[prefix] += 1 | ||
|
||
# Update counts by flags. | ||
counters['clique_count'] += 1 | ||
counters['cliques_by_id_count'][len(ids)] += 1 | ||
labels = list(filter(lambda x: x.strip() != '', map(lambda x: x.get('l', ''), identifiers))) | ||
counters['cliques_by_label_count'][len(labels)] += 1 | ||
unique_labels = set(labels) | ||
counters['cliques_by_unique_label_count'][len(unique_labels)] += 1 | ||
|
||
# Since descriptions are currently lists, we have to first flatten the list with | ||
# itertools.chain.from_iterable() before we can count them. | ||
descriptions = list(filter(lambda x: x.strip() != '', itertools.chain.from_iterable(map(lambda x: x.get('d', ''), identifiers)))) | ||
counters['cliques_by_description_count'][len(descriptions)] += 1 | ||
unique_descriptions = set(descriptions) | ||
counters['cliques_by_unique_description_count'][len(unique_descriptions)] += 1 | ||
|
||
json.dump({ | ||
'name': os.path.splitext(os.path.basename(compendium_path))[0], | ||
'compendium_path': compendium_path, | ||
'report_path': report_path, | ||
'count_lines': count_lines, | ||
'count_by_biolink_type': count_by_biolink_type, | ||
'count_by_prefix': count_by_prefix, | ||
'counters': counters, | ||
}, report_file, sort_keys=True, indent=2) | ||
|
||
|
||
def summarize_content_report_for_compendia(compendia_report_paths, summary_path): | ||
""" | ||
Summarize all the content reports generated by generate_content_report_for_compendium(). | ||
:param compendia_report_paths: A list of file paths for the compendia reports generated by generate_content_report_for_compendium() | ||
:param summary_path: The path to write the summary report. | ||
""" | ||
|
||
# Start writing the summary file. | ||
with open(summary_path, "w") as summaryfile: | ||
# Summarized information from the reports. | ||
biolink_types = defaultdict(dict) | ||
prefixes = defaultdict(dict) | ||
counters = {} | ||
count_lines = 0 | ||
|
||
# Read all the summary reports -- these are small, so we can just read them all in. | ||
for report_path in compendia_report_paths: | ||
with open(report_path, "r") as report_file: | ||
report = json.load(report_file) | ||
|
||
# name = report['name'] | ||
count_lines += report['count_lines'] | ||
|
||
# Add up Biolink type information. | ||
for biolink_type, count in report['count_by_biolink_type'].items(): | ||
biolink_types[biolink_type] = biolink_types.get(biolink_type, 0) + count | ||
|
||
# Add up prefix information. | ||
for prefix, count in report['count_by_prefix'].items(): | ||
prefixes[prefix] = prefixes.get(prefix, 0) + count | ||
|
||
# Every counter is either an int or a dict. If a dict, we need to add up | ||
# all the counters. | ||
for counter, value in report['counters'].items(): | ||
if type(value) is int: | ||
counters[counter] = counters.get(counter, 0) + value | ||
elif type(value) is dict: | ||
if counter not in counters: | ||
counters[counter] = defaultdict(int) | ||
for key, count in value.items(): | ||
counters[counter][key] = counters[counter].get(key, 0) + count | ||
else: | ||
raise ValueError(f"Counter {counter} has unexpected value in {value}.") | ||
|
||
# Write the summary report. | ||
json.dump({ | ||
'report_path': summary_path, | ||
'biolink_types': biolink_types, | ||
'prefixes': prefixes, | ||
'counters': counters, | ||
'count_lines': count_lines, | ||
}, summaryfile, sort_keys=True, indent=2) |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,105 @@ | ||
import os | ||
|
||
from src.reports.compendia_per_file_reports import assert_files_in_directory, \ | ||
generate_content_report_for_compendium, summarize_content_report_for_compendia | ||
|
||
# Some paths we will use at multiple times in these reports. | ||
compendia_path = config['output_directory'] + '/compendia' | ||
synonyms_path = config['output_directory'] + '/synonyms' | ||
conflations_path = config['output_directory'] + '/conflation' | ||
|
||
# Expected compendia files. | ||
compendia_files = config['anatomy_outputs'] + config['gene_outputs'] + config['protein_outputs'] + \ | ||
config['disease_outputs'] + config['process_outputs'] + \ | ||
config['chemical_outputs'] + config['taxon_outputs'] + config['genefamily_outputs'] + \ | ||
config['umls_outputs'] + config['macromolecularcomplex_outputs'] | ||
|
||
# Expected synonym files. | ||
synonyms_files = config['anatomy_outputs'] + config['gene_outputs'] + config['protein_outputs'] + \ | ||
config['disease_outputs'] + config['process_outputs'] + \ | ||
config['chemical_outputs'] + config['taxon_outputs'] + config['genefamily_outputs'] + \ | ||
config['drugchemicalconflated_synonym_outputs'] + \ | ||
config['umls_outputs'] + config['macromolecularcomplex_outputs'] | ||
|
||
# Expected conflation files. | ||
conflation_files = config['geneprotein_outputs'] + config['drugchemical_outputs'] | ||
|
||
# Make sure we have all the expected Compendia files. | ||
rule check_compendia_files: | ||
input: | ||
# Don't run this until all the outputs have been generated. | ||
config['output_directory'] + '/reports/outputs_done' | ||
output: | ||
donefile = config['output_directory']+'/reports/check_compendia_files.done' | ||
run: | ||
assert_files_in_directory(compendia_path, | ||
compendia_files, | ||
output.donefile | ||
) | ||
|
||
# Make sure we have all the expected Synonyms files. | ||
rule check_synonyms_files: | ||
input: | ||
# Don't run this until all the outputs have been generated. | ||
config['output_directory'] + '/reports/outputs_done' | ||
output: | ||
donefile = config['output_directory']+'/reports/check_synonyms_files.done' | ||
run: | ||
assert_files_in_directory(synonyms_path, | ||
synonyms_files, | ||
output.donefile | ||
) | ||
|
||
# Make sure we have all the expected Conflation files. | ||
rule check_conflation_files: | ||
input: | ||
# Don't run this until all the outputs have been generated. | ||
config['output_directory'] + '/reports/outputs_done' | ||
output: | ||
donefile = config['output_directory']+'/reports/check_conflation_files.done' | ||
run: | ||
assert_files_in_directory(conflations_path, | ||
conflation_files, | ||
output.donefile | ||
) | ||
|
||
# Generate a report of CURIE prefixes by file. | ||
expected_content_reports = [] | ||
for compendium_filename in compendia_files: | ||
# Remove the extension from compendium_filename using os.path | ||
compendium_basename = os.path.splitext(compendium_filename)[0] | ||
report_filename = f"{config['output_directory']}/reports/content/compendia/{compendium_basename}.json" | ||
|
||
expected_content_reports.append(report_filename) | ||
|
||
rule: | ||
name: f"generate_content_report_for_compendium_{compendium_basename}" | ||
input: | ||
compendium_file = f"{config['output_directory']}/compendia/{compendium_filename}", | ||
output: | ||
report_file = report_filename | ||
run: | ||
generate_content_report_for_compendium(input.compendium_file, output.report_file) | ||
|
||
|
||
rule generate_summary_content_report_for_compendia: | ||
input: | ||
expected_content_reports = expected_content_reports, | ||
output: | ||
report_path = config['output_directory']+'/reports/content/compendia_report.json', | ||
run: | ||
summarize_content_report_for_compendia(input.expected_content_reports, output.report_path) | ||
|
||
|
||
# Check that all the reports were built correctly. | ||
rule all_reports: | ||
input: | ||
config['output_directory']+'/reports/content/compendia_report.json', | ||
config['output_directory']+'/reports/check_compendia_files.done', | ||
config['output_directory']+'/reports/check_synonyms_files.done', | ||
config['output_directory']+'/reports/check_conflation_files.done', | ||
output: | ||
x = config['output_directory']+'/reports/reports_done', | ||
shell: | ||
"echo 'done' >> {output.x}" | ||
|