diff --git a/README.md b/README.md index 72adc61..2859232 100644 --- a/README.md +++ b/README.md @@ -1,2 +1,5 @@ ## TCGA Analysis Workflow to rapidly search TCGA data for microbial presence. +======= +# tcga +TCGA analysis diff --git a/python_scripts/cgc_bowtie2_build.py b/python_scripts/cgc_bowtie2_build.py new file mode 100644 index 0000000..fc21496 --- /dev/null +++ b/python_scripts/cgc_bowtie2_build.py @@ -0,0 +1,152 @@ +#!/usr/bin/env python + +#----------------------------------------------------------------------------- +# Copyright (c) 2016--, Evguenia Kopylova, Jad Kanbar +# +# Distributed under the terms of the Modified BSD License. +# +# The full license is in the file COPYING.txt, distributed with this software. +#----------------------------------------------------------------------------- + +""" +Build Bowtie2 database on all reference genomes in Kraken report. +""" + +import click +import collections + +def load_kraken_mpa_report(kraken_mpa_report_fp, + taxa_levels, + read_per_taxa): + """Absolute abundance of number of reads matching a defined taxa level. + Parameters + ---------- + kraken_mpa_report_fp: str + filepath to output of "kraken mpa report" + taxa_levels: list + list of two elements that includes the taxonomic rank at which + to generate summary and rank below to split by + read_per_taxa: int + integer of number of minimum number of reads to keep per taxa + + Returns + ------- + taxonomies: set + set of taxonomies from kraken_mpa_report_fp + """ + taxonomic_abundances= [] + for report_fp in kraken_mpa_report_fp: + with open(report_fp) as report_fp: + for line in report_fp: + label, taxonomy = line.strip().split('\t') + if taxa_levels[0] in taxonomy: + # keep taxonomy string up to specified level + taxonomy_parse = taxonomy.split(taxa_levels[1])[0] + taxonomy_parse = taxonomy_parse.replace('d__','k__') + taxonomic_abundances.append(taxonomy_parse) + + taxonomies = set([k for k, v in + collections.Counter(taxonomic_abundances).iteritems() + if v > read_per_taxa]) + + return taxonomies + +def create_db_folder(repophlan_genomeid_taxonomy_fp, + genome_tax, + split_on_level, + output_filename): + + """Return sets for sample IDs and taxonomy strings. + Parameters + ---------- + repophlan_genomeid_taxonomy_fp: str + filepath to output of repophlan file genome IDs and associated taxa + genome_tax: set + set of taxonomies from kraken_mpa_report_fp + split_on_level: str + string that determines the level to split the taxonomy in genome_tax + output_filename: str + string with output filename + + Returns + ------- + Writes a text file with genome IDs, directory to genomes, and associated + taxa from repophlan_genomeid_taxonomy_fp containned with the set + genome_tax + """ + with open(output_filename, 'w') as f: + with open(repophlan_genomeid_taxonomy_fp) as repophlan_genomeID_taxonomy_f: + for line in repophlan_genomeID_taxonomy_f: + tax_id_repo, directory_repo, taxonomy_repo =\ + line.strip().split('\t') + taxonomy_repo = taxonomy_repo.split(split_on_level)[0] + if taxonomy_repo in genome_tax: + f.writelines(line) + + +@click.command() + +@click.option('--kraken-mpa-report-fp', required=True, multiple=True, + type=click.Path(resolve_path=True, readable=True, exists=True, + file_okay=True), + help='Filepath to Kraken report') +@click.option('--repophlan-genomeID-taxonomy-fp', required=True, + type=click.Path(resolve_path=True, readable=True, exists=False, + file_okay=True), + help='Filepath to RepoPhlAn genome ID and taxonomy list') +@click.option('--taxonomic-rank', type=click.Choice(['genus', 'species', + 'family', 'order', + 'class', 'phylum', + 'domain']), + required=False, default=['genus'], show_default=True, + help="Taxonomic rank at which to generate summary") +@click.option('--read-per-taxa', required=False, + default=10, show_default=True, + help="Minimum number of reads for each taxa") +@click.option('--output-filename', required=False, + default='subset_repophlan_genomeID_taxonomy.good', + show_default=True, + help="Output filename for RepoPhlAn genome ID and taxonomy list") + +def main(kraken_mpa_report_fp, + repophlan_genomeid_taxonomy_fp, + taxonomic_rank, + read_per_taxa, + output_filename): + + taxa_levels = {"domain": "d__", + "phylum": "|p__", + "class": "|c__", + "order": "|o__", + "family": "|f__", + "genus": "|g__", + "species": "|s__"} + + taxa_levels_idx = {"d__": 0, "|p__": 1, "|c__": 2, + "|o__": 3, "|f__": 4, "|g__": 5, + "|s__": 6, "6": "|s__", "5": "|g__", + "4": "|f__", "3": "|o__", "2": "|c__", + "1": "|p__", "0": "d__"} + + taxa_levels_str=taxa_levels[taxonomic_rank] + taxa_levels_idx_int=taxa_levels_idx[taxa_levels_str] + + if taxa_levels_idx_int < 6: + split_on_level = taxa_levels_idx[str(taxa_levels_idx_int + 1)] + else: + split_on_level = '\t' + + taxonomic_set =\ + load_kraken_mpa_report( + kraken_mpa_report_fp=kraken_mpa_report_fp, + taxa_levels=[taxa_levels_str,split_on_level], + read_per_taxa=read_per_taxa) + + create_db_folder( + repophlan_genomeid_taxonomy_fp=repophlan_genomeid_taxonomy_fp, + genome_tax=taxonomic_set, + split_on_level=split_on_level, + output_filename=output_filename) + +if __name__ == "__main__": + main() diff --git a/scripts/compute_taxonomy_breakdown.py b/scripts/compute_taxonomy_breakdown.py new file mode 100644 index 0000000..65f2d68 --- /dev/null +++ b/scripts/compute_taxonomy_breakdown.py @@ -0,0 +1,76 @@ +#!/usr/bin/env python + +#----------------------------------------------------------------------------- +# Copyright (c) 2016--, Evguenia Kopylova. +# +# Distributed under the terms of the Modified BSD License. +# +# The full license is in the file COPYING.txt, distributed with this software. +#----------------------------------------------------------------------------- + +# Compute number of unique taxonomies and their counts + +import sys + + +def get_genome_paths(scores_average_fp, scores_repophlan_fp): + """Return taxonomy strings and their counts. + """ + genomes = [] + # Get genome IDs + with open(scores_average_fp) as scores_average_f: + next(scores_average_f) + for line in scores_average_f: + line = line.strip().split('\t') + genome_id = line[0] + if genome_id not in genomes: + genomes.append(genome_id) + else: + raise ValueError("Duplicate genome IDs %s" % genome_id) + # Get taxonomies based on used genome IDs + taxonomies = {} + genomes_without_taxonomy = [] + with open(scores_repophlan_fp) as scores_repophlan_f: + # header + line = scores_repophlan_f.readline() + line = line.strip().split('\t') + tax_idx = line.index('taxonomy') + for line in scores_repophlan_f: + line = line.strip().split('\t') + genome_id = line[0] + # only want tax_ids for genomes passing quality filter + if genome_id in genomes: + taxonomy = line[tax_idx] + if (('k__Bacteria' not in taxonomy) and + ('k__Archaea' not in taxonomy) and + ('k__Viruses' not in taxonomy) and + ('k__Viroids' not in taxonomy)): + genomes_without_taxonomy.append(genome_id) + if taxonomy not in taxonomies: + taxonomies[taxonomy] = 1 + else: + taxonomies[taxonomy] += 1 + return ([(k, taxonomies[k]) for k in sorted(taxonomies, key=taxonomies.get, reverse=True)], + genomes_without_taxonomy) + + +def main(): + """Parse output of RepoPhlAn's repophlan_get_microbes.txt to obtain + unique taxonomies and their counts. + """ + # Output of RepoPhlAn's repophlan_get_microbes.py + repophlan_scores_fp = sys.argv[1] + # Output of Jon's score average script + repophlan_scores_average_fp = sys.argv[2] + + taxonomies, genomes_without_taxonomy = get_genome_paths( + repophlan_scores_average_fp, repophlan_scores_fp) + if len(genomes_without_taxonomy) != 0: + print("Genomes without taxonomy: %s" % len(genomes_without_taxonomy)) + + for tup in taxonomies: + print("%s\t%s" % (tup[0], tup[1])) + + +if __name__ == '__main__': + main() diff --git a/scripts/filter_fasta.py b/scripts/filter_fasta.py new file mode 100644 index 0000000..1b6d74c --- /dev/null +++ b/scripts/filter_fasta.py @@ -0,0 +1,42 @@ +#!/usr/bin/env python + +#----------------------------------------------------------------------------- +# Copyright (c) 2016--, Evguenia Kopylova. +# +# Distributed under the terms of the Modified BSD License. +# +# The full license is in the file COPYING.txt, distributed with this software. +#----------------------------------------------------------------------------- + +import sys + +import skbio +from skbio import Sequence + +def main(): + """Output FASTA file using sequences in Kraken output. + """ + kraken_results_fp = sys.argv[1] + fasta_fp = sys.argv[2] + output_fp = sys.argv[3] + fasta_d = {} + # Load all FASTA seqs into dictionary + for seq in skbio.io.read(fasta_fp, format='fasta'): + label = seq.metadata['id'] + description = seq.metadata['description'] + if label not in fasta_d: + fasta_d[label] = [description, str(seq)] + else: + raise ValueError("Duplicate FASTA labels %s" % label) + # Output sequences to FASTA. + with open(output_fp, 'w') as output_f: + with open(kraken_results_fp) as kraken_results_f: + for line in kraken_results_f: + label = line.strip().split('\t')[1] + if label in fasta_d: + output_f.write(">%s%s\n%s\n" % (label, fasta_d[label][0], + fasta_d[label][1])) + + +if __name__ == '__main__': + main() \ No newline at end of file diff --git a/scripts/generate_kraken_db.py b/scripts/generate_kraken_db.py new file mode 100644 index 0000000..491266d --- /dev/null +++ b/scripts/generate_kraken_db.py @@ -0,0 +1,116 @@ +#!/bin/python + +#----------------------------------------------------------------------------- +# Copyright (c) 2016--, Evguenia Kopylova. +# +# Distributed under the terms of the Modified BSD License. +# +# The full license is in the file COPYING.txt, distributed with this software. +#----------------------------------------------------------------------------- + +# Edit the FASTA label to Kraken format: +# >sequence16|kraken:taxid|32630 Adapter sequence +# CAAGCAGAAGACGGCATACGAGATCTTCGAGTGACTGGAGTTCCTTGGCACCCGAGAATTCCA + +import sys +import bz2 +from skbio import Sequence +import skbio + +from os import walk +from os.path import join, splitext, basename, isfile + + +def get_genome_paths(scores_average_fp, scores_repophlan_fp): + """Return genome ID, tax_id and .fna.bz2 path + """ + genomes = {} + # Get quality filtered genome ID and filepath to .fna.bz2 + with open(scores_average_fp) as scores_average_f: + # header + line = scores_average_f.readline() + line = line.strip().split('\t') + fna_idx = line.index('fna_lname') + for line in scores_average_f: + line = line.strip().split('\t') + genome_id = line[0] + fna_fp = line[fna_idx] + if genome_id not in genomes: + genomes[genome_id] = [fna_fp] + else: + raise ValueError("Duplicate genome IDs %s" % genome_id) + # Get tax_id + genomes_without_tax_id = [] + with open(scores_repophlan_fp) as scores_repophlan_f: + # header + line = scores_repophlan_f.readline() + line = line.strip().split('\t') + tax_id_idx = line.index('taxid') + for line in scores_repophlan_f: + line = line.strip().split('\t') + genome_id = line[0] + # only want tax_ids for genomes passing quality filter + if genome_id in genomes: + tax_id = line[tax_id_idx] + # tax_id must be an integer, if not check the field + # immediately prior (bug in repophlan parsing) + if not tax_id.isdigit(): + tax_id = line[tax_id_idx-1] + if not tax_id.isdigit(): + genomes_without_tax_id.append(genome_id) + continue + genomes[genome_id].append(int(tax_id)) + return genomes, genomes_without_tax_id + + +def format_labels(genomes, repophlan_scores_filtered_genomes_dp): + """Format FASTA labels in qualified genomes. + """ + kraken_db_str = "|kraken:taxid|" + for genome_id, info in genomes.items(): + genome_fp = info[0] + taxid = info[1] + genome_fp_name = basename(splitext(genome_fp)[0]) + # check FNA file exists for genome + if genome_fp_name != "": + output_fp = join(repophlan_scores_filtered_genomes_dp, + genome_fp_name) + # skip files already modified (e.g. from previous run) + if not isfile(output_fp): + with open(output_fp, 'w') as output_f: + for seq in skbio.io.read( + genome_fp, format='fasta', compression='bz2'): + id_ = seq.metadata['id'] + description = seq.metadata['description'] + new_id_ = "%s%s%s" % (id_, kraken_db_str, taxid) + seq.metadata['id'] = new_id_ + output_f.write( + ">%s%s%s %s\n%s\n" % (id_, kraken_db_str, + taxid, description, + str(seq))) + + +def main(): + """Edit qualified genomes' labels to Kraken format. + """ + # .fna.bz2 genomes folder + all_genomes_bz2_dp = sys.argv[1] + # Output of RepoPhlAn's repophlan_get_microbes.py + repophlan_scores_fp = sys.argv[2] + # Output of Jon's score average script + # https://github.com/tanaes/script_bin/blob/master/filter_repophlan.py + repophlan_scores_average_fp = sys.argv[3] + # Output directory path + repophlan_scores_filtered_genomes_dp = sys.argv[4] + + genomes, genomes_without_tax_id = get_genome_paths( + repophlan_scores_average_fp, repophlan_scores_fp) + if len(genomes_without_tax_id) != 0: + raise ValueError( + "Some genomes do not have a taxid: %s" % genomes_without_tax_id) + + format_labels(genomes, repophlan_scores_filtered_genomes_dp) + + +if __name__ == '__main__': + main() \ No newline at end of file