biocore · jkanbar · Jul 8, 2016 · Jul 8, 2016 · Jul 8, 2016 · Nov 2, 2016
diff --git a/README.md b/README.md
@@ -1,2 +1,5 @@
 ## TCGA Analysis
 Workflow to rapidly search TCGA data for microbial presence.
+=======
+# tcga
+TCGA analysis
diff --git a/python_scripts/cgc_bowtie2_build.py b/python_scripts/cgc_bowtie2_build.py
@@ -0,0 +1,152 @@
+#!/usr/bin/env python
+
+#-----------------------------------------------------------------------------
+# Copyright (c) 2016--, Evguenia Kopylova, Jad Kanbar
+#
+# Distributed under the terms of the Modified BSD License.
+#
+# The full license is in the file COPYING.txt, distributed with this software.
+#-----------------------------------------------------------------------------
+
+"""
+Build Bowtie2 database on all reference genomes in Kraken report.
+"""
+
+import click
+import collections
+
+def load_kraken_mpa_report(kraken_mpa_report_fp,
+                           taxa_levels,
+                           read_per_taxa):
+    """Absolute abundance of number of reads matching a defined taxa level.
+    Parameters
+    ----------
+    kraken_mpa_report_fp: str
+        filepath to output of "kraken mpa report"
+    taxa_levels: list
+        list of two elements that includes the taxonomic rank at which
+        to generate summary and rank below to split by
+    read_per_taxa: int
+        integer of number of minimum number of reads to keep per taxa
+
+    Returns
+    -------
+    taxonomies: set
+        set of taxonomies from kraken_mpa_report_fp
+    """
+    taxonomic_abundances= []
+    for report_fp in kraken_mpa_report_fp:
+        with open(report_fp) as report_fp:
+            for line in report_fp:
+                label, taxonomy = line.strip().split('\t')
+                if taxa_levels[0] in taxonomy:
+                    # keep taxonomy string up to specified level
+                    taxonomy_parse = taxonomy.split(taxa_levels[1])[0]
+                    taxonomy_parse = taxonomy_parse.replace('d__','k__')
+                    taxonomic_abundances.append(taxonomy_parse)
+
+    taxonomies = set([k for k, v in
+                      collections.Counter(taxonomic_abundances).iteritems()
+                      if v > read_per_taxa])
+
+    return taxonomies
+
+def create_db_folder(repophlan_genomeid_taxonomy_fp,
+                     genome_tax,
+                     split_on_level,
+                     output_filename):
+
+    """Return sets for sample IDs and taxonomy strings.
+    Parameters
+    ----------
+    repophlan_genomeid_taxonomy_fp: str
+        filepath to output of repophlan file genome IDs and associated taxa
+    genome_tax: set
+        set of taxonomies from kraken_mpa_report_fp
+    split_on_level: str
+        string that determines the level to split the taxonomy in genome_tax
+    output_filename: str
+        string with output filename
+
+    Returns
+    -------
+    Writes a text file with genome IDs, directory to genomes, and associated
+    taxa from repophlan_genomeid_taxonomy_fp containned with the set
+    genome_tax
+    """
+    with open(output_filename, 'w') as f:
+        with open(repophlan_genomeid_taxonomy_fp) as repophlan_genomeID_taxonomy_f:
+            for line in repophlan_genomeID_taxonomy_f:
+                tax_id_repo, directory_repo, taxonomy_repo =\
+                    line.strip().split('\t')
+                taxonomy_repo = taxonomy_repo.split(split_on_level)[0]
+                if taxonomy_repo in genome_tax:
+                    f.writelines(line)
+
+
+@click.command()
+
+@click.option('--kraken-mpa-report-fp', required=True, multiple=True,
+              type=click.Path(resolve_path=True, readable=True, exists=True,
+                              file_okay=True),
+              help='Filepath to Kraken report')
+@click.option('--repophlan-genomeID-taxonomy-fp', required=True,
+              type=click.Path(resolve_path=True, readable=True, exists=False,
+                              file_okay=True),
+              help='Filepath to RepoPhlAn genome ID and taxonomy list')
+@click.option('--taxonomic-rank', type=click.Choice(['genus', 'species',
+                                                     'family', 'order',
+                                                     'class', 'phylum',
+                                                     'domain']),
+              required=False, default=['genus'], show_default=True,
+              help="Taxonomic rank at which to generate summary")
+@click.option('--read-per-taxa', required=False,
+              default=10, show_default=True,
+              help="Minimum number of reads for each taxa")
+@click.option('--output-filename', required=False,
+              default='subset_repophlan_genomeID_taxonomy.good',
+              show_default=True,
+              help="Output filename for RepoPhlAn genome ID and taxonomy list")
+
+def main(kraken_mpa_report_fp,
+         repophlan_genomeid_taxonomy_fp,
+         taxonomic_rank,
+         read_per_taxa,
+         output_filename):
+
+    taxa_levels = {"domain": "d__",
+                   "phylum": "|p__",
+                   "class": "|c__",
+                   "order": "|o__",
+                   "family": "|f__",
+                   "genus": "|g__",
+                   "species": "|s__"}
+
+    taxa_levels_idx = {"d__": 0, "|p__": 1, "|c__": 2,
+                       "|o__": 3, "|f__": 4, "|g__": 5,
+                       "|s__": 6, "6": "|s__", "5": "|g__",
+                       "4": "|f__", "3": "|o__", "2": "|c__",
+                       "1": "|p__", "0": "d__"}
+
+    taxa_levels_str=taxa_levels[taxonomic_rank]
+    taxa_levels_idx_int=taxa_levels_idx[taxa_levels_str]
+
+    if taxa_levels_idx_int < 6:
+        split_on_level = taxa_levels_idx[str(taxa_levels_idx_int + 1)]
+    else:
+        split_on_level = '\t'
+
+    taxonomic_set =\
+        load_kraken_mpa_report(
+            kraken_mpa_report_fp=kraken_mpa_report_fp,
+            taxa_levels=[taxa_levels_str,split_on_level],
+            read_per_taxa=read_per_taxa)
+
+    create_db_folder(
+            repophlan_genomeid_taxonomy_fp=repophlan_genomeid_taxonomy_fp,
+            genome_tax=taxonomic_set,
+            split_on_level=split_on_level,
+            output_filename=output_filename)
+
+if __name__ == "__main__":
+    main()
diff --git a/scripts/compute_taxonomy_breakdown.py b/scripts/compute_taxonomy_breakdown.py
@@ -0,0 +1,76 @@
+#!/usr/bin/env python
+
+#-----------------------------------------------------------------------------
+# Copyright (c) 2016--, Evguenia Kopylova.
+#
+# Distributed under the terms of the Modified BSD License.
+#
+# The full license is in the file COPYING.txt, distributed with this software.
+#-----------------------------------------------------------------------------
+
+# Compute number of unique taxonomies and their counts
+
+import sys
+
+
+def get_genome_paths(scores_average_fp, scores_repophlan_fp):
+    """Return taxonomy strings and their counts.
+    """
+    genomes = []
+    # Get genome IDs
+    with open(scores_average_fp) as scores_average_f:
+        next(scores_average_f)
+        for line in scores_average_f:
+            line = line.strip().split('\t')
+            genome_id = line[0]
+            if genome_id not in genomes:
+                genomes.append(genome_id)
+            else:
+                raise ValueError("Duplicate genome IDs %s" % genome_id)
+    # Get taxonomies based on used genome IDs
+    taxonomies = {}
+    genomes_without_taxonomy = []
+    with open(scores_repophlan_fp) as scores_repophlan_f:
+        # header
+        line = scores_repophlan_f.readline()
+        line = line.strip().split('\t')
+        tax_idx = line.index('taxonomy')
+        for line in scores_repophlan_f:
+            line = line.strip().split('\t')
+            genome_id = line[0]
+            # only want tax_ids for genomes passing quality filter
+            if genome_id in genomes:
+                taxonomy = line[tax_idx]
+                if (('k__Bacteria' not in taxonomy) and
+                        ('k__Archaea' not in taxonomy) and
+                        ('k__Viruses' not in taxonomy) and
+                        ('k__Viroids' not in taxonomy)):
+                    genomes_without_taxonomy.append(genome_id)
+                if taxonomy not in taxonomies:
+                    taxonomies[taxonomy] = 1
+                else:
+                    taxonomies[taxonomy] += 1
+    return ([(k, taxonomies[k]) for k in sorted(taxonomies, key=taxonomies.get, reverse=True)],
+            genomes_without_taxonomy)
+
+
+def main():
+    """Parse output of RepoPhlAn's repophlan_get_microbes.txt to obtain
+       unique taxonomies and their counts.
+    """
+    # Output of RepoPhlAn's repophlan_get_microbes.py
+    repophlan_scores_fp = sys.argv[1]
+    # Output of Jon's score average script
+    repophlan_scores_average_fp = sys.argv[2]
+
+    taxonomies, genomes_without_taxonomy = get_genome_paths(
+        repophlan_scores_average_fp, repophlan_scores_fp)
+    if len(genomes_without_taxonomy) != 0:
+        print("Genomes without taxonomy: %s" % len(genomes_without_taxonomy))
+
+    for tup in taxonomies:
+        print("%s\t%s" % (tup[0], tup[1]))
+
+
+if __name__ == '__main__':
+    main()
diff --git a/scripts/filter_fasta.py b/scripts/filter_fasta.py
@@ -0,0 +1,42 @@
+#!/usr/bin/env python
+
+#-----------------------------------------------------------------------------
+# Copyright (c) 2016--, Evguenia Kopylova.
+#
+# Distributed under the terms of the Modified BSD License.
+#
+# The full license is in the file COPYING.txt, distributed with this software.
+#-----------------------------------------------------------------------------
+
+import sys
+
+import skbio
+from skbio import Sequence
+
+def main():
+    """Output FASTA file using sequences in Kraken output.
+    """
+    kraken_results_fp = sys.argv[1]
+    fasta_fp = sys.argv[2]
+    output_fp = sys.argv[3]
+    fasta_d = {}
+    # Load all FASTA seqs into dictionary
+    for seq in skbio.io.read(fasta_fp, format='fasta'):
+        label = seq.metadata['id']
+        description = seq.metadata['description']
+        if label not in fasta_d:
+            fasta_d[label] = [description, str(seq)]
+        else:
+            raise ValueError("Duplicate FASTA labels %s" % label)
+    # Output sequences to FASTA.
+    with open(output_fp, 'w') as output_f:
+        with open(kraken_results_fp) as kraken_results_f:
+            for line in kraken_results_f:
+                label = line.strip().split('\t')[1]
+                if label in fasta_d:
+                    output_f.write(">%s%s\n%s\n" % (label, fasta_d[label][0],
+                                                    fasta_d[label][1]))
+
+
+if __name__ == '__main__':
+    main()
diff --git a/scripts/generate_kraken_db.py b/scripts/generate_kraken_db.py
@@ -0,0 +1,116 @@
+#!/bin/python
+
+#-----------------------------------------------------------------------------
+# Copyright (c) 2016--, Evguenia Kopylova.
+#
+# Distributed under the terms of the Modified BSD License.
+#
+# The full license is in the file COPYING.txt, distributed with this software.
+#-----------------------------------------------------------------------------
+
+# Edit the FASTA label to Kraken format:
+#   >sequence16|kraken:taxid|32630  Adapter sequence
+#   CAAGCAGAAGACGGCATACGAGATCTTCGAGTGACTGGAGTTCCTTGGCACCCGAGAATTCCA
+
+import sys
+import bz2
+from skbio import Sequence
+import skbio
+
+from os import walk
+from os.path import join, splitext, basename, isfile
+
+
+def get_genome_paths(scores_average_fp, scores_repophlan_fp):
+    """Return genome ID, tax_id and .fna.bz2 path
+    """
+    genomes = {}
+    # Get quality filtered genome ID and filepath to .fna.bz2
+    with open(scores_average_fp) as scores_average_f:
+        # header
+        line = scores_average_f.readline()
+        line = line.strip().split('\t')
+        fna_idx = line.index('fna_lname')
+        for line in scores_average_f:
+            line = line.strip().split('\t')
+            genome_id = line[0]
+            fna_fp = line[fna_idx]
+            if genome_id not in genomes:
+                genomes[genome_id] = [fna_fp]
+            else:
+                raise ValueError("Duplicate genome IDs %s" % genome_id)
+    # Get tax_id
+    genomes_without_tax_id = []
+    with open(scores_repophlan_fp) as scores_repophlan_f:
+        # header
+        line = scores_repophlan_f.readline()
+        line = line.strip().split('\t')
+        tax_id_idx = line.index('taxid')
+        for line in scores_repophlan_f:
+            line = line.strip().split('\t')
+            genome_id = line[0]
+            # only want tax_ids for genomes passing quality filter
+            if genome_id in genomes:
+                tax_id = line[tax_id_idx]
+                # tax_id must be an integer, if not check the field
+                # immediately prior (bug in repophlan parsing)
+                if not tax_id.isdigit():
+                    tax_id = line[tax_id_idx-1]
+                    if not tax_id.isdigit():
+                        genomes_without_tax_id.append(genome_id)
+                        continue
+                genomes[genome_id].append(int(tax_id))
+    return genomes, genomes_without_tax_id
+
+
+def format_labels(genomes, repophlan_scores_filtered_genomes_dp):
+    """Format FASTA labels in qualified genomes.
+    """
+    kraken_db_str = "|kraken:taxid|"
+    for genome_id, info in genomes.items():
+        genome_fp = info[0]
+        taxid = info[1]
+        genome_fp_name = basename(splitext(genome_fp)[0])
+        # check FNA file exists for genome
+        if genome_fp_name != "":
+            output_fp = join(repophlan_scores_filtered_genomes_dp,
+                             genome_fp_name)
+            # skip files already modified (e.g. from previous run)
+            if not isfile(output_fp):
+                with open(output_fp, 'w') as output_f:
+                    for seq in skbio.io.read(
+                            genome_fp, format='fasta', compression='bz2'):
+                        id_ = seq.metadata['id']
+                        description = seq.metadata['description']
+                        new_id_ = "%s%s%s" % (id_, kraken_db_str, taxid)
+                        seq.metadata['id'] = new_id_
+                        output_f.write(
+                            ">%s%s%s %s\n%s\n" % (id_, kraken_db_str,
+                                                  taxid, description,
+                                                  str(seq)))
+
+
+def main():
+    """Edit qualified genomes' labels to Kraken format.
+    """
+    # .fna.bz2 genomes folder
+    all_genomes_bz2_dp = sys.argv[1]
+    # Output of RepoPhlAn's repophlan_get_microbes.py
+    repophlan_scores_fp = sys.argv[2]
+    # Output of Jon's score average script
+    # https://github.com/tanaes/script_bin/blob/master/filter_repophlan.py
+    repophlan_scores_average_fp = sys.argv[3]
+    # Output directory path
+    repophlan_scores_filtered_genomes_dp = sys.argv[4]
+
+    genomes, genomes_without_tax_id = get_genome_paths(
+        repophlan_scores_average_fp, repophlan_scores_fp)
+    if len(genomes_without_tax_id) != 0:
+        raise ValueError(
+            "Some genomes do not have a taxid: %s" % genomes_without_tax_id)
+
+    format_labels(genomes, repophlan_scores_filtered_genomes_dp)
+
+
+if __name__ == '__main__':
+    main()