Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

bowtie build #12

Open
wants to merge 11 commits into
base: master
Choose a base branch
from
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
3 changes: 3 additions & 0 deletions README.md
Original file line number Diff line number Diff line change
@@ -1,2 +1,5 @@
## TCGA Analysis
Workflow to rapidly search TCGA data for microbial presence.
=======
# tcga
TCGA analysis
152 changes: 152 additions & 0 deletions python_scripts/cgc_bowtie2_build.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,152 @@
#!/usr/bin/env python

#-----------------------------------------------------------------------------
# Copyright (c) 2016--, Evguenia Kopylova, Jad Kanbar
#
# Distributed under the terms of the Modified BSD License.
#
# The full license is in the file COPYING.txt, distributed with this software.
#-----------------------------------------------------------------------------

"""
Build Bowtie2 database on all reference genomes in Kraken report.
"""

import click
import collections

def load_kraken_mpa_report(kraken_mpa_report_fp,
taxa_levels,
read_per_taxa):
"""Absolute abundance of number of reads matching a defined taxa level.
Parameters
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

newline please

----------
kraken_mpa_report_fp: str
filepath to output of "kraken mpa report"
taxa_levels: list
list of two elements that includes the taxonomic rank at which
to generate summary and rank below to split by
read_per_taxa: int
integer of number of minimum number of reads to keep per taxa

Returns
-------
taxonomies: set
set of taxonomies from kraken_mpa_report_fp
"""
taxonomic_abundances= []
for report_fp in kraken_mpa_report_fp:
with open(report_fp) as report_fp:
for line in report_fp:
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

would it be possible to break this parser out? seems like a logical point for decomposition. i'm not seeing a paired unit test or set of tests either...?

label, taxonomy = line.strip().split('\t')
if taxa_levels[0] in taxonomy:
# keep taxonomy string up to specified level
taxonomy_parse = taxonomy.split(taxa_levels[1])[0]
taxonomy_parse = taxonomy_parse.replace('d__','k__')
taxonomic_abundances.append(taxonomy_parse)

taxonomies = set([k for k, v in
collections.Counter(taxonomic_abundances).iteritems()
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

this is a py2 codebase?

if v > read_per_taxa])

return taxonomies

def create_db_folder(repophlan_genomeid_taxonomy_fp,
genome_tax,
split_on_level,
output_filename):

"""Return sets for sample IDs and taxonomy strings.
Parameters
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

newline please

----------
repophlan_genomeid_taxonomy_fp: str
filepath to output of repophlan file genome IDs and associated taxa
genome_tax: set
set of taxonomies from kraken_mpa_report_fp
split_on_level: str
string that determines the level to split the taxonomy in genome_tax
output_filename: str
string with output filename

Returns
-------
Writes a text file with genome IDs, directory to genomes, and associated
taxa from repophlan_genomeid_taxonomy_fp containned with the set
genome_tax
"""
with open(output_filename, 'w') as f:
with open(repophlan_genomeid_taxonomy_fp) as repophlan_genomeID_taxonomy_f:
for line in repophlan_genomeID_taxonomy_f:
tax_id_repo, directory_repo, taxonomy_repo =\
line.strip().split('\t')
taxonomy_repo = taxonomy_repo.split(split_on_level)[0]
if taxonomy_repo in genome_tax:
f.writelines(line)


@click.command()

Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

remove the extra newline please

@click.option('--kraken-mpa-report-fp', required=True, multiple=True,
type=click.Path(resolve_path=True, readable=True, exists=True,
file_okay=True),
help='Filepath to Kraken report')
@click.option('--repophlan-genomeID-taxonomy-fp', required=True,
type=click.Path(resolve_path=True, readable=True, exists=False,
file_okay=True),
help='Filepath to RepoPhlAn genome ID and taxonomy list')
@click.option('--taxonomic-rank', type=click.Choice(['genus', 'species',
'family', 'order',
'class', 'phylum',
'domain']),
required=False, default=['genus'], show_default=True,
help="Taxonomic rank at which to generate summary")
@click.option('--read-per-taxa', required=False,
default=10, show_default=True,
help="Minimum number of reads for each taxa")
@click.option('--output-filename', required=False,
default='subset_repophlan_genomeID_taxonomy.good',
show_default=True,
help="Output filename for RepoPhlAn genome ID and taxonomy list")

Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

remove the extra newline please

def main(kraken_mpa_report_fp,
repophlan_genomeid_taxonomy_fp,
taxonomic_rank,
read_per_taxa,
output_filename):

taxa_levels = {"domain": "d__",
"phylum": "|p__",
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

would it make sense to split instead of encode the |?

"class": "|c__",
"order": "|o__",
"family": "|f__",
"genus": "|g__",
"species": "|s__"}

taxa_levels_idx = {"d__": 0, "|p__": 1, "|c__": 2,
"|o__": 3, "|f__": 4, "|g__": 5,
"|s__": 6, "6": "|s__", "5": "|g__",
"4": "|f__", "3": "|o__", "2": "|c__",
"1": "|p__", "0": "d__"}

taxa_levels_str=taxa_levels[taxonomic_rank]
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

is this code evaluated for pep8?

taxa_levels_idx_int=taxa_levels_idx[taxa_levels_str]

if taxa_levels_idx_int < 6:
split_on_level = taxa_levels_idx[str(taxa_levels_idx_int + 1)]
else:
split_on_level = '\t'

taxonomic_set =\
load_kraken_mpa_report(
kraken_mpa_report_fp=kraken_mpa_report_fp,
taxa_levels=[taxa_levels_str,split_on_level],
read_per_taxa=read_per_taxa)

create_db_folder(
repophlan_genomeid_taxonomy_fp=repophlan_genomeid_taxonomy_fp,
genome_tax=taxonomic_set,
split_on_level=split_on_level,
output_filename=output_filename)

if __name__ == "__main__":
main()
76 changes: 76 additions & 0 deletions scripts/compute_taxonomy_breakdown.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,76 @@
#!/usr/bin/env python

#-----------------------------------------------------------------------------
# Copyright (c) 2016--, Evguenia Kopylova.
#
# Distributed under the terms of the Modified BSD License.
#
# The full license is in the file COPYING.txt, distributed with this software.
#-----------------------------------------------------------------------------

# Compute number of unique taxonomies and their counts

import sys


def get_genome_paths(scores_average_fp, scores_repophlan_fp):
"""Return taxonomy strings and their counts.
"""
genomes = []
# Get genome IDs
with open(scores_average_fp) as scores_average_f:
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

there is a lot of parsing of tab delimited text in this PR. would it make sense to use pandas?

next(scores_average_f)
for line in scores_average_f:
line = line.strip().split('\t')
genome_id = line[0]
if genome_id not in genomes:
genomes.append(genome_id)
else:
raise ValueError("Duplicate genome IDs %s" % genome_id)
# Get taxonomies based on used genome IDs
taxonomies = {}
genomes_without_taxonomy = []
with open(scores_repophlan_fp) as scores_repophlan_f:
# header
line = scores_repophlan_f.readline()
line = line.strip().split('\t')
tax_idx = line.index('taxonomy')
for line in scores_repophlan_f:
line = line.strip().split('\t')
genome_id = line[0]
# only want tax_ids for genomes passing quality filter
if genome_id in genomes:
taxonomy = line[tax_idx]
if (('k__Bacteria' not in taxonomy) and
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

it might be cleaner to define an exclude set, and test the length of a set intersection with taxonomy

('k__Archaea' not in taxonomy) and
('k__Viruses' not in taxonomy) and
('k__Viroids' not in taxonomy)):
genomes_without_taxonomy.append(genome_id)
if taxonomy not in taxonomies:
taxonomies[taxonomy] = 1
else:
taxonomies[taxonomy] += 1
return ([(k, taxonomies[k]) for k in sorted(taxonomies, key=taxonomies.get, reverse=True)],
genomes_without_taxonomy)


def main():
"""Parse output of RepoPhlAn's repophlan_get_microbes.txt to obtain
unique taxonomies and their counts.
"""
# Output of RepoPhlAn's repophlan_get_microbes.py
repophlan_scores_fp = sys.argv[1]
# Output of Jon's score average script
repophlan_scores_average_fp = sys.argv[2]

taxonomies, genomes_without_taxonomy = get_genome_paths(
repophlan_scores_average_fp, repophlan_scores_fp)
if len(genomes_without_taxonomy) != 0:
print("Genomes without taxonomy: %s" % len(genomes_without_taxonomy))

for tup in taxonomies:
print("%s\t%s" % (tup[0], tup[1]))


if __name__ == '__main__':
main()
42 changes: 42 additions & 0 deletions scripts/filter_fasta.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,42 @@
#!/usr/bin/env python

#-----------------------------------------------------------------------------
# Copyright (c) 2016--, Evguenia Kopylova.
#
# Distributed under the terms of the Modified BSD License.
#
# The full license is in the file COPYING.txt, distributed with this software.
#-----------------------------------------------------------------------------

import sys

import skbio
from skbio import Sequence

def main():
"""Output FASTA file using sequences in Kraken output.
"""
kraken_results_fp = sys.argv[1]
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

:(

fasta_fp = sys.argv[2]
output_fp = sys.argv[3]
fasta_d = {}
# Load all FASTA seqs into dictionary
for seq in skbio.io.read(fasta_fp, format='fasta'):
label = seq.metadata['id']
description = seq.metadata['description']
if label not in fasta_d:
fasta_d[label] = [description, str(seq)]
else:
raise ValueError("Duplicate FASTA labels %s" % label)
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

is this an error in practice...?

# Output sequences to FASTA.
with open(output_fp, 'w') as output_f:
with open(kraken_results_fp) as kraken_results_f:
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

instead of nesting, you can open multiple files in a single with:

with open(foo, 'w') as x, open(bar, 'r') as y:
    ...

for line in kraken_results_f:
label = line.strip().split('\t')[1]
if label in fasta_d:
output_f.write(">%s%s\n%s\n" % (label, fasta_d[label][0],
fasta_d[label][1]))


if __name__ == '__main__':
main()
116 changes: 116 additions & 0 deletions scripts/generate_kraken_db.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,116 @@
#!/bin/python

#-----------------------------------------------------------------------------
# Copyright (c) 2016--, Evguenia Kopylova.
#
# Distributed under the terms of the Modified BSD License.
#
# The full license is in the file COPYING.txt, distributed with this software.
#-----------------------------------------------------------------------------

# Edit the FASTA label to Kraken format:
# >sequence16|kraken:taxid|32630 Adapter sequence
# CAAGCAGAAGACGGCATACGAGATCTTCGAGTGACTGGAGTTCCTTGGCACCCGAGAATTCCA

import sys
import bz2
from skbio import Sequence
import skbio

from os import walk
from os.path import join, splitext, basename, isfile


def get_genome_paths(scores_average_fp, scores_repophlan_fp):
"""Return genome ID, tax_id and .fna.bz2 path
"""
genomes = {}
# Get quality filtered genome ID and filepath to .fna.bz2
with open(scores_average_fp) as scores_average_f:
# header
line = scores_average_f.readline()
line = line.strip().split('\t')
fna_idx = line.index('fna_lname')
for line in scores_average_f:
line = line.strip().split('\t')
genome_id = line[0]
fna_fp = line[fna_idx]
if genome_id not in genomes:
genomes[genome_id] = [fna_fp]
else:
raise ValueError("Duplicate genome IDs %s" % genome_id)
# Get tax_id
genomes_without_tax_id = []
with open(scores_repophlan_fp) as scores_repophlan_f:
# header
line = scores_repophlan_f.readline()
line = line.strip().split('\t')
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

yea... i think pandas would be so nice here

tax_id_idx = line.index('taxid')
for line in scores_repophlan_f:
line = line.strip().split('\t')
genome_id = line[0]
# only want tax_ids for genomes passing quality filter
if genome_id in genomes:
tax_id = line[tax_id_idx]
# tax_id must be an integer, if not check the field
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

that sucks. has this bug been reported to the repophlan maintainers?

# immediately prior (bug in repophlan parsing)
if not tax_id.isdigit():
tax_id = line[tax_id_idx-1]
if not tax_id.isdigit():
genomes_without_tax_id.append(genome_id)
continue
genomes[genome_id].append(int(tax_id))
return genomes, genomes_without_tax_id


def format_labels(genomes, repophlan_scores_filtered_genomes_dp):
"""Format FASTA labels in qualified genomes.
"""
kraken_db_str = "|kraken:taxid|"
for genome_id, info in genomes.items():
genome_fp = info[0]
taxid = info[1]
genome_fp_name = basename(splitext(genome_fp)[0])
# check FNA file exists for genome
if genome_fp_name != "":
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

if genome_fp_name:

output_fp = join(repophlan_scores_filtered_genomes_dp,
genome_fp_name)
# skip files already modified (e.g. from previous run)
if not isfile(output_fp):
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

should this be os.path.exists?

with open(output_fp, 'w') as output_f:
for seq in skbio.io.read(
genome_fp, format='fasta', compression='bz2'):
id_ = seq.metadata['id']
description = seq.metadata['description']
new_id_ = "%s%s%s" % (id_, kraken_db_str, taxid)
seq.metadata['id'] = new_id_
output_f.write(
">%s%s%s %s\n%s\n" % (id_, kraken_db_str,
taxid, description,
str(seq)))


def main():
"""Edit qualified genomes' labels to Kraken format.
"""
# .fna.bz2 genomes folder
all_genomes_bz2_dp = sys.argv[1]
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

:(

# Output of RepoPhlAn's repophlan_get_microbes.py
repophlan_scores_fp = sys.argv[2]
# Output of Jon's score average script
# https://github.com/tanaes/script_bin/blob/master/filter_repophlan.py
repophlan_scores_average_fp = sys.argv[3]
# Output directory path
repophlan_scores_filtered_genomes_dp = sys.argv[4]

genomes, genomes_without_tax_id = get_genome_paths(
repophlan_scores_average_fp, repophlan_scores_fp)
if len(genomes_without_tax_id) != 0:
raise ValueError(
"Some genomes do not have a taxid: %s" % genomes_without_tax_id)

format_labels(genomes, repophlan_scores_filtered_genomes_dp)


if __name__ == '__main__':
main()