-
Notifications
You must be signed in to change notification settings - Fork 44
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
bowtie build #12
base: master
Are you sure you want to change the base?
bowtie build #12
Changes from all commits
b6bc4f5
58858fa
3382f26
04668bc
c09670d
b79a084
6c4628a
87971fc
85ef3a2
4344f93
8599710
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -1,2 +1,5 @@ | ||
## TCGA Analysis | ||
Workflow to rapidly search TCGA data for microbial presence. | ||
======= | ||
# tcga | ||
TCGA analysis |
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,152 @@ | ||
#!/usr/bin/env python | ||
|
||
#----------------------------------------------------------------------------- | ||
# Copyright (c) 2016--, Evguenia Kopylova, Jad Kanbar | ||
# | ||
# Distributed under the terms of the Modified BSD License. | ||
# | ||
# The full license is in the file COPYING.txt, distributed with this software. | ||
#----------------------------------------------------------------------------- | ||
|
||
""" | ||
Build Bowtie2 database on all reference genomes in Kraken report. | ||
""" | ||
|
||
import click | ||
import collections | ||
|
||
def load_kraken_mpa_report(kraken_mpa_report_fp, | ||
taxa_levels, | ||
read_per_taxa): | ||
"""Absolute abundance of number of reads matching a defined taxa level. | ||
Parameters | ||
---------- | ||
kraken_mpa_report_fp: str | ||
filepath to output of "kraken mpa report" | ||
taxa_levels: list | ||
list of two elements that includes the taxonomic rank at which | ||
to generate summary and rank below to split by | ||
read_per_taxa: int | ||
integer of number of minimum number of reads to keep per taxa | ||
|
||
Returns | ||
------- | ||
taxonomies: set | ||
set of taxonomies from kraken_mpa_report_fp | ||
""" | ||
taxonomic_abundances= [] | ||
for report_fp in kraken_mpa_report_fp: | ||
with open(report_fp) as report_fp: | ||
for line in report_fp: | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. would it be possible to break this parser out? seems like a logical point for decomposition. i'm not seeing a paired unit test or set of tests either...? |
||
label, taxonomy = line.strip().split('\t') | ||
if taxa_levels[0] in taxonomy: | ||
# keep taxonomy string up to specified level | ||
taxonomy_parse = taxonomy.split(taxa_levels[1])[0] | ||
taxonomy_parse = taxonomy_parse.replace('d__','k__') | ||
taxonomic_abundances.append(taxonomy_parse) | ||
|
||
taxonomies = set([k for k, v in | ||
collections.Counter(taxonomic_abundances).iteritems() | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. this is a py2 codebase? |
||
if v > read_per_taxa]) | ||
|
||
return taxonomies | ||
|
||
def create_db_folder(repophlan_genomeid_taxonomy_fp, | ||
genome_tax, | ||
split_on_level, | ||
output_filename): | ||
|
||
"""Return sets for sample IDs and taxonomy strings. | ||
Parameters | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. newline please |
||
---------- | ||
repophlan_genomeid_taxonomy_fp: str | ||
filepath to output of repophlan file genome IDs and associated taxa | ||
genome_tax: set | ||
set of taxonomies from kraken_mpa_report_fp | ||
split_on_level: str | ||
string that determines the level to split the taxonomy in genome_tax | ||
output_filename: str | ||
string with output filename | ||
|
||
Returns | ||
------- | ||
Writes a text file with genome IDs, directory to genomes, and associated | ||
taxa from repophlan_genomeid_taxonomy_fp containned with the set | ||
genome_tax | ||
""" | ||
with open(output_filename, 'w') as f: | ||
with open(repophlan_genomeid_taxonomy_fp) as repophlan_genomeID_taxonomy_f: | ||
for line in repophlan_genomeID_taxonomy_f: | ||
tax_id_repo, directory_repo, taxonomy_repo =\ | ||
line.strip().split('\t') | ||
taxonomy_repo = taxonomy_repo.split(split_on_level)[0] | ||
if taxonomy_repo in genome_tax: | ||
f.writelines(line) | ||
|
||
|
||
@click.command() | ||
|
||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. remove the extra newline please |
||
@click.option('--kraken-mpa-report-fp', required=True, multiple=True, | ||
type=click.Path(resolve_path=True, readable=True, exists=True, | ||
file_okay=True), | ||
help='Filepath to Kraken report') | ||
@click.option('--repophlan-genomeID-taxonomy-fp', required=True, | ||
type=click.Path(resolve_path=True, readable=True, exists=False, | ||
file_okay=True), | ||
help='Filepath to RepoPhlAn genome ID and taxonomy list') | ||
@click.option('--taxonomic-rank', type=click.Choice(['genus', 'species', | ||
'family', 'order', | ||
'class', 'phylum', | ||
'domain']), | ||
required=False, default=['genus'], show_default=True, | ||
help="Taxonomic rank at which to generate summary") | ||
@click.option('--read-per-taxa', required=False, | ||
default=10, show_default=True, | ||
help="Minimum number of reads for each taxa") | ||
@click.option('--output-filename', required=False, | ||
default='subset_repophlan_genomeID_taxonomy.good', | ||
show_default=True, | ||
help="Output filename for RepoPhlAn genome ID and taxonomy list") | ||
|
||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. remove the extra newline please |
||
def main(kraken_mpa_report_fp, | ||
repophlan_genomeid_taxonomy_fp, | ||
taxonomic_rank, | ||
read_per_taxa, | ||
output_filename): | ||
|
||
taxa_levels = {"domain": "d__", | ||
"phylum": "|p__", | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. would it make sense to |
||
"class": "|c__", | ||
"order": "|o__", | ||
"family": "|f__", | ||
"genus": "|g__", | ||
"species": "|s__"} | ||
|
||
taxa_levels_idx = {"d__": 0, "|p__": 1, "|c__": 2, | ||
"|o__": 3, "|f__": 4, "|g__": 5, | ||
"|s__": 6, "6": "|s__", "5": "|g__", | ||
"4": "|f__", "3": "|o__", "2": "|c__", | ||
"1": "|p__", "0": "d__"} | ||
|
||
taxa_levels_str=taxa_levels[taxonomic_rank] | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. is this code evaluated for pep8? |
||
taxa_levels_idx_int=taxa_levels_idx[taxa_levels_str] | ||
|
||
if taxa_levels_idx_int < 6: | ||
split_on_level = taxa_levels_idx[str(taxa_levels_idx_int + 1)] | ||
else: | ||
split_on_level = '\t' | ||
|
||
taxonomic_set =\ | ||
load_kraken_mpa_report( | ||
kraken_mpa_report_fp=kraken_mpa_report_fp, | ||
taxa_levels=[taxa_levels_str,split_on_level], | ||
read_per_taxa=read_per_taxa) | ||
|
||
create_db_folder( | ||
repophlan_genomeid_taxonomy_fp=repophlan_genomeid_taxonomy_fp, | ||
genome_tax=taxonomic_set, | ||
split_on_level=split_on_level, | ||
output_filename=output_filename) | ||
|
||
if __name__ == "__main__": | ||
main() |
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,76 @@ | ||
#!/usr/bin/env python | ||
|
||
#----------------------------------------------------------------------------- | ||
# Copyright (c) 2016--, Evguenia Kopylova. | ||
# | ||
# Distributed under the terms of the Modified BSD License. | ||
# | ||
# The full license is in the file COPYING.txt, distributed with this software. | ||
#----------------------------------------------------------------------------- | ||
|
||
# Compute number of unique taxonomies and their counts | ||
|
||
import sys | ||
|
||
|
||
def get_genome_paths(scores_average_fp, scores_repophlan_fp): | ||
"""Return taxonomy strings and their counts. | ||
""" | ||
genomes = [] | ||
# Get genome IDs | ||
with open(scores_average_fp) as scores_average_f: | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. there is a lot of parsing of tab delimited text in this PR. would it make sense to use pandas? |
||
next(scores_average_f) | ||
for line in scores_average_f: | ||
line = line.strip().split('\t') | ||
genome_id = line[0] | ||
if genome_id not in genomes: | ||
genomes.append(genome_id) | ||
else: | ||
raise ValueError("Duplicate genome IDs %s" % genome_id) | ||
# Get taxonomies based on used genome IDs | ||
taxonomies = {} | ||
genomes_without_taxonomy = [] | ||
with open(scores_repophlan_fp) as scores_repophlan_f: | ||
# header | ||
line = scores_repophlan_f.readline() | ||
line = line.strip().split('\t') | ||
tax_idx = line.index('taxonomy') | ||
for line in scores_repophlan_f: | ||
line = line.strip().split('\t') | ||
genome_id = line[0] | ||
# only want tax_ids for genomes passing quality filter | ||
if genome_id in genomes: | ||
taxonomy = line[tax_idx] | ||
if (('k__Bacteria' not in taxonomy) and | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. it might be cleaner to define an exclude set, and test the length of a set intersection with taxonomy |
||
('k__Archaea' not in taxonomy) and | ||
('k__Viruses' not in taxonomy) and | ||
('k__Viroids' not in taxonomy)): | ||
genomes_without_taxonomy.append(genome_id) | ||
if taxonomy not in taxonomies: | ||
taxonomies[taxonomy] = 1 | ||
else: | ||
taxonomies[taxonomy] += 1 | ||
return ([(k, taxonomies[k]) for k in sorted(taxonomies, key=taxonomies.get, reverse=True)], | ||
genomes_without_taxonomy) | ||
|
||
|
||
def main(): | ||
"""Parse output of RepoPhlAn's repophlan_get_microbes.txt to obtain | ||
unique taxonomies and their counts. | ||
""" | ||
# Output of RepoPhlAn's repophlan_get_microbes.py | ||
repophlan_scores_fp = sys.argv[1] | ||
# Output of Jon's score average script | ||
repophlan_scores_average_fp = sys.argv[2] | ||
|
||
taxonomies, genomes_without_taxonomy = get_genome_paths( | ||
repophlan_scores_average_fp, repophlan_scores_fp) | ||
if len(genomes_without_taxonomy) != 0: | ||
print("Genomes without taxonomy: %s" % len(genomes_without_taxonomy)) | ||
|
||
for tup in taxonomies: | ||
print("%s\t%s" % (tup[0], tup[1])) | ||
|
||
|
||
if __name__ == '__main__': | ||
main() |
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,42 @@ | ||
#!/usr/bin/env python | ||
|
||
#----------------------------------------------------------------------------- | ||
# Copyright (c) 2016--, Evguenia Kopylova. | ||
# | ||
# Distributed under the terms of the Modified BSD License. | ||
# | ||
# The full license is in the file COPYING.txt, distributed with this software. | ||
#----------------------------------------------------------------------------- | ||
|
||
import sys | ||
|
||
import skbio | ||
from skbio import Sequence | ||
|
||
def main(): | ||
"""Output FASTA file using sequences in Kraken output. | ||
""" | ||
kraken_results_fp = sys.argv[1] | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. :( |
||
fasta_fp = sys.argv[2] | ||
output_fp = sys.argv[3] | ||
fasta_d = {} | ||
# Load all FASTA seqs into dictionary | ||
for seq in skbio.io.read(fasta_fp, format='fasta'): | ||
label = seq.metadata['id'] | ||
description = seq.metadata['description'] | ||
if label not in fasta_d: | ||
fasta_d[label] = [description, str(seq)] | ||
else: | ||
raise ValueError("Duplicate FASTA labels %s" % label) | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. is this an error in practice...? |
||
# Output sequences to FASTA. | ||
with open(output_fp, 'w') as output_f: | ||
with open(kraken_results_fp) as kraken_results_f: | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. instead of nesting, you can open multiple files in a single with: with open(foo, 'w') as x, open(bar, 'r') as y:
... |
||
for line in kraken_results_f: | ||
label = line.strip().split('\t')[1] | ||
if label in fasta_d: | ||
output_f.write(">%s%s\n%s\n" % (label, fasta_d[label][0], | ||
fasta_d[label][1])) | ||
|
||
|
||
if __name__ == '__main__': | ||
main() |
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,116 @@ | ||
#!/bin/python | ||
|
||
#----------------------------------------------------------------------------- | ||
# Copyright (c) 2016--, Evguenia Kopylova. | ||
# | ||
# Distributed under the terms of the Modified BSD License. | ||
# | ||
# The full license is in the file COPYING.txt, distributed with this software. | ||
#----------------------------------------------------------------------------- | ||
|
||
# Edit the FASTA label to Kraken format: | ||
# >sequence16|kraken:taxid|32630 Adapter sequence | ||
# CAAGCAGAAGACGGCATACGAGATCTTCGAGTGACTGGAGTTCCTTGGCACCCGAGAATTCCA | ||
|
||
import sys | ||
import bz2 | ||
from skbio import Sequence | ||
import skbio | ||
|
||
from os import walk | ||
from os.path import join, splitext, basename, isfile | ||
|
||
|
||
def get_genome_paths(scores_average_fp, scores_repophlan_fp): | ||
"""Return genome ID, tax_id and .fna.bz2 path | ||
""" | ||
genomes = {} | ||
# Get quality filtered genome ID and filepath to .fna.bz2 | ||
with open(scores_average_fp) as scores_average_f: | ||
# header | ||
line = scores_average_f.readline() | ||
line = line.strip().split('\t') | ||
fna_idx = line.index('fna_lname') | ||
for line in scores_average_f: | ||
line = line.strip().split('\t') | ||
genome_id = line[0] | ||
fna_fp = line[fna_idx] | ||
if genome_id not in genomes: | ||
genomes[genome_id] = [fna_fp] | ||
else: | ||
raise ValueError("Duplicate genome IDs %s" % genome_id) | ||
# Get tax_id | ||
genomes_without_tax_id = [] | ||
with open(scores_repophlan_fp) as scores_repophlan_f: | ||
# header | ||
line = scores_repophlan_f.readline() | ||
line = line.strip().split('\t') | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. yea... i think pandas would be so nice here |
||
tax_id_idx = line.index('taxid') | ||
for line in scores_repophlan_f: | ||
line = line.strip().split('\t') | ||
genome_id = line[0] | ||
# only want tax_ids for genomes passing quality filter | ||
if genome_id in genomes: | ||
tax_id = line[tax_id_idx] | ||
# tax_id must be an integer, if not check the field | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. that sucks. has this bug been reported to the repophlan maintainers? |
||
# immediately prior (bug in repophlan parsing) | ||
if not tax_id.isdigit(): | ||
tax_id = line[tax_id_idx-1] | ||
if not tax_id.isdigit(): | ||
genomes_without_tax_id.append(genome_id) | ||
continue | ||
genomes[genome_id].append(int(tax_id)) | ||
return genomes, genomes_without_tax_id | ||
|
||
|
||
def format_labels(genomes, repophlan_scores_filtered_genomes_dp): | ||
"""Format FASTA labels in qualified genomes. | ||
""" | ||
kraken_db_str = "|kraken:taxid|" | ||
for genome_id, info in genomes.items(): | ||
genome_fp = info[0] | ||
taxid = info[1] | ||
genome_fp_name = basename(splitext(genome_fp)[0]) | ||
# check FNA file exists for genome | ||
if genome_fp_name != "": | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more.
|
||
output_fp = join(repophlan_scores_filtered_genomes_dp, | ||
genome_fp_name) | ||
# skip files already modified (e.g. from previous run) | ||
if not isfile(output_fp): | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. should this be |
||
with open(output_fp, 'w') as output_f: | ||
for seq in skbio.io.read( | ||
genome_fp, format='fasta', compression='bz2'): | ||
id_ = seq.metadata['id'] | ||
description = seq.metadata['description'] | ||
new_id_ = "%s%s%s" % (id_, kraken_db_str, taxid) | ||
seq.metadata['id'] = new_id_ | ||
output_f.write( | ||
">%s%s%s %s\n%s\n" % (id_, kraken_db_str, | ||
taxid, description, | ||
str(seq))) | ||
|
||
|
||
def main(): | ||
"""Edit qualified genomes' labels to Kraken format. | ||
""" | ||
# .fna.bz2 genomes folder | ||
all_genomes_bz2_dp = sys.argv[1] | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. :( |
||
# Output of RepoPhlAn's repophlan_get_microbes.py | ||
repophlan_scores_fp = sys.argv[2] | ||
# Output of Jon's score average script | ||
# https://github.com/tanaes/script_bin/blob/master/filter_repophlan.py | ||
repophlan_scores_average_fp = sys.argv[3] | ||
# Output directory path | ||
repophlan_scores_filtered_genomes_dp = sys.argv[4] | ||
|
||
genomes, genomes_without_tax_id = get_genome_paths( | ||
repophlan_scores_average_fp, repophlan_scores_fp) | ||
if len(genomes_without_tax_id) != 0: | ||
raise ValueError( | ||
"Some genomes do not have a taxid: %s" % genomes_without_tax_id) | ||
|
||
format_labels(genomes, repophlan_scores_filtered_genomes_dp) | ||
|
||
|
||
if __name__ == '__main__': | ||
main() |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
newline please