-
Notifications
You must be signed in to change notification settings - Fork 4
Partition reads to coding, noncoding and low complexity #8
Changes from all commits
7a4ae09
f046e3f
f48a1fe
b1f4a6e
2f649eb
cdb73d6
0445295
e301200
63ee111
4c0c10f
370e373
d5a6169
6ffb4fc
fda0ab2
eb75359
f69c41f
5bbf52b
bcb40cc
2693754
c928403
d1efc3e
7563b3f
4816051
c6640fa
bfafe72
380f35d
5d99f5f
c02e97a
91777ba
6a1f80a
2710f7c
86d8ace
54c5124
822f278
f4e493c
7c91683
e9c4e4f
4bb838b
4d44f42
a7c5341
160243a
7443753
fff01cd
11374d5
8f67ebb
92f301d
f7058d9
d902191
36689aa
9675c17
39d1475
ba8ef36
9ead2d4
9d4acf8
8494fe3
282f4f3
92547e7
d67038d
8e4f976
94f338e
19438f6
1129dd5
05199cc
369ee45
dc9f8d6
8365f50
a7a0f9d
cfd827d
08ed466
4e078d4
d8896e6
5bba5b1
d508350
b8b1fba
4ce7d13
7409be6
dddbb9a
708600f
a15056f
4e3a242
f335e5d
f2a6373
92408b8
c61ad1b
dbc3d1f
2c656bf
367017c
3b7366b
05392ea
a31a37f
6a9342b
9be85d2
8674b69
d32a495
b6d4643
4c73111
a4a75ea
cff80d1
1d3e3ef
67a5ec4
0137712
3870f14
492377e
e7f4c1b
0287b45
571a75f
65422c5
4c668ab
bc786fd
81fbf4a
54dbbba
196ba1d
648322f
be37660
b6b093e
729555b
dc6d2fe
cdee037
30121de
dd2a984
372cfeb
01137c3
9ce86a2
fd7ba49
045fed1
a1872a6
e8d8162
5da265c
f075200
266a582
82f9067
7e0faeb
ae2bc70
5fd75b9
92022c6
79b5294
b767a31
5b768fa
0f64aa7
2848d9c
1c537f9
0c7af66
0de6819
c572326
63d828f
fd39adf
44f6b82
28279cf
b5d24e1
7b09d27
990316a
2f9adde
36082f2
99acc1f
dc4fb65
c6975f0
cdf6441
4f22185
89d413e
e1bc110
8581537
4402c09
d3e8660
f4851c9
94f263b
e37e2b5
619c3fd
876e895
7ea9f9a
99fd41b
5757e7d
660e4b8
19fd996
beec65a
66df131
c117a5f
6019659
5f92f12
50dcd15
60f72eb
0780aa8
167729e
2d31a85
abb378e
01867b6
0e1ecce
c405b82
cbe3f4b
db0c7a2
6e291cb
a733f50
2a2c4dd
97369ab
8acacc0
d83a968
cd0d503
101d4fc
fc0f322
2093fdf
26bd46e
6e2bcbd
2a31127
28047fc
7019f67
d55d977
89bc0f7
9322973
bd40b37
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
Original file line number | Diff line number | Diff line change |
---|---|---|
|
@@ -276,3 +276,4 @@ dmypy.json | |
|
||
# Pyre type checker | ||
.pyre/ | ||
*.nodegraph |
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,253 @@ | ||
import math | ||
import os | ||
|
||
import click | ||
import khmer | ||
import screed | ||
from sourmash._minhash import hash_murmur | ||
from tqdm import tqdm | ||
|
||
from khtools.compare_kmer_content import kmerize | ||
from khtools.sequence_encodings import encode_peptide, VALID_PEPTIDE_MOLECULES | ||
|
||
# khmer Nodegraph features | ||
DEFAULT_N_TABLES = 4 | ||
DEFAULT_MAX_TABLESIZE = int(1e8) | ||
|
||
# Default k-mer sizes for different alphabets | ||
DEFAULT_PROTEIN_KSIZE = 7 | ||
DEFAULT_DAYHOFF_KSIZE = 11 | ||
DEFAULT_HP_KSIZE = 21 | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. instead of hardcoding these values maybe add them to a config file instead, that way it will be easier for a user to track down defaults for debugging. |
||
|
||
|
||
def per_read_false_positive_coding_rate(n_kmers_in_read, n_total_kmers=1e7, | ||
n_hash_functions=DEFAULT_N_TABLES, | ||
tablesize=DEFAULT_MAX_TABLESIZE): | ||
exponent = - n_hash_functions * n_total_kmers / tablesize | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. I feel like the variable name |
||
print(f"exponent: {exponent}") | ||
|
||
# Probability that a single k-mer is randomly in the data | ||
# per_kmer_fpr = math.pow(1 - math.exp(exponent), n_hash_functions) | ||
|
||
# Use built-in `exp1m` = exp - 1 | ||
# - (exp - 1) = 1 - exp | ||
per_kmer_fpr = math.pow(- math.expm1(exponent), n_hash_functions) | ||
print(f"per kmer false positive rate: {per_kmer_fpr}") | ||
|
||
# Probability that the number of k-mers in the read are all false positives | ||
per_read_fpr = math.pow(per_kmer_fpr, n_kmers_in_read) | ||
return per_read_fpr | ||
|
||
|
||
def load_nodegraph(*args, **kwargs): | ||
try: | ||
# khmer 2.1.1 | ||
return khmer.load_nodegraph(*args, **kwargs) | ||
except AttributeError: | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. if this error is encountered it would be a good thing to log to user as warning. There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. This is what @luizirber told me to do to load the Nodegraph in a way that's compatible with both versions of khmer, so I'm not sure exactly what would be warned here. Here's how they do it in sourmash: https://github.com/dib-lab/sourmash/blob/master/sourmash/sbt.py#L62 |
||
# khmer 3+/master branch | ||
return khmer.Nodegraph.load(*args, **kwargs) | ||
|
||
|
||
# Cribbed from https://click.palletsprojects.com/en/7.x/parameters/ | ||
class BasedIntParamType(click.ParamType): | ||
name = "integer" | ||
|
||
def convert(self, value, param, ctx): | ||
try: | ||
if isinstance(value, int): | ||
return value | ||
if 'e' in value: | ||
sigfig, exponent = value.split('e') | ||
sigfig = float(sigfig) | ||
exponent = int(exponent) | ||
return int(sigfig * 10 ** exponent) | ||
return int(value, 10) | ||
except TypeError: | ||
self.fail( | ||
"expected string for int() conversion, got " | ||
f"{value!r} of type {type(value).__name__}", | ||
param, | ||
ctx, | ||
) | ||
except ValueError: | ||
self.fail(f"{value!r} is not a valid integer", param, ctx) | ||
|
||
|
||
BASED_INT = BasedIntParamType() | ||
|
||
|
||
def make_peptide_bloom_filter(peptide_fasta, | ||
peptide_ksize, | ||
molecule, | ||
n_tables=DEFAULT_N_TABLES, | ||
tablesize=DEFAULT_MAX_TABLESIZE): | ||
"""Create a bloom filter out of peptide sequences""" | ||
peptide_bloom_filter = khmer.Nodegraph(peptide_ksize, | ||
tablesize, | ||
n_tables=n_tables) | ||
|
||
with screed.open(peptide_fasta) as records: | ||
for record in tqdm(records): | ||
olgabot marked this conversation as resolved.
Show resolved
Hide resolved
|
||
if '*' in record['sequence']: | ||
continue | ||
sequence = encode_peptide(record['sequence'], molecule) | ||
try: | ||
kmers = kmerize(sequence, peptide_ksize) | ||
for kmer in kmers: | ||
# Convert the k-mer into an integer | ||
hashed = hash_murmur(kmer) | ||
|
||
# .add can take the hashed integer so we can hash the | ||
# peptide kmer and add it directly | ||
peptide_bloom_filter.add(hashed) | ||
except ValueError: | ||
# Sequence length is smaller than k-mer size | ||
continue | ||
return peptide_bloom_filter | ||
|
||
|
||
def make_peptide_set(peptide_fasta, peptide_ksize, molecule): | ||
"""Create a python set out of peptide sequence k-mers | ||
|
||
For comparing to the bloom filter in storage and performance | ||
""" | ||
peptide_set = set([]) | ||
|
||
with screed.open(peptide_fasta) as records: | ||
for record in tqdm(records): | ||
if '*' in record['sequence']: | ||
continue | ||
sequence = encode_peptide(record['sequence'], molecule) | ||
try: | ||
kmers = kmerize(sequence, peptide_ksize) | ||
peptide_set.update(kmers) | ||
except ValueError: | ||
# Sequence length is smaller than k-mer size | ||
continue | ||
return peptide_set | ||
|
||
|
||
def maybe_make_peptide_bloom_filter(peptides, peptide_ksize, molecule, | ||
peptides_are_bloom_filter, | ||
n_tables=DEFAULT_N_TABLES, | ||
tablesize=DEFAULT_MAX_TABLESIZE): | ||
if peptides_are_bloom_filter: | ||
click.echo( | ||
f"Loading existing bloom filter from {peptides} and " | ||
f"making sure the ksizes match", | ||
err=True) | ||
peptide_bloom_filter = load_nodegraph(peptides) | ||
if peptide_ksize is not None: | ||
try: | ||
assert peptide_ksize == peptide_bloom_filter.ksize() | ||
except AssertionError: | ||
raise ValueError(f"Given peptide ksize ({peptide_ksize}) and " | ||
f"ksize found in bloom filter " | ||
f"({peptide_bloom_filter.ksize()}) are not" | ||
f"equal") | ||
else: | ||
peptide_ksize = get_peptide_ksize(molecule, peptide_ksize) | ||
click.echo( | ||
f"Creating peptide bloom filter with file: {peptides}\n" | ||
f"Using ksize: {peptide_ksize} and molecule: {molecule} " | ||
f"...", | ||
err=True) | ||
peptide_bloom_filter = make_peptide_bloom_filter( | ||
peptides, peptide_ksize, molecule=molecule, | ||
n_tables=n_tables, tablesize=tablesize) | ||
return peptide_bloom_filter | ||
|
||
|
||
def maybe_save_peptide_bloom_filter(peptides, peptide_bloom_filter, molecule, | ||
save_peptide_bloom_filter): | ||
if save_peptide_bloom_filter: | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. what is the else condition? There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. There is no else, it's just a small function to abstract away the saving of the bloom filter so that the body of the There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. I'm confused why you would need this if statement. I think it would be more fluid if you created a bloom filter class and had a There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Making the bloom filter takes ~30s which is small compared to the filtering step (minutes-hours) so in case the user doesn't want to save the bloom filter, this |
||
ksize = peptide_bloom_filter.ksize() | ||
|
||
if isinstance(save_peptide_bloom_filter, str): | ||
filename = save_peptide_bloom_filter | ||
peptide_bloom_filter.save(save_peptide_bloom_filter) | ||
else: | ||
suffix = f'.molecule-{molecule}_ksize-{ksize}.bloomfilter.' \ | ||
f'nodegraph' | ||
filename = os.path.splitext(peptides)[0] + suffix | ||
|
||
click.echo(f"Writing peptide bloom filter to {filename}", err=True) | ||
peptide_bloom_filter.save(filename) | ||
click.echo("\tDone!", err=True) | ||
|
||
|
||
@click.command() | ||
@click.argument('peptides') | ||
@click.option('--peptide-ksize', | ||
default=None, type=int, | ||
help="K-mer size of the peptide sequence to use. Defaults for" | ||
" different molecules are, " | ||
f"protein: {DEFAULT_PROTEIN_KSIZE}" | ||
f", dayhoff: {DEFAULT_DAYHOFF_KSIZE}," | ||
f" hydrophobic-polar: {DEFAULT_HP_KSIZE}") | ||
@click.option('--molecule', | ||
default='protein', | ||
help="The type of amino acid encoding to use. Default is " | ||
"'protein', but 'dayhoff' or 'hydrophobic-polar' can be " | ||
"used") | ||
@click.option('--save-as', | ||
default=None, | ||
help='If provided, save peptide bloom filter as this filename. ' | ||
'Otherwise, add ksize and molecule name to input filename.') | ||
@click.option('--tablesize', type=BASED_INT, | ||
default="1e8", | ||
help='Size of the bloom filter table to use') | ||
@click.option('--n-tables', type=int, | ||
default=DEFAULT_N_TABLES, | ||
help='Size of the bloom filter table to use') | ||
def cli(peptides, peptide_ksize=None, molecule='protein', save_as=None, | ||
tablesize=DEFAULT_MAX_TABLESIZE, n_tables=DEFAULT_N_TABLES): | ||
"""Make a peptide bloom filter for your peptides | ||
|
||
\b | ||
Parameters | ||
---------- | ||
reads : str | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. there is no reads There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Yes, that's right. Thanks for catching! |
||
Sequence file of reads to filter | ||
peptides : str | ||
Sequence file of peptides | ||
peptide_ksize : int | ||
Number of characters in amino acid words | ||
long_reads | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. molecule save as are missing args and remove long_reads. I think for this function here it seems like redundant to have docstring since its help strings are right above, but better have 2 docs than none! There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Thanks for catching that! |
||
verbose | ||
|
||
\b | ||
Returns | ||
------- | ||
|
||
""" | ||
# \b above prevents rewrapping of paragraph | ||
peptide_ksize = get_peptide_ksize(molecule, peptide_ksize) | ||
peptide_bloom_filter = make_peptide_bloom_filter(peptides, peptide_ksize, | ||
molecule, | ||
n_tables=n_tables, | ||
tablesize=tablesize) | ||
click.echo("\tDone!", err=True) | ||
|
||
save_peptide_bloom_filter = save_as if save_as is not None else True | ||
maybe_save_peptide_bloom_filter( | ||
peptides, | ||
peptide_bloom_filter, | ||
molecule, | ||
save_peptide_bloom_filter=save_peptide_bloom_filter) | ||
|
||
|
||
def get_peptide_ksize(molecule, peptide_ksize): | ||
if molecule not in VALID_PEPTIDE_MOLECULES: | ||
raise ValueError(f"{molecule} is not a valid protein encoding! " | ||
f"Only one of 'protein', 'hydrophobic-polar', or" | ||
f" 'dayhoff' can be specified") | ||
|
||
if peptide_ksize is None: | ||
if molecule == 'protein': | ||
peptide_ksize = DEFAULT_PROTEIN_KSIZE | ||
elif molecule == 'dayhoff': | ||
peptide_ksize = DEFAULT_DAYHOFF_KSIZE | ||
elif molecule == 'hydrophobic-polar' or molecule == 'hp': | ||
peptide_ksize = DEFAULT_HP_KSIZE | ||
return peptide_ksize |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
why not
khtools extract_coding peptides.fa.gz *.fastq.gz > coding_peptides.fasta
does it save the coding nucleotides to fasta by default. if so can you mention in the comments?There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Ah yes good point. By default, only the coding peptides are output to stdout and the user needs to ask for everything else. Updated the README, let me know if it's still unclear.