-
Notifications
You must be signed in to change notification settings - Fork 1
/
Copy pathkt_filter.py
104 lines (88 loc) · 4.64 KB
/
kt_filter.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
import os
import sys
import csv
import argparse
# import ktoolu_io as KTIO
from ktoolu_taxonomy import ktTaxonomyTree as ktree
def filterSequences(db, _in_class_stream, keepTaxIDs, allowUnclassified=False, logfile=None, class_fmt="kraken"):
if logfile is not None:
[logfile.write('Filtering sequences...\n'), logfile.flush()]
assert keepTaxIDs or allowUnclassified
nseqs = 0
keepSequences = set()
# need to keep track of all dropped sequences in case we want unclassified but used the --only-classified-output switch
# when running kraken (unclassified sequences are unmarked and need to be distinguished from unwanted sequences.)
dropSequences = set()
n_unclassified_keep, n_classified_keep, n_unclassified_discard, n_classified_discard = 0, 0, 0, 0
with _in_class_stream as fi:
if class_fmt == "tabular":
for nseqs, line in enumerate(fi, start=1):
line = line.strip().split()
takeClassified = line[1].isdigit() and int(line[1]) in keepTaxIDs
takeUnclassified = allowUnclassified and line[1] == "N/A"
if takeUnclassified or takeClassified:
keepSequences.add(line[0].strip())
elif allowUnclassified:
dropSequences.add(line[1].strip())
elif class_fmt == "kraken":
for line in fi:
nseqs += 1
line = line.strip().split()
takeClassified = line[0] == 'C' and int(line[2]) in keepTaxIDs
takeUnclassified = allowUnclassified and line[0] == 'U'
# print(line, takeClassified, takeUnclassified)
if takeUnclassified or takeClassified:
keepSequences.add(line[1].strip())
elif allowUnclassified:
# we want unclassified, but current line was classified and rejected
# myself 2.something years later: there is probably a reason for doing it this way...
dropSequences.add(line[1].strip())
elif class_fmt == "centrifuge":
next(fi)
for nseqs, row in enumerate(csv.reader(fi, delimiter='\t'), start=1):
keep = False
isUnclassified = row[1] == 'unclassified'
if isUnclassified:
if allowUnclassified:
keep = True
n_unclassified_keep += 1
else:
n_unclassified_discard += 1
else:
if int(row[2]) in keepTaxIDs:
keep = True
n_classified_keep += 1
else:
n_classified_discard += 1
if keep:
keepSequences.add(row[0])
elif allowUnclassified:
dropSequences.add(row[0])
"""
takeClassified = row[1] != 'unclassified' and int(row[2]) in keepTaxIDs
takeUnclassified = allowUnclassified and row[1] == 'unclassified'
if takeUnclassified or takeClassified:
keepSequences.add(row[0])
elif allowUnclassified:
dropSequences.add(row[0])
"""
if logfile is not None:
logfile.write('Keeping %i of %i sequences (%.1f).\n' % (len(keepSequences), nseqs, float(len(keepSequences))/nseqs))
logfile.write('Dropping %i of %i sequences (%.1f).\n' % (len(dropSequences), nseqs, float(len(dropSequences))/nseqs))
logfile.write('C:keep={} ({}), C:discard={} ({}), U:keep={} ({}), U:discard={} ({})\n'.format(n_classified_keep, n_classified_keep/nseqs, n_classified_discard, n_classified_discard/nseqs, n_unclassified_keep, n_unclassified_keep/nseqs, n_unclassified_discard, n_unclassified_discard/nseqs))
logfile.flush()
return keepSequences, dropSequences
if __name__ == "__main__":
from ktoolu_taxonomy import compileValidTaxIDs, ktTaxonomyTree
ap = argparse.ArgumentParser()
ap.add_argument("input")
ap.add_argument("taxid", type=int, default=1)
ap.add_argument("in_format", choices=("kraken", "centrifuge", "tabular"), default="kraken")
ap.add_argument("taxdb")
ap.add_argument("--allow-unclassified", action="store_true")
args = ap.parse_args()
keepTaxIDs = compileValidTaxIDs(args.taxdb, wantedTaxIDs=[args.taxid])
with open(args.input) as _in_class_stream:
keep, drop = filterSequences(args.taxdb, _in_class_stream, keepTaxIDs, class_fmt=args.in_format, allowUnclassified=args.allow_unclassified)
for seq in keep:
print(seq)