-
Notifications
You must be signed in to change notification settings - Fork 2
/
Copy pathdna.py
63 lines (52 loc) · 1.68 KB
/
dna.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
#!/usr/bin/env python
import argparse
import mmap
try:
from tqdm import tqdm
_tqdm = True
print("The tqdm progress bar software is available in this version of Python.")
except ImportError:
_tqdm = False
print("The tqdm progress bar software is not available in this version of Python.")
def get_num_lines(file_path):
fp = open(file_path, "r+")
buf = mmap.mmap(fp.fileno(), 0)
lines = 0
while buf.readline():
lines += 1
return lines
parser = argparse.ArgumentParser()
parser.add_argument("--file", required=True, type=str)
args = parser.parse_args()
file = args.file
print("file is", file)
with open(file) as f:
new_list = []
concat = False
if _tqdm:
my_iterator = tqdm(f, total=get_num_lines(file))
else:
my_iterator = f
for line in my_iterator:
if line.startswith(">"):
new_list.append(
">" + line[line.find("Eukaryota") :].
replace("\n", ";").
replace('phylum_class_order_family_genus_','').
replace('class_order_family_genus_','').
replace('order_family_genus_','').
replace('family_genus_','').
replace('genus_','')
)
if set(line.strip()) == {"A", "C", "G", "T"}:
if concat:
new_list[-1] += line.strip()
else:
new_list.append(line.strip())
concat = True
else:
concat = False
file = open(file + "-edited.txt", "w")
for item in new_list:
file.write(item + "\n")
file.close()