-
Notifications
You must be signed in to change notification settings - Fork 1
/
Copy pathcalc_gc_percent.py
57 lines (44 loc) · 1.99 KB
/
calc_gc_percent.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
"""
Simple script that outputs the name, length, GC count, and GC percent of a sequence in a fasta format.
Example:
NODE_360_length_156_cov_88.910255 2,019,315 781,621 38.71%
NODE_361_length_33_cov_92.575760 2,019,380 781,644 38.71%
NODE_362_length_14453_cov_13.237667 2,033,865 787,505 38.72%
NODE_363_length_47_cov_445.914886 2,033,944 787,549 38.72%
"""
import argparse
def get_options():
parser = argparse.ArgumentParser()
parser.add_argument('-fasta', required=True, help='Sequences in Fasta format')
return parser.parse_args()
def get_counts(fasta_input):
gc_count = 0
genome_length = 0
with open(fasta_input, 'r') as myfile:
title = None
for line in myfile:
if line.startswith('>'):
line = line.rstrip()
# save the old record
if title is not None:
contig = {'title': title,
'gc_count': '{:,}'.format(gc_count),
'length': '{:,}'.format(genome_length),
'percent': "{0:.2f}".format(float(gc_count) / genome_length * 100)}
yield contig
# start a new record
title = line[1:]
continue
line = line.rstrip().upper()
gc_count += line.count('G') + line.count('C')
genome_length += len(line)
def main():
options = get_options()
fasta_input = options.fasta
for count, record in enumerate(get_counts(fasta_input)):
print('{title}\t{length}\t{gc_count}\t{percent}%'.format(title=record['title'],
length=record['length'],
gc_count=record['gc_count'],
percent=record['percent']))
if __name__ == '__main__':
main()