-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathget_fastq_sum.py
executable file
·96 lines (83 loc) · 3.3 KB
/
get_fastq_sum.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
"""
Merge key statistics from pacbio isoseq fastq transcript headers
"""
__author__ = "Sarah Hazell Pickering"
__date__ = "2018-09-27"
import os
import numpy
import pandas
def _getNum(attributes):
nums = []
for a in attributes:
nums.append(int(a.split(sep="=")[1]))
return nums
def _getAverage(df):
averages = df.mean(axis=0, numeric_only=True)
averages = averages.round()
formatted = pandas.Series(["averages"] + averages.to_list(), index =df.columns)
return formatted
def getIsoseqSummary(fasta, outpath="test.txt"):
columns = ["transcript","full_length_coverage","length","num_subreads"]
collate = []
with open(fasta, 'r') as f:
for line in f:
if line.startswith(">"):
# breakpoint()
trans_id,other = line.split(" ")
trans_id = trans_id[1:] #rm ">"
fl_coverage,length,num_subreads = _getNum(other.split(sep=";"))
collate.append([trans_id, fl_coverage,length,num_subreads])
result = pandas.DataFrame(collate, columns=columns)
averages = _getAverage(result)
# breakpoint()
result = result.append(averages, ignore_index=True)
result = result.round()
result.to_csv(outpath, sep="\t", index=False)
return averages
def extractAverages(summary="hisat_summary.tab"):
averages = pandas.read_table(summary).iloc[-1,1:] #select last row of summary file
return averages
def countFullLength(isoseq, cupcake, outpath):
#convert cupcake summary to cluster:[transcripts] dict
with open(cupcake) as fin:
rows = ( line.split('\t') for line in fin )
c = { row[0]:(row[1].strip().split(',')) for row in rows }
#convert isoseq summary to transcript:full_length dict
with open(isoseq) as fin:
rows = ( line.split('\t') for line in fin )
i = { row[0]:row[1].strip() for row in rows }
collate = {}
for cluster in c: #c.keys()?
count = 0
for trans in c[cluster]:
count += int(i[trans].split('.')[0])
collate[cluster] = count
result = pandas.Series(collate)
print(result)
return result
def addCoverageToGFF(summary, gtf, output_file):
#convert cupcake summary to cluster:full_length_coverage dict
with open(summary) as coverage:
rows = (line.split('\t') for line in coverage if not(line.startswith('#')))
cov = { row[0]:row[1].strip() for row in rows }
collate = []
with open(gtf) as gff:
for line in gff:
cols = line.rstrip().split('\t')
attrs = cols[8].split(";")
for at in attrs:
at = at.strip()
if at.startswith("transcript_id"):
trans_id = at.split()[-1].strip('"')
cols[8] = cols[8] + " full_length_coverage \"" + cov[trans_id] + '"'
collate.append('\t'.join(cols) + '\n')
with open(output_file, 'w') as out:
out.writelines(collate)
return collate[1:6]
if __name__ == '__main__':
data = "new_transcripts/ASC_D07G_P7_D1_hq.fasta"
getIsoseqSummary(data)
#extractAverages(summary="test.txt")
#countFullLength("test.txt", "cupcake/P7_D1/P7_D1_250bp5.collapsed.group.txt", "")
PRE = "cupcake/ASC_D07G_P7_D1/P7_D1.collapsed"
print(addCoverageToGFF(PRE + ".abundance.txt",PRE + ".gff", PRE + ".f.gff"))