forked from rotheconrad/00_in-situ_GeneCoverage
-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy path01a_Fastq_rename_sequences.py
105 lines (89 loc) · 3.44 KB
/
01a_Fastq_rename_sequences.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
#!/usr/bin/env python
''' Renames sequences in a fastq formatted file.
Magic Blast cuts the read name at the first space character and
reports the name before the space as the query ID.
However, fastq files can be named as:
@D00468:261:HYTMHBCX2:1:1101:9119:31637 1:N:0:CAGAGAGG+ACTGCATA
@D00468:261:HYTMHBCX2:1:1101:9119:31637 2:N:0:CAGAGAGG+ACTGCATA
where the unique identifier can come after the space chacter.
Filtering for best hits and retrieving the nucleotide sequence of a magic
blast match becomes much more involved once the unique identifier is lost.
This script renames the fastq sequences with a simple name:
@Read_0000000000001
@Read_0000000000002
etc...
!!! CAUTION !!! Renames file in place (ie: overwrites the input file) !!!
-------------------------------------------
Author :: Roth Conrad
Email :: [email protected]
GitHub :: https://github.com/rotheconrad
Date Created :: February 7th, 2020
License :: GNU GPLv3
Copyright 2020 Roth Conrad
All rights reserved
-------------------------------------------
'''
import argparse, sys, subprocess
def read_fastq(fq):
#linecount = 0
while fq:
# fastq format line 1x is read name
name = fq.readline().rstrip()
# Check for end of file and break.
if not name: break
# fastq format line 2x is nucleotide sequence
seq = fq.readline().rstrip()
# fastq format line 3x is wierd blank '+'' line
blank = fq.readline().rstrip()
# fastq format line 4x is the quality score line
qal = fq.readline().rstrip()
# Return these four lines each iteration
yield (name, seq, blank, qal)
#linecount += 4
#if linecount % 4 == 0: yield (name, seq, blank, qal)
def main():
# Configure Argument Parser
parser = argparse.ArgumentParser(
description=__doc__,
formatter_class=argparse.RawDescriptionHelpFormatter
)
parser.add_argument(
'-i', '--fastq_input_file',
help='Please specify the fastq file to read!',
metavar='',
type=str,
required=True
)
parser.add_argument(
'-p', '--prefix',
help='Please specify the prefix to rename with!',
metavar='',
type=str,
required=True
)
args=vars(parser.parse_args())
infile = args['fastq_input_file']
# define temporary file name.
outfile = args['fastq_input_file'].split('.')[0] + '.temp'
prefix = args['prefix']
# open the file to check fastq format
with open(infile, 'r') as file:
# only need to read first line
line = file.readline()
# Verify fastq format or exit with message
if not line.startswith('@'):
print('\n\nError reading fastq formatted read names!\n\n')
sys.exit('\n\nAre you sure this is a fastq formatted file?\n\n')
# initialize read count for new read names.
readcount = 0
# Read through the input fastq file, change read names, and write new file.
with open(infile, 'r') as f, open (outfile, 'w') as o:
# Iterate through fastq format with read_fastq function
for name, seq, blank, qal in read_fastq(f):
# increment readcount and write file with numbered read names.
readcount += 1
o.write(f'@{prefix}_{readcount:013}\n{seq}\n{blank}\n{qal}\n')
# Move the temporary file to overwrite the orignal file.
_ = subprocess.run(['mv', outfile, infile])
if __name__ == "__main__":
main()