-
Notifications
You must be signed in to change notification settings - Fork 1
/
Copy pathcompileMLST.py
112 lines (83 loc) · 3.82 KB
/
compileMLST.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
"""
Compile results of MLST analysis using ARIBA.
This script assumes subdirectory are named after genomes. It takes as input from stdin and
produces output files under the current working directory.
Command line:
find ./* -maxdepth 1 -type d | python compileMLST.py
cat outdir.txt | python compileMLST.py -p mlst
Note that do not use `find . -maxdepth 1 -type d` or `find ./ -maxdepth 1 -type d` for this
script because both commands return the current working directory './' in the first line.
Python 3 is preferred to run this script, though it is compatible to Python 2.
Copyright 2020 Yu Wan <[email protected]>
Licensed under the GNU General Public Licence version 3 (GPLv3) <https://www.gnu.org/licenses/>.
Publication: 8 Mar 2020
"""
from __future__ import print_function
import os
import sys
from argparse import ArgumentParser
def parse_arguments():
parser = ArgumentParser(description= 'Compile ARIBA outputs of MLST analysis')
parser.add_argument('-p', '--prefix', dest = 'p', type = str, required = False, default = '',\
help = 'Prefix of output files')
return parser.parse_args()
def main():
# Define types of files to be processed
FILES = {'STs' : 'mlst_report.tsv', \
'hits' : 'mlst_report.details.tsv', \
'scores' : 'report.tsv'}
args = parse_arguments()
genomes = get_genome_names(sys.stdin)
for file_type, file_name in FILES.items():
compile_files(file_type, file_name, genomes, args.p)
return
def get_genome_names(paths):
"""
Extracts genome names from paths of directories containing ARIBA's output
"""
genomes = {}
for p in paths:
p = p.rstrip('\n') # Otherwise, every genome name has a newline character.
g = os.path.basename(p) # For example, './genome1' becomes 'genome1'.
if g == '':
print(p + ' is not parsed as it points to the current working directory.')
else:
genomes[g] = p
if len(genomes) == 0:
sys.exit('Error: No genome name is extracted from paths.')
return genomes
def compile_files(report_type, report_name, genomes, prefix):
"""
Compiles files of a given type.
"""
print('Start to compile report type ' + report_type + '.')
file_name = report_type + '.tsv'
if prefix != '':
file_name = prefix + '__' + file_name
compiled_report = open(file_name, 'w') # Output file
report_count = 0 # File counter
for g, p in genomes.items(): # g: genome ID; p: path to a directory
try:
report = open(os.path.join(p, report_name), 'r')
report_count += 1
lines = report.readlines() # Does not need to use read().splitlines() to strip off newline characters.
line_count = len(lines)
if report_count == 1: # Read the header of the first file, assuming other files have the same format
header = lines[0]
if header.startswith('#'):
header = header[1 : len(header)] # Chop away the leading hash character.
compiled_report.write('genome' + '\t' + header)
if line_count > 1: # Skip the header line for the block to be written into the output file
lines = lines[1 : line_count]
for r in lines: # Write into the output file
compiled_report.write(g + '\t' + r)
else:
print('Warning: file ' + p + 'only has one line. Hence this file is skipped.')
report.close()
except:
compiled_report.close()
sys.exit('Error: Input file ' + p + ' or output file ' + file_name + ' is not accessible.')
compiled_report.close()
return
if __name__ == '__main__':
main()