-
Notifications
You must be signed in to change notification settings - Fork 14
/
Copy pathdfast
executable file
·313 lines (240 loc) · 14.4 KB
/
dfast
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
#! /usr/bin/env python
# coding: UTF8
import os
import sys
import argparse
import tempfile
from logging import getLogger, DEBUG, INFO, StreamHandler, Formatter
from dfc import dfast_version
from dfc.pipeline import Pipeline
from dfc.utils.config_util import load_config, show_config, set_references, set_database, set_aligner, \
disable_cdd_search, disable_hmm_scan, enable_prodigal, enable_trnascan, disable_rrna_prediction, \
disable_trna_prediction, disable_cds_prediction, disable_crispr_prediction, set_genetic_code, set_gff, \
set_threshold, set_values_from_metadata, enable_mga, enable_genemarks2, enable_rnammer, \
enable_metagenome_mode, disable_func_annotation, enable_amr
from dfc.utils.path_util import set_binaries_path
from dfc.utils.fix_origin import fix_origin
def parse_bool(arg):
if arg in ["t", "true", "TRUE", "True"]:
return True
elif arg in ["f", "false", "FALSE", "False"]:
return False
else:
return None
parser = argparse.ArgumentParser(description='DFAST: DDBJ Fast Annotation and Submission \tTool\n' +
"version {}. ".format(dfast_version),
usage="dfast -g your_genome.fna [options]", epilog=None, # "-- prolog",
# formatter_class=argparse.ArgumentDefaultsHelpFormatter,
add_help=False # , allow_abbrev=False
# argument_default=argparse.SUPPRESS
)
group_basic = parser.add_argument_group("Basic options")
group_basic.add_argument("-g", "--genome", help="Genomic FASTA file for input. Can be gzipped.", metavar="PATH")
group_basic.add_argument("-o", "--out", help="Output directory (default:OUT)", type=str, metavar="PATH")
group_basic.add_argument("-c", "--config", help="Configuration file (default config will be used if not specified)", type=str, metavar="PATH")
group_basic.add_argument("--organism", help="Organism name", metavar="STR", default="")
group_basic.add_argument("--strain", help="Strain name", metavar="STR", default="")
group_genome = parser.add_argument_group('Genome settings')
group_genome.add_argument("--complete", choices=["t", "true", "f", "false"], metavar="BOOL",
help="Treat the query as a complete genome. Not required unless you need INSDC submission files. [t|f(=default)]")
group_genome.add_argument("--use_original_name", choices=["t", "true", "f", "false"],
help="Use original sequence names in a query FASTA file [t|f(=default)]", metavar="BOOL")
group_genome.add_argument("--sort_sequence", choices=["t", "true", "f", "false"],
help="Sort sequences by length [t(=default)|f]", metavar="BOOL")
group_genome.add_argument("--minimum_length", help="Minimum sequence length (default:200)", type=int, metavar="INT")
group_genome.add_argument("--fix_origin", help="Rotate/flip the chromosome so that the dnaA gene comes first. (ONLY FOR A FINISHED GENOME)", action="store_true")
group_genome.add_argument("--offset", help="Offset from the start codon of the dnaA gene. (for --fix_origin option, default=0)", default=0, type=int, metavar="INT")
group_lt = parser.add_argument_group('Locus_tag settings')
group_lt.add_argument("--locus_tag_prefix", help="Locus tag prefix (defaut:LOCUS)", metavar="STR")
group_lt.add_argument("--step", help="Increment step of locus tag (default:10)", metavar="INT", type=int)
group_lt.add_argument("--use_separate_tags", choices=["t", "true", "f", "false"],
help="Use separate tags according to feature types [t(=default)|f]", default=None, metavar="BOOL")
group_workflow = parser.add_argument_group('Workflow options')
group_workflow.add_argument("--threshold", help='Thresholds for default database search (format: "pident,q_cov,s_cov,e_value", default: "0,75,75,1e-6")', metavar="STR")
group_workflow.add_argument("--database", help="Additional reference database to be searched against prior to the default database. (format: db_path[,db_name[,pident,q_cov,s_cov,e_value]])", metavar="PATH")
group_workflow.add_argument("--references", help="Reference file(s) for OrthoSearch. Use semicolons for multiple files, e.g. 'genome1.faa;genome2.gbk'", metavar="PATH")
group_workflow.add_argument("--aligner", help="Aligner to use [ghostx(=default)|blastp|diamond]", choices=["ghostx", "blastp", "diamond"], metavar="STR")
group_genecall = group_workflow.add_mutually_exclusive_group()
group_genecall.add_argument("--use_prodigal", help="Use Prodigal to predict CDS instead of MGA", action="store_true")
group_genecall.add_argument("--use_genemarks2", help="Use GeneMarkS2 to predict CDS instead of MGA. [auto|bact|arch]", choices=["auto", "bact", "arch", "bacteria", "archaea"], metavar="STR")
# group_genecall.add_argument("--use_mga", help="Use MetaGeneAnnotator to predict CDS instead of Prodigal", action="store_true")
group_workflow.add_argument("--use_trnascan", help="Use tRNAscan-SE to predict tRNA instead of Aragorn. [bact|arch]", choices=["bac", "bact", "arc", "arch"], metavar="STR")
group_workflow.add_argument("--use_rnammer", help="Use RNAmmer to predict rRNA instead of Barrnap. [bact|arch]", choices=["bact", "arch"], metavar="STR")
group_workflow.add_argument("--gcode", help="Genetic code [11(=default),4(=Mycoplasma)]", metavar="INT", type=int, default=11)
group_workflow.add_argument("--no_func_anno", help="Disable all functional annotation steps", action="store_true")
group_workflow.add_argument("--no_hmm", help="Disable HMMscan", action="store_true")
group_workflow.add_argument("--no_cdd", help="Disable CDDsearch", action="store_true")
group_workflow.add_argument("--no_cds", help="Disable CDS prediction", action="store_true")
group_workflow.add_argument("--no_rrna", help="Disable rRNA prediction", action="store_true")
group_workflow.add_argument("--no_trna", help="Disable tRNA prediction", action="store_true")
group_workflow.add_argument("--no_crispr", help="Disable CRISPR prediction", action="store_true")
# Preliminary implementation for GFF parsing
group_workflow.add_argument("--metagenome", help="Set options of MGA/Prodigal for metagenome contigs", action="store_true")
group_workflow.add_argument("--amr", help="[Preliminary implementation] Enable AMR/VFG annotation and identification of plasmid-derived contigs", action="store_true")
group_workflow.add_argument("--gff", help="[Preliminary implementation] Read GFF to import structural annotation. Ignores --use_original_name, --sort_sequence, --fix_origin.")
group_source = parser.add_argument_group('Genome source modifiers and metadata [advanced]',
description='These values are only used to create INSDC submission files and do not affect the annotation result. See documents for more detail.')
group_source.add_argument("--seq_names", help="Sequence names for each sequence (for complete genome)", metavar="STR")
group_source.add_argument("--seq_types", help="Sequence types for each sequence (chromosome/plasmid, for complete genome)", metavar="STR")
group_source.add_argument("--seq_topologies", help="Sequence topologies for each sequence (linear/circular, for complete genome)", metavar="STR")
group_source.add_argument("--additional_modifiers", help="Additional modifiers for source features", metavar="STR")
group_source.add_argument("--metadata_file", help="Path to a metadata file (optional for DDBJ submission file)", metavar="PATH")
# group_source.add_argument("--center_name", help="Genome center name (optional for GenBank submission file)", metavar="STR") # deprecated
group_run = parser.add_argument_group('Run options')
group_run.add_argument("--cpu", help="Number of CPUs to use", type=int, metavar="INT")
group_run.add_argument("--use_locustag_as_gene_id", help="Use locustag as gene ID for FASTA and GFF. (Useful when providing DFAST results to other tools such as Roary)", action="store_true")
group_run.add_argument("--dbroot", help="DB root directory (default:APP_ROOT/db", type=str, metavar="PATH")
group_run.add_argument("--force", help="Force overwriting output", action="store_true")
group_run.add_argument("--debug", help="Run in debug mode (Extra logging and retaining temporary files)", action="store_true")
group_run.add_argument("--show_config", help="Show pipeline configuration and exit", action="store_true")
group_run.add_argument('--version', version='DFAST ver. {version}'.format(prog="DFAST", version=dfast_version),
action='version', help="Show program version", default=False)
group_run.add_argument("-h", "--help", help="Show this help message", action="store_true")
args = parser.parse_args()
if len(sys.argv) == 1:
parser.print_help()
exit()
logger = getLogger()
handler = StreamHandler(sys.stdout)
handler.setLevel(DEBUG)
handler.setFormatter(Formatter("%(asctime)s %(message)s", datefmt='%Y/%m/%d %H:%M:%S'))
logger.addHandler(handler)
logger.setLevel(DEBUG)
if args.help:
parser.print_help()
exit()
app_root = os.path.abspath(os.path.dirname(os.path.realpath(__file__)))
if app_root == "":
app_root = "."
# loading config file
config_file = args.config or os.path.join(app_root, "dfc/default_config.py")
config = load_config(app_root, config_file, args.dbroot)
if args.debug: # --debug
config.DEBUG = True
handler.setFormatter(Formatter("%(asctime)s %(levelname)8s %(message)s [%(module)s %(lineno)d]", datefmt='%Y/%m/%d %H:%M:%S'))
logger.setLevel(DEBUG)
logger.warning("Debug mode enabled. Logger level is set to DEBUG and temporary files will be kept.")
else:
config.DEBUG = False
logger.setLevel(INFO)
if args.cpu is not None: # --cpu
config.CPU = args.cpu
if args.force: # --force
config.FORCE_OVERWRITE = True
# override config by command options
if args.genome: # --genome
config.GENOME_FASTA = args.genome
if args.out: # --out
config.WORK_DIR = args.out
if args.metadata_file: # --metadata_file
config.DDBJ_SUBMISSION["metadata_file"] = args.metadata_file
set_values_from_metadata(config)
if args.organism: # --organism
config.GENOME_SOURCE_INFORMATION["organism"] = args.organism
if args.strain: # --strain
config.GENOME_SOURCE_INFORMATION["strain"] = args.strain
if args.gff:
set_gff(config, args.gff)
logger.warning("[Preliminary implementation!] GFF parser enabled. Will import features from GFF file. " + \
"fix_origin is set to disabled, use_original_name is set to enabled, minimum_length is set to 0.")
args.fix_origin = False
args.use_original_name = "t"
# args.sort_sequence = "f"
args.minimum_length = 0
complete = parse_bool(args.complete) # --complete
if complete is not None:
config.GENOME_CONFIG["complete"] = complete
use_original_name = parse_bool(args.use_original_name) # --use_original_name
if use_original_name is not None:
config.GENOME_CONFIG["use_original_name"] = use_original_name
sort_sequence = parse_bool(args.sort_sequence) # --sort_sequence
if sort_sequence is not None:
config.GENOME_CONFIG["sort_sequence"] = sort_sequence
if args.minimum_length: # --minimum_length
config.GENOME_CONFIG["minimum_length"] = args.minimum_length
if args.locus_tag_prefix: # --locus_tag_prefix
config.LOCUS_TAG_SETTINGS["locus_tag_prefix"] = args.locus_tag_prefix
if args.step: # --step
config.LOCUS_TAG_SETTINGS["step"] = args.step
use_separate_tags = parse_bool(args.use_separate_tags) # --use_separate_tags
if use_separate_tags is not None:
config.LOCUS_TAG_SETTINGS["use_separate_tags"] = use_separate_tags
if args.references: # --references
set_references(config, args.references)
if args.database: # --databse
set_database(config, args.database)
if args.aligner: # --aligner
set_aligner(config, args.aligner)
if args.threshold:
set_threshold(config, args.threshold)
if args.no_crispr: # --no_crispr
disable_crispr_prediction(config)
if args.gcode != 11:
# As of ver 1.2.11, gcode 4 become available when using MGA
# if not (args.use_prodigal or args.use_genemarks2):
# logger.error("'--gcode 4' cannot be specified when using MGA. Please set '--use_prodigal' or '--use_genemarks2'")
# exit(1)
set_genetic_code(config, args.gcode)
if args.use_prodigal: # --use_prodigal
enable_prodigal(config)
if args.use_genemarks2: # --use_genemarks2
enable_genemarks2(config, args.use_genemarks2)
# if args.use_mga:
# enable_mga(config)
if args.use_trnascan: # --use_trnascan
enable_trnascan(config, args.use_trnascan)
if args.use_rnammer: # --use_trnascan
enable_rnammer(config, args.use_rnammer)
if args.no_func_anno:
disable_func_annotation(config)
if args.metagenome:
enable_metagenome_mode(config)
if args.amr:
enable_amr(config)
if args.no_hmm: # --no_hmm
disable_hmm_scan(config)
if args.no_cdd: # --no_cdd
disable_cdd_search(config)
if args.no_cds: # --no_cds
disable_cds_prediction(config)
if args.no_rrna: # --no_rrna
disable_rrna_prediction(config)
if args.no_trna: # --no_trna
disable_trna_prediction(config)
if args.seq_names: # --seq_names
config.GENOME_SOURCE_INFORMATION["seq_names"] = args.seq_names
if args.seq_types: # --seq_types
config.GENOME_SOURCE_INFORMATION["seq_types"] = args.seq_types
if args.seq_topologies: # --seq_topologies
config.GENOME_SOURCE_INFORMATION["seq_topologies"] = args.seq_topologies
if args.additional_modifiers: # --additional_modifiers
config.GENOME_SOURCE_INFORMATION["additional_modifiers"] = args.additional_modifiers
# deprecated as of 2024.May
# if args.center_name: # --center_name
# config.GENBANK_SUBMISSION["center_name"] = args.center_name
if args.use_locustag_as_gene_id:
config.OUTPUT_RESULT["use_locustag_as_gene_id"] = True
if args.show_config:
show_config(config)
exit()
# pp = pprint.PrettyPrinter(indent=1)
# pp.pprint(config.FUNCTIONAL_ANNOTATION)
# print(dir(config))
# set environmental path
set_binaries_path(app_root)
# rotate the chromosome so that dnaA comes first.
if args.fix_origin:
query_genome_fasta = config.GENOME_FASTA
fd, temporary_fasta_file = tempfile.mkstemp()
fix_origin(query_genome_fasta, temporary_fasta_file, offset=args.offset, logger=logger)
logger.info("Created a temporary file for the origin-fixed genome. ({0})".format((temporary_fasta_file )))
config.GENOME_FASTA = temporary_fasta_file
pipeline = Pipeline(config, logger)
pipeline.execute()
# cleanup unwanted files
if not config.DEBUG:
pipeline.cleanup()
if args.fix_origin:
os.remove(temporary_fasta_file)
if __name__ == '__main__':
pass