-
Notifications
You must be signed in to change notification settings - Fork 3
/
Copy pathSECNVs.py
349 lines (311 loc) · 15.2 KB
/
SECNVs.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
#!/usr/bin/python
import argparse
import random
import os
import subprocess
import math
import sys
import time
import copy
from numpy.random import choice as choices
from WES_simulator import *
from snp_rate import *
def main():
parser = argparse.ArgumentParser(description='Simulator for WES or WGS data', \
formatter_class=argparse.RawTextHelpFormatter)
group1 = parser.add_argument_group('Mandatory inputs')
group1.add_argument('-G', type=str, dest='genome_file', required=True, \
help='Reference genome FASTA file')
group1.add_argument('-T', type=str, dest='target_region_file', required=True, \
help='Target region file')
group2 = parser.add_argument_group('Arguments for simulating rearranged genomes')
group2.add_argument('-rN', dest='replaceNs', choices=['none','gap','all'], default="none", \
help='Replace sequences of "N"s by ATGC randomly? [none]')
group2.add_argument('-eN', dest='exclude_Ns', choices=['none','gap','all'], default="none", \
help='Exclude sequences of "N"s for CNV simulation? [none]')
group2.add_argument('-n_gap', dest='num_N_for_gaps', type=int, default = 50, \
help='Number of consecutive "N"s to be considered a gap region [50]')
group2.add_argument('-e_cnv', dest='target_cnv_list', type=str, default=None, \
help='A user-defined list of CNVs overlapping with target regions')
group2.add_argument('-e_chr', dest='target_cnv_chr', type=int, default = None, \
help='Number of CNVs overlapping with target regions to be generated on each chromosome')
group2.add_argument('-e_tol', dest='target_cnv_tol', type=int, default = None, \
help='Total number of CNVs overlapping with target regions to be generated across the genome (estimate)')
group2.add_argument('-e_cl', dest='target_cnv_len_file', type=str, default=None, \
help='User supplied file of CNV length for CNVs overlapping with target regions')
group2.add_argument('-o_cnv', dest='out_cnv_list', type=str, default=None, \
help='A user-defined list of CNVs outside of target regions')
group2.add_argument('-o_chr', dest='out_cnv_chr', type=int, default = None, \
help='Number of CNVs outside of target regions to be generated on each chromosome')
group2.add_argument('-o_tol', dest='out_cnv_tol', type=int, default = None, \
help='Total number of CNVs outside of target regions to be generated across the genome (estimate)')
group2.add_argument('-o_cl', dest='out_cnv_len_file', type=str, default=None, \
help='User supplied file of CNV length for CNVs outside of target regions')
group2.add_argument('-ol', dest='overlap_bps', type=int, default = 100, \
help='For each CNV overlapping with target regions, number of minimum overlapping bps [100]')
group2.add_argument('-min_len', dest='cnv_min_length', type=int, default=1000, \
help='Minimum CNV length [1000]')
group2.add_argument('-max_len', dest='cnv_max_length', type=int, default=100000, \
help='Maximum CNV length [100000]')
group2.add_argument('-min_cn', dest='min_copy_number', type=int, default=2, \
help='Minimum copy number for insertions [2]')
group2.add_argument('-max_cn', dest='max_copy_number', type=int, default=10, \
help='Maximum copy number for insertions [10]')
group2.add_argument('-p', dest='proportion_ins', type=float, default=0.5, \
help='Proportion of insertions [0.5]')
group2.add_argument('-f', dest='min_flanking_len', type=int, default=50, \
help='Minimum length between each CNV [50]')
group2.add_argument('-ms', dest='method_s', choices=['random','uniform','gauss'], default="random", \
help='Distribution of CNVs [random]')
group2.add_argument('-ml', dest='method_l', choices=['random','uniform','gauss','beta','user'], default="random", \
help='Distribution of CNV length [random]')
group2.add_argument('-as', dest='as1', type=float, default=None, \
help='Mu for Gaussian CNV distribution [0]')
group2.add_argument('-bs', dest='bs', type=float, default=None, \
help='Sigma for Gaussian CNV distribution [1]')
group2.add_argument('-al', dest='al', type=float, default=None, \
help='Mu (Gaussian distribution) or alpha (Beta distribution) for CNV length distribution [0 for Gaussian distribution, and 0.5 for Beta distribution]')
group2.add_argument('-bl', dest='bl', type=float, default=None, \
help='Sigma (Gaussian distribution) or beta (Beta distribution) for CNV length distribution [1 for Gaussian distribution, and 2.3 for Beta distribution]')
group2.add_argument('-s_r', dest='s_rate', type=float, default=0, \
help='Rate of SNPs in target regions [0]')
group2.add_argument('-s_s', dest='s_slack', type=int, default=0, \
help='Slack region up and down stream of target regions to simulate SNPs [0]')
group2.add_argument('-i_r', dest='i_rate', type=float, default=0, \
help='Rate of indels in target regions [0]')
group2.add_argument('-i_mlen', dest='i_max_len', type=int, default=50, \
help='The Maximum length of indels in target regions [50]. (If a deletion is equal or larger than the length of the target region it is in, the length of the deletion will be changed to (length of the target region it is in) - 1.)')
group3 = parser.add_argument_group('Arguments for simulating short reads (fastq)')
group3.add_argument('-nr', dest='nreads', type=int, default=10000, \
help='Number of reads / read pairs on target regions to be generated for each genome [10000]')
group3.add_argument('-fs', dest='frag_size', type=int, default=200, \
help='Mean fragment size to be generated [200]')
group3.add_argument('-s', dest='stdev', type=int, default=20, \
help='Standard deviation of fragment sizes [20]')
group3.add_argument('-l', dest='read_length', type=int, default=100, \
help='Read length of each short read [100]')
group3.add_argument('-tf', dest='target_region_flank', type=int, default=0, \
help='Length of flanking region up and down stream of target regions to be sequenced (this step take place after -clr). [0]')
group3.add_argument('-pr', dest='paired_end', action='store_true', \
help='Select if paired-end sequencing')
group3.add_argument('-q', dest='quality_score_offset', type=int, default=33, \
help='Quality score offset for short reads simulation [33]')
group3.add_argument('-clr', dest='connect_len_between_regions', type=int, default=None, \
help='Maximum length bwtween target regions to connect the target regions.')
group3.add_argument('-m', dest='model', type=str, \
default=os.path.join(os.path.dirname(os.path.realpath(__file__)) + "/Illumina_HiSeq_2500_p.gzip"), \
help='GemSIM error model file (.gzip, need absolute path) [Illumina_HiSeq_2500_p]')
group4 = parser.add_argument_group('Arguments for general settings')
group4.add_argument('-o', dest='output_dir', type=str, default="simulation_output", \
help='Output directory [simulator_output]')
group4.add_argument('-rn', dest='rearranged_output_name', type=str, default="test", \
help='Prefix of the rearranged outputs (do not include directory name) [test]')
group4.add_argument('-n', dest='num_samples', type=int, default=1, \
help='Number of test samples to be generated [1]')
group4.add_argument('-sc', dest='sim_control', action='store_true', \
help='Simulation for control genome')
group4.add_argument('-ssr', dest='sim_short_reads', action='store_true', \
help='Simulate short reads (fastq) files')
group4.add_argument('-sb', dest='sim_bam', action='store_true', \
help='Simulate BAM files')
group4.add_argument('-picard', dest='path_to_picard', type=str, default=None, \
help='Absolute path to picard')
group4.add_argument('-GATK', dest='path_to_GATK', type=str, default=None, \
help='Absolute path to GATK')
args = parser.parse_args()
if not os.path.exists(args.genome_file):
log_print('Error: The reference genome file does not exist!')
exit(1)
if not os.path.exists(args.target_region_file):
log_print('Error: The target region file does not exist!')
exit(1)
param = {}
param['type'] = 'e'
param['genome_file'] = os.path.join(os.getcwd(), args.genome_file)
if args.target_region_file:
param['target_region_file'] = os.path.join(os.getcwd(), args.target_region_file)
param['replaceN'] = args.replaceNs
param['cnv_min_len'] = args.cnv_min_length
param['cnv_max_len'] = args.cnv_max_length
param['min_cn'] = args.min_copy_number
param['max_cn'] = args.max_copy_number
param['p_ins'] = args.proportion_ins
param['e_cnv_list'] = args.target_cnv_list
param['o_cnv_list'] = args.out_cnv_list
param['out_dir'] = os.path.join(os.getcwd(), args.output_dir)
param['e_cnv_chr'] = args.target_cnv_chr
param['e_cnv_tol'] = args.target_cnv_tol
param['o_cnv_chr'] = args.out_cnv_chr
param['o_cnv_tol'] = args.out_cnv_tol
param['overlap_bp'] = args.overlap_bps
param['tmp_dir'] = os.path.join(param['out_dir'], "tmp")
#param['rearranged_out'] = args.rearranged_output_name
param['nreads'] = args.nreads
param['frag_size'] = args.frag_size
param['stdev'] = args.stdev
param['read_length'] = args.read_length
param['paired_end'] = args.paired_end
param['qual'] = args.quality_score_offset
param['model'] = args.model
#param['sim_control'] = args.sim_control
param['sim_short_reads'] = args.sim_short_reads
param['sim_bam'] = args.sim_bam
param['path_to_picard'] = args.path_to_picard
param['path_to_GATK'] = args.path_to_GATK
param['method_s'] = args.method_s
param['method_l'] = args.method_l
param['e_cnv_len_file'] = args.target_cnv_len_file
param['o_cnv_len_file'] = args.out_cnv_len_file
param['opt'] = args.exclude_Ns
param['gapn'] = args.num_N_for_gaps
param['flank'] = args.min_flanking_len
param['fl'] = args.target_region_flank
param['inter'] = args.connect_len_between_regions
param['s_rate'] = args.s_rate
param['i_rate'] = args.i_rate
param['i_mlen'] = args.i_max_len
param['as'] = args.as1
param['bs'] = args.bs
param['al'] = args.al
param['bl'] = args.bl
param['snp_slack'] = args.s_slack
t = args.num_samples
if t < 1:
log_print("Error: The number of test samples (-n) must be at least 1!")
exit(1)
if (param['p_ins'] < 0) or (param['p_ins'] > 1):
log_print("Error: Insertion rate must be between 0 and 1.")
exit(1)
if (param['s_rate'] < 0) or (param['s_rate'] > 1):
log_print("Error: SNP rate must be between 0 and 1.")
exit(1)
if (param['i_rate'] < 0) or (param['i_rate'] > 1):
log_print("Error: indel rate must be between 0 and 1.")
exit(1)
if (param['i_mlen'] < 0):
log_print("Error: the maximium length of indels must > 0.")
exit(1)
if (param['snp_slack'] < 0):
log_print("Error: the slack region for making SNPS must > 0.")
exit(1)
if param['method_s'] == 'gauss':
if not param['as']:
param['as'] = 0
if not param['bs']:
param['bs'] = 1
if param['method_l'] == 'gauss':
if not param['al']:
param['al'] = 0
if not param['bl']:
param['bl'] = 1
if (param['method_l'] == 'beta'):
if not param['al']:
param['al'] = 2
if not param['bl']:
param['bl'] = 2
if (param['al']) and (param['al'] <= 0):
log_print("Error: alpha must > 0 for beta distribution.")
exit(1)
if (param['bl']) and (param['bl'] <= 0):
log_print("Error: beta must > 0 for beta distribution.")
exit(1)
if (param['method_s'] != 'gauss'):
if param['as'] or param['bs']:
log_print("Warning: parameters mu and sigma for CNV distribution are not used! (Only used in gauss distribution.)")
if (param['method_l'] != 'gauss') and (param['method_l'] != 'beta'):
if param['al'] or param['bl']:
log_print("Warning: parameters mu and sigma / alpha and beta for CNV length distribution are not used! (Only used in gauss and beta distribution.)")
if param['sim_bam']:
if (not param['path_to_picard']) or (not param['path_to_GATK']):
log_print('Error: Must provide path to picard (-picard) and path to GATK (-GATK)!')
exit(1)
if param['sim_short_reads'] and not param['paired_end']:
log_print("Warning: Chose single-end sequencing. Mean fragment size (-fs) and standard deviation of fragment size (-s) will be ignored.")
if param['type'] == 'e':
e_ct = 0
if param['e_cnv_list']:
e_ct += 1
if param['e_cnv_chr']:
e_ct += 1
if param['e_cnv_tol']:
e_ct += 1
if param['e_cnv_len_file']:
e_ct += 1
if e_ct != 1:
log_print('Error: One and only one of -e_cnv, -e_chr, -e_tol and -e_cl must be present!')
exit(1)
o_ct = 0
if param['o_cnv_list']:
o_ct += 1
if param['o_cnv_chr']:
o_ct += 1
if param['o_cnv_tol']:
o_ct += 1
if param['o_cnv_len_file']:
o_ct += 1
if not (o_ct == 0 or o_ct ==1):
log_print('Error: Only one of -o_cnv, -o_chr, -o_tol and -o_cl can be present!')
exit(1)
if param['e_cnv_list']:
log_print('Warning: A list of CNVs overlapping with target regions are provided. -em, -f, -ms, -ml, -ol, -min_cn, -max_cn, -min_len and -max_len will be ignored for CNVs on this list!')
if param['o_cnv_list']:
log_print('Warning: A list of CNVs outside of target regions are provided. -em, -f, -ms, -ml, -ol, -min_cn, -max_cn, -min_len and -max_len will be ignored for CNVs on this list!')
if param['method_l'] == 'user':
log_print('Warning: -min_len and -max_len will be ignored since "-ml user" is chosen!')
if not param['e_cnv_len_file']:
log_print('Error: "-ml user" must be used with -e_cl!')
exit(1)
if o_ct == 1 and not param['o_cnv_len_file']:
log_print('Error: If CNVs outside of target regions are to be generated, "-ml user" must be used with -o_cl!')
exit(1)
else:
if param['e_cnv_len_file']:
log_print('Error: Only "-ml user" could be used with -e_cl!')
exit(1)
if o_ct == 1 and param['o_cnv_len_file']:
log_print('Error: Only "-ml user" could be used with -o_cl!')
exit(1)
if param['sim_bam']:
if not param['sim_short_reads']:
log_print('Error: Must simulate short reads (-ssr) to simulate bam files!')
exit(1)
if os.path.exists(param['tmp_dir']):
subprocess.call(['rm', '-rf', param['tmp_dir']], stderr=None)
#shutil.rmtree(param['tmp_dir'])
os.makedirs(param['tmp_dir'])
else:
os.makedirs(param['tmp_dir'])
print ' ==================== SECNVs ==================== '
sys.stdout.flush()
print ' SECNVs (2019) '
sys.stdout.flush()
print ' Version 2.7.1 (Oct 2019) '
sys.stdout.flush()
print ' Bug report: Yue Xing <[email protected]> '
sys.stdout.flush()
print ' ------------------------------------------------------ '
sys.stdout.flush()
log_print('Reading genome file...')
iin_seqs, iin_chrs, iin_ran_m = read_fasta(param['genome_file'], \
param['replaceN'], param['gapn'], param['out_dir'], \
param['opt'])
if param['type'] == 'e':
log_print('Reading target region file...')
iin_st, iin_ed = read_target(param['target_region_file'], iin_chrs)
if t == 1:
param['rearranged_out'] = args.rearranged_output_name
else:
log_print('Processing the 1st sample and control (if required)...')
param['rearranged_out'] = args.rearranged_output_name + "1"
simulate_WES(param, iin_seqs, iin_chrs, iin_st, iin_ed, args.sim_control, 0, iin_ran_m)
if t > 1:
for i in range(1,t):
mess = 'Processing the ' + str(i+1) + 'th sample...'
log_print(mess)
param['rearranged_out'] = args.rearranged_output_name + str(i+1)
simulate_WES(param, iin_seqs, iin_chrs, iin_st, iin_ed, None, 1, iin_ran_m)
#shutil.rmtree(param['tmp_dir'])
subprocess.call(['rm', '-rf', param['tmp_dir']], stderr=None)
log_print('Done')
if __name__ == '__main__':
main()