-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathfind_snps.py
executable file
·367 lines (270 loc) · 14.1 KB
/
find_snps.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
#!/usr/bin/python3
# We will be run with a file that should fit entirely in memory, and which should not
# break across a SNP boundary. We don't need to know more than that.
# We will take filtered individual kmers, and output SNPs
#
# Performance note: We use about 350 bytes/kmer discovered, and from a dataset with
# about 3.9million kmers as a median of five genomes, there were 8.7million total
# kmers found (in this dataset). Need to assume that the overlap is worse than that
# (however, the worst case, completely divergent genomes, doesn't give us any data,
# so how important is it to plan for that case?)
#
# Revised estimate is closer to 115bytes or even 33bytes per kmer. Keep testing.
# May be worth trying to crunch memory usage. Should be doable.
# Import standard Python libraries for argument parsing, interacting with the
# filesystem, and environment and time / date processing.
import argparse as argparse
# Permit use of stdout
import sys
# Help with debug data:
import logging as logging
# Import tools for parsing kSNP configuration data.
# TODO - JN
# import ksnpConfig
# The override option to this function can be used to cause the parsing of a provided list
# instead of the programs passed options, for testing.
def parseCommandline(override=None):
description = '''Identify SNPs from filtered kmers, and prepare files for input into mummer to
determine SNP location. Will work on partitioned data as long as the partition
kmer is the same and isn't on a loci boundary.'''
example = '''The term "partitioned" refers to creating files that have only a subset of the
kmers (e.g. those beginning with AA). All files should contain the same subset
for proper operation.
To output the SNPs for the files fsplit0.kmer.part0, fsplit1.kmer.part0, and
fsplit2.kmer.part0:
%(prog)s SNPs.part0 fsplit0.kmer.part0 fsplit1.kmer.part0 fsplit2.kmer.part0
The outputs will be:
fsplit0.kmer.part0.SNPs.fasta
fsplit1.kmer.part0.SNPs.fasta
fsplit2.kmer.part0.SNPs.fasta
and also (with the overall list of SNPs):
SNPs.part0
'''
parser = argparse.ArgumentParser(description= description, epilog=example,
formatter_class=argparse.RawDescriptionHelpFormatter)
# parser.add_argument('SNPsFile', metavar='SNPsFile', nargs=1,
# help='The filename to use for SNPs discovered in this input set')
parser.add_argument('filteredGenomes', metavar='filteredGenome', nargs='+',
help='Filtered genomes (possibly partitioned) from which to extract SNPs.')
parser.add_argument('--fasta-suffix', action='store', help='The suffix to use for fasta files to output to mummer for position data', default='SNPs.fasta')
parser.add_argument('--snp-suffix', action='store', help='The suffix to use for SNP files', default='SNPs')
parser.add_argument('--snps-all', action='store', help='A set of SNPs output by a previous run, for comparing to this run.')
parser.add_argument('--debug', action='store_true', help='Output diagnostic data to the screen using STDERR')
parser.add_argument('--info', action='store_true', help='Output status information on STDERR', default=True)
parser.add_argument('--quiet', action='store_true', help='Silence debug and other messages. (warnings only)')
if override is None:
return parser.parse_args()
else:
return parser.parse_args(override)
def loadSNPsFromSNPsAll(bucket, snpsAllFile, startGenomeBit=1):
'''
This function loads a set of loci from an existing SNPs_all file. The input
file represents the sum total of the loci that can involve our previous data.
If we only have one new genome, this will be all of the loci ever. Otherwise,
this will ensure that all of these loci are represented, including where they
appear in the new genomes, in addition to all SNPs between new genomes.
'''
# read buffer (trying this out)
bufferSize = 32 * 1024
tab = '\t'
# Column identifiers
center = 2
loci = 1
genome = 4
addedSNPsCount = 0
lociAddedCount = 0
lastSnpLocus = ''
maxGenomeBit = startGenomeBit
# Mapping of genomes to the associated bit in the bit field.
genomeBits = {}
# Default / initial value for a given locus.
noSNPsNoGenomes = {
'A': 0,
'C': 0,
'G': 0,
'T': 0,
}
snps = open(snpsAllFile, 'r')
nextLines = snps.readlines(bufferSize)
while nextLines != []:
for kmer in nextLines:
snpData = kmer.split(tab)
snpCenter = snpData[center]
snpLocus = snpData[loci]
snpGenome = snpData[genome]
thisBit = genomeBits.setdefault(snpGenome, maxGenomeBit)
if thisBit == maxGenomeBit:
maxGenomeBit = maxGenomeBit * 2
# Set the bit corresponding to this central nucleotide for this genome
bucket[snpLocus][snpCenter] = bucket.setdefault(snpLocus, noSNPsNoGenomes.copy())[snpCenter] | thisBit
# Overhead:
if snpLocus != lastSnpLocus:
lociAddedCount = lociAddedCount + 1
# logging.debug("Added SNP: %s: %s", lastSnpLocus, bucket[lastSnpLocus])
lastSnpLocus = snpLocus
addedSNPsCount = addedSNPsCount + 1
nextLines = snps.readlines(bufferSize)
# logging.debug("Added SNP: %s: %s", lastSnpLocus, bucket[lastSnpLocus])
snps.close()
logging.info("Added %s SNP kmers to bucket (%s loci), new bucket size: %s loci", addedSNPsCount, lociAddedCount, len(bucket))
return (bucket, genomeBits)
def fillBucket(genomeList, k ):
# read buffer (trying this out)
bufferSize = 32 * 1024
# To go faster, might need to switch to asynchronous file I/O.
centerpos = int((k-1)/2)
maxGenomeBit = 1
# Mapping of genomes to the associated bit in the bit field.
genomeBits = {}
thisBucket = {}
# Default / initial value for a given locus.
noSNPsNoGenomes = {
'A': 0,
'C': 0,
'G': 0,
'T': 0,
}
for genome in genomeList:
logging.info("Loading genome: %s", genome)
genomeBits[genome] = maxGenomeBit
thisGenome = open(genome,'r')
nextLines = thisGenome.readlines(bufferSize)
while nextLines != []:
for kmer in nextLines:
center = kmer[centerpos]
locus=kmer[:centerpos] + '.' + kmer[centerpos+1:k]
# Set the bit corresponding to this central nucleotide for this genome
thisBucket[locus][center] = thisBucket.setdefault(locus, noSNPsNoGenomes.copy())[center] | maxGenomeBit
# logging.debug("%s: %s", locus, thisBucket[locus])
nextLines = thisGenome.readlines(bufferSize)
maxGenomeBit = maxGenomeBit * 2 # Move to a more significant bit.
thisGenome.close()
logging.debug("bucket size: %s loci", len(thisBucket))
return (thisBucket, genomeBits)
def findConflicts(genomeSets):
conflictingGenomes = 0
remainingGenomeSets = genomeSets.copy()
while remainingGenomeSets:
(nucleotideA, setA) = remainingGenomeSets.pop()
for (nucleotideB, setB) in remainingGenomeSets:
# This should run for every pair of nucleotide integers.
# It will set any bits in conflictingGenomes that correspond to a genome
# that has more than one nucleotide associated with this locus.
conflictingGenomes = conflictingGenomes | ( setA & setB )
return conflictingGenomes
def deconflict(genomeSets):
# Figure out which genomes have conflicts in this locus.
conflicts = findConflicts(genomeSets)
# Remove all references to conflicted genomes from the genome sets, trim
# unused center nucleotides.
return [ (nucleotide, genomeSet & ~ conflicts ) for (nucleotide, genomeSet) in genomeSets if genomeSet & ~ conflicts != 0]
# def writeSNP(locus, nucleotide, genome, SNPsFile, k):
def writeSNP(locus, nucleotide, genome, centerPos):
newline = '\n'
tab = '\t'
genome['fasta'].write('>' + locus + '_' + nucleotide + newline + locus[:centerPos] + nucleotide + locus[centerPos+1:] + newline)
genome['snp'].write(tab.join([locus, nucleotide, locus[:centerPos] + nucleotide + locus[centerPos+1:], 'FILENAME']) + newline)
# If we also want to create a central list of SNPs as well NOW, we can do it here.
# SNPsFile.write(locus[:centerpos] + nucleotide + locus[centerpos+1:] + newline)
# def dumpBucket(bucket, genomeBits, genomeList, SNPsFile, k):
def dumpBucket(bucket, genomeBits, genomeList, k):
# We then browse through values looking for:
# Only one center char in a given mask (not a SNP)
# The same genome in the same mask twice, with different center chars, and if not: SNP??
# Identify the index of the center nucleotide (the polymorphic one)
centerPos = int((k-1)/2)
# Collect statistics for debugging
totalSNPs = 0
writtenKmers = 0
# create reverse map from genomeBit to genome:
bitGenomes = [ (bitValue, genome) for (genome, bitValue) in genomeBits.items() ]
# Now the thisBucket hash table is fully populated...
for (locus, nucleotideGenomes) in bucket.items():
# First, deconflict; remove from consideration genomes that show up for more than
# one nucleotide for this locus:
kmerGenomeMap = list(nucleotideGenomes.items())
# Remove all references to conflicted genomes from the genome sets, trim
# unused center nucleotides.
deconflicted = deconflict(kmerGenomeMap)
if len(deconflicted) > 1:
# logging.debug("deconflicted left something to play with: %s", deconflicted)
# Deconfliction removes any nucleotides with no corresponding genomes after
# the deconfliction is done. The possible outcomes are:
# 0 nucleotides: All of the hits were from genomes with conflicts.
# 1 nucleotide: After all conflicts removed, there is no polymorphism (only
# one nucleotide represented means no mutation)
# 2-4 nucleotides: There is an SNP here.
totalSNPs = totalSNPs + 1 # Count the number of SNPs found.
# logging.debug("Found SNP: %s", deconflicted)
# These two loops should be v. fast; small lists, bitwise operations, at most
# 4*len(genomeList) tests and len(genomeList)*2 writes.
for (nucleotide, genomeSet) in deconflicted:
for (bitValue, genome) in bitGenomes:
if bitValue & genomeSet != 0:
# This should be called at most len(genomeList) times.
# writeSNP(locus, nucleotide, genome, SNPsFile)
writtenKmers = writtenKmers + 1
writeSNP(locus, nucleotide, genomeList[genome], centerPos)
return (totalSNPs, writtenKmers)
def getK(genomeList):
# Don't do this a _lot_, but it is low cost to do once in a while.
sampleFile = open(genomeList[0],'r')
# Grab the first space delimited "word", which should be the kmer
k = len(sampleFile.readline().strip().split(" ")[0])
sampleFile.close()
return k
def findSNPs(genomeList, fastaSuffix, snpSuffix, snpsAll=None):
k = getK(genomeList)
logging.debug('Looking for SNPs in %s genomes', len(genomeList))
(thisBucket, genomeBits) = fillBucket(genomeList, k)
logging.debug('Found %s loci in the %s genomes', len(thisBucket), len(genomeList))
if snpsAll is not None:
logging.info("snpsAll: %s -> Therefore we are going to load SNPs from that file", snpsAll)
# Start genome bit for new genomes:
nextGenomeBit = max(list(genomeBits.values())) * 2
(nextBucket, newGenomeBits) = loadSNPsFromSNPsAll(thisBucket, snpsAll, nextGenomeBit)
# Swap in our new bucket. (may not be necessary if we modified the one we passed in-place?)
thisBucket = nextBucket
# Nothing else to do; dumpBucket will only dump the new (non-SNPs_all related) genomes, and
# those only if they either intersect with the SNPs_all data after deconflicting, or they
# internally (within new genomes) create new SNPs.
outputFiles = {}
for genome in genomeList:
outputFiles[genome] = {
'fasta': open(genome + fastaSuffix, 'w'),
'snp': open(genome + snpSuffix, 'w'),
}
# Avoid the cost of flushing at the end of every line.
logging.debug("FASTA output file line_buffering: %s", outputFiles[genome]['fasta'].line_buffering)
logging.debug("SNP output file line_buffering: %s", outputFiles[genome]['snp'].line_buffering)
# dumpBucket(thisBucket, genomeBits, genomeList, SNPsFile, k)
(totalSNPs, writtenKmers) = dumpBucket(thisBucket, genomeBits, outputFiles, k)
logging.info('Found %s SNPs in this partition, wrote %s SNP kmers', totalSNPs, writtenKmers)
# Clean up, flush data to disk.
for outputFiles in outputFiles.values():
for fileType in outputFiles.values():
fileType.close()
####################################
####################################
###
###
### Main
###
###
####################################
####################################
if __name__ == "__main__":
options = parseCommandline()
if options.debug:
logging.basicConfig(format='%(levelname)s:%(message)s', level=logging.DEBUG)
elif options.info:
logging.basicConfig(format='%(levelname)s:%(message)s', level=logging.INFO)
else:
logging.basicConfig(format='%(levelname)s:%(message)s', level=logging.WARN)
logging.debug('Commandline options, as parsed: %s', str(options))
# Primary function of this script:
# - Load kmers into memory,
# - Traverse kmers, filtering out conflicts,
genomeFiles = options.filteredGenomes # Looks like fsplitX.SNPs.partY.mers AND fsplitX.SNPs.partY.mers.fasta (last for mummer)
# This output will be piped into mummer to get position information to link us with annotation data.
findSNPs(genomeFiles, options.fasta_suffix, options.snp_suffix, options.snps_all)