forked from maximilianh/crisporWebsite
-
Notifications
You must be signed in to change notification settings - Fork 1
/
Copy pathcrispor.py
executable file
·5359 lines (4580 loc) · 209 KB
/
crispor.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
556
557
558
559
560
561
562
563
564
565
566
567
568
569
570
571
572
573
574
575
576
577
578
579
580
581
582
583
584
585
586
587
588
589
590
591
592
593
594
595
596
597
598
599
600
601
602
603
604
605
606
607
608
609
610
611
612
613
614
615
616
617
618
619
620
621
622
623
624
625
626
627
628
629
630
631
632
633
634
635
636
637
638
639
640
641
642
643
644
645
646
647
648
649
650
651
652
653
654
655
656
657
658
659
660
661
662
663
664
665
666
667
668
669
670
671
672
673
674
675
676
677
678
679
680
681
682
683
684
685
686
687
688
689
690
691
692
693
694
695
696
697
698
699
700
701
702
703
704
705
706
707
708
709
710
711
712
713
714
715
716
717
718
719
720
721
722
723
724
725
726
727
728
729
730
731
732
733
734
735
736
737
738
739
740
741
742
743
744
745
746
747
748
749
750
751
752
753
754
755
756
757
758
759
760
761
762
763
764
765
766
767
768
769
770
771
772
773
774
775
776
777
778
779
780
781
782
783
784
785
786
787
788
789
790
791
792
793
794
795
796
797
798
799
800
801
802
803
804
805
806
807
808
809
810
811
812
813
814
815
816
817
818
819
820
821
822
823
824
825
826
827
828
829
830
831
832
833
834
835
836
837
838
839
840
841
842
843
844
845
846
847
848
849
850
851
852
853
854
855
856
857
858
859
860
861
862
863
864
865
866
867
868
869
870
871
872
873
874
875
876
877
878
879
880
881
882
883
884
885
886
887
888
889
890
891
892
893
894
895
896
897
898
899
900
901
902
903
904
905
906
907
908
909
910
911
912
913
914
915
916
917
918
919
920
921
922
923
924
925
926
927
928
929
930
931
932
933
934
935
936
937
938
939
940
941
942
943
944
945
946
947
948
949
950
951
952
953
954
955
956
957
958
959
960
961
962
963
964
965
966
967
968
969
970
971
972
973
974
975
976
977
978
979
980
981
982
983
984
985
986
987
988
989
990
991
992
993
994
995
996
997
998
999
1000
#!/usr/bin/env python2.7
# the tefor crispr tool
# can be run as a CGI or from the command line
# OOF scores are WRONG for Cpf1! -> where is the cut site?
# python std library
import subprocess, tempfile, optparse, logging, atexit, glob, shutil
import Cookie, time, sys, cgi, re, random, platform, os
import hashlib, base64, string, logging, operator, urllib, sqlite3, time
import traceback, json, pwd, pickle
from datetime import datetime
from collections import defaultdict, namedtuple
from os.path import join, isfile, basename, dirname, isdir, abspath
from StringIO import StringIO
from itertools import product
# try to load external dependencies
# we're going into great lengths to create a readable error message
needModules = set(["tabix", "twobitreader", "pandas", "matplotlib", "scipy"])
try:
import tabix # if not found, install with 'pip install pytabix'
needModules.remove("tabix")
except:
pass
try:
import twobitreader # if not found, install with 'pip install twobitreader'
needModules.remove("twobitreader")
except:
pass
try:
import pandas # required by doench2016 score. install with 'pip install pandas'
needModules.remove("pandas")
import scipy # required by doench2016 score. install with 'pip install pandas'
needModules.remove("scipy")
import matplotlib # required by doench2016 score. install with 'pip install matplotlib'
needModules.remove("matplotlib")
import numpy # required by doench2016 score. install with 'pip install numpy'
needModules.remove("numpy")
except:
pass
if len(needModules)!=0:
print("Content-type: text/html\n")
print("Python interpreter path: %s<p>" % sys.executable)
print("These python modules were not found: %s<p>" % ",".join(needModules))
print("To install all requirements in one line, run: pip install pytabix pandas twobitreader scipy matplotlib numpy<p>")
sys.exit(0)
# our own eff scoring library
import crisporEffScores
# don't report print as an error
# pylint: disable=E1601
# optional module for Excel export as native .xls files
# install with 'apt-get install python-xlwt' or 'pip install xlwt'
xlwtLoaded = True
try:
import xlwt
except:
sys.stderr.write("crispor.py - warning - the python xlwt module is not available\n")
xlwtLoaded = False
# optional module for mysql support
try:
import MySQLdb
mysqldbLoaded = True
except:
mysqldbLoaded = False
# version of crispor
versionStr = "4.0"
# contact email
contactEmail='[email protected]'
# write debug output to stdout
DEBUG = False
#DEBUG = True
# use bowtie for off-target search?
useBowtie = False
# calculate the efficienc scores?
doEffScoring = True
# system-wide temporary directory
TEMPDIR = os.environ.get("TMPDIR", "/tmp")
#TEMPDIR = "/dev/shm/"
# prefix in html statements before the directories "image/", "style/" and "js/"
HTMLPREFIX = ""
# alternative directory on local disk where image/, style/ and js/ are located
HTMLDIR = "/usr/local/apache/htdocs/crispor/"
# directory of crispor.py
baseDir = dirname(__file__)
# filename of this script, usually crispor.py
myName = basename(__file__)
# the segments.bed files use abbreviated genomic region names
segTypeConv = {"ex":"exon", "in":"intron", "ig":"intergenic"}
# directory for processed batches of offtargets ("cache" of bwa results)
batchDir = join(baseDir,"temp")
# use mysql or sqlite for the jobs?
jobQueueUseMysql = False
# the file where the sqlite job queue is stored
JOBQUEUEDB = join("/tmp/crisporJobs.db")
# alternatively: connection info for mysql
jobQueueMysqlConn = {"socket":None, "host":None, "user": None, "password" : None}
# directory for platform-independent scripts (e.g. Heng Li's perl SAM parser)
scriptDir = join(baseDir, "bin")
# directory for helper binaries (e.g. BWA)
binDir = abspath(join(baseDir, "bin", platform.system()))
# directory for genomes
genomesDir = join(baseDir, "genomes")
DEFAULTORG = 'hg19'
DEFAULTSEQ = 'cttcctttgtccccaatctgggcgcgcgccggcgccccctggcggcctaaggactcggcgcgccggaagtggccagggcgggggcgacctcggctcacagcgcgcccggctattctcgcagctcaccatgGATGATGATATCGCCGCGCTCGTCGTCGACAACGGCTCCGGCATGTGCAAGGCCGGCTTCGCGGGCGACGATGCCCCCCGGGCCGTCTTCCCCTCCATCGTGGGGCGCC'
# used if hg19 is not available
ALTORG = 'sacCer3'
ALTSEQ = 'ATTCTACTTTTCAACAATAATACATAAACatattggcttgtggtagCAACACTATCATGGTATCACTAACGTAAAAGTTCCTCAATATTGCAATTTGCTTGAACGGATGCTATTTCAGAATATTTCGTACTTACACAGGCCATACATTAGAATAATATGTCACATCACTGTCGTAACACTCT'
pamDesc = [ ('NGG','20bp-NGG - Cas9 S. Pyogenes'),
('TTTN','TTTN-23bp - Cpf1 F. novicida'),
('NGA','20bp-NGA - Cas9 S. Pyogenes mutant VQR'),
('NGCG','20bp-NGCG - Cas9 S. Pyogenes mutant VRER'),
('NNAGAA','20bp-NNAGAA - Cas9 S. Thermophilus'),
('NGGNG','20bp-NGGNG - Cas9 S. Thermophilus'),
('NNGRRT','21bp-NNG(A/G)(A/G)T - Cas9 S. Aureus'),
('NNNNGMTT','20bp-NNNNG(A/C)TT - Cas9 N. Meningitidis'),
('NNNNACA','20bp-NNNNACA - Cas9 Campylobacter jejuni'),
]
DEFAULTPAM = 'NGG'
# maximum size of an input sequence
MAXSEQLEN = 1000
# maximum input size when specifying "no genome"
MAXSEQLEN_NOGENOME = 25000
# BWA: allow up to X mismatches
maxMMs=4
# maximum number of occurences in the genome to get flagged as repeats.
# This is used in bwa samse, when converting the same file
# and for warnings in the table output.
MAXOCC = 60000
# the BWA queue size is 2M by default. We derive the queue size from MAXOCC
MFAC = 2000000/MAXOCC
# the length of the guide sequence, set by setupPamInfo
GUIDELEN=None
# the name of the currently processed batch, assigned only once
# in readBatchParams and only for json-type batches
batchName = ""
# are we doing a Cpf1 run?
# this variable changes almost all processing and
# has to be set on program start, as soon as we know
# the PAM we're running on
cpf1Mode=None
# Highly-sensitive mode (not for CLI mode):
# MAXOCC is increased in processSubmission() and in the html UI if only one
# guide seq is run
# Also, the number of allowed mismatches is increased to 5 instead of 4
#HIGH_MAXOCC=600000
#HIGH_maxMMs=5
# minimum off-target score of standard off-targets (those that end with NGG)
# This should probably be based on the CFD score these days
# But for now, I'll let the user do the filtering
MINSCORE = 0.0
# minimum off-target score for alternative PAM off-targets
# There is not a lot of data to support this cutoff, but it seems
# reasonable to have at least some cutoff, as otherwise we would show
# NAG and NGA like NGG and the data shows clearly that the alternative
# PAMs are not recognized as well as the main NGG PAM.
# so for now, I just filter out very degenerative ones. the best solution
# would be to have a special penalty on the CFD score, but CFS does not
# support non-NGG PAMs (is this actually true?)
ALTPAMMINSCORE = 1.0
# for some PAMs, we allow other alternative motifs when searching for offtargets
# MIT and eCrisp do that, they use the motif NGG + NAG, we add one more, based on the
# on the guideSeq results in Tsai et al, Nat Biot 2014
# The NGA -> NGG rule was described by Kleinstiver...Young 2015 "Improved Cas9 Specificity..."
# NNGTRRT rule for S. aureus is in the new protocol "SaCas9 User manual"
# ! the length of the alternate PAM has to be the same as the original PAM!
offtargetPams = {
"NGG" : ["NAG","NGA"],
"NGA" : ["NGG"],
"NNGRRT" : ["NNGRRN"]
}
# global flag to indicate if we're run from command line or as a CGI
commandLineMode = False
# names/order of efficiency scores to show in UI
scoreNames = ["fusi", "crisprScan"]
allScoreNames = ["fusi", "chariRank", "ssc", "doench", "wang", "crisprScan", "oof", "housden", "proxGc"]
# how many digits shall we show for each score? default is 0
scoreDigits = {
"ssc" : 1,
}
# List of AddGene plasmids, their long and short names:
addGenePlasmids = [
("43860", ("MLM3636 (Joung lab)", "MLM3636")),
("49330", ("pAc-sgRNA-Cas9 (Liu lab)", "pAcsgRnaCas9")),
("42230", ("pX330-U6-Chimeric_BB-CBh-hSpCas9 (Zhang lab) + derivatives", "pX330")),
("52961", ("lentiCRISPR v2 (Zhang lab)", "lentiCrispr")),
]
addGenePlasmidsAureus = [
("61591", ("pX601-AAV-CMV::NLS-SaCas9-NLS-3xHA-bGHpA;U6::BsaI-sgRNA (Zhang lab)", "pX601")),
("61592", ("pX600-AAV-CMV::NLS-SaCas9-NLS-3xHA-bGHpA (Zhang lab)", "pX600")),
("61593", ("pX602-AAV-TBG::NLS-SaCas9-NLS-HA-OLLAS-bGHpA;U6::BsaI-sgRNA (Zhang lab)", "pX602")),
("65779", ("VVT1 (Joung lab)", "VVT1"))
]
# list of AddGene primer 5' and 3' extensions, one for each AddGene plasmid
# format: prefixFw, prefixRw, restriction enzyme, link to protocol
addGenePlasmidInfo = {
"43860" : ("ACACC", "AAAAC", "BsmBI", "https://www.addgene.org/static/data/plasmids/43/43860/43860-attachment_T35tt6ebKxov.pdf"),
"49330" : ("TTC", "AAC", "Bsp QI", "http://bio.biologists.org/content/3/1/42#sec-9"),
"42230" : ("CACC", "AAAC", "Bbs1", "https://www.addgene.org/static/data/plasmids/52/52961/52961-attachment_B3xTwla0bkYD.pdf"),
"52961" : ("CACC", "AAAC", "Bbs1", "https://www.addgene.org/static/data/plasmids/52/52961/52961-attachment_B3xTwla0bkYD.pdf"),
"61591" : ("CACC", "AAAC", "BsaI", "https://www.addgene.org/static/data/plasmids/61/61591/61591-attachment_it03kn5x5O6E.pdf"),
"61592" : ("CACC", "AAAC", "BsaI", "https://www.addgene.org/static/data/plasmids/61/61592/61592-attachment_iAbvIKnbqNRO.pdf"),
"61593" : ("CACC", "AAAC", "BsaI", "https://www.addgene.org/static/data/plasmids/61/61592/61592-attachment_iAbvIKnbqNRO.pdf"),
"65779": ("CACC", "AAAC", "BsmBI", "https://www.addgene.org/static/data/plasmids/65/65779/65779-attachment_G8oNyvV6pA78.pdf")
}
# Restriction enzyme supplier codes
rebaseSuppliers = {
"B":"Life Technologies",
"C":"Minotech",
"E":"Agilent",
"I":"SibEnzyme",
"J":"Nippon Gene",
"K":"Takara",
"M":"Roche",
"N":"NEB",
"O":"Toyobo",
"Q":"Molecular Biology Resources",
"R":"Promega",
"S":"Sigma",
"V":"Vivantis",
"X":"EURx",
"Y":"SinaClon BioScience"
}
# labels and descriptions of eff. scores
scoreDescs = {
"doench" : ("Doench '14", "Range: 0-100. Linear regression model trained on 880 guides transfected into human MOLM13/NB4/TF1 cells (three genes) and mouse cells (six genes). Delivery: lentivirus. The Fusi score can be considered an updated version this score, as their training data overlaps a lot. See <a target='_blank' href='http://www.nature.com/nbt/journal/v32/n12/full/nbt.3026.html'>Doench et al.</a>"),
"ssc" : ("Xu", "Range ~ -2 - +2. Aka 'SSC score'. Linear regression model trained on data from >1000 genes in human KBM7/HL60 cells (Wang et al) and mouse (Koike-Yusa et al.). Delivery: lentivirus. Ranges mostly -2 to +2. See <a target='_blank' href='http://genome.cshlp.org/content/early/2015/06/10/gr.191452.115'>Xu et al.</a>"),
"crisprScan" : ["Moreno-Mateos", "Also called 'CrisprScan'. Range: mostly 0-100. Linear regression model, trained on data from 1000 guides on >100 genes, from zebrafish 1-cell stage embryos injected with mRNA. See <a target=_blank href='http://www.nature.com/nmeth/journal/v12/n10/full/nmeth.3543.html'>Moreno-Mateos et al.</a>. Recommended for guides transcribed <i>in-vitro</i> (T7 promoter). Click to sort by this score."],
"wang" : ("Wang", "Range: 0-100. SVM model trained on human cell culture data on guides from >1000 genes. The Xu score can be considered an updated version of this score, as the training data overlaps a lot. Delivery: lentivirus. See <a target='_blank' href='http://http://www.ncbi.nlm.nih.gov/pmc/articles/PMC3972032/'>Wang et al.</a>"),
"chariRank" : ("Chari", "Range: 0-100. Support Vector Machine, converted to rank-percent, trained on data from 1235 guides targeting sequences that were also transfected with a lentivirus into human 293T cells. See <a target='_blank' href='http://www.nature.com/nmeth/journal/v12/n9/abs/nmeth.3473.html'>Chari et al.</a>"),
"fusi" : ("Doench '16", "Previously called the 'Fusi' score. Range: 0-100. Boosted Regression Tree model, trained on data produced by Doench et al (881 guides, MOLM13/NB4/TF1 cells + unpublished additional data). Delivery: lentivirus. See <a target='_blank' href='http://biorxiv.org/content/early/2015/06/26/021568'>Fusi et al. 2015</a> and <a target='_blank' href='http://www.nature.com/nbt/journal/v34/n2/full/nbt.3437.html'>Doench et al. 2016</a>. Recommended for guides expressed in cells (U6 promoter). Click to sort the table by this score."),
"housden" : ("Housden", "Range: ~ 1-10. Weight matrix model trained on data from Drosophila mRNA injections. See <a target='_blank' href='http://stke.sciencemag.org/content/8/393/rs9.long'>Housden et al.</a>"),
"oof" : ("Out-of-Frame", "Range: 0-100. Predicts the percentage of clones that will carry out-of-frame deletions, based on the micro-homology in the sequence flanking the target site. See <a target='_blank' href='http://www.nature.com/nmeth/journal/v11/n7/full/nmeth.3015.html'>Bae et al.</a>. Click the score to show the most likely deletions for this guide.")
}
# the headers for the guide and offtarget output files
guideHeaders = ["guideId", "guideSeq", "mitSpecScore", "cfdSpecScore", "offtargetCount", "guideGenomeMatchGeneLocus"]
offtargetHeaders = ["guideId", "guideSeq", "offtargetSeq", "mismatchPos", "mismatchCount", "mitOfftargetScore", "cfdOfftargetScore", "chrom", "start", "end", "strand", "locusDesc"]
# a file crispor.conf in the directory of the script allows to override any global variable
myDir = dirname(__file__)
confPath =join(myDir, "crispor.conf")
if isfile(confPath):
exec(open(confPath))
cgiParams = None
# ====== END GLOBALS ============
def setupPamInfo(pam):
" modify a few globals based on the current pam "
global GUIDELEN
global cpf1Mode
global addGenePlasmids
if pam=="TTTN":
GUIDELEN = 23
cpf1Mode = True
elif pam=="NNGRRT":
addGenePlasmids = addGenePlasmidsAureus
GUIDELEN = 21
cpf1Mode = False
else:
GUIDELEN = 20
cpf1Mode = False
# ==== CLASSES =====
class JobQueue:
"""
simple job queue, using a db table as a backend
jobs have different types and status. status can be updated while they run
job running times are kept and old job info is kept in a separate table
>>> q = JobQueue()
>>> q.clearJobs()
>>> q.waitCount()
0
>>> q.addJob("search", "abc123", "myParams")
True
only one job per jobId
>>> q.addJob("search", "abc123", "myParams")
False
>>> q.waitCount()
1
>>> q.getStatus("abc123")
u'Waiting'
>>> q.startStep("abc123", "bwa", "Alignment with BWA")
>>> q.getStatus("abc123")
u'Alignment with BWA'
>>> jobType, jobId, paramStr = q.popJob()
>>> q.waitCount()
0
>>> q.jobDone("abc123")
>>> q.waitCount()
0
can't pop from an empty queue
#>>> q.popJob()
#(None, None, None)
#>>> os.system("rm /tmp/tempCrisporTest.db")
#0
"""
_queueDef = (
'CREATE TABLE IF NOT EXISTS %s '
'('
' jobType text,' # either "index" or "search"
' jobId text %s,' # unique identifier
' paramStr text,' # parameters for jobs, like db, options, etc.
' isRunning int DEFAULT 0,' # indicates steps have started, done jobs are moved to doneJobs table
' stepName text,' # currently step, internal step name for timings
' stepLabel text,' # current step, human-readable status of job, for UI
' lastUpdate float,' # time of last update
' stepTimes text,' # comma-sep list of whole msecs, one per step
' startTime text' # date+time when job was put into queue
')')
def __init__(self):
" no inheritance needed here "
if jobQueueUseMysql:
self.openMysql()
else:
self.openSqlite(JOBQUEUEDB)
def openSqlite(self, dbName):
self.dbName = dbName
self.conn = sqlite3.Connection(dbName)
try:
self.conn.execute(self._queueDef % ("queue", "PRIMARY KEY"))
except sqlite3.OperationalError:
errAbort("cannot open the file %s" % JOBQUEUEDB)
self.conn.execute(self._queueDef % ("doneJobs", ""))
self.conn.commit()
self._chmodJobDb()
def _chmodJobDb(self):
# umask is not respected by sqlite, bug http://www.mail-archive.com/[email protected]/msg59080.html
if not jobQueueUseMysql:
try:
os.chmod(JOBQUEUEDB, 0o666)
except OSError:
pass
def openMysql(self):
db = MySQLdb.connect(**jobQueueMysql)
self.conn = db.cursor()
def addJob(self, jobType, jobId, paramStr):
" create a new job, returns False if not successful "
self._chmodJobDb()
sql = 'INSERT INTO queue (jobType, jobId, isRunning, lastUpdate, ' \
'stepTimes, paramStr, stepName, stepLabel, startTime) VALUES (?, ?, ?, ?, ?, ?, ?, ?, datetime("now"))'
now = "%.3f" % time.time()
try:
self.conn.execute(sql, (jobType, jobId, 0, now, "", paramStr, "wait", "Waiting"))
self.conn.commit()
return True
except sqlite3.IntegrityError:
return False
except sqlite3.OperationalError:
errAbort("Cannot open DB file %s. Please contact %s" % (self.dbName, contactEmail))
def getStatus(self, jobId):
" return current job status label or None if job is not in queue"
sql = 'SELECT stepLabel FROM queue WHERE jobId=?'
try:
status = self.conn.execute(sql, (jobId,)).next()[0]
except StopIteration:
status = None
return status
def dump(self):
" for debugging, write the whole queue table to stdout "
sql = 'SELECT * FROM queue'
for row in self.conn.execute(sql):
print "\t".join([str(x) for x in row])
def jobInfo(self, jobId, isDone=False):
" for debugging, return all job info as a tuple "
if isDone:
sql = 'SELECT * FROM doneJobs WHERE jobId=?'
else:
sql = 'SELECT * FROM queue WHERE jobId=?'
try:
row = self.conn.execute(sql, (jobId,)).next()
except StopIteration:
return []
return row
def startStep(self, jobId, newName, newLabel):
" start a new step. Update lastUpdate, status and stepTime "
self.conn.execute('BEGIN IMMEDIATE') # lock db
sql = 'SELECT lastUpdate, stepTimes, stepName FROM queue WHERE jobId=?'
lastTime, timeStr, lastStep = self.conn.execute(sql, (jobId,)).next()
lastTime = float(lastTime)
# append a string in format "stepName:milliSecs" to the timeStr
now = time.time()
timeDiff = "%d" % int((1000.0*(now - lastTime)))
newTimeStr = timeStr+"%s=%s" % (lastStep, timeDiff)+","
sql = 'UPDATE queue SET lastUpdate=?, stepName=?, stepLabel=?, stepTimes=?, isRunning=? WHERE jobId=?'
self.conn.execute(sql, (now, newName, newLabel, newTimeStr, 1, jobId))
tryCount = 0
while tryCount < 10:
try:
self.conn.commit()
break
except sqlite3.OperationalError:
time.sleep(3)
tryCount += 1
if tryCount >= 10:
raise Exception("Database locked for a long time")
def jobDone(self, jobId):
" remove the job from the queue and add it to the queue log"
self.conn.execute('BEGIN IMMEDIATE') # lock db
sql = 'SELECT * FROM queue WHERE jobId=?'
try:
row = self.conn.execute(sql, (jobId,)).next()
except StopIteration:
# return if the job has already been removed
logging.warn("jobDone - jobs %s has been removed already" % jobId)
return
sql = 'DELETE FROM queue WHERE jobId=?'
self.conn.execute(sql, (jobId,))
sql = 'INSERT INTO doneJobs VALUES (?,?,?,?,?,?,?,?,?)'
self.conn.execute(sql, row)
self.conn.commit()
def waitCount(self):
" return number of waiting jobs "
sql = 'SELECT count(*) FROM queue WHERE isRunning=0'
count = self.conn.execute(sql).next()[0]
return count
def popJob(self):
" return (jobType, jobId, params) of first waiting job and set it to running state "
self.conn.execute('BEGIN IMMEDIATE') # lock db
sql = 'SELECT jobType, jobId, paramStr FROM queue WHERE isRunning=0 ORDER BY lastUpdate LIMIT 1'
try:
jobType, jobId, paramStr = self.conn.execute(sql).next()
except StopIteration:
self.conn.commit() # unlock db
return None, None, None
sql = 'UPDATE queue SET isRunning=1 where jobId=?'
self.conn.execute(sql, (jobId,))
self.conn.commit() # unlock db
return jobType, jobId, paramStr
def clearJobs(self):
" clear the job table, removing running jobs, too "
self.conn.execute("DELETE from queue")
#self.conn.execute("DROP TABLE queue")
self.conn.commit()
def close(self):
" "
self.conn.close()
# ====== FUNCTIONS =====
contentLineDone = False
def errAbort(msg):
" print err msg and exit "
if commandLineMode:
raise Exception(msg)
if not contentLineDone:
print "Content-type: text/html\n"
print('<div style="float:left; text-align:left; width: 800px">')
print("<strong>Error:</strong> ")
print(msg+"<p>")
print("If you think this is a bug or you have any other suggestions, please do not hesitate to email %s" % contactEmail)
print('</div>')
sys.exit(0) # cgi must not exit with 1
# allow only dashes, digits, characters, underscores and colons in the CGI parameters
# and +
notOkChars = re.compile(r'[^a-zA-Z0-9-_:+.]')
def checkVal(key, str):
""" remove special characters from input string, to protect against injection attacks """
if len(str) > 10000:
errAbort("input parameter %s is too long" % key)
if notOkChars.search(str):
errAbort("input parameter %s contains an invalid character" % key)
return str
def cgiGetParams():
" get CGI parameters and return as dict "
form = cgi.FieldStorage()
global cgiParams
cgiParams = {}
# parameters are:
#"pamId", "batchId", "pam", "seq", "org", "download", "sortBy", "format", "ajax
for key in form.keys():
val = form.getfirst(key)
if val!=None:
# "seq" is cleaned by cleanSeq later
val = urllib.unquote(val)
if key not in ["seq", "name"]:
checkVal(key, val)
cgiParams[key] = val
if "pam" in cgiParams:
if len(set(cgiParams["pam"])-set("ACTGNMKRY"))!=0:
errAbort("Illegal character in PAM-sequence. Only ACTGMKRY and N allowed.")
return cgiParams
transTab = string.maketrans("-=/+_", "abcde")
def makeTempBase(seq, org, pam, batchName):
"create the base name of temp files using a hash function and some prettyfication "
hasher = hashlib.sha1(seq+org+pam+batchName)
batchId = base64.urlsafe_b64encode(hasher.digest()[0:20]).translate(transTab)[:20]
return batchId
def makeTempFile(prefix, suffix):
" return a temporary file that is deleted upon exit, unless DEBUG is set "
if DEBUG:
fname = join("/tmp", prefix+suffix)
fh = open(fname, "w")
else:
fh = tempfile.NamedTemporaryFile(prefix="primer3In", suffix=".txt")
return fh
def saveSeqOrgPamToCookies(seq, org, pam):
" create a cookie with seq, org and pam and print it"
cookies=Cookie.SimpleCookie()
expires = 365 * 24 * 60 * 60
cookies['lastseq'] = seq
cookies['lastseq']['expires'] = expires
cookies['lastorg'] = org
cookies['lastorg']['expires'] = expires
cookies['lastpam'] = pam
cookies['lastpam']['expires'] = expires
print cookies
def debug(msg):
if commandLineMode:
logging.debug(msg)
elif DEBUG:
print msg
print "<br>"
def matchNuc(pat, nuc):
" returns true if pat (single char) matches nuc (single char) "
if pat in ["A", "C", "T", "G"] and pat==nuc:
return True
elif pat=="S" and nuc in ["G", "C"]:
return True
elif pat=="M" and nuc in ["A", "C"]:
return True
elif pat=="D" and nuc in ["AGT"]:
return True
elif pat=="W" and nuc in ["A", "T"]:
return True
elif pat=="K" and nuc in ["T", "G"]:
return True
elif pat=="R" and nuc in ["A", "G"]:
return True
elif pat=="Y" and nuc in ["C", "T"]:
return True
else:
return False
def gcContent(seq):
" return GC content as a float "
c = 0
for x in seq:
if x in ["G","C"]:
c+= 1
return (float(c)/len(seq))
def findPat(seq, pat):
""" yield positions where pat matches seq, stupid brute force search
"""
seq = seq.upper()
pat = pat.upper()
for i in range(0, len(seq)-len(pat)+1):
#print "new pos", i, seq[i:i+len(pat)],"<br>"
found = True
for x in range(0, len(pat)):
#print "new step", x, "<br>"
if pat[x]=="N":
#print "N","<br>"
continue
seqPos = i+x
if seqPos == len(seq):
found = False
break
if not matchNuc(pat[x], seq[seqPos]):
#if not patMatch(seq[seqPos], pat[x]):
#print i, x, pat[x], seq[seqPos], "no match<br>"
found = False
break
#print "match", i, x, found, "<br>"
if found:
#print "yielding", i, "<br>"
yield i
def rndSeq(seqLen):
" return random seq "
seq = []
alf = "ACTG"
for i in range(0, seqLen):
seq.append(alf[random.randint(0,3)])
return "".join(seq)
def cleanSeq(seq, db):
""" remove fasta header, check seq for illegal chars and return (filtered
seq, user message) special value "random" returns a random sequence.
"""
#print repr(seq)
if seq.startswith("random"):
seq = rndSeq(800)
lines = seq.strip().splitlines()
#print "<br>"
#print "before fasta cleaning", "|".join(lines)
if len(lines)>0 and lines[0].startswith(">"):
line1 = lines.pop(0)
#print "<br>"
#print "after fasta cleaning", "|".join(lines)
#print "<br>"
newSeq = []
nCount = 0
for l in lines:
if len(l)==0:
continue
for c in l:
if c not in "actgACTGNn":
nCount +=1
else:
newSeq.append(c)
seq = "".join(newSeq)
msgs = []
if len(seq)>MAXSEQLEN and db!="noGenome":
msgs.append("<strong>Sorry, this tool cannot handle sequences longer than 1kbp</strong><br>Below you find the results for the first %d bp of your input sequence.<br>" % MAXSEQLEN)
seq = seq[:MAXSEQLEN]
if len(seq)>MAXSEQLEN_NOGENOME and db=="noGenome":
msgs.append("<strong>Sorry, this tool cannot handle sequences longer than %d bp when specifying 'No Genome'.</strong><br>Below you find the results for the first %d bp of your input sequence.<br>" % (MAXSEQLEN_NOGENOME, MAXSEQLEN_NOGENOME))
seq = seq[:MAXSEQLEN_NOGENOME]
if nCount!=0:
msgs.append("Sequence contained %d non-ACTGN letters. They were removed." % nCount)
return seq, "<br>".join(msgs)
def revComp(seq):
" rev-comp a dna sequence with UIPAC characters "
revTbl = {'A' : 'T', 'C' : 'G', 'G' : 'C', 'T' : 'A', 'N' : 'N' , 'M' : 'K', 'K' : 'M',
"R" : "Y" , "Y":"R" , "g":"c", "a":"t", "c":"g","t":"a", "n":"n"}
newSeq = []
for c in reversed(seq):
newSeq.append(revTbl[c])
return "".join(newSeq)
def findPams (seq, pam, strand, startDict, endSet):
""" return two values: dict with pos -> strand of PAM and set of end positions of PAMs
Makes sure to return only values with at least GUIDELEN bp left (if strand "+") or to the
right of the match (if strand "-")
If the PAM is TTTN, then this is inversed: pos-strand matches must have at least GUIDELEN
basepairs to the right, neg-strand matches must have at least GUIDELEN bp on their left
>>> findPams("GGGGGGGGGGGGGGGGGGGGGGG", "NGG", "+", {}, set(), False)
({20: '+'}, set([23]))
>>> findPams("CCAGCCCCCCCCCCCCCCCCCCC", "CCA", "-", {}, set(), False)
({0: '-'}, set([3]))
>>> findPams("TTTNCCCCCCCCCCCCCCCCCTTTN", "TTTN", "+", {}, set(), True)
({0: '+'}, set([3]))
>>> findPams("CCCCCCCCCCCCCCCCCCCCCAAAA", "NAA", "-", {}, set(), True
({}, set([]))
>>> findPams("AAACCCCCCCCCCCCCCCCCCCCC", "NAA", "-", {}, set(), True)
({}, set([]))
>>> findPams("CCCCCCCCCCCCCCCCCCCCCCCCCAA", "NAA", "-", {}, set(), True)
({}, set([]))
"""
assert(cpf1Mode is not None)
if cpf1Mode:
maxPosPlus = len(seq)-(GUIDELEN+len(pam))
minPosMinus = GUIDELEN
else:
# -------------------
# OKOKOKOKOK
minPosPlus = GUIDELEN
# -------------------
# OKOKOKOKOK
maxPosMinus = len(seq)-(GUIDELEN+len(pam))
#print "new search", seq, pam, "<br>"
for start in findPat(seq, pam):
if cpf1Mode:
# need enough flanking seq on one side
#print "found", start,"<br>"
if strand == "+" and start > maxPosPlus:
continue
if strand == "-" and start < minPosMinus:
continue
else:
if strand=="+" and start < minPosPlus:
continue
if strand=="-" and start > maxPosMinus:
continue
#print "match", strand, start, end, "<br>"
startDict[start] = strand
end = start+len(pam)
endSet.add(end)
return startDict, endSet
def rulerString(maxLen):
" return line with positions every 10 chars "
texts = []
for i in range(0, maxLen, 10):
numStr = str(i)
texts.append(numStr)
spacer = "".join([" "]*(10-len(numStr)))
texts.append(spacer)
return "".join(texts)
def varDictToHtml(varDict, seq):
" make a list of one html string per position in the sequence "
if varDict is None:
return None
varHtmls = []
for i in range(0, len(seq)):
if not i in varDict:
varHtmls.append(".")
else:
varHooverLines = []
showStar = False # show a star if change is non-simple SNP
varInfos = varDict[i]
for chrom, pos, refAll, altAll, infoDict in varInfos:
varHooverLines.append("%s → %s<br>" % (refAll, altAll))
if "freq" in infoDict:
varHooverLines.append(" <b>Freq:</b> %s<br>" % infoDict["freq"])
#if "dbg" in infoDict:
#varHooverLines.append("%s<br>" % infoDict["dbg"])
if "varId" in infoDict:
varHooverLines.append(" <b>ID:</b> %s<br>" % infoDict["varId"])
if "ExAC" in varDict["label"]:
endPos = int(pos)+len(refAll)
varHooverLines.append(' <a target=_blank href="http://exac.broadinstitute.org/region/%s-%s-%d">ExAC Browser</a><br>' % (chrom, pos, endPos))
if len(refAll)!=1 or len(altAll)!=1:
showStar = True
if len(varInfos)!=1:
showStar = True
varDesc = "".join(varHooverLines)
if showStar:
dispChar = "*"
else:
dispChar = altAll
varHtmls.append("<u class='tooltipsterInteract' title='%s'>%s</u>" % (varDesc, dispChar))
return varHtmls
def showSeqAndPams(seq, startDict, pam, guideScores, varHtmls, varDbs, varDb, minFreq):
" show the sequence and the PAM sites underneath in a sequence viewer "
lines, maxY = distrOnLines(seq.upper(), startDict, len(pam))
print "<div class='substep'>"
print '<a id="seqStart"></a>'
print "Found %d possible guide sequences in input (%d bp). Click on a PAM %s match to show its guide sequence.<br>" % (len(guideScores), len(seq), pam)
if not cpf1Mode:
print "Shown below are the PAM site and the expected cleavage position located -3bp 5' of the PAM site.<br>"
print '''Colors <span style="color:#32cd32; text-shadow: 1px 1px 1px #bbb">green</span>, <span style="color:#ffff00; text-shadow: 1px 1px 1px #888">yellow</span> and <span style="text-shadow: 1px 1px 1px #f01; color:#aa0014">red</span> indicate high, medium and low specificity of the PAM's guide sequence in the genome.<br>'''
else:
print ""
if varDb is not None:
print("""<form style="display:inline" id="paramForm" action="%s" method="GET">""" % basename(__file__))
print ("Variant database:")
varDbList = [(b,c) for a,b,c,d in varDbs] # only keep fname+label
printDropDown("varDb", varDbList, varDb)
if minFreq==0.0:
minFreq="0.0"
else:
minFreq = str(minFreq)
# pull out the hasAF field for this varDb
varDbHasAF = False
for shortLabel, fname, desc, hasAF in varDbs:
if fname==varDb:
varDbHasAF = hasAF
break
if varDbHasAF:
print(""" Min. frequency: """)
print("""<input type="text" name="minFreq" size="8" value="%s">""" % minFreq)
print("""<input style="height:18px;margin:0px;font-size:10px;line-height:normal" type="submit" name="submit" value="Update">""")
print "</div>"
print '''<div style="text-align: left; overflow-x:scroll; width:100%; background:#DDDDDD; border-style: solid; border-width: 1px">'''
print '<pre style="font-size: 80%; display:inline; line-height: 0.95em; text-align:left">'+rulerString(len(seq))
if varHtmls is not None:
print "".join(varHtmls)
print seq
for y in range(0, maxY+1):
texts = []
lastEnd = 0
for start, end, name, strand, pamId in lines[y]:
spacer = "".join([" "]*((start-lastEnd)))
lastEnd = end
texts.append(spacer)
score = guideScores[pamId]
color = scoreToColor(score)
texts.append('''<a style="text-shadow: 1px 1px 1px #bbb; color: %s" id="list%s" href="#%s">''' % (color, pamId,pamId))
texts.append(name)
texts.append("</a>")
print(u''.join(texts).encode("utf8"))
print("</pre><br>")
print '''</div>'''
if cpf1Mode:
print('<div style="line-height: 1.0; padding-top: 5px; font-size: 15px">Cpf1 has a staggered site: cleavage occurs after the 18th base on the non-targeted strand which has the TTTN PAM motif (indicate by "\\" in the schema above). Cleavage occurs after the 23rd base on the targeted strand which has the AAAN motif (indicated by "/" in the schema above). See <a target=_blank href="http://www.sciencedirect.com/science/article/pii/S0092867415012003">Zetsche et al 2015</a>, in particular <a target=_blank href="http://www.sciencedirect.com/science?_ob=MiamiCaptionURL&_method=retrieve&_eid=1-s2.0-S0092867415012003&_image=1-s2.0-S0092867415012003-gr3.jpg&_cid=272196&_explode=defaultEXP_LIST&_idxType=defaultREF_WORK_INDEX_TYPE&_alpha=defaultALPHA&_ba=&_rdoc=1&_fmt=FULL&_issn=00928674&_pii=S0092867415012003&md5=11771263f3e390e444320cacbcfae323">Fig 3</a>.</div>')
def iterOneDelSeqs(seq):
""" given a seq, create versions with each bp removed. Avoid duplicates
yields (delPos, seq)
>>> list(iterOneDelSeqs("AATGG"))
[(0, 'ATGG'), (2, 'AAGG'), (3, 'AATG')]
"""
doneSeqs = set()
for i in range(0, len(seq)):
delSeq = seq[:i]+seq[i+1:]
if delSeq not in doneSeqs:
yield i, delSeq
doneSeqs.add(delSeq)
def flankSeqIter(seq, startDict, pamLen, doFilterNs):
""" given a seq and dictionary of pos -> strand and the length of the pamSite
yield tuples of (name, pamStart, guideStart, strand, flankSeq, pamSeq)
if doFilterNs is set, will not return any sequences that contain an N character
"""
startList = sorted(startDict.keys())
for pamStart in startList:
strand = startDict[pamStart]
if cpf1Mode: # Cpf1: get the sequence to the right of the PAM
if strand=="+":
guideStart = pamStart+pamLen
flankSeq = seq[guideStart:guideStart+GUIDELEN]
pamSeq = seq[pamStart:pamStart+pamLen]
else: # strand is minus
guideStart = pamStart-GUIDELEN
flankSeq = revComp(seq[guideStart:pamStart])
pamSeq = revComp(seq[pamStart:pamStart+pamLen])
else: # common case: get the sequence on the left side of the PAM
if strand=="+":
guideStart = pamStart-GUIDELEN
flankSeq = seq[guideStart:pamStart]
pamSeq = seq[pamStart:pamStart+pamLen]
else: # strand is minus
guideStart = pamStart+pamLen
flankSeq = revComp(seq[guideStart:guideStart+GUIDELEN])
pamSeq = revComp(seq[pamStart:pamStart+pamLen])
if "N" in flankSeq and doFilterNs:
continue
yield "s%d%s" % (pamStart, strand), pamStart, guideStart, strand, flankSeq, pamSeq
def makeBrowserLink(dbInfo, pos, text, title, cssClasses):
" return link to genome browser (ucsc or ensembl) at pos, with given text "
if dbInfo.server.startswith("Ensembl"):
baseUrl = "www.ensembl.org"
urlLabel = "Ensembl"
if dbInfo.server=="EnsemblPlants":
baseUrl = "plants.ensembl.org"
elif dbInfo.server=="EnsemblMetazoa":
baseUrl = "metazoa.ensembl.org"
elif dbInfo.server=="EnsemblProtists":
baseUrl = "protists.ensembl.org"
org = dbInfo.scientificName.replace(" ", "_")
url = "http://%s/%s/Location/View?r=%s" % (baseUrl, org, pos)
elif dbInfo.server=="ucsc":
urlLabel = "UCSC"
if pos[0].isdigit():
pos = "chr"+pos
url = "http://genome.ucsc.edu/cgi-bin/hgTracks?db=%s&position=%s" % (dbInfo.name, pos)
# some limited support for gbrowse
elif dbInfo.server.startswith("http://"):
urlLabel = "GBrowse"
chrom, start, end, strand = parsePos(pos)
start = start+1
url = "%s/?name=%s:%d..%d" % (dbInfo.server, chrom, start, end)
else:
#return "unknown genome browser server %s, please email [email protected]" % dbInfo.server
urlLabel = None
url = "javascript:void(0)"
classStr = ""
if len(cssClasses)!=0:
classStr = ' class="%s"' % (" ".join(cssClasses))
if title is None:
if urlLabel != None:
title = "Link to %s Genome Browser" % urlLabel
else:
title = "No Genome Browser link available yet for this organism"
return '''<a title="%s"%s target="_blank" href="%s">%s</a>''' % (title, classStr, url, text)
def highlightMismatches(guide, offTarget, pamLen):
" return a string that marks mismatches between guide and offtarget with * "
if cpf1Mode:
offTarget = offTarget[pamLen:]
else:
offTarget = offTarget[:-pamLen]
assert(len(guide)==len(offTarget))
s = []
for x, y in zip(guide, offTarget):
if x==y:
s.append(".")
else:
s.append("*")
return "".join(s)
def makeAlnStr(seq1, seq2, pam, mitScore, cfdScore, posStr):
" given two strings of equal length, return a html-formatted string that highlights the differences "
lines = [ [], [], [] ]
last12MmCount = 0
if cpf1Mode: