diff --git a/src/shootout/regex_dna.jl b/src/shootout/regex_dna.jl index 8e519fd..887d674 100644 --- a/src/shootout/regex_dna.jl +++ b/src/shootout/regex_dna.jl @@ -5,30 +5,30 @@ # Fix from David Campbell const variants = [ - "agggtaaa|tttaccct", - "[cgt]gggtaaa|tttaccc[acg]", - "a[act]ggtaaa|tttacc[agt]t", - "ag[act]gtaaa|tttac[agt]ct", - "agg[act]taaa|ttta[agt]cct", - "aggg[acg]aaa|ttt[cgt]ccct", - "agggt[cgt]aa|tt[acg]accct", - "agggta[cgt]a|t[acg]taccct", - "agggtaa[cgt]|[acg]ttaccct" + r"agggtaaa|tttaccct", + r"[cgt]gggtaaa|tttaccc[acg]", + r"a[act]ggtaaa|tttacc[agt]t", + r"ag[act]gtaaa|tttac[agt]ct", + r"agg[act]taaa|ttta[agt]cct", + r"aggg[acg]aaa|ttt[cgt]ccct", + r"agggt[cgt]aa|tt[acg]accct", + r"agggta[cgt]a|t[acg]taccct", + r"agggtaa[cgt]|[acg]ttaccct" ] -const subs = [ - (r"B", "(c|g|t)"), - (r"D", "(a|g|t)"), - (r"H", "(a|c|t)"), - (r"K", "(g|t)"), - (r"M", "(a|c)"), - (r"N", "(a|c|g|t)"), - (r"R", "(a|g)"), - (r"S", "(c|g)"), - (r"V", "(a|c|g)"), - (r"W", "(a|t)"), - (r"Y", "(c|t)") -] +const subs = ( + ("B" => "(c|g|t)"), + ("D" => "(a|g|t)"), + ("H" => "(a|c|t)"), + ("K" => "(g|t)"), + ("M" => "(a|c)"), + ("N" => "(a|c|g|t)"), + ("R" => "(a|g)"), + ("S" => "(c|g)"), + ("V" => "(a|c|g)"), + ("W" => "(a|t)"), + ("Y" => "(c|t)") +) function perf_regex_dna() infile = joinpath(SHOOTOUT_DATA_PATH, "regexdna-input.txt") @@ -38,20 +38,29 @@ function perf_regex_dna() seq = replace(seq, r">.*\n|\n" => "") l2 = length(seq) + kk = 0 for v in variants k = 0 - for m in eachmatch(Regex(v), seq) + for m in eachmatch(v, seq) k += 1 end -# @printf("%s %d\n", v, k) + kk += k end - for (u, v) in subs - seq = replace(seq, u => v) + try + # VERSION > 1.7-dev + seq = replace(seq, subs...) + catch ex + ex isa MethodError || rethrow() + # semi-optimized regex + r = Regex(join(first.(subs), "|")) + repl = Dict(subs) + seq = replace(seq, r => (r -> repl[r])) + ## multiple passes + #for sub in subs + # seq = replace(seq, sub) + #end end -# println() -# println(l1) -# println(l2) -# println(length(seq)) + seq, kk end