From 8fb1626118198ec0ac538075eb02d7f5249494b2 Mon Sep 17 00:00:00 2001 From: Jameson Nash Date: Wed, 14 Apr 2021 16:21:43 -0400 Subject: [PATCH] use appropriate optimizations in regex_dna The multi-replace method is being added in https://github.com/JuliaLang/julia/pull/40484 --- src/shootout/regex_dna.jl | 68 ++++++++++++++++++++++----------------- 1 file changed, 38 insertions(+), 30 deletions(-) diff --git a/src/shootout/regex_dna.jl b/src/shootout/regex_dna.jl index 8e519fde..63320493 100644 --- a/src/shootout/regex_dna.jl +++ b/src/shootout/regex_dna.jl @@ -5,30 +5,30 @@ # Fix from David Campbell const variants = [ - "agggtaaa|tttaccct", - "[cgt]gggtaaa|tttaccc[acg]", - "a[act]ggtaaa|tttacc[agt]t", - "ag[act]gtaaa|tttac[agt]ct", - "agg[act]taaa|ttta[agt]cct", - "aggg[acg]aaa|ttt[cgt]ccct", - "agggt[cgt]aa|tt[acg]accct", - "agggta[cgt]a|t[acg]taccct", - "agggtaa[cgt]|[acg]ttaccct" + r"agggtaaa|tttaccct", + r"[cgt]gggtaaa|tttaccc[acg]", + r"a[act]ggtaaa|tttacc[agt]t", + r"ag[act]gtaaa|tttac[agt]ct", + r"agg[act]taaa|ttta[agt]cct", + r"aggg[acg]aaa|ttt[cgt]ccct", + r"agggt[cgt]aa|tt[acg]accct", + r"agggta[cgt]a|t[acg]taccct", + r"agggtaa[cgt]|[acg]ttaccct" ] -const subs = [ - (r"B", "(c|g|t)"), - (r"D", "(a|g|t)"), - (r"H", "(a|c|t)"), - (r"K", "(g|t)"), - (r"M", "(a|c)"), - (r"N", "(a|c|g|t)"), - (r"R", "(a|g)"), - (r"S", "(c|g)"), - (r"V", "(a|c|g)"), - (r"W", "(a|t)"), - (r"Y", "(c|t)") -] +const subs = ( + ("B" => "(c|g|t)"), + ("D" => "(a|g|t)"), + ("H" => "(a|c|t)"), + ("K" => "(g|t)"), + ("M" => "(a|c)"), + ("N" => "(a|c|g|t)"), + ("R" => "(a|g)"), + ("S" => "(c|g)"), + ("V" => "(a|c|g)"), + ("W" => "(a|t)"), + ("Y" => "(c|t)") +) function perf_regex_dna() infile = joinpath(SHOOTOUT_DATA_PATH, "regexdna-input.txt") @@ -40,18 +40,26 @@ function perf_regex_dna() for v in variants k = 0 - for m in eachmatch(Regex(v), seq) + for m in eachmatch(v, seq) k += 1 end -# @printf("%s %d\n", v, k) end - for (u, v) in subs - seq = replace(seq, u => v) + if applicable(replace, seq, subs...) + # VERSION > 1.7-dev + seq = replace(seq, subs...) + elseif false + # semi-optimized regex + seq = replace(seq, subs...) + r = Regex(join(first.(subs), "|")) + repl = Dict(subs) + seq = replace(seq, r => (r -> repl[r])) + else + # multiple passes + for sub in subs + seq = replace(seq, sub) + end end -# println() -# println(l1) -# println(l2) -# println(length(seq)) + seq, k end