Skip to content

Commit

Permalink
use appropriate optimizations in regex_dna
Browse files Browse the repository at this point in the history
The multi-replace method is being added in JuliaLang/julia#40484
  • Loading branch information
vtjnash committed Apr 14, 2021
1 parent dd25659 commit 94dba74
Showing 1 changed file with 39 additions and 30 deletions.
69 changes: 39 additions & 30 deletions src/shootout/regex_dna.jl
Original file line number Diff line number Diff line change
Expand Up @@ -5,30 +5,30 @@
# Fix from David Campbell

const variants = [
"agggtaaa|tttaccct",
"[cgt]gggtaaa|tttaccc[acg]",
"a[act]ggtaaa|tttacc[agt]t",
"ag[act]gtaaa|tttac[agt]ct",
"agg[act]taaa|ttta[agt]cct",
"aggg[acg]aaa|ttt[cgt]ccct",
"agggt[cgt]aa|tt[acg]accct",
"agggta[cgt]a|t[acg]taccct",
"agggtaa[cgt]|[acg]ttaccct"
r"agggtaaa|tttaccct",
r"[cgt]gggtaaa|tttaccc[acg]",
r"a[act]ggtaaa|tttacc[agt]t",
r"ag[act]gtaaa|tttac[agt]ct",
r"agg[act]taaa|ttta[agt]cct",
r"aggg[acg]aaa|ttt[cgt]ccct",
r"agggt[cgt]aa|tt[acg]accct",
r"agggta[cgt]a|t[acg]taccct",
r"agggtaa[cgt]|[acg]ttaccct"
]

const subs = [
(r"B", "(c|g|t)"),
(r"D", "(a|g|t)"),
(r"H", "(a|c|t)"),
(r"K", "(g|t)"),
(r"M", "(a|c)"),
(r"N", "(a|c|g|t)"),
(r"R", "(a|g)"),
(r"S", "(c|g)"),
(r"V", "(a|c|g)"),
(r"W", "(a|t)"),
(r"Y", "(c|t)")
]
const subs = (
("B" => "(c|g|t)"),
("D" => "(a|g|t)"),
("H" => "(a|c|t)"),
("K" => "(g|t)"),
("M" => "(a|c)"),
("N" => "(a|c|g|t)"),
("R" => "(a|g)"),
("S" => "(c|g)"),
("V" => "(a|c|g)"),
("W" => "(a|t)"),
("Y" => "(c|t)")
)

function perf_regex_dna()
infile = joinpath(SHOOTOUT_DATA_PATH, "regexdna-input.txt")
Expand All @@ -38,20 +38,29 @@ function perf_regex_dna()
seq = replace(seq, r">.*\n|\n" => "")
l2 = length(seq)

kk = 0
for v in variants
k = 0
for m in eachmatch(Regex(v), seq)
for m in eachmatch(v, seq)
k += 1
end
# @printf("%s %d\n", v, k)
kk += k
end

for (u, v) in subs
seq = replace(seq, u => v)
try
# VERSION > 1.7-dev
seq = replace(seq, subs...)
catch ex
ex isa MethodError || rethrow()
# semi-optimized regex
r = Regex(join(first.(subs), "|"))
repl = Dict(subs)
seq = replace(seq, r => (r -> repl[r]))
## multiple passes
#for sub in subs
# seq = replace(seq, sub)
#end
end

# println()
# println(l1)
# println(l2)
# println(length(seq))
seq, kk
end

0 comments on commit 94dba74

Please sign in to comment.