Skip to content

Commit

Permalink
adding benchmark and improved gammackpar for highly unique columns
Browse files Browse the repository at this point in the history
  • Loading branch information
jw2249a committed May 28, 2024
1 parent c33b6ab commit 0ceb626
Show file tree
Hide file tree
Showing 10 changed files with 256 additions and 38 deletions.
Binary file added ._benchmark.csv
Binary file not shown.
1 change: 1 addition & 0 deletions Project.toml
Original file line number Diff line number Diff line change
Expand Up @@ -10,6 +10,7 @@ Distributions = "31c24e10-a181-5473-b8eb-7969acd0382f"
JSON = "682c06a0-de6a-54ab-a142-c8b1cf79cde6"
PooledArrays = "2dfb63ee-cc39-5dd5-95bd-886bf059d720"
StaticStrings = "4db0a0c5-418a-4e1d-8806-cb305fe13294"
StatsBase = "2913bbd2-ae8a-5f71-8c99-4fb6c76f3a91"
StringDistances = "88034a9c-02f8-509d-84a9-84ec65e18404"

[extras]
Expand Down
9 changes: 9 additions & 0 deletions benchmark.csv
Original file line number Diff line number Diff line change
@@ -0,0 +1,9 @@
"allocs","time_elapsed","N_dfA","N_dfB","sim_num","uniqueobsa_FIRST_NAME","uniqueobsa_FIRST_NAME1","uniqueobsa_MIDDLE_NAME","uniqueobsa_MIDDLE_NAME1","uniqueobsa_STREET_NAME","uniqueobsa_STREET_NAME1","uniqueobsb_FIRST_NAME","uniqueobsb_FIRST_NAME1","uniqueobsb_MIDDLE_NAME","uniqueobsb_MIDDLE_NAME1","uniqueobsb_STREET_NAME","uniqueobsb_STREET_NAME1","time_FIRST_NAME","time_FIRST_NAME1","time_MIDDLE_NAME","time_MIDDLE_NAME1","time_STREET_NAME","time_STREET_NAME1"
4.616895088e9,6.677211046218872,10000.0,100.0,1.0,1204.0,1204.0,2247.0,2247.0,6009.0,6009.0,54.0,54.0,49.0,49.0,76.0,76.0,0.008161067962646484,0.2337639331817627,0.012760162353515625,0.10138893127441406,0.04068493843078613,0.24126601219177246
4.949848048e9,7.091861963272095,10000.0,200.0,1.0,1204.0,1204.0,2247.0,2247.0,6009.0,6009.0,76.0,76.0,86.0,86.0,158.0,158.0,0.021563053131103516,0.4409921169281006,0.02609109878540039,0.11256885528564453,0.06923508644104004,0.6743419170379639
2.0483332936e10,4.617326021194458,10000.0,500.0,1.0,1204.0,1204.0,2247.0,2247.0,6009.0,6009.0,132.0,132.0,173.0,173.0,366.0,366.0,0.04845905303955078,0.7306158542633057,0.1388530731201172,0.697188138961792,1.3162028789520264,0.6631791591644287
4.1047003248e10,14.433804035186768,10000.0,1000.0,1.0,1204.0,1204.0,2247.0,2247.0,6009.0,6009.0,201.0,201.0,298.0,298.0,710.0,710.0,0.2675011157989502,1.2603449821472168,0.9462311267852783,0.7218730449676514,3.2080440521240234,1.4295718669891357
7.6155148776e10,24.44671106338501,10000.0,2000.0,1.0,1204.0,1204.0,2247.0,2247.0,6009.0,6009.0,294.0,294.0,508.0,508.0,1260.0,1260.0,0.14083600044250488,2.116300106048584,2.3114588260650635,0.6510841846466064,7.969947814941406,3.070338010787964
1.75658277856e11,47.291542053222656,10000.0,5000.0,1.0,1204.0,1204.0,2247.0,2247.0,6009.0,6009.0,569.0,569.0,1149.0,1149.0,2569.0,2569.0,1.3489818572998047,6.330427885055542,6.920858144760132,3.821377992630005,11.075815916061401,6.40576696395874
3.60736019104e11,90.80747604370117,10000.0,10000.0,1.0,1204.0,1204.0,2247.0,2247.0,6009.0,6009.0,900.0,900.0,2178.0,2178.0,4844.0,4844.0,0.6437540054321289,20.855478048324585,6.600561857223511,6.436399936676025,27.533162117004395,15.035200834274292
7.06118755624e11,170.87631511688232,10000.0,20000.0,1.0,1204.0,1204.0,2247.0,2247.0,6009.0,6009.0,1408.0,1408.0,3541.0,3541.0,7713.0,7713.0,3.5316689014434814,42.065459966659546,10.992216110229492,11.271030902862549,37.30153822898865,26.96807599067688
9 changes: 9 additions & 0 deletions benchmark_tf.csv
Original file line number Diff line number Diff line change
@@ -0,0 +1,9 @@
"allocs","time_elapsed","N_dfA","N_dfB","sim_num","uniqueobsa_FIRST_NAME","uniqueobsa_FIRST_NAME1","uniqueobsa_MIDDLE_NAME","uniqueobsa_MIDDLE_NAME1","uniqueobsa_STREET_NAME","uniqueobsa_STREET_NAME1","uniqueobsb_FIRST_NAME","uniqueobsb_FIRST_NAME1","uniqueobsb_MIDDLE_NAME","uniqueobsb_MIDDLE_NAME1","uniqueobsb_STREET_NAME","uniqueobsb_STREET_NAME1","time_FIRST_NAME","time_FIRST_NAME1","time_MIDDLE_NAME","time_MIDDLE_NAME1","time_STREET_NAME","time_STREET_NAME1"
3.280123392e9,13.693497896194458,10000.0,100.0,1.0,1204.0,1204.0,2247.0,2247.0,6009.0,6009.0,54.0,54.0,49.0,49.0,76.0,76.0,0.5871648788452148,0.8367969989776611,0.6334218978881836,1.55326509475708,0.7466540336608887,0.5079309940338135
4.947232528e9,7.505352973937988,10000.0,200.0,1.0,1204.0,1204.0,2247.0,2247.0,6009.0,6009.0,76.0,76.0,86.0,86.0,158.0,158.0,0.019359111785888672,0.5516459941864014,0.03357195854187012,0.034218788146972656,0.16811299324035645,0.8470368385314941
2.0483903904e10,5.700722932815552,10000.0,500.0,1.0,1204.0,1204.0,2247.0,2247.0,6009.0,6009.0,132.0,132.0,173.0,173.0,366.0,366.0,0.2061920166015625,0.9722309112548828,0.2941570281982422,0.45786404609680176,1.4087908267974854,1.0088179111480713
4.1029842568e10,16.80856680870056,10000.0,1000.0,1.0,1204.0,1204.0,2247.0,2247.0,6009.0,6009.0,201.0,201.0,298.0,298.0,710.0,710.0,0.10366487503051758,2.1447150707244873,1.162208080291748,1.1916310787200928,3.1355249881744385,1.7991409301757812
7.6141552768e10,26.468062162399292,10000.0,2000.0,1.0,1204.0,1204.0,2247.0,2247.0,6009.0,6009.0,294.0,294.0,508.0,508.0,1260.0,1260.0,0.5798399448394775,4.4384400844573975,1.065039873123169,1.2449331283569336,5.390944004058838,2.651992082595825
1.7557405088e11,56.88152098655701,10000.0,5000.0,1.0,1204.0,1204.0,2247.0,2247.0,6009.0,6009.0,569.0,569.0,1149.0,1149.0,2569.0,2569.0,1.4824249744415283,11.301899909973145,2.204904079437256,3.3433010578155518,12.95470905303955,9.015083074569702
3.61094355512e11,115.79980397224426,10000.0,10000.0,1.0,1204.0,1204.0,2247.0,2247.0,6009.0,6009.0,900.0,900.0,2178.0,2178.0,4844.0,4844.0,0.7947039604187012,25.837323904037476,6.860728979110718,7.348973035812378,31.153676986694336,19.69862699508667
7.05929703312e11,211.53230595588684,10000.0,20000.0,1.0,1204.0,1204.0,2247.0,2247.0,6009.0,6009.0,1408.0,1408.0,3541.0,3541.0,7713.0,7713.0,1.6117851734161377,50.610339879989624,12.198137998580933,12.091246843338013,44.39586901664734,32.71300005912781
129 changes: 95 additions & 34 deletions src/benchmark.jl
Original file line number Diff line number Diff line change
@@ -1,73 +1,134 @@
using Base: postoutput
using Pkg
Pkg.develop(path="..")
using FastLink
using DataFrames
using BenchmarkTools
using CSV
import Pkg.Artifacts: @artifact_str
using Profile

outputfile = "../benchmark.csv"

outputfile = "../benchmark.csv"
outputfile_tf = "../benchmark_tf.csv"
include("utils/prettyprinting.jl")

a_fil="../../../rstudio/test_merge/data/test_a.csv"
b_fil="../../../rstudio/test_merge/data/test_b.csv"

sims=10
N1=10_000
N2_N=[100,200,500,1000,2000,5000,10_000,20_000,50_000,100_000,200_000,500_000,750_000,1_000_000]


config = Dict("link_type"=>"link_only",
"idvar"=> ["TV_ID", "TS_ID"],
"comparisons"=> Dict("name" => "total",
"threshold_match" => 0.88,
"variables" => [
Dict("varname" => "FIRST_NAME",
"method" => "jarowinkler",
"tf_adjust" => true),
"method" => "jarowinkler"),
Dict("varname" => "FIRST_NAME1",
"method" => "jarowinkler"),
Dict("varname" => "MIDDLE_NAME",
"method" => "exact",
"tf_adjust" => true),
"method" => "exact"),
Dict("varname" => "MIDDLE_NAME1",
"method" => "exact"),
Dict("varname" => "STREET_NAME",
"method" => "jarowinkler",
"tf_adjust" => true) ]))

"method" => "jarowinkler"),
Dict("varname" => "STREET_NAME1",
"method" => "jarowinkler")]))

config_tf = Dict("link_type"=>"link_only",
"idvar"=> ["TV_ID", "TS_ID"],
"comparisons"=> Dict("name" => "total",
"threshold_match" => 0.88,
"variables" => [
Dict("varname" => "FIRST_NAME",
"method" => "jarowinkler"),
"method" => "jarowinkler",
"tf_adjust" => true),
Dict("varname" => "FIRST_NAME1",
"method" => "jarowinkler",
"tf_adjust" => true),
Dict("varname" => "MIDDLE_NAME",
"method" => "exact"),
"method" => "exact",
"tf_adjust" => true),
Dict("varname" => "MIDDLE_NAME1",
"method" => "exact",
"tf_adjust" => true),
Dict("varname" => "STREET_NAME",
"method" => "jarowinkler")]))
"method" => "jarowinkler",
"tf_adjust" => true),
Dict("varname" => "STREET_NAME1",
"method" => "jarowinkler",
"tf_adjust" => true) ]))

# creating output files for the benchmarks
vars=retrieve(config, "varname")
filheadernames=append!(["allocs", "time_elapsed", "N_dfA", "N_dfB", "sim_num"],
"uniqueobsa_" .* vars,
"uniqueobsb_" .* vars,
"time_" .* vars)

filheader=*((v != last(filheadernames) ? "\"$v\"," : "\"$v\"\n" for v in filheadernames)...)
open(outputfile, "w") do file
write(file, filheader)
end
open(outputfile_tf, "w") do file
write(file, filheader)
end

open(outputfile, "w" do file
write(file,"\"N1\",\"N2\",\"u_FIRST_NAME\",\"u_MIDDLE_NAME\"")

for sim_num in 1:sims
for N2 in N2_N

N2=20_000
N1_N=[10_000,50_000,100_000,500_000,750_000,1_000_000]
println("## $(length(varnames)) vars")
for N1 in N1_N
@info center_in_line("(sim $sim_num, $(pretty_print_number(N1)) x $(pretty_print_number(N2)) )")
dfA=CSV.read(a_fil, DataFrame,
limit=N1,
ignoreemptyrows=true,
ntasks=1,
pool=true,
missingstring=["", "NA", "NaN", "NULL", "Null"])

dfB=CSV.read(b_fil, DataFrame,
limit=N2,
ignoreemptyrows=true,
ntasks=1,
pool=true,
missingstring=["", "NA", "NaN", "NULL", "Null"])

println(center_in_line("( $(pretty_print_number(N1)) x $(pretty_print_number(N2)) )"))
dfA=CSV.read(a_fil, DataFrame,
limit=N1,
ignoreemptyrows=true,
ntasks=1,
pool=true,
missingstring=["", "NA", "NaN", "NULL", "Null"])
dfA.STREET_NAME1 = Vector(dfA.STREET_NAME)
dfB.STREET_NAME1 = Vector(dfB.STREET_NAME)
dfA.FIRST_NAME1 = Vector(dfA.FIRST_NAME)
dfB.FIRST_NAME1 = Vector(dfB.FIRST_NAME)
dfA.MIDDLE_NAME1 = Vector(dfA.MIDDLE_NAME)
dfB.MIDDLE_NAME1 = Vector(dfB.MIDDLE_NAME)
@info "running no tf"
let res_allocs = @allocated fastLink(dfA,dfB,config)

stime = time()
res = fastLink(dfA,dfB,config, true)
telapsed = time() - stime

dfB=CSV.read(b_fil, DataFrame,
limit=N2,
ignoreemptyrows=true,
ntasks=1,
pool=true,
missingstring=["", "NA", "NaN", "NULL", "Null"])

res = @benchmark fastLink($dfA,$dfB,$config)
res=DataFrame(append!(["allocs"=>res_allocs, "time_elapsed"=>telapsed,"N1"=>N1,"N2"=>N2, "sim_num"=>sim_num],
["uniqueobsa_$v"=>length(unique(dfA[:,v])) for v in vars],
["uniqueobsb_$v"=>length(unique(dfB[:,v])) for v in vars],
["time_$(vars[i])"=>res["benchtimes"][i] for i in 1:length(vars)]))
CSV.write(outputfile, res, writeheader = false, append = true)
end
@info "running tf"
let res_allocs = @allocated fastLink(dfA,dfB,config)

stime = time()
res = fastLink(dfA,dfB,config_tf, true)
telapsed = time() - stime

res=DataFrame(append!(["allocs"=>res_allocs,"time_elapsed"=>telapsed,"N1"=>N1,"N2"=>N2, "sim_num"=>sim_num],
["uniqueobsa_$v"=>length(unique(dfA[:,v])) for v in vars],
["uniqueobsb_$v"=>length(unique(dfB[:,v])) for v in vars],
["time_$(vars[i])"=>res["benchtimes"][i] for i in 1:length(vars)]))
CSV.write(outputfile_tf, res, writeheader = false, append = true)
end



end
end
2 changes: 1 addition & 1 deletion src/fastlink/fastlink.jl
Original file line number Diff line number Diff line change
Expand Up @@ -147,7 +147,7 @@ function fastLink(dfA::DataFrame, dfB::DataFrame, config::Dict{String,Any}, benc
# allow missing for comparisons
allowmissing!(dfA)
allowmissing!(dfB)

benchtimes = []
for v in varnames
starttime = time()
Expand Down
2 changes: 1 addition & 1 deletion src/gammas/Gammas.jl
Original file line number Diff line number Diff line change
@@ -1,7 +1,7 @@
module Gammas
import PooledArrays: PooledVector, PooledArray
import StringDistances: Jaro, JaroWinkler, Levenshtein, DamerauLevenshtein, compare

import StatsBase: countmap
using ..DiBitMat
import ..nonmatch, ..match1, ..match2, ..missingval

Expand Down
139 changes: 139 additions & 0 deletions src/gammas/gammaCKpar.jl
Original file line number Diff line number Diff line change
Expand Up @@ -22,6 +22,26 @@ function score_value(dist::Float64,indices_x::Vector{<:Integer},indices_y::Vecto
return nothing
end

function score_single2(dist::Float64,ix::A,iy::A, cut_a::Float64, cut_b::Float64, results::DiBitMatrix) where A <: Integer
# if matches at a threshold, go through result vector and assign new value
if dist >= cut_a
results[ix,iy] = match2
elseif dist >= cut_b
results[ix,iy] = match1
end
return nothing
end

function score_single(dist::Float64,ix::A,iy::A, cut_a::Float64, cut_b::Float64, results::DiBitMatrix) where A <: Integer
# if matches at a threshold, go through result vector and assign new value
if dist >= cut_a
results[ix,iy] = match2
end
return nothing
end




"""
String comparison of two columns with partial match.
Expand Down Expand Up @@ -214,3 +234,122 @@ function gammaCKpar!(vecA::PooledVector,vecB::PooledVector,
return nothing
end

# handling highly unique data
function gammaCKpar!(vecA::Vector,vecB::Vector,
results::DiBitMatrix;
distmethod="jw",cut_a=0.92,cut_b=0.88,partial=true,w=0.1)

if @isdefined(_dims) == false
_dims = (length(vecA), length(vecB))
end

# assign distance function
if distmethod=="jw"
distance = JaroWinkler(p=w)
elseif distmethod=="dl"
distance = DamerauLevenshtein()
elseif distmethod=="jaro"
distance = Jaro(p=w)
elseif distmethod=="lv"
distance = Levenshtein()
elseif distmethod=="ro"
distance = RatcliffObershelp()
elseif distmethod=="osa"
distance = OptimalStringAlignment()
elseif distmethod=="hamming"
distance = Hamming()
end


if partial
score_value! = score_single2
else
score_value! = score_single
end

# Segment unique keys from missing key
Threads.@threads for (ix, x) in collect(enumerate(vecA))
if ismissing(x)
for iy in collect(1:_dims[2])
results[ix,iy] = missingval
end
else
Threads.@threads for (iy, y) in collect(enumerate(vecB))
if ismissing(y)
results[ix,iy] = missingval
else
dist=round(compare(x,y, distance),digits=4)
score_value!(dist, ix,iy, cut_a,cut_b, results)
end
end
end
end


# Return nothing
return nothing
end



# handling highly unique data
function gammaCKpar!(vecA::Vector,vecB::Vector,
results::DiBitMatrix,
tf_table_x::SubArray{Float16},
tf_table_y::SubArray{Float16};
distmethod="jw",cut_a=0.92,cut_b=0.88,partial=true,w=0.1,
tf_minimum_u_value=0.001)
if @isdefined(_dims) == false
_dims = (length(vecA), length(vecB))
end
freqsA = countmap(vecA)
freqsB = countmap(vecB)
# assign distance function
if distmethod=="jw"
distance = JaroWinkler(p=w)
elseif distmethod=="dl"
distance = DamerauLevenshtein()
elseif distmethod=="jaro"
distance = Jaro(p=w)
elseif distmethod=="lv"
distance = Levenshtein()
elseif distmethod=="ro"
distance = RatcliffObershelp()
elseif distmethod=="osa"
distance = OptimalStringAlignment()
elseif distmethod=="hamming"
distance = Hamming()
end


if partial
score_value! = score_single2
else
score_value! = score_single
end

# Segment unique keys from missing key
Threads.@threads for (ix, x) in collect(enumerate(vecA))
if ismissing(x)
for iy in collect(1:_dims[2])
results[ix,iy] = missingval
end
else
tf_table_x[ix] = max(freqsA[x]/_dims[1], tf_minimum_u_value)
Threads.@threads for (iy, y) in collect(enumerate(vecB))
if ismissing(y)
results[ix,iy] = missingval
else
tf_table_y[iy] = max(freqsB[y]/_dims[2], tf_minimum_u_value)
dist=round(compare(x,y, distance),digits=4)
score_value!(dist, ix,iy, cut_a,cut_b, results)
end
end
end
end


# Return nothing
return nothing
end

1 change: 0 additions & 1 deletion src/gammas/gammaKpar.jl
Original file line number Diff line number Diff line change
Expand Up @@ -213,7 +213,6 @@ function gammaKpar!(vecA::Vector,vecB::Vector,results::DiBitMatrix)

# Form match matrices based on differing levels of matches
Threads.@threads for (ix, x) in collect(enumerate(vecA))
indices_x = findall(vecA .=== x)
if ismissing(x)
for iy in collect(1:_dims[2])
results[ix,iy] = missingval
Expand Down
2 changes: 1 addition & 1 deletion src/patterns.jl
Original file line number Diff line number Diff line change
Expand Up @@ -232,7 +232,7 @@ end
function process_comparisons(res::Dict{String, DiBitMatrix},
emlink_configuration::Vector{Vector{Dict{String, Any}}},
_dims::Tuple{Int64,Int64},
parameters::Dict{String, Dict{String, Any}},
parameters::Dict{String, <: Any},
tf_tables::Dict{String, Vector{Vector{Float16}}})
final_name = last(emlink_configuration)[1]["name"]
for emconfig in emlink_configuration
Expand Down

0 comments on commit 0ceb626

Please sign in to comment.