updating 1.10 version to current and adding getMatches

jw2249a · Sep 5, 2024 · 2e32711 · 2e32711
1 parent 0ceb626
commit 2e32711
Show file tree

Hide file tree

Showing 8 changed files with 60 additions and 41 deletions.
diff --git a/._benchmark.csv b/._benchmark.csv
diff --git a/Project.toml b/Project.toml
@@ -1,7 +1,7 @@
 name = "FastLink"
 uuid = "11f39cfd-5548-489f-be9a-f4ad0ff6eadc"
 authors = ["Jack R. Williams <[email protected]>"]
-version = "0.0.5"
+version = "0.0.7"
 
 [deps]
 DataFrames = "a93c6f00-e57d-5684-b7b6-d8193f3e46c0"

diff --git a/README.md b/README.md
@@ -120,12 +120,23 @@ Each `method` has a number of arguments that can be specified for that matching
 __________________
 ### `fastLink`'s output
 
-A `NamedTuple` with these vars:
+For ease of extracting matches, the `getMatches` function was added. You can call it on the fastLink output as the single argument `getMatches(FastLinkOutput)` or with a specified threshold `getMatches(FastLinkOutput, threshold_match)`.
 
-- `indices` - a vector with indices in `dfA` and `dfB` that are in each pattern group (see `patterns_w` or `patterns_b`)
+The FastLink output is:
+A `Dict{String,Any}` with these vars:
+- `ids`: A vector of vectors of tuple pairs of ids for each match pattern.
+- `idvar`: ID variable from configuration
+- `resultsEM`: The results of the Expectation Maximization algorithm
 
-- `matched_ids` - same as `indices` but using `idvars` from input parameters
+If term frequency is specified then 
+- `resultsTF`: term frequencies for each variable with specified term frequency by pattern if relevant for the pattern (if no term frequency is applied then tf_adjusted is false).
 
+If benchmark is specified:
+- `benchtimes`: times for each variable to be matched.
+
+
+
+Within `resultsEM` in the EM output, there is:
 - `iter_converge` - number of iterations for expectation maximization algorithm to converge. 
 
 - `obs_a` - observations in `dfA`
@@ -136,8 +147,12 @@ A `NamedTuple` with these vars:
 
 - `p_u` - posterior **not** match probability
 
+- `number_of_unique_patterns` - equivalent to number of rows in `patterns_w`
+
+- `number_of_comparisons` - For convenience `nrow(dfA) * nrow(dfB)`
+
 - `patterns_w` - a `DataFrame` of:
-  - `gamma_` - An `Int64` with the gamma values for each variable (similar to `patterns_b`)
+  - `gamma_*` - An `Int64` with the gamma values for each variable (similar to `patterns_b`)
   - `counts` - An `Int64` with counts for each agreement pattern
   - `weights` - An `Int64` with partial match weights for each agreement pattern
   - `p_gamma_jm` - A `Float64` that has the posterior probability that a pair matches for each agreement pattern
@@ -150,9 +165,8 @@ A `NamedTuple` with these vars:
 
 - `pgamma_ku` - A `Vector{Vector{Float64}}` with posterior probababilities for each variable in the EM algorithm. Ordered (2,1,0).
 
-- `tf_adj_table` - A `Vector{DataFrame}` that has a DataFrame for each match pattern and a row in each DataFrame for each comparison appended with the letter of their corresponding dataset.
+- `p_gamma_jm` - A `Float64` that has the posterior probability that a pair matches for each agreement pattern  (see `patterns_w`).
+- `p_gamma_ju` - A `Float64` that has the posterior probability that a pair **does not** match for each agreement pattern (see `patterns_w`).
 
 - `varnames` - A `Vector{String}` of the input variable names
-
-- `zeta_j` - A `Vector{Float64}` with the posterior match probabilities for each agreement pattern. 
 
diff --git a/src/FastLink.jl b/src/FastLink.jl
@@ -29,6 +29,7 @@ const STRING_DISTANCE_METHODS = Dict("jw" => "jw",
                                      "optimalstringalignment" => "osa"
                                      )
 
+include("getMatches.jl")
 include("settings/settings.jl")
 include("DiBitMatrix.jl")
 include("matchPatterns.jl")
@@ -37,6 +38,7 @@ include("term_frequency_adjustment.jl")
 include("emlink.jl")
 include("patterns.jl")
 
+using .getmatches
 using .settings
 using .DiBitMat
 using .matchpatterns
@@ -47,7 +49,9 @@ using .patterns
 
 include("fastlink/fastlink.jl")
 
-export gammaCKpar!, gammaKpar!, gammaCKfuzzy!, gammaNUMCKpar!, DiBitMatrix, namedtuple, fetch_parameters, retrieve, parse_configuration, remove_keys, emlinkMARmov,  STRING_DISTANCE_METHODS, match1, match2, missingval, nonmatch, indices_to_uids, process_comparisons, fastLink
+
+
+export getMatches, gammaCKpar!, gammaKpar!, gammaCKfuzzy!, gammaNUMCKpar!, DiBitMatrix, namedtuple, fetch_parameters, retrieve, parse_configuration, remove_keys, emlinkMARmov,  STRING_DISTANCE_METHODS, match1, match2, missingval, nonmatch, indices_to_uids, process_comparisons, fastLink
 
 #export(fastLink)
 

diff --git a/src/fastlink/fastlink.jl b/src/fastlink/fastlink.jl
@@ -1,6 +1,3 @@
-
-
-
 """
 Probabilistic record matching using FastLink data-matching algorithm.
 Algorithm taken from:
@@ -115,12 +112,14 @@ function fastLink(dfA::DataFrame, dfB::DataFrame, config::Dict{String,Any})
     results = process_comparisons(res, emlink_configuration, _dims, parameters, tf_tables)
 
     if length(results)  == 3
-        return Dict("ids" => indices_to_uids(dfA[!, config["idvar"][1]],dfB[!, config["idvar"][2]],results[1].indices),
-                "resultsEM" => results[2],
-                "resultsTF" => results[3])
+        return Dict("idvar" => config["idvar"],
+                    "ids" => indices_to_uids(dfA[!, config["idvar"][1]],dfB[!, config["idvar"][2]],results[1].indices),
+                    "resultsEM" => results[2],
+                    "resultsTF" => results[3])
     else
-        return Dict("ids" => indices_to_uids(dfA[!, config["idvar"][1]],dfB[!, config["idvar"][2]],results[1].indices),
-                "resultsEM" => results[2])
+        return Dict("idvar" => config["idvar"],
+                    "ids" => indices_to_uids(dfA[!, config["idvar"][1]],dfB[!, config["idvar"][2]],results[1].indices),
+                    "resultsEM" => results[2])
     end
 end
 
@@ -229,12 +228,14 @@ function fastLink(dfA::DataFrame, dfB::DataFrame, config::Dict{String,Any}, benc
     results = process_comparisons(res, emlink_configuration, _dims, parameters, tf_tables)
 
     if length(results)  == 3
-        return Dict("ids" => indices_to_uids(dfA[!, config["idvar"][1]],dfB[!, config["idvar"][2]],results[1].indices),
+        return Dict("idvar" => config["idvar"],
+            "ids" => indices_to_uids(dfA[!, config["idvar"][1]],dfB[!, config["idvar"][2]],results[1].indices),
                 "resultsEM" => results[2],
                     "resultsTF" => results[3],
                     "benchtimes" => benchtimes)
     else
-        return Dict("ids" => indices_to_uids(dfA[!, config["idvar"][1]],dfB[!, config["idvar"][2]],results[1].indices),
+        return Dict("idvar" => config["idvar"],
+                    "ids" => indices_to_uids(dfA[!, config["idvar"][1]],dfB[!, config["idvar"][2]],results[1].indices),
                     "resultsEM" => results[2],
                      "benchtimes" => benchtimes)
     end

diff --git a/src/getMatches.jl b/src/getMatches.jl
@@ -0,0 +1,17 @@
+module getmatches
+using DataFrames
+"""
+Function to extract matches from fastlink output. Specify an threshold for matches or use the default threshold specified in the configuration for the fastlink function. returns a `Vector{DataFrame}` that can be matched to the same indices in the patterns_w object subset by zeta_j >= threshold_match.
+"""
+function getMatches(FastLinkOutput::Dict{String, Any}, threshold_match::T) where T <: AbstractFloat
+    (FastLinkOutput["resultsEM"]["patterns_w"].zeta_j .>= threshold_match) |>
+        findall |>
+        ids -> FastLinkOutput["ids"][ids] .|>
+        x->DataFrame(x,FastLinkOutput["idvar"])
+end
+function getMatches(FastLinkOutput::Dict{String, Any})
+    getMatches(FastLinkOutput,FastLinkOutput["resultsEM"]["threshold_match"])
+end
+
+export getMatches
+end # module getMatches
diff --git a/src/utils/prettyprinting.jl b/src/utils/prettyprinting.jl
diff --git a/test/runtests.jl b/test/runtests.jl
@@ -62,15 +62,16 @@ config = Dict("idvar" => ["ida", "idb"],
 @testset "Testing FastLink Basic Run" begin
     @info "Executing fastLink()"
     results=fastLink(dfA,dfB,config)
-
-
     @info "Correct # of Matches"
+    matches = getMatches(results)
     p_w = results["resultsEM"]["patterns_w"]
-    inds = p_w.zeta_j .>= 0.85
+    inds = p_w.zeta_j .>= results["resultsEM"]["threshold_match"]
     @test sum(p_w.counts[inds]) == 50
+    @info "Correct grouping of matches"
+    @info p_w.counts[inds] == nrow.(matches)
     @info "Number of patterns == 26"
     @test results["resultsEM"]["number_of_unique_patterns"] == 26
-    @info "Number of counts == (N₁×N₂) "
+    @info "Number of counts == (N₁×N₂)"
     @test sum(p_w.counts) == nrow(dfA) * nrow(dfB)
     @info "Ρ(𝑢) >=.999"
     @test results["resultsEM"]["p_u"] >= .999