Skip to content

Commit

Permalink
updating 1.10 version to current and adding getMatches
Browse files Browse the repository at this point in the history
  • Loading branch information
jw2249a committed Sep 5, 2024
1 parent 0ceb626 commit 2e32711
Show file tree
Hide file tree
Showing 8 changed files with 60 additions and 41 deletions.
Binary file removed ._benchmark.csv
Binary file not shown.
2 changes: 1 addition & 1 deletion Project.toml
Original file line number Diff line number Diff line change
@@ -1,7 +1,7 @@
name = "FastLink"
uuid = "11f39cfd-5548-489f-be9a-f4ad0ff6eadc"
authors = ["Jack R. Williams <[email protected]>"]
version = "0.0.5"
version = "0.0.7"

[deps]
DataFrames = "a93c6f00-e57d-5684-b7b6-d8193f3e46c0"
Expand Down
28 changes: 21 additions & 7 deletions README.md
Original file line number Diff line number Diff line change
Expand Up @@ -120,12 +120,23 @@ Each `method` has a number of arguments that can be specified for that matching
__________________
### `fastLink`'s output

A `NamedTuple` with these vars:
For ease of extracting matches, the `getMatches` function was added. You can call it on the fastLink output as the single argument `getMatches(FastLinkOutput)` or with a specified threshold `getMatches(FastLinkOutput, threshold_match)`.

- `indices` - a vector with indices in `dfA` and `dfB` that are in each pattern group (see `patterns_w` or `patterns_b`)
The FastLink output is:
A `Dict{String,Any}` with these vars:
- `ids`: A vector of vectors of tuple pairs of ids for each match pattern.
- `idvar`: ID variable from configuration
- `resultsEM`: The results of the Expectation Maximization algorithm

- `matched_ids` - same as `indices` but using `idvars` from input parameters
If term frequency is specified then
- `resultsTF`: term frequencies for each variable with specified term frequency by pattern if relevant for the pattern (if no term frequency is applied then tf_adjusted is false).

If benchmark is specified:
- `benchtimes`: times for each variable to be matched.



Within `resultsEM` in the EM output, there is:
- `iter_converge` - number of iterations for expectation maximization algorithm to converge.

- `obs_a` - observations in `dfA`
Expand All @@ -136,8 +147,12 @@ A `NamedTuple` with these vars:

- `p_u` - posterior **not** match probability

- `number_of_unique_patterns` - equivalent to number of rows in `patterns_w`

- `number_of_comparisons` - For convenience `nrow(dfA) * nrow(dfB)`

- `patterns_w` - a `DataFrame` of:
- `gamma_` - An `Int64` with the gamma values for each variable (similar to `patterns_b`)
- `gamma_*` - An `Int64` with the gamma values for each variable (similar to `patterns_b`)
- `counts` - An `Int64` with counts for each agreement pattern
- `weights` - An `Int64` with partial match weights for each agreement pattern
- `p_gamma_jm` - A `Float64` that has the posterior probability that a pair matches for each agreement pattern
Expand All @@ -150,9 +165,8 @@ A `NamedTuple` with these vars:

- `pgamma_ku` - A `Vector{Vector{Float64}}` with posterior probababilities for each variable in the EM algorithm. Ordered (2,1,0).

- `tf_adj_table` - A `Vector{DataFrame}` that has a DataFrame for each match pattern and a row in each DataFrame for each comparison appended with the letter of their corresponding dataset.
- `p_gamma_jm` - A `Float64` that has the posterior probability that a pair matches for each agreement pattern (see `patterns_w`).
- `p_gamma_ju` - A `Float64` that has the posterior probability that a pair **does not** match for each agreement pattern (see `patterns_w`).

- `varnames` - A `Vector{String}` of the input variable names

- `zeta_j` - A `Vector{Float64}` with the posterior match probabilities for each agreement pattern.

6 changes: 5 additions & 1 deletion src/FastLink.jl
Original file line number Diff line number Diff line change
Expand Up @@ -29,6 +29,7 @@ const STRING_DISTANCE_METHODS = Dict("jw" => "jw",
"optimalstringalignment" => "osa"
)

include("getMatches.jl")
include("settings/settings.jl")
include("DiBitMatrix.jl")
include("matchPatterns.jl")
Expand All @@ -37,6 +38,7 @@ include("term_frequency_adjustment.jl")
include("emlink.jl")
include("patterns.jl")

using .getmatches
using .settings
using .DiBitMat
using .matchpatterns
Expand All @@ -47,7 +49,9 @@ using .patterns

include("fastlink/fastlink.jl")

export gammaCKpar!, gammaKpar!, gammaCKfuzzy!, gammaNUMCKpar!, DiBitMatrix, namedtuple, fetch_parameters, retrieve, parse_configuration, remove_keys, emlinkMARmov, STRING_DISTANCE_METHODS, match1, match2, missingval, nonmatch, indices_to_uids, process_comparisons, fastLink


export getMatches, gammaCKpar!, gammaKpar!, gammaCKfuzzy!, gammaNUMCKpar!, DiBitMatrix, namedtuple, fetch_parameters, retrieve, parse_configuration, remove_keys, emlinkMARmov, STRING_DISTANCE_METHODS, match1, match2, missingval, nonmatch, indices_to_uids, process_comparisons, fastLink

#export(fastLink)

Expand Down
21 changes: 11 additions & 10 deletions src/fastlink/fastlink.jl
Original file line number Diff line number Diff line change
@@ -1,6 +1,3 @@



"""
Probabilistic record matching using FastLink data-matching algorithm.
Algorithm taken from:
Expand Down Expand Up @@ -115,12 +112,14 @@ function fastLink(dfA::DataFrame, dfB::DataFrame, config::Dict{String,Any})
results = process_comparisons(res, emlink_configuration, _dims, parameters, tf_tables)

if length(results) == 3
return Dict("ids" => indices_to_uids(dfA[!, config["idvar"][1]],dfB[!, config["idvar"][2]],results[1].indices),
"resultsEM" => results[2],
"resultsTF" => results[3])
return Dict("idvar" => config["idvar"],
"ids" => indices_to_uids(dfA[!, config["idvar"][1]],dfB[!, config["idvar"][2]],results[1].indices),
"resultsEM" => results[2],
"resultsTF" => results[3])
else
return Dict("ids" => indices_to_uids(dfA[!, config["idvar"][1]],dfB[!, config["idvar"][2]],results[1].indices),
"resultsEM" => results[2])
return Dict("idvar" => config["idvar"],
"ids" => indices_to_uids(dfA[!, config["idvar"][1]],dfB[!, config["idvar"][2]],results[1].indices),
"resultsEM" => results[2])
end
end

Expand Down Expand Up @@ -229,12 +228,14 @@ function fastLink(dfA::DataFrame, dfB::DataFrame, config::Dict{String,Any}, benc
results = process_comparisons(res, emlink_configuration, _dims, parameters, tf_tables)

if length(results) == 3
return Dict("ids" => indices_to_uids(dfA[!, config["idvar"][1]],dfB[!, config["idvar"][2]],results[1].indices),
return Dict("idvar" => config["idvar"],
"ids" => indices_to_uids(dfA[!, config["idvar"][1]],dfB[!, config["idvar"][2]],results[1].indices),
"resultsEM" => results[2],
"resultsTF" => results[3],
"benchtimes" => benchtimes)
else
return Dict("ids" => indices_to_uids(dfA[!, config["idvar"][1]],dfB[!, config["idvar"][2]],results[1].indices),
return Dict("idvar" => config["idvar"],
"ids" => indices_to_uids(dfA[!, config["idvar"][1]],dfB[!, config["idvar"][2]],results[1].indices),
"resultsEM" => results[2],
"benchtimes" => benchtimes)
end
Expand Down
17 changes: 17 additions & 0 deletions src/getMatches.jl
Original file line number Diff line number Diff line change
@@ -0,0 +1,17 @@
module getmatches
using DataFrames
"""
Function to extract matches from fastlink output. Specify an threshold for matches or use the default threshold specified in the configuration for the fastlink function. returns a `Vector{DataFrame}` that can be matched to the same indices in the patterns_w object subset by zeta_j >= threshold_match.
"""
function getMatches(FastLinkOutput::Dict{String, Any}, threshold_match::T) where T <: AbstractFloat
(FastLinkOutput["resultsEM"]["patterns_w"].zeta_j .>= threshold_match) |>
findall |>
ids -> FastLinkOutput["ids"][ids] .|>
x->DataFrame(x,FastLinkOutput["idvar"])
end
function getMatches(FastLinkOutput::Dict{String, Any})
getMatches(FastLinkOutput,FastLinkOutput["resultsEM"]["threshold_match"])
end

export getMatches
end # module getMatches
18 changes: 0 additions & 18 deletions src/utils/prettyprinting.jl

This file was deleted.

9 changes: 5 additions & 4 deletions test/runtests.jl
Original file line number Diff line number Diff line change
Expand Up @@ -62,15 +62,16 @@ config = Dict("idvar" => ["ida", "idb"],
@testset "Testing FastLink Basic Run" begin
@info "Executing fastLink()"
results=fastLink(dfA,dfB,config)


@info "Correct # of Matches"
matches = getMatches(results)
p_w = results["resultsEM"]["patterns_w"]
inds = p_w.zeta_j .>= 0.85
inds = p_w.zeta_j .>= results["resultsEM"]["threshold_match"]
@test sum(p_w.counts[inds]) == 50
@info "Correct grouping of matches"
@info p_w.counts[inds] == nrow.(matches)
@info "Number of patterns == 26"
@test results["resultsEM"]["number_of_unique_patterns"] == 26
@info "Number of counts == (N₁×N₂) "
@info "Number of counts == (N₁×N₂)"
@test sum(p_w.counts) == nrow(dfA) * nrow(dfB)
@info "Ρ(𝑢) >=.999"
@test results["resultsEM"]["p_u"] >= .999
Expand Down

0 comments on commit 2e32711

Please sign in to comment.