-
Notifications
You must be signed in to change notification settings - Fork 1
/
Copy pathMinHashSimilarity.cs
152 lines (136 loc) · 4.38 KB
/
MinHashSimilarity.cs
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
using System;
using System.Collections.Generic;
using System.Text;
namespace MinHashComparison
{
public class MinHashSimilarity
{
/// <summary>
/// The internal min hash instance
/// </summary>
private readonly MinHash _minHash;
/// <summary>
/// The threshold in which documents are considered similar
/// </summary>
private readonly double _threshold;
/// <summary>
/// Number of bands for LSH comparison
/// </summary>
private readonly int _bands;
/// <summary>
/// number of rows for LSH comparison
/// </summary>
private readonly int _rows;
/// <summary>
/// Buckets for LSH comparison
/// </summary>
private readonly Dictionary<string, List<int[]>> _buckets;
/// <summary>
/// Default Constructor
/// Works best if threshold is ~90%
/// </summary>
/// <param name="threshold">The threshold in which documents are considered similar</param>
public MinHashSimilarity(double threshold) : this (threshold, 5, 400, 20, 20)
{
// Likelihood of an LSH match between two documents (1-(1-J(A,B)^rows)^bands) | J(A,B) = Jaccard index, rows = 20, bands = 20
// J(A,B) Probability of getting compared
// .7 .016
// .8 .206
// .85 .546
// .861 .642 // sCurve ((1/b)^(1/r))
// .87 .720
// .9 .925
// .95 .999
}
/// <summary>
/// Constructor
/// </summary>
/// <param name="threshold">The threshold in which documents are considered similar</param>
/// <param name="tokensInWord">number of tokens for a word</param>
/// <param name="numHashFunctions">number of min hash functions to compute</param>
/// <param name="bands">number of bands for LSH comparison</param>
/// <param name="rows">number of rows for LSH comparison</param>
public MinHashSimilarity(double threshold, int tokensInWord, int numHashFunctions, int bands, int rows)
{
if (threshold < 0 || threshold > 1)
{
throw new Exception(String.Format("MinHashSimilarity - Illegal threshold: {0}", threshold));
}
if (bands*rows != numHashFunctions)
{
throw new Exception("MinHashSimilarity - bands * rows != numHashFunctions");
}
_threshold = threshold;
_minHash = new MinHash(tokensInWord, numHashFunctions);
_bands = bands;
_rows = rows;
_buckets = new Dictionary<string, List<int[]>>();
}
/// <summary>
/// Clears all history of documents
/// </summary>
public void ClearDocuments()
{
_buckets.Clear();
}
/// <summary>
/// Given a string document, looks whether a similar document was already seen
/// </summary>
/// <param name="doc">The new document to compare to</param>
/// <returns>true if a similar document was already seen</returns>
public bool LookForSimilarDocument(string doc)
{
int[] minHashes = _minHash.ComputeSketch(doc.Split(new char[]{' ', '\t', '\r', '\n'}));
string[] bandHashes = new string[_bands];
HashSet<int[]> comparedSketches = new HashSet<int[]>();
for (int i = 0; i < _bands; i++)
{
bandHashes[i] = ComputeBandHash(minHashes, i);
if (_buckets.ContainsKey(bandHashes[i]))
{
foreach (int[] sketchToCompare in _buckets[bandHashes[i]])
{
if (!comparedSketches.Contains(sketchToCompare))
{
if (_minHash.CompareSketches(minHashes, sketchToCompare) >= _threshold)
{
// Found a similar document
return true;
}
// Avoid comparing two documents twice
comparedSketches.Add(sketchToCompare);
}
}
}
}
// No match found, add document to buckets
for (int i = 0; i < _bands; i++)
{
if (!_buckets.ContainsKey(bandHashes[i]))
{
_buckets.Add(bandHashes[i], new List<int[]>());
}
_buckets[bandHashes[i]].Add(minHashes);
}
return false;
}
/// <summary>
/// Computes a hash for quick bucket match search
/// </summary>
/// <param name="minHashes">The MinHashes for row values</param>
/// <param name="i">The ith band</param>
/// <returns>The computed hash for the ith band</returns>
private string ComputeBandHash(int[] minHashes, int i)
{
StringBuilder bandHashSB = new StringBuilder((_rows + 1) * 10);
for (int j = 0; j < _rows; j++)
{
// adding the rows corresponding to ith band
bandHashSB.Append(minHashes[i * _rows + j].ToString().PadLeft(10, '0'));
}
// adding the number i to distinguish between bands
bandHashSB.Append(i.ToString().PadLeft(10, '0'));
return bandHashSB.ToString();
}
}
}