-
Notifications
You must be signed in to change notification settings - Fork 1
/
Copy pathwinnowing.cpp
127 lines (104 loc) · 3.15 KB
/
winnowing.cpp
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
#include "winnowing.h"
std::vector <Code> getCodes(std::vector <std::string> &codeFiles, int k, int w)
{
std::vector <Code> codes;
for (const auto &codeFile: codeFiles)
{
initializeSets();
Code *code = new Code;
code->filePath = codeFile;
qDebug() << codeFile.c_str();
getCodeSkeleton(*code);
qDebug() << "getCodeSkeleton";
if (code->skeleton.size() >= k)
{
getFingerprints(*code, k, w);
qDebug() << "getFingerprints";
setFileName(*code);
codes.push_back(*code);
}
}
return codes;
}
std::vector<Code> getDocs(std::vector<std::string> &docFiles, int k, int w)
{
std::vector <Code> codes;
for (const auto &codeFile: docFiles)
{
Code *code = new Code;
code->filePath = codeFile;
qDebug() << code->filePath.c_str();
code->pureCode = code->skeleton = readFile(code->filePath);
qDebug() << "readFile";
if (code->skeleton.size() >= k)
{
getFingerprints(*code, k, w);
qDebug() << "getFingerprints";
setFileName(*code);
codes.push_back(*code);
}
}
return codes;
}
std::string readFile(const std::string &path)
{
std::string fullCode, word;
std::ifstream inFile(path.substr(1, path.size()-2));
if (!inFile)
{
qDebug() << "Unable to open file";
exit(1);
}
while (inFile >> word)
fullCode += word;
inFile.close();
return fullCode;
}
void getCodeSkeleton(Code &code)
{
std::string fullCode = removeComments(code.filePath);
fullCode = removeSpaces(fullCode);
removeQuotes(fullCode);
code.pureCode = fullCode;
code.skeleton = transformCode(fullCode);
}
void getFingerprints(Code &code, int k, int w)
{
// Applying Karp-Rabin algorithm:
long long factor = getFactor(k);
std::vector <long long> hashKeys = karpRabinHashing(code.skeleton, k, factor);
// Applying Winnowing algorithm:
int min = -1; // index of minimum hash
for (int i = 0; i < hashKeys.size()-w+1; ++i)
{
if (min < i) // the previous minimum hash key is no longer in the window
{
min = std::min_element(hashKeys.begin()+i, hashKeys.begin()+i+w) - hashKeys.begin();
++code.fingerprints[hashKeys[min]];
++code.numOfSelectedFingerPrints;
}
else if (hashKeys[i+w-1] <= hashKeys[min]) // the previous minimum hash key is still in the window
{
min = i+w-1;
++code.fingerprints[hashKeys[min]];
++code.numOfSelectedFingerPrints;
}
}
}
int compareCodes(Code &code1, Code &code2)
{
int tokens_matched = 0;
for (const auto &p1: code1.fingerprints)
{
auto p2 = code2.fingerprints.find(p1.first);
if (p2 != code2.fingerprints.end())
tokens_matched += std::min(p1.second, p2->second);
}
return tokens_matched;
}
void setFileName(Code &code)
{
auto index = code.filePath.find_last_of('/');
if (index != std::string::npos)
code.fileName = code.filePath.substr(index+1, code.filePath.size()-index-2);
}