-
Notifications
You must be signed in to change notification settings - Fork 7
/
Copy patheval_wordsim.py
89 lines (83 loc) · 3.26 KB
/
eval_wordsim.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
import sys
import numpy.linalg as linalg
import numpy as np
import scipy.stats as stats
# Read word embedding file
wordvecFile = sys.argv[1]
# print_writer_filename = sys.argv[2]
wordVecDict = {}
file = open(wordvecFile, 'r', encoding='utf-8')
# fprint = open(print_writer_filename, 'a', encoding='utf-8')
num = 0
for line in file:
num += 1
if num % 500 == 0:
print("Reading the %d-th word" % num)
items = line.strip().split()
items = [item.strip('[],') for item in items]
word = items[0]
vec = list(map(float, items[1:]))
if linalg.norm(vec) != 0:
# Important!
if word not in wordVecDict:
wordVecDict[word] = vec / linalg.norm(vec)
else:
wordVecDict[word] = (vec / linalg.norm(vec) + wordVecDict[word]) / 2
file.close()
print('Word embeddings reading completes and total number of words is:', num)
# fprint.write('Word embeddings reading completes and total number of words is:' + str(num) + '\n')
# test on wordsim240 and wordsim297
wordSimType = ['240', '297']
for x in wordSimType:
file = open('wordsim/filtered_wordsim' + x + '.txt', 'r', encoding='utf-8')
testPairNum = 0
skipPairNum = 0
wordSimStd = []
wordSimPre = []
for line in file.readlines():
word1, word2, valStr = line.strip().split()
if (word1 in wordVecDict) and (word2 in wordVecDict):
testPairNum += 1
wordSimStd.append(float(valStr))
wordVec1 = wordVecDict[word1]
wordVec2 = wordVecDict[word2]
cosSim = np.dot(wordVec1, wordVec2) / np.linalg.norm(wordVec1) / np.linalg.norm(wordVec2)
wordSimPre.append(cosSim)
else:
skipPairNum += 1
print('Skip:', word1, word2)
# fprint.write('Skip: ' + word1 + ' ' + word2 + ' \n')
# corrCoef = np.corrcoef(wordSimStd, wordSimPre)[0, 1]
SpearCoef = stats.spearmanr(wordSimStd, wordSimPre).correlation
print("WordSim-" + x + " Score:" + str(SpearCoef))
print('TestPair:', testPairNum, 'SkipPair:', skipPairNum)
# fprint.write("WordSim-" + x + " Score:" + str(SpearCoef) + '\n')
# fprint.write('TestPair:' + str(testPairNum) + 'SkipPair:' + str(skipPairNum) + '\n')
file.close()
# test on COS960
file = open('wordsim/COS960.txt', 'r', encoding='utf-8')
testPairNum = 0
skipPairNum = 0
wordSimStd = []
wordSimPre = []
for line in file.readlines():
word1, word2, valStr = line.strip().split()
if (word1 in wordVecDict) and (word2 in wordVecDict):
testPairNum += 1
wordSimStd.append(float(valStr))
wordVec1 = wordVecDict[word1]
wordVec2 = wordVecDict[word2]
cosSim = np.dot(wordVec1, wordVec2) / np.linalg.norm(wordVec1) / np.linalg.norm(wordVec2)
wordSimPre.append(cosSim)
else:
skipPairNum += 1
print('Skip:', word1, word2)
# fprint.write('Skip: ' + word1 + ' ' + word2 + ' \n')
# corrCoef = np.corrcoef(wordSimStd, wordSimPre)[0, 1]
SpearCoef = stats.spearmanr(wordSimStd, wordSimPre).correlation
print("WordSim960 Score:" + str(SpearCoef))
print('TestPair:', testPairNum, 'SkipPair:', skipPairNum)
# fprint.write("WordSim960 Score:" + str(SpearCoef) + '\n')
# fprint.write('TestPair:' + str(testPairNum) + 'SkipPair:' + str(skipPairNum) + '\n')
file.close()
# fprint.close()