-
Notifications
You must be signed in to change notification settings - Fork 1
/
Copy pathoie_extraction.py
64 lines (49 loc) · 2.01 KB
/
oie_extraction.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
#!/usr/bin/env python
# -*- coding: utf-8 -*-
# OIE Extraction and vector sum for use in SEE baseline method
# The reference SEE paper used ReVerb to extract OIE http://reverb.cs.washington.edu/
import subprocess
import gensim
import numpy as np
extract_rel = False
data_folder = "/home/janderson/dev/mestrado/data"
word_vector_path = "/data/GoogleNews-vectors-negative300.bin"
def extract_relations(s):
# Put your ReVerb jar in the same folder
""" Extracts OpenIE relations using Reverb (As described in paper)"""
r = subprocess.Popen("echo '{0}' | java -Xmx512m -jar reverb-latest.jar".format(s),
shell=True, stdout=subprocess.PIPE).stdout.read()
oargs = r.decode("utf-8").strip().split('\t')
return " ".join(oargs[-3:])
model = gensim.models.KeyedVectors. \
load_word2vec_format(word_vector_path, binary=True)
print('Loaded %s word vectors.' % len(model.vocab))
stocks = ["AAPL", "AMZN", "CSCO", "F", "GOOGL", "IBM", "MSFT", "NFLX", "ORCL", "VZ"]
for stock in stocks:
count = 0
fOut = open("{0}/asset_oie_splited/{1}.csv".format(data_folder, stock), "a+")
for line in open("{0}/asset_rated_news/{1}.csv".format(data_folder, stock), "r"):
headline = line.strip().split("\t")[-1]
try:
if extract_rel:
extracted = extract_relations(headline)
else:
extracted = headline
except:
continue
try:
M = []
for w in extracted.split():
try:
v = model[w]
M.append(v)
except:
print("[{0}] not found".format(w))
matrix_sum = np.sum(np.array(M))
fOut.write("\t".join(line.strip().split("\t")[:1]) + "\t" + str(matrix_sum) + "\n")
except:
fOut.write("\t".join(line.strip().split("\t")[:1]) + "\t" + str(0.0) + "\n")
count += 1
fOut.close()
print(str(count) + " headlines analysed for " + stock)