-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathBOWApproach.py
190 lines (138 loc) · 6.01 KB
/
BOWApproach.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
from collections import Counter
import sys
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer, HashingVectorizer
from sklearn.feature_extraction import DictVectorizer
import sklearn.cluster.k_means_
from sklearn.cluster.k_means_ import KMeans, MiniBatchKMeans
from sklearn.cluster import SpectralClustering, DBSCAN
from sklearn. decomposition import PCA, KernelPCA, SparsePCA, TruncatedSVD, IncrementalPCA
from sklearn.preprocessing import StandardScaler
from sklearn.discriminant_analysis import LinearDiscriminantAnalysis as LDA
import numpy as np
from nltk.corpus import stopwords
import xml.etree.ElementTree as ET
import matplotlib.pyplot as plt
from mpl_toolkits.mplot3d import Axes3D
import csv
import pandas as pd
from pandas.plotting import scatter_matrix
def readAligedCorpus(words, path):
rval = [Counter() for i in range(len(words))]
stop_words = set(stopwords.words('german'))
tree = ET.parse(path)
root = tree.getroot()
body = root.find('body')
for tu in body.findall('tu'):
de = ''
en = ''
for tuv in tu.findall('tuv'):
atr = tuv.attrib
lang = atr.get('{http://www.w3.org/XML/1998/namespace}lang')
if lang == 'de':
for seg in tuv.findall('seg'):
de = seg.text.split()
if lang == 'en':
for seg in tuv.findall('seg'):
en = seg.text.lower()
en_words = en.split()
for i, word in enumerate(words):
if word in en_words:
counter = rval[i]
de = [token.lower() for token in de if token.isalpha() and not token in stop_words]
#whole aligned sentence as BOW
for de_w in de:
counter[de_w] += 1
return rval
def readFile(words, path):
with open(path, 'r', encoding='utf8') as f:
rval = []
stop_words = set(stopwords.words('english'))
rval = [Counter() for i in range(len(words))]
lines = f.readlines()
for line in lines:
tokens = line.split()
for i, word in enumerate(words):
if(word in tokens):
tokens = [token.lower() for token in tokens if token.isalpha() and not token in stop_words]
counter = rval[i]
idx = tokens.index(word)
#bow of 5 (2 on the left | 2 on the right)
bow = tokens[idx-2:idx+3]
#print(bow)
for w in bow:
counter[w] += 1
return rval
corpus = readFile(['apple', 'banana', 'oranges', 'watermelons', 'strawberries', 'grape', 'peach', 'cherry', 'pear', 'plum', 'melon', 'lemon', 'coconut', 'lime',
'office', 'home', 'building', 'house', 'apartment', 'city', 'town', 'village'], 'resources/corpora/OpenSubtitles/small/combined2')
corpus_biling = readAligedCorpus(['apple', 'banana', 'oranges', 'watermelons', 'strawberries', 'grape', 'peach', 'cherry', 'pear', 'plum', 'melon', 'lemon', 'coconut', 'lime',
'office', 'home', 'building', 'house', 'apartment', 'city', 'town', 'village'], 'resources/corpora/OpenSubtitles/very_small_parallel/vsmallaa')
#'apple', 'banana', 'oranges', 'watermelons', 'strawberries', 'grape', 'peach', 'cherry', 'pear', 'plum', 'melon', 'lemon', 'coconut', 'lime',
#'office', 'home', 'building', 'house', 'apartment', 'city', 'town', 'village'
#'shoes', 'shirt', 'pants', 'jacket', 'sweatshirt', 'socks'
#'car', 'plane', 'bicycle', 'motorcycle', 'scooter', 'bus', 'train'
#'new york', 'los angeles', 'chicago', 'houston', 'philadelphia', 'san antonio', 'san diego', 'dallas', 'san jose', 'austin', 'seattle'
#'wind', 'sun', 'water', 'fire'
#'chair', 'table', 'bed', 'closet', 'commode'
#'sister', 'brother', 'father', 'mother'
#'nose', 'eyes', 'mouth', 'face', 'hair'
vectorizer = DictVectorizer()
X = vectorizer.fit_transform(corpus).toarray()
vectorizer_biling = DictVectorizer()
X_biling = vectorizer_biling.fit_transform(corpus_biling).toarray()
X_combined = np.hstack((X, X_biling))
#sc = StandardScaler()
#X_std = sc.fit_transform(X)
#sc_biling = StandardScaler()
#X_biling_std = sc_biling = sc_biling.fit_transform(X_biling)
sc_combined = StandardScaler()
X_combined_std = sc_combined.fit_transform(X_combined)
#pca = PCA(n_components=2)
pca = KernelPCA(kernel='rbf')
#pca = SparsePCA()
#pca = TruncatedSVD()
#pca = IncrementalPCA()
#X_pca = pca.fit_transform(X_std)
#X_biling_pca = pca.fit_transform(X_biling_std)
X_combined_pca = pca.fit_transform(X_combined_std)
#kmeans = KMeans(n_clusters=2, init='random').fit(X_pca)
#f = kmeans.predict(X_pca)
#kmeans = KMeans(n_clusters=2, init='random').fit(X_biling_pca)
#f = kmeans.predict(X_biling_pca)
kmeans = KMeans(n_clusters=2, init='random').fit(X_combined_pca)
f = kmeans.predict(X_combined_pca)
print(f)
print('contains number one x times: ', list(f).count(1))
print('contains number zero x times: ', list(f).count(0))
#plot function from my warmup assignment
def plot(f):
arr = np.array(f)
if arr.shape[1] == 2:
x1 = arr[:, 0]
x2 = arr[:, 1]
plt.scatter(x1, x2)
plt.show()
elif arr.shape[1] == 3:
fig = plt.figure()
ax = fig.add_subplot(111, projection='3d')
x = []
y = []
z = []
for line in f:
x.append(float(line[0]))
y.append(float(line[1]))
z.append(float(line[2]))
ax.scatter(x, y, z, c='r', marker='o')
ax.set_xlabel('X Label')
ax.set_ylabel('Y Label')
ax.set_zlabel('Z Label')
plt.show()
else:
m = np.array(f, dtype=float)
# first make some fake data with same layout as yours
data = pd.DataFrame(m)
# now plot using pandas
scatter_matrix(data, alpha=0.2, figsize=(6, 6), diagonal='kde')
plt.show()
#plot(X_pca)
#plot(X_biling_pca)
plot(X_combined_pca)