-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathcorpusWordDetector.py
155 lines (125 loc) · 4.9 KB
/
corpusWordDetector.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
import os
import csv
import stanfordnlp
import nltk
from re import *
from nltk.parse import CoreNLPParser
from stanfordnlp.server import CoreNLPClient
parser = CoreNLPParser('http://localhost:9001')
nlp = stanfordnlp.Pipeline(lang='zh',processors='tokenize,mwt,pos')
path = '/Users/trekkatkins/Downloads/JCLCv2'
path2 = '/Users/trekkatkins/Downloads/JCLCv2/index'
#path = '/Users/trekkatkins/Desktop/ChineseCharacterCountTest'
all_files = os.listdir(path)
dictOfTxts = {}
andUTF = b'\xe5\x92\x8c'
locationUTF = b'\xe5\x9c\xa8'
def getPOS(doc):
taggedList = []
for sentence in doc.sentences:
for word in sentence.words:
wordPOSTuple = (word.text, word.pos)
taggedList.append(wordPOSTuple)
return taggedList
for root, dirs, files in os.walk(path):
for file in files:
with open('/Users/trekkatkins/Downloads/JCLCv2/index.csv') as index_csv:
csv_reader = csv.DictReader(index_csv)
if file.endswith('.txt'):
with open(os.path.join(root,file), 'r') as f:
text = f.read()
for row in csv_reader:
andFrequency = 0
totalWords = 0
if row["country"] == "USA" or row["country"] == "UK" or row["country"] == "Germany" or row["country"] == "Costa Rica" or row["country"] == "Peru" or row["country"] == "France":
for word in text:
#print(word.encode('utf8'))
totalWords += 1
if word.encode('utf8') == andUTF:
andFrequency += 1
string = "" + str(totalWords) + ", " + str(andFrequency)
dictOfTxts.update({file:string})
f.close()
with open('test.csv', 'w') as f:
for key in dictOfTxts.keys():
f.write("%s,%s\n"%(key,dictOfTxts[key]))
f.close()
#check more broadly and see patterns that will negate adverbial phrases
#with verb-object phrases and other constructions
#regular expressions allow for negative things - not allow VV in front of NN.
#beginning of week 9 and end of week 9 to test with Chinese students
#email Chinese professor -
#what arrangements to make with students
#language mentors - help set up, Hu Zuyi (more time for students to try)
#Audrey in the language center - to work with multiple computers
#check if I can run the software on the language center computers
#pyc - compiled files save, copy and run the files on those computers
#alternatively use PASTA
grammar = r"""
NP:
{<NN.*><DEC?><NN.*>}
{<NN.*><DEC?><PRP>}
{<CD><NNB><NN.*>}
{<JJ><DEC?><NN.*>}
{<JJ><DEC?><PRP>}
{<VV><DEC?><NN.*>}
{^<VV><DEC?><NN.*>}
{<DT><DEC?><NN.*>}
{<IN><NN><NN.*>}
{<IN><PRP><NN.*>}
{<PRP><DEC?><NN.*>}
{<PRP><DEC?><PRP>}
{<DT><NN.*>}
{<DT><PRP>}
{<NN.*>}
{<PRP>}
CC_CLAUSE:
{<NP><CC><NP>}
IN_CLAUSE:
{<NP><IN><NP>}
"""
i = 0
txtFiles = [keys for keys in dictOfTxts.keys()]
#print(txtFiles)
resultList = []
subtreeList = []
andIsCC = False
andIsIN = False
while i < len(txtFiles)/2:
corpusDoc = nlp(open('/Users/trekkatkins/Downloads/JCLCv2/' + txtFiles[i], 'r').read())
pos = getPOS(corpusDoc)
cp = nltk.RegexpParser(grammar)
result = cp.parse(pos)
for subtree in result.subtrees(filter=lambda t: t.label() == 'CC_CLAUSE'):
#print(subtree)
if subtree[1][0].encode('UTF8') == andUTF and subtree[1][1] == 'CC':
andIsCC = True
subtreeList.append(subtree)
for subtree in result.subtrees(filter=lambda t: t.label() == 'IN_CLAUSE'):
#print(subtree)
if subtree[1][0].encode('UTF8') == andUTF and subtree[1][1] == 'IN':
andIsIN = True
subtreeList.append(subtree)
resultList.append(result)
i += 1
with open('andClauses.txt', 'w') as f:
if andIsIN or andIsCC:
for parse in subtreeList:
f.write("%s\n"%(parse))
f.close()
with open('andSentences.txt','w') as k:
if andIsCC or andIsIN:
for result in resultList:
k.write("%s\n"%(result))
f.close()
#do a few manually, randomly select a few files, split and select and few sentences
#circumvent user interface, work out patterns if they come up with certain files
#then writes out result of whether it is correct or not
#see how many errors there are in the corpus text, go by hand and see if system actually catches the
#ones made by hand
#use a different text - go through and see the precision/recall evaluation on the texts
#calculate the error rate - required for presentation and paper
#look at and identify the texts you are using until running the experiment, don't change code by the time of evaluation
#
#print(\xe5\x92\x8c.decode('utf8'))
#