-
Notifications
You must be signed in to change notification settings - Fork 2
/
Copy pathtechtc2CSV.py
executable file
·169 lines (147 loc) · 5.83 KB
/
techtc2CSV.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
#!/usr/bin/env python
#
# Author: Nil Geisweiller
"""Script to convert the TechTC-300 Test Collection found at
http://techtc.cs.technion.ac.il/techtc300/techtc300.html into a CSV
feature vector.
Some preprocessing:
1) All punctuation characters are removed from tokens
2) Everything is converted in lower case
3) Stop words are ignored ### this step is ignore for now ###
4) dash characters between words are removed (brain-washed => brainwashed)
5) numbers and mixed alphanumaric strings (e.g. Win2K) are ignored
6) Stemming is performed using PyStemmer
7) Words appearing in less than 3 documents are removed
"""
import sys
import os
import Stemmer
from collections import Counter
from optparse import OptionParser
# 1) Stop words are removed
# Google's list of stop words
stop_words = set(["i", "a", "about", "an", "are", "as", "at", "be", "by", "com", "de", "en", "for", "from", "how", "in", "is", "it", "la", "of", "on", "or", "that", "the", "this", "to", "was", "what", "when", "where", "who", "will", "with", "und", "the", "www"])
stemmer = Stemmer.Stemmer("english")
def check_file(fileTitle,fileName):
if not os.path.exists(fileName):
sys.stderr.write("error: "+ fileTitle+" "+fileName+" not found")
sys.stderr.write(os.linesep)
sys.exit()
def preprocessWord(word):
# 1) All punctuation characters are removed from tokens
word.strip(",.?:;!\"")
# 2) Everything is converted in lower case
word=word.lower()
# # 3) Stop words are ignored
# if word in stop_words:
# return ""
# 4) dash characters between words are removed (brain-washed => brainwashed)
word.replace("-", "")
# 5) numbers and mixed alphanumaric strings (e.g. Win2K) are ignored
if not word.isalpha():
return ""
# 6) Stemming is performed using PyStemmer
global stemmer
word = stemmer.stemWord(word)
return word
# collect and preprocess (filter and stem) all words of the line
def preprocessLine(line):
words = set()
for w in line.split():
for subw in w.split("'"): # split ' (e.g. "you're")
word = preprocessWord(subw)
if word:
words.add(word)
return words
# read a dmoz_doc and return the set of words that it contains.
# On the way it filters and stem the words
def DocWords(doc):
words = set()
for l in doc:
words |= preprocessLine(l)
return words
# Return a list of set of words, each element of the list corresponds
# to a document and the set of words whether it appears or not in the
# document
def FListWords(File):
lines = File.readlines()
start_doc = False
res = [] # list of word set
for l in lines:
if "<dmoz_doc>" in l:
start_doc = True
doc = []
elif "</dmoz_doc>" in l:
start_doc = False
res.append(DocWords(doc))
if start_doc:
doc.append(l)
return res
# gather all the words from list of set of words. Words must appear at
# least in 2 documents to be included in the result
def gatherWords(listWS):
mw = [] # list of all words including duplicates
for ws in listWS:
mw += list(ws)
cw = Counter(mw) # multiset of words (using Counter)
# 7) Words appearing in less than 3 documents are removed
words = set([w for w in cw.keys() if cw[w] > 2])
return words
# take dom of positive and negative and write the CSV table on the
# stdout
def convertF2CSV(posFile, negFile, targetVar, outputFileName):
# list of all positive doc with all present words
plws = FListWords(posFile)
# list of all negative doc with all present words
nlws = FListWords(negFile)
# list of all words
words = gatherWords(plws+nlws)
print outputFileName, "has", len(words), "words"
# write the header of the CSV, i.e. list of words, and targetVar
# as last argument
outputFile = open(outputFileName, "w") if outputFileName else sys.stdout
for w in words:
outputFile.write(w+",")
outputFile.write(targetVar)
outputFile.write(os.linesep)
# write alternation of positive and negative (so that truncating
# the data will remain unbiased)
for i in range(max(len(plws), len(nlws))):
if i < len(plws):
for w in words:
outputFile.write(("1" if w in plws[i] else "0")+",")
outputFile.write("1") # because it is positive
outputFile.write(os.linesep)
if i < len(nlws):
for w in words:
outputFile.write(("1" if w in nlws[i] else "0")+",")
outputFile.write("0") # because it is negative
outputFile.write(os.linesep)
# take file names of positive and negative and write the CSV table on
# the stdout
def convertFN2CSV(posFileName, negFileName, targetVar, outputFileName):
check_file("Positive XML file", posFileName)
check_file("Negative XML file", negFileName)
posFile = open(posFileName)
negFile = open(negFileName)
convertF2CSV(posFile, negFile, targetVar, outputFileName)
def main():
usage = "usage: %prog POSITIVE_FILE NEGATIVE_FILE TARGET_VAR_NAME [-o OUTPUT_FILE]"
parser = OptionParser(usage)
parser.add_option("-o", "--output-file",
dest="outputFile",
help="File where to output the result. If not specified the result is printed on stdout.")
(options, args) = parser.parse_args()
if len(args) != 3:
parser.error("incorrect number of arguments. Use --help to get more information")
posFN = args[0]
negFN = args[1]
targetVar = args[2]
# # for debugging
# print "posFN = "+posFN
# print "negFN = "+negFN
# print "target = "+targetVar
# print "outputFN = "+options.outputFile
convertFN2CSV(posFN, negFN, targetVar, options.outputFile)
if __name__ == '__main__':
main()