-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathnewselautil.py
executable file
·177 lines (156 loc) · 5.93 KB
/
newselautil.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
# Utils for newsela articles
# Author: S. Anderson
# Modified: A. Fedchin
import io
import re
import string
import csv
import nltk.data
import regex as re
import classpaths as path
from nltk.tokenize import TreebankWordTokenizer
from nltk.stem import WordNetLemmatizer
from nltk.corpus import wordnet
from nltk.corpus import stopwords
STOPWORDS = stopwords.words('english')
STOPWORDS.append("`s") # TODO: Should this really be appended?
STOPWORDS.append("n`t")
for i in range(len(STOPWORDS) - 2):
STOPWORDS.append(STOPWORDS[i][0].capitalize() + STOPWORDS[i][1:])
HDR = ['title', 'filename', 'grade_level', 'language', 'version', 'slug']
Tokenizer = nltk.data.load('tokenizers/punkt/english.pickle')
Wordtokenizer = TreebankWordTokenizer()
Lemmatizer = WordNetLemmatizer()
htmltag_rm = re.compile(r'(<!--.*?-->|<[^>]*>)')
def loadMetafile():
"""Return list of dictionaries, one for each English Newsela file."""
numArticles = 0
enArticles = 0
articles = [] # all articles
with open(path.METAFILE,'r') as meta:
reader = csv.DictReader(meta, delimiter=',')
for row in reader:
numArticles += 1
if row['language'] == 'en':
enArticles += 1
articles.append(row)
return articles
def cleanSentence(s):
"""Clean one string."""
# if s[0] == '#': return '' # skip lines that are section titles
return htmltag_rm.sub('', s) # remove html tags
def cleanSentences(slist):
"""Strip material from list of sentences."""
sentlist = []
for s in slist:
s1 = cleanSentence(s)
if s1 == '' or s1.isspace():
continue # skip
sentlist.append(s1)
return sentlist
# Some articles begin with the name of the city in which the event discussed
# has happened. These "location tags" should not affect alignment and should
# be removed. Here are some abbreviations with which some lines begin but
# which should not be removed.
CAPITALIZED_WORDS = ['A', 'FBI', 'FDA', 'NASA', 'DNA', 'TV', 'UFO']
OTHER_HEADERS = '(SEATTLE - |NEW YORK - |WASHINGTON - |BEIJING - |CHICAGO - |' \
'GORDONVILLE , Pa. - |LOS ANGELES - |BAGHDAD - |' \
'SAN JOSE , Calif. - |AMSTERDAM - |BAMAKO , Mali - |' \
'DAKAR , Senegal - |PARIS - |PARIS , France - ' \
'WASHINGTON , D.C. - |RIYADH - |SAN FRANCISCO , Calif. - |' \
'CAIRO - |CHICAGO , Ill. - )'
# these are headers that are different to detect otherwise
def modify_the_header(line):
"""
delete the not very useful headers such as (SEATTLE, Wash. --)
:param line: the line to modify
:return: the modified line
"""
line = re.sub('### PRO : ', '', line)
line = re.sub('<.*> ', '', line)
line = re.sub('### PRO : ', '', line)
if line.split(' ')[0] not in CAPITALIZED_WORDS:
line = re.sub('^[A-Z][A-Z\-.]* .*?-- ', '', line)
line = re.sub('^' + OTHER_HEADERS, '', line)
# firstWord == "!\n": # a header with an image
# TODO
return line
def getTokParagraphs(article, separateBySemicolon=True, MODIFY_HEADER=True):
"""
Return list of paragraphs. Each par is a list of strings, each of
which is an already tokenized sentence. File suffix should be .tok
:param article:
:param separateBySemicolon: if True, the parts of one sentence separated by
a semicolon will be considered as separate sentences
:param MODIFY_HEADER: whether the program should 'clean' the headers
:return:
"""
SUFFIX = ".tok"
PARPREFIX = "@PGPH " # Delimits paragraphs in FILE.tok
pars = []
slist = []
with io.open(path.BASEDIR + '/articles/' + article['filename']+SUFFIX,
mode='r', encoding='utf-8') as fd:
lines = fd.readlines()
if MODIFY_HEADER:
lines[1] = modify_the_header(lines[1])
for i in range(len(lines)):
if separateBySemicolon:
phrases = lines[i].split(";")
for phrase in phrases:
if phrase[0:len(PARPREFIX)] == PARPREFIX: # new paragraph
cleaned=cleanSentences(slist)
if len(cleaned) > 0:
pars.append(cleaned)
slist = []
else:
slist.append( phrase.rstrip('\n'))
else:
if lines[i][0:len(PARPREFIX)] == PARPREFIX:
# without considering ";" to be a delimiter
cleaned = cleanSentences(slist)
if len(cleaned) > 0:
pars.append(cleaned)
slist = []
else:
slist.append(lines[i].rstrip('\n'))
cleaned = cleanSentences(slist)
if len(cleaned) > 0:
pars.append(cleaned)
return pars
PENNPOS = ['N', 'V', 'J', 'R']
WNETPOS = [wordnet.NOUN, wordnet.VERB, wordnet.ADJ, wordnet.ADV]
def convertPOS(pos):
"""Convert pos to wordnet pos"""
for i in range(len(PENNPOS)):
if pos[0] in PENNPOS[i]:
return WNETPOS[i]
return None
def lemmatize(s):
"""Return list of lemmas for string s, a sentence."""
tokens = Wordtokenizer.tokenize(s)
# tokens = [x for x in tokens if x.lower() == x]
# remove any string with uppercase char
# (eg, proper names)
cleantokens = []
for w in tokens:
try:
w_asc = w.encode('ascii')
if w not in string.punctuation:
cleantokens.append(w)
except UnicodeEncodeError:
# print "Not ascii:", repr(w)
pass
w_tagged = nltk.pos_tag(cleantokens)
lemmas = []
for word, pos in w_tagged:
wordnetPOS = convertPOS(pos)
if wordnetPOS is None:
lemmas.append(Lemmatizer.lemmatize(word))
else:
lemmas.append(Lemmatizer.lemmatize(word, pos=wordnetPOS))
i = 0
# while i<len(lemmas):
# lemmas[i]=lemmas[i].lower()
# i += 1
return lemmas