-
Notifications
You must be signed in to change notification settings - Fork 1
/
Copy pathpre_process.py
executable file
·65 lines (49 loc) · 1.56 KB
/
pre_process.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
import nltk
import string
stopwords = set(nltk.corpus.stopwords.words('english'))
# do not split up hyphenated word "cross-language"
# do not split up decimal number
punctuations = set(string.punctuation)
# Porter Stemmer
stemmer = nltk.stem.PorterStemmer()
# Lemmatization
lemmatizer = nltk.stem.wordnet.WordNetLemmatizer()
def lemmatize(word):
lemma = lemmatizer.lemmatize(word, 'v')
if lemma == word:
lemma = lemmatizer.lemmatize(word, 'n')
return lemma
def process_sen(sentence, remove_stopwords):
words = []
sentence = nltk.word_tokenize(sentence)
for word in sentence:
if word not in punctuations:
if not remove_stopwords:
if word not in stopwords:
words.append(word)
else:
words.append(word)
return words
def sen_tokenize(wiki):
sentences = []
for sentence in wiki:
sentence = nltk.word_tokenize(sentence)
sentences.append(sentence)
return sentences
def preprocess(wiki, remove_stopwords):
sentences = []
for sentence in wiki:
sentence = process_sen(sentence, remove_stopwords)
sentences.append(sentence)
return sentences
def process_question(sentence, remove_stopwords):
words = []
sentence = nltk.word_tokenize(sentence)
for word in sentence:
if word not in punctuations:
if not remove_stopwords:
if word not in stopwords:
words.append(lemmatize(word.lower()))
else:
words.append(word)
return words