-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathmain.py
57 lines (44 loc) · 1.51 KB
/
main.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
from pprint import pprint
import nltk
nltk.download() #de preferência criar uma maneira de rodar só uma vez
text = """Did you know that The Rock and Dwayne Johnson are the same person?
I certainly didn't.
Also, you've never seen Childish Gambino and Donald Glover at the same place, right?
It makes me wonder..."""
segments = nltk.tokenize.sent_tokenize(text)
print('\nSEGMENTAÇÃO')
pprint(segments)
tokens = []
filtered_tokens = []
stemmed_tokens = []
lemmatized_tokens = []
tagged_tokens = []
entities = []
stop_words = set(nltk.corpus.stopwords.words('english'))
stemmer = nltk.stem.PorterStemmer()
lemmatizer = nltk.stem.WordNetLemmatizer()
for i in range(len(segments)):
tokens.append(nltk.word_tokenize(segments[i]))
filtered_tokens.append([])
stemmed_tokens.append([])
lemmatized_tokens.append([])
for j in range(len(tokens[i])):
if tokens[i][j] not in stop_words:
filtered_tokens[i].append(tokens[i][j])
for j in range(len(tokens[i])):
stemmed_tokens[i].append(stemmer.stem(tokens[i][j]))
lemmatized_tokens[i].append(lemmatizer.lemmatize(tokens[i][j], pos="v"))
tagged_tokens.append(nltk.pos_tag(tokens[i]))
entities.append(nltk.chunk.ne_chunk(tagged_tokens[i]))
print('\nTOKENIZAÇÃO')
pprint(tokens)
print('\nREMOÇÃO DE STOP WORDS')
pprint(filtered_tokens)
print('\nSTEMMING')
pprint(stemmed_tokens)
print('\nLEMATIZAÇÃO')
pprint(lemmatized_tokens)
print('\nPOS TAGGING')
pprint(tagged_tokens)
print('\nNAMED ENTITIES RECOGNITION')
pprint(entities)