-
Notifications
You must be signed in to change notification settings - Fork 8
/
Copy pathBagOfWords.py
65 lines (34 loc) · 915 Bytes
/
BagOfWords.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
# coding: utf-8
# In[1]:
#import library
from sklearn.feature_extraction.text import CountVectorizer
# In[2]:
# create CountVectorizer object
vectoriser = CountVectorizer()
# In[3]:
# Sample strings
document1 = 'Hi How are you'
document2 = 'today is a very very very pleasant day and we can have some fun fun fun'
document3 = 'This was an amazing experience'
# In[4]:
# Create an array
listofdocuments = [document1,document2,document3]
# In[5]:
# Fit in CountVectoriser model
bag_of_words = vectoriser.fit(listofdocuments)
# In[6]:
# Verify the result
bag_of_words
# In[7]:
# Tokenise the words
bag_of_words = vectoriser.transform(listofdocuments)
# In[8]:
# View the results
print bag_of_words
# In[9]:
# Search the words using vocabulary_.get function
print vectoriser.vocabulary_.get('very')
print vectoriser.vocabulary_.get('fun')
# In[10]:
# View the datatype
type(bag_of_words)