-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathAssignment2.py
26 lines (22 loc) · 909 Bytes
/
Assignment2.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
import codecs
import string
import re
from nltk import word_tokenize
f = codecs.open("smallcleaned.txt", encoding="UTF-8")
contents = f.read()
#the initial cleaning process seems to miss formatting between braces
#this code will remove everything between two sets of braces
contents = re.sub(r'\{\{(.*?)\}\}', "", contents)
#catch the left over links that have no closing braces
contents = re.sub(r'\{\{(.*)', "", contents)
#remove the quotes that are left over, the filter
contents = re.sub(r'\'+', "", contents)
#remove the filenames of images but retain the title text they are called from
contents = re.sub(r'(.*)\|', "", contents)
tokens = word_tokenize(contents)
small = codecs.open('smalltokens.txt', 'w', 'utf8')
#punctuation is tokenized but they are not tokens, remove them
tokens = filter(lambda word: word not in ',-.:;()%', tokens)
for ele in tokens:
small.write(ele+'\n')
small.close()