-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathcleaning.py
111 lines (84 loc) · 3.2 KB
/
cleaning.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
import pandas as pd
import re
import nltk
import sys
# nltk.download('stopwords')
# import nltk
# nltk.download('punkt')
#1. Load the dataset into dataframe df
df = pd.read_csv('data/haha_2019_test.csv')
import csv
import string
import wordninja
from nltk.corpus import stopwords
from nltk.stem.wordnet import WordNetLemmatizer
stop = set(stopwords.words('spanish'))
exclude = set(string.punctuation)
#clean emojis
def clean_emoji(sen):
sen = ''.join(c for c in sen if c <= '\uFFFF')
return sen.replace(" ", " ")
#further cleaning
def clean(sen,remove_stopwords = True, contraction = True, pun= True,lemma_= False):
# re.sub(pattern, repl, string, count=0, flags=0)
# pattern:表示正则表达式中的模式字符串;
# repl:被替换的字符串(既可以是字符串,也可以是函数);
# string:要被处理的,要被替换的字符串;
# count:匹配的次数, 默认是全部替换
# flags:具体用处不详
sen = re.sub(r'\{\{(.*?)\}\}', "", sen)
#catch the left over links that have no closing braces
sen = re.sub(r'\{\{(.*)', "", sen)
#remove the quotes that are left over, the filter
sen = re.sub(r'\'+', "", sen)
#remove the filenames of images but retain the title text they are called from
sen = re.sub(r'(.*)\|', "",sen)
sen = sen.strip(""" '!:?-_().,'"[]{};*""")
sen = ' '.join([w.strip(""" '!:?-_().,'"[]{};*""") for w in re.split(' ', sen)])
sen = re.sub("[-+]?[.\d]*[\d]+[:,.\d]*", " NUMBER ", sen)
# spliting words
string = []
for x in sen.split():
if len(x)>6:
for i in wordninja.split(x):#分词
if len(i)>2:
string.append(i)
else:
string.append(x)
sen = " ".join(string)
contraction
new_text = []
for word in sen.split():#切片 默认空格
# if word in contractions:
# new_text.append(contractions[word])
# else:
new_text.append(word)
sen = " ".join(new_text)
sen = re.sub(r"[^A-Za-z0-9:(),\'\`]", " ", sen)
sen = re.sub(r"\b\d+\b", "", sen) #remove numbers
sen = re.sub('\s+', ' ', sen) #matches any whitespace characte
sen = re.sub(r'(?:^| )\w(?:$| )', ' ', sen).strip() #removing single character
# Optionally, remove stop words
if remove_stopwords:
sen = " ".join([i for i in sen.split() if i not in stop])
# Optionally emove puncuations
if pun:
sen = ''.join(ch for ch in sen if ch not in exclude)
# Optionally lemmatiztion
if lemma_:
normalized = " ".join(WordNetLemmatizer().lemmatize(word) for word in sen.split())
return sen.strip().lower()#转成小写
# Cleaning the dataset
clean_data = []
for index, row in df['text'].iteritems():#()返回迭代器(index:row)
row = clean_emoji(str(row))
row = clean(row, remove_stopwords=False)
#print(row)
clean_data.append(row)
# Inspect the cleaned summaries and texts to ensure they have been cleaned well
for i in range(len(df['text'])):
print("Clean Review #",i+1)
print(clean_data[i])
print()
df['text']=clean_data
df.to_csv("data/cleaned_data_test_haha.csv", index=False , encoding='utf-8')