cleaning.py

import pandas as pd
import re
import nltk
import sys
# nltk.download('stopwords')
# import nltk
# nltk.download('punkt')

#1. Load the dataset into dataframe df
df = pd.read_csv('data/haha_2019_test.csv')

import csv
import string
import wordninja
from nltk.corpus import stopwords
from nltk.stem.wordnet import WordNetLemmatizer

stop = set(stopwords.words('spanish'))
exclude = set(string.punctuation) 

#clean emojis
def clean_emoji(sen): 
    sen = ''.join(c for c in sen if c <= '\uFFFF')
    return sen.replace("  ", " ")

#further cleaning
def clean(sen,remove_stopwords = True, contraction = True, pun= True,lemma_= False):
#     re.sub(pattern, repl, string, count=0, flags=0)

# pattern：表示正则表达式中的模式字符串；

# repl：被替换的字符串（既可以是字符串，也可以是函数）；

# string：要被处理的，要被替换的字符串；

# count：匹配的次数, 默认是全部替换

# flags：具体用处不详


    sen = re.sub(r'\{\{(.*?)\}\}', "", sen)
    #catch the left over links that have no closing braces
    sen = re.sub(r'\{\{(.*)', "", sen)
    #remove the quotes that are left over, the filter 
    sen = re.sub(r'\'+', "", sen)
    #remove the filenames of images but retain the title text they are called from
    sen = re.sub(r'(.*)\|', "",sen)
    sen = sen.strip(""" '!:?-_().,'"[]{};*""")


    sen = ' '.join([w.strip(""" '!:?-_().,'"[]{};*""") for w in re.split(' ', sen)])

    sen = re.sub("[-+]?[.\d]*[\d]+[:,.\d]*", " NUMBER ", sen)
   
    # spliting words
    string = []
    for x in sen.split():
        if len(x)>6:
            for i in wordninja.split(x):#分词
                if len(i)>2:
                    string.append(i)
        else:
            string.append(x)
    sen = " ".join(string)
    
    contraction  
    new_text = []
    for word in sen.split():#切片 默认空格

        # if word in contractions:
        #     new_text.append(contractions[word])
        # else:

        new_text.append(word)
    sen = " ".join(new_text)

    sen = re.sub(r"[^A-Za-z0-9:(),\'\`]", " ", sen)
    sen = re.sub(r"\b\d+\b", "", sen)  #remove numbers 
    sen = re.sub('\s+',  ' ', sen) #matches any whitespace characte
    sen = re.sub(r'(?:^| )\w(?:$| )', ' ', sen).strip() #removing single character
   
     # Optionally, remove stop words
    if remove_stopwords:
        sen = " ".join([i for i in sen.split() if i not in stop])
       
    # Optionally emove puncuations 
    if pun:
        sen = ''.join(ch for ch in sen if ch not in exclude)
    
    # Optionally lemmatiztion  
    if lemma_:
        normalized = " ".join(WordNetLemmatizer().lemmatize(word) for word in sen.split())        
        
    return sen.strip().lower()#转成小写

# Cleaning the dataset 
clean_data = []
for index, row in df['text'].iteritems():#（）返回迭代器（index:row）
    row = clean_emoji(str(row))
    row = clean(row, remove_stopwords=False)
    #print(row)
    clean_data.append(row)

# Inspect the cleaned summaries and texts to ensure they have been cleaned well
for i in range(len(df['text'])):
    print("Clean Review #",i+1)
    print(clean_data[i])
    print()
df['text']=clean_data
df.to_csv("data/cleaned_data_test_haha.csv", index=False , encoding='utf-8')