-
Notifications
You must be signed in to change notification settings - Fork 2
/
Copy pathpreprocessing.py
112 lines (80 loc) · 4.01 KB
/
preprocessing.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
import argparse
from pathlib import Path
from typing import List
import jieba
def get_args():
parser = argparse.ArgumentParser(description='preprocess the dataset')
parser.add_argument('--data-path', help='dataset to be processed', type=str, default='./data')
return parser.parse_args()
def get_stopwords(path: str) -> List[str]:
"""read and give the stop words list from given path"""
stopwords = []
with open(path, 'r', encoding='utf-8') as stopwords_file:
lines = stopwords_file.readlines()
stopwords = [line.strip() for line in lines]
return stopwords
def parse_sentence_to_char(sentence: str, stopwords: List[str]) -> List[str]:
"""read sentence, remove stopword, and make it character"""
characters = []
for char in sentence:
if char not in stopwords and char != ' ':
characters.append(char)
return characters
def parse_sentence_to_word(sentence: str, stopwords: List[str]) -> List[str]:
"""read sentence, remove stopwords, and tokenize it into words"""
words = []
for word in jieba.cut(sentence):
if word not in stopwords and word != ' ':
words.append(word)
return words
def cleanup_corpus(corpus_lines, split_symbol, start_index, number_of_columns, stopwords):
"""clean up given corpus"""
sentences = []
for epidemic_line in corpus_lines:
raw_data = epidemic_line.split(split_symbol)
sentence_one_tokens = parse_sentence_to_word(sentence=raw_data[start_index], stopwords=stopwords)
sentence_two_tokens = parse_sentence_to_word(sentence=raw_data[start_index + 1], stopwords=stopwords)
if len(raw_data) != number_of_columns or len(sentence_one_tokens) == 0 or len(sentence_two_tokens) == 0:
continue
sentence_one = ' '.join(sentence_one_tokens)
sentence_two = ' '.join(sentence_two_tokens)
label = raw_data[start_index + 2]
processed_sentence = '\t'.join([sentence_one, sentence_two, label]).strip()
sentences.append(processed_sentence)
return sentences
def cleanup_data(data_path: str):
"""clean up data to desired format"""
stopwords = get_stopwords(path=f'{data_path}/stopwords.txt')
parse_strategy = None
with open(f'{data_path}/dictionary', 'r', encoding='utf-8') as dictionary, \
open(f'{data_path}/not_word', 'r', encoding='utf-8') as not_word:
dictionary_lines = dictionary.readlines()
not_word_lines = not_word.readlines()
for dictionary_line in dictionary_lines:
dictionary_line = dictionary_line.strip()
jieba.add_word(dictionary_line)
for not_word_line in not_word_lines:
not_word_line = not_word_line.strip()
jieba.del_word(not_word_line)
with open(f'{data_path}/ant_train', 'r', encoding='utf-8') as ant_train, \
open(f'{data_path}/ant_train_add', 'r', encoding='utf-8') as ant_train_add, \
open(f'{data_path}/epidemic_dev.csv', 'r', encoding='utf-8') as epidemic_dev, \
open(f'{data_path}/epidemic_train.csv', 'r', encoding='utf-8') as epidemic_train, \
open(f'{data_path}/icqmc_train.txt', 'r', encoding='utf-8') as icqmc_train, \
open(f'{data_path}/icqmc_dev.txt', 'r', encoding='utf-8') as icqmc_dev, \
open(f'{data_path}/icqmc_test.txt', 'r', encoding='utf-8') as icqmc_test, \
open(f'{data_path}/simtrain_to05sts.txt', encoding='utf-8') as simtrain:
ant_train_lines = ant_train.readlines() + ant_train_add.readlines()
epidemic_lines = epidemic_dev.readlines()[1:] + epidemic_train.readlines()[1:]
icqmc_lines = icqmc_train.readlines()[1:] + icqmc_dev.readlines()[1:] + icqmc_test.readlines()[1:]
sentences = []
sentences += cleanup_corpus(epidemic_lines, ',', 2, 5, stopwords)
sentences += cleanup_corpus(ant_train_lines, '\t', 1, 4, stopwords)
sentences += cleanup_corpus(icqmc_lines, '\t', 0, 3, stopwords)
with open(f'./preprocessed/data', 'a+', encoding='utf-8') as ant_file:
for sentence in sentences:
ant_file.write(f'{sentence}\n')
if __name__ == '__main__':
args = get_args()
Path('preprocessed').mkdir(parents=True, exist_ok=True)
cleanup_data(data_path=args.data_path)