-
Notifications
You must be signed in to change notification settings - Fork 1
/
Copy pathpartition_yelp_corpus.py
48 lines (41 loc) · 1.3 KB
/
partition_yelp_corpus.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
""""
Script to split yelp corpus into train and test.
test_split_ratio = 0.15
"""
import random
import sys
if __name__ == '__main__':
seed = 42
random.seed(seed)
test_split_ratio = 0.15
shuffle = True
file_input_path = sys.argv[1]
train_output_path = sys.argv[2]
test_output_path = sys.argv[3]
nb_lines = 0
with open(file_input_path, 'r', encoding='utf8') as f:
for _ in f:
nb_lines += 1
data_indexes = list(range(nb_lines))
random.shuffle(data_indexes)
split_idx = int((1 - test_split_ratio) * len(data_indexes))
train_indexes = set(data_indexes[:split_idx])
test_indexes = set(data_indexes[split_idx:])
print('Nb examples')
print(len(data_indexes))
print('Nb examples to train | test')
print(len(train_indexes), len(test_indexes))
f_train = open(train_output_path, 'w', encoding='utf8')
f_test = open(test_output_path, 'w', encoding='utf8')
idx = 0
with open(file_input_path, 'r', encoding='utf8') as f:
for line in f:
print('{}/{}'.format(idx, nb_lines), end='\r')
# ele = json.loads(line.strip())
if idx in test_indexes:
f_test.write(line)
else:
f_train.write(line)
idx += 1
f_train.close()
f_test.close()