-
Notifications
You must be signed in to change notification settings - Fork 1
/
Copy pathdata_util.py
89 lines (75 loc) · 3.15 KB
/
data_util.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
import tflearn
import csv
import numpy as np
import os
import json
MAX_LENGTH =1014
import numpy as np
from six.moves import cPickle
class Alphabet:
characters = '''abcdefghijklmnopqrstuvwxyz0123456789-,;.!?:'\"/\\|_@#$%^&*~`+-=<>()[]{}\n'''
def __init__(self):
self.lookup = np.zeros(256, dtype = np.int32)
for i, c in enumerate(self.characters):
self.lookup[ord(c)] = i + 1
self.reverse_lookup = np.zeros(self.vocab_size + 1, dtype = np.uint8)
for i, c in enumerate(self.characters):
self.reverse_lookup[i + 1] = ord(c)
self.reverse_lookup[0] = ord(' ')
def transform(self, s):
''' Return the index of characters in ``s``, for characters in alphabet, return index from 1,
from characters out of alphabet, return index 0, transform upper case to lower case
Args:
s (string)
Return:
array(dtype = np.int32) of same length of ``s``
'''
arr = np.frombuffer(s.lower(), dtype=np.uint8)
return np.take(self.lookup, arr)
def reverse(self, char_indices):
return np.take(self.reverse_lookup, char_indices).tobytes().strip()
def reverse_raw(self, char_indices):
return np.take(self.reverse_lookup, char_indices).tobytes()
@property
def vocab_size(self):
return len(self.characters)
def onehot(arr, dims):
return np.take(np.identity(dims), arr, axis = 0)
def clean_str(s):
return s.replace('\\n\\n', '\n').replace('\\n','\n').replace('\\\\', '\\').replace('\\"','"').replace('<br /><br />', '\n')
def labels_onehot(labels):
class_names = list(sorted(set(labels)))
num_classes = len(class_names)
class_id = dict(zip(class_names, range(num_classes)))
y = onehot(np.array([class_id[label] for label in labels]), num_classes )
return y
def load_dataset_csv(filepath, target_column=0):
if filepath.find('aclImdb') != -1:
target_column = 1
alphabet = Alphabet()
if not os.path.exists(filepath):
filepath = filepath.replace('csv', 'json')
with open(filepath, 'r') as f:
dataset = json.load(f)
texts = dataset['texts']
labels = dataset['labels']
else:
texts, labels = tflearn.data_utils.load_csv(filepath, target_column=target_column, has_header=False)
texts[:] = [ clean_str(' '.join(text) ) for text in texts]
x = tflearn.data_utils.pad_sequences([alphabet.transform(texts[i] ) for i in range(len(texts))], maxlen=1014)
y = labels_onehot(labels)
print('Data from {} loaded.'.format(filepath) )
return x, y
def load_dataset_csv_raw(filepath, target_column=0):
if filepath.find('aclImdb') != -1:
target_column = 1
if not os.path.exists(filepath):
filepath = filepath.replace('csv', 'json')
with open(filepath, 'r') as f:
dataset = json.load(f)
texts = dataset['texts']
labels = dataset['labels']
else:
texts, labels = tflearn.data_utils.load_csv(filepath, target_column=target_column, has_header=False)
texts[:] = [ clean_str(' '.join(text) ) for text in texts]
return texts, labels