-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathbenchmark.py
59 lines (48 loc) · 2.27 KB
/
benchmark.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
"""
Daniel Roudnitsky
"""
from tokenizers import TokenizeAndVectorize
from sklearn.linear_model import LogisticRegression
def train_and_evaluate_classifier(feature_matrix, labels, train_test_split=.9):
"""
Split feature_matrix and labels into a training set and testing set, train logistic regression on the training set
and evaluate the model with the testing set, returns the accuracy on the testing set
:param train_test_split: fraction representing percentage of examples to use for training, save the rest for testing
:return: Accuracy on the testing set
"""
split = int(train_test_split * len(labels)) # index at which to split
training_X, training_y = feature_matrix[:split], labels[:split]
testing_X, testing_y = feature_matrix[split:], labels[split:]
lr_model = LogisticRegression(C=1e5).fit(training_X, training_y)
return lr_model.score(testing_X, testing_y)
def evaluate_tokenizer(tokenizer, *args):
"""
Get average accuracy(sentiment analysis) of a tokenizer over 5 runs
:param tokenizer: Tokenization method that returns a tfidf matrix and labels
:param args: Arguments to tokenization method
:return: Average accuracy over num_runs using regularized logistic regression
"""
accuracy = []
for _ in range(5):
X, y = tokenizer(*args)
accuracy.append(train_and_evaluate_classifier(X, y))
return sum(accuracy) / 5
tokenizers = TokenizeAndVectorize('/home/droudy/Desktop/ChnSuperCorp/ChnSentiCorp_htl_ba_6000')
results = []
# raw documents
score = evaluate_tokenizer(tokenizers.no_tokenizer)
results.append("Raw documents: {}".format(score))
# jieba tokenizer, default params
score = evaluate_tokenizer(tokenizers.jieba)
results.append("Jieba: {}".format(score))
# all spm tokenizers with different vocab_sizes
spm_models = ['unigram', 'bpe', 'char', 'word']
vocab_sizes = [2000, 4000, 8000, 16000]
for model_type in spm_models:
for vocab_size in vocab_sizes:
try:
score = evaluate_tokenizer(tokenizers.sentence_piece_model, 'm', vocab_size, model_type)
results.append("{} vocab_size={} : {} ".format(model_type, vocab_size, score))
except ValueError:
results.append("{} vocab_size={} : ABORTED (CORE DUMPED)".format(model_type, vocab_size))
print(results)