Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Add prado #8

Merged
merged 4 commits into from
Feb 22, 2021
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
2 changes: 2 additions & 0 deletions bunruija/binarizer.py
Original file line number Diff line number Diff line change
Expand Up @@ -24,6 +24,8 @@ def load_data(self, data_path):
with open(data_path) as f:
reader = csv.reader(f)
for row in reader:
if len(row) < 2:
continue
if len(row[0]) == 0 or len(row[1]) == 0:
continue
labels.append(row[0])
Expand Down
4 changes: 2 additions & 2 deletions bunruija/classifiers/__init__.py
Original file line number Diff line number Diff line change
@@ -1,4 +1,4 @@
import logging
from logging import getLogger
from pathlib import Path
import pickle

Expand Down Expand Up @@ -31,7 +31,7 @@
BUNRUIJA_REGISTRY['voting'] = VotingClassifier


logger = logging.getLogger(__name__)
logger = getLogger(__name__)


class ClassifierBuilder:
Expand Down
35 changes: 27 additions & 8 deletions bunruija/classifiers/classifier.py
Original file line number Diff line number Diff line change
@@ -1,4 +1,4 @@
import logging
from logging import getLogger
import time

import numpy as np
Expand All @@ -12,7 +12,7 @@
from bunruija.feature_extraction.sequence import SequenceVectorizer


logger = logging.getLogger(__name__)
logger = getLogger(__name__)


class BaseClassifier(BaseEstimator, ClassifierMixin):
Expand Down Expand Up @@ -63,11 +63,13 @@ def __init__(self, **kwargs):
self.batch_size = kwargs.get('batch_size', 20)

self.optimizer_type = kwargs.get('optimizer', 'adam')
self.labels = set()

def init_layer(self, data):
pass

def convert_data(self, X, y=None):
logger.info('Loading data')
if len(X) == 2 and isinstance(X[1], list):
indices = X[0]
raw_words = X[1]
Expand All @@ -87,6 +89,7 @@ def convert_data(self, X, y=None):

if y is not None:
data_i['label'] = y[i]
self.labels.add(y[i])

if has_raw_words:
data_i['raw_words'] = raw_words[start: end]
Expand All @@ -103,10 +106,14 @@ def fit(self, X, y):

self.to(self.device)
self.train()
log_interval = 100

logger.info(f'{self}')
step = 0
loss_accum = 0
n_samples_accum = 0
for epoch in range(self.max_epochs):
loss_epoch = 0.
# loss_epoch = 0.

for batch in torch.utils.data.DataLoader(
data,
Expand All @@ -122,15 +129,27 @@ def fit(self, X, y):
logits = self(batch)
loss = F.nll_loss(
torch.log_softmax(logits, dim=1),
batch['labels']
batch['labels'],
reduction='sum',
)
loss_epoch += loss.item()
loss.backward()
# loss_epoch += loss.item()
loss_accum += loss.item()
n_samples_accum += len(batch['labels'])
(loss / len(batch['labels'])).backward()
optimizer.step()
step += 1
del loss

elapsed = time.perf_counter() - start_at
logger.info(f'epoch:{epoch+1} loss:{loss_epoch:.2f} elapsed:{elapsed:.2f}')
if step % log_interval == 0:
loss_accum /= n_samples_accum
elapsed = time.perf_counter() - start_at
logger.info(f'epoch:{epoch+1} step:{step} '
f'loss:{loss_accum:.2f} elapsed:{elapsed:.2f}')
loss_accum = 0
n_samples_accum = 0

# elapsed = time.perf_counter() - start_at
# logger.info(f'epoch:{epoch+1} loss:{loss_epoch:.2f} elapsed:{elapsed:.2f}')

def reset_module(self, **kwargs):
pass
Expand Down
4 changes: 1 addition & 3 deletions bunruija/classifiers/lstm.py
Original file line number Diff line number Diff line change
Expand Up @@ -37,10 +37,8 @@ def __init__(self, **kwargs):
)

def init_layer(self, data):
y = []
max_input_idx = 0
for data_i in data:
y.append(data_i['label'])
max_input_idx = max(max_input_idx, np.max(data_i['inputs']))

self.embed = torch.nn.Embedding(
Expand All @@ -53,7 +51,7 @@ def init_layer(self, data):
num_classes = np.unique(y)
self.out = torch.nn.Linear(
2 * self.dim_hid,
len(num_classes),
len(self.labels),
bias=True)

def __call__(self, batch):
Expand Down
15 changes: 4 additions & 11 deletions bunruija/classifiers/prado.py
Original file line number Diff line number Diff line change
Expand Up @@ -18,11 +18,11 @@ def get_hash_codes(self, word):

class WeightMask:
def __init__(self, index):
self.index = torch.tensor(index)
self.index = index

def __call__(self, module, _):
mask = module.raw_weight.new_ones(module.raw_weight.size())
mask.index_fill_(2, self.index, 0.)
mask.index_fill_(2, self.index.to(mask.device), 0.)
module.weight = module.raw_weight * mask


Expand Down Expand Up @@ -137,21 +137,14 @@ def __init__(self, **kwargs):
self.batch_norm_attn = torch.nn.BatchNorm1d(self.dim_hid)

def init_layer(self, data):
y = []
max_input_idx = 0
for data_i in data:
y.append(data_i['label'])

self.pad = 0

num_classes = np.unique(y)
self.fc = torch.nn.Linear(
len(self.kernel_sizes) * self.dim_hid,
len(num_classes),
len(self.labels),
bias=True)

def word_string_distort(self, word):
if self.distort == 0:
if self.distort == 0 or len(word) == 0:
return word
else:
if random.random() < self.distort:
Expand Down
39 changes: 2 additions & 37 deletions bunruija/feature_extraction/__init__.py
Original file line number Diff line number Diff line change
@@ -1,5 +1,5 @@
from logging import getLogger
import functools
import logging

from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.pipeline import FeatureUnion
Expand All @@ -13,7 +13,7 @@
BUNRUIJA_REGISTRY['sequence'] = SequenceVectorizer
BUNRUIJA_REGISTRY['tfidf'] = TfidfVectorizer

logger = logging.getLogger(__name__)
logger = getLogger(__name__)


# https://stackoverflow.com/questions/9336646/python-decorator-with-multiprocessing-fails
Expand All @@ -22,41 +22,6 @@ def register_vectorizer(vectorizer_name, vectorizer):
raise KeyError
BUNRUIJA_REGISTRY[vectorizer_name] = vectorizer

# def register_vectorizer(vectorizer_name):
# def f(vectorizer):
# class new_vectorizer(vectorizer):
# def fit(self, raw_documents):
# return super().fit(raw_documents)

# def transforme(self, raw_documents):
# return super().transform(raw_documents)

# new_vectorizer.__name__ = vectorizer.__name__
# BUNRUIJA_VECTORIZER_REGISTRY[vectorizer_name] = new_vectorizer
# return f

# def register_vectorizer(vectorizer_name):
# def f(vectorizer):
# print(vectorizer)
# BUNRUIJA_VECTORIZER_REGISTRY[vectorizer_name] = vectorizer
# return f

# class register_vectorizer:
# def __init__(self, vectorizer_name):
# self.vectorizer_name = vectorizer_name

# def __call__(self, vectorizer):
# class new_vectorizer(vectorizer):
# def fit(self, raw_documents):
# return super().fit(raw_documents)

# def transforme(self, raw_documents):
# return super().transform(raw_documents)

# new_vectorizer.__name__ = vectorizer.__name__
# BUNRUIJA_VECTORIZER_REGISTRY[self.vectorizer_name] = new_vectorizer


def build_vectorizer(config, tokenizer=None):
vectorizer_setting = config.get('preprocess', {}).get('vectorizer', {})

Expand Down
25 changes: 21 additions & 4 deletions bunruija/feature_extraction/sequence.py
Original file line number Diff line number Diff line change
Expand Up @@ -13,6 +13,7 @@ def __init__(
tokenizer=None,
max_features=None,
keep_raw_word=True,
only_raw_word=False,
dictionary=Dictionary(),
**kwargs):
super().__init__()
Expand All @@ -22,13 +23,16 @@ def __init__(
self.vocabulary_ = dictionary.index_to_element
self.max_features = max_features
self.keep_raw_word = keep_raw_word
self.only_raw_word = only_raw_word

def __repr__(self):
args = []
if self.tokenizer:
args.append(f'tokenizer={self.tokenizer}')
if self.max_features:
args.append(f'max_features={self.max_features}')
args.append(f'keep_raw_word={self.keep_raw_word}')
args.append(f'only_raw_word={self.only_raw_word}')
out = f'{self.__class__.__name__}({", ".join(args)})'
return out

Expand All @@ -50,9 +54,13 @@ def get_params(self, deep=True):
'max_features': self.max_features,
'dictionary': self.dictionary,
'keep_raw_word': self.keep_raw_word,
'only_raw_word': self.only_raw_word,
}

def fit(self, raw_documents, y=None):
if self.only_raw_word:
return self

tokenizer = self.build_tokenizer()

for row_id, document in enumerate(raw_documents):
Expand Down Expand Up @@ -96,13 +104,22 @@ def transform(self, raw_documents):
max_col = max(max_col, len(elements))

for i, element in enumerate(elements):
if element in self.dictionary:
if self.keep_raw_word:
raw_words.append(element)
index = self.dictionary.get_index(element)
if self.only_raw_word:
raw_words.append(element)
index = 1
data.append(index)
row.append(row_id)
col.append(i)
else:
if element in self.dictionary:
if self.keep_raw_word:
raw_words.append(element)

index = self.dictionary.get_index(element)

data.append(index)
row.append(row_id)
col.append(i)

data = np.array(data)
row = np.array(row)
Expand Down
2 changes: 2 additions & 0 deletions bunruija/tokenizers/__init__.py
Original file line number Diff line number Diff line change
@@ -1,11 +1,13 @@
from ..registry import BUNRUIJA_REGISTRY
from .tokenizer import BaseTokenizer
from .mecab_tokenizer import MeCabTokenizer
from .space_tokenizer import SpaceTokenizer

from transformers import AutoTokenizer


BUNRUIJA_REGISTRY['mecab'] = MeCabTokenizer
BUNRUIJA_REGISTRY['space'] = SpaceTokenizer
BUNRUIJA_REGISTRY['auto'] = AutoTokenizer


Expand Down
14 changes: 14 additions & 0 deletions bunruija/tokenizers/space_tokenizer.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,14 @@
from bunruija.tokenizers import BaseTokenizer


class SpaceTokenizer(BaseTokenizer):
def __init__(self, **kwargs):
super().__init__(name='space')

def __call__(self, text):
result = text.split(' ')
return result

def __repr__(self):
out = f'{self.__class__.__name__}()'
return out
4 changes: 0 additions & 4 deletions bunruija/trainer.py
Original file line number Diff line number Diff line change
@@ -1,4 +1,3 @@
import logging
import os
import pickle
from pathlib import Path
Expand All @@ -10,9 +9,6 @@
import bunruija


logger = logging.getLogger(__name__)


class Trainer:
def __init__(self, config_file):
with open(config_file) as f:
Expand Down
12 changes: 8 additions & 4 deletions bunruija_cli/train.py
Original file line number Diff line number Diff line change
@@ -1,17 +1,21 @@
import logging
from logging import (
basicConfig,
getLogger,
INFO
)
import sys

from bunruija import options
from bunruija import Trainer


logging.basicConfig(
basicConfig(
format='%(asctime)s | %(levelname)s | %(name)s | %(message)s',
datefmt='%Y-%m-%d %H:%M:%S',
level=logging.INFO,
level=INFO,
stream=sys.stdout,
)
logger = logging.getLogger('bunruija_cli.train')
logger = getLogger('bunruija_cli.train')


def main(args):
Expand Down
2 changes: 1 addition & 1 deletion example/livedoor_corpus/settings/prado.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -23,5 +23,5 @@ classifier:
dim_hid: 64
optimizer: adamw
lr: 0.001
max_epochs: 300
max_epochs: 3
weight_decay: 0.01
Loading