diff --git a/.github/workflows/unittest.yaml b/.github/workflows/unittest.yaml new file mode 100644 index 0000000..c4769d4 --- /dev/null +++ b/.github/workflows/unittest.yaml @@ -0,0 +1,27 @@ +name: Bunruija test + +on: [push] + +jobs: + build: + + runs-on: ubuntu-latest + strategy: + matrix: + python-version: [3.7, 3.8] + + steps: + - uses: actions/checkout@v2 + - name: Set up Python ${{ matrix.python-version }} + uses: actions/setup-python@v2 + with: + python-version: ${{ matrix.python-version }} + - name: Install dependencies + run: | + python -m pip install --upgrade pip + pip install nose + if [ -f requirements.txt ]; then pip install -r requirements.txt; fi + python setup.py develop + - name: Test with nosetests + run: | + nosetests diff --git a/bunruija/classifiers/__init__.py b/bunruija/classifiers/__init__.py index 1485d8d..02230e9 100644 --- a/bunruija/classifiers/__init__.py +++ b/bunruija/classifiers/__init__.py @@ -3,7 +3,10 @@ import pickle import lightgbm -from sklearn.svm import SVC +from sklearn.svm import ( + LinearSVC, + SVC +) from sklearn.ensemble import ( RandomForestClassifier, StackingClassifier, @@ -17,17 +20,20 @@ from .classifier import NeuralBaseClassifier from .lstm import LSTMClassifier from .prado import PRADO +from .qrnn import QRNN from .transformer import TransformerClassifier from . import util -BUNRUIJA_REGISTRY['prado'] = PRADO -BUNRUIJA_REGISTRY['svm'] = SVC -BUNRUIJA_REGISTRY['rf'] = RandomForestClassifier BUNRUIJA_REGISTRY['lgb'] = lightgbm.LGBMClassifier +BUNRUIJA_REGISTRY['linear_svm'] = LinearSVC BUNRUIJA_REGISTRY['lr'] = LogisticRegression BUNRUIJA_REGISTRY['lstm'] = LSTMClassifier -BUNRUIJA_REGISTRY['pipeline'] = Pipeline +# BUNRUIJA_REGISTRY['pipeline'] = Pipeline +BUNRUIJA_REGISTRY['prado'] = PRADO +BUNRUIJA_REGISTRY['qrnn'] = QRNN +BUNRUIJA_REGISTRY['rf'] = RandomForestClassifier +BUNRUIJA_REGISTRY['svm'] = SVC BUNRUIJA_REGISTRY['stacking'] = StackingClassifier BUNRUIJA_REGISTRY['transformer'] = TransformerClassifier BUNRUIJA_REGISTRY['voting'] = VotingClassifier @@ -51,7 +57,9 @@ def build_estimator(self, estimator_data): if isinstance(estimator_data, list): estimators = [self.build_estimator(s) for s in estimator_data] estimator_type = 'pipeline' - estimator = BUNRUIJA_REGISTRY[estimator_type](estimators) + memory = Path(self.config.get('bin_dir', '.')) / 'cache' +# estimator = BUNRUIJA_REGISTRY[estimator_type](estimators) + estimator = Pipeline(estimators, memory=str(memory)) else: estimator_type = estimator_data['type'] estimator_args = estimator_data.get('args', {}) diff --git a/bunruija/classifiers/qrnn/__init__.py b/bunruija/classifiers/qrnn/__init__.py new file mode 100644 index 0000000..651c825 --- /dev/null +++ b/bunruija/classifiers/qrnn/__init__.py @@ -0,0 +1 @@ +from .model import QRNN diff --git a/bunruija/classifiers/qrnn/model.py b/bunruija/classifiers/qrnn/model.py new file mode 100644 index 0000000..2788cbd --- /dev/null +++ b/bunruija/classifiers/qrnn/model.py @@ -0,0 +1,126 @@ +import numpy as np +import torch + +from bunruija.classifiers.classifier import NeuralBaseClassifier + + + +class QRNNLayer(torch.nn.Module): + def __init__(self, input_size, output_size, window_size=2, bidirectional=True): + super().__init__() + + self.num_gates = 3 + self.window_size = window_size + self.input_size = input_size + self.output_size = output_size + self.bidirectional = bidirectional + + if self.bidirectional: + self.fc = torch.nn.Linear( + self.window_size * input_size, + 2 * output_size * self.num_gates) + else: + self.fc = torch.nn.Linear( + self.window_size * input_size, + output_size * self.num_gates) + + def forward(self, x): + bsz = x.size(0) + seq_len = x.size(1) + window_tokens = [x] + for i in range(self.window_size - 1): + prev_x = x[:, :-(i + 1), :] + prev_x = torch.cat( + [prev_x.new_zeros(bsz, i + 1, self.input_size), prev_x], + dim=1) + window_tokens.insert(0, prev_x) + x = torch.stack(window_tokens, dim=2) + x = x.view(bsz, seq_len, -1) + x = self.fc(x) + z, f, o = x.chunk(self.num_gates, dim=2) + + z = torch.tanh(z) + f = torch.sigmoid(f) + seq_len = z.size(1) + + c = torch.zeros_like(z) + + if self.bidirectional: + c = c.view(bsz, seq_len, 2, self.output_size) + f = f.view(bsz, seq_len, 2, self.output_size) + z = z.view(bsz, seq_len, 2, self.output_size) + for t in range(seq_len): + if t == 0: + c[:, t, 0] = (1 - f[:, t, 0]) * z[:, t, 0] + else: + c[:, t, 0] = f[:, t, 0] * c[:, t - 1, 0].clone() + (1 - f[:, t, 0]) * z[:, t, 0] + for t in range(seq_len - 1, -1, -1): + if t == seq_len - 1: + c[:, t, 0] = (1 - f[:, t, 0]) * z[:, t, 0] + else: + c[:, t, 0] = f[:, t, 0] * c[:, t + 1, 0].clone() + (1 - f[:, t, 0]) * z[:, t, 0] + c = c.view(bsz, seq_len, 2 * self.output_size) + else: + for t in range(seq_len): + if t == 0: + c[:, t] = (1 - f[:, t]) * z[:, t] + else: + c[:, t] = f[:, t] * c[:, t -1].clone() + (1 - f[:, t]) * z[:, t] + + h = torch.sigmoid(o) * c + return h + + +class QRNN(NeuralBaseClassifier): + def __init__(self, **kwargs): + super().__init__(**kwargs) + + self.dim_emb = kwargs.get('dim_emb', 256) + self.dim_hid = kwargs.get('dim_hid', 128) + self.window_size = kwargs.get('window_size', 3) + + self.layers = torch.nn.ModuleList() + self.bidirectional = kwargs.get('bidirectional', True) + num_layers = kwargs.get('num_layers', 2) + for i in range(num_layers): + if i == 0: + input_size = self.dim_emb + else: + input_size = 2 * self.dim_hid if self.bidirectional else self.dim_hid + + self.layers.append(QRNNLayer( + input_size, + self.dim_hid, + window_size=self.window_size, + bidirectional=self.bidirectional)) + + def init_layer(self, data): + self.pad = 0 + max_input_idx = 0 + for data_i in data: + max_input_idx = max(max_input_idx, np.max(data_i['inputs'])) + + self.embed = torch.nn.Embedding( + max_input_idx + 1, + self.dim_emb, + padding_idx=0, + ) + + self.out = torch.nn.Linear( + 2 * self.dim_hid if self.bidirectional else self.dim_hid, + len(self.labels), + bias=True) + + def __call__(self, batch): + src_tokens = batch['inputs'] + lengths = (src_tokens != self.pad).sum(dim=1) + + x = self.embed(src_tokens) + for layer in self.layers: + x = layer(x) + x = torch.nn.functional.adaptive_max_pool2d( + x, + (1, 2 * self.dim_hid if self.bidirectional else self.dim_hid)) + x = x.squeeze(1) + x = self.out(x) + return x diff --git a/bunruija/feature_extraction/sequence.py b/bunruija/feature_extraction/sequence.py index 9328867..87134d5 100644 --- a/bunruija/feature_extraction/sequence.py +++ b/bunruija/feature_extraction/sequence.py @@ -89,7 +89,10 @@ def transform(self, raw_documents): tokenizer = self.build_tokenizer() for row_id, document in enumerate(raw_documents): - elements = tokenizer(document) + if isinstance(tokenizer, transformers.PreTrainedTokenizerBase): + elements = tokenizer(document, truncation=True) + else: + elements = tokenizer(document) if isinstance(elements, transformers.tokenization_utils_base.BatchEncoding): input_ids = elements['input_ids'] diff --git a/example/livedoor_corpus/settings/prado.yaml b/example/livedoor_corpus/settings/prado.yaml index c58fa0e..253c784 100644 --- a/example/livedoor_corpus/settings/prado.yaml +++ b/example/livedoor_corpus/settings/prado.yaml @@ -19,11 +19,11 @@ classifier: args: make_fast: true batch_size: 10 - n_features: 32 + n_features: 512 dim_emb: 64 dim_hid: 64 optimizer: adamw lr: 0.001 max_epochs: 3 weight_decay: 0.01 - log_interval: 1 + log_interval: 100 diff --git a/example/livedoor_corpus/settings/qrnn.yaml b/example/livedoor_corpus/settings/qrnn.yaml new file mode 100644 index 0000000..b813599 --- /dev/null +++ b/example/livedoor_corpus/settings/qrnn.yaml @@ -0,0 +1,28 @@ +preprocess: + data: + train: train.csv + dev: dev.csv + test: test.csv + +tokenizer: + type: mecab + args: + lemmatize: false + +bin_dir: models/qrnn-model + +classifier: + - type: sequence + args: + max_features: 10000 + - type: qrnn + args: + batch_size: 2 + n_features: 512 + dim_emb: 64 + dim_hid: 128 + optimizer: adamw + lr: 0.001 + max_epochs: 3 + weight_decay: 0.01 + log_interval: 100 diff --git a/example/yelp_polarity/settings/prado.yaml b/example/yelp_polarity/settings/prado.yaml index 4cfceea..8e9b2d3 100644 --- a/example/yelp_polarity/settings/prado.yaml +++ b/example/yelp_polarity/settings/prado.yaml @@ -15,7 +15,7 @@ classifier: - type: prado args: device: cpu - batch_size: 16 + batch_size: 32 n_features: 512 dim_emb: 64 dim_hid: 64 diff --git a/example/yelp_polarity/settings/qrnn.yaml b/example/yelp_polarity/settings/qrnn.yaml new file mode 100644 index 0000000..8e9b2d3 --- /dev/null +++ b/example/yelp_polarity/settings/qrnn.yaml @@ -0,0 +1,27 @@ +preprocess: + data: + train: train.csv + test: test.csv + +tokenizer: + type: space + +bin_dir: models/prado-model + +classifier: + - type: sequence + args: + only_raw_word: true + - type: prado + args: + device: cpu + batch_size: 32 + n_features: 512 + dim_emb: 64 + dim_hid: 64 + optimizer: adamw + lr: 0.001 + max_epochs: 3 + weight_decay: 0.01 + save_every_step: 1000 + log_interval: 1 diff --git a/requirements.txt b/requirements.txt new file mode 100644 index 0000000..13e9633 --- /dev/null +++ b/requirements.txt @@ -0,0 +1,2 @@ +Cython==0.29.22 +numpy==1.19.5