From 1df63a3035793cf451adb623c1e8277ce13d9954 Mon Sep 17 00:00:00 2001 From: Takuya Makino Date: Sun, 4 Apr 2021 20:10:51 +0900 Subject: [PATCH 01/11] Add QRNN-based classifier (#14) --- bunruija/classifiers/__init__.py | 2 + bunruija/classifiers/qrnn/__init__.py | 1 + bunruija/classifiers/qrnn/model.py | 150 +++++++++++++++++++++ example/livedoor_corpus/settings/qrnn.yaml | 28 ++++ example/yelp_polarity/settings/qrnn.yaml | 27 ++++ 5 files changed, 208 insertions(+) create mode 100644 bunruija/classifiers/qrnn/__init__.py create mode 100644 bunruija/classifiers/qrnn/model.py create mode 100644 example/livedoor_corpus/settings/qrnn.yaml create mode 100644 example/yelp_polarity/settings/qrnn.yaml diff --git a/bunruija/classifiers/__init__.py b/bunruija/classifiers/__init__.py index 1485d8d..0dc71e1 100644 --- a/bunruija/classifiers/__init__.py +++ b/bunruija/classifiers/__init__.py @@ -17,6 +17,7 @@ from .classifier import NeuralBaseClassifier from .lstm import LSTMClassifier from .prado import PRADO +from .qrnn import QRNN from .transformer import TransformerClassifier from . import util @@ -28,6 +29,7 @@ BUNRUIJA_REGISTRY['lr'] = LogisticRegression BUNRUIJA_REGISTRY['lstm'] = LSTMClassifier BUNRUIJA_REGISTRY['pipeline'] = Pipeline +BUNRUIJA_REGISTRY['qrnn'] = QRNN BUNRUIJA_REGISTRY['stacking'] = StackingClassifier BUNRUIJA_REGISTRY['transformer'] = TransformerClassifier BUNRUIJA_REGISTRY['voting'] = VotingClassifier diff --git a/bunruija/classifiers/qrnn/__init__.py b/bunruija/classifiers/qrnn/__init__.py new file mode 100644 index 0000000..651c825 --- /dev/null +++ b/bunruija/classifiers/qrnn/__init__.py @@ -0,0 +1 @@ +from .model import QRNN diff --git a/bunruija/classifiers/qrnn/model.py b/bunruija/classifiers/qrnn/model.py new file mode 100644 index 0000000..4ebd23f --- /dev/null +++ b/bunruija/classifiers/qrnn/model.py @@ -0,0 +1,150 @@ +import torch + +from bunruija.classifiers.classifier import NeuralBaseClassifier + + + +class QRNNLayer(torch.nn.Module): + def __init__(self, input_size, output_size, window_size=2, bidirectional=True): + super().__init__() + + self.num_gates = 3 + self.window_size = window_size + self.input_size = input_size + self.output_size = output_size + self.bidirectional = bidirectional + + if self.bidirectional: + self.fc = torch.nn.Linear( + self.window_size * input_size, + 2 * output_size * self.num_gates) + else: + self.fc = torch.nn.Linear( + self.window_size * input_size, + output_size * self.num_gates) + + def forward(self, x): +# x, batch_sizes, sorted_indices, unsorted_indices = x +# print('x', x.size()) +# seq_len = batch_sizes.size(0) +# print('seq_len', seq_len) + + bsz = x.size(0) + seq_len = x.size(1) + window_tokens = [x] + for i in range(self.window_size - 1): + prev_x = x[:, :-(i + 1), :] + prev_x = torch.cat( + [prev_x.new_zeros(bsz, i + 1, self.input_size), prev_x], + dim=1) + window_tokens.insert(0, prev_x) + x = torch.stack(window_tokens, dim=2) + x = x.view(bsz, seq_len, -1) + x = self.fc(x) + z, f, o = x.chunk(self.num_gates, dim=2) + + z = torch.tanh(z) + f = torch.sigmoid(f) +# print('z', z.size(), 'f', f.size()) +# exit() + seq_len = z.size(1) + + c = torch.zeros_like(z) +# print('C', c.size()) + + if self.bidirectional: + c = c.view(bsz, seq_len, 2, self.output_size) + f = f.view(bsz, seq_len, 2, self.output_size) + z = z.view(bsz, seq_len, 2, self.output_size) +# print('C', c.size()) + for t in range(seq_len): + if t == 0: + c[:, t, 0] = (1 - f[:, t, 0]) * z[:, t, 0] + else: + c[:, t, 0] = f[:, t, 0] * c[:, t - 1, 0].clone() + (1 - f[:, t, 0]) * z[:, t, 0] + for t in range(seq_len - 1, -1, -1): + if t == seq_len - 1: + c[:, t, 0] = (1 - f[:, t, 0]) * z[:, t, 0] + else: + c[:, t, 0] = f[:, t, 0] * c[:, t + 1, 0].clone() + (1 - f[:, t, 0]) * z[:, t, 0] + c = c.view(bsz, seq_len, 2 * self.output_size) +# print('C', c.size()) +# exit() +# pass + else: + for t in range(seq_len): + if t == 0: + c[:, t] = (1 - f[:, t]) * z[:, t] + else: + c[:, t] = f[:, t] * c[:, t -1].clone() + (1 - f[:, t]) * z[:, t] + + h = torch.sigmoid(o) * c +# output = PackedSequence(h, batch_sizes, sorted_indices, unsorted_indices) +# return output + return h + + +class QRNN(NeuralBaseClassifier): + def __init__(self, **kwargs): + super().__init__(**kwargs) + + self.dim_emb = kwargs.get('dim_emb', 256) + self.dim_hid = kwargs.get('dim_hid', 128) + self.window_size = kwargs.get('window_size', 3) + + self.embed = torch.nn.Embedding(10000, self.dim_emb, padding_idx=0) + self.layers = torch.nn.ModuleList() + self.bidirectional = kwargs.get('bidirectional', True) + num_layers = kwargs.get('num_layers', 2) + for i in range(num_layers): + if i == 0: + input_size = self.dim_emb + else: + input_size = 2 * self.dim_hid if self.bidirectional else self.dim_hid + + self.layers.append(QRNNLayer( + input_size, + self.dim_hid, + window_size=self.window_size, + bidirectional=self.bidirectional)) + +# self.pooler = torch.nn.MaxPool2d( +# (2 * self.dim_hid if self.bidirectional else self.dim_hid), +# ) + + def init_layer(self, data): + self.pad = 0 + self.out = torch.nn.Linear( + 2 * self.dim_hid if self.bidirectional else self.dim_hid, + len(self.labels), + bias=True) + + def __call__(self, batch): + src_tokens = batch['inputs'] + lengths = (src_tokens != self.pad).sum(dim=1) + +# print(src_tokens) + x = self.embed(src_tokens) +# print('embed', x.size()) +# packed = torch.nn.utils.rnn.pack_padded_sequence( +# x, +# lengths, +# batch_first=True, +# enforce_sorted=False +# ) +# self.layers[0](packed) +# exit() + for layer in self.layers: + x = layer(x) +# print('x', x.size()) +# x = x[:, 0] +# x = self.pooler(x) + x = torch.nn.functional.adaptive_max_pool2d( + x, + (1, 2 * self.dim_hid if self.bidirectional else self.dim_hid)) + x = x.squeeze(1) +# print('x', x.size()) + x = self.out(x) +# print('x', x.size()) +# exit() + return x diff --git a/example/livedoor_corpus/settings/qrnn.yaml b/example/livedoor_corpus/settings/qrnn.yaml new file mode 100644 index 0000000..b813599 --- /dev/null +++ b/example/livedoor_corpus/settings/qrnn.yaml @@ -0,0 +1,28 @@ +preprocess: + data: + train: train.csv + dev: dev.csv + test: test.csv + +tokenizer: + type: mecab + args: + lemmatize: false + +bin_dir: models/qrnn-model + +classifier: + - type: sequence + args: + max_features: 10000 + - type: qrnn + args: + batch_size: 2 + n_features: 512 + dim_emb: 64 + dim_hid: 128 + optimizer: adamw + lr: 0.001 + max_epochs: 3 + weight_decay: 0.01 + log_interval: 100 diff --git a/example/yelp_polarity/settings/qrnn.yaml b/example/yelp_polarity/settings/qrnn.yaml new file mode 100644 index 0000000..8e9b2d3 --- /dev/null +++ b/example/yelp_polarity/settings/qrnn.yaml @@ -0,0 +1,27 @@ +preprocess: + data: + train: train.csv + test: test.csv + +tokenizer: + type: space + +bin_dir: models/prado-model + +classifier: + - type: sequence + args: + only_raw_word: true + - type: prado + args: + device: cpu + batch_size: 32 + n_features: 512 + dim_emb: 64 + dim_hid: 64 + optimizer: adamw + lr: 0.001 + max_epochs: 3 + weight_decay: 0.01 + save_every_step: 1000 + log_interval: 1 From 5597e26a0e2a6b86feb0f37314fe7be0f8d03fd6 Mon Sep 17 00:00:00 2001 From: Takuya Makino Date: Sat, 14 Aug 2021 15:54:22 +0900 Subject: [PATCH 02/11] Add PRAD (#14) --- bunruija/classifiers/__init__.py | 18 +++++--- bunruija/classifiers/qrnn/model.py | 46 +++++---------------- bunruija/feature_extraction/sequence.py | 5 ++- example/livedoor_corpus/settings/prado.yaml | 4 +- example/yelp_polarity/settings/prado.yaml | 2 +- 5 files changed, 30 insertions(+), 45 deletions(-) diff --git a/bunruija/classifiers/__init__.py b/bunruija/classifiers/__init__.py index 0dc71e1..02230e9 100644 --- a/bunruija/classifiers/__init__.py +++ b/bunruija/classifiers/__init__.py @@ -3,7 +3,10 @@ import pickle import lightgbm -from sklearn.svm import SVC +from sklearn.svm import ( + LinearSVC, + SVC +) from sklearn.ensemble import ( RandomForestClassifier, StackingClassifier, @@ -22,14 +25,15 @@ from . import util -BUNRUIJA_REGISTRY['prado'] = PRADO -BUNRUIJA_REGISTRY['svm'] = SVC -BUNRUIJA_REGISTRY['rf'] = RandomForestClassifier BUNRUIJA_REGISTRY['lgb'] = lightgbm.LGBMClassifier +BUNRUIJA_REGISTRY['linear_svm'] = LinearSVC BUNRUIJA_REGISTRY['lr'] = LogisticRegression BUNRUIJA_REGISTRY['lstm'] = LSTMClassifier -BUNRUIJA_REGISTRY['pipeline'] = Pipeline +# BUNRUIJA_REGISTRY['pipeline'] = Pipeline +BUNRUIJA_REGISTRY['prado'] = PRADO BUNRUIJA_REGISTRY['qrnn'] = QRNN +BUNRUIJA_REGISTRY['rf'] = RandomForestClassifier +BUNRUIJA_REGISTRY['svm'] = SVC BUNRUIJA_REGISTRY['stacking'] = StackingClassifier BUNRUIJA_REGISTRY['transformer'] = TransformerClassifier BUNRUIJA_REGISTRY['voting'] = VotingClassifier @@ -53,7 +57,9 @@ def build_estimator(self, estimator_data): if isinstance(estimator_data, list): estimators = [self.build_estimator(s) for s in estimator_data] estimator_type = 'pipeline' - estimator = BUNRUIJA_REGISTRY[estimator_type](estimators) + memory = Path(self.config.get('bin_dir', '.')) / 'cache' +# estimator = BUNRUIJA_REGISTRY[estimator_type](estimators) + estimator = Pipeline(estimators, memory=str(memory)) else: estimator_type = estimator_data['type'] estimator_args = estimator_data.get('args', {}) diff --git a/bunruija/classifiers/qrnn/model.py b/bunruija/classifiers/qrnn/model.py index 4ebd23f..2788cbd 100644 --- a/bunruija/classifiers/qrnn/model.py +++ b/bunruija/classifiers/qrnn/model.py @@ -1,3 +1,4 @@ +import numpy as np import torch from bunruija.classifiers.classifier import NeuralBaseClassifier @@ -24,11 +25,6 @@ def __init__(self, input_size, output_size, window_size=2, bidirectional=True): output_size * self.num_gates) def forward(self, x): -# x, batch_sizes, sorted_indices, unsorted_indices = x -# print('x', x.size()) -# seq_len = batch_sizes.size(0) -# print('seq_len', seq_len) - bsz = x.size(0) seq_len = x.size(1) window_tokens = [x] @@ -45,18 +41,14 @@ def forward(self, x): z = torch.tanh(z) f = torch.sigmoid(f) -# print('z', z.size(), 'f', f.size()) -# exit() seq_len = z.size(1) c = torch.zeros_like(z) -# print('C', c.size()) if self.bidirectional: c = c.view(bsz, seq_len, 2, self.output_size) f = f.view(bsz, seq_len, 2, self.output_size) z = z.view(bsz, seq_len, 2, self.output_size) -# print('C', c.size()) for t in range(seq_len): if t == 0: c[:, t, 0] = (1 - f[:, t, 0]) * z[:, t, 0] @@ -68,9 +60,6 @@ def forward(self, x): else: c[:, t, 0] = f[:, t, 0] * c[:, t + 1, 0].clone() + (1 - f[:, t, 0]) * z[:, t, 0] c = c.view(bsz, seq_len, 2 * self.output_size) -# print('C', c.size()) -# exit() -# pass else: for t in range(seq_len): if t == 0: @@ -79,8 +68,6 @@ def forward(self, x): c[:, t] = f[:, t] * c[:, t -1].clone() + (1 - f[:, t]) * z[:, t] h = torch.sigmoid(o) * c -# output = PackedSequence(h, batch_sizes, sorted_indices, unsorted_indices) -# return output return h @@ -92,7 +79,6 @@ def __init__(self, **kwargs): self.dim_hid = kwargs.get('dim_hid', 128) self.window_size = kwargs.get('window_size', 3) - self.embed = torch.nn.Embedding(10000, self.dim_emb, padding_idx=0) self.layers = torch.nn.ModuleList() self.bidirectional = kwargs.get('bidirectional', True) num_layers = kwargs.get('num_layers', 2) @@ -108,12 +94,18 @@ def __init__(self, **kwargs): window_size=self.window_size, bidirectional=self.bidirectional)) -# self.pooler = torch.nn.MaxPool2d( -# (2 * self.dim_hid if self.bidirectional else self.dim_hid), -# ) - def init_layer(self, data): self.pad = 0 + max_input_idx = 0 + for data_i in data: + max_input_idx = max(max_input_idx, np.max(data_i['inputs'])) + + self.embed = torch.nn.Embedding( + max_input_idx + 1, + self.dim_emb, + padding_idx=0, + ) + self.out = torch.nn.Linear( 2 * self.dim_hid if self.bidirectional else self.dim_hid, len(self.labels), @@ -123,28 +115,12 @@ def __call__(self, batch): src_tokens = batch['inputs'] lengths = (src_tokens != self.pad).sum(dim=1) -# print(src_tokens) x = self.embed(src_tokens) -# print('embed', x.size()) -# packed = torch.nn.utils.rnn.pack_padded_sequence( -# x, -# lengths, -# batch_first=True, -# enforce_sorted=False -# ) -# self.layers[0](packed) -# exit() for layer in self.layers: x = layer(x) -# print('x', x.size()) -# x = x[:, 0] -# x = self.pooler(x) x = torch.nn.functional.adaptive_max_pool2d( x, (1, 2 * self.dim_hid if self.bidirectional else self.dim_hid)) x = x.squeeze(1) -# print('x', x.size()) x = self.out(x) -# print('x', x.size()) -# exit() return x diff --git a/bunruija/feature_extraction/sequence.py b/bunruija/feature_extraction/sequence.py index 9328867..87134d5 100644 --- a/bunruija/feature_extraction/sequence.py +++ b/bunruija/feature_extraction/sequence.py @@ -89,7 +89,10 @@ def transform(self, raw_documents): tokenizer = self.build_tokenizer() for row_id, document in enumerate(raw_documents): - elements = tokenizer(document) + if isinstance(tokenizer, transformers.PreTrainedTokenizerBase): + elements = tokenizer(document, truncation=True) + else: + elements = tokenizer(document) if isinstance(elements, transformers.tokenization_utils_base.BatchEncoding): input_ids = elements['input_ids'] diff --git a/example/livedoor_corpus/settings/prado.yaml b/example/livedoor_corpus/settings/prado.yaml index c58fa0e..253c784 100644 --- a/example/livedoor_corpus/settings/prado.yaml +++ b/example/livedoor_corpus/settings/prado.yaml @@ -19,11 +19,11 @@ classifier: args: make_fast: true batch_size: 10 - n_features: 32 + n_features: 512 dim_emb: 64 dim_hid: 64 optimizer: adamw lr: 0.001 max_epochs: 3 weight_decay: 0.01 - log_interval: 1 + log_interval: 100 diff --git a/example/yelp_polarity/settings/prado.yaml b/example/yelp_polarity/settings/prado.yaml index 4cfceea..8e9b2d3 100644 --- a/example/yelp_polarity/settings/prado.yaml +++ b/example/yelp_polarity/settings/prado.yaml @@ -15,7 +15,7 @@ classifier: - type: prado args: device: cpu - batch_size: 16 + batch_size: 32 n_features: 512 dim_emb: 64 dim_hid: 64 From 00f4dc90ff2861fd740a92d169eb5d55650e8bed Mon Sep 17 00:00:00 2001 From: Takuya Makino Date: Sat, 14 Aug 2021 16:12:03 +0900 Subject: [PATCH 03/11] Add a setting file for GitHub action --- .github/workflows/unittest.yaml | 26 ++++++++++++++++++++++++++ 1 file changed, 26 insertions(+) create mode 100644 .github/workflows/unittest.yaml diff --git a/.github/workflows/unittest.yaml b/.github/workflows/unittest.yaml new file mode 100644 index 0000000..6d8bd1b --- /dev/null +++ b/.github/workflows/unittest.yaml @@ -0,0 +1,26 @@ +name: Bunruija test + +on: [push] + +jobs: + build: + + runs-on: ubuntu-latest + strategy: + matrix: + python-version: [3.5, 3.6, 3.7, 3.8] + + steps: + - uses: actions/checkout@v2 + - name: Set up Python ${{ matrix.python-version }} + uses: actions/setup-python@v2 + with: + python-version: ${{ matrix.python-version }} + - name: Install dependencies + run: | + python -m pip install --upgrade pip + pip install nosetests + if [ -f requirements.txt ]; then pip install -r requirements.txt; fi + - name: Test with nosetests + run: | + nosetests From 18d5e318d3df5673890f86ed6722866ed8393238 Mon Sep 17 00:00:00 2001 From: Takuya Makino Date: Sat, 14 Aug 2021 16:14:29 +0900 Subject: [PATCH 04/11] Fix --- .github/workflows/unittest.yaml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/.github/workflows/unittest.yaml b/.github/workflows/unittest.yaml index 6d8bd1b..f68216d 100644 --- a/.github/workflows/unittest.yaml +++ b/.github/workflows/unittest.yaml @@ -19,7 +19,7 @@ jobs: - name: Install dependencies run: | python -m pip install --upgrade pip - pip install nosetests + pip install nose if [ -f requirements.txt ]; then pip install -r requirements.txt; fi - name: Test with nosetests run: | From 6d8f8c9351dc5e106bc85960a36e03beb82af4b9 Mon Sep 17 00:00:00 2001 From: Takuya Makino Date: Sat, 14 Aug 2021 16:22:15 +0900 Subject: [PATCH 05/11] Fix yaml format --- .github/workflows/unittest.yaml | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/.github/workflows/unittest.yaml b/.github/workflows/unittest.yaml index f68216d..6777488 100644 --- a/.github/workflows/unittest.yaml +++ b/.github/workflows/unittest.yaml @@ -18,9 +18,9 @@ jobs: python-version: ${{ matrix.python-version }} - name: Install dependencies run: | - python -m pip install --upgrade pip - pip install nose - if [ -f requirements.txt ]; then pip install -r requirements.txt; fi + python -m pip install --upgrade pip + pip install nose + if [ -f requirements.txt ]; then pip install -r requirements.txt; fi - name: Test with nosetests run: | nosetests From 0a6a8c01d997b6d014080955bbe426d53b25fbb1 Mon Sep 17 00:00:00 2001 From: Takuya Makino Date: Sat, 14 Aug 2021 16:26:03 +0900 Subject: [PATCH 06/11] Add install --- .github/workflows/unittest.yaml | 1 + 1 file changed, 1 insertion(+) diff --git a/.github/workflows/unittest.yaml b/.github/workflows/unittest.yaml index 6777488..e35b8ad 100644 --- a/.github/workflows/unittest.yaml +++ b/.github/workflows/unittest.yaml @@ -21,6 +21,7 @@ jobs: python -m pip install --upgrade pip pip install nose if [ -f requirements.txt ]; then pip install -r requirements.txt; fi + python setup.py install - name: Test with nosetests run: | nosetests From 5f14d5e9c36e61d8875c377ae04a695e6d67320c Mon Sep 17 00:00:00 2001 From: Takuya Makino Date: Sat, 14 Aug 2021 16:28:01 +0900 Subject: [PATCH 07/11] Add requirements.txt --- requirements.txt | 1 + 1 file changed, 1 insertion(+) create mode 100644 requirements.txt diff --git a/requirements.txt b/requirements.txt new file mode 100644 index 0000000..550c266 --- /dev/null +++ b/requirements.txt @@ -0,0 +1 @@ +Cython==0.29.22 From c7a55b54697a93ffc614d3d9ec227c59102ff85a Mon Sep 17 00:00:00 2001 From: Takuya Makino Date: Sat, 14 Aug 2021 16:36:48 +0900 Subject: [PATCH 08/11] Update requirements.txt --- requirements.txt | 1 + 1 file changed, 1 insertion(+) diff --git a/requirements.txt b/requirements.txt index 550c266..13e9633 100644 --- a/requirements.txt +++ b/requirements.txt @@ -1 +1,2 @@ Cython==0.29.22 +numpy==1.19.5 From 9705397af62eef44fac63eeb6dae8c39bb71d0a5 Mon Sep 17 00:00:00 2001 From: Takuya Makino Date: Sat, 14 Aug 2021 16:39:41 +0900 Subject: [PATCH 09/11] Unsupport 3.5 --- .github/workflows/unittest.yaml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/.github/workflows/unittest.yaml b/.github/workflows/unittest.yaml index e35b8ad..b2fa907 100644 --- a/.github/workflows/unittest.yaml +++ b/.github/workflows/unittest.yaml @@ -8,7 +8,7 @@ jobs: runs-on: ubuntu-latest strategy: matrix: - python-version: [3.5, 3.6, 3.7, 3.8] + python-version: [3.6, 3.7, 3.8] steps: - uses: actions/checkout@v2 From ffd026e5e21cdea1cad7ea9eaa1cd4456c04eb42 Mon Sep 17 00:00:00 2001 From: Takuya Makino Date: Sat, 14 Aug 2021 16:41:53 +0900 Subject: [PATCH 10/11] Unsupport 3.6 --- .github/workflows/unittest.yaml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/.github/workflows/unittest.yaml b/.github/workflows/unittest.yaml index b2fa907..dabd7d0 100644 --- a/.github/workflows/unittest.yaml +++ b/.github/workflows/unittest.yaml @@ -8,7 +8,7 @@ jobs: runs-on: ubuntu-latest strategy: matrix: - python-version: [3.6, 3.7, 3.8] + python-version: [3.7, 3.8] steps: - uses: actions/checkout@v2 From 9a16277579c1f64960e4753603c8244c63fa0992 Mon Sep 17 00:00:00 2001 From: Takuya Makino Date: Sat, 14 Aug 2021 16:48:27 +0900 Subject: [PATCH 11/11] setup.py develop --- .github/workflows/unittest.yaml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/.github/workflows/unittest.yaml b/.github/workflows/unittest.yaml index dabd7d0..c4769d4 100644 --- a/.github/workflows/unittest.yaml +++ b/.github/workflows/unittest.yaml @@ -21,7 +21,7 @@ jobs: python -m pip install --upgrade pip pip install nose if [ -f requirements.txt ]; then pip install -r requirements.txt; fi - python setup.py install + python setup.py develop - name: Test with nosetests run: | nosetests