tma15 · tma15 · Aug 14, 2021 · Apr 4, 2021 · Aug 14, 2021 · Aug 14, 2021
diff --git a/.github/workflows/unittest.yaml b/.github/workflows/unittest.yaml
@@ -0,0 +1,27 @@
+name: Bunruija test
+
+on: [push]
+
+jobs:
+  build:
+
+    runs-on: ubuntu-latest
+    strategy:
+      matrix:
+        python-version: [3.7, 3.8]
+
+    steps:
+      - uses: actions/checkout@v2
+      - name: Set up Python ${{ matrix.python-version }}
+        uses: actions/setup-python@v2
+        with:
+          python-version: ${{ matrix.python-version }}
+      - name: Install dependencies
+        run: |
+          python -m pip install --upgrade pip
+          pip install nose
+          if [ -f requirements.txt ]; then pip install -r requirements.txt; fi
+          python setup.py develop
+      - name: Test with nosetests
+        run: |
+          nosetests
diff --git a/bunruija/classifiers/__init__.py b/bunruija/classifiers/__init__.py
@@ -3,7 +3,10 @@
 import pickle
 
 import lightgbm
-from sklearn.svm import SVC
+from sklearn.svm import (
+    LinearSVC,
+    SVC
+)
 from sklearn.ensemble import (
     RandomForestClassifier,
     StackingClassifier,
@@ -17,17 +20,20 @@
 from .classifier import NeuralBaseClassifier
 from .lstm import LSTMClassifier
 from .prado import PRADO
+from .qrnn import QRNN
 from .transformer import TransformerClassifier
 from . import util
 
 
-BUNRUIJA_REGISTRY['prado'] = PRADO
-BUNRUIJA_REGISTRY['svm'] = SVC
-BUNRUIJA_REGISTRY['rf'] = RandomForestClassifier
 BUNRUIJA_REGISTRY['lgb'] = lightgbm.LGBMClassifier
+BUNRUIJA_REGISTRY['linear_svm'] = LinearSVC
 BUNRUIJA_REGISTRY['lr'] = LogisticRegression
 BUNRUIJA_REGISTRY['lstm'] = LSTMClassifier
-BUNRUIJA_REGISTRY['pipeline'] = Pipeline
+# BUNRUIJA_REGISTRY['pipeline'] = Pipeline
+BUNRUIJA_REGISTRY['prado'] = PRADO
+BUNRUIJA_REGISTRY['qrnn'] = QRNN
+BUNRUIJA_REGISTRY['rf'] = RandomForestClassifier
+BUNRUIJA_REGISTRY['svm'] = SVC
 BUNRUIJA_REGISTRY['stacking'] = StackingClassifier
 BUNRUIJA_REGISTRY['transformer'] = TransformerClassifier
 BUNRUIJA_REGISTRY['voting'] = VotingClassifier
@@ -51,7 +57,9 @@ def build_estimator(self, estimator_data):
         if isinstance(estimator_data, list):
             estimators = [self.build_estimator(s) for s in estimator_data]
             estimator_type = 'pipeline'
-            estimator = BUNRUIJA_REGISTRY[estimator_type](estimators)
+            memory = Path(self.config.get('bin_dir', '.')) / 'cache'
+#             estimator = BUNRUIJA_REGISTRY[estimator_type](estimators)
+            estimator = Pipeline(estimators, memory=str(memory))
         else:
             estimator_type = estimator_data['type']
             estimator_args = estimator_data.get('args', {})

diff --git a/bunruija/classifiers/qrnn/__init__.py b/bunruija/classifiers/qrnn/__init__.py
@@ -0,0 +1 @@
+from .model import QRNN
diff --git a/bunruija/classifiers/qrnn/model.py b/bunruija/classifiers/qrnn/model.py
@@ -0,0 +1,126 @@
+import numpy as np
+import torch
+
+from bunruija.classifiers.classifier import NeuralBaseClassifier
+
+
+
+class QRNNLayer(torch.nn.Module):
+    def __init__(self, input_size, output_size, window_size=2, bidirectional=True):
+        super().__init__()
+
+        self.num_gates = 3
+        self.window_size = window_size
+        self.input_size = input_size
+        self.output_size = output_size
+        self.bidirectional = bidirectional
+
+        if self.bidirectional:
+            self.fc = torch.nn.Linear(
+                self.window_size * input_size,
+                2 * output_size * self.num_gates)
+        else:
+            self.fc = torch.nn.Linear(
+                self.window_size * input_size,
+                output_size * self.num_gates)
+
+    def forward(self, x):
+        bsz = x.size(0)
+        seq_len = x.size(1)
+        window_tokens = [x]
+        for i in range(self.window_size - 1):
+            prev_x = x[:, :-(i + 1), :]
+            prev_x = torch.cat(
+                [prev_x.new_zeros(bsz, i + 1, self.input_size), prev_x],
+                dim=1)
+            window_tokens.insert(0, prev_x)
+        x = torch.stack(window_tokens, dim=2)
+        x = x.view(bsz, seq_len, -1)
+        x = self.fc(x)
+        z, f, o = x.chunk(self.num_gates, dim=2)
+
+        z = torch.tanh(z)
+        f = torch.sigmoid(f)
+        seq_len = z.size(1)
+
+        c = torch.zeros_like(z)
+
+        if self.bidirectional:
+            c = c.view(bsz, seq_len, 2, self.output_size)
+            f = f.view(bsz, seq_len, 2, self.output_size)
+            z = z.view(bsz, seq_len, 2, self.output_size)
+            for t in range(seq_len):
+                if t == 0:
+                    c[:, t, 0] = (1 - f[:, t, 0]) * z[:, t, 0]
+                else:
+                    c[:, t, 0] = f[:, t, 0] * c[:, t - 1, 0].clone() + (1 - f[:, t, 0]) * z[:, t, 0]
+            for t in range(seq_len - 1, -1, -1):
+                if t == seq_len - 1:
+                    c[:, t, 0] = (1 - f[:, t, 0]) * z[:, t, 0]
+                else:
+                    c[:, t, 0] = f[:, t, 0] * c[:, t + 1, 0].clone() + (1 - f[:, t, 0]) * z[:, t, 0]
+            c = c.view(bsz, seq_len, 2 * self.output_size)
+        else:
+            for t in range(seq_len):
+                if t == 0:
+                    c[:, t] = (1 - f[:, t]) * z[:, t]
+                else:
+                    c[:, t] = f[:, t] * c[:, t -1].clone() + (1 - f[:, t]) * z[:, t]
+
+        h = torch.sigmoid(o) * c
+        return h
+
+
+class QRNN(NeuralBaseClassifier):
+    def __init__(self, **kwargs):
+        super().__init__(**kwargs)
+
+        self.dim_emb = kwargs.get('dim_emb', 256)
+        self.dim_hid = kwargs.get('dim_hid', 128)
+        self.window_size = kwargs.get('window_size', 3)
+
+        self.layers = torch.nn.ModuleList()
+        self.bidirectional = kwargs.get('bidirectional', True)
+        num_layers = kwargs.get('num_layers', 2)
+        for i in range(num_layers):
+            if i == 0:
+                input_size = self.dim_emb
+            else:
+                input_size = 2 * self.dim_hid if self.bidirectional else self.dim_hid
+
+            self.layers.append(QRNNLayer(
+                input_size,
+                self.dim_hid,
+                window_size=self.window_size,
+                bidirectional=self.bidirectional))
+
+    def init_layer(self, data):
+        self.pad = 0
+        max_input_idx = 0
+        for data_i in data:
+            max_input_idx = max(max_input_idx, np.max(data_i['inputs']))
+
+        self.embed = torch.nn.Embedding(
+            max_input_idx + 1,
+            self.dim_emb,
+            padding_idx=0,
+        )
+
+        self.out = torch.nn.Linear(
+            2 * self.dim_hid if self.bidirectional else self.dim_hid,
+            len(self.labels),
+            bias=True)
+
+    def __call__(self, batch):
+        src_tokens = batch['inputs']
+        lengths = (src_tokens != self.pad).sum(dim=1)
+
+        x = self.embed(src_tokens)
+        for layer in self.layers:
+            x = layer(x)
+        x = torch.nn.functional.adaptive_max_pool2d(
+            x,
+            (1, 2 * self.dim_hid if self.bidirectional else self.dim_hid))
+        x = x.squeeze(1)
+        x = self.out(x)
+        return x
diff --git a/bunruija/feature_extraction/sequence.py b/bunruija/feature_extraction/sequence.py
@@ -89,7 +89,10 @@ def transform(self, raw_documents):
 
         tokenizer = self.build_tokenizer()
         for row_id, document in enumerate(raw_documents):
-            elements = tokenizer(document)
+            if isinstance(tokenizer, transformers.PreTrainedTokenizerBase):
+                elements = tokenizer(document, truncation=True)
+            else:
+                elements = tokenizer(document)
 
             if isinstance(elements, transformers.tokenization_utils_base.BatchEncoding):
                 input_ids = elements['input_ids']

diff --git a/example/livedoor_corpus/settings/prado.yaml b/example/livedoor_corpus/settings/prado.yaml
@@ -19,11 +19,11 @@ classifier:
     args:
       make_fast: true
       batch_size: 10
-      n_features: 32
+      n_features: 512
       dim_emb: 64
       dim_hid: 64
       optimizer: adamw
       lr: 0.001
       max_epochs: 3
       weight_decay: 0.01
-      log_interval: 1
+      log_interval: 100
diff --git a/example/livedoor_corpus/settings/qrnn.yaml b/example/livedoor_corpus/settings/qrnn.yaml
@@ -0,0 +1,28 @@
+preprocess:
+  data:
+    train: train.csv
+    dev: dev.csv
+    test: test.csv
+
+tokenizer:
+  type: mecab
+  args:
+    lemmatize: false
+
+bin_dir: models/qrnn-model
+
+classifier:
+  - type: sequence
+    args:
+      max_features: 10000
+  - type: qrnn
+    args:
+      batch_size: 2
+      n_features: 512
+      dim_emb: 64
+      dim_hid: 128
+      optimizer: adamw
+      lr: 0.001
+      max_epochs: 3
+      weight_decay: 0.01
+      log_interval: 100
diff --git a/example/yelp_polarity/settings/prado.yaml b/example/yelp_polarity/settings/prado.yaml
@@ -15,7 +15,7 @@ classifier:
   - type: prado
     args:
       device: cpu
-      batch_size: 16
+      batch_size: 32
       n_features: 512
       dim_emb: 64
       dim_hid: 64

diff --git a/example/yelp_polarity/settings/qrnn.yaml b/example/yelp_polarity/settings/qrnn.yaml
@@ -0,0 +1,27 @@
+preprocess:
+  data:
+    train: train.csv
+    test: test.csv
+
+tokenizer:
+  type: space
+
+bin_dir: models/prado-model
+
+classifier:
+  - type: sequence
+    args:
+      only_raw_word: true
+  - type: prado
+    args:
+      device: cpu
+      batch_size: 32
+      n_features: 512
+      dim_emb: 64
+      dim_hid: 64
+      optimizer: adamw
+      lr: 0.001
+      max_epochs: 3
+      weight_decay: 0.01
+      save_every_step: 1000
+      log_interval: 1
diff --git a/requirements.txt b/requirements.txt
@@ -0,0 +1,2 @@
+Cython==0.29.22
+numpy==1.19.5