Merge pull request #37 from tma15/fix/refactor

Fix/refactor
tma15 · Feb 3, 2024 · 5b2d507 · 5b2d507
2 parents 6cf27fc + 096e601
commit 5b2d507
Show file tree

Hide file tree

Showing 26 changed files with 312 additions and 106 deletions.
diff --git a/README.md b/README.md
@@ -24,7 +24,7 @@ data:
   dev: dev.csv
   test: test.csv
 
-bin_dir: models/svm-model
+output_dir: models/svm-model
 
 pipeline:
   - type: sklearn.feature_extraction.text.TfidfVectorizer
@@ -55,7 +55,7 @@ data:
   dev: dev.csv
   test: test.csv
 
-bin_dir: models/transformer-model
+output_dir: models/transformer-model
 
 pipeline:
   - type: bunruija.feature_extraction.sequence.SequenceVectorizer
@@ -68,10 +68,15 @@ pipeline:
     args:
       device: cpu
       pretrained_model_name_or_path: cl-tohoku/bert-base-japanese
-      optimizer: adamw
-      lr: 3e-5
+      optimizer:
+        type: torch.optim.AdamW
+        args:
+          lr: 3e-5
+          weight_decay: 0.01
+          betas:
+            - 0.9
+            - 0.999
       max_epochs: 3
-      weight_decay: 0.01
 ```
 
 ## CLI
@@ -83,6 +88,47 @@ bunruija-train -y config.yaml
 bunruija-evaluate -y config.yaml
 ```
 
+## Config
+### data
+You can set data-related settings in `data`.
+
+```sh
+data:
+  train: train.csv  # training data
+  dev: dev.csv # development data
+  test: test.csv # test data
+  label_column: label
+  text_column: text
+```
+
+You can set local files in `train`, `dev`, and `test`.
+Supported types are `csv`, `json` and `jsonl`.
+`label_column` and `text_column` are field names of label and text.
+When you set `label_column` to `label` and `text_column` to `text`, which are the default values, actual data must be as follows:
+
+Format of `csv`:
+
+```
+label,text
+label_name,sentence
+…
+```
+
+Format of `json`:
+
+```
+[{"label", "label_name", "text": "sentence"}]
+```
+
+Format of `jsonl`:
+
+```
+{"label", "label_name", "text": "sentence"}
+```
+
+### pipeline
+You can set pipeline of your model in `pipeline`
+
 
 ## Prediction using the trained classifier in Python code
 ```python

diff --git a/bunruija/classifiers/classifier.py b/bunruija/classifiers/classifier.py
@@ -1,4 +1,5 @@
 import time
+from functools import partial
 from logging import getLogger
 
 import numpy as np
@@ -141,7 +142,6 @@ def fit(self, X, y):
         self.init_layer(data)
 
         optimizer = self.build_optimizer()
-        logger.info(f"{optimizer}")
         start_at = time.perf_counter()
 
         self.to(self.device)
@@ -211,24 +211,13 @@ def reset_module(self, **kwargs):
     def classifier_args(self):
         raise NotImplementedError
 
-    def build_optimizer(self):
-        lr = float(self.kwargs.get("lr", 0.001))
-        weight_decay = self.kwargs.get("weight_decay", 0.0)
-
-        if self.optimizer_type == "sgd":
-            optimizer = torch.optim.SGD(
-                self.parameters(), lr=lr, weight_decay=weight_decay
-            )
-        elif self.optimizer_type == "adam":
-            optimizer = torch.optim.Adam(
-                self.parameters(), lr=lr, weight_decay=weight_decay
-            )
-        elif self.optimizer_type == "adamw":
-            optimizer = torch.optim.AdamW(
-                self.parameters(), lr=lr, weight_decay=weight_decay
-            )
-        else:
-            raise ValueError(f"Unsupported optimizer: {self.optimizer_type}")
+    def build_optimizer(self) -> torch.optim.Optimizer:
+        unlinked_optimizer = self.kwargs.get(
+            "optimizer",
+            partial(torch.optim.AdamW),
+        )
+        optimizer: torch.optim.Optimizer = unlinked_optimizer(self.parameters())
+        logger.info(f"Optimizer: {optimizer}")
         return optimizer
 
     def zero_grad(self):

diff --git a/bunruija/classifiers/lstm.py b/bunruija/classifiers/lstm.py
@@ -1,5 +1,4 @@
 import logging
-from typing import Optional
 
 import numpy as np
 import torch
@@ -8,15 +7,14 @@
 from bunruija.classifiers.classifier import NeuralBaseClassifier
 from bunruija.modules import StaticEmbedding
 
-
 logger = logging.getLogger(__name__)
 
 
 class LSTMClassifier(NeuralBaseClassifier):
     def __init__(self, **kwargs):
         super().__init__(**kwargs)
 
-        self.embedding_path: Optional[str] = kwargs.get("static_embedding_path", None)
+        self.embedding_path: str | None = kwargs.get("static_embedding_path", None)
 
         self.dim_emb: int = kwargs.get("dim_emb", 256)
         self.dim_hid: int = kwargs.get("dim_hid", 512)

diff --git a/bunruija/classifiers/qrnn/model.py b/bunruija/classifiers/qrnn/model.py
@@ -1,5 +1,3 @@
-from typing import Optional
-
 import numpy as np
 import torch
 
@@ -12,7 +10,7 @@ class QRNN(NeuralBaseClassifier):
     def __init__(self, **kwargs):
         super().__init__(**kwargs)
 
-        self.embedding_path: Optional[str] = kwargs.get("static_embedding_path", None)
+        self.embedding_path: str | None = kwargs.get("static_embedding_path", None)
 
         self.dim_emb: int = kwargs.get("dim_emb", 256)
         self.dim_hid: int = kwargs.get("dim_hid", 128)

diff --git a/bunruija/data/dataset.py b/bunruija/data/dataset.py
@@ -1,17 +1,33 @@
-import csv
 from pathlib import Path
 
+from datasets import Dataset, load_dataset
+
+
+def load_data(
+    data_path: str | Path,
+    label_column: str = "label",
+    text_column: str = "text",
+) -> tuple[list[str], list[str]]:
+    if isinstance(data_path, str):
+        data_path = Path(data_path)
 
-def load_data(data_path: str | Path) -> tuple[list[str], list[str]]:
     labels: list[str] = []
     texts: list[str] = []
-    with open(data_path) as f:
-        reader = csv.reader(f)
-        for row in reader:
-            if len(row) < 2:
-                continue
-            if len(row[0]) == 0 or len(row[1]) == 0:
-                continue
-            labels.append(row[0])
-            texts.append(row[1])
-    return labels, texts
+
+    if data_path.suffix in [".csv", ".json", ".jsonl"]:
+        suffix: str = data_path.suffix[1:]
+
+        # Because datasets does not support jsonl suffix, convert it to json
+        if suffix == "jsonl":
+            suffix = "json"
+
+        # When data_files is only a single data_path, data split is "train"
+        dataset: Dataset = load_dataset(suffix, data_files=str(data_path))["train"]
+
+        for sample in dataset:
+            labels.append(sample[label_column])
+            texts.append(sample[text_column])
+        return labels, texts
+
+    else:
+        raise ValueError(data_path.suffix)
diff --git a/bunruija/dataclass.py b/bunruija/dataclass.py
@@ -1,21 +1,35 @@
 from dataclasses import dataclass, field
 from pathlib import Path
-from typing import Any, Dict, List
+from typing import Any
 
 import ruamel.yaml  # type: ignore
 
 
 @dataclass
 class PipelineUnit:
     type: str
-    args: Dict[str, Any] = field(default_factory=dict)
+    args: dict[str, Any] = field(default_factory=dict)
+
+
+@dataclass
+class DataConfig:
+    train: Path = field(default_factory=Path)
+    dev: Path = field(default_factory=Path)
+    test: Path = field(default_factory=Path)
+    label_column: str = "label"
+    text_column: str = "text"
+
+    def __post_init__(self):
+        self.train = Path(self.train)
+        self.dev = Path(self.dev)
+        self.test = Path(self.test)
 
 
 @dataclass
 class BunruijaConfig:
-    data: Dict[str, str]
-    pipeline: List[PipelineUnit]
-    bin_dir: Path
+    data: DataConfig
+    pipeline: list[PipelineUnit]
+    output_dir: Path
 
     @classmethod
     def from_yaml(cls, config_file):
@@ -24,7 +38,7 @@ def from_yaml(cls, config_file):
             config = yaml.load(f)
 
             return cls(
-                data=config["data"],
+                data=DataConfig(**config["data"]),
                 pipeline=[PipelineUnit(**unit) for unit in config["pipeline"]],
-                bin_dir=Path(config.get("bin_dir", ".")),
+                output_dir=Path(config.get("output_dir", "output")),
             )
diff --git a/bunruija/evaluator.py b/bunruija/evaluator.py
@@ -17,7 +17,11 @@ def __init__(self, args: Namespace):
         self.predictor = Predictor(args.yaml)
 
     def evaluate(self):
-        labels_test, X_test = load_data(self.config.data["test"])
+        labels_test, X_test = load_data(
+            self.config.data.test,
+            label_column=self.config.data.label_column,
+            text_column=self.config.data.text_column,
+        )
         y_test: np.ndarray = self.predictor.label_encoder.transform(labels_test)
         y_pred: np.ndarray = self.predictor(X_test)
 

diff --git a/bunruija/feature_extraction/sequence.py b/bunruija/feature_extraction/sequence.py
@@ -1,4 +1,4 @@
-from typing import Any, Callable, Dict, List, Optional, Tuple, Union
+from typing import Any, Callable
 
 import numpy as np
 import transformers  # type: ignore
@@ -12,8 +12,8 @@
 class SequenceVectorizer(TransformerMixin):
     def __init__(
         self,
-        tokenizer: Optional[Callable[[str], List[str]]] = None,
-        max_features: Optional[int] = None,
+        tokenizer: Callable[[str], list[str]] | None = None,
+        max_features: int | None = None,
         keep_raw_word: bool = True,
         only_raw_word: bool = False,
         dictionary: Dictionary = Dictionary(),
@@ -44,7 +44,7 @@ def __repr__(self) -> str:
         out = f'{self.__class__.__name__}({", ".join(args)})'
         return out
 
-    def build_tokenizer(self) -> Callable[[str], List[str]]:
+    def build_tokenizer(self) -> Callable[[str], list[str]]:
         if self.tokenizer is not None:
             return self.tokenizer
 
@@ -56,7 +56,7 @@ def set_params(self, **kwargs):
             if hasattr(self, k):
                 setattr(self, k, v)
 
-    def get_params(self, deep=True) -> Dict[str, Any]:
+    def get_params(self, deep=True) -> dict[str, Any]:
         return {
             "tokenizer": self.tokenizer,
             "max_features": self.max_features,
@@ -65,7 +65,7 @@ def get_params(self, deep=True) -> Dict[str, Any]:
             "only_raw_word": self.only_raw_word,
         }
 
-    def fit(self, raw_documents: List[str], y=None) -> "SequenceVectorizer":
+    def fit(self, raw_documents: list[str], y=None) -> "SequenceVectorizer":
         if self.only_raw_word:
             return self
 
@@ -89,8 +89,9 @@ def fit(self, raw_documents: List[str], y=None) -> "SequenceVectorizer":
         return self
 
     def transform(
-        self, raw_documents: List[str]
-    ) -> Union[csr_matrix, Tuple[csr_matrix, List[str]]]:
+        self,
+        raw_documents: list[str],
+    ) -> csr_matrix | tuple[csr_matrix, list[str]]:
         data = []
         raw_words = []
         row = []

diff --git a/bunruija/pipeline_builder.py b/bunruija/pipeline_builder.py
@@ -1,7 +1,8 @@
 import importlib
+from functools import partial
 from logging import getLogger
-from typing import List, Union
 
+import torch
 from sklearn.pipeline import Pipeline  # type: ignore
 from transformers import AutoTokenizer, PreTrainedTokenizer
 
@@ -85,19 +86,25 @@ def _update_arg_value(x):
 
     def build_estimator(
         self,
-        pipeline_units: Union[PipelineUnit, List[PipelineUnit]],
+        pipeline_units: PipelineUnit | list[PipelineUnit],
         pipeline_idx="pipeline",
     ):
         if isinstance(pipeline_units, list):
             estimators = [self.build_estimator(u) for u in pipeline_units]
             estimator_type = pipeline_idx
-            memory = self.config.bin_dir / "cache"
+            memory = self.config.output_dir / "cache"
             estimator = Pipeline(estimators, memory=str(memory))
         else:
             self._maybe_update_arg(pipeline_units)
             estimator_type = pipeline_units.type
             cls = self._load_class(pipeline_units.type)
-            estimator = cls(**pipeline_units.args)
+
+            # parameters of a neural network are not given at this moment.
+            # so, partially create an optimizer
+            if issubclass(cls, torch.optim.Optimizer):
+                estimator = partial(cls, **pipeline_units.args)
+            else:
+                estimator = cls(**pipeline_units.args)
 
         # Because Pipeline of scikit-learn requires the tuple of name and estimator,
         # this functions returns them

diff --git a/bunruija/predictor.py b/bunruija/predictor.py
@@ -2,19 +2,18 @@
 from pathlib import Path
 
 import numpy as np
-import ruamel.yaml  # type: ignore
 from sklearn.preprocessing import LabelEncoder  # type: ignore
 
+from . import BunruijaConfig
+
 
 class Predictor:
     """Predicts labels"""
 
     def __init__(self, config_file):
-        with open(config_file) as f:
-            yaml = ruamel.yaml.YAML()
-            config = yaml.load(f)
+        config = BunruijaConfig.from_yaml(config_file)
+        model_path: Path = config.output_dir / "model.bunruija"
 
-        model_path = Path(config.get("bin_dir", ".")) / "model.bunruija"
         with open(model_path, "rb") as f:
             model_data: dict = pickle.load(f)