Skip to content

Commit

Permalink
Merge pull request #37 from tma15/fix/refactor
Browse files Browse the repository at this point in the history
Fix/refactor
  • Loading branch information
tma15 authored Feb 3, 2024
2 parents 6cf27fc + 096e601 commit 5b2d507
Show file tree
Hide file tree
Showing 26 changed files with 312 additions and 106 deletions.
56 changes: 51 additions & 5 deletions README.md
Original file line number Diff line number Diff line change
Expand Up @@ -24,7 +24,7 @@ data:
dev: dev.csv
test: test.csv

bin_dir: models/svm-model
output_dir: models/svm-model

pipeline:
- type: sklearn.feature_extraction.text.TfidfVectorizer
Expand Down Expand Up @@ -55,7 +55,7 @@ data:
dev: dev.csv
test: test.csv

bin_dir: models/transformer-model
output_dir: models/transformer-model

pipeline:
- type: bunruija.feature_extraction.sequence.SequenceVectorizer
Expand All @@ -68,10 +68,15 @@ pipeline:
args:
device: cpu
pretrained_model_name_or_path: cl-tohoku/bert-base-japanese
optimizer: adamw
lr: 3e-5
optimizer:
type: torch.optim.AdamW
args:
lr: 3e-5
weight_decay: 0.01
betas:
- 0.9
- 0.999
max_epochs: 3
weight_decay: 0.01
```
## CLI
Expand All @@ -83,6 +88,47 @@ bunruija-train -y config.yaml
bunruija-evaluate -y config.yaml
```

## Config
### data
You can set data-related settings in `data`.

```sh
data:
train: train.csv # training data
dev: dev.csv # development data
test: test.csv # test data
label_column: label
text_column: text
```

You can set local files in `train`, `dev`, and `test`.
Supported types are `csv`, `json` and `jsonl`.
`label_column` and `text_column` are field names of label and text.
When you set `label_column` to `label` and `text_column` to `text`, which are the default values, actual data must be as follows:

Format of `csv`:

```
label,text
label_name,sentence
```

Format of `json`:

```
[{"label", "label_name", "text": "sentence"}]
```

Format of `jsonl`:

```
{"label", "label_name", "text": "sentence"}
```

### pipeline
You can set pipeline of your model in `pipeline`


## Prediction using the trained classifier in Python code
```python
Expand Down
27 changes: 8 additions & 19 deletions bunruija/classifiers/classifier.py
Original file line number Diff line number Diff line change
@@ -1,4 +1,5 @@
import time
from functools import partial
from logging import getLogger

import numpy as np
Expand Down Expand Up @@ -141,7 +142,6 @@ def fit(self, X, y):
self.init_layer(data)

optimizer = self.build_optimizer()
logger.info(f"{optimizer}")
start_at = time.perf_counter()

self.to(self.device)
Expand Down Expand Up @@ -211,24 +211,13 @@ def reset_module(self, **kwargs):
def classifier_args(self):
raise NotImplementedError

def build_optimizer(self):
lr = float(self.kwargs.get("lr", 0.001))
weight_decay = self.kwargs.get("weight_decay", 0.0)

if self.optimizer_type == "sgd":
optimizer = torch.optim.SGD(
self.parameters(), lr=lr, weight_decay=weight_decay
)
elif self.optimizer_type == "adam":
optimizer = torch.optim.Adam(
self.parameters(), lr=lr, weight_decay=weight_decay
)
elif self.optimizer_type == "adamw":
optimizer = torch.optim.AdamW(
self.parameters(), lr=lr, weight_decay=weight_decay
)
else:
raise ValueError(f"Unsupported optimizer: {self.optimizer_type}")
def build_optimizer(self) -> torch.optim.Optimizer:
unlinked_optimizer = self.kwargs.get(
"optimizer",
partial(torch.optim.AdamW),
)
optimizer: torch.optim.Optimizer = unlinked_optimizer(self.parameters())
logger.info(f"Optimizer: {optimizer}")
return optimizer

def zero_grad(self):
Expand Down
4 changes: 1 addition & 3 deletions bunruija/classifiers/lstm.py
Original file line number Diff line number Diff line change
@@ -1,5 +1,4 @@
import logging
from typing import Optional

import numpy as np
import torch
Expand All @@ -8,15 +7,14 @@
from bunruija.classifiers.classifier import NeuralBaseClassifier
from bunruija.modules import StaticEmbedding


logger = logging.getLogger(__name__)


class LSTMClassifier(NeuralBaseClassifier):
def __init__(self, **kwargs):
super().__init__(**kwargs)

self.embedding_path: Optional[str] = kwargs.get("static_embedding_path", None)
self.embedding_path: str | None = kwargs.get("static_embedding_path", None)

self.dim_emb: int = kwargs.get("dim_emb", 256)
self.dim_hid: int = kwargs.get("dim_hid", 512)
Expand Down
4 changes: 1 addition & 3 deletions bunruija/classifiers/qrnn/model.py
Original file line number Diff line number Diff line change
@@ -1,5 +1,3 @@
from typing import Optional

import numpy as np
import torch

Expand All @@ -12,7 +10,7 @@ class QRNN(NeuralBaseClassifier):
def __init__(self, **kwargs):
super().__init__(**kwargs)

self.embedding_path: Optional[str] = kwargs.get("static_embedding_path", None)
self.embedding_path: str | None = kwargs.get("static_embedding_path", None)

self.dim_emb: int = kwargs.get("dim_emb", 256)
self.dim_hid: int = kwargs.get("dim_hid", 128)
Expand Down
40 changes: 28 additions & 12 deletions bunruija/data/dataset.py
Original file line number Diff line number Diff line change
@@ -1,17 +1,33 @@
import csv
from pathlib import Path

from datasets import Dataset, load_dataset


def load_data(
data_path: str | Path,
label_column: str = "label",
text_column: str = "text",
) -> tuple[list[str], list[str]]:
if isinstance(data_path, str):
data_path = Path(data_path)

def load_data(data_path: str | Path) -> tuple[list[str], list[str]]:
labels: list[str] = []
texts: list[str] = []
with open(data_path) as f:
reader = csv.reader(f)
for row in reader:
if len(row) < 2:
continue
if len(row[0]) == 0 or len(row[1]) == 0:
continue
labels.append(row[0])
texts.append(row[1])
return labels, texts

if data_path.suffix in [".csv", ".json", ".jsonl"]:
suffix: str = data_path.suffix[1:]

# Because datasets does not support jsonl suffix, convert it to json
if suffix == "jsonl":
suffix = "json"

# When data_files is only a single data_path, data split is "train"
dataset: Dataset = load_dataset(suffix, data_files=str(data_path))["train"]

for sample in dataset:
labels.append(sample[label_column])
texts.append(sample[text_column])
return labels, texts

else:
raise ValueError(data_path.suffix)
28 changes: 21 additions & 7 deletions bunruija/dataclass.py
Original file line number Diff line number Diff line change
@@ -1,21 +1,35 @@
from dataclasses import dataclass, field
from pathlib import Path
from typing import Any, Dict, List
from typing import Any

import ruamel.yaml # type: ignore


@dataclass
class PipelineUnit:
type: str
args: Dict[str, Any] = field(default_factory=dict)
args: dict[str, Any] = field(default_factory=dict)


@dataclass
class DataConfig:
train: Path = field(default_factory=Path)
dev: Path = field(default_factory=Path)
test: Path = field(default_factory=Path)
label_column: str = "label"
text_column: str = "text"

def __post_init__(self):
self.train = Path(self.train)
self.dev = Path(self.dev)
self.test = Path(self.test)


@dataclass
class BunruijaConfig:
data: Dict[str, str]
pipeline: List[PipelineUnit]
bin_dir: Path
data: DataConfig
pipeline: list[PipelineUnit]
output_dir: Path

@classmethod
def from_yaml(cls, config_file):
Expand All @@ -24,7 +38,7 @@ def from_yaml(cls, config_file):
config = yaml.load(f)

return cls(
data=config["data"],
data=DataConfig(**config["data"]),
pipeline=[PipelineUnit(**unit) for unit in config["pipeline"]],
bin_dir=Path(config.get("bin_dir", ".")),
output_dir=Path(config.get("output_dir", "output")),
)
6 changes: 5 additions & 1 deletion bunruija/evaluator.py
Original file line number Diff line number Diff line change
Expand Up @@ -17,7 +17,11 @@ def __init__(self, args: Namespace):
self.predictor = Predictor(args.yaml)

def evaluate(self):
labels_test, X_test = load_data(self.config.data["test"])
labels_test, X_test = load_data(
self.config.data.test,
label_column=self.config.data.label_column,
text_column=self.config.data.text_column,
)
y_test: np.ndarray = self.predictor.label_encoder.transform(labels_test)
y_pred: np.ndarray = self.predictor(X_test)

Expand Down
17 changes: 9 additions & 8 deletions bunruija/feature_extraction/sequence.py
Original file line number Diff line number Diff line change
@@ -1,4 +1,4 @@
from typing import Any, Callable, Dict, List, Optional, Tuple, Union
from typing import Any, Callable

import numpy as np
import transformers # type: ignore
Expand All @@ -12,8 +12,8 @@
class SequenceVectorizer(TransformerMixin):
def __init__(
self,
tokenizer: Optional[Callable[[str], List[str]]] = None,
max_features: Optional[int] = None,
tokenizer: Callable[[str], list[str]] | None = None,
max_features: int | None = None,
keep_raw_word: bool = True,
only_raw_word: bool = False,
dictionary: Dictionary = Dictionary(),
Expand Down Expand Up @@ -44,7 +44,7 @@ def __repr__(self) -> str:
out = f'{self.__class__.__name__}({", ".join(args)})'
return out

def build_tokenizer(self) -> Callable[[str], List[str]]:
def build_tokenizer(self) -> Callable[[str], list[str]]:
if self.tokenizer is not None:
return self.tokenizer

Expand All @@ -56,7 +56,7 @@ def set_params(self, **kwargs):
if hasattr(self, k):
setattr(self, k, v)

def get_params(self, deep=True) -> Dict[str, Any]:
def get_params(self, deep=True) -> dict[str, Any]:
return {
"tokenizer": self.tokenizer,
"max_features": self.max_features,
Expand All @@ -65,7 +65,7 @@ def get_params(self, deep=True) -> Dict[str, Any]:
"only_raw_word": self.only_raw_word,
}

def fit(self, raw_documents: List[str], y=None) -> "SequenceVectorizer":
def fit(self, raw_documents: list[str], y=None) -> "SequenceVectorizer":
if self.only_raw_word:
return self

Expand All @@ -89,8 +89,9 @@ def fit(self, raw_documents: List[str], y=None) -> "SequenceVectorizer":
return self

def transform(
self, raw_documents: List[str]
) -> Union[csr_matrix, Tuple[csr_matrix, List[str]]]:
self,
raw_documents: list[str],
) -> csr_matrix | tuple[csr_matrix, list[str]]:
data = []
raw_words = []
row = []
Expand Down
15 changes: 11 additions & 4 deletions bunruija/pipeline_builder.py
Original file line number Diff line number Diff line change
@@ -1,7 +1,8 @@
import importlib
from functools import partial
from logging import getLogger
from typing import List, Union

import torch
from sklearn.pipeline import Pipeline # type: ignore
from transformers import AutoTokenizer, PreTrainedTokenizer

Expand Down Expand Up @@ -85,19 +86,25 @@ def _update_arg_value(x):

def build_estimator(
self,
pipeline_units: Union[PipelineUnit, List[PipelineUnit]],
pipeline_units: PipelineUnit | list[PipelineUnit],
pipeline_idx="pipeline",
):
if isinstance(pipeline_units, list):
estimators = [self.build_estimator(u) for u in pipeline_units]
estimator_type = pipeline_idx
memory = self.config.bin_dir / "cache"
memory = self.config.output_dir / "cache"
estimator = Pipeline(estimators, memory=str(memory))
else:
self._maybe_update_arg(pipeline_units)
estimator_type = pipeline_units.type
cls = self._load_class(pipeline_units.type)
estimator = cls(**pipeline_units.args)

# parameters of a neural network are not given at this moment.
# so, partially create an optimizer
if issubclass(cls, torch.optim.Optimizer):
estimator = partial(cls, **pipeline_units.args)
else:
estimator = cls(**pipeline_units.args)

# Because Pipeline of scikit-learn requires the tuple of name and estimator,
# this functions returns them
Expand Down
9 changes: 4 additions & 5 deletions bunruija/predictor.py
Original file line number Diff line number Diff line change
Expand Up @@ -2,19 +2,18 @@
from pathlib import Path

import numpy as np
import ruamel.yaml # type: ignore
from sklearn.preprocessing import LabelEncoder # type: ignore

from . import BunruijaConfig


class Predictor:
"""Predicts labels"""

def __init__(self, config_file):
with open(config_file) as f:
yaml = ruamel.yaml.YAML()
config = yaml.load(f)
config = BunruijaConfig.from_yaml(config_file)
model_path: Path = config.output_dir / "model.bunruija"

model_path = Path(config.get("bin_dir", ".")) / "model.bunruija"
with open(model_path, "rb") as f:
model_data: dict = pickle.load(f)

Expand Down
Loading

0 comments on commit 5b2d507

Please sign in to comment.