Skip to content

Commit

Permalink
Add MLP model (#37)
Browse files Browse the repository at this point in the history
*  Integrate BERT into Hedwig (#29)

* Fix package imports

* Update README.md

* Fix bug due to TAR/AR attribute check

* Add BERT models

* Add BERT tokenizer

* Return logits from the model.py

* Remove unused classes in models/bert

* Return logits from the model.py (#12)

* Remove unused classes in models/bert (#13)

* Add initial main file

* Add args for BERT

* Add partial support for BERT

* Initialize training and optimization

* Draft the structure of Trainers for BERT

* Remove duplicate tokenizer

* Add utils

* Move optimization to utils

* Add more structure for trainer

* Refactor the trainer (#15)

* Refactor the trainer

* Add more edits

* Add support for our datasets

* Add evaluator

* Split data4bert module into multiple processors

* Refactor BERT tokenizer

* Integrate BERT into Castor framework (#17)

* Remove unused classes in models/bert

* Split data4bert module into multiple processors

* Refactor BERT tokenizer

* Add multilabel support in BertTrainer

* Add multilabel support in BertEvaluator

* Add get_test_samples method in dataset processors

* Fix args.py for BERT

* Add support for Reuters, IMDB datasets for BERT

* Revert "Integrate BERT into Castor framework (#17)"

This reverts commit e4244ec.

* Fix paths to datasets in dataset classes and args

* Add SST dataset

* Add hedwig-data instructions to README.md

* Fix KimCNN README

* Fix RegLSTM README

* Fix typos in README

* Remove trec_eval from README

* Add tensorboardX to requirements.txt

* Rename processors module to bert_processors

* Add method to print metrics after training

* Add model check-pointing and early stopping for BERT

* Add logos

* Update README.md

* Fix code comments in classification trainer

* Add support for AAPD, Sogou, AGNews and Yelp2014

* Fix bug that deleted saved models

* Update README for HAN

* Update README for XML-CNN

* Remove redundant TODOs from the READMEs

* Fix logo in README.md

* Update README for Char-CNN

* Fix all the READMEs

* Resolve conflict

* Fix Typos

* Re-Add SST2 Processor

* Add support for evaluating trained model

* Update args.py

* Resolve issues due to DataParallel wrapper on saved model

* Remove redundant Yelp processor

* Fix bug for safely creating the saving directory

* Change checkpoint paths to timestamps

* Remove unwanted string.strip() from tokenizer

* Create save path if it doesn't exist

* Decouple model checkpoints from code

* Remove model choice restrictions for BERT

* Remove model/distill driver

* Simplify checkpoint directory creation

* Add TREC relevance datasets

* Add relevance transfer trainer and evaluator

* Add re-ranking module

* Add ImbalancedDatasetSampler

* Add relevance transfer package

* Fix import in classification trainer

* Remove unwanted args from models/bert

* Fix bug where model wasn't in training mode every epoch

* Add Robust45 preprocessor for BERT

* Add support for BERT for relevance transfer

* Add hierarchical BERT model

* Remove tensorboardX logging

* Add hierarchical BERT for relevance transfer

* Add learning rate multiplier

* Add lr multiplier for relevance transfer

* Add MLP model
  • Loading branch information
achyudh authored and Ashutosh-Adhikari committed Sep 8, 2019
1 parent 255624b commit 7a1fce7
Show file tree
Hide file tree
Showing 7 changed files with 268 additions and 1 deletion.
40 changes: 40 additions & 0 deletions datasets/reuters.py
Original file line number Diff line number Diff line change
@@ -1,12 +1,17 @@
import json
import os
import re
import sys
import csv

import numpy as np
import torch
from torchtext.data import NestedField, Field, TabularDataset
from torchtext.data.iterator import BucketIterator
from torchtext.vocab import Vectors

csv.field_size_limit(sys.maxsize)


def clean_string(string):
"""
Expand All @@ -22,6 +27,11 @@ def split_sents(string):
return string.strip().split('.')


def load_json(string):
split_val = json.loads(string)
return np.asarray(split_val, dtype=np.float32)


def char_quantize(string, max_length=1000):
identity = np.identity(len(ReutersCharQuantized.ALPHABET))
quantized_string = np.array([identity[ReutersCharQuantized.ALPHABET[char]] for char in list(string.lower()) if char in ReutersCharQuantized.ALPHABET], dtype=np.float32)
Expand Down Expand Up @@ -100,6 +110,36 @@ def iters(cls, path, vectors_name, vectors_cache, batch_size=64, shuffle=True, d
return BucketIterator.splits((train, val, test), batch_size=batch_size, repeat=False, shuffle=shuffle, device=device)


class ReutersTFIDF(Reuters):
VOCAB_SIZE = 30485
TEXT_FIELD = Field(sequential=False, use_vocab=False, batch_first=True, preprocessing=load_json, dtype=torch.float)

@classmethod
def splits(cls, path, train=os.path.join('Reuters', 'tfidf_train.tsv'),
validation=os.path.join('Reuters', 'tfidf_dev.tsv'),
test=os.path.join('Reuters', 'tfidf_test.tsv'), **kwargs):
return super(Reuters, cls).splits(
path, train=train, validation=validation, test=test,
format='tsv', fields=[('label', cls.LABEL_FIELD), ('text', cls.TEXT_FIELD)]
)

@classmethod
def iters(cls, path, vectors_name, vectors_cache, batch_size=64, shuffle=True, device=0, vectors=None,
unk_init=torch.Tensor.zero_):
"""
:param path: directory containing train, test, dev files
:param vectors_name: name of word vectors file
:param vectors_cache: path to directory containing word vectors file
:param batch_size: batch size
:param device: GPU device
:param vectors: custom vectors - either predefined torchtext vectors or your own custom Vector classes
:param unk_init: function used to generate vector for OOV words
:return:
"""
train, val, test = cls.splits(path)
return BucketIterator.splits((train, val, test), batch_size=batch_size, repeat=False, shuffle=shuffle, device=device)


class ReutersHierarchical(Reuters):
NESTING_FIELD = Field(batch_first=True, tokenize=clean_string)
TEXT_FIELD = NestedField(NESTING_FIELD, tokenize=split_sents)
2 changes: 1 addition & 1 deletion models/args.py
Original file line number Diff line number Diff line change
Expand Up @@ -8,7 +8,7 @@ def get_args():
parser.add_argument('--no-cuda', action='store_false', dest='cuda')
parser.add_argument('--gpu', type=int, default=0)
parser.add_argument('--epochs', type=int, default=50)
parser.add_argument('--batch-size', type=int, default=1024)
parser.add_argument('--batch-size', type=int, default=32)
parser.add_argument('--lr', type=float, default=0.001)
parser.add_argument('--seed', type=int, default=3435)
parser.add_argument('--patience', type=int, default=5)
Expand Down
48 changes: 48 additions & 0 deletions models/mlp/README.md
Original file line number Diff line number Diff line change
@@ -0,0 +1,48 @@
# KimCNN

Implementation for Convolutional Neural Networks for Sentence Classification of [Kim (2014)](https://arxiv.org/abs/1408.5882) with PyTorch and Torchtext.

## Quick Start

To run the model on the Reuters dataset, just run the following from the working directory:

```
python -m models.kim_cnn --mode static --dataset Reuters --batch-size 32 --lr 0.01 --epochs 30 --dropout 0.5 --seed 3435
```

The best model weights will be saved in

```
models/kim_cnn/saves/Reuters/best_model.pt
```

To test the model, you can use the following command.

```
python -m models.kim_cnn --dataset Reuters --mode static --batch-size 32 --trained-model models/kim_cnn/saves/Reuters/best_model.pt --seed 3435
```

## Model Types

- rand: All words are randomly initialized and then modified during training.
- static: A model with pre-trained vectors from [word2vec](https://code.google.com/archive/p/word2vec/).
All words, including the unknown ones that are initialized with zero, are kept static and only the other
parameters of the model are learned.
- non-static: Same as above but the pretrained vectors are fine-tuned for each task.
- multichannel: A model with two sets of word vectors. Each set of vectors is treated as a 'channel' and each
filter is applied to both channels, but gradients are back-propagated only through one of the channels. Hence the
model is able to fine-tune one set of vectors while keeping the other static. Both channels are initialized with
word2vec.

## Dataset

We experiment the model on the following datasets:

- Reuters (ModApte)
- AAPD
- IMDB
- Yelp 2014

## Settings

Adam is used for training.
Empty file added models/mlp/__init__.py
Empty file.
141 changes: 141 additions & 0 deletions models/mlp/__main__.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,141 @@
import os
import random
from copy import deepcopy

import numpy as np
import torch.onnx

from common.evaluate import EvaluatorFactory
from common.train import TrainerFactory
from datasets.aapd import AAPD
from datasets.imdb import IMDB
from datasets.reuters import ReutersTFIDF
from datasets.yelp2014 import Yelp2014
from models.mlp.args import get_args
from models.mlp.model import MLP


def evaluate_dataset(split_name, dataset_cls, model, embedding, loader, batch_size, device, is_multilabel):
saved_model_evaluator = EvaluatorFactory.get_evaluator(dataset_cls, model, embedding, loader, batch_size, device)
if hasattr(saved_model_evaluator, 'is_multilabel'):
saved_model_evaluator.is_multilabel = is_multilabel
if hasattr(saved_model_evaluator, 'ignore_lengths'):
saved_model_evaluator.ignore_lengths = True

scores, metric_names = saved_model_evaluator.get_scores()
print('Evaluation metrics for', split_name)
print(metric_names)
print(scores)


if __name__ == '__main__':
# Set default configuration in args.py
args = get_args()

# Set random seed for reproducibility
random.seed(args.seed)
np.random.seed(args.seed)
torch.manual_seed(args.seed)
torch.backends.cudnn.deterministic = True

if not args.cuda:
args.gpu = -1

if torch.cuda.is_available() and args.cuda:
print('Note: You are using GPU for training')
torch.cuda.set_device(args.gpu)
torch.cuda.manual_seed(args.seed)
args.gpu = torch.device('cuda:%d' % args.gpu)

if torch.cuda.is_available() and not args.cuda:
print('Warning: Using CPU for training')

dataset_map = {
'Reuters': ReutersTFIDF,
'AAPD': AAPD,
'IMDB': IMDB,
'Yelp2014': Yelp2014
}

if args.dataset not in dataset_map:
raise ValueError('Unrecognized dataset')
else:
dataset_class = dataset_map[args.dataset]
train_iter, dev_iter, test_iter = dataset_map[args.dataset].iters(args.data_dir, None, None,
batch_size=args.batch_size,
device=args.gpu,
unk_init=None)

config = deepcopy(args)
config.dataset = train_iter.dataset
config.target_class = train_iter.dataset.NUM_CLASSES
config.words_num = train_iter.dataset.VOCAB_SIZE

print('Dataset:', args.dataset)
print('No. of target classes:', train_iter.dataset.NUM_CLASSES)
print('No. of train instances', len(train_iter.dataset))
print('No. of dev instances', len(dev_iter.dataset))
print('No. of test instances', len(test_iter.dataset))

if args.resume_snapshot:
if args.cuda:
model = torch.load(args.resume_snapshot, map_location=lambda storage, location: storage.cuda(args.gpu))
else:
model = torch.load(args.resume_snapshot, map_location=lambda storage, location: storage)
else:
model = MLP(config)
if args.cuda:
model.cuda()

if not args.trained_model:
save_path = os.path.join(args.save_path, dataset_map[args.dataset].NAME)
os.makedirs(save_path, exist_ok=True)

parameter = filter(lambda p: p.requires_grad, model.parameters())
optimizer = torch.optim.Adam(parameter, lr=args.lr, weight_decay=args.weight_decay)

train_evaluator = EvaluatorFactory.get_evaluator(dataset_map[args.dataset], model, None, train_iter, args.batch_size, args.gpu)
test_evaluator = EvaluatorFactory.get_evaluator(dataset_map[args.dataset], model, None, test_iter, args.batch_size, args.gpu)
dev_evaluator = EvaluatorFactory.get_evaluator(dataset_map[args.dataset], model, None, dev_iter, args.batch_size, args.gpu)

if hasattr(train_evaluator, 'is_multilabel'):
train_evaluator.is_multilabel = dataset_class.IS_MULTILABEL
if hasattr(test_evaluator, 'is_multilabel'):
test_evaluator.is_multilabel = dataset_class.IS_MULTILABEL
if hasattr(dev_evaluator, 'is_multilabel'):
dev_evaluator.is_multilabel = dataset_class.IS_MULTILABEL
if hasattr(dev_evaluator, 'ignore_lengths'):
dev_evaluator.ignore_lengths = True
if hasattr(test_evaluator, 'ignore_lengths'):
test_evaluator.ignore_lengths = True

trainer_config = {
'optimizer': optimizer,
'batch_size': args.batch_size,
'log_interval': args.log_every,
'patience': args.patience,
'model_outfile': args.save_path,
'is_multilabel': dataset_class.IS_MULTILABEL,
'ignore_lengths': True
}

trainer = TrainerFactory.get_trainer(args.dataset, model, None, train_iter, trainer_config, train_evaluator, test_evaluator, dev_evaluator)

if not args.trained_model:
trainer.train(args.epochs)
else:
if args.cuda:
model = torch.load(args.trained_model, map_location=lambda storage, location: storage.cuda(args.gpu))
else:
model = torch.load(args.trained_model, map_location=lambda storage, location: storage)

# Calculate dev and test metrics
if hasattr(trainer, 'snapshot_path'):
model = torch.load(trainer.snapshot_path)

evaluate_dataset('dev', dataset_map[args.dataset], model, None, dev_iter, args.batch_size,
is_multilabel=dataset_class.IS_MULTILABEL,
device=args.gpu)
evaluate_dataset('test', dataset_map[args.dataset], model, None, test_iter, args.batch_size,
is_multilabel=dataset_class.IS_MULTILABEL,
device=args.gpu)
20 changes: 20 additions & 0 deletions models/mlp/args.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,20 @@
import os

import models.args


def get_args():
parser = models.args.get_args()

parser.add_argument('--dataset', type=str, default='Reuters', choices=['Reuters', 'AAPD', 'IMDB', 'Yelp2014'])
parser.add_argument('--embed-dim', type=int, default=300)
parser.add_argument('--dropout', type=float, default=0.5)
parser.add_argument('--epoch-decay', type=int, default=15)
parser.add_argument('--weight-decay', type=float, default=0)

parser.add_argument('--save-path', type=str, default=os.path.join('model_checkpoints', 'mlp'))
parser.add_argument('--resume-snapshot', type=str)
parser.add_argument('--trained-model', type=str)

args = parser.parse_args()
return args
18 changes: 18 additions & 0 deletions models/mlp/model.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,18 @@
import torch
import torch.nn as nn


class MLP(nn.Module):

def __init__(self, config):
super().__init__()
dataset = config.dataset
target_class = config.target_class
# self.dropout = nn.Dropout(config.dropout)
self.fc1 = nn.Linear(dataset.VOCAB_SIZE, target_class)

def forward(self, x, **kwargs):
x = torch.squeeze(x) # (batch, vocab_size)
# x = self.dropout(x)
logit = self.fc1(x) # (batch, target_size)
return logit

0 comments on commit 7a1fce7

Please sign in to comment.