diff --git a/README.md b/README.md index 27d2aee..3800572 100644 --- a/README.md +++ b/README.md @@ -4,6 +4,10 @@ This repo contains PyTorch deep learning models for document classification, implemented by the Data Systems Group at the University of Waterloo. +# Modifications from Original at castorini/hedwig +- added 'models/' in setup.py +- add boto3 in requirements.txt + ## Models + [DocBERT](models/bert/) : DocBERT: BERT for Document Classification [(Adhikari et al., 2019)](https://arxiv.org/abs/1904.08398v1) diff --git a/common/evaluate.py b/common/evaluate.py index a4ad6d8..0f4f3c5 100644 --- a/common/evaluate.py +++ b/common/evaluate.py @@ -11,6 +11,7 @@ class EvaluatorFactory(object): 'AAPD': ClassificationEvaluator, 'IMDB': ClassificationEvaluator, 'Yelp2014': ClassificationEvaluator, + 'Lyrics': ClassificationEvaluator, 'Robust04': RelevanceTransferEvaluator, 'Robust05': RelevanceTransferEvaluator, 'Robust45': RelevanceTransferEvaluator diff --git a/common/evaluators/bert_evaluator.py b/common/evaluators/bert_evaluator.py index 88c4660..fab803a 100644 --- a/common/evaluators/bert_evaluator.py +++ b/common/evaluators/bert_evaluator.py @@ -86,11 +86,18 @@ def get_scores(self, silent=False): nb_eval_examples += input_ids.size(0) nb_eval_steps += 1 + if self.args.is_multilabel: + score_method = 'micro' + pos_label = None + else: + score_method = 'binary' + pos_label = 1 + predicted_labels, target_labels = np.array(predicted_labels), np.array(target_labels) accuracy = metrics.accuracy_score(target_labels, predicted_labels) - precision = metrics.precision_score(target_labels, predicted_labels, average='micro') - recall = metrics.recall_score(target_labels, predicted_labels, average='micro') - f1 = metrics.f1_score(target_labels, predicted_labels, average='micro') + precision = metrics.precision_score(target_labels, predicted_labels, average=score_method, pos_label=pos_label) + recall = metrics.recall_score(target_labels, predicted_labels, average=score_method, pos_label=pos_label) + f1 = metrics.f1_score(target_labels, predicted_labels, average=score_method, pos_label=pos_label) avg_loss = total_loss / nb_eval_steps return [accuracy, precision, recall, f1, avg_loss], ['accuracy', 'precision', 'recall', 'f1', 'avg_loss'] diff --git a/common/train.py b/common/train.py index b53c8f7..a3ed2be 100644 --- a/common/train.py +++ b/common/train.py @@ -11,6 +11,7 @@ class TrainerFactory(object): 'AAPD': ClassificationTrainer, 'IMDB': ClassificationTrainer, 'Yelp2014': ClassificationTrainer, + 'Lyrics': ClassificationTrainer, 'Robust04': RelevanceTransferTrainer, 'Robust05': RelevanceTransferTrainer, 'Robust45': RelevanceTransferTrainer, diff --git a/common/trainers/bert_trainer.py b/common/trainers/bert_trainer.py index 359dc91..2951bdb 100644 --- a/common/trainers/bert_trainer.py +++ b/common/trainers/bert_trainer.py @@ -1,6 +1,11 @@ +# noinspection PyPackageRequirements import datetime import os +import numpy as np +import pandas as pd +import matplotlib.pyplot as plt + import torch import torch.nn.functional as F from torch.utils.data import DataLoader, RandomSampler, TensorDataset @@ -108,6 +113,8 @@ def train(self): train_dataloader = DataLoader(train_data, sampler=train_sampler, batch_size=self.args.batch_size) + # results for graphing learning curves + results = [] for epoch in trange(int(self.args.epochs), desc="Epoch"): self.train_epoch(train_dataloader) dev_evaluator = BertEvaluator(self.model, self.processor, self.args, split='dev') @@ -118,6 +125,8 @@ def train(self): tqdm.write(self.log_template.format(epoch + 1, self.iterations, epoch + 1, self.args.epochs, dev_acc, dev_precision, dev_recall, dev_f1, dev_loss)) + results.append([epoch + 1, dev_acc, dev_precision, dev_recall, dev_f1, dev_loss]) + # Update validation results if dev_f1 > self.best_dev_f1: self.unimproved_iters = 0 @@ -130,3 +139,18 @@ def train(self): self.early_stop = True tqdm.write("Early Stopping. Epoch: {}, Best Dev F1: {}".format(epoch, self.best_dev_f1)) break + + # create learning curves + results_frame = pd.DataFrame(data=np.array(results), + columns=['Epoch', 'Accuracy', 'Precision', 'Recall', 'F1', 'Loss']) \ + .set_index('Epoch') + + + ax_acc = results_frame[['Accuracy', 'Precision', 'Recall', 'F1']].plot() + ax_loss = results_frame[['Loss']].plot() + + ax_acc.get_figure().savefig('accuracy_curves.png') + ax_loss.get_figure().savefig('loss_curves.png') + + + diff --git a/common/trainers/classification_trainer.py b/common/trainers/classification_trainer.py index 0de3d51..55b3ca5 100644 --- a/common/trainers/classification_trainer.py +++ b/common/trainers/classification_trainer.py @@ -103,6 +103,7 @@ def train(self, epochs): torch.save(self.model, self.snapshot_path) else: self.iters_not_improved += 1 + torch.save(self.model, self.snapshot_path) if self.iters_not_improved >= self.patience: self.early_stop = True print("Early Stopping. Epoch: {}, Best Dev F1: {}".format(epoch, self.best_dev_f1)) diff --git a/datasets/bert_processors/lyrics_processor.py b/datasets/bert_processors/lyrics_processor.py new file mode 100644 index 0000000..65d944d --- /dev/null +++ b/datasets/bert_processors/lyrics_processor.py @@ -0,0 +1,41 @@ +import os + +from datasets.bert_processors.abstract_processor import BertProcessor, InputExample + + +class LyricsProcessor(BertProcessor): + def __init__(self): + self.NAME = 'Lyrics' + + def set_num_classes_(self, data_dir): + with open(os.path.join(data_dir, 'Lyrics', 'train.tsv'), 'r') as f: + l1 = f.readline().split('\t') + + # from one-hot class vector + self.NUM_CLASSES = len(l1[0]) + self.IS_MULTILABEL = self.NUM_CLASSES > 2 + + def get_train_examples(self, data_dir): + return self._create_examples( + self._read_tsv(os.path.join(data_dir, 'Lyrics', 'train.tsv')), 'train') + + def get_dev_examples(self, data_dir): + return self._create_examples( + self._read_tsv(os.path.join(data_dir, 'Lyrics', 'dev.tsv')), 'dev') + + def get_test_examples(self, data_dir): + return self._create_examples( + self._read_tsv(os.path.join(data_dir, 'Lyrics', 'test.tsv')), 'test') + + def _create_examples(self, lines, set_type): + """Creates examples for the training and dev sets.""" + examples = [] + for (i, line) in enumerate(lines): + if i == 0: + continue + guid = "%s-%s" % (set_type, i) + text_a = line[1] + label = line[0] + examples.append( + InputExample(guid=guid, text_a=text_a, text_b=None, label=label)) + return examples diff --git a/datasets/bert_processors/reuters_processor.py b/datasets/bert_processors/reuters_processor.py index 8ad9b33..3711210 100644 --- a/datasets/bert_processors/reuters_processor.py +++ b/datasets/bert_processors/reuters_processor.py @@ -4,10 +4,17 @@ class ReutersProcessor(BertProcessor): - NAME = 'Reuters' - NUM_CLASSES = 90 - IS_MULTILABEL = True - + def __init__(self): + self.NAME = 'Reuters' + + def set_num_classes_(self, data_dir): + with open(os.path.join(data_dir, 'Reuters', 'train.tsv'), 'r') as f: + l1 = f.readline().split('\t') + + # from one-hot class vector + self.NUM_CLASSES = len(l1[0]) + self.IS_MULTILABEL = self.NUM_CLASSES > 2 + def get_train_examples(self, data_dir): return self._create_examples( self._read_tsv(os.path.join(data_dir, 'Reuters', 'train.tsv')), 'train') diff --git a/datasets/bert_processors/sst_processor.py b/datasets/bert_processors/sst_processor.py index 01c6079..35231c9 100644 --- a/datasets/bert_processors/sst_processor.py +++ b/datasets/bert_processors/sst_processor.py @@ -4,9 +4,16 @@ class SST2Processor(BertProcessor): - NAME = 'SST-2' - NUM_CLASSES = 2 - IS_MULTILABEL = False + def __init__(self): + self.NAME = 'SST-2' + + def set_num_classes_(self, data_dir): + with open(os.path.join(data_dir, 'SST-2', 'train.tsv'), 'r') as f: + l1 = f.readline().split('\t') + + # from one-hot class vector + self.NUM_CLASSES = len(l1[0]) + self.IS_MULTILABEL = self.NUM_CLASSES > 2 def get_train_examples(self, data_dir): return self._create_examples( diff --git a/datasets/lyrics.py b/datasets/lyrics.py new file mode 100644 index 0000000..c17fc22 --- /dev/null +++ b/datasets/lyrics.py @@ -0,0 +1,105 @@ +import os +import re + +import numpy as np +import torch +from torchtext.data import NestedField, Field, TabularDataset +from torchtext.data.iterator import BucketIterator +from torchtext.vocab import Vectors + + +def clean_string(string): + """ + Performs tokenization and string cleaning for the Lyrics dataset + """ + string = re.sub(r"[^A-Za-z0-9(),!?\'`]", " ", string) + string = re.sub(r"\s{2,}", " ", string) + return string.lower().strip().split() + + +def split_sents(string): + string = re.sub(r"[!?]"," ", string) + return string.strip().split('.') + + +def char_quantize(string, max_length=1000): + identity = np.identity(len(LyricsCharQuantized.ALPHABET)) + quantized_string = np.array([identity[LyricsCharQuantized.ALPHABET[char]] for char in list(string.lower()) if char in LyricsCharQuantized.ALPHABET], dtype=np.float32) + if len(quantized_string) > max_length: + return quantized_string[:max_length] + else: + return np.concatenate((quantized_string, np.zeros((max_length - len(quantized_string), len(LyricsCharQuantized.ALPHABET)), dtype=np.float32))) + + +def process_labels(string): + """ + Returns the label string as a list of integers + :param string: + :return: + """ + return [float(x) for x in string] + + +class Lyrics(TabularDataset): + NAME = 'Lyrics' + NUM_CLASSES = 2 #10 + IS_MULTILABEL = False #True + + TEXT_FIELD = Field(batch_first=True, tokenize=clean_string, include_lengths=True) + LABEL_FIELD = Field(sequential=False, use_vocab=False, batch_first=True, preprocessing=process_labels) + + @staticmethod + def sort_key(ex): + return len(ex.text) + + @classmethod + def splits(cls, path, train=os.path.join('Lyrics', 'train.tsv'), + validation=os.path.join('Lyrics', 'dev.tsv'), + test=os.path.join('Lyrics', 'test.tsv'), **kwargs): + return super(Lyrics, cls).splits( + path, train=train, validation=validation, test=test, + format='tsv', fields=[('label', cls.LABEL_FIELD), ('text', cls.TEXT_FIELD)] + ) + + @classmethod + def iters(cls, path, vectors_name, vectors_cache, batch_size=64, shuffle=True, device=0, vectors=None, + unk_init=torch.Tensor.zero_): + """ + :param path: directory containing train, test, dev files + :param vectors_name: name of word vectors file + :param vectors_cache: path to directory containing word vectors file + :param batch_size: batch size + :param device: GPU device + :param vectors: custom vectors - either predefined torchtext vectors or your own custom Vector classes + :param unk_init: function used to generate vector for OOV words + :return: + """ + if vectors is None: + vectors = Vectors(name=vectors_name, cache=vectors_cache, unk_init=unk_init) + + train, val, test = cls.splits(path) + cls.TEXT_FIELD.build_vocab(train, val, test, vectors=vectors) + return BucketIterator.splits((train, val, test), batch_size=batch_size, repeat=False, shuffle=shuffle, + sort_within_batch=True, device=device) + + +class LyricsCharQuantized(Lyrics): + ALPHABET = dict(map(lambda t: (t[1], t[0]), enumerate(list("""abcdefghijklmnopqrstuvwxyz0123456789,;.!?:'\"/\\|_@#$%^&*~`+-=<>()[]{}""")))) + TEXT_FIELD = Field(sequential=False, use_vocab=False, batch_first=True, preprocessing=char_quantize) + + @classmethod + def iters(cls, path, vectors_name, vectors_cache, batch_size=64, shuffle=True, device=0, vectors=None, + unk_init=torch.Tensor.zero_): + """ + :param path: directory containing train, test, dev files + :param batch_size: batch size + :param device: GPU device + :return: + """ + train, val, test = cls.splits(path) + return BucketIterator.splits((train, val, test), batch_size=batch_size, repeat=False, shuffle=shuffle, device=device) + + +class LyricsHierarchical(Lyrics): + NESTING_FIELD = Field(batch_first=True, tokenize=clean_string) + TEXT_FIELD = NestedField(NESTING_FIELD, tokenize=split_sents) diff --git a/models/args.py b/models/args.py index e80ae2d..ebc1adc 100644 --- a/models/args.py +++ b/models/args.py @@ -13,6 +13,7 @@ def get_args(): parser.add_argument('--seed', type=int, default=3435) parser.add_argument('--patience', type=int, default=5) parser.add_argument('--log-every', type=int, default=10) + parser.add_argument('--local-rank', type=int, default=-1, help='local rank for distributed training') parser.add_argument('--data-dir', default=os.path.join(os.pardir, 'hedwig-data', 'datasets')) return parser diff --git a/models/bert/__main__.py b/models/bert/__main__.py index 3241db7..75f9271 100644 --- a/models/bert/__main__.py +++ b/models/bert/__main__.py @@ -14,6 +14,8 @@ from datasets.bert_processors.sogou_processor import SogouProcessor from datasets.bert_processors.sst_processor import SST2Processor from datasets.bert_processors.yelp2014_processor import Yelp2014Processor +from datasets.bert_processors.lyrics_processor import LyricsProcessor + from models.bert.args import get_args from models.bert.model import BertForSequenceClassification from utils.io import PYTORCH_PRETRAINED_BERT_CACHE @@ -67,7 +69,8 @@ def evaluate_split(model, processor, args, split='dev'): 'AAPD': AAPDProcessor, 'AGNews': AGNewsProcessor, 'Yelp2014': Yelp2014Processor, - 'Sogou': SogouProcessor + 'Sogou': SogouProcessor, + 'Lyrics': LyricsProcessor } if args.gradient_accumulation_steps < 1: @@ -77,17 +80,19 @@ def evaluate_split(model, processor, args, split='dev'): if args.dataset not in dataset_map: raise ValueError('Unrecognized dataset') + processor = dataset_map[args.dataset]() + processor.set_num_classes_(args.data_dir) + args.batch_size = args.batch_size // args.gradient_accumulation_steps args.device = device args.n_gpu = n_gpu - args.num_labels = dataset_map[args.dataset].NUM_CLASSES - args.is_multilabel = dataset_map[args.dataset].IS_MULTILABEL + args.num_labels = processor.NUM_CLASSES + args.is_multilabel = processor.IS_MULTILABEL if not args.trained_model: - save_path = os.path.join(args.save_path, dataset_map[args.dataset].NAME) + save_path = os.path.join(args.save_path, processor.NAME) os.makedirs(save_path, exist_ok=True) - processor = dataset_map[args.dataset]() args.is_lowercase = 'uncased' in args.model args.is_hierarchical = False tokenizer = BertTokenizer.from_pretrained(args.model, is_lowercase=args.is_lowercase) diff --git a/models/bert/args.py b/models/bert/args.py index 5819765..d159d64 100644 --- a/models/bert/args.py +++ b/models/bert/args.py @@ -7,11 +7,11 @@ def get_args(): parser = models.args.get_args() parser.add_argument('--model', default=None, type=str, required=True) - parser.add_argument('--dataset', type=str, default='SST-2', choices=['SST-2', 'AGNews', 'Reuters', 'AAPD', 'IMDB', 'Yelp2014']) + parser.add_argument('--dataset', type=str, default='SST-2', choices=['SST-2', 'AGNews', 'Reuters', 'AAPD', 'IMDB', + 'Yelp2014', 'Lyrics']) parser.add_argument('--save-path', type=str, default=os.path.join('model_checkpoints', 'bert')) parser.add_argument('--cache-dir', default='cache', type=str) parser.add_argument('--trained-model', default=None, type=str) - parser.add_argument('--local-rank', type=int, default=-1, help='local rank for distributed training') parser.add_argument('--fp16', action='store_true', help='use 16-bit floating point precision') parser.add_argument('--max-seq-length', diff --git a/models/char_cnn/__main__.py b/models/char_cnn/__main__.py index 4344698..e9fad4d 100644 --- a/models/char_cnn/__main__.py +++ b/models/char_cnn/__main__.py @@ -12,6 +12,7 @@ from datasets.imdb import IMDBCharQuantized as IMDB from datasets.reuters import ReutersCharQuantized as Reuters from datasets.yelp2014 import Yelp2014CharQuantized as Yelp2014 +from datasets.lyrics import LyricsCharQuantized as Lyrics from models.char_cnn.args import get_args from models.char_cnn.model import CharCNN @@ -59,31 +60,40 @@ def evaluate_dataset(split_name, dataset_cls, model, embedding, loader, batch_si if __name__ == '__main__': # Set default configuration in args.py - args = get_args() logger = get_logger() + args = get_args() + + if args.local_rank == -1 or not args.cuda: + device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu") + n_gpu = torch.cuda.device_count() + else: + torch.cuda.set_device(args.local_rank) + device = torch.device("cuda", args.local_rank) + n_gpu = 1 + # Initializes the distributed backend which will take care of sychronizing nodes/GPUs + torch.distributed.init_process_group(backend='nccl') + + print('Device:', str(device).upper()) + print('Number of GPUs:', n_gpu) + print('Distributed training:', bool(args.local_rank != -1)) # Set random seed for reproducibility - torch.manual_seed(args.seed) - torch.backends.cudnn.deterministic = True - np.random.seed(args.seed) random.seed(args.seed) - - if not args.cuda: - args.gpu = -1 - if torch.cuda.is_available() and args.cuda: - print('Note: You are using GPU for training') - torch.cuda.set_device(args.gpu) - torch.cuda.manual_seed(args.seed) - if torch.cuda.is_available() and not args.cuda: - print('Warning: Using CPU for training') + np.random.seed(args.seed) + torch.manual_seed(args.seed) + if n_gpu > 0: + torch.cuda.manual_seed_all(args.seed) dataset_map = { 'Reuters': Reuters, 'AAPD': AAPD, 'IMDB': IMDB, - 'Yelp2014': Yelp2014 + 'Yelp2014': Yelp2014, + 'Lyrics': Lyrics } + args.device = device + if args.dataset not in dataset_map: raise ValueError('Unrecognized dataset') @@ -93,7 +103,7 @@ def evaluate_dataset(split_name, dataset_cls, model, embedding, loader, batch_si args.word_vectors_file, args.word_vectors_dir, batch_size=args.batch_size, - device=args.gpu, + device=args.device, unk_init=UnknownWordVecCache.unk) config = deepcopy(args) @@ -114,7 +124,7 @@ def evaluate_dataset(split_name, dataset_cls, model, embedding, loader, batch_si else: model = CharCNN(config) if args.cuda: - model.cuda() + model.to(args.device) if not args.trained_model: save_path = os.path.join(args.save_path, dataset_map[args.dataset].NAME) diff --git a/models/char_cnn/args.py b/models/char_cnn/args.py index 62c517f..a2f244a 100644 --- a/models/char_cnn/args.py +++ b/models/char_cnn/args.py @@ -6,7 +6,8 @@ def get_args(): parser = models.args.get_args() - parser.add_argument('--dataset', type=str, default='Reuters', choices=['Reuters', 'AAPD', 'IMDB', 'Yelp2014']) + parser.add_argument('--dataset', type=str, default='Reuters', choices=['Reuters', 'AAPD', 'IMDB', 'Yelp2014', + 'Lyrics']) parser.add_argument('--num-conv-filters', type=int, default=256) parser.add_argument('--num-affine-neurons', type=int, default=1024) parser.add_argument('--output-channel', type=int, default=256) diff --git a/models/han/__main__.py b/models/han/__main__.py index 6d77b7c..78f5519 100644 --- a/models/han/__main__.py +++ b/models/han/__main__.py @@ -13,6 +13,7 @@ from datasets.imdb import IMDBHierarchical as IMDB from datasets.reuters import ReutersHierarchical as Reuters from datasets.yelp2014 import Yelp2014Hierarchical as Yelp2014 +from datasets.lyrics import LyricsHierarchical as Lyrics from models.han.args import get_args from models.han.model import HAN @@ -60,31 +61,57 @@ def evaluate_dataset(split_name, dataset_cls, model, embedding, loader, batch_si if __name__ == '__main__': # Set default configuration in args.py - args = get_args() + # args = get_args() logger = get_logger() + # + # # Set random seed for reproducibility + # torch.manual_seed(args.seed) + # torch.backends.cudnn.deterministic = True + # np.random.seed(args.seed) + # random.seed(args.seed) + # + # if not args.cuda: + # args.gpu = -1 + # if torch.cuda.is_available() and args.cuda: + # print('Note: You are using GPU for training') + # torch.cuda.set_device('cuda:{}'.format(args.gpu)) + # torch.cuda.manual_seed(args.seed) + # if torch.cuda.is_available() and not args.cuda: + # print('Warning: Using CPU for training') + + args = get_args() + + if args.local_rank == -1 or not args.cuda: + device = torch.device("cuda" if torch.cuda.is_available() and args.cuda else "cpu") + n_gpu = torch.cuda.device_count() + else: + torch.cuda.set_device(args.local_rank) + device = torch.device("cuda", args.local_rank) + n_gpu = 1 + # Initializes the distributed backend which will take care of sychronizing nodes/GPUs + torch.distributed.init_process_group(backend='nccl') + + print('Device:', str(device).upper()) + print('Number of GPUs:', n_gpu) + print('Distributed training:', bool(args.local_rank != -1)) # Set random seed for reproducibility - torch.manual_seed(args.seed) - torch.backends.cudnn.deterministic = True - np.random.seed(args.seed) random.seed(args.seed) - - if not args.cuda: - args.gpu = -1 - if torch.cuda.is_available() and args.cuda: - print('Note: You are using GPU for training') - torch.cuda.set_device(args.gpu) - torch.cuda.manual_seed(args.seed) - if torch.cuda.is_available() and not args.cuda: - print('Warning: Using CPU for training') + np.random.seed(args.seed) + torch.manual_seed(args.seed) + if n_gpu > 0: + torch.cuda.manual_seed_all(args.seed) dataset_map = { 'Reuters': Reuters, 'AAPD': AAPD, 'IMDB': IMDB, - 'Yelp2014': Yelp2014 + 'Yelp2014': Yelp2014, + 'Lyrics': Lyrics } + args.device = device + if args.dataset not in dataset_map: raise ValueError('Unrecognized dataset') @@ -94,7 +121,7 @@ def evaluate_dataset(split_name, dataset_cls, model, embedding, loader, batch_si args.word_vectors_file, args.word_vectors_dir, batch_size=args.batch_size, - device=args.gpu, + device=args.device, unk_init=UnknownWordVecCache.unk) config = deepcopy(args) @@ -110,13 +137,13 @@ def evaluate_dataset(split_name, dataset_cls, model, embedding, loader, batch_si if args.resume_snapshot: if args.cuda: - model = torch.load(args.resume_snapshot, map_location=lambda storage, location: storage.cuda(args.gpu)) + model = torch.load(args.resume_snapshot, map_location=lambda storage, location: storage.cuda(args.device)) else: model = torch.load(args.resume_snapshot, map_location=lambda storage, location: storage) else: model = HAN(config) if args.cuda: - model.cuda() + model.to(device) if not args.trained_model: save_path = os.path.join(args.save_path, dataset_map[args.dataset].NAME) diff --git a/models/han/args.py b/models/han/args.py index c803f71..7749eff 100644 --- a/models/han/args.py +++ b/models/han/args.py @@ -7,7 +7,8 @@ def get_args(): parser = models.args.get_args() parser.add_argument('--mode', type=str, default='static', choices=['rand', 'static', 'non-static']) - parser.add_argument('--dataset', type=str, default='Reuters', choices=['Reuters', 'AAPD', 'IMDB', 'Yelp2014']) + parser.add_argument('--dataset', type=str, default='Reuters', choices=['Reuters', 'AAPD', 'IMDB', 'Yelp2014', + 'Lyrics']) parser.add_argument('--output-channel', type=int, default=100) parser.add_argument('--words-dim', type=int, default=300) parser.add_argument('--embed-dim', type=int, default=300) diff --git a/models/kim_cnn/__main__.py b/models/kim_cnn/__main__.py index 5cf6720..606246f 100644 --- a/models/kim_cnn/__main__.py +++ b/models/kim_cnn/__main__.py @@ -13,6 +13,7 @@ from datasets.imdb import IMDB from datasets.reuters import Reuters from datasets.yelp2014 import Yelp2014 +from datasets.lyrics import Lyrics from models.kim_cnn.args import get_args from models.kim_cnn.model import KimCNN @@ -58,37 +59,48 @@ def evaluate_dataset(split_name, dataset_cls, model, embedding, loader, batch_si if __name__ == '__main__': # Set default configuration in args.py + logger = get_logger() args = get_args() + if args.local_rank == -1 or not args.cuda: + device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu") + n_gpu = torch.cuda.device_count() + else: + torch.cuda.set_device(args.local_rank) + device = torch.device("cuda", args.local_rank) + n_gpu = 1 + # Initializes the distributed backend which will take care of sychronizing nodes/GPUs + torch.distributed.init_process_group(backend='nccl') + + print('Device:', str(device).upper()) + print('Number of GPUs:', n_gpu) + print('Distributed training:', bool(args.local_rank != -1)) + # Set random seed for reproducibility - torch.manual_seed(args.seed) - torch.backends.cudnn.deterministic = True - if not args.cuda: - args.gpu = -1 - if torch.cuda.is_available() and args.cuda: - print('Note: You are using GPU for training') - torch.cuda.set_device(args.gpu) - torch.cuda.manual_seed(args.seed) - if torch.cuda.is_available() and not args.cuda: - print('Warning: Using CPU for training') - np.random.seed(args.seed) random.seed(args.seed) - logger = get_logger() + np.random.seed(args.seed) + torch.manual_seed(args.seed) + if n_gpu > 0: + torch.cuda.manual_seed_all(args.seed) dataset_map = { 'Reuters': Reuters, 'AAPD': AAPD, 'IMDB': IMDB, - 'Yelp2014': Yelp2014 + 'Yelp2014': Yelp2014, + 'Lyrics': Lyrics } + args.device = device + if args.dataset not in dataset_map: raise ValueError('Unrecognized dataset') else: dataset_class = dataset_map[args.dataset] train_iter, dev_iter, test_iter = dataset_map[args.dataset].iters(args.data_dir, args.word_vectors_file, args.word_vectors_dir, - batch_size=args.batch_size, device=args.gpu, + batch_size=args.batch_size, + device=args.device, unk_init=UnknownWordVecCache.unk) config = deepcopy(args) @@ -110,7 +122,7 @@ def evaluate_dataset(split_name, dataset_cls, model, embedding, loader, batch_si else: model = KimCNN(config) if args.cuda: - model.cuda() + model.to(args.device) if not args.trained_model: save_path = os.path.join(args.save_path, dataset_map[args.dataset].NAME) diff --git a/models/kim_cnn/args.py b/models/kim_cnn/args.py index 10094f8..be9c9a9 100644 --- a/models/kim_cnn/args.py +++ b/models/kim_cnn/args.py @@ -6,7 +6,8 @@ def get_args(): parser = models.args.get_args() - parser.add_argument('--dataset', type=str, default='Reuters', choices=['Reuters', 'AAPD', 'IMDB', 'Yelp2014']) + parser.add_argument('--dataset', type=str, default='Reuters', choices=['Reuters', 'AAPD', 'IMDB', 'Yelp2014', + 'Lyrics']) parser.add_argument('--mode', type=str, default='multichannel', choices=['rand', 'static', 'non-static', 'multichannel']) parser.add_argument('--output-channel', type=int, default=100) parser.add_argument('--words-dim', type=int, default=300) diff --git a/models/reg_lstm/__main__.py b/models/reg_lstm/__main__.py index c0dfc4a..af8607e 100644 --- a/models/reg_lstm/__main__.py +++ b/models/reg_lstm/__main__.py @@ -12,6 +12,7 @@ from datasets.imdb import IMDB from datasets.reuters import Reuters from datasets.yelp2014 import Yelp2014 +from datasets.lyrics import Lyrics from models.reg_lstm.args import get_args from models.reg_lstm.model import RegLSTM @@ -57,31 +58,40 @@ def evaluate_dataset(split_name, dataset_cls, model, embedding, loader, batch_si if __name__ == '__main__': # Set default configuration in args.py - args = get_args() logger = get_logger() + args = get_args() + + if args.local_rank == -1 or not args.cuda: + device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu") + n_gpu = torch.cuda.device_count() + else: + torch.cuda.set_device(args.local_rank) + device = torch.device("cuda", args.local_rank) + n_gpu = 1 + # Initializes the distributed backend which will take care of sychronizing nodes/GPUs + torch.distributed.init_process_group(backend='nccl') + + print('Device:', str(device).upper()) + print('Number of GPUs:', n_gpu) + print('Distributed training:', bool(args.local_rank != -1)) # Set random seed for reproducibility - torch.manual_seed(args.seed) - torch.backends.cudnn.deterministic = True - np.random.seed(args.seed) random.seed(args.seed) - - if not args.cuda: - args.gpu = -1 - if torch.cuda.is_available() and args.cuda: - print('Note: You are using GPU for training') - torch.cuda.set_device(args.gpu) - torch.cuda.manual_seed(args.seed) - if torch.cuda.is_available() and not args.cuda: - print('Warning: Using CPU for training') + np.random.seed(args.seed) + torch.manual_seed(args.seed) + if n_gpu > 0: + torch.cuda.manual_seed_all(args.seed) dataset_map = { 'Reuters': Reuters, 'AAPD': AAPD, 'IMDB': IMDB, - 'Yelp2014': Yelp2014 + 'Yelp2014': Yelp2014, + 'Lyrics': Lyrics } + args.device = device + if args.dataset not in dataset_map: raise ValueError('Unrecognized dataset') @@ -91,7 +101,7 @@ def evaluate_dataset(split_name, dataset_cls, model, embedding, loader, batch_si args.word_vectors_file, args.word_vectors_dir, batch_size=args.batch_size, - device=args.gpu, + device=args.device, unk_init=UnknownWordVecCache.unk) config = deepcopy(args) @@ -113,7 +123,7 @@ def evaluate_dataset(split_name, dataset_cls, model, embedding, loader, batch_si else: model = RegLSTM(config) if args.cuda: - model.cuda() + model.to(args.device) if not args.trained_model: save_path = os.path.join(args.save_path, dataset_map[args.dataset].NAME) diff --git a/models/reg_lstm/args.py b/models/reg_lstm/args.py index 30761ae..4eb7bde 100644 --- a/models/reg_lstm/args.py +++ b/models/reg_lstm/args.py @@ -11,7 +11,8 @@ def get_args(): parser.add_argument('--num-layers', type=int, default=2) parser.add_argument('--hidden-dim', type=int, default=256) parser.add_argument('--mode', type=str, default='static', choices=['rand', 'static', 'non-static']) - parser.add_argument('--dataset', type=str, default='Reuters', choices=['Reuters', 'AAPD', 'IMDB', 'Yelp2014']) + parser.add_argument('--dataset', type=str, default='Reuters', choices=['Reuters', 'AAPD', 'IMDB', 'Yelp2014', + 'Lyrics']) parser.add_argument('--words-dim', type=int, default=300) parser.add_argument('--embed-dim', type=int, default=300) parser.add_argument('--epoch-decay', type=int, default=15) diff --git a/models/reg_lstm/model.py b/models/reg_lstm/model.py index ac955f6..a9c4ce0 100644 --- a/models/reg_lstm/model.py +++ b/models/reg_lstm/model.py @@ -94,13 +94,15 @@ def forward(self, x, lengths=None): return self.fc1(x) def update_ema(self): - self.steps_ema += 1 - for p, avg_p in zip(self.parameters(), self.avg_param): - avg_p.mul_(self.beta_ema).add_((1-self.beta_ema)*p.data) + # self.steps_ema += 1 + # for p, avg_p in zip(self.parameters(), self.avg_param): + # avg_p.mul_(self.beta_ema).add_((1-self.beta_ema)*p.data) + pass def load_ema_params(self): - for p, avg_p in zip(self.parameters(), self.avg_param): - p.data.copy_(avg_p/(1-self.beta_ema**self.steps_ema)) + # for p, avg_p in zip(self.parameters(), self.avg_param): + # p.data.copy_(avg_p/(1-self.beta_ema**self.steps_ema)) + pass def load_params(self, params): for p,avg_p in zip(self.parameters(), params): diff --git a/models/reg_lstm/weight_drop.py b/models/reg_lstm/weight_drop.py index 5a08e85..0ed484b 100644 --- a/models/reg_lstm/weight_drop.py +++ b/models/reg_lstm/weight_drop.py @@ -67,9 +67,9 @@ def _setweights(self): mask = torch.autograd.Variable(torch.ones(raw_w.size(0), 1)) if raw_w.is_cuda: mask = mask.cuda() mask = torch.nn.functional.dropout(mask, p=self.dropout, training=True) - w = mask.expand_as(raw_w) * raw_w + w = torch.nn.Parameter(mask.expand_as(raw_w) * raw_w) else: - w = torch.nn.functional.dropout(raw_w, p=self.dropout, training=self.training) + w = torch.nn.Parameter(torch.nn.functional.dropout(raw_w, p=self.dropout, training=self.training)) setattr(self.module, name_w, w) def forward(self, *args): diff --git a/models/xml_cnn/__main__.py b/models/xml_cnn/__main__.py index 5e30273..7735819 100644 --- a/models/xml_cnn/__main__.py +++ b/models/xml_cnn/__main__.py @@ -13,6 +13,7 @@ from datasets.imdb import IMDB from datasets.reuters import Reuters from datasets.yelp2014 import Yelp2014 +from datasets.lyrics import Lyrics from models.xml_cnn.args import get_args from models.xml_cnn.model import XmlCNN @@ -58,37 +59,48 @@ def evaluate_dataset(split_name, dataset_cls, model, embedding, loader, batch_si if __name__ == '__main__': # Set default configuration in args.py + logger = get_logger() args = get_args() + if args.local_rank == -1 or not args.cuda: + device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu") + n_gpu = torch.cuda.device_count() + else: + torch.cuda.set_device(args.local_rank) + device = torch.device("cuda", args.local_rank) + n_gpu = 1 + # Initializes the distributed backend which will take care of sychronizing nodes/GPUs + torch.distributed.init_process_group(backend='nccl') + + print('Device:', str(device).upper()) + print('Number of GPUs:', n_gpu) + print('Distributed training:', bool(args.local_rank != -1)) + # Set random seed for reproducibility - torch.manual_seed(args.seed) - torch.backends.cudnn.deterministic = True - if not args.cuda: - args.gpu = -1 - if torch.cuda.is_available() and args.cuda: - print('Note: You are using GPU for training') - torch.cuda.set_device(args.gpu) - torch.cuda.manual_seed(args.seed) - if torch.cuda.is_available() and not args.cuda: - print('Warning: Using CPU for training') - np.random.seed(args.seed) random.seed(args.seed) - logger = get_logger() + np.random.seed(args.seed) + torch.manual_seed(args.seed) + if n_gpu > 0: + torch.cuda.manual_seed_all(args.seed) dataset_map = { 'Reuters': Reuters, 'AAPD': AAPD, 'IMDB': IMDB, - 'Yelp2014': Yelp2014 + 'Yelp2014': Yelp2014, + 'Lyrics': Lyrics } + args.device = device + if args.dataset not in dataset_map: raise ValueError('Unrecognized dataset') else: dataset_class = dataset_map[args.dataset] train_iter, dev_iter, test_iter = dataset_map[args.dataset].iters(args.data_dir, args.word_vectors_file, args.word_vectors_dir, - batch_size=args.batch_size, device=args.gpu, + batch_size=args.batch_size, + device=args.device, unk_init=UnknownWordVecCache.unk) config = deepcopy(args) @@ -110,7 +122,7 @@ def evaluate_dataset(split_name, dataset_cls, model, embedding, loader, batch_si else: model = XmlCNN(config) if args.cuda: - model.cuda() + model.to(args.device) if not args.trained_model: save_path = os.path.join(args.save_path, dataset_map[args.dataset].NAME) diff --git a/models/xml_cnn/args.py b/models/xml_cnn/args.py index 5d269ed..4745d39 100644 --- a/models/xml_cnn/args.py +++ b/models/xml_cnn/args.py @@ -6,7 +6,8 @@ def get_args(): parser = models.args.get_args() parser.add_argument('--mode', type=str, default='multichannel', choices=['rand', 'static', 'non-static', 'multichannel']) - parser.add_argument('--dataset', type=str, default='Reuters', choices=['Reuters', 'AAPD', 'IMDB', 'Yelp2014']) + parser.add_argument('--dataset', type=str, default='Reuters', choices=['Reuters', 'AAPD', 'IMDB', 'Yelp2014', + 'Lyrics']) parser.add_argument('--dev-every', type=int, default=30) parser.add_argument('--output-channel', type=int, default=100) parser.add_argument('--words-dim', type=int, default=300) diff --git a/requirements.txt b/requirements.txt index fe8e9b5..dcf8748 100644 --- a/requirements.txt +++ b/requirements.txt @@ -1,6 +1,7 @@ nltk==3.2.5 -numpy==1.14.0 Cython==0.28.2 scikit-learn==0.19.1 scipy==1.0.0 torchtext==0.2.3 +numpy +boto3 diff --git a/setup.py b/setup.py index 3d2907b..70a1d18 100644 --- a/setup.py +++ b/setup.py @@ -3,5 +3,5 @@ setup(name='hedwig', version='1.0.0', description='PyTorch deep learning models for document classification', - packages=['char_cnn', 'han', 'kim_cnn', 'reg_lstm', 'xml_cnn'], + packages=['models/char_cnn', 'models/han', 'models/kim_cnn', 'models/reg_lstm', 'models/xml_cnn'], )