From ff04385ec3bf5b2a941a6be769de959808837064 Mon Sep 17 00:00:00 2001 From: Jesse Date: Fri, 19 Jul 2019 21:53:24 -0400 Subject: [PATCH 01/42] Update setup.py package list Fixed paths to include 'models/' --- setup.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/setup.py b/setup.py index 3d2907b..70a1d18 100644 --- a/setup.py +++ b/setup.py @@ -3,5 +3,5 @@ setup(name='hedwig', version='1.0.0', description='PyTorch deep learning models for document classification', - packages=['char_cnn', 'han', 'kim_cnn', 'reg_lstm', 'xml_cnn'], + packages=['models/char_cnn', 'models/han', 'models/kim_cnn', 'models/reg_lstm', 'models/xml_cnn'], ) From a4e8311dc6ea43ff7880d25d6609273fc612b1eb Mon Sep 17 00:00:00 2001 From: Jesse Date: Sat, 20 Jul 2019 11:39:56 -0400 Subject: [PATCH 02/42] Update README.md --- README.md | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/README.md b/README.md index 27d2aee..3800572 100644 --- a/README.md +++ b/README.md @@ -4,6 +4,10 @@ This repo contains PyTorch deep learning models for document classification, implemented by the Data Systems Group at the University of Waterloo. +# Modifications from Original at castorini/hedwig +- added 'models/' in setup.py +- add boto3 in requirements.txt + ## Models + [DocBERT](models/bert/) : DocBERT: BERT for Document Classification [(Adhikari et al., 2019)](https://arxiv.org/abs/1904.08398v1) From b4778e9537ebd2a65f98912fe1633b1c7595bbe5 Mon Sep 17 00:00:00 2001 From: Jesse Date: Sun, 21 Jul 2019 18:38:49 -0400 Subject: [PATCH 03/42] modified training code to create learning curve figures --- common/trainers/bert_trainer.py | 24 ++++++++++++++++++++++++ 1 file changed, 24 insertions(+) diff --git a/common/trainers/bert_trainer.py b/common/trainers/bert_trainer.py index 359dc91..066baae 100644 --- a/common/trainers/bert_trainer.py +++ b/common/trainers/bert_trainer.py @@ -1,6 +1,11 @@ +# noinspection PyPackageRequirements import datetime import os +import numpy as np +import pandas as pd +import matplotlib.pyplot as plt + import torch import torch.nn.functional as F from torch.utils.data import DataLoader, RandomSampler, TensorDataset @@ -108,6 +113,8 @@ def train(self): train_dataloader = DataLoader(train_data, sampler=train_sampler, batch_size=self.args.batch_size) + # results for graphing learning curves + results = [] for epoch in trange(int(self.args.epochs), desc="Epoch"): self.train_epoch(train_dataloader) dev_evaluator = BertEvaluator(self.model, self.processor, self.args, split='dev') @@ -118,6 +125,8 @@ def train(self): tqdm.write(self.log_template.format(epoch + 1, self.iterations, epoch + 1, self.args.epochs, dev_acc, dev_precision, dev_recall, dev_f1, dev_loss)) + results.append([epoch + 1, dev_acc, dev_precision, dev_recall, dev_f1, dev_loss]) + # Update validation results if dev_f1 > self.best_dev_f1: self.unimproved_iters = 0 @@ -130,3 +139,18 @@ def train(self): self.early_stop = True tqdm.write("Early Stopping. Epoch: {}, Best Dev F1: {}".format(epoch, self.best_dev_f1)) break + + # create learning curves + results_frame = pd.DataFrame(data=np.array(results), + columns=['Epoch', 'Accuracy', 'Precision', 'Recall', 'F1', 'Loss'], + index='Epoch') + + + ax_acc = results_frame[['Accuracy', 'Precision', 'Recall', 'F1']].plot() + ax_loss = results_frame[['Loss']].plot() + + ax_acc.get_figure().savefig('accuracy_curves.png') + ax_loss.get_figure().savefig('loss_curves.png') + + + From 5531f0dd15884efda98aef2d0a9c6b892efae40f Mon Sep 17 00:00:00 2001 From: Jesse Date: Sun, 21 Jul 2019 20:20:03 -0400 Subject: [PATCH 04/42] bug fix - learning curves --- common/trainers/bert_trainer.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/common/trainers/bert_trainer.py b/common/trainers/bert_trainer.py index 066baae..2951bdb 100644 --- a/common/trainers/bert_trainer.py +++ b/common/trainers/bert_trainer.py @@ -142,8 +142,8 @@ def train(self): # create learning curves results_frame = pd.DataFrame(data=np.array(results), - columns=['Epoch', 'Accuracy', 'Precision', 'Recall', 'F1', 'Loss'], - index='Epoch') + columns=['Epoch', 'Accuracy', 'Precision', 'Recall', 'F1', 'Loss']) \ + .set_index('Epoch') ax_acc = results_frame[['Accuracy', 'Precision', 'Recall', 'F1']].plot() From 2c2beffaab1070cded3e40c9cab68b7b86d8fd60 Mon Sep 17 00:00:00 2001 From: Jesse Date: Sun, 21 Jul 2019 23:24:14 -0400 Subject: [PATCH 05/42] Update requirements.txt --- requirements.txt | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/requirements.txt b/requirements.txt index fe8e9b5..dcf8748 100644 --- a/requirements.txt +++ b/requirements.txt @@ -1,6 +1,7 @@ nltk==3.2.5 -numpy==1.14.0 Cython==0.28.2 scikit-learn==0.19.1 scipy==1.0.0 torchtext==0.2.3 +numpy +boto3 From 3ef89c369f3c8b4ced0e2ccf0d2aafbc86491f4a Mon Sep 17 00:00:00 2001 From: Jesse Date: Mon, 22 Jul 2019 22:24:48 -0400 Subject: [PATCH 06/42] fixed cuda bug for HAN --- models/han/__main__.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/models/han/__main__.py b/models/han/__main__.py index 6d77b7c..b2ac2c5 100644 --- a/models/han/__main__.py +++ b/models/han/__main__.py @@ -73,7 +73,7 @@ def evaluate_dataset(split_name, dataset_cls, model, embedding, loader, batch_si args.gpu = -1 if torch.cuda.is_available() and args.cuda: print('Note: You are using GPU for training') - torch.cuda.set_device(args.gpu) + torch.cuda.set_device('cuda:{}'.format(args.gpu)) torch.cuda.manual_seed(args.seed) if torch.cuda.is_available() and not args.cuda: print('Warning: Using CPU for training') From c2b1fcf014939edd819b3ce77c33be99c3d56d30 Mon Sep 17 00:00:00 2001 From: Jesse Date: Mon, 22 Jul 2019 22:49:05 -0400 Subject: [PATCH 07/42] attempt to make CUDA usage for HAN resemble BERT code --- models/han/__main__.py | 51 ++++++++++++++++++++++++++++++------------ models/han/args.py | 1 + 2 files changed, 38 insertions(+), 14 deletions(-) diff --git a/models/han/__main__.py b/models/han/__main__.py index b2ac2c5..94b8e0e 100644 --- a/models/han/__main__.py +++ b/models/han/__main__.py @@ -60,23 +60,46 @@ def evaluate_dataset(split_name, dataset_cls, model, embedding, loader, batch_si if __name__ == '__main__': # Set default configuration in args.py - args = get_args() + # args = get_args() logger = get_logger() + # + # # Set random seed for reproducibility + # torch.manual_seed(args.seed) + # torch.backends.cudnn.deterministic = True + # np.random.seed(args.seed) + # random.seed(args.seed) + # + # if not args.cuda: + # args.gpu = -1 + # if torch.cuda.is_available() and args.cuda: + # print('Note: You are using GPU for training') + # torch.cuda.set_device('cuda:{}'.format(args.gpu)) + # torch.cuda.manual_seed(args.seed) + # if torch.cuda.is_available() and not args.cuda: + # print('Warning: Using CPU for training') + + args = get_args() + + if args.local_rank == -1 or not args.cuda: + device = torch.device("cuda" if torch.cuda.is_available() and args.cuda else "cpu") + n_gpu = torch.cuda.device_count() + else: + torch.cuda.set_device(args.local_rank) + device = torch.device("cuda", args.local_rank) + n_gpu = 1 + # Initializes the distributed backend which will take care of sychronizing nodes/GPUs + torch.distributed.init_process_group(backend='nccl') + + print('Device:', str(device).upper()) + print('Number of GPUs:', n_gpu) + print('Distributed training:', bool(args.local_rank != -1)) # Set random seed for reproducibility - torch.manual_seed(args.seed) - torch.backends.cudnn.deterministic = True - np.random.seed(args.seed) random.seed(args.seed) - - if not args.cuda: - args.gpu = -1 - if torch.cuda.is_available() and args.cuda: - print('Note: You are using GPU for training') - torch.cuda.set_device('cuda:{}'.format(args.gpu)) - torch.cuda.manual_seed(args.seed) - if torch.cuda.is_available() and not args.cuda: - print('Warning: Using CPU for training') + np.random.seed(args.seed) + torch.manual_seed(args.seed) + if n_gpu > 0: + torch.cuda.manual_seed_all(args.seed) dataset_map = { 'Reuters': Reuters, @@ -116,7 +139,7 @@ def evaluate_dataset(split_name, dataset_cls, model, embedding, loader, batch_si else: model = HAN(config) if args.cuda: - model.cuda() + model.to(device) if not args.trained_model: save_path = os.path.join(args.save_path, dataset_map[args.dataset].NAME) diff --git a/models/han/args.py b/models/han/args.py index c803f71..5e6c2b2 100644 --- a/models/han/args.py +++ b/models/han/args.py @@ -16,6 +16,7 @@ def get_args(): parser.add_argument('--weight-decay', type=float, default=0) parser.add_argument('--word-num-hidden', type=int, default=50) parser.add_argument('--sentence-num-hidden', type=int, default=50) + parser.add_argument('--local-rank', type=int, default=-1, help='local rank for distributed training') parser.add_argument('--word-vectors-dir', default=os.path.join(os.pardir, 'hedwig-data', 'embeddings', 'word2vec')) parser.add_argument('--word-vectors-file', default='GoogleNews-vectors-negative300.txt') From 454c5f3f6d6bdfa7d6fa781392862480d14a7ef3 Mon Sep 17 00:00:00 2001 From: Jesse Date: Mon, 22 Jul 2019 22:59:18 -0400 Subject: [PATCH 08/42] attempt to make CUDA usage for HAN resemble BERT code -2 --- models/han/__main__.py | 6 ++++-- 1 file changed, 4 insertions(+), 2 deletions(-) diff --git a/models/han/__main__.py b/models/han/__main__.py index 94b8e0e..42bb48b 100644 --- a/models/han/__main__.py +++ b/models/han/__main__.py @@ -108,6 +108,8 @@ def evaluate_dataset(split_name, dataset_cls, model, embedding, loader, batch_si 'Yelp2014': Yelp2014 } + args.device = device + if args.dataset not in dataset_map: raise ValueError('Unrecognized dataset') @@ -117,7 +119,7 @@ def evaluate_dataset(split_name, dataset_cls, model, embedding, loader, batch_si args.word_vectors_file, args.word_vectors_dir, batch_size=args.batch_size, - device=args.gpu, + device=args.device, unk_init=UnknownWordVecCache.unk) config = deepcopy(args) @@ -133,7 +135,7 @@ def evaluate_dataset(split_name, dataset_cls, model, embedding, loader, batch_si if args.resume_snapshot: if args.cuda: - model = torch.load(args.resume_snapshot, map_location=lambda storage, location: storage.cuda(args.gpu)) + model = torch.load(args.resume_snapshot, map_location=lambda storage, location: storage.cuda(args.device)) else: model = torch.load(args.resume_snapshot, map_location=lambda storage, location: storage) else: From 9825e795eee3e860081c4bb6a36fcbf06416a33b Mon Sep 17 00:00:00 2001 From: Jesse Date: Mon, 22 Jul 2019 23:07:15 -0400 Subject: [PATCH 09/42] CUDA fix for LSTM --- models/reg_lstm/__main__.py | 54 ++++++++++++++++++++++++++----------- models/reg_lstm/args.py | 1 + 2 files changed, 40 insertions(+), 15 deletions(-) diff --git a/models/reg_lstm/__main__.py b/models/reg_lstm/__main__.py index c0dfc4a..0f50e71 100644 --- a/models/reg_lstm/__main__.py +++ b/models/reg_lstm/__main__.py @@ -57,23 +57,45 @@ def evaluate_dataset(split_name, dataset_cls, model, embedding, loader, batch_si if __name__ == '__main__': # Set default configuration in args.py - args = get_args() + # args = get_args() logger = get_logger() + # # Set random seed for reproducibility + # torch.manual_seed(args.seed) + # torch.backends.cudnn.deterministic = True + # np.random.seed(args.seed) + # random.seed(args.seed) + # + # if not args.cuda: + # args.gpu = -1 + # if torch.cuda.is_available() and args.cuda: + # print('Note: You are using GPU for training') + # torch.cuda.set_device(args.gpu) + # torch.cuda.manual_seed(args.seed) + # if torch.cuda.is_available() and not args.cuda: + # print('Warning: Using CPU for training') + args = get_args() + + if args.local_rank == -1 or not args.cuda: + device = torch.device("cuda" if torch.cuda.is_available() and args.cuda else "cpu") + n_gpu = torch.cuda.device_count() + else: + torch.cuda.set_device(args.local_rank) + device = torch.device("cuda", args.local_rank) + n_gpu = 1 + # Initializes the distributed backend which will take care of sychronizing nodes/GPUs + torch.distributed.init_process_group(backend='nccl') + + print('Device:', str(device).upper()) + print('Number of GPUs:', n_gpu) + print('Distributed training:', bool(args.local_rank != -1)) + # Set random seed for reproducibility - torch.manual_seed(args.seed) - torch.backends.cudnn.deterministic = True - np.random.seed(args.seed) random.seed(args.seed) - - if not args.cuda: - args.gpu = -1 - if torch.cuda.is_available() and args.cuda: - print('Note: You are using GPU for training') - torch.cuda.set_device(args.gpu) - torch.cuda.manual_seed(args.seed) - if torch.cuda.is_available() and not args.cuda: - print('Warning: Using CPU for training') + np.random.seed(args.seed) + torch.manual_seed(args.seed) + if n_gpu > 0: + torch.cuda.manual_seed_all(args.seed) dataset_map = { 'Reuters': Reuters, @@ -82,6 +104,8 @@ def evaluate_dataset(split_name, dataset_cls, model, embedding, loader, batch_si 'Yelp2014': Yelp2014 } + args.device = device + if args.dataset not in dataset_map: raise ValueError('Unrecognized dataset') @@ -91,7 +115,7 @@ def evaluate_dataset(split_name, dataset_cls, model, embedding, loader, batch_si args.word_vectors_file, args.word_vectors_dir, batch_size=args.batch_size, - device=args.gpu, + device=args.device, unk_init=UnknownWordVecCache.unk) config = deepcopy(args) @@ -113,7 +137,7 @@ def evaluate_dataset(split_name, dataset_cls, model, embedding, loader, batch_si else: model = RegLSTM(config) if args.cuda: - model.cuda() + model.to(device) if not args.trained_model: save_path = os.path.join(args.save_path, dataset_map[args.dataset].NAME) diff --git a/models/reg_lstm/args.py b/models/reg_lstm/args.py index 30761ae..d0302b8 100644 --- a/models/reg_lstm/args.py +++ b/models/reg_lstm/args.py @@ -16,6 +16,7 @@ def get_args(): parser.add_argument('--embed-dim', type=int, default=300) parser.add_argument('--epoch-decay', type=int, default=15) parser.add_argument('--weight-decay', type=float, default=0) + parser.add_argument('--local-rank', type=int, default=-1, help='local rank for distributed training') parser.add_argument('--dropout', type=float, default=0.5) parser.add_argument('--wdrop', type=float, default=0.0, help="weight drop") From 4ee01c78ddda2983da396ed36c2c23bd72746bfd Mon Sep 17 00:00:00 2001 From: naotominakawa <44623998+naotominakawa@users.noreply.github.com> Date: Mon, 22 Jul 2019 23:15:24 -0400 Subject: [PATCH 10/42] Add files via upload --- datasets/bert_processors/lyrics_processor.py | 34 ++++++ datasets/lyrics.py | 105 +++++++++++++++++++ 2 files changed, 139 insertions(+) create mode 100644 datasets/bert_processors/lyrics_processor.py create mode 100644 datasets/lyrics.py diff --git a/datasets/bert_processors/lyrics_processor.py b/datasets/bert_processors/lyrics_processor.py new file mode 100644 index 0000000..fff5f96 --- /dev/null +++ b/datasets/bert_processors/lyrics_processor.py @@ -0,0 +1,34 @@ +import os + +from datasets.bert_processors.abstract_processor import BertProcessor, InputExample + + +class LyricsProcessor(BertProcessor): + NAME = 'Lyrics' + NUM_CLASSES = 12 # Number of genre; len(df['genre'].unique()) = 12 + IS_MULTILABEL = False + + def get_train_examples(self, data_dir): + return self._create_examples( + self._read_tsv(os.path.join(data_dir, 'Lyrics', 'train.tsv')), 'train') + + def get_dev_examples(self, data_dir): + return self._create_examples( + self._read_tsv(os.path.join(data_dir, 'Lyrics', 'dev.tsv')), 'dev') + + def get_test_examples(self, data_dir): + return self._create_examples( + self._read_tsv(os.path.join(data_dir, 'Lyrics', 'test.tsv')), 'test') + + def _create_examples(self, lines, set_type): + """Creates examples for the training and dev sets.""" + examples = [] + for (i, line) in enumerate(lines): + if i == 0: + continue + guid = "%s-%s" % (set_type, i) + text_a = line[1] + label = line[0] + examples.append( + InputExample(guid=guid, text_a=text_a, text_b=None, label=label)) + return examples \ No newline at end of file diff --git a/datasets/lyrics.py b/datasets/lyrics.py new file mode 100644 index 0000000..cac6c79 --- /dev/null +++ b/datasets/lyrics.py @@ -0,0 +1,105 @@ +import os +import re + +import numpy as np +import torch +from torchtext.data import NestedField, Field, TabularDataset +from torchtext.data.iterator import BucketIterator +from torchtext.vocab import Vectors + + +def clean_string(string): + """ + Performs tokenization and string cleaning for the Lyrics dataset + """ + string = re.sub(r"[^A-Za-z0-9(),!?\'`]", " ", string) + string = re.sub(r"\s{2,}", " ", string) + return string.lower().strip().split() + + +def split_sents(string): + string = re.sub(r"[!?]"," ", string) + return string.strip().split('.') + + +def char_quantize(string, max_length=1000): + identity = np.identity(len(LyricsCharQuantized.ALPHABET)) + quantized_string = np.array([identity[LyricsCharQuantized.ALPHABET[char]] for char in list(string.lower()) if char in LyricsCharQuantized.ALPHABET], dtype=np.float32) + if len(quantized_string) > max_length: + return quantized_string[:max_length] + else: + return np.concatenate((quantized_string, np.zeros((max_length - len(quantized_string), len(LyricsCharQuantized.ALPHABET)), dtype=np.float32))) + + +def process_labels(string): + """ + Returns the label string as a list of integers + :param string: + :return: + """ + return [float(x) for x in string] + + +class Lyrics(TabularDataset): + NAME = 'Lyrics' + NUM_CLASSES = 12 + IS_MULTILABEL = True + + TEXT_FIELD = Field(batch_first=True, tokenize=clean_string, include_lengths=True) + LABEL_FIELD = Field(sequential=False, use_vocab=False, batch_first=True, preprocessing=process_labels) + + @staticmethod + def sort_key(ex): + return len(ex.text) + + @classmethod + def splits(cls, path, train=os.path.join('Lyrics', 'train.tsv'), + validation=os.path.join('Lyrics', 'dev.tsv'), + test=os.path.join('Lyrics', 'test.tsv'), **kwargs): + return super(Lyrics, cls).splits( + path, train=train, validation=validation, test=test, + format='tsv', fields=[('label', cls.LABEL_FIELD), ('text', cls.TEXT_FIELD)] + ) + + @classmethod + def iters(cls, path, vectors_name, vectors_cache, batch_size=64, shuffle=True, device=0, vectors=None, + unk_init=torch.Tensor.zero_): + """ + :param path: directory containing train, test, dev files + :param vectors_name: name of word vectors file + :param vectors_cache: path to directory containing word vectors file + :param batch_size: batch size + :param device: GPU device + :param vectors: custom vectors - either predefined torchtext vectors or your own custom Vector classes + :param unk_init: function used to generate vector for OOV words + :return: + """ + if vectors is None: + vectors = Vectors(name=vectors_name, cache=vectors_cache, unk_init=unk_init) + + train, val, test = cls.splits(path) + cls.TEXT_FIELD.build_vocab(train, val, test, vectors=vectors) + return BucketIterator.splits((train, val, test), batch_size=batch_size, repeat=False, shuffle=shuffle, + sort_within_batch=True, device=device) + + +class LyricsCharQuantized(Lyrics): + ALPHABET = dict(map(lambda t: (t[1], t[0]), enumerate(list("""abcdefghijklmnopqrstuvwxyz0123456789,;.!?:'\"/\\|_@#$%^&*~`+-=<>()[]{}""")))) + TEXT_FIELD = Field(sequential=False, use_vocab=False, batch_first=True, preprocessing=char_quantize) + + @classmethod + def iters(cls, path, vectors_name, vectors_cache, batch_size=64, shuffle=True, device=0, vectors=None, + unk_init=torch.Tensor.zero_): + """ + :param path: directory containing train, test, dev files + :param batch_size: batch size + :param device: GPU device + :return: + """ + train, val, test = cls.splits(path) + return BucketIterator.splits((train, val, test), batch_size=batch_size, repeat=False, shuffle=shuffle, device=device) + + +class LyricsHierarchical(Lyrics): + NESTING_FIELD = Field(batch_first=True, tokenize=clean_string) + TEXT_FIELD = NestedField(NESTING_FIELD, tokenize=split_sents) From cf8e1019937f4817893eb2b91976869d906c69c0 Mon Sep 17 00:00:00 2001 From: Jesse Date: Wed, 24 Jul 2019 00:09:47 -0400 Subject: [PATCH 11/42] modified args and dataset map to include lyrics arguments --- models/bert/__main__.py | 5 ++++- models/bert/args.py | 3 ++- 2 files changed, 6 insertions(+), 2 deletions(-) diff --git a/models/bert/__main__.py b/models/bert/__main__.py index 3241db7..dd74edd 100644 --- a/models/bert/__main__.py +++ b/models/bert/__main__.py @@ -14,6 +14,8 @@ from datasets.bert_processors.sogou_processor import SogouProcessor from datasets.bert_processors.sst_processor import SST2Processor from datasets.bert_processors.yelp2014_processor import Yelp2014Processor +from datasets.bert_processors.lyrics_processor import LyricsProcessor + from models.bert.args import get_args from models.bert.model import BertForSequenceClassification from utils.io import PYTORCH_PRETRAINED_BERT_CACHE @@ -67,7 +69,8 @@ def evaluate_split(model, processor, args, split='dev'): 'AAPD': AAPDProcessor, 'AGNews': AGNewsProcessor, 'Yelp2014': Yelp2014Processor, - 'Sogou': SogouProcessor + 'Sogou': SogouProcessor, + 'Lyrics': LyricsProcessor } if args.gradient_accumulation_steps < 1: diff --git a/models/bert/args.py b/models/bert/args.py index 5819765..35df3c7 100644 --- a/models/bert/args.py +++ b/models/bert/args.py @@ -7,7 +7,8 @@ def get_args(): parser = models.args.get_args() parser.add_argument('--model', default=None, type=str, required=True) - parser.add_argument('--dataset', type=str, default='SST-2', choices=['SST-2', 'AGNews', 'Reuters', 'AAPD', 'IMDB', 'Yelp2014']) + parser.add_argument('--dataset', type=str, default='SST-2', choices=['SST-2', 'AGNews', 'Reuters', 'AAPD', 'IMDB', + 'Yelp2014', 'Lyrics']) parser.add_argument('--save-path', type=str, default=os.path.join('model_checkpoints', 'bert')) parser.add_argument('--cache-dir', default='cache', type=str) parser.add_argument('--trained-model', default=None, type=str) From b55cdc981699ba8ccbd957803ed1c7f9153af236 Mon Sep 17 00:00:00 2001 From: Jesse Date: Wed, 24 Jul 2019 18:56:42 -0400 Subject: [PATCH 12/42] Changed class number to correct number, 10 --- datasets/lyrics.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/datasets/lyrics.py b/datasets/lyrics.py index cac6c79..02987c8 100644 --- a/datasets/lyrics.py +++ b/datasets/lyrics.py @@ -42,7 +42,7 @@ def process_labels(string): class Lyrics(TabularDataset): NAME = 'Lyrics' - NUM_CLASSES = 12 + NUM_CLASSES = 10 IS_MULTILABEL = True TEXT_FIELD = Field(batch_first=True, tokenize=clean_string, include_lengths=True) From e02c98f7b594120a0644de2138ab285fdc7d8da1 Mon Sep 17 00:00:00 2001 From: naotominakawa <44623998+naotominakawa@users.noreply.github.com> Date: Wed, 24 Jul 2019 20:43:07 -0400 Subject: [PATCH 13/42] Update lyrics_processor.py --- datasets/bert_processors/lyrics_processor.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/datasets/bert_processors/lyrics_processor.py b/datasets/bert_processors/lyrics_processor.py index fff5f96..ad64e2e 100644 --- a/datasets/bert_processors/lyrics_processor.py +++ b/datasets/bert_processors/lyrics_processor.py @@ -5,7 +5,7 @@ class LyricsProcessor(BertProcessor): NAME = 'Lyrics' - NUM_CLASSES = 12 # Number of genre; len(df['genre'].unique()) = 12 + NUM_CLASSES = 10 # Number of genre; len(df['genre'].unique()) = 10 IS_MULTILABEL = False def get_train_examples(self, data_dir): @@ -31,4 +31,4 @@ def _create_examples(self, lines, set_type): label = line[0] examples.append( InputExample(guid=guid, text_a=text_a, text_b=None, label=label)) - return examples \ No newline at end of file + return examples From edfc7cd5a3b40d865c57c47292e1adcb667b56de Mon Sep 17 00:00:00 2001 From: naotominakawa <44623998+naotominakawa@users.noreply.github.com> Date: Wed, 24 Jul 2019 20:59:09 -0400 Subject: [PATCH 14/42] Update lyrics_processor.py --- datasets/bert_processors/lyrics_processor.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/datasets/bert_processors/lyrics_processor.py b/datasets/bert_processors/lyrics_processor.py index ad64e2e..af4bc4b 100644 --- a/datasets/bert_processors/lyrics_processor.py +++ b/datasets/bert_processors/lyrics_processor.py @@ -6,7 +6,7 @@ class LyricsProcessor(BertProcessor): NAME = 'Lyrics' NUM_CLASSES = 10 # Number of genre; len(df['genre'].unique()) = 10 - IS_MULTILABEL = False + IS_MULTILABEL = True def get_train_examples(self, data_dir): return self._create_examples( From e13fe200e7493827f0f1a97b190b3f7959e215a4 Mon Sep 17 00:00:00 2001 From: Jesse Date: Wed, 31 Jul 2019 16:02:06 -0400 Subject: [PATCH 15/42] Update __main__.py try at fixing cuda usage --- models/reg_lstm/__main__.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/models/reg_lstm/__main__.py b/models/reg_lstm/__main__.py index 0f50e71..ab35ac1 100644 --- a/models/reg_lstm/__main__.py +++ b/models/reg_lstm/__main__.py @@ -77,7 +77,7 @@ def evaluate_dataset(split_name, dataset_cls, model, embedding, loader, batch_si args = get_args() if args.local_rank == -1 or not args.cuda: - device = torch.device("cuda" if torch.cuda.is_available() and args.cuda else "cpu") + device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu") n_gpu = torch.cuda.device_count() else: torch.cuda.set_device(args.local_rank) @@ -137,7 +137,7 @@ def evaluate_dataset(split_name, dataset_cls, model, embedding, loader, batch_si else: model = RegLSTM(config) if args.cuda: - model.to(device) + model.to(args.device) if not args.trained_model: save_path = os.path.join(args.save_path, dataset_map[args.dataset].NAME) From d0ac4b0257616e479eaac32285b3867e9cf134c6 Mon Sep 17 00:00:00 2001 From: Jesse Date: Wed, 31 Jul 2019 22:09:08 -0400 Subject: [PATCH 16/42] fixed cuda loading for all models --- models/char_cnn/__main__.py | 38 ++++++++++++++++++++++--------------- models/kim_cnn/__main__.py | 38 +++++++++++++++++++++++-------------- models/reg_lstm/__main__.py | 16 ---------------- models/xml_cnn/__main__.py | 38 +++++++++++++++++++++++-------------- 4 files changed, 71 insertions(+), 59 deletions(-) diff --git a/models/char_cnn/__main__.py b/models/char_cnn/__main__.py index 4344698..928ff85 100644 --- a/models/char_cnn/__main__.py +++ b/models/char_cnn/__main__.py @@ -59,23 +59,29 @@ def evaluate_dataset(split_name, dataset_cls, model, embedding, loader, batch_si if __name__ == '__main__': # Set default configuration in args.py - args = get_args() logger = get_logger() + args = get_args() + + if args.local_rank == -1 or not args.cuda: + device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu") + n_gpu = torch.cuda.device_count() + else: + torch.cuda.set_device(args.local_rank) + device = torch.device("cuda", args.local_rank) + n_gpu = 1 + # Initializes the distributed backend which will take care of sychronizing nodes/GPUs + torch.distributed.init_process_group(backend='nccl') + + print('Device:', str(device).upper()) + print('Number of GPUs:', n_gpu) + print('Distributed training:', bool(args.local_rank != -1)) # Set random seed for reproducibility - torch.manual_seed(args.seed) - torch.backends.cudnn.deterministic = True - np.random.seed(args.seed) random.seed(args.seed) - - if not args.cuda: - args.gpu = -1 - if torch.cuda.is_available() and args.cuda: - print('Note: You are using GPU for training') - torch.cuda.set_device(args.gpu) - torch.cuda.manual_seed(args.seed) - if torch.cuda.is_available() and not args.cuda: - print('Warning: Using CPU for training') + np.random.seed(args.seed) + torch.manual_seed(args.seed) + if n_gpu > 0: + torch.cuda.manual_seed_all(args.seed) dataset_map = { 'Reuters': Reuters, @@ -84,6 +90,8 @@ def evaluate_dataset(split_name, dataset_cls, model, embedding, loader, batch_si 'Yelp2014': Yelp2014 } + args.device = device + if args.dataset not in dataset_map: raise ValueError('Unrecognized dataset') @@ -93,7 +101,7 @@ def evaluate_dataset(split_name, dataset_cls, model, embedding, loader, batch_si args.word_vectors_file, args.word_vectors_dir, batch_size=args.batch_size, - device=args.gpu, + device=args.device, unk_init=UnknownWordVecCache.unk) config = deepcopy(args) @@ -114,7 +122,7 @@ def evaluate_dataset(split_name, dataset_cls, model, embedding, loader, batch_si else: model = CharCNN(config) if args.cuda: - model.cuda() + model.to(args.device) if not args.trained_model: save_path = os.path.join(args.save_path, dataset_map[args.dataset].NAME) diff --git a/models/kim_cnn/__main__.py b/models/kim_cnn/__main__.py index 5cf6720..5a69dcf 100644 --- a/models/kim_cnn/__main__.py +++ b/models/kim_cnn/__main__.py @@ -58,22 +58,29 @@ def evaluate_dataset(split_name, dataset_cls, model, embedding, loader, batch_si if __name__ == '__main__': # Set default configuration in args.py + logger = get_logger() args = get_args() + if args.local_rank == -1 or not args.cuda: + device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu") + n_gpu = torch.cuda.device_count() + else: + torch.cuda.set_device(args.local_rank) + device = torch.device("cuda", args.local_rank) + n_gpu = 1 + # Initializes the distributed backend which will take care of sychronizing nodes/GPUs + torch.distributed.init_process_group(backend='nccl') + + print('Device:', str(device).upper()) + print('Number of GPUs:', n_gpu) + print('Distributed training:', bool(args.local_rank != -1)) + # Set random seed for reproducibility - torch.manual_seed(args.seed) - torch.backends.cudnn.deterministic = True - if not args.cuda: - args.gpu = -1 - if torch.cuda.is_available() and args.cuda: - print('Note: You are using GPU for training') - torch.cuda.set_device(args.gpu) - torch.cuda.manual_seed(args.seed) - if torch.cuda.is_available() and not args.cuda: - print('Warning: Using CPU for training') - np.random.seed(args.seed) random.seed(args.seed) - logger = get_logger() + np.random.seed(args.seed) + torch.manual_seed(args.seed) + if n_gpu > 0: + torch.cuda.manual_seed_all(args.seed) dataset_map = { 'Reuters': Reuters, @@ -82,13 +89,16 @@ def evaluate_dataset(split_name, dataset_cls, model, embedding, loader, batch_si 'Yelp2014': Yelp2014 } + args.device = device + if args.dataset not in dataset_map: raise ValueError('Unrecognized dataset') else: dataset_class = dataset_map[args.dataset] train_iter, dev_iter, test_iter = dataset_map[args.dataset].iters(args.data_dir, args.word_vectors_file, args.word_vectors_dir, - batch_size=args.batch_size, device=args.gpu, + batch_size=args.batch_size, + device=args.device, unk_init=UnknownWordVecCache.unk) config = deepcopy(args) @@ -110,7 +120,7 @@ def evaluate_dataset(split_name, dataset_cls, model, embedding, loader, batch_si else: model = KimCNN(config) if args.cuda: - model.cuda() + model.to(args.device) if not args.trained_model: save_path = os.path.join(args.save_path, dataset_map[args.dataset].NAME) diff --git a/models/reg_lstm/__main__.py b/models/reg_lstm/__main__.py index ab35ac1..e81cfbb 100644 --- a/models/reg_lstm/__main__.py +++ b/models/reg_lstm/__main__.py @@ -57,23 +57,7 @@ def evaluate_dataset(split_name, dataset_cls, model, embedding, loader, batch_si if __name__ == '__main__': # Set default configuration in args.py - # args = get_args() logger = get_logger() - - # # Set random seed for reproducibility - # torch.manual_seed(args.seed) - # torch.backends.cudnn.deterministic = True - # np.random.seed(args.seed) - # random.seed(args.seed) - # - # if not args.cuda: - # args.gpu = -1 - # if torch.cuda.is_available() and args.cuda: - # print('Note: You are using GPU for training') - # torch.cuda.set_device(args.gpu) - # torch.cuda.manual_seed(args.seed) - # if torch.cuda.is_available() and not args.cuda: - # print('Warning: Using CPU for training') args = get_args() if args.local_rank == -1 or not args.cuda: diff --git a/models/xml_cnn/__main__.py b/models/xml_cnn/__main__.py index 5e30273..67b07c4 100644 --- a/models/xml_cnn/__main__.py +++ b/models/xml_cnn/__main__.py @@ -58,22 +58,29 @@ def evaluate_dataset(split_name, dataset_cls, model, embedding, loader, batch_si if __name__ == '__main__': # Set default configuration in args.py + logger = get_logger() args = get_args() + if args.local_rank == -1 or not args.cuda: + device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu") + n_gpu = torch.cuda.device_count() + else: + torch.cuda.set_device(args.local_rank) + device = torch.device("cuda", args.local_rank) + n_gpu = 1 + # Initializes the distributed backend which will take care of sychronizing nodes/GPUs + torch.distributed.init_process_group(backend='nccl') + + print('Device:', str(device).upper()) + print('Number of GPUs:', n_gpu) + print('Distributed training:', bool(args.local_rank != -1)) + # Set random seed for reproducibility - torch.manual_seed(args.seed) - torch.backends.cudnn.deterministic = True - if not args.cuda: - args.gpu = -1 - if torch.cuda.is_available() and args.cuda: - print('Note: You are using GPU for training') - torch.cuda.set_device(args.gpu) - torch.cuda.manual_seed(args.seed) - if torch.cuda.is_available() and not args.cuda: - print('Warning: Using CPU for training') - np.random.seed(args.seed) random.seed(args.seed) - logger = get_logger() + np.random.seed(args.seed) + torch.manual_seed(args.seed) + if n_gpu > 0: + torch.cuda.manual_seed_all(args.seed) dataset_map = { 'Reuters': Reuters, @@ -82,13 +89,16 @@ def evaluate_dataset(split_name, dataset_cls, model, embedding, loader, batch_si 'Yelp2014': Yelp2014 } + args.device = device + if args.dataset not in dataset_map: raise ValueError('Unrecognized dataset') else: dataset_class = dataset_map[args.dataset] train_iter, dev_iter, test_iter = dataset_map[args.dataset].iters(args.data_dir, args.word_vectors_file, args.word_vectors_dir, - batch_size=args.batch_size, device=args.gpu, + batch_size=args.batch_size, + device=args.device, unk_init=UnknownWordVecCache.unk) config = deepcopy(args) @@ -110,7 +120,7 @@ def evaluate_dataset(split_name, dataset_cls, model, embedding, loader, batch_si else: model = XmlCNN(config) if args.cuda: - model.cuda() + model.to(args.device) if not args.trained_model: save_path = os.path.join(args.save_path, dataset_map[args.dataset].NAME) From c188fb5d665c607c99f7e3c4b22119bd2fc817b0 Mon Sep 17 00:00:00 2001 From: Jesse Date: Wed, 31 Jul 2019 22:11:59 -0400 Subject: [PATCH 17/42] Fix for weight_drop error --- models/reg_lstm/weight_drop.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/models/reg_lstm/weight_drop.py b/models/reg_lstm/weight_drop.py index 5a08e85..0ed484b 100644 --- a/models/reg_lstm/weight_drop.py +++ b/models/reg_lstm/weight_drop.py @@ -67,9 +67,9 @@ def _setweights(self): mask = torch.autograd.Variable(torch.ones(raw_w.size(0), 1)) if raw_w.is_cuda: mask = mask.cuda() mask = torch.nn.functional.dropout(mask, p=self.dropout, training=True) - w = mask.expand_as(raw_w) * raw_w + w = torch.nn.Parameter(mask.expand_as(raw_w) * raw_w) else: - w = torch.nn.functional.dropout(raw_w, p=self.dropout, training=self.training) + w = torch.nn.Parameter(torch.nn.functional.dropout(raw_w, p=self.dropout, training=self.training)) setattr(self.module, name_w, w) def forward(self, *args): From 2b5b60994d33e8210a9eff43d876fc3319e3f058 Mon Sep 17 00:00:00 2001 From: Jesse Date: Wed, 31 Jul 2019 22:49:23 -0400 Subject: [PATCH 18/42] added local-rank arg --- models/args.py | 1 + models/char_cnn/args.py | 1 + models/kim_cnn/args.py | 1 + 3 files changed, 3 insertions(+) diff --git a/models/args.py b/models/args.py index e80ae2d..ebc1adc 100644 --- a/models/args.py +++ b/models/args.py @@ -13,6 +13,7 @@ def get_args(): parser.add_argument('--seed', type=int, default=3435) parser.add_argument('--patience', type=int, default=5) parser.add_argument('--log-every', type=int, default=10) + parser.add_argument('--local-rank', type=int, default=-1, help='local rank for distributed training') parser.add_argument('--data-dir', default=os.path.join(os.pardir, 'hedwig-data', 'datasets')) return parser diff --git a/models/char_cnn/args.py b/models/char_cnn/args.py index 62c517f..9d077a9 100644 --- a/models/char_cnn/args.py +++ b/models/char_cnn/args.py @@ -13,6 +13,7 @@ def get_args(): parser.add_argument('--dropout', type=float, default=0.5) parser.add_argument('--epoch-decay', type=int, default=15) parser.add_argument('--weight-decay', type=float, default=0) + parser.add_argument('--local-rank', type=int, default=-1, help='local rank for distributed training') parser.add_argument('--word-vectors-dir', default=os.path.join(os.pardir, 'hedwig-data', 'embeddings', 'word2vec')) parser.add_argument('--word-vectors-file', default='GoogleNews-vectors-negative300.txt') diff --git a/models/kim_cnn/args.py b/models/kim_cnn/args.py index 10094f8..0d5a7ae 100644 --- a/models/kim_cnn/args.py +++ b/models/kim_cnn/args.py @@ -14,6 +14,7 @@ def get_args(): parser.add_argument('--dropout', type=float, default=0.5) parser.add_argument('--epoch-decay', type=int, default=15) parser.add_argument('--weight-decay', type=float, default=0) + parser.add_argument('--local-rank', type=int, default=-1, help='local rank for distributed training') parser.add_argument('--word-vectors-dir', default=os.path.join(os.pardir, 'hedwig-data', 'embeddings', 'word2vec')) parser.add_argument('--word-vectors-file', default='GoogleNews-vectors-negative300.txt') From ccd8d8a3cd21ac4710be2401a5c054016414128a Mon Sep 17 00:00:00 2001 From: Jesse Date: Wed, 31 Jul 2019 22:54:22 -0400 Subject: [PATCH 19/42] fixed local rank to only be in models/args --- models/char_cnn/args.py | 1 - models/han/args.py | 1 - models/kim_cnn/args.py | 1 - models/reg_lstm/args.py | 1 - 4 files changed, 4 deletions(-) diff --git a/models/char_cnn/args.py b/models/char_cnn/args.py index 9d077a9..62c517f 100644 --- a/models/char_cnn/args.py +++ b/models/char_cnn/args.py @@ -13,7 +13,6 @@ def get_args(): parser.add_argument('--dropout', type=float, default=0.5) parser.add_argument('--epoch-decay', type=int, default=15) parser.add_argument('--weight-decay', type=float, default=0) - parser.add_argument('--local-rank', type=int, default=-1, help='local rank for distributed training') parser.add_argument('--word-vectors-dir', default=os.path.join(os.pardir, 'hedwig-data', 'embeddings', 'word2vec')) parser.add_argument('--word-vectors-file', default='GoogleNews-vectors-negative300.txt') diff --git a/models/han/args.py b/models/han/args.py index 5e6c2b2..c803f71 100644 --- a/models/han/args.py +++ b/models/han/args.py @@ -16,7 +16,6 @@ def get_args(): parser.add_argument('--weight-decay', type=float, default=0) parser.add_argument('--word-num-hidden', type=int, default=50) parser.add_argument('--sentence-num-hidden', type=int, default=50) - parser.add_argument('--local-rank', type=int, default=-1, help='local rank for distributed training') parser.add_argument('--word-vectors-dir', default=os.path.join(os.pardir, 'hedwig-data', 'embeddings', 'word2vec')) parser.add_argument('--word-vectors-file', default='GoogleNews-vectors-negative300.txt') diff --git a/models/kim_cnn/args.py b/models/kim_cnn/args.py index 0d5a7ae..10094f8 100644 --- a/models/kim_cnn/args.py +++ b/models/kim_cnn/args.py @@ -14,7 +14,6 @@ def get_args(): parser.add_argument('--dropout', type=float, default=0.5) parser.add_argument('--epoch-decay', type=int, default=15) parser.add_argument('--weight-decay', type=float, default=0) - parser.add_argument('--local-rank', type=int, default=-1, help='local rank for distributed training') parser.add_argument('--word-vectors-dir', default=os.path.join(os.pardir, 'hedwig-data', 'embeddings', 'word2vec')) parser.add_argument('--word-vectors-file', default='GoogleNews-vectors-negative300.txt') diff --git a/models/reg_lstm/args.py b/models/reg_lstm/args.py index d0302b8..30761ae 100644 --- a/models/reg_lstm/args.py +++ b/models/reg_lstm/args.py @@ -16,7 +16,6 @@ def get_args(): parser.add_argument('--embed-dim', type=int, default=300) parser.add_argument('--epoch-decay', type=int, default=15) parser.add_argument('--weight-decay', type=float, default=0) - parser.add_argument('--local-rank', type=int, default=-1, help='local rank for distributed training') parser.add_argument('--dropout', type=float, default=0.5) parser.add_argument('--wdrop', type=float, default=0.0, help="weight drop") From 11e6d00dcc4ddc38dc2123f0dd7d903dd2fedb77 Mon Sep 17 00:00:00 2001 From: Jesse Date: Wed, 31 Jul 2019 23:16:19 -0400 Subject: [PATCH 20/42] attempt at char_cnn file not found fix --- models/char_cnn/__main__.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/models/char_cnn/__main__.py b/models/char_cnn/__main__.py index 928ff85..26d2f06 100644 --- a/models/char_cnn/__main__.py +++ b/models/char_cnn/__main__.py @@ -168,8 +168,8 @@ def evaluate_dataset(split_name, dataset_cls, model, embedding, loader, batch_si model = torch.load(args.trained_model, map_location=lambda storage, location: storage) # Calculate dev and test metrics - if hasattr(trainer, 'snapshot_path'): - model = torch.load(trainer.snapshot_path) + #if hasattr(trainer, 'snapshot_path'): + # model = torch.load(trainer.snapshot_path) evaluate_dataset('dev', dataset_map[args.dataset], model, None, dev_iter, args.batch_size, is_multilabel=dataset_class.IS_MULTILABEL, From 2f34d5ccebc313673e44b195d55d07a8e88932ac Mon Sep 17 00:00:00 2001 From: Jesse Date: Thu, 1 Aug 2019 20:35:55 -0400 Subject: [PATCH 21/42] char-cnn fix --- models/char_cnn/__main__.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/models/char_cnn/__main__.py b/models/char_cnn/__main__.py index 26d2f06..928ff85 100644 --- a/models/char_cnn/__main__.py +++ b/models/char_cnn/__main__.py @@ -168,8 +168,8 @@ def evaluate_dataset(split_name, dataset_cls, model, embedding, loader, batch_si model = torch.load(args.trained_model, map_location=lambda storage, location: storage) # Calculate dev and test metrics - #if hasattr(trainer, 'snapshot_path'): - # model = torch.load(trainer.snapshot_path) + if hasattr(trainer, 'snapshot_path'): + model = torch.load(trainer.snapshot_path) evaluate_dataset('dev', dataset_map[args.dataset], model, None, dev_iter, args.batch_size, is_multilabel=dataset_class.IS_MULTILABEL, From eb51e2bc8527254b63faa41939bb060aeef05c3c Mon Sep 17 00:00:00 2001 From: Jesse Date: Thu, 1 Aug 2019 23:01:04 -0400 Subject: [PATCH 22/42] char_cnn fix --- common/trainers/classification_trainer.py | 1 + 1 file changed, 1 insertion(+) diff --git a/common/trainers/classification_trainer.py b/common/trainers/classification_trainer.py index 0de3d51..ea08f42 100644 --- a/common/trainers/classification_trainer.py +++ b/common/trainers/classification_trainer.py @@ -101,6 +101,7 @@ def train(self, epochs): self.iters_not_improved = 0 self.best_dev_f1 = dev_f1 torch.save(self.model, self.snapshot_path) + print(self.snapshot_path) else: self.iters_not_improved += 1 if self.iters_not_improved >= self.patience: From 42c596bfe15ce554afee686f8c1092b87432118a Mon Sep 17 00:00:00 2001 From: Jesse Date: Thu, 1 Aug 2019 23:11:53 -0400 Subject: [PATCH 23/42] char_cnn fix --- common/trainers/classification_trainer.py | 1 + 1 file changed, 1 insertion(+) diff --git a/common/trainers/classification_trainer.py b/common/trainers/classification_trainer.py index ea08f42..e4b7941 100644 --- a/common/trainers/classification_trainer.py +++ b/common/trainers/classification_trainer.py @@ -104,6 +104,7 @@ def train(self, epochs): print(self.snapshot_path) else: self.iters_not_improved += 1 + print('not improved') if self.iters_not_improved >= self.patience: self.early_stop = True print("Early Stopping. Epoch: {}, Best Dev F1: {}".format(epoch, self.best_dev_f1)) From b8215c7bb58c5c60f02bc1a6d4ff63240f819c89 Mon Sep 17 00:00:00 2001 From: Jesse Date: Thu, 1 Aug 2019 23:16:34 -0400 Subject: [PATCH 24/42] char_cnn fix --- common/trainers/classification_trainer.py | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/common/trainers/classification_trainer.py b/common/trainers/classification_trainer.py index e4b7941..55b3ca5 100644 --- a/common/trainers/classification_trainer.py +++ b/common/trainers/classification_trainer.py @@ -101,10 +101,9 @@ def train(self, epochs): self.iters_not_improved = 0 self.best_dev_f1 = dev_f1 torch.save(self.model, self.snapshot_path) - print(self.snapshot_path) else: self.iters_not_improved += 1 - print('not improved') + torch.save(self.model, self.snapshot_path) if self.iters_not_improved >= self.patience: self.early_stop = True print("Early Stopping. Epoch: {}, Best Dev F1: {}".format(epoch, self.best_dev_f1)) From aac329ac37f2c3c3d199aa137a93514741e5bf3f Mon Sep 17 00:00:00 2001 From: Jesse Date: Thu, 1 Aug 2019 23:47:15 -0400 Subject: [PATCH 25/42] LSTM fix --- models/reg_lstm/model.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/models/reg_lstm/model.py b/models/reg_lstm/model.py index ac955f6..197992e 100644 --- a/models/reg_lstm/model.py +++ b/models/reg_lstm/model.py @@ -84,7 +84,7 @@ def forward(self, x, lengths=None): x = self.dropout(x) if self.has_bottleneck_layer: x = F.relu(self.fc1(x)) - # x = self.dropout(x) + x = self.dropout(x) if self.tar or self.ar: return self.fc2(x), rnn_outs.permute(1,0,2) return self.fc2(x) From ad2b8c9c422884d63d65e9d55ebe3d63d0b6c26d Mon Sep 17 00:00:00 2001 From: Jesse Date: Thu, 1 Aug 2019 23:51:22 -0400 Subject: [PATCH 26/42] LSTM fix --- models/reg_lstm/model.py | 9 +++++---- 1 file changed, 5 insertions(+), 4 deletions(-) diff --git a/models/reg_lstm/model.py b/models/reg_lstm/model.py index 197992e..8752374 100644 --- a/models/reg_lstm/model.py +++ b/models/reg_lstm/model.py @@ -84,7 +84,7 @@ def forward(self, x, lengths=None): x = self.dropout(x) if self.has_bottleneck_layer: x = F.relu(self.fc1(x)) - x = self.dropout(x) + # x = self.dropout(x) if self.tar or self.ar: return self.fc2(x), rnn_outs.permute(1,0,2) return self.fc2(x) @@ -94,9 +94,10 @@ def forward(self, x, lengths=None): return self.fc1(x) def update_ema(self): - self.steps_ema += 1 - for p, avg_p in zip(self.parameters(), self.avg_param): - avg_p.mul_(self.beta_ema).add_((1-self.beta_ema)*p.data) + # self.steps_ema += 1 + # for p, avg_p in zip(self.parameters(), self.avg_param): + # avg_p.mul_(self.beta_ema).add_((1-self.beta_ema)*p.data) + pass def load_ema_params(self): for p, avg_p in zip(self.parameters(), self.avg_param): From 9dba709f446baac2d1c1a3cc78d44ac19b6f8d9d Mon Sep 17 00:00:00 2001 From: Jesse Date: Thu, 1 Aug 2019 23:53:16 -0400 Subject: [PATCH 27/42] LSTM fix --- models/reg_lstm/model.py | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) diff --git a/models/reg_lstm/model.py b/models/reg_lstm/model.py index 8752374..a9c4ce0 100644 --- a/models/reg_lstm/model.py +++ b/models/reg_lstm/model.py @@ -100,8 +100,9 @@ def update_ema(self): pass def load_ema_params(self): - for p, avg_p in zip(self.parameters(), self.avg_param): - p.data.copy_(avg_p/(1-self.beta_ema**self.steps_ema)) + # for p, avg_p in zip(self.parameters(), self.avg_param): + # p.data.copy_(avg_p/(1-self.beta_ema**self.steps_ema)) + pass def load_params(self, params): for p,avg_p in zip(self.parameters(), params): From fd8276f6da0823347fe369cb4e910a5987fd6af3 Mon Sep 17 00:00:00 2001 From: Jesse Date: Fri, 2 Aug 2019 00:01:58 -0400 Subject: [PATCH 28/42] added Lyrics arg --- models/char_cnn/args.py | 3 ++- models/han/args.py | 3 ++- models/kim_cnn/args.py | 3 ++- models/reg_lstm/args.py | 3 ++- models/xml_cnn/args.py | 3 ++- 5 files changed, 10 insertions(+), 5 deletions(-) diff --git a/models/char_cnn/args.py b/models/char_cnn/args.py index 62c517f..a2f244a 100644 --- a/models/char_cnn/args.py +++ b/models/char_cnn/args.py @@ -6,7 +6,8 @@ def get_args(): parser = models.args.get_args() - parser.add_argument('--dataset', type=str, default='Reuters', choices=['Reuters', 'AAPD', 'IMDB', 'Yelp2014']) + parser.add_argument('--dataset', type=str, default='Reuters', choices=['Reuters', 'AAPD', 'IMDB', 'Yelp2014', + 'Lyrics']) parser.add_argument('--num-conv-filters', type=int, default=256) parser.add_argument('--num-affine-neurons', type=int, default=1024) parser.add_argument('--output-channel', type=int, default=256) diff --git a/models/han/args.py b/models/han/args.py index c803f71..7749eff 100644 --- a/models/han/args.py +++ b/models/han/args.py @@ -7,7 +7,8 @@ def get_args(): parser = models.args.get_args() parser.add_argument('--mode', type=str, default='static', choices=['rand', 'static', 'non-static']) - parser.add_argument('--dataset', type=str, default='Reuters', choices=['Reuters', 'AAPD', 'IMDB', 'Yelp2014']) + parser.add_argument('--dataset', type=str, default='Reuters', choices=['Reuters', 'AAPD', 'IMDB', 'Yelp2014', + 'Lyrics']) parser.add_argument('--output-channel', type=int, default=100) parser.add_argument('--words-dim', type=int, default=300) parser.add_argument('--embed-dim', type=int, default=300) diff --git a/models/kim_cnn/args.py b/models/kim_cnn/args.py index 10094f8..be9c9a9 100644 --- a/models/kim_cnn/args.py +++ b/models/kim_cnn/args.py @@ -6,7 +6,8 @@ def get_args(): parser = models.args.get_args() - parser.add_argument('--dataset', type=str, default='Reuters', choices=['Reuters', 'AAPD', 'IMDB', 'Yelp2014']) + parser.add_argument('--dataset', type=str, default='Reuters', choices=['Reuters', 'AAPD', 'IMDB', 'Yelp2014', + 'Lyrics']) parser.add_argument('--mode', type=str, default='multichannel', choices=['rand', 'static', 'non-static', 'multichannel']) parser.add_argument('--output-channel', type=int, default=100) parser.add_argument('--words-dim', type=int, default=300) diff --git a/models/reg_lstm/args.py b/models/reg_lstm/args.py index 30761ae..4eb7bde 100644 --- a/models/reg_lstm/args.py +++ b/models/reg_lstm/args.py @@ -11,7 +11,8 @@ def get_args(): parser.add_argument('--num-layers', type=int, default=2) parser.add_argument('--hidden-dim', type=int, default=256) parser.add_argument('--mode', type=str, default='static', choices=['rand', 'static', 'non-static']) - parser.add_argument('--dataset', type=str, default='Reuters', choices=['Reuters', 'AAPD', 'IMDB', 'Yelp2014']) + parser.add_argument('--dataset', type=str, default='Reuters', choices=['Reuters', 'AAPD', 'IMDB', 'Yelp2014', + 'Lyrics']) parser.add_argument('--words-dim', type=int, default=300) parser.add_argument('--embed-dim', type=int, default=300) parser.add_argument('--epoch-decay', type=int, default=15) diff --git a/models/xml_cnn/args.py b/models/xml_cnn/args.py index 5d269ed..4745d39 100644 --- a/models/xml_cnn/args.py +++ b/models/xml_cnn/args.py @@ -6,7 +6,8 @@ def get_args(): parser = models.args.get_args() parser.add_argument('--mode', type=str, default='multichannel', choices=['rand', 'static', 'non-static', 'multichannel']) - parser.add_argument('--dataset', type=str, default='Reuters', choices=['Reuters', 'AAPD', 'IMDB', 'Yelp2014']) + parser.add_argument('--dataset', type=str, default='Reuters', choices=['Reuters', 'AAPD', 'IMDB', 'Yelp2014', + 'Lyrics']) parser.add_argument('--dev-every', type=int, default=30) parser.add_argument('--output-channel', type=int, default=100) parser.add_argument('--words-dim', type=int, default=300) From 51cd6a2a0edfe6a83a5e047dfe2089f0eeceb377 Mon Sep 17 00:00:00 2001 From: Jesse Date: Fri, 2 Aug 2019 00:06:59 -0400 Subject: [PATCH 29/42] added lyrics dataset to all __main__ files --- models/char_cnn/__main__.py | 4 +++- models/han/__main__.py | 4 +++- models/kim_cnn/__main__.py | 4 +++- models/reg_lstm/__main__.py | 4 +++- models/xml_cnn/__main__.py | 4 +++- 5 files changed, 15 insertions(+), 5 deletions(-) diff --git a/models/char_cnn/__main__.py b/models/char_cnn/__main__.py index 928ff85..e9fad4d 100644 --- a/models/char_cnn/__main__.py +++ b/models/char_cnn/__main__.py @@ -12,6 +12,7 @@ from datasets.imdb import IMDBCharQuantized as IMDB from datasets.reuters import ReutersCharQuantized as Reuters from datasets.yelp2014 import Yelp2014CharQuantized as Yelp2014 +from datasets.lyrics import LyricsCharQuantized as Lyrics from models.char_cnn.args import get_args from models.char_cnn.model import CharCNN @@ -87,7 +88,8 @@ def evaluate_dataset(split_name, dataset_cls, model, embedding, loader, batch_si 'Reuters': Reuters, 'AAPD': AAPD, 'IMDB': IMDB, - 'Yelp2014': Yelp2014 + 'Yelp2014': Yelp2014, + 'Lyrics': Lyrics } args.device = device diff --git a/models/han/__main__.py b/models/han/__main__.py index 42bb48b..78f5519 100644 --- a/models/han/__main__.py +++ b/models/han/__main__.py @@ -13,6 +13,7 @@ from datasets.imdb import IMDBHierarchical as IMDB from datasets.reuters import ReutersHierarchical as Reuters from datasets.yelp2014 import Yelp2014Hierarchical as Yelp2014 +from datasets.lyrics import LyricsHierarchical as Lyrics from models.han.args import get_args from models.han.model import HAN @@ -105,7 +106,8 @@ def evaluate_dataset(split_name, dataset_cls, model, embedding, loader, batch_si 'Reuters': Reuters, 'AAPD': AAPD, 'IMDB': IMDB, - 'Yelp2014': Yelp2014 + 'Yelp2014': Yelp2014, + 'Lyrics': Lyrics } args.device = device diff --git a/models/kim_cnn/__main__.py b/models/kim_cnn/__main__.py index 5a69dcf..606246f 100644 --- a/models/kim_cnn/__main__.py +++ b/models/kim_cnn/__main__.py @@ -13,6 +13,7 @@ from datasets.imdb import IMDB from datasets.reuters import Reuters from datasets.yelp2014 import Yelp2014 +from datasets.lyrics import Lyrics from models.kim_cnn.args import get_args from models.kim_cnn.model import KimCNN @@ -86,7 +87,8 @@ def evaluate_dataset(split_name, dataset_cls, model, embedding, loader, batch_si 'Reuters': Reuters, 'AAPD': AAPD, 'IMDB': IMDB, - 'Yelp2014': Yelp2014 + 'Yelp2014': Yelp2014, + 'Lyrics': Lyrics } args.device = device diff --git a/models/reg_lstm/__main__.py b/models/reg_lstm/__main__.py index e81cfbb..af8607e 100644 --- a/models/reg_lstm/__main__.py +++ b/models/reg_lstm/__main__.py @@ -12,6 +12,7 @@ from datasets.imdb import IMDB from datasets.reuters import Reuters from datasets.yelp2014 import Yelp2014 +from datasets.lyrics import Lyrics from models.reg_lstm.args import get_args from models.reg_lstm.model import RegLSTM @@ -85,7 +86,8 @@ def evaluate_dataset(split_name, dataset_cls, model, embedding, loader, batch_si 'Reuters': Reuters, 'AAPD': AAPD, 'IMDB': IMDB, - 'Yelp2014': Yelp2014 + 'Yelp2014': Yelp2014, + 'Lyrics': Lyrics } args.device = device diff --git a/models/xml_cnn/__main__.py b/models/xml_cnn/__main__.py index 67b07c4..7735819 100644 --- a/models/xml_cnn/__main__.py +++ b/models/xml_cnn/__main__.py @@ -13,6 +13,7 @@ from datasets.imdb import IMDB from datasets.reuters import Reuters from datasets.yelp2014 import Yelp2014 +from datasets.lyrics import Lyrics from models.xml_cnn.args import get_args from models.xml_cnn.model import XmlCNN @@ -86,7 +87,8 @@ def evaluate_dataset(split_name, dataset_cls, model, embedding, loader, batch_si 'Reuters': Reuters, 'AAPD': AAPD, 'IMDB': IMDB, - 'Yelp2014': Yelp2014 + 'Yelp2014': Yelp2014, + 'Lyrics': Lyrics } args.device = device From ed7882a0e37c575180948529d176af1b1f68c64f Mon Sep 17 00:00:00 2001 From: Jesse Date: Fri, 2 Aug 2019 00:26:22 -0400 Subject: [PATCH 30/42] added lyrics to evaluators --- common/evaluate.py | 1 + 1 file changed, 1 insertion(+) diff --git a/common/evaluate.py b/common/evaluate.py index a4ad6d8..0f4f3c5 100644 --- a/common/evaluate.py +++ b/common/evaluate.py @@ -11,6 +11,7 @@ class EvaluatorFactory(object): 'AAPD': ClassificationEvaluator, 'IMDB': ClassificationEvaluator, 'Yelp2014': ClassificationEvaluator, + 'Lyrics': ClassificationEvaluator, 'Robust04': RelevanceTransferEvaluator, 'Robust05': RelevanceTransferEvaluator, 'Robust45': RelevanceTransferEvaluator From 822178879742d988c9129fbd3131614b1ea077d2 Mon Sep 17 00:00:00 2001 From: Jesse Date: Fri, 2 Aug 2019 00:31:47 -0400 Subject: [PATCH 31/42] added lyrics to train --- common/train.py | 1 + 1 file changed, 1 insertion(+) diff --git a/common/train.py b/common/train.py index b53c8f7..a3ed2be 100644 --- a/common/train.py +++ b/common/train.py @@ -11,6 +11,7 @@ class TrainerFactory(object): 'AAPD': ClassificationTrainer, 'IMDB': ClassificationTrainer, 'Yelp2014': ClassificationTrainer, + 'Lyrics': ClassificationTrainer, 'Robust04': RelevanceTransferTrainer, 'Robust05': RelevanceTransferTrainer, 'Robust45': RelevanceTransferTrainer, From 587f7050d8d1153dacf0baad6018b064ad09270a Mon Sep 17 00:00:00 2001 From: Jesse Date: Sat, 3 Aug 2019 13:52:03 -0400 Subject: [PATCH 32/42] Update args.py removed local rank argument --- models/bert/args.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/models/bert/args.py b/models/bert/args.py index 35df3c7..f405728 100644 --- a/models/bert/args.py +++ b/models/bert/args.py @@ -12,7 +12,7 @@ def get_args(): parser.add_argument('--save-path', type=str, default=os.path.join('model_checkpoints', 'bert')) parser.add_argument('--cache-dir', default='cache', type=str) parser.add_argument('--trained-model', default=None, type=str) - parser.add_argument('--local-rank', type=int, default=-1, help='local rank for distributed training') + #parser.add_argument('--local-rank', type=int, default=-1, help='local rank for distributed training') parser.add_argument('--fp16', action='store_true', help='use 16-bit floating point precision') parser.add_argument('--max-seq-length', From d41615fb282fd94cb2bd28bb4639f9dd5b7d08fe Mon Sep 17 00:00:00 2001 From: Jesse Date: Sat, 3 Aug 2019 15:28:56 -0400 Subject: [PATCH 33/42] fix for num_classes for bert --- datasets/bert_processors/lyrics_processor.py | 13 ++++++++++--- models/bert/__main__.py | 10 ++++++---- 2 files changed, 16 insertions(+), 7 deletions(-) diff --git a/datasets/bert_processors/lyrics_processor.py b/datasets/bert_processors/lyrics_processor.py index af4bc4b..65d944d 100644 --- a/datasets/bert_processors/lyrics_processor.py +++ b/datasets/bert_processors/lyrics_processor.py @@ -4,9 +4,16 @@ class LyricsProcessor(BertProcessor): - NAME = 'Lyrics' - NUM_CLASSES = 10 # Number of genre; len(df['genre'].unique()) = 10 - IS_MULTILABEL = True + def __init__(self): + self.NAME = 'Lyrics' + + def set_num_classes_(self, data_dir): + with open(os.path.join(data_dir, 'Lyrics', 'train.tsv'), 'r') as f: + l1 = f.readline().split('\t') + + # from one-hot class vector + self.NUM_CLASSES = len(l1[0]) + self.IS_MULTILABEL = self.NUM_CLASSES > 2 def get_train_examples(self, data_dir): return self._create_examples( diff --git a/models/bert/__main__.py b/models/bert/__main__.py index dd74edd..75f9271 100644 --- a/models/bert/__main__.py +++ b/models/bert/__main__.py @@ -80,17 +80,19 @@ def evaluate_split(model, processor, args, split='dev'): if args.dataset not in dataset_map: raise ValueError('Unrecognized dataset') + processor = dataset_map[args.dataset]() + processor.set_num_classes_(args.data_dir) + args.batch_size = args.batch_size // args.gradient_accumulation_steps args.device = device args.n_gpu = n_gpu - args.num_labels = dataset_map[args.dataset].NUM_CLASSES - args.is_multilabel = dataset_map[args.dataset].IS_MULTILABEL + args.num_labels = processor.NUM_CLASSES + args.is_multilabel = processor.IS_MULTILABEL if not args.trained_model: - save_path = os.path.join(args.save_path, dataset_map[args.dataset].NAME) + save_path = os.path.join(args.save_path, processor.NAME) os.makedirs(save_path, exist_ok=True) - processor = dataset_map[args.dataset]() args.is_lowercase = 'uncased' in args.model args.is_hierarchical = False tokenizer = BertTokenizer.from_pretrained(args.model, is_lowercase=args.is_lowercase) From bdaec319348970b5570e38c2b0696b88a5ff542e Mon Sep 17 00:00:00 2001 From: Jesse Date: Sat, 3 Aug 2019 15:31:22 -0400 Subject: [PATCH 34/42] removed local-rank argument from bert.args --- models/bert/args.py | 1 - 1 file changed, 1 deletion(-) diff --git a/models/bert/args.py b/models/bert/args.py index 35df3c7..d159d64 100644 --- a/models/bert/args.py +++ b/models/bert/args.py @@ -12,7 +12,6 @@ def get_args(): parser.add_argument('--save-path', type=str, default=os.path.join('model_checkpoints', 'bert')) parser.add_argument('--cache-dir', default='cache', type=str) parser.add_argument('--trained-model', default=None, type=str) - parser.add_argument('--local-rank', type=int, default=-1, help='local rank for distributed training') parser.add_argument('--fp16', action='store_true', help='use 16-bit floating point precision') parser.add_argument('--max-seq-length', From ef4d618b19dedf4052a7a13c246e5328cedd08d4 Mon Sep 17 00:00:00 2001 From: Jesse Date: Sat, 3 Aug 2019 15:42:03 -0400 Subject: [PATCH 35/42] monitoring class number and multilabel --- models/bert/__main__.py | 2 ++ 1 file changed, 2 insertions(+) diff --git a/models/bert/__main__.py b/models/bert/__main__.py index 75f9271..09b65b8 100644 --- a/models/bert/__main__.py +++ b/models/bert/__main__.py @@ -87,7 +87,9 @@ def evaluate_split(model, processor, args, split='dev'): args.device = device args.n_gpu = n_gpu args.num_labels = processor.NUM_CLASSES + print(args.num_labels) args.is_multilabel = processor.IS_MULTILABEL + print(args.is_multilabel) if not args.trained_model: save_path = os.path.join(args.save_path, processor.NAME) From 8ee8bb250363174d135090b5689deefdc1a5d644 Mon Sep 17 00:00:00 2001 From: Jesse Date: Sat, 3 Aug 2019 15:44:03 -0400 Subject: [PATCH 36/42] test --- datasets/lyrics.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/datasets/lyrics.py b/datasets/lyrics.py index 02987c8..c17fc22 100644 --- a/datasets/lyrics.py +++ b/datasets/lyrics.py @@ -42,8 +42,8 @@ def process_labels(string): class Lyrics(TabularDataset): NAME = 'Lyrics' - NUM_CLASSES = 10 - IS_MULTILABEL = True + NUM_CLASSES = 2 #10 + IS_MULTILABEL = False #True TEXT_FIELD = Field(batch_first=True, tokenize=clean_string, include_lengths=True) LABEL_FIELD = Field(sequential=False, use_vocab=False, batch_first=True, preprocessing=process_labels) From b3837c6276273a87549728dda959dc87ed62b42e Mon Sep 17 00:00:00 2001 From: Jesse Date: Sat, 3 Aug 2019 16:03:50 -0400 Subject: [PATCH 37/42] fixed sst_processor code --- datasets/bert_processors/sst_processor.py | 13 ++++++++++--- 1 file changed, 10 insertions(+), 3 deletions(-) diff --git a/datasets/bert_processors/sst_processor.py b/datasets/bert_processors/sst_processor.py index 01c6079..35231c9 100644 --- a/datasets/bert_processors/sst_processor.py +++ b/datasets/bert_processors/sst_processor.py @@ -4,9 +4,16 @@ class SST2Processor(BertProcessor): - NAME = 'SST-2' - NUM_CLASSES = 2 - IS_MULTILABEL = False + def __init__(self): + self.NAME = 'SST-2' + + def set_num_classes_(self, data_dir): + with open(os.path.join(data_dir, 'SST-2', 'train.tsv'), 'r') as f: + l1 = f.readline().split('\t') + + # from one-hot class vector + self.NUM_CLASSES = len(l1[0]) + self.IS_MULTILABEL = self.NUM_CLASSES > 2 def get_train_examples(self, data_dir): return self._create_examples( From 3b53a5bd0405efa1d1538fb1645062d58acd0a87 Mon Sep 17 00:00:00 2001 From: Jesse Date: Sat, 3 Aug 2019 16:16:58 -0400 Subject: [PATCH 38/42] multilabel true for testing --- datasets/bert_processors/lyrics_processor.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/datasets/bert_processors/lyrics_processor.py b/datasets/bert_processors/lyrics_processor.py index 65d944d..f25439d 100644 --- a/datasets/bert_processors/lyrics_processor.py +++ b/datasets/bert_processors/lyrics_processor.py @@ -13,7 +13,7 @@ def set_num_classes_(self, data_dir): # from one-hot class vector self.NUM_CLASSES = len(l1[0]) - self.IS_MULTILABEL = self.NUM_CLASSES > 2 + self.IS_MULTILABEL = True #self.NUM_CLASSES > 2 def get_train_examples(self, data_dir): return self._create_examples( From 46f60d82d52e69e8ccf0ce6071b65e4f673292e0 Mon Sep 17 00:00:00 2001 From: Jesse Date: Sat, 3 Aug 2019 16:35:21 -0400 Subject: [PATCH 39/42] fixed evaluation metrics for 2 class problem --- common/evaluators/bert_evaluator.py | 13 ++++++++++--- datasets/bert_processors/lyrics_processor.py | 2 +- 2 files changed, 11 insertions(+), 4 deletions(-) diff --git a/common/evaluators/bert_evaluator.py b/common/evaluators/bert_evaluator.py index 88c4660..7947731 100644 --- a/common/evaluators/bert_evaluator.py +++ b/common/evaluators/bert_evaluator.py @@ -86,11 +86,18 @@ def get_scores(self, silent=False): nb_eval_examples += input_ids.size(0) nb_eval_steps += 1 + if self.args.is_multilabel: + score_method = 'micro' + pos_label = None + else: + score_method = 'binary' + pos_label = '01' + predicted_labels, target_labels = np.array(predicted_labels), np.array(target_labels) accuracy = metrics.accuracy_score(target_labels, predicted_labels) - precision = metrics.precision_score(target_labels, predicted_labels, average='micro') - recall = metrics.recall_score(target_labels, predicted_labels, average='micro') - f1 = metrics.f1_score(target_labels, predicted_labels, average='micro') + precision = metrics.precision_score(target_labels, predicted_labels, average=score_method, pos_label=pos_label) + recall = metrics.recall_score(target_labels, predicted_labels, average=score_method, pos_label=pos_label) + f1 = metrics.f1_score(target_labels, predicted_labels, average=score_method, pos_label=pos_label) avg_loss = total_loss / nb_eval_steps return [accuracy, precision, recall, f1, avg_loss], ['accuracy', 'precision', 'recall', 'f1', 'avg_loss'] diff --git a/datasets/bert_processors/lyrics_processor.py b/datasets/bert_processors/lyrics_processor.py index f25439d..65d944d 100644 --- a/datasets/bert_processors/lyrics_processor.py +++ b/datasets/bert_processors/lyrics_processor.py @@ -13,7 +13,7 @@ def set_num_classes_(self, data_dir): # from one-hot class vector self.NUM_CLASSES = len(l1[0]) - self.IS_MULTILABEL = True #self.NUM_CLASSES > 2 + self.IS_MULTILABEL = self.NUM_CLASSES > 2 def get_train_examples(self, data_dir): return self._create_examples( From 37e7ad25dab06548d782899d9861ebdbc5689a9f Mon Sep 17 00:00:00 2001 From: Jesse Date: Sat, 3 Aug 2019 16:38:16 -0400 Subject: [PATCH 40/42] changed pos_label to 1 --- common/evaluators/bert_evaluator.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/common/evaluators/bert_evaluator.py b/common/evaluators/bert_evaluator.py index 7947731..fab803a 100644 --- a/common/evaluators/bert_evaluator.py +++ b/common/evaluators/bert_evaluator.py @@ -91,7 +91,7 @@ def get_scores(self, silent=False): pos_label = None else: score_method = 'binary' - pos_label = '01' + pos_label = 1 predicted_labels, target_labels = np.array(predicted_labels), np.array(target_labels) accuracy = metrics.accuracy_score(target_labels, predicted_labels) From 1ce9b9578f0bf951eeee8a7ca7fd37eaa041c5cd Mon Sep 17 00:00:00 2001 From: Jesse Date: Sat, 3 Aug 2019 16:43:56 -0400 Subject: [PATCH 41/42] removed testing print statements --- models/bert/__main__.py | 2 -- 1 file changed, 2 deletions(-) diff --git a/models/bert/__main__.py b/models/bert/__main__.py index 09b65b8..75f9271 100644 --- a/models/bert/__main__.py +++ b/models/bert/__main__.py @@ -87,9 +87,7 @@ def evaluate_split(model, processor, args, split='dev'): args.device = device args.n_gpu = n_gpu args.num_labels = processor.NUM_CLASSES - print(args.num_labels) args.is_multilabel = processor.IS_MULTILABEL - print(args.is_multilabel) if not args.trained_model: save_path = os.path.join(args.save_path, processor.NAME) From 8883909c7e76c9d0c84077beac728b992a977e7a Mon Sep 17 00:00:00 2001 From: Jesse Date: Sun, 4 Aug 2019 16:27:07 -0400 Subject: [PATCH 42/42] infer class number from actual dataset --- datasets/bert_processors/reuters_processor.py | 15 +++++++++++---- 1 file changed, 11 insertions(+), 4 deletions(-) diff --git a/datasets/bert_processors/reuters_processor.py b/datasets/bert_processors/reuters_processor.py index 8ad9b33..3711210 100644 --- a/datasets/bert_processors/reuters_processor.py +++ b/datasets/bert_processors/reuters_processor.py @@ -4,10 +4,17 @@ class ReutersProcessor(BertProcessor): - NAME = 'Reuters' - NUM_CLASSES = 90 - IS_MULTILABEL = True - + def __init__(self): + self.NAME = 'Reuters' + + def set_num_classes_(self, data_dir): + with open(os.path.join(data_dir, 'Reuters', 'train.tsv'), 'r') as f: + l1 = f.readline().split('\t') + + # from one-hot class vector + self.NUM_CLASSES = len(l1[0]) + self.IS_MULTILABEL = self.NUM_CLASSES > 2 + def get_train_examples(self, data_dir): return self._create_examples( self._read_tsv(os.path.join(data_dir, 'Reuters', 'train.tsv')), 'train')