From 7fd54e87c6b4f607b0331ce7d85ffd12eb00b599 Mon Sep 17 00:00:00 2001 From: mrkarezina Date: Mon, 18 May 2020 00:11:26 -0400 Subject: [PATCH 1/4] Update Reuters vocab size --- datasets/bow_processors/reuters_processor.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/datasets/bow_processors/reuters_processor.py b/datasets/bow_processors/reuters_processor.py index 59c901b..678deb7 100644 --- a/datasets/bow_processors/reuters_processor.py +++ b/datasets/bow_processors/reuters_processor.py @@ -6,7 +6,7 @@ class ReutersProcessor(BagOfWordsProcessor): NAME = 'Reuters' NUM_CLASSES = 90 - VOCAB_SIZE = 36308 + VOCAB_SIZE = 36311 IS_MULTILABEL = True def get_train_examples(self, data_dir): From 22131e21334528ac96da87708d749577dd7fd769 Mon Sep 17 00:00:00 2001 From: mrkarezina Date: Mon, 18 May 2020 00:11:55 -0400 Subject: [PATCH 2/4] Add hf-checkpoint args --- models/bert/__main__.py | 7 ++++--- models/bert/args.py | 1 + 2 files changed, 5 insertions(+), 3 deletions(-) diff --git a/models/bert/__main__.py b/models/bert/__main__.py index 144010c..bc35f7b 100644 --- a/models/bert/__main__.py +++ b/models/bert/__main__.py @@ -71,8 +71,9 @@ def evaluate_split(model, processor, tokenizer, args, split='dev'): args.is_hierarchical = False processor = dataset_map[args.dataset]() - pretrained_vocab_path = PRETRAINED_VOCAB_ARCHIVE_MAP[args.model] - tokenizer = BertTokenizer.from_pretrained(pretrained_vocab_path) + if not args.hf_checkpoint: + pretrained_vocab_path = PRETRAINED_VOCAB_ARCHIVE_MAP[args.model] + tokenizer = BertTokenizer.from_pretrained(args.model) train_examples = None num_train_optimization_steps = None @@ -81,7 +82,7 @@ def evaluate_split(model, processor, tokenizer, args, split='dev'): num_train_optimization_steps = int( len(train_examples) / args.batch_size / args.gradient_accumulation_steps) * args.epochs - pretrained_model_path = args.model if os.path.isfile(args.model) else PRETRAINED_MODEL_ARCHIVE_MAP[args.model] + pretrained_model_path = args.model if os.path.isfile(args.model) or args.hf_checkpoint else PRETRAINED_MODEL_ARCHIVE_MAP[args.model] model = BertForSequenceClassification.from_pretrained(pretrained_model_path, num_labels=args.num_labels) if args.fp16: diff --git a/models/bert/args.py b/models/bert/args.py index 477249e..38f22aa 100644 --- a/models/bert/args.py +++ b/models/bert/args.py @@ -7,6 +7,7 @@ def get_args(): parser = models.args.get_args() parser.add_argument('--model', default=None, type=str, required=True) + parser.add_argument('--hf-checkpoint', default=False, type=bool) parser.add_argument('--dataset', type=str, default='SST-2', choices=['SST-2', 'AGNews', 'Reuters', 'AAPD', 'IMDB', 'Yelp2014']) parser.add_argument('--save-path', type=str, default=os.path.join('model_checkpoints', 'bert')) parser.add_argument('--cache-dir', default='cache', type=str) From f3f665cac0f57de2922105d471bc453f99cac42a Mon Sep 17 00:00:00 2001 From: mrkarezina Date: Sat, 23 May 2020 15:37:47 -0400 Subject: [PATCH 3/4] Calculate vocab size --- models/lr/__main__.py | 7 ++++++- 1 file changed, 6 insertions(+), 1 deletion(-) diff --git a/models/lr/__main__.py b/models/lr/__main__.py index eb46ff0..af133a8 100644 --- a/models/lr/__main__.py +++ b/models/lr/__main__.py @@ -58,7 +58,6 @@ def evaluate_split(model, vectorizer, processor, args, split='dev'): args.n_gpu = n_gpu args.num_labels = dataset_map[args.dataset].NUM_CLASSES args.is_multilabel = dataset_map[args.dataset].IS_MULTILABEL - args.vocab_size = min(args.max_vocab_size, dataset_map[args.dataset].VOCAB_SIZE) train_examples = None processor = dataset_map[args.dataset]() @@ -71,6 +70,12 @@ def evaluate_split(model, vectorizer, processor, args, split='dev'): save_path = os.path.join(args.save_path, dataset_map[args.dataset].NAME) os.makedirs(save_path, exist_ok=True) + if train_examples: + train_features = vectorizer.fit_transform([x.text for x in train_examples]) + dataset_map[args.dataset].VOCAB_SIZE = train_features.shape[1] + + args.vocab_size = min(args.max_vocab_size, dataset_map[args.dataset].VOCAB_SIZE) + model = LogisticRegression(args) model.to(device) From 7a75012ab32a94cfbce67f1ee12ec0dbdadab9a1 Mon Sep 17 00:00:00 2001 From: mrkarezina Date: Sat, 23 May 2020 16:29:49 -0400 Subject: [PATCH 4/4] Remove hf model zoo flag --- models/bert/__main__.py | 10 ++++------ models/bert/args.py | 1 - 2 files changed, 4 insertions(+), 7 deletions(-) diff --git a/models/bert/__main__.py b/models/bert/__main__.py index bc35f7b..2414b49 100644 --- a/models/bert/__main__.py +++ b/models/bert/__main__.py @@ -71,9 +71,8 @@ def evaluate_split(model, processor, tokenizer, args, split='dev'): args.is_hierarchical = False processor = dataset_map[args.dataset]() - if not args.hf_checkpoint: - pretrained_vocab_path = PRETRAINED_VOCAB_ARCHIVE_MAP[args.model] - tokenizer = BertTokenizer.from_pretrained(args.model) + pretrained_vocab_path = PRETRAINED_VOCAB_ARCHIVE_MAP[args.model] + tokenizer = BertTokenizer.from_pretrained(pretrained_vocab_path) train_examples = None num_train_optimization_steps = None @@ -82,7 +81,7 @@ def evaluate_split(model, processor, tokenizer, args, split='dev'): num_train_optimization_steps = int( len(train_examples) / args.batch_size / args.gradient_accumulation_steps) * args.epochs - pretrained_model_path = args.model if os.path.isfile(args.model) or args.hf_checkpoint else PRETRAINED_MODEL_ARCHIVE_MAP[args.model] + pretrained_model_path = args.model if os.path.isfile(args.model) else PRETRAINED_MODEL_ARCHIVE_MAP[args.model] model = BertForSequenceClassification.from_pretrained(pretrained_model_path, num_labels=args.num_labels) if args.fp16: @@ -137,5 +136,4 @@ def evaluate_split(model, processor, tokenizer, args, split='dev'): model = model.to(device) evaluate_split(model, processor, tokenizer, args, split='dev') - evaluate_split(model, processor, tokenizer, args, split='test') - + evaluate_split(model, processor, tokenizer, args, split='test') \ No newline at end of file diff --git a/models/bert/args.py b/models/bert/args.py index 38f22aa..477249e 100644 --- a/models/bert/args.py +++ b/models/bert/args.py @@ -7,7 +7,6 @@ def get_args(): parser = models.args.get_args() parser.add_argument('--model', default=None, type=str, required=True) - parser.add_argument('--hf-checkpoint', default=False, type=bool) parser.add_argument('--dataset', type=str, default='SST-2', choices=['SST-2', 'AGNews', 'Reuters', 'AAPD', 'IMDB', 'Yelp2014']) parser.add_argument('--save-path', type=str, default=os.path.join('model_checkpoints', 'bert')) parser.add_argument('--cache-dir', default='cache', type=str)