From 07e291b4ddada5bf635939ae338689c30d020eca Mon Sep 17 00:00:00 2001 From: Luca Foppiano Date: Mon, 8 May 2023 17:33:37 +0900 Subject: [PATCH 1/9] add learning rate as parameter --- delft/applications/datasetTagger.py | 8 +++-- delft/applications/grobidTagger.py | 19 +++++++---- delft/applications/insultTagger.py | 10 +++--- delft/applications/nerTagger.py | 53 ++++++++++++++++++++--------- 4 files changed, 61 insertions(+), 29 deletions(-) diff --git a/delft/applications/datasetTagger.py b/delft/applications/datasetTagger.py index 3b32177e..e7d86d09 100644 --- a/delft/applications/datasetTagger.py +++ b/delft/applications/datasetTagger.py @@ -280,6 +280,7 @@ def annotate_text(texts, output_format, architecture='BidLSTM_CRF', features=Non parser.add_argument("--batch-size", type=int, default=-1, help="batch-size parameter to be used.") parser.add_argument("--patience", type=int, default=-1, help="patience, number of extra epochs to perform after " "the best epoch before stopping a training.") + parser.add_argument("--learning-rate", type=float, default=0.0001, help="Initial learning rate") args = parser.parse_args() @@ -293,6 +294,7 @@ def annotate_text(texts, output_format, architecture='BidLSTM_CRF', features=Non transformer = args.transformer use_ELMo = args.use_ELMo patience = args.patience + learning_rate = args.learning_rate if transformer is None and embeddings_name is None: # default word embeddings @@ -307,7 +309,8 @@ def annotate_text(texts, output_format, architecture='BidLSTM_CRF', features=Non max_sequence_length=max_sequence_length, batch_size=batch_size, use_ELMo=use_ELMo, - patience=patience) + patience=patience, + learning_rate=learning_rate) if action == "eval": if args.fold_count is not None and args.fold_count > 1: @@ -329,7 +332,8 @@ def annotate_text(texts, output_format, architecture='BidLSTM_CRF', features=Non max_sequence_length=max_sequence_length, batch_size=batch_size, use_ELMo=use_ELMo, - patience=patience) + patience=patience, + learning_rate=learning_rate) if action == "tag": someTexts = [] diff --git a/delft/applications/grobidTagger.py b/delft/applications/grobidTagger.py index 8fed17b9..6a221e6f 100644 --- a/delft/applications/grobidTagger.py +++ b/delft/applications/grobidTagger.py @@ -137,7 +137,7 @@ def configure(model, architecture, output_path=None, max_sequence_length=-1, bat # train a GROBID model with all available data def train(model, embeddings_name=None, architecture=None, transformer=None, input_path=None, output_path=None, features_indices=None, max_sequence_length=-1, batch_size=-1, max_epoch=-1, - use_ELMo=False, incremental=False, input_model_path=None, patience=-1): + use_ELMo=False, incremental=False, input_model_path=None, patience=-1, learning_rate=0.0001): print('Loading data...') if input_path == None: @@ -176,7 +176,8 @@ def train(model, embeddings_name=None, architecture=None, transformer=None, inpu use_ELMo=use_ELMo, multiprocessing=multiprocessing, early_stop=early_stop, - patience=patience) + patience=patience, + learning_rate=learning_rate) if incremental: if input_model_path != None: @@ -202,7 +203,7 @@ def train(model, embeddings_name=None, architecture=None, transformer=None, inpu def train_eval(model, embeddings_name=None, architecture='BidLSTM_CRF', transformer=None, input_path=None, output_path=None, fold_count=1, features_indices=None, max_sequence_length=-1, batch_size=-1, max_epoch=-1, - use_ELMo=False, incremental=False, input_model_path=None, patience=-1): + use_ELMo=False, incremental=False, input_model_path=None, patience=-1, learning_rate=0.0001): print('Loading data...') if input_path is None: x_all, y_all, f_all = load_data_and_labels_crf_file('data/sequenceLabelling/grobid/'+model+'/'+model+'-060518.train') @@ -242,7 +243,8 @@ def train_eval(model, embeddings_name=None, architecture='BidLSTM_CRF', transfor use_ELMo=use_ELMo, multiprocessing=multiprocessing, early_stop=early_stop, - patience=patience) + patience=patience, + learning_rate=learning_rate) if incremental: if input_model_path != None: @@ -336,6 +338,7 @@ class Tasks: EVAL = 'eval' TAG = 'tag' + if __name__ == "__main__": parser = argparse.ArgumentParser(description = "Trainer for GROBID models using the DeLFT library") @@ -392,6 +395,7 @@ class Tasks: parser.add_argument("--batch-size", type=int, default=-1, help="batch-size parameter to be used.") parser.add_argument("--patience", type=int, default=-1, help="patience, number of extra epochs to perform after " "the best epoch before stopping a training.") + parser.add_argument("--learning-rate", type=float, default=0.0001, help="Initial learning rate") @@ -410,6 +414,7 @@ class Tasks: use_ELMo = args.use_ELMo incremental = args.incremental patience = args.patience + learning_rate = args.learning_rate if transformer is None and embeddings_name is None: # default word embeddings @@ -427,7 +432,8 @@ class Tasks: use_ELMo=use_ELMo, incremental=incremental, input_model_path=input_model_path, - patience=patience) + patience=patience, + learning_rate=learning_rate) if action == Tasks.EVAL: if args.fold_count is not None and args.fold_count > 1: @@ -451,7 +457,8 @@ class Tasks: batch_size=batch_size, use_ELMo=use_ELMo, incremental=incremental, - input_model_path=input_model_path) + input_model_path=input_model_path, + learning_rate=learning_rate) if action == Tasks.TAG: someTexts = [] diff --git a/delft/applications/insultTagger.py b/delft/applications/insultTagger.py index 14691bc2..019b8265 100644 --- a/delft/applications/insultTagger.py +++ b/delft/applications/insultTagger.py @@ -21,7 +21,7 @@ def configure(architecture, embeddings_name): return batch_size, maxlen, patience, early_stop, max_epoch, embeddings_name -def train(embeddings_name=None, architecture='BidLSTM_CRF', transformer=None, use_ELMo=False): +def train(embeddings_name=None, architecture='BidLSTM_CRF', transformer=None, use_ELMo=False, learning_rate=0.0001): batch_size, maxlen, patience, early_stop, max_epoch, embeddings_name = configure(architecture, embeddings_name) root = 'data/sequenceLabelling/toxic/' @@ -41,7 +41,7 @@ def train(embeddings_name=None, architecture='BidLSTM_CRF', transformer=None, us model = Sequence(model_name, max_epoch=max_epoch, batch_size=batch_size, max_sequence_length=maxlen, embeddings_name=embeddings_name, architecture=architecture, patience=patience, early_stop=early_stop, - transformer_name=transformer, use_ELMo=use_ELMo) + transformer_name=transformer, use_ELMo=use_ELMo, learning_rate=learning_rate) model.train(x_train, y_train, x_valid=x_valid, y_valid=y_valid) print('training done') @@ -113,7 +113,8 @@ def annotate(texts, output_format, architecture='BidLSTM_CRF', transformer=None, "HuggingFace transformers hub will be used otherwise to fetch the model, see https://huggingface.co/models " + \ "for model names" ) - parser.add_argument("--use-ELMo", action="store_true", help="Use ELMo contextual embeddings") + parser.add_argument("--use-ELMo", action="store_true", help="Use ELMo contextual embeddings") + parser.add_argument("--learning-rate", type=float, default=0.0001, help="Initial learning rate") args = parser.parse_args() @@ -124,13 +125,14 @@ def annotate(texts, output_format, architecture='BidLSTM_CRF', transformer=None, architecture = args.architecture transformer = args.transformer use_ELMo = args.use_ELMo + learning_rate = args.learning_rate if transformer == None and embeddings_name == None: # default word embeddings embeddings_name = "glove-840B" if args.action == 'train': - train(embeddings_name=embeddings_name, architecture=architecture, transformer=transformer, use_ELMo=use_ELMo) + train(embeddings_name=embeddings_name, architecture=architecture, transformer=transformer, use_ELMo=use_ELMo, learning_rate=learning_rate) if args.action == 'tag': someTexts = ['This is a gentle test.', diff --git a/delft/applications/nerTagger.py b/delft/applications/nerTagger.py index f76d94b8..01774efa 100644 --- a/delft/applications/nerTagger.py +++ b/delft/applications/nerTagger.py @@ -67,7 +67,7 @@ def configure(architecture, dataset_type, lang, embeddings_name, use_ELMo, max_s # train a model with all available for a given dataset def train(dataset_type='conll2003', lang='en', embeddings_name=None, architecture='BidLSTM_CRF', - transformer=None, data_path=None, use_ELMo=False, max_sequence_length=-1, batch_size=-1, patience=-1): + transformer=None, data_path=None, use_ELMo=False, max_sequence_length=-1, batch_size=-1, patience=-1, learning_rate=0.0001): batch_size, max_sequence_length, patience, recurrent_dropout, early_stop, max_epoch, embeddings_name, word_lstm_units, multiprocessing = \ configure(architecture, dataset_type, lang, embeddings_name, use_ELMo, max_sequence_length, batch_size, patience) @@ -102,7 +102,9 @@ def train(dataset_type='conll2003', lang='en', embeddings_name=None, architectur patience=patience, max_sequence_length=max_sequence_length, use_ELMo=use_ELMo, - multiprocessing=multiprocessing) + multiprocessing=multiprocessing, + learning_rate=learning_rate) + elif (dataset_type == 'conll2012') and (lang == 'en'): print('Loading Ontonotes 5.0 CoNLL-2012 NER data...') @@ -134,7 +136,8 @@ def train(dataset_type='conll2003', lang='en', embeddings_name=None, architectur patience=patience, max_sequence_length=max_sequence_length, use_ELMo=use_ELMo, - multiprocessing=multiprocessing) + multiprocessing=multiprocessing, + learning_rate=learning_rate) elif (lang == 'fr'): print('Loading data...') dataset_type = 'lemonde' @@ -159,7 +162,8 @@ def train(dataset_type='conll2003', lang='en', embeddings_name=None, architectur patience=patience, max_sequence_length=max_sequence_length, use_ELMo=use_ELMo, - multiprocessing=multiprocessing) + multiprocessing=multiprocessing, + learning_rate=learning_rate) else: print("dataset/language combination is not supported:", dataset_type, lang) return @@ -190,7 +194,8 @@ def train_eval(embeddings_name=None, use_ELMo=False, patience=-1, batch_size=-1, - max_sequence_length=-1): + max_sequence_length=-1, + learning_rate=0.0001): batch_size, max_sequence_length, patience, recurrent_dropout, early_stop, max_epoch, embeddings_name, word_lstm_units, multiprocessing = \ configure(architecture, dataset_type, lang, embeddings_name, use_ELMo, @@ -222,7 +227,8 @@ def train_eval(embeddings_name=None, patience=patience, max_sequence_length=max_sequence_length, use_ELMo=use_ELMo, - multiprocessing=multiprocessing) + multiprocessing=multiprocessing, + learning_rate=learning_rate) else: # also use validation set to train (no early stop, hyperparmeters must be set preliminarly), # as (Chui & Nochols, 2016) and (Peters and al., 2017) @@ -240,7 +246,8 @@ def train_eval(embeddings_name=None, patience=patience, max_sequence_length=max_sequence_length, use_ELMo=use_ELMo, - multiprocessing=multiprocessing) + multiprocessing=multiprocessing, + learning_rate=learning_rate) elif (dataset_type == 'ontonotes-all') and (lang == 'en'): print("Loading all Ontonotes 5.0 XML data, evaluation will be on 10\% random partition") @@ -266,7 +273,8 @@ def train_eval(embeddings_name=None, patience=patience, max_sequence_length=max_sequence_length, use_ELMo=use_ELMo, - multiprocessing=multiprocessing) + multiprocessing=multiprocessing, + learning_rate=learning_rate) elif (dataset_type == 'conll2012') and (lang == 'en'): print('Loading Ontonotes 5.0 CoNLL-2012 NER data...') @@ -294,7 +302,8 @@ def train_eval(embeddings_name=None, patience=patience, max_sequence_length=max_sequence_length, use_ELMo=use_ELMo, - multiprocessing=multiprocessing) + multiprocessing=multiprocessing, + learning_rate=learning_rate) else: # also use validation set to train (no early stop, hyperparameters must be set preliminarly), # as (Chui & Nochols, 2016) and (Peters and al., 2017) @@ -312,7 +321,8 @@ def train_eval(embeddings_name=None, patience=patience, max_sequence_length=max_sequence_length, use_ELMo=use_ELMo, - multiprocessing=multiprocessing) + multiprocessing=multiprocessing, + learning_rate=learning_rate) elif (lang == 'fr') and (dataset_type == 'ftb' or dataset_type is None): print('Loading data for ftb...') @@ -339,7 +349,8 @@ def train_eval(embeddings_name=None, patience=patience, max_sequence_length=max_sequence_length, use_ELMo=use_ELMo, - multiprocessing=multiprocessing) + multiprocessing=multiprocessing, + learning_rate=learning_rate) elif (lang == 'fr') and (dataset_type == 'ftb_force_split'): print('Loading data for ftb_force_split...') x_train, y_train = load_data_and_labels_conll('data/sequenceLabelling/leMonde/ftb6_train.conll') @@ -367,7 +378,8 @@ def train_eval(embeddings_name=None, patience=patience, max_sequence_length=max_sequence_length, use_ELMo=use_ELMo, - multiprocessing=multiprocessing) + multiprocessing=multiprocessing, + learning_rate=learning_rate) else: # also use validation set to train (no early stop, hyperparmeters must be set preliminarly), # as (Chui & Nochols, 2016) and (Peters and al., 2017) @@ -385,7 +397,8 @@ def train_eval(embeddings_name=None, patience=patience, max_sequence_length=max_sequence_length, use_ELMo=use_ELMo, - multiprocessing=multiprocessing) + multiprocessing=multiprocessing, + learning_rate=learning_rate) elif (lang == 'fr') and (dataset_type == 'ftb_force_split_xml'): print('Loading data for ftb_force_split_xml...') x_train, y_train = load_data_and_labels_lemonde('data/sequenceLabelling/leMonde/ftb6_ALL.EN.docs.relinked.train.xml') @@ -413,7 +426,8 @@ def train_eval(embeddings_name=None, patience=patience, max_sequence_length=max_sequence_length, use_ELMo=use_ELMo, - multiprocessing=multiprocessing) + multiprocessing=multiprocessing, + learning_rate=learning_rate) else: # also use validation set to train (no early stop, hyperparmeters must be set preliminarly), # as (Chui & Nochols, 2016) and (Peters and al., 2017) @@ -431,7 +445,8 @@ def train_eval(embeddings_name=None, patience=patience, max_sequence_length=max_sequence_length, use_ELMo=use_ELMo, - multiprocessing=multiprocessing) + multiprocessing=multiprocessing, + learning_rate=learning_rate) else: print("dataset/language combination is not supported:", dataset_type, lang) return @@ -597,6 +612,7 @@ def annotate(output_format, parser.add_argument("--batch-size", type=int, default=-1, help="batch-size parameter to be used.") parser.add_argument("--patience", type=int, default=-1, help="patience, number of extra epochs to perform after " "the best epoch before stopping a training.") + parser.add_argument("--learning-rate", type=float, default=0.0001, help="Initial learning rate") args = parser.parse_args() @@ -617,6 +633,7 @@ def annotate(output_format, patience = args.patience max_sequence_length = args.max_sequence_length batch_size = args.batch_size + learning_rate = args.learning_rate # name of embeddings refers to the file delft/resources-registry.json # be sure to use here the same name as in the registry ('glove-840B', 'fasttext-crawl', 'word2vec'), @@ -635,7 +652,8 @@ def annotate(output_format, use_ELMo=use_ELMo, max_sequence_length=max_sequence_length, batch_size=batch_size, - patience=patience + patience=patience, + learning_rate=learning_rate ) if action == 'train_eval': @@ -653,7 +671,8 @@ def annotate(output_format, use_ELMo=use_ELMo, max_sequence_length=max_sequence_length, batch_size=batch_size, - patience=patience + patience=patience, + learning_rate=learning_rate ) if action == 'eval': From 33d6436fe49c1581aed5d86ca5211d706b332bfd Mon Sep 17 00:00:00 2001 From: lfoppiano Date: Sat, 13 May 2023 08:38:30 +0900 Subject: [PATCH 2/9] print correct learning_rate and improve default value based on architecture --- delft/sequenceLabelling/trainer.py | 2 +- delft/sequenceLabelling/wrapper.py | 8 +++++++- 2 files changed, 8 insertions(+), 2 deletions(-) diff --git a/delft/sequenceLabelling/trainer.py b/delft/sequenceLabelling/trainer.py index 88174be7..6c7b7f9e 100644 --- a/delft/sequenceLabelling/trainer.py +++ b/delft/sequenceLabelling/trainer.py @@ -67,7 +67,7 @@ def compile_model(self, local_model, train_size): if self.model_config.transformer_name is not None: # we use a transformer layer in the architecture optimizer, lr_schedule = create_optimizer( - init_lr=2e-5, + init_lr=self.training_config.learning_rate, num_train_steps=nb_train_steps, weight_decay_rate=0.01, num_warmup_steps=0.1*nb_train_steps, diff --git a/delft/sequenceLabelling/wrapper.py b/delft/sequenceLabelling/wrapper.py index f3f92264..2976b1d3 100644 --- a/delft/sequenceLabelling/wrapper.py +++ b/delft/sequenceLabelling/wrapper.py @@ -72,7 +72,7 @@ def __init__(self, recurrent_dropout=0.25, batch_size=20, optimizer='adam', - learning_rate=0.001, + learning_rate=None, lr_decay=0.9, clip_gradients=5.0, max_epoch=50, @@ -113,6 +113,12 @@ def __init__(self, self.embeddings = None word_emb_size = 0 + if learning_rate is None: + if transformer_name is None: + learning_rate = 0.0001 + else: + learning_rate = 2e-5 + self.model_config = ModelConfig(model_name=model_name, architecture=architecture, embeddings_name=embeddings_name, From 185428b87cdaa8a36ea90ccbffba15e6b455a419 Mon Sep 17 00:00:00 2001 From: Luca Foppiano Date: Thu, 15 Jun 2023 08:25:42 +0900 Subject: [PATCH 3/9] Fix default learning rate from applications scripts --- delft/applications/datasetTagger.py | 14 +++++++++----- delft/applications/grobidTagger.py | 6 +++--- delft/applications/insultTagger.py | 4 ++-- delft/applications/nerTagger.py | 6 +++--- 4 files changed, 17 insertions(+), 13 deletions(-) diff --git a/delft/applications/datasetTagger.py b/delft/applications/datasetTagger.py index e7d86d09..c414b62a 100644 --- a/delft/applications/datasetTagger.py +++ b/delft/applications/datasetTagger.py @@ -66,7 +66,9 @@ def configure(architecture, output_path=None, max_sequence_length=-1, batch_size # train a model with all available data def train(embeddings_name=None, architecture='BidLSTM_CRF', transformer=None, input_path=None, output_path=None, fold_count=1, - features_indices=None, max_sequence_length=-1, batch_size=-1, max_epoch=-1, use_ELMo=False, patience=-1): + features_indices=None, max_sequence_length=-1, + batch_size=-1, max_epoch=-1, use_ELMo=False, patience=-1, + learning_rate=None): print('Loading data...') if input_path is None: x_all1 = y_all1 = x_all2 = y_all2 = x_all3 = y_all3 = [] @@ -110,7 +112,8 @@ def train(embeddings_name=None, architecture='BidLSTM_CRF', transformer=None, use_ELMo=use_ELMo, multiprocessing=multiprocessing, early_stop=early_stop, - patience=patience) + patience=patience, + learning_rate=learning_rate) start_time = time.time() model.train(x_train, y_train, x_valid=x_valid, y_valid=y_valid) @@ -129,7 +132,7 @@ def train(embeddings_name=None, architecture='BidLSTM_CRF', transformer=None, def train_eval(embeddings_name=None, architecture='BidLSTM_CRF', transformer=None, input_path=None, output_path=None, fold_count=1, features_indices=None, max_sequence_length=-1, batch_size=-1, max_epoch=-1, use_ELMo=False, - patience=-1): + patience=-1, learning_rate=None): print('Loading data...') if input_path is None: x_all1 = y_all1 = x_all2 = y_all2 = x_all3 = y_all3 = [] @@ -175,7 +178,8 @@ def train_eval(embeddings_name=None, architecture='BidLSTM_CRF', transformer=Non use_ELMo=use_ELMo, multiprocessing=multiprocessing, early_stop=early_stop, - patience=patience) + patience=patience, + learning_rate=learning_rate) start_time = time.time() @@ -280,7 +284,7 @@ def annotate_text(texts, output_format, architecture='BidLSTM_CRF', features=Non parser.add_argument("--batch-size", type=int, default=-1, help="batch-size parameter to be used.") parser.add_argument("--patience", type=int, default=-1, help="patience, number of extra epochs to perform after " "the best epoch before stopping a training.") - parser.add_argument("--learning-rate", type=float, default=0.0001, help="Initial learning rate") + parser.add_argument("--learning-rate", type=float, default=None, help="Initial learning rate") args = parser.parse_args() diff --git a/delft/applications/grobidTagger.py b/delft/applications/grobidTagger.py index 6a221e6f..651a6bd7 100644 --- a/delft/applications/grobidTagger.py +++ b/delft/applications/grobidTagger.py @@ -137,7 +137,7 @@ def configure(model, architecture, output_path=None, max_sequence_length=-1, bat # train a GROBID model with all available data def train(model, embeddings_name=None, architecture=None, transformer=None, input_path=None, output_path=None, features_indices=None, max_sequence_length=-1, batch_size=-1, max_epoch=-1, - use_ELMo=False, incremental=False, input_model_path=None, patience=-1, learning_rate=0.0001): + use_ELMo=False, incremental=False, input_model_path=None, patience=-1, learning_rate=None): print('Loading data...') if input_path == None: @@ -203,7 +203,7 @@ def train(model, embeddings_name=None, architecture=None, transformer=None, inpu def train_eval(model, embeddings_name=None, architecture='BidLSTM_CRF', transformer=None, input_path=None, output_path=None, fold_count=1, features_indices=None, max_sequence_length=-1, batch_size=-1, max_epoch=-1, - use_ELMo=False, incremental=False, input_model_path=None, patience=-1, learning_rate=0.0001): + use_ELMo=False, incremental=False, input_model_path=None, patience=-1, learning_rate=None): print('Loading data...') if input_path is None: x_all, y_all, f_all = load_data_and_labels_crf_file('data/sequenceLabelling/grobid/'+model+'/'+model+'-060518.train') @@ -395,7 +395,7 @@ class Tasks: parser.add_argument("--batch-size", type=int, default=-1, help="batch-size parameter to be used.") parser.add_argument("--patience", type=int, default=-1, help="patience, number of extra epochs to perform after " "the best epoch before stopping a training.") - parser.add_argument("--learning-rate", type=float, default=0.0001, help="Initial learning rate") + parser.add_argument("--learning-rate", type=float, default=None, help="Initial learning rate") diff --git a/delft/applications/insultTagger.py b/delft/applications/insultTagger.py index 019b8265..0d00a1b2 100644 --- a/delft/applications/insultTagger.py +++ b/delft/applications/insultTagger.py @@ -21,7 +21,7 @@ def configure(architecture, embeddings_name): return batch_size, maxlen, patience, early_stop, max_epoch, embeddings_name -def train(embeddings_name=None, architecture='BidLSTM_CRF', transformer=None, use_ELMo=False, learning_rate=0.0001): +def train(embeddings_name=None, architecture='BidLSTM_CRF', transformer=None, use_ELMo=False, learning_rate=None): batch_size, maxlen, patience, early_stop, max_epoch, embeddings_name = configure(architecture, embeddings_name) root = 'data/sequenceLabelling/toxic/' @@ -114,7 +114,7 @@ def annotate(texts, output_format, architecture='BidLSTM_CRF', transformer=None, "for model names" ) parser.add_argument("--use-ELMo", action="store_true", help="Use ELMo contextual embeddings") - parser.add_argument("--learning-rate", type=float, default=0.0001, help="Initial learning rate") + parser.add_argument("--learning-rate", type=float, default=None, help="Initial learning rate") args = parser.parse_args() diff --git a/delft/applications/nerTagger.py b/delft/applications/nerTagger.py index 01774efa..36a4deab 100644 --- a/delft/applications/nerTagger.py +++ b/delft/applications/nerTagger.py @@ -67,7 +67,7 @@ def configure(architecture, dataset_type, lang, embeddings_name, use_ELMo, max_s # train a model with all available for a given dataset def train(dataset_type='conll2003', lang='en', embeddings_name=None, architecture='BidLSTM_CRF', - transformer=None, data_path=None, use_ELMo=False, max_sequence_length=-1, batch_size=-1, patience=-1, learning_rate=0.0001): + transformer=None, data_path=None, use_ELMo=False, max_sequence_length=-1, batch_size=-1, patience=-1, learning_rate=None): batch_size, max_sequence_length, patience, recurrent_dropout, early_stop, max_epoch, embeddings_name, word_lstm_units, multiprocessing = \ configure(architecture, dataset_type, lang, embeddings_name, use_ELMo, max_sequence_length, batch_size, patience) @@ -195,7 +195,7 @@ def train_eval(embeddings_name=None, patience=-1, batch_size=-1, max_sequence_length=-1, - learning_rate=0.0001): + learning_rate=None): batch_size, max_sequence_length, patience, recurrent_dropout, early_stop, max_epoch, embeddings_name, word_lstm_units, multiprocessing = \ configure(architecture, dataset_type, lang, embeddings_name, use_ELMo, @@ -612,7 +612,7 @@ def annotate(output_format, parser.add_argument("--batch-size", type=int, default=-1, help="batch-size parameter to be used.") parser.add_argument("--patience", type=int, default=-1, help="patience, number of extra epochs to perform after " "the best epoch before stopping a training.") - parser.add_argument("--learning-rate", type=float, default=0.0001, help="Initial learning rate") + parser.add_argument("--learning-rate", type=float, default=None, help="Initial learning rate") args = parser.parse_args() From 22c2ca531a859b4a294a469e3e357d3ce78ee866 Mon Sep 17 00:00:00 2001 From: Luca Foppiano Date: Thu, 15 Jun 2023 08:28:45 +0900 Subject: [PATCH 4/9] Remove hardcoded learning rates --- delft/sequenceLabelling/config.py | 2 +- delft/textClassification/models.py | 4 ++-- 2 files changed, 3 insertions(+), 3 deletions(-) diff --git a/delft/sequenceLabelling/config.py b/delft/sequenceLabelling/config.py index ec84e8fd..92a28e6f 100644 --- a/delft/sequenceLabelling/config.py +++ b/delft/sequenceLabelling/config.py @@ -84,7 +84,7 @@ class TrainingConfig(object): def __init__(self, batch_size=20, optimizer='adam', - learning_rate=0.001, + learning_rate=None, lr_decay=0.9, clip_gradients=5.0, max_epoch=50, diff --git a/delft/textClassification/models.py b/delft/textClassification/models.py index e5a4c308..09be91f8 100644 --- a/delft/textClassification/models.py +++ b/delft/textClassification/models.py @@ -847,9 +847,9 @@ def __init__(self, model_config, training_config, load_pretrained_weights=True, def compile(self, train_size): #optimizer = Adam(learning_rate=2e-5, clipnorm=1) optimizer, lr_schedule = create_optimizer( - init_lr=2e-5, + init_lr=self.training_config.learning_rate, num_train_steps=train_size, weight_decay_rate=0.01, - num_warmup_steps=0.1*train_size, + num_warmup_steps=0.1 * train_size, ) self.model.compile(optimizer=optimizer, loss='binary_crossentropy', metrics=["accuracy"]) From 42207e2ed79af51cab2c97008d6643c37c8f3f43 Mon Sep 17 00:00:00 2001 From: Luca Foppiano Date: Thu, 15 Jun 2023 08:32:39 +0900 Subject: [PATCH 5/9] Fix typo --- delft/sequenceLabelling/trainer.py | 12 ++++++------ 1 file changed, 6 insertions(+), 6 deletions(-) diff --git a/delft/sequenceLabelling/trainer.py b/delft/sequenceLabelling/trainer.py index 6c7b7f9e..b12164ba 100644 --- a/delft/sequenceLabelling/trainer.py +++ b/delft/sequenceLabelling/trainer.py @@ -70,7 +70,7 @@ def compile_model(self, local_model, train_size): init_lr=self.training_config.learning_rate, num_train_steps=nb_train_steps, weight_decay_rate=0.01, - num_warmup_steps=0.1*nb_train_steps, + num_warmup_steps=0.1 * nb_train_steps, ) if local_model.config.use_chain_crf: @@ -139,7 +139,7 @@ def train_model(self, local_model, x_train, y_train, f_train=None, output_input_offsets=True, use_chain_crf=self.model_config.use_chain_crf) _callbacks = get_callbacks(log_dir=self.checkpoint_path, - eary_stopping=True, + early_stopping=True, patience=self.training_config.patience, valid=(validation_generator, self.preprocessor), use_crf=self.model_config.use_crf, use_chain_crf=self.model_config.use_chain_crf) @@ -159,7 +159,7 @@ def train_model(self, local_model, x_train, y_train, f_train=None, features=feature_all, use_chain_crf=self.model_config.use_chain_crf) _callbacks = get_callbacks(log_dir=self.checkpoint_path, - eary_stopping=False, + early_stopping=False, use_crf=self.model_config.use_crf, use_chain_crf=self.model_config.use_chain_crf) _callbacks += (callbacks or []) @@ -268,14 +268,14 @@ def train_nfold(self, x_train, y_train, x_valid=None, y_valid=None, f_train=None transformer_preprocessor.tokenizer.save_pretrained(os.path.join(output_directory, DEFAULT_TRANSFORMER_TOKENIZER_DIR)) -def get_callbacks(log_dir=None, valid=(), eary_stopping=True, patience=5, use_crf=True, use_chain_crf=False): +def get_callbacks(log_dir=None, valid=(), early_stopping=True, patience=5, use_crf=True, use_chain_crf=False): """ Get callbacks. Args: log_dir (str): the destination to save logs valid (tuple): data for validation. - eary_stopping (bool): whether to use early stopping. + early_stopping (bool): whether to use early stopping. Returns: list: list of callbacks @@ -296,7 +296,7 @@ def get_callbacks(log_dir=None, valid=(), eary_stopping=True, patience=5, use_cr save_weights_only=True) callbacks.append(save_callback) - if eary_stopping: + if early_stopping: callbacks.append(EarlyStopping(monitor='f1', patience=patience, mode='max')) return callbacks From a405486dce041e19b40eea6255989fcfc053b30a Mon Sep 17 00:00:00 2001 From: Luca Foppiano Date: Thu, 15 Jun 2023 08:34:03 +0900 Subject: [PATCH 6/9] Remove default value from learning rate in TrainingConfig and set different default for transformers-based and non-transformers-based models --- delft/sequenceLabelling/config.py | 6 +++--- delft/sequenceLabelling/wrapper.py | 2 +- delft/textClassification/config.py | 6 +++--- delft/textClassification/wrapper.py | 15 +++++++++++---- 4 files changed, 18 insertions(+), 11 deletions(-) diff --git a/delft/sequenceLabelling/config.py b/delft/sequenceLabelling/config.py index 92a28e6f..a5515ef4 100644 --- a/delft/sequenceLabelling/config.py +++ b/delft/sequenceLabelling/config.py @@ -81,10 +81,10 @@ def load(cls, file): # Training parameters class TrainingConfig(object): - def __init__(self, + def __init__(self, + learning_rate, batch_size=20, - optimizer='adam', - learning_rate=None, + optimizer='adam', lr_decay=0.9, clip_gradients=5.0, max_epoch=50, diff --git a/delft/sequenceLabelling/wrapper.py b/delft/sequenceLabelling/wrapper.py index 2976b1d3..b9fda70f 100644 --- a/delft/sequenceLabelling/wrapper.py +++ b/delft/sequenceLabelling/wrapper.py @@ -136,7 +136,7 @@ def __init__(self, features_indices=features_indices, transformer_name=transformer_name) - self.training_config = TrainingConfig(batch_size, optimizer, learning_rate, + self.training_config = TrainingConfig(learning_rate, batch_size, optimizer, lr_decay, clip_gradients, max_epoch, early_stop, patience, max_checkpoints_to_keep, multiprocessing) diff --git a/delft/textClassification/config.py b/delft/textClassification/config.py index 144e776f..b06286a3 100644 --- a/delft/textClassification/config.py +++ b/delft/textClassification/config.py @@ -58,10 +58,10 @@ def load(cls, file): # Training parameter class TrainingConfig(object): - def __init__(self, + def __init__(self, + learning_rate, batch_size=256, - optimizer='adam', - learning_rate=0.001, + optimizer='adam', lr_decay=0.9, clip_gradients=5.0, max_epoch=50, diff --git a/delft/textClassification/wrapper.py b/delft/textClassification/wrapper.py index 4026ba67..015e2c5a 100644 --- a/delft/textClassification/wrapper.py +++ b/delft/textClassification/wrapper.py @@ -62,7 +62,7 @@ def __init__(self, use_char_feature=False, batch_size=256, optimizer='adam', - learning_rate=0.001, + learning_rate=None, lr_decay=0.9, clip_gradients=5.0, max_epoch=50, @@ -84,6 +84,13 @@ def __init__(self, if transformer_name is not None: model_name += "_" + transformer_name + if learning_rate is None: + if transformer_name is None: + learning_rate = 0.0001 + else: + learning_rate = 2e-5 + + self.model = None self.models = None self.log_dir = log_dir @@ -118,9 +125,9 @@ def __init__(self, batch_size=batch_size, transformer_name=self.transformer_name) - self.training_config = TrainingConfig(batch_size=batch_size, - optimizer=optimizer, - learning_rate=learning_rate, + self.training_config = TrainingConfig(learning_rate, + batch_size=batch_size, + optimizer=optimizer, lr_decay=lr_decay, clip_gradients=clip_gradients, max_epoch=max_epoch, From edd1898f4291f3de1abf5b81b5bd21105570956b Mon Sep 17 00:00:00 2001 From: Luca Foppiano Date: Thu, 15 Jun 2023 10:43:11 +0900 Subject: [PATCH 7/9] Log learning rate at each epoque --- delft/sequenceLabelling/trainer.py | 20 ++++++++++++++++---- delft/textClassification/wrapper.py | 5 +++++ 2 files changed, 21 insertions(+), 4 deletions(-) diff --git a/delft/sequenceLabelling/trainer.py b/delft/sequenceLabelling/trainer.py index b12164ba..dd8a46cb 100644 --- a/delft/sequenceLabelling/trainer.py +++ b/delft/sequenceLabelling/trainer.py @@ -141,8 +141,7 @@ def train_model(self, local_model, x_train, y_train, f_train=None, _callbacks = get_callbacks(log_dir=self.checkpoint_path, early_stopping=True, patience=self.training_config.patience, - valid=(validation_generator, self.preprocessor), use_crf=self.model_config.use_crf, - use_chain_crf=self.model_config.use_chain_crf) + valid=(validation_generator, self.preprocessor), use_crf=self.model_config.use_crf, use_chain_crf=self.model_config.use_chain_crf, model=local_model) else: x_train = np.concatenate((x_train, x_valid), axis=0) y_train = np.concatenate((y_train, y_valid), axis=0) @@ -161,7 +160,8 @@ def train_model(self, local_model, x_train, y_train, f_train=None, _callbacks = get_callbacks(log_dir=self.checkpoint_path, early_stopping=False, use_crf=self.model_config.use_crf, - use_chain_crf=self.model_config.use_chain_crf) + use_chain_crf=self.model_config.use_chain_crf, + model=local_model) _callbacks += (callbacks or []) nb_workers = 6 multiprocessing = self.training_config.multiprocessing @@ -268,7 +268,17 @@ def train_nfold(self, x_train, y_train, x_valid=None, y_valid=None, f_train=None transformer_preprocessor.tokenizer.save_pretrained(os.path.join(output_directory, DEFAULT_TRANSFORMER_TOKENIZER_DIR)) -def get_callbacks(log_dir=None, valid=(), early_stopping=True, patience=5, use_crf=True, use_chain_crf=False): +class LogLearningRateCallback(Callback): + + def __init__(self, model=None): + super().__init__() + self.model = model + + def on_epoch_end(self, epoch, logs): + if self.model is not None: + logs.update({"learning_rate": self.model.optimizer._decayed_lr(tf.float32)}) + +def get_callbacks(log_dir=None, valid=(), early_stopping=True, patience=5, use_crf=True, use_chain_crf=False, model=None): """ Get callbacks. @@ -299,6 +309,8 @@ def get_callbacks(log_dir=None, valid=(), early_stopping=True, patience=5, use_c if early_stopping: callbacks.append(EarlyStopping(monitor='f1', patience=patience, mode='max')) + callbacks.append(LogLearningRateCallback(model)) + return callbacks diff --git a/delft/textClassification/wrapper.py b/delft/textClassification/wrapper.py index 015e2c5a..24e8d255 100644 --- a/delft/textClassification/wrapper.py +++ b/delft/textClassification/wrapper.py @@ -1,5 +1,6 @@ import os +from delft.sequenceLabelling.trainer import LogLearningRateCallback # ask tensorflow to be quiet and not print hundred lines of logs from delft.utilities.misc import print_parameters @@ -172,6 +173,10 @@ def train(self, x_train, y_train, vocab_init=None, incremental=False, callbacks= embeddings=self.embeddings, shuffle=True, bert_data=bert_data, transformer_tokenizer=self.model.transformer_tokenizer) validation_generator = None + + callbacks_ = callbacks if callbacks else [] + callbacks_.append(LogLearningRateCallback(self.model)) + # uncomment to plot graph #plot_model(self.model, # to_file='data/models/textClassification/'+self.model_config.model_name+'_'+self.model_config.architecture+'.png') From 64fa67ce30a2027f2c34d316f382727b8e17e94e Mon Sep 17 00:00:00 2001 From: Luca Foppiano Date: Thu, 15 Jun 2023 11:10:11 +0900 Subject: [PATCH 8/9] cosmetics --- delft/sequenceLabelling/trainer.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/delft/sequenceLabelling/trainer.py b/delft/sequenceLabelling/trainer.py index dd8a46cb..e1e83ca2 100644 --- a/delft/sequenceLabelling/trainer.py +++ b/delft/sequenceLabelling/trainer.py @@ -276,7 +276,7 @@ def __init__(self, model=None): def on_epoch_end(self, epoch, logs): if self.model is not None: - logs.update({"learning_rate": self.model.optimizer._decayed_lr(tf.float32)}) + logs.update({"lr": self.model.optimizer._decayed_lr(tf.float32)}) def get_callbacks(log_dir=None, valid=(), early_stopping=True, patience=5, use_crf=True, use_chain_crf=False, model=None): """ From 5b3ea9361b136f4df032a05f14a279f62cfd4ec1 Mon Sep 17 00:00:00 2001 From: Luca Foppiano Date: Sun, 16 Jul 2023 10:24:17 +0900 Subject: [PATCH 9/9] fix default learning rate for RNN architectures --- delft/sequenceLabelling/wrapper.py | 2 +- delft/textClassification/wrapper.py | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/delft/sequenceLabelling/wrapper.py b/delft/sequenceLabelling/wrapper.py index b9fda70f..29e09111 100644 --- a/delft/sequenceLabelling/wrapper.py +++ b/delft/sequenceLabelling/wrapper.py @@ -115,7 +115,7 @@ def __init__(self, if learning_rate is None: if transformer_name is None: - learning_rate = 0.0001 + learning_rate = 0.001 else: learning_rate = 2e-5 diff --git a/delft/textClassification/wrapper.py b/delft/textClassification/wrapper.py index 24e8d255..abcce915 100644 --- a/delft/textClassification/wrapper.py +++ b/delft/textClassification/wrapper.py @@ -87,7 +87,7 @@ def __init__(self, if learning_rate is None: if transformer_name is None: - learning_rate = 0.0001 + learning_rate = 0.001 else: learning_rate = 2e-5