From fe2f394fecaca62dbda8b01fec709f653e5a6336 Mon Sep 17 00:00:00 2001 From: John Bauer Date: Thu, 9 Jan 2025 20:53:20 -0800 Subject: [PATCH] Add the verbs conllu from Prof. Lapalme to the English lemmatizer --- stanza/utils/datasets/prepare_tokenizer_treebank.py | 7 ++++++- 1 file changed, 6 insertions(+), 1 deletion(-) diff --git a/stanza/utils/datasets/prepare_tokenizer_treebank.py b/stanza/utils/datasets/prepare_tokenizer_treebank.py index 6160a0a7d..7f8c95211 100755 --- a/stanza/utils/datasets/prepare_tokenizer_treebank.py +++ b/stanza/utils/datasets/prepare_tokenizer_treebank.py @@ -816,7 +816,7 @@ def build_combined_english_dataset(paths, model_type, dataset): """ en_combined is currently EWT, GUM, PUD, Pronouns, and handparsed """ - udbase_dir = paths["UDBASE"] + udbase_dir = paths["UDBASE_GIT"] check_gum_ready(udbase_dir) if dataset == 'train': @@ -911,6 +911,11 @@ def build_extra_combined_english_dataset(paths, model_type, dataset): handparsed_sentences = read_sentences_from_conllu(handparsed_path) print("Loaded %d sentences from %s" % (len(handparsed_sentences), handparsed_path)) sents.extend(handparsed_sentences) + + handparsed_path = os.path.join(handparsed_dir, "english-lemmas-verbs", "irregularVerbs-noNnoAdj.conllu") + handparsed_sentences = read_sentences_from_conllu(handparsed_path) + print("Loaded %d sentences from %s" % (len(handparsed_sentences), handparsed_path)) + sents.extend(handparsed_sentences) return sents def build_extra_combined_italian_dataset(paths, model_type, dataset):