From fe2f394fecaca62dbda8b01fec709f653e5a6336 Mon Sep 17 00:00:00 2001
From: John Bauer <horatio@gmail.com>
Date: Thu, 9 Jan 2025 20:53:20 -0800
Subject: [PATCH] Add the verbs conllu from Prof. Lapalme to the English
 lemmatizer

---
 stanza/utils/datasets/prepare_tokenizer_treebank.py | 7 ++++++-
 1 file changed, 6 insertions(+), 1 deletion(-)

diff --git a/stanza/utils/datasets/prepare_tokenizer_treebank.py b/stanza/utils/datasets/prepare_tokenizer_treebank.py
index 6160a0a7d..7f8c95211 100755
--- a/stanza/utils/datasets/prepare_tokenizer_treebank.py
+++ b/stanza/utils/datasets/prepare_tokenizer_treebank.py
@@ -816,7 +816,7 @@ def build_combined_english_dataset(paths, model_type, dataset):
     """
     en_combined is currently EWT, GUM, PUD, Pronouns, and handparsed
     """
-    udbase_dir = paths["UDBASE"]
+    udbase_dir = paths["UDBASE_GIT"]
     check_gum_ready(udbase_dir)
 
     if dataset == 'train':
@@ -911,6 +911,11 @@ def build_extra_combined_english_dataset(paths, model_type, dataset):
             handparsed_sentences = read_sentences_from_conllu(handparsed_path)
             print("Loaded %d sentences from %s" % (len(handparsed_sentences), handparsed_path))
             sents.extend(handparsed_sentences)
+
+            handparsed_path = os.path.join(handparsed_dir, "english-lemmas-verbs", "irregularVerbs-noNnoAdj.conllu")
+            handparsed_sentences = read_sentences_from_conllu(handparsed_path)
+            print("Loaded %d sentences from %s" % (len(handparsed_sentences), handparsed_path))
+            sents.extend(handparsed_sentences)
     return sents
 
 def build_extra_combined_italian_dataset(paths, model_type, dataset):