Refactor spaces_after and spaces_before so they are actual members of…

… the Token, not just stuck on the MISC field
stanfordnlp · Dec 16, 2023 · 2120a87 · 2120a87
1 parent 8da1ba1
commit 2120a87
Show file tree

Hide file tree

Showing 4 changed files with 60 additions and 41 deletions.
diff --git a/stanza/models/common/doc.py b/stanza/models/common/doc.py
@@ -1022,6 +1022,8 @@ def __init__(self, sentence, token_entry, words=None):
         self._end_char = token_entry.get(END_CHAR, None)
         self._sent = sentence
         self._mexp = token_entry.get(MEXP, None)
+        self._spaces_before = ""
+        self._spaces_after = " "
 
         if self._misc is not None:
             init_from_misc(self)
@@ -1070,6 +1072,8 @@ def consolidate_whitespace(self):
         """
         Remove whitespace misc annotations from the Words and mark the whitespace on the Tokens
         """
+        found_after = False
+        found_before = False
         num_words = len(self.words)
         for word_idx, word in enumerate(self.words):
             misc = word.misc
@@ -1079,66 +1083,61 @@ def consolidate_whitespace(self):
             if word_idx == 0:
                 if any(piece.startswith("SpacesBefore=") for piece in pieces):
                     self.spaces_before = misc_to_space_before(misc)
+                    found_before = True
             else:
                 if any(piece.startswith("SpacesBefore=") for piece in pieces):
-                    raise ValueError("Found a SpacesBefore MISC annotation on a Word that was not the first Word in a Token")
+                    warnings.warn("Found a SpacesBefore MISC annotation on a Word that was not the first Word in a Token")
             if word_idx == num_words - 1:
                 if any(piece.startswith("SpaceAfter=") or piece.startswith("SpacesAfter=") for piece in pieces):
                     self.spaces_after = misc_to_space_after(misc)
+                    found_after = True
             else:
                 if any(piece.startswith("SpaceAfter=") or piece.startswith("SpacesAfter=") for piece in pieces):
                     unexpected_space_after = misc_to_space_after(misc)
                     if unexpected_space_after == "":
                         warnings.warn("Unexpected SpaceAfter=No annotation on a word in the middle of an MWT")
                     else:
-                        raise ValueError("Unexpected SpacesAfter on a word in the middle on an MWT")
+                        warnings.warn("Unexpected SpacesAfter on a word in the middle on an MWT")
             pieces = [x for x in pieces if not x.startswith("SpacesAfter=") and not x.startswith("SpaceAfter=") and not x.startswith("SpacesBefore=")]
             word.misc = "|".join(pieces)
 
+        misc = self.misc
+        if misc:
+            pieces = misc.split("|")
+            if any(piece.startswith("SpacesBefore=") for piece in pieces):
+                spaces_before = misc_to_space_before(misc)
+                if found_before:
+                    if spaces_before != self.spaces_before:
+                        warnings.warn("Found conflicting SpacesBefore on a token and its word!")
+                else:
+                    self.spaces_before = spaces_before
+            if any(piece.startswith("SpaceAfter=") or piece.startswith("SpacesAfter=") for piece in pieces):
+                spaces_after = misc_to_space_after(misc)
+                if found_after:
+                    if spaces_after != self.spaces_after:
+                        warnings.warn("Found conflicting SpaceAfter / SpacesAfter on a token and its word!")
+                else:
+                    self.spaces_after = spaces_after
+            pieces = [x for x in pieces if not x.startswith("SpacesAfter=") and not x.startswith("SpaceAfter=") and not x.startswith("SpacesBefore=")]
+            self.misc = "|".join(pieces)
+
     @property
     def spaces_before(self):
-        """ SpacesBefore for the token.  Attached to the MISC field, although the plan is to switch it to be on the token itself """
-        space_before = misc_to_space_before(self.misc)
-        return space_before
+        """ SpacesBefore for the token. Translated from the MISC fields """
+        return self._spaces_before
 
     @spaces_before.setter
     def spaces_before(self, value):
-        # TODO: instead of cramming this into the MISC field, make the spaces a separate field
-        misc = self.misc
-        if not misc:
-            misc = space_before_to_misc(value)
-            self.misc = misc
-        else:
-            pieces = misc.split("|")
-            pieces = [x for x in pieces if x and not x.lower().split("=", maxsplit=1)[0] == 'spacesbefore']
-            space_misc = space_before_to_misc(value)
-            if space_misc:
-                pieces.append(space_misc)
-            misc = "|".join(pieces)
-            self.misc = misc
-
+        self._spaces_before = value
 
     @property
     def spaces_after(self):
-        """ SpaceAfter or SpacesAfter for the token.  Currently uses the MISC field """
-        space_after = misc_to_space_after(self.misc)
-        return space_after
+        """ SpaceAfter or SpacesAfter for the token.  Translated from the MISC field """
+        return self._spaces_after
 
     @spaces_after.setter
     def spaces_after(self, value):
-        # TODO: instead of cramming this into the MISC field, make the spaces a separate field
-        misc = self.misc
-        if not misc:
-            misc = space_after_to_misc(value)
-            self.misc = misc
-        else:
-            pieces = misc.split("|")
-            pieces = [x for x in pieces if x and not x.lower().split("=", maxsplit=1)[0] in ("spaceafter", "spacesafter")]
-            space_misc = space_after_to_misc(value)
-            if space_misc:
-                pieces.append(space_misc)
-            misc = "|".join(pieces)
-            self.misc = misc
+        self._spaces_after = value
 
     @property
     def words(self):
@@ -1216,6 +1215,23 @@ def to_dict(self, fields=[ID, TEXT, MISC, START_CHAR, END_CHAR, NER, MULTI_NER,
             for field in fields:
                 if getattr(self, field) is not None:
                     token_dict[field] = getattr(self, field)
+            if MISC in fields:
+                spaces_after = self.spaces_after
+                if spaces_after is not None and spaces_after != ' ':
+                    space_misc = space_after_to_misc(spaces_after)
+                    if token_dict.get(MISC):
+                        token_dict[MISC] = token_dict[MISC] + "|" + space_misc
+                    else:
+                        token_dict[MISC] = space_misc
+
+                spaces_before = self.spaces_before
+                if spaces_before is not None and spaces_before != '':
+                    space_misc = space_before_to_misc(spaces_before)
+                    if token_dict.get(MISC):
+                        token_dict[MISC] = token_dict[MISC] + "|" + space_misc
+                    else:
+                        token_dict[MISC] = space_misc
+
             ret.append(token_dict)
         for word in self.words:
             word_dict = word.to_dict()

diff --git a/stanza/tests/server/test_java_protobuf_requests.py b/stanza/tests/server/test_java_protobuf_requests.py
@@ -76,7 +76,7 @@ def test_convert_networkx_graph():
 14	been	be	AUX	VBN	Tense=Past|VerbForm=Part	15	aux:pass	_	_
 15	verified	verify	VERB	VBN	Tense=Past|VerbForm=Part|Voice=Pass	2	ccomp	_	SpaceAfter=No
 16	.	.	PUNCT	.	_	2	punct	_	_
-""".lstrip()
+""".strip()
 
 def test_nbsp_doc():
     """
@@ -85,6 +85,9 @@ def test_nbsp_doc():
     doc = CoNLL.conll2doc(input_str=ENGLISH_NBSP_SAMPLE)
 
     assert doc.sentences[0].text == "Please note that neither the e-mail address nor name of the sender have been verified."
-    assert doc.sentences[0].tokens[12].misc == "SpacesAfter=\\u00A0"
-    assert misc_to_space_after(doc.sentences[0].tokens[12].misc) == ' '
+    assert doc.sentences[0].tokens[12].spaces_after == " "
+    assert misc_to_space_after("SpacesAfter=\\u00A0") == ' '
     assert space_after_to_misc(' ') == "SpacesAfter=\\u00A0"
+
+    conllu = "{:C}".format(doc)
+    assert conllu == ENGLISH_NBSP_SAMPLE
diff --git a/stanza/tests/server/test_ssurgeon.py b/stanza/tests/server/test_ssurgeon.py
@@ -358,7 +358,7 @@ def test_ssurgeon_mwt_space_after():
 3-4	farepace	_	_	_	_	_	_	_	Players=GonnaPlay|SpaceAfter=No
 3	fare	fare	VERB	V	VerbForm=Inf	0	root	_	_
 4	pace	pace	NOUN	S	Gender=Fem|Number=Sing	3	obj	_	_
-5-6	col	_	_	_	_	_	_	_	SpaceAfter=No|Haters=GonnaHate
+5-6	col	_	_	_	_	_	_	_	Haters=GonnaHate|SpaceAfter=No
 5	con	con	ADP	E	_	7	case	_	_
 6	il	il	DET	RD	Definite=Def|Gender=Masc|Number=Sing|PronType=Art	7	det	_	_
 7	cervello	cervello	NOUN	S	Gender=Masc|Number=Sing	3	obl	_	RandomFeature=foo

diff --git a/stanza/tests/tokenization/test_spaces.py b/stanza/tests/tokenization/test_spaces.py
@@ -38,7 +38,7 @@ def test_spaces_no_mwt():
 EXPECTED_MWT = """
 # text = She's not a nice person.
 # sent_id = 0
-1-2	She's	_	_	_	_	_	_	_	SpacesBefore=\\s\\s|start_char=2|end_char=7
+1-2	She's	_	_	_	_	_	_	_	start_char=2|end_char=7|SpacesBefore=\\s\\s
 1	She	_	_	_	_	0	_	_	_
 2	's	_	_	_	_	1	_	_	_
 3	not	_	_	_	_	2	_	_	start_char=8|end_char=11
@@ -58,7 +58,7 @@ def test_spaces_no_mwt():
 7	the	_	_	_	_	6	_	_	start_char=58|end_char=61
 8	Cerritos	_	_	_	_	7	_	_	start_char=62|end_char=70
 9	are	_	_	_	_	8	_	_	start_char=71|end_char=74
-10-11	Jennifer's	_	_	_	_	_	_	_	SpaceAfter=No|start_char=75|end_char=85
+10-11	Jennifer's	_	_	_	_	_	_	_	start_char=75|end_char=85|SpaceAfter=No
 10	Jennifer	_	_	_	_	9	_	_	_
 11	's	_	_	_	_	10	_	_	_
 12	.	_	_	_	_	11	_	_	start_char=85|end_char=86|SpacesAfter=\\s\\s