Skip to content

Commit

Permalink
Refactor spaces_after and spaces_before so they are actual members of…
Browse files Browse the repository at this point in the history
… the Token, not just stuck on the MISC field
  • Loading branch information
AngledLuffa committed Dec 16, 2023
1 parent 8da1ba1 commit 2120a87
Show file tree
Hide file tree
Showing 4 changed files with 60 additions and 41 deletions.
86 changes: 51 additions & 35 deletions stanza/models/common/doc.py
Original file line number Diff line number Diff line change
Expand Up @@ -1022,6 +1022,8 @@ def __init__(self, sentence, token_entry, words=None):
self._end_char = token_entry.get(END_CHAR, None)
self._sent = sentence
self._mexp = token_entry.get(MEXP, None)
self._spaces_before = ""
self._spaces_after = " "

if self._misc is not None:
init_from_misc(self)
Expand Down Expand Up @@ -1070,6 +1072,8 @@ def consolidate_whitespace(self):
"""
Remove whitespace misc annotations from the Words and mark the whitespace on the Tokens
"""
found_after = False
found_before = False
num_words = len(self.words)
for word_idx, word in enumerate(self.words):
misc = word.misc
Expand All @@ -1079,66 +1083,61 @@ def consolidate_whitespace(self):
if word_idx == 0:
if any(piece.startswith("SpacesBefore=") for piece in pieces):
self.spaces_before = misc_to_space_before(misc)
found_before = True
else:
if any(piece.startswith("SpacesBefore=") for piece in pieces):
raise ValueError("Found a SpacesBefore MISC annotation on a Word that was not the first Word in a Token")
warnings.warn("Found a SpacesBefore MISC annotation on a Word that was not the first Word in a Token")
if word_idx == num_words - 1:
if any(piece.startswith("SpaceAfter=") or piece.startswith("SpacesAfter=") for piece in pieces):
self.spaces_after = misc_to_space_after(misc)
found_after = True
else:
if any(piece.startswith("SpaceAfter=") or piece.startswith("SpacesAfter=") for piece in pieces):
unexpected_space_after = misc_to_space_after(misc)
if unexpected_space_after == "":
warnings.warn("Unexpected SpaceAfter=No annotation on a word in the middle of an MWT")
else:
raise ValueError("Unexpected SpacesAfter on a word in the middle on an MWT")
warnings.warn("Unexpected SpacesAfter on a word in the middle on an MWT")
pieces = [x for x in pieces if not x.startswith("SpacesAfter=") and not x.startswith("SpaceAfter=") and not x.startswith("SpacesBefore=")]
word.misc = "|".join(pieces)

misc = self.misc
if misc:
pieces = misc.split("|")
if any(piece.startswith("SpacesBefore=") for piece in pieces):
spaces_before = misc_to_space_before(misc)
if found_before:
if spaces_before != self.spaces_before:
warnings.warn("Found conflicting SpacesBefore on a token and its word!")
else:
self.spaces_before = spaces_before
if any(piece.startswith("SpaceAfter=") or piece.startswith("SpacesAfter=") for piece in pieces):
spaces_after = misc_to_space_after(misc)
if found_after:
if spaces_after != self.spaces_after:
warnings.warn("Found conflicting SpaceAfter / SpacesAfter on a token and its word!")
else:
self.spaces_after = spaces_after
pieces = [x for x in pieces if not x.startswith("SpacesAfter=") and not x.startswith("SpaceAfter=") and not x.startswith("SpacesBefore=")]
self.misc = "|".join(pieces)

@property
def spaces_before(self):
""" SpacesBefore for the token. Attached to the MISC field, although the plan is to switch it to be on the token itself """
space_before = misc_to_space_before(self.misc)
return space_before
""" SpacesBefore for the token. Translated from the MISC fields """
return self._spaces_before

@spaces_before.setter
def spaces_before(self, value):
# TODO: instead of cramming this into the MISC field, make the spaces a separate field
misc = self.misc
if not misc:
misc = space_before_to_misc(value)
self.misc = misc
else:
pieces = misc.split("|")
pieces = [x for x in pieces if x and not x.lower().split("=", maxsplit=1)[0] == 'spacesbefore']
space_misc = space_before_to_misc(value)
if space_misc:
pieces.append(space_misc)
misc = "|".join(pieces)
self.misc = misc

self._spaces_before = value

@property
def spaces_after(self):
""" SpaceAfter or SpacesAfter for the token. Currently uses the MISC field """
space_after = misc_to_space_after(self.misc)
return space_after
""" SpaceAfter or SpacesAfter for the token. Translated from the MISC field """
return self._spaces_after

@spaces_after.setter
def spaces_after(self, value):
# TODO: instead of cramming this into the MISC field, make the spaces a separate field
misc = self.misc
if not misc:
misc = space_after_to_misc(value)
self.misc = misc
else:
pieces = misc.split("|")
pieces = [x for x in pieces if x and not x.lower().split("=", maxsplit=1)[0] in ("spaceafter", "spacesafter")]
space_misc = space_after_to_misc(value)
if space_misc:
pieces.append(space_misc)
misc = "|".join(pieces)
self.misc = misc
self._spaces_after = value

@property
def words(self):
Expand Down Expand Up @@ -1216,6 +1215,23 @@ def to_dict(self, fields=[ID, TEXT, MISC, START_CHAR, END_CHAR, NER, MULTI_NER,
for field in fields:
if getattr(self, field) is not None:
token_dict[field] = getattr(self, field)
if MISC in fields:
spaces_after = self.spaces_after
if spaces_after is not None and spaces_after != ' ':
space_misc = space_after_to_misc(spaces_after)
if token_dict.get(MISC):
token_dict[MISC] = token_dict[MISC] + "|" + space_misc
else:
token_dict[MISC] = space_misc

spaces_before = self.spaces_before
if spaces_before is not None and spaces_before != '':
space_misc = space_before_to_misc(spaces_before)
if token_dict.get(MISC):
token_dict[MISC] = token_dict[MISC] + "|" + space_misc
else:
token_dict[MISC] = space_misc

ret.append(token_dict)
for word in self.words:
word_dict = word.to_dict()
Expand Down
9 changes: 6 additions & 3 deletions stanza/tests/server/test_java_protobuf_requests.py
Original file line number Diff line number Diff line change
Expand Up @@ -76,7 +76,7 @@ def test_convert_networkx_graph():
14 been be AUX VBN Tense=Past|VerbForm=Part 15 aux:pass _ _
15 verified verify VERB VBN Tense=Past|VerbForm=Part|Voice=Pass 2 ccomp _ SpaceAfter=No
16 . . PUNCT . _ 2 punct _ _
""".lstrip()
""".strip()

def test_nbsp_doc():
"""
Expand All @@ -85,6 +85,9 @@ def test_nbsp_doc():
doc = CoNLL.conll2doc(input_str=ENGLISH_NBSP_SAMPLE)

assert doc.sentences[0].text == "Please note that neither the e-mail address nor name of the sender have been verified."
assert doc.sentences[0].tokens[12].misc == "SpacesAfter=\\u00A0"
assert misc_to_space_after(doc.sentences[0].tokens[12].misc) == ' '
assert doc.sentences[0].tokens[12].spaces_after == " "
assert misc_to_space_after("SpacesAfter=\\u00A0") == ' '
assert space_after_to_misc(' ') == "SpacesAfter=\\u00A0"

conllu = "{:C}".format(doc)
assert conllu == ENGLISH_NBSP_SAMPLE
2 changes: 1 addition & 1 deletion stanza/tests/server/test_ssurgeon.py
Original file line number Diff line number Diff line change
Expand Up @@ -358,7 +358,7 @@ def test_ssurgeon_mwt_space_after():
3-4 farepace _ _ _ _ _ _ _ Players=GonnaPlay|SpaceAfter=No
3 fare fare VERB V VerbForm=Inf 0 root _ _
4 pace pace NOUN S Gender=Fem|Number=Sing 3 obj _ _
5-6 col _ _ _ _ _ _ _ SpaceAfter=No|Haters=GonnaHate
5-6 col _ _ _ _ _ _ _ Haters=GonnaHate|SpaceAfter=No
5 con con ADP E _ 7 case _ _
6 il il DET RD Definite=Def|Gender=Masc|Number=Sing|PronType=Art 7 det _ _
7 cervello cervello NOUN S Gender=Masc|Number=Sing 3 obl _ RandomFeature=foo
Expand Down
4 changes: 2 additions & 2 deletions stanza/tests/tokenization/test_spaces.py
Original file line number Diff line number Diff line change
Expand Up @@ -38,7 +38,7 @@ def test_spaces_no_mwt():
EXPECTED_MWT = """
# text = She's not a nice person.
# sent_id = 0
1-2 She's _ _ _ _ _ _ _ SpacesBefore=\\s\\s|start_char=2|end_char=7
1-2 She's _ _ _ _ _ _ _ start_char=2|end_char=7|SpacesBefore=\\s\\s
1 She _ _ _ _ 0 _ _ _
2 's _ _ _ _ 1 _ _ _
3 not _ _ _ _ 2 _ _ start_char=8|end_char=11
Expand All @@ -58,7 +58,7 @@ def test_spaces_no_mwt():
7 the _ _ _ _ 6 _ _ start_char=58|end_char=61
8 Cerritos _ _ _ _ 7 _ _ start_char=62|end_char=70
9 are _ _ _ _ 8 _ _ start_char=71|end_char=74
10-11 Jennifer's _ _ _ _ _ _ _ SpaceAfter=No|start_char=75|end_char=85
10-11 Jennifer's _ _ _ _ _ _ _ start_char=75|end_char=85|SpaceAfter=No
10 Jennifer _ _ _ _ 9 _ _ _
11 's _ _ _ _ 10 _ _ _
12 . _ _ _ _ 11 _ _ start_char=85|end_char=86|SpacesAfter=\\s\\s
Expand Down

0 comments on commit 2120a87

Please sign in to comment.