Skip to content

Commit

Permalink
Quick hack for multiword (#17)
Browse files Browse the repository at this point in the history
* Add proper support for multiword tokens
  • Loading branch information
KoichiYasuoka authored Apr 19, 2020
1 parent fa121f5 commit 263c880
Show file tree
Hide file tree
Showing 2 changed files with 51 additions and 0 deletions.
6 changes: 6 additions & 0 deletions spacy_udpipe/language.py
Original file line number Diff line number Diff line change
Expand Up @@ -185,6 +185,12 @@ def __call__(self, text: str) -> Doc:
lemmas = []
offset = 0
is_aligned = self._check_aligned(text=text, tokens=tokens)
if not is_aligned:
text = ""
for token in tokens:
text += token.form
if "SpaceAfter=No" not in token.misc:
text += " "
for i, token in enumerate(tokens):
span = text[offset:]
if not span:
Expand Down
45 changes: 45 additions & 0 deletions tests/languages/fr/test_fr_language.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,45 @@
from typing import List

import pytest
from spacy.lang.fr import FrenchDefaults
from spacy.language import BaseDefaults
from spacy_udpipe import download
from spacy_udpipe.language import load
from spacy_udpipe.utils import get_defaults

FR = "fr"


@pytest.fixture
def lang() -> str:
return FR


@pytest.fixture(autouse=True)
def download_lang(lang: str) -> None:
download(lang)


def test_get_defaults(lang: str) -> None:
assert get_defaults(lang) == FrenchDefaults
assert get_defaults("blabla") == BaseDefaults


def test_spacy_udpipe(lang: str) -> None:
nlp = load(lang=lang)
assert nlp._meta["lang"] == f"udpipe_{lang}"

text = "Attention aux articles contractés!"
doc = nlp (text=text)

assert [t.orth_ for t in doc] == ["Attention", "à", "les", "articles", "contractés", "!"]

pos = [{"INTJ", "NOUN"}, {"ADP"}, {"DET"}, {"NOUN"}, {"VERB", "ADJ"}, {"PUNCT"}]
for i, t in enumerate(doc):
assert t.pos_ in pos[i]

assert [t.head.i for t in doc] == [0, 3, 3, 0, 3, 0]

dep = [{"ROOT", "root"}, {"case"}, {"det"}, {"nmod", "obl", "obl:arg"}, {"acl", "amod"}, {"punct"}]
for i, t in enumerate(doc):
assert t.dep_ in dep[i]

0 comments on commit 263c880

Please sign in to comment.