Skip to content

Commit

Permalink
Fix failing tests on Windows (#39)
Browse files Browse the repository at this point in the history
* Fix failing tests on Windows

* Style

* Add windows-latest to workflows file

* Fix workflows file
  • Loading branch information
mariosasko authored Jul 26, 2021
1 parent e14d79e commit ca6a77c
Show file tree
Hide file tree
Showing 3 changed files with 41 additions and 30 deletions.
3 changes: 2 additions & 1 deletion .github/workflows/pythonpackage.yml
Original file line number Diff line number Diff line change
Expand Up @@ -9,10 +9,11 @@ on:
jobs:
build:

runs-on: ubuntu-latest
runs-on: ${{ matrix.os }}
strategy:
max-parallel: 4
matrix:
os: [ubuntu-latest, windows-latest]
python-version: [3.6, 3.7, 3.8, 3.9]

steps:
Expand Down
8 changes: 4 additions & 4 deletions spacy_udpipe/tokenizer.py
Original file line number Diff line number Diff line change
Expand Up @@ -6,7 +6,7 @@
from spacy.util import registry
from ufal.udpipe import Sentence, Word

from .udpipe import NO_SPACE, UDPipeModel
from .udpipe import UDPipeModel
from .utils import get_path


Expand Down Expand Up @@ -40,7 +40,7 @@ def _spacy_dep(d: str) -> str:
return d.upper() if d == "root" else d


class UDPipeTokenizer(object):
class UDPipeTokenizer:
"""Custom Tokenizer which sets all the attributes because
the UDPipe pipeline runs only once and does not
contain separate spaCy pipeline components.
Expand Down Expand Up @@ -97,7 +97,7 @@ def __call__(
text = ""
for token in tokens:
text += token.form
if NO_SPACE not in token.misc:
if token.getSpaceAfter():
text += " "
for i, token in enumerate(tokens):
span = text[offset:]
Expand All @@ -117,7 +117,7 @@ def __call__(
lemmas.append(token.lemma or "")
offset += len(token.form)
span = text[offset:]
if i == len(tokens) - 1 or NO_SPACE in token.misc:
if i == len(tokens) - 1 or not token.getSpaceAfter():
spaces.append(False)
elif not is_aligned:
spaces.append(True)
Expand Down
60 changes: 35 additions & 25 deletions spacy_udpipe/udpipe.py
Original file line number Diff line number Diff line change
Expand Up @@ -3,14 +3,30 @@

from ufal.udpipe import InputFormat
from ufal.udpipe import Model
from ufal.udpipe import OutputFormat, ProcessingError, Sentence, Word
from ufal.udpipe import OutputFormat, ProcessingError, Sentence

from .utils import get_path

NO_SPACE = "SpaceAfter=No"


class PretokenizedInputFormat(object):
def _default_model_meta(lang: str, name: str) -> Dict:
return {
"author": "Milan Straka & Jana Straková",
"description": "UDPipe pretrained model.",
"email": "[email protected]",
"lang": f"udpipe_{lang}",
"license": "CC BY-NC-SA 4.0",
"name": name,
"parent_package": "spacy_udpipe",
"pipeline": [
"Tokenizer", "Tagger", "Lemmatizer", "Parser"
],
"source": "Universal Dependencies 2.5",
"url": "http://ufal.mff.cuni.cz/udpipe",
"version": "1.2.0"
}


class PretokenizedInputFormat:
"""Dummy tokenizer for pretokenized input.
Execution speed might be slow compared to other UDPipe tokenizers
Expand All @@ -35,19 +51,19 @@ def nextSentence(self, sentence: Sentence, _: ProcessingError) -> bool:
line = next(self.lines)
except StopIteration:
return False

tokens = line.split("\t")
prev_word = Word()
for token in tokens:
num_tokens = len(tokens)
for i, token in enumerate(tokens):
word = sentence.addWord(token)
if re.match(r"\W", token):
# leave no space after previous token iff current token
if i < num_tokens - 1 and re.match(r"\W", tokens[i + 1]):
# leave no space after current token iff next token
# is non-alphanumeric (i.e. punctuation)
prev_word.misc = NO_SPACE
prev_word = word
word.setSpaceAfter(False)
return True


class UDPipeModel(object):
class UDPipeModel:

def __init__(
self,
Expand All @@ -64,20 +80,14 @@ def __init__(
path = path or get_path(lang=lang)
self.model = Model.load(path)
self._lang = lang.split("-")[0]
self._meta = meta or {"author": "Milan Straka & Jana Straková",
"description": "UDPipe pretrained model.",
"email": "[email protected]",
"lang": f"udpipe_{self._lang}",
"license": "CC BY-NC-SA 4.0",
"name": path.split("/")[-1],
"parent_package": "spacy_udpipe",
"pipeline": [
"Tokenizer", "Tagger", "Lemmatizer", "Parser"
],
"source": "Universal Dependencies 2.5",
"url": "http://ufal.mff.cuni.cz/udpipe",
"version": "1.2.0"
}
self._path = path
self._meta = meta or _default_model_meta(
self._lang, self._path.split("/")[-1]
)

def __reduce__(self):
# required for multiprocessing on Windows
return self.__class__, (self._lang, self._path, self._meta)

def __call__(
self,
Expand Down

0 comments on commit ca6a77c

Please sign in to comment.