Fix failing tests on Windows (#39)

* Fix failing tests on Windows * Style * Add windows-latest to workflows file * Fix workflows file
TakeLab · Jul 26, 2021 · ca6a77c · ca6a77c
1 parent e14d79e
commit ca6a77c
Show file tree

Hide file tree

Showing 3 changed files with 41 additions and 30 deletions.
diff --git a/.github/workflows/pythonpackage.yml b/.github/workflows/pythonpackage.yml
@@ -9,10 +9,11 @@ on:
 jobs:
   build:
 
-    runs-on: ubuntu-latest
+    runs-on: ${{ matrix.os }}
     strategy:
       max-parallel: 4
       matrix:
+        os: [ubuntu-latest, windows-latest]
         python-version: [3.6, 3.7, 3.8, 3.9]
 
     steps:

diff --git a/spacy_udpipe/tokenizer.py b/spacy_udpipe/tokenizer.py
@@ -6,7 +6,7 @@
 from spacy.util import registry
 from ufal.udpipe import Sentence, Word
 
-from .udpipe import NO_SPACE, UDPipeModel
+from .udpipe import UDPipeModel
 from .utils import get_path
 
 
@@ -40,7 +40,7 @@ def _spacy_dep(d: str) -> str:
     return d.upper() if d == "root" else d
 
 
-class UDPipeTokenizer(object):
+class UDPipeTokenizer:
     """Custom Tokenizer which sets all the attributes because
     the UDPipe pipeline runs only once and does not
     contain separate spaCy pipeline components.
@@ -97,7 +97,7 @@ def __call__(
             text = ""
             for token in tokens:
                 text += token.form
-                if NO_SPACE not in token.misc:
+                if token.getSpaceAfter():
                     text += " "
         for i, token in enumerate(tokens):
             span = text[offset:]
@@ -117,7 +117,7 @@ def __call__(
             lemmas.append(token.lemma or "")
             offset += len(token.form)
             span = text[offset:]
-            if i == len(tokens) - 1 or NO_SPACE in token.misc:
+            if i == len(tokens) - 1 or not token.getSpaceAfter():
                 spaces.append(False)
             elif not is_aligned:
                 spaces.append(True)

diff --git a/spacy_udpipe/udpipe.py b/spacy_udpipe/udpipe.py
@@ -3,14 +3,30 @@
 
 from ufal.udpipe import InputFormat
 from ufal.udpipe import Model
-from ufal.udpipe import OutputFormat, ProcessingError, Sentence, Word
+from ufal.udpipe import OutputFormat, ProcessingError, Sentence
 
 from .utils import get_path
 
-NO_SPACE = "SpaceAfter=No"
 
-
-class PretokenizedInputFormat(object):
+def _default_model_meta(lang: str, name: str) -> Dict:
+    return {
+        "author": "Milan Straka & Jana Straková",
+        "description": "UDPipe pretrained model.",
+        "email": "[email protected]",
+        "lang": f"udpipe_{lang}",
+        "license": "CC BY-NC-SA 4.0",
+        "name": name,
+        "parent_package": "spacy_udpipe",
+        "pipeline": [
+            "Tokenizer", "Tagger", "Lemmatizer", "Parser"
+        ],
+        "source": "Universal Dependencies 2.5",
+        "url": "http://ufal.mff.cuni.cz/udpipe",
+        "version": "1.2.0"
+    }
+
+
+class PretokenizedInputFormat:
     """Dummy tokenizer for pretokenized input.
 
     Execution speed might be slow compared to other UDPipe tokenizers
@@ -35,19 +51,19 @@ def nextSentence(self, sentence: Sentence, _: ProcessingError) -> bool:
             line = next(self.lines)
         except StopIteration:
             return False
+
         tokens = line.split("\t")
-        prev_word = Word()
-        for token in tokens:
+        num_tokens = len(tokens)
+        for i, token in enumerate(tokens):
             word = sentence.addWord(token)
-            if re.match(r"\W", token):
-                # leave no space after previous token iff current token
+            if i < num_tokens - 1 and re.match(r"\W", tokens[i + 1]):
+                # leave no space after current token iff next token
                 # is non-alphanumeric (i.e. punctuation)
-                prev_word.misc = NO_SPACE
-            prev_word = word
+                word.setSpaceAfter(False)
         return True
 
 
-class UDPipeModel(object):
+class UDPipeModel:
 
     def __init__(
         self,
@@ -64,20 +80,14 @@ def __init__(
         path = path or get_path(lang=lang)
         self.model = Model.load(path)
         self._lang = lang.split("-")[0]
-        self._meta = meta or {"author": "Milan Straka & Jana Straková",
-                              "description": "UDPipe pretrained model.",
-                              "email": "[email protected]",
-                              "lang": f"udpipe_{self._lang}",
-                              "license": "CC BY-NC-SA 4.0",
-                              "name": path.split("/")[-1],
-                              "parent_package": "spacy_udpipe",
-                              "pipeline": [
-                                  "Tokenizer", "Tagger", "Lemmatizer", "Parser"
-                              ],
-                              "source": "Universal Dependencies 2.5",
-                              "url": "http://ufal.mff.cuni.cz/udpipe",
-                              "version": "1.2.0"
-                              }
+        self._path = path
+        self._meta = meta or _default_model_meta(
+            self._lang, self._path.split("/")[-1]
+        )
+
+    def __reduce__(self):
+        # required for multiprocessing on Windows
+        return self.__class__, (self._lang, self._path, self._meta)
 
     def __call__(
         self,