TakeLab · asajatovic · May 9, 2020 · May 7, 2020 · May 7, 2020 · May 9, 2020
diff --git a/README.md b/README.md
@@ -34,6 +34,11 @@ for token in doc:
 ```
 As all attributes are computed once and set in the custom [`Tokenizer`](https://spacy.io/api/tokenizer), the `Language.pipeline` is empty.
 
+The type of `text` can be one of the following:
+  * unprocessed: `str`,
+  * presegmented: `List[str]`,
+  * pretokenized: `List[List[str]]`.
+
 #### Loading a custom model
 The following code snippet demonstrates how to load a custom `UDPipe` model (for the Croatian language):
 ```python

diff --git a/spacy_udpipe/language.py b/spacy_udpipe/language.py
@@ -1,134 +1,15 @@
 import re
-from typing import Dict, Iterable, List, Optional, Tuple
+from typing import Dict, Iterable, List, Optional, Tuple, Union
 
 import numpy
 from spacy.language import Language
 from spacy.symbols import DEP, HEAD, LEMMA, POS, TAG
 from spacy.tokens import Doc
 from spacy.vocab import Vocab
-from ufal.udpipe import (InputFormat, Model, OutputFormat, ProcessingError,
-                         Sentence, Word)
+from ufal.udpipe import Sentence, Word
 
-from .utils import get_defaults, get_path
-
-
-class UDPipeModel(object):
-
-    def __init__(
-        self,
-        lang: str,
-        path: Optional[str] = None,
-        meta: Optional[Dict] = None
-    ):
-        """Load UDPipe model for given language.
-
-        lang: ISO 639-1 language code or shorthand UDPipe model name.
-        path: Path to UDPipe model.
-        meta: Meta-information about the UDPipe model.
-        """
-        path = path or get_path(lang=lang)
-        self.model = Model.load(path)
-        if self.model is None:
-            raise Exception(f"Cannot load UDPipe model from file '{path}'")
-        self._lang = lang.split("-")[0]
-        self._meta = meta or {"author": "Milan Straka & Jana Straková",
-                              "description": "UDPipe pretrained model.",
-                              "email": "[email protected]",
-                              "lang": f"udpipe_{self._lang}",
-                              "license": "CC BY-NC-SA 4.0",
-                              "name": path.split("/")[-1],
-                              "parent_package": "spacy_udpipe",
-                              "pipeline": [
-                                  "Tokenizer", "Tagger", "Lemmatizer", "Parser"
-                              ],
-                              "source": "Universal Dependencies 2.5",
-                              "url": "http://ufal.mff.cuni.cz/udpipe",
-                              "version": "1.2.0"
-                              }
-
-    def __call__(self, text: str) -> List[Sentence]:
-        """Tokenize, tag and parse the text and return it in an UDPipe
-        representation.
-
-        text: Input text.
-        RETURNS: Processed sentences.
-        """
-        sentences = self.tokenize(text)
-        for s in sentences:
-            self.tag(s)
-            self.parse(s)
-        return sentences
-
-    def _read(self, text: str, input_format: str) -> List[Sentence]:
-        """Convert the text to an UDPipe representation.
-
-        text: Input text.
-        input_format: Desired input format.
-        RETURNS: Processed sentences.
-        """
-        input_format.setText(text)
-        error = ProcessingError()
-        sentences = []
-
-        sentence = Sentence()
-        while input_format.nextSentence(sentence, error):
-            sentences.append(sentence)
-            sentence = Sentence()
-        if error.occurred():
-            raise Exception(error.message)
-
-        return sentences
-
-    def tokenize(self, text: str) -> List[Sentence]:
-        """Tokenize input text.
-
-        text: Input text.
-        RETURNS: Processed sentences.
-        """
-        tokenizer = self.model.newTokenizer(self.model.DEFAULT)
-        if not tokenizer:
-            raise Exception("The model does not have a tokenizer")
-        return self._read(text=text, input_format=tokenizer)
-
-    def tag(self, sentence: Sentence) -> None:
-        """Assign part-of-speech tags (inplace).
-
-        sentence: Input sentence.
-        """
-        self.model.tag(sentence, self.model.DEFAULT)
-
-    def parse(self, sentence: Sentence) -> None:
-        """Assign dependency parse relations (inplace).
-
-        sentence: Input sentence.
-        """
-        self.model.parse(sentence, self.model.DEFAULT)
-
-    def read(self, text: str, in_format: str) -> List[Sentence]:
-        """Load text in the given format and return it in an UDPipe
-        representation.
-
-        text: Text to load.
-        in_format: 'conllu'|'horizontal'|'vertical'.
-        RETURNS: Processed sentences.
-        """
-        input_format = InputFormat.newInputFormat(in_format)
-        if not input_format:
-            raise Exception(f"Cannot create input format '{in_format}'")
-        return self._read(text=text, input_format=input_format)
-
-    def write(self, sentences: List[Sentence], out_format: str) -> str:
-        """Write given sentences in the required output format.
-
-        sentences: Input ufal.udpipe.Sentence-s.
-        out_format: 'conllu'|'horizontal'|'vertical'.
-        RETURNS: Sentences formatted in the out_format.
-        """
-        output_format = OutputFormat.newOutputFormat(out_format)
-        output = "".join([output_format.writeSentence(s) for s in sentences])
-        output += output_format.finishDocument()
-
-        return output
+from .udpipe import UDPipeModel
+from .utils import get_defaults
 
 
 class UDPipeTokenizer(object):
@@ -165,10 +46,20 @@ def _dep(self, d: str) -> str:
         # Ensure labels match with SpaCy
         return d.upper() if d == "root" else d
 
-    def __call__(self, text: str) -> Doc:
+    def __call__(
+        self,
+        text: Union[
+            str,
+            List[str],
+            List[List[str]]
+        ]
+    ) -> Doc:
         """Convert input text to a spaCy Doc.
 
-        text: The text to process.
+        text: The text to process. It can be presegmented or pretokenized:
+            str             : raw text,
+            List[str]       : presegmented text,
+            List[List[str]] : pretokenized text.
         RETURNS: The spaCy Doc object.
         """
         udpipe_sents = self.model(text=text) if text else [Sentence()]
@@ -246,10 +137,17 @@ def __call__(self, text: str) -> Doc:
         doc.is_parsed = bool(any(deps))
         return doc
 
-    def pipe(self, texts: Iterable[str]) -> Iterable[Doc]:
+    def pipe(
+        self,
+        texts: Union[
+            Iterable[str],
+            Iterable[List[str]],
+            Iterable[List[List[str]]]
+        ]
+    ) -> Iterable[Doc]:
         """Tokenize a stream of texts.
 
-        texts: A sequence of unicode texts.
+        texts: A sequence of unicode texts (raw, presegmented or pretokenized).
         YIELDS: A sequence of Doc objects, in order.
         """
         for text in texts: