Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Feature/Pretokenized and presegmented text #19

Merged
merged 3 commits into from
May 9, 2020
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
5 changes: 5 additions & 0 deletions README.md
Original file line number Diff line number Diff line change
Expand Up @@ -34,6 +34,11 @@ for token in doc:
```
As all attributes are computed once and set in the custom [`Tokenizer`](https://spacy.io/api/tokenizer), the `Language.pipeline` is empty.

The type of `text` can be one of the following:
* unprocessed: `str`,
* presegmented: `List[str]`,
* pretokenized: `List[List[str]]`.

#### Loading a custom model
The following code snippet demonstrates how to load a custom `UDPipe` model (for the Croatian language):
```python
Expand Down
152 changes: 25 additions & 127 deletions spacy_udpipe/language.py
Original file line number Diff line number Diff line change
@@ -1,134 +1,15 @@
import re
from typing import Dict, Iterable, List, Optional, Tuple
from typing import Dict, Iterable, List, Optional, Tuple, Union

import numpy
from spacy.language import Language
from spacy.symbols import DEP, HEAD, LEMMA, POS, TAG
from spacy.tokens import Doc
from spacy.vocab import Vocab
from ufal.udpipe import (InputFormat, Model, OutputFormat, ProcessingError,
Sentence, Word)
from ufal.udpipe import Sentence, Word

from .utils import get_defaults, get_path


class UDPipeModel(object):

def __init__(
self,
lang: str,
path: Optional[str] = None,
meta: Optional[Dict] = None
):
"""Load UDPipe model for given language.

lang: ISO 639-1 language code or shorthand UDPipe model name.
path: Path to UDPipe model.
meta: Meta-information about the UDPipe model.
"""
path = path or get_path(lang=lang)
self.model = Model.load(path)
if self.model is None:
raise Exception(f"Cannot load UDPipe model from file '{path}'")
self._lang = lang.split("-")[0]
self._meta = meta or {"author": "Milan Straka & Jana Straková",
"description": "UDPipe pretrained model.",
"email": "[email protected]",
"lang": f"udpipe_{self._lang}",
"license": "CC BY-NC-SA 4.0",
"name": path.split("/")[-1],
"parent_package": "spacy_udpipe",
"pipeline": [
"Tokenizer", "Tagger", "Lemmatizer", "Parser"
],
"source": "Universal Dependencies 2.5",
"url": "http://ufal.mff.cuni.cz/udpipe",
"version": "1.2.0"
}

def __call__(self, text: str) -> List[Sentence]:
"""Tokenize, tag and parse the text and return it in an UDPipe
representation.

text: Input text.
RETURNS: Processed sentences.
"""
sentences = self.tokenize(text)
for s in sentences:
self.tag(s)
self.parse(s)
return sentences

def _read(self, text: str, input_format: str) -> List[Sentence]:
"""Convert the text to an UDPipe representation.

text: Input text.
input_format: Desired input format.
RETURNS: Processed sentences.
"""
input_format.setText(text)
error = ProcessingError()
sentences = []

sentence = Sentence()
while input_format.nextSentence(sentence, error):
sentences.append(sentence)
sentence = Sentence()
if error.occurred():
raise Exception(error.message)

return sentences

def tokenize(self, text: str) -> List[Sentence]:
"""Tokenize input text.

text: Input text.
RETURNS: Processed sentences.
"""
tokenizer = self.model.newTokenizer(self.model.DEFAULT)
if not tokenizer:
raise Exception("The model does not have a tokenizer")
return self._read(text=text, input_format=tokenizer)

def tag(self, sentence: Sentence) -> None:
"""Assign part-of-speech tags (inplace).

sentence: Input sentence.
"""
self.model.tag(sentence, self.model.DEFAULT)

def parse(self, sentence: Sentence) -> None:
"""Assign dependency parse relations (inplace).

sentence: Input sentence.
"""
self.model.parse(sentence, self.model.DEFAULT)

def read(self, text: str, in_format: str) -> List[Sentence]:
"""Load text in the given format and return it in an UDPipe
representation.

text: Text to load.
in_format: 'conllu'|'horizontal'|'vertical'.
RETURNS: Processed sentences.
"""
input_format = InputFormat.newInputFormat(in_format)
if not input_format:
raise Exception(f"Cannot create input format '{in_format}'")
return self._read(text=text, input_format=input_format)

def write(self, sentences: List[Sentence], out_format: str) -> str:
"""Write given sentences in the required output format.

sentences: Input ufal.udpipe.Sentence-s.
out_format: 'conllu'|'horizontal'|'vertical'.
RETURNS: Sentences formatted in the out_format.
"""
output_format = OutputFormat.newOutputFormat(out_format)
output = "".join([output_format.writeSentence(s) for s in sentences])
output += output_format.finishDocument()

return output
from .udpipe import UDPipeModel
from .utils import get_defaults


class UDPipeTokenizer(object):
Expand Down Expand Up @@ -165,10 +46,20 @@ def _dep(self, d: str) -> str:
# Ensure labels match with SpaCy
return d.upper() if d == "root" else d

def __call__(self, text: str) -> Doc:
def __call__(
self,
text: Union[
str,
List[str],
List[List[str]]
]
) -> Doc:
"""Convert input text to a spaCy Doc.

text: The text to process.
text: The text to process. It can be presegmented or pretokenized:
str : raw text,
List[str] : presegmented text,
List[List[str]] : pretokenized text.
RETURNS: The spaCy Doc object.
"""
udpipe_sents = self.model(text=text) if text else [Sentence()]
Expand Down Expand Up @@ -246,10 +137,17 @@ def __call__(self, text: str) -> Doc:
doc.is_parsed = bool(any(deps))
return doc

def pipe(self, texts: Iterable[str]) -> Iterable[Doc]:
def pipe(
self,
texts: Union[
Iterable[str],
Iterable[List[str]],
Iterable[List[List[str]]]
]
) -> Iterable[Doc]:
"""Tokenize a stream of texts.

texts: A sequence of unicode texts.
texts: A sequence of unicode texts (raw, presegmented or pretokenized).
YIELDS: A sequence of Doc objects, in order.
"""
for text in texts:
Expand Down
Loading