-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathmodel.py
62 lines (50 loc) · 2.3 KB
/
model.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
"""
Advertência: o código abaixo é apenas *ilustrativo* e somente funciona no site dos autores.
Não é possível rodá-lo em um computador doméstico.
"""
import ufal.udpipe
import os
module_dir = os.path.dirname(os.path.abspath(__file__))
class Model:
def __init__(self, path):
"""Load given model."""
self.model = ufal.udpipe.Model.load(os.path.join(module_dir, 'portuguese-bosque-ud-2.4-190531.udpipe'))
if not self.model:
raise Exception("Cannot load UDPipe model from file '%s'" % path)
def tokenize(self, text):
"""Tokenize the text and return list of ufal.udpipe.Sentence-s."""
tokenizer = self.model.newTokenizer(self.model.DEFAULT)
if not tokenizer:
raise Exception("The model does not have a tokenizer")
return self._read(text, tokenizer)
def read(self, text, in_format):
"""Load text in the given format (conllu|horizontal|vertical) and return list of ufal.udpipe.Sentence-s."""
input_format = ufal.udpipe.InputFormat.newInputFormat(in_format)
if not input_format:
raise Exception("Cannot create input format '%s'" % in_format)
return self._read(text, input_format)
def _read(self, text, input_format):
input_format.setText(text)
error = ufal.udpipe.ProcessingError()
sentences = []
sentence = ufal.udpipe.Sentence()
while input_format.nextSentence(sentence, error):
sentences.append(sentence)
sentence = ufal.udpipe.Sentence()
if error.occurred():
raise Exception(error.message)
return sentences
def tag(self, sentence):
"""Tag the given ufal.udpipe.Sentence (inplace)."""
self.model.tag(sentence, self.model.DEFAULT)
def parse(self, sentence):
"""Parse the given ufal.udpipe.Sentence (inplace)."""
self.model.parse(sentence, self.model.DEFAULT)
def write(self, sentences, out_format):
"""Write given ufal.udpipe.Sentence-s in the required format (conllu|horizontal|vertical)."""
output_format = ufal.udpipe.OutputFormat.newOutputFormat(out_format)
output = ''
for sentence in sentences:
output += output_format.writeSentence(sentence)
output += output_format.finishDocument()
return output