Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Spell Checker #80

Merged
merged 11 commits into from
Jan 9, 2018
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
2 changes: 1 addition & 1 deletion .flake8
Original file line number Diff line number Diff line change
@@ -1,3 +1,3 @@
[flake8]
max-line-length = 180
exclude = toro.py
exclude = toro.py, streamparser
3 changes: 3 additions & 0 deletions .gitmodules
Original file line number Diff line number Diff line change
@@ -0,0 +1,3 @@
[submodule "streamparser"]
path = streamparser
url = https://github.com/goavki/streamparser.git
6 changes: 5 additions & 1 deletion modeSearch.py
Original file line number Diff line number Diff line change
Expand Up @@ -34,13 +34,17 @@ def searchPath(rootpath, include_pairs=True, verbosity=1):
'pair': re.compile(r'({0})-({0})\.mode'.format(lang_code)),
'analyzer': re.compile(r'(({0}(-{0})?)-(an)?mor(ph)?)\.mode'.format(lang_code)),
'generator': re.compile(r'(({0}(-{0})?)-gener[A-z]*)\.mode'.format(lang_code)),
'tagger': re.compile(r'(({0}(-{0})?)-tagger)\.mode'.format(lang_code))
'tagger': re.compile(r'(({0}(-{0})?)-tagger)\.mode'.format(lang_code)),
'spell': re.compile(r'(({0}(-{0})?)-spell)\.mode'.format(lang_code)),
'tokenise': re.compile(r'(({0}(-{0})?)-tokenise)\.mode'.format(lang_code)),
}
modes = {
'pair': [],
'analyzer': [],
'generator': [],
'tagger': [],
'spell': [],
'tokenise': [],
} # type: Dict[str, List[Tuple[str, str, str]]]

real_root = os.path.abspath(os.path.realpath(rootpath))
Expand Down
4 changes: 4 additions & 0 deletions mypy.ini
Original file line number Diff line number Diff line change
@@ -0,0 +1,4 @@
[mypy]

[mypy-streamparser]
ignore_errors = True
68 changes: 59 additions & 9 deletions servlet.py
Original file line number Diff line number Diff line change
Expand Up @@ -43,20 +43,19 @@
from tornado.log import enable_pretty_logging
except ImportError: # 2.1
from tornado.options import enable_pretty_logging # type: ignore
try:
import cld2full as cld2 # type: ignore
except ImportError as _e:
cld2 = None

from modeSearch import searchPath
from keys import getKey
from util import (getLocalizedLanguages, stripTags, processPerWord, getCoverage, getCoverages, toAlpha3Code,
toAlpha2Code, scaleMtLog, TranslationInfo, removeDotFromDeformat)

import systemd
import missingdb
import translation # type: ignore

try:
import cld2full as cld2 # type: ignore
except ImportError as _e:
cld2 = None
from streamparser.streamparser import parse, known

RECAPTCHA_VERIFICATION_URL = 'https://www.google.com/recaptcha/api/siteverify'
bypassToken = ''.join(random.choice(string.ascii_letters + string.digits) for _ in range(24))
Expand Down Expand Up @@ -102,6 +101,7 @@ class BaseHandler(tornado.web.RequestHandler):
analyzers = {} # type: Dict[str, Tuple[str, str]]
generators = {} # type: Dict[str, Tuple[str, str]]
taggers = {} # type: Dict[str, Tuple[str, str]]
spellers = {} # type: Dict[str, Tuple[str, str]]
# (l1, l2): [translation.Pipeline], only contains flushing pairs!
pipelines = {} # type: Dict[str, List]
pipelines_holding = [] # type: List
Expand Down Expand Up @@ -295,8 +295,10 @@ def langs(foo):
self.sendResponse({pair: modename for (pair, (path, modename)) in self.generators.items()})
elif query == 'taggers' or query == 'disambiguators':
self.sendResponse({pair: modename for (pair, (path, modename)) in self.taggers.items()})
elif query == 'spellers':
self.sendResponse({lang_src: modename for (lang_src, (path, modename)) in self.spellers.items()})
else:
self.send_error(400, explanation='Expecting q argument to be one of analysers, generators, disambiguators, or pairs')
self.send_error(400, explanation='Expecting q argument to be one of analysers, generators, spellers, disambiguators, or pairs')


class StatsHandler(BaseHandler):
Expand Down Expand Up @@ -908,6 +910,50 @@ def get(self):
self.send_error(400, explanation='That mode is not installed')


class SpellerHandler(BaseHandler):
@gen.coroutine
def get(self):
in_text = self.get_argument('q') + '*'
in_mode = toAlpha3Code(self.get_argument('lang'))
logging.info(in_text)
logging.info(self.get_argument('lang'))
logging.info(in_mode)
logging.info(self.spellers)
if in_mode in self.spellers:
logging.info(self.spellers[in_mode])
[path, mode] = self.spellers[in_mode]
logging.info(path)
logging.info(mode)
formatting = 'none'
commands = [['apertium', '-d', path, '-f', formatting, self.get_argument('lang')+'-tokenise']]
result = yield translation.translateSimple(in_text, commands)

tokens = parse(result)
units = []
for token in tokens:
if token.knownness == known:
units.append({'token': token.wordform, 'known': True, 'sugg': []})
else:
suggestion = []
commands = [['apertium', '-d', path, '-f', formatting, mode]]

result = yield translation.translateSimple(token.wordform, commands)
foundSugg = False
for line in result.splitlines():
if line.count('Corrections for'):
foundSugg = True
continue
if foundSugg and '\t' in line:
s, w = line.split('\t')
suggestion.append((s, w))

units.append({'token': token.wordform, 'known': False, 'sugg': suggestion})

self.sendResponse(units)
else:
self.send_error(404, explanation="{} on spellchecker mode: {}".format('Error 404', 'Spelling mode for ' + in_mode + ' is not installed'))


class GenerateHandler(BaseHandler):
def preproc_text(self, in_text):
lexical_units = re.findall(r'(\^[^\$]*\$[^\^]*)', in_text)
Expand Down Expand Up @@ -1277,6 +1323,9 @@ def setupHandler(
Handler.generators[lang_pair] = (dirpath, modename)
for dirpath, modename, lang_pair in modes['tagger']:
Handler.taggers[lang_pair] = (dirpath, modename)
for dirpath, modename, lang_src in modes['spell']:
if (any(lang_src == elem[2] for elem in modes['tokenise'])):
Handler.spellers[lang_src] = (dirpath, modename)

Handler.initPairsGraph()
Handler.initPaths()
Expand Down Expand Up @@ -1358,7 +1407,7 @@ def apply_config(args, apySection):
parser.add_argument('-V', '--version', help='show APY version', action='version', version="%(prog)s version " + __version__)
parser.add_argument('-S', '--scalemt-logs', help='generates ScaleMT-like logs; use with --log-path; disables', action='store_true')
parser.add_argument('-M', '--unknown-memory-limit',
help="keeps unknown words in memory until a limit is reached; use with --missing-freqs (default = 1000)", type=int, default=1000)
help='keeps unknown words in memory until a limit is reached; use with --missing-freqs (default = 1000)', type=int, default=1000)
parser.add_argument('-T', '--stat-period-max-age',
help='How many seconds back to keep track request timing stats (default = 3600)', type=int, default=3600)
parser.add_argument('-wp', '--wiki-password', help="Apertium Wiki account password for SuggestionHandler", default=None)
Expand Down Expand Up @@ -1440,7 +1489,8 @@ def apply_config(args, apySection):
(r'/identifyLang', IdentifyLangHandler),
(r'/getLocale', GetLocaleHandler),
(r'/pipedebug', PipeDebugHandler),
(r'/suggest', SuggestionHandler)
(r'/suggest', SuggestionHandler),
(r'/speller', SpellerHandler),
])

if args.bypass_token:
Expand Down
1 change: 1 addition & 0 deletions streamparser
Submodule streamparser added at fdc640