Skip to content

Commit

Permalink
setup basic FR preprocessing
Browse files Browse the repository at this point in the history
  • Loading branch information
nicolaspanel committed Feb 19, 2019
1 parent 1d9be5e commit 824bf74
Show file tree
Hide file tree
Showing 8 changed files with 128 additions and 4 deletions.
1 change: 1 addition & 0 deletions .gitignore
Original file line number Diff line number Diff line change
Expand Up @@ -47,3 +47,4 @@ MANIFEST

# Per-project virtualenvs
.virtualenv/
.python-version
9 changes: 9 additions & 0 deletions README.rst
Original file line number Diff line number Diff line change
Expand Up @@ -272,3 +272,12 @@ As in the case of abbreviations, you can hear the audio for a given sentence and
.. _tab separated file: https://en.wikipedia.org/wiki/Tab-separated_values
.. _common.py: https://github.com/mozilla/CorporaCreator/blob/master/src/corporacreator/preprocessors/common.py
.. _en.py: https://github.com/mozilla/CorporaCreator/blob/master/src/corporacreator/preprocessors/en.py


Development
===========

::

$ pip install -r requirements.txt
$ pytest
3 changes: 3 additions & 0 deletions pytest.ini
Original file line number Diff line number Diff line change
@@ -0,0 +1,3 @@
[pytest]
testpaths = tests

6 changes: 5 additions & 1 deletion requirements.txt
Original file line number Diff line number Diff line change
Expand Up @@ -14,4 +14,8 @@
# Example:
# numpy==1.13.3
# scipy==1.0

pytest==4.3.0
pandas==0.24.1
swifter==0.283
unidecode==1.0.23
num2words==0.5.9
34 changes: 32 additions & 2 deletions src/corporacreator/preprocessors/fr.py
Original file line number Diff line number Diff line change
@@ -1,3 +1,29 @@
import re

import unidecode

from corporacreator.utils import maybe_normalize, replace_numbers, FIND_PUNCTUATIONS_REG, FIND_MULTIPLE_SPACES_REG

FIND_ORDINAL_REG = re.compile(r"(\d+)([ème|éme|ieme|ier|iere]+)")


FR_NORMALIZATIONS = [
[re.compile(r'(^|\s)(\d+)\s(0{3})(\s|\.|,|\?|!|$)'), r'\1\2\3\4'], # "123 000 …" => "123000 …"
[re.compile(r'(^|\s)km(\s|\.|,|\?|!|$)'), r'\1 kilomètres \2'],
[re.compile(r'(^|\s)0(\d)(\s|\.|,|\?|!|$)'), r'\1zéro \2 \3'],
['%', ' pourcent'],
[re.compile(r'(^|\s)\+(\s|\.|,|\?|!|$)'), r'\1 plus \2'],
[re.compile(r'(\d+)\s?m(?:2|²)(\s|\.|,|\?|!|$)'), r'\1 mètre carré\2'],
[re.compile(r'(^|\s|/)m(?:2|²)(\s|\.|,|\?|!|$)'), r' mètre carré\2'],
[re.compile(r'(^|\s)(\d+),(\d{2})\s?€(\s|\.|,|\?|!|$)'), r'\1\2 euros \3 \4'],
[re.compile(r'\s?€(.+)'), r' euros\1'],
[re.compile(r'\s?€$'), r' euros'],
[re.compile(r'(^| )(n)(?:°|º|°)(\s)?', flags=re.IGNORECASE), r'\1\2uméro '],
[re.compile(r'(^|\s)(\d+)h(\d*)(\s|\.|,|$)'), r'\1\2 heure \3\4'],
[re.compile(r'(^|\s)(\d+)h(\s|\.|,|$)'), r'\1\2 heure \3'],
]


def fr(client_id, sentence):
"""Cleans up the passed sentence, removing or reformatting invalid data.
Expand All @@ -8,5 +34,9 @@ def fr(client_id, sentence):
Returns:
(str): Cleaned up sentence. Returning None or a `str` of whitespace flags the sentence as invalid.
"""
# TODO: Clean up fr data
return sentence
text = maybe_normalize(sentence, mapping=FR_NORMALIZATIONS)
text = replace_numbers(text, locale='fr', ordinal_regex=FIND_ORDINAL_REG)
text = text.replace('’', "'").replace('\u00A0', ' ')
text = FIND_PUNCTUATIONS_REG.sub(' ', text)
text = FIND_MULTIPLE_SPACES_REG.sub(' ', text)
return unidecode.unidecode(text).strip().lower()
51 changes: 51 additions & 0 deletions src/corporacreator/utils.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,51 @@
import re
from typing import Pattern

from num2words import num2words


NUMS_REGEX = re.compile(r"(\d+,?\u00A0?\d+)|(\d+\w+)|(\d)+")
FIND_MULTIPLE_SPACES_REG = re.compile(r'\s{2,}')
FIND_PUNCTUATIONS_REG = re.compile(r"[/°\-,;!?.()\[\]*…—]")


def get_numbers(text):
return NUMS_REGEX.split(text)


def replace_numbers(inp: str, locale: str, ordinal_regex: Pattern = None):
finalinp = ''
for e in get_numbers(inp):
if not e:
continue
newinp = e
try:
ee = ''.join(e.split())
if int(e) >= 0:
newinp = num2words(int(ee), lang=locale)
except ValueError:
try:
ee = ''.join(e.replace(',', '.').split())
if float(ee):
newinp = num2words(float(ee), lang=locale)
except ValueError:
if ordinal_regex:
matches = ordinal_regex.match(e)
if matches:
newinp = num2words(int(matches.group(1)), ordinal=True, lang=locale)

finalinp += newinp

return finalinp


def maybe_normalize(value: str, mapping):
for norm in mapping:
if type(norm[0]) == str:
value = value.replace(norm[0], norm[1])
elif isinstance(norm[0], Pattern):
value = norm[0].sub(norm[1], value)
else:
print('UNEXPECTED', type(norm[0]), norm[0])

return value
7 changes: 6 additions & 1 deletion tests/conftest.py
Original file line number Diff line number Diff line change
Expand Up @@ -7,5 +7,10 @@
Read more about conftest.py under:
https://pytest.org/latest/plugins.html
"""
import os
import site

# import pytest
# make corporacreator python module available from tests
src_path = os.path.join(os.path.dirname(os.path.dirname(__file__)), 'src')
if os.path.isdir(src_path):
site.addsitedir(src_path)
21 changes: 21 additions & 0 deletions tests/test_preprocessors.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,21 @@
import pytest

from corporacreator import preprocessors


@pytest.mark.parametrize('locale, client_id, sentence, expected', [
('fr', '*', 'Faisons donc attention à utiliser les bons mots.', 'faisons donc attention a utiliser les bons mots'),
('fr', '*', "bah 98%", "bah quatre vingt dix huit pourcent"),
('fr', '*', "prix au m2", "prix au metre carre"),
('fr', '*', "prix au m²", "prix au metre carre"),
('fr', '*', "10 m²", "dix metre carre"),
('fr', '*', "2éme page", "deuxieme page"),
('fr', '*', "donc, ce sera 299 € + 99 €", "donc ce sera deux cent quatre vingt dix neuf euros plus quatre vingt dix neuf euros"),
('fr', '*', "ok pour 18h", "ok pour dix huit heure"),
('fr', '*', '2 0 200', "deux zero deux cents"),
('fr', '*', 'rue Coq-Héron au nº13', "rue coq heron au numero treize"),
('fr', '*', "En comparaison, la Lune orbite en moyenne à 390 000 km de la Terre", "en comparaison la lune orbite en moyenne a trois cent quatre vingt dix mille kilometres de la terre"),
])
def test_preprocessor(locale, client_id, sentence, expected):
preprocessor = getattr(preprocessors, locale.replace('-', ''))
assert expected == preprocessor(client_id, preprocessors.common(sentence))

0 comments on commit 824bf74

Please sign in to comment.