setup basic FR preprocessing

common-voice · Feb 19, 2019 · 824bf74 · 824bf74
1 parent 1d9be5e
commit 824bf74
Show file tree

Hide file tree

Showing 8 changed files with 128 additions and 4 deletions.
diff --git a/.gitignore b/.gitignore
@@ -47,3 +47,4 @@ MANIFEST
 
 # Per-project virtualenvs
 .virtualenv/
+.python-version
diff --git a/README.rst b/README.rst
@@ -272,3 +272,12 @@ As in the case of abbreviations, you can hear the audio for a given sentence and
 .. _tab separated file: https://en.wikipedia.org/wiki/Tab-separated_values
 .. _common.py: https://github.com/mozilla/CorporaCreator/blob/master/src/corporacreator/preprocessors/common.py
 .. _en.py: https://github.com/mozilla/CorporaCreator/blob/master/src/corporacreator/preprocessors/en.py
+
+
+Development
+===========
+
+::
+
+    $ pip install -r requirements.txt
+    $ pytest
diff --git a/pytest.ini b/pytest.ini
@@ -0,0 +1,3 @@
+[pytest]
+testpaths = tests
+
diff --git a/requirements.txt b/requirements.txt
@@ -14,4 +14,8 @@
 # Example:
 # numpy==1.13.3
 # scipy==1.0
-
+pytest==4.3.0
+pandas==0.24.1
+swifter==0.283
+unidecode==1.0.23
+num2words==0.5.9
diff --git a/src/corporacreator/preprocessors/fr.py b/src/corporacreator/preprocessors/fr.py
@@ -1,3 +1,29 @@
+import re
+
+import unidecode
+
+from corporacreator.utils import maybe_normalize, replace_numbers, FIND_PUNCTUATIONS_REG, FIND_MULTIPLE_SPACES_REG
+
+FIND_ORDINAL_REG = re.compile(r"(\d+)([ème|éme|ieme|ier|iere]+)")
+
+
+FR_NORMALIZATIONS = [
+    [re.compile(r'(^|\s)(\d+)\s(0{3})(\s|\.|,|\?|!|$)'), r'\1\2\3\4'],  # "123 000 …" => "123000 …"
+    [re.compile(r'(^|\s)km(\s|\.|,|\?|!|$)'), r'\1 kilomètres \2'],
+    [re.compile(r'(^|\s)0(\d)(\s|\.|,|\?|!|$)'), r'\1zéro \2 \3'],
+    ['%', ' pourcent'],
+    [re.compile(r'(^|\s)\+(\s|\.|,|\?|!|$)'), r'\1 plus \2'],
+    [re.compile(r'(\d+)\s?m(?:2|²)(\s|\.|,|\?|!|$)'), r'\1 mètre carré\2'],
+    [re.compile(r'(^|\s|/)m(?:2|²)(\s|\.|,|\?|!|$)'), r' mètre carré\2'],
+    [re.compile(r'(^|\s)(\d+),(\d{2})\s?€(\s|\.|,|\?|!|$)'), r'\1\2 euros \3 \4'],
+    [re.compile(r'\s?€(.+)'), r' euros\1'],
+    [re.compile(r'\s?€$'), r' euros'],
+    [re.compile(r'(^| )(n)(?:°|º|°)(\s)?', flags=re.IGNORECASE), r'\1\2uméro '],
+    [re.compile(r'(^|\s)(\d+)h(\d*)(\s|\.|,|$)'), r'\1\2 heure \3\4'],
+    [re.compile(r'(^|\s)(\d+)h(\s|\.|,|$)'), r'\1\2 heure \3'],
+]
+
+
 def fr(client_id, sentence):
     """Cleans up the passed sentence, removing or reformatting invalid data.
 
@@ -8,5 +34,9 @@ def fr(client_id, sentence):
     Returns:
       (str): Cleaned up sentence. Returning None or a `str` of whitespace flags the sentence as invalid.
     """
-    # TODO: Clean up fr data
-    return sentence
+    text = maybe_normalize(sentence, mapping=FR_NORMALIZATIONS)
+    text = replace_numbers(text, locale='fr', ordinal_regex=FIND_ORDINAL_REG)
+    text = text.replace('’', "'").replace('\u00A0', ' ')
+    text = FIND_PUNCTUATIONS_REG.sub(' ', text)
+    text = FIND_MULTIPLE_SPACES_REG.sub(' ', text)
+    return unidecode.unidecode(text).strip().lower()
diff --git a/src/corporacreator/utils.py b/src/corporacreator/utils.py
@@ -0,0 +1,51 @@
+import re
+from typing import Pattern
+
+from num2words import num2words
+
+
+NUMS_REGEX = re.compile(r"(\d+,?\u00A0?\d+)|(\d+\w+)|(\d)+")
+FIND_MULTIPLE_SPACES_REG = re.compile(r'\s{2,}')
+FIND_PUNCTUATIONS_REG = re.compile(r"[/°\-,;!?.()\[\]*…—]")
+
+
+def get_numbers(text):
+    return NUMS_REGEX.split(text)
+
+
+def replace_numbers(inp: str, locale: str, ordinal_regex: Pattern = None):
+    finalinp = ''
+    for e in get_numbers(inp):
+        if not e:
+            continue
+        newinp = e
+        try:
+            ee = ''.join(e.split())
+            if int(e) >= 0:
+                newinp = num2words(int(ee), lang=locale)
+        except ValueError:
+            try:
+                ee = ''.join(e.replace(',', '.').split())
+                if float(ee):
+                    newinp = num2words(float(ee), lang=locale)
+            except ValueError:
+                if ordinal_regex:
+                    matches = ordinal_regex.match(e)
+                    if matches:
+                        newinp = num2words(int(matches.group(1)), ordinal=True, lang=locale)
+
+        finalinp += newinp
+
+    return finalinp
+
+
+def maybe_normalize(value: str, mapping):
+    for norm in mapping:
+        if type(norm[0]) == str:
+            value = value.replace(norm[0], norm[1])
+        elif isinstance(norm[0], Pattern):
+            value = norm[0].sub(norm[1], value)
+        else:
+            print('UNEXPECTED', type(norm[0]), norm[0])
+
+    return value
diff --git a/tests/conftest.py b/tests/conftest.py
@@ -7,5 +7,10 @@
     Read more about conftest.py under:
     https://pytest.org/latest/plugins.html
 """
+import os
+import site
 
-# import pytest
+# make corporacreator python module available from tests
+src_path = os.path.join(os.path.dirname(os.path.dirname(__file__)), 'src')
+if os.path.isdir(src_path):
+    site.addsitedir(src_path)
diff --git a/tests/test_preprocessors.py b/tests/test_preprocessors.py
@@ -0,0 +1,21 @@
+import pytest
+
+from corporacreator import preprocessors
+
+
+@pytest.mark.parametrize('locale, client_id, sentence, expected', [
+    ('fr', '*', 'Faisons donc attention à utiliser les bons mots.', 'faisons donc attention a utiliser les bons mots'),
+    ('fr', '*', "bah 98%", "bah quatre vingt dix huit pourcent"),
+    ('fr', '*', "prix au m2", "prix au metre carre"),
+    ('fr', '*', "prix au m²", "prix au metre carre"),
+    ('fr', '*', "10 m²", "dix metre carre"),
+    ('fr', '*', "2éme page", "deuxieme page"),
+    ('fr', '*', "donc, ce sera 299 € + 99 €", "donc ce sera deux cent quatre vingt dix neuf euros plus quatre vingt dix neuf euros"),
+    ('fr', '*', "ok pour 18h", "ok pour dix huit heure"),
+    ('fr', '*', '2 0 200', "deux zero deux cents"),
+    ('fr', '*', 'rue Coq-Héron au nº13', "rue coq heron au numero treize"),
+    ('fr', '*', "En comparaison, la Lune orbite en moyenne à 390 000 km de la Terre", "en comparaison la lune orbite en moyenne a trois cent quatre vingt dix mille kilometres de la terre"),
+])
+def test_preprocessor(locale, client_id, sentence, expected):
+    preprocessor = getattr(preprocessors, locale.replace('-', ''))
+    assert expected == preprocessor(client_id, preprocessors.common(sentence))
Original file line number	Diff line number	Diff line change
Expand Up		@@ -47,3 +47,4 @@ MANIFEST

		# Per-project virtualenvs
		.virtualenv/
		.python-version