HazyResearch · lukehsiao · Jun 7, 2020 · May 26, 2020 · May 27, 2020 · Jun 2, 2020
diff --git a/.github/workflows/pythonpackage.yml b/.github/workflows/pythonpackage.yml
@@ -27,12 +27,6 @@ jobs:
     - name: Install dependencies for Ubuntu
       run: |
         sudo apt-get install neofetch
-        sudo apt-get install -q -y libmecab-dev swig mecab
-        # In Ubuntu 18.04, unidic-mecab is too old, so we install it manually. Note
-        # that this automatically sets the dictionary to unidic.
-        # Binary dictionary not included in 2.2.0-1 (https://bugs.debian.org/cgi-bin/bugreport.cgi?bug=788822)
-        wget http://launchpadlibrarian.net/326401889/unidic-mecab_2.1.2~dfsg-7_all.deb
-        sudo dpkg -i unidic-mecab_2.1.2~dfsg-7_all.deb
         # Install ImageMagick library for Wand
         sudo apt-get install libmagickwand-dev ghostscript
         # Remove the policy file to allow the visualizer test to open the PDF.
@@ -49,7 +43,6 @@ jobs:
         psql --version
         python --version
         pip --version
-        mecab -D || true
     - name: Install Fonduer
       run: |
         make dev_extra
@@ -102,10 +95,6 @@ jobs:
     - name: Install dependencies for macOS
       run: |
         brew update
-        brew install swig mecab mecab-unidic
-        # Use unidic-mecab dictionary
-        sed -i -e "s/ipadic/unidic/" /usr/local/etc/mecabrc
-
         # Install ImageMagick for Wand.
         brew install freetype imagemagick ghostscript
 
@@ -141,7 +130,6 @@ jobs:
         psql --version
         python --version
         pip --version
-        mecab -D || true
         neofetch
     - name: Install Fonduer
       run: |

diff --git a/CHANGELOG.rst b/CHANGELOG.rst
@@ -9,10 +9,18 @@ Added
   See `here <../user/packaging.html>`_ for how to use it.
   (`#259 <https://github.com/HazyResearch/fonduer/issues/259>`_)
   (`#407 <https://github.com/HazyResearch/fonduer/pull/407>`_)
+* `@HiromuHota`_: Support spaCy v2.2.
+  (`#384 <https://github.com/HazyResearch/fonduer/issues/384>`_)
+  (`#432 <https://github.com/HazyResearch/fonduer/pull/432>`_)
 
 Changed
 ^^^^^^^
 * `@HiromuHota`_: Enabled "Type hints (PEP 484) support for the Sphinx autodoc extension."
+  (`#421 <https://github.com/HazyResearch/fonduer/pull/421>`_)
+* `@HiromuHota`_: Switched the Cython wrapper for Mecab from mecab-python3 to fugashi.
+  Since the Japanese tokenizer remains the same, there should be no impact on users.
+  (`#384 <https://github.com/HazyResearch/fonduer/issues/384>`_)
+  (`#432 <https://github.com/HazyResearch/fonduer/pull/432>`_)
 
 Deprecated
 ^^^^^^^^^^

diff --git a/docs/user/faqs.rst b/docs/user/faqs.rst
@@ -70,21 +70,8 @@ its NLP pipeline (`see spacy language support`_). We also started adding
 languages with spaCy alpha support for tokenization (`see spacy alpha
 languages`_). Currently, only Chinese and Japanese are supported.
 
-If you would like to use Fonduer for Japanese documents, you will first have
-to install some additional packages (`see mecab on PyPI`_).
-
-For Linux::
-
-    $ sudo apt-get install swig libmecab-dev
-    $ sudo apt-get install mecab unidic-mecab
-
-For OS X::
-
-    $ brew install swig mecab
-    $ brew install mecab-unidic
-
-Afterwards, you can use ``pip install fonduer[spacy_ja]`` to install Fonduer
-with Japanese language support.
+If you would like to use Fonduer for Japanese documents, you can use
+``pip install fonduer[spacy_ja]`` to install Fonduer with Japanese language support.
 
 If you would like to use Fonduer for Chinese documents, you can use
 ``pip install fonduer[spacy_zh]`` to install Fonduer with Chinese language support.

diff --git a/setup.py b/setup.py
@@ -23,7 +23,7 @@
         "numpy>=1.11, <2.0",
         "pyyaml>=5.1, <6.0",
         "scipy>=1.1.0, <2.0.0",
-        "spacy>=2.1.3, <2.2.0",
+        "spacy>=2.1.3, <2.3.0",
         "sqlalchemy[postgresql]>=1.3.7, <2.0.0",
         "torch>=1.3.1,<2.0.0",
         "tqdm>=4.36.0, <5.0.0",
@@ -32,7 +32,7 @@
         "ipython",
     ],
     extras_require={
-        "spacy_ja": ["mecab-python3==0.7"],
+        "spacy_ja": ["fugashi[unidic-lite]>=0.2.3"],
         "spacy_zh": ["jieba>=0.39, <0.40"],
     },
     keywords=["fonduer", "knowledge base construction", "richly formatted data"],

diff --git a/tests/candidates/test_candidates.py b/tests/candidates/test_candidates.py
@@ -300,7 +300,7 @@ def do_nothing_matcher(fig):
     )
 
     doc = candidate_extractor_udf.apply(doc, split=0)
-    assert len(doc.part_temps) == 1432
+    assert len(doc.part_temps) == 1431
     assert len(doc.part_volts) == 2310
 
     # Clear
@@ -313,7 +313,7 @@ def do_nothing_matcher(fig):
 
     doc = candidate_extractor_udf.apply(doc, split=0)
 
-    assert len(doc.part_temps) == 1432
+    assert len(doc.part_temps) == 1431
     assert len(doc.part_volts) == 1993
     assert len(doc.parts) == 70
     assert len(doc.volts) == 33
@@ -336,9 +336,9 @@ def test_ngrams():
     )
     doc = mention_extractor_udf.apply(doc)
 
-    assert len(doc.persons) == 118
+    assert len(doc.persons) == 123
     mentions = doc.persons
-    assert len([x for x in mentions if x.context.get_num_words() == 1]) == 49
+    assert len([x for x in mentions if x.context.get_num_words() == 1]) == 41
     assert len([x for x in mentions if x.context.get_num_words() > 3]) == 0
 
     # Test for unigram exclusion
@@ -351,7 +351,7 @@ def test_ngrams():
         [Person], [person_ngrams], [person_matcher]
     )
     doc = mention_extractor_udf.apply(doc)
-    assert len(doc.persons) == 69
+    assert len(doc.persons) == 82
     mentions = doc.persons
     assert len([x for x in mentions if x.context.get_num_words() == 1]) == 0
     assert len([x for x in mentions if x.context.get_num_words() > 3]) == 0