Skip to content

Commit

Permalink
✨ 💫 Add support for Chinese language
Browse files Browse the repository at this point in the history
Fixes #2
  • Loading branch information
nipunsadvilkar committed Jun 6, 2020
1 parent e39f3aa commit 092764f
Show file tree
Hide file tree
Showing 6 changed files with 74 additions and 18 deletions.
1 change: 0 additions & 1 deletion pysbd/between_punctuation.py
Original file line number Diff line number Diff line change
Expand Up @@ -92,4 +92,3 @@ def sub_punctuation_between_em_dashes(self, txt):
def sub_punctuation_between_quotes_slanted(self, txt):
return re.sub(self.BETWEEN_QUOTE_SLANTED_REGEX_2, replace_punctuation,
txt)

34 changes: 34 additions & 0 deletions pysbd/lang/chinese.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,34 @@
# -*- coding: utf-8 -*-
import re
from pysbd.abbreviation_replacer import AbbreviationReplacer
from pysbd.between_punctuation import BetweenPunctuation
from pysbd.lang.common import Common, Standard
from pysbd.punctuation_replacer import replace_punctuation

class Chinese(Common, Standard):

class AbbreviationReplacer(AbbreviationReplacer):
SENTENCE_STARTERS = []

class BetweenPunctuation(BetweenPunctuation):

def __init__(self, text):
super().__init__(text)

def replace(self):
self.sub_punctuation_between_quotes_and_parens()
return self.text

def sub_punctuation_between_double_angled_quotation_marks(self):
BETWEEN_DOUBLE_ANGLE_QUOTATION_MARK_REGEX = r"《(?=(?P<tmp>[^》\\]+|\\{2}|\\.)*)(?P=tmp)》"
self.text = re.sub(BETWEEN_DOUBLE_ANGLE_QUOTATION_MARK_REGEX, replace_punctuation,
self.text)

def sub_punctuation_between_l_bracket(self):
BETWEEN_L_BRACKET_REGEX = r"「(?=(?P<tmp>[^」\\]+|\\{2}|\\.)*)(?P=tmp)」"
self.text = re.sub(BETWEEN_L_BRACKET_REGEX, replace_punctuation,
self.text)

def sub_punctuation_between_quotes_and_parens(self):
self.sub_punctuation_between_double_angled_quotation_marks()
self.sub_punctuation_between_l_bracket()
3 changes: 2 additions & 1 deletion pysbd/languages.py
Original file line number Diff line number Diff line change
Expand Up @@ -2,8 +2,9 @@
from pysbd.lang.english import English
from pysbd.lang.hindi import Hindi
from pysbd.lang.marathi import Marathi
from pysbd.lang.chinese import Chinese

LANGUAGE_CODES = {'en': English, 'hi': Hindi, 'mr': Marathi}
LANGUAGE_CODES = {'en': English, 'hi': Hindi, 'mr': Marathi, 'zh': Chinese}


class Language(object):
Expand Down
14 changes: 8 additions & 6 deletions pysbd/processor.py
Original file line number Diff line number Diff line change
Expand Up @@ -92,6 +92,7 @@ def split_into_segments(self):
for pps in post_process_sent:
new_sents.append(pps)
new_sents = [Text(ns).apply(self.lang.SubSingleQuoteRule) for ns in new_sents]
# TODO: seperate char span functionality from split_into_segments function
if self.char_span:
sent_start_token_idx = [m.start() for sent in new_sents for m in re.finditer(re.escape(sent), self.doc.text)]
for tok in self.doc:
Expand Down Expand Up @@ -169,7 +170,7 @@ def process_text(self, txt):
if txt[-1] not in self.lang.Punctuations:
txt += 'ȸ'
txt = ExclamationWords.apply_rules(txt)
txt = BetweenPunctuation(txt).replace()
txt = self.between_punctuation(txt)
# handle text having only doublepunctuations
if not re.match(self.lang.DoublePunctuationRules.DoublePunctuation, txt):
txt = Text(txt).apply(*self.lang.DoublePunctuationRules.All)
Expand All @@ -183,7 +184,6 @@ def replace_numbers(self):
self.text = Text(self.text).apply(*self.lang.Numbers.All)

def abbreviations_replacer(self):
# AbbreviationReplacer
if hasattr(self.lang, "AbbreviationReplacer"):
return self.lang.AbbreviationReplacer(self.text, self.lang)
else:
Expand All @@ -193,12 +193,14 @@ def replace_abbreviations(self):
self.text = self.abbreviations_replacer().replace()

def between_punctuation_processor(self, txt):
# BetweenPunctuation
raise NotImplementedError
if hasattr(self.lang, "BetweenPunctuation"):
return self.lang.BetweenPunctuation(txt)
else:
return BetweenPunctuation(txt)

def between_punctuation(self, txt):
# between_punctuation_processor
raise NotImplementedError
txt = self.between_punctuation_processor(txt).replace()
return txt

def sentence_boundary_punctuation(self, txt):
if hasattr(self.lang, 'ReplaceColonBetweenNumbersRule'):
Expand Down
25 changes: 15 additions & 10 deletions tests/conftest.py
Original file line number Diff line number Diff line change
Expand Up @@ -4,25 +4,30 @@

@pytest.fixture()
def pysbd_default_en_no_clean_no_span_fixture():
segmenter = pysbd.Segmenter(language="en", clean=False, char_span=False)
return segmenter
en_segmenter = pysbd.Segmenter(language="en", clean=False, char_span=False)
return en_segmenter

@pytest.fixture()
def en_with_clean_no_span_fixture():
segmenter = pysbd.Segmenter(language="en", clean=True, char_span=False)
return segmenter
en_segmenter = pysbd.Segmenter(language="en", clean=True, char_span=False)
return en_segmenter

@pytest.fixture()
def en_no_clean_with_span_fixture():
segmenter = pysbd.Segmenter(language="en", clean=False, char_span=True)
return segmenter
en_segmenter = pysbd.Segmenter(language="en", clean=False, char_span=True)
return en_segmenter

@pytest.fixture()
def hi_default_fixture():
segmenter = pysbd.Segmenter(language="hi", clean=False, char_span=False)
return segmenter
hi_segmenter = pysbd.Segmenter(language="hi", clean=False, char_span=False)
return hi_segmenter

@pytest.fixture()
def mr_default_fixture():
segmenter = pysbd.Segmenter(language="mr", clean=False, char_span=False)
return segmenter
mr_segmenter = pysbd.Segmenter(language="mr", clean=False, char_span=False)
return mr_segmenter

@pytest.fixture()
def zh_default_fixture():
zh_segmenter = pysbd.Segmenter(language="zh", clean=False, char_span=False)
return zh_segmenter
15 changes: 15 additions & 0 deletions tests/lang/test_chinese.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,15 @@
# -*- coding: utf-8 -*-
import pytest

GOLDEN_ZH_RULES_TEST_CASES = [
("安永已聯繫周怡安親屬,協助辦理簽證相關事宜,周怡安家屬1月1日晚間搭乘東方航空班機抵達上海,他們步入入境大廳時神情落寞、不發一語。周怡安來自台中,去年剛從元智大學畢業,同年9月加入安永。",
["安永已聯繫周怡安親屬,協助辦理簽證相關事宜,周怡安家屬1月1日晚間搭乘東方航空班機抵達上海,他們步入入境大廳時神情落寞、不發一語。", "周怡安來自台中,去年剛從元智大學畢業,同年9月加入安永。"]),
("我们明天一起去看《摔跤吧!爸爸》好吗?好!",
["我们明天一起去看《摔跤吧!爸爸》好吗?", "好!"])
]

@pytest.mark.parametrize('text,expected_sents', GOLDEN_ZH_RULES_TEST_CASES)
def test_zsh_sbd(zh_default_fixture, text, expected_sents):
"""Chinese language SBD tests from Pragmatic Segmenter"""
segments = zh_default_fixture.segment(text)
assert segments == expected_sents

0 comments on commit 092764f

Please sign in to comment.