-
Notifications
You must be signed in to change notification settings - Fork 84
Commit
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
✨ 💫 Add support for Chinese language
Fixes #2
- Loading branch information
1 parent
e39f3aa
commit 092764f
Showing
6 changed files
with
74 additions
and
18 deletions.
There are no files selected for viewing
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,34 @@ | ||
# -*- coding: utf-8 -*- | ||
import re | ||
from pysbd.abbreviation_replacer import AbbreviationReplacer | ||
from pysbd.between_punctuation import BetweenPunctuation | ||
from pysbd.lang.common import Common, Standard | ||
from pysbd.punctuation_replacer import replace_punctuation | ||
|
||
class Chinese(Common, Standard): | ||
|
||
class AbbreviationReplacer(AbbreviationReplacer): | ||
SENTENCE_STARTERS = [] | ||
|
||
class BetweenPunctuation(BetweenPunctuation): | ||
|
||
def __init__(self, text): | ||
super().__init__(text) | ||
|
||
def replace(self): | ||
self.sub_punctuation_between_quotes_and_parens() | ||
return self.text | ||
|
||
def sub_punctuation_between_double_angled_quotation_marks(self): | ||
BETWEEN_DOUBLE_ANGLE_QUOTATION_MARK_REGEX = r"《(?=(?P<tmp>[^》\\]+|\\{2}|\\.)*)(?P=tmp)》" | ||
self.text = re.sub(BETWEEN_DOUBLE_ANGLE_QUOTATION_MARK_REGEX, replace_punctuation, | ||
self.text) | ||
|
||
def sub_punctuation_between_l_bracket(self): | ||
BETWEEN_L_BRACKET_REGEX = r"「(?=(?P<tmp>[^」\\]+|\\{2}|\\.)*)(?P=tmp)」" | ||
self.text = re.sub(BETWEEN_L_BRACKET_REGEX, replace_punctuation, | ||
self.text) | ||
|
||
def sub_punctuation_between_quotes_and_parens(self): | ||
self.sub_punctuation_between_double_angled_quotation_marks() | ||
self.sub_punctuation_between_l_bracket() |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,15 @@ | ||
# -*- coding: utf-8 -*- | ||
import pytest | ||
|
||
GOLDEN_ZH_RULES_TEST_CASES = [ | ||
("安永已聯繫周怡安親屬,協助辦理簽證相關事宜,周怡安家屬1月1日晚間搭乘東方航空班機抵達上海,他們步入入境大廳時神情落寞、不發一語。周怡安來自台中,去年剛從元智大學畢業,同年9月加入安永。", | ||
["安永已聯繫周怡安親屬,協助辦理簽證相關事宜,周怡安家屬1月1日晚間搭乘東方航空班機抵達上海,他們步入入境大廳時神情落寞、不發一語。", "周怡安來自台中,去年剛從元智大學畢業,同年9月加入安永。"]), | ||
("我们明天一起去看《摔跤吧!爸爸》好吗?好!", | ||
["我们明天一起去看《摔跤吧!爸爸》好吗?", "好!"]) | ||
] | ||
|
||
@pytest.mark.parametrize('text,expected_sents', GOLDEN_ZH_RULES_TEST_CASES) | ||
def test_zsh_sbd(zh_default_fixture, text, expected_sents): | ||
"""Chinese language SBD tests from Pragmatic Segmenter""" | ||
segments = zh_default_fixture.segment(text) | ||
assert segments == expected_sents |