diff --git a/CHANGELOG.md b/CHANGELOG.md index 896d1fc..7a36266 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -1,3 +1,6 @@ +# 2.3 +* 计算相似度时增加平滑策略 + # v1.6 * use ```jieba``` instead of ```thulac``` as tokeninzer. * refine console log for Jupyter notebook. \ No newline at end of file diff --git a/README.md b/README.md index be18313..31378b9 100644 --- a/README.md +++ b/README.md @@ -12,7 +12,7 @@ Chinese Synonyms for Natural Language Processing and Understanding. ``` pip install -U synonyms ``` -兼容py2和py3,当前稳定版本 v2.2。**同时,Node.js 用户可以使用 [node-synonyms](https://www.npmjs.com/package/node-synonyms)了。** +兼容py2和py3,当前稳定版本 v2.3。**同时,Node.js 用户可以使用 [node-synonyms](https://www.npmjs.com/package/node-synonyms)了。** ``` npm install node-synonyms @@ -103,10 +103,10 @@ data is built based on [wikidata-corpus](https://github.com/Samurais/wikidata-co ## Valuation ### 同义词词林 -《同义词词林》是梅家驹等人于1983年编纂而成,现在使用广泛的是哈工大社会计算与信息检索研究中心维护的《同义词词林扩展版》,它精细的将中文词汇划分成大类和小类,梳理了词汇间的关系,同义词词林扩展版包含词语77,343条,其中32,470被以开放数据形式共享。 +《同义词词林》是梅家驹等人于1983年编纂而成,现在使用广泛的是哈工大社会计算与信息检索研究中心维护的《同义词词林扩展版》,它精细的将中文词汇划分成大类和小类,梳理了词汇间的关系,同义词词林扩展版包含词语7万余条,其中3万余条被以开放数据形式共享。 ### 知网, HowNet -HowNet,也被称为知网,它并不只是一个语义字典,而是一个知识系统,词汇之间的关系是其一个基本使用场景。知网包含词语8,265条。 +HowNet,也被称为知网,它并不只是一个语义字典,而是一个知识系统,词汇之间的关系是其一个基本使用场景。知网包含词语8余条。 国际上对词语相似度算法的评价标准普遍采用 Miller&Charles 发布的英语词对集的人工判定值。该词对集由十对高度相关、十对中度相关、十对低度相关共 30 个英语词对组成,然后让38个受试者对这30对进行语义相关度判断,最后取他们的平均值作为人工判定标准。然后不同近义词工具也对这些词汇进行相似度评分,与人工判定标准做比较,比如使用皮尔森相关系数。在中文领域,使用这个词表的翻译版进行中文近义词比较也是常用的办法。 @@ -115,7 +115,7 @@ Synonyms的词表容量是125,792,下面选择一些在同义词词林、知 ![](./assets/5.png) -注:同义词林及知网数据、分数来源, https://github.com/yaleimeng/Final_word_Similarity +注:同义词林及知网数据、分数来源, https://github.com/yaleimeng/Final_word_Similarity;Synonyms也在不断优化中,新的分数可能和上图不一致。 ## Benchmark diff --git a/Requirements.txt b/Requirements.txt index 30a1967..dd85757 100644 --- a/Requirements.txt +++ b/Requirements.txt @@ -1 +1 @@ -synonyms>=2.0 \ No newline at end of file +synonyms>=2.3 \ No newline at end of file diff --git a/demo.py b/demo.py index c863aed..9e2d94c 100755 --- a/demo.py +++ b/demo.py @@ -36,9 +36,9 @@ import numpy import unittest -# run testcase: python /Users/hain/ai/Synonyms/demo.py Test.testExample - +compare_ = lambda x,y,z: "%s vs %s: %f" % (x, y, synonyms.compare(x, y, seg=z)) +# run testcase: python /Users/hain/ai/Synonyms/demo.py Test.testExample class Test(unittest.TestCase): ''' @@ -50,6 +50,39 @@ def setUp(self): def tearDown(self): pass + def test_pairs(self): + print("test_pairs") + print("*"* 30) + print(compare_("轿车", "汽车", True)) + print("*"* 30) + print(compare_("宝石", "宝物", True)) + print("*"* 30) + print(compare_("旅游", "游历", True)) + print("*"* 30) + print(compare_("男孩子", "小伙子", True)) + print("*"* 30) + print(compare_("海岸", "海滨", True)) + print("*"* 30) + print(compare_("庇护所", "精神病院", True)) + print("*"* 30) + print(compare_("魔术师", "巫师", True)) + print("*"* 30) + print(compare_("中午", "正午", True)) + print("*"* 30) + print(compare_("火炉", "炉灶", True)) + print("*"* 30) + print(compare_("食物", "水果", True)) + print("*"* 30) + print(compare_("鸡", "公鸡", True)) + print("*"* 30) + print(compare_("鸟", "鹤", True)) + print("*"* 30) + print(compare_("工具", "器械", True)) + print("*"* 30) + print(compare_("兄弟", "和尚", True)) + print("*"* 30) + print(compare_("起重机", "器械", True)) + def test_similarity(self): ''' Generate sentence similarity @@ -73,6 +106,12 @@ def test_similarity(self): print("发生历史性变革 vs 发生历史性变革:", r) # assert r > 0, "the similarity should be bigger then zero" + sen1 = "骨折" + sen2 = "巴赫" + r = synonyms.compare(sen1, sen2, seg=True) + print("%s vs %s" % (sen1, sen2), r) + + def test_nearby(self): synonyms.display("人脸") # synonyms.display calls synonyms.nearby diff --git a/setup.py b/setup.py index 684a832..611f7ce 100644 --- a/setup.py +++ b/setup.py @@ -13,7 +13,7 @@ setup( name='synonyms', - version='2.2', + version='2.3', description='Chinese Synonyms for Natural Language Processing and Understanding', long_description=LONGDOC, author='Hai Liang Wang, Hu Ying Xi', diff --git a/synonyms/__init__.py b/synonyms/__init__.py index 8d11ab3..b1b4008 100755 --- a/synonyms/__init__.py +++ b/synonyms/__init__.py @@ -47,6 +47,7 @@ from synonyms.word2vec import KeyedVectors from synonyms.utils import any2utf8 from synonyms.utils import any2unicode +from synonyms.utils import sigmoid import jieba.posseg as _tokenizer import jieba @@ -58,7 +59,6 @@ _vectors = None _stopwords = set() - ''' nearby ''' @@ -195,10 +195,13 @@ def _levenshtein_distance(sentence1, sentence2): Based on: http://rosettacode.org/wiki/Levenshtein_distance#Python ''' - first = sentence1.split() - second = sentence2.split() - if len(first) > len(second): + first = any2utf8(sentence1).decode('utf-8', 'ignore') + second = any2utf8(sentence2).decode('utf-8', 'ignore') + sentence1_len, sentence2_len = len(first), len(second) + maxlen = max(sentence1_len, sentence2_len) + if sentence1_len > sentence2_len: first, second = second, first + distances = range(len(first) + 1) for index2, char2 in enumerate(second): new_distances = [index2 + 1] @@ -211,8 +214,13 @@ def _levenshtein_distance(sentence1, sentence2): new_distances[-1]))) distances = new_distances levenshtein = distances[-1] - return 2 ** (-1 * levenshtein) + dis = float((maxlen - levenshtein)/maxlen) + # smoothing + s = (sigmoid(dis * 6) - 0.5) * 2 + # print("smoothing[%s| %s]: %s -> %s" % (sentence1, sentence2, dis, s)) + return s +_smooth = lambda x, y, z: (x * y) + z def _similarity_distance(s1, s2): ''' @@ -223,9 +231,21 @@ def _similarity_distance(s1, s2): # https://docs.scipy.org/doc/numpy-1.13.0/reference/generated/numpy.linalg.norm.html g = 1 / (np.linalg.norm(a - b) + 1) u = _levenshtein_distance(s1, s2) - r = g * 5 + u * 0.8 - r = min(r, 1.0) + # print("g: %s, u: %s" % (g, u)) + if u > 0.8: + r = _smooth(g, 0.05, u) + elif u > 0.7: + r = _smooth(g, 0.1, u) + elif u > 0.6: + r = _smooth(g, 0.2, u) + elif u > 0.5: + r = _smooth(g, 1, u) + elif u > 0.4: + r = _smooth(g, 4, u) + else: + r = _smooth(g, 10, u) + r = min(r, 1.0) return float("%.3f" % r) diff --git a/synonyms/utils.py b/synonyms/utils.py index b5ac5e7..d46b1df 100644 --- a/synonyms/utils.py +++ b/synonyms/utils.py @@ -239,6 +239,8 @@ def any2unicode(text, encoding='utf8', errors='strict'): to_unicode = any2unicode +def sigmoid(x): + return 1.0 / (1.0 + np.exp(-x)) def call_on_class_only(*args, **kwargs): """Raise exception when load methods are called on instance"""