-
Notifications
You must be signed in to change notification settings - Fork 94
Commit
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
fix wrong recognition when encountering katakana-kanji-mixed tokens #9
- Loading branch information
Showing
8 changed files
with
168 additions
and
43 deletions.
There are no files selected for viewing
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -1,6 +1,6 @@ | ||
{ | ||
"name": "kuroshiro", | ||
"version": "0.1.3", | ||
"version": "0.1.4", | ||
"authors": [ | ||
"Hexen Qi <[email protected]>" | ||
], | ||
|
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
|
@@ -9489,7 +9489,7 @@ d=(h[l++]|h[l++]<<8|h[l++]<<16|h[l++]<<24)>>>0;(a.length&4294967295)!==d&&n(Erro | |
},{}],29:[function(require,module,exports){ | ||
/*! | ||
* kuroshiro.js | ||
* Copyright(c) 2015 Hexen Qi <[email protected]> | ||
* Copyright(c) 2015-2017 Hexen Qi <[email protected]> | ||
* MIT Licensed | ||
*/ | ||
|
||
|
@@ -9630,8 +9630,36 @@ var convert = function(str, options){ | |
break; | ||
case 'hiragana': | ||
for(var hi=0;hi<tokens.length;hi++){ | ||
if(!hasKatakana(tokens[hi].surface_form) && hasKanji(tokens[hi].surface_form)){ | ||
tokens[hi].reading = wanakana.toHiragana(tokens[hi].reading); | ||
if(hasKanji(tokens[hi].surface_form)){ | ||
if(!hasKatakana(tokens[hi].surface_form)){ | ||
tokens[hi].reading = wanakana.toHiragana(tokens[hi].reading); | ||
}else{ | ||
// handle katakana-kanji-mixed tokens | ||
tokens[hi].reading = wanakana.toHiragana(tokens[hi].reading); | ||
var tmp = ''; | ||
var hpattern = ''; | ||
for(var hc=0;hc<tokens[hi].surface_form.length;hc++){ | ||
if(isKanji(tokens[hi].surface_form[hc])){ | ||
hpattern += '(.*)'; | ||
}else{ | ||
hpattern += wanakana.isKatakana(tokens[hi].surface_form[hc]) ? wanakana.toHiragana(tokens[hi].surface_form[hc]):tokens[hi].surface_form[hc]; | ||
} | ||
} | ||
var hreg = new RegExp(hpattern); | ||
var hmatches = hreg.exec(tokens[hi].reading); | ||
if(hmatches){ | ||
var pickKJ = 0; | ||
for(var hc1=0;hc1<tokens[hi].surface_form.length;hc1++){ | ||
if(isKanji(tokens[hi].surface_form[hc1])){ | ||
tmp += hmatches[pickKJ+1]; | ||
pickKJ++; | ||
}else{ | ||
tmp += tokens[hi].surface_form[hc1]; | ||
} | ||
} | ||
tokens[hi].reading = tmp; | ||
} | ||
} | ||
}else{ | ||
tokens[hi].reading = tokens[hi].surface_form; | ||
} | ||
|
@@ -9658,19 +9686,23 @@ var convert = function(str, options){ | |
if(isKanji(tokens[i].surface_form[c])){ | ||
pattern += '(.*)'; | ||
}else{ | ||
pattern += tokens[i].surface_form[c]; | ||
pattern += wanakana.isKatakana(tokens[i].surface_form[c]) ? wanakana.toHiragana(tokens[i].surface_form[c]):tokens[i].surface_form[c]; | ||
} | ||
} | ||
var reg = new RegExp(pattern); | ||
var matches = reg.exec(tokens[i].reading); | ||
var pickKanji = 0; | ||
for(var c1=0;c1<tokens[i].surface_form.length;c1++){ | ||
if(isKanji(tokens[i].surface_form[c1])){ | ||
notations.push([tokens[i].surface_form[c1],1,matches[pickKanji+1]]); | ||
pickKanji++; | ||
}else{ | ||
notations.push([tokens[i].surface_form[c1],2,wanakana.toHiragana(tokens[i].surface_form[c1])]); | ||
if(matches){ | ||
var pickKanji = 0; | ||
for(var c1=0;c1<tokens[i].surface_form.length;c1++){ | ||
if(isKanji(tokens[i].surface_form[c1])){ | ||
notations.push([tokens[i].surface_form[c1],1,matches[pickKanji+1]]); | ||
pickKanji++; | ||
}else{ | ||
notations.push([tokens[i].surface_form[c1],2,wanakana.toHiragana(tokens[i].surface_form[c1])]); | ||
} | ||
} | ||
}else{ | ||
notations.push([tokens[i].surface_form,1,tokens[i].reading]); | ||
} | ||
break; | ||
case 2: | ||
|
Large diffs are not rendered by default.
Oops, something went wrong.
Large diffs are not rendered by default.
Oops, something went wrong.
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -1,6 +1,6 @@ | ||
/*! | ||
* kuroshiro.js | ||
* Copyright(c) 2015 Hexen Qi <[email protected]> | ||
* Copyright(c) 2015-2017 Hexen Qi <[email protected]> | ||
* MIT Licensed | ||
*/ | ||
|
||
|
@@ -141,8 +141,36 @@ var convert = function(str, options){ | |
break; | ||
case 'hiragana': | ||
for(var hi=0;hi<tokens.length;hi++){ | ||
if(!hasKatakana(tokens[hi].surface_form) && hasKanji(tokens[hi].surface_form)){ | ||
tokens[hi].reading = wanakana.toHiragana(tokens[hi].reading); | ||
if(hasKanji(tokens[hi].surface_form)){ | ||
if(!hasKatakana(tokens[hi].surface_form)){ | ||
tokens[hi].reading = wanakana.toHiragana(tokens[hi].reading); | ||
}else{ | ||
// handle katakana-kanji-mixed tokens | ||
tokens[hi].reading = wanakana.toHiragana(tokens[hi].reading); | ||
var tmp = ''; | ||
var hpattern = ''; | ||
for(var hc=0;hc<tokens[hi].surface_form.length;hc++){ | ||
if(isKanji(tokens[hi].surface_form[hc])){ | ||
hpattern += '(.*)'; | ||
}else{ | ||
hpattern += wanakana.isKatakana(tokens[hi].surface_form[hc]) ? wanakana.toHiragana(tokens[hi].surface_form[hc]):tokens[hi].surface_form[hc]; | ||
} | ||
} | ||
var hreg = new RegExp(hpattern); | ||
var hmatches = hreg.exec(tokens[hi].reading); | ||
if(hmatches){ | ||
var pickKJ = 0; | ||
for(var hc1=0;hc1<tokens[hi].surface_form.length;hc1++){ | ||
if(isKanji(tokens[hi].surface_form[hc1])){ | ||
tmp += hmatches[pickKJ+1]; | ||
pickKJ++; | ||
}else{ | ||
tmp += tokens[hi].surface_form[hc1]; | ||
} | ||
} | ||
tokens[hi].reading = tmp; | ||
} | ||
} | ||
}else{ | ||
tokens[hi].reading = tokens[hi].surface_form; | ||
} | ||
|
@@ -169,19 +197,23 @@ var convert = function(str, options){ | |
if(isKanji(tokens[i].surface_form[c])){ | ||
pattern += '(.*)'; | ||
}else{ | ||
pattern += tokens[i].surface_form[c]; | ||
pattern += wanakana.isKatakana(tokens[i].surface_form[c]) ? wanakana.toHiragana(tokens[i].surface_form[c]):tokens[i].surface_form[c]; | ||
} | ||
} | ||
var reg = new RegExp(pattern); | ||
var matches = reg.exec(tokens[i].reading); | ||
var pickKanji = 0; | ||
for(var c1=0;c1<tokens[i].surface_form.length;c1++){ | ||
if(isKanji(tokens[i].surface_form[c1])){ | ||
notations.push([tokens[i].surface_form[c1],1,matches[pickKanji+1]]); | ||
pickKanji++; | ||
}else{ | ||
notations.push([tokens[i].surface_form[c1],2,wanakana.toHiragana(tokens[i].surface_form[c1])]); | ||
if(matches){ | ||
var pickKanji = 0; | ||
for(var c1=0;c1<tokens[i].surface_form.length;c1++){ | ||
if(isKanji(tokens[i].surface_form[c1])){ | ||
notations.push([tokens[i].surface_form[c1],1,matches[pickKanji+1]]); | ||
pickKanji++; | ||
}else{ | ||
notations.push([tokens[i].surface_form[c1],2,wanakana.toHiragana(tokens[i].surface_form[c1])]); | ||
} | ||
} | ||
}else{ | ||
notations.push([tokens[i].surface_form,1,tokens[i].reading]); | ||
} | ||
break; | ||
case 2: | ||
|
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -1,12 +1,14 @@ | ||
/*! | ||
* Copyright(c) 2015 Hexen Qi <[email protected]> | ||
* Copyright(c) 2015-2017 Hexen Qi <[email protected]> | ||
* MIT Licensed | ||
*/ | ||
var expect = require("chai").expect; | ||
var kuroshiro = require("../src/kuroshiro.js"); | ||
|
||
describe("kuroshiro.js Test", function () { | ||
const EXAMPLE_TEXT = "感じ取れたら手を繋ごう、重なるのは人生のライン and レミリア最高!"; | ||
const EXAMPLE_TEXT2 = "ブラウン管への愛が足りねぇな"; | ||
const EXAMPLE_TEXT3 = "関ヶ原の戦い"; | ||
|
||
before(function(done){ | ||
kuroshiro.init(done); | ||
|
@@ -21,16 +23,26 @@ describe("kuroshiro.js Test", function () { | |
var result = kuroshiro.hasKanji(ori); | ||
expect(result).to.be.true; | ||
}); | ||
it("Kanji to Hiragana", function () { | ||
it("Kanji to Hiragana(1)", function () { | ||
var ori = EXAMPLE_TEXT; | ||
var result = kuroshiro.convert(ori,{to:'hiragana'}); | ||
expect(result).to.eql('かんじとれたらてをつなごう、かさなるのはじんせいのライン and レミリアさいこう!'); | ||
}); | ||
it("Kanji to Hiragana(2)", function () { | ||
var ori = EXAMPLE_TEXT2; | ||
var result = kuroshiro.convert(ori,{to:'hiragana'}); | ||
expect(result).to.eql('ブラウンかんへのあいがたりねぇな'); | ||
}); | ||
it("Kanji to Hiragana(3)", function () { | ||
var ori = EXAMPLE_TEXT3; | ||
var result = kuroshiro.convert(ori,{to:'hiragana'}); | ||
expect(result).to.eql('せきがはらのたたかい'); | ||
}); | ||
it("Kanji to Katakana", function () { | ||
var ori = EXAMPLE_TEXT; | ||
var result = kuroshiro.convert(ori,{to:'katakana'}); | ||
expect(result).to.eql('カンジトレタラテヲツナゴウ、カサナルノハジンセイノライン and レミリアサイコウ!'); | ||
}) | ||
}); | ||
it("Kanji to Romaji", function () { | ||
var ori = EXAMPLE_TEXT; | ||
var result = kuroshiro.convert(ori,{to:'romaji'}); | ||
|
@@ -45,17 +57,27 @@ describe("kuroshiro.js Test", function () { | |
var ori = EXAMPLE_TEXT; | ||
var result = kuroshiro.convert(ori,{mode:'spaced', to:'katakana'}); | ||
expect(result).to.eql('カンジトレ タラ テ ヲ ツナゴ ウ 、 カサナル ノ ハ ジンセイ ノ ライン and レミ リア サイコウ !'); | ||
}) | ||
}); | ||
it("Kanji to Romaji with spaces", function () { | ||
var ori = EXAMPLE_TEXT; | ||
var result = kuroshiro.convert(ori,{mode:'spaced', to:'romaji'}); | ||
expect(result).to.eql('kanjitore tara te wo tsunago u 、 kasanaru no ha jinsei no rain and remi ria saikou !'); | ||
}); | ||
it("Kanji to Hiragana with okurigana", function () { | ||
it("Kanji to Hiragana with okurigana(1)", function () { | ||
var ori = EXAMPLE_TEXT; | ||
var result = kuroshiro.convert(ori,{mode:'okurigana', to:'hiragana'}); | ||
expect(result).to.eql('感(かん)じ取(と)れたら手(て)を繋(つな)ごう、重(かさ)なるのは人生(じんせい)のライン and レミリア最高(さいこう)!'); | ||
}); | ||
it("Kanji to Hiragana with okurigana(2)", function () { | ||
var ori = EXAMPLE_TEXT2; | ||
var result = kuroshiro.convert(ori,{mode:'okurigana', to:'hiragana'}); | ||
expect(result).to.eql('ブラウン管(かん)への愛(あい)が足(た)りねぇな'); | ||
}); | ||
it("Kanji to Hiragana with okurigana(3)", function () { | ||
var ori = EXAMPLE_TEXT3; | ||
var result = kuroshiro.convert(ori,{mode:'okurigana', to:'hiragana'}); | ||
expect(result).to.eql('関ヶ原(せきがはら)の戦(たたか)い'); | ||
}); | ||
it("Kanji to Katakana with okurigana", function () { | ||
var ori = EXAMPLE_TEXT; | ||
var result = kuroshiro.convert(ori,{mode:'okurigana', to:'katakana'}); | ||
|