Skip to content

Commit

Permalink
fix wrong recognition when encountering katakana-kanji-mixed tokens #9
Browse files Browse the repository at this point in the history
  • Loading branch information
hexenq committed May 25, 2017
1 parent e26043b commit 4ae2c00
Show file tree
Hide file tree
Showing 8 changed files with 168 additions and 43 deletions.
7 changes: 7 additions & 0 deletions CHANGELOG.md
Original file line number Diff line number Diff line change
@@ -1,3 +1,10 @@
<a name="0.1.4"></a>
## [0.1.4](https://github.com/hexenq/kuroshiro.js/compare/0.1.3...0.1.4) (2017-05-25)

### Bug Fixes

* fix wrong recognition when encountering katakana-kanji-mixed tokens ([#9](https://github.com/hexenq/kuroshiro.js/issues/9))

<a name="0.1.3"></a>
## [0.1.3](https://github.com/hexenq/kuroshiro.js/compare/0.1.2...0.1.3) (2017-01-10)

Expand Down
2 changes: 1 addition & 1 deletion bower.json
Original file line number Diff line number Diff line change
@@ -1,6 +1,6 @@
{
"name": "kuroshiro",
"version": "0.1.3",
"version": "0.1.4",
"authors": [
"Hexen Qi <[email protected]>"
],
Expand Down
54 changes: 43 additions & 11 deletions dist/browser/kuroshiro.js
Original file line number Diff line number Diff line change
Expand Up @@ -9489,7 +9489,7 @@ d=(h[l++]|h[l++]<<8|h[l++]<<16|h[l++]<<24)>>>0;(a.length&4294967295)!==d&&n(Erro
},{}],29:[function(require,module,exports){
/*!
* kuroshiro.js
* Copyright(c) 2015 Hexen Qi <[email protected]>
* Copyright(c) 2015-2017 Hexen Qi <[email protected]>
* MIT Licensed
*/

Expand Down Expand Up @@ -9630,8 +9630,36 @@ var convert = function(str, options){
break;
case 'hiragana':
for(var hi=0;hi<tokens.length;hi++){
if(!hasKatakana(tokens[hi].surface_form) && hasKanji(tokens[hi].surface_form)){
tokens[hi].reading = wanakana.toHiragana(tokens[hi].reading);
if(hasKanji(tokens[hi].surface_form)){
if(!hasKatakana(tokens[hi].surface_form)){
tokens[hi].reading = wanakana.toHiragana(tokens[hi].reading);
}else{
// handle katakana-kanji-mixed tokens
tokens[hi].reading = wanakana.toHiragana(tokens[hi].reading);
var tmp = '';
var hpattern = '';
for(var hc=0;hc<tokens[hi].surface_form.length;hc++){
if(isKanji(tokens[hi].surface_form[hc])){
hpattern += '(.*)';
}else{
hpattern += wanakana.isKatakana(tokens[hi].surface_form[hc]) ? wanakana.toHiragana(tokens[hi].surface_form[hc]):tokens[hi].surface_form[hc];
}
}
var hreg = new RegExp(hpattern);
var hmatches = hreg.exec(tokens[hi].reading);
if(hmatches){
var pickKJ = 0;
for(var hc1=0;hc1<tokens[hi].surface_form.length;hc1++){
if(isKanji(tokens[hi].surface_form[hc1])){
tmp += hmatches[pickKJ+1];
pickKJ++;
}else{
tmp += tokens[hi].surface_form[hc1];
}
}
tokens[hi].reading = tmp;
}
}
}else{
tokens[hi].reading = tokens[hi].surface_form;
}
Expand All @@ -9658,19 +9686,23 @@ var convert = function(str, options){
if(isKanji(tokens[i].surface_form[c])){
pattern += '(.*)';
}else{
pattern += tokens[i].surface_form[c];
pattern += wanakana.isKatakana(tokens[i].surface_form[c]) ? wanakana.toHiragana(tokens[i].surface_form[c]):tokens[i].surface_form[c];
}
}
var reg = new RegExp(pattern);
var matches = reg.exec(tokens[i].reading);
var pickKanji = 0;
for(var c1=0;c1<tokens[i].surface_form.length;c1++){
if(isKanji(tokens[i].surface_form[c1])){
notations.push([tokens[i].surface_form[c1],1,matches[pickKanji+1]]);
pickKanji++;
}else{
notations.push([tokens[i].surface_form[c1],2,wanakana.toHiragana(tokens[i].surface_form[c1])]);
if(matches){
var pickKanji = 0;
for(var c1=0;c1<tokens[i].surface_form.length;c1++){
if(isKanji(tokens[i].surface_form[c1])){
notations.push([tokens[i].surface_form[c1],1,matches[pickKanji+1]]);
pickKanji++;
}else{
notations.push([tokens[i].surface_form[c1],2,wanakana.toHiragana(tokens[i].surface_form[c1])]);
}
}
}else{
notations.push([tokens[i].surface_form,1,tokens[i].reading]);
}
break;
case 2:
Expand Down
4 changes: 2 additions & 2 deletions dist/browser/kuroshiro.min.js

Large diffs are not rendered by default.

56 changes: 44 additions & 12 deletions dist/node/kuroshiro.js

Large diffs are not rendered by default.

2 changes: 1 addition & 1 deletion package.json
Original file line number Diff line number Diff line change
@@ -1,6 +1,6 @@
{
"name": "kuroshiro",
"version": "0.1.3",
"version": "0.1.4",
"description": "kuroshiro.js is a japanese language utility mainly for converting Kanji-mixed sentence to Hiragana, Katakana or Romaji with furigana and okurigana modes supported.",
"main": "./dist/node/kuroshiro.js",
"scripts": {
Expand Down
54 changes: 43 additions & 11 deletions src/kuroshiro.js
Original file line number Diff line number Diff line change
@@ -1,6 +1,6 @@
/*!
* kuroshiro.js
* Copyright(c) 2015 Hexen Qi <[email protected]>
* Copyright(c) 2015-2017 Hexen Qi <[email protected]>
* MIT Licensed
*/

Expand Down Expand Up @@ -141,8 +141,36 @@ var convert = function(str, options){
break;
case 'hiragana':
for(var hi=0;hi<tokens.length;hi++){
if(!hasKatakana(tokens[hi].surface_form) && hasKanji(tokens[hi].surface_form)){
tokens[hi].reading = wanakana.toHiragana(tokens[hi].reading);
if(hasKanji(tokens[hi].surface_form)){
if(!hasKatakana(tokens[hi].surface_form)){
tokens[hi].reading = wanakana.toHiragana(tokens[hi].reading);
}else{
// handle katakana-kanji-mixed tokens
tokens[hi].reading = wanakana.toHiragana(tokens[hi].reading);
var tmp = '';
var hpattern = '';
for(var hc=0;hc<tokens[hi].surface_form.length;hc++){
if(isKanji(tokens[hi].surface_form[hc])){
hpattern += '(.*)';
}else{
hpattern += wanakana.isKatakana(tokens[hi].surface_form[hc]) ? wanakana.toHiragana(tokens[hi].surface_form[hc]):tokens[hi].surface_form[hc];
}
}
var hreg = new RegExp(hpattern);
var hmatches = hreg.exec(tokens[hi].reading);
if(hmatches){
var pickKJ = 0;
for(var hc1=0;hc1<tokens[hi].surface_form.length;hc1++){
if(isKanji(tokens[hi].surface_form[hc1])){
tmp += hmatches[pickKJ+1];
pickKJ++;
}else{
tmp += tokens[hi].surface_form[hc1];
}
}
tokens[hi].reading = tmp;
}
}
}else{
tokens[hi].reading = tokens[hi].surface_form;
}
Expand All @@ -169,19 +197,23 @@ var convert = function(str, options){
if(isKanji(tokens[i].surface_form[c])){
pattern += '(.*)';
}else{
pattern += tokens[i].surface_form[c];
pattern += wanakana.isKatakana(tokens[i].surface_form[c]) ? wanakana.toHiragana(tokens[i].surface_form[c]):tokens[i].surface_form[c];
}
}
var reg = new RegExp(pattern);
var matches = reg.exec(tokens[i].reading);
var pickKanji = 0;
for(var c1=0;c1<tokens[i].surface_form.length;c1++){
if(isKanji(tokens[i].surface_form[c1])){
notations.push([tokens[i].surface_form[c1],1,matches[pickKanji+1]]);
pickKanji++;
}else{
notations.push([tokens[i].surface_form[c1],2,wanakana.toHiragana(tokens[i].surface_form[c1])]);
if(matches){
var pickKanji = 0;
for(var c1=0;c1<tokens[i].surface_form.length;c1++){
if(isKanji(tokens[i].surface_form[c1])){
notations.push([tokens[i].surface_form[c1],1,matches[pickKanji+1]]);
pickKanji++;
}else{
notations.push([tokens[i].surface_form[c1],2,wanakana.toHiragana(tokens[i].surface_form[c1])]);
}
}
}else{
notations.push([tokens[i].surface_form,1,tokens[i].reading]);
}
break;
case 2:
Expand Down
32 changes: 27 additions & 5 deletions test/kuroshiroTest.js
Original file line number Diff line number Diff line change
@@ -1,12 +1,14 @@
/*!
* Copyright(c) 2015 Hexen Qi <[email protected]>
* Copyright(c) 2015-2017 Hexen Qi <[email protected]>
* MIT Licensed
*/
var expect = require("chai").expect;
var kuroshiro = require("../src/kuroshiro.js");

describe("kuroshiro.js Test", function () {
const EXAMPLE_TEXT = "感じ取れたら手を繋ごう、重なるのは人生のライン and レミリア最高!";
const EXAMPLE_TEXT2 = "ブラウン管への愛が足りねぇな";
const EXAMPLE_TEXT3 = "関ヶ原の戦い";

before(function(done){
kuroshiro.init(done);
Expand All @@ -21,16 +23,26 @@ describe("kuroshiro.js Test", function () {
var result = kuroshiro.hasKanji(ori);
expect(result).to.be.true;
});
it("Kanji to Hiragana", function () {
it("Kanji to Hiragana(1)", function () {
var ori = EXAMPLE_TEXT;
var result = kuroshiro.convert(ori,{to:'hiragana'});
expect(result).to.eql('かんじとれたらてをつなごう、かさなるのはじんせいのライン and レミリアさいこう!');
});
it("Kanji to Hiragana(2)", function () {
var ori = EXAMPLE_TEXT2;
var result = kuroshiro.convert(ori,{to:'hiragana'});
expect(result).to.eql('ブラウンかんへのあいがたりねぇな');
});
it("Kanji to Hiragana(3)", function () {
var ori = EXAMPLE_TEXT3;
var result = kuroshiro.convert(ori,{to:'hiragana'});
expect(result).to.eql('せきがはらのたたかい');
});
it("Kanji to Katakana", function () {
var ori = EXAMPLE_TEXT;
var result = kuroshiro.convert(ori,{to:'katakana'});
expect(result).to.eql('カンジトレタラテヲツナゴウ、カサナルノハジンセイノライン and レミリアサイコウ!');
})
});
it("Kanji to Romaji", function () {
var ori = EXAMPLE_TEXT;
var result = kuroshiro.convert(ori,{to:'romaji'});
Expand All @@ -45,17 +57,27 @@ describe("kuroshiro.js Test", function () {
var ori = EXAMPLE_TEXT;
var result = kuroshiro.convert(ori,{mode:'spaced', to:'katakana'});
expect(result).to.eql('カンジトレ タラ テ ヲ ツナゴ ウ 、 カサナル ノ ハ ジンセイ ノ ライン and レミ リア サイコウ !');
})
});
it("Kanji to Romaji with spaces", function () {
var ori = EXAMPLE_TEXT;
var result = kuroshiro.convert(ori,{mode:'spaced', to:'romaji'});
expect(result).to.eql('kanjitore tara te wo tsunago u 、 kasanaru no ha jinsei no rain and remi ria saikou !');
});
it("Kanji to Hiragana with okurigana", function () {
it("Kanji to Hiragana with okurigana(1)", function () {
var ori = EXAMPLE_TEXT;
var result = kuroshiro.convert(ori,{mode:'okurigana', to:'hiragana'});
expect(result).to.eql('感(かん)じ取(と)れたら手(て)を繋(つな)ごう、重(かさ)なるのは人生(じんせい)のライン and レミリア最高(さいこう)!');
});
it("Kanji to Hiragana with okurigana(2)", function () {
var ori = EXAMPLE_TEXT2;
var result = kuroshiro.convert(ori,{mode:'okurigana', to:'hiragana'});
expect(result).to.eql('ブラウン管(かん)への愛(あい)が足(た)りねぇな');
});
it("Kanji to Hiragana with okurigana(3)", function () {
var ori = EXAMPLE_TEXT3;
var result = kuroshiro.convert(ori,{mode:'okurigana', to:'hiragana'});
expect(result).to.eql('関ヶ原(せきがはら)の戦(たたか)い');
});
it("Kanji to Katakana with okurigana", function () {
var ori = EXAMPLE_TEXT;
var result = kuroshiro.convert(ori,{mode:'okurigana', to:'katakana'});
Expand Down

0 comments on commit 4ae2c00

Please sign in to comment.