From 45a921f1786059c014e4d15bf7538650ad7bb0a6 Mon Sep 17 00:00:00 2001 From: hexenq Date: Fri, 19 Oct 2018 11:31:08 +0800 Subject: [PATCH] fix: fix romaji conversion bugs Closes #46, #47 --- CHANGELOG.md | 16 ++++++++++++++ README.jp.md | 9 ++++++++ README.md | 14 ++++++++++-- README.zh-cn.md | 11 +++++++++- README.zh-tw.md | 11 +++++++++- package.json | 2 +- src/core.js | 31 ++++++++++++++------------- src/util.js | 54 +++++++++++++++++++++++++++++++++++++++++------ test/node.spec.js | 42 ++++++++++++++++++++++++++---------- 9 files changed, 152 insertions(+), 38 deletions(-) diff --git a/CHANGELOG.md b/CHANGELOG.md index 8d6b0b6..813bc7a 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -1,3 +1,19 @@ + +## [1.1.2](https://github.com/hexenq/kuroshiro/compare/1.1.1...1.1.2) (2018-10-19) + +### Bug Fixes + +* fix conversion bug when handling chōon with passport-shiki romanization ([#47](https://github.com/hexenq/kuroshiro/issues/47)) +* fix kanji->romaji conversion bug when using nippon-shiki/hepburn-shiki romanization ([#46](https://github.com/hexenq/kuroshiro/issues/46)) + +### Test + +* Update test specification + +### Miscellaneous + +* Update docs, add notice for romaji conversion + ## [1.1.1](https://github.com/hexenq/kuroshiro/compare/1.1.0...1.1.1) (2018-08-28) diff --git a/README.jp.md b/README.jp.md index 6586a95..7d492c8 100644 --- a/README.jp.md +++ b/README.jp.md @@ -220,6 +220,15 @@ kuroshiroは三種類のローマ字表記法をサポートします。 各種ローマ字表の比較は[こちら](http://jgrammar.life.coocan.jp/ja/data/rohmaji2.htm)を参考にしてください。 +### ローマ字変換のお知らせ +フリガナは音声を正確にあらわしていないため、__フリガナ__ を __ローマ字__ に完全自動的に変換することは不可能です。([なぜフリガナではダメなのか?](https://green.adam.ne.jp/roomazi/onamae.html#naze)を参照) + +そのゆえ、`nippon`、`hepburn`のローマ字表記法を使って、フリガナ(仮名)-> ローマ字 変換を行うとき、kuroshiroは長音の処理を実行しません。(`passport`表記法そのものが長音を無視します) + +*例えば`nippon`、` passport`、 `hepburn`のローマ字表記法を使って フリガナ->ローマ字 変換を行うと、それぞれ"kousi"、 "koshi"、 "koushi"が得られます。* + +フリガナモードを使うかどうかにかかわらず、漢字->ローマ字の変換はこの仕組みに影響を与えられないです。 + ## 貢献したい方 [CONTRIBUTING](CONTRIBUTING.md) を参考にしてみてください。 diff --git a/README.md b/README.md index 9499490..f77b03b 100644 --- a/README.md +++ b/README.md @@ -127,7 +127,7 @@ Convert given string to target syllabary with options available __Arguments__ * `str` - A String to be converted. -* `options` - *Optional* kuroshiro has several convert options as below. `romajiSystem` is only applied when the value of param `to` is `romaji` +* `options` - *Optional* kuroshiro has several convert options as below. | Options | Type | Default | Description | |---|---|---|---| @@ -207,7 +207,7 @@ Convert input kana string to hiragana. Convert input kana string to katakana. #### kanaToRomaji(str, system) -Convert input kana string to romaji. Param `system` accepts `"nippon"`, `"passport"`, `"hepburn"` (Default: "hepburn") +Convert input kana string to romaji. Param `system` accepts `"nippon"`, `"passport"`, `"hepburn"` (Default: "hepburn"). ## Romanization System kuroshiro supports three kinds of romanization systems. @@ -220,6 +220,16 @@ kuroshiro supports three kinds of romanization systems. There is a useful [webpage](http://jgrammar.life.coocan.jp/ja/data/rohmaji2.htm) for you to check the difference between these romanization systems. +### Notice for Romaji Conversion +Since it's impossible to fully automatically convert __furigana__ directly to __romaji__ because furigana lacks information on pronunciation (Refer to [なぜ フリガナでは ダメなのか?](https://green.adam.ne.jp/roomazi/onamae.html#naze)). + +kuroshiro will not handle chōon when processing directly furigana (kana) -> romaji conversion with `nippon` or `hepburn` romanization system (Chōon will be ignored by `passport` romanization system) + +*For example, you'll get "kousi", "koshi", "koushi" respectively when converts kana "こうし" to romaji +using `nippon`, `passport`, `hepburn` romanization system.* + +The kanji -> romaji conversion with/without furigana mode is __unaffected__ by this logic. + ## Contributing Please check [CONTRIBUTING](CONTRIBUTING.md). diff --git a/README.zh-cn.md b/README.zh-cn.md index 4b1d1a3..b6fc84e 100644 --- a/README.zh-cn.md +++ b/README.zh-cn.md @@ -208,7 +208,7 @@ const result = Kuroshiro.Util.isHiragana("あ")); 转换输入假名字符串至片假名。 #### kanaToRomaji(str, system) -转换输入假名字符串至罗马字。参数`system`可选值为`"nippon"`, `"passport"`, `"hepburn"` (默认值: "hepburn") +转换输入假名字符串至罗马字。参数`system`可选值为`"nippon"`, `"passport"`, `"hepburn"` (默认值: "hepburn")。 ## 罗马字体系 kuroshiro支持三种罗马字体系。 @@ -221,6 +221,15 @@ kuroshiro支持三种罗马字体系。 想快速了解这些罗马字体系的不同,可参考这个实用的[网页](http://jgrammar.life.coocan.jp/ja/data/rohmaji2.htm)。 +### 罗马字转换须知 +完全自动化进行注音假名到罗马字的直接转换是不可能的,这是因为一般的注音假名都缺乏正确的发音信息,可以参考 [なぜ フリガナでは ダメなのか?](https://green.adam.ne.jp/roomazi/onamae.html#naze)。 + +因此kuroshiro在进行直接的注音假名->罗马字转换(使用`nippon`或`hepburn`罗马字体系)时,不会处理长音。(`passport`罗马字体系本身便忽略长音) + +*例如,当进行假名"こうし"到罗马字的转换时,对于`nippon`, `passport`, `hepburn`三种罗马字体系,你会分别得到"kousi", "koshi", "koushi"这几个结果* + +汉字->罗马字的转换无论使用注音假名模式与否都 __不受__ 此逻辑影响。 + ## 贡献 请查阅文档 [CONTRIBUTING](CONTRIBUTING.md). diff --git a/README.zh-tw.md b/README.zh-tw.md index 2f4720a..b04fe4c 100644 --- a/README.zh-tw.md +++ b/README.zh-tw.md @@ -208,7 +208,7 @@ const result = Kuroshiro.Util.isHiragana("あ")); 轉換輸入假名字元串至片假名。 #### kanaToRomaji(str, system) -轉換輸入假名字元串至羅馬字。參數`system`可選值為`"nippon"`, `"passport"`, `"hepburn"` (默認值: "hepburn") +轉換輸入假名字元串至羅馬字。參數`system`可選值為`"nippon"`, `"passport"`, `"hepburn"` (默認值: "hepburn")。 ## 羅馬字體系 kuroshiro支持三種羅馬字體系。 @@ -221,6 +221,15 @@ kuroshiro支持三種羅馬字體系。 想快速了解這些羅馬字體系的不同,可參考這個實用的[網頁](http://jgrammar.life.coocan.jp/ja/data/rohmaji2.htm)。 +### 羅馬字轉換須知 +完全自動化進行注音假名到羅馬字的直接轉換是不可能的,這是因為一般的注音假名都缺乏正確的發音信息,可以參考 [なぜ フリガナでは ダメなのか?](https://green.adam.ne.jp/roomazi/onamae.html#naze)。 + +因此kuroshiro在進行直接的注音假名->羅馬字轉換(使用`nippon`或`hepburn`羅馬字體系)時,不會處理長音。(`passport`羅馬字體系本身便忽略長音) + +*例如,當進行假名"こうし"到羅馬字的轉換時,對於`nippon`, `passport`, `hepburn`三種羅馬字體系,你會分別得到"kousi", "koshi", "koushi"這幾個結果* + +漢字->羅馬字的轉換無論使用注音假名模式與否都 __不受__ 此邏輯影響。 + ## 貢獻 請查閱文檔 [CONTRIBUTING](CONTRIBUTING.md). diff --git a/package.json b/package.json index f7b719d..d3c1b99 100644 --- a/package.json +++ b/package.json @@ -1,6 +1,6 @@ { "name": "kuroshiro", - "version": "1.1.1", + "version": "1.1.2", "description": "kuroshiro is a Japanese language library for converting Japanese sentence to Hiragana, Katakana or Romaji with furigana and okurigana modes supported.", "main": "lib/index.js", "module": "src/index.js", diff --git a/src/core.js b/src/core.js index 1f6763c..e4720a8 100644 --- a/src/core.js +++ b/src/core.js @@ -1,6 +1,7 @@ import { ROMANIZATION_SYSTEM, getStrType, + patchTokens, isHiragana, isKatakana, isKana, @@ -90,7 +91,8 @@ class Kuroshiro { throw new Error("Invalid Romanization System."); } - const tokens = await this._analyzer.parse(str); + const rawTokens = await this._analyzer.parse(str); + const tokens = patchTokens(rawTokens); for (let cr = 0; cr < tokens.length; cr++) { if (hasJapanese(tokens[cr].surface_form)) { if (!tokens[cr].reading) { @@ -119,9 +121,9 @@ class Kuroshiro { return tokens.map(token => token.reading).join(" "); case "romaji": if (options.mode === "normal") { - return tokens.map(token => toRawRomaji(token.reading, options.romajiSystem)).join(""); + return tokens.map(token => toRawRomaji(token.pronunciation || token.reading, options.romajiSystem)).join(""); } - return tokens.map(token => toRawRomaji(token.reading, options.romajiSystem)).join(" "); + return tokens.map(token => toRawRomaji(token.pronunciation || token.reading, options.romajiSystem)).join(" "); case "hiragana": for (let hi = 0; hi < tokens.length; hi++) { if (hasKanji(tokens[hi].surface_form)) { @@ -171,14 +173,12 @@ class Kuroshiro { } } else if (options.mode === "okurigana" || options.mode === "furigana") { - const notations = []; // [basic,basic_type[1=kanji,2=kana,3=others],notation] + const notations = []; // [basic, basic_type[1=kanji,2=kana,3=others], notation, pronunciation] for (let i = 0; i < tokens.length; i++) { - tokens[i].reading = toRawHiragana(tokens[i].reading); - const strType = getStrType(tokens[i].surface_form); switch (strType) { case 0: - notations.push([tokens[i].surface_form, 1, tokens[i].reading]); + notations.push([tokens[i].surface_form, 1, toRawHiragana(tokens[i].reading), tokens[i].pronunciation || tokens[i].reading]); break; case 1: let pattern = ""; @@ -202,30 +202,31 @@ class Kuroshiro { } } const reg = new RegExp(`^${pattern}$`); - const matches = reg.exec(tokens[i].reading); + const matches = reg.exec(toRawHiragana(tokens[i].reading)); if (matches) { let pickKanji = 1; for (let c1 = 0; c1 < subs.length; c1++) { if (isKanji(subs[c1][0])) { - notations.push([subs[c1], 1, matches[pickKanji++]]); + notations.push([subs[c1], 1, matches[pickKanji], toRawKatakana(matches[pickKanji])]); + pickKanji += 1; } else { - notations.push([subs[c1], 2, toRawHiragana(subs[c1])]); + notations.push([subs[c1], 2, toRawHiragana(subs[c1]), toRawKatakana(subs[c1])]); } } } else { - notations.push([tokens[i].surface_form, 1, tokens[i].reading]); + notations.push([tokens[i].surface_form, 1, toRawHiragana(tokens[i].reading), tokens[i].pronunciation || tokens[i].reading]); } break; case 2: for (let c2 = 0; c2 < tokens[i].surface_form.length; c2++) { - notations.push([tokens[i].surface_form[c2], 2, tokens[i].reading[c2]]); + notations.push([tokens[i].surface_form[c2], 2, toRawHiragana(tokens[i].reading[c2]), (tokens[i].pronunciation && tokens[i].pronunciation[c2]) || tokens[i].reading[c2]]); } break; case 3: for (let c3 = 0; c3 < tokens[i].surface_form.length; c3++) { - notations.push([tokens[i].surface_form[c3], 3, tokens[i].surface_form[c3]]); + notations.push([tokens[i].surface_form[c3], 3, tokens[i].surface_form[c3], tokens[i].surface_form[c3]]); } break; default: @@ -263,14 +264,14 @@ class Kuroshiro { result += notations[n2][0]; } else { - result += notations[n2][0] + options.delimiter_start + toRawRomaji(notations[n2][2], options.romajiSystem) + options.delimiter_end; + result += notations[n2][0] + options.delimiter_start + toRawRomaji(notations[n2][3], options.romajiSystem) + options.delimiter_end; } } } else { // furigana result += ""; for (let n3 = 0; n3 < notations.length; n3++) { - result += `${notations[n3][0]}${options.delimiter_start}${toRawRomaji(notations[n3][2], options.romajiSystem)}${options.delimiter_end}`; + result += `${notations[n3][0]}${options.delimiter_start}${toRawRomaji(notations[n3][3], options.romajiSystem)}${options.delimiter_end}`; } result += ""; } diff --git a/src/util.js b/src/util.js index e9b6082..c795fbc 100644 --- a/src/util.js +++ b/src/util.js @@ -1396,6 +1396,7 @@ const toRawRomaji = function (str, system) { } } + // [ALL] kana to roman chars const max = str.length; while (pnt <= max) { if (r = romajiSystem[system][str.substring(pnt, pnt + 2)]) { @@ -1423,13 +1424,6 @@ const toRawRomaji = function (str, system) { result = result.replace(/np/gm, "mp"); } - // [PASSPORT] 長音省略 他の場合 - if (system === ROMANIZATION_SYSTEM.PASSPORT) { - result = result.replace(/uu/gm, "u"); - result = result.replace(/ou/gm, "o"); - result = result.replace(/oo(?!$)/gm, "o"); - } - // [NIPPON] 長音変換 if (system === ROMANIZATION_SYSTEM.NIPPON) { result = result.replace(/aー/gm, "â"); @@ -1474,6 +1468,51 @@ const getStrType = function (str) { return 3; }; +/** + * Patch tokens for conversion + * @param {Object} tokens Given tokens + * @return {Object} Patched tokens + */ +const patchTokens = function (tokens) { + // patch for 助動詞"う" after 動詞 + for (let i = 0; i < tokens.length; i++) { + if (tokens[i].pos && tokens[i].pos === "助動詞" && (tokens[i].surface_form === "う" || tokens[i].surface_form === "ウ")) { + if (i - 1 >= 0 && tokens[i - 1].pos && tokens[i - 1].pos === "動詞") { + tokens[i - 1].surface_form += "う"; + if (tokens[i - 1].pronunciation) { + tokens[i - 1].pronunciation += "ー"; + } + else { + tokens[i - 1].pronunciation = `${tokens[i - 1].reading}ー`; + } + tokens[i - 1].reading += "ウ"; + tokens.splice(i, 1); + i--; + } + } + } + + // patch for "っ" at the tail of 動詞、形容詞 + for (let j = 0; j < tokens.length; j++) { + if (tokens[j].pos && (tokens[j].pos === "動詞" || tokens[j].pos === "形容詞") && tokens[j].surface_form.length > 1 && (tokens[j].surface_form[tokens[j].surface_form.length - 1] === "っ" || tokens[j].surface_form[tokens[j].surface_form.length - 1] === "ッ")) { + if (j + 1 < tokens.length && tokens[j + 1].pos && (tokens[j + 1].pos === "動詞" || tokens[j + 1].pos === "助動詞")) { + tokens[j].surface_form += tokens[j + 1].surface_form; + if (tokens[j].pronunciation) { + tokens[j].pronunciation += tokens[j + 1].pronunciation; + } + else { + tokens[j].pronunciation = `${tokens[j].reading}${tokens[j + 1].reading}`; + } + tokens[j].reading += tokens[j + 1].reading; + tokens.splice(j + 1, 1); + j--; + } + } + } + + return tokens; +}; + /** * Convert kana to hiragana * @@ -1509,6 +1548,7 @@ export { // language ROMANIZATION_SYSTEM, getStrType, + patchTokens, isHiragana, isKatakana, isKana, diff --git a/test/node.spec.js b/test/node.spec.js index bc473f2..5d3c61b 100644 --- a/test/node.spec.js +++ b/test/node.spec.js @@ -182,27 +182,47 @@ describe("Kuroshiro Node Funtional Test", () => { it("Kanji to Romaji", async () => { const ori = EXAMPLE_TEXT; const result = await kuroshiro.convert(ori, { to: "romaji" }); - expect(result).toEqual("kanjitoretarateotsunagou,kasanarunohajinseinorain and remiriasaikou!"); + expect(result).toEqual("kanjitoretarateotsunagō,kasanarunowajinseinorain and remiriasaikō!"); + }); + it("Kanji to Romaji with sokuon", async () => { + const ori = "勝手に買っちゃったんだ"; + const result = await kuroshiro.convert(ori, { mode: "spaced", to: "romaji" }); + expect(result).toEqual("katte ni katchatta n da"); + }); + it("Kanji to Romaji with spaces", async () => { + const ori = EXAMPLE_TEXT; + const result = await kuroshiro.convert(ori, { mode: "spaced", to: "romaji" }); + expect(result).toEqual("kanjitore tara te o tsunagō , kasanaru no wa jinsei no rain and remi ria saikō !"); }); it("Kanji to Romaji with passport-shiki romaji system", async () => { const ori = EXAMPLE_TEXT; const result = await kuroshiro.convert(ori, { to: "romaji", romajiSystem: "passport" }); - expect(result).toEqual("kanjitoretarateotsunagou,kasanarunohajinseinorain and remiriasaiko!"); + expect(result).toEqual("kanjitoretarateotsunago,kasanarunowajinseinorain and remiriasaiko!"); + }); + it("Kanji to Romaji misc with hepburn-shiki romaji system", async () => { + const ori = "東京、九州、丸の内、観桜、呼応、思う、長雨、記入、金融、学校、ビール、お母さん、委員"; + const result = await kuroshiro.convert(ori, { to: "romaji" }); + expect(result).toEqual("tōkyō,kyūshū,marunouchi,kan'ō,koō,omou,nagaame,kinyū,kin'yū,gakkō,bīru,okāsan,iin"); + }); + it("Kanji to Romaji misc with nippon-shiki romaji system", async () => { + const ori = "東京、九州、丸の内、観桜、呼応、思う、長雨、記入、金融、学校、ビール、お母さん、委員"; + const result = await kuroshiro.convert(ori, { to: "romaji", romajiSystem: "nippon" }); + expect(result).toEqual("tôkyô,kyûsyû,marunouti,kan'ô,koô,omou,nagaame,kinyû,kin'yû,gakkô,bîru,okâsan,iin"); + }); + it("Kanji to Romaji misc with passport-shiki romaji system", async () => { + const ori = "東京、九州、丸の内、観桜、呼応、思う、長雨、記入、金融、学校、ビール、お母さん、委員"; + const result = await kuroshiro.convert(ori, { to: "romaji", romajiSystem: "passport" }); + expect(result).toEqual("tokyo,kyushu,marunouchi,kano,koo,omou,nagaame,kinyu,kinyu,gakko,biru,okasan,iin"); }); it("Kanji to Hiragana with spaces", async () => { const ori = EXAMPLE_TEXT; const result = await kuroshiro.convert(ori, { mode: "spaced", to: "hiragana" }); - expect(result).toEqual("かんじとれ たら て を つなご う 、 かさなる の は じんせい の ライン and レミ リア さいこう !"); + expect(result).toEqual("かんじとれ たら て を つなごう 、 かさなる の は じんせい の ライン and レミ リア さいこう !"); }); it("Kanji to Katakana with spaces", async () => { const ori = EXAMPLE_TEXT; const result = await kuroshiro.convert(ori, { mode: "spaced", to: "katakana" }); - expect(result).toEqual("カンジトレ タラ テ ヲ ツナゴ ウ 、 カサナル ノ ハ ジンセイ ノ ライン and レミ リア サイコウ !"); - }); - it("Kanji to Romaji with spaces", async () => { - const ori = EXAMPLE_TEXT; - const result = await kuroshiro.convert(ori, { mode: "spaced", to: "romaji" }); - expect(result).toEqual("kanjitore tara te o tsunago u , kasanaru no ha jinsei no rain and remi ria saikou !"); + expect(result).toEqual("カンジトレ タラ テ ヲ ツナゴウ 、 カサナル ノ ハ ジンセイ ノ ライン and レミ リア サイコウ !"); }); it("Kanji to Hiragana with okurigana(1)", async () => { const ori = EXAMPLE_TEXT; @@ -242,7 +262,7 @@ describe("Kuroshiro Node Funtional Test", () => { it("Kanji to Romaji with okurigana", async () => { const ori = EXAMPLE_TEXT; const result = await kuroshiro.convert(ori, { mode: "okurigana", to: "romaji" }); - expect(result).toEqual("感(kan)じ取(to)れたら手(te)を繋(tsuna)ごう、重(kasa)なるのは人生(jinsei)のライン and レミリア最高(saikou)!"); + expect(result).toEqual("感(kan)じ取(to)れたら手(te)を繋(tsuna)ごう、重(kasa)なるのは人生(jinsei)のライン and レミリア最高(saikō)!"); }); it("Kanji to Hiragana with furigana", async () => { const ori = EXAMPLE_TEXT; @@ -257,6 +277,6 @@ describe("Kuroshiro Node Funtional Test", () => { it("Kanji to Romaji with furigana", async () => { const ori = EXAMPLE_TEXT; const result = await kuroshiro.convert(ori, { mode: "furigana", to: "romaji" }); - expect(result).toEqual("(kan)(ji)(to)(re)(ta)(ra)(te)(o)(tsuna)(go)(u)(,)(kasa)(na)(ru)(no)(ha)人生(jinsei)(no)(ra)(i)(n) ( )a(a)n(n)d(d) ( )(re)(mi)(ri)(a)最高(saikou)(!)"); + expect(result).toEqual("(kan)(ji)(to)(re)(ta)(ra)(te)(o)(tsuna)(go)(u)(,)(kasa)(na)(ru)(no)(wa)人生(jinsei)(no)(ra)(i)(n) ( )a(a)n(n)d(d) ( )(re)(mi)(ri)(a)最高(saikō)(!)"); }); });