diff --git a/src/base/util.cc b/src/base/util.cc index b08084370..55daee5be 100644 --- a/src/base/util.cc +++ b/src/base/util.cc @@ -791,27 +791,29 @@ bool Util::IsEnglishTransliteration(absl::string_view value) { // script type // TODO(yukawa, team): Make a mechanism to keep this classifier up-to-date // based on the original data from Unicode.org. -Util::ScriptType Util::GetScriptType(char32_t w) { - if (INRANGE(w, 0x0030, 0x0039) || // ascii number - INRANGE(w, 0xFF10, 0xFF19)) { // full width number +Util::ScriptType Util::GetScriptType(char32_t codepoint) { + if (INRANGE(codepoint, 0x0030, 0x0039) || // ascii number + INRANGE(codepoint, 0xFF10, 0xFF19)) { // full width number return NUMBER; - } else if (INRANGE(w, 0x0041, 0x005A) || // ascii upper - INRANGE(w, 0x0061, 0x007A) || // ascii lower - INRANGE(w, 0xFF21, 0xFF3A) || // fullwidth ascii upper - INRANGE(w, 0xFF41, 0xFF5A)) { // fullwidth ascii lower + } else if (INRANGE(codepoint, 0x0041, 0x005A) || // ascii upper + INRANGE(codepoint, 0x0061, 0x007A) || // ascii lower + INRANGE(codepoint, 0xFF21, 0xFF3A) || // fullwidth ascii upper + INRANGE(codepoint, 0xFF41, 0xFF5A)) { // fullwidth ascii lower return ALPHABET; - } else if (w == 0x3005 || // IDEOGRAPHIC ITERATION MARK "々" - INRANGE(w, 0x3400, + } else if (codepoint == 0x3005 || // IDEOGRAPHIC ITERATION MARK "々" + INRANGE(codepoint, 0x3400, 0x4DBF) || // CJK Unified Ideographs Extension A - INRANGE(w, 0x4E00, 0x9FFF) || // CJK Unified Ideographs - INRANGE(w, 0xF900, 0xFAFF) || // CJK Compatibility Ideographs - INRANGE(w, 0x20000, + INRANGE(codepoint, 0x4E00, 0x9FFF) || // CJK Unified Ideographs + INRANGE(codepoint, 0xF900, + 0xFAFF) || // CJK Compatibility Ideographs + INRANGE(codepoint, 0x20000, 0x2A6DF) || // CJK Unified Ideographs Extension B - INRANGE(w, 0x2A700, + INRANGE(codepoint, 0x2A700, 0x2B73F) || // CJK Unified Ideographs Extension C - INRANGE(w, 0x2B740, + INRANGE(codepoint, 0x2B740, 0x2B81F) || // CJK Unified Ideographs Extension D - INRANGE(w, 0x2F800, 0x2FA1F)) { // CJK Compatibility Ideographs + INRANGE(codepoint, 0x2F800, + 0x2FA1F)) { // CJK Compatibility Ideographs // As of Unicode 6.0.2, each block has the following characters assigned. // [U+3400, U+4DB5]: CJK Unified Ideographs Extension A // [U+4E00, U+9FCB]: CJK Unified Ideographs @@ -821,51 +823,53 @@ Util::ScriptType Util::GetScriptType(char32_t w) { // [U+2B740, U+2B81D]: CJK Unified Ideographs Extension D // [U+2F800, U+2FA1D]: CJK Compatibility Ideographs return KANJI; - } else if (INRANGE(w, 0x3041, 0x309F) || // hiragana - w == 0x1B001) { // HIRAGANA LETTER ARCHAIC YE + } else if (INRANGE(codepoint, 0x3041, 0x309F) || // hiragana + codepoint == 0x1B001) { // HIRAGANA LETTER ARCHAIC YE return HIRAGANA; - } else if (INRANGE(w, 0x30A1, 0x30FF) || // full width katakana - INRANGE(w, 0x31F0, + } else if (INRANGE(codepoint, 0x30A1, 0x30FF) || // full width katakana + INRANGE(codepoint, 0x31F0, 0x31FF) || // Katakana Phonetic Extensions for Ainu - INRANGE(w, 0xFF65, 0xFF9F) || // half width katakana - w == 0x1B000) { // KATAKANA LETTER ARCHAIC E + INRANGE(codepoint, 0xFF65, 0xFF9F) || // half width katakana + codepoint == 0x1B000) { // KATAKANA LETTER ARCHAIC E return KATAKANA; - } else if (INRANGE(w, 0x02300, 0x023F3) || // Miscellaneous Technical - INRANGE(w, 0x02700, 0x027BF) || // Dingbats - INRANGE(w, 0x1F000, 0x1F02F) || // Mahjong tiles - INRANGE(w, 0x1F030, 0x1F09F) || // Domino tiles - INRANGE(w, 0x1F0A0, 0x1F0FF) || // Playing cards - INRANGE(w, 0x1F100, + } else if (INRANGE(codepoint, 0x02300, 0x023F3) || // Miscellaneous Technical + INRANGE(codepoint, 0x02700, 0x027BF) || // Dingbats + INRANGE(codepoint, 0x1F000, 0x1F02F) || // Mahjong tiles + INRANGE(codepoint, 0x1F030, 0x1F09F) || // Domino tiles + INRANGE(codepoint, 0x1F0A0, 0x1F0FF) || // Playing cards + INRANGE(codepoint, 0x1F100, 0x1F2FF) || // Enclosed Alphanumeric Supplement - INRANGE(w, 0x1F200, 0x1F2FF) || // Enclosed Ideographic Supplement - INRANGE(w, 0x1F300, + INRANGE(codepoint, 0x1F200, + 0x1F2FF) || // Enclosed Ideographic Supplement + INRANGE(codepoint, 0x1F300, 0x1F5FF) || // Miscellaneous Symbols And Pictographs - INRANGE(w, 0x1F600, 0x1F64F) || // Emoticons - INRANGE(w, 0x1F680, 0x1F6FF) || // Transport And Map Symbols - INRANGE(w, 0x1F700, 0x1F77F) || // Alchemical Symbols - w == 0x26CE) { // Ophiuchus + INRANGE(codepoint, 0x1F600, 0x1F64F) || // Emoticons + INRANGE(codepoint, 0x1F680, + 0x1F6FF) || // Transport And Map Symbols + INRANGE(codepoint, 0x1F700, 0x1F77F) || // Alchemical Symbols + codepoint == 0x26CE) { // Ophiuchus return EMOJI; } return UNKNOWN_SCRIPT; } -Util::FormType Util::GetFormType(char32_t w) { +Util::FormType Util::GetFormType(char32_t codepoint) { // 'Unicode Standard Annex #11: EAST ASIAN WIDTH' // http://www.unicode.org/reports/tr11/ // Characters marked as 'Na' in // http://www.unicode.org/Public/UNIDATA/EastAsianWidth.txt - if (INRANGE(w, 0x0020, 0x007F) || // ascii - INRANGE(w, 0x27E6, 0x27ED) || // narrow mathematical symbols - INRANGE(w, 0x2985, 0x2986)) { // narrow white parentheses + if (INRANGE(codepoint, 0x0020, 0x007F) || // ascii + INRANGE(codepoint, 0x27E6, 0x27ED) || // narrow mathematical symbols + INRANGE(codepoint, 0x2985, 0x2986)) { // narrow white parentheses return HALF_WIDTH; } // Other characters marked as 'Na' in // http://www.unicode.org/Public/UNIDATA/EastAsianWidth.txt - if (INRANGE(w, 0x00A2, 0x00AF)) { - switch (w) { + if (INRANGE(codepoint, 0x00A2, 0x00AF)) { + switch (codepoint) { case 0x00A2: // CENT SIGN case 0x00A3: // POUND SIGN case 0x00A5: // YEN SIGN @@ -878,13 +882,13 @@ Util::FormType Util::GetFormType(char32_t w) { // Characters marked as 'H' in // http://www.unicode.org/Public/UNIDATA/EastAsianWidth.txt - if (w == 0x20A9 || // WON SIGN - INRANGE(w, 0xFF61, 0xFF9F) || // half-width katakana - INRANGE(w, 0xFFA0, 0xFFBE) || // half-width hangul - INRANGE(w, 0xFFC2, 0xFFCF) || // half-width hangul - INRANGE(w, 0xFFD2, 0xFFD7) || // half-width hangul - INRANGE(w, 0xFFDA, 0xFFDC) || // half-width hangul - INRANGE(w, 0xFFE8, 0xFFEE)) { // half-width symbols + if (codepoint == 0x20A9 || // WON SIGN + INRANGE(codepoint, 0xFF61, 0xFF9F) || // half-width katakana + INRANGE(codepoint, 0xFFA0, 0xFFBE) || // half-width hangul + INRANGE(codepoint, 0xFFC2, 0xFFCF) || // half-width hangul + INRANGE(codepoint, 0xFFD2, 0xFFD7) || // half-width hangul + INRANGE(codepoint, 0xFFDA, 0xFFDC) || // half-width hangul + INRANGE(codepoint, 0xFFE8, 0xFFEE)) { // half-width symbols return HALF_WIDTH; } @@ -970,9 +974,10 @@ Util::ScriptType Util::GetScriptTypeWithoutSymbols(absl::string_view str) { // return true if all script_type in str is "type" bool Util::IsScriptType(absl::string_view str, Util::ScriptType type) { for (ConstChar32Iterator iter(str); !iter.Done(); iter.Next()) { - const char32_t w = iter.Get(); + const char32_t codepoint = iter.Get(); // Exception: 30FC (PROLONGEDSOUND MARK is categorized as HIRAGANA as well) - if (type != GetScriptType(w) && (w != 0x30FC || type != HIRAGANA)) { + if (type != GetScriptType(codepoint) && + (codepoint != 0x30FC || type != HIRAGANA)) { return false; } } diff --git a/src/base/util.h b/src/base/util.h index 998bba842..1bd5d71e1 100644 --- a/src/base/util.h +++ b/src/base/util.h @@ -188,8 +188,8 @@ class Util { SCRIPT_TYPE_SIZE, }; - // return script type of w - static ScriptType GetScriptType(char32_t w); + // Returns the script type of `codepoint`. + static ScriptType GetScriptType(char32_t codepoint); // Returns the script type of the first character in `str`. // This function finds the first UTF-8 chars and returns its script type. @@ -198,7 +198,7 @@ class Util { static ScriptType GetFirstScriptType(absl::string_view str, size_t *mblen = nullptr); - // return script type of string. all chars in str must be + // Returns the script type of a string. All chars in str must be // KATAKANA/HIRAGANA/KANJI/NUMBER or ALPHABET. // If str has mixed scripts, this function returns UNKNOWN_SCRIPT static ScriptType GetScriptType(absl::string_view str); @@ -207,10 +207,10 @@ class Util { // in the |str|. static ScriptType GetScriptTypeWithoutSymbols(absl::string_view str); - // return true if all script_type in str is "type" + // Returns true if all script_type in str is "type" static bool IsScriptType(absl::string_view str, ScriptType type); - // return true if the string contains script_type char + // Returns true if the string contains script_type char static bool ContainsScriptType(absl::string_view str, ScriptType type); // See 'Unicode Standard Annex #11: EAST ASIAN WIDTH' @@ -223,12 +223,12 @@ class Util { FORM_TYPE_SIZE, }; - // return Form type of single character. + // Returns Form type of single character. // This function never returns UNKNOWN_FORM. - static FormType GetFormType(char32_t w); + static FormType GetFormType(char32_t codepoint); - // return FormType of string. - // return UNKNOWN_FORM if |str| contains both HALF_WIDTH and FULL_WIDTH. + // Returns FormType of string. + // Returns UNKNOWN_FORM if |str| contains both HALF_WIDTH and FULL_WIDTH. static FormType GetFormType(absl::string_view str); // Returns true if all characters of `str` are ASCII (U+00 - U+7F). diff --git a/src/converter/converter.cc b/src/converter/converter.cc index 53b427df7..bf554bbbf 100644 --- a/src/converter/converter.cc +++ b/src/converter/converter.cc @@ -177,11 +177,12 @@ bool ExtractLastTokenWithScriptType(const absl::string_view text, std::vector reverse_last_token; Util::ScriptType last_script_type_found = Util::GetScriptType(iter.Get()); for (; !iter.Done(); iter.Next()) { - const char32_t w = iter.Get(); - if ((w == ' ') || (Util::GetScriptType(w) != last_script_type_found)) { + const char32_t codepoint = iter.Get(); + if ((codepoint == ' ') || + (Util::GetScriptType(codepoint) != last_script_type_found)) { break; } - reverse_last_token.push_back(w); + reverse_last_token.push_back(codepoint); } *last_script_type = last_script_type_found; diff --git a/src/prediction/dictionary_prediction_aggregator_test.cc b/src/prediction/dictionary_prediction_aggregator_test.cc index b2b73f346..af21fa46f 100644 --- a/src/prediction/dictionary_prediction_aggregator_test.cc +++ b/src/prediction/dictionary_prediction_aggregator_test.cc @@ -291,13 +291,13 @@ void SetUpInputForSuggestionWithHistory(absl::string_view key, void GenerateKeyEvents(absl::string_view text, std::vector *keys) { keys->clear(); - for (const char32_t w : Util::Utf8ToUtf32(text)) { + for (const char32_t codepoint : Util::Utf8ToUtf32(text)) { commands::KeyEvent key; - if (w <= 0x7F) { // IsAscii, w is unsigned. - key.set_key_code(w); + if (codepoint <= 0x7F) { // IsAscii, w is unsigned. + key.set_key_code(codepoint); } else { key.set_key_code('?'); - *key.mutable_key_string() = Util::CodepointToUtf8(w); + *key.mutable_key_string() = Util::CodepointToUtf8(codepoint); } keys->push_back(key); } diff --git a/src/prediction/user_history_predictor.cc b/src/prediction/user_history_predictor.cc index 1831b8b4a..77eeaeb5f 100644 --- a/src/prediction/user_history_predictor.cc +++ b/src/prediction/user_history_predictor.cc @@ -722,9 +722,9 @@ bool UserHistoryPredictor::MaybeRomanMisspelledKey( int num_hiragana = 0; int num_unknown = 0; for (ConstChar32Iterator iter(key); !iter.Done(); iter.Next()) { - const char32_t w = iter.Get(); - const Util::ScriptType type = Util::GetScriptType(w); - if (type == Util::HIRAGANA || w == 0x30FC) { // "ー". + const char32_t codepoint = iter.Get(); + const Util::ScriptType type = Util::GetScriptType(codepoint); + if (type == Util::HIRAGANA || codepoint == 0x30FC) { // "ー". ++num_hiragana; continue; } diff --git a/src/prediction/user_history_predictor_test.cc b/src/prediction/user_history_predictor_test.cc index 4a4de19d9..45841493e 100644 --- a/src/prediction/user_history_predictor_test.cc +++ b/src/prediction/user_history_predictor_test.cc @@ -2821,9 +2821,9 @@ void InitSegmentsFromInputSequence(const absl::string_view text, DCHECK(segments); for (const UnicodeChar ch : Utf8AsUnicodeChar(text)) { commands::KeyEvent key; - const char32_t w = ch.char32(); - if (w <= 0x7F) { // IsAscii, w is unsigned. - key.set_key_code(w); + const char32_t codepoint = ch.char32(); + if (codepoint <= 0x7F) { // IsAscii, w is unsigned. + key.set_key_code(codepoint); } else { key.set_key_code('?'); key.set_key_string(ch.utf8()); diff --git a/src/rewriter/collocation_rewriter.cc b/src/rewriter/collocation_rewriter.cc index 65f26b002..72b2ff786 100644 --- a/src/rewriter/collocation_rewriter.cc +++ b/src/rewriter/collocation_rewriter.cc @@ -273,9 +273,8 @@ bool IsNaturalContent(const Segment::Candidate &cand, // special cases if (top_content_len == 1) { - const char32_t wchar = Util::Utf8ToCodepoint(top_content); - - switch (wchar) { + const char32_t codepoint = Util::Utf8ToCodepoint(top_content); + switch (codepoint) { case 0x304a: // "お" case 0x5fa1: // "御" case 0x3054: // "ご" diff --git a/src/rewriter/collocation_util.cc b/src/rewriter/collocation_util.cc index 10cd69ec9..b722349ae 100644 --- a/src/rewriter/collocation_util.cc +++ b/src/rewriter/collocation_util.cc @@ -77,14 +77,14 @@ void CollocationUtil::RemoveExtraCharacters(const absl::string_view input, bool remove_number, std::string *output) { for (ConstChar32Iterator iter(input); !iter.Done(); iter.Next()) { - const char32_t w = iter.Get(); - if (((Util::GetScriptType(w) != Util::UNKNOWN_SCRIPT) && - (!remove_number || !IsNumber(w))) || - w == 0x3005 || // "々" - w == 0x0025 || w == 0xFF05 || // "%", "%" - w == 0x3006 || // "〆" - w == 0x301C || w == 0xFF5E) { // "〜", "~" - Util::CodepointToUtf8Append(w, output); + const char32_t codepoint = iter.Get(); + if (((Util::GetScriptType(codepoint) != Util::UNKNOWN_SCRIPT) && + (!remove_number || !IsNumber(codepoint))) || + codepoint == 0x3005 || // "々" + codepoint == 0x0025 || codepoint == 0xFF05 || // "%", "%" + codepoint == 0x3006 || // "〆" + codepoint == 0x301C || codepoint == 0xFF5E) { // "〜", "~" + Util::CodepointToUtf8Append(codepoint, output); } } } diff --git a/src/rewriter/usage_rewriter.cc b/src/rewriter/usage_rewriter.cc index 85fe241d2..c2cce3fbf 100644 --- a/src/rewriter/usage_rewriter.cc +++ b/src/rewriter/usage_rewriter.cc @@ -110,19 +110,19 @@ std::string UsageRewriter::GetKanjiPrefixAndOneHiragana( bool has_kanji = false; bool has_hiragana = false; for (ConstChar32Iterator iter(word); !iter.Done(); iter.Next()) { - const char32_t w = iter.Get(); - const Util::ScriptType s = Util::GetScriptType(w); + const char32_t codepoint = iter.Get(); + const Util::ScriptType s = Util::GetScriptType(codepoint); if (pos == 0 && s != Util::KANJI) { return ""; } else if (pos >= 0 && pos <= 1 && s == Util::KANJI) { // length of kanji <= 2. has_kanji = true; ++pos; - Util::CodepointToUtf8Append(w, &result); + Util::CodepointToUtf8Append(codepoint, &result); continue; } else if (pos > 0 && s == Util::HIRAGANA) { has_hiragana = true; - Util::CodepointToUtf8Append(w, &result); + Util::CodepointToUtf8Append(codepoint, &result); break; } else { return "";