Skip to content

Commit

Permalink
Rename char32_t w tocodepoint
Browse files Browse the repository at this point in the history
For the readability. Also updates some comments to match the [C++ Style Guide](https://google.github.io/styleguide/cppguide.html#Punctuation,_Spelling_and_Grammar).

This patch has no behavior changes.

PiperOrigin-RevId: 639659016
  • Loading branch information
kojiishi authored and hiroyuki-komatsu committed Jun 3, 2024
1 parent 42cbb3f commit 7e818ac
Show file tree
Hide file tree
Showing 9 changed files with 90 additions and 85 deletions.
101 changes: 53 additions & 48 deletions src/base/util.cc
Original file line number Diff line number Diff line change
Expand Up @@ -791,27 +791,29 @@ bool Util::IsEnglishTransliteration(absl::string_view value) {
// script type
// TODO(yukawa, team): Make a mechanism to keep this classifier up-to-date
// based on the original data from Unicode.org.
Util::ScriptType Util::GetScriptType(char32_t w) {
if (INRANGE(w, 0x0030, 0x0039) || // ascii number
INRANGE(w, 0xFF10, 0xFF19)) { // full width number
Util::ScriptType Util::GetScriptType(char32_t codepoint) {
if (INRANGE(codepoint, 0x0030, 0x0039) || // ascii number
INRANGE(codepoint, 0xFF10, 0xFF19)) { // full width number
return NUMBER;
} else if (INRANGE(w, 0x0041, 0x005A) || // ascii upper
INRANGE(w, 0x0061, 0x007A) || // ascii lower
INRANGE(w, 0xFF21, 0xFF3A) || // fullwidth ascii upper
INRANGE(w, 0xFF41, 0xFF5A)) { // fullwidth ascii lower
} else if (INRANGE(codepoint, 0x0041, 0x005A) || // ascii upper
INRANGE(codepoint, 0x0061, 0x007A) || // ascii lower
INRANGE(codepoint, 0xFF21, 0xFF3A) || // fullwidth ascii upper
INRANGE(codepoint, 0xFF41, 0xFF5A)) { // fullwidth ascii lower
return ALPHABET;
} else if (w == 0x3005 || // IDEOGRAPHIC ITERATION MARK "々"
INRANGE(w, 0x3400,
} else if (codepoint == 0x3005 || // IDEOGRAPHIC ITERATION MARK "々"
INRANGE(codepoint, 0x3400,
0x4DBF) || // CJK Unified Ideographs Extension A
INRANGE(w, 0x4E00, 0x9FFF) || // CJK Unified Ideographs
INRANGE(w, 0xF900, 0xFAFF) || // CJK Compatibility Ideographs
INRANGE(w, 0x20000,
INRANGE(codepoint, 0x4E00, 0x9FFF) || // CJK Unified Ideographs
INRANGE(codepoint, 0xF900,
0xFAFF) || // CJK Compatibility Ideographs
INRANGE(codepoint, 0x20000,
0x2A6DF) || // CJK Unified Ideographs Extension B
INRANGE(w, 0x2A700,
INRANGE(codepoint, 0x2A700,
0x2B73F) || // CJK Unified Ideographs Extension C
INRANGE(w, 0x2B740,
INRANGE(codepoint, 0x2B740,
0x2B81F) || // CJK Unified Ideographs Extension D
INRANGE(w, 0x2F800, 0x2FA1F)) { // CJK Compatibility Ideographs
INRANGE(codepoint, 0x2F800,
0x2FA1F)) { // CJK Compatibility Ideographs
// As of Unicode 6.0.2, each block has the following characters assigned.
// [U+3400, U+4DB5]: CJK Unified Ideographs Extension A
// [U+4E00, U+9FCB]: CJK Unified Ideographs
Expand All @@ -821,51 +823,53 @@ Util::ScriptType Util::GetScriptType(char32_t w) {
// [U+2B740, U+2B81D]: CJK Unified Ideographs Extension D
// [U+2F800, U+2FA1D]: CJK Compatibility Ideographs
return KANJI;
} else if (INRANGE(w, 0x3041, 0x309F) || // hiragana
w == 0x1B001) { // HIRAGANA LETTER ARCHAIC YE
} else if (INRANGE(codepoint, 0x3041, 0x309F) || // hiragana
codepoint == 0x1B001) { // HIRAGANA LETTER ARCHAIC YE
return HIRAGANA;
} else if (INRANGE(w, 0x30A1, 0x30FF) || // full width katakana
INRANGE(w, 0x31F0,
} else if (INRANGE(codepoint, 0x30A1, 0x30FF) || // full width katakana
INRANGE(codepoint, 0x31F0,
0x31FF) || // Katakana Phonetic Extensions for Ainu
INRANGE(w, 0xFF65, 0xFF9F) || // half width katakana
w == 0x1B000) { // KATAKANA LETTER ARCHAIC E
INRANGE(codepoint, 0xFF65, 0xFF9F) || // half width katakana
codepoint == 0x1B000) { // KATAKANA LETTER ARCHAIC E
return KATAKANA;
} else if (INRANGE(w, 0x02300, 0x023F3) || // Miscellaneous Technical
INRANGE(w, 0x02700, 0x027BF) || // Dingbats
INRANGE(w, 0x1F000, 0x1F02F) || // Mahjong tiles
INRANGE(w, 0x1F030, 0x1F09F) || // Domino tiles
INRANGE(w, 0x1F0A0, 0x1F0FF) || // Playing cards
INRANGE(w, 0x1F100,
} else if (INRANGE(codepoint, 0x02300, 0x023F3) || // Miscellaneous Technical
INRANGE(codepoint, 0x02700, 0x027BF) || // Dingbats
INRANGE(codepoint, 0x1F000, 0x1F02F) || // Mahjong tiles
INRANGE(codepoint, 0x1F030, 0x1F09F) || // Domino tiles
INRANGE(codepoint, 0x1F0A0, 0x1F0FF) || // Playing cards
INRANGE(codepoint, 0x1F100,
0x1F2FF) || // Enclosed Alphanumeric Supplement
INRANGE(w, 0x1F200, 0x1F2FF) || // Enclosed Ideographic Supplement
INRANGE(w, 0x1F300,
INRANGE(codepoint, 0x1F200,
0x1F2FF) || // Enclosed Ideographic Supplement
INRANGE(codepoint, 0x1F300,
0x1F5FF) || // Miscellaneous Symbols And Pictographs
INRANGE(w, 0x1F600, 0x1F64F) || // Emoticons
INRANGE(w, 0x1F680, 0x1F6FF) || // Transport And Map Symbols
INRANGE(w, 0x1F700, 0x1F77F) || // Alchemical Symbols
w == 0x26CE) { // Ophiuchus
INRANGE(codepoint, 0x1F600, 0x1F64F) || // Emoticons
INRANGE(codepoint, 0x1F680,
0x1F6FF) || // Transport And Map Symbols
INRANGE(codepoint, 0x1F700, 0x1F77F) || // Alchemical Symbols
codepoint == 0x26CE) { // Ophiuchus
return EMOJI;
}

return UNKNOWN_SCRIPT;
}

Util::FormType Util::GetFormType(char32_t w) {
Util::FormType Util::GetFormType(char32_t codepoint) {
// 'Unicode Standard Annex #11: EAST ASIAN WIDTH'
// http://www.unicode.org/reports/tr11/

// Characters marked as 'Na' in
// http://www.unicode.org/Public/UNIDATA/EastAsianWidth.txt
if (INRANGE(w, 0x0020, 0x007F) || // ascii
INRANGE(w, 0x27E6, 0x27ED) || // narrow mathematical symbols
INRANGE(w, 0x2985, 0x2986)) { // narrow white parentheses
if (INRANGE(codepoint, 0x0020, 0x007F) || // ascii
INRANGE(codepoint, 0x27E6, 0x27ED) || // narrow mathematical symbols
INRANGE(codepoint, 0x2985, 0x2986)) { // narrow white parentheses
return HALF_WIDTH;
}

// Other characters marked as 'Na' in
// http://www.unicode.org/Public/UNIDATA/EastAsianWidth.txt
if (INRANGE(w, 0x00A2, 0x00AF)) {
switch (w) {
if (INRANGE(codepoint, 0x00A2, 0x00AF)) {
switch (codepoint) {
case 0x00A2: // CENT SIGN
case 0x00A3: // POUND SIGN
case 0x00A5: // YEN SIGN
Expand All @@ -878,13 +882,13 @@ Util::FormType Util::GetFormType(char32_t w) {

// Characters marked as 'H' in
// http://www.unicode.org/Public/UNIDATA/EastAsianWidth.txt
if (w == 0x20A9 || // WON SIGN
INRANGE(w, 0xFF61, 0xFF9F) || // half-width katakana
INRANGE(w, 0xFFA0, 0xFFBE) || // half-width hangul
INRANGE(w, 0xFFC2, 0xFFCF) || // half-width hangul
INRANGE(w, 0xFFD2, 0xFFD7) || // half-width hangul
INRANGE(w, 0xFFDA, 0xFFDC) || // half-width hangul
INRANGE(w, 0xFFE8, 0xFFEE)) { // half-width symbols
if (codepoint == 0x20A9 || // WON SIGN
INRANGE(codepoint, 0xFF61, 0xFF9F) || // half-width katakana
INRANGE(codepoint, 0xFFA0, 0xFFBE) || // half-width hangul
INRANGE(codepoint, 0xFFC2, 0xFFCF) || // half-width hangul
INRANGE(codepoint, 0xFFD2, 0xFFD7) || // half-width hangul
INRANGE(codepoint, 0xFFDA, 0xFFDC) || // half-width hangul
INRANGE(codepoint, 0xFFE8, 0xFFEE)) { // half-width symbols
return HALF_WIDTH;
}

Expand Down Expand Up @@ -970,9 +974,10 @@ Util::ScriptType Util::GetScriptTypeWithoutSymbols(absl::string_view str) {
// return true if all script_type in str is "type"
bool Util::IsScriptType(absl::string_view str, Util::ScriptType type) {
for (ConstChar32Iterator iter(str); !iter.Done(); iter.Next()) {
const char32_t w = iter.Get();
const char32_t codepoint = iter.Get();
// Exception: 30FC (PROLONGEDSOUND MARK is categorized as HIRAGANA as well)
if (type != GetScriptType(w) && (w != 0x30FC || type != HIRAGANA)) {
if (type != GetScriptType(codepoint) &&
(codepoint != 0x30FC || type != HIRAGANA)) {
return false;
}
}
Expand Down
18 changes: 9 additions & 9 deletions src/base/util.h
Original file line number Diff line number Diff line change
Expand Up @@ -188,8 +188,8 @@ class Util {
SCRIPT_TYPE_SIZE,
};

// return script type of w
static ScriptType GetScriptType(char32_t w);
// Returns the script type of `codepoint`.
static ScriptType GetScriptType(char32_t codepoint);

// Returns the script type of the first character in `str`.
// This function finds the first UTF-8 chars and returns its script type.
Expand All @@ -198,7 +198,7 @@ class Util {
static ScriptType GetFirstScriptType(absl::string_view str,
size_t *mblen = nullptr);

// return script type of string. all chars in str must be
// Returns the script type of a string. All chars in str must be
// KATAKANA/HIRAGANA/KANJI/NUMBER or ALPHABET.
// If str has mixed scripts, this function returns UNKNOWN_SCRIPT
static ScriptType GetScriptType(absl::string_view str);
Expand All @@ -207,10 +207,10 @@ class Util {
// in the |str|.
static ScriptType GetScriptTypeWithoutSymbols(absl::string_view str);

// return true if all script_type in str is "type"
// Returns true if all script_type in str is "type"
static bool IsScriptType(absl::string_view str, ScriptType type);

// return true if the string contains script_type char
// Returns true if the string contains script_type char
static bool ContainsScriptType(absl::string_view str, ScriptType type);

// See 'Unicode Standard Annex #11: EAST ASIAN WIDTH'
Expand All @@ -223,12 +223,12 @@ class Util {
FORM_TYPE_SIZE,
};

// return Form type of single character.
// Returns Form type of single character.
// This function never returns UNKNOWN_FORM.
static FormType GetFormType(char32_t w);
static FormType GetFormType(char32_t codepoint);

// return FormType of string.
// return UNKNOWN_FORM if |str| contains both HALF_WIDTH and FULL_WIDTH.
// Returns FormType of string.
// Returns UNKNOWN_FORM if |str| contains both HALF_WIDTH and FULL_WIDTH.
static FormType GetFormType(absl::string_view str);

// Returns true if all characters of `str` are ASCII (U+00 - U+7F).
Expand Down
7 changes: 4 additions & 3 deletions src/converter/converter.cc
Original file line number Diff line number Diff line change
Expand Up @@ -177,11 +177,12 @@ bool ExtractLastTokenWithScriptType(const absl::string_view text,
std::vector<char32_t> reverse_last_token;
Util::ScriptType last_script_type_found = Util::GetScriptType(iter.Get());
for (; !iter.Done(); iter.Next()) {
const char32_t w = iter.Get();
if ((w == ' ') || (Util::GetScriptType(w) != last_script_type_found)) {
const char32_t codepoint = iter.Get();
if ((codepoint == ' ') ||
(Util::GetScriptType(codepoint) != last_script_type_found)) {
break;
}
reverse_last_token.push_back(w);
reverse_last_token.push_back(codepoint);
}

*last_script_type = last_script_type_found;
Expand Down
8 changes: 4 additions & 4 deletions src/prediction/dictionary_prediction_aggregator_test.cc
Original file line number Diff line number Diff line change
Expand Up @@ -291,13 +291,13 @@ void SetUpInputForSuggestionWithHistory(absl::string_view key,
void GenerateKeyEvents(absl::string_view text,
std::vector<commands::KeyEvent> *keys) {
keys->clear();
for (const char32_t w : Util::Utf8ToUtf32(text)) {
for (const char32_t codepoint : Util::Utf8ToUtf32(text)) {
commands::KeyEvent key;
if (w <= 0x7F) { // IsAscii, w is unsigned.
key.set_key_code(w);
if (codepoint <= 0x7F) { // IsAscii, w is unsigned.
key.set_key_code(codepoint);
} else {
key.set_key_code('?');
*key.mutable_key_string() = Util::CodepointToUtf8(w);
*key.mutable_key_string() = Util::CodepointToUtf8(codepoint);
}
keys->push_back(key);
}
Expand Down
6 changes: 3 additions & 3 deletions src/prediction/user_history_predictor.cc
Original file line number Diff line number Diff line change
Expand Up @@ -722,9 +722,9 @@ bool UserHistoryPredictor::MaybeRomanMisspelledKey(
int num_hiragana = 0;
int num_unknown = 0;
for (ConstChar32Iterator iter(key); !iter.Done(); iter.Next()) {
const char32_t w = iter.Get();
const Util::ScriptType type = Util::GetScriptType(w);
if (type == Util::HIRAGANA || w == 0x30FC) { // "ー".
const char32_t codepoint = iter.Get();
const Util::ScriptType type = Util::GetScriptType(codepoint);
if (type == Util::HIRAGANA || codepoint == 0x30FC) { // "ー".
++num_hiragana;
continue;
}
Expand Down
6 changes: 3 additions & 3 deletions src/prediction/user_history_predictor_test.cc
Original file line number Diff line number Diff line change
Expand Up @@ -2821,9 +2821,9 @@ void InitSegmentsFromInputSequence(const absl::string_view text,
DCHECK(segments);
for (const UnicodeChar ch : Utf8AsUnicodeChar(text)) {
commands::KeyEvent key;
const char32_t w = ch.char32();
if (w <= 0x7F) { // IsAscii, w is unsigned.
key.set_key_code(w);
const char32_t codepoint = ch.char32();
if (codepoint <= 0x7F) { // IsAscii, w is unsigned.
key.set_key_code(codepoint);
} else {
key.set_key_code('?');
key.set_key_string(ch.utf8());
Expand Down
5 changes: 2 additions & 3 deletions src/rewriter/collocation_rewriter.cc
Original file line number Diff line number Diff line change
Expand Up @@ -273,9 +273,8 @@ bool IsNaturalContent(const Segment::Candidate &cand,

// special cases
if (top_content_len == 1) {
const char32_t wchar = Util::Utf8ToCodepoint(top_content);

switch (wchar) {
const char32_t codepoint = Util::Utf8ToCodepoint(top_content);
switch (codepoint) {
case 0x304a: // "お"
case 0x5fa1: // "御"
case 0x3054: // "ご"
Expand Down
16 changes: 8 additions & 8 deletions src/rewriter/collocation_util.cc
Original file line number Diff line number Diff line change
Expand Up @@ -77,14 +77,14 @@ void CollocationUtil::RemoveExtraCharacters(const absl::string_view input,
bool remove_number,
std::string *output) {
for (ConstChar32Iterator iter(input); !iter.Done(); iter.Next()) {
const char32_t w = iter.Get();
if (((Util::GetScriptType(w) != Util::UNKNOWN_SCRIPT) &&
(!remove_number || !IsNumber(w))) ||
w == 0x3005 || // "々"
w == 0x0025 || w == 0xFF05 || // "%", "%"
w == 0x3006 || // "〆"
w == 0x301C || w == 0xFF5E) { // "〜", "~"
Util::CodepointToUtf8Append(w, output);
const char32_t codepoint = iter.Get();
if (((Util::GetScriptType(codepoint) != Util::UNKNOWN_SCRIPT) &&
(!remove_number || !IsNumber(codepoint))) ||
codepoint == 0x3005 || // "々"
codepoint == 0x0025 || codepoint == 0xFF05 || // "%", "%"
codepoint == 0x3006 || // "〆"
codepoint == 0x301C || codepoint == 0xFF5E) { // "〜", "~"
Util::CodepointToUtf8Append(codepoint, output);
}
}
}
Expand Down
8 changes: 4 additions & 4 deletions src/rewriter/usage_rewriter.cc
Original file line number Diff line number Diff line change
Expand Up @@ -110,19 +110,19 @@ std::string UsageRewriter::GetKanjiPrefixAndOneHiragana(
bool has_kanji = false;
bool has_hiragana = false;
for (ConstChar32Iterator iter(word); !iter.Done(); iter.Next()) {
const char32_t w = iter.Get();
const Util::ScriptType s = Util::GetScriptType(w);
const char32_t codepoint = iter.Get();
const Util::ScriptType s = Util::GetScriptType(codepoint);
if (pos == 0 && s != Util::KANJI) {
return "";
} else if (pos >= 0 && pos <= 1 && s == Util::KANJI) {
// length of kanji <= 2.
has_kanji = true;
++pos;
Util::CodepointToUtf8Append(w, &result);
Util::CodepointToUtf8Append(codepoint, &result);
continue;
} else if (pos > 0 && s == Util::HIRAGANA) {
has_hiragana = true;
Util::CodepointToUtf8Append(w, &result);
Util::CodepointToUtf8Append(codepoint, &result);
break;
} else {
return "";
Expand Down

0 comments on commit 7e818ac

Please sign in to comment.