From 1e0356be9ae3c25920e17a65ab6eefd8677f1967 Mon Sep 17 00:00:00 2001 From: Xin Hu Date: Thu, 31 Dec 2015 01:11:38 -0500 Subject: [PATCH] Use bit vector to store CESU-8 lookup table, to improve lit_get_unicode_char_size_by_utf8_first_byte performance. JerryScript-DCO-1.0-Signed-off-by: Xin Hu Xin.A.Hu@intel.com --- jerry-core/lit/lit-strings.cpp | 47 ++++++++++++++++++++++++---------- jerry-core/lit/lit-strings.h | 3 +++ 2 files changed, 37 insertions(+), 13 deletions(-) diff --git a/jerry-core/lit/lit-strings.cpp b/jerry-core/lit/lit-strings.cpp index 0392181456..5ac0998bf0 100644 --- a/jerry-core/lit/lit-strings.cpp +++ b/jerry-core/lit/lit-strings.cpp @@ -757,6 +757,17 @@ lit_utf8_string_code_unit_at (const lit_utf8_byte_t *utf8_buf_p, /**< utf-8 stri return code_unit; } /* lit_utf8_string_code_unit_at */ +/* CESU-8 number of bytes occupied lookup table */ +#ifndef __LITTLE_ENDIAN +const __attribute__ ((aligned (CESU_8_TABLE_MEM_ALIGNMENT))) lit_utf8_byte_t table[] +{ + 1, 1, 1, 1, 1, 1, 1, 1, + 0, 0, 0, 0, + 2, 2, + 3, 0 +}; +#endif + /** * Get CESU-8 encoded size of character * @@ -765,19 +776,29 @@ lit_utf8_string_code_unit_at (const lit_utf8_byte_t *utf8_buf_p, /**< utf-8 stri lit_utf8_size_t lit_get_unicode_char_size_by_utf8_first_byte (const lit_utf8_byte_t first_byte) /**< buffer with characters */ { - if ((first_byte & LIT_UTF8_1_BYTE_MASK) == LIT_UTF8_1_BYTE_MARKER) - { - return 1; - } - else if ((first_byte & LIT_UTF8_2_BYTE_MASK) == LIT_UTF8_2_BYTE_MARKER) - { - return 2; - } - else - { - JERRY_ASSERT ((first_byte & LIT_UTF8_3_BYTE_MASK) == LIT_UTF8_3_BYTE_MARKER); - return 3; - } + JERRY_ASSERT (((first_byte >> 4) <= 7 || (first_byte >> 4) == 12 || + (first_byte >> 4) == 13 || (first_byte >> 4) == 14)); + +#ifdef __LITTLE_ENDIAN + //compact CESU-8 length lookup table into an uint32_t, every two bits represent one item + //const lit_utf8_byte_t table[] + //{ + // 1, 1, 1, 1, 1, 1, 1, 1, + // 0, 0, 0, 0, + // 2, 2, + // 3, 0 + //}; + // MSB ---> LSB + // on little endian platform, it is 00 11 10 10 00 00 00 00 01 01 01 01 01 01 01 01 + // table index: 15 14 13 12 11 10 9 8 7 6 5 4 3 2 1 0 + + const uint32_t cesu_8_store = 0x3a005555; + int shift = (first_byte >> 4) << 1; + + return (cesu_8_store >> shift) & 0x3; +#else + return table[first_byte >> 4]; +#endif } /* lit_get_unicode_char_size_by_utf8_first_byte */ /** diff --git a/jerry-core/lit/lit-strings.h b/jerry-core/lit/lit-strings.h index f40ca34506..1fa9d4d8d7 100644 --- a/jerry-core/lit/lit-strings.h +++ b/jerry-core/lit/lit-strings.h @@ -157,6 +157,9 @@ lit_string_hash_t lit_utf8_string_calc_hash (const lit_utf8_byte_t *, lit_utf8_s lit_string_hash_t lit_utf8_string_hash_combine (lit_string_hash_t, const lit_utf8_byte_t *, lit_utf8_size_t); /* code unit access */ +#ifndef __LITTLE_ENDIAN +#define CESU_8_TABLE_MEM_ALIGNMENT 16 +#endif ecma_char_t lit_utf8_string_code_unit_at (const lit_utf8_byte_t *, lit_utf8_size_t, ecma_length_t); lit_utf8_size_t lit_get_unicode_char_size_by_utf8_first_byte (lit_utf8_byte_t);