diff --git a/.gitignore b/.gitignore index 8ae6ac11..f5a342d7 100644 --- a/.gitignore +++ b/.gitignore @@ -16,6 +16,7 @@ Gemfile.lock .DS_Store */**/Makefile */**/*.o +*/**/extconf.h */**/*.class */**/*.jar .byebug_history diff --git a/Rakefile b/Rakefile index 1e68d2ae..8f2f97f1 100644 --- a/Rakefile +++ b/Rakefile @@ -86,7 +86,7 @@ end file EXT_GENERATOR_DL => EXT_GENERATOR_SRC do cd EXT_GENERATOR_DIR do - ruby 'extconf.rb' + ruby "extconf.rb #{ENV['JSON_GENERATOR_CONFIGURE_OPTS']}" sh MAKE end cp "#{EXT_GENERATOR_DIR}/generator.#{CONFIG['DLEXT']}", EXT_ROOT_DIR diff --git a/benchmark/encoder-simple.rb b/benchmark/encoder-simple.rb new file mode 100644 index 00000000..cf3d380c --- /dev/null +++ b/benchmark/encoder-simple.rb @@ -0,0 +1,58 @@ +require "benchmark/ips" +require "json" +require "date" +require "oj" + +Oj.default_options = Oj.default_options.merge(mode: :compat) + +if ENV["ONLY"] + RUN = ENV["ONLY"].split(/[,: ]/).map{|x| [x.to_sym, true] }.to_h + RUN.default = false +elsif ENV["EXCEPT"] + RUN = ENV["EXCEPT"].split(/[,: ]/).map{|x| [x.to_sym, false] }.to_h + RUN.default = true +else + RUN = Hash.new(true) +end + +def implementations(ruby_obj) + state = JSON::State.new(JSON.dump_default_options) + { + json: ["json", proc { JSON.generate(ruby_obj) }], + oj: ["oj", proc { Oj.dump(ruby_obj) }], + } +end + +def benchmark_encoding(benchmark_name, ruby_obj, check_expected: true, except: []) + json_output = JSON.dump(ruby_obj) + puts "== Encoding #{benchmark_name} (#{json_output.bytesize} bytes)" + + impls = implementations(ruby_obj).select { |name| RUN[name] } + except.each { |i| impls.delete(i) } + + Benchmark.ips do |x| + expected = ::JSON.dump(ruby_obj) if check_expected + impls.values.each do |name, block| + begin + result = block.call + if check_expected && expected != result + puts "#{name} does not match expected output. Skipping" + puts "Expected:" + '-' * 40 + puts expected + puts "Actual:" + '-' * 40 + puts result + puts '-' * 40 + next + end + rescue => error + puts "#{name} unsupported (#{error})" + next + end + x.report(name, &block) + end + x.compare!(order: :baseline) + end + puts +end + +benchmark_encoding "long string", (["this is a test of the emergency broadcast system."*5]*500) \ No newline at end of file diff --git a/ext/json/ext/generator/extconf.rb b/ext/json/ext/generator/extconf.rb index 078068cf..5ed26fda 100644 --- a/ext/json/ext/generator/extconf.rb +++ b/ext/json/ext/generator/extconf.rb @@ -6,5 +6,55 @@ else append_cflags("-std=c99") $defs << "-DJSON_GENERATOR" + + if enable_config('use-simd', default=true) + if RbConfig::CONFIG['host_cpu'] =~ /^(arm.*|aarch64.*)/ + $defs.push("-DENABLE_SIMD") + + # Try to compile a small program using NEON instructions + if have_header('arm_neon.h') + have_type('uint8x16_t', headers=['arm_neon.h']) && try_compile(<<~'SRC') + #include + int main() { + uint8x16_t test = vdupq_n_u8(32); + return 0; + } + SRC + + have_type('uint8x8_t', headers=['arm_neon.h']) && try_compile(<<~'SRC') + #include + int main() { + uint8x8_t test = vdup_n_u8(32); + return 0; + } + SRC + end + elsif have_header('x86intrin.h') + + if have_type('__m256i', headers=['x86intrin.h']) && try_compile(<<~'SRC', opt='-mavx2') + #include + int main() { + __m256i test = _mm256_set1_epi8(32); + return 0; + } + SRC + $defs.push("-DENABLE_SIMD") + end + + if have_type('__m128i', headers=['x86intrin.h']) && try_compile(<<~'SRC', opt='-mavx2') + #include + int main() { + __m128i test = _mm_set1_epi8(32); + return 0; + } + SRC + $defs.push("-DENABLE_SIMD") unless $defs.include?('-DENABLE_SIMD') + end + end + + have_header('cpuid.h') + end + + create_header create_makefile 'json/ext/generator' end diff --git a/ext/json/ext/generator/generator.c b/ext/json/ext/generator/generator.c index 5006b785..745a3ce1 100644 --- a/ext/json/ext/generator/generator.c +++ b/ext/json/ext/generator/generator.c @@ -4,6 +4,9 @@ #include #include +#include "extconf.h" +#include "simd.h" + /* ruby api and some helpers */ typedef struct JSON_Generator_StateStruct { @@ -33,6 +36,11 @@ static ID i_to_s, i_to_json, i_new, i_pack, i_unpack, i_create_id, i_extend, i_e static ID sym_indent, sym_space, sym_space_before, sym_object_nl, sym_array_nl, sym_max_nesting, sym_allow_nan, sym_ascii_only, sym_depth, sym_buffer_initial_length, sym_script_safe, sym_escape_slash, sym_strict; +static void (*convert_UTF8_to_JSON_impl)(FBuffer *, VALUE, const unsigned char escape_table[256]); + +#ifdef ENABLE_SIMD +static void (*convert_UTF8_to_JSON_simd_kernel)(FBuffer *out_buffer, const char * ptr, unsigned long len, unsigned long *_beg, unsigned long *_pos, const char *hexdig, char scratch[12], const unsigned char escape_table[256]); +#endif #define GET_STATE_TO(self, state) \ TypedData_Get_Struct(self, JSON_Generator_State, &JSON_Generator_State_type, state) @@ -179,7 +187,57 @@ static const unsigned char script_safe_escape_table[256] = { * Everything else (should be UTF-8) is just passed through and * appended to the result. */ -static inline void convert_UTF8_to_JSON(FBuffer *out_buffer, VALUE str, const unsigned char escape_table[256]) +#define FLUSH_POS(bytes) if (pos > beg) { fbuffer_append(out_buffer, &ptr[beg], pos - beg); } pos += bytes; beg = pos; +#define PROCESS_BYTE if (RB_UNLIKELY(ch_len)) { \ + switch (ch_len) { \ + case 9: { \ + FLUSH_POS(1); \ + switch (ch) { \ + case '"': fbuffer_append(out_buffer, "\\\"", 2); break; \ + case '\\': fbuffer_append(out_buffer, "\\\\", 2); break; \ + case '/': fbuffer_append(out_buffer, "\\/", 2); break; \ + case '\b': fbuffer_append(out_buffer, "\\b", 2); break; \ + case '\f': fbuffer_append(out_buffer, "\\f", 2); break; \ + case '\n': fbuffer_append(out_buffer, "\\n", 2); break; \ + case '\r': fbuffer_append(out_buffer, "\\r", 2); break; \ + case '\t': fbuffer_append(out_buffer, "\\t", 2); break; \ + default: { \ + scratch[2] = '0'; \ + scratch[3] = '0'; \ + scratch[4] = hexdig[(ch >> 4) & 0xf]; \ + scratch[5] = hexdig[ch & 0xf]; \ + fbuffer_append(out_buffer, scratch, 6); \ + break; \ + } \ + } \ + break; \ + } \ + case 11: { \ + unsigned char b2 = ptr[pos + 1]; \ + if (RB_UNLIKELY(b2 == 0x80)) { \ + unsigned char b3 = ptr[pos + 2]; \ + if (b3 == 0xA8) { \ + FLUSH_POS(3); \ + fbuffer_append(out_buffer, "\\u2028", 6); \ + break; \ + } else if (b3 == 0xA9) { \ + FLUSH_POS(3); \ + fbuffer_append(out_buffer, "\\u2029", 6); \ + break; \ + } \ + } \ + ch_len = 3; \ + } \ + default: \ + pos += ch_len; \ + break; \ + } \ + } else { \ + pos++; \ + } + +#ifdef ENABLE_SIMD +static void convert_UTF8_to_JSON_simd(FBuffer *out_buffer, VALUE str, const unsigned char escape_table[256]) { const char *hexdig = "0123456789abcdef"; char scratch[12] = { '\\', 'u', 0, 0, 0, 0, '\\', 'u' }; @@ -189,63 +247,462 @@ static inline void convert_UTF8_to_JSON(FBuffer *out_buffer, VALUE str, const un unsigned long beg = 0, pos = 0; -#define FLUSH_POS(bytes) if (pos > beg) { fbuffer_append(out_buffer, &ptr[beg], pos - beg); } pos += bytes; beg = pos; - + convert_UTF8_to_JSON_simd_kernel(out_buffer, ptr, len, &beg, &pos, hexdig, scratch, escape_table); + while (pos < len) { unsigned char ch = ptr[pos]; unsigned char ch_len = escape_table[ch]; - /* JSON encoding */ + PROCESS_BYTE; + } - if (RB_UNLIKELY(ch_len)) { - switch (ch_len) { - case 9: { - FLUSH_POS(1); - switch (ch) { - case '"': fbuffer_append(out_buffer, "\\\"", 2); break; - case '\\': fbuffer_append(out_buffer, "\\\\", 2); break; - case '/': fbuffer_append(out_buffer, "\\/", 2); break; - case '\b': fbuffer_append(out_buffer, "\\b", 2); break; - case '\f': fbuffer_append(out_buffer, "\\f", 2); break; - case '\n': fbuffer_append(out_buffer, "\\n", 2); break; - case '\r': fbuffer_append(out_buffer, "\\r", 2); break; - case '\t': fbuffer_append(out_buffer, "\\t", 2); break; - default: { - scratch[2] = '0'; - scratch[3] = '0'; - scratch[4] = hexdig[(ch >> 4) & 0xf]; - scratch[5] = hexdig[ch & 0xf]; - fbuffer_append(out_buffer, scratch, 6); - break; - } - } - break; + if (beg < len) { + fbuffer_append(out_buffer, &ptr[beg], len - beg); + } + + RB_GC_GUARD(str); +} +#endif + +#ifdef HAVE_SIMD_NEON + +void convert_UTF8_to_JSON_simd_kernel_neon(FBuffer *out_buffer, const char * ptr, unsigned long len, unsigned long *_beg, unsigned long *_pos, const char *hexdig, char scratch[12], const unsigned char escape_table[256]) { + unsigned long beg = *_beg, pos = *_pos; + + const uint8x16_t lower_bound = vdupq_n_u8(' '); + const uint8x16_t backslash = vdupq_n_u8('\\'); + const uint8x16_t dblquote = vdupq_n_u8('\"'); + + if (escape_table == script_safe_escape_table) { + /* + * This works almost exactly the same as what is described above. The difference in this case comes after we know + * there is a byte to be escaped. In the previous case, all bytes were handled the same way. In this case, however, + * some bytes need to be handled differently. + * + * Since we know each byte in chunk can only match a single case, we logical AND each of the has_backslash, + * has_dblquote, and has_forward_slash with a different bit (0x1, 0x2 and 0x4 respectively) and combine + * the results with a logical OR. + * + * Now we loop over the result vector and switch on the particular pattern we just created. If we find a + * case we don't know, we simply lookup the byte in the script_safe_escape_table to determine the correct + * action. + */ + const uint8x16_t upper_bound = vdupq_n_u8('~'); + const uint8x16_t forward_slash = vdupq_n_u8('/'); + + while (pos+16 < len) { + uint8x16_t chunk = vld1q_u8((const uint8_t*)&ptr[pos]); + uint8x16_t too_low = vcltq_u8(chunk, lower_bound); + uint8x16_t too_high = vcgtq_u8(chunk, upper_bound); + + uint8x16_t has_backslash = vceqq_u8(chunk, backslash); + uint8x16_t has_dblquote = vceqq_u8(chunk, dblquote); + uint8x16_t has_forward_slash = vceqq_u8(chunk, forward_slash); + + uint8x16_t needs_escape = vorrq_u8(too_low, too_high); + uint8x16_t has_escaped_char = vorrq_u8(has_forward_slash, vorrq_u8(has_backslash, has_dblquote)); + needs_escape = vorrq_u8(needs_escape, has_escaped_char); + + if (vmaxvq_u8(needs_escape) == 0) { + pos += 16; + continue; + } + + uint8x16_t tmp = vandq_u8(too_low, vdupq_n_u8(0x1)); + tmp = vorrq_u8(tmp, vandq_u8(has_backslash, vdupq_n_u8(0x2))); + tmp = vorrq_u8(tmp, vandq_u8(has_dblquote, vdupq_n_u8(0x4))); + tmp = vorrq_u8(tmp, vandq_u8(has_forward_slash, vdupq_n_u8(0x8))); + + uint8_t arr[16]; + vst1q_u8(arr, tmp); + + for (int i = 0; i < 16; ) { + unsigned long start = pos; + unsigned char ch = ptr[pos]; + unsigned char ch_len = arr[i]; + switch(ch_len) { + case 0x1: + case 0x2: + case 0x4: + case 0x8: + ch_len = 9; + break; + default: + ch_len = script_safe_escape_table[ch]; } - case 11: { - unsigned char b2 = ptr[pos + 1]; - if (RB_UNLIKELY(b2 == 0x80)) { - unsigned char b3 = ptr[pos + 2]; - if (b3 == 0xA8) { - FLUSH_POS(3); - fbuffer_append(out_buffer, "\\u2028", 6); - break; - } else if (b3 == 0xA9) { - FLUSH_POS(3); - fbuffer_append(out_buffer, "\\u2029", 6); - break; - } - } - ch_len = 3; - // fallthrough + // This must remain in sync with the array `escape_table`. + if (RB_UNLIKELY(ch_len)) { + PROCESS_BYTE; + } else { + pos++; + } + + i += (pos - start); + } + } + } else { + /* + * The code below implements an SIMD-based algorithm to determine if N bytes at a time + * need to be escaped. + * + * Assume the ptr = "Te\sting!" (the double quotes are included in the string) + * + * The explanination will be limited to the first 8 bytes of the string for simplicity. However + * the vector insructions may work on larger vectors. + * + * First, we load three constants 'lower_bound', 'backslash' and 'dblquote" in vector registers. + * + * lower_bound: [20 20 20 20 20 20 20 20] + * backslash: [5C 5C 5C 5C 5C 5C 5C 5C] + * dblquote: [22 22 22 22 22 22 22 22] + * + * Next we load the first chunk of the ptr: + * [22 54 65 5C 73 74 69 6E] (" T e \ s t i n) + * + * First we check if any byte in chunk is less than 32 (0x20). This returns the following vector + * as no bytes are less than 32 (0x20): + * [0 0 0 0 0 0 0 0] + * + * Next, we check if any byte in chunk is equal to a backslash: + * [0 0 0 FF 0 0 0 0] + * + * Finally we check if any byte in chunk is equal to a double quote: + * [FF 0 0 0 0 0 0 0] + * + * Now we have three vectors where each byte indicates if the corresponding byte in chunk + * needs to be escaped. We combine these vectors with a series of logical OR instructions. + * This is the needs_escape vector and it is equal to: + * [FF 0 0 FF 0 0 0 0] + * + * For ARM Neon specifically, we check if the maximum number in the vector is 0. The maximum of + * the needs_escape vector is FF. Therefore, we know there is at least one byte that needs to be + * escaped. + * + * If the maximum of the needs_escape vector is 0, none of the bytes need to be escaped and + * we advance pos by the width of the vector. + * + * To determine how to escape characters, we look at each value in the needs_escape vector and take + * the appropriate action. + */ + while (pos+16 < len) { + uint8x16_t chunk = vld1q_u8((const uint8_t*)&ptr[pos]); + uint8x16_t too_low = vcltq_u8(chunk, lower_bound); + uint8x16_t has_backslash = vceqq_u8(chunk, backslash); + uint8x16_t has_dblquote = vceqq_u8(chunk, dblquote); + uint8x16_t needs_escape = vorrq_u8(too_low, vorrq_u8(has_backslash, has_dblquote)); + + if (vmaxvq_u8(needs_escape) == 0) { + pos += 16; + continue; + } + + /* + * TODO Consider making another type simd_vec_mask. The reason being on x86 we can use _mm_movemask_epi8 + * to get a mask rather than storing the vector to memory. + * + * We would need another function like simd_vec_mask_position_set(mask, pos) which returns true + * if the bit/byte (implementation defined) at position 'pos' is non-zero. + */ + + uint8_t arr[16]; + vst1q_u8(arr, needs_escape); + + for (int i = 0; i < 16; i++) { + unsigned char ch = ptr[pos]; + unsigned char ch_len = arr[i]; + + // This must remain in sync with the array `escape_table`. + if (RB_UNLIKELY(ch_len)) { + ch_len = 9; + PROCESS_BYTE; + } else { + pos++; } - default: - pos += ch_len; - break; } - } else { - pos++; } } -#undef FLUSH_POS + + *_beg = beg; + *_pos = pos; +} + +#endif /* HAVE_SIMD_NEON */ + +#ifdef HAVE_SIMD_X86_64 + +#ifdef HAVE_TYPE___M128I +#ifdef __GNUC__ +#pragma GCC push_options +#pragma GCC target ("sse4") +#endif /* __GNUC__ */ + +#define _mm_cmpge_epu8(a, b) _mm_cmpeq_epi8(_mm_max_epu8(a, b), a) +#define _mm_cmple_epu8(a, b) _mm_cmpge_epu8(b, a) +#define _mm_cmpgt_epu8(a, b) _mm_xor_si128(_mm_cmple_epu8(a, b), _mm_set1_epi8(-1)) +#define _mm_cmplt_epu8(a, b) _mm_cmpgt_epu8(b, a) + +#ifdef __clang__ +__attribute__((target("sse4.2"))) +#endif /* __clang__ */ +void convert_UTF8_to_JSON_simd_kernel_sse42(FBuffer *out_buffer, const char * ptr, unsigned long len, unsigned long *_beg, unsigned long *_pos, const char *hexdig, char scratch[12], const unsigned char escape_table[256]) { + unsigned long beg = *_beg, pos = *_pos; + + if (escape_table == script_safe_escape_table) { + /* + * Again, this is basically a straight port of the ARM Neon version. + */ + const __m128i lower_bound = _mm_set1_epi8(' '); + const __m128i upper_bound = _mm_set1_epi8('~'); + const __m128i backslash = _mm_set1_epi8('\\'); + const __m128i dblquote = _mm_set1_epi8('\"'); + const __m128i forward_slash = _mm_set1_epi8('/'); + + while (pos+16 < len) { + __m128i chunk = _mm_loadu_si128((__m128i const*)&ptr[pos]); + __m128i too_low = _mm_cmplt_epu8(chunk, lower_bound); + __m128i too_high = _mm_cmpgt_epu8(chunk, upper_bound); + + __m128i has_backslash = _mm_cmpeq_epi8(chunk, backslash); + __m128i has_dblquote = _mm_cmpeq_epi8(chunk, dblquote); + __m128i has_forward_slash = _mm_cmpeq_epi8(chunk, forward_slash); + + __m128i needs_escape = _mm_or_si128(too_low, too_high); + __m128i has_escaped_char = _mm_or_si128(has_forward_slash, _mm_or_si128(has_backslash, has_dblquote)); + needs_escape = _mm_or_si128(needs_escape, has_escaped_char); + + int needs_escape_mask = _mm_movemask_epi8(needs_escape); + if (needs_escape_mask == 0) { + pos += 16; + continue; + } + + __m128i tmp = _mm_and_si128(too_low, _mm_set1_epi8(0x1)); + tmp = _mm_or_si128(tmp, _mm_and_si128(has_backslash, _mm_set1_epi8(0x2))); + tmp = _mm_or_si128(tmp, _mm_and_si128(has_dblquote, _mm_set1_epi8(0x4))); + tmp = _mm_or_si128(tmp, _mm_and_si128(has_forward_slash, _mm_set1_epi8(0x8))); + + uint8_t arr[16]; + _mm_storeu_si128((__m128i *) arr, tmp); + + for (int i = 0; i < 16; ) { + unsigned long start = pos; + unsigned char ch = ptr[pos]; + unsigned char ch_len = arr[i]; + switch(ch_len) { + case 0x1: + case 0x2: + case 0x4: + case 0x8: + ch_len = 9; + break; + default: + ch_len = script_safe_escape_table[ch]; + } + // This must remain in sync with the array `escape_table`. + if (RB_UNLIKELY(ch_len)) { + PROCESS_BYTE; + } else { + pos++; + } + + i += (pos - start); + } + } + } else { + /* + * This is a straight port of the ARM Neon implementation to SSE4. This is + * likely not optimal for this instruction set. There is likely table lookup, + * shuffle, gather, blend, etc. instructions that may perform significantly + * better than what is implemented here. + */ + + const __m128i lower_bound = _mm_set1_epi8(' '); + const __m128i backslash = _mm_set1_epi8('\\'); + const __m128i dblquote = _mm_set1_epi8('\"'); + + while (pos+16 < len) { + __m128i chunk = _mm_loadu_si128((__m128i const*)&ptr[pos]); + __m128i too_low = _mm_cmplt_epu8(chunk, lower_bound); + __m128i has_backslash = _mm_cmpeq_epi8(chunk, backslash); + __m128i has_dblquote = _mm_cmpeq_epi8(chunk, dblquote); + __m128i needs_escape = _mm_or_si128(too_low, _mm_or_si128(has_backslash, has_dblquote)); + + int needs_escape_mask = _mm_movemask_epi8(needs_escape); + + if (needs_escape_mask == 0) { + pos += 16; + continue; + } + + for (int i = 0; i < 16; i++) { + int bit = needs_escape_mask & (1 << i); + unsigned char ch = ptr[pos]; + unsigned char ch_len = 0; + + // This must remain in sync with the array `escape_table`. + if (RB_UNLIKELY(bit)) { + ch_len = 9; + PROCESS_BYTE; + } else { + pos++; + } + } + } + } + + *_beg = beg; + *_pos = pos; +} + +#ifdef __GNUC__ +#pragma GCC pop_options +#endif /* __GNUC__ */ +#endif /* HAVE_TYPE___M128I */ + +#ifdef HAVE_TYPE___M256I +#ifdef __GNUC__ +#pragma GCC push_options +#pragma GCC target ("avx2") +#endif /* __GNUC__ */ + +#define _mm256_cmpge_epu8(a, b) _mm256_cmpeq_epi8(_mm256_max_epu8(a, b), a) +#define _mm256_cmple_epu8(a, b) _mm256_cmpge_epu8(b, a) +#define _mm256_cmpgt_epu8(a, b) _mm256_xor_si256(_mm256_cmple_epu8(a, b), _mm256_set1_epi8(-1)) +#define _mm256_cmplt_epu8(a, b) _mm256_cmpgt_epu8(b, a) + +#ifdef __clang__ +__attribute__((target("avx2"))) +#endif /* __clang__ */ +void convert_UTF8_to_JSON_simd_kernel_avx2(FBuffer *out_buffer, const char * ptr, unsigned long len, unsigned long *_beg, unsigned long *_pos, const char *hexdig, char scratch[12], const unsigned char escape_table[256]) { + unsigned long beg = *_beg, pos = *_pos; + + const __m256i lower_bound = _mm256_set1_epi8(' '); + const __m256i backslash = _mm256_set1_epi8('\\'); + const __m256i dblquote = _mm256_set1_epi8('\"'); + + if (escape_table == script_safe_escape_table) { + /* + * Again, this is basically a straight port of the ARM Neon version. + */ + const __m256i upper_bound = _mm256_set1_epi8('~'); + const __m256i forward_slash = _mm256_set1_epi8('/'); + + while (pos+32 < len) { + __m256i chunk = _mm256_loadu_si256((__m256i const*)&ptr[pos]); + __m256i too_low = _mm256_cmplt_epu8(chunk, lower_bound); + __m256i too_high = _mm256_cmpgt_epu8(chunk, upper_bound); + + __m256i has_backslash = _mm256_cmpeq_epi8(chunk, backslash); + __m256i has_dblquote = _mm256_cmpeq_epi8(chunk, dblquote); + __m256i has_forward_slash = _mm256_cmpeq_epi8(chunk, forward_slash); + + __m256i needs_escape = _mm256_or_si256(too_low, too_high); + __m256i has_escaped_char = _mm256_or_si256(has_forward_slash, _mm256_or_si256(has_backslash, has_dblquote)); + needs_escape = _mm256_or_si256(needs_escape, has_escaped_char); + + int needs_escape_mask = _mm256_movemask_epi8(needs_escape); + if (needs_escape_mask == 0) { + pos += 32; + continue; + } + + __m256i tmp = _mm256_and_si256(too_low, _mm256_set1_epi8(0x1)); + tmp = _mm256_or_si256(tmp, _mm256_and_si256(has_backslash, _mm256_set1_epi8(0x2))); + tmp = _mm256_or_si256(tmp, _mm256_and_si256(has_dblquote, _mm256_set1_epi8(0x4))); + tmp = _mm256_or_si256(tmp, _mm256_and_si256(has_forward_slash, _mm256_set1_epi8(0x8))); + + uint8_t arr[32]; + _mm256_storeu_si256((__m256i *) arr, tmp); + + for (int i = 0; i < 32; ) { + unsigned long start = pos; + unsigned char ch = ptr[pos]; + unsigned char ch_len = arr[i]; + switch(ch_len) { + case 0x1: + case 0x2: + case 0x4: + case 0x8: + ch_len = 9; + break; + default: + ch_len = script_safe_escape_table[ch]; + } + // This must remain in sync with the array `escape_table`. + if (RB_UNLIKELY(ch_len)) { + PROCESS_BYTE; + } else { + pos++; + } + + i += (pos - start); + } + } + } else { + /* + * This is a straight port of the ARM Neon implementation to SSE4. This is + * likely not optimal for this instruction set. There is likely table lookup, + * shuffle, gather, blend, etc. instructions that may perform significantly + * better than what is implemented here. + */ + while (pos+32 < len) { + __m256i chunk = _mm256_loadu_si256((__m256i const*)&ptr[pos]); + __m256i too_low = _mm256_cmplt_epu8(chunk, lower_bound); + __m256i has_backslash = _mm256_cmpeq_epi8(chunk, backslash); + __m256i has_dblquote = _mm256_cmpeq_epi8(chunk, dblquote); + __m256i needs_escape = _mm256_or_si256(too_low, _mm256_or_si256(has_backslash, has_dblquote)); + + int needs_escape_mask = _mm256_movemask_epi8(needs_escape); + + if (needs_escape_mask == 0) { + pos += 32; + continue; + } + + for (int i = 0; i < 32; i++) { + int bit = needs_escape_mask & (1 << i); + unsigned char ch = ptr[pos]; + unsigned char ch_len = 0; + + // This must remain in sync with the array `escape_table`. + if (RB_UNLIKELY(bit)) { + ch_len = 9; + PROCESS_BYTE; + } else { + pos++; + } + } + } + } + *_beg = beg; + *_pos = pos; +} + +#ifdef __GNUC__ +#pragma GCC pop_options +#endif /* __GNUC__ */ + +#endif /* HAVE_TYPE___M256I */ + +#endif /* x86_64 support */ + + +static void convert_UTF8_to_JSON(FBuffer *out_buffer, VALUE str, const unsigned char escape_table[256]) +{ + const char *hexdig = "0123456789abcdef"; + char scratch[12] = { '\\', 'u', 0, 0, 0, 0, '\\', 'u' }; + + const char *ptr = RSTRING_PTR(str); + unsigned long len = RSTRING_LEN(str); + + unsigned long beg = 0, pos = 0; + + while (pos < len) { + unsigned char ch = ptr[pos]; + unsigned char ch_len = escape_table[ch]; + /* JSON encoding */ + + PROCESS_BYTE; + } if (beg < len) { fbuffer_append(out_buffer, &ptr[beg], len - beg); @@ -254,6 +711,8 @@ static inline void convert_UTF8_to_JSON(FBuffer *out_buffer, VALUE str, const un RB_GC_GUARD(str); } +#undef PROCESS_BYTE + static void convert_UTF8_to_ASCII_only_JSON(FBuffer *out_buffer, VALUE str, const unsigned char escape_table[256]) { const char *hexdig = "0123456789abcdef"; @@ -264,91 +723,88 @@ static void convert_UTF8_to_ASCII_only_JSON(FBuffer *out_buffer, VALUE str, cons unsigned long beg = 0, pos = 0; -#define FLUSH_POS(bytes) if (pos > beg) { fbuffer_append(out_buffer, &ptr[beg], pos - beg); } pos += bytes; beg = pos; - while (pos < len) { unsigned char ch = ptr[pos]; unsigned char ch_len = escape_table[ch]; - if (RB_UNLIKELY(ch_len)) { - switch (ch_len) { - case 9: { - FLUSH_POS(1); - switch (ch) { - case '"': fbuffer_append(out_buffer, "\\\"", 2); break; - case '\\': fbuffer_append(out_buffer, "\\\\", 2); break; - case '/': fbuffer_append(out_buffer, "\\/", 2); break; - case '\b': fbuffer_append(out_buffer, "\\b", 2); break; - case '\f': fbuffer_append(out_buffer, "\\f", 2); break; - case '\n': fbuffer_append(out_buffer, "\\n", 2); break; - case '\r': fbuffer_append(out_buffer, "\\r", 2); break; - case '\t': fbuffer_append(out_buffer, "\\t", 2); break; - default: { - scratch[2] = '0'; - scratch[3] = '0'; - scratch[4] = hexdig[(ch >> 4) & 0xf]; - scratch[5] = hexdig[ch & 0xf]; - fbuffer_append(out_buffer, scratch, 6); - break; - } - } - break; - } - default: { - uint32_t wchar = 0; - ch_len = ch_len & CHAR_LENGTH_MASK; - - switch(ch_len) { - case 2: - wchar = ptr[pos] & 0x1F; - break; - case 3: - wchar = ptr[pos] & 0x0F; - break; - case 4: - wchar = ptr[pos] & 0x07; - break; - } - - for (short i = 1; i < ch_len; i++) { - wchar = (wchar << 6) | (ptr[pos+i] & 0x3F); - } - - FLUSH_POS(ch_len); - - if (wchar <= 0xFFFF) { - scratch[2] = hexdig[wchar >> 12]; - scratch[3] = hexdig[(wchar >> 8) & 0xf]; - scratch[4] = hexdig[(wchar >> 4) & 0xf]; - scratch[5] = hexdig[wchar & 0xf]; - fbuffer_append(out_buffer, scratch, 6); - } else { - uint16_t hi, lo; - wchar -= 0x10000; - hi = 0xD800 + (uint16_t)(wchar >> 10); - lo = 0xDC00 + (uint16_t)(wchar & 0x3FF); - - scratch[2] = hexdig[hi >> 12]; - scratch[3] = hexdig[(hi >> 8) & 0xf]; - scratch[4] = hexdig[(hi >> 4) & 0xf]; - scratch[5] = hexdig[hi & 0xf]; - - scratch[8] = hexdig[lo >> 12]; - scratch[9] = hexdig[(lo >> 8) & 0xf]; - scratch[10] = hexdig[(lo >> 4) & 0xf]; - scratch[11] = hexdig[lo & 0xf]; - - fbuffer_append(out_buffer, scratch, 12); - } - - break; - } - } - } else { - pos++; - } + if (RB_UNLIKELY(ch_len)) { + switch (ch_len) { + case 9: { + FLUSH_POS(1); + switch (ch) { + case '"': fbuffer_append(out_buffer, "\\\"", 2); break; + case '\\': fbuffer_append(out_buffer, "\\\\", 2); break; + case '/': fbuffer_append(out_buffer, "\\/", 2); break; + case '\b': fbuffer_append(out_buffer, "\\b", 2); break; + case '\f': fbuffer_append(out_buffer, "\\f", 2); break; + case '\n': fbuffer_append(out_buffer, "\\n", 2); break; + case '\r': fbuffer_append(out_buffer, "\\r", 2); break; + case '\t': fbuffer_append(out_buffer, "\\t", 2); break; + default: { + scratch[2] = '0'; + scratch[3] = '0'; + scratch[4] = hexdig[(ch >> 4) & 0xf]; + scratch[5] = hexdig[ch & 0xf]; + fbuffer_append(out_buffer, scratch, 6); + break; + } + } + break; + } + default: { + uint32_t wchar = 0; + ch_len = ch_len & CHAR_LENGTH_MASK; + + switch(ch_len) { + case 2: + wchar = ptr[pos] & 0x1F; + break; + case 3: + wchar = ptr[pos] & 0x0F; + break; + case 4: + wchar = ptr[pos] & 0x07; + break; + } + + for (short i = 1; i < ch_len; i++) { + wchar = (wchar << 6) | (ptr[pos+i] & 0x3F); + } + + FLUSH_POS(ch_len); + + if (wchar <= 0xFFFF) { + scratch[2] = hexdig[wchar >> 12]; + scratch[3] = hexdig[(wchar >> 8) & 0xf]; + scratch[4] = hexdig[(wchar >> 4) & 0xf]; + scratch[5] = hexdig[wchar & 0xf]; + fbuffer_append(out_buffer, scratch, 6); + } else { + uint16_t hi, lo; + wchar -= 0x10000; + hi = 0xD800 + (uint16_t)(wchar >> 10); + lo = 0xDC00 + (uint16_t)(wchar & 0x3FF); + + scratch[2] = hexdig[hi >> 12]; + scratch[3] = hexdig[(hi >> 8) & 0xf]; + scratch[4] = hexdig[(hi >> 4) & 0xf]; + scratch[5] = hexdig[hi & 0xf]; + + scratch[8] = hexdig[lo >> 12]; + scratch[9] = hexdig[(lo >> 8) & 0xf]; + scratch[10] = hexdig[(lo >> 4) & 0xf]; + scratch[11] = hexdig[lo & 0xf]; + + fbuffer_append(out_buffer, scratch, 12); + } + + break; + } + } + } else { + pos++; + } } -#undef FLUSH_POS if (beg < len) { fbuffer_append(out_buffer, &ptr[beg], len - beg); @@ -357,6 +813,8 @@ static void convert_UTF8_to_ASCII_only_JSON(FBuffer *out_buffer, VALUE str, cons RB_GC_GUARD(str); } +#undef FLUSH_POS + /* * Document-module: JSON::Ext::Generator * @@ -912,7 +1370,7 @@ static void generate_json_string(FBuffer *buffer, struct generate_json_data *dat if (RB_UNLIKELY(state->ascii_only)) { convert_UTF8_to_ASCII_only_JSON(buffer, obj, state->script_safe ? script_safe_escape_table : ascii_only_escape_table); } else { - convert_UTF8_to_JSON(buffer, obj, state->script_safe ? script_safe_escape_table : escape_table); + convert_UTF8_to_JSON_impl(buffer, obj, state->script_safe ? script_safe_escape_table : escape_table); } break; default: @@ -1670,4 +2128,28 @@ void Init_generator(void) binary_encindex = rb_ascii8bit_encindex(); rb_require("json/ext/generator/state"); -} + + // TODO ADD RUNTIME CHECKS HERE? + switch(find_simd_implementation()) { +#ifdef HAVE_SIMD_NEON + case SIMD_NEON: + convert_UTF8_to_JSON_impl = convert_UTF8_to_JSON_simd; + convert_UTF8_to_JSON_simd_kernel = convert_UTF8_to_JSON_simd_kernel_neon; + break; +#endif +#ifdef HAVE_SIMD_X86_64 + case SIMD_SSE42: + convert_UTF8_to_JSON_impl = convert_UTF8_to_JSON_simd; + convert_UTF8_to_JSON_simd_kernel = convert_UTF8_to_JSON_simd_kernel_sse42; + break; +#ifdef HAVE_TYPE___M256I + case SIMD_AVX2: + convert_UTF8_to_JSON_impl = convert_UTF8_to_JSON_simd; + convert_UTF8_to_JSON_simd_kernel = convert_UTF8_to_JSON_simd_kernel_avx2; + break; +#endif /* HAVE_TYPE___M256I */ +#endif + default: + convert_UTF8_to_JSON_impl = convert_UTF8_to_JSON; + } +} \ No newline at end of file diff --git a/ext/json/ext/generator/simd.h b/ext/json/ext/generator/simd.h new file mode 100644 index 00000000..352c0b6f --- /dev/null +++ b/ext/json/ext/generator/simd.h @@ -0,0 +1,71 @@ +#include "extconf.h" + +typedef enum { + SIMD_NONE, + SIMD_NEON, + SIMD_SSE42, + SIMD_AVX2 +} SIMD_Implementation; + +#ifdef ENABLE_SIMD + +#if defined(__ARM_NEON) || defined(__ARM_NEON__) || defined(__aarch64__) || defined(_M_ARM64) +#include + +#define FIND_SIMD_IMPLEMENTATION_DEFINED 1 +SIMD_Implementation find_simd_implementation() { + return SIMD_NEON; +} + +#define HAVE_SIMD_NEON 1 + +#ifdef HAVE_TYPE_UINT8X16_T + +#endif /* HAVE_TYPE_UINT8X16_T */ +#endif /* ARM Neon Support.*/ + +#if defined(__amd64__) || defined(__amd64) || defined(__x86_64__) || defined(__x86_64) || defined(_M_X64) || defined(_M_AMD64) + +#define HAVE_SIMD_X86_64 1 +#ifdef HAVE_X86INTRIN_H +#include + +#define HAVE_SIMD_X86_64 1 + +#ifdef HAVE_CPUID_H +#define FIND_SIMD_IMPLEMENTATION_DEFINED 1 + +#include +#endif + +SIMD_Implementation find_simd_implementation(void) { + +#if defined(__GNUC__ ) || defined(__clang__) +#ifdef __GNUC__ + __builtin_cpu_init(); +#endif /* __GNUC__ */ + +#ifdef HAVE_TYPE___M256I + if(__builtin_cpu_supports("avx2")) { + return SIMD_AVX2; + } +#endif /* #ifdef HAVE_TYPE___M256I */ + + // TODO Revisit. I think the SSE version now only uses SSE2 instructions. + if (__builtin_cpu_supports("sse4.2")) { + return SIMD_SSE42; + } +#endif /* __GNUC__ || __clang__*/ + + return SIMD_NONE; +} + +#endif /* HAVE_X86INTRIN_H */ +#endif /* X86_64 Support */ +#endif /* ENABLE_SIMD */ + +#ifndef FIND_SIMD_IMPLEMENTATION_DEFINED +SIMD_Implementation find_simd_implementation(void) { + return SIMD_NONE; +} +#endif \ No newline at end of file diff --git a/test/json/json_generator_test.rb b/test/json/json_generator_test.rb index 8dd3913d..c2156325 100755 --- a/test/json/json_generator_test.rb +++ b/test/json/json_generator_test.rb @@ -424,6 +424,10 @@ def test_backslash json = '["\\\\.(?i:gif|jpe?g|png)$"]' assert_equal json, generate(data) # + data = [ '\\.(?i:gif|jpe?g|png)\\.(?i:gif|jpe?g|png)\\.(?i:gif|jpe?g|png)\\.(?i:gif|jpe?g|png)\\.(?i:gif|jpe?g|png)$' ] + json = '["\\\\.(?i:gif|jpe?g|png)\\\\.(?i:gif|jpe?g|png)\\\\.(?i:gif|jpe?g|png)\\\\.(?i:gif|jpe?g|png)\\\\.(?i:gif|jpe?g|png)$"]' + assert_equal json, generate(data) + # data = [ '\\"' ] json = '["\\\\\""]' assert_equal json, generate(data) @@ -432,10 +436,22 @@ def test_backslash json = '["/"]' assert_equal json, generate(data) # + data = [ '////////////////////////////////////////////////////////////////////////////////////' ] + json = '["////////////////////////////////////////////////////////////////////////////////////"]' + assert_equal json, generate(data) + # data = [ '/' ] json = '["\/"]' assert_equal json, generate(data, :script_safe => true) # + data = [ '///////////' ] + json = '["\/\/\/\/\/\/\/\/\/\/\/"]' + assert_equal json, generate(data, :script_safe => true) + # + data = [ '///////////////////////////////////////////////////////' ] + json = '["\/\/\/\/\/\/\/\/\/\/\/\/\/\/\/\/\/\/\/\/\/\/\/\/\/\/\/\/\/\/\/\/\/\/\/\/\/\/\/\/\/\/\/\/\/\/\/\/\/\/\/\/\/\/\/"]' + assert_equal json, generate(data, :script_safe => true) + # data = [ "\u2028\u2029" ] json = '["\u2028\u2029"]' assert_equal json, generate(data, :script_safe => true) @@ -444,10 +460,18 @@ def test_backslash json = '["ABC \u2028 DEF \u2029 GHI"]' assert_equal json, generate(data, :script_safe => true) # + data = [ "ABC \u2028 DEF \u2029 GHI ABC \u2028 DEF \u2029 GHI ABC \u2028 DEF \u2029 GHI ABC \u2028 DEF \u2029 GHI ABC \u2028 DEF \u2029 GHI" ] + json = '["ABC \u2028 DEF \u2029 GHI ABC \u2028 DEF \u2029 GHI ABC \u2028 DEF \u2029 GHI ABC \u2028 DEF \u2029 GHI ABC \u2028 DEF \u2029 GHI"]' + assert_equal json, generate(data, :script_safe => true) + # data = [ "/\u2028\u2029" ] json = '["\/\u2028\u2029"]' assert_equal json, generate(data, :escape_slash => true) # + data = [ "/\u2028\u2029/\u2028\u2029/\u2028\u2029/\u2028\u2029/\u2028\u2029/\u2028\u2029/\u2028\u2029/\u2028\u2029/\u2028\u2029/\u2028\u2029" ] + json = '["\/\u2028\u2029\/\u2028\u2029\/\u2028\u2029\/\u2028\u2029\/\u2028\u2029\/\u2028\u2029\/\u2028\u2029\/\u2028\u2029\/\u2028\u2029\/\u2028\u2029"]' + assert_equal json, generate(data, :escape_slash => true) + # data = ['"'] json = '["\""]' assert_equal json, generate(data) @@ -459,6 +483,14 @@ def test_backslash data = ["倩", "瀨"] json = '["倩","瀨"]' assert_equal json, generate(data, script_safe: true) + # + data = ["倩", "瀨", "倩", "瀨", "倩", "瀨", "倩", "瀨", "倩", "瀨", "倩", "瀨", "倩", "瀨", "倩", "瀨", "倩", "瀨", "倩", "瀨"] + json = '["倩","瀨","倩","瀨","倩","瀨","倩","瀨","倩","瀨","倩","瀨","倩","瀨","倩","瀨","倩","瀨","倩","瀨"]' + assert_equal json, generate(data, script_safe: true) + # + data = '["This is a "test" of the emergency broadcast system."]' + json = "\"[\\\"This is a \\\"test\\\" of the emergency broadcast system.\\\"]\"" + assert_equal json, generate(data) end def test_string_subclass