From 8c268cdd78f2846979e88f27f2cdab3109b62be0 Mon Sep 17 00:00:00 2001 From: Scott Myron Date: Thu, 16 Jan 2025 21:24:02 -0600 Subject: [PATCH 01/11] WIP ARM Neon SIMD implementation. --- benchmark/encoder-simple.rb | 58 ++++ ext/json/ext/generator/extconf.rb | 12 + ext/json/ext/generator/generator.c | 425 ++++++++++++++++++++--------- 3 files changed, 368 insertions(+), 127 deletions(-) create mode 100644 benchmark/encoder-simple.rb diff --git a/benchmark/encoder-simple.rb b/benchmark/encoder-simple.rb new file mode 100644 index 000000000..cf3d380c3 --- /dev/null +++ b/benchmark/encoder-simple.rb @@ -0,0 +1,58 @@ +require "benchmark/ips" +require "json" +require "date" +require "oj" + +Oj.default_options = Oj.default_options.merge(mode: :compat) + +if ENV["ONLY"] + RUN = ENV["ONLY"].split(/[,: ]/).map{|x| [x.to_sym, true] }.to_h + RUN.default = false +elsif ENV["EXCEPT"] + RUN = ENV["EXCEPT"].split(/[,: ]/).map{|x| [x.to_sym, false] }.to_h + RUN.default = true +else + RUN = Hash.new(true) +end + +def implementations(ruby_obj) + state = JSON::State.new(JSON.dump_default_options) + { + json: ["json", proc { JSON.generate(ruby_obj) }], + oj: ["oj", proc { Oj.dump(ruby_obj) }], + } +end + +def benchmark_encoding(benchmark_name, ruby_obj, check_expected: true, except: []) + json_output = JSON.dump(ruby_obj) + puts "== Encoding #{benchmark_name} (#{json_output.bytesize} bytes)" + + impls = implementations(ruby_obj).select { |name| RUN[name] } + except.each { |i| impls.delete(i) } + + Benchmark.ips do |x| + expected = ::JSON.dump(ruby_obj) if check_expected + impls.values.each do |name, block| + begin + result = block.call + if check_expected && expected != result + puts "#{name} does not match expected output. Skipping" + puts "Expected:" + '-' * 40 + puts expected + puts "Actual:" + '-' * 40 + puts result + puts '-' * 40 + next + end + rescue => error + puts "#{name} unsupported (#{error})" + next + end + x.report(name, &block) + end + x.compare!(order: :baseline) + end + puts +end + +benchmark_encoding "long string", (["this is a test of the emergency broadcast system."*5]*500) \ No newline at end of file diff --git a/ext/json/ext/generator/extconf.rb b/ext/json/ext/generator/extconf.rb index 078068cf6..cd090ca52 100644 --- a/ext/json/ext/generator/extconf.rb +++ b/ext/json/ext/generator/extconf.rb @@ -6,5 +6,17 @@ else append_cflags("-std=c99") $defs << "-DJSON_GENERATOR" + + if RbConfig::CONFIG['host_cpu'] =~ /^(arm.*|aarch64.*)/ + # Try to compile a small program using NEON instructions + have_header('arm_neon.h') && try_compile(<<~'END_SRC') + #include + int main() { + uint8x16_t test = vdupq_n_u8(32); + return 0; + } + END_SRC + end + create_makefile 'json/ext/generator' end diff --git a/ext/json/ext/generator/generator.c b/ext/json/ext/generator/generator.c index 5006b7853..fb1af9e7f 100644 --- a/ext/json/ext/generator/generator.c +++ b/ext/json/ext/generator/generator.c @@ -4,6 +4,10 @@ #include #include +#ifdef HAVE_ARM_NEON_H +#include +#endif + /* ruby api and some helpers */ typedef struct JSON_Generator_StateStruct { @@ -179,7 +183,56 @@ static const unsigned char script_safe_escape_table[256] = { * Everything else (should be UTF-8) is just passed through and * appended to the result. */ -static inline void convert_UTF8_to_JSON(FBuffer *out_buffer, VALUE str, const unsigned char escape_table[256]) +#define FLUSH_POS(bytes) if (pos > beg) { fbuffer_append(out_buffer, &ptr[beg], pos - beg); } pos += bytes; beg = pos; +#define PROCESS_BYTE if (RB_UNLIKELY(ch_len)) { \ + switch (ch_len) { \ + case 9: { \ + FLUSH_POS(1); \ + switch (ch) { \ + case '"': fbuffer_append(out_buffer, "\\\"", 2); break; \ + case '\\': fbuffer_append(out_buffer, "\\\\", 2); break; \ + case '/': fbuffer_append(out_buffer, "\\/", 2); break; \ + case '\b': fbuffer_append(out_buffer, "\\b", 2); break; \ + case '\f': fbuffer_append(out_buffer, "\\f", 2); break; \ + case '\n': fbuffer_append(out_buffer, "\\n", 2); break; \ + case '\r': fbuffer_append(out_buffer, "\\r", 2); break; \ + case '\t': fbuffer_append(out_buffer, "\\t", 2); break; \ + default: { \ + scratch[2] = '0'; \ + scratch[3] = '0'; \ + scratch[4] = hexdig[(ch >> 4) & 0xf]; \ + scratch[5] = hexdig[ch & 0xf]; \ + fbuffer_append(out_buffer, scratch, 6); \ + break; \ + } \ + } \ + break; \ + } \ + case 11: { \ + unsigned char b2 = ptr[pos + 1]; \ + if (RB_UNLIKELY(b2 == 0x80)) { \ + unsigned char b3 = ptr[pos + 2]; \ + if (b3 == 0xA8) { \ + FLUSH_POS(3); \ + fbuffer_append(out_buffer, "\\u2028", 6); \ + break; \ + } else if (b3 == 0xA9) { \ + FLUSH_POS(3); \ + fbuffer_append(out_buffer, "\\u2029", 6); \ + break; \ + } \ + } \ + ch_len = 3; \ + } \ + default: \ + pos += ch_len; \ + break; \ + } \ + } else { \ + pos++; \ + } + +static void convert_UTF8_to_JSON(FBuffer *out_buffer, VALUE str) { const char *hexdig = "0123456789abcdef"; char scratch[12] = { '\\', 'u', 0, 0, 0, 0, '\\', 'u' }; @@ -189,63 +242,142 @@ static inline void convert_UTF8_to_JSON(FBuffer *out_buffer, VALUE str, const un unsigned long beg = 0, pos = 0; -#define FLUSH_POS(bytes) if (pos > beg) { fbuffer_append(out_buffer, &ptr[beg], pos - beg); } pos += bytes; beg = pos; +#ifdef HAVE_ARM_NEON_H + const uint8x16_t lower_bound = vdupq_n_u8(32); + const uint8x16_t backslash = vdupq_n_u8(92); + const uint8x16_t dblquote = vdupq_n_u8(34); + + while (pos+16 < len) { + uint8x16_t chunk = vld1q_u8((const uint8_t*)&ptr[pos]); + uint8x16_t too_low = vcltq_u8(chunk, lower_bound); + + uint8x16_t has_backslash = vceqq_u8(chunk, backslash); + uint8x16_t has_dblquote = vceqq_u8(chunk, dblquote); + + uint8x16_t invalid = too_low; + uint8x16_t has_escaped_char = vorrq_u8(has_backslash, has_dblquote); + + invalid = vorrq_u8(invalid, has_escaped_char); + + if (vmaxvq_u8(invalid) == 0) { + pos += 16; + continue; + } + + uint8x16_t tmp = vandq_u8(too_low, vdupq_n_u8(0x1)); + tmp = vorrq_u8(tmp, vandq_u8(has_backslash, vdupq_n_u8(0x2))); + tmp = vorrq_u8(tmp, vandq_u8(has_dblquote, vdupq_n_u8(0x4))); + + uint8_t arr[16]; + vst1q_u8(arr, tmp); + for (int i = 0; i < 16; i++) { + unsigned char ch = ptr[pos]; + unsigned char ch_len = arr[i]; + + // This must remain in sync with the array `escape_table`. + if (RB_UNLIKELY(ch_len)) { + ch_len = 9; + PROCESS_BYTE; + } else { + pos++; + } + } + } +#endif while (pos < len) { unsigned char ch = ptr[pos]; unsigned char ch_len = escape_table[ch]; /* JSON encoding */ - if (RB_UNLIKELY(ch_len)) { - switch (ch_len) { - case 9: { - FLUSH_POS(1); - switch (ch) { - case '"': fbuffer_append(out_buffer, "\\\"", 2); break; - case '\\': fbuffer_append(out_buffer, "\\\\", 2); break; - case '/': fbuffer_append(out_buffer, "\\/", 2); break; - case '\b': fbuffer_append(out_buffer, "\\b", 2); break; - case '\f': fbuffer_append(out_buffer, "\\f", 2); break; - case '\n': fbuffer_append(out_buffer, "\\n", 2); break; - case '\r': fbuffer_append(out_buffer, "\\r", 2); break; - case '\t': fbuffer_append(out_buffer, "\\t", 2); break; - default: { - scratch[2] = '0'; - scratch[3] = '0'; - scratch[4] = hexdig[(ch >> 4) & 0xf]; - scratch[5] = hexdig[ch & 0xf]; - fbuffer_append(out_buffer, scratch, 6); - break; - } - } + PROCESS_BYTE + } + + if (beg < len) { + fbuffer_append(out_buffer, &ptr[beg], len - beg); + } + + RB_GC_GUARD(str); +} + +static void convert_UTF8_to_JSON_script_safe(FBuffer *out_buffer, VALUE str) +{ + const char *hexdig = "0123456789abcdef"; + char scratch[12] = { '\\', 'u', 0, 0, 0, 0, '\\', 'u' }; + + const char *ptr = RSTRING_PTR(str); + unsigned long len = RSTRING_LEN(str); + + unsigned long beg = 0, pos = 0; + +#define FLUSH_POS(bytes) if (pos > beg) { fbuffer_append(out_buffer, &ptr[beg], pos - beg); } pos += bytes; beg = pos; + +#ifdef HAVE_ARM_NEON_H + const uint8x16_t lower_bound = vdupq_n_u8(32); + const uint8x16_t upper_bound = vdupq_n_u8(126); + const uint8x16_t backslash = vdupq_n_u8(92); + const uint8x16_t dblquote_slash = vdupq_n_u8(34); + const uint8x16_t forward_slash = vdupq_n_u8(47); + + while (pos+16 < len) { + uint8x16_t chunk = vld1q_u8((const uint8_t*)&ptr[pos]); + uint8x16_t too_low = vcltq_u8(chunk, lower_bound); + uint8x16_t too_high = vcgtq_u8(chunk, upper_bound); + + uint8x16_t has_backslash = vceqq_u8(chunk, backslash); + uint8x16_t has_dblquote = vceqq_u8(chunk, dblquote_slash); + uint8x16_t has_forward_slash = vceqq_u8(chunk, forward_slash); + + uint8x16_t invalid = vorrq_u8(too_low, too_high); + uint8x16_t has_escaped_char = vorrq_u8(has_forward_slash, vorrq_u8(has_backslash, has_dblquote)); + + invalid = vorrq_u8(invalid, has_escaped_char); + + if (vmaxvq_u8(invalid) == 0) { + pos += 16; + continue; + } + + uint8x16_t tmp = vandq_u8(too_low, vdupq_n_u8(0x1)); + tmp = vorrq_u8(tmp, vandq_u8(has_backslash, vdupq_n_u8(0x2))); + tmp = vorrq_u8(tmp, vandq_u8(has_dblquote, vdupq_n_u8(0x4))); + tmp = vorrq_u8(tmp, vandq_u8(has_forward_slash, vdupq_n_u8(0x8))); + + uint8_t arr[16]; + vst1q_u8(arr, tmp); + for (int i = 0; i < 16; ) { + unsigned long start = pos; + unsigned char ch = ptr[pos]; + unsigned char ch_len = arr[i]; + switch(ch_len) { + case 0x1: + case 0x2: + case 0x4: + case 0x8: + ch_len = 9; break; - } - case 11: { - unsigned char b2 = ptr[pos + 1]; - if (RB_UNLIKELY(b2 == 0x80)) { - unsigned char b3 = ptr[pos + 2]; - if (b3 == 0xA8) { - FLUSH_POS(3); - fbuffer_append(out_buffer, "\\u2028", 6); - break; - } else if (b3 == 0xA9) { - FLUSH_POS(3); - fbuffer_append(out_buffer, "\\u2029", 6); - break; - } - } - ch_len = 3; - // fallthrough - } default: - pos += ch_len; - break; + ch_len = script_safe_escape_table[ch]; } - } else { - pos++; + // This must remain in sync with the array `escape_table`. + if (RB_UNLIKELY(ch_len)) { + PROCESS_BYTE; + } else { + pos++; + } + + i += (pos - start); } } -#undef FLUSH_POS +#endif + + while (pos < len) { + unsigned char ch = ptr[pos]; + unsigned char ch_len = script_safe_escape_table[ch]; + /* JSON encoding */ + + PROCESS_BYTE; + } if (beg < len) { fbuffer_append(out_buffer, &ptr[beg], len - beg); @@ -254,6 +386,86 @@ static inline void convert_UTF8_to_JSON(FBuffer *out_buffer, VALUE str, const un RB_GC_GUARD(str); } +#undef PROCESS_BYTE + +#define PROCESS_BYTE if (RB_UNLIKELY(ch_len)) { \ + switch (ch_len) { \ + case 9: { \ + FLUSH_POS(1); \ + switch (ch) { \ + case '"': fbuffer_append(out_buffer, "\\\"", 2); break; \ + case '\\': fbuffer_append(out_buffer, "\\\\", 2); break; \ + case '/': fbuffer_append(out_buffer, "\\/", 2); break; \ + case '\b': fbuffer_append(out_buffer, "\\b", 2); break; \ + case '\f': fbuffer_append(out_buffer, "\\f", 2); break; \ + case '\n': fbuffer_append(out_buffer, "\\n", 2); break; \ + case '\r': fbuffer_append(out_buffer, "\\r", 2); break; \ + case '\t': fbuffer_append(out_buffer, "\\t", 2); break; \ + default: { \ + scratch[2] = '0'; \ + scratch[3] = '0'; \ + scratch[4] = hexdig[(ch >> 4) & 0xf]; \ + scratch[5] = hexdig[ch & 0xf]; \ + fbuffer_append(out_buffer, scratch, 6); \ + break; \ + } \ + } \ + break; \ + } \ + default: { \ + uint32_t wchar = 0; \ + ch_len = ch_len & CHAR_LENGTH_MASK; \ + \ + switch(ch_len) { \ + case 2: \ + wchar = ptr[pos] & 0x1F; \ + break; \ + case 3: \ + wchar = ptr[pos] & 0x0F; \ + break; \ + case 4: \ + wchar = ptr[pos] & 0x07; \ + break; \ + } \ + \ + for (short i = 1; i < ch_len; i++) { \ + wchar = (wchar << 6) | (ptr[pos+i] & 0x3F); \ + } \ + \ + FLUSH_POS(ch_len); \ + \ + if (wchar <= 0xFFFF) { \ + scratch[2] = hexdig[wchar >> 12]; \ + scratch[3] = hexdig[(wchar >> 8) & 0xf]; \ + scratch[4] = hexdig[(wchar >> 4) & 0xf]; \ + scratch[5] = hexdig[wchar & 0xf]; \ + fbuffer_append(out_buffer, scratch, 6); \ + } else { \ + uint16_t hi, lo; \ + wchar -= 0x10000; \ + hi = 0xD800 + (uint16_t)(wchar >> 10); \ + lo = 0xDC00 + (uint16_t)(wchar & 0x3FF); \ + \ + scratch[2] = hexdig[hi >> 12]; \ + scratch[3] = hexdig[(hi >> 8) & 0xf]; \ + scratch[4] = hexdig[(hi >> 4) & 0xf]; \ + scratch[5] = hexdig[hi & 0xf]; \ + \ + scratch[8] = hexdig[lo >> 12]; \ + scratch[9] = hexdig[(lo >> 8) & 0xf]; \ + scratch[10] = hexdig[(lo >> 4) & 0xf]; \ + scratch[11] = hexdig[lo & 0xf]; \ + \ + fbuffer_append(out_buffer, scratch, 12); \ + } \ + \ + break; \ + } \ + } \ + } else { \ + pos++; \ + } + static void convert_UTF8_to_ASCII_only_JSON(FBuffer *out_buffer, VALUE str, const unsigned char escape_table[256]) { const char *hexdig = "0123456789abcdef"; @@ -264,91 +476,43 @@ static void convert_UTF8_to_ASCII_only_JSON(FBuffer *out_buffer, VALUE str, cons unsigned long beg = 0, pos = 0; -#define FLUSH_POS(bytes) if (pos > beg) { fbuffer_append(out_buffer, &ptr[beg], pos - beg); } pos += bytes; beg = pos; +#ifdef HAVE_ARM_NEON_H + const uint8x16_t lower_bound = vdupq_n_u8(32); + const uint8x16_t upper_bound = vdupq_n_u8(126); + const uint8x16_t backslash = vdupq_n_u8(92); // '\\' + const uint8x16_t dblquote = vdupq_n_u8(34); // '"' + + while (pos+16 < len) { + uint8x16_t chunk = vld1q_u8((const uint8_t*)&ptr[pos]); + uint8x16_t too_low = vcltq_u8(chunk, lower_bound); + uint8x16_t too_high = vcgtq_u8(chunk, upper_bound); + uint8x16_t invalid = vorrq_u8(too_low, too_high); + + uint8x16_t has_backslash = vceqq_u8(chunk, backslash); + uint8x16_t has_dblquote = vceqq_u8(chunk, dblquote); + uint8x16_t has_escape_char = vorrq_u8(has_backslash, has_dblquote); + + if (escape_table == script_safe_escape_table) { + uint8x16_t forward_slash = vdupq_n_u8('/'); + uint8x16_t has_forward_slash = vceqq_u8(chunk, forward_slash); + has_escape_char = vorrq_u8(has_escape_char, has_forward_slash); + invalid = vorrq_u8(invalid, has_escape_char); + } + + if (vmaxvq_u8(invalid) != 0) { + break; + } + + pos += 16; + } +#endif while (pos < len) { unsigned char ch = ptr[pos]; unsigned char ch_len = escape_table[ch]; - if (RB_UNLIKELY(ch_len)) { - switch (ch_len) { - case 9: { - FLUSH_POS(1); - switch (ch) { - case '"': fbuffer_append(out_buffer, "\\\"", 2); break; - case '\\': fbuffer_append(out_buffer, "\\\\", 2); break; - case '/': fbuffer_append(out_buffer, "\\/", 2); break; - case '\b': fbuffer_append(out_buffer, "\\b", 2); break; - case '\f': fbuffer_append(out_buffer, "\\f", 2); break; - case '\n': fbuffer_append(out_buffer, "\\n", 2); break; - case '\r': fbuffer_append(out_buffer, "\\r", 2); break; - case '\t': fbuffer_append(out_buffer, "\\t", 2); break; - default: { - scratch[2] = '0'; - scratch[3] = '0'; - scratch[4] = hexdig[(ch >> 4) & 0xf]; - scratch[5] = hexdig[ch & 0xf]; - fbuffer_append(out_buffer, scratch, 6); - break; - } - } - break; - } - default: { - uint32_t wchar = 0; - ch_len = ch_len & CHAR_LENGTH_MASK; - - switch(ch_len) { - case 2: - wchar = ptr[pos] & 0x1F; - break; - case 3: - wchar = ptr[pos] & 0x0F; - break; - case 4: - wchar = ptr[pos] & 0x07; - break; - } - - for (short i = 1; i < ch_len; i++) { - wchar = (wchar << 6) | (ptr[pos+i] & 0x3F); - } - - FLUSH_POS(ch_len); - - if (wchar <= 0xFFFF) { - scratch[2] = hexdig[wchar >> 12]; - scratch[3] = hexdig[(wchar >> 8) & 0xf]; - scratch[4] = hexdig[(wchar >> 4) & 0xf]; - scratch[5] = hexdig[wchar & 0xf]; - fbuffer_append(out_buffer, scratch, 6); - } else { - uint16_t hi, lo; - wchar -= 0x10000; - hi = 0xD800 + (uint16_t)(wchar >> 10); - lo = 0xDC00 + (uint16_t)(wchar & 0x3FF); - - scratch[2] = hexdig[hi >> 12]; - scratch[3] = hexdig[(hi >> 8) & 0xf]; - scratch[4] = hexdig[(hi >> 4) & 0xf]; - scratch[5] = hexdig[hi & 0xf]; - - scratch[8] = hexdig[lo >> 12]; - scratch[9] = hexdig[(lo >> 8) & 0xf]; - scratch[10] = hexdig[(lo >> 4) & 0xf]; - scratch[11] = hexdig[lo & 0xf]; - - fbuffer_append(out_buffer, scratch, 12); - } - - break; - } - } - } else { - pos++; - } + PROCESS_BYTE } -#undef FLUSH_POS if (beg < len) { fbuffer_append(out_buffer, &ptr[beg], len - beg); @@ -357,6 +521,8 @@ static void convert_UTF8_to_ASCII_only_JSON(FBuffer *out_buffer, VALUE str, cons RB_GC_GUARD(str); } +#undef FLUSH_POS + /* * Document-module: JSON::Ext::Generator * @@ -912,7 +1078,12 @@ static void generate_json_string(FBuffer *buffer, struct generate_json_data *dat if (RB_UNLIKELY(state->ascii_only)) { convert_UTF8_to_ASCII_only_JSON(buffer, obj, state->script_safe ? script_safe_escape_table : ascii_only_escape_table); } else { - convert_UTF8_to_JSON(buffer, obj, state->script_safe ? script_safe_escape_table : escape_table); + if (state->script_safe) { + convert_UTF8_to_JSON_script_safe(buffer, obj); + } + else { + convert_UTF8_to_JSON(buffer, obj); + } } break; default: From d8eed56bc5098a7dbc9d91b652f09bc856580ef9 Mon Sep 17 00:00:00 2001 From: Scott Myron Date: Fri, 17 Jan 2025 06:55:15 -0600 Subject: [PATCH 02/11] Removed unnecessary instructions from the Neon portion of convert_UTF8_to_JSON. It doesn't matter which case is hit when a byte needs to be escaped. In that case, remove the vorr_q chain and simply use the combined 'needs_escape' vector. --- ext/json/ext/generator/generator.c | 12 ++++-------- 1 file changed, 4 insertions(+), 8 deletions(-) diff --git a/ext/json/ext/generator/generator.c b/ext/json/ext/generator/generator.c index fb1af9e7f..698990f24 100644 --- a/ext/json/ext/generator/generator.c +++ b/ext/json/ext/generator/generator.c @@ -254,22 +254,18 @@ static void convert_UTF8_to_JSON(FBuffer *out_buffer, VALUE str) uint8x16_t has_backslash = vceqq_u8(chunk, backslash); uint8x16_t has_dblquote = vceqq_u8(chunk, dblquote); - uint8x16_t invalid = too_low; + uint8x16_t needs_escape = too_low; uint8x16_t has_escaped_char = vorrq_u8(has_backslash, has_dblquote); - invalid = vorrq_u8(invalid, has_escaped_char); + needs_escape = vorrq_u8(needs_escape, has_escaped_char); - if (vmaxvq_u8(invalid) == 0) { + if (vmaxvq_u8(needs_escape) == 0) { pos += 16; continue; } - uint8x16_t tmp = vandq_u8(too_low, vdupq_n_u8(0x1)); - tmp = vorrq_u8(tmp, vandq_u8(has_backslash, vdupq_n_u8(0x2))); - tmp = vorrq_u8(tmp, vandq_u8(has_dblquote, vdupq_n_u8(0x4))); - uint8_t arr[16]; - vst1q_u8(arr, tmp); + vst1q_u8(arr, needs_escape); for (int i = 0; i < 16; i++) { unsigned char ch = ptr[pos]; unsigned char ch_len = arr[i]; From 9beedacf2f7160b161d5faa898bbff47bedc0998 Mon Sep 17 00:00:00 2001 From: Scott Myron Date: Sat, 18 Jan 2025 09:25:45 -0600 Subject: [PATCH 03/11] Added documentation and a bit of refactoring. --- ext/json/ext/generator/generator.c | 122 ++++++++++++++++++++--------- 1 file changed, 87 insertions(+), 35 deletions(-) diff --git a/ext/json/ext/generator/generator.c b/ext/json/ext/generator/generator.c index 698990f24..a6890c1d1 100644 --- a/ext/json/ext/generator/generator.c +++ b/ext/json/ext/generator/generator.c @@ -4,6 +4,8 @@ #include #include +#include "extconf.h" + #ifdef HAVE_ARM_NEON_H #include #endif @@ -243,21 +245,59 @@ static void convert_UTF8_to_JSON(FBuffer *out_buffer, VALUE str) unsigned long beg = 0, pos = 0; #ifdef HAVE_ARM_NEON_H - const uint8x16_t lower_bound = vdupq_n_u8(32); - const uint8x16_t backslash = vdupq_n_u8(92); - const uint8x16_t dblquote = vdupq_n_u8(34); + /* + * The code below implements an SIMD-based algorithm to determine if N bytes at a time + * need to be escaped. + * + * Assume the ptr = "Te\sting!" (the double quotes are included in the string) + * + * The explanination will be limited to the first 8 bytes of the string for simplicity. However + * the vector insructions may work on larger vectors. + * + * First, we load three constants 'lower_bound', 'backslash' and 'dblquote" in vector registers. + * + * lower_bound: [20 20 20 20 20 20 20 20] + * backslash: [5C 5C 5C 5C 5C 5C 5C 5C] + * dblquote: [22 22 22 22 22 22 22 22] + * + * Next we load the first chunk of the ptr: + * [22 54 65 5C 73 74 69 6E] (" T e \ s t i n) + * + * First we check if any byte in chunk is less than 32 (0x20). This returns the following vector + * as no bytes are less than 32 (0x20): + * [0 0 0 0 0 0 0 0] + * + * Next, we check if any byte in chunk is equal to a backslash: + * [0 0 0 FF 0 0 0 0] + * + * Finally we check if any byte in chunk is equal to a double quote: + * [FF 0 0 0 0 0 0 0] + * + * Now we have three vectors where each byte indicates if the corresponding byte in chunk + * needs to be escaped. We combine these vectors with a series of logical OR instructions. + * This is the needs_escape vector and it is equal to: + * [FF 0 0 FF 0 0 0 0] + * + * For ARM Neon specifically, we check if the maximum number in the vector is 0. The maximum of + * the needs_escape vector is FF. Therefore, we know there is at least one byte that needs to be + * escaped. + * + * If the maximum of the needs_escape vector is 0, none of the bytes need to be escaped and + * we advance pos by the width of the vector. + * + * To determine how to escape characters, we look at each value in the needs_escape vector and take + * the appropriate action. + */ + const uint8x16_t lower_bound = vdupq_n_u8(' '); + const uint8x16_t backslash = vdupq_n_u8('\\'); + const uint8x16_t dblquote = vdupq_n_u8('\"'); while (pos+16 < len) { - uint8x16_t chunk = vld1q_u8((const uint8_t*)&ptr[pos]); - uint8x16_t too_low = vcltq_u8(chunk, lower_bound); - + uint8x16_t chunk = vld1q_u8((const uint8_t*)&ptr[pos]); + uint8x16_t too_low = vcltq_u8(chunk, lower_bound); uint8x16_t has_backslash = vceqq_u8(chunk, backslash); - uint8x16_t has_dblquote = vceqq_u8(chunk, dblquote); - - uint8x16_t needs_escape = too_low; - uint8x16_t has_escaped_char = vorrq_u8(has_backslash, has_dblquote); - - needs_escape = vorrq_u8(needs_escape, has_escaped_char); + uint8x16_t has_dblquote = vceqq_u8(chunk, dblquote); + uint8x16_t needs_escape = vorrq_u8(too_low, vorrq_u8(has_backslash, has_dblquote)); if (vmaxvq_u8(needs_escape) == 0) { pos += 16; @@ -309,25 +349,37 @@ static void convert_UTF8_to_JSON_script_safe(FBuffer *out_buffer, VALUE str) #define FLUSH_POS(bytes) if (pos > beg) { fbuffer_append(out_buffer, &ptr[beg], pos - beg); } pos += bytes; beg = pos; #ifdef HAVE_ARM_NEON_H - const uint8x16_t lower_bound = vdupq_n_u8(32); - const uint8x16_t upper_bound = vdupq_n_u8(126); - const uint8x16_t backslash = vdupq_n_u8(92); - const uint8x16_t dblquote_slash = vdupq_n_u8(34); - const uint8x16_t forward_slash = vdupq_n_u8(47); + /* + * This works almost exactly the same as what is described above. The difference in this case comes after we know + * there is a byte to be escaped. In the previous case, all bytes were handled the same way. In this case, however, + * some bytes need to be handled differently. + * + * Since we know each byte in chunk can only match a single case, we logical AND each of the has_backslash, + * has_dblquote, and has_forward_slash with a different bit (0x1, 0x2 and 0x4 respectively) and combine + * the results with a logical OR. + * + * Now we loop over the result vector and switch on the particular pattern we just created. If we find a + * case we don't know, we simply lookup the byte in the script_safe_escape_table to determine the correct + * action. + */ + const uint8x16_t lower_bound = vdupq_n_u8(' '); + const uint8x16_t upper_bound = vdupq_n_u8('~'); + const uint8x16_t backslash = vdupq_n_u8('\\'); + const uint8x16_t dblquote = vdupq_n_u8('\"'); + const uint8x16_t forward_slash = vdupq_n_u8('/'); while (pos+16 < len) { - uint8x16_t chunk = vld1q_u8((const uint8_t*)&ptr[pos]); - uint8x16_t too_low = vcltq_u8(chunk, lower_bound); - uint8x16_t too_high = vcgtq_u8(chunk, upper_bound); + uint8x16_t chunk = vld1q_u8((const uint8_t*)&ptr[pos]); + uint8x16_t too_low = vcltq_u8(chunk, lower_bound); + uint8x16_t too_high = vcgtq_u8(chunk, upper_bound); - uint8x16_t has_backslash = vceqq_u8(chunk, backslash); - uint8x16_t has_dblquote = vceqq_u8(chunk, dblquote_slash); + uint8x16_t has_backslash = vceqq_u8(chunk, backslash); + uint8x16_t has_dblquote = vceqq_u8(chunk, dblquote); uint8x16_t has_forward_slash = vceqq_u8(chunk, forward_slash); - uint8x16_t invalid = vorrq_u8(too_low, too_high); - uint8x16_t has_escaped_char = vorrq_u8(has_forward_slash, vorrq_u8(has_backslash, has_dblquote)); - - invalid = vorrq_u8(invalid, has_escaped_char); + uint8x16_t invalid = vorrq_u8(too_low, too_high); + uint8x16_t has_escaped_char = vorrq_u8(has_forward_slash, vorrq_u8(has_backslash, has_dblquote)); + invalid = vorrq_u8(invalid, has_escaped_char); if (vmaxvq_u8(invalid) == 0) { pos += 16; @@ -473,19 +525,19 @@ static void convert_UTF8_to_ASCII_only_JSON(FBuffer *out_buffer, VALUE str, cons unsigned long beg = 0, pos = 0; #ifdef HAVE_ARM_NEON_H - const uint8x16_t lower_bound = vdupq_n_u8(32); - const uint8x16_t upper_bound = vdupq_n_u8(126); - const uint8x16_t backslash = vdupq_n_u8(92); // '\\' - const uint8x16_t dblquote = vdupq_n_u8(34); // '"' + const uint8x16_t lower_bound = vdupq_n_u8(' '); + const uint8x16_t upper_bound = vdupq_n_u8('~'); + const uint8x16_t backslash = vdupq_n_u8('\\'); + const uint8x16_t dblquote = vdupq_n_u8('\"'); while (pos+16 < len) { - uint8x16_t chunk = vld1q_u8((const uint8_t*)&ptr[pos]); - uint8x16_t too_low = vcltq_u8(chunk, lower_bound); - uint8x16_t too_high = vcgtq_u8(chunk, upper_bound); - uint8x16_t invalid = vorrq_u8(too_low, too_high); + uint8x16_t chunk = vld1q_u8((const uint8_t*)&ptr[pos]); + uint8x16_t too_low = vcltq_u8(chunk, lower_bound); + uint8x16_t too_high = vcgtq_u8(chunk, upper_bound); + uint8x16_t invalid = vorrq_u8(too_low, too_high); uint8x16_t has_backslash = vceqq_u8(chunk, backslash); - uint8x16_t has_dblquote = vceqq_u8(chunk, dblquote); + uint8x16_t has_dblquote = vceqq_u8(chunk, dblquote); uint8x16_t has_escape_char = vorrq_u8(has_backslash, has_dblquote); if (escape_table == script_safe_escape_table) { From 6b6ff88ee38a5b29281d6e8f07532409d9e62d3a Mon Sep 17 00:00:00 2001 From: Jean Boussier Date: Sat, 18 Jan 2025 15:35:21 +0000 Subject: [PATCH 04/11] No extconf.h --- ext/json/ext/generator/generator.c | 2 -- 1 file changed, 2 deletions(-) diff --git a/ext/json/ext/generator/generator.c b/ext/json/ext/generator/generator.c index a6890c1d1..74bbd439c 100644 --- a/ext/json/ext/generator/generator.c +++ b/ext/json/ext/generator/generator.c @@ -4,8 +4,6 @@ #include #include -#include "extconf.h" - #ifdef HAVE_ARM_NEON_H #include #endif From edf90b84396d80852b752acf3e14fb01e4341280 Mon Sep 17 00:00:00 2001 From: Scott Myron Date: Sat, 18 Jan 2025 11:07:57 -0600 Subject: [PATCH 05/11] Make SIMD configurable via the --enable-use-simd or --disable-use-simd flags. These can be set with the JSON_GENERATOR_CONFIGURE_OPTS environment variable prior to running rake. Additionally, set the stage for different SIMD implementations. --- Rakefile | 2 +- ext/json/ext/generator/extconf.h | 8 ++ ext/json/ext/generator/extconf.rb | 33 +++++--- ext/json/ext/generator/generator.c | 126 ++++++++++++++--------------- ext/json/ext/generator/simd.h | 40 +++++++++ 5 files changed, 136 insertions(+), 73 deletions(-) create mode 100644 ext/json/ext/generator/extconf.h create mode 100644 ext/json/ext/generator/simd.h diff --git a/Rakefile b/Rakefile index 1e68d2aed..8f2f97f1e 100644 --- a/Rakefile +++ b/Rakefile @@ -86,7 +86,7 @@ end file EXT_GENERATOR_DL => EXT_GENERATOR_SRC do cd EXT_GENERATOR_DIR do - ruby 'extconf.rb' + ruby "extconf.rb #{ENV['JSON_GENERATOR_CONFIGURE_OPTS']}" sh MAKE end cp "#{EXT_GENERATOR_DIR}/generator.#{CONFIG['DLEXT']}", EXT_ROOT_DIR diff --git a/ext/json/ext/generator/extconf.h b/ext/json/ext/generator/extconf.h new file mode 100644 index 000000000..576a3ee85 --- /dev/null +++ b/ext/json/ext/generator/extconf.h @@ -0,0 +1,8 @@ +#ifndef EXTCONF_H +#define EXTCONF_H +#define JSON_GENERATOR 1 +#define ENABLE_SIMD 1 +#define HAVE_ARM_NEON_H 1 +#define HAVE_TYPE_UINT8X16_T 1 +#define HAVE_TYPE_UINT8X8_T 1 +#endif diff --git a/ext/json/ext/generator/extconf.rb b/ext/json/ext/generator/extconf.rb index cd090ca52..ce5163be2 100644 --- a/ext/json/ext/generator/extconf.rb +++ b/ext/json/ext/generator/extconf.rb @@ -7,16 +7,31 @@ append_cflags("-std=c99") $defs << "-DJSON_GENERATOR" - if RbConfig::CONFIG['host_cpu'] =~ /^(arm.*|aarch64.*)/ - # Try to compile a small program using NEON instructions - have_header('arm_neon.h') && try_compile(<<~'END_SRC') - #include - int main() { - uint8x16_t test = vdupq_n_u8(32); - return 0; - } - END_SRC + if enable_config('use-simd', default=true) + $defs.push("-DENABLE_SIMD") + + if RbConfig::CONFIG['host_cpu'] =~ /^(arm.*|aarch64.*)/ + # Try to compile a small program using NEON instructions + if have_header('arm_neon.h') + have_type('uint8x16_t', headers=['arm_neon.h']) && try_compile(<<~'SRC') + #include + int main() { + uint8x16_t test = vdupq_n_u8(32); + return 0; + } + SRC + + have_type('uint8x8_t', headers=['arm_neon.h']) && try_compile(<<~'SRC') + #include + int main() { + uint8x8_t test = vdup_n_u8(32); + return 0; + } + SRC + end + end end + create_header create_makefile 'json/ext/generator' end diff --git a/ext/json/ext/generator/generator.c b/ext/json/ext/generator/generator.c index 74bbd439c..5441099e0 100644 --- a/ext/json/ext/generator/generator.c +++ b/ext/json/ext/generator/generator.c @@ -4,9 +4,7 @@ #include #include -#ifdef HAVE_ARM_NEON_H -#include -#endif +#include "simd.h" /* ruby api and some helpers */ @@ -286,24 +284,25 @@ static void convert_UTF8_to_JSON(FBuffer *out_buffer, VALUE str) * To determine how to escape characters, we look at each value in the needs_escape vector and take * the appropriate action. */ - const uint8x16_t lower_bound = vdupq_n_u8(' '); - const uint8x16_t backslash = vdupq_n_u8('\\'); - const uint8x16_t dblquote = vdupq_n_u8('\"'); - - while (pos+16 < len) { - uint8x16_t chunk = vld1q_u8((const uint8_t*)&ptr[pos]); - uint8x16_t too_low = vcltq_u8(chunk, lower_bound); - uint8x16_t has_backslash = vceqq_u8(chunk, backslash); - uint8x16_t has_dblquote = vceqq_u8(chunk, dblquote); - uint8x16_t needs_escape = vorrq_u8(too_low, vorrq_u8(has_backslash, has_dblquote)); - - if (vmaxvq_u8(needs_escape) == 0) { - pos += 16; + + const simd_vec_type lower_bound = simd_vec_from_byte(' '); + const simd_vec_type backslash = simd_vec_from_byte('\\'); + const simd_vec_type dblquote = simd_vec_from_byte('\"'); + + while (pos+SIMD_VEC_STRIDE < len) { + simd_vec_type chunk = simd_vec_load_from_mem((const uint8_t*)&ptr[pos]); + simd_vec_type too_low = simd_vec_lt(chunk, lower_bound); + simd_vec_type has_backslash = simd_vec_eq(chunk, backslash); + simd_vec_type has_dblquote = simd_vec_eq(chunk, dblquote); + simd_vec_type needs_escape = simd_vec_or(too_low, simd_vec_or(has_backslash, has_dblquote)); + + if (simd_vec_max(needs_escape) == 0) { + pos += SIMD_VEC_STRIDE; continue; } - uint8_t arr[16]; - vst1q_u8(arr, needs_escape); + uint8_t arr[SIMD_VEC_STRIDE]; + simd_vec_to_memory(arr, needs_escape); for (int i = 0; i < 16; i++) { unsigned char ch = ptr[pos]; unsigned char ch_len = arr[i]; @@ -317,6 +316,7 @@ static void convert_UTF8_to_JSON(FBuffer *out_buffer, VALUE str) } } } + #endif while (pos < len) { @@ -360,37 +360,37 @@ static void convert_UTF8_to_JSON_script_safe(FBuffer *out_buffer, VALUE str) * case we don't know, we simply lookup the byte in the script_safe_escape_table to determine the correct * action. */ - const uint8x16_t lower_bound = vdupq_n_u8(' '); - const uint8x16_t upper_bound = vdupq_n_u8('~'); - const uint8x16_t backslash = vdupq_n_u8('\\'); - const uint8x16_t dblquote = vdupq_n_u8('\"'); - const uint8x16_t forward_slash = vdupq_n_u8('/'); - - while (pos+16 < len) { - uint8x16_t chunk = vld1q_u8((const uint8_t*)&ptr[pos]); - uint8x16_t too_low = vcltq_u8(chunk, lower_bound); - uint8x16_t too_high = vcgtq_u8(chunk, upper_bound); - - uint8x16_t has_backslash = vceqq_u8(chunk, backslash); - uint8x16_t has_dblquote = vceqq_u8(chunk, dblquote); - uint8x16_t has_forward_slash = vceqq_u8(chunk, forward_slash); - - uint8x16_t invalid = vorrq_u8(too_low, too_high); - uint8x16_t has_escaped_char = vorrq_u8(has_forward_slash, vorrq_u8(has_backslash, has_dblquote)); - invalid = vorrq_u8(invalid, has_escaped_char); - - if (vmaxvq_u8(invalid) == 0) { - pos += 16; + const simd_vec_type lower_bound = simd_vec_from_byte(' '); + const simd_vec_type upper_bound = simd_vec_from_byte('~'); + const simd_vec_type backslash = simd_vec_from_byte('\\'); + const simd_vec_type dblquote = simd_vec_from_byte('\"'); + const simd_vec_type forward_slash = simd_vec_from_byte('/'); + + while (pos+SIMD_VEC_STRIDE < len) { + simd_vec_type chunk = simd_vec_load_from_mem((const uint8_t*)&ptr[pos]); + simd_vec_type too_low = simd_vec_lt(chunk, lower_bound); + simd_vec_type too_high = simd_vec_gt(chunk, upper_bound); + + simd_vec_type has_backslash = simd_vec_eq(chunk, backslash); + simd_vec_type has_dblquote = simd_vec_eq(chunk, dblquote); + simd_vec_type has_forward_slash = simd_vec_eq(chunk, forward_slash); + + simd_vec_type invalid = simd_vec_or(too_low, too_high); + simd_vec_type has_escaped_char = simd_vec_or(has_forward_slash, simd_vec_or(has_backslash, has_dblquote)); + invalid = simd_vec_or(invalid, has_escaped_char); + + if (simd_vec_max(invalid) == 0) { + pos += SIMD_VEC_STRIDE; continue; } - uint8x16_t tmp = vandq_u8(too_low, vdupq_n_u8(0x1)); - tmp = vorrq_u8(tmp, vandq_u8(has_backslash, vdupq_n_u8(0x2))); - tmp = vorrq_u8(tmp, vandq_u8(has_dblquote, vdupq_n_u8(0x4))); - tmp = vorrq_u8(tmp, vandq_u8(has_forward_slash, vdupq_n_u8(0x8))); + simd_vec_type tmp = simd_vec_and(too_low, simd_vec_from_byte(0x1)); + tmp = simd_vec_or(tmp, simd_vec_and(has_backslash, simd_vec_from_byte(0x2))); + tmp = simd_vec_or(tmp, simd_vec_and(has_dblquote, simd_vec_from_byte(0x4))); + tmp = simd_vec_or(tmp, simd_vec_and(has_forward_slash, simd_vec_from_byte(0x8))); - uint8_t arr[16]; - vst1q_u8(arr, tmp); + uint8_t arr[SIMD_VEC_STRIDE]; + simd_vec_to_memory(arr, tmp); for (int i = 0; i < 16; ) { unsigned long start = pos; unsigned char ch = ptr[pos]; @@ -523,33 +523,33 @@ static void convert_UTF8_to_ASCII_only_JSON(FBuffer *out_buffer, VALUE str, cons unsigned long beg = 0, pos = 0; #ifdef HAVE_ARM_NEON_H - const uint8x16_t lower_bound = vdupq_n_u8(' '); - const uint8x16_t upper_bound = vdupq_n_u8('~'); - const uint8x16_t backslash = vdupq_n_u8('\\'); - const uint8x16_t dblquote = vdupq_n_u8('\"'); + const simd_vec_type lower_bound = simd_vec_from_byte(' '); + const simd_vec_type upper_bound = simd_vec_from_byte('~'); + const simd_vec_type backslash = simd_vec_from_byte('\\'); + const simd_vec_type dblquote = simd_vec_from_byte('\"'); - while (pos+16 < len) { - uint8x16_t chunk = vld1q_u8((const uint8_t*)&ptr[pos]); - uint8x16_t too_low = vcltq_u8(chunk, lower_bound); - uint8x16_t too_high = vcgtq_u8(chunk, upper_bound); - uint8x16_t invalid = vorrq_u8(too_low, too_high); + while (pos+SIMD_VEC_STRIDE < len) { + simd_vec_type chunk = simd_vec_load_from_mem((const uint8_t*)&ptr[pos]); + simd_vec_type too_low = simd_vec_lt(chunk, lower_bound); + simd_vec_type too_high = simd_vec_gt(chunk, upper_bound); + simd_vec_type invalid = simd_vec_or(too_low, too_high); - uint8x16_t has_backslash = vceqq_u8(chunk, backslash); - uint8x16_t has_dblquote = vceqq_u8(chunk, dblquote); - uint8x16_t has_escape_char = vorrq_u8(has_backslash, has_dblquote); + simd_vec_type has_backslash = simd_vec_eq(chunk, backslash); + simd_vec_type has_dblquote = simd_vec_eq(chunk, dblquote); + simd_vec_type has_escape_char = simd_vec_or(has_backslash, has_dblquote); if (escape_table == script_safe_escape_table) { - uint8x16_t forward_slash = vdupq_n_u8('/'); - uint8x16_t has_forward_slash = vceqq_u8(chunk, forward_slash); - has_escape_char = vorrq_u8(has_escape_char, has_forward_slash); - invalid = vorrq_u8(invalid, has_escape_char); + simd_vec_type forward_slash = simd_vec_from_byte('/'); + simd_vec_type has_forward_slash = simd_vec_eq(chunk, forward_slash); + has_escape_char = simd_vec_or(has_escape_char, has_forward_slash); + invalid = simd_vec_or(invalid, has_escape_char); } - if (vmaxvq_u8(invalid) != 0) { + if (simd_vec_max(invalid) != 0) { break; } - pos += 16; + pos += SIMD_VEC_STRIDE; } #endif diff --git a/ext/json/ext/generator/simd.h b/ext/json/ext/generator/simd.h new file mode 100644 index 000000000..becbd66b8 --- /dev/null +++ b/ext/json/ext/generator/simd.h @@ -0,0 +1,40 @@ +#include "extconf.h" + +#ifdef ENABLE_SIMD + +#ifdef HAVE_ARM_NEON_H +#include + +#ifdef HAVE_TYPE_UINT8X16_T + +#define SIMD_VEC_STRIDE 16 + +#define simd_vec_type uint8x16_t +#define simd_vec_from_byte vdupq_n_u8 +#define simd_vec_load_from_mem vld1q_u8 +#define simd_vec_to_memory vst1q_u8 +#define simd_vec_eq vceqq_u8 +#define simd_vec_lt vcltq_u8 +#define simd_vec_gt vcgtq_u8 +#define simd_vec_or vorrq_u8 +#define simd_vec_and vandq_u8 +#define simd_vec_max vmaxvq_u8 + +#elif HAVE_TYPE_UINT8X8_T + +#define SIMD_VEC_STRIDE 8 +#define simd_vec_type uint8x8_t +#define simd_vec_from_byte vdup_n_u8 +#define simd_vec_load_from_mem vld1_u8 +#define simd_vec_to_memory vst1_u8 +#define simd_vec_eq vceq_u8 +#define simd_vec_lt vclt_u8 +#define simd_vec_gt vcgt_u8 +#define simd_vec_or vorr_u8 +#define simd_vec_and vand_u8 +#define simd_vec_max vmaxv_u8 + +#endif /* HAVE_TYPE_UINT8X16_T */ +#endif /* HAVE_ARM_NEON_H */ + +#endif /* ENABLE_SIMD */ \ No newline at end of file From 21d52323610bd320e31ee770c97afb80ad4bb85b Mon Sep 17 00:00:00 2001 From: Scott Myron Date: Sat, 18 Jan 2025 11:35:32 -0600 Subject: [PATCH 06/11] Replace HAVE_ARM_NEON_H with ENABLE_SIMD. --- ext/json/ext/generator/generator.c | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/ext/json/ext/generator/generator.c b/ext/json/ext/generator/generator.c index 5441099e0..94cb12bd5 100644 --- a/ext/json/ext/generator/generator.c +++ b/ext/json/ext/generator/generator.c @@ -240,7 +240,7 @@ static void convert_UTF8_to_JSON(FBuffer *out_buffer, VALUE str) unsigned long beg = 0, pos = 0; -#ifdef HAVE_ARM_NEON_H +#ifdef ENABLE_SIMD /* * The code below implements an SIMD-based algorithm to determine if N bytes at a time * need to be escaped. @@ -346,7 +346,7 @@ static void convert_UTF8_to_JSON_script_safe(FBuffer *out_buffer, VALUE str) #define FLUSH_POS(bytes) if (pos > beg) { fbuffer_append(out_buffer, &ptr[beg], pos - beg); } pos += bytes; beg = pos; -#ifdef HAVE_ARM_NEON_H +#ifdef ENABLE_SIMD /* * This works almost exactly the same as what is described above. The difference in this case comes after we know * there is a byte to be escaped. In the previous case, all bytes were handled the same way. In this case, however, @@ -522,7 +522,7 @@ static void convert_UTF8_to_ASCII_only_JSON(FBuffer *out_buffer, VALUE str, cons unsigned long beg = 0, pos = 0; -#ifdef HAVE_ARM_NEON_H +#ifdef ENABLE_SIMD const simd_vec_type lower_bound = simd_vec_from_byte(' '); const simd_vec_type upper_bound = simd_vec_from_byte('~'); const simd_vec_type backslash = simd_vec_from_byte('\\'); From 4a04a911843a98e2c36aac7905cb4120c5f1c0a1 Mon Sep 17 00:00:00 2001 From: Scott Myron Date: Sat, 18 Jan 2025 23:37:20 -0600 Subject: [PATCH 07/11] Initial x86 support. --- ext/json/ext/generator/extconf.h | 5 +- ext/json/ext/generator/extconf.rb | 29 ++++++- ext/json/ext/generator/generator.c | 20 +++-- ext/json/ext/generator/simd.h | 135 ++++++++++++++++++++++++++++- 4 files changed, 177 insertions(+), 12 deletions(-) diff --git a/ext/json/ext/generator/extconf.h b/ext/json/ext/generator/extconf.h index 576a3ee85..4bb9c6cca 100644 --- a/ext/json/ext/generator/extconf.h +++ b/ext/json/ext/generator/extconf.h @@ -1,8 +1,7 @@ #ifndef EXTCONF_H #define EXTCONF_H #define JSON_GENERATOR 1 +#define HAVE_X86INTRIN_H 1 +#define HAVE_TYPE___M128I 1 #define ENABLE_SIMD 1 -#define HAVE_ARM_NEON_H 1 -#define HAVE_TYPE_UINT8X16_T 1 -#define HAVE_TYPE_UINT8X8_T 1 #endif diff --git a/ext/json/ext/generator/extconf.rb b/ext/json/ext/generator/extconf.rb index ce5163be2..8c2737f04 100644 --- a/ext/json/ext/generator/extconf.rb +++ b/ext/json/ext/generator/extconf.rb @@ -8,9 +8,9 @@ $defs << "-DJSON_GENERATOR" if enable_config('use-simd', default=true) - $defs.push("-DENABLE_SIMD") - if RbConfig::CONFIG['host_cpu'] =~ /^(arm.*|aarch64.*)/ + $defs.push("-DENABLE_SIMD") + # Try to compile a small program using NEON instructions if have_header('arm_neon.h') have_type('uint8x16_t', headers=['arm_neon.h']) && try_compile(<<~'SRC') @@ -29,7 +29,30 @@ } SRC end - end + elsif have_header('x86intrin.h') + + # This is currently hardcoded to false as using m256 seems significantly slower on my machine. + # TODO make this configurable + if false && have_type('__m256i', headers=['x86intrin.h']) && try_compile(<<~'SRC', opt='-mavx2') + #include + int main() { + __m256i test = _mm256_set1_epi8(32); + return 0; + } + SRC + $defs.push("-DENABLE_SIMD") + append_cflags('-mavx2') + elsif have_type('__m128i', headers=['x86intrin.h']) && try_compile(<<~'SRC', opt='-mavx2') + #include + int main() { + __m128i test = _mm_set1_epi8(32); + return 0; + } + SRC + $defs.push("-DENABLE_SIMD") + append_cflags('-mavx2') + end + end end create_header diff --git a/ext/json/ext/generator/generator.c b/ext/json/ext/generator/generator.c index 94cb12bd5..bad83f722 100644 --- a/ext/json/ext/generator/generator.c +++ b/ext/json/ext/generator/generator.c @@ -296,13 +296,22 @@ static void convert_UTF8_to_JSON(FBuffer *out_buffer, VALUE str) simd_vec_type has_dblquote = simd_vec_eq(chunk, dblquote); simd_vec_type needs_escape = simd_vec_or(too_low, simd_vec_or(has_backslash, has_dblquote)); - if (simd_vec_max(needs_escape) == 0) { + if (simd_vec_all_zero(needs_escape)) { pos += SIMD_VEC_STRIDE; continue; } + /* + * TODO Consider making another type simd_vec_mask. The reason being on x86 we can use _mm_movemask_epi8 + * to get a mask rather than storing the vector to memory. + * + * We would need another function like simd_vec_mask_position_set(mask, pos) which returns true + * if the bit/byte (implementation defined) at position 'pos' is non-zero. + */ + uint8_t arr[SIMD_VEC_STRIDE]; simd_vec_to_memory(arr, needs_escape); + for (int i = 0; i < 16; i++) { unsigned char ch = ptr[pos]; unsigned char ch_len = arr[i]; @@ -375,11 +384,11 @@ static void convert_UTF8_to_JSON_script_safe(FBuffer *out_buffer, VALUE str) simd_vec_type has_dblquote = simd_vec_eq(chunk, dblquote); simd_vec_type has_forward_slash = simd_vec_eq(chunk, forward_slash); - simd_vec_type invalid = simd_vec_or(too_low, too_high); + simd_vec_type needs_escape = simd_vec_or(too_low, too_high); simd_vec_type has_escaped_char = simd_vec_or(has_forward_slash, simd_vec_or(has_backslash, has_dblquote)); - invalid = simd_vec_or(invalid, has_escaped_char); + needs_escape = simd_vec_or(needs_escape, has_escaped_char); - if (simd_vec_max(invalid) == 0) { + if (simd_vec_all_zero(needs_escape)) { pos += SIMD_VEC_STRIDE; continue; } @@ -391,6 +400,7 @@ static void convert_UTF8_to_JSON_script_safe(FBuffer *out_buffer, VALUE str) uint8_t arr[SIMD_VEC_STRIDE]; simd_vec_to_memory(arr, tmp); + for (int i = 0; i < 16; ) { unsigned long start = pos; unsigned char ch = ptr[pos]; @@ -545,7 +555,7 @@ static void convert_UTF8_to_ASCII_only_JSON(FBuffer *out_buffer, VALUE str, cons invalid = simd_vec_or(invalid, has_escape_char); } - if (simd_vec_max(invalid) != 0) { + if (simd_vec_any_set(invalid)) { break; } diff --git a/ext/json/ext/generator/simd.h b/ext/json/ext/generator/simd.h index becbd66b8..bda40ea30 100644 --- a/ext/json/ext/generator/simd.h +++ b/ext/json/ext/generator/simd.h @@ -20,6 +20,14 @@ #define simd_vec_and vandq_u8 #define simd_vec_max vmaxvq_u8 +inline int smd_vec_any_set(uint8x16_t vec) { + return vmaxvq_u8(vec) != 0; +} + +inline int smd_vec_all_zero(uint8x16_t vec) { + return vmaxvq_u8(vec) == 0; +} + #elif HAVE_TYPE_UINT8X8_T #define SIMD_VEC_STRIDE 8 @@ -34,7 +42,132 @@ #define simd_vec_and vand_u8 #define simd_vec_max vmaxv_u8 +inline int smd_vec_any_set(uint8x8_t vec) { + return vmaxv_u8(vec) != 0; +} + +inline int smd_vec_all_zero(uint8x8_t vec) { + return vmaxv_u8(vec) == 0; +} + #endif /* HAVE_TYPE_UINT8X16_T */ #endif /* HAVE_ARM_NEON_H */ -#endif /* ENABLE_SIMD */ \ No newline at end of file +#ifdef HAVE_X86INTRIN_H +#include + +#ifdef HAVE_TYPE___M256I + +#define SIMD_VEC_STRIDE 32 + +#define _mm256_cmpge_epu8(a, b) _mm256_cmpeq_epi8(_mm256_max_epu8(a, b), a) +#define _mm256_cmple_epu8(a, b) _mm256_cmpge_epu8(b, a) +#define _mm256_cmpgt_epu8(a, b) _mm256_xor_si256(_mm256_cmple_epu8(a, b), _mm256_set1_epi8(-1)) +#define _mm256_cmplt_epu8(a, b) _mm256_cmpgt_epu8(b, a) + +#define simd_vec_type __m256i +#define simd_vec_from_byte _mm256_set1_epi8 +#define simd_vec_load_from_mem(x) _mm256_loadu_si256((__m256i const*) x) +#define simd_vec_to_memory(mem, vec) _mm256_storeu_si256((__m256i *) mem, (__m256i) vec) +#define simd_vec_eq _mm256_cmpeq_epi8 +#define simd_vec_lt(a,b) _mm256_cmplt_epu8(a, b) +#define simd_vec_gt(a,b) _mm256_cmpgt_epu8(a, b) +#define simd_vec_or _mm256_or_si256 +#define simd_vec_and _mm256_and_si256 +#define simd_vec_max _mm256_max_epu8 + +void print_simd_vec(simd_vec_type vec) { + alignas(32) unsigned char bytes[32]; + _mm256_storeu_si256((__m256i *) bytes, vec); + printf("SIMD vector:\n\t["); + for(int i=0; i< 32; i++) { + printf(" %02x ", bytes[i]); + } + printf("]\n"); +} + +void print_simd_vec1(const char *prefix, simd_vec_type vec) { + alignas(32) unsigned char bytes[32]; + _mm256_storeu_si256((__m256i *) bytes, vec); + printf("%s:\n\t[", prefix); + for(int i=0; i< 32; i++) { + printf(" %02x ", bytes[i]); + } + printf("]\n"); +} + +int simd_vec_any_set(__m256i vec) { + // print_simd_vec1("simd_vec_any_set vec", vec); + __m256i zero = _mm256_setzero_si256(); + __m256i cmp = _mm256_cmpeq_epi8(vec, zero); + int mask = _mm256_movemask_epi8(cmp); + return mask != 0xFFFF; +} + +int simd_vec_all_zero(__m256i vec) { + // print_simd_vec1("simd_vec_any_set vec", vec); + __m256i zero = _mm256_setzero_si256(); + __m256i cmp = _mm256_cmpeq_epi8(vec, zero); + int mask = _mm256_movemask_epi8(cmp); + return mask == 0xFFFF; +} + +#elif HAVE_TYPE___M128I +#define SIMD_VEC_STRIDE 16 + +#define _mm_cmpge_epu8(a, b) _mm_cmpeq_epi8(_mm_max_epu8(a, b), a) +#define _mm_cmple_epu8(a, b) _mm_cmpge_epu8(b, a) +#define _mm_cmpgt_epu8(a, b) _mm_xor_si128(_mm_cmple_epu8(a, b), _mm_set1_epi8(-1)) +#define _mm_cmplt_epu8(a, b) _mm_cmpgt_epu8(b, a) + +#define simd_vec_type __m128i +#define simd_vec_from_byte _mm_set1_epi8 +#define simd_vec_load_from_mem(x) _mm_lddqu_si128((__m128i const*) x) +#define simd_vec_to_memory(mem, vec) _mm_storeu_si128((__m128i *) mem, (__m128i) vec) +#define simd_vec_eq _mm_cmpeq_epi8 +#define simd_vec_lt(a,b) _mm_cmplt_epu8(a, b) +#define simd_vec_gt(a,b) _mm_cmpgt_epu8(a, b) +#define simd_vec_or _mm_or_si128 +#define simd_vec_and _mm_and_si128 +#define simd_vec_max _mm_max_epi8 + + + +void print_simd_vec(simd_vec_type vec) { + alignas(16) unsigned char bytes[16]; + _mm_store_si128((__m128i *) bytes, vec); + printf("SIMD vector:\n\t["); + for(int i=0; i< 16; i++) { + printf(" %02x ", bytes[i]); + } + printf("]\n"); +} + +void print_simd_vec1(const char *prefix, simd_vec_type vec) { + alignas(16) unsigned char bytes[16]; + _mm_store_si128((__m128i *) bytes, vec); + printf("%s:\n\t[", prefix); + for(int i=0; i< 16; i++) { + printf(" %02x ", bytes[i]); + } + printf("]\n"); +} + +int simd_vec_any_set(__m128i vec) { + // print_simd_vec1("simd_vec_any_set vec", vec); + __m128i zero = _mm_setzero_si128(); + __m128i cmp = _mm_cmpeq_epi8(vec, zero); + int mask = _mm_movemask_epi8(cmp); + return mask != 0xFFFF; +} + +int simd_vec_all_zero(__m128i vec) { + __m128i zero = _mm_setzero_si128(); + __m128i cmp = _mm_cmpeq_epi8(vec, zero); + int mask = _mm_movemask_epi8(cmp); + return mask == 0xFFFF; +} + +#endif /* HAVE_TYPE___M256 */ +#endif /* HAVE_X86INTRIN_H */ +#endif /* ENABLE_SIMD */ From 6ee867af50313e9d0c2f94460c29799e676e983a Mon Sep 17 00:00:00 2001 From: Scott Myron Date: Sat, 18 Jan 2025 23:40:33 -0600 Subject: [PATCH 08/11] Fix a typo. --- ext/json/ext/generator/extconf.h | 5 +++-- ext/json/ext/generator/simd.h | 4 ++-- 2 files changed, 5 insertions(+), 4 deletions(-) diff --git a/ext/json/ext/generator/extconf.h b/ext/json/ext/generator/extconf.h index 4bb9c6cca..576a3ee85 100644 --- a/ext/json/ext/generator/extconf.h +++ b/ext/json/ext/generator/extconf.h @@ -1,7 +1,8 @@ #ifndef EXTCONF_H #define EXTCONF_H #define JSON_GENERATOR 1 -#define HAVE_X86INTRIN_H 1 -#define HAVE_TYPE___M128I 1 #define ENABLE_SIMD 1 +#define HAVE_ARM_NEON_H 1 +#define HAVE_TYPE_UINT8X16_T 1 +#define HAVE_TYPE_UINT8X8_T 1 #endif diff --git a/ext/json/ext/generator/simd.h b/ext/json/ext/generator/simd.h index bda40ea30..0de809ad8 100644 --- a/ext/json/ext/generator/simd.h +++ b/ext/json/ext/generator/simd.h @@ -20,11 +20,11 @@ #define simd_vec_and vandq_u8 #define simd_vec_max vmaxvq_u8 -inline int smd_vec_any_set(uint8x16_t vec) { +inline int simd_vec_any_set(uint8x16_t vec) { return vmaxvq_u8(vec) != 0; } -inline int smd_vec_all_zero(uint8x16_t vec) { +inline int simd_vec_all_zero(uint8x16_t vec) { return vmaxvq_u8(vec) == 0; } From ded09f8c9da33f50138f6f8691f27a12d4f837dd Mon Sep 17 00:00:00 2001 From: Scott Myron Date: Sat, 18 Jan 2025 23:51:21 -0600 Subject: [PATCH 09/11] Removed the extconf.h --- ext/json/ext/generator/extconf.h | 8 -------- 1 file changed, 8 deletions(-) delete mode 100644 ext/json/ext/generator/extconf.h diff --git a/ext/json/ext/generator/extconf.h b/ext/json/ext/generator/extconf.h deleted file mode 100644 index 576a3ee85..000000000 --- a/ext/json/ext/generator/extconf.h +++ /dev/null @@ -1,8 +0,0 @@ -#ifndef EXTCONF_H -#define EXTCONF_H -#define JSON_GENERATOR 1 -#define ENABLE_SIMD 1 -#define HAVE_ARM_NEON_H 1 -#define HAVE_TYPE_UINT8X16_T 1 -#define HAVE_TYPE_UINT8X8_T 1 -#endif From 8b281a8c7342925a85e83f2fd141b4cf01c2d025 Mon Sep 17 00:00:00 2001 From: Scott Myron Date: Sat, 18 Jan 2025 23:53:04 -0600 Subject: [PATCH 10/11] Added the extconf.h to .gitignore. --- .gitignore | 1 + 1 file changed, 1 insertion(+) diff --git a/.gitignore b/.gitignore index 8ae6ac119..f5a342d7e 100644 --- a/.gitignore +++ b/.gitignore @@ -16,6 +16,7 @@ Gemfile.lock .DS_Store */**/Makefile */**/*.o +*/**/extconf.h */**/*.class */**/*.jar .byebug_history From b98ab4037e65aae9773f336e6f4fb9061be68fc6 Mon Sep 17 00:00:00 2001 From: samyron Date: Tue, 21 Jan 2025 07:33:22 -0600 Subject: [PATCH 11/11] SIMD runtime detection refactor (#1) * Initial support of runtime SIMD detection. * Work in progress implementation of dynamic dispatch. * More clean up. * Added support for AVX2 and quite a bit of refactoring. * __GNU_C__ => __GNUC__ * Major refactoring to reduce the amount of duplicate code between all of the SIMD implementations. Also added a few tests with more characters to ensure the SIMD implementations are tested. * Moved the definition of convert_UTF8_to_JSON_simd. --- ext/json/ext/generator/extconf.rb | 14 +- ext/json/ext/generator/generator.c | 807 +++++++++++++++++++---------- ext/json/ext/generator/simd.h | 188 ++----- test/json/json_generator_test.rb | 32 ++ 4 files changed, 613 insertions(+), 428 deletions(-) diff --git a/ext/json/ext/generator/extconf.rb b/ext/json/ext/generator/extconf.rb index 8c2737f04..5ed26fda1 100644 --- a/ext/json/ext/generator/extconf.rb +++ b/ext/json/ext/generator/extconf.rb @@ -31,9 +31,7 @@ end elsif have_header('x86intrin.h') - # This is currently hardcoded to false as using m256 seems significantly slower on my machine. - # TODO make this configurable - if false && have_type('__m256i', headers=['x86intrin.h']) && try_compile(<<~'SRC', opt='-mavx2') + if have_type('__m256i', headers=['x86intrin.h']) && try_compile(<<~'SRC', opt='-mavx2') #include int main() { __m256i test = _mm256_set1_epi8(32); @@ -41,18 +39,20 @@ } SRC $defs.push("-DENABLE_SIMD") - append_cflags('-mavx2') - elsif have_type('__m128i', headers=['x86intrin.h']) && try_compile(<<~'SRC', opt='-mavx2') + end + + if have_type('__m128i', headers=['x86intrin.h']) && try_compile(<<~'SRC', opt='-mavx2') #include int main() { __m128i test = _mm_set1_epi8(32); return 0; } SRC - $defs.push("-DENABLE_SIMD") - append_cflags('-mavx2') + $defs.push("-DENABLE_SIMD") unless $defs.include?('-DENABLE_SIMD') end end + + have_header('cpuid.h') end create_header diff --git a/ext/json/ext/generator/generator.c b/ext/json/ext/generator/generator.c index bad83f722..745a3ce15 100644 --- a/ext/json/ext/generator/generator.c +++ b/ext/json/ext/generator/generator.c @@ -4,6 +4,7 @@ #include #include +#include "extconf.h" #include "simd.h" /* ruby api and some helpers */ @@ -35,6 +36,11 @@ static ID i_to_s, i_to_json, i_new, i_pack, i_unpack, i_create_id, i_extend, i_e static ID sym_indent, sym_space, sym_space_before, sym_object_nl, sym_array_nl, sym_max_nesting, sym_allow_nan, sym_ascii_only, sym_depth, sym_buffer_initial_length, sym_script_safe, sym_escape_slash, sym_strict; +static void (*convert_UTF8_to_JSON_impl)(FBuffer *, VALUE, const unsigned char escape_table[256]); + +#ifdef ENABLE_SIMD +static void (*convert_UTF8_to_JSON_simd_kernel)(FBuffer *out_buffer, const char * ptr, unsigned long len, unsigned long *_beg, unsigned long *_pos, const char *hexdig, char scratch[12], const unsigned char escape_table[256]); +#endif #define GET_STATE_TO(self, state) \ TypedData_Get_Struct(self, JSON_Generator_State, &JSON_Generator_State_type, state) @@ -230,7 +236,8 @@ static const unsigned char script_safe_escape_table[256] = { pos++; \ } -static void convert_UTF8_to_JSON(FBuffer *out_buffer, VALUE str) +#ifdef ENABLE_SIMD +static void convert_UTF8_to_JSON_simd(FBuffer *out_buffer, VALUE str, const unsigned char escape_table[256]) { const char *hexdig = "0123456789abcdef"; char scratch[12] = { '\\', 'u', 0, 0, 0, 0, '\\', 'u' }; @@ -240,196 +247,458 @@ static void convert_UTF8_to_JSON(FBuffer *out_buffer, VALUE str) unsigned long beg = 0, pos = 0; -#ifdef ENABLE_SIMD - /* - * The code below implements an SIMD-based algorithm to determine if N bytes at a time - * need to be escaped. - * - * Assume the ptr = "Te\sting!" (the double quotes are included in the string) - * - * The explanination will be limited to the first 8 bytes of the string for simplicity. However - * the vector insructions may work on larger vectors. - * - * First, we load three constants 'lower_bound', 'backslash' and 'dblquote" in vector registers. - * - * lower_bound: [20 20 20 20 20 20 20 20] - * backslash: [5C 5C 5C 5C 5C 5C 5C 5C] - * dblquote: [22 22 22 22 22 22 22 22] - * - * Next we load the first chunk of the ptr: - * [22 54 65 5C 73 74 69 6E] (" T e \ s t i n) - * - * First we check if any byte in chunk is less than 32 (0x20). This returns the following vector - * as no bytes are less than 32 (0x20): - * [0 0 0 0 0 0 0 0] - * - * Next, we check if any byte in chunk is equal to a backslash: - * [0 0 0 FF 0 0 0 0] - * - * Finally we check if any byte in chunk is equal to a double quote: - * [FF 0 0 0 0 0 0 0] - * - * Now we have three vectors where each byte indicates if the corresponding byte in chunk - * needs to be escaped. We combine these vectors with a series of logical OR instructions. - * This is the needs_escape vector and it is equal to: - * [FF 0 0 FF 0 0 0 0] - * - * For ARM Neon specifically, we check if the maximum number in the vector is 0. The maximum of - * the needs_escape vector is FF. Therefore, we know there is at least one byte that needs to be - * escaped. - * - * If the maximum of the needs_escape vector is 0, none of the bytes need to be escaped and - * we advance pos by the width of the vector. - * - * To determine how to escape characters, we look at each value in the needs_escape vector and take - * the appropriate action. - */ - - const simd_vec_type lower_bound = simd_vec_from_byte(' '); - const simd_vec_type backslash = simd_vec_from_byte('\\'); - const simd_vec_type dblquote = simd_vec_from_byte('\"'); - - while (pos+SIMD_VEC_STRIDE < len) { - simd_vec_type chunk = simd_vec_load_from_mem((const uint8_t*)&ptr[pos]); - simd_vec_type too_low = simd_vec_lt(chunk, lower_bound); - simd_vec_type has_backslash = simd_vec_eq(chunk, backslash); - simd_vec_type has_dblquote = simd_vec_eq(chunk, dblquote); - simd_vec_type needs_escape = simd_vec_or(too_low, simd_vec_or(has_backslash, has_dblquote)); - - if (simd_vec_all_zero(needs_escape)) { - pos += SIMD_VEC_STRIDE; - continue; - } + convert_UTF8_to_JSON_simd_kernel(out_buffer, ptr, len, &beg, &pos, hexdig, scratch, escape_table); + + while (pos < len) { + unsigned char ch = ptr[pos]; + unsigned char ch_len = escape_table[ch]; + PROCESS_BYTE; + } + + if (beg < len) { + fbuffer_append(out_buffer, &ptr[beg], len - beg); + } + + RB_GC_GUARD(str); +} +#endif + +#ifdef HAVE_SIMD_NEON + +void convert_UTF8_to_JSON_simd_kernel_neon(FBuffer *out_buffer, const char * ptr, unsigned long len, unsigned long *_beg, unsigned long *_pos, const char *hexdig, char scratch[12], const unsigned char escape_table[256]) { + unsigned long beg = *_beg, pos = *_pos; + + const uint8x16_t lower_bound = vdupq_n_u8(' '); + const uint8x16_t backslash = vdupq_n_u8('\\'); + const uint8x16_t dblquote = vdupq_n_u8('\"'); + if (escape_table == script_safe_escape_table) { /* - * TODO Consider making another type simd_vec_mask. The reason being on x86 we can use _mm_movemask_epi8 - * to get a mask rather than storing the vector to memory. + * This works almost exactly the same as what is described above. The difference in this case comes after we know + * there is a byte to be escaped. In the previous case, all bytes were handled the same way. In this case, however, + * some bytes need to be handled differently. * - * We would need another function like simd_vec_mask_position_set(mask, pos) which returns true - * if the bit/byte (implementation defined) at position 'pos' is non-zero. + * Since we know each byte in chunk can only match a single case, we logical AND each of the has_backslash, + * has_dblquote, and has_forward_slash with a different bit (0x1, 0x2 and 0x4 respectively) and combine + * the results with a logical OR. + * + * Now we loop over the result vector and switch on the particular pattern we just created. If we find a + * case we don't know, we simply lookup the byte in the script_safe_escape_table to determine the correct + * action. */ + const uint8x16_t upper_bound = vdupq_n_u8('~'); + const uint8x16_t forward_slash = vdupq_n_u8('/'); + + while (pos+16 < len) { + uint8x16_t chunk = vld1q_u8((const uint8_t*)&ptr[pos]); + uint8x16_t too_low = vcltq_u8(chunk, lower_bound); + uint8x16_t too_high = vcgtq_u8(chunk, upper_bound); + + uint8x16_t has_backslash = vceqq_u8(chunk, backslash); + uint8x16_t has_dblquote = vceqq_u8(chunk, dblquote); + uint8x16_t has_forward_slash = vceqq_u8(chunk, forward_slash); + + uint8x16_t needs_escape = vorrq_u8(too_low, too_high); + uint8x16_t has_escaped_char = vorrq_u8(has_forward_slash, vorrq_u8(has_backslash, has_dblquote)); + needs_escape = vorrq_u8(needs_escape, has_escaped_char); + + if (vmaxvq_u8(needs_escape) == 0) { + pos += 16; + continue; + } - uint8_t arr[SIMD_VEC_STRIDE]; - simd_vec_to_memory(arr, needs_escape); + uint8x16_t tmp = vandq_u8(too_low, vdupq_n_u8(0x1)); + tmp = vorrq_u8(tmp, vandq_u8(has_backslash, vdupq_n_u8(0x2))); + tmp = vorrq_u8(tmp, vandq_u8(has_dblquote, vdupq_n_u8(0x4))); + tmp = vorrq_u8(tmp, vandq_u8(has_forward_slash, vdupq_n_u8(0x8))); - for (int i = 0; i < 16; i++) { - unsigned char ch = ptr[pos]; - unsigned char ch_len = arr[i]; + uint8_t arr[16]; + vst1q_u8(arr, tmp); - // This must remain in sync with the array `escape_table`. - if (RB_UNLIKELY(ch_len)) { - ch_len = 9; - PROCESS_BYTE; - } else { - pos++; + for (int i = 0; i < 16; ) { + unsigned long start = pos; + unsigned char ch = ptr[pos]; + unsigned char ch_len = arr[i]; + switch(ch_len) { + case 0x1: + case 0x2: + case 0x4: + case 0x8: + ch_len = 9; + break; + default: + ch_len = script_safe_escape_table[ch]; + } + // This must remain in sync with the array `escape_table`. + if (RB_UNLIKELY(ch_len)) { + PROCESS_BYTE; + } else { + pos++; + } + + i += (pos - start); + } + } + } else { + /* + * The code below implements an SIMD-based algorithm to determine if N bytes at a time + * need to be escaped. + * + * Assume the ptr = "Te\sting!" (the double quotes are included in the string) + * + * The explanination will be limited to the first 8 bytes of the string for simplicity. However + * the vector insructions may work on larger vectors. + * + * First, we load three constants 'lower_bound', 'backslash' and 'dblquote" in vector registers. + * + * lower_bound: [20 20 20 20 20 20 20 20] + * backslash: [5C 5C 5C 5C 5C 5C 5C 5C] + * dblquote: [22 22 22 22 22 22 22 22] + * + * Next we load the first chunk of the ptr: + * [22 54 65 5C 73 74 69 6E] (" T e \ s t i n) + * + * First we check if any byte in chunk is less than 32 (0x20). This returns the following vector + * as no bytes are less than 32 (0x20): + * [0 0 0 0 0 0 0 0] + * + * Next, we check if any byte in chunk is equal to a backslash: + * [0 0 0 FF 0 0 0 0] + * + * Finally we check if any byte in chunk is equal to a double quote: + * [FF 0 0 0 0 0 0 0] + * + * Now we have three vectors where each byte indicates if the corresponding byte in chunk + * needs to be escaped. We combine these vectors with a series of logical OR instructions. + * This is the needs_escape vector and it is equal to: + * [FF 0 0 FF 0 0 0 0] + * + * For ARM Neon specifically, we check if the maximum number in the vector is 0. The maximum of + * the needs_escape vector is FF. Therefore, we know there is at least one byte that needs to be + * escaped. + * + * If the maximum of the needs_escape vector is 0, none of the bytes need to be escaped and + * we advance pos by the width of the vector. + * + * To determine how to escape characters, we look at each value in the needs_escape vector and take + * the appropriate action. + */ + while (pos+16 < len) { + uint8x16_t chunk = vld1q_u8((const uint8_t*)&ptr[pos]); + uint8x16_t too_low = vcltq_u8(chunk, lower_bound); + uint8x16_t has_backslash = vceqq_u8(chunk, backslash); + uint8x16_t has_dblquote = vceqq_u8(chunk, dblquote); + uint8x16_t needs_escape = vorrq_u8(too_low, vorrq_u8(has_backslash, has_dblquote)); + + if (vmaxvq_u8(needs_escape) == 0) { + pos += 16; + continue; + } + + /* + * TODO Consider making another type simd_vec_mask. The reason being on x86 we can use _mm_movemask_epi8 + * to get a mask rather than storing the vector to memory. + * + * We would need another function like simd_vec_mask_position_set(mask, pos) which returns true + * if the bit/byte (implementation defined) at position 'pos' is non-zero. + */ + + uint8_t arr[16]; + vst1q_u8(arr, needs_escape); + + for (int i = 0; i < 16; i++) { + unsigned char ch = ptr[pos]; + unsigned char ch_len = arr[i]; + + // This must remain in sync with the array `escape_table`. + if (RB_UNLIKELY(ch_len)) { + ch_len = 9; + PROCESS_BYTE; + } else { + pos++; + } } } } -#endif + *_beg = beg; + *_pos = pos; +} - while (pos < len) { - unsigned char ch = ptr[pos]; - unsigned char ch_len = escape_table[ch]; - /* JSON encoding */ +#endif /* HAVE_SIMD_NEON */ - PROCESS_BYTE - } +#ifdef HAVE_SIMD_X86_64 - if (beg < len) { - fbuffer_append(out_buffer, &ptr[beg], len - beg); +#ifdef HAVE_TYPE___M128I +#ifdef __GNUC__ +#pragma GCC push_options +#pragma GCC target ("sse4") +#endif /* __GNUC__ */ + +#define _mm_cmpge_epu8(a, b) _mm_cmpeq_epi8(_mm_max_epu8(a, b), a) +#define _mm_cmple_epu8(a, b) _mm_cmpge_epu8(b, a) +#define _mm_cmpgt_epu8(a, b) _mm_xor_si128(_mm_cmple_epu8(a, b), _mm_set1_epi8(-1)) +#define _mm_cmplt_epu8(a, b) _mm_cmpgt_epu8(b, a) + +#ifdef __clang__ +__attribute__((target("sse4.2"))) +#endif /* __clang__ */ +void convert_UTF8_to_JSON_simd_kernel_sse42(FBuffer *out_buffer, const char * ptr, unsigned long len, unsigned long *_beg, unsigned long *_pos, const char *hexdig, char scratch[12], const unsigned char escape_table[256]) { + unsigned long beg = *_beg, pos = *_pos; + + if (escape_table == script_safe_escape_table) { + /* + * Again, this is basically a straight port of the ARM Neon version. + */ + const __m128i lower_bound = _mm_set1_epi8(' '); + const __m128i upper_bound = _mm_set1_epi8('~'); + const __m128i backslash = _mm_set1_epi8('\\'); + const __m128i dblquote = _mm_set1_epi8('\"'); + const __m128i forward_slash = _mm_set1_epi8('/'); + + while (pos+16 < len) { + __m128i chunk = _mm_loadu_si128((__m128i const*)&ptr[pos]); + __m128i too_low = _mm_cmplt_epu8(chunk, lower_bound); + __m128i too_high = _mm_cmpgt_epu8(chunk, upper_bound); + + __m128i has_backslash = _mm_cmpeq_epi8(chunk, backslash); + __m128i has_dblquote = _mm_cmpeq_epi8(chunk, dblquote); + __m128i has_forward_slash = _mm_cmpeq_epi8(chunk, forward_slash); + + __m128i needs_escape = _mm_or_si128(too_low, too_high); + __m128i has_escaped_char = _mm_or_si128(has_forward_slash, _mm_or_si128(has_backslash, has_dblquote)); + needs_escape = _mm_or_si128(needs_escape, has_escaped_char); + + int needs_escape_mask = _mm_movemask_epi8(needs_escape); + if (needs_escape_mask == 0) { + pos += 16; + continue; + } + + __m128i tmp = _mm_and_si128(too_low, _mm_set1_epi8(0x1)); + tmp = _mm_or_si128(tmp, _mm_and_si128(has_backslash, _mm_set1_epi8(0x2))); + tmp = _mm_or_si128(tmp, _mm_and_si128(has_dblquote, _mm_set1_epi8(0x4))); + tmp = _mm_or_si128(tmp, _mm_and_si128(has_forward_slash, _mm_set1_epi8(0x8))); + + uint8_t arr[16]; + _mm_storeu_si128((__m128i *) arr, tmp); + + for (int i = 0; i < 16; ) { + unsigned long start = pos; + unsigned char ch = ptr[pos]; + unsigned char ch_len = arr[i]; + switch(ch_len) { + case 0x1: + case 0x2: + case 0x4: + case 0x8: + ch_len = 9; + break; + default: + ch_len = script_safe_escape_table[ch]; + } + // This must remain in sync with the array `escape_table`. + if (RB_UNLIKELY(ch_len)) { + PROCESS_BYTE; + } else { + pos++; + } + + i += (pos - start); + } + } + } else { + /* + * This is a straight port of the ARM Neon implementation to SSE4. This is + * likely not optimal for this instruction set. There is likely table lookup, + * shuffle, gather, blend, etc. instructions that may perform significantly + * better than what is implemented here. + */ + + const __m128i lower_bound = _mm_set1_epi8(' '); + const __m128i backslash = _mm_set1_epi8('\\'); + const __m128i dblquote = _mm_set1_epi8('\"'); + + while (pos+16 < len) { + __m128i chunk = _mm_loadu_si128((__m128i const*)&ptr[pos]); + __m128i too_low = _mm_cmplt_epu8(chunk, lower_bound); + __m128i has_backslash = _mm_cmpeq_epi8(chunk, backslash); + __m128i has_dblquote = _mm_cmpeq_epi8(chunk, dblquote); + __m128i needs_escape = _mm_or_si128(too_low, _mm_or_si128(has_backslash, has_dblquote)); + + int needs_escape_mask = _mm_movemask_epi8(needs_escape); + + if (needs_escape_mask == 0) { + pos += 16; + continue; + } + + for (int i = 0; i < 16; i++) { + int bit = needs_escape_mask & (1 << i); + unsigned char ch = ptr[pos]; + unsigned char ch_len = 0; + + // This must remain in sync with the array `escape_table`. + if (RB_UNLIKELY(bit)) { + ch_len = 9; + PROCESS_BYTE; + } else { + pos++; + } + } + } } - RB_GC_GUARD(str); + *_beg = beg; + *_pos = pos; } -static void convert_UTF8_to_JSON_script_safe(FBuffer *out_buffer, VALUE str) -{ - const char *hexdig = "0123456789abcdef"; - char scratch[12] = { '\\', 'u', 0, 0, 0, 0, '\\', 'u' }; +#ifdef __GNUC__ +#pragma GCC pop_options +#endif /* __GNUC__ */ +#endif /* HAVE_TYPE___M128I */ - const char *ptr = RSTRING_PTR(str); - unsigned long len = RSTRING_LEN(str); +#ifdef HAVE_TYPE___M256I +#ifdef __GNUC__ +#pragma GCC push_options +#pragma GCC target ("avx2") +#endif /* __GNUC__ */ - unsigned long beg = 0, pos = 0; +#define _mm256_cmpge_epu8(a, b) _mm256_cmpeq_epi8(_mm256_max_epu8(a, b), a) +#define _mm256_cmple_epu8(a, b) _mm256_cmpge_epu8(b, a) +#define _mm256_cmpgt_epu8(a, b) _mm256_xor_si256(_mm256_cmple_epu8(a, b), _mm256_set1_epi8(-1)) +#define _mm256_cmplt_epu8(a, b) _mm256_cmpgt_epu8(b, a) -#define FLUSH_POS(bytes) if (pos > beg) { fbuffer_append(out_buffer, &ptr[beg], pos - beg); } pos += bytes; beg = pos; +#ifdef __clang__ +__attribute__((target("avx2"))) +#endif /* __clang__ */ +void convert_UTF8_to_JSON_simd_kernel_avx2(FBuffer *out_buffer, const char * ptr, unsigned long len, unsigned long *_beg, unsigned long *_pos, const char *hexdig, char scratch[12], const unsigned char escape_table[256]) { + unsigned long beg = *_beg, pos = *_pos; -#ifdef ENABLE_SIMD - /* - * This works almost exactly the same as what is described above. The difference in this case comes after we know - * there is a byte to be escaped. In the previous case, all bytes were handled the same way. In this case, however, - * some bytes need to be handled differently. - * - * Since we know each byte in chunk can only match a single case, we logical AND each of the has_backslash, - * has_dblquote, and has_forward_slash with a different bit (0x1, 0x2 and 0x4 respectively) and combine - * the results with a logical OR. - * - * Now we loop over the result vector and switch on the particular pattern we just created. If we find a - * case we don't know, we simply lookup the byte in the script_safe_escape_table to determine the correct - * action. - */ - const simd_vec_type lower_bound = simd_vec_from_byte(' '); - const simd_vec_type upper_bound = simd_vec_from_byte('~'); - const simd_vec_type backslash = simd_vec_from_byte('\\'); - const simd_vec_type dblquote = simd_vec_from_byte('\"'); - const simd_vec_type forward_slash = simd_vec_from_byte('/'); - - while (pos+SIMD_VEC_STRIDE < len) { - simd_vec_type chunk = simd_vec_load_from_mem((const uint8_t*)&ptr[pos]); - simd_vec_type too_low = simd_vec_lt(chunk, lower_bound); - simd_vec_type too_high = simd_vec_gt(chunk, upper_bound); - - simd_vec_type has_backslash = simd_vec_eq(chunk, backslash); - simd_vec_type has_dblquote = simd_vec_eq(chunk, dblquote); - simd_vec_type has_forward_slash = simd_vec_eq(chunk, forward_slash); - - simd_vec_type needs_escape = simd_vec_or(too_low, too_high); - simd_vec_type has_escaped_char = simd_vec_or(has_forward_slash, simd_vec_or(has_backslash, has_dblquote)); - needs_escape = simd_vec_or(needs_escape, has_escaped_char); - - if (simd_vec_all_zero(needs_escape)) { - pos += SIMD_VEC_STRIDE; - continue; - } + const __m256i lower_bound = _mm256_set1_epi8(' '); + const __m256i backslash = _mm256_set1_epi8('\\'); + const __m256i dblquote = _mm256_set1_epi8('\"'); - simd_vec_type tmp = simd_vec_and(too_low, simd_vec_from_byte(0x1)); - tmp = simd_vec_or(tmp, simd_vec_and(has_backslash, simd_vec_from_byte(0x2))); - tmp = simd_vec_or(tmp, simd_vec_and(has_dblquote, simd_vec_from_byte(0x4))); - tmp = simd_vec_or(tmp, simd_vec_and(has_forward_slash, simd_vec_from_byte(0x8))); + if (escape_table == script_safe_escape_table) { + /* + * Again, this is basically a straight port of the ARM Neon version. + */ + const __m256i upper_bound = _mm256_set1_epi8('~'); + const __m256i forward_slash = _mm256_set1_epi8('/'); + + while (pos+32 < len) { + __m256i chunk = _mm256_loadu_si256((__m256i const*)&ptr[pos]); + __m256i too_low = _mm256_cmplt_epu8(chunk, lower_bound); + __m256i too_high = _mm256_cmpgt_epu8(chunk, upper_bound); + + __m256i has_backslash = _mm256_cmpeq_epi8(chunk, backslash); + __m256i has_dblquote = _mm256_cmpeq_epi8(chunk, dblquote); + __m256i has_forward_slash = _mm256_cmpeq_epi8(chunk, forward_slash); + + __m256i needs_escape = _mm256_or_si256(too_low, too_high); + __m256i has_escaped_char = _mm256_or_si256(has_forward_slash, _mm256_or_si256(has_backslash, has_dblquote)); + needs_escape = _mm256_or_si256(needs_escape, has_escaped_char); + + int needs_escape_mask = _mm256_movemask_epi8(needs_escape); + if (needs_escape_mask == 0) { + pos += 32; + continue; + } - uint8_t arr[SIMD_VEC_STRIDE]; - simd_vec_to_memory(arr, tmp); - - for (int i = 0; i < 16; ) { - unsigned long start = pos; - unsigned char ch = ptr[pos]; - unsigned char ch_len = arr[i]; - switch(ch_len) { - case 0x1: - case 0x2: - case 0x4: - case 0x8: - ch_len = 9; - break; - default: - ch_len = script_safe_escape_table[ch]; + __m256i tmp = _mm256_and_si256(too_low, _mm256_set1_epi8(0x1)); + tmp = _mm256_or_si256(tmp, _mm256_and_si256(has_backslash, _mm256_set1_epi8(0x2))); + tmp = _mm256_or_si256(tmp, _mm256_and_si256(has_dblquote, _mm256_set1_epi8(0x4))); + tmp = _mm256_or_si256(tmp, _mm256_and_si256(has_forward_slash, _mm256_set1_epi8(0x8))); + + uint8_t arr[32]; + _mm256_storeu_si256((__m256i *) arr, tmp); + + for (int i = 0; i < 32; ) { + unsigned long start = pos; + unsigned char ch = ptr[pos]; + unsigned char ch_len = arr[i]; + switch(ch_len) { + case 0x1: + case 0x2: + case 0x4: + case 0x8: + ch_len = 9; + break; + default: + ch_len = script_safe_escape_table[ch]; + } + // This must remain in sync with the array `escape_table`. + if (RB_UNLIKELY(ch_len)) { + PROCESS_BYTE; + } else { + pos++; + } + + i += (pos - start); } - // This must remain in sync with the array `escape_table`. - if (RB_UNLIKELY(ch_len)) { - PROCESS_BYTE; - } else { - pos++; + } + } else { + /* + * This is a straight port of the ARM Neon implementation to SSE4. This is + * likely not optimal for this instruction set. There is likely table lookup, + * shuffle, gather, blend, etc. instructions that may perform significantly + * better than what is implemented here. + */ + while (pos+32 < len) { + __m256i chunk = _mm256_loadu_si256((__m256i const*)&ptr[pos]); + __m256i too_low = _mm256_cmplt_epu8(chunk, lower_bound); + __m256i has_backslash = _mm256_cmpeq_epi8(chunk, backslash); + __m256i has_dblquote = _mm256_cmpeq_epi8(chunk, dblquote); + __m256i needs_escape = _mm256_or_si256(too_low, _mm256_or_si256(has_backslash, has_dblquote)); + + int needs_escape_mask = _mm256_movemask_epi8(needs_escape); + + if (needs_escape_mask == 0) { + pos += 32; + continue; } - i += (pos - start); + for (int i = 0; i < 32; i++) { + int bit = needs_escape_mask & (1 << i); + unsigned char ch = ptr[pos]; + unsigned char ch_len = 0; + + // This must remain in sync with the array `escape_table`. + if (RB_UNLIKELY(bit)) { + ch_len = 9; + PROCESS_BYTE; + } else { + pos++; + } + } } } -#endif + *_beg = beg; + *_pos = pos; +} + +#ifdef __GNUC__ +#pragma GCC pop_options +#endif /* __GNUC__ */ + +#endif /* HAVE_TYPE___M256I */ + +#endif /* x86_64 support */ + + +static void convert_UTF8_to_JSON(FBuffer *out_buffer, VALUE str, const unsigned char escape_table[256]) +{ + const char *hexdig = "0123456789abcdef"; + char scratch[12] = { '\\', 'u', 0, 0, 0, 0, '\\', 'u' }; + + const char *ptr = RSTRING_PTR(str); + unsigned long len = RSTRING_LEN(str); + + unsigned long beg = 0, pos = 0; while (pos < len) { unsigned char ch = ptr[pos]; - unsigned char ch_len = script_safe_escape_table[ch]; + unsigned char ch_len = escape_table[ch]; /* JSON encoding */ PROCESS_BYTE; @@ -444,84 +713,6 @@ static void convert_UTF8_to_JSON_script_safe(FBuffer *out_buffer, VALUE str) #undef PROCESS_BYTE -#define PROCESS_BYTE if (RB_UNLIKELY(ch_len)) { \ - switch (ch_len) { \ - case 9: { \ - FLUSH_POS(1); \ - switch (ch) { \ - case '"': fbuffer_append(out_buffer, "\\\"", 2); break; \ - case '\\': fbuffer_append(out_buffer, "\\\\", 2); break; \ - case '/': fbuffer_append(out_buffer, "\\/", 2); break; \ - case '\b': fbuffer_append(out_buffer, "\\b", 2); break; \ - case '\f': fbuffer_append(out_buffer, "\\f", 2); break; \ - case '\n': fbuffer_append(out_buffer, "\\n", 2); break; \ - case '\r': fbuffer_append(out_buffer, "\\r", 2); break; \ - case '\t': fbuffer_append(out_buffer, "\\t", 2); break; \ - default: { \ - scratch[2] = '0'; \ - scratch[3] = '0'; \ - scratch[4] = hexdig[(ch >> 4) & 0xf]; \ - scratch[5] = hexdig[ch & 0xf]; \ - fbuffer_append(out_buffer, scratch, 6); \ - break; \ - } \ - } \ - break; \ - } \ - default: { \ - uint32_t wchar = 0; \ - ch_len = ch_len & CHAR_LENGTH_MASK; \ - \ - switch(ch_len) { \ - case 2: \ - wchar = ptr[pos] & 0x1F; \ - break; \ - case 3: \ - wchar = ptr[pos] & 0x0F; \ - break; \ - case 4: \ - wchar = ptr[pos] & 0x07; \ - break; \ - } \ - \ - for (short i = 1; i < ch_len; i++) { \ - wchar = (wchar << 6) | (ptr[pos+i] & 0x3F); \ - } \ - \ - FLUSH_POS(ch_len); \ - \ - if (wchar <= 0xFFFF) { \ - scratch[2] = hexdig[wchar >> 12]; \ - scratch[3] = hexdig[(wchar >> 8) & 0xf]; \ - scratch[4] = hexdig[(wchar >> 4) & 0xf]; \ - scratch[5] = hexdig[wchar & 0xf]; \ - fbuffer_append(out_buffer, scratch, 6); \ - } else { \ - uint16_t hi, lo; \ - wchar -= 0x10000; \ - hi = 0xD800 + (uint16_t)(wchar >> 10); \ - lo = 0xDC00 + (uint16_t)(wchar & 0x3FF); \ - \ - scratch[2] = hexdig[hi >> 12]; \ - scratch[3] = hexdig[(hi >> 8) & 0xf]; \ - scratch[4] = hexdig[(hi >> 4) & 0xf]; \ - scratch[5] = hexdig[hi & 0xf]; \ - \ - scratch[8] = hexdig[lo >> 12]; \ - scratch[9] = hexdig[(lo >> 8) & 0xf]; \ - scratch[10] = hexdig[(lo >> 4) & 0xf]; \ - scratch[11] = hexdig[lo & 0xf]; \ - \ - fbuffer_append(out_buffer, scratch, 12); \ - } \ - \ - break; \ - } \ - } \ - } else { \ - pos++; \ - } - static void convert_UTF8_to_ASCII_only_JSON(FBuffer *out_buffer, VALUE str, const unsigned char escape_table[256]) { const char *hexdig = "0123456789abcdef"; @@ -532,42 +723,87 @@ static void convert_UTF8_to_ASCII_only_JSON(FBuffer *out_buffer, VALUE str, cons unsigned long beg = 0, pos = 0; -#ifdef ENABLE_SIMD - const simd_vec_type lower_bound = simd_vec_from_byte(' '); - const simd_vec_type upper_bound = simd_vec_from_byte('~'); - const simd_vec_type backslash = simd_vec_from_byte('\\'); - const simd_vec_type dblquote = simd_vec_from_byte('\"'); - - while (pos+SIMD_VEC_STRIDE < len) { - simd_vec_type chunk = simd_vec_load_from_mem((const uint8_t*)&ptr[pos]); - simd_vec_type too_low = simd_vec_lt(chunk, lower_bound); - simd_vec_type too_high = simd_vec_gt(chunk, upper_bound); - simd_vec_type invalid = simd_vec_or(too_low, too_high); - - simd_vec_type has_backslash = simd_vec_eq(chunk, backslash); - simd_vec_type has_dblquote = simd_vec_eq(chunk, dblquote); - simd_vec_type has_escape_char = simd_vec_or(has_backslash, has_dblquote); - - if (escape_table == script_safe_escape_table) { - simd_vec_type forward_slash = simd_vec_from_byte('/'); - simd_vec_type has_forward_slash = simd_vec_eq(chunk, forward_slash); - has_escape_char = simd_vec_or(has_escape_char, has_forward_slash); - invalid = simd_vec_or(invalid, has_escape_char); - } - - if (simd_vec_any_set(invalid)) { - break; - } - - pos += SIMD_VEC_STRIDE; - } -#endif - while (pos < len) { unsigned char ch = ptr[pos]; unsigned char ch_len = escape_table[ch]; - PROCESS_BYTE + if (RB_UNLIKELY(ch_len)) { + switch (ch_len) { + case 9: { + FLUSH_POS(1); + switch (ch) { + case '"': fbuffer_append(out_buffer, "\\\"", 2); break; + case '\\': fbuffer_append(out_buffer, "\\\\", 2); break; + case '/': fbuffer_append(out_buffer, "\\/", 2); break; + case '\b': fbuffer_append(out_buffer, "\\b", 2); break; + case '\f': fbuffer_append(out_buffer, "\\f", 2); break; + case '\n': fbuffer_append(out_buffer, "\\n", 2); break; + case '\r': fbuffer_append(out_buffer, "\\r", 2); break; + case '\t': fbuffer_append(out_buffer, "\\t", 2); break; + default: { + scratch[2] = '0'; + scratch[3] = '0'; + scratch[4] = hexdig[(ch >> 4) & 0xf]; + scratch[5] = hexdig[ch & 0xf]; + fbuffer_append(out_buffer, scratch, 6); + break; + } + } + break; + } + default: { + uint32_t wchar = 0; + ch_len = ch_len & CHAR_LENGTH_MASK; + + switch(ch_len) { + case 2: + wchar = ptr[pos] & 0x1F; + break; + case 3: + wchar = ptr[pos] & 0x0F; + break; + case 4: + wchar = ptr[pos] & 0x07; + break; + } + + for (short i = 1; i < ch_len; i++) { + wchar = (wchar << 6) | (ptr[pos+i] & 0x3F); + } + + FLUSH_POS(ch_len); + + if (wchar <= 0xFFFF) { + scratch[2] = hexdig[wchar >> 12]; + scratch[3] = hexdig[(wchar >> 8) & 0xf]; + scratch[4] = hexdig[(wchar >> 4) & 0xf]; + scratch[5] = hexdig[wchar & 0xf]; + fbuffer_append(out_buffer, scratch, 6); + } else { + uint16_t hi, lo; + wchar -= 0x10000; + hi = 0xD800 + (uint16_t)(wchar >> 10); + lo = 0xDC00 + (uint16_t)(wchar & 0x3FF); + + scratch[2] = hexdig[hi >> 12]; + scratch[3] = hexdig[(hi >> 8) & 0xf]; + scratch[4] = hexdig[(hi >> 4) & 0xf]; + scratch[5] = hexdig[hi & 0xf]; + + scratch[8] = hexdig[lo >> 12]; + scratch[9] = hexdig[(lo >> 8) & 0xf]; + scratch[10] = hexdig[(lo >> 4) & 0xf]; + scratch[11] = hexdig[lo & 0xf]; + + fbuffer_append(out_buffer, scratch, 12); + } + + break; + } + } + } else { + pos++; + } } if (beg < len) { @@ -1134,12 +1370,7 @@ static void generate_json_string(FBuffer *buffer, struct generate_json_data *dat if (RB_UNLIKELY(state->ascii_only)) { convert_UTF8_to_ASCII_only_JSON(buffer, obj, state->script_safe ? script_safe_escape_table : ascii_only_escape_table); } else { - if (state->script_safe) { - convert_UTF8_to_JSON_script_safe(buffer, obj); - } - else { - convert_UTF8_to_JSON(buffer, obj); - } + convert_UTF8_to_JSON_impl(buffer, obj, state->script_safe ? script_safe_escape_table : escape_table); } break; default: @@ -1897,4 +2128,28 @@ void Init_generator(void) binary_encindex = rb_ascii8bit_encindex(); rb_require("json/ext/generator/state"); -} + + // TODO ADD RUNTIME CHECKS HERE? + switch(find_simd_implementation()) { +#ifdef HAVE_SIMD_NEON + case SIMD_NEON: + convert_UTF8_to_JSON_impl = convert_UTF8_to_JSON_simd; + convert_UTF8_to_JSON_simd_kernel = convert_UTF8_to_JSON_simd_kernel_neon; + break; +#endif +#ifdef HAVE_SIMD_X86_64 + case SIMD_SSE42: + convert_UTF8_to_JSON_impl = convert_UTF8_to_JSON_simd; + convert_UTF8_to_JSON_simd_kernel = convert_UTF8_to_JSON_simd_kernel_sse42; + break; +#ifdef HAVE_TYPE___M256I + case SIMD_AVX2: + convert_UTF8_to_JSON_impl = convert_UTF8_to_JSON_simd; + convert_UTF8_to_JSON_simd_kernel = convert_UTF8_to_JSON_simd_kernel_avx2; + break; +#endif /* HAVE_TYPE___M256I */ +#endif + default: + convert_UTF8_to_JSON_impl = convert_UTF8_to_JSON; + } +} \ No newline at end of file diff --git a/ext/json/ext/generator/simd.h b/ext/json/ext/generator/simd.h index 0de809ad8..352c0b6fc 100644 --- a/ext/json/ext/generator/simd.h +++ b/ext/json/ext/generator/simd.h @@ -1,173 +1,71 @@ #include "extconf.h" +typedef enum { + SIMD_NONE, + SIMD_NEON, + SIMD_SSE42, + SIMD_AVX2 +} SIMD_Implementation; + #ifdef ENABLE_SIMD -#ifdef HAVE_ARM_NEON_H +#if defined(__ARM_NEON) || defined(__ARM_NEON__) || defined(__aarch64__) || defined(_M_ARM64) #include -#ifdef HAVE_TYPE_UINT8X16_T - -#define SIMD_VEC_STRIDE 16 - -#define simd_vec_type uint8x16_t -#define simd_vec_from_byte vdupq_n_u8 -#define simd_vec_load_from_mem vld1q_u8 -#define simd_vec_to_memory vst1q_u8 -#define simd_vec_eq vceqq_u8 -#define simd_vec_lt vcltq_u8 -#define simd_vec_gt vcgtq_u8 -#define simd_vec_or vorrq_u8 -#define simd_vec_and vandq_u8 -#define simd_vec_max vmaxvq_u8 - -inline int simd_vec_any_set(uint8x16_t vec) { - return vmaxvq_u8(vec) != 0; +#define FIND_SIMD_IMPLEMENTATION_DEFINED 1 +SIMD_Implementation find_simd_implementation() { + return SIMD_NEON; } -inline int simd_vec_all_zero(uint8x16_t vec) { - return vmaxvq_u8(vec) == 0; -} - -#elif HAVE_TYPE_UINT8X8_T - -#define SIMD_VEC_STRIDE 8 -#define simd_vec_type uint8x8_t -#define simd_vec_from_byte vdup_n_u8 -#define simd_vec_load_from_mem vld1_u8 -#define simd_vec_to_memory vst1_u8 -#define simd_vec_eq vceq_u8 -#define simd_vec_lt vclt_u8 -#define simd_vec_gt vcgt_u8 -#define simd_vec_or vorr_u8 -#define simd_vec_and vand_u8 -#define simd_vec_max vmaxv_u8 - -inline int smd_vec_any_set(uint8x8_t vec) { - return vmaxv_u8(vec) != 0; -} +#define HAVE_SIMD_NEON 1 -inline int smd_vec_all_zero(uint8x8_t vec) { - return vmaxv_u8(vec) == 0; -} +#ifdef HAVE_TYPE_UINT8X16_T #endif /* HAVE_TYPE_UINT8X16_T */ -#endif /* HAVE_ARM_NEON_H */ +#endif /* ARM Neon Support.*/ + +#if defined(__amd64__) || defined(__amd64) || defined(__x86_64__) || defined(__x86_64) || defined(_M_X64) || defined(_M_AMD64) +#define HAVE_SIMD_X86_64 1 #ifdef HAVE_X86INTRIN_H #include -#ifdef HAVE_TYPE___M256I +#define HAVE_SIMD_X86_64 1 -#define SIMD_VEC_STRIDE 32 - -#define _mm256_cmpge_epu8(a, b) _mm256_cmpeq_epi8(_mm256_max_epu8(a, b), a) -#define _mm256_cmple_epu8(a, b) _mm256_cmpge_epu8(b, a) -#define _mm256_cmpgt_epu8(a, b) _mm256_xor_si256(_mm256_cmple_epu8(a, b), _mm256_set1_epi8(-1)) -#define _mm256_cmplt_epu8(a, b) _mm256_cmpgt_epu8(b, a) - -#define simd_vec_type __m256i -#define simd_vec_from_byte _mm256_set1_epi8 -#define simd_vec_load_from_mem(x) _mm256_loadu_si256((__m256i const*) x) -#define simd_vec_to_memory(mem, vec) _mm256_storeu_si256((__m256i *) mem, (__m256i) vec) -#define simd_vec_eq _mm256_cmpeq_epi8 -#define simd_vec_lt(a,b) _mm256_cmplt_epu8(a, b) -#define simd_vec_gt(a,b) _mm256_cmpgt_epu8(a, b) -#define simd_vec_or _mm256_or_si256 -#define simd_vec_and _mm256_and_si256 -#define simd_vec_max _mm256_max_epu8 - -void print_simd_vec(simd_vec_type vec) { - alignas(32) unsigned char bytes[32]; - _mm256_storeu_si256((__m256i *) bytes, vec); - printf("SIMD vector:\n\t["); - for(int i=0; i< 32; i++) { - printf(" %02x ", bytes[i]); - } - printf("]\n"); -} +#ifdef HAVE_CPUID_H +#define FIND_SIMD_IMPLEMENTATION_DEFINED 1 -void print_simd_vec1(const char *prefix, simd_vec_type vec) { - alignas(32) unsigned char bytes[32]; - _mm256_storeu_si256((__m256i *) bytes, vec); - printf("%s:\n\t[", prefix); - for(int i=0; i< 32; i++) { - printf(" %02x ", bytes[i]); - } - printf("]\n"); -} +#include +#endif -int simd_vec_any_set(__m256i vec) { - // print_simd_vec1("simd_vec_any_set vec", vec); - __m256i zero = _mm256_setzero_si256(); - __m256i cmp = _mm256_cmpeq_epi8(vec, zero); - int mask = _mm256_movemask_epi8(cmp); - return mask != 0xFFFF; -} +SIMD_Implementation find_simd_implementation(void) { -int simd_vec_all_zero(__m256i vec) { - // print_simd_vec1("simd_vec_any_set vec", vec); - __m256i zero = _mm256_setzero_si256(); - __m256i cmp = _mm256_cmpeq_epi8(vec, zero); - int mask = _mm256_movemask_epi8(cmp); - return mask == 0xFFFF; -} +#if defined(__GNUC__ ) || defined(__clang__) +#ifdef __GNUC__ + __builtin_cpu_init(); +#endif /* __GNUC__ */ -#elif HAVE_TYPE___M128I -#define SIMD_VEC_STRIDE 16 - -#define _mm_cmpge_epu8(a, b) _mm_cmpeq_epi8(_mm_max_epu8(a, b), a) -#define _mm_cmple_epu8(a, b) _mm_cmpge_epu8(b, a) -#define _mm_cmpgt_epu8(a, b) _mm_xor_si128(_mm_cmple_epu8(a, b), _mm_set1_epi8(-1)) -#define _mm_cmplt_epu8(a, b) _mm_cmpgt_epu8(b, a) - -#define simd_vec_type __m128i -#define simd_vec_from_byte _mm_set1_epi8 -#define simd_vec_load_from_mem(x) _mm_lddqu_si128((__m128i const*) x) -#define simd_vec_to_memory(mem, vec) _mm_storeu_si128((__m128i *) mem, (__m128i) vec) -#define simd_vec_eq _mm_cmpeq_epi8 -#define simd_vec_lt(a,b) _mm_cmplt_epu8(a, b) -#define simd_vec_gt(a,b) _mm_cmpgt_epu8(a, b) -#define simd_vec_or _mm_or_si128 -#define simd_vec_and _mm_and_si128 -#define simd_vec_max _mm_max_epi8 - - - -void print_simd_vec(simd_vec_type vec) { - alignas(16) unsigned char bytes[16]; - _mm_store_si128((__m128i *) bytes, vec); - printf("SIMD vector:\n\t["); - for(int i=0; i< 16; i++) { - printf(" %02x ", bytes[i]); +#ifdef HAVE_TYPE___M256I + if(__builtin_cpu_supports("avx2")) { + return SIMD_AVX2; } - printf("]\n"); -} +#endif /* #ifdef HAVE_TYPE___M256I */ -void print_simd_vec1(const char *prefix, simd_vec_type vec) { - alignas(16) unsigned char bytes[16]; - _mm_store_si128((__m128i *) bytes, vec); - printf("%s:\n\t[", prefix); - for(int i=0; i< 16; i++) { - printf(" %02x ", bytes[i]); + // TODO Revisit. I think the SSE version now only uses SSE2 instructions. + if (__builtin_cpu_supports("sse4.2")) { + return SIMD_SSE42; } - printf("]\n"); -} - -int simd_vec_any_set(__m128i vec) { - // print_simd_vec1("simd_vec_any_set vec", vec); - __m128i zero = _mm_setzero_si128(); - __m128i cmp = _mm_cmpeq_epi8(vec, zero); - int mask = _mm_movemask_epi8(cmp); - return mask != 0xFFFF; -} +#endif /* __GNUC__ || __clang__*/ -int simd_vec_all_zero(__m128i vec) { - __m128i zero = _mm_setzero_si128(); - __m128i cmp = _mm_cmpeq_epi8(vec, zero); - int mask = _mm_movemask_epi8(cmp); - return mask == 0xFFFF; + return SIMD_NONE; } -#endif /* HAVE_TYPE___M256 */ #endif /* HAVE_X86INTRIN_H */ +#endif /* X86_64 Support */ #endif /* ENABLE_SIMD */ + +#ifndef FIND_SIMD_IMPLEMENTATION_DEFINED +SIMD_Implementation find_simd_implementation(void) { + return SIMD_NONE; +} +#endif \ No newline at end of file diff --git a/test/json/json_generator_test.rb b/test/json/json_generator_test.rb index 8dd3913d6..c2156325f 100755 --- a/test/json/json_generator_test.rb +++ b/test/json/json_generator_test.rb @@ -424,6 +424,10 @@ def test_backslash json = '["\\\\.(?i:gif|jpe?g|png)$"]' assert_equal json, generate(data) # + data = [ '\\.(?i:gif|jpe?g|png)\\.(?i:gif|jpe?g|png)\\.(?i:gif|jpe?g|png)\\.(?i:gif|jpe?g|png)\\.(?i:gif|jpe?g|png)$' ] + json = '["\\\\.(?i:gif|jpe?g|png)\\\\.(?i:gif|jpe?g|png)\\\\.(?i:gif|jpe?g|png)\\\\.(?i:gif|jpe?g|png)\\\\.(?i:gif|jpe?g|png)$"]' + assert_equal json, generate(data) + # data = [ '\\"' ] json = '["\\\\\""]' assert_equal json, generate(data) @@ -432,10 +436,22 @@ def test_backslash json = '["/"]' assert_equal json, generate(data) # + data = [ '////////////////////////////////////////////////////////////////////////////////////' ] + json = '["////////////////////////////////////////////////////////////////////////////////////"]' + assert_equal json, generate(data) + # data = [ '/' ] json = '["\/"]' assert_equal json, generate(data, :script_safe => true) # + data = [ '///////////' ] + json = '["\/\/\/\/\/\/\/\/\/\/\/"]' + assert_equal json, generate(data, :script_safe => true) + # + data = [ '///////////////////////////////////////////////////////' ] + json = '["\/\/\/\/\/\/\/\/\/\/\/\/\/\/\/\/\/\/\/\/\/\/\/\/\/\/\/\/\/\/\/\/\/\/\/\/\/\/\/\/\/\/\/\/\/\/\/\/\/\/\/\/\/\/\/"]' + assert_equal json, generate(data, :script_safe => true) + # data = [ "\u2028\u2029" ] json = '["\u2028\u2029"]' assert_equal json, generate(data, :script_safe => true) @@ -444,10 +460,18 @@ def test_backslash json = '["ABC \u2028 DEF \u2029 GHI"]' assert_equal json, generate(data, :script_safe => true) # + data = [ "ABC \u2028 DEF \u2029 GHI ABC \u2028 DEF \u2029 GHI ABC \u2028 DEF \u2029 GHI ABC \u2028 DEF \u2029 GHI ABC \u2028 DEF \u2029 GHI" ] + json = '["ABC \u2028 DEF \u2029 GHI ABC \u2028 DEF \u2029 GHI ABC \u2028 DEF \u2029 GHI ABC \u2028 DEF \u2029 GHI ABC \u2028 DEF \u2029 GHI"]' + assert_equal json, generate(data, :script_safe => true) + # data = [ "/\u2028\u2029" ] json = '["\/\u2028\u2029"]' assert_equal json, generate(data, :escape_slash => true) # + data = [ "/\u2028\u2029/\u2028\u2029/\u2028\u2029/\u2028\u2029/\u2028\u2029/\u2028\u2029/\u2028\u2029/\u2028\u2029/\u2028\u2029/\u2028\u2029" ] + json = '["\/\u2028\u2029\/\u2028\u2029\/\u2028\u2029\/\u2028\u2029\/\u2028\u2029\/\u2028\u2029\/\u2028\u2029\/\u2028\u2029\/\u2028\u2029\/\u2028\u2029"]' + assert_equal json, generate(data, :escape_slash => true) + # data = ['"'] json = '["\""]' assert_equal json, generate(data) @@ -459,6 +483,14 @@ def test_backslash data = ["倩", "瀨"] json = '["倩","瀨"]' assert_equal json, generate(data, script_safe: true) + # + data = ["倩", "瀨", "倩", "瀨", "倩", "瀨", "倩", "瀨", "倩", "瀨", "倩", "瀨", "倩", "瀨", "倩", "瀨", "倩", "瀨", "倩", "瀨"] + json = '["倩","瀨","倩","瀨","倩","瀨","倩","瀨","倩","瀨","倩","瀨","倩","瀨","倩","瀨","倩","瀨","倩","瀨"]' + assert_equal json, generate(data, script_safe: true) + # + data = '["This is a "test" of the emergency broadcast system."]' + json = "\"[\\\"This is a \\\"test\\\" of the emergency broadcast system.\\\"]\"" + assert_equal json, generate(data) end def test_string_subclass