diff --git a/.gitignore b/.gitignore
index 8ae6ac11..f5a342d7 100644
--- a/.gitignore
+++ b/.gitignore
@@ -16,6 +16,7 @@ Gemfile.lock
 .DS_Store
 */**/Makefile
 */**/*.o
+*/**/extconf.h
 */**/*.class
 */**/*.jar
 .byebug_history
diff --git a/Rakefile b/Rakefile
index 1e68d2ae..8f2f97f1 100644
--- a/Rakefile
+++ b/Rakefile
@@ -86,7 +86,7 @@ end
 
 file EXT_GENERATOR_DL => EXT_GENERATOR_SRC do
   cd EXT_GENERATOR_DIR do
-    ruby 'extconf.rb'
+    ruby "extconf.rb #{ENV['JSON_GENERATOR_CONFIGURE_OPTS']}"
     sh MAKE
   end
   cp "#{EXT_GENERATOR_DIR}/generator.#{CONFIG['DLEXT']}", EXT_ROOT_DIR
diff --git a/benchmark/encoder-simple.rb b/benchmark/encoder-simple.rb
new file mode 100644
index 00000000..cf3d380c
--- /dev/null
+++ b/benchmark/encoder-simple.rb
@@ -0,0 +1,58 @@
+require "benchmark/ips"
+require "json"
+require "date"
+require "oj"
+
+Oj.default_options = Oj.default_options.merge(mode: :compat)
+
+if ENV["ONLY"]
+  RUN = ENV["ONLY"].split(/[,: ]/).map{|x| [x.to_sym, true] }.to_h
+  RUN.default = false
+elsif ENV["EXCEPT"]
+  RUN = ENV["EXCEPT"].split(/[,: ]/).map{|x| [x.to_sym, false] }.to_h
+  RUN.default = true
+else
+  RUN = Hash.new(true)
+end
+
+def implementations(ruby_obj)
+  state = JSON::State.new(JSON.dump_default_options)
+  {
+    json: ["json", proc { JSON.generate(ruby_obj) }],
+    oj: ["oj", proc { Oj.dump(ruby_obj) }],
+  }
+end
+
+def benchmark_encoding(benchmark_name, ruby_obj, check_expected: true, except: [])
+  json_output = JSON.dump(ruby_obj)
+  puts "== Encoding #{benchmark_name} (#{json_output.bytesize} bytes)"
+
+  impls = implementations(ruby_obj).select { |name| RUN[name] }
+  except.each { |i| impls.delete(i) }
+
+  Benchmark.ips do |x|
+    expected = ::JSON.dump(ruby_obj) if check_expected
+    impls.values.each do |name, block|
+      begin
+        result = block.call
+        if check_expected && expected != result
+          puts "#{name} does not match expected output. Skipping"
+          puts "Expected:" + '-' * 40
+          puts expected
+          puts "Actual:" + '-' * 40
+          puts result
+          puts '-' * 40
+          next
+        end
+      rescue => error
+        puts "#{name} unsupported (#{error})"
+        next
+      end
+      x.report(name, &block)
+    end
+    x.compare!(order: :baseline)
+  end
+  puts
+end
+
+benchmark_encoding "long string", (["this is a test of the emergency broadcast system."*5]*500)
\ No newline at end of file
diff --git a/ext/json/ext/generator/extconf.rb b/ext/json/ext/generator/extconf.rb
index 078068cf..5ed26fda 100644
--- a/ext/json/ext/generator/extconf.rb
+++ b/ext/json/ext/generator/extconf.rb
@@ -6,5 +6,55 @@
 else
   append_cflags("-std=c99")
   $defs << "-DJSON_GENERATOR"
+
+  if enable_config('use-simd', default=true)
+    if RbConfig::CONFIG['host_cpu'] =~ /^(arm.*|aarch64.*)/
+      $defs.push("-DENABLE_SIMD")
+
+      # Try to compile a small program using NEON instructions
+      if have_header('arm_neon.h')
+        have_type('uint8x16_t', headers=['arm_neon.h']) && try_compile(<<~'SRC')
+          #include <arm_neon.h>
+          int main() {
+              uint8x16_t test = vdupq_n_u8(32);
+              return 0;
+          }
+        SRC
+
+        have_type('uint8x8_t', headers=['arm_neon.h']) && try_compile(<<~'SRC')
+            #include <arm_neon.h>
+            int main() {
+                uint8x8_t test = vdup_n_u8(32);
+                return 0;
+            }
+        SRC
+        end
+      elsif have_header('x86intrin.h')
+        
+        if have_type('__m256i', headers=['x86intrin.h']) && try_compile(<<~'SRC', opt='-mavx2')
+          #include <x86intrin.h>
+          int main() {
+              __m256i test = _mm256_set1_epi8(32);
+              return 0;
+          }
+          SRC
+          $defs.push("-DENABLE_SIMD")
+        end
+        
+        if have_type('__m128i', headers=['x86intrin.h']) && try_compile(<<~'SRC', opt='-mavx2')
+          #include <x86intrin.h>
+          int main() {
+              __m128i test = _mm_set1_epi8(32);
+              return 0;
+          }
+          SRC
+            $defs.push("-DENABLE_SIMD") unless $defs.include?('-DENABLE_SIMD')
+        end
+      end
+
+      have_header('cpuid.h')
+  end
+
+  create_header
   create_makefile 'json/ext/generator'
 end
diff --git a/ext/json/ext/generator/generator.c b/ext/json/ext/generator/generator.c
index 5006b785..745a3ce1 100644
--- a/ext/json/ext/generator/generator.c
+++ b/ext/json/ext/generator/generator.c
@@ -4,6 +4,9 @@
 #include <math.h>
 #include <ctype.h>
 
+#include "extconf.h"
+#include "simd.h"
+
 /* ruby api and some helpers */
 
 typedef struct JSON_Generator_StateStruct {
@@ -33,6 +36,11 @@ static ID i_to_s, i_to_json, i_new, i_pack, i_unpack, i_create_id, i_extend, i_e
 static ID sym_indent, sym_space, sym_space_before, sym_object_nl, sym_array_nl, sym_max_nesting, sym_allow_nan,
           sym_ascii_only, sym_depth, sym_buffer_initial_length, sym_script_safe, sym_escape_slash, sym_strict;
 
+static void (*convert_UTF8_to_JSON_impl)(FBuffer *, VALUE, const unsigned char escape_table[256]);
+
+#ifdef ENABLE_SIMD
+static void (*convert_UTF8_to_JSON_simd_kernel)(FBuffer *out_buffer, const char * ptr, unsigned long len, unsigned long *_beg, unsigned long *_pos, const char *hexdig, char scratch[12], const unsigned char escape_table[256]);
+#endif
 
 #define GET_STATE_TO(self, state) \
     TypedData_Get_Struct(self, JSON_Generator_State, &JSON_Generator_State_type, state)
@@ -179,7 +187,57 @@ static const unsigned char script_safe_escape_table[256] = {
  * Everything else (should be UTF-8) is just passed through and
  * appended to the result.
  */
-static inline void convert_UTF8_to_JSON(FBuffer *out_buffer, VALUE str, const unsigned char escape_table[256])
+#define FLUSH_POS(bytes) if (pos > beg) { fbuffer_append(out_buffer, &ptr[beg], pos - beg); } pos += bytes; beg = pos;
+#define PROCESS_BYTE if (RB_UNLIKELY(ch_len)) { \
+                switch (ch_len) { \
+                    case 9: { \
+                        FLUSH_POS(1); \
+                        switch (ch) { \
+                            case '"':  fbuffer_append(out_buffer, "\\\"", 2); break; \
+                            case '\\': fbuffer_append(out_buffer, "\\\\", 2); break; \
+                            case '/':  fbuffer_append(out_buffer, "\\/", 2); break; \
+                            case '\b': fbuffer_append(out_buffer, "\\b", 2); break; \
+                            case '\f': fbuffer_append(out_buffer, "\\f", 2); break; \
+                            case '\n': fbuffer_append(out_buffer, "\\n", 2); break; \
+                            case '\r': fbuffer_append(out_buffer, "\\r", 2); break; \
+                            case '\t': fbuffer_append(out_buffer, "\\t", 2); break; \
+                            default: { \
+                                scratch[2] = '0'; \
+                                scratch[3] = '0'; \
+                                scratch[4] = hexdig[(ch >> 4) & 0xf]; \
+                                scratch[5] = hexdig[ch & 0xf]; \
+                                fbuffer_append(out_buffer, scratch, 6); \
+                                break; \
+                            } \
+                        } \
+                        break; \
+                    } \
+                    case 11: { \
+                        unsigned char b2 = ptr[pos + 1]; \
+                        if (RB_UNLIKELY(b2 == 0x80)) { \
+                            unsigned char b3 = ptr[pos + 2]; \
+                            if (b3 == 0xA8) { \
+                                FLUSH_POS(3); \
+                                fbuffer_append(out_buffer, "\\u2028", 6); \
+                                break; \
+                            } else if (b3 == 0xA9) { \
+                                FLUSH_POS(3); \
+                                fbuffer_append(out_buffer, "\\u2029", 6); \
+                                break; \
+                            } \
+                        } \
+                        ch_len = 3;  \
+                    } \
+                    default: \
+                        pos += ch_len; \
+                        break; \
+                } \
+            } else { \
+                pos++; \
+            }
+
+#ifdef ENABLE_SIMD
+static void convert_UTF8_to_JSON_simd(FBuffer *out_buffer, VALUE str, const unsigned char escape_table[256])
 {
     const char *hexdig = "0123456789abcdef";
     char scratch[12] = { '\\', 'u', 0, 0, 0, 0, '\\', 'u' };
@@ -189,63 +247,462 @@ static inline void convert_UTF8_to_JSON(FBuffer *out_buffer, VALUE str, const un
 
     unsigned long beg = 0, pos = 0;
 
-#define FLUSH_POS(bytes) if (pos > beg) { fbuffer_append(out_buffer, &ptr[beg], pos - beg); } pos += bytes; beg = pos;
-
+    convert_UTF8_to_JSON_simd_kernel(out_buffer, ptr, len, &beg, &pos, hexdig, scratch, escape_table);
+    
     while (pos < len) {
         unsigned char ch = ptr[pos];
         unsigned char ch_len = escape_table[ch];
-        /* JSON encoding */
+        PROCESS_BYTE;
+    }
 
-        if (RB_UNLIKELY(ch_len)) {
-            switch (ch_len) {
-                case 9: {
-                    FLUSH_POS(1);
-                    switch (ch) {
-                        case '"':  fbuffer_append(out_buffer, "\\\"", 2); break;
-                        case '\\': fbuffer_append(out_buffer, "\\\\", 2); break;
-                        case '/':  fbuffer_append(out_buffer, "\\/", 2); break;
-                        case '\b': fbuffer_append(out_buffer, "\\b", 2); break;
-                        case '\f': fbuffer_append(out_buffer, "\\f", 2); break;
-                        case '\n': fbuffer_append(out_buffer, "\\n", 2); break;
-                        case '\r': fbuffer_append(out_buffer, "\\r", 2); break;
-                        case '\t': fbuffer_append(out_buffer, "\\t", 2); break;
-                        default: {
-                            scratch[2] = '0';
-                            scratch[3] = '0';
-                            scratch[4] = hexdig[(ch >> 4) & 0xf];
-                            scratch[5] = hexdig[ch & 0xf];
-                            fbuffer_append(out_buffer, scratch, 6);
-                            break;
-                        }
-                    }
-                    break;
+    if (beg < len) {
+        fbuffer_append(out_buffer, &ptr[beg], len - beg);
+    }
+
+    RB_GC_GUARD(str);
+}
+#endif 
+
+#ifdef HAVE_SIMD_NEON
+
+void convert_UTF8_to_JSON_simd_kernel_neon(FBuffer *out_buffer, const char * ptr, unsigned long len, unsigned long *_beg, unsigned long *_pos, const char *hexdig, char scratch[12], const unsigned char escape_table[256]) {
+    unsigned long beg = *_beg, pos = *_pos;
+        
+    const uint8x16_t lower_bound = vdupq_n_u8(' '); 
+    const uint8x16_t backslash   = vdupq_n_u8('\\');
+    const uint8x16_t dblquote    = vdupq_n_u8('\"');
+
+    if (escape_table == script_safe_escape_table) {
+        /*
+        * This works almost exactly the same as what is described above. The difference in this case comes after we know
+        * there is a byte to be escaped. In the previous case, all bytes were handled the same way. In this case, however,
+        * some bytes need to be handled differently. 
+        * 
+        * Since we know each byte in chunk can only match a single case, we logical AND each of the has_backslash,
+        * has_dblquote, and has_forward_slash with a different bit (0x1, 0x2 and 0x4 respectively) and combine
+        * the results with a logical OR. 
+        * 
+        * Now we loop over the result vector and switch on the particular pattern we just created. If we find a 
+        * case we don't know, we simply lookup the byte in the script_safe_escape_table to determine the correct
+        * action.
+        */
+        const uint8x16_t upper_bound     = vdupq_n_u8('~');
+        const uint8x16_t forward_slash   = vdupq_n_u8('/');
+
+        while (pos+16 < len) {
+            uint8x16_t chunk             = vld1q_u8((const uint8_t*)&ptr[pos]);
+            uint8x16_t too_low           = vcltq_u8(chunk, lower_bound);
+            uint8x16_t too_high          = vcgtq_u8(chunk, upper_bound);
+
+            uint8x16_t has_backslash     = vceqq_u8(chunk, backslash);
+            uint8x16_t has_dblquote      = vceqq_u8(chunk, dblquote);
+            uint8x16_t has_forward_slash = vceqq_u8(chunk, forward_slash);
+
+            uint8x16_t needs_escape      = vorrq_u8(too_low, too_high);
+            uint8x16_t has_escaped_char  = vorrq_u8(has_forward_slash, vorrq_u8(has_backslash, has_dblquote));
+            needs_escape                 = vorrq_u8(needs_escape, has_escaped_char);
+
+            if (vmaxvq_u8(needs_escape) == 0) {
+                pos += 16;
+                continue;
+            }
+
+            uint8x16_t tmp = vandq_u8(too_low, vdupq_n_u8(0x1));
+            tmp = vorrq_u8(tmp, vandq_u8(has_backslash, vdupq_n_u8(0x2)));
+            tmp = vorrq_u8(tmp, vandq_u8(has_dblquote, vdupq_n_u8(0x4)));
+            tmp = vorrq_u8(tmp, vandq_u8(has_forward_slash, vdupq_n_u8(0x8)));
+
+            uint8_t arr[16];
+            vst1q_u8(arr, tmp);
+            
+            for (int i = 0; i < 16; ) {
+                unsigned long start = pos;
+                unsigned char ch = ptr[pos];
+                unsigned char ch_len = arr[i];
+                switch(ch_len) {
+                    case 0x1:
+                    case 0x2:
+                    case 0x4:
+                    case 0x8:
+                        ch_len = 9;
+                        break;
+                    default:
+                        ch_len = script_safe_escape_table[ch];
                 }
-                case 11: {
-                    unsigned char b2 = ptr[pos + 1];
-                    if (RB_UNLIKELY(b2 == 0x80)) {
-                        unsigned char b3 = ptr[pos + 2];
-                        if (b3 == 0xA8) {
-                            FLUSH_POS(3);
-                            fbuffer_append(out_buffer, "\\u2028", 6);
-                            break;
-                        } else if (b3 == 0xA9) {
-                            FLUSH_POS(3);
-                            fbuffer_append(out_buffer, "\\u2029", 6);
-                            break;
-                        }
-                    }
-                    ch_len = 3;
-                    // fallthrough
+                // This must remain in sync with the array `escape_table`.
+                if (RB_UNLIKELY(ch_len)) {
+                    PROCESS_BYTE;
+                } else {
+                    pos++;
+                }
+
+                i += (pos - start);
+            }
+        }
+    } else {
+        /*
+        * The code below implements an SIMD-based algorithm to determine if N bytes at a time
+        * need to be escaped. 
+        * 
+        * Assume the ptr = "Te\sting!" (the double quotes are included in the string)
+        * 
+        * The explanination will be limited to the first 8 bytes of the string for simplicity. However
+        * the vector insructions may work on larger vectors.
+        * 
+        * First, we load three constants 'lower_bound', 'backslash' and 'dblquote" in vector registers.
+        * 
+        * lower_bound: [20 20 20 20 20 20 20 20] 
+        * backslash:   [5C 5C 5C 5C 5C 5C 5C 5C] 
+        * dblquote:    [22 22 22 22 22 22 22 22] 
+        * 
+        * Next we load the first chunk of the ptr: 
+        * [22 54 65 5C 73 74 69 6E] ("  T  e  \  s  t  i  n)
+        * 
+        * First we check if any byte in chunk is less than 32 (0x20). This returns the following vector
+        * as no bytes are less than 32 (0x20):
+        * [0 0 0 0 0 0 0 0]
+        * 
+        * Next, we check if any byte in chunk is equal to a backslash:
+        * [0 0 0 FF 0 0 0 0]
+        * 
+        * Finally we check if any byte in chunk is equal to a double quote:
+        * [FF 0 0 0 0 0 0 0] 
+        * 
+        * Now we have three vectors where each byte indicates if the corresponding byte in chunk
+        * needs to be escaped. We combine these vectors with a series of logical OR instructions.
+        * This is the needs_escape vector and it is equal to:
+        * [FF 0 0 FF 0 0 0 0] 
+        * 
+        * For ARM Neon specifically, we check if the maximum number in the vector is 0. The maximum of
+        * the needs_escape vector is FF. Therefore, we know there is at least one byte that needs to be
+        * escaped.
+        * 
+        * If the maximum of the needs_escape vector is 0, none of the bytes need to be escaped and
+        * we advance pos by the width of the vector.
+        * 
+        * To determine how to escape characters, we look at each value in the needs_escape vector and take
+        * the appropriate action.
+        */
+        while (pos+16 < len) {
+            uint8x16_t chunk         = vld1q_u8((const uint8_t*)&ptr[pos]);
+            uint8x16_t too_low       = vcltq_u8(chunk, lower_bound);
+            uint8x16_t has_backslash = vceqq_u8(chunk, backslash);
+            uint8x16_t has_dblquote  = vceqq_u8(chunk, dblquote);
+            uint8x16_t needs_escape  = vorrq_u8(too_low, vorrq_u8(has_backslash, has_dblquote));
+
+            if (vmaxvq_u8(needs_escape) == 0) {
+                pos += 16;
+                continue;
+            }
+
+            /*
+            * TODO Consider making another type simd_vec_mask. The reason being on x86 we can use _mm_movemask_epi8
+            * to get a mask rather than storing the vector to memory. 
+            * 
+            * We would need another function like simd_vec_mask_position_set(mask, pos) which returns true
+            * if the bit/byte (implementation defined) at position 'pos' is non-zero.
+            */
+
+            uint8_t arr[16];
+            vst1q_u8(arr, needs_escape);
+
+            for (int i = 0; i < 16; i++) {
+                unsigned char ch = ptr[pos];
+                unsigned char ch_len = arr[i];
+                
+                // This must remain in sync with the array `escape_table`.
+                if (RB_UNLIKELY(ch_len)) {
+                    ch_len = 9;
+                    PROCESS_BYTE;
+                } else {
+                    pos++;
                 }
-                default:
-                    pos += ch_len;
-                    break;
             }
-        } else {
-            pos++;
         }
     }
-#undef FLUSH_POS
+
+    *_beg = beg;
+    *_pos = pos;
+}
+
+#endif /* HAVE_SIMD_NEON */
+
+#ifdef HAVE_SIMD_X86_64
+
+#ifdef HAVE_TYPE___M128I
+#ifdef __GNUC__
+#pragma GCC push_options
+#pragma GCC target ("sse4")
+#endif /* __GNUC__ */
+
+#define _mm_cmpge_epu8(a, b) _mm_cmpeq_epi8(_mm_max_epu8(a, b), a)
+#define _mm_cmple_epu8(a, b) _mm_cmpge_epu8(b, a)
+#define _mm_cmpgt_epu8(a, b) _mm_xor_si128(_mm_cmple_epu8(a, b), _mm_set1_epi8(-1))
+#define _mm_cmplt_epu8(a, b) _mm_cmpgt_epu8(b, a)
+
+#ifdef __clang__
+__attribute__((target("sse4.2")))
+#endif /* __clang__ */
+void convert_UTF8_to_JSON_simd_kernel_sse42(FBuffer *out_buffer, const char * ptr, unsigned long len, unsigned long *_beg, unsigned long *_pos, const char *hexdig, char scratch[12], const unsigned char escape_table[256]) {
+    unsigned long beg = *_beg, pos = *_pos;
+
+    if (escape_table == script_safe_escape_table) {
+        /*
+        * Again, this is basically a straight port of the ARM Neon version.
+        */
+        const __m128i lower_bound     = _mm_set1_epi8(' '); 
+        const __m128i upper_bound     = _mm_set1_epi8('~');
+        const __m128i backslash       = _mm_set1_epi8('\\');
+        const __m128i dblquote        = _mm_set1_epi8('\"');
+        const __m128i forward_slash   = _mm_set1_epi8('/');
+
+        while (pos+16 < len) {
+            __m128i chunk             = _mm_loadu_si128((__m128i const*)&ptr[pos]);
+            __m128i too_low           = _mm_cmplt_epu8(chunk, lower_bound);
+            __m128i too_high          = _mm_cmpgt_epu8(chunk, upper_bound);
+
+            __m128i has_backslash     = _mm_cmpeq_epi8(chunk, backslash);
+            __m128i has_dblquote      = _mm_cmpeq_epi8(chunk, dblquote);
+            __m128i has_forward_slash = _mm_cmpeq_epi8(chunk, forward_slash);
+
+            __m128i needs_escape      = _mm_or_si128(too_low, too_high);
+            __m128i has_escaped_char  = _mm_or_si128(has_forward_slash, _mm_or_si128(has_backslash, has_dblquote));
+            needs_escape              = _mm_or_si128(needs_escape, has_escaped_char);
+
+            int needs_escape_mask     = _mm_movemask_epi8(needs_escape);
+            if (needs_escape_mask == 0) {
+                pos += 16;
+                continue;
+            }
+
+            __m128i tmp = _mm_and_si128(too_low, _mm_set1_epi8(0x1));
+            tmp = _mm_or_si128(tmp, _mm_and_si128(has_backslash, _mm_set1_epi8(0x2)));
+            tmp = _mm_or_si128(tmp, _mm_and_si128(has_dblquote, _mm_set1_epi8(0x4)));
+            tmp = _mm_or_si128(tmp, _mm_and_si128(has_forward_slash, _mm_set1_epi8(0x8)));
+
+            uint8_t arr[16];
+            _mm_storeu_si128((__m128i *) arr, tmp);
+
+            for (int i = 0; i < 16; ) {
+                unsigned long start = pos;
+                unsigned char ch = ptr[pos];
+                unsigned char ch_len = arr[i];
+                switch(ch_len) {
+                    case 0x1:
+                    case 0x2:
+                    case 0x4:
+                    case 0x8:
+                        ch_len = 9;
+                        break;
+                    default:
+                        ch_len = script_safe_escape_table[ch];
+                }
+                // This must remain in sync with the array `escape_table`.
+                if (RB_UNLIKELY(ch_len)) {
+                    PROCESS_BYTE;
+                } else {
+                    pos++;
+                }
+
+                i += (pos - start);
+            }
+        }
+    } else {
+        /*
+        * This is a straight port of the ARM Neon implementation to SSE4. This is 
+        * likely not optimal for this instruction set. There is likely table lookup,
+        * shuffle, gather, blend, etc. instructions that may perform significantly
+        * better than what is implemented here.
+        */
+
+        const __m128i lower_bound = _mm_set1_epi8(' '); 
+        const __m128i backslash   = _mm_set1_epi8('\\');
+        const __m128i dblquote    = _mm_set1_epi8('\"');
+
+        while (pos+16 < len) {
+            __m128i chunk         = _mm_loadu_si128((__m128i const*)&ptr[pos]);
+            __m128i too_low       = _mm_cmplt_epu8(chunk, lower_bound);
+            __m128i has_backslash = _mm_cmpeq_epi8(chunk, backslash);
+            __m128i has_dblquote  = _mm_cmpeq_epi8(chunk, dblquote);
+            __m128i needs_escape  = _mm_or_si128(too_low, _mm_or_si128(has_backslash, has_dblquote));
+
+            int needs_escape_mask = _mm_movemask_epi8(needs_escape);
+
+            if (needs_escape_mask == 0) {
+                pos += 16;
+                continue;
+            }
+
+            for (int i = 0; i < 16; i++) {
+                int bit = needs_escape_mask & (1 << i);
+                unsigned char ch = ptr[pos];
+                unsigned char ch_len = 0;
+                
+                // This must remain in sync with the array `escape_table`.
+                if (RB_UNLIKELY(bit)) {
+                    ch_len = 9;
+                    PROCESS_BYTE;
+                } else {
+                    pos++;
+                }   
+            }
+        }
+    }
+
+    *_beg = beg;
+    *_pos = pos;
+}
+
+#ifdef __GNUC__
+#pragma GCC pop_options
+#endif /* __GNUC__ */
+#endif /* HAVE_TYPE___M128I */
+
+#ifdef HAVE_TYPE___M256I
+#ifdef __GNUC__
+#pragma GCC push_options
+#pragma GCC target ("avx2")
+#endif /* __GNUC__ */
+
+#define _mm256_cmpge_epu8(a, b) _mm256_cmpeq_epi8(_mm256_max_epu8(a, b), a)
+#define _mm256_cmple_epu8(a, b) _mm256_cmpge_epu8(b, a)
+#define _mm256_cmpgt_epu8(a, b) _mm256_xor_si256(_mm256_cmple_epu8(a, b), _mm256_set1_epi8(-1))
+#define _mm256_cmplt_epu8(a, b) _mm256_cmpgt_epu8(b, a)
+
+#ifdef __clang__
+__attribute__((target("avx2")))
+#endif /* __clang__ */
+void convert_UTF8_to_JSON_simd_kernel_avx2(FBuffer *out_buffer, const char * ptr, unsigned long len, unsigned long *_beg, unsigned long *_pos, const char *hexdig, char scratch[12], const unsigned char escape_table[256]) {
+    unsigned long beg = *_beg, pos = *_pos;
+
+    const __m256i lower_bound = _mm256_set1_epi8(' '); 
+    const __m256i backslash   = _mm256_set1_epi8('\\');
+    const __m256i dblquote    = _mm256_set1_epi8('\"');
+
+    if (escape_table == script_safe_escape_table) {
+        /*
+        * Again, this is basically a straight port of the ARM Neon version.
+        */
+        const __m256i upper_bound     = _mm256_set1_epi8('~');
+        const __m256i forward_slash   = _mm256_set1_epi8('/');
+
+        while (pos+32 < len) {
+            __m256i chunk             = _mm256_loadu_si256((__m256i const*)&ptr[pos]);
+            __m256i too_low           = _mm256_cmplt_epu8(chunk, lower_bound);
+            __m256i too_high          = _mm256_cmpgt_epu8(chunk, upper_bound);
+
+            __m256i has_backslash     = _mm256_cmpeq_epi8(chunk, backslash);
+            __m256i has_dblquote      = _mm256_cmpeq_epi8(chunk, dblquote);
+            __m256i has_forward_slash = _mm256_cmpeq_epi8(chunk, forward_slash);
+
+            __m256i needs_escape      = _mm256_or_si256(too_low, too_high);
+            __m256i has_escaped_char  = _mm256_or_si256(has_forward_slash, _mm256_or_si256(has_backslash, has_dblquote));
+            needs_escape              = _mm256_or_si256(needs_escape, has_escaped_char);
+
+            int needs_escape_mask     = _mm256_movemask_epi8(needs_escape);
+            if (needs_escape_mask == 0) {
+                pos += 32;
+                continue;
+            }
+
+            __m256i tmp = _mm256_and_si256(too_low, _mm256_set1_epi8(0x1));
+            tmp = _mm256_or_si256(tmp, _mm256_and_si256(has_backslash, _mm256_set1_epi8(0x2)));
+            tmp = _mm256_or_si256(tmp, _mm256_and_si256(has_dblquote, _mm256_set1_epi8(0x4)));
+            tmp = _mm256_or_si256(tmp, _mm256_and_si256(has_forward_slash, _mm256_set1_epi8(0x8)));
+
+            uint8_t arr[32];
+            _mm256_storeu_si256((__m256i *) arr, tmp);
+
+            for (int i = 0; i < 32; ) {
+                unsigned long start = pos;
+                unsigned char ch = ptr[pos];
+                unsigned char ch_len = arr[i];
+                switch(ch_len) {
+                    case 0x1:
+                    case 0x2:
+                    case 0x4:
+                    case 0x8:
+                        ch_len = 9;
+                        break;
+                    default:
+                        ch_len = script_safe_escape_table[ch];
+                }
+                // This must remain in sync with the array `escape_table`.
+                if (RB_UNLIKELY(ch_len)) {
+                    PROCESS_BYTE;
+                } else {
+                    pos++;
+                }
+
+                i += (pos - start);
+            }
+        }
+    } else {
+        /*
+        * This is a straight port of the ARM Neon implementation to SSE4. This is 
+        * likely not optimal for this instruction set. There is likely table lookup,
+        * shuffle, gather, blend, etc. instructions that may perform significantly
+        * better than what is implemented here.
+        */
+        while (pos+32 < len) {
+            __m256i chunk         = _mm256_loadu_si256((__m256i const*)&ptr[pos]);
+            __m256i too_low       = _mm256_cmplt_epu8(chunk, lower_bound);
+            __m256i has_backslash = _mm256_cmpeq_epi8(chunk, backslash);
+            __m256i has_dblquote  = _mm256_cmpeq_epi8(chunk, dblquote);
+            __m256i needs_escape  = _mm256_or_si256(too_low, _mm256_or_si256(has_backslash, has_dblquote));
+
+            int needs_escape_mask = _mm256_movemask_epi8(needs_escape);
+
+            if (needs_escape_mask == 0) {
+                pos += 32;
+                continue;
+            }
+
+            for (int i = 0; i < 32; i++) {
+                int bit = needs_escape_mask & (1 << i);
+                unsigned char ch = ptr[pos];
+                unsigned char ch_len = 0;
+                
+                // This must remain in sync with the array `escape_table`.
+                if (RB_UNLIKELY(bit)) {
+                    ch_len = 9;
+                    PROCESS_BYTE;
+                } else {
+                    pos++;
+                }   
+            }
+        }
+    }
+    *_beg = beg;
+    *_pos = pos;
+}
+
+#ifdef __GNUC__
+#pragma GCC pop_options
+#endif /* __GNUC__ */
+
+#endif /* HAVE_TYPE___M256I */
+
+#endif /* x86_64 support */
+
+
+static void convert_UTF8_to_JSON(FBuffer *out_buffer, VALUE str, const unsigned char escape_table[256])
+{
+    const char *hexdig = "0123456789abcdef";
+    char scratch[12] = { '\\', 'u', 0, 0, 0, 0, '\\', 'u' };
+
+    const char *ptr = RSTRING_PTR(str);
+    unsigned long len = RSTRING_LEN(str);
+
+    unsigned long beg = 0, pos = 0;
+
+    while (pos < len) {
+        unsigned char ch = ptr[pos];
+        unsigned char ch_len = escape_table[ch];
+        /* JSON encoding */
+
+        PROCESS_BYTE;
+    }
 
     if (beg < len) {
         fbuffer_append(out_buffer, &ptr[beg], len - beg);
@@ -254,6 +711,8 @@ static inline void convert_UTF8_to_JSON(FBuffer *out_buffer, VALUE str, const un
     RB_GC_GUARD(str);
 }
 
+#undef PROCESS_BYTE
+
 static void convert_UTF8_to_ASCII_only_JSON(FBuffer *out_buffer, VALUE str, const unsigned char escape_table[256])
 {
     const char *hexdig = "0123456789abcdef";
@@ -264,91 +723,88 @@ static void convert_UTF8_to_ASCII_only_JSON(FBuffer *out_buffer, VALUE str, cons
 
     unsigned long beg = 0, pos = 0;
 
-#define FLUSH_POS(bytes) if (pos > beg) { fbuffer_append(out_buffer, &ptr[beg], pos - beg); } pos += bytes; beg = pos;
-
     while (pos < len) {
         unsigned char ch = ptr[pos];
         unsigned char ch_len = escape_table[ch];
 
-        if (RB_UNLIKELY(ch_len)) {
-            switch (ch_len) {
-                case 9: {
-                    FLUSH_POS(1);
-                    switch (ch) {
-                        case '"':  fbuffer_append(out_buffer, "\\\"", 2); break;
-                        case '\\': fbuffer_append(out_buffer, "\\\\", 2); break;
-                        case '/':  fbuffer_append(out_buffer, "\\/", 2); break;
-                        case '\b': fbuffer_append(out_buffer, "\\b", 2); break;
-                        case '\f': fbuffer_append(out_buffer, "\\f", 2); break;
-                        case '\n': fbuffer_append(out_buffer, "\\n", 2); break;
-                        case '\r': fbuffer_append(out_buffer, "\\r", 2); break;
-                        case '\t': fbuffer_append(out_buffer, "\\t", 2); break;
-                        default: {
-                            scratch[2] = '0';
-                            scratch[3] = '0';
-                            scratch[4] = hexdig[(ch >> 4) & 0xf];
-                            scratch[5] = hexdig[ch & 0xf];
-                            fbuffer_append(out_buffer, scratch, 6);
-                            break;
-                        }
-                    }
-                    break;
-                }
-                default: {
-                    uint32_t wchar = 0;
-                    ch_len = ch_len & CHAR_LENGTH_MASK;
-
-                    switch(ch_len) {
-                        case 2:
-                            wchar = ptr[pos] & 0x1F;
-                            break;
-                        case 3:
-                            wchar = ptr[pos] & 0x0F;
-                            break;
-                        case 4:
-                            wchar = ptr[pos] & 0x07;
-                            break;
-                    }
-
-                    for (short i = 1; i < ch_len; i++) {
-                        wchar = (wchar << 6) | (ptr[pos+i] & 0x3F);
-                    }
-
-                    FLUSH_POS(ch_len);
-
-                    if (wchar <= 0xFFFF) {
-                        scratch[2] = hexdig[wchar >> 12];
-                        scratch[3] = hexdig[(wchar >> 8) & 0xf];
-                        scratch[4] = hexdig[(wchar >> 4) & 0xf];
-                        scratch[5] = hexdig[wchar & 0xf];
-                        fbuffer_append(out_buffer, scratch, 6);
-                    } else {
-                        uint16_t hi, lo;
-                        wchar -= 0x10000;
-                        hi = 0xD800 + (uint16_t)(wchar >> 10);
-                        lo = 0xDC00 + (uint16_t)(wchar & 0x3FF);
-
-                        scratch[2] = hexdig[hi >> 12];
-                        scratch[3] = hexdig[(hi >> 8) & 0xf];
-                        scratch[4] = hexdig[(hi >> 4) & 0xf];
-                        scratch[5] = hexdig[hi & 0xf];
-
-                        scratch[8] = hexdig[lo >> 12];
-                        scratch[9] = hexdig[(lo >> 8) & 0xf];
-                        scratch[10] = hexdig[(lo >> 4) & 0xf];
-                        scratch[11] = hexdig[lo & 0xf];
-
-                        fbuffer_append(out_buffer, scratch, 12);
-                    }
-
-                    break;
-                }
-            }
-        } else {
-            pos++;
-        }
+        if (RB_UNLIKELY(ch_len)) { 
+            switch (ch_len) { 
+                case 9: { 
+                    FLUSH_POS(1); 
+                    switch (ch) { 
+                        case '"':  fbuffer_append(out_buffer, "\\\"", 2); break; 
+                        case '\\': fbuffer_append(out_buffer, "\\\\", 2); break; 
+                        case '/':  fbuffer_append(out_buffer, "\\/", 2); break; 
+                        case '\b': fbuffer_append(out_buffer, "\\b", 2); break; 
+                        case '\f': fbuffer_append(out_buffer, "\\f", 2); break; 
+                        case '\n': fbuffer_append(out_buffer, "\\n", 2); break; 
+                        case '\r': fbuffer_append(out_buffer, "\\r", 2); break; 
+                        case '\t': fbuffer_append(out_buffer, "\\t", 2); break; 
+                        default: { 
+                            scratch[2] = '0'; 
+                            scratch[3] = '0'; 
+                            scratch[4] = hexdig[(ch >> 4) & 0xf]; 
+                            scratch[5] = hexdig[ch & 0xf]; 
+                            fbuffer_append(out_buffer, scratch, 6); 
+                            break; 
+                        } 
+                    } 
+                    break; 
+                } 
+                default: { 
+                    uint32_t wchar = 0; 
+                    ch_len = ch_len & CHAR_LENGTH_MASK; 
+ 
+                    switch(ch_len) { 
+                        case 2: 
+                            wchar = ptr[pos] & 0x1F; 
+                            break; 
+                        case 3: 
+                            wchar = ptr[pos] & 0x0F; 
+                            break; 
+                        case 4: 
+                            wchar = ptr[pos] & 0x07; 
+                            break; 
+                    } 
+ 
+                    for (short i = 1; i < ch_len; i++) { 
+                        wchar = (wchar << 6) | (ptr[pos+i] & 0x3F); 
+                    } 
+ 
+                    FLUSH_POS(ch_len); 
+ 
+                    if (wchar <= 0xFFFF) { 
+                        scratch[2] = hexdig[wchar >> 12]; 
+                        scratch[3] = hexdig[(wchar >> 8) & 0xf]; 
+                        scratch[4] = hexdig[(wchar >> 4) & 0xf]; 
+                        scratch[5] = hexdig[wchar & 0xf]; 
+                        fbuffer_append(out_buffer, scratch, 6); 
+                    } else { 
+                        uint16_t hi, lo; 
+                        wchar -= 0x10000; 
+                        hi = 0xD800 + (uint16_t)(wchar >> 10); 
+                        lo = 0xDC00 + (uint16_t)(wchar & 0x3FF); 
+ 
+                        scratch[2] = hexdig[hi >> 12]; 
+                        scratch[3] = hexdig[(hi >> 8) & 0xf]; 
+                        scratch[4] = hexdig[(hi >> 4) & 0xf]; 
+                        scratch[5] = hexdig[hi & 0xf]; 
+ 
+                        scratch[8] = hexdig[lo >> 12]; 
+                        scratch[9] = hexdig[(lo >> 8) & 0xf]; 
+                        scratch[10] = hexdig[(lo >> 4) & 0xf]; 
+                        scratch[11] = hexdig[lo & 0xf]; 
+ 
+                        fbuffer_append(out_buffer, scratch, 12); 
+                    } 
+ 
+                    break; 
+                } 
+            } 
+        } else { 
+            pos++; 
+        } 
     }
-#undef FLUSH_POS
 
     if (beg < len) {
         fbuffer_append(out_buffer, &ptr[beg], len - beg);
@@ -357,6 +813,8 @@ static void convert_UTF8_to_ASCII_only_JSON(FBuffer *out_buffer, VALUE str, cons
     RB_GC_GUARD(str);
 }
 
+#undef FLUSH_POS
+
 /*
  * Document-module: JSON::Ext::Generator
  *
@@ -912,7 +1370,7 @@ static void generate_json_string(FBuffer *buffer, struct generate_json_data *dat
             if (RB_UNLIKELY(state->ascii_only)) {
                 convert_UTF8_to_ASCII_only_JSON(buffer, obj, state->script_safe ? script_safe_escape_table : ascii_only_escape_table);
             } else {
-                convert_UTF8_to_JSON(buffer, obj, state->script_safe ? script_safe_escape_table : escape_table);
+                convert_UTF8_to_JSON_impl(buffer, obj, state->script_safe ? script_safe_escape_table : escape_table);
             }
             break;
         default:
@@ -1670,4 +2128,28 @@ void Init_generator(void)
     binary_encindex = rb_ascii8bit_encindex();
 
     rb_require("json/ext/generator/state");
-}
+
+       // TODO ADD RUNTIME CHECKS HERE?
+    switch(find_simd_implementation()) {
+#ifdef HAVE_SIMD_NEON
+        case SIMD_NEON:
+            convert_UTF8_to_JSON_impl = convert_UTF8_to_JSON_simd;
+            convert_UTF8_to_JSON_simd_kernel = convert_UTF8_to_JSON_simd_kernel_neon;
+            break;
+#endif
+#ifdef HAVE_SIMD_X86_64
+        case SIMD_SSE42:
+            convert_UTF8_to_JSON_impl = convert_UTF8_to_JSON_simd;
+            convert_UTF8_to_JSON_simd_kernel = convert_UTF8_to_JSON_simd_kernel_sse42;
+            break;
+#ifdef HAVE_TYPE___M256I
+        case SIMD_AVX2:
+            convert_UTF8_to_JSON_impl = convert_UTF8_to_JSON_simd;
+            convert_UTF8_to_JSON_simd_kernel = convert_UTF8_to_JSON_simd_kernel_avx2;
+            break;
+#endif /* HAVE_TYPE___M256I */
+#endif
+        default:
+            convert_UTF8_to_JSON_impl = convert_UTF8_to_JSON;
+    }
+}
\ No newline at end of file
diff --git a/ext/json/ext/generator/simd.h b/ext/json/ext/generator/simd.h
new file mode 100644
index 00000000..352c0b6f
--- /dev/null
+++ b/ext/json/ext/generator/simd.h
@@ -0,0 +1,71 @@
+#include "extconf.h"
+
+typedef enum {
+    SIMD_NONE,
+    SIMD_NEON,
+    SIMD_SSE42,
+    SIMD_AVX2
+} SIMD_Implementation;
+
+#ifdef ENABLE_SIMD
+
+#if defined(__ARM_NEON) || defined(__ARM_NEON__) || defined(__aarch64__) || defined(_M_ARM64)
+#include <arm_neon.h>
+
+#define FIND_SIMD_IMPLEMENTATION_DEFINED 1
+SIMD_Implementation find_simd_implementation() {
+    return SIMD_NEON;
+}
+
+#define HAVE_SIMD_NEON 1
+
+#ifdef HAVE_TYPE_UINT8X16_T
+
+#endif /* HAVE_TYPE_UINT8X16_T */
+#endif /* ARM Neon Support.*/
+
+#if defined(__amd64__) || defined(__amd64) || defined(__x86_64__) || defined(__x86_64) || defined(_M_X64) || defined(_M_AMD64)
+
+#define HAVE_SIMD_X86_64 1 
+#ifdef HAVE_X86INTRIN_H
+#include <x86intrin.h>
+
+#define HAVE_SIMD_X86_64 1
+
+#ifdef HAVE_CPUID_H
+#define FIND_SIMD_IMPLEMENTATION_DEFINED 1
+
+#include <cpuid.h>
+#endif 
+
+SIMD_Implementation find_simd_implementation(void) {
+
+#if defined(__GNUC__ ) || defined(__clang__)
+#ifdef __GNUC__ 
+    __builtin_cpu_init();
+#endif /* __GNUC__  */
+
+#ifdef HAVE_TYPE___M256I
+    if(__builtin_cpu_supports("avx2")) {
+        return SIMD_AVX2;
+    }
+#endif /* #ifdef HAVE_TYPE___M256I */
+
+    // TODO Revisit. I think the SSE version now only uses SSE2 instructions.
+    if (__builtin_cpu_supports("sse4.2")) {
+        return SIMD_SSE42;
+    }
+#endif /* __GNUC__ || __clang__*/
+
+    return SIMD_NONE;
+}
+
+#endif /* HAVE_X86INTRIN_H */
+#endif /* X86_64 Support */
+#endif /* ENABLE_SIMD */
+
+#ifndef FIND_SIMD_IMPLEMENTATION_DEFINED
+SIMD_Implementation find_simd_implementation(void) {
+    return SIMD_NONE;
+}
+#endif
\ No newline at end of file
diff --git a/test/json/json_generator_test.rb b/test/json/json_generator_test.rb
index 8dd3913d..c2156325 100755
--- a/test/json/json_generator_test.rb
+++ b/test/json/json_generator_test.rb
@@ -424,6 +424,10 @@ def test_backslash
     json = '["\\\\.(?i:gif|jpe?g|png)$"]'
     assert_equal json, generate(data)
     #
+    data = [ '\\.(?i:gif|jpe?g|png)\\.(?i:gif|jpe?g|png)\\.(?i:gif|jpe?g|png)\\.(?i:gif|jpe?g|png)\\.(?i:gif|jpe?g|png)$' ]
+    json = '["\\\\.(?i:gif|jpe?g|png)\\\\.(?i:gif|jpe?g|png)\\\\.(?i:gif|jpe?g|png)\\\\.(?i:gif|jpe?g|png)\\\\.(?i:gif|jpe?g|png)$"]'
+    assert_equal json, generate(data)
+    #
     data = [ '\\"' ]
     json = '["\\\\\""]'
     assert_equal json, generate(data)
@@ -432,10 +436,22 @@ def test_backslash
     json = '["/"]'
     assert_equal json, generate(data)
     #
+    data = [ '////////////////////////////////////////////////////////////////////////////////////' ]
+    json = '["////////////////////////////////////////////////////////////////////////////////////"]'
+    assert_equal json, generate(data)
+    #
     data = [ '/' ]
     json = '["\/"]'
     assert_equal json, generate(data, :script_safe => true)
     #
+    data = [ '///////////' ]
+    json = '["\/\/\/\/\/\/\/\/\/\/\/"]'
+    assert_equal json, generate(data, :script_safe => true)
+    #
+    data = [ '///////////////////////////////////////////////////////' ]
+    json = '["\/\/\/\/\/\/\/\/\/\/\/\/\/\/\/\/\/\/\/\/\/\/\/\/\/\/\/\/\/\/\/\/\/\/\/\/\/\/\/\/\/\/\/\/\/\/\/\/\/\/\/\/\/\/\/"]'
+    assert_equal json, generate(data, :script_safe => true)
+    #
     data = [ "\u2028\u2029" ]
     json = '["\u2028\u2029"]'
     assert_equal json, generate(data, :script_safe => true)
@@ -444,10 +460,18 @@ def test_backslash
     json = '["ABC \u2028 DEF \u2029 GHI"]'
     assert_equal json, generate(data, :script_safe => true)
     #
+    data = [ "ABC \u2028 DEF \u2029 GHI ABC \u2028 DEF \u2029 GHI ABC \u2028 DEF \u2029 GHI ABC \u2028 DEF \u2029 GHI ABC \u2028 DEF \u2029 GHI" ]
+    json = '["ABC \u2028 DEF \u2029 GHI ABC \u2028 DEF \u2029 GHI ABC \u2028 DEF \u2029 GHI ABC \u2028 DEF \u2029 GHI ABC \u2028 DEF \u2029 GHI"]'
+    assert_equal json, generate(data, :script_safe => true)
+    #
     data = [ "/\u2028\u2029" ]
     json = '["\/\u2028\u2029"]'
     assert_equal json, generate(data, :escape_slash => true)
     #
+    data = [ "/\u2028\u2029/\u2028\u2029/\u2028\u2029/\u2028\u2029/\u2028\u2029/\u2028\u2029/\u2028\u2029/\u2028\u2029/\u2028\u2029/\u2028\u2029" ]
+    json = '["\/\u2028\u2029\/\u2028\u2029\/\u2028\u2029\/\u2028\u2029\/\u2028\u2029\/\u2028\u2029\/\u2028\u2029\/\u2028\u2029\/\u2028\u2029\/\u2028\u2029"]'
+    assert_equal json, generate(data, :escape_slash => true)
+    #
     data = ['"']
     json = '["\""]'
     assert_equal json, generate(data)
@@ -459,6 +483,14 @@ def test_backslash
     data = ["倩", "瀨"]
     json = '["倩","瀨"]'
     assert_equal json, generate(data, script_safe: true)
+    #
+    data = ["倩", "瀨", "倩", "瀨", "倩", "瀨", "倩", "瀨", "倩", "瀨", "倩", "瀨", "倩", "瀨", "倩", "瀨", "倩", "瀨", "倩", "瀨"]
+    json = '["倩","瀨","倩","瀨","倩","瀨","倩","瀨","倩","瀨","倩","瀨","倩","瀨","倩","瀨","倩","瀨","倩","瀨"]'
+    assert_equal json, generate(data, script_safe: true)
+    #
+    data = '["This is a "test" of the emergency broadcast system."]'
+    json = "\"[\\\"This is a \\\"test\\\" of the emergency broadcast system.\\\"]\""
+    assert_equal json, generate(data)
   end
 
   def test_string_subclass