diff --git a/.gitmodules b/.gitmodules
deleted file mode 100644
index e69de29bb2..0000000000
diff --git a/lib/luajit/doc/changes.html b/lib/luajit/doc/changes.html
index 125b58b4ca..826cd2436b 100644
--- a/lib/luajit/doc/changes.html
+++ b/lib/luajit/doc/changes.html
@@ -113,6 +113,7 @@ <h2 id="LuaJIT-2.1.0-beta1">LuaJIT 2.1.0-beta1 &mdash; 2015-08-25</h2>
 <li>x64: Add separate port of the interpreter to <tt>LJ_GC64</tt> mode.</li>
 <li>x86/x64: Drop internal x87 math functions. Use libm functions.</li>
 <li>x86: Remove x87 support from interpreter. SSE2 is mandatory now.</li>
+<li>x86/x64: Add support for AES-NI, AVX and AVX2 to DynASM.</li>
 <li>PPC/e500: Drop support for this architecture.</li>
 </ul></li>
 <li>FFI library:
@@ -123,6 +124,7 @@ <h2 id="LuaJIT-2.1.0-beta1">LuaJIT 2.1.0-beta1 &mdash; 2015-08-25</h2>
 <li>FFI: Compile lightuserdata to <tt>void *</tt> conversion.</li>
 <li>FFI: Compile <tt>ffi.gc(cdata, nil)</tt>, too.</li>
 <li>FFI: Add <tt>ffi.typeinfo()</tt>.</li>
+<li>FFI: Add <tt>ssize_t</tt> declaration.</li>
 </ul></li>
 </ul>
 </div>
diff --git a/lib/luajit/doc/ext_ffi_semantics.html b/lib/luajit/doc/ext_ffi_semantics.html
index 889d44d823..f65fe8f36d 100644
--- a/lib/luajit/doc/ext_ffi_semantics.html
+++ b/lib/luajit/doc/ext_ffi_semantics.html
@@ -185,6 +185,8 @@ <h2 id="clang">C Language Support</h2>
 <tt>uint16_t</tt>, <tt>uint32_t</tt>, <tt>uint64_t</tt>,
 <tt>intptr_t</tt>, <tt>uintptr_t</tt>.</li>
 
+<li>From <tt>&lt;unistd.h&gt;</tt> (POSIX): <tt>ssize_t</tt>.</li>
+
 </ul>
 <p>
 You're encouraged to use these types in preference to
diff --git a/lib/luajit/doc/install.html b/lib/luajit/doc/install.html
index b5df697b67..a4cc721512 100644
--- a/lib/luajit/doc/install.html
+++ b/lib/luajit/doc/install.html
@@ -114,30 +114,30 @@ <h1>Installation</h1>
 </tr>
 <tr class="odd separate">
 <td class="compatcpu">x86 (32 bit)</td>
-<td class="compatos">GCC 4.x<br>GCC 3.4</td>
-<td class="compatos">GCC 4.x<br>GCC 3.4</td>
-<td class="compatos">GCC 4.x<br>GCC 3.4</td>
+<td class="compatos">GCC 4.2+</td>
+<td class="compatos">GCC 4.2+</td>
+<td class="compatos">XCode 5.0+<br>Clang</td>
 <td class="compatos">MSVC, MSVC/EE<br>WinSDK<br>MinGW, Cygwin</td>
 </tr>
 <tr class="even">
 <td class="compatcpu">x64 (64 bit)</td>
-<td class="compatos">GCC 4.x</td>
+<td class="compatos">GCC 4.2+</td>
 <td class="compatos">ORBIS (<a href="#ps4">PS4</a>)</td>
-<td class="compatos">GCC 4.x</td>
+<td class="compatos">XCode 5.0+<br>Clang</td>
 <td class="compatos">MSVC + SDK v7.0<br>WinSDK v7.0<br>Durango (<a href="#xboxone">Xbox One</a>)</td>
 </tr>
 <tr class="odd">
 <td class="compatcpu"><a href="#cross2">ARMv5+<br>ARM9E+</a></td>
 <td class="compatos">GCC 4.2+</td>
 <td class="compatos">GCC 4.2+<br>PSP2 (<a href="#psvita">PS VITA</a>)</td>
-<td class="compatos">GCC 4.2+</td>
+<td class="compatos">XCode 5.0+<br>Clang</td>
 <td class="compatos compatno">&nbsp;</td>
 </tr>
 <tr class="even">
 <td class="compatcpu"><a href="#cross2">ARM64</a></td>
 <td class="compatos">GCC 4.8+</td>
 <td class="compatos compatno">&nbsp;</td>
-<td class="compatos">Clang 3.5+</td>
+<td class="compatos">XCode 6.0+<br>Clang 3.5+</td>
 <td class="compatos compatno">&nbsp;</td>
 </tr>
 <tr class="odd">
@@ -442,8 +442,7 @@ <h2 id="cross">Cross-compiling LuaJIT</h2>
 make HOST_CC="gcc -m32" CROSS=$NDKP TARGET_FLAGS="$NDKF"
 </pre>
 <p>
-You can cross-compile for <b id="ios">iOS 3.0+</b> (iPhone/iPad) using the <a href="http://developer.apple.com/devcenter/ios/index.action"><span class="ext">&raquo;</span>&nbsp;iOS SDK</a>.
-The environment variables need to match the iOS SDK version:
+You can cross-compile for <b id="ios">iOS 3.0+</b> (iPhone/iPad) using the <a href="http://developer.apple.com/devcenter/ios/index.action"><span class="ext">&raquo;</span>&nbsp;iOS SDK</a>:
 </p>
 <p style="font-size: 8pt;">
 Note: <b>the JIT compiler is disabled for iOS</b>, because regular iOS Apps
@@ -453,13 +452,18 @@ <h2 id="cross">Cross-compiling LuaJIT</h2>
 Or use Android. :-p
 </p>
 <pre class="code">
-IXCODE=`xcode-select -print-path`
-ISDK=$IXCODE/Platforms/iPhoneOS.platform/Developer
-ISDKVER=iPhoneOS6.0.sdk
-ISDKP=$ISDK/usr/bin/
-ISDKF="-arch armv7 -isysroot $ISDK/SDKs/$ISDKVER"
-make HOST_CC="gcc -m32 -arch i386" CROSS=$ISDKP TARGET_FLAGS="$ISDKF" \
-     TARGET_SYS=iOS
+# iOS/ARM (32 bit)
+ISDKP=$(xcrun --sdk iphoneos --show-sdk-path)
+ICC=$(xcrun --sdk iphoneos --find clang)
+ISDKF="-arch armv7 -isysroot $ISDKP"
+make HOST_CC="clang -m32 -arch i386" CROSS="$(dirname $ICC)/" \
+     TARGET_FLAGS="$ISDKF" TARGET_SYS=iOS
+
+# iOS/ARM64
+ISDKP=$(xcrun --sdk iphoneos --show-sdk-path)
+ICC=$(xcrun --sdk iphoneos --find clang)
+ISDKF="-arch arm64 -isysroot $ISDKP"
+make CROSS="$(dirname $ICC)/" TARGET_FLAGS="$ISDKF" TARGET_SYS=iOS
 </pre>
 
 <h3 id="consoles">Cross-compiling for consoles</h3>
diff --git a/lib/luajit/dynasm/dasm_arm.lua b/lib/luajit/dynasm/dasm_arm.lua
index 90a259c5c3..6a1d1d5195 100644
--- a/lib/luajit/dynasm/dasm_arm.lua
+++ b/lib/luajit/dynasm/dasm_arm.lua
@@ -9,9 +9,9 @@
 local _info = {
   arch =	"arm",
   description =	"DynASM ARM module",
-  version =	"1.3.0",
-  vernum =	 10300,
-  release =	"2011-05-05",
+  version =	"1.4.0",
+  vernum =	 10400,
+  release =	"2015-10-18",
   author =	"Mike Pall",
   license =	"MIT",
 }
diff --git a/lib/luajit/dynasm/dasm_arm64.lua b/lib/luajit/dynasm/dasm_arm64.lua
index 9766e475b0..c1e3a81b11 100644
--- a/lib/luajit/dynasm/dasm_arm64.lua
+++ b/lib/luajit/dynasm/dasm_arm64.lua
@@ -9,9 +9,9 @@
 local _info = {
   arch =	"arm",
   description =	"DynASM ARM64 module",
-  version =	"1.3.0",
-  vernum =	 10300,
-  release =	"2014-12-03",
+  version =	"1.4.0",
+  vernum =	 10400,
+  release =	"2015-10-18",
   author =	"Mike Pall",
   license =	"MIT",
 }
diff --git a/lib/luajit/dynasm/dasm_mips.lua b/lib/luajit/dynasm/dasm_mips.lua
index ae0dbd7a9b..ef383431cd 100644
--- a/lib/luajit/dynasm/dasm_mips.lua
+++ b/lib/luajit/dynasm/dasm_mips.lua
@@ -9,9 +9,9 @@
 local _info = {
   arch =	"mips",
   description =	"DynASM MIPS module",
-  version =	"1.3.0",
-  vernum =	 10300,
-  release =	"2012-01-23",
+  version =	"1.4.0",
+  vernum =	 10400,
+  release =	"2015-10-18",
   author =	"Mike Pall",
   license =	"MIT",
 }
diff --git a/lib/luajit/dynasm/dasm_ppc.lua b/lib/luajit/dynasm/dasm_ppc.lua
index 278f09526d..1e9bccaeb8 100644
--- a/lib/luajit/dynasm/dasm_ppc.lua
+++ b/lib/luajit/dynasm/dasm_ppc.lua
@@ -11,9 +11,9 @@
 local _info = {
   arch =	"ppc",
   description =	"DynASM PPC module",
-  version =	"1.3.0",
-  vernum =	 10300,
-  release =	"2015-01-14",
+  version =	"1.4.0",
+  vernum =	 10400,
+  release =	"2015-10-18",
   author =	"Mike Pall",
   license =	"MIT",
 }
diff --git a/lib/luajit/dynasm/dasm_proto.h b/lib/luajit/dynasm/dasm_proto.h
index a8bc6fd285..93ca06533c 100644
--- a/lib/luajit/dynasm/dasm_proto.h
+++ b/lib/luajit/dynasm/dasm_proto.h
@@ -10,8 +10,8 @@
 #include <stddef.h>
 #include <stdarg.h>
 
-#define DASM_IDENT	"DynASM 1.3.0"
-#define DASM_VERSION	10300	/* 1.3.0 */
+#define DASM_IDENT	"DynASM 1.4.0"
+#define DASM_VERSION	10400	/* 1.4.0 */
 
 #ifndef Dst_DECL
 #define Dst_DECL	dasm_State **Dst
diff --git a/lib/luajit/dynasm/dasm_x86.h b/lib/luajit/dynasm/dasm_x86.h
deleted file mode 100644
index 652e8c99b0..0000000000
--- a/lib/luajit/dynasm/dasm_x86.h
+++ /dev/null
@@ -1,471 +0,0 @@
-/*
-** DynASM x86 encoding engine.
-** Copyright (C) 2005-2015 Mike Pall. All rights reserved.
-** Released under the MIT license. See dynasm.lua for full copyright notice.
-*/
-
-#include <stddef.h>
-#include <stdarg.h>
-#include <string.h>
-#include <stdlib.h>
-
-#define DASM_ARCH		"x86"
-
-#ifndef DASM_EXTERN
-#define DASM_EXTERN(a,b,c,d)	0
-#endif
-
-/* Action definitions. DASM_STOP must be 255. */
-enum {
-  DASM_DISP = 233,
-  DASM_IMM_S, DASM_IMM_B, DASM_IMM_W, DASM_IMM_D, DASM_IMM_WB, DASM_IMM_DB,
-  DASM_VREG, DASM_SPACE, DASM_SETLABEL, DASM_REL_A, DASM_REL_LG, DASM_REL_PC,
-  DASM_IMM_LG, DASM_IMM_PC, DASM_LABEL_LG, DASM_LABEL_PC, DASM_ALIGN,
-  DASM_EXTERN, DASM_ESC, DASM_MARK, DASM_SECTION, DASM_STOP
-};
-
-/* Maximum number of section buffer positions for a single dasm_put() call. */
-#define DASM_MAXSECPOS		25
-
-/* DynASM encoder status codes. Action list offset or number are or'ed in. */
-#define DASM_S_OK		0x00000000
-#define DASM_S_NOMEM		0x01000000
-#define DASM_S_PHASE		0x02000000
-#define DASM_S_MATCH_SEC	0x03000000
-#define DASM_S_RANGE_I		0x11000000
-#define DASM_S_RANGE_SEC	0x12000000
-#define DASM_S_RANGE_LG		0x13000000
-#define DASM_S_RANGE_PC		0x14000000
-#define DASM_S_RANGE_VREG	0x15000000
-#define DASM_S_UNDEF_L		0x21000000
-#define DASM_S_UNDEF_PC		0x22000000
-
-/* Macros to convert positions (8 bit section + 24 bit index). */
-#define DASM_POS2IDX(pos)	((pos)&0x00ffffff)
-#define DASM_POS2BIAS(pos)	((pos)&0xff000000)
-#define DASM_SEC2POS(sec)	((sec)<<24)
-#define DASM_POS2SEC(pos)	((pos)>>24)
-#define DASM_POS2PTR(D, pos)	(D->sections[DASM_POS2SEC(pos)].rbuf + (pos))
-
-/* Action list type. */
-typedef const unsigned char *dasm_ActList;
-
-/* Per-section structure. */
-typedef struct dasm_Section {
-  int *rbuf;		/* Biased buffer pointer (negative section bias). */
-  int *buf;		/* True buffer pointer. */
-  size_t bsize;		/* Buffer size in bytes. */
-  int pos;		/* Biased buffer position. */
-  int epos;		/* End of biased buffer position - max single put. */
-  int ofs;		/* Byte offset into section. */
-} dasm_Section;
-
-/* Core structure holding the DynASM encoding state. */
-struct dasm_State {
-  size_t psize;			/* Allocated size of this structure. */
-  dasm_ActList actionlist;	/* Current actionlist pointer. */
-  int *lglabels;		/* Local/global chain/pos ptrs. */
-  size_t lgsize;
-  int *pclabels;		/* PC label chains/pos ptrs. */
-  size_t pcsize;
-  void **globals;		/* Array of globals (bias -10). */
-  dasm_Section *section;	/* Pointer to active section. */
-  size_t codesize;		/* Total size of all code sections. */
-  int maxsection;		/* 0 <= sectionidx < maxsection. */
-  int status;			/* Status code. */
-  dasm_Section sections[1];	/* All sections. Alloc-extended. */
-};
-
-/* The size of the core structure depends on the max. number of sections. */
-#define DASM_PSZ(ms)	(sizeof(dasm_State)+(ms-1)*sizeof(dasm_Section))
-
-
-/* Initialize DynASM state. */
-void dasm_init(Dst_DECL, int maxsection)
-{
-  dasm_State *D;
-  size_t psz = 0;
-  int i;
-  Dst_REF = NULL;
-  DASM_M_GROW(Dst, struct dasm_State, Dst_REF, psz, DASM_PSZ(maxsection));
-  D = Dst_REF;
-  D->psize = psz;
-  D->lglabels = NULL;
-  D->lgsize = 0;
-  D->pclabels = NULL;
-  D->pcsize = 0;
-  D->globals = NULL;
-  D->maxsection = maxsection;
-  for (i = 0; i < maxsection; i++) {
-    D->sections[i].buf = NULL;  /* Need this for pass3. */
-    D->sections[i].rbuf = D->sections[i].buf - DASM_SEC2POS(i);
-    D->sections[i].bsize = 0;
-    D->sections[i].epos = 0;  /* Wrong, but is recalculated after resize. */
-  }
-}
-
-/* Free DynASM state. */
-void dasm_free(Dst_DECL)
-{
-  dasm_State *D = Dst_REF;
-  int i;
-  for (i = 0; i < D->maxsection; i++)
-    if (D->sections[i].buf)
-      DASM_M_FREE(Dst, D->sections[i].buf, D->sections[i].bsize);
-  if (D->pclabels) DASM_M_FREE(Dst, D->pclabels, D->pcsize);
-  if (D->lglabels) DASM_M_FREE(Dst, D->lglabels, D->lgsize);
-  DASM_M_FREE(Dst, D, D->psize);
-}
-
-/* Setup global label array. Must be called before dasm_setup(). */
-void dasm_setupglobal(Dst_DECL, void **gl, unsigned int maxgl)
-{
-  dasm_State *D = Dst_REF;
-  D->globals = gl - 10;  /* Negative bias to compensate for locals. */
-  DASM_M_GROW(Dst, int, D->lglabels, D->lgsize, (10+maxgl)*sizeof(int));
-}
-
-/* Grow PC label array. Can be called after dasm_setup(), too. */
-void dasm_growpc(Dst_DECL, unsigned int maxpc)
-{
-  dasm_State *D = Dst_REF;
-  size_t osz = D->pcsize;
-  DASM_M_GROW(Dst, int, D->pclabels, D->pcsize, maxpc*sizeof(int));
-  memset((void *)(((unsigned char *)D->pclabels)+osz), 0, D->pcsize-osz);
-}
-
-/* Setup encoder. */
-void dasm_setup(Dst_DECL, const void *actionlist)
-{
-  dasm_State *D = Dst_REF;
-  int i;
-  D->actionlist = (dasm_ActList)actionlist;
-  D->status = DASM_S_OK;
-  D->section = &D->sections[0];
-  memset((void *)D->lglabels, 0, D->lgsize);
-  if (D->pclabels) memset((void *)D->pclabels, 0, D->pcsize);
-  for (i = 0; i < D->maxsection; i++) {
-    D->sections[i].pos = DASM_SEC2POS(i);
-    D->sections[i].ofs = 0;
-  }
-}
-
-
-#ifdef DASM_CHECKS
-#define CK(x, st) \
-  do { if (!(x)) { \
-    D->status = DASM_S_##st|(int)(p-D->actionlist-1); return; } } while (0)
-#define CKPL(kind, st) \
-  do { if ((size_t)((char *)pl-(char *)D->kind##labels) >= D->kind##size) { \
-    D->status=DASM_S_RANGE_##st|(int)(p-D->actionlist-1); return; } } while (0)
-#else
-#define CK(x, st)	((void)0)
-#define CKPL(kind, st)	((void)0)
-#endif
-
-/* Pass 1: Store actions and args, link branches/labels, estimate offsets. */
-void dasm_put(Dst_DECL, int start, ...)
-{
-  va_list ap;
-  dasm_State *D = Dst_REF;
-  dasm_ActList p = D->actionlist + start;
-  dasm_Section *sec = D->section;
-  int pos = sec->pos, ofs = sec->ofs, mrm = 4;
-  int *b;
-
-  if (pos >= sec->epos) {
-    DASM_M_GROW(Dst, int, sec->buf, sec->bsize,
-      sec->bsize + 2*DASM_MAXSECPOS*sizeof(int));
-    sec->rbuf = sec->buf - DASM_POS2BIAS(pos);
-    sec->epos = (int)sec->bsize/sizeof(int) - DASM_MAXSECPOS+DASM_POS2BIAS(pos);
-  }
-
-  b = sec->rbuf;
-  b[pos++] = start;
-
-  va_start(ap, start);
-  while (1) {
-    int action = *p++;
-    if (action < DASM_DISP) {
-      ofs++;
-    } else if (action <= DASM_REL_A) {
-      int n = va_arg(ap, int);
-      b[pos++] = n;
-      switch (action) {
-      case DASM_DISP:
-	if (n == 0) { if ((mrm&7) == 4) mrm = p[-2]; if ((mrm&7) != 5) break; }
-      case DASM_IMM_DB: if (((n+128)&-256) == 0) goto ob;
-      case DASM_REL_A: /* Assumes ptrdiff_t is int. !x64 */
-      case DASM_IMM_D: ofs += 4; break;
-      case DASM_IMM_S: CK(((n+128)&-256) == 0, RANGE_I); goto ob;
-      case DASM_IMM_B: CK((n&-256) == 0, RANGE_I); ob: ofs++; break;
-      case DASM_IMM_WB: if (((n+128)&-256) == 0) goto ob;
-      case DASM_IMM_W: CK((n&-65536) == 0, RANGE_I); ofs += 2; break;
-      case DASM_SPACE: p++; ofs += n; break;
-      case DASM_SETLABEL: b[pos-2] = -0x40000000; break;  /* Neg. label ofs. */
-      case DASM_VREG: CK((n&-8) == 0 && (n != 4 || (*p&1) == 0), RANGE_VREG);
-	if (*p++ == 1 && *p == DASM_DISP) mrm = n; continue;
-      }
-      mrm = 4;
-    } else {
-      int *pl, n;
-      switch (action) {
-      case DASM_REL_LG:
-      case DASM_IMM_LG:
-	n = *p++; pl = D->lglabels + n;
-	/* Bkwd rel or global. */
-	if (n <= 246) { CK(n>=10||*pl<0, RANGE_LG); CKPL(lg, LG); goto putrel; }
-	pl -= 246; n = *pl;
-	if (n < 0) n = 0;  /* Start new chain for fwd rel if label exists. */
-	goto linkrel;
-      case DASM_REL_PC:
-      case DASM_IMM_PC: pl = D->pclabels + va_arg(ap, int); CKPL(pc, PC);
-      putrel:
-	n = *pl;
-	if (n < 0) {  /* Label exists. Get label pos and store it. */
-	  b[pos] = -n;
-	} else {
-      linkrel:
-	  b[pos] = n;  /* Else link to rel chain, anchored at label. */
-	  *pl = pos;
-	}
-	pos++;
-	ofs += 4;  /* Maximum offset needed. */
-	if (action == DASM_REL_LG || action == DASM_REL_PC)
-	  b[pos++] = ofs;  /* Store pass1 offset estimate. */
-	break;
-      case DASM_LABEL_LG: pl = D->lglabels + *p++; CKPL(lg, LG); goto putlabel;
-      case DASM_LABEL_PC: pl = D->pclabels + va_arg(ap, int); CKPL(pc, PC);
-      putlabel:
-	n = *pl;  /* n > 0: Collapse rel chain and replace with label pos. */
-	while (n > 0) { int *pb = DASM_POS2PTR(D, n); n = *pb; *pb = pos; }
-	*pl = -pos;  /* Label exists now. */
-	b[pos++] = ofs;  /* Store pass1 offset estimate. */
-	break;
-      case DASM_ALIGN:
-	ofs += *p++;  /* Maximum alignment needed (arg is 2**n-1). */
-	b[pos++] = ofs;  /* Store pass1 offset estimate. */
-	break;
-      case DASM_EXTERN: p += 2; ofs += 4; break;
-      case DASM_ESC: p++; ofs++; break;
-      case DASM_MARK: mrm = p[-2]; break;
-      case DASM_SECTION:
-	n = *p; CK(n < D->maxsection, RANGE_SEC); D->section = &D->sections[n];
-      case DASM_STOP: goto stop;
-      }
-    }
-  }
-stop:
-  va_end(ap);
-  sec->pos = pos;
-  sec->ofs = ofs;
-}
-#undef CK
-
-/* Pass 2: Link sections, shrink branches/aligns, fix label offsets. */
-int dasm_link(Dst_DECL, size_t *szp)
-{
-  dasm_State *D = Dst_REF;
-  int secnum;
-  int ofs = 0;
-
-#ifdef DASM_CHECKS
-  *szp = 0;
-  if (D->status != DASM_S_OK) return D->status;
-  {
-    int pc;
-    for (pc = 0; pc*sizeof(int) < D->pcsize; pc++)
-      if (D->pclabels[pc] > 0) return DASM_S_UNDEF_PC|pc;
-  }
-#endif
-
-  { /* Handle globals not defined in this translation unit. */
-    int idx;
-    for (idx = 10; idx*sizeof(int) < D->lgsize; idx++) {
-      int n = D->lglabels[idx];
-      /* Undefined label: Collapse rel chain and replace with marker (< 0). */
-      while (n > 0) { int *pb = DASM_POS2PTR(D, n); n = *pb; *pb = -idx; }
-    }
-  }
-
-  /* Combine all code sections. No support for data sections (yet). */
-  for (secnum = 0; secnum < D->maxsection; secnum++) {
-    dasm_Section *sec = D->sections + secnum;
-    int *b = sec->rbuf;
-    int pos = DASM_SEC2POS(secnum);
-    int lastpos = sec->pos;
-
-    while (pos != lastpos) {
-      dasm_ActList p = D->actionlist + b[pos++];
-      while (1) {
-	int op, action = *p++;
-	switch (action) {
-	case DASM_REL_LG: p++; op = p[-3]; goto rel_pc;
-	case DASM_REL_PC: op = p[-2]; rel_pc: {
-	  int shrink = op == 0xe9 ? 3 : ((op&0xf0) == 0x80 ? 4 : 0);
-	  if (shrink) {  /* Shrinkable branch opcode? */
-	    int lofs, lpos = b[pos];
-	    if (lpos < 0) goto noshrink;  /* Ext global? */
-	    lofs = *DASM_POS2PTR(D, lpos);
-	    if (lpos > pos) {  /* Fwd label: add cumulative section offsets. */
-	      int i;
-	      for (i = secnum; i < DASM_POS2SEC(lpos); i++)
-		lofs += D->sections[i].ofs;
-	    } else {
-	      lofs -= ofs;  /* Bkwd label: unfix offset. */
-	    }
-	    lofs -= b[pos+1];  /* Short branch ok? */
-	    if (lofs >= -128-shrink && lofs <= 127) ofs -= shrink;  /* Yes. */
-	    else { noshrink: shrink = 0; }  /* No, cannot shrink op. */
-	  }
-	  b[pos+1] = shrink;
-	  pos += 2;
-	  break;
-	}
-	case DASM_SPACE: case DASM_IMM_LG: case DASM_VREG: p++;
-	case DASM_DISP: case DASM_IMM_S: case DASM_IMM_B: case DASM_IMM_W:
-	case DASM_IMM_D: case DASM_IMM_WB: case DASM_IMM_DB:
-	case DASM_SETLABEL: case DASM_REL_A: case DASM_IMM_PC: pos++; break;
-	case DASM_LABEL_LG: p++;
-	case DASM_LABEL_PC: b[pos++] += ofs; break; /* Fix label offset. */
-	case DASM_ALIGN: ofs -= (b[pos++]+ofs)&*p++; break; /* Adjust ofs. */
-	case DASM_EXTERN: p += 2; break;
-	case DASM_ESC: p++; break;
-	case DASM_MARK: break;
-	case DASM_SECTION: case DASM_STOP: goto stop;
-	}
-      }
-      stop: (void)0;
-    }
-    ofs += sec->ofs;  /* Next section starts right after current section. */
-  }
-
-  D->codesize = ofs;  /* Total size of all code sections */
-  *szp = ofs;
-  return DASM_S_OK;
-}
-
-#define dasmb(x)	*cp++ = (unsigned char)(x)
-#ifndef DASM_ALIGNED_WRITES
-#define dasmw(x) \
-  do { *((unsigned short *)cp) = (unsigned short)(x); cp+=2; } while (0)
-#define dasmd(x) \
-  do { *((unsigned int *)cp) = (unsigned int)(x); cp+=4; } while (0)
-#else
-#define dasmw(x)	do { dasmb(x); dasmb((x)>>8); } while (0)
-#define dasmd(x)	do { dasmw(x); dasmw((x)>>16); } while (0)
-#endif
-
-/* Pass 3: Encode sections. */
-int dasm_encode(Dst_DECL, void *buffer)
-{
-  dasm_State *D = Dst_REF;
-  unsigned char *base = (unsigned char *)buffer;
-  unsigned char *cp = base;
-  int secnum;
-
-  /* Encode all code sections. No support for data sections (yet). */
-  for (secnum = 0; secnum < D->maxsection; secnum++) {
-    dasm_Section *sec = D->sections + secnum;
-    int *b = sec->buf;
-    int *endb = sec->rbuf + sec->pos;
-
-    while (b != endb) {
-      dasm_ActList p = D->actionlist + *b++;
-      unsigned char *mark = NULL;
-      while (1) {
-	int action = *p++;
-	int n = (action >= DASM_DISP && action <= DASM_ALIGN) ? *b++ : 0;
-	switch (action) {
-	case DASM_DISP: if (!mark) mark = cp; {
-	  unsigned char *mm = mark;
-	  if (*p != DASM_IMM_DB && *p != DASM_IMM_WB) mark = NULL;
-	  if (n == 0) { int mrm = mm[-1]&7; if (mrm == 4) mrm = mm[0]&7;
-	    if (mrm != 5) { mm[-1] -= 0x80; break; } }
-	  if (((n+128) & -256) != 0) goto wd; else mm[-1] -= 0x40;
-	}
-	case DASM_IMM_S: case DASM_IMM_B: wb: dasmb(n); break;
-	case DASM_IMM_DB: if (((n+128)&-256) == 0) {
-	    db: if (!mark) mark = cp; mark[-2] += 2; mark = NULL; goto wb;
-	  } else mark = NULL;
-	case DASM_IMM_D: wd: dasmd(n); break;
-	case DASM_IMM_WB: if (((n+128)&-256) == 0) goto db; else mark = NULL;
-	case DASM_IMM_W: dasmw(n); break;
-	case DASM_VREG: { int t = *p++; if (t >= 2) n<<=3; cp[-1] |= n; break; }
-	case DASM_REL_LG: p++; if (n >= 0) goto rel_pc;
-	  b++; n = (int)(ptrdiff_t)D->globals[-n];
-	case DASM_REL_A: rel_a: n -= (int)(ptrdiff_t)(cp+4); goto wd; /* !x64 */
-	case DASM_REL_PC: rel_pc: {
-	  int shrink = *b++;
-	  int *pb = DASM_POS2PTR(D, n); if (*pb < 0) { n = pb[1]; goto rel_a; }
-	  n = *pb - ((int)(cp-base) + 4-shrink);
-	  if (shrink == 0) goto wd;
-	  if (shrink == 4) { cp--; cp[-1] = *cp-0x10; } else cp[-1] = 0xeb;
-	  goto wb;
-	}
-	case DASM_IMM_LG:
-	  p++; if (n < 0) { n = (int)(ptrdiff_t)D->globals[-n]; goto wd; }
-	case DASM_IMM_PC: {
-	  int *pb = DASM_POS2PTR(D, n);
-	  n = *pb < 0 ? pb[1] : (*pb + (int)(ptrdiff_t)base);
-	  goto wd;
-	}
-	case DASM_LABEL_LG: {
-	  int idx = *p++;
-	  if (idx >= 10)
-	    D->globals[idx] = (void *)(base + (*p == DASM_SETLABEL ? *b : n));
-	  break;
-	}
-	case DASM_LABEL_PC: case DASM_SETLABEL: break;
-	case DASM_SPACE: { int fill = *p++; while (n--) *cp++ = fill; break; }
-	case DASM_ALIGN:
-	  n = *p++;
-	  while (((cp-base) & n)) *cp++ = 0x90; /* nop */
-	  break;
-	case DASM_EXTERN: n = DASM_EXTERN(Dst, cp, p[1], *p); p += 2; goto wd;
-	case DASM_MARK: mark = cp; break;
-	case DASM_ESC: action = *p++;
-	default: *cp++ = action; break;
-	case DASM_SECTION: case DASM_STOP: goto stop;
-	}
-      }
-      stop: (void)0;
-    }
-  }
-
-  if (base + D->codesize != cp)  /* Check for phase errors. */
-    return DASM_S_PHASE;
-  return DASM_S_OK;
-}
-
-/* Get PC label offset. */
-int dasm_getpclabel(Dst_DECL, unsigned int pc)
-{
-  dasm_State *D = Dst_REF;
-  if (pc*sizeof(int) < D->pcsize) {
-    int pos = D->pclabels[pc];
-    if (pos < 0) return *DASM_POS2PTR(D, -pos);
-    if (pos > 0) return -1;  /* Undefined. */
-  }
-  return -2;  /* Unused or out of range. */
-}
-
-#ifdef DASM_CHECKS
-/* Optional sanity checker to call between isolated encoding steps. */
-int dasm_checkstep(Dst_DECL, int secmatch)
-{
-  dasm_State *D = Dst_REF;
-  if (D->status == DASM_S_OK) {
-    int i;
-    for (i = 1; i <= 9; i++) {
-      if (D->lglabels[i] > 0) { D->status = DASM_S_UNDEF_L|i; break; }
-      D->lglabels[i] = 0;
-    }
-  }
-  if (D->status == DASM_S_OK && secmatch >= 0 &&
-      D->section != &D->sections[secmatch])
-    D->status = DASM_S_MATCH_SEC|(int)(D->section-D->sections);
-  return D->status;
-}
-#endif
-
diff --git a/lib/luajit/dynasm/dasm_x86.lua b/lib/luajit/dynasm/dasm_x86.lua
index 7ca061d22f..60f5211a33 100644
--- a/lib/luajit/dynasm/dasm_x86.lua
+++ b/lib/luajit/dynasm/dasm_x86.lua
@@ -11,9 +11,9 @@ local x64 = x64
 local _info = {
   arch =	x64 and "x64" or "x86",
   description =	"DynASM x86/x64 module",
-  version =	"1.3.0",
-  vernum =	 10300,
-  release =	"2011-05-05",
+  version =	"1.4.0",
+  vernum =	 10400,
+  release =	"2015-10-18",
   author =	"Mike Pall",
   license =	"MIT",
 }
@@ -27,9 +27,9 @@ local assert, unpack, setmetatable = assert, unpack or table.unpack, setmetatabl
 local _s = string
 local sub, format, byte, char = _s.sub, _s.format, _s.byte, _s.char
 local find, match, gmatch, gsub = _s.find, _s.match, _s.gmatch, _s.gsub
-local concat, sort = table.concat, table.sort
+local concat, sort, remove = table.concat, table.sort, table.remove
 local bit = bit or require("bit")
-local band, shl, shr = bit.band, bit.lshift, bit.rshift
+local band, bxor, shl, shr = bit.band, bit.bxor, bit.lshift, bit.rshift
 
 -- Inherited tables and callbacks.
 local g_opt, g_arch
@@ -41,7 +41,7 @@ local action_names = {
   -- int arg, 1 buffer pos:
   "DISP",  "IMM_S", "IMM_B", "IMM_W", "IMM_D",  "IMM_WB", "IMM_DB",
   -- action arg (1 byte), int arg, 1 buffer pos (reg/num):
-  "VREG", "SPACE", -- !x64: VREG support NYI.
+  "VREG", "SPACE",
   -- ptrdiff_t arg, 1 buffer pos (address): !x64
   "SETLABEL", "REL_A",
   -- action arg (1 byte) or int arg, 2 buffer pos (link, offset):
@@ -83,6 +83,21 @@ local actargs = { 0 }
 -- Current number of section buffer positions for dasm_put().
 local secpos = 1
 
+-- VREG kind encodings, pre-shifted by 5 bits.
+local map_vreg = {
+  ["modrm.rm.m"] = 0x00,
+  ["modrm.rm.r"] = 0x20,
+  ["opcode"] =     0x20,
+  ["sib.base"] =   0x20,
+  ["sib.index"] =  0x40,
+  ["modrm.reg"] =  0x80,
+  ["vex.v"] =      0xa0,
+  ["imm.hi"] =     0xc0,
+}
+
+-- Current number of VREG actions contributing to REX/VEX shrinkage.
+local vreg_shrink_count = 0
+
 ------------------------------------------------------------------------------
 
 -- Compute action numbers for action names.
@@ -134,6 +149,21 @@ local function waction(action, a, num)
   if a or num then secpos = secpos + (num or 1) end
 end
 
+-- Optionally add a VREG action.
+local function wvreg(kind, vreg, psz, sk, defer)
+  if not vreg then return end
+  waction("VREG", vreg)
+  local b = assert(map_vreg[kind], "bad vreg kind `"..vreg.."'")
+  if b < (sk or 0) then
+    vreg_shrink_count = vreg_shrink_count + 1
+  end
+  if not defer then
+    b = b + vreg_shrink_count * 8
+    vreg_shrink_count = 0
+  end
+  wputxb(b + (psz or 0))
+end
+
 -- Add call to embedded DynASM C code.
 local function wcall(func, args)
   wline(format("dasm_%s(Dst, %s);", func, concat(args, ", ")), true)
@@ -299,7 +329,7 @@ local function mkrmap(sz, cl, names)
     local iname = format("@%s%x%s", sz, i, needrex and "R" or "")
     if needrex then map_reg_needrex[iname] = true end
     local name
-    if sz == "o" then name = format("xmm%d", i)
+    if sz == "o" or sz == "y" then name = format("%s%d", cl, i)
     elseif sz == "f" then name = format("st%d", i)
     else name = format("r%d%s", i, sz == addrsize and "" or sz) end
     map_archdef[name] = iname
@@ -326,6 +356,7 @@ mkrmap("w", "Rw", {"ax", "cx", "dx", "bx", "sp", "bp", "si", "di"})
 mkrmap("b", "Rb", {"al", "cl", "dl", "bl", "ah", "ch", "dh", "bh"})
 map_reg_valid_index[map_archdef.esp] = false
 if x64 then map_reg_valid_index[map_archdef.rsp] = false end
+if x64 then map_reg_needrex[map_archdef.Rb] = true end
 map_archdef["Ra"] = "@"..addrsize
 
 -- FP registers (internally tword sized, but use "f" as operand size).
@@ -334,21 +365,24 @@ mkrmap("f", "Rf")
 -- SSE registers (oword sized, but qword and dword accessible).
 mkrmap("o", "xmm")
 
+-- AVX registers (yword sized, but oword, qword and dword accessible).
+mkrmap("y", "ymm")
+
 -- Operand size prefixes to codes.
 local map_opsize = {
-  byte = "b", word = "w", dword = "d", qword = "q", oword = "o", tword = "t",
-  aword = addrsize,
+  byte = "b", word = "w", dword = "d", qword = "q", oword = "o", yword = "y",
+  tword = "t", aword = addrsize,
 }
 
 -- Operand size code to number.
 local map_opsizenum = {
-  b = 1, w = 2, d = 4, q = 8, o = 16, t = 10,
+  b = 1, w = 2, d = 4, q = 8, o = 16, y = 32, t = 10,
 }
 
 -- Operand size code to name.
 local map_opsizename = {
-  b = "byte", w = "word", d = "dword", q = "qword", o = "oword", t = "tword",
-  f = "fpword",
+  b = "byte", w = "word", d = "dword", q = "qword", o = "oword", y = "yword",
+  t = "tword", f = "fpword",
 }
 
 -- Valid index register scale factors.
@@ -460,9 +494,45 @@ local function wputszarg(sz, n)
 end
 
 -- Put multi-byte opcode with operand-size dependent modifications.
-local function wputop(sz, op, rex)
+local function wputop(sz, op, rex, vex, vregr, vregxb)
+  local psz, sk = 0, nil
+  if vex then
+    local tail
+    if vex.m == 1 and band(rex, 11) == 0 then
+      if x64 and vregxb then
+	sk = map_vreg["modrm.reg"]
+      else
+	wputb(0xc5)
+      tail = shl(bxor(band(rex, 4), 4), 5)
+      psz = 3
+      end
+    end
+    if not tail then
+      wputb(0xc4)
+      wputb(shl(bxor(band(rex, 7), 7), 5) + vex.m)
+      tail = shl(band(rex, 8), 4)
+      psz = 4
+    end
+    local reg, vreg = 0, nil
+    if vex.v then
+      reg = vex.v.reg
+      if not reg then werror("bad vex operand") end
+      if reg < 0 then reg = 0; vreg = vex.v.vreg end
+    end
+    if sz == "y" or vex.l then tail = tail + 4 end
+    wputb(tail + shl(bxor(reg, 15), 3) + vex.p)
+    wvreg("vex.v", vreg)
+    rex = 0
+    if op >= 256 then werror("bad vex opcode") end
+  else
+    if rex ~= 0 then
+      if not x64 then werror("bad operand size") end
+    elseif (vregr or vregxb) and x64 then
+      rex = 0x10
+      sk = map_vreg["vex.v"]
+    end
+  end
   local r
-  if rex ~= 0 and not x64 then werror("bad operand size") end
   if sz == "w" then wputb(102) end
   -- Needs >32 bit numbers, but only for crc32 eax, word [ebx]
   if op >= 4294967296 then r = op%4294967296 wputb((op-r)/4294967296) op = r end
@@ -471,20 +541,20 @@ local function wputop(sz, op, rex)
     if rex ~= 0 then
       local opc3 = band(op, 0xffff00)
       if opc3 == 0x0f3a00 or opc3 == 0x0f3800 then
-	wputb(64 + band(rex, 15)); rex = 0
+	wputb(64 + band(rex, 15)); rex = 0; psz = 2
       end
     end
-    wputb(shr(op, 16)); op = band(op, 0xffff)
+    wputb(shr(op, 16)); op = band(op, 0xffff); psz = psz + 1
   end
   if op >= 256 then
     local b = shr(op, 8)
-    if b == 15 and rex ~= 0 then wputb(64 + band(rex, 15)); rex = 0 end
-    wputb(b)
-    op = band(op, 255)
+    if b == 15 and rex ~= 0 then wputb(64 + band(rex, 15)); rex = 0; psz = 2 end
+    wputb(b); op = band(op, 255); psz = psz + 1
   end
-  if rex ~= 0 then wputb(64 + band(rex, 15)) end
+  if rex ~= 0 then wputb(64 + band(rex, 15)); psz = 2 end
   if sz == "b" then op = op - 1 end
   wputb(op)
+  return psz, sk
 end
 
 -- Put ModRM or SIB formatted byte.
@@ -494,7 +564,7 @@ local function wputmodrm(m, s, rm, vs, vrm)
 end
 
 -- Put ModRM/SIB plus optional displacement.
-local function wputmrmsib(t, imark, s, vsreg)
+local function wputmrmsib(t, imark, s, vsreg, psz, sk)
   local vreg, vxreg
   local reg, xreg = t.reg, t.xreg
   if reg and reg < 0 then reg = 0; vreg = t.vreg end
@@ -504,8 +574,8 @@ local function wputmrmsib(t, imark, s, vsreg)
   -- Register mode.
   if sub(t.mode, 1, 1) == "r" then
     wputmodrm(3, s, reg)
-    if vsreg then waction("VREG", vsreg); wputxb(2) end
-    if vreg then waction("VREG", vreg); wputxb(0) end
+    wvreg("modrm.reg", vsreg, psz+1, sk, vreg)
+    wvreg("modrm.rm.r", vreg, psz+1, sk)
     return
   end
 
@@ -519,21 +589,22 @@ local function wputmrmsib(t, imark, s, vsreg)
       -- [xreg*xsc+disp] -> (0, s, esp) (xsc, xreg, ebp)
       wputmodrm(0, s, 4)
       if imark == "I" then waction("MARK") end
-      if vsreg then waction("VREG", vsreg); wputxb(2) end
+      wvreg("modrm.reg", vsreg, psz+1, sk, vxreg)
       wputmodrm(t.xsc, xreg, 5)
-      if vxreg then waction("VREG", vxreg); wputxb(3) end
+      wvreg("sib.index", vxreg, psz+2, sk)
     else
       -- Pure 32 bit displacement.
       if x64 and tdisp ~= "table" then
 	wputmodrm(0, s, 4) -- [disp] -> (0, s, esp) (0, esp, ebp)
+	wvreg("modrm.reg", vsreg, psz+1, sk)
 	if imark == "I" then waction("MARK") end
 	wputmodrm(0, 4, 5)
       else
 	riprel = x64
 	wputmodrm(0, s, 5) -- [disp|rip-label] -> (0, s, ebp)
+	wvreg("modrm.reg", vsreg, psz+1, sk)
 	if imark == "I" then waction("MARK") end
       end
-      if vsreg then waction("VREG", vsreg); wputxb(2) end
     end
     if riprel then -- Emit rip-relative displacement.
       if match("UWSiI", imark) then
@@ -561,16 +632,16 @@ local function wputmrmsib(t, imark, s, vsreg)
   if xreg or band(reg, 7) == 4 then
     wputmodrm(m or 2, s, 4) -- ModRM.
     if m == nil or imark == "I" then waction("MARK") end
-    if vsreg then waction("VREG", vsreg); wputxb(2) end
+    wvreg("modrm.reg", vsreg, psz+1, sk, vxreg or vreg)
     wputmodrm(t.xsc or 0, xreg or 4, reg) -- SIB.
-    if vxreg then waction("VREG", vxreg); wputxb(3) end
-    if vreg then waction("VREG", vreg); wputxb(1) end
+    wvreg("sib.index", vxreg, psz+2, sk, vreg)
+    wvreg("sib.base", vreg, psz+2, sk)
   else
     wputmodrm(m or 2, s, reg) -- ModRM.
     if (imark == "I" and (m == 1 or m == 2)) or
        (m == nil and (vsreg or vreg)) then waction("MARK") end
-    if vsreg then waction("VREG", vsreg); wputxb(2) end
-    if vreg then waction("VREG", vreg); wputxb(1) end
+    wvreg("modrm.reg", vsreg, psz+1, sk, vreg)
+    wvreg("modrm.rm.m", vreg, psz+1, sk)
   end
 
   -- Put displacement.
@@ -881,9 +952,15 @@ end
 --   "m"/"M"   generates ModRM/SIB from the 1st/2nd operand.
 --             The spare 3 bits are either filled with the last hex digit or
 --             the result from a previous "r"/"R". The opcode is restored.
+--   "u"       Use VEX encoding, vvvv unused.
+--   "v"/"V"   Use VEX encoding, vvvv from 1st/2nd operand (the operand is
+--             removed from the list used by future characters).
+--   "L"       Force VEX.L
 --
 -- All of the following characters force a flush of the opcode:
 --   "o"/"O"   stores a pure 32 bit disp (offset) from the 1st/2nd operand.
+--   "s"       stores a 4 bit immediate from the last register operand,
+--             followed by 4 zero bits.
 --   "S"       stores a signed 8 bit immediate from the last operand.
 --   "U"       stores an unsigned 8 bit immediate from the last operand.
 --   "W"       stores an unsigned 16 bit immediate from the last operand.
@@ -1081,10 +1158,11 @@ local map_op = {
   btr_2 =	"mrqdw:0FB3Rm|miqdw:0FBA6mU",
   bts_2 =	"mrqdw:0FABRm|miqdw:0FBA5mU",
 
-  shld_3 =	"mriqdw:0FA4RmU|mrCqdw:0FA5Rm",
-  shrd_3 =	"mriqdw:0FACRmU|mrCqdw:0FADRm",
+  shld_3 =	"mriqdw:0FA4RmU|mrC/qq:0FA5Rm|mrC/dd:|mrC/ww:",
+  shrd_3 =	"mriqdw:0FACRmU|mrC/qq:0FADRm|mrC/dd:|mrC/ww:",
 
   rdtsc_0 =	"0F31", -- P1+
+  rdpmc_0 =	"0F33", -- P6+
   cpuid_0 =	"0FA2", -- P1+
 
   -- floating point ops
@@ -1190,7 +1268,7 @@ local map_op = {
   cvtsi2sd_2 =	"rm/od:F20F2ArM|rm/oq:F20F2ArXM",
   cvtsi2ss_2 =	"rm/od:F30F2ArM|rm/oq:F30F2ArXM",
   cvtss2sd_2 =	"rro:F30F5ArM|rx/od:",
-  cvtss2si_2 =	"rr/do:F20F2CrM|rr/qo:|rxd:|rx/qd:",
+  cvtss2si_2 =	"rr/do:F30F2DrM|rr/qo:|rxd:|rx/qd:",
   cvttpd2dq_2 =	"rmo:660FE6rM",
   cvttps2dq_2 =	"rmo:F30F5BrM",
   cvttsd2si_2 =	"rr/do:F20F2CrM|rr/qo:|rx/dq:|rxq:",
@@ -1225,46 +1303,14 @@ local map_op = {
   movups_2 =	"rmo:0F10rM|mro:0F11Rm",
   orpd_2 =	"rmo:660F56rM",
   orps_2 =	"rmo:0F56rM",
-  packssdw_2 =	"rmo:660F6BrM",
-  packsswb_2 =	"rmo:660F63rM",
-  packuswb_2 =	"rmo:660F67rM",
-  paddb_2 =	"rmo:660FFCrM",
-  paddd_2 =	"rmo:660FFErM",
-  paddq_2 =	"rmo:660FD4rM",
-  paddsb_2 =	"rmo:660FECrM",
-  paddsw_2 =	"rmo:660FEDrM",
-  paddusb_2 =	"rmo:660FDCrM",
-  paddusw_2 =	"rmo:660FDDrM",
-  paddw_2 =	"rmo:660FFDrM",
-  pand_2 =	"rmo:660FDBrM",
-  pandn_2 =	"rmo:660FDFrM",
   pause_0 =	"F390",
-  pavgb_2 =	"rmo:660FE0rM",
-  pavgw_2 =	"rmo:660FE3rM",
-  pcmpeqb_2 =	"rmo:660F74rM",
-  pcmpeqd_2 =	"rmo:660F76rM",
-  pcmpeqw_2 =	"rmo:660F75rM",
-  pcmpgtb_2 =	"rmo:660F64rM",
-  pcmpgtd_2 =	"rmo:660F66rM",
-  pcmpgtw_2 =	"rmo:660F65rM",
-  pextrw_3 =	"rri/do:660FC5rMU|xri/wo:660F3A15nrMU", -- Mem op: SSE4.1 only.
+  pextrw_3 =	"rri/do:660FC5rMU|xri/wo:660F3A15nRmU", -- Mem op: SSE4.1 only.
   pinsrw_3 =	"rri/od:660FC4rMU|rxi/ow:",
-  pmaddwd_2 =	"rmo:660FF5rM",
-  pmaxsw_2 =	"rmo:660FEErM",
-  pmaxub_2 =	"rmo:660FDErM",
-  pminsw_2 =	"rmo:660FEArM",
-  pminub_2 =	"rmo:660FDArM",
   pmovmskb_2 =	"rr/do:660FD7rM",
-  pmulhuw_2 =	"rmo:660FE4rM",
-  pmulhw_2 =	"rmo:660FE5rM",
-  pmullw_2 =	"rmo:660FD5rM",
-  pmuludq_2 =	"rmo:660FF4rM",
-  por_2 =	"rmo:660FEBrM",
   prefetchnta_1 = "xb:n0F180m",
   prefetcht0_1 = "xb:n0F181m",
   prefetcht1_1 = "xb:n0F182m",
   prefetcht2_1 = "xb:n0F183m",
-  psadbw_2 =	"rmo:660FF6rM",
   pshufd_3 =	"rmio:660F70rMU",
   pshufhw_3 =	"rmio:F30F70rMU",
   pshuflw_3 =	"rmio:F20F70rMU",
@@ -1278,23 +1324,6 @@ local map_op = {
   psrldq_2 =	"rio:660F733mU",
   psrlq_2 =	"rmo:660FD3rM|rio:660F732mU",
   psrlw_2 =	"rmo:660FD1rM|rio:660F712mU",
-  psubb_2 =	"rmo:660FF8rM",
-  psubd_2 =	"rmo:660FFArM",
-  psubq_2 =	"rmo:660FFBrM",
-  psubsb_2 =	"rmo:660FE8rM",
-  psubsw_2 =	"rmo:660FE9rM",
-  psubusb_2 =	"rmo:660FD8rM",
-  psubusw_2 =	"rmo:660FD9rM",
-  psubw_2 =	"rmo:660FF9rM",
-  punpckhbw_2 =	"rmo:660F68rM",
-  punpckhdq_2 =	"rmo:660F6ArM",
-  punpckhqdq_2 = "rmo:660F6DrM",
-  punpckhwd_2 =	"rmo:660F69rM",
-  punpcklbw_2 =	"rmo:660F60rM",
-  punpckldq_2 =	"rmo:660F62rM",
-  punpcklqdq_2 = "rmo:660F6CrM",
-  punpcklwd_2 =	"rmo:660F61rM",
-  pxor_2 =	"rmo:660FEFrM",
   rcpps_2 =	"rmo:0F53rM",
   rcpss_2 =	"rro:F30F53rM|rx/od:",
   rsqrtps_2 =	"rmo:0F52rM",
@@ -1352,7 +1381,7 @@ local map_op = {
   dpps_3 =	"rmio:660F3A40rMU",
   extractps_3 =	"mri/do:660F3A17RmU|rri/qo:660F3A17RXmU",
   insertps_3 =	"rrio:660F3A41rMU|rxi/od:",
-  movntdqa_2 =	"rmo:660F382ArM",
+  movntdqa_2 =	"rxo:660F382ArM",
   mpsadbw_3 =	"rmio:660F3A42rMU",
   packusdw_2 =	"rmo:660F382BrM",
   pblendvb_3 =	"rmRo:660F3810rM",
@@ -1412,6 +1441,238 @@ local map_op = {
   movntsd_2 =	"xr/qo:nF20F2BRm",
   movntss_2 =	"xr/do:F30F2BRm",
   -- popcnt is also in SSE4.2
+
+  -- AES-NI
+  aesdec_2 =	"rmo:660F38DErM",
+  aesdeclast_2 = "rmo:660F38DFrM",
+  aesenc_2 =	"rmo:660F38DCrM",
+  aesenclast_2 = "rmo:660F38DDrM",
+  aesimc_2 =	"rmo:660F38DBrM",
+  aeskeygenassist_3 = "rmio:660F3ADFrMU",
+  pclmulqdq_3 =	"rmio:660F3A44rMU",
+
+   -- AVX FP ops
+  vaddsubpd_3 =	"rrmoy:660FVD0rM",
+  vaddsubps_3 =	"rrmoy:F20FVD0rM",
+  vandpd_3 =	"rrmoy:660FV54rM",
+  vandps_3 =	"rrmoy:0FV54rM",
+  vandnpd_3 =	"rrmoy:660FV55rM",
+  vandnps_3 =	"rrmoy:0FV55rM",
+  vblendpd_4 =	"rrmioy:660F3AV0DrMU",
+  vblendps_4 =	"rrmioy:660F3AV0CrMU",
+  vblendvpd_4 =	"rrmroy:660F3AV4BrMs",
+  vblendvps_4 =	"rrmroy:660F3AV4ArMs",
+  vbroadcastf128_2 = "rx/yo:660F38u1ArM",
+  vcmppd_4 =	"rrmioy:660FVC2rMU",
+  vcmpps_4 =	"rrmioy:0FVC2rMU",
+  vcmpsd_4 =	"rrrio:F20FVC2rMU|rrxi/ooq:",
+  vcmpss_4 =	"rrrio:F30FVC2rMU|rrxi/ood:",
+  vcomisd_2 =	"rro:660Fu2FrM|rx/oq:",
+  vcomiss_2 =	"rro:0Fu2FrM|rx/od:",
+  vcvtdq2pd_2 =	"rro:F30FuE6rM|rx/oq:|rm/yo:",
+  vcvtdq2ps_2 =	"rmoy:0Fu5BrM",
+  vcvtpd2dq_2 =	"rmoy:F20FuE6rM",
+  vcvtpd2ps_2 =	"rmoy:660Fu5ArM",
+  vcvtps2dq_2 =	"rmoy:660Fu5BrM",
+  vcvtps2pd_2 =	"rro:0Fu5ArM|rx/oq:|rm/yo:",
+  vcvtsd2si_2 =	"rr/do:F20Fu2DrM|rx/dq:|rr/qo:|rxq:",
+  vcvtsd2ss_3 =	"rrro:F20FV5ArM|rrx/ooq:",
+  vcvtsi2sd_3 =	"rrm/ood:F20FV2ArM|rrm/ooq:F20FVX2ArM",
+  vcvtsi2ss_3 =	"rrm/ood:F30FV2ArM|rrm/ooq:F30FVX2ArM",
+  vcvtss2sd_3 =	"rrro:F30FV5ArM|rrx/ood:",
+  vcvtss2si_2 =	"rr/do:F30Fu2DrM|rxd:|rr/qo:|rx/qd:",
+  vcvttpd2dq_2 = "rmo:660FuE6rM|rm/oy:660FuLE6rM",
+  vcvttps2dq_2 = "rmoy:F30Fu5BrM",
+  vcvttsd2si_2 = "rr/do:F20Fu2CrM|rx/dq:|rr/qo:|rxq:",
+  vcvttss2si_2 = "rr/do:F30Fu2CrM|rxd:|rr/qo:|rx/qd:",
+  vdppd_4 =	"rrmio:660F3AV41rMU",
+  vdpps_4 =	"rrmioy:660F3AV40rMU",
+  vextractf128_3 = "mri/oy:660F3AuL19RmU",
+  vextractps_3 = "mri/do:660F3Au17RmU",
+  vhaddpd_3 =	"rrmoy:660FV7CrM",
+  vhaddps_3 =	"rrmoy:F20FV7CrM",
+  vhsubpd_3 =	"rrmoy:660FV7DrM",
+  vhsubps_3 =	"rrmoy:F20FV7DrM",
+  vinsertf128_4 = "rrmi/yyo:660F3AV18rMU",
+  vinsertps_4 =	"rrrio:660F3AV21rMU|rrxi/ood:",
+  vldmxcsr_1 =	"xd:0FuAE2m",
+  vmaskmovps_3 = "rrxoy:660F38V2CrM|xrroy:660F38V2ERm",
+  vmaskmovpd_3 = "rrxoy:660F38V2DrM|xrroy:660F38V2FRm",
+  vmovapd_2 =	"rmoy:660Fu28rM|mroy:660Fu29Rm",
+  vmovaps_2 =	"rmoy:0Fu28rM|mroy:0Fu29Rm",
+  vmovd_2 =	"rm/od:660Fu6ErM|rm/oq:660FuX6ErM|mr/do:660Fu7ERm|mr/qo:",
+  vmovq_2 =	"rro:F30Fu7ErM|rx/oq:|xr/qo:660FuD6Rm",
+  vmovddup_2 =	"rmy:F20Fu12rM|rro:|rx/oq:",
+  vmovhlps_3 =	"rrro:0FV12rM",
+  vmovhpd_2 =	"xr/qo:660Fu17Rm",
+  vmovhpd_3 =	"rrx/ooq:660FV16rM",
+  vmovhps_2 =	"xr/qo:0Fu17Rm",
+  vmovhps_3 =	"rrx/ooq:0FV16rM",
+  vmovlhps_3 =	"rrro:0FV16rM",
+  vmovlpd_2 =	"xr/qo:660Fu13Rm",
+  vmovlpd_3 =	"rrx/ooq:660FV12rM",
+  vmovlps_2 =	"xr/qo:0Fu13Rm",
+  vmovlps_3 =	"rrx/ooq:0FV12rM",
+  vmovmskpd_2 =	"rr/do:660Fu50rM|rr/dy:660FuL50rM",
+  vmovmskps_2 =	"rr/do:0Fu50rM|rr/dy:0FuL50rM",
+  vmovntpd_2 =	"xroy:660Fu2BRm",
+  vmovntps_2 =	"xroy:0Fu2BRm",
+  vmovsd_2 =	"rx/oq:F20Fu10rM|xr/qo:F20Fu11Rm",
+  vmovsd_3 =	"rrro:F20FV10rM",
+  vmovshdup_2 =	"rmoy:F30Fu16rM",
+  vmovsldup_2 =	"rmoy:F30Fu12rM",
+  vmovss_2 =	"rx/od:F30Fu10rM|xr/do:F30Fu11Rm",
+  vmovss_3 =	"rrro:F30FV10rM",
+  vmovupd_2 =	"rmoy:660Fu10rM|mroy:660Fu11Rm",
+  vmovups_2 =	"rmoy:0Fu10rM|mroy:0Fu11Rm",
+  vorpd_3 =	"rrmoy:660FV56rM",
+  vorps_3 =	"rrmoy:0FV56rM",
+  vpermilpd_3 =	"rrmoy:660F38V0DrM|rmioy:660F3Au05rMU",
+  vpermilps_3 =	"rrmoy:660F38V0CrM|rmioy:660F3Au04rMU",
+  vperm2f128_4 = "rrmiy:660F3AV06rMU",
+  vptestpd_2 =	"rmoy:660F38u0FrM",
+  vptestps_2 =	"rmoy:660F38u0ErM",
+  vrcpps_2 =	"rmoy:0Fu53rM",
+  vrcpss_3 =	"rrro:F30FV53rM|rrx/ood:",
+  vrsqrtps_2 =	"rmoy:0Fu52rM",
+  vrsqrtss_3 =	"rrro:F30FV52rM|rrx/ood:",
+  vroundpd_3 =	"rmioy:660F3AV09rMU",
+  vroundps_3 =	"rmioy:660F3AV08rMU",
+  vroundsd_4 =	"rrrio:660F3AV0BrMU|rrxi/ooq:",
+  vroundss_4 =	"rrrio:660F3AV0ArMU|rrxi/ood:",
+  vshufpd_4 =	"rrmioy:660FVC6rMU",
+  vshufps_4 =	"rrmioy:0FVC6rMU",
+  vsqrtps_2 =	"rmoy:0Fu51rM",
+  vsqrtss_2 =	"rro:F30Fu51rM|rx/od:",
+  vsqrtpd_2 =	"rmoy:660Fu51rM",
+  vsqrtsd_2 =	"rro:F20Fu51rM|rx/oq:",
+  vstmxcsr_1 =	"xd:0FuAE3m",
+  vucomisd_2 =	"rro:660Fu2ErM|rx/oq:",
+  vucomiss_2 =	"rro:0Fu2ErM|rx/od:",
+  vunpckhpd_3 =	"rrmoy:660FV15rM",
+  vunpckhps_3 =	"rrmoy:0FV15rM",
+  vunpcklpd_3 =	"rrmoy:660FV14rM",
+  vunpcklps_3 =	"rrmoy:0FV14rM",
+  vxorpd_3 =	"rrmoy:660FV57rM",
+  vxorps_3 =	"rrmoy:0FV57rM",
+  vzeroall_0 =	"0FuL77",
+  vzeroupper_0 = "0Fu77",
+
+  -- AVX2 FP ops
+  vbroadcastss_2 = "rx/od:660F38u18rM|rx/yd:|rro:|rr/yo:",
+  vbroadcastsd_2 = "rx/yq:660F38u19rM|rr/yo:",
+  -- *vgather* (!vsib)
+  vpermpd_3 =	"rmiy:660F3AuX01rMU",
+  vpermps_3 =	"rrmy:660F38V16rM",
+
+  -- AVX, AVX2 integer ops
+  -- In general, xmm requires AVX, ymm requires AVX2.
+  vaesdec_3 =  "rrmo:660F38VDErM",
+  vaesdeclast_3 = "rrmo:660F38VDFrM",
+  vaesenc_3 =  "rrmo:660F38VDCrM",
+  vaesenclast_3 = "rrmo:660F38VDDrM",
+  vaesimc_2 =  "rmo:660F38uDBrM",
+  vaeskeygenassist_3 = "rmio:660F3AuDFrMU",
+  vlddqu_2 =	"rxoy:F20FuF0rM",
+  vmaskmovdqu_2 = "rro:660FuF7rM",
+  vmovdqa_2 =	"rmoy:660Fu6FrM|mroy:660Fu7FRm",
+  vmovdqu_2 =	"rmoy:F30Fu6FrM|mroy:F30Fu7FRm",
+  vmovntdq_2 =	"xroy:660FuE7Rm",
+  vmovntdqa_2 =	"rxoy:660F38u2ArM",
+  vmpsadbw_4 =	"rrmioy:660F3AV42rMU",
+  vpabsb_2 =	"rmoy:660F38u1CrM",
+  vpabsd_2 =	"rmoy:660F38u1ErM",
+  vpabsw_2 =	"rmoy:660F38u1DrM",
+  vpackusdw_3 =	"rrmoy:660F38V2BrM",
+  vpalignr_4 =	"rrmioy:660F3AV0FrMU",
+  vpblendvb_4 =	"rrmroy:660F3AV4CrMs",
+  vpblendw_4 =	"rrmioy:660F3AV0ErMU",
+  vpclmulqdq_4 = "rrmio:660F3AV44rMU",
+  vpcmpeqq_3 =	"rrmoy:660F38V29rM",
+  vpcmpestri_3 = "rmio:660F3Au61rMU",
+  vpcmpestrm_3 = "rmio:660F3Au60rMU",
+  vpcmpgtq_3 =	"rrmoy:660F38V37rM",
+  vpcmpistri_3 = "rmio:660F3Au63rMU",
+  vpcmpistrm_3 = "rmio:660F3Au62rMU",
+  vpextrb_3 =	"rri/do:660F3Au14nRmU|rri/qo:|xri/bo:",
+  vpextrw_3 =	"rri/do:660FuC5rMU|xri/wo:660F3Au15nRmU",
+  vpextrd_3 =	"mri/do:660F3Au16RmU",
+  vpextrq_3 =	"mri/qo:660F3Au16RmU",
+  vphaddw_3 =	"rrmoy:660F38V01rM",
+  vphaddd_3 =	"rrmoy:660F38V02rM",
+  vphaddsw_3 =	"rrmoy:660F38V03rM",
+  vphminposuw_2 = "rmo:660F38u41rM",
+  vphsubw_3 =	"rrmoy:660F38V05rM",
+  vphsubd_3 =	"rrmoy:660F38V06rM",
+  vphsubsw_3 =	"rrmoy:660F38V07rM",
+  vpinsrb_4 =	"rrri/ood:660F3AV20rMU|rrxi/oob:",
+  vpinsrw_4 =	"rrri/ood:660FVC4rMU|rrxi/oow:",
+  vpinsrd_4 =	"rrmi/ood:660F3AV22rMU",
+  vpinsrq_4 =	"rrmi/ooq:660F3AVX22rMU",
+  vpmaddubsw_3 = "rrmoy:660F38V04rM",
+  vpmaxsb_3 =	"rrmoy:660F38V3CrM",
+  vpmaxsd_3 =	"rrmoy:660F38V3DrM",
+  vpmaxuw_3 =	"rrmoy:660F38V3ErM",
+  vpmaxud_3 =	"rrmoy:660F38V3FrM",
+  vpminsb_3 =	"rrmoy:660F38V38rM",
+  vpminsd_3 =	"rrmoy:660F38V39rM",
+  vpminuw_3 =	"rrmoy:660F38V3ArM",
+  vpminud_3 =	"rrmoy:660F38V3BrM",
+  vpmovmskb_2 =	"rr/do:660FuD7rM|rr/dy:660FuLD7rM",
+  vpmovsxbw_2 =	"rroy:660F38u20rM|rx/oq:|rx/yo:",
+  vpmovsxbd_2 =	"rroy:660F38u21rM|rx/od:|rx/yq:",
+  vpmovsxbq_2 =	"rroy:660F38u22rM|rx/ow:|rx/yd:",
+  vpmovsxwd_2 =	"rroy:660F38u23rM|rx/oq:|rx/yo:",
+  vpmovsxwq_2 =	"rroy:660F38u24rM|rx/od:|rx/yq:",
+  vpmovsxdq_2 =	"rroy:660F38u25rM|rx/oq:|rx/yo:",
+  vpmovzxbw_2 =	"rroy:660F38u30rM|rx/oq:|rx/yo:",
+  vpmovzxbd_2 =	"rroy:660F38u31rM|rx/od:|rx/yq:",
+  vpmovzxbq_2 =	"rroy:660F38u32rM|rx/ow:|rx/yd:",
+  vpmovzxwd_2 =	"rroy:660F38u33rM|rx/oq:|rx/yo:",
+  vpmovzxwq_2 =	"rroy:660F38u34rM|rx/od:|rx/yq:",
+  vpmovzxdq_2 =	"rroy:660F38u35rM|rx/oq:|rx/yo:",
+  vpmuldq_3 =	"rrmoy:660F38V28rM",
+  vpmulhrsw_3 =	"rrmoy:660F38V0BrM",
+  vpmulld_3 =	"rrmoy:660F38V40rM",
+  vpshufb_3 =	"rrmoy:660F38V00rM",
+  vpshufd_3 =	"rmioy:660Fu70rMU",
+  vpshufhw_3 =	"rmioy:F30Fu70rMU",
+  vpshuflw_3 =	"rmioy:F20Fu70rMU",
+  vpsignb_3 =	"rrmoy:660F38V08rM",
+  vpsignw_3 =	"rrmoy:660F38V09rM",
+  vpsignd_3 =	"rrmoy:660F38V0ArM",
+  vpslldq_3 =	"rrioy:660Fv737mU",
+  vpsllw_3 =	"rrmoy:660FVF1rM|rrioy:660Fv716mU",
+  vpslld_3 =	"rrmoy:660FVF2rM|rrioy:660Fv726mU",
+  vpsllq_3 =	"rrmoy:660FVF3rM|rrioy:660Fv736mU",
+  vpsraw_3 =	"rrmoy:660FVE1rM|rrioy:660Fv714mU",
+  vpsrad_3 =	"rrmoy:660FVE2rM|rrioy:660Fv724mU",
+  vpsrldq_3 =	"rrioy:660Fv733mU",
+  vpsrlw_3 =	"rrmoy:660FVD1rM|rrioy:660Fv712mU",
+  vpsrld_3 =	"rrmoy:660FVD2rM|rrioy:660Fv722mU",
+  vpsrlq_3 =	"rrmoy:660FVD3rM|rrioy:660Fv732mU",
+  vptest_2 =	"rmoy:660F38u17rM",
+
+  -- AVX2 integer ops
+  vbroadcasti128_2 = "rx/yo:660F38u5ArM",
+  vinserti128_4 = "rrmi/yyo:660F3AV38rMU",
+  vextracti128_3 = "mri/oy:660F3AuL39RmU",
+  vpblendd_4 =	"rrmioy:660F3AV02rMU",
+  vpbroadcastb_2 = "rro:660F38u78rM|rx/ob:|rr/yo:|rx/yb:",
+  vpbroadcastw_2 = "rro:660F38u79rM|rx/ow:|rr/yo:|rx/yw:",
+  vpbroadcastd_2 = "rro:660F38u58rM|rx/od:|rr/yo:|rx/yd:",
+  vpbroadcastq_2 = "rro:660F38u59rM|rx/oq:|rr/yo:|rx/yq:",
+  vpermd_3 =	"rrmy:660F38V36rM",
+  vpermq_3 =	"rmiy:660F3AuX00rMU",
+  -- *vpgather* (!vsib)
+  vperm2i128_4 = "rrmiy:660F3AV46rMU",
+  vpmaskmovd_3 = "rrxoy:660F38V8CrM|xrroy:660F38V8ERm",
+  vpmaskmovq_3 = "rrxoy:660F38VX8CrM|xrroy:660F38VX8ERm",
+  vpsllvd_3 =	"rrmoy:660F38V47rM",
+  vpsllvq_3 =	"rrmoy:660F38VX47rM",
+  vpsravd_3 =	"rrmoy:660F38V46rM",
+  vpsrlvd_3 =	"rrmoy:660F38V45rM",
+  vpsrlvq_3 =	"rrmoy:660F38VX45rM",
 }
 
 ------------------------------------------------------------------------------
@@ -1462,28 +1723,58 @@ for cc,n in pairs{ b=0, e=1, be=2, u=3, nb=4, ne=5, nbe=6, nu=7 } do
   map_op["fcmov"..cc.."_2"] = format("Fff:%04XR", nc) -- P6+
 end
 
--- SSE FP arithmetic ops.
+-- SSE / AVX FP arithmetic ops.
 for name,n in pairs{ sqrt = 1, add = 8, mul = 9,
 		     sub = 12, min = 13, div = 14, max = 15 } do
   map_op[name.."ps_2"] = format("rmo:0F5%XrM", n)
   map_op[name.."ss_2"] = format("rro:F30F5%XrM|rx/od:", n)
   map_op[name.."pd_2"] = format("rmo:660F5%XrM", n)
   map_op[name.."sd_2"] = format("rro:F20F5%XrM|rx/oq:", n)
+  if n ~= 1 then
+    map_op["v"..name.."ps_3"] = format("rrmoy:0FV5%XrM", n)
+    map_op["v"..name.."ss_3"] = format("rrro:F30FV5%XrM|rrx/ood:", n)
+    map_op["v"..name.."pd_3"] = format("rrmoy:660FV5%XrM", n)
+    map_op["v"..name.."sd_3"] = format("rrro:F20FV5%XrM|rrx/ooq:", n)
+  end
+end
+
+-- SSE2 / AVX / AVX2 integer arithmetic ops (66 0F leaf).
+for name,n in pairs{
+  paddb = 0xFC, paddw = 0xFD, paddd = 0xFE, paddq = 0xD4,
+  paddsb = 0xEC, paddsw = 0xED, packssdw = 0x6B,
+  packsswb = 0x63, packuswb = 0x67, paddusb = 0xDC,
+  paddusw = 0xDD, pand = 0xDB, pandn = 0xDF, pavgb = 0xE0,
+  pavgw = 0xE3, pcmpeqb = 0x74, pcmpeqd = 0x76,
+  pcmpeqw = 0x75, pcmpgtb = 0x64, pcmpgtd = 0x66,
+  pcmpgtw = 0x65, pmaddwd = 0xF5, pmaxsw = 0xEE,
+  pmaxub = 0xDE, pminsw = 0xEA, pminub = 0xDA,
+  pmulhuw = 0xE4, pmulhw = 0xE5, pmullw = 0xD5,
+  pmuludq = 0xF4, por = 0xEB, psadbw = 0xF6, psubb = 0xF8,
+  psubw = 0xF9, psubd = 0xFA, psubq = 0xFB, psubsb = 0xE8,
+  psubsw = 0xE9, psubusb = 0xD8, psubusw = 0xD9,
+  punpckhbw = 0x68, punpckhwd = 0x69, punpckhdq = 0x6A,
+  punpckhqdq = 0x6D, punpcklbw = 0x60, punpcklwd = 0x61,
+  punpckldq = 0x62, punpcklqdq = 0x6C, pxor = 0xEF
+} do
+  map_op[name.."_2"] = format("rmo:660F%02XrM", n)
+  map_op["v"..name.."_3"] = format("rrmoy:660FV%02XrM", n)
 end
 
 ------------------------------------------------------------------------------
 
+local map_vexarg = { u = false, v = 1, V = 2 }
+
 -- Process pattern string.
 local function dopattern(pat, args, sz, op, needrex)
-  local digit, addin
+  local digit, addin, vex
   local opcode = 0
   local szov = sz
   local narg = 1
   local rex = 0
 
   -- Limit number of section buffer positions used by a single dasm_put().
-  -- A single opcode needs a maximum of 5 positions.
-  if secpos+5 > maxsecpos then wflush() end
+  -- A single opcode needs a maximum of 6 positions.
+  if secpos+6 > maxsecpos then wflush() end
 
   -- Process each character.
   for c in gmatch(pat.."|", ".") do
@@ -1497,6 +1788,8 @@ local function dopattern(pat, args, sz, op, needrex)
       szov = nil
     elseif c == "X" then	-- Force REX.W.
       rex = 8
+    elseif c == "L" then	-- Force VEX.L.
+      vex.l = true
     elseif c == "r" then	-- Merge 1st operand regno. into opcode.
       addin = args[1]; opcode = opcode + (addin.reg % 8)
       if narg < 2 then narg = 2 end
@@ -1520,21 +1813,42 @@ local function dopattern(pat, args, sz, op, needrex)
       if t.xreg and t.xreg > 7 then rex = rex + 2 end
       if s > 7 then rex = rex + 4 end
       if needrex then rex = rex + 16 end
-      wputop(szov, opcode, rex); opcode = nil
+      local psz, sk = wputop(szov, opcode, rex, vex, s < 0, t.vreg or t.vxreg)
+      opcode = nil
       local imark = sub(pat, -1) -- Force a mark (ugly).
       -- Put ModRM/SIB with regno/last digit as spare.
-      wputmrmsib(t, imark, s, addin and addin.vreg)
+      wputmrmsib(t, imark, s, addin and addin.vreg, psz, sk)
       addin = nil
+    elseif map_vexarg[c] ~= nil then -- Encode using VEX prefix
+      local b = band(opcode, 255); opcode = shr(opcode, 8)
+      local m = 1
+      if b == 0x38 then m = 2
+      elseif b == 0x3a then m = 3 end
+      if m ~= 1 then b = band(opcode, 255); opcode = shr(opcode, 8) end
+      if b ~= 0x0f then
+	werror("expected `0F', `0F38', or `0F3A' to precede `"..c..
+	  "' in pattern `"..pat.."' for `"..op.."'")
+      end
+      local v = map_vexarg[c]
+      if v then v = remove(args, v) end
+      b = band(opcode, 255)
+      local p = 0
+      if b == 0x66 then p = 1
+      elseif b == 0xf3 then p = 2
+      elseif b == 0xf2 then p = 3 end
+      if p ~= 0 then opcode = shr(opcode, 8) end
+      if opcode ~= 0 then wputop(nil, opcode, 0); opcode = 0 end
+      vex = { m = m, p = p, v = v }
     else
       if opcode then -- Flush opcode.
 	if szov == "q" and rex == 0 then rex = rex + 8 end
 	if needrex then rex = rex + 16 end
 	if addin and addin.reg == -1 then
-	  wputop(szov, opcode - 7, rex)
-	  waction("VREG", addin.vreg); wputxb(0)
+	  local psz, sk = wputop(szov, opcode - 7, rex, vex, true)
+	  wvreg("opcode", addin.vreg, psz, sk)
 	else
 	  if addin and addin.reg > 7 then rex = rex + 1 end
-	  wputop(szov, opcode, rex)
+	  wputop(szov, opcode, rex, vex)
 	end
 	opcode = nil
       end
@@ -1571,6 +1885,14 @@ local function dopattern(pat, args, sz, op, needrex)
 	  else
 	    wputlabel("REL_", imm, 2)
 	  end
+	elseif c == "s" then
+	  local reg = a.reg
+	  if reg < 0 then
+	    wputb(0)
+	    wvreg("imm.hi", a.vreg)
+	  else
+	    wputb(shl(reg, 4))
+	  end
 	else
 	  werror("bad char `"..c.."' in pattern `"..pat.."' for `"..op.."'")
 	end
@@ -1647,11 +1969,14 @@ map_op[".template__"] = function(params, template, nparams)
     if pat == "" then pat = lastpat else lastpat = pat end
     if matchtm(tm, args) then
       local prefix = sub(szm, 1, 1)
-      if prefix == "/" then -- Match both operand sizes.
-	if args[1].opsize == sub(szm, 2, 2) and
-	   args[2].opsize == sub(szm, 3, 3) then
-	  dopattern(pat, args, sz, params.op, needrex) -- Process pattern.
-	  return
+      if prefix == "/" then -- Exactly match leading operand sizes.
+	for i = #szm,1,-1 do
+	  if i == 1 then
+	    dopattern(pat, args, sz, params.op, needrex) -- Process pattern.
+	    return
+	  elseif args[i-1].opsize ~= sub(szm, i, i) then
+	    break
+	  end
 	end
       else -- Match common operand size.
 	local szp = sz
@@ -1716,8 +2041,8 @@ if x64 then
 	rex = a.reg > 7 and 9 or 8
       end
     end
-    wputop(sz, opcode, rex)
-    if vreg then waction("VREG", vreg); wputxb(0) end
+    local psz, sk = wputop(sz, opcode, rex, nil, vreg)
+    wvreg("opcode", vreg, psz, sk)
     waction("IMM_D", format("(unsigned int)(%s)", op64))
     waction("IMM_D", format("(unsigned int)((%s)>>32)", op64))
   end
diff --git a/lib/luajit/dynasm/dynasm.lua b/lib/luajit/dynasm/dynasm.lua
index fffda7513c..145fb0cc6d 100644
--- a/lib/luajit/dynasm/dynasm.lua
+++ b/lib/luajit/dynasm/dynasm.lua
@@ -10,9 +10,9 @@
 local _info = {
   name =	"DynASM",
   description =	"A dynamic assembler for code generation engines",
-  version =	"1.3.0",
-  vernum =	 10300,
-  release =	"2011-05-05",
+  version =	"1.4.0",
+  vernum =	 10400,
+  release =	"2015-10-18",
   author =	"Mike Pall",
   url =		"http://luajit.org/dynasm.html",
   license =	"MIT",
diff --git a/lib/luajit/src/Makefile b/lib/luajit/src/Makefile
index 532da6e94d..9845f6a0e2 100644
--- a/lib/luajit/src/Makefile
+++ b/lib/luajit/src/Makefile
@@ -24,11 +24,13 @@ NODOTABIVER= 51
 # removing the '#' in front of them. Make sure you force a full recompile
 # with "make clean", followed by "make" if you change any options.
 #
+DEFAULT_CC = gcc
+#
 # LuaJIT builds as a native 32 or 64 bit binary by default.
-CC= gcc
+CC= $(DEFAULT_CC)
 #
 # Use this if you want to force a 32 bit build on a 64 bit multilib OS.
-#CC= gcc -m32
+#CC= $(DEFAULT_CC) -m32
 #
 # Since the assembler part does NOT maintain a frame pointer, it's pointless
 # to slow down the C part by not omitting it. Debugging, tracebacks and
@@ -147,6 +149,29 @@ XCFLAGS=
 # You probably don't need to change anything below this line!
 ##############################################################################
 
+##############################################################################
+# Host system detection.
+##############################################################################
+
+ifeq (Windows,$(findstring Windows,$(OS))$(MSYSTEM)$(TERM))
+  HOST_SYS= Windows
+  HOST_RM= del
+else
+  HOST_SYS:= $(shell uname -s)
+  ifneq (,$(findstring MINGW,$(HOST_SYS)))
+    HOST_SYS= Windows
+    HOST_MSYS= mingw
+  endif
+  ifneq (,$(findstring CYGWIN,$(HOST_SYS)))
+    HOST_SYS= Windows
+    HOST_MSYS= cygwin
+  endif
+  # Use Clang for OSX host.
+  ifeq (Darwin,$(HOST_SYS))
+    DEFAULT_CC= clang
+  endif
+endif
+
 ##############################################################################
 # Flags and options for host and target.
 ##############################################################################
@@ -268,24 +293,9 @@ ifneq (,$(LMULTILIB))
 endif
 
 ##############################################################################
-# System detection.
+# Target system detection.
 ##############################################################################
 
-ifeq (Windows,$(findstring Windows,$(OS))$(MSYSTEM)$(TERM))
-  HOST_SYS= Windows
-  HOST_RM= del
-else
-  HOST_SYS:= $(shell uname -s)
-  ifneq (,$(findstring MINGW,$(HOST_SYS)))
-    HOST_SYS= Windows
-    HOST_MSYS= mingw
-  endif
-  ifneq (,$(findstring CYGWIN,$(HOST_SYS)))
-    HOST_SYS= Windows
-    HOST_MSYS= cygwin
-  endif
-endif
-
 TARGET_SYS?= $(HOST_SYS)
 ifeq (Windows,$(TARGET_SYS))
   TARGET_STRIP+= --strip-unneeded
diff --git a/lib/luajit/src/host/buildvm_asm.c b/lib/luajit/src/host/buildvm_asm.c
index 9b7ae53a26..9b1194259a 100644
--- a/lib/luajit/src/host/buildvm_asm.c
+++ b/lib/luajit/src/host/buildvm_asm.c
@@ -261,11 +261,20 @@ void emit_asm(BuildCtx *ctx)
 
 #if LJ_TARGET_ARM && defined(__GNUC__) && !LJ_NO_UNWIND
   /* This should really be moved into buildvm_arm.dasc. */
+#if LJ_ARCH_HASFPU
+  fprintf(ctx->fp,
+	  ".fnstart\n"
+	  ".save {r5, r6, r7, r8, r9, r10, r11, lr}\n"
+	  ".vsave {d8-d15}\n"
+	  ".save {r4}\n"
+	  ".pad #28\n");
+#else
   fprintf(ctx->fp,
 	  ".fnstart\n"
 	  ".save {r4, r5, r6, r7, r8, r9, r10, r11, lr}\n"
 	  ".pad #28\n");
 #endif
+#endif
 #if LJ_TARGET_MIPS
   fprintf(ctx->fp, ".set nomips16\n.abicalls\n.set noreorder\n.set nomacro\n");
 #endif
diff --git a/lib/luajit/src/jit/dis_x86.lua b/lib/luajit/src/jit/dis_x86.lua
index 6bc38066fe..a7c05ed6d5 100644
--- a/lib/luajit/src/jit/dis_x86.lua
+++ b/lib/luajit/src/jit/dis_x86.lua
@@ -15,13 +15,12 @@
 -- Intel and AMD manuals. The supported instruction set is quite extensive
 -- and reflects what a current generation Intel or AMD CPU implements in
 -- 32 bit and 64 bit mode. Yes, this includes MMX, SSE, SSE2, SSE3, SSSE3,
--- SSE4.1, SSE4.2, SSE4a and even privileged and hypervisor (VMX/SVM)
--- instructions.
+-- SSE4.1, SSE4.2, SSE4a, AVX, AVX2 and even privileged and hypervisor
+-- (VMX/SVM) instructions.
 --
 -- Notes:
 -- * The (useless) a16 prefix, 3DNow and pre-586 opcodes are unsupported.
 -- * No attempt at optimization has been made -- it's fast enough for my needs.
--- * The public API may change when more architectures are added.
 ------------------------------------------------------------------------------
 
 local type = type
@@ -78,7 +77,7 @@ local map_opc1_32 = {
 "movBRi","movBRi","movBRi","movBRi","movBRi","movBRi","movBRi","movBRi",
 "movVRI","movVRI","movVRI","movVRI","movVRI","movVRI","movVRI","movVRI",
 --Cx
-"shift!Bmu","shift!Vmu","retBw","ret","$lesVrm","$ldsVrm","movBmi","movVmi",
+"shift!Bmu","shift!Vmu","retBw","ret","vex*3$lesVrm","vex*2$ldsVrm","movBmi","movVmi",
 "enterBwu","leave","retfBw","retf","int3","intBu","into","iretVS",
 --Dx
 "shift!Bm1","shift!Vm1","shift!Bmc","shift!Vmc","aamBu","aadBu","salc","xlatb",
@@ -103,7 +102,7 @@ local map_opc1_64 = setmetatable({
   [0x44]="rex*r",  [0x45]="rex*rb",  [0x46]="rex*rx",  [0x47]="rex*rxb",
   [0x48]="rex*w",  [0x49]="rex*wb",  [0x4a]="rex*wx",  [0x4b]="rex*wxb",
   [0x4c]="rex*wr", [0x4d]="rex*wrb", [0x4e]="rex*wrx", [0x4f]="rex*wrxb",
-  [0x82]=false, [0x9a]=false, [0xc4]=false, [0xc5]=false, [0xce]=false,
+  [0x82]=false, [0x9a]=false, [0xc4]="vex*3", [0xc5]="vex*2", [0xce]=false,
   [0xd4]=false, [0xd5]=false, [0xd6]=false, [0xea]=false,
 }, { __index = map_opc1_32 })
 
@@ -114,12 +113,12 @@ local map_opc2 = {
 [0]="sldt!Dmp","sgdt!Ump","larVrm","lslVrm",nil,"syscall","clts","sysret",
 "invd","wbinvd",nil,"ud1",nil,"$prefetch!Bm","femms","3dnowMrmu",
 --1x
-"movupsXrm|movssXrm|movupdXrm|movsdXrm",
-"movupsXmr|movssXmr|movupdXmr|movsdXmr",
+"movupsXrm|movssXrvm|movupdXrm|movsdXrvm",
+"movupsXmr|movssXmvr|movupdXmr|movsdXmvr",
 "movhlpsXrm$movlpsXrm|movsldupXrm|movlpdXrm|movddupXrm",
 "movlpsXmr||movlpdXmr",
-"unpcklpsXrm||unpcklpdXrm",
-"unpckhpsXrm||unpckhpdXrm",
+"unpcklpsXrvm||unpcklpdXrvm",
+"unpckhpsXrvm||unpckhpdXrvm",
 "movlhpsXrm$movhpsXrm|movshdupXrm|movhpdXrm",
 "movhpsXmr||movhpdXmr",
 "$prefetcht!Bm","hintnopVm","hintnopVm","hintnopVm",
@@ -128,7 +127,7 @@ local map_opc2 = {
 "movUmx$","movUmy$","movUxm$","movUym$","movUmz$",nil,"movUzm$",nil,
 "movapsXrm||movapdXrm",
 "movapsXmr||movapdXmr",
-"cvtpi2psXrMm|cvtsi2ssXrVmt|cvtpi2pdXrMm|cvtsi2sdXrVmt",
+"cvtpi2psXrMm|cvtsi2ssXrvVmt|cvtpi2pdXrMm|cvtsi2sdXrvVmt",
 "movntpsXmr|movntssXmr|movntpdXmr|movntsdXmr",
 "cvttps2piMrXm|cvttss2siVrXm|cvttpd2piMrXm|cvttsd2siVrXm",
 "cvtps2piMrXm|cvtss2siVrXm|cvtpd2piMrXm|cvtsd2siVrXm",
@@ -144,27 +143,27 @@ local map_opc2 = {
 "cmovlVrm","cmovgeVrm","cmovleVrm","cmovgVrm",
 --5x
 "movmskpsVrXm$||movmskpdVrXm$","sqrtpsXrm|sqrtssXrm|sqrtpdXrm|sqrtsdXrm",
-"rsqrtpsXrm|rsqrtssXrm","rcppsXrm|rcpssXrm",
-"andpsXrm||andpdXrm","andnpsXrm||andnpdXrm",
-"orpsXrm||orpdXrm","xorpsXrm||xorpdXrm",
-"addpsXrm|addssXrm|addpdXrm|addsdXrm","mulpsXrm|mulssXrm|mulpdXrm|mulsdXrm",
-"cvtps2pdXrm|cvtss2sdXrm|cvtpd2psXrm|cvtsd2ssXrm",
+"rsqrtpsXrm|rsqrtssXrvm","rcppsXrm|rcpssXrvm",
+"andpsXrvm||andpdXrvm","andnpsXrvm||andnpdXrvm",
+"orpsXrvm||orpdXrvm","xorpsXrvm||xorpdXrvm",
+"addpsXrvm|addssXrvm|addpdXrvm|addsdXrvm","mulpsXrvm|mulssXrvm|mulpdXrvm|mulsdXrvm",
+"cvtps2pdXrm|cvtss2sdXrvm|cvtpd2psXrm|cvtsd2ssXrvm",
 "cvtdq2psXrm|cvttps2dqXrm|cvtps2dqXrm",
-"subpsXrm|subssXrm|subpdXrm|subsdXrm","minpsXrm|minssXrm|minpdXrm|minsdXrm",
-"divpsXrm|divssXrm|divpdXrm|divsdXrm","maxpsXrm|maxssXrm|maxpdXrm|maxsdXrm",
+"subpsXrvm|subssXrvm|subpdXrvm|subsdXrvm","minpsXrvm|minssXrvm|minpdXrvm|minsdXrvm",
+"divpsXrvm|divssXrvm|divpdXrvm|divsdXrvm","maxpsXrvm|maxssXrvm|maxpdXrvm|maxsdXrvm",
 --6x
-"punpcklbwPrm","punpcklwdPrm","punpckldqPrm","packsswbPrm",
-"pcmpgtbPrm","pcmpgtwPrm","pcmpgtdPrm","packuswbPrm",
-"punpckhbwPrm","punpckhwdPrm","punpckhdqPrm","packssdwPrm",
-"||punpcklqdqXrm","||punpckhqdqXrm",
+"punpcklbwPrvm","punpcklwdPrvm","punpckldqPrvm","packsswbPrvm",
+"pcmpgtbPrvm","pcmpgtwPrvm","pcmpgtdPrvm","packuswbPrvm",
+"punpckhbwPrvm","punpckhwdPrvm","punpckhdqPrvm","packssdwPrvm",
+"||punpcklqdqXrvm","||punpckhqdqXrvm",
 "movPrVSm","movqMrm|movdquXrm|movdqaXrm",
 --7x
-"pshufwMrmu|pshufhwXrmu|pshufdXrmu|pshuflwXrmu","pshiftw!Pmu",
-"pshiftd!Pmu","pshiftq!Mmu||pshiftdq!Xmu",
-"pcmpeqbPrm","pcmpeqwPrm","pcmpeqdPrm","emms|",
+"pshufwMrmu|pshufhwXrmu|pshufdXrmu|pshuflwXrmu","pshiftw!Pvmu",
+"pshiftd!Pvmu","pshiftq!Mvmu||pshiftdq!Xvmu",
+"pcmpeqbPrvm","pcmpeqwPrvm","pcmpeqdPrvm","emms*|",
 "vmreadUmr||extrqXmuu$|insertqXrmuu$","vmwriteUrm||extrqXrm$|insertqXrm$",
 nil,nil,
-"||haddpdXrm|haddpsXrm","||hsubpdXrm|hsubpsXrm",
+"||haddpdXrvm|haddpsXrvm","||hsubpdXrvm|hsubpsXrvm",
 "movVSmMr|movqXrm|movVSmXr","movqMmr|movdquXmr|movdqaXmr",
 --8x
 "joVj","jnoVj","jbVj","jnbVj","jzVj","jnzVj","jbeVj","jaVj",
@@ -182,27 +181,27 @@ nil,nil,
 "bsfVrm","bsrVrm|lzcntVrm|bsrWrm","movsxVrBmt","movsxVrWmt",
 --Cx
 "xaddBmr","xaddVmr",
-"cmppsXrmu|cmpssXrmu|cmppdXrmu|cmpsdXrmu","$movntiVmr|",
-"pinsrwPrWmu","pextrwDrPmu",
-"shufpsXrmu||shufpdXrmu","$cmpxchg!Qmp",
+"cmppsXrvmu|cmpssXrvmu|cmppdXrvmu|cmpsdXrvmu","$movntiVmr|",
+"pinsrwPrvWmu","pextrwDrPmu",
+"shufpsXrvmu||shufpdXrvmu","$cmpxchg!Qmp",
 "bswapVR","bswapVR","bswapVR","bswapVR","bswapVR","bswapVR","bswapVR","bswapVR",
 --Dx
-"||addsubpdXrm|addsubpsXrm","psrlwPrm","psrldPrm","psrlqPrm",
-"paddqPrm","pmullwPrm",
+"||addsubpdXrvm|addsubpsXrvm","psrlwPrvm","psrldPrvm","psrlqPrvm",
+"paddqPrvm","pmullwPrvm",
 "|movq2dqXrMm|movqXmr|movdq2qMrXm$","pmovmskbVrMm||pmovmskbVrXm",
-"psubusbPrm","psubuswPrm","pminubPrm","pandPrm",
-"paddusbPrm","padduswPrm","pmaxubPrm","pandnPrm",
+"psubusbPrvm","psubuswPrvm","pminubPrvm","pandPrvm",
+"paddusbPrvm","padduswPrvm","pmaxubPrvm","pandnPrvm",
 --Ex
-"pavgbPrm","psrawPrm","psradPrm","pavgwPrm",
-"pmulhuwPrm","pmulhwPrm",
+"pavgbPrvm","psrawPrvm","psradPrvm","pavgwPrvm",
+"pmulhuwPrvm","pmulhwPrvm",
 "|cvtdq2pdXrm|cvttpd2dqXrm|cvtpd2dqXrm","$movntqMmr||$movntdqXmr",
-"psubsbPrm","psubswPrm","pminswPrm","porPrm",
-"paddsbPrm","paddswPrm","pmaxswPrm","pxorPrm",
+"psubsbPrvm","psubswPrvm","pminswPrvm","porPrvm",
+"paddsbPrvm","paddswPrvm","pmaxswPrvm","pxorPrvm",
 --Fx
-"|||lddquXrm","psllwPrm","pslldPrm","psllqPrm",
-"pmuludqPrm","pmaddwdPrm","psadbwPrm","maskmovqMrm||maskmovdquXrm$",
-"psubbPrm","psubwPrm","psubdPrm","psubqPrm",
-"paddbPrm","paddwPrm","padddPrm","ud",
+"|||lddquXrm","psllwPrvm","pslldPrvm","psllqPrvm",
+"pmuludqPrvm","pmaddwdPrvm","psadbwPrvm","maskmovqMrm||maskmovdquXrm$",
+"psubbPrvm","psubwPrvm","psubdPrvm","psubqPrvm",
+"paddbPrvm","paddwPrvm","padddPrvm","ud",
 }
 assert(map_opc2[255] == "ud")
 
@@ -210,49 +209,70 @@ assert(map_opc2[255] == "ud")
 local map_opc3 = {
 ["38"] = { -- [66] 0f 38 xx
 --0x
-[0]="pshufbPrm","phaddwPrm","phadddPrm","phaddswPrm",
-"pmaddubswPrm","phsubwPrm","phsubdPrm","phsubswPrm",
-"psignbPrm","psignwPrm","psigndPrm","pmulhrswPrm",
-nil,nil,nil,nil,
+[0]="pshufbPrvm","phaddwPrvm","phadddPrvm","phaddswPrvm",
+"pmaddubswPrvm","phsubwPrvm","phsubdPrvm","phsubswPrvm",
+"psignbPrvm","psignwPrvm","psigndPrvm","pmulhrswPrvm",
+"||permilpsXrvm","||permilpdXrvm",nil,nil,
 --1x
 "||pblendvbXrma",nil,nil,nil,
-"||blendvpsXrma","||blendvpdXrma",nil,"||ptestXrm",
-nil,nil,nil,nil,
+"||blendvpsXrma","||blendvpdXrma","||permpsXrvm","||ptestXrm",
+"||broadcastssXrm","||broadcastsdXrm","||broadcastf128XrlXm",nil,
 "pabsbPrm","pabswPrm","pabsdPrm",nil,
 --2x
 "||pmovsxbwXrm","||pmovsxbdXrm","||pmovsxbqXrm","||pmovsxwdXrm",
 "||pmovsxwqXrm","||pmovsxdqXrm",nil,nil,
-"||pmuldqXrm","||pcmpeqqXrm","||$movntdqaXrm","||packusdwXrm",
-nil,nil,nil,nil,
+"||pmuldqXrvm","||pcmpeqqXrvm","||$movntdqaXrm","||packusdwXrvm",
+"||maskmovpsXrvm","||maskmovpdXrvm","||maskmovpsXmvr","||maskmovpdXmvr",
 --3x
 "||pmovzxbwXrm","||pmovzxbdXrm","||pmovzxbqXrm","||pmovzxwdXrm",
-"||pmovzxwqXrm","||pmovzxdqXrm",nil,"||pcmpgtqXrm",
-"||pminsbXrm","||pminsdXrm","||pminuwXrm","||pminudXrm",
-"||pmaxsbXrm","||pmaxsdXrm","||pmaxuwXrm","||pmaxudXrm",
+"||pmovzxwqXrm","||pmovzxdqXrm","||permdXrvm","||pcmpgtqXrvm",
+"||pminsbXrvm","||pminsdXrvm","||pminuwXrvm","||pminudXrvm",
+"||pmaxsbXrvm","||pmaxsdXrvm","||pmaxuwXrvm","||pmaxudXrvm",
 --4x
-"||pmulddXrm","||phminposuwXrm",
+"||pmulddXrvm","||phminposuwXrm",nil,nil,
+nil,"||psrlvVSXrvm","||psravdXrvm","||psllvVSXrvm",
+--5x
+[0x58] = "||pbroadcastdXrlXm",[0x59] = "||pbroadcastqXrlXm",
+[0x5a] = "||broadcasti128XrlXm",
+--7x
+[0x78] = "||pbroadcastbXrlXm",[0x79] = "||pbroadcastwXrlXm",
+--8x
+[0x8c] = "||pmaskmovXrvVSm",
+[0x8e] = "||pmaskmovVSmXvr",
+--Dx
+[0xdc] = "||aesencXrvm", [0xdd] = "||aesenclastXrvm",
+[0xde] = "||aesdecXrvm", [0xdf] = "||aesdeclastXrvm",
 --Fx
 [0xf0] = "|||crc32TrBmt",[0xf1] = "|||crc32TrVmt",
 },
 
 ["3a"] = { -- [66] 0f 3a xx
 --0x
-[0x00]=nil,nil,nil,nil,nil,nil,nil,nil,
-"||roundpsXrmu","||roundpdXrmu","||roundssXrmu","||roundsdXrmu",
-"||blendpsXrmu","||blendpdXrmu","||pblendwXrmu","palignrPrmu",
+[0x00]="||permqXrmu","||permpdXrmu","||pblenddXrvmu",nil,
+"||permilpsXrmu","||permilpdXrmu","||perm2f128Xrvmu",nil,
+"||roundpsXrmu","||roundpdXrmu","||roundssXrvmu","||roundsdXrvmu",
+"||blendpsXrvmu","||blendpdXrvmu","||pblendwXrvmu","palignrPrvmu",
 --1x
 nil,nil,nil,nil,
 "||pextrbVmXru","||pextrwVmXru","||pextrVmSXru","||extractpsVmXru",
-nil,nil,nil,nil,nil,nil,nil,nil,
+"||insertf128XrvlXmu","||extractf128XlXmYru",nil,nil,
+nil,nil,nil,nil,
 --2x
-"||pinsrbXrVmu","||insertpsXrmu","||pinsrXrVmuS",nil,
+"||pinsrbXrvVmu","||insertpsXrvmu","||pinsrXrvVmuS",nil,
+--3x
+[0x38] = "||inserti128Xrvmu",[0x39] = "||extracti128XlXmYru",
 --4x
-[0x40] = "||dppsXrmu",
-[0x41] = "||dppdXrmu",
-[0x42] = "||mpsadbwXrmu",
+[0x40] = "||dppsXrvmu",
+[0x41] = "||dppdXrvmu",
+[0x42] = "||mpsadbwXrvmu",
+[0x44] = "||pclmulqdqXrvmu",
+[0x46] = "||perm2i128Xrvmu",
+[0x4a] = "||blendvpsXrvmb",[0x4b] = "||blendvpdXrvmb",
+[0x4c] = "||pblendvbXrvmb",
 --6x
 [0x60] = "||pcmpestrmXrmu",[0x61] = "||pcmpestriXrmu",
 [0x62] = "||pcmpistrmXrmu",[0x63] = "||pcmpistriXrmu",
+[0xdf] = "||aeskeygenassistXrmu",
 },
 }
 
@@ -356,17 +376,19 @@ local map_regs = {
 	"mm0", "mm1", "mm2", "mm3", "mm4", "mm5", "mm6", "mm7" }, -- No x64 ext!
   X = { "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6", "xmm7",
 	"xmm8", "xmm9", "xmm10", "xmm11", "xmm12", "xmm13", "xmm14", "xmm15" },
+  Y = { "ymm0", "ymm1", "ymm2", "ymm3", "ymm4", "ymm5", "ymm6", "ymm7",
+	"ymm8", "ymm9", "ymm10", "ymm11", "ymm12", "ymm13", "ymm14", "ymm15" },
 }
 local map_segregs = { "es", "cs", "ss", "ds", "fs", "gs", "segr6", "segr7" }
 
 -- Maps for size names.
 local map_sz2n = {
-  B = 1, W = 2, D = 4, Q = 8, M = 8, X = 16,
+  B = 1, W = 2, D = 4, Q = 8, M = 8, X = 16, Y = 32,
 }
 local map_sz2prefix = {
   B = "byte", W = "word", D = "dword",
   Q = "qword",
-  M = "qword", X = "xword",
+  M = "qword", X = "xword", Y = "yword",
   F = "dword", G = "qword", -- No need for sizes/register names for these two.
 }
 
@@ -389,10 +411,13 @@ local function putop(ctx, text, operands)
   if ctx.rep then text = ctx.rep.." "..text; ctx.rep = false end
   if ctx.rex then
     local t = (ctx.rexw and "w" or "")..(ctx.rexr and "r" or "")..
-	      (ctx.rexx and "x" or "")..(ctx.rexb and "b" or "")
-    if t ~= "" then text = "rex."..t.." "..text end
+	      (ctx.rexx and "x" or "")..(ctx.rexb and "b" or "")..
+	      (ctx.vexl and "l" or "")
+    if ctx.vexv and ctx.vexv ~= 0 then t = t.."v"..ctx.vexv end
+    if t ~= "" then text = ctx.rex.."."..t.." "..text
+    elseif ctx.rex == "vex" then text = "v"..text end
     ctx.rexw = false; ctx.rexr = false; ctx.rexx = false; ctx.rexb = false
-    ctx.rex = false
+    ctx.rex = false; ctx.vexl = false; ctx.vexv = false
   end
   if ctx.seg then
     local text2, n = gsub(text, "%[", "["..ctx.seg..":")
@@ -407,6 +432,7 @@ local function putop(ctx, text, operands)
   end
   ctx.out(format("%08x  %s%s\n", ctx.addr+ctx.start, hex, text))
   ctx.mrm = false
+  ctx.vexv = false
   ctx.start = pos
   ctx.imm = nil
 end
@@ -415,7 +441,7 @@ end
 local function clearprefixes(ctx)
   ctx.o16 = false; ctx.seg = false; ctx.lock = false; ctx.rep = false
   ctx.rexw = false; ctx.rexr = false; ctx.rexx = false; ctx.rexb = false
-  ctx.rex = false; ctx.a32 = false
+  ctx.rex = false; ctx.a32 = false; ctx.vexl = false
 end
 
 -- Fallback for incomplete opcodes at the end.
@@ -452,9 +478,9 @@ end
 -- Process pattern string and generate the operands.
 local function putpat(ctx, name, pat)
   local operands, regs, sz, mode, sp, rm, sc, rx, sdisp
-  local code, pos, stop = ctx.code, ctx.pos, ctx.stop
+  local code, pos, stop, vexl = ctx.code, ctx.pos, ctx.stop, ctx.vexl
 
-  -- Chars used: 1DFGIMPQRSTUVWXacdfgijmoprstuwxyz
+  -- Chars used: 1DFGIMPQRSTUVWXYabcdfgijlmoprstuvwxyz
   for p in gmatch(pat, ".") do
     local x = nil
     if p == "V" or p == "U" then
@@ -469,11 +495,13 @@ local function putpat(ctx, name, pat)
     elseif p == "B" then
       sz = "B"
       regs = ctx.rex and map_regs.B64 or map_regs.B
-    elseif match(p, "[WDQMXFG]") then
+    elseif match(p, "[WDQMXYFG]") then
       sz = p
+      if sz == "X" and vexl then sz = "Y"; ctx.vexl = false end
       regs = map_regs[sz]
     elseif p == "P" then
       sz = ctx.o16 and "X" or "M"; ctx.o16 = false
+      if sz == "X" and vexl then sz = "Y"; ctx.vexl = false end
       regs = map_regs[sz]
     elseif p == "S" then
       name = name..lower(sz)
@@ -486,6 +514,10 @@ local function putpat(ctx, name, pat)
       local imm = getimm(ctx, pos, 1); if not imm then return end
       x = format("0x%02x", imm)
       pos = pos+1
+    elseif p == "b" then
+      local imm = getimm(ctx, pos, 1); if not imm then return end
+      x = regs[imm/16+1]
+      pos = pos+1
     elseif p == "w" then
       local imm = getimm(ctx, pos, 2); if not imm then return end
       x = format("0x%x", imm)
@@ -618,8 +650,13 @@ local function putpat(ctx, name, pat)
 	else
 	  x = "CR"..sp
 	end
+      elseif p == "v" then
+	if ctx.vexv then
+	  x = regs[ctx.vexv+1]; ctx.vexv = false
+	end
       elseif p == "y" then x = "DR"..sp
       elseif p == "z" then x = "TR"..sp
+      elseif p == "l" then vexl = false
       elseif p == "t" then
       else
 	error("bad pattern `"..pat.."'")
@@ -694,7 +731,7 @@ map_act = {
   B = putpat, W = putpat, D = putpat, Q = putpat,
   V = putpat, U = putpat, T = putpat,
   M = putpat, X = putpat, P = putpat,
-  F = putpat, G = putpat,
+  F = putpat, G = putpat, Y = putpat,
 
   -- Collect prefixes.
   [":"] = function(ctx, name, pat)
@@ -755,15 +792,68 @@ map_act = {
 
   -- REX prefix.
   rex = function(ctx, name, pat)
-    if ctx.rex then return unknown(ctx) end -- Only 1 REX prefix allowed.
+    if ctx.rex then return unknown(ctx) end -- Only 1 REX or VEX prefix allowed.
     for p in gmatch(pat, ".") do ctx["rex"..p] = true end
-    ctx.rex = true
+    ctx.rex = "rex"
+  end,
+
+  -- VEX prefix.
+  vex = function(ctx, name, pat)
+    if ctx.rex then return unknown(ctx) end -- Only 1 REX or VEX prefix allowed.
+    ctx.rex = "vex"
+    local pos = ctx.pos
+    if ctx.mrm then
+      ctx.mrm = nil
+      pos = pos-1
+    end
+    local b = byte(ctx.code, pos, pos)
+    if not b then return incomplete(ctx) end
+    pos = pos+1
+    if b < 128 then ctx.rexr = true end
+    local m = 1
+    if pat == "3" then
+      m = b%32; b = (b-m)/32
+      local nb = b%2; b = (b-nb)/2
+      if nb == 0 then ctx.rexb = true end
+      local nx = b%2; b = (b-nx)/2
+      if nx == 0 then ctx.rexx = true end
+      b = byte(ctx.code, pos, pos)
+      if not b then return incomplete(ctx) end
+      pos = pos+1
+      if b >= 128 then ctx.rexw = true end
+    end
+    ctx.pos = pos
+    local map
+    if m == 1 then map = map_opc2
+    elseif m == 2 then map = map_opc3["38"]
+    elseif m == 3 then map = map_opc3["3a"]
+    else return unknown(ctx) end
+    local p = b%4; b = (b-p)/4
+    if p == 1 then ctx.o16 = "o16"
+    elseif p == 2 then ctx.rep = "rep"
+    elseif p == 3 then ctx.rep = "repne" end
+    local l = b%2; b = (b-l)/2
+    if l ~= 0 then ctx.vexl = true end
+    ctx.vexv = (-1-b)%16
+    return dispatchmap(ctx, map)
   end,
 
   -- Special case for nop with REX prefix.
   nop = function(ctx, name, pat)
     return dispatch(ctx, ctx.rex and pat or "nop")
   end,
+
+  -- Special case for 0F 77.
+  emms = function(ctx, name, pat)
+    if ctx.rex ~= "vex" then
+      return putop(ctx, "emms")
+    elseif ctx.vexl then
+      ctx.vexl = false
+      return putop(ctx, "zeroall")
+    else
+      return putop(ctx, "zeroupper")
+    end
+  end,
 }
 
 ------------------------------------------------------------------------------
diff --git a/lib/luajit/src/jit/dump.lua b/lib/luajit/src/jit/dump.lua
index c52d0f217e..b1cdcfe294 100644
--- a/lib/luajit/src/jit/dump.lua
+++ b/lib/luajit/src/jit/dump.lua
@@ -75,9 +75,6 @@ local bcline, disass
 -- Active flag, output file handle and dump mode.
 local active, out, dumpmode
 
--- Information about traces that is remembered for future reference.
-local info = {}
-
 ------------------------------------------------------------------------------
 
 local symtabmt = { __index = false }
@@ -553,7 +550,6 @@ local function dump_trace(what, tr, func, pc, otr, oex)
     if dumpmode.m then dump_mcode(tr) end
   end
   if what == "start" then
-    info[tr] = { func = func, pc = pc, otr = otr, oex = oex }
     if dumpmode.H then out:write('<pre class="ljdump">\n') end
     out:write("---- TRACE ", tr, " ", what)
     if otr then out:write(" ", otr, "/", oex) end
@@ -575,6 +571,7 @@ local function dump_trace(what, tr, func, pc, otr, oex)
     end
     if dumpmode.H then out:write("</pre>\n\n") else out:write("\n") end
   else
+    if what == "flush" then symtab, nexitsym = {}, 0 end
     out:write("---- TRACE ", what, "\n\n")
   end
   out:flush()
@@ -705,7 +702,6 @@ end
 return {
   on = dumpon,
   off = dumpoff,
-  start = dumpon, -- For -j command line option.
-  info = info
+  start = dumpon -- For -j command line option.
 }
 
diff --git a/lib/luajit/src/jit/p.lua b/lib/luajit/src/jit/p.lua
index d894bb7d2c..97d4ccdf87 100644
--- a/lib/luajit/src/jit/p.lua
+++ b/lib/luajit/src/jit/p.lua
@@ -36,7 +36,6 @@
 --   G  Produce raw output suitable for graphical tools (e.g. flame graphs).
 --   m<number> Minimum sample percentage to be shown. Default: 3.
 --   i<number> Sampling interval in milliseconds. Default: 10.
---   S[<string>] Events source if performace events are enabled
 --
 ----------------------------------------------------------------------------
 
@@ -45,8 +44,6 @@ local jit = require("jit")
 assert(jit.version_num == 20100, "LuaJIT core/library version mismatch")
 local profile = require("jit.profile")
 local vmdef = require("jit.vmdef")
-local jutil = require("jit.util")
-local dump = require("jit.dump")
 local math = math
 local pairs, ipairs, tonumber, floor = pairs, ipairs, tonumber, math.floor
 local sort, format = table.sort, string.format
@@ -77,38 +74,7 @@ local function prof_cb(th, samples, vmmode)
   -- Collect keys for sample.
   if prof_states then
     if prof_states == "v" then
-      if map_vmmode[vmmode] then
-        key_state = map_vmmode[vmmode]
-      else
-         -- Sampling a trace: make an understandable one-line description.
-         local tr = tonumber(vmmode)
-         local info = jutil.traceinfo(tr)
-         local extra = dump.info[tr]
-         -- Show the parent of this trace (if this is a side trace)
-         local parent = ""
-         if extra and extra.otr and extra.oex then
-            parent = "("..extra.otr.."/"..extra.oex..")"
-         end
-         -- Show what the end of the trace links to (e.g. loop or other trace)
-         local lnk = ""
-         local link, ltype = info.link, info.linktype
-         if     link == tr or link == 0 then lnk = "->"..ltype
-         elseif ltype == "root"         then lnk = "->"..link
-         else                                lnk = "->"..link.." "..ltype end
-         -- Show the current zone (if zone profiling is enabled)
-         local z = ""
-         if zone and zone:get() then
-            z = (" %-16s"):format(zone:get())
-         end
-         -- Show the source location where the trace starts
-         local loc = ""
-         if extra and extra.func then
-            local fi = jutil.funcinfo(extra.func, extra.pc)
-            if fi.loc then loc = fi.loc end
-         end
-         local s = ("TRACE %3d %-8s %-10s%s %s"):format(vmmode, parent, lnk, z, loc)
-         key_state = map_vmmode[vmmode] or s
-      end
+      key_state = map_vmmode[vmmode] or vmmode
     else
       key_state = zone:get() or "(none)"
     end
@@ -277,18 +243,15 @@ end
 -- Start profiling.
 local function prof_start(mode)
   local interval = ""
-  mode = mode:gsub("i%d+", function(s) interval = s; return "" end)
+  mode = mode:gsub("i%d*", function(s) interval = s; return "" end)
   prof_min = 3
   mode = mode:gsub("m(%d+)", function(s) prof_min = tonumber(s); return "" end)
   prof_depth = 1
   mode = mode:gsub("%-?%d+", function(s) prof_depth = tonumber(s); return "" end)
-  local flavour = "S[vanilla]"
-  mode = mode:gsub("S%[.+%]", function(s) flavour = s; return "" end)
-
   local m = {}
   for c in mode:gmatch(".") do m[c] = c end
-  prof_states = m.v or m.z
-  if m.z == "z" then zone = require("jit.zone") end
+  prof_states = m.z or m.v
+  if prof_states == "z" then zone = require("jit.zone") end
   local scope = m.l or m.f or m.F or (prof_states and "" or "f")
   local flags = (m.p or "")
   prof_raw = m.r
@@ -322,7 +285,7 @@ local function prof_start(mode)
   prof_count1 = {}
   prof_count2 = {}
   prof_samples = 0
-  profile.start(scope:lower()..interval..flavour, prof_cb)
+  profile.start(scope:lower()..interval, prof_cb)
   prof_ud = newproxy(true)
   getmetatable(prof_ud).__gc = prof_finish
 end
diff --git a/lib/luajit/src/lib_base.c b/lib/luajit/src/lib_base.c
index 887fea7a58..ca268b1d07 100644
--- a/lib/luajit/src/lib_base.c
+++ b/lib/luajit/src/lib_base.c
@@ -435,13 +435,13 @@ LJLIB_CF(gcinfo)
 LJLIB_CF(collectgarbage)
 {
   int opt = lj_lib_checkopt(L, 1, LUA_GCCOLLECT,  /* ORDER LUA_GC* */
-    "\4stop\7restart\7collect\5count\1\377\4step\10setpause\12setstepmul");
+    "\4stop\7restart\7collect\5count\1\377\4step\10setpause\12setstepmul\1\377\11isrunning");
   int32_t data = lj_lib_optint(L, 2, 0);
   if (opt == LUA_GCCOUNT) {
     setnumV(L->top, (lua_Number)G(L)->gc.total/1024.0);
   } else {
     int res = lua_gc(L, opt, data);
-    if (opt == LUA_GCSTEP)
+    if (opt == LUA_GCSTEP || opt == LUA_GCISRUNNING)
       setboolV(L->top, res);
     else
       setintV(L->top, res);
diff --git a/lib/luajit/src/lib_ffi.c b/lib/luajit/src/lib_ffi.c
index b2b2d37ff7..7be624b42d 100644
--- a/lib/luajit/src/lib_ffi.c
+++ b/lib/luajit/src/lib_ffi.c
@@ -505,10 +505,7 @@ LJLIB_CF(ffi_new)	LJLIB_REC(.)
   }
   if (sz == CTSIZE_INVALID)
     lj_err_arg(L, 1, LJ_ERR_FFI_INVSIZE);
-  if (!(info & CTF_VLA) && ctype_align(info) <= CT_MEMALIGN)
-    cd = lj_cdata_new(cts, id, sz);
-  else
-    cd = lj_cdata_newv(L, id, sz, ctype_align(info));
+  cd = lj_cdata_newx(cts, id, sz, info);
   setcdataV(L, o-1, cd);  /* Anchor the uninitialized cdata. */
   lj_cconv_ct_init(cts, ct, sz, cdataptr(cd),
 		   o, (MSize)(L->top - o));  /* Initialize cdata. */
diff --git a/lib/luajit/src/lib_jit.c b/lib/luajit/src/lib_jit.c
index 2227d198c5..178ef249df 100644
--- a/lib/luajit/src/lib_jit.c
+++ b/lib/luajit/src/lib_jit.c
@@ -299,9 +299,6 @@ LJLIB_CF(jit_util_traceinfo)
     setintfield(L, t, "nk", REF_BIAS - (int32_t)T->nk);
     setintfield(L, t, "link", T->link);
     setintfield(L, t, "nexit", T->nsnap);
-    setintfield(L, t, "szmcode", T->szmcode);
-    setintfield(L, t, "mcode", (int32_t)(intptr_t)T->mcode);
-    setintfield(L, t, "mcloop", T->mcloop);
     setstrV(L, L->top++, lj_str_newz(L, jit_trlinkname[T->linktype]));
     lua_setfield(L, -2, "linktype");
     /* There are many more fields. Add them only when needed. */
@@ -558,10 +555,7 @@ static void jit_profile_callback(lua_State *L2, lua_State *L, int samples,
     setfuncV(L2, L2->top++, funcV(tv));
     setthreadV(L2, L2->top++, L);
     setintV(L2->top++, samples);
-    if (vmstate >= 256)
-      setintV(L2->top++, vmstate-256);
-    else
-      setstrV(L2, L2->top++, lj_str_new(L2, &vmst, 1));
+    setstrV(L2, L2->top++, lj_str_new(L2, &vmst, 1));
     status = lua_pcall(L2, 3, 0, 0);  /* callback(thread, samples, vmstate) */
     if (status) {
       if (G(L2)->panic) G(L2)->panic(L2);
diff --git a/lib/luajit/src/lib_os.c b/lib/luajit/src/lib_os.c
index 7b5873a518..37d7d5be61 100644
--- a/lib/luajit/src/lib_os.c
+++ b/lib/luajit/src/lib_os.c
@@ -39,7 +39,7 @@
 
 LJLIB_CF(os_execute)
 {
-#if LJ_TARGET_CONSOLE
+#if LJ_NO_SYSTEM
 #if LJ_52
   errno = ENOSYS;
   return luaL_fileresult(L, 0, NULL);
diff --git a/lib/luajit/src/lj.supp b/lib/luajit/src/lj.supp
index 411f261700..acb9e789d0 100644
--- a/lib/luajit/src/lj.supp
+++ b/lib/luajit/src/lj.supp
@@ -24,3 +24,18 @@
    Memcheck:Cond
    fun:lj_str_new
 }
+{
+   Optimized string compare
+   Memcheck:Addr4
+   fun:lj_str_fastcmp
+}
+{
+   Optimized string compare
+   Memcheck:Addr1
+   fun:lj_str_fastcmp
+}
+{
+   Optimized string compare
+   Memcheck:Cond
+   fun:lj_str_fastcmp
+}
diff --git a/lib/luajit/src/lj_alloc.c b/lib/luajit/src/lj_alloc.c
index 0aad826d36..ddd50cae4f 100644
--- a/lib/luajit/src/lj_alloc.c
+++ b/lib/luajit/src/lj_alloc.c
@@ -196,7 +196,7 @@ static LJ_AINLINE void *CALL_MMAP(size_t size)
   return ptr;
 }
 
-#elif LJ_TARGET_OSX || LJ_TARGET_PS4 || defined(__FreeBSD__) || defined(__FreeBSD_kernel__) || defined(__NetBSD__) || defined(__OpenBSD__) || defined(__DragonFly__) || defined(__sun__)
+#elif LJ_TARGET_OSX || LJ_TARGET_PS4 || defined(__FreeBSD__) || defined(__FreeBSD_kernel__) || defined(__NetBSD__) || defined(__OpenBSD__) || defined(__DragonFly__) || defined(__sun__) || defined(__CYGWIN__)
 
 /* OSX and FreeBSD mmap() use a naive first-fit linear search.
 ** That's perfect for us. Except that -pagezero_size must be set for OSX,
diff --git a/lib/luajit/src/lj_api.c b/lib/luajit/src/lj_api.c
index 1f09284f99..042b0d9c8d 100644
--- a/lib/luajit/src/lj_api.c
+++ b/lib/luajit/src/lj_api.c
@@ -1188,6 +1188,9 @@ LUA_API int lua_gc(lua_State *L, int what, int data)
     res = (int)(g->gc.stepmul);
     g->gc.stepmul = (MSize)data;
     break;
+  case LUA_GCISRUNNING:
+    res = (g->gc.threshold != LJ_MAX_MEM);
+    break;
   default:
     res = -1;  /* Invalid option. */
   }
diff --git a/lib/luajit/src/lj_arch.h b/lib/luajit/src/lj_arch.h
index f1e7d7f45c..a114bdda53 100644
--- a/lib/luajit/src/lj_arch.h
+++ b/lib/luajit/src/lj_arch.h
@@ -155,7 +155,11 @@
 #define LJ_ARCH_NAME		"x64"
 #define LJ_ARCH_BITS		64
 #define LJ_ARCH_ENDIAN		LUAJIT_LE
-#define LJ_ABI_WIN		LJ_TARGET_WINDOWS
+#if LJ_TARGET_WINDOWS || __CYGWIN__
+#define LJ_ABI_WIN		1
+#else
+#define LJ_ABI_WIN		0
+#endif
 #define LJ_TARGET_X64		1
 #define LJ_TARGET_X86ORX64	1
 #define LJ_TARGET_EHRETREG	0
@@ -300,6 +304,13 @@
 #define LJ_TARGET_UNIFYROT	2	/* Want only IR_BROR. */
 #define LJ_ARCH_NUMMODE		LJ_NUMMODE_SINGLE
 
+#if !defined(LJ_ARCH_HASFPU) && defined(__mips_soft_float)
+#define LJ_ARCH_HASFPU		0
+#endif
+#if !defined(LJ_ABI_SOFTFP) && defined(__mips_soft_float)
+#define LJ_ABI_SOFTFP		1
+#endif
+
 #if _MIPS_ARCH_MIPS32R2
 #define LJ_ARCH_VERSION		20
 #else
@@ -382,9 +393,6 @@
 #error "No support for PPC/e500 anymore (use LuaJIT 2.0)"
 #endif
 #elif LJ_TARGET_MIPS
-#if defined(__mips_soft_float)
-#error "No support for MIPS CPUs without FPU"
-#endif
 #if defined(_LP64)
 #error "No support for MIPS64"
 #endif
@@ -494,6 +502,9 @@
 #if defined(__symbian__) || LJ_TARGET_WINDOWS
 #define LUAJIT_NO_EXP2
 #endif
+#if LJ_TARGET_CONSOLE || (LJ_TARGET_IOS && __IPHONE_OS_VERSION_MIN_REQUIRED >= __IPHONE_8_0)
+#define LJ_NO_SYSTEM		1
+#endif
 
 #if defined(LUAJIT_NO_UNWIND) || defined(__symbian__) || LJ_TARGET_IOS || LJ_TARGET_PS3 || LJ_TARGET_PS4
 #define LJ_NO_UNWIND		1
diff --git a/lib/luajit/src/lj_ccall.c b/lib/luajit/src/lj_ccall.c
index 5ab5b60daa..2dda540510 100644
--- a/lib/luajit/src/lj_ccall.c
+++ b/lib/luajit/src/lj_ccall.c
@@ -418,6 +418,18 @@
   /* Complex values are returned in 1 or 2 FPRs. */ \
   cc->retref = 0;
 
+#if LJ_ABI_SOFTFP
+#define CCALL_HANDLE_COMPLEXRET2 \
+  if (ctr->size == 2*sizeof(float)) {  /* Copy complex float from GPRs. */ \
+    ((intptr_t *)dp)[0] = cc->gpr[0]; \
+    ((intptr_t *)dp)[1] = cc->gpr[1]; \
+  } else {  /* Copy complex double from GPRs. */ \
+    ((intptr_t *)dp)[0] = cc->gpr[0]; \
+    ((intptr_t *)dp)[1] = cc->gpr[1]; \
+    ((intptr_t *)dp)[2] = cc->gpr[2]; \
+    ((intptr_t *)dp)[3] = cc->gpr[3]; \
+  }
+#else
 #define CCALL_HANDLE_COMPLEXRET2 \
   if (ctr->size == 2*sizeof(float)) {  /* Copy complex float from FPRs. */ \
     ((float *)dp)[0] = cc->fpr[0].f; \
@@ -426,6 +438,7 @@
     ((double *)dp)[0] = cc->fpr[0].d; \
     ((double *)dp)[1] = cc->fpr[1].d; \
   }
+#endif
 
 #define CCALL_HANDLE_STRUCTARG \
   /* Pass all structs by value in registers and/or on the stack. */
@@ -433,6 +446,22 @@
 #define CCALL_HANDLE_COMPLEXARG \
   /* Pass complex by value in 2 or 4 GPRs. */
 
+#define CCALL_HANDLE_GPR \
+  if ((d->info & CTF_ALIGN) > CTALIGN_PTR) \
+    ngpr = (ngpr + 1u) & ~1u;  /* Align to regpair. */ \
+  if (ngpr < maxgpr) { \
+    dp = &cc->gpr[ngpr]; \
+    if (ngpr + n > maxgpr) { \
+     nsp += ngpr + n - maxgpr;  /* Assumes contiguous gpr/stack fields. */ \
+     if (nsp > CCALL_MAXSTACK) goto err_nyi;  /* Too many arguments. */ \
+     ngpr = maxgpr; \
+    } else { \
+     ngpr += n; \
+    } \
+    goto done; \
+  }
+
+#if !LJ_ABI_SOFTFP	/* MIPS32 hard-float */
 #define CCALL_HANDLE_REGARG \
   if (isfp && nfpr < CCALL_NARG_FPR && !(ct->info & CTF_VARARG)) { \
     /* Try to pass argument in FPRs. */ \
@@ -441,24 +470,18 @@
     goto done; \
   } else {  /* Try to pass argument in GPRs. */ \
     nfpr = CCALL_NARG_FPR; \
-    if ((d->info & CTF_ALIGN) > CTALIGN_PTR) \
-      ngpr = (ngpr + 1u) & ~1u;  /* Align to regpair. */ \
-    if (ngpr < maxgpr) { \
-      dp = &cc->gpr[ngpr]; \
-      if (ngpr + n > maxgpr) { \
-	nsp += ngpr + n - maxgpr;  /* Assumes contiguous gpr/stack fields. */ \
-	if (nsp > CCALL_MAXSTACK) goto err_nyi;  /* Too many arguments. */ \
-	ngpr = maxgpr; \
-      } else { \
-	ngpr += n; \
-      } \
-      goto done; \
-    } \
+    CCALL_HANDLE_GPR \
   }
+#else			/* MIPS32 soft-float */
+#define CCALL_HANDLE_REGARG CCALL_HANDLE_GPR
+#endif
 
+#if !LJ_ABI_SOFTFP
+/* On MIPS64 soft-float, position of float return values is endian-dependant. */
 #define CCALL_HANDLE_RET \
   if (ctype_isfp(ctr->info) && ctr->size == sizeof(float)) \
     sp = (uint8_t *)&cc->fpr[0].f;
+#endif
 
 #else
 #error "Missing calling convention definitions for this architecture"
diff --git a/lib/luajit/src/lj_ccall.h b/lib/luajit/src/lj_ccall.h
index 91983feebd..8b0e796bfc 100644
--- a/lib/luajit/src/lj_ccall.h
+++ b/lib/luajit/src/lj_ccall.h
@@ -98,9 +98,9 @@ typedef double FPRArg;
 #elif LJ_TARGET_MIPS
 
 #define CCALL_NARG_GPR		4
-#define CCALL_NARG_FPR		2
+#define CCALL_NARG_FPR		(LJ_ABI_SOFTFP ? 0 : 2)
 #define CCALL_NRET_GPR		2
-#define CCALL_NRET_FPR		2
+#define CCALL_NRET_FPR		(LJ_ABI_SOFTFP ? 0 : 2)
 #define CCALL_SPS_EXTRA		7
 #define CCALL_SPS_FREE		1
 
diff --git a/lib/luajit/src/lj_ccallback.c b/lib/luajit/src/lj_ccallback.c
index 065c329fa7..539c9e3da4 100644
--- a/lib/luajit/src/lj_ccallback.c
+++ b/lib/luajit/src/lj_ccallback.c
@@ -427,6 +427,15 @@ void lj_ccallback_mcode_free(CTState *cts)
 
 #elif LJ_TARGET_MIPS
 
+#define CALLBACK_HANDLE_GPR \
+  if (n > 1) ngpr = (ngpr + 1u) & ~1u;  /* Align to regpair. */ \
+  if (ngpr + n <= maxgpr) { \
+    sp = &cts->cb.gpr[ngpr]; \
+    ngpr += n; \
+    goto done; \
+  }
+
+#if !LJ_ABI_SOFTFP	/* MIPS32 hard-float */
 #define CALLBACK_HANDLE_REGARG \
   if (isfp && nfpr < CCALL_NARG_FPR) {  /* Try to pass argument in FPRs. */ \
     sp = (void *)((uint8_t *)&cts->cb.fpr[nfpr] + ((LJ_BE && n==1) ? 4 : 0)); \
@@ -434,13 +443,13 @@ void lj_ccallback_mcode_free(CTState *cts)
     goto done; \
   } else {  /* Try to pass argument in GPRs. */ \
     nfpr = CCALL_NARG_FPR; \
-    if (n > 1) ngpr = (ngpr + 1u) & ~1u;  /* Align to regpair. */ \
-    if (ngpr + n <= maxgpr) { \
-      sp = &cts->cb.gpr[ngpr]; \
-      ngpr += n; \
-      goto done; \
-    } \
+    CALLBACK_HANDLE_GPR \
   }
+#else			/* MIPS32 soft-float */
+#define CALLBACK_HANDLE_REGARG \
+  CALLBACK_HANDLE_GPR \
+  UNUSED(isfp);
+#endif
 
 #define CALLBACK_HANDLE_RET \
   if (ctype_isfp(ctr->info) && ctr->size == sizeof(float)) \
diff --git a/lib/luajit/src/lj_cdata.c b/lib/luajit/src/lj_cdata.c
index 5cd2c1140e..30d788e4c9 100644
--- a/lib/luajit/src/lj_cdata.c
+++ b/lib/luajit/src/lj_cdata.c
@@ -49,6 +49,15 @@ GCcdata *lj_cdata_newv(lua_State *L, CTypeID id, CTSize sz, CTSize align)
   return cd;
 }
 
+/* Allocate arbitrary C data object. */
+GCcdata *lj_cdata_newx(CTState *cts, CTypeID id, CTSize sz, CTInfo info)
+{
+  if (!(info & CTF_VLA) && ctype_align(info) <= CT_MEMALIGN)
+    return lj_cdata_new(cts, id, sz);
+  else
+    return lj_cdata_newv(cts->L, id, sz, ctype_align(info));
+}
+
 /* Free a C data object. */
 void LJ_FASTCALL lj_cdata_free(global_State *g, GCcdata *cd)
 {
diff --git a/lib/luajit/src/lj_cdata.h b/lib/luajit/src/lj_cdata.h
index c8975be1c9..0891c33c80 100644
--- a/lib/luajit/src/lj_cdata.h
+++ b/lib/luajit/src/lj_cdata.h
@@ -60,6 +60,8 @@ static LJ_AINLINE GCcdata *lj_cdata_new_(lua_State *L, CTypeID id, CTSize sz)
 LJ_FUNC GCcdata *lj_cdata_newref(CTState *cts, const void *pp, CTypeID id);
 LJ_FUNC GCcdata *lj_cdata_newv(lua_State *L, CTypeID id, CTSize sz,
 			       CTSize align);
+LJ_FUNC GCcdata *lj_cdata_newx(CTState *cts, CTypeID id, CTSize sz,
+			       CTInfo info);
 
 LJ_FUNC void LJ_FASTCALL lj_cdata_free(global_State *g, GCcdata *cd);
 LJ_FUNC void lj_cdata_setfin(lua_State *L, GCcdata *cd, GCobj *obj,
diff --git a/lib/luajit/src/lj_ctype.c b/lib/luajit/src/lj_ctype.c
index 2e23c994bb..eda070ce1e 100644
--- a/lib/luajit/src/lj_ctype.c
+++ b/lib/luajit/src/lj_ctype.c
@@ -38,6 +38,8 @@
   _("uint64_t",			UINT64) \
   _("intptr_t",			INT_PSZ) \
   _("uintptr_t",		UINT_PSZ) \
+  /* From POSIX. */ \
+  _("ssize_t",			INT_PSZ) \
   /* End of typedef list. */
 
 /* Keywords (only the ones we actually care for). */
diff --git a/lib/luajit/src/lj_dispatch.h b/lib/luajit/src/lj_dispatch.h
index 1e247e3828..73d00ec00c 100644
--- a/lib/luajit/src/lj_dispatch.h
+++ b/lib/luajit/src/lj_dispatch.h
@@ -14,6 +14,21 @@
 
 #if LJ_TARGET_MIPS
 /* Need our own global offset table for the dreaded MIPS calling conventions. */
+#if LJ_SOFTFP
+extern double __adddf3(double a, double b);
+extern double __subdf3(double a, double b);
+extern double __muldf3(double a, double b);
+extern double __divdf3(double a, double b);
+extern void __ledf2(double a, double b);
+extern double __floatsidf(int32_t a);
+extern int32_t __fixdfsi(double a);
+
+#define SFGOTDEF(_) \
+  _(lj_num2bit) _(sqrt) _(__adddf3) _(__subdf3) _(__muldf3) _(__divdf3) _(__ledf2) \
+  _(__floatsidf) _(__fixdfsi)
+#else
+#define SFGOTDEF(_)
+#endif
 #if LJ_HASJIT
 #define JITGOTDEF(_)	_(lj_trace_exit) _(lj_trace_hot)
 #else
@@ -39,7 +54,8 @@
   _(lj_str_new) _(lj_tab_dup) _(lj_tab_get) _(lj_tab_getinth) _(lj_tab_len) \
   _(lj_tab_new) _(lj_tab_newkey) _(lj_tab_next) _(lj_tab_reasize) \
   _(lj_tab_setinth) _(lj_buf_putstr_reverse) _(lj_buf_putstr_lower) \
-  _(lj_buf_putstr_upper) _(lj_buf_tostr) JITGOTDEF(_) FFIGOTDEF(_)
+  _(lj_buf_putstr_upper) _(lj_buf_tostr) \
+  JITGOTDEF(_) FFIGOTDEF(_) SFGOTDEF(_)
 
 enum {
 #define GOTENUM(name) LJ_GOT_##name,
diff --git a/lib/luajit/src/lj_err.c b/lib/luajit/src/lj_err.c
index 2e20c2c0f8..d641735e9f 100644
--- a/lib/luajit/src/lj_err.c
+++ b/lib/luajit/src/lj_err.c
@@ -183,20 +183,13 @@ static void *err_unwind(lua_State *L, void *stopcf, int errcode)
 
 /* -- External frame unwinding -------------------------------------------- */
 
-#if defined(__GNUC__) && !LJ_NO_UNWIND && !LJ_TARGET_WINDOWS
+#if defined(__GNUC__) && !LJ_NO_UNWIND && !LJ_ABI_WIN
 
 /*
 ** We have to use our own definitions instead of the mandatory (!) unwind.h,
 ** since various OS, distros and compilers mess up the header installation.
 */
 
-typedef struct _Unwind_Exception
-{
-  uint64_t exclass;
-  void (*excleanup)(int, struct _Unwind_Exception *);
-  uintptr_t p1, p2;
-} __attribute__((__aligned__)) _Unwind_Exception;
-
 typedef struct _Unwind_Context _Unwind_Context;
 
 #define _URC_OK			0
@@ -206,8 +199,20 @@ typedef struct _Unwind_Context _Unwind_Context;
 #define _URC_CONTINUE_UNWIND	8
 #define _URC_FAILURE		9
 
+#define LJ_UEXCLASS		0x4c55414a49543200ULL	/* LUAJIT2\0 */
+#define LJ_UEXCLASS_MAKE(c)	(LJ_UEXCLASS | (uint64_t)(c))
+#define LJ_UEXCLASS_CHECK(cl)	(((cl) ^ LJ_UEXCLASS) <= 0xff)
+#define LJ_UEXCLASS_ERRCODE(cl)	((int)((cl) & 0xff))
+
 #if !LJ_TARGET_ARM
 
+typedef struct _Unwind_Exception
+{
+  uint64_t exclass;
+  void (*excleanup)(int, struct _Unwind_Exception *);
+  uintptr_t p1, p2;
+} __attribute__((__aligned__)) _Unwind_Exception;
+
 extern uintptr_t _Unwind_GetCFA(_Unwind_Context *);
 extern void _Unwind_SetGR(_Unwind_Context *, int, uintptr_t);
 extern void _Unwind_SetIP(_Unwind_Context *, uintptr_t);
@@ -219,11 +224,6 @@ extern int _Unwind_RaiseException(_Unwind_Exception *);
 #define _UA_HANDLER_FRAME	4
 #define _UA_FORCE_UNWIND	8
 
-#define LJ_UEXCLASS		0x4c55414a49543200ULL	/* LUAJIT2\0 */
-#define LJ_UEXCLASS_MAKE(c)	(LJ_UEXCLASS | (uint64_t)(c))
-#define LJ_UEXCLASS_CHECK(cl)	(((cl) ^ LJ_UEXCLASS) <= 0xff)
-#define LJ_UEXCLASS_ERRCODE(cl)	((int)((cl) & 0xff))
-
 /* DWARF2 personality handler referenced from interpreter .eh_frame. */
 LJ_FUNCA int lj_err_unwind_dwarf(int version, int actions,
   uint64_t uexclass, _Unwind_Exception *uex, _Unwind_Context *ctx)
@@ -302,10 +302,23 @@ static void err_raise_ext(int errcode)
 }
 #endif
 
-#else
+#else /* LJ_TARGET_ARM */
+
+#define _US_VIRTUAL_UNWIND_FRAME	0
+#define _US_UNWIND_FRAME_STARTING	1
+#define _US_ACTION_MASK			3
+#define _US_FORCE_UNWIND		8
+
+typedef struct _Unwind_Control_Block _Unwind_Control_Block;
+typedef struct _Unwind_Context _Unwind_Context;
 
-extern void _Unwind_DeleteException(void *);
-extern int __gnu_unwind_frame (void *, _Unwind_Context *);
+struct _Unwind_Control_Block {
+  uint64_t exclass;
+  uint32_t misc[20];
+};
+
+extern int _Unwind_RaiseException(_Unwind_Control_Block *);
+extern int __gnu_unwind_frame(_Unwind_Control_Block *, _Unwind_Context *);
 extern int _Unwind_VRS_Set(_Unwind_Context *, int, uint32_t, int, void *);
 extern int _Unwind_VRS_Get(_Unwind_Context *, int, uint32_t, int, void *);
 
@@ -321,35 +334,58 @@ static inline void _Unwind_SetGR(_Unwind_Context *ctx, int r, uint32_t v)
   _Unwind_VRS_Set(ctx, 0, r, 0, &v);
 }
 
-#define _US_VIRTUAL_UNWIND_FRAME	0
-#define _US_UNWIND_FRAME_STARTING	1
-#define _US_ACTION_MASK			3
-#define _US_FORCE_UNWIND		8
+extern void lj_vm_unwind_ext(void);
 
 /* ARM unwinder personality handler referenced from interpreter .ARM.extab. */
-LJ_FUNCA int lj_err_unwind_arm(int state, void *ucb, _Unwind_Context *ctx)
+LJ_FUNCA int lj_err_unwind_arm(int state, _Unwind_Control_Block *ucb,
+			       _Unwind_Context *ctx)
 {
   void *cf = (void *)_Unwind_GetGR(ctx, 13);
   lua_State *L = cframe_L(cf);
-  if ((state & _US_ACTION_MASK) == _US_VIRTUAL_UNWIND_FRAME) {
-    setstrV(L, L->top++, lj_err_str(L, LJ_ERR_ERRCPP));
+  int errcode;
+
+  switch ((state & _US_ACTION_MASK)) {
+  case _US_VIRTUAL_UNWIND_FRAME:
+    if ((state & _US_FORCE_UNWIND)) break;
     return _URC_HANDLER_FOUND;
-  }
-  if ((state&(_US_ACTION_MASK|_US_FORCE_UNWIND)) == _US_UNWIND_FRAME_STARTING) {
-    _Unwind_DeleteException(ucb);
-    _Unwind_SetGR(ctx, 15, (uint32_t)(void *)lj_err_throw);
-    _Unwind_SetGR(ctx, 0, (uint32_t)L);
-    _Unwind_SetGR(ctx, 1, (uint32_t)LUA_ERRRUN);
+  case _US_UNWIND_FRAME_STARTING:
+    if (LJ_UEXCLASS_CHECK(ucb->exclass)) {
+      errcode = LJ_UEXCLASS_ERRCODE(ucb->exclass);
+    } else {
+      errcode = LUA_ERRRUN;
+      setstrV(L, L->top++, lj_err_str(L, LJ_ERR_ERRCPP));
+    }
+    cf = err_unwind(L, cf, errcode);
+    if ((state & _US_FORCE_UNWIND) || cf == NULL) break;
+    _Unwind_SetGR(ctx, 15, (uint32_t)lj_vm_unwind_ext);
+    _Unwind_SetGR(ctx, 0, (uint32_t)ucb);
+    _Unwind_SetGR(ctx, 1, (uint32_t)errcode);
+    _Unwind_SetGR(ctx, 2, cframe_unwind_ff(cf) ?
+			    (uint32_t)lj_vm_unwind_ff_eh :
+			    (uint32_t)lj_vm_unwind_c_eh);
     return _URC_INSTALL_CONTEXT;
+  default:
+    return _URC_FAILURE;
   }
   if (__gnu_unwind_frame(ucb, ctx) != _URC_OK)
     return _URC_FAILURE;
   return _URC_CONTINUE_UNWIND;
 }
 
+#if LJ_UNWIND_EXT
+static __thread _Unwind_Control_Block static_uex;
+
+static void err_raise_ext(int errcode)
+{
+  memset(&static_uex, 0, sizeof(static_uex));
+  static_uex.exclass = LJ_UEXCLASS_MAKE(errcode);
+  _Unwind_RaiseException(&static_uex);
+}
 #endif
 
-#elif LJ_TARGET_X64 && LJ_TARGET_WINDOWS
+#endif /* LJ_TARGET_ARM */
+
+#elif LJ_TARGET_X64 && LJ_ABI_WIN
 
 /*
 ** Someone in Redmond owes me several days of my life. A lot of this is
@@ -414,7 +450,9 @@ LJ_FUNCA EXCEPTION_DISPOSITION lj_err_unwind_win64(EXCEPTION_RECORD *rec,
     if (cf2) {  /* We catch it, so start unwinding the upper frames. */
       if (rec->ExceptionCode == LJ_MSVC_EXCODE ||
 	  rec->ExceptionCode == LJ_GCC_EXCODE) {
+#if LJ_TARGET_WINDOWS
 	__DestructExceptionObject(rec, 1);
+#endif
 	setstrV(L, L->top++, lj_err_str(L, LJ_ERR_ERRCPP));
       } else if (!LJ_EXCODE_CHECK(rec->ExceptionCode)) {
 	/* Don't catch access violations etc. */
diff --git a/lib/luajit/src/lj_ffrecord.c b/lib/luajit/src/lj_ffrecord.c
index 6cc05a24f7..281f017856 100644
--- a/lib/luajit/src/lj_ffrecord.c
+++ b/lib/luajit/src/lj_ffrecord.c
@@ -435,11 +435,12 @@ static void LJ_FASTCALL recff_ipairs_aux(jit_State *J, RecordFFData *rd)
 
 static void LJ_FASTCALL recff_xpairs(jit_State *J, RecordFFData *rd)
 {
-  if (!(LJ_52 && recff_metacall(J, rd, MM_ipairs))) {
-    TRef tab = J->base[0];
-    if (tref_istab(tab)) {
+  TRef tr = J->base[0];
+  if (!((LJ_52 || (LJ_HASFFI && tref_iscdata(tr))) &&
+	recff_metacall(J, rd, MM_pairs + rd->data))) {
+    if (tref_istab(tr)) {
       J->base[0] = lj_ir_kfunc(J, funcV(&J->fn->c.upvalue[0]));
-      J->base[1] = tab;
+      J->base[1] = tr;
       J->base[2] = rd->data ? lj_ir_kint(J, 0) : TREF_NIL;
       rd->nres = 3;
     }  /* else: Interpreter will throw. */
diff --git a/lib/luajit/src/lj_frame.h b/lib/luajit/src/lj_frame.h
index a86c36be7e..aa3ab20bbf 100644
--- a/lib/luajit/src/lj_frame.h
+++ b/lib/luajit/src/lj_frame.h
@@ -218,6 +218,7 @@ enum { LJ_CONT_TAILCALL, LJ_CONT_FFI_CALLBACK };  /* Special continuations. */
 #define CFRAME_SHIFT_MULTRES	3
 #endif
 #elif LJ_TARGET_MIPS
+#if LJ_ARCH_HASFPU
 #define CFRAME_OFS_ERRF		124
 #define CFRAME_OFS_NRES		120
 #define CFRAME_OFS_PREV		116
@@ -227,6 +228,16 @@ enum { LJ_CONT_TAILCALL, LJ_CONT_FFI_CALLBACK };  /* Special continuations. */
 #define CFRAME_SIZE		112
 #define CFRAME_SHIFT_MULTRES	3
 #else
+#define CFRAME_OFS_ERRF		100
+#define CFRAME_OFS_NRES		96
+#define CFRAME_OFS_PREV		92
+#define CFRAME_OFS_L		88
+#define CFRAME_OFS_PC		44
+#define CFRAME_OFS_MULTRES	16
+#define CFRAME_SIZE		88
+#define CFRAME_SHIFT_MULTRES	3
+#endif
+#else
 #error "Missing CFRAME_* definitions for this architecture"
 #endif
 
diff --git a/lib/luajit/src/lj_ircall.h b/lib/luajit/src/lj_ircall.h
index 84e41ecfcc..1f44b03d67 100644
--- a/lib/luajit/src/lj_ircall.h
+++ b/lib/luajit/src/lj_ircall.h
@@ -270,6 +270,22 @@ LJ_DATA const CCallInfo lj_ir_callinfo[IRCALL__MAX+1];
 #define fp64_f2l __aeabi_f2lz
 #define fp64_f2ul __aeabi_f2ulz
 #endif
+#elif LJ_TARGET_MIPS
+#define softfp_add __adddf3
+#define softfp_sub __subdf3
+#define softfp_mul __muldf3
+#define softfp_div __divdf3
+#define softfp_cmp __ledf2
+#define softfp_i2d __floatsidf
+#define softfp_d2i __fixdfsi
+#define softfp_ui2d __floatunsidf
+#define softfp_f2d __extendsfdf2
+#define softfp_d2ui __fixunsdfsi
+#define softfp_d2f __truncdfsf2
+#define softfp_i2f __floatsisf
+#define softfp_ui2f __floatunsisf
+#define softfp_f2i __fixsfsi
+#define softfp_f2ui __fixunssfsi
 #else
 #error "Missing soft-float definitions for target architecture"
 #endif
diff --git a/lib/luajit/src/lj_opt_split.c b/lib/luajit/src/lj_opt_split.c
index 81ded6c0a0..4652c73786 100644
--- a/lib/luajit/src/lj_opt_split.c
+++ b/lib/luajit/src/lj_opt_split.c
@@ -596,7 +596,8 @@ static void split_ir(jit_State *J)
 	}
 #endif
 	else if (st == IRT_I64 || st == IRT_U64) {  /* 64/64 bit cast. */
-	  /* Drop cast, since assembler doesn't care. */
+	  /* Drop cast, since assembler doesn't care. But fwd both parts. */
+	  hi = hiref;
 	  goto fwdlo;
 	} else if ((ir->op2 & IRCONV_SEXT)) {  /* Sign-extend to 64 bit. */
 	  IRRef k31 = lj_ir_kint(J, 31);
diff --git a/lib/luajit/src/lj_profile.c b/lib/luajit/src/lj_profile.c
index f4d6fe18de..c7e53963b5 100644
--- a/lib/luajit/src/lj_profile.c
+++ b/lib/luajit/src/lj_profile.c
@@ -5,7 +5,6 @@
 
 #define lj_profile_c
 #define LUA_CORE
-#define _GNU_SOURCE 1
 
 #include "lj_obj.h"
 
@@ -30,17 +29,6 @@
 #define profile_lock(ps)	UNUSED(ps)
 #define profile_unlock(ps)	UNUSED(ps)
 
-#if 1
-#include <stdio.h>
-#include <unistd.h>
-#include <fcntl.h>
-#include <sys/syscall.h>
-#include <sys/ioctl.h>
-#include <linux/perf_event.h>
-#include <sys/prctl.h>
-#endif
-
-
 #elif LJ_PROFILE_PTHREAD
 
 #include <pthread.h>
@@ -74,8 +62,6 @@ typedef struct ProfileState {
   SBuf sb;			/* String buffer for stack dumps. */
   int interval;			/* Sample interval in milliseconds. */
   int samples;			/* Number of samples for next callback. */
-  char *flavour;		/* What generates profiling events. */
-  int perf_event_fd;		/* Performace event file descriptor */
   int vmstate;			/* VM state when profile timer triggered. */
 #if LJ_PROFILE_SIGPROF
   struct sigaction oldsa;	/* Previous SIGPROF state. */
@@ -169,7 +155,7 @@ static void profile_trigger(ProfileState *ps)
   mask = g->hookmask;
   if (!(mask & (HOOK_PROFILE|HOOK_VMEVENT))) {  /* Set profile hook. */
     int st = g->vmstate;
-    ps->vmstate = st >= 0 ? 256+st :
+    ps->vmstate = st >= 0 ? 'N' :
 		  st == ~LJ_VMST_INTERP ? 'I' :
 		  st == ~LJ_VMST_C ? 'C' :
 		  st == ~LJ_VMST_GC ? 'G' : 'J';
@@ -190,178 +176,29 @@ static void profile_signal(int sig)
   profile_trigger(&profile_state);
 }
 
-
-static int perf_event_open(struct perf_event_attr *attr,
-			   pid_t pid, int cpu, int group_fd,
-			   unsigned long flags)
-{
-  return syscall(SYS_perf_event_open, attr, pid, cpu, group_fd, flags);
-}
-
-
-static void register_prof_events(ProfileState *ps)
-{
-  struct flavour_t {
-    char *name; uint32_t type; uint64_t config;
-  };
-
-  static struct flavour_t flavours[] =
-      {
-	{ "sw-cpu-clock",
-	  PERF_TYPE_SOFTWARE, PERF_COUNT_SW_CPU_CLOCK },
-
-	{ "sw-context-switches",
-	  PERF_TYPE_SOFTWARE, PERF_COUNT_SW_CONTEXT_SWITCHES },
-
-	{ "sw-page-faults",
-	  PERF_TYPE_SOFTWARE, PERF_COUNT_SW_PAGE_FAULTS },
-
-	{ "sw-minor-page-faults",
-	  PERF_TYPE_SOFTWARE, PERF_COUNT_SW_PAGE_FAULTS_MIN },
-
-	{ "sw-major-page-faults",
-	  PERF_TYPE_SOFTWARE, PERF_COUNT_SW_PAGE_FAULTS_MAJ },
-
-	{ "branch-instructions",
-	  PERF_TYPE_HARDWARE, PERF_COUNT_HW_BRANCH_INSTRUCTIONS },
-
-	{ "cpu-cycles",
-	  PERF_TYPE_HARDWARE, PERF_COUNT_HW_CPU_CYCLES },
-
-	{ "instructions",
-	  PERF_TYPE_HARDWARE, PERF_COUNT_HW_INSTRUCTIONS },
-
-	{ "cache-references",
-	  PERF_TYPE_HARDWARE, PERF_COUNT_HW_CACHE_REFERENCES },
-
-	{ "cache-misses",
-	  PERF_TYPE_HARDWARE, PERF_COUNT_HW_CACHE_MISSES },
-
-	{ "branch-instructions",
-	  PERF_TYPE_HARDWARE, PERF_COUNT_HW_BRANCH_INSTRUCTIONS },
-
-	{ "branch-misses",
-	  PERF_TYPE_HARDWARE, PERF_COUNT_HW_BRANCH_MISSES },
-
-	{ "bus-cycles",
-	  PERF_TYPE_HARDWARE, PERF_COUNT_HW_BUS_CYCLES },
-
-	{ "stalled-cycles-frontend",
-	  PERF_TYPE_HARDWARE, PERF_COUNT_HW_STALLED_CYCLES_FRONTEND },
-
-	{ "stalled-cycles-backend",
-	  PERF_TYPE_HARDWARE, PERF_COUNT_HW_STALLED_CYCLES_BACKEND },
-
-	{ "cpu-cycles",
-	  PERF_TYPE_HARDWARE, PERF_COUNT_HW_REF_CPU_CYCLES },
-
-	{ 0, 0, 0 }
-  };
-
-
-  struct perf_event_attr attr = { };
-
-  memset(&attr, 0, sizeof(struct perf_event_attr));
-
-  const struct flavour_t *f;
-  for (f = flavours; f->name != 0; f++)
-    {
-      if (strcmp (ps->flavour, f->name) == 0)
-	{
-	  attr.type = f->type;
-	  attr.config = f->config;
-	  break;
-	}
-    }
-
-  if (strcmp (ps->flavour, "?") == 0)
-    {
-      const struct flavour_t *f;
-      fprintf (stderr, "I know: ");
-      for (f = flavours; f->name != 0; f++)
-	fprintf (stderr, "%s ", f->name);
-      fprintf(stderr, "\n");
-    }
-  else if (! f->name)
-    {
-      fprintf (stderr, "unknown profiling flavour `%s', S[?] to list\n", ps->flavour);
-    }
-
-  attr.size = sizeof(struct perf_event_attr);
-  attr.sample_type = PERF_SAMPLE_IP;
-  /* attr.read_format = PERF_FORMAT_GROUP | PERF_FORMAT_ID; */
-  attr.disabled=1;
-  attr.pinned=1;
-  attr.exclude_kernel=1;
-  attr.exclude_hv=1;
-
-  attr.sample_period = ps->interval;
-  /* attr.watermark=0; */
-  /* attr.wakeup_events=1; */
-  
-  int fd = perf_event_open(&attr, 0, -1, -1, 0);
-  if (fd == -1)
-    {
-      printf ("! perf_event_open %m\n");
-    }
-
-  ps->perf_event_fd = fd;
-
-  fcntl(fd, F_SETFL, O_RDWR|O_NONBLOCK|O_ASYNC);
-  fcntl(fd, F_SETSIG, SIGPROF);
-  fcntl(fd, F_SETOWN, getpid());
-
-  ioctl(fd, PERF_EVENT_IOC_RESET, 0);
-
-  int err = ioctl(fd, PERF_EVENT_IOC_ENABLE, 0);
-  if (err != 0)
-    printf ("! perf_events enable\n");
-}
-
-
-
 /* Start profiling timer. */
 static void profile_timer_start(ProfileState *ps)
 {
-  struct sigaction sa = {
-    .sa_flags = SA_RESTART,
-    .sa_handler = profile_signal
-  };
-
+  int interval = ps->interval;
+  struct itimerval tm;
+  struct sigaction sa;
+  tm.it_value.tv_sec = tm.it_interval.tv_sec = interval / 1000;
+  tm.it_value.tv_usec = tm.it_interval.tv_usec = (interval % 1000) * 1000;
+  setitimer(ITIMER_PROF, &tm, NULL);
+  sa.sa_flags = SA_RESTART;
+  sa.sa_handler = profile_signal;
   sigemptyset(&sa.sa_mask);
   sigaction(SIGPROF, &sa, &ps->oldsa);
-
-  if (strcmp(ps->flavour, "vanilla") == 0)
-    {
-      int interval = ps->interval;
-      struct itimerval tm;
-      tm.it_value.tv_sec = tm.it_interval.tv_sec = interval / 1000;
-      tm.it_value.tv_usec = tm.it_interval.tv_usec = (interval % 1000) * 1000;
-      setitimer(ITIMER_PROF, &tm, NULL);
-    }
-  else
-    {
-      register_prof_events(ps);
-    }
 }
 
-
-
 /* Stop profiling timer. */
 static void profile_timer_stop(ProfileState *ps)
 {
-  if (ps->perf_event_fd)
-    {
-      ioctl(ps->perf_event_fd, PERF_EVENT_IOC_DISABLE, 0);
-    }
-  else
-    {
-      struct itimerval tm;
-      tm.it_value.tv_sec = tm.it_interval.tv_sec = 0;
-      tm.it_value.tv_usec = tm.it_interval.tv_usec = 0;
-      setitimer(ITIMER_PROF, &tm, NULL);
-      sigaction(SIGPROF, &ps->oldsa, NULL);
-    }
+  struct itimerval tm;
+  tm.it_value.tv_sec = tm.it_interval.tv_sec = 0;
+  tm.it_value.tv_usec = tm.it_interval.tv_usec = 0;
+  setitimer(ITIMER_PROF, &tm, NULL);
+  sigaction(SIGPROF, &ps->oldsa, NULL);
 }
 
 #elif LJ_PROFILE_PTHREAD
@@ -463,8 +300,6 @@ LUA_API void luaJIT_profile_start(lua_State *L, const char *mode,
 {
   ProfileState *ps = &profile_state;
   int interval = LJ_PROFILE_INTERVAL_DEFAULT;
-  char *flavour;
-
   while (*mode) {
     int m = *mode++;
     switch (m) {
@@ -480,13 +315,6 @@ LUA_API void luaJIT_profile_start(lua_State *L, const char *mode,
       lj_trace_flushall(L);
       break;
 #endif
-    case 'S':
-      {
-	int k;
-	if (sscanf (mode, "[%m[^]]]%n", &flavour, &k) > 0)
-	  mode += k;
-      }
-
     default:  /* Ignore unknown mode chars. */
       break;
     }
@@ -500,7 +328,6 @@ LUA_API void luaJIT_profile_start(lua_State *L, const char *mode,
   ps->cb = cb;
   ps->data = data;
   ps->samples = 0;
-  ps->flavour = flavour;
   lj_buf_init(L, &ps->sb);
   profile_timer_start(ps);
 }
diff --git a/lib/luajit/src/lj_snap.c b/lib/luajit/src/lj_snap.c
index fa9abb7475..62515ed0f6 100644
--- a/lib/luajit/src/lj_snap.c
+++ b/lib/luajit/src/lj_snap.c
@@ -26,9 +26,6 @@
 #include "lj_cdata.h"
 #endif
 
-/* Some local macros to save typing. Undef'd at the end. */
-#define IR(ref)		(&J->cur.ir[(ref)])
-
 /* Pass IR on to next optimization in chain (FOLD). */
 #define emitir(ot, a, b)	(lj_ir_set(J, (ot), (a), (b)), lj_opt_fold(J))
 
@@ -73,7 +70,7 @@ static MSize snapshot_slots(jit_State *J, SnapEntry *map, BCReg nslots)
     IRRef ref = tref_ref(tr);
     if (ref) {
       SnapEntry sn = SNAP_TR(s, tr);
-      IRIns *ir = IR(ref);
+      IRIns *ir = &J->cur.ir[ref];
       if (!(sn & (SNAP_CONT|SNAP_FRAME)) &&
 	  ir->o == IR_SLOAD && ir->op1 == s && ref > retf) {
 	/* No need to snapshot unmodified non-inherited slots. */
@@ -407,24 +404,24 @@ static TRef snap_pref(jit_State *J, GCtrace *T, SnapEntry *map, MSize nmax,
 }
 
 /* Check whether a sunk store corresponds to an allocation. Slow path. */
-static int snap_sunk_store2(jit_State *J, IRIns *ira, IRIns *irs)
+static int snap_sunk_store2(GCtrace *T, IRIns *ira, IRIns *irs)
 {
   if (irs->o == IR_ASTORE || irs->o == IR_HSTORE ||
       irs->o == IR_FSTORE || irs->o == IR_XSTORE) {
-    IRIns *irk = IR(irs->op1);
+    IRIns *irk = &T->ir[irs->op1];
     if (irk->o == IR_AREF || irk->o == IR_HREFK)
-      irk = IR(irk->op1);
-    return (IR(irk->op1) == ira);
+      irk = &T->ir[irk->op1];
+    return (&T->ir[irk->op1] == ira);
   }
   return 0;
 }
 
 /* Check whether a sunk store corresponds to an allocation. Fast path. */
-static LJ_AINLINE int snap_sunk_store(jit_State *J, IRIns *ira, IRIns *irs)
+static LJ_AINLINE int snap_sunk_store(GCtrace *T, IRIns *ira, IRIns *irs)
 {
   if (irs->s != 255)
     return (ira + irs->s == irs);  /* Fast check. */
-  return snap_sunk_store2(J, ira, irs);
+  return snap_sunk_store2(T, ira, irs);
 }
 
 /* Replay snapshot state to setup side trace. */
@@ -487,7 +484,7 @@ void lj_snap_replay(jit_State *J, GCtrace *T)
 	} else {
 	  IRIns *irs;
 	  for (irs = ir+1; irs < irlast; irs++)
-	    if (irs->r == RID_SINK && snap_sunk_store(J, ir, irs)) {
+	    if (irs->r == RID_SINK && snap_sunk_store(T, ir, irs)) {
 	      if (snap_pref(J, T, map, nent, seen, irs->op2) == 0)
 		snap_pref(J, T, map, nent, seen, T->ir[irs->op2].op1);
 	      else if ((LJ_SOFTFP || (LJ_32 && LJ_HASFFI)) &&
@@ -521,13 +518,13 @@ void lj_snap_replay(jit_State *J, GCtrace *T)
 	    op2 = emitir_raw(IRT(IR_HIOP, IRT_I64), op2,
 			     snap_pref(J, T, map, nent, seen, (ir+1)->op2));
 	  }
-	  J->slot[snap_slot(sn)] = emitir(ir->ot, op1, op2);
+	  J->slot[snap_slot(sn)] = emitir(ir->ot & ~(IRT_MARK|IRT_ISPHI), op1, op2);
 	} else {
 	  IRIns *irs;
 	  TRef tr = emitir(ir->ot, op1, op2);
 	  J->slot[snap_slot(sn)] = tr;
 	  for (irs = ir+1; irs < irlast; irs++)
-	    if (irs->r == RID_SINK && snap_sunk_store(J, ir, irs)) {
+	    if (irs->r == RID_SINK && snap_sunk_store(T, ir, irs)) {
 	      IRIns *irr = &T->ir[irs->op1];
 	      TRef val, key = irr->op2, tmp = tr;
 	      if (irr->o != IR_FREF) {
@@ -714,8 +711,9 @@ static void snap_unsink(jit_State *J, GCtrace *T, ExitState *ex,
   if (ir->o == IR_CNEW || ir->o == IR_CNEWI) {
     CTState *cts = ctype_cts(J->L);
     CTypeID id = (CTypeID)T->ir[ir->op1].i;
-    CTSize sz = lj_ctype_size(cts, id);
-    GCcdata *cd = lj_cdata_new(cts, id, sz);
+    CTSize sz;
+    CTInfo info = lj_ctype_info(cts, id, &sz);
+    GCcdata *cd = lj_cdata_newx(cts, id, sz, info);
     setcdataV(J->L, o, cd);
     if (ir->o == IR_CNEWI) {
       uint8_t *p = (uint8_t *)cdataptr(cd);
@@ -729,7 +727,7 @@ static void snap_unsink(jit_State *J, GCtrace *T, ExitState *ex,
     } else {
       IRIns *irs, *irlast = &T->ir[T->snap[snapno].ref];
       for (irs = ir+1; irs < irlast; irs++)
-	if (irs->r == RID_SINK && snap_sunk_store(J, ir, irs)) {
+	if (irs->r == RID_SINK && snap_sunk_store(T, ir, irs)) {
 	  IRIns *iro = &T->ir[T->ir[irs->op1].op2];
 	  uint8_t *p = (uint8_t *)cd;
 	  CTSize szs;
@@ -762,7 +760,7 @@ static void snap_unsink(jit_State *J, GCtrace *T, ExitState *ex,
     settabV(J->L, o, t);
     irlast = &T->ir[T->snap[snapno].ref];
     for (irs = ir+1; irs < irlast; irs++)
-      if (irs->r == RID_SINK && snap_sunk_store(J, ir, irs)) {
+      if (irs->r == RID_SINK && snap_sunk_store(T, ir, irs)) {
 	IRIns *irk = &T->ir[irs->op1];
 	TValue tmp, *val;
 	lua_assert(irs->o == IR_ASTORE || irs->o == IR_HSTORE ||
@@ -863,7 +861,6 @@ const BCIns *lj_snap_restore(jit_State *J, void *exptr)
   return pc;
 }
 
-#undef IR
 #undef emitir_raw
 #undef emitir
 
diff --git a/lib/luajit/src/lj_vm.h b/lib/luajit/src/lj_vm.h
index b31e22f70f..cb76d7a700 100644
--- a/lib/luajit/src/lj_vm.h
+++ b/lib/luajit/src/lj_vm.h
@@ -50,7 +50,7 @@ LJ_ASMF void lj_vm_exit_handler(void);
 LJ_ASMF void lj_vm_exit_interp(void);
 
 /* Internal math helper functions. */
-#if LJ_TARGET_PPC || LJ_TARGET_ARM64
+#if LJ_TARGET_PPC || LJ_TARGET_ARM64 || (LJ_TARGET_MIPS && LJ_ABI_SOFTFP)
 #define lj_vm_floor	floor
 #define lj_vm_ceil	ceil
 #else
diff --git a/lib/luajit/src/lua.h b/lib/luajit/src/lua.h
index c83fd3bbe7..352d29f3cd 100644
--- a/lib/luajit/src/lua.h
+++ b/lib/luajit/src/lua.h
@@ -226,6 +226,7 @@ LUA_API int  (lua_status) (lua_State *L);
 #define LUA_GCSTEP		5
 #define LUA_GCSETPAUSE		6
 #define LUA_GCSETSTEPMUL	7
+#define LUA_GCISRUNNING		9
 
 LUA_API int (lua_gc) (lua_State *L, int what, int data);
 
diff --git a/lib/luajit/src/vm_arm.dasc b/lib/luajit/src/vm_arm.dasc
index af722f9eac..acc0853bb7 100644
--- a/lib/luajit/src/vm_arm.dasc
+++ b/lib/luajit/src/vm_arm.dasc
@@ -372,6 +372,17 @@ static void build_subroutines(BuildCtx *ctx)
   |    str CARG1, [BASE, #-4]		// Prepend false to error message.
   |   st_vmstate CARG2
   |  b ->vm_returnc
+  |
+  |->vm_unwind_ext:			// Complete external unwind.
+#if !LJ_NO_UNWIND
+  |  push {r0, r1, r2, lr}
+  |  bl extern _Unwind_Complete
+  |  ldr r0, [sp]
+  |  bl extern _Unwind_DeleteException
+  |  pop {r0, r1, r2, lr}
+  |  mov r0, r1
+  |  bx r2
+#endif
   |
   |//-----------------------------------------------------------------------
   |//-- Grow stack for calls -----------------------------------------------
diff --git a/lib/luajit/src/vm_mips.dasc b/lib/luajit/src/vm_mips.dasc
index 134ed569e8..0dba129316 100644
--- a/lib/luajit/src/vm_mips.dasc
+++ b/lib/luajit/src/vm_mips.dasc
@@ -1,6 +1,9 @@
 |// Low-level VM code for MIPS CPUs.
 |// Bytecode interpreter, fast functions and helper functions.
 |// Copyright (C) 2005-2015 Mike Pall. See Copyright Notice in luajit.h
+|//
+|// MIPS soft-float support contributed by Djordje Kovacevic and
+|// Stefan Pejic from RT-RK.com, sponsored by Cisco Systems, Inc.
 |
 |.arch mips
 |.section code_op, code_sub
@@ -18,6 +21,12 @@
 |// Fixed register assignments for the interpreter.
 |// Don't use: r0 = 0, r26/r27 = reserved, r28 = gp, r29 = sp, r31 = ra
 |
+|.macro .FPU, a, b
+|.if FPU
+|  a, b
+|.endif
+|.endmacro
+|
 |// The following must be C callee-save (but BASE is often refetched).
 |.define BASE,		r16	// Base of current Lua stack frame.
 |.define KBASE,		r17	// Constants of current Lua function.
@@ -31,7 +40,9 @@
 |
 |// Constants for type-comparisons, stores and conversions. C callee-save.
 |.define TISNIL,	r30
+|.if FPU
 |.define TOBIT,		f30	// 2^52 + 2^51.
+|.endif
 |
 |// The following temporaries are not saved across C calls, except for RA.
 |.define RA,		r23	// Callee-save.
@@ -46,6 +57,13 @@
 |.define TMP2,		r14
 |.define TMP3,		r15
 |
+|.if not FPU
+|.define SFT1,		r2
+|.define SFT2,		r3
+|.define SFT3,		r4
+|.define SFT4,		r5
+|.endif
+|
 |// Calling conventions.
 |.define CFUNCADDR,	r25
 |.define CARG1,		r4
@@ -56,13 +74,16 @@
 |.define CRET1,		r2
 |.define CRET2,		r3
 |
+|.if FPU
 |.define FARG1,		f12
 |.define FARG2,		f14
 |
 |.define FRET1,		f0
 |.define FRET2,		f2
+|.endif
 |
 |// Stack layout while in interpreter. Must match with lj_frame.h.
+|.if FPU		// MIPS32 hard-float.
 |.define CFRAME_SPACE,	112	// Delta for sp.
 |
 |.define SAVE_ERRF,	124(sp)	// 32 bit C frame info.
@@ -83,43 +104,76 @@
 |.define ARG5_OFS,	16
 |.define SAVE_MULTRES,	ARG5
 |
+|//-----------------------------------------------------------------------
+|.else				// MIPS32 soft-float.
+|
+|.define CFRAME_SPACE,	88	// Delta for sp.
+|
+|.define SAVE_ERRF,	100(sp)	// 32 bit C frame info.
+|.define SAVE_NRES,	96(sp)
+|.define SAVE_CFRAME,	92(sp)
+|.define SAVE_L,	88(sp)
+|//----- 8 byte aligned, ^^^^ 16 byte register save area, owned by interpreter.
+|.define SAVE_GPR_,	48	// .. 48+10*4: 32 bit GPR saves.
+|.define SAVE_PC,	44(sp)
+|.define TEMP_SAVE_6,	40(sp)
+|.define TEMP_SAVE_5,	36(sp)
+|.define TEMP_SAVE_4,	32(sp)
+|.define TEMP_SAVE_3,	28(sp)
+|.define TEMP_SAVE_2,	24(sp)
+|.define TEMP_SAVE_1,	20(sp)
+|//----- 8 byte aligned, ^^^^ 24 byte register save area, owned by caller.
+|.define ARG5,		16(sp)
+|.define CSAVE_4,	12(sp)
+|.define CSAVE_3,	8(sp)
+|.define CSAVE_2,	4(sp)
+|.define CSAVE_1,	0(sp)
+|//----- 8 byte aligned, ^^^^ 16 byte register save area, owned by callee.
+|
+|.define ARG5_OFS,	16
+|.define SAVE_MULTRES,	ARG5
+|
+|.endif
+|
+|//-----------------------------------------------------------------------
+|
 |.macro saveregs
 |  addiu sp, sp, -CFRAME_SPACE
 |  sw ra, SAVE_GPR_+9*4(sp)
 |  sw r30, SAVE_GPR_+8*4(sp)
-|   sdc1 f30, SAVE_FPR_+5*8(sp)
+|   .FPU sdc1 f30, SAVE_FPR_+5*8(sp)
 |  sw r23, SAVE_GPR_+7*4(sp)
 |  sw r22, SAVE_GPR_+6*4(sp)
-|   sdc1 f28, SAVE_FPR_+4*8(sp)
+|   .FPU sdc1 f28, SAVE_FPR_+4*8(sp)
 |  sw r21, SAVE_GPR_+5*4(sp)
 |  sw r20, SAVE_GPR_+4*4(sp)
-|   sdc1 f26, SAVE_FPR_+3*8(sp)
+|   .FPU sdc1 f26, SAVE_FPR_+3*8(sp)
 |  sw r19, SAVE_GPR_+3*4(sp)
 |  sw r18, SAVE_GPR_+2*4(sp)
-|   sdc1 f24, SAVE_FPR_+2*8(sp)
+|   .FPU sdc1 f24, SAVE_FPR_+2*8(sp)
 |  sw r17, SAVE_GPR_+1*4(sp)
 |  sw r16, SAVE_GPR_+0*4(sp)
-|   sdc1 f22, SAVE_FPR_+1*8(sp)
-|   sdc1 f20, SAVE_FPR_+0*8(sp)
+|   .FPU sdc1 f22, SAVE_FPR_+1*8(sp)
+|   .FPU sdc1 f20, SAVE_FPR_+0*8(sp)
 |.endmacro
 |
 |.macro restoreregs_ret
 |  lw ra, SAVE_GPR_+9*4(sp)
 |  lw r30, SAVE_GPR_+8*4(sp)
-|   ldc1 f30, SAVE_FPR_+5*8(sp)
+|   .FPU ldc1 f30, SAVE_FPR_+5*8(sp)
 |  lw r23, SAVE_GPR_+7*4(sp)
 |  lw r22, SAVE_GPR_+6*4(sp)
-|   ldc1 f28, SAVE_FPR_+4*8(sp)
+|   .FPU ldc1 f28, SAVE_FPR_+4*8(sp)
 |  lw r21, SAVE_GPR_+5*4(sp)
 |  lw r20, SAVE_GPR_+4*4(sp)
-|   ldc1 f26, SAVE_FPR_+3*8(sp)
+|   .FPU ldc1 f26, SAVE_FPR_+3*8(sp)
 |  lw r19, SAVE_GPR_+3*4(sp)
 |  lw r18, SAVE_GPR_+2*4(sp)
-|   ldc1 f24, SAVE_FPR_+2*8(sp)
+|   .FPU ldc1 f24, SAVE_FPR_+2*8(sp)
 |  lw r17, SAVE_GPR_+1*4(sp)
 |  lw r16, SAVE_GPR_+0*4(sp)
-|   ldc1 f22, SAVE_FPR_+1*8(sp)
-|   ldc1 f20, SAVE_FPR_+0*8(sp)
+|   .FPU ldc1 f22, SAVE_FPR_+1*8(sp)
+|   .FPU ldc1 f20, SAVE_FPR_+0*8(sp)
 |  jr ra
 |  addiu sp, sp, CFRAME_SPACE
 |.endmacro
@@ -270,6 +324,61 @@
 |.macro call_extern; jalr CFUNCADDR; .endmacro
 |.macro jmp_extern; jr CFUNCADDR; .endmacro
 |
+|// Converts int from given reg to double, result in CRET1 and CRET2 regs.
+|.if not FPU
+|.macro cvti2d, arg
+|   load_got __floatsidf
+|   call_extern
+|.  move CARG1, arg
+|.endmacro
+|.endif
+|
+|// Loads a double-word floating-point value.
+|.macro load_double, fpr, gpr1, gpr2, src
+|.if FPU
+|  ldc1 fpr, src
+|.else
+|  lw gpr1, src
+|  lw gpr2, 4+src
+|.endif
+|.endmacro
+|
+|// Stores a double-word floating-point value.
+|.macro store_double, fpr, gpr1, gpr2, dst
+|.if FPU
+|  sdc1 fpr, dst
+|.else
+|  sw gpr1, dst
+|  sw gpr2, 4+dst
+|.endif
+|.endmacro
+|
+|// Loads the first double-word floating-point argument.
+|.macro load_farg1, src
+|  load_double FARG1, CARG1, CARG2, src
+|.endmacro
+|
+|// Loads the second double-word floating-point argument.
+|.macro load_farg2, src
+|  load_double FARG2, CARG3, CARG4, src
+|.endmacro
+|
+|.macro load_double1, src
+|  load_double f0, SFT1, SFT2, src
+|.endmacro
+|
+|.macro store_double1, dst
+|  store_double f0, SFT1, SFT2, dst
+|.endmacro
+|
+|.macro load_double2, src
+|  load_double f2, SFT3, SFT4, src
+|.endmacro
+|
+|.macro store_double2, dst
+|  store_double f2, SFT3, SFT4, dst
+|.endmacro
+|
 |.macro hotcheck, delta, target
 |  srl TMP1, PC, 1
 |  andi TMP1, TMP1, 126
@@ -354,9 +463,9 @@ static void build_subroutines(BuildCtx *ctx)
   |.   sll TMP2, TMP2, 3
   |1:
   |  addiu TMP1, TMP1, -8
-  |   ldc1 f0, 0(RA)
+  |   load_double1 0(RA)
   |    addiu RA, RA, 8
-  |   sdc1 f0, 0(BASE)
+  |   store_double1 0(BASE)
   |  bnez TMP1, <1
   |.  addiu BASE, BASE, 8
   |
@@ -425,15 +534,15 @@ static void build_subroutines(BuildCtx *ctx)
   |  and sp, CARG1, AT
   |->vm_unwind_ff_eh:			// Landing pad for external unwinder.
   |  lw L, SAVE_L
-  |     lui TMP3, 0x59c0		// TOBIT = 2^52 + 2^51 (float).
+  |     .FPU lui TMP3, 0x59c0		// TOBIT = 2^52 + 2^51 (float).
   |     li TISNIL, LJ_TNIL
   |  lw BASE, L->base
   |   lw DISPATCH, L->glref		// Setup pointer to dispatch table.
-  |     mtc1 TMP3, TOBIT
+  |     .FPU mtc1 TMP3, TOBIT
   |  li TMP1, LJ_TFALSE
   |    li_vmstate INTERP
   |  lw PC, FRAME_PC(BASE)		// Fetch PC of previous frame.
-  |     cvt.d.s TOBIT, TOBIT
+  |     .FPU cvt.d.s TOBIT, TOBIT
   |  addiu RA, BASE, -8			// Results start at BASE-8.
   |   addiu DISPATCH, DISPATCH, GG_G2DISP
   |  sw TMP1, HI(RA)			// Prepend false to error message.
@@ -498,11 +607,11 @@ static void build_subroutines(BuildCtx *ctx)
   |   lw BASE, L->base
   |   lw TMP1, L->top
   |  lw PC, FRAME_PC(BASE)
-  |     lui TMP3, 0x59c0		// TOBIT = 2^52 + 2^51 (float).
+  |     .FPU  lui TMP3, 0x59c0		// TOBIT = 2^52 + 2^51 (float).
   |   subu RD, TMP1, BASE
-  |     mtc1 TMP3, TOBIT
+  |     .FPU  mtc1 TMP3, TOBIT
   |    sb r0, L->status
-  |     cvt.d.s TOBIT, TOBIT
+  |     .FPU  cvt.d.s TOBIT, TOBIT
   |    li_vmstate INTERP
   |   addiu RD, RD, 8
   |    st_vmstate
@@ -540,13 +649,13 @@ static void build_subroutines(BuildCtx *ctx)
   |3:  // Entry point for vm_cpcall/vm_resume (BASE = base, PC = ftype).
   |  sw L, DISPATCH_GL(cur_L)(DISPATCH)
   |  lw TMP2, L->base			// TMP2 = old base (used in vmeta_call).
-  |     lui TMP3, 0x59c0		// TOBIT = 2^52 + 2^51 (float).
+  |     .FPU lui TMP3, 0x59c0		// TOBIT = 2^52 + 2^51 (float).
   |   lw TMP1, L->top
-  |     mtc1 TMP3, TOBIT
+  |     .FPU mtc1 TMP3, TOBIT
   |  addu PC, PC, BASE
   |   subu NARGS8:RC, TMP1, BASE
   |  subu PC, PC, TMP2			// PC = frame delta + frame type
-  |     cvt.d.s TOBIT, TOBIT
+  |     .FPU cvt.d.s TOBIT, TOBIT
   |    li_vmstate INTERP
   |     li TISNIL, LJ_TNIL
   |    st_vmstate
@@ -628,7 +737,7 @@ static void build_subroutines(BuildCtx *ctx)
   |->cont_cat:				// RA = resultptr, RB = meta base
   |  lw INS, -4(PC)
   |   addiu CARG2, RB, -16
-  |   ldc1 f0, 0(RA)
+  |   load_double1 0(RA)
   |  decode_RB8a MULTRES, INS
   |   decode_RA8a RA, INS
   |  decode_RB8b MULTRES
@@ -636,11 +745,21 @@ static void build_subroutines(BuildCtx *ctx)
   |  addu TMP1, BASE, MULTRES
   |   sw BASE, L->base
   |   subu CARG3, CARG2, TMP1
+  |.if FPU
   |  bne TMP1, CARG2, ->BC_CAT_Z
   |.  sdc1 f0, 0(CARG2)
   |  addu RA, BASE, RA
   |  b ->cont_nop
   |.  sdc1 f0, 0(RA)
+  |.else
+  |  sw SFT1, 0(CARG2)
+  |  bne TMP1, CARG2, ->BC_CAT_Z
+  |.  sw SFT2, 4(CARG2)
+  |  addu RA, BASE, RA
+  |  sw SFT1, 0(RA)
+  |  b ->cont_nop
+  |.  sw SFT2, 4(RA)
+  |.endif
   |
   |//-- Table indexing metamethods -----------------------------------------
   |
@@ -663,10 +782,19 @@ static void build_subroutines(BuildCtx *ctx)
   |.  sw TMP1, HI(CARG3)
   |
   |->vmeta_tgetb:			// TMP0 = index
+  |.if FPU
   |  mtc1 TMP0, f0
   |  cvt.d.w f0, f0
   |  addiu CARG3, DISPATCH, DISPATCH_GL(tmptv)
   |  sdc1 f0, 0(CARG3)
+  |.else
+  |  sw CARG2, TEMP_SAVE_1 //needed to be saved because it's used later in lj_meta_tget
+  |  cvti2d TMP0
+  |  addiu CARG3, DISPATCH, DISPATCH_GL(tmptv)
+  |  sw CRET1, 0(CARG3)
+  |  sw CRET2, 4(CARG3)
+  |  lw CARG2, TEMP_SAVE_1
+  |.endif
   |
   |->vmeta_tgetv:
   |1:
@@ -678,9 +806,9 @@ static void build_subroutines(BuildCtx *ctx)
   |  // Returns TValue * (finished) or NULL (metamethod).
   |  beqz CRET1, >3
   |.  addiu TMP1, BASE, -FRAME_CONT
-  |  ldc1 f0, 0(CRET1)
+  |  load_double2 0(CRET1)
   |  ins_next1
-  |   sdc1 f0, 0(RA)
+  |   store_double2 0(RA)
   |  ins_next2
   |
   |3:  // Call __index metamethod.
@@ -699,8 +827,14 @@ static void build_subroutines(BuildCtx *ctx)
   |  // Returns cTValue * or NULL.
   |  beqz CRET1, >1
   |.  nop
+  |.if FPU
   |  b ->BC_TGETR_Z
   |.  ldc1 f0, 0(CRET1)
+  |.else
+  |  lw SFT1, 0(CRET1)
+  |  b ->BC_TGETR_Z
+  |.  lw SFT2, 4(CRET1)
+  |.endif
   |
   |//-----------------------------------------------------------------------
   |
@@ -723,10 +857,19 @@ static void build_subroutines(BuildCtx *ctx)
   |.  sw TMP1, HI(CARG3)
   |
   |->vmeta_tsetb:			// TMP0 = index
+  |.if  FPU
   |  mtc1 TMP0, f0
   |  cvt.d.w f0, f0
   |  addiu CARG3, DISPATCH, DISPATCH_GL(tmptv)
   |  sdc1 f0, 0(CARG3)
+  |.else
+  |  sw CARG2, TEMP_SAVE_1
+  |  cvti2d TMP0
+  |  addiu CARG3, DISPATCH, DISPATCH_GL(tmptv)
+  |  sw CRET1, 0(CARG3)
+  |  sw CRET2, 4(CARG3)
+  |  lw CARG2, TEMP_SAVE_1
+  |.endif
   |
   |->vmeta_tsetv:
   |1:
@@ -736,11 +879,17 @@ static void build_subroutines(BuildCtx *ctx)
   |  call_intern lj_meta_tset		// (lua_State *L, TValue *o, TValue *k)
   |.  move CARG1, L
   |  // Returns TValue * (finished) or NULL (metamethod).
+  |.if FPU
   |  beqz CRET1, >3
-  |.  ldc1 f0, 0(RA)
+  |.  ldc1 f2, 0(RA)
+  |.else
+  |  lw SFT3, 0(RA)
+  |  beqz CRET1, >3
+  |.  lw SFT4, 4(RA)
+  |.endif
   |  // NOBARRIER: lj_meta_tset ensures the table is not black.
   |  ins_next1
-  |   sdc1 f0, 0(CRET1)
+  |   store_double2 0(CRET1)
   |  ins_next2
   |
   |3:  // Call __newindex metamethod.
@@ -750,7 +899,7 @@ static void build_subroutines(BuildCtx *ctx)
   |  sw PC, -16+HI(BASE)		// [cont|PC]
   |   subu PC, BASE, TMP1
   |  lw LFUNC:RB, FRAME_FUNC(BASE)	// Guaranteed to be a function here.
-  |  sdc1 f0, 16(BASE)			// Copy value to third argument.
+  |  store_double2 16(BASE)			// Copy value to third argument.
   |  b ->vm_call_dispatch_f
   |.  li NARGS8:RC, 24			// 3 args for func(t, k, v)
   |
@@ -793,11 +942,17 @@ static void build_subroutines(BuildCtx *ctx)
   |
   |->cont_ra:				// RA = resultptr
   |  lbu TMP1, -4+OFS_RA(PC)
-  |   ldc1 f0, 0(RA)
+  |   load_double1 0(RA)
   |  sll TMP1, TMP1, 3
   |  addu TMP1, BASE, TMP1
+  |.if FPU
   |  b ->cont_nop
   |.  sdc1 f0, 0(TMP1)
+  |.else
+  |   sw SFT1, 0(TMP1)
+  |  b ->cont_nop
+  |.  sw SFT2, 4(TMP1)
+  |.endif
   |
   |->cont_condt:			// RA = resultptr
   |  lw TMP0, HI(RA)
@@ -852,7 +1007,22 @@ static void build_subroutines(BuildCtx *ctx)
   |//-- Arithmetic metamethods ---------------------------------------------
   |
   |->vmeta_unm:
-  |  move CARG4, CARG3
+  |  b ->vmeta_arith
+  |.  move CARG4, CARG3
+  |
+  |->vmeta_arith_vn:
+  |  addu CARG3, BASE, RB
+  |  b ->vmeta_arith
+  |.  addu CARG4, KBASE, RC
+  |
+  |->vmeta_arith_nv:
+  |  addu CARG4, BASE, RB
+  |  b ->vmeta_arith
+  |.  addu CARG3, KBASE, RC
+  |
+  |->vmeta_arith_vv:
+  |  addu CARG3, BASE, RB
+  |   addu CARG4, BASE, RC
   |
   |->vmeta_arith:
   |  load_got lj_meta_arith
@@ -985,9 +1155,9 @@ static void build_subroutines(BuildCtx *ctx)
   |.macro .ffunc_n, name	// Caveat: has delay slot!
   |->ff_ .. name:
   |  lw CARG3, HI(BASE)
+  |   load_farg1 0(BASE)
   |  beqz NARGS8:RC, ->fff_fallback
-  |.  ldc1 FARG1, 0(BASE)
-  |  sltiu AT, CARG3, LJ_TISNUM
+  |.  sltiu AT, CARG3, LJ_TISNUM
   |  beqz AT, ->fff_fallback
   |.endmacro
   |
@@ -997,10 +1167,10 @@ static void build_subroutines(BuildCtx *ctx)
   |   lw CARG3, HI(BASE)
   |  bnez AT, ->fff_fallback
   |.  lw CARG4, 8+HI(BASE)
-  |  ldc1 FARG1, 0(BASE)
-  |  ldc1 FARG2, 8(BASE)
   |  sltiu TMP0, CARG3, LJ_TISNUM
   |  sltiu TMP1, CARG4, LJ_TISNUM
+  |  load_farg1 0(BASE)
+  |  load_farg2 8(BASE)
   |  and TMP0, TMP0, TMP1
   |  beqz TMP0, ->fff_fallback
   |.endmacro
@@ -1027,8 +1197,8 @@ static void build_subroutines(BuildCtx *ctx)
   |  beq BASE, TMP2, ->fff_res		// Done if exactly 1 argument.
   |.  sw CARG1, LO(RA)
   |1:
-  |  ldc1 f0, 0(TMP1)
-  |  sdc1 f0, -8(TMP1)
+  |  load_double1 0(TMP1)
+  |  store_double1 -8(TMP1)
   |  bne TMP1, TMP2, <1
   |.  addiu TMP1, TMP1, 8
   |  b ->fff_res
@@ -1043,8 +1213,14 @@ static void build_subroutines(BuildCtx *ctx)
   |  not TMP1, TMP1
   |  sll TMP1, TMP1, 3
   |  addu TMP1, CFUNC:RB, TMP1
+  |.if HFABI
   |  b ->fff_resn
   |.  ldc1 FRET1, CFUNC:TMP1->upvalue
+  |.else
+  |  lw CRET1, CFUNC:TMP1->upvalue[0].u32.hi
+  |  b ->fff_resn
+  |.  lw CRET2, CFUNC:TMP1->upvalue[0].u32.lo
+  |.endif
   |
   |//-- Base library: getters and setters ---------------------------------
   |
@@ -1125,8 +1301,14 @@ static void build_subroutines(BuildCtx *ctx)
   |  call_intern lj_tab_get	// (lua_State *L, GCtab *t, cTValue *key)
   |.  move CARG1, L
   |  // Returns cTValue *.
+  |.if HFABI
   |  b ->fff_resn
   |.  ldc1 FRET1, 0(CRET1)
+  |.else
+  |  lw CRET2, 4(CRET1)
+  |  b ->fff_resn
+  |.  lw CRET1, 0(CRET1)
+  |.endif
   |
   |//-- Base library: conversions ------------------------------------------
   |
@@ -1136,8 +1318,14 @@ static void build_subroutines(BuildCtx *ctx)
   |  xori AT, NARGS8:RC, 8
   |  sltiu CARG1, CARG1, LJ_TISNUM
   |  movn CARG1, r0, AT
+  |.if HFABI
   |  beqz CARG1, ->fff_fallback		// Exactly one number argument.
   |.  ldc1 FRET1, 0(BASE)
+  |.else
+  |  lw CRET1, 0(BASE)
+  |  beqz CARG1, ->fff_fallback		// Exactly one number argument.
+  |.  lw CRET2, 4(BASE)
+  |.endif
   |  b ->fff_resn
   |.  nop
   |
@@ -1185,13 +1373,13 @@ static void build_subroutines(BuildCtx *ctx)
   |  // Returns 0 at end of traversal.
   |  beqz CRET1, ->fff_restv		// End of traversal: return nil.
   |.  li CARG3, LJ_TNIL
-  |  ldc1 f0, 8(BASE)			// Copy key and value to results.
+  |  load_double1 8(BASE)
   |    addiu RA, BASE, -8
-  |   ldc1 f2, 16(BASE)
-  |    li RD, (2+1)*8
-  |  sdc1 f0, 0(RA)
+  |   load_double2 16(BASE)
+  |  store_double1 0(RA)
+  |   store_double2 8(RA)
   |  b ->fff_res
-  |.  sdc1 f2, 8(RA)
+  |.  li RD, (2+1)*8
   |
   |.ffunc_1 pairs
   |  li AT, LJ_TTAB
@@ -1199,16 +1387,32 @@ static void build_subroutines(BuildCtx *ctx)
   |.  lw PC, FRAME_PC(BASE)
 #if LJ_52
   |  lw TAB:TMP2, TAB:CARG1->metatable
+  |.if FPU
   |   ldc1 f0, CFUNC:RB->upvalue[0]
+  |.else
+  |  lw SFT1, CFUNC:RB->upvalue[0].u32.hi
+  |   lw SFT2, CFUNC:RB->upvalue[0].u32.lo
+  |.endif
   |  bnez TAB:TMP2, ->fff_fallback
 #else
+  |.if FPU
   |  ldc1 f0, CFUNC:RB->upvalue[0]
+  |.else
+  |  lw SFT1, CFUNC:RB->upvalue[0].u32.hi
+  |   lw SFT2, CFUNC:RB->upvalue[0].u32.lo
+  |.endif
 #endif
   |.  addiu RA, BASE, -8
   |   sw TISNIL, 8+HI(BASE)
   |  li RD, (3+1)*8
+  |.if FPU
   |  b ->fff_res
   |.  sdc1 f0, 0(RA)
+  |.else
+  |  sw SFT1, 0(RA)
+  |  b ->fff_res
+  |.  sw SFT2, 4(RA)
+  |.endif
   |
   |.ffunc ipairs_aux
   |  sltiu AT, NARGS8:RC, 16
@@ -1216,35 +1420,55 @@ static void build_subroutines(BuildCtx *ctx)
   |    lw TAB:CARG1, LO(BASE)
   |   lw CARG4, 8+HI(BASE)
   |  bnez AT, ->fff_fallback
-  |.  ldc1 FARG2, 8(BASE)
-  |   addiu CARG3, CARG3, -LJ_TTAB
+  |.  addiu CARG3, CARG3, -LJ_TTAB
   |  sltiu AT, CARG4, LJ_TISNUM
   |   li TMP0, 1
   |  movn AT, r0, CARG3
-  |   mtc1 TMP0, FARG1
   |  beqz AT, ->fff_fallback
   |.  lw PC, FRAME_PC(BASE)
+  |.if FPU
+  |   ldc1 FARG2, 8(BASE)
+  |   mtc1 TMP0, FARG1
   |   trunc.w.d FRET1, FARG2
   |  cvt.d.w FARG1, FARG1
-  |   lw TMP0, TAB:CARG1->asize
-  |   lw TMP1, TAB:CARG1->array
   |  mfc1 TMP2, FRET1
-  |   addiu RA, BASE, -8
   |  add.d FARG2, FARG2, FARG1
+  |.else
+  |  sw CARG1, TEMP_SAVE_1
+  |  cvti2d TMP0
+  |  sw CRET1, TEMP_SAVE_2	// Store result CRET1/CRET2=1 (double).
+  |  sw CRET2, TEMP_SAVE_3
+  |  lw CARG2, 8+4(BASE)
+  |  load_got __fixdfsi
+  |  call_extern
+  |.  lw CARG1, 8(BASE)
+  |  sw CRET1, TEMP_SAVE_4
+  |  load_got __adddf3
+  |  lw CARG2, TEMP_SAVE_3
+  |  lw CARG3, 8(BASE)
+  |  lw CARG4, 8+4(BASE)
+  |  call_extern
+  |.  lw CARG1, TEMP_SAVE_2
+  |  lw TMP2, TEMP_SAVE_4
+  |  lw CARG1, TEMP_SAVE_1
+  |.endif
+  |   lw TMP0, TAB:CARG1->asize
+  |   lw TMP1, TAB:CARG1->array
   |  addiu TMP2, TMP2, 1
   |  sltu AT, TMP2, TMP0
+  |  beqz AT, >2			// Not in array part?
+  |.  addiu RA, BASE, -8
+  |  store_double FARG2, CRET1, CRET2, 0(RA)
   |   sll TMP3, TMP2, 3
   |   addu TMP3, TMP1, TMP3
-  |  beqz AT, >2			// Not in array part?
-  |.  sdc1 FARG2, 0(RA)
   |  lw TMP2, HI(TMP3)
-  |  ldc1 f0, 0(TMP3)
+  |  load_double1 0(TMP3)
   |1:
   |  beq TMP2, TISNIL, ->fff_res	// End of iteration, return 0 results.
   |.  li RD, (0+1)*8
-  |   li RD, (2+1)*8
+  |   store_double1 8(RA)
   |  b ->fff_res
-  |.  sdc1 f0, 8(RA)
+  |.  li RD, (2+1)*8
   |2:  // Check for empty hash part first. Otherwise call C function.
   |  lw TMP0, TAB:CARG1->hmask
   |  load_got lj_tab_getinth
@@ -1256,8 +1480,14 @@ static void build_subroutines(BuildCtx *ctx)
   |  beqz CRET1, ->fff_res
   |.  li RD, (0+1)*8
   |  lw TMP2, HI(CRET1)
+  |.if FPU
   |  b <1
   |.  ldc1 f0, 0(CRET1)
+  |.else
+  |  lw SFT2, 4(CRET1)
+  |  b <1
+  |.  lw SFT1, 0(CRET1)
+  |.endif
   |
   |.ffunc_1 ipairs
   |  li AT, LJ_TTAB
@@ -1265,17 +1495,33 @@ static void build_subroutines(BuildCtx *ctx)
   |.  lw PC, FRAME_PC(BASE)
 #if LJ_52
   |  lw TAB:TMP2, TAB:CARG1->metatable
+  |.if FPU
   |   ldc1 f0, CFUNC:RB->upvalue[0]
+  |.else
+  |  lw SFT1, CFUNC:RB->upvalue[0].u32.hi
+  |   lw SFT2, CFUNC:RB->upvalue[0].u32.lo
+  |.endif
   |  bnez TAB:TMP2, ->fff_fallback
 #else
+  |.if FPU
   |  ldc1 f0, CFUNC:RB->upvalue[0]
+  |.else
+  |  lw SFT1, CFUNC:RB->upvalue[0].u32.hi
+  |   lw SFT2, CFUNC:RB->upvalue[0].u32.lo
+  |.endif
 #endif
   |.  addiu RA, BASE, -8
   |   sw r0, 8+HI(BASE)
   |   sw r0, 8+LO(BASE)
   |  li RD, (3+1)*8
+  |.if FPU
   |  b ->fff_res
   |.  sdc1 f0, 0(RA)
+  |.else
+  |  sw SFT1, 0(RA)
+  |  b ->fff_res
+  |.  sw SFT2, 4(RA)
+  |.endif
   |
   |//-- Base library: catch errors ----------------------------------------
   |
@@ -1295,8 +1541,12 @@ static void build_subroutines(BuildCtx *ctx)
   |    sltiu AT, NARGS8:RC, 16
   |  lw CARG4, 8+HI(BASE)
   |    bnez AT, ->fff_fallback
+  |.if FPU
   |.  ldc1 FARG2, 8(BASE)
-  |   ldc1 FARG1, 0(BASE)
+  |.else
+  |.  lw CARG3, 8+LO(BASE)
+  |.endif
+  |   load_double FARG1, CARG1, CARG2, 0(BASE)
   |    lbu TMP1, DISPATCH_GL(hookmask)(DISPATCH)
   |  li AT, LJ_TFUNC
   |   move TMP2, BASE
@@ -1304,9 +1554,14 @@ static void build_subroutines(BuildCtx *ctx)
   |   addiu BASE, BASE, 16
   |  // Remember active hook before pcall.
   |  srl TMP3, TMP3, HOOK_ACTIVE_SHIFT
+  |.if FPU
   |   sdc1 FARG2, 0(TMP2)		// Swap function and traceback.
+  |.else
+  |   sw CARG3, LO(TMP2)
+  |   sw CARG4, HI(TMP2)
+  |.endif
   |  andi TMP3, TMP3, 1
-  |   sdc1 FARG1, 8(TMP2)
+  |   store_double FARG1, CARG1, CARG2, 8(TMP2)
   |  addiu PC, TMP3, 16+FRAME_PCALL
   |  b ->vm_call_dispatch
   |.  addiu NARGS8:RC, NARGS8:RC, -16
@@ -1350,11 +1605,11 @@ static void build_subroutines(BuildCtx *ctx)
   |  move CARG3, CARG2
   |  sw BASE, L->top
   |2:  // Move args to coroutine.
-  |   ldc1 f0, 0(BASE)
+  |   load_double1 0(BASE)
   |  sltu AT, BASE, TMP1
   |  beqz AT, >3
   |.  addiu BASE, BASE, 8
-  |   sdc1 f0, 0(CARG3)
+  |   store_double1 0(CARG3)
   |  b <2
   |.  addiu CARG3, CARG3, 8
   |3:
@@ -1380,10 +1635,10 @@ static void build_subroutines(BuildCtx *ctx)
   |  sw TMP2, L:RA->top			// Clear coroutine stack.
   |  move TMP1, BASE
   |5:  // Move results from coroutine.
-  |   ldc1 f0, 0(TMP2)
+  |   load_double1 0(TMP2)
   |  addiu TMP2, TMP2, 8
   |  sltu AT, TMP2, TMP3
-  |   sdc1 f0, 0(TMP1)
+  |   store_double1 0(TMP1)
   |  bnez AT, <5
   |.  addiu TMP1, TMP1, 8
   |6:
@@ -1408,12 +1663,12 @@ static void build_subroutines(BuildCtx *ctx)
   |.if resume
   |  addiu TMP3, TMP3, -8
   |   li TMP1, LJ_TFALSE
-  |  ldc1 f0, 0(TMP3)
+  |  load_double1 0(TMP3)
   |   sw TMP3, L:RA->top		// Remove error from coroutine stack.
   |    li RD, (2+1)*8
   |   sw TMP1, -8+HI(BASE)		// Prepend false to results.
   |    addiu RA, BASE, -8
-  |  sdc1 f0, 0(BASE)			// Copy error message.
+  |   store_double1 0(BASE)			// Copy error message.
   |  b <7
   |.  andi TMP0, PC, FRAME_TYPE
   |.else
@@ -1449,13 +1704,33 @@ static void build_subroutines(BuildCtx *ctx)
   |
   |//-- Math library -------------------------------------------------------
   |
-  |.ffunc_n math_abs
+  |.ffunc_1 math_abs
+  |   load_farg1 0(BASE)
+  |   sltiu AT, CARG3, LJ_TISNUM
+  |  beqz AT, ->fff_fallback
+  |.  nop
+  |.if FPU
   |.  abs.d FRET1, FARG1
+  |.else
+  |.  lui TMP1, 0x8000
+  |   and AT, CARG1, TMP1
+  |   move CRET2, CARG2
+  |   beqz AT, ->fff_resn
+  |.  move CRET1, CARG1
+  |   xor CRET1, CARG1, TMP1
+  |.endif
+  |
   |->fff_resn:
   |  lw PC, FRAME_PC(BASE)
   |  addiu RA, BASE, -8
+  |.if HFABI
   |  b ->fff_res1
   |.  sdc1 FRET1, -8(BASE)
+  |.else
+  |  sw CRET1, -8(BASE)
+  |  b ->fff_res1
+  |.  sw CRET2, -8+4(BASE)
+  |.endif
   |
   |->fff_restv:
   |  // CARG3/CARG1 = TValue result.
@@ -1498,8 +1773,14 @@ static void build_subroutines(BuildCtx *ctx)
   |  sltiu AT, CARG3, LJ_TISNUM
   |  beqz AT, ->fff_fallback
   |.  nop
+  |.if HFABI
   |  call_extern
   |.  ldc1 FARG1, 0(BASE)
+  |.else
+  |  lw CARG1, 0(BASE)
+  |  call_extern
+  |.  lw CARG2, 4(BASE)
+  |.endif
   |  b ->fff_resn
   |.  nop
   |.endmacro
@@ -1526,15 +1807,20 @@ static void build_subroutines(BuildCtx *ctx)
   |  math_round ceil
   |
   |.ffunc math_log
-  |  lw CARG3, HI(BASE)
   |  li AT, 8
   |  bne NARGS8:RC, AT, ->fff_fallback	// Exactly 1 argument.
-  |.  load_got log
+  |.  lw CARG3, HI(BASE)
   |  sltiu AT, CARG3, LJ_TISNUM
   |  beqz AT, ->fff_fallback
-  |.  nop
+  |.  load_got log
+  |.if HFABI
   |  call_extern
   |.  ldc1 FARG1, 0(BASE)
+  |.else
+  |  lw CARG1, 0(BASE)
+  |  call_extern
+  |.  lw CARG2, 4(BASE)
+  |.endif
   |  b ->fff_resn
   |.  nop
   |
@@ -1553,17 +1839,40 @@ static void build_subroutines(BuildCtx *ctx)
   |  math_extern2 atan2
   |  math_extern2 fmod
   |
+  |.if FPU
   |.ffunc_n math_sqrt
   |.  sqrt.d FRET1, FARG1
   |  b ->fff_resn
   |.  nop
+  |.else
+  |  math_extern sqrt
+  |.endif
   |
-  |.ffunc_nn math_ldexp
+  |.ffunc_2 math_ldexp
+  |  sltiu TMP0, CARG3, LJ_TISNUM
+  |  sltiu TMP1, CARG4, LJ_TISNUM
+  |   load_farg1 0(BASE)
+  |   load_farg2 8(BASE)
+  |  and TMP0, TMP0, TMP1
+  |  beqz TMP0, ->fff_fallback
+  |.if FPU
+  |  load_got ldexp
   |  trunc.w.d FARG2, FARG2
+  |  call_extern
+  |.  mfc1 CARG3, FARG2
+  |.else
+  |  sw CARG1, TEMP_SAVE_1
+  |  sw CARG2, TEMP_SAVE_2
+  |  load_got __fixdfsi
+  |   move CARG1, CARG3
+  |  call_extern
+  |.  move CARG2, CARG4
+  |  lw CARG1, TEMP_SAVE_1
   |  load_got ldexp
-  |  mfc1 CARG3, FARG2
+  |  lw CARG2, TEMP_SAVE_2
   |  call_extern
-  |.  nop
+  |.  move CARG3, CRET1
+  |.endif
   |  b ->fff_resn
   |.  nop
   |
@@ -1574,10 +1883,14 @@ static void build_subroutines(BuildCtx *ctx)
   |.  addiu CARG3, DISPATCH, DISPATCH_GL(tmptv)
   |   lw TMP1, DISPATCH_GL(tmptv)(DISPATCH)
   |  addiu RA, BASE, -8
+  |  store_double FRET1, CRET1, CRET2, 0(RA)
+  |.if FPU
   |   mtc1 TMP1, FARG2
-  |  sdc1 FRET1, 0(RA)
   |   cvt.d.w FARG2, FARG2
-  |   sdc1 FARG2, 8(RA)
+  |.else
+  |   cvti2d TMP1
+  |.endif
+  |  store_double FARG2, CRET1, CRET2, 8(RA)
   |  b ->fff_res
   |.  li RD, (2+1)*8
   |
@@ -1587,7 +1900,12 @@ static void build_subroutines(BuildCtx *ctx)
   |  call_extern
   |.  addiu CARG3, BASE, -8
   |  addiu RA, BASE, -8
+  |.if HFABI
   |  sdc1 FRET1, 0(BASE)
+  |.else
+  |  sw CRET1, 0(BASE)
+  |  sw CRET2, 4(BASE)
+  |.endif
   |  b ->fff_res
   |.  li RD, (2+1)*8
   |
@@ -1595,25 +1913,73 @@ static void build_subroutines(BuildCtx *ctx)
   |->ff_ .. name:
   |  lw CARG3, HI(BASE)
   |  beqz NARGS8:RC, ->fff_fallback
-  |.  ldc1 FRET1, 0(BASE)
-  |  sltiu AT, CARG3, LJ_TISNUM
+  |.  sltiu AT, CARG3, LJ_TISNUM
   |  beqz AT, ->fff_fallback
   |.  addu TMP2, BASE, NARGS8:RC
   |  addiu TMP1, BASE, 8
+  |.if HFABI
+  |  ldc1 FRET1, 0(BASE)
   |  beq TMP1, TMP2, ->fff_resn
+  |.else
+  |  lw CRET1, 0(BASE)
+  |  lw CRET2, 4(BASE)
+  |  beq TMP1, TMP2, ->fff_resn
+  |.endif
   |1:
   |.  lw CARG3, HI(TMP1)
+  |.if HFABI
   |  ldc1 FARG1, 0(TMP1)
-  |   addiu TMP1, TMP1, 8
+  |.else
+  |  lw CARG1, 0(TMP1)
+  |  lw CARG2, 4(TMP1)
+  |.endif
   |  sltiu AT, CARG3, LJ_TISNUM
   |  beqz AT, ->fff_fallback
+  |. addiu TMP1, TMP1, 8
+  |.if FPU
   |.if ismax
-  |.  c.olt.d FARG1, FRET1
+  |  c.olt.d FARG1, FRET1
   |.else
-  |.  c.olt.d FRET1, FARG1
+  |  c.olt.d FRET1, FARG1
   |.endif
   |  bne TMP1, TMP2, <1
   |.  movf.d FRET1, FARG1
+  |.else
+  |  load_got __ledf2
+  |  sw TMP1, TEMP_SAVE_1
+  |  sw TMP2, TEMP_SAVE_2
+  |  sw CARG1, TEMP_SAVE_3
+  |  sw CARG2, TEMP_SAVE_4
+  |  sw CRET1, TEMP_SAVE_5
+  |  sw CRET2, TEMP_SAVE_6
+  |  move CARG3, CRET1
+  |  call_extern
+  |.  move CARG4, CRET2
+  |  lw CARG4, TEMP_SAVE_6
+  |  lw CARG3, TEMP_SAVE_5
+  |  lw CARG2, TEMP_SAVE_4
+  |  lw CARG1, TEMP_SAVE_3
+  |  lw TMP2, TEMP_SAVE_2
+  |  lw TMP1, TEMP_SAVE_1
+  |.if ismax
+  |  beqz CRET1, >2		// farg1==fret1
+  |.  li TMP3, 1
+  |  beq CRET1, TMP3, >2	// farg1>fret1
+  |.  nop
+  |.else
+  |  blez CRET1, >2
+  |.  nop
+  |.endif
+  |  move CRET1, CARG3		// Keep the value.
+  |  b >3
+  |.  move CRET2, CARG4
+  |2:
+  |  move CRET1, CARG1		// Set new value.
+  |  move CRET2, CARG2
+  |3:
+  |  bne TMP1, TMP2, <1
+  |.  nop
+  |.endif
   |  b ->fff_resn
   |.  nop
   |.endmacro
@@ -1632,32 +1998,52 @@ static void build_subroutines(BuildCtx *ctx)
   |  bnez AT, ->fff_fallback		// Need exactly 1 string argument.
   |.  nop
   |  lw TMP0, STR:CARG1->len
-  |   lbu TMP1, STR:CARG1[1]		// Access is always ok (NUL at end).
   |    addiu RA, BASE, -8
   |  sltu RD, r0, TMP0
-  |   mtc1 TMP1, f0
+  |  lw PC, FRAME_PC(BASE)
   |  addiu RD, RD, 1
+  |   lbu TMP1, STR:CARG1[1]		// Access is always ok (NUL at end).
+  |.if FPU
+  |   mtc1 TMP1, f0
   |   cvt.d.w f0, f0
-  |  lw PC, FRAME_PC(BASE)
-  |  sll RD, RD, 3			// RD = ((str->len != 0)+1)*8
+  |   sdc1 f0, 0(RA)
+  |.else
+  |   sw RD, TEMP_SAVE_1
+  |   cvti2d TMP1
+  |  sw CRET1, 0(RA)
+  |  sw CRET2, 4(RA)
+  |   lw RD, TEMP_SAVE_1
+  |.endif
   |  b ->fff_res
-  |.  sdc1 f0, 0(RA)
+  |.  sll RD, RD, 3		// RD = ((str->len != 0)+1)*8
   |
   |.ffunc string_char			// Only handle the 1-arg case here.
   |  ffgccheck
   |  lw CARG3, HI(BASE)
-  |   ldc1 FARG1, 0(BASE)
   |  li AT, 8
   |  bne NARGS8:RC, AT, ->fff_fallback	// Exactly 1 argument.
   |.  sltiu AT, CARG3, LJ_TISNUM
   |  beqz AT, ->fff_fallback
   |.  li CARG3, 1
-  |   trunc.w.d FARG1, FARG1
-  |  addiu CARG2, sp, ARG5_OFS
   |  sltiu AT, TMP0, 256
-  |   mfc1 TMP0, FARG1
   |  beqz AT, ->fff_fallback
-  |.  sw TMP0, ARG5
+  |  load_farg1 0(BASE)
+  |.if FPU
+  |   trunc.w.d FARG1, FARG1
+  |   mfc1 TMP0, FARG1
+  |.else
+  |   load_got __fixdfsi
+  |   sw RB, TEMP_SAVE_1
+  |   sw RC, TEMP_SAVE_2
+  |   call_extern
+  |.   sw CARG3, TEMP_SAVE_3
+  |   lw CARG3, TEMP_SAVE_3
+  |   lw RC, TEMP_SAVE_2
+  |   lw RB, TEMP_SAVE_1
+  |   move TMP0, CRET1
+  |.endif
+  |  addiu CARG2, sp, ARG5_OFS
+  |  sw TMP0, ARG5
   |->fff_newstr:
   |  load_got lj_str_new
   |   sw BASE, L->base
@@ -1674,27 +2060,52 @@ static void build_subroutines(BuildCtx *ctx)
   |.ffunc string_sub
   |  ffgccheck
   |  addiu AT, NARGS8:RC, -16
+  |.if FPU
+  |   ldc1 f0, 16(BASE)
+  |   trunc.w.d f0, f0
+  |.else
+  |   lw CARG1, 16(BASE)
+  |   load_got __fixdfsi
+  |   sw AT, TEMP_SAVE_1
+  |   call_extern
+  |.   lw CARG2, 16+4(BASE)
+  |   lw AT, TEMP_SAVE_1
+  |.endif
   |   lw CARG3, 16+HI(BASE)
-  |    ldc1 f0, 16(BASE)
   |   lw TMP0, HI(BASE)
   |    lw STR:CARG1, LO(BASE)
   |  bltz AT, ->fff_fallback
-  |   lw CARG2, 8+HI(BASE)
-  |    ldc1 f2, 8(BASE)
+  |.  lw CARG2, 8+HI(BASE)
   |  beqz AT, >1
   |.  li CARG4, -1
-  |   trunc.w.d f0, f0
   |  sltiu AT, CARG3, LJ_TISNUM
   |  beqz AT, ->fff_fallback
+  |.if FPU
   |.  mfc1 CARG4, f0
+  |.else
+  |.  move CARG4, CRET1
+  |.endif
   |1:
   |  sltiu AT, CARG2, LJ_TISNUM
   |  beqz AT, ->fff_fallback
   |.  li AT, LJ_TSTR
-  |  trunc.w.d f2, f2
   |  bne TMP0, AT, ->fff_fallback
-  |.  lw CARG2, STR:CARG1->len
+  |.if FPU
+  |.  ldc1 f2, 8(BASE)
+  |  trunc.w.d f2, f2
   |  mfc1 CARG3, f2
+  |.else
+  |.   sw CARG1, TEMP_SAVE_1
+  |   sw CARG4, TEMP_SAVE_2
+  |   lw CARG2, 8+4(BASE)
+  |   load_got __fixdfsi
+  |   call_extern
+  |.   lw CARG1, 8(BASE)
+  |   lw CARG1, TEMP_SAVE_1
+  |   lw CARG4, TEMP_SAVE_2
+  |   move CARG3, CRET1
+  |.endif
+  |  lw CARG2, STR:CARG1->len
   |  // STR:CARG1 = str, CARG2 = str->len, CARG3 = start, CARG4 = end
   |  slt AT, CARG4, r0
   |  addiu TMP0, CARG2, 1
@@ -1749,10 +2160,58 @@ static void build_subroutines(BuildCtx *ctx)
   |
   |//-- Bit library --------------------------------------------------------
   |
+  |.if not FPU
+  |// FP number to bit conversion for soft-float.
+  |->vm_tobit:
+  |  sll TMP0, CARG1, 1
+  |  lui TMP3, 0x0020
+  |  addu TMP0, TMP0, TMP3
+  |  slt TMP3, TMP0, r0
+  |  movz CARG2, r0, TMP3
+  |  beqz TMP3, >2
+  |.  li CARG4, 0x3e0
+  |  not CARG4, CARG4
+  |  sra TMP0, TMP0, 21
+  |  subu TMP0, CARG4, TMP0
+  |  slt TMP3, TMP0, r0
+  |  bnez TMP3, >1
+  |.  sll CARG4, CARG1, 11
+  |  lui TMP3, 0x8000
+  |  or CARG4, CARG4, TMP3
+  |  srl TMP3, CARG2, 21
+  |  or CARG4, CARG4, TMP3
+  |  slt TMP3, CARG1, r0
+  |  beqz TMP3, >2
+  |.  srlv CARG2, CARG4, TMP0
+  |  subu CARG2, r0, CARG2
+  |2:
+  |  jr ra
+  |.  move CRET1, CARG2
+  |1:
+  |  addiu TMP0, TMP0, 21
+  |  srlv CARG4, CARG2, TMP0
+  |  li TMP3, 20
+  |  subu TMP0, TMP3, TMP0
+  |  sll CARG2, CARG1, 12
+  |  sllv TMP3, CARG2, TMP0
+  |  or CARG2, CARG4, TMP3
+  |  slt TMP3, CARG1, r0
+  |  beqz TMP3, <2
+  |.  nop
+  |  jr ra
+  |.  subu CRET1, r0, CARG2
+  |.endif
+  |
   |.macro .ffunc_bit, name
   |  .ffunc_n bit_..name
+  |.if FPU
   |.  add.d FARG1, FARG1, TOBIT
   |  mfc1 CRET1, FARG1
+  |.else
+  |.  nop
+  |  bal ->vm_tobit
+  |.  nop
+  |.endif
   |.endmacro
   |
   |.macro .ffunc_bit_op, name, ins
@@ -1760,14 +2219,27 @@ static void build_subroutines(BuildCtx *ctx)
   |  addiu TMP1, BASE, 8
   |  addu TMP2, BASE, NARGS8:RC
   |1:
+  |  move CRET2, CRET1
   |  lw CARG4, HI(TMP1)
+  |.if FPU
   |  beq TMP1, TMP2, ->fff_resi
   |.  ldc1 FARG1, 0(TMP1)
+  |.else
+  |  lw CARG1, 0(TMP1)
+  |  beq TMP1, TMP2, ->fff_resi
+  |.  lw CARG2, 4(TMP1)
+  |.endif
   |  sltiu AT, CARG4, LJ_TISNUM
   |  beqz AT, ->fff_fallback
-  |  add.d FARG1, FARG1, TOBIT
-  |  mfc1 CARG2, FARG1
-  |  ins CRET1, CRET1, CARG2
+  |.if FPU
+  |.  add.d FARG1, FARG1, TOBIT
+  |  mfc1 CRET1, FARG1
+  |.else
+  |.  nop
+  |  bal ->vm_tobit
+  |.  nop
+  |.endif
+  |  ins CRET1, CRET2, CRET1
   |  b <1
   |.  addiu TMP1, TMP1, 8
   |.endmacro
@@ -1794,10 +2266,22 @@ static void build_subroutines(BuildCtx *ctx)
   |
   |.macro .ffunc_bit_sh, name, ins, shmod
   |  .ffunc_nn bit_..name
+  |.if FPU
   |.  add.d FARG1, FARG1, TOBIT
   |  add.d FARG2, FARG2, TOBIT
   |  mfc1 CARG1, FARG1
   |  mfc1 CARG2, FARG2
+  |.else
+  |.  sw CARG4, TEMP_SAVE_1
+  |  bal ->vm_tobit
+  |.  nop
+  |  move CRET2, CRET1
+  |  lw CARG2, TEMP_SAVE_1
+  |  bal ->vm_tobit
+  |.  move CARG1, CARG3
+  |  move CARG2, CRET1
+  |  move CARG1, CRET2
+  |.endif
   |.if shmod == 1
   |  li AT, 32
   |  subu TMP0, AT, CARG2
@@ -1822,9 +2306,19 @@ static void build_subroutines(BuildCtx *ctx)
   |
   |.ffunc_bit tobit
   |->fff_resi:
+  |  lw PC, FRAME_PC(BASE)
+  |  addiu RA, BASE, -8
+  |.if HFABI
   |  mtc1 CRET1, FRET1
-  |  b ->fff_resn
-  |.  cvt.d.w FRET1, FRET1
+  |  cvt.d.w FRET1, FRET1
+  |  b ->fff_res1
+  |.  sdc1 FRET1, -8(BASE)
+  |.else		// Result already in CRET1.
+  |  cvti2d CRET1
+  |  sw CRET1, -8(BASE)
+  |  b ->fff_res1
+  |.  sw CRET2, -8+4(BASE)
+  |.endif
   |
   |//-----------------------------------------------------------------------
   |
@@ -2082,14 +2576,23 @@ static void build_subroutines(BuildCtx *ctx)
   |//-----------------------------------------------------------------------
   |
   |.macro savex_, a, b
+  |.if FPU
   |  sdc1 f..a, 16+a*8(sp)
   |  sw r..a, 16+32*8+a*4(sp)
   |  sw r..b, 16+32*8+b*4(sp)
+  |.else
+  |  sw r..a, 16+a*4(sp)
+  |  sw r..b, 16+b*4(sp)
+  |.endif
   |.endmacro
   |
   |->vm_exit_handler:
   |.if JIT
+  |.if FPU
   |  addiu sp, sp, -(16+32*8+32*4)
+  |.else
+  |  addiu sp, sp, -(16+32*4)
+  |.endif
   |  savex_ 0, 1
   |  savex_ 2, 3
   |  savex_ 4, 5
@@ -2104,17 +2607,25 @@ static void build_subroutines(BuildCtx *ctx)
   |  savex_ 22, 23
   |  savex_ 24, 25
   |  savex_ 26, 27
+  |.if FPU
   |  sdc1 f28, 16+28*8(sp)
-  |  sw r28, 16+32*8+28*4(sp)
   |  sdc1 f30, 16+30*8(sp)
+  |  sw r28, 16+32*8+28*4(sp)
   |  sw r30, 16+32*8+30*4(sp)
   |  sw r0, 16+32*8+31*4(sp)		// Clear RID_TMP.
+  |  addiu TMP2, sp, 16+32*8+32*4	// Recompute original value of sp.
+  |  sw TMP2, 16+32*8+29*4(sp)		// Store sp in RID_SP
+  |.else
+  |  sw r28, 16+28*4(sp)
+  |  sw r30, 16+30*4(sp)
+  |  sw r0, 16+31*4(sp)			// Clear RID_TMP.
+  |  addiu TMP2, sp, 16+32*4		// Recompute original value of sp.
+  |  sw TMP2, 16+29*4(sp)		// Store sp in RID_SP
+  |.endif
   |  li_vmstate EXIT
-  |   addiu TMP2, sp, 16+32*8+32*4	// Recompute original value of sp.
   |  addiu DISPATCH, JGL, -GG_DISP2G-32768
   |  lw TMP1, 0(TMP2)			// Load exit number.
   |  st_vmstate
-  |   sw TMP2, 16+32*8+29*4(sp)		// Store sp in RID_SP.
   |  lw L, DISPATCH_GL(cur_L)(DISPATCH)
   |   lw BASE, DISPATCH_GL(jit_base)(DISPATCH)
   |  load_got lj_trace_exit
@@ -2144,15 +2655,15 @@ static void build_subroutines(BuildCtx *ctx)
   |1:
   |  bltz CRET1, >9			// Check for error from exit.
   |.  lw LFUNC:RB, FRAME_FUNC(BASE)
-  |    lui TMP3, 0x59c0			// TOBIT = 2^52 + 2^51 (float).
+  |    .FPU lui TMP3, 0x59c0			// TOBIT = 2^52 + 2^51 (float).
   |  sll MULTRES, CRET1, 3
   |    li TISNIL, LJ_TNIL
   |  sw MULTRES, SAVE_MULTRES
-  |    mtc1 TMP3, TOBIT
+  |    .FPU mtc1 TMP3, TOBIT
   |  lw TMP1, LFUNC:RB->pc
   |   sw r0, DISPATCH_GL(jit_base)(DISPATCH)
   |  lw KBASE, PC2PROTO(k)(TMP1)
-  |    cvt.d.s TOBIT, TOBIT
+  |    .FPU cvt.d.s TOBIT, TOBIT
   |  // Modified copy of ins_next which handles function header dispatch, too.
   |  lw INS, 0(PC)
   |   addiu PC, PC, 4
@@ -2160,7 +2671,7 @@ static void build_subroutines(BuildCtx *ctx)
   |    sw TISNIL, DISPATCH_GL(vmstate)(DISPATCH)
   |  decode_OP4a TMP1, INS
   |  decode_OP4b TMP1
-  |    sltiu TMP2, TMP1, BC_FUNCF*4	// Function header?
+  |    sltiu TMP2, TMP1, BC_FUNCF*4
   |  addu TMP0, DISPATCH, TMP1
   |   decode_RD8a RD, INS
   |  lw AT, 0(TMP0)
@@ -2202,7 +2713,7 @@ static void build_subroutines(BuildCtx *ctx)
   |//-----------------------------------------------------------------------
   |
   |// Modifies AT, TMP0, FRET1, FRET2, f4. Keeps all others incl. FARG1.
-  |.macro vm_round, func
+  |.macro vm_round_hf, func
   |  lui TMP0, 0x4330			// Hiword of 2^52 (double).
   |  mtc1 r0, f4
   |  mtc1 TMP0, f5
@@ -2244,6 +2755,25 @@ static void build_subroutines(BuildCtx *ctx)
   |.  mov.d FRET1, FARG1
   |.endmacro
   |
+  |.macro vm_round_sf, func
+  |  addiu sp, sp, -8
+  |  load_got func
+  |  sw ra, 0(sp)
+  |  call_extern
+  |.   nop
+  |  lw ra, 0(sp)
+  |  jr ra
+  |.  addiu sp, sp, 8
+  |.endmacro
+  |
+  |.macro vm_round, func
+  |.if FPU
+  |  vm_round_hf, func
+  |.else
+  |  vm_round_sf, func
+  |.endif
+  |.endmacro
+  |
   |->vm_floor:
   |  vm_round floor
   |->vm_ceil:
@@ -2272,10 +2802,10 @@ static void build_subroutines(BuildCtx *ctx)
   |  sw r1, CTSTATE->cb.slot
   |  sw CARG1, CTSTATE->cb.gpr[0]
   |  sw CARG2, CTSTATE->cb.gpr[1]
-  |   sdc1 FARG1, CTSTATE->cb.fpr[0]
+  |   .FPU sdc1 FARG1, CTSTATE->cb.fpr[0]
   |  sw CARG3, CTSTATE->cb.gpr[2]
   |  sw CARG4, CTSTATE->cb.gpr[3]
-  |   sdc1 FARG2, CTSTATE->cb.fpr[1]
+  |   .FPU sdc1 FARG2, CTSTATE->cb.fpr[1]
   |  addiu TMP0, sp, CFRAME_SPACE+16
   |  sw TMP0, CTSTATE->cb.stack
   |  sw r0, SAVE_PC			// Any value outside of bytecode is ok.
@@ -2286,14 +2816,14 @@ static void build_subroutines(BuildCtx *ctx)
   |  lw BASE, L:CRET1->base
   |  lw RC, L:CRET1->top
   |   move L, CRET1
-  |     lui TMP3, 0x59c0		// TOBIT = 2^52 + 2^51 (float).
+  |     .FPU lui TMP3, 0x59c0		// TOBIT = 2^52 + 2^51 (float).
   |  lw LFUNC:RB, FRAME_FUNC(BASE)
-  |     mtc1 TMP3, TOBIT
+  |     .FPU mtc1 TMP3, TOBIT
   |    li_vmstate INTERP
   |     li TISNIL, LJ_TNIL
   |  subu RC, RC, BASE
   |    st_vmstate
-  |     cvt.d.s TOBIT, TOBIT
+  |     .FPU cvt.d.s TOBIT, TOBIT
   |  ins_callt
   |.endif
   |
@@ -2307,11 +2837,11 @@ static void build_subroutines(BuildCtx *ctx)
   |  move CARG2, RA
   |  call_intern lj_ccallback_leave	// (CTState *cts, TValue *o)
   |.  move CARG1, CTSTATE
+  |   .FPU ldc1 FRET1, CTSTATE->cb.fpr[0]
   |  lw CRET1, CTSTATE->cb.gpr[0]
-  |   ldc1 FRET1, CTSTATE->cb.fpr[0]
-  |  lw CRET2, CTSTATE->cb.gpr[1]
+  |   .FPU ldc1 FRET2, CTSTATE->cb.fpr[1]
   |  b ->vm_leave_unw
-  |.  ldc1 FRET2, CTSTATE->cb.fpr[1]
+  |.  lw CRET2, CTSTATE->cb.gpr[1]
   |.endif
   |
   |->vm_ffi_call:			// Call C function via FFI.
@@ -2343,8 +2873,8 @@ static void build_subroutines(BuildCtx *ctx)
   |  lw CARG2, CCSTATE->gpr[1]
   |  lw CARG3, CCSTATE->gpr[2]
   |  lw CARG4, CCSTATE->gpr[3]
-  |  ldc1 FARG1, CCSTATE->fpr[0]
-  |  ldc1 FARG2, CCSTATE->fpr[1]
+  |  .FPU ldc1 FARG1, CCSTATE->fpr[0]
+  |  .FPU ldc1 FARG2, CCSTATE->fpr[1]
   |  jalr CFUNCADDR
   |.  lw CARG1, CCSTATE->gpr[0]		// Do this last, since CCSTATE is CARG1.
   |  lw CCSTATE:TMP1, -12(r16)
@@ -2352,8 +2882,10 @@ static void build_subroutines(BuildCtx *ctx)
   |  lw ra, -4(r16)
   |  sw CRET1, CCSTATE:TMP1->gpr[0]
   |  sw CRET2, CCSTATE:TMP1->gpr[1]
-  |  sdc1 FRET1, CCSTATE:TMP1->fpr[0]
-  |  sdc1 FRET2, CCSTATE:TMP1->fpr[1]
+  |  .FPU sdc1 FRET1, CCSTATE:TMP1->fpr[0]
+  |  .FPU sdc1 FRET2, CCSTATE:TMP1->fpr[1]
+  |  sw CARG1, CCSTATE:TMP1->gpr[2]	// MIPS32 soft-float.
+  |  sw CARG2, CCSTATE:TMP1->gpr[3]	// Complex doubles are returned in v0, v1, a0, a1.
   |  move sp, r16
   |  jr ra
   |.  move r16, TMP2
@@ -2381,8 +2913,6 @@ static void build_ins(BuildCtx *ctx, BCOp op, int defop)
     |   addu CARG3, BASE, RD
     |  lw TMP0, HI(CARG2)
     |   lw TMP1, HI(CARG3)
-    |   ldc1 f0, 0(CARG2)
-    |   ldc1 f2, 0(CARG3)
     |  sltiu TMP0, TMP0, LJ_TISNUM
     |   sltiu TMP1, TMP1, LJ_TISNUM
     |    lhu TMP2, OFS_RD(PC)
@@ -2390,8 +2920,16 @@ static void build_ins(BuildCtx *ctx, BCOp op, int defop)
     |    addiu PC, PC, 4
     |  beqz TMP0, ->vmeta_comp
     |.   lui TMP1, (-(BCBIAS_J*4 >> 16) & 65535)
+    |   load_double f0, CARG1, CARG2, 0(CARG2)
+    |.if FPU
+    |   ldc1 f2, 0(CARG3)
+    |.else
+    |   lw CARG4, 4(CARG3)
+    |   lw CARG3, 0(CARG3)
+    |.endif
     |    decode_RD4b TMP2
     |    addu TMP2, TMP2, TMP1
+    |.if FPU
     if (op == BC_ISLT || op == BC_ISGE) {
       |  c.olt.d f0, f2
     } else {
@@ -2402,8 +2940,28 @@ static void build_ins(BuildCtx *ctx, BCOp op, int defop)
     } else {
       |  movt TMP2, r0
     }
-    |  addu PC, PC, TMP2
+    |.else
+    |  load_got __ledf2
+    |  sw RD, TEMP_SAVE_1
+    |  sw TMP1, TEMP_SAVE_2
+    |  call_extern              //CRET1 = f0<=f2
+    |.  sw TMP2, TEMP_SAVE_3
+    |  lw TMP2, TEMP_SAVE_3
+    |  lw TMP1, TEMP_SAVE_2
+    if (op == BC_ISLT) {
+      |  bltz CRET1, >1
+    } else if (op == BC_ISLE) {
+      |  blez CRET1, >1
+    }  else if (op == BC_ISGT) {
+      |  bgtz CRET1, >1
+    }  else {
+      |  bgez CRET1, >1
+    }
+    |.  lw RD, TEMP_SAVE_1
+    |  move TMP2, r0
     |1:
+    |.endif
+    |  addu PC, PC, TMP2
     |  ins_next
     break;
 
@@ -2413,24 +2971,43 @@ static void build_ins(BuildCtx *ctx, BCOp op, int defop)
     |  addu RA, BASE, RA
     |   addiu PC, PC, 4
     |  lw TMP0, HI(RA)
-    |   ldc1 f0, 0(RA)
     |  addu RD, BASE, RD
     |    lhu TMP2, -4+OFS_RD(PC)
-    |   lw TMP1, HI(RD)
-    |   ldc1 f2, 0(RD)
     |    lui TMP3, (-(BCBIAS_J*4 >> 16) & 65535)
+    |   lw TMP1, HI(RD)
+    |    decode_RD4b TMP2
     |  sltiu AT, TMP0, LJ_TISNUM
     |  sltiu CARG1, TMP1, LJ_TISNUM
-    |    decode_RD4b TMP2
+    |   load_double f2, CARG3, CARG4, 0(RD)
+    |    lui TMP3, (-(BCBIAS_J*4 >> 16) & 65535)
     |  and AT, AT, CARG1
+    |   load_double f0, CARG1, CARG2, 0(RA)
     |  beqz AT, >5
     |.   addu TMP2, TMP2, TMP3
+    |.if FPU
     |  c.eq.d f0, f2
     if (vk) {
       |  movf TMP2, r0
     } else {
       |  movt TMP2, r0
     }
+    |.else
+    |  load_got __ledf2
+    |  sw RD, TEMP_SAVE_1
+    |  call_extern
+    |.  sw TMP2, TEMP_SAVE_2
+    |  lw RD, TEMP_SAVE_1
+    |  lw TMP2, TEMP_SAVE_2
+    if (vk) {
+      |  beqz CRET1, >4
+      |.  nop
+    } else {
+      |  bnez CRET1, >4
+      |.  nop
+    }
+    |  move TMP2, r0
+    |4:
+    |.endif
     |1:
     |  addu PC, PC, TMP2
     |  ins_next
@@ -2507,10 +3084,10 @@ static void build_ins(BuildCtx *ctx, BCOp op, int defop)
     |  addu RA, BASE, RA
     |   addiu PC, PC, 4
     |  lw TMP0, HI(RA)
-    |   ldc1 f0, 0(RA)
+    |   load_double f0, CARG1, CARG2, 0(RA)
     |  addu RD, KBASE, RD
     |    lhu TMP2, -4+OFS_RD(PC)
-    |   ldc1 f2, 0(RD)
+    |   load_double f2, CARG3, CARG4, 0(RD)
     |    lui TMP3, (-(BCBIAS_J*4 >> 16) & 65535)
     |  sltiu AT, TMP0, LJ_TISNUM
     |    decode_RD4b TMP2
@@ -2520,6 +3097,7 @@ static void build_ins(BuildCtx *ctx, BCOp op, int defop)
     |  beqz AT, >1
     |.endif
     |.   addu TMP2, TMP2, TMP3
+    |.if FPU
     |   c.eq.d f0, f2
     if (vk) {
       |  movf TMP2, r0
@@ -2530,6 +3108,28 @@ static void build_ins(BuildCtx *ctx, BCOp op, int defop)
       |1:
       |  addu PC, PC, TMP2
     }
+    |.else
+    |  load_got __ledf2
+    |   sw RD, TEMP_SAVE_1
+    |  call_extern
+    |.  sw TMP2, TEMP_SAVE_2
+    |  lw RD, TEMP_SAVE_1
+    |  lw TMP2, TEMP_SAVE_2
+    if (vk) {
+      |  beqz CRET1, >4
+      |.  nop
+      |  move TMP2, r0
+      |4:
+      |  addu PC, PC, TMP2
+      |1:
+    } else {
+      |  bnez CRET1, >1
+      |.  nop
+      |  move TMP2, r0
+      |1:
+      |  addu PC, PC, TMP2
+    }
+    |.endif
     |  ins_next
     |.if FFI
     |5:
@@ -2588,7 +3188,7 @@ static void build_ins(BuildCtx *ctx, BCOp op, int defop)
       |  addu PC, PC, TMP2
     } else {
       |  sltiu TMP0, TMP0, LJ_TISTRUECOND
-      |  ldc1 f0, 0(RD)
+      |  load_double1 0(RD)
       if (op == BC_ISTC) {
 	|  beqz TMP0, >1
       } else {
@@ -2598,7 +3198,7 @@ static void build_ins(BuildCtx *ctx, BCOp op, int defop)
       |   decode_RD4b TMP2
       |   lui TMP3, (-(BCBIAS_J*4 >> 16) & 65535)
       |   addu TMP2, TMP2, TMP3
-      |  sdc1 f0, 0(RA)
+      |  store_double1 0(RA)
       |   addu PC, PC, TMP2
       |1:
     }
@@ -2631,9 +3231,9 @@ static void build_ins(BuildCtx *ctx, BCOp op, int defop)
     |  // RA = dst*8, RD = src*8
     |  addu RD, BASE, RD
     |  addu RA, BASE, RA
-    |  ldc1 f0, 0(RD)
+    |  load_double1 0(RD)
     |  ins_next1
-    |  sdc1 f0, 0(RA)
+    |  store_double1 0(RA)
     |  ins_next2
     break;
   case BC_NOT:
@@ -2653,12 +3253,19 @@ static void build_ins(BuildCtx *ctx, BCOp op, int defop)
     |  addu CARG3, BASE, RD
     |   addu RA, BASE, RA
     |  lw TMP0, HI(CARG3)
-    |   ldc1 f0, 0(CARG3)
     |  sltiu AT, TMP0, LJ_TISNUM
+    |   load_double f0, CARG1, CARG2, 0(CARG3)
+    |.if FPU
     |  beqz AT, ->vmeta_unm
     |.  neg.d f0, f0
+    |.else
+    |   lui TMP1, 0x8000
+    |   xor CRET1, TMP1, CARG1
+    |  beqz AT, ->vmeta_unm
+    |.   move CRET2, CARG2
+    |.endif
     |  ins_next1
-    |   sdc1 f0, 0(RA)
+    |   store_double f0, CRET1, CRET2, 0(RA)
     |  ins_next2
     break;
   case BC_LEN:
@@ -2672,10 +3279,14 @@ static void build_ins(BuildCtx *ctx, BCOp op, int defop)
     |.  li AT, LJ_TTAB
     |  lw CRET1, STR:CARG1->len
     |1:
+    |.if FPU
     |  mtc1 CRET1, f0
     |  cvt.d.w f0, f0
+    |.else
+    |  cvti2d CRET1
+    |.endif
     |  ins_next1
-    |  sdc1 f0, 0(RA)
+    |   store_double f0, CRET1, CRET2, 0(RA)
     |  ins_next2
     |2:
     |  bne TMP0, AT, ->vmeta_len
@@ -2717,72 +3328,142 @@ static void build_ins(BuildCtx *ctx, BCOp op, int defop)
     |   addu CARG3, BASE, RB
     |    addu CARG4, KBASE, RC
     |   lw TMP1, HI(CARG3)
-    |   ldc1 f20, 0(CARG3)
-    |    ldc1 f22, 0(CARG4)
-    |   sltiu AT, TMP1, LJ_TISNUM
+    |  sltiu AT, TMP1, LJ_TISNUM
+    |   load_double f20, CARG1, CARG2, 0(CARG3)
+    |   load_double f22, CARG3, CARG4, 0(CARG4)
+    |.if FPU
+    |   beqz AT, ->vmeta_arith
+    |.else
+    |   beqz AT, ->vmeta_arith_vn
+    |.endif
+    |.   addu RA, BASE, RA
     ||  break;
     ||case 1:
     |   addu CARG4, BASE, RB
     |    addu CARG3, KBASE, RC
     |   lw TMP1, HI(CARG4)
-    |   ldc1 f22, 0(CARG4)
-    |    ldc1 f20, 0(CARG3)
-    |   sltiu AT, TMP1, LJ_TISNUM
+    |  sltiu AT, TMP1, LJ_TISNUM
+    |  load_double f20, CARG1, CARG2, 0(CARG3)
+    |   load_double f22, CARG3, CARG4, 0(CARG4)
+    |.if FPU
+    |   beqz AT, ->vmeta_arith
+    |.else
+    |   beqz AT, ->vmeta_arith_nv
+    |.endif
+    |.   addu RA, BASE, RA
     ||  break;
     ||default:
     |   addu CARG3, BASE, RB
     |    addu CARG4, BASE, RC
     |   lw TMP1, HI(CARG3)
     |    lw TMP2, HI(CARG4)
-    |   ldc1 f20, 0(CARG3)
-    |    ldc1 f22, 0(CARG4)
-    |   sltiu AT, TMP1, LJ_TISNUM
-    |   sltiu TMP0, TMP2, LJ_TISNUM
-    |   and AT, AT, TMP0
+    |  sltiu AT, TMP1, LJ_TISNUM
+    |  sltiu TMP0, TMP2, LJ_TISNUM
+    |  and AT, AT, TMP0
+    |  load_double f20, CARG1, CARG2, 0(CARG3)
+    |   load_double f22, CARG3, CARG4, 0(CARG4)
+    |.if FPU
+    |   beqz AT, ->vmeta_arith
+    |.else
+    |   beqz AT, ->vmeta_arith_vv
+    |.endif
+    |.   addu RA, BASE, RA
     ||  break;
     ||}
-    |  beqz AT, ->vmeta_arith
-    |.  addu RA, BASE, RA
     |.endmacro
     |
+    |.macro ins_arithfallback
+    ||switch (vk) {
+    ||case 0:
+    |   b ->vmeta_arith_vn
+    |.  nop
+    ||  break;
+    ||case 1:
+    |   b ->vmeta_arith_nv
+    |.  nop
+    ||  break;
+    ||default:
+    |   b ->vmeta_arith_vv
+    |.  nop
+    ||  break;
+    ||}
+    |.endmacro
+    |
+    |.if FPU
     |.macro fpmod, a, b, c
     |->BC_MODVN_Z:
-    |  bal ->vm_floor			// floor(b/c)
+    |  bal ->vm_floor     // floor(b/c)
     |.  div.d FARG1, b, c
     |  mul.d a, FRET1, c
-    |  sub.d a, b, a			// b - floor(b/c)*c
+    |  sub.d a, b, a      // b - floor(b/c)*c
     |.endmacro
+    |.else
     |
-    |.macro ins_arith, ins
+    |.macro sfpmod
+    |->BC_MODVN_Z:
+    |  load_got __divdf3
+    |  sw CARG1, TEMP_SAVE_1
+    |  sw CARG2, TEMP_SAVE_2
+    |  sw CARG3, TEMP_SAVE_3
+    |  call_extern
+    |.  sw CARG4, TEMP_SAVE_4
+    |  move CARG1, CRET1
+    |  bal ->vm_floor
+    |.  move CARG2, CRET2
+    |  load_got __muldf3
+    |  move CARG1, CRET1
+    |  move CARG2, CRET2
+    |  lw CARG3, TEMP_SAVE_3
+    |  call_extern
+    |.  lw CARG4, TEMP_SAVE_4
+    |  load_got __subdf3
+    |  lw CARG1, TEMP_SAVE_1
+    |  lw CARG2, TEMP_SAVE_2
+    |  move CARG3, CRET1
+    |  call_extern
+    |.  move CARG4, CRET2
+    |.endmacro
+    |.endif
+    |
+    |.macro ins_arith, intins, fpins, fpcall
     |  ins_arithpre
-    |.if "ins" == "fpmod_"
-    |  b ->BC_MODVN_Z			// Avoid 3 copies. It's slow anyway.
+    |.if "fpins" == "fpmod_"
+    |  b ->BC_MODVN_Z     // Avoid 3 copies. It's slow anyway.
     |.  nop
     |.else
-    |  ins f0, f20, f22
+    |.if FPU
+    |  fpins f0, f20, f22
+    |.else
+    |.if "fpcall" == "sfpmod"
+    |  sfpmod
+    |.else
+    |  load_got fpcall
+    |  call_extern
+    |.  nop
+    |.endif
+    |.endif
     |  ins_next1
-    |  sdc1 f0, 0(RA)
+    |  store_double1 0(RA)
     |  ins_next2
     |.endif
     |.endmacro
 
   case BC_ADDVN: case BC_ADDNV: case BC_ADDVV:
-    |  ins_arith add.d
+    |  ins_arith addu, add.d, __adddf3
     break;
   case BC_SUBVN: case BC_SUBNV: case BC_SUBVV:
-    |  ins_arith sub.d
+    |  ins_arith subu, sub.d, __subdf3
     break;
   case BC_MULVN: case BC_MULNV: case BC_MULVV:
-    |  ins_arith mul.d
+    |  ins_arith mult, mul.d, __muldf3
     break;
   case BC_DIVVN: case BC_DIVNV: case BC_DIVVV:
-    |  ins_arith div.d
+    |  ins_arith div, div.d, __divdf3
     break;
   case BC_MODVN:
-    |  ins_arith fpmod
-    break;
+    |  ins_arith modi, fpmod, sfpmod
   case BC_MODNV: case BC_MODVV:
-    |  ins_arith fpmod_
+    |  ins_arith modi, fpmod_, sfpmod
     break;
   case BC_POW:
     |  decode_RB8a RB, INS
@@ -2792,18 +3473,23 @@ static void build_ins(BuildCtx *ctx, BCOp op, int defop)
     |   addu CARG4, BASE, RC
     |  lw TMP1, HI(CARG3)
     |   lw TMP2, HI(CARG4)
-    |  ldc1 FARG1, 0(CARG3)
-    |   ldc1 FARG2, 0(CARG4)
     |  sltiu AT, TMP1, LJ_TISNUM
     |  sltiu TMP0, TMP2, LJ_TISNUM
     |  and AT, AT, TMP0
     |  load_got pow
     |  beqz AT, ->vmeta_arith
     |.  addu RA, BASE, RA
+    |  load_farg1 0(CARG3)
+    |   load_farg2 0(CARG4)
     |  call_extern
     |.  nop
     |  ins_next1
+    |.if HFABI
     |  sdc1 FRET1, 0(RA)
+    |.else
+    |  sw CRET1, 0(RA)
+    |  sw CRET2, 4(RA)
+    |.endif
     |  ins_next2
     break;
 
@@ -2826,10 +3512,10 @@ static void build_ins(BuildCtx *ctx, BCOp op, int defop)
     |  bnez CRET1, ->vmeta_binop
     |.  lw BASE, L->base
     |  addu RB, BASE, MULTRES
-    |  ldc1 f0, 0(RB)
+    |  load_double1 0(RB)
     |   addu RA, BASE, RA
     |  ins_next1
-    |   sdc1 f0, 0(RA)			// Copy result from RB to RA.
+    |   store_double1 0(RA)
     |  ins_next2
     break;
 
@@ -2864,20 +3550,24 @@ static void build_ins(BuildCtx *ctx, BCOp op, int defop)
   case BC_KSHORT:
     |  // RA = dst*8, RD = int16_literal*8
     |  sra RD, INS, 16
-    |  mtc1 RD, f0
     |  addu RA, BASE, RA
+    |.if FPU
+    |  mtc1 RD, f0
     |  cvt.d.w f0, f0
+    |.else
+    |  cvti2d RD
+    |.endif
     |  ins_next1
-    |  sdc1 f0, 0(RA)
+    |   store_double f0, CRET1, CRET2, 0(RA)
     |  ins_next2
     break;
   case BC_KNUM:
     |  // RA = dst*8, RD = num_const*8
     |  addu RD, KBASE, RD
     |   addu RA, BASE, RA
-    |  ldc1 f0, 0(RD)
+    |  load_double1 0(RD)
     |  ins_next1
-    |   sdc1 f0, 0(RA)
+    |   store_double1 0(RA)
     |  ins_next2
     break;
   case BC_KPRI:
@@ -2913,9 +3603,9 @@ static void build_ins(BuildCtx *ctx, BCOp op, int defop)
     |  lw UPVAL:RB, LFUNC:RD->uvptr
     |  ins_next1
     |  lw TMP1, UPVAL:RB->v
-    |  ldc1 f0, 0(TMP1)
+    |  load_double1 0(TMP1)
     |  addu RA, BASE, RA
-    |  sdc1 f0, 0(RA)
+    |  store_double1 0(RA)
     |  ins_next2
     break;
   case BC_USETV:
@@ -2924,14 +3614,14 @@ static void build_ins(BuildCtx *ctx, BCOp op, int defop)
     |    srl RA, RA, 1
     |   addu RD, BASE, RD
     |    addu RA, RA, LFUNC:RB
-    |   ldc1 f0, 0(RD)
+    |   load_double1 0(RD)
     |  lw UPVAL:RB, LFUNC:RA->uvptr
     |  lbu TMP3, UPVAL:RB->marked
     |   lw CARG2, UPVAL:RB->v
     |  andi TMP3, TMP3, LJ_GC_BLACK	// isblack(uv)
     |  lbu TMP0, UPVAL:RB->closed
     |   lw TMP2, HI(RD)
-    |   sdc1 f0, 0(CARG2)
+    |   store_double1 0(CARG2)
     |  li AT, LJ_GC_BLACK|1
     |  or TMP3, TMP3, TMP0
     |  beq TMP3, AT, >2			// Upvalue is closed and black?
@@ -2991,11 +3681,11 @@ static void build_ins(BuildCtx *ctx, BCOp op, int defop)
     |   srl RA, RA, 1
     |    addu RD, KBASE, RD
     |   addu RA, RA, LFUNC:RB
-    |    ldc1 f0, 0(RD)
+    |    load_double1 0(RD)
     |  lw UPVAL:RB, LFUNC:RA->uvptr
     |  ins_next1
     |  lw TMP1, UPVAL:RB->v
-    |  sdc1 f0, 0(TMP1)
+    |  store_double1 0(TMP1)
     |  ins_next2
     break;
   case BC_USETP:
@@ -3126,13 +3816,13 @@ static void build_ins(BuildCtx *ctx, BCOp op, int defop)
     |   lw TMP2, HI(CARG3)
     |    lw TAB:RB, LO(CARG2)
     |  li AT, LJ_TTAB
-    |   ldc1 f0, 0(CARG3)
     |  bne TMP1, AT, ->vmeta_tgetv
     |.  addu RA, BASE, RA
     |  sltiu AT, TMP2, LJ_TISNUM
     |  beqz AT, >5
     |.  li AT, LJ_TSTR
-    |
+    |.if FPU
+    |   ldc1 f0, 0(CARG3)
     |  // Convert number key to integer, check for integerness and range.
     |  cvt.w.d f2, f0
     |   lw TMP0, TAB:RB->asize
@@ -3148,9 +3838,51 @@ static void build_ins(BuildCtx *ctx, BCOp op, int defop)
     |  lw TMP0, HI(TMP2)
     |  beq TMP0, TISNIL, >2
     |.  ldc1 f0, 0(TMP2)
+    |.else
+    |   sw RB, TEMP_SAVE_1
+    |   sw CARG2, TEMP_SAVE_3
+    |  load_got __fixdfsi
+    |   lw CARG1, 0(CARG3)
+    |   lw CARG2, 4(CARG3)
+    |  call_extern			// cvt.w.d f2, f0
+    |.  sw RC, TEMP_SAVE_2
+    |  sw CRET1, TEMP_SAVE_4
+    |  cvti2d CRET1			// cvt.d.w f4, f2
+    |  load_got __ledf2
+    |  lw RC, TEMP_SAVE_2
+    |   addu CARG3, BASE, RC
+    |  lw CARG1, 0(CARG3)
+    |  lw CARG2, 4(CARG3)
+    |   move CARG3, CRET1
+    |   move CARG4, CRET2
+    |  call_extern			// c.eq.d f0, f4
+    |.  nop
+    |  lw CARG3, TEMP_SAVE_3
+    |  lw RC, TEMP_SAVE_2
+    |  lw RB, TEMP_SAVE_1
+    |   lw TMP0, TAB:RB->asize
+    |   lw TMP1, TAB:RB->array
+    |  lw TMP2, TEMP_SAVE_4
+    |  lw CARG2, TEMP_SAVE_3		// Restore old CARG2 and CARG3.
+    |   addu CARG3, BASE, RC
+    |  bnez CRET1, >3
+    |.  sltu AT, TMP2, TMP0
+    |  b >4
+    |.  nop
+    |3:
+    |  move AT, r0
+    |4:
+    |   sll TMP2, TMP2, 3
+    |  beqz AT, ->vmeta_tgetv		// Integer key and in array part?
+    |.  addu TMP2, TMP1, TMP2
+    |  lw TMP0, HI(TMP2)
+    |  lw SFT2, 4(TMP2)
+    |  beq TMP0, TISNIL, >2
+    |.  lw SFT1, 0(TMP2)
+    |.endif
     |1:
     |  ins_next1
-    |   sdc1 f0, 0(RA)
+    |   store_double1 0(RA)
     |  ins_next2
     |
     |2:  // Check for __index if table value is nil.
@@ -3246,10 +3978,11 @@ static void build_ins(BuildCtx *ctx, BCOp op, int defop)
     |.  addu RC, TMP2, RC
     |  lw TMP1, HI(RC)
     |  beq TMP1, TISNIL, >5
-    |.  ldc1 f0, 0(RC)
+    |.  nop
     |1:
+    |  load_double1 0(RC)
     |  ins_next1
-    |   sdc1 f0, 0(RA)
+    |   store_double1 0(RA)
     |  ins_next2
     |
     |5:  // Check for __index if table value is nil.
@@ -3271,20 +4004,28 @@ static void build_ins(BuildCtx *ctx, BCOp op, int defop)
     |  addu CARG2, BASE, RB
     |   addu CARG3, BASE, RC
     |    lw TAB:CARG1, LO(CARG2)
+    |   lw TMP0, TAB:CARG1->asize
+    |   lw TMP1, TAB:CARG1->array
+    |.if FPU
     |   ldc1 f0, 0(CARG3)
     |  trunc.w.d f2, f0
-    |   lw TMP0, TAB:CARG1->asize
     |  mfc1 CARG2, f2
-    |   lw TMP1, TAB:CARG1->array
+    |.else
+    |  load_got __fixdfsi
+    |  lw CARG1, 0(CARG3)
+    |  call_extern
+    |.  lw CARG2, 4(CARG3)
+    |  move CARG2, CRET1
+    |.endif
     |  sltu AT, CARG2, TMP0
     |   sll TMP2, CARG2, 3
     |  beqz AT, ->vmeta_tgetr		// In array part?
     |.  addu TMP2, TMP1, TMP2
-    |   ldc1 f0, 0(TMP2)
+    |   load_double1 0(TMP2)
     |->BC_TGETR_Z:
     |   addu RA, BASE, RA
     |  ins_next1
-    |   sdc1 f0, 0(RA)
+    |   store_double1 0(RA)
     |  ins_next2
     break;
 
@@ -3299,13 +4040,13 @@ static void build_ins(BuildCtx *ctx, BCOp op, int defop)
     |   lw TMP2, HI(CARG3)
     |    lw TAB:RB, LO(CARG2)
     |  li AT, LJ_TTAB
-    |   ldc1 f0, 0(CARG3)
     |  bne TMP1, AT, ->vmeta_tsetv
     |.  addu RA, BASE, RA
     |  sltiu AT, TMP2, LJ_TISNUM
     |  beqz AT, >5
     |.  li AT, LJ_TSTR
-    |
+    |.if FPU
+    |   ldc1 f0, 0(CARG3)
     |  // Convert number key to integer, check for integerness and range.
     |  cvt.w.d f2, f0
     |   lw TMP0, TAB:RB->asize
@@ -3326,6 +4067,52 @@ static void build_ins(BuildCtx *ctx, BCOp op, int defop)
     |   andi AT, TMP3, LJ_GC_BLACK	// isblack(table)
     |   bnez AT, >7
     |.  sdc1 f0, 0(TMP1)
+    |.else
+    |  sw RB, TEMP_SAVE_1
+    |  sw RC, TEMP_SAVE_2
+    |  sw CARG2, TEMP_SAVE_3
+    |  load_got __fixdfsi
+    |  lw CARG1, 0(CARG3)
+    |  call_extern			// cvt.w.d f2, f0
+    |.  lw CARG2, 4(CARG3)
+    |  sw CRET1, TEMP_SAVE_4
+    |  cvti2d CRET1			// cvt.d.w f4, f2
+    |  load_got __ledf2
+    |  lw RC, TEMP_SAVE_2
+    |   addu CARG3, BASE, RC
+    |  lw CARG1, 0(CARG3)
+    |  lw CARG2, 4(CARG3)
+    |  move CARG3, CRET1
+    |  call_extern			// c.eq.d f0, f4
+    |.  move CARG4, CRET2
+    |  lw RC, TEMP_SAVE_2
+    |  lw RB, TEMP_SAVE_1
+    |   lw TMP0, TAB:RB->asize
+    |   lw TMP1, TAB:RB->array
+    |  lw TMP2, TEMP_SAVE_4
+    |  lw CARG2, TEMP_SAVE_3		// Restore old CARG2 and CARG3.
+    |   addu CARG3, BASE, RC
+    |  bnez CRET1, >4			// NaN?
+    |.  sltu AT, TMP2, TMP0
+    |  b >6
+    |.  nop
+    |4:
+    |  move AT, r0
+    |6:
+    |   sll TMP2, TMP2, 3
+    |  beqz AT, ->vmeta_tsetv		// Integer key and in array part?
+    |.  addu TMP1, TMP1, TMP2
+    |   lbu TMP3, TAB:RB->marked
+    |  lw TMP0, HI(TMP1)
+    |   lw SFT1, 0(RA)
+    |  beq TMP0, TISNIL, >3
+    |.  lw SFT2, 4(RA)
+    |1:
+    |   andi AT, TMP3, LJ_GC_BLACK	// isblack(table)
+    |   sw SFT1, 0(TMP1)
+    |   bnez AT, >7
+    |.  sw SFT2, 4(TMP1)
+    |.endif
     |2:
     |  ins_next
     |
@@ -3374,7 +4161,7 @@ static void build_ins(BuildCtx *ctx, BCOp op, int defop)
     |  sll TMP1, TMP1, 3
     |  subu TMP1, TMP0, TMP1
     |  addu NODE:TMP2, NODE:TMP2, TMP1	// node = tab->node + (idx*32-idx*8)
-    |   ldc1 f20, 0(RA)
+    |   load_double f20, SFT1, SFT2, 0(RA)
     |1:
     |  lw CARG1, offsetof(Node, key)+HI(NODE:TMP2)
     |   lw TMP0, offsetof(Node, key)+LO(NODE:TMP2)
@@ -3388,8 +4175,14 @@ static void build_ins(BuildCtx *ctx, BCOp op, int defop)
     |.    lw TAB:TMP0, TAB:RB->metatable
     |2:
     |  andi AT, TMP3, LJ_GC_BLACK	// isblack(table)
+    |.if FPU
     |  bnez AT, >7
     |.  sdc1 f20, NODE:TMP2->val
+    |.else
+    |  sw SFT1, NODE:TMP2->val.u32.hi
+    |  bnez AT, >7
+    |.  sw SFT2, NODE:TMP2->val.u32.lo
+    |.endif
     |3:
     |  ins_next
     |
@@ -3417,6 +4210,10 @@ static void build_ins(BuildCtx *ctx, BCOp op, int defop)
     |  beqz TMP0, ->vmeta_tsets		// 'no __newindex' flag NOT set: check.
     |.  li AT, LJ_TSTR
     |6:
+    |.if not FPU
+    |  sw SFT1, TEMP_SAVE_1
+    |  sw SFT2, TEMP_SAVE_2
+    |.endif
     |  load_got lj_tab_newkey
     |  sw STR:RC, LO(CARG3)
     |  sw AT, HI(CARG3)
@@ -3427,8 +4224,16 @@ static void build_ins(BuildCtx *ctx, BCOp op, int defop)
     |.  move CARG1, L
     |  // Returns TValue *.
     |  lw BASE, L->base
+    |.if FPU
     |  b <3				// No 2nd write barrier needed.
     |.  sdc1 f20, 0(CRET1)
+    |.else
+    |  lw SFT2, TEMP_SAVE_1
+    |  lw SFT3, TEMP_SAVE_2
+    |  sw SFT2, 0(CRET1)
+    |  b <3
+    |.  sw SFT3, 4(CRET1)
+    |.endif
     |
     |7:  // Possible table write barrier for the value. Skip valiswhite check.
     |  barrierback TAB:RB, TMP3, TMP0, <3
@@ -3453,11 +4258,17 @@ static void build_ins(BuildCtx *ctx, BCOp op, int defop)
     |  lw TMP1, HI(RC)
     |   lbu TMP3, TAB:RB->marked
     |  beq TMP1, TISNIL, >5
-    |.  ldc1 f0, 0(RA)
     |1:
-    |  andi AT, TMP3, LJ_GC_BLACK	// isblack(table)
+    |.  andi AT, TMP3, LJ_GC_BLACK	// isblack(table)
+    |  load_double1 0(RA)
+    |.if FPU
     |  bnez AT, >7
     |.  sdc1 f0, 0(RC)
+    |.else
+    |  sw SFT1, 0(RC)
+    |  bnez AT, >7
+    |.  sw SFT2, 4(RC)
+    |.endif
     |2:
     |  ins_next
     |
@@ -3482,12 +4293,22 @@ static void build_ins(BuildCtx *ctx, BCOp op, int defop)
     |   decode_RDtoRC8 RC, RD
     |  addu CARG1, BASE, RB
     |   addu CARG3, BASE, RC
-    |    lw TAB:CARG2, LO(CARG1)
+    |.if FPU
     |   ldc1 f0, 0(CARG3)
     |  trunc.w.d f2, f0
+    |  mfc1 CARG3, f2
+    |.else
+    |  load_got __fixdfsi
+    |  sw CARG1, TEMP_SAVE_1
+    |   lw CARG1, 0(CARG3)
+    |  call_extern
+    |.   lw CARG2, 4(CARG3)
+    |  lw CARG1, TEMP_SAVE_1
+    |  move CARG3, CRET1
+    |.endif
+    |    lw TAB:CARG2, LO(CARG1)
     |    lbu TMP3, TAB:CARG2->marked
     |   lw TMP0, TAB:CARG2->asize
-    |  mfc1 CARG3, f2
     |   lw TMP1, TAB:CARG2->array
     |  andi AT, TMP3, LJ_GC_BLACK	// isblack(table)
     |  bnez AT, >7
@@ -3495,12 +4316,24 @@ static void build_ins(BuildCtx *ctx, BCOp op, int defop)
     |2:
     |  sltu AT, CARG3, TMP0
     |   sll TMP2, CARG3, 3
+    |.if FPU
     |  beqz AT, ->vmeta_tsetr		// In array part?
     |.  ldc1 f20, 0(RA)
     |   addu CRET1, TMP1, TMP2
     |->BC_TSETR_Z:
+    |.else
+    |   lw TMP0, 0(RA)
+    |   lw TMP3, 4(RA)
+    |   sw TMP0, TEMP_SAVE_1
+    |  beqz AT, ->vmeta_tsetr		// In array part?
+    |.   sw TMP3, TEMP_SAVE_2
+    |  addu CRET1, TMP1, TMP2
+    |->BC_TSETR_Z:
+    |   lw TMP0, TEMP_SAVE_1
+    |   lw TMP3, TEMP_SAVE_2
+    |.endif
     |  ins_next1
-    |   sdc1 f20, 0(CRET1)
+    |   store_double f20, TMP0, TMP3, 0(CRET1)
     |  ins_next2
     |
     |7:  // Possible table write barrier for the value. Skip valiswhite check.
@@ -3529,10 +4362,10 @@ static void build_ins(BuildCtx *ctx, BCOp op, int defop)
     |   addu TMP1, TMP1, CARG1
     |  andi TMP0, TMP3, LJ_GC_BLACK	// isblack(table)
     |3:  // Copy result slots to table.
-    |   ldc1 f0, 0(RA)
+    |   load_double1 0(RA)
     |    addiu RA, RA, 8
     |  sltu AT, RA, TMP2
-    |   sdc1 f0, 0(TMP1)
+    |   store_double1 0(TMP1)
     |  bnez AT, <3
     |.   addiu TMP1, TMP1, 8
     |  bnez TMP0, >7
@@ -3607,10 +4440,10 @@ static void build_ins(BuildCtx *ctx, BCOp op, int defop)
     |  beqz NARGS8:RC, >3
     |.  move TMP3, NARGS8:RC
     |2:
-    |   ldc1 f0, 0(RA)
+    |   load_double1 0(RA)
     |    addiu RA, RA, 8
     |  addiu TMP3, TMP3, -8
-    |   sdc1 f0, 0(TMP2)
+    |   store_double1 0(TMP2)
     |  bnez TMP3, <2
     |.   addiu TMP2, TMP2, 8
     |3:
@@ -3647,12 +4480,12 @@ static void build_ins(BuildCtx *ctx, BCOp op, int defop)
     |   li AT, LJ_TFUNC
     |  lw TMP1, -24+HI(BASE)
     |   lw LFUNC:RB, -24+LO(BASE)
-    |    ldc1 f2, -8(BASE)
-    |    ldc1 f0, -16(BASE)
+    |    load_double1 -8(BASE)
+    |    load_double2 -16(BASE)
     |  sw TMP1, HI(BASE)		// Copy callable.
     |   sw LFUNC:RB, LO(BASE)
-    |    sdc1 f2, 16(BASE)		// Copy control var.
-    |    sdc1 f0, 8(BASE)		// Copy state.
+    |    store_double1 16(BASE)		// Copy control var.
+    |    store_double2 8(BASE)		// Copy state.
     |   addiu BASE, BASE, 8
     |  bne TMP1, AT, ->vmeta_call
     |.  li NARGS8:RC, 16		// Iterators get 2 arguments.
@@ -3676,19 +4509,29 @@ static void build_ins(BuildCtx *ctx, BCOp op, int defop)
     |.  sll TMP3, RC, 3
     |  addu TMP3, TMP1, TMP3
     |  lw TMP2, HI(TMP3)
-    |   ldc1 f0, 0(TMP3)
+    |   load_double1 0(TMP3)
+    |.if FPU
     |    mtc1 RC, f2
+    |.else
+    |    move CARG1, RC
+    |.endif
     |     lhu RD, -4+OFS_RD(PC)
     |  beq TMP2, TISNIL, <1		// Skip holes in array part.
     |.  addiu RC, RC, 1
+    |   store_double1 8(RA)
+    |.if FPU
     |    cvt.d.w f2, f2
+    |.else
+    |   load_got __floatsidf
+    |   call_extern
+    |.   nop
+    |.endif
     |     lui TMP3, (-(BCBIAS_J*4 >> 16) & 65535)
-    |   sdc1 f0, 8(RA)
+    |   store_double f2, CRET1, CRET2, 0(RA)
     |     decode_RD4b RD
     |     addu RD, RD, TMP3
     |   sw RC, -8+LO(RA)		// Update control var.
     |     addu PC, PC, RD
-    |    sdc1 f2, 0(RA)
     |3:
     |  ins_next
     |
@@ -3704,17 +4547,22 @@ static void build_ins(BuildCtx *ctx, BCOp op, int defop)
     |   subu TMP3, TMP3, RB
     |  addu NODE:TMP3, TMP3, TMP2
     |  lw RB, HI(NODE:TMP3)
-    |  ldc1 f0, 0(NODE:TMP3)
+    |  load_double1 0(NODE:TMP3)
     |     lhu RD, -4+OFS_RD(PC)
     |  beq RB, TISNIL, <6		// Skip holes in hash part.
     |.  addiu RC, RC, 1
+    |.if FPU
     |   ldc1 f2, NODE:TMP3->key
+    |.else
+    |   lw SFT3, NODE:TMP3->key.u32.hi
+    |   lw SFT4, NODE:TMP3->key.u32.lo
+    |.endif
     |     lui TMP3, (-(BCBIAS_J*4 >> 16) & 65535)
-    |  sdc1 f0, 8(RA)
+    |  store_double1 8(RA)
     |    addu RC, RC, TMP0
     |     decode_RD4b RD
     |     addu RD, RD, TMP3
-    |   sdc1 f2, 0(RA)
+    |   store_double2 0(RA)
     |     addu PC, PC, RD
     |  b <3
     |.  sw RC, -8+LO(RA)		// Update control var.
@@ -3794,9 +4642,9 @@ static void build_ins(BuildCtx *ctx, BCOp op, int defop)
     |  bnez AT, >7
     |.  addiu MULTRES, TMP1, 8
     |6:
-    |  ldc1 f0, 0(RC)
+    |  load_double1 0(RC)
     |   addiu RC, RC, 8
-    |  sdc1 f0, 0(RA)
+    |  store_double1 0(RA)
     |  sltu AT, RC, TMP3
     |  bnez AT, <6			// More vararg slots?
     |.  addiu RA, RA, 8
@@ -3852,10 +4700,10 @@ static void build_ins(BuildCtx *ctx, BCOp op, int defop)
     |  beqz RC, >3
     |.  subu BASE, TMP2, TMP0
     |2:
-    |   ldc1 f0, 0(RA)
+    |   load_double1 0(RA)
     |    addiu RA, RA, 8
     |  addiu RC, RC, -8
-    |   sdc1 f0, 0(TMP2)
+    |   store_double1 0(TMP2)
     |  bnez RC, <2
     |.   addiu TMP2, TMP2, 8
     |3:
@@ -3896,14 +4744,14 @@ static void build_ins(BuildCtx *ctx, BCOp op, int defop)
     |  lw INS, -4(PC)
     |   addiu TMP2, BASE, -8
     if (op == BC_RET1) {
-      |  ldc1 f0, 0(RA)
+      |  load_double1 0(RA)
     }
     |  decode_RB8a RB, INS
     |   decode_RA8a RA, INS
     |  decode_RB8b RB
     |   decode_RA8b RA
     if (op == BC_RET1) {
-      |  sdc1 f0, 0(TMP2)
+      |  store_double1 0(TMP2)
     }
     |   subu BASE, TMP2, RA
     |5:
@@ -3928,6 +4776,45 @@ static void build_ins(BuildCtx *ctx, BCOp op, int defop)
 
   /* -- Loops and branches ------------------------------------------------ */
 
+    |.macro cmp_res, gt
+    |.if gt == 1
+    |.if FPU
+    |  movf TMP1, r0, 0		// f0>f2: TMP1=0
+    |  movf TMP2, r0, 1		// f2>f0: TMP2=0
+    |.else
+    |  li SFT2, 1
+    |  bne CRET1, SFT2, >1
+    |.  nop
+    |  b >2
+    |.  move TMP1, r0
+    |1:
+    |  li SFT2, -1
+    |  bne CRET1, SFT2, >2
+    |.  nop
+    |  move TMP2, r0
+    |2:
+    |.endif
+    |.else
+    |.if FPU
+    |  movt TMP1, r0, 0		// f0<=f2: TMP1=0
+    |  movt TMP2, r0, 1		// f2<=f0: TMP2=0
+    |.else
+    |  bltz CRET1, >3		// f0<f2: TMP1=0
+    |.  nop
+    |  beqz CRET1, >2		// f0==f2: TMP1=TMP2=0
+    |.  li SFT2, 1
+    |  bne SFT2, CRET1, >4	// f0>f2: TMP2=0
+    |.  nop
+    |  b >4
+    |2:
+    |.  move TMP2, r0
+    |3:
+    |  move TMP1, r0
+    |4:
+    |.endif
+    |.endif
+    |.endmacro
+
   case BC_FORL:
     |.if JIT
     |  hotloop
@@ -3946,12 +4833,26 @@ static void build_ins(BuildCtx *ctx, BCOp op, int defop)
     vk = (op == BC_IFORL || op == BC_JFORL);
     |  addu RA, BASE, RA
     if (vk) {
+      |.if FPU
       |  ldc1 f0, FORL_IDX*8(RA)
       |  ldc1 f4, FORL_STEP*8(RA)
       |  ldc1 f2, FORL_STOP*8(RA)
       |   lw TMP3, FORL_STEP*8+HI(RA)
       |  add.d f0, f0, f4
       |  sdc1 f0, FORL_IDX*8(RA)
+      |.else
+      |  load_got __adddf3
+      |  load_farg1 FORL_IDX*8(RA)
+      |  load_farg2 FORL_STEP*8(RA)
+      |  call_extern
+      |.  sw RD, TEMP_SAVE_1  //save RD
+      |  sw CRET1, FORL_IDX*8(RA)
+      |  sw CRET2, FORL_IDX*8+4(RA)
+      |  load_farg1 FORL_IDX*8(RA)
+      |  load_farg2 FORL_STOP*8(RA)		// f0 and f2
+      |  lw TMP3, FORL_STEP*8+HI(RA)
+      |  lw RD, TEMP_SAVE_1
+      |.endif
     } else {
       |  lw TMP1, FORL_IDX*8+HI(RA)
       |  lw TMP3, FORL_STEP*8+HI(RA)
@@ -3961,25 +4862,41 @@ static void build_ins(BuildCtx *ctx, BCOp op, int defop)
       |  sltiu TMP2, TMP2, LJ_TISNUM
       |  and TMP1, TMP1, TMP0
       |  and TMP1, TMP1, TMP2
+      |.if FPU
       |   ldc1 f0, FORL_IDX*8(RA)
       |  beqz TMP1, ->vmeta_for
       |.  ldc1 f2, FORL_STOP*8(RA)
+      |.else
+      |  beqz TMP1, ->vmeta_for
+      |  load_farg1 FORL_IDX*8(RA)
+      |  load_farg2 FORL_STOP*8(RA)
+      |.endif
     }
     if (op != BC_JFORL) {
       |  srl RD, RD, 1
       |  lui TMP0, (-(BCBIAS_J*4 >> 16) & 65535)
     }
+    |  store_double f0, CARG1, CARG2, FORL_EXT*8(RA)
+    |.if FPU
     |  c.le.d 0, f0, f2
     |  c.le.d 1, f2, f0
-    |  sdc1 f0, FORL_EXT*8(RA)
+    |.else
+    |  sw RD, TEMP_SAVE_1
+    |  load_got __ledf2				// f0<=f2
+    |  call_extern
+    |.  sw TMP0, TEMP_SAVE_2
+    |  lw TMP0, TEMP_SAVE_2
+    |  lw RD, TEMP_SAVE_1
+    |  lw TMP3, FORL_STEP*8+HI(RA)		// Restored step.
+    |.endif
+    |
     if (op == BC_JFORI) {
       |  li TMP1, 1
       |  li TMP2, 1
       |   addu TMP0, RD, TMP0
       |  slt TMP3, TMP3, r0
-      |  movf TMP1, r0, 0
+      |  cmp_res 1
       |   addu PC, PC, TMP0
-      |  movf TMP2, r0, 1
       |   lhu RD, -4+OFS_RD(PC)
       |  movn TMP1, TMP2, TMP3
       |  bnez TMP1, =>BC_JLOOP
@@ -3988,8 +4905,7 @@ static void build_ins(BuildCtx *ctx, BCOp op, int defop)
       |  li TMP1, 1
       |  li TMP2, 1
       |  slt TMP3, TMP3, r0
-      |  movf TMP1, r0, 0
-      |  movf TMP2, r0, 1
+      |  cmp_res 1
       |  movn TMP1, TMP2, TMP3
       |  bnez TMP1, =>BC_JLOOP
       |.  nop
@@ -3998,11 +4914,9 @@ static void build_ins(BuildCtx *ctx, BCOp op, int defop)
       |  slt TMP3, TMP3, r0
       |  move TMP2, TMP1
       if (op == BC_FORI) {
-	|  movt TMP1, r0, 0
-	|  movt TMP2, r0, 1
+	| cmp_res 0
       } else {
-	|  movf TMP1, r0, 0
-	|  movf TMP2, r0, 1
+	| cmp_res 1
       }
       |  movn TMP1, TMP2, TMP3
       |  addu PC, PC, TMP1
@@ -4256,8 +5170,10 @@ static void emit_asm_debug(BuildCtx *ctx)
 	fcofs, CFRAME_SIZE);
     for (i = 23; i >= 16; i--)
       fprintf(ctx->fp, "\t.byte %d\n\t.uleb128 %d\n", 0x80+i, 26-i);
+#if !LJ_SOFTFP
     for (i = 30; i >= 20; i -= 2)
       fprintf(ctx->fp, "\t.byte %d\n\t.uleb128 %d\n", 0x80+32+i, 42-i);
+#endif
     fprintf(ctx->fp,
 	"\t.align 2\n"
 	".LEFDE0:\n\n");
@@ -4275,6 +5191,7 @@ static void emit_asm_debug(BuildCtx *ctx)
 	"\t.align 2\n"
 	".LEFDE1:\n\n", (int)ctx->codesz - fcofs);
 #endif
+#if !LJ_NO_UNWIND
     fprintf(ctx->fp, "\t.section .eh_frame,\"aw\",@progbits\n");
     fprintf(ctx->fp,
 	"\t.globl lj_err_unwind_dwarf\n"
@@ -4342,6 +5259,7 @@ static void emit_asm_debug(BuildCtx *ctx)
 	"\t.byte 0xd\n\t.uleb128 0x10\n"
 	"\t.align 2\n"
 	".LEFDE3:\n\n", (int)ctx->codesz - fcofs);
+#endif
 #endif
     break;
   default:
diff --git a/lib/luajit/src/vm_x64.dasc b/lib/luajit/src/vm_x64.dasc
index e7e990ae27..bba89aaf1b 100644
--- a/lib/luajit/src/vm_x64.dasc
+++ b/lib/luajit/src/vm_x64.dasc
@@ -531,7 +531,7 @@ static void build_subroutines(BuildCtx *ctx)
   |  jmp >2
   |
   |->vm_growstack_v:			// Grow stack for vararg Lua function.
-  |  sub RD, 8
+  |  sub RD, 16				// LJ_FR2
   |  jmp >1
   |
   |->vm_growstack_f:			// Grow stack for fixarg Lua function.
diff --git a/src/core/lib.c b/src/core/lib.c
index e1703f71ee..84951da3bf 100644
--- a/src/core/lib.c
+++ b/src/core/lib.c
@@ -77,3 +77,15 @@ void nop()
 {
 }
 
+/* Bitswap uint64_t. */
+uint64_t bswap64 (uint64_t b)
+{
+  return ((((uint64_t) b & (uint64_t) 0x00000000000000ff) << 56) |
+          (((uint64_t) b & (uint64_t) 0x000000000000ff00) << 40) |
+          (((uint64_t) b & (uint64_t) 0x0000000000ff0000) << 24) |
+          (((uint64_t) b & (uint64_t) 0x00000000ff000000) <<  8) |
+          (((uint64_t) b & (uint64_t) 0x000000ff00000000) >>  8) |
+          (((uint64_t) b & (uint64_t) 0x0000ff0000000000) >> 24) |
+          (((uint64_t) b & (uint64_t) 0x00ff000000000000) >> 40) |
+          (((uint64_t) b & (uint64_t) 0xff00000000000000) >> 56));
+}
diff --git a/src/core/lib.h b/src/core/lib.h
index 013bdcfecc..616d12a484 100644
--- a/src/core/lib.h
+++ b/src/core/lib.h
@@ -6,3 +6,4 @@ void full_memory_barrier();
 void prefetch_for_read(const void *address);
 void prefetch_for_write(const void *address);
 unsigned int stat_mtime(const char *path);
+uint64_t bswap64 (uint64_t b);
diff --git a/src/core/lib.lua b/src/core/lib.lua
index eccd097dbc..8ecf6f9df8 100644
--- a/src/core/lib.lua
+++ b/src/core/lib.lua
@@ -351,15 +351,19 @@ end
 -- avoid C function call overhead while using C.xxxx counterparts
 if ffi.abi("be") then
    -- nothing to do
+   function htonll(b) return b end
    function htonl(b) return b end
    function htons(b) return b end
 else
+   function htonll(b) return C.bswap64(b) end
    function htonl(b) return bswap(b) end
    function htons(b) return rshift(bswap(b), 16) end
 end
+ntohll = htonll
 ntohl = htonl
 ntohs = htons
 
+
 -- Manipulation of bit fields in uint{8,16,32)_t stored in network
 -- byte order.  Using bit fields in C structs is compiler-dependent
 -- and a little awkward for handling endianness and fields that cross
diff --git a/src/dasm.lua b/src/dasm.lua
index 448eab9cf9..acf8587e5f 100644
--- a/src/dasm.lua
+++ b/src/dasm.lua
@@ -1,5 +1,5 @@
 
---binding to the DynASM encoding engine.
+--Binding to the DynASM encoding engine.
 --Written by Cosmin Apreutesei. Public Domain.
 
 local ffi = require'ffi'
diff --git a/src/dasm_proto.h b/src/dasm_proto.h
index 93ca06533c..d3ba39ab1e 100644
--- a/src/dasm_proto.h
+++ b/src/dasm_proto.h
@@ -13,6 +13,8 @@
 #define DASM_IDENT	"DynASM 1.4.0"
 #define DASM_VERSION	10400	/* 1.4.0 */
 
+#undef DASM_CHECKS
+
 #ifndef Dst_DECL
 #define Dst_DECL	dasm_State **Dst
 #endif
@@ -76,7 +78,8 @@ DASM_FDEF int dasm_getpclabel(Dst_DECL, unsigned int pc);
 /* Optional sanity checker to call between isolated encoding steps. */
 DASM_FDEF int dasm_checkstep(Dst_DECL, int secmatch);
 #else
-#define dasm_checkstep(a, b)	0
+/*#define dasm_checkstep(a, b)	0*/
+DASM_FDEF int dasm_checkstep(Dst_DECL, int secmatch) {return 0;}
 #endif
 
 
diff --git a/src/dasm_x64.lua b/src/dasm_x64.lua
index 24efbae866..c22ddcfda8 100644
--- a/src/dasm_x64.lua
+++ b/src/dasm_x64.lua
@@ -9,10 +9,11 @@
 ------------------------------------------------------------------------------
 
 --unload dasm_x86 if it's already loaded.
+if not package then package = {loaded = {}} end --for compat. with minilua
 local dasm_x86 = package.loaded.dasm_x86
 package.loaded.dasm_x86 = nil
 
-rawset(_G, 'x64', true) -- Using a global is an ugly, but effective solution.
+x64 = true -- Using a global is an ugly, but effective solution.
 local dasm_x64 = require("dasm_x86")
 
 package.loaded.dasm_x86 = dasm_x86 --put it back
diff --git a/src/dasm_x86.c b/src/dasm_x86.c
index 85376ca7ca..59c3bb63c0 100644
--- a/src/dasm_x86.c
+++ b/src/dasm_x86.c
@@ -1,4 +1,10 @@
-#define DASM_CHECKS
+/*
+  Encoding engine to use with dasm.lua.
+
+  Compile with:
+
+    gcc dasm_x86.c -DDASM_CHECKS -shared -s -o dasm_x86.so
+*/
 
 #include "dasm_extern.h"
 #include "dasm_proto.h"
diff --git a/src/dasm_x86.h b/src/dasm_x86.h
index 175febe0ca..be9c289f02 100644
--- a/src/dasm_x86.h
+++ b/src/dasm_x86.h
@@ -170,7 +170,7 @@ void dasm_put(Dst_DECL, int start, ...)
   dasm_State *D = Dst_REF;
   dasm_ActList p = D->actionlist + start;
   dasm_Section *sec = D->section;
-  int pos = sec->pos, ofs = sec->ofs, mrm = 4;
+  int pos = sec->pos, ofs = sec->ofs, mrm = -1;
   int *b;
 
   if (pos >= sec->epos) {
@@ -193,7 +193,7 @@ void dasm_put(Dst_DECL, int start, ...)
       b[pos++] = n;
       switch (action) {
       case DASM_DISP:
-	if (n == 0) { if ((mrm&7) == 4) mrm = p[-2]; if ((mrm&7) != 5) break; }
+	if (n == 0) { if (mrm < 0) mrm = p[-2]; if ((mrm&7) != 5) break; }
       case DASM_IMM_DB: if (((n+128)&-256) == 0) goto ob;
       case DASM_REL_A: /* Assumes ptrdiff_t is int. !x64 */
       case DASM_IMM_D: ofs += 4; break;
@@ -203,10 +203,17 @@ void dasm_put(Dst_DECL, int start, ...)
       case DASM_IMM_W: CK((n&-65536) == 0, RANGE_I); ofs += 2; break;
       case DASM_SPACE: p++; ofs += n; break;
       case DASM_SETLABEL: b[pos-2] = -0x40000000; break;  /* Neg. label ofs. */
-      case DASM_VREG: CK((n&-8) == 0 && (n != 4 || (*p&1) == 0), RANGE_VREG);
-	if (*p++ == 1 && *p == DASM_DISP) mrm = n; continue;
+      case DASM_VREG: CK((n&-16) == 0 && (n != 4 || (*p>>5) != 2), RANGE_VREG);
+	if (*p < 0x40 && p[1] == DASM_DISP) mrm = n;
+	if (*p < 0x20 && (n&7) == 4) ofs++;
+	switch ((*p++ >> 3) & 3) {
+	case 3: n |= b[pos-3];
+	case 2: n |= b[pos-2];
+	case 1: if (n <= 7) { b[pos-1] |= 0x10; ofs--; }
+	}
+	continue;
       }
-      mrm = 4;
+      mrm = -1;
     } else {
       int *pl, n;
       switch (action) {
@@ -393,7 +400,22 @@ int dasm_encode(Dst_DECL, void *buffer)
 	case DASM_IMM_W: dasmw(n); break;
 	case DASM_VREG: {
 	  int t = *p++;
-	  if (t >= 5) n <<= 4; else if (t >= 2) n <<= 3;
+	  unsigned char *ex = cp - (t&7);
+	  if ((n & 8) && t < 0xa0) {
+	    if (*ex & 0x80) ex[1] ^= 0x20 << (t>>6); else *ex ^= 1 << (t>>6);
+	    n &= 7;
+	  } else if (n & 0x10) {
+	    if (*ex & 0x80) {
+	      *ex = 0xc5; ex[1] = (ex[1] & 0x80) | ex[2]; ex += 2;
+	    }
+	    while (++ex < cp) ex[-1] = *ex;
+	    if (mark) mark--;
+	    cp--;
+	    n &= 7;
+	  }
+	  if (t >= 0xc0) n <<= 4;
+	  else if (t >= 0x40) n <<= 3;
+	  else if (n == 4 && t < 0x20) { cp[-1] ^= n; *cp++ = 0x20; }
 	  cp[-1] ^= n;
 	  break;
 	}
diff --git a/src/dasm_x86.lua b/src/dasm_x86.lua
index e7563d477f..0c11f020ec 100644
--- a/src/dasm_x86.lua
+++ b/src/dasm_x86.lua
@@ -44,7 +44,7 @@ local action_names = {
   -- int arg, 1 buffer pos:
   "DISP",  "IMM_S", "IMM_B", "IMM_W", "IMM_D",  "IMM_WB", "IMM_DB",
   -- action arg (1 byte), int arg, 1 buffer pos (reg/num):
-  "VREG", "SPACE", -- !x64: VREG support NYI.
+  "VREG", "SPACE",
   -- ptrdiff_t arg, 1 buffer pos (address): !x64
   "SETLABEL", "REL_A",
   -- action arg (1 byte) or int arg, 2 buffer pos (link, offset):
@@ -92,6 +92,21 @@ local function init_actionlist()
   secpos = 1
 end
 
+-- VREG kind encodings, pre-shifted by 5 bits.
+local map_vreg = {
+  ["modrm.rm.m"] = 0x00,
+  ["modrm.rm.r"] = 0x20,
+  ["opcode"] =     0x20,
+  ["sib.base"] =   0x20,
+  ["sib.index"] =  0x40,
+  ["modrm.reg"] =  0x80,
+  ["vex.v"] =      0xa0,
+  ["imm.hi"] =     0xc0,
+}
+
+-- Current number of VREG actions contributing to REX/VEX shrinkage.
+local vreg_shrink_count = 0
+
 ------------------------------------------------------------------------------
 
 -- Compute action numbers for action names.
@@ -151,6 +166,21 @@ local function waction(action, a, num)
   if a or num then secpos = secpos + (num or 1) end
 end
 
+-- Optionally add a VREG action.
+local function wvreg(kind, vreg, psz, sk, defer)
+  if not vreg then return end
+  waction("VREG", vreg)
+  local b = assert(map_vreg[kind], "bad vreg kind `"..vreg.."'")
+  if b < (sk or 0) then
+    vreg_shrink_count = vreg_shrink_count + 1
+  end
+  if not defer then
+    b = b + vreg_shrink_count * 8
+    vreg_shrink_count = 0
+  end
+  wputxb(b + (psz or 0))
+end
+
 -- Add call to embedded DynASM C code.
 local function wcall(func, args)
   if luamode then
@@ -390,6 +420,7 @@ mkrmap("w", "Rw", {"ax", "cx", "dx", "bx", "sp", "bp", "si", "di"})
 mkrmap("b", "Rb", {"al", "cl", "dl", "bl", "ah", "ch", "dh", "bh"})
 map_reg_valid_index[map_archdef.esp] = false
 if x64 then map_reg_valid_index[map_archdef.rsp] = false end
+if x64 then map_reg_needrex[map_archdef.Rb] = true end
 map_archdef["Ra"] = "@"..addrsize
 
 -- FP registers (internally tword sized, but use "f" as operand size).
@@ -527,16 +558,24 @@ local function wputszarg(sz, n)
 end
 
 -- Put multi-byte opcode with operand-size dependent modifications.
-local function wputop(sz, op, rex, vex)
+local function wputop(sz, op, rex, vex, vregr, vregxb)
+  local psz, sk = 0, nil
   if vex then
     local tail
     if vex.m == 1 and band(rex, 11) == 0 then
-      wputb(0xc5)
+      if x64 and vregxb then
+	sk = map_vreg["modrm.reg"]
+      else
+	wputb(0xc5)
       tail = shl(bxor(band(rex, 4), 4), 5)
-    else
+      psz = 3
+      end
+    end
+    if not tail then
       wputb(0xc4)
       wputb(shl(bxor(band(rex, 7), 7), 5) + vex.m)
       tail = shl(band(rex, 8), 4)
+      psz = 4
     end
     local reg, vreg = 0, nil
     if vex.v then
@@ -546,12 +585,18 @@ local function wputop(sz, op, rex, vex)
     end
     if sz == "y" or vex.l then tail = tail + 4 end
     wputb(tail + shl(bxor(reg, 15), 3) + vex.p)
-    if vreg then waction("VREG", vreg); wputxb(4) end
+    wvreg("vex.v", vreg)
     rex = 0
     if op >= 256 then werror("bad vex opcode") end
+  else
+    if rex ~= 0 then
+      if not x64 then werror("bad operand size") end
+    elseif (vregr or vregxb) and x64 then
+      rex = 0x10
+      sk = map_vreg["vex.v"]
+    end
   end
   local r
-  if rex ~= 0 and not x64 then werror("bad operand size") end
   if sz == "w" then wputb(102) end
   -- Needs >32 bit numbers, but only for crc32 eax, word [ebx]
   if op >= 4294967296 then r = op%4294967296 wputb((op-r)/4294967296) op = r end
@@ -560,20 +605,20 @@ local function wputop(sz, op, rex, vex)
     if rex ~= 0 then
       local opc3 = band(op, 0xffff00)
       if opc3 == 0x0f3a00 or opc3 == 0x0f3800 then
-	wputb(64 + band(rex, 15)); rex = 0
+	wputb(64 + band(rex, 15)); rex = 0; psz = 2
       end
     end
-    wputb(shr(op, 16)); op = band(op, 0xffff)
+    wputb(shr(op, 16)); op = band(op, 0xffff); psz = psz + 1
   end
   if op >= 256 then
     local b = shr(op, 8)
-    if b == 15 and rex ~= 0 then wputb(64 + band(rex, 15)); rex = 0 end
-    wputb(b)
-    op = band(op, 255)
+    if b == 15 and rex ~= 0 then wputb(64 + band(rex, 15)); rex = 0; psz = 2 end
+    wputb(b); op = band(op, 255); psz = psz + 1
   end
-  if rex ~= 0 then wputb(64 + band(rex, 15)) end
+  if rex ~= 0 then wputb(64 + band(rex, 15)); psz = 2 end
   if sz == "b" then op = op - 1 end
   wputb(op)
+  return psz, sk
 end
 
 -- Put ModRM or SIB formatted byte.
@@ -583,7 +628,7 @@ local function wputmodrm(m, s, rm, vs, vrm)
 end
 
 -- Put ModRM/SIB plus optional displacement.
-local function wputmrmsib(t, imark, s, vsreg)
+local function wputmrmsib(t, imark, s, vsreg, psz, sk)
   local vreg, vxreg
   local reg, xreg = t.reg, t.xreg
   if reg and reg < 0 then reg = 0; vreg = t.vreg end
@@ -593,8 +638,8 @@ local function wputmrmsib(t, imark, s, vsreg)
   -- Register mode.
   if sub(t.mode, 1, 1) == "r" then
     wputmodrm(3, s, reg)
-    if vsreg then waction("VREG", vsreg); wputxb(2) end
-    if vreg then waction("VREG", vreg); wputxb(0) end
+    wvreg("modrm.reg", vsreg, psz+1, sk, vreg)
+    wvreg("modrm.rm.r", vreg, psz+1, sk)
     return
   end
 
@@ -608,21 +653,22 @@ local function wputmrmsib(t, imark, s, vsreg)
       -- [xreg*xsc+disp] -> (0, s, esp) (xsc, xreg, ebp)
       wputmodrm(0, s, 4)
       if imark == "I" then waction("MARK") end
-      if vsreg then waction("VREG", vsreg); wputxb(2) end
+      wvreg("modrm.reg", vsreg, psz+1, sk, vxreg)
       wputmodrm(t.xsc, xreg, 5)
-      if vxreg then waction("VREG", vxreg); wputxb(3) end
+      wvreg("sib.index", vxreg, psz+2, sk)
     else
       -- Pure 32 bit displacement.
       if x64 and tdisp ~= "table" then
 	wputmodrm(0, s, 4) -- [disp] -> (0, s, esp) (0, esp, ebp)
+	wvreg("modrm.reg", vsreg, psz+1, sk)
 	if imark == "I" then waction("MARK") end
 	wputmodrm(0, 4, 5)
       else
 	riprel = x64
 	wputmodrm(0, s, 5) -- [disp|rip-label] -> (0, s, ebp)
+	wvreg("modrm.reg", vsreg, psz+1, sk)
 	if imark == "I" then waction("MARK") end
       end
-      if vsreg then waction("VREG", vsreg); wputxb(2) end
     end
     if riprel then -- Emit rip-relative displacement.
       if match("UWSiI", imark) then
@@ -650,16 +696,16 @@ local function wputmrmsib(t, imark, s, vsreg)
   if xreg or band(reg, 7) == 4 then
     wputmodrm(m or 2, s, 4) -- ModRM.
     if m == nil or imark == "I" then waction("MARK") end
-    if vsreg then waction("VREG", vsreg); wputxb(2) end
+    wvreg("modrm.reg", vsreg, psz+1, sk, vxreg or vreg)
     wputmodrm(t.xsc or 0, xreg or 4, reg) -- SIB.
-    if vxreg then waction("VREG", vxreg); wputxb(3) end
-    if vreg then waction("VREG", vreg); wputxb(1) end
+    wvreg("sib.index", vxreg, psz+2, sk, vreg)
+    wvreg("sib.base", vreg, psz+2, sk)
   else
     wputmodrm(m or 2, s, reg) -- ModRM.
     if (imark == "I" and (m == 1 or m == 2)) or
        (m == nil and (vsreg or vreg)) then waction("MARK") end
-    if vsreg then waction("VREG", vsreg); wputxb(2) end
-    if vreg then waction("VREG", vreg); wputxb(1) end
+    wvreg("modrm.reg", vsreg, psz+1, sk, vreg)
+    wvreg("modrm.rm.m", vreg, psz+1, sk)
   end
 
   -- Put displacement.
@@ -1184,7 +1230,7 @@ local map_op = {
   shrd_3 =	"mriqdw:0FACRmU|mrC/qq:0FADRm|mrC/dd:|mrC/ww:",
 
   rdtsc_0 =	"0F31", -- P1+
-  rdpmc_0 =	"0F33",
+  rdpmc_0 =	"0F33", -- P6+
   cpuid_0 =	"0FA2", -- P1+
 
   -- floating point ops
@@ -1327,46 +1373,14 @@ local map_op = {
   movups_2 =	"rmo:0F10rM|mro:0F11Rm",
   orpd_2 =	"rmo:660F56rM",
   orps_2 =	"rmo:0F56rM",
-  packssdw_2 =	"rmo:660F6BrM",
-  packsswb_2 =	"rmo:660F63rM",
-  packuswb_2 =	"rmo:660F67rM",
-  paddb_2 =	"rmo:660FFCrM",
-  paddd_2 =	"rmo:660FFErM",
-  paddq_2 =	"rmo:660FD4rM",
-  paddsb_2 =	"rmo:660FECrM",
-  paddsw_2 =	"rmo:660FEDrM",
-  paddusb_2 =	"rmo:660FDCrM",
-  paddusw_2 =	"rmo:660FDDrM",
-  paddw_2 =	"rmo:660FFDrM",
-  pand_2 =	"rmo:660FDBrM",
-  pandn_2 =	"rmo:660FDFrM",
   pause_0 =	"F390",
-  pavgb_2 =	"rmo:660FE0rM",
-  pavgw_2 =	"rmo:660FE3rM",
-  pcmpeqb_2 =	"rmo:660F74rM",
-  pcmpeqd_2 =	"rmo:660F76rM",
-  pcmpeqw_2 =	"rmo:660F75rM",
-  pcmpgtb_2 =	"rmo:660F64rM",
-  pcmpgtd_2 =	"rmo:660F66rM",
-  pcmpgtw_2 =	"rmo:660F65rM",
   pextrw_3 =	"rri/do:660FC5rMU|xri/wo:660F3A15nRmU", -- Mem op: SSE4.1 only.
   pinsrw_3 =	"rri/od:660FC4rMU|rxi/ow:",
-  pmaddwd_2 =	"rmo:660FF5rM",
-  pmaxsw_2 =	"rmo:660FEErM",
-  pmaxub_2 =	"rmo:660FDErM",
-  pminsw_2 =	"rmo:660FEArM",
-  pminub_2 =	"rmo:660FDArM",
   pmovmskb_2 =	"rr/do:660FD7rM",
-  pmulhuw_2 =	"rmo:660FE4rM",
-  pmulhw_2 =	"rmo:660FE5rM",
-  pmullw_2 =	"rmo:660FD5rM",
-  pmuludq_2 =	"rmo:660FF4rM",
-  por_2 =	"rmo:660FEBrM",
   prefetchnta_1 = "xb:n0F180m",
   prefetcht0_1 = "xb:n0F181m",
   prefetcht1_1 = "xb:n0F182m",
   prefetcht2_1 = "xb:n0F183m",
-  psadbw_2 =	"rmo:660FF6rM",
   pshufd_3 =	"rmio:660F70rMU",
   pshufhw_3 =	"rmio:F30F70rMU",
   pshuflw_3 =	"rmio:F20F70rMU",
@@ -1380,23 +1394,6 @@ local map_op = {
   psrldq_2 =	"rio:660F733mU",
   psrlq_2 =	"rmo:660FD3rM|rio:660F732mU",
   psrlw_2 =	"rmo:660FD1rM|rio:660F712mU",
-  psubb_2 =	"rmo:660FF8rM",
-  psubd_2 =	"rmo:660FFArM",
-  psubq_2 =	"rmo:660FFBrM",
-  psubsb_2 =	"rmo:660FE8rM",
-  psubsw_2 =	"rmo:660FE9rM",
-  psubusb_2 =	"rmo:660FD8rM",
-  psubusw_2 =	"rmo:660FD9rM",
-  psubw_2 =	"rmo:660FF9rM",
-  punpckhbw_2 =	"rmo:660F68rM",
-  punpckhdq_2 =	"rmo:660F6ArM",
-  punpckhqdq_2 = "rmo:660F6DrM",
-  punpckhwd_2 =	"rmo:660F69rM",
-  punpcklbw_2 =	"rmo:660F60rM",
-  punpckldq_2 =	"rmo:660F62rM",
-  punpcklqdq_2 = "rmo:660F6CrM",
-  punpcklwd_2 =	"rmo:660F61rM",
-  pxor_2 =	"rmo:660FEFrM",
   rcpps_2 =	"rmo:0F53rM",
   rcpss_2 =	"rro:F30F53rM|rx/od:",
   rsqrtps_2 =	"rmo:0F52rM",
@@ -1640,6 +1637,12 @@ local map_op = {
 
   -- AVX, AVX2 integer ops
   -- In general, xmm requires AVX, ymm requires AVX2.
+  vaesdec_3 =  "rrmo:660F38VDErM",
+  vaesdeclast_3 = "rrmo:660F38VDFrM",
+  vaesenc_3 =  "rrmo:660F38VDCrM",
+  vaesenclast_3 = "rrmo:660F38VDDrM",
+  vaesimc_2 =  "rmo:660F38uDBrM",
+  vaeskeygenassist_3 = "rmio:660F3AuDFrMU",
   vlddqu_2 =	"rxoy:F20FuF0rM",
   vmaskmovdqu_2 = "rro:660FuF7rM",
   vmovdqa_2 =	"rmoy:660Fu6FrM|mroy:660Fu7FRm",
@@ -1880,10 +1883,11 @@ local function dopattern(pat, args, sz, op, needrex)
       if t.xreg and t.xreg > 7 then rex = rex + 2 end
       if s > 7 then rex = rex + 4 end
       if needrex then rex = rex + 16 end
-      wputop(szov, opcode, rex, vex); opcode = nil
+      local psz, sk = wputop(szov, opcode, rex, vex, s < 0, t.vreg or t.vxreg)
+      opcode = nil
       local imark = sub(pat, -1) -- Force a mark (ugly).
       -- Put ModRM/SIB with regno/last digit as spare.
-      wputmrmsib(t, imark, s, addin and addin.vreg)
+      wputmrmsib(t, imark, s, addin and addin.vreg, psz, sk)
       addin = nil
     elseif map_vexarg[c] ~= nil then -- Encode using VEX prefix
       local b = band(opcode, 255); opcode = shr(opcode, 8)
@@ -1910,8 +1914,8 @@ local function dopattern(pat, args, sz, op, needrex)
 	if szov == "q" and rex == 0 then rex = rex + 8 end
 	if needrex then rex = rex + 16 end
 	if addin and addin.reg == -1 then
-	  wputop(szov, opcode - 7, rex, vex)
-	  waction("VREG", addin.vreg); wputxb(0)
+	  local psz, sk = wputop(szov, opcode - 7, rex, vex, true)
+	  wvreg("opcode", addin.vreg, psz, sk)
 	else
 	  if addin and addin.reg > 7 then rex = rex + 1 end
 	  wputop(szov, opcode, rex, vex)
@@ -1955,7 +1959,7 @@ local function dopattern(pat, args, sz, op, needrex)
 	  local reg = a.reg
 	  if reg < 0 then
 	    wputb(0)
-	    waction("VREG", a.vreg); wputxb(5)
+	    wvreg("imm.hi", a.vreg)
 	  else
 	    wputb(shl(reg, 4))
 	  end
@@ -2107,8 +2111,8 @@ if x64 then
 	rex = a.reg > 7 and 9 or 8
       end
     end
-    wputop(sz, opcode, rex)
-    if vreg then waction("VREG", vreg); wputxb(0) end
+    local psz, sk = wputop(sz, opcode, rex, nil, vreg)
+    wvreg("opcode", vreg, psz, sk)
     if luamode then
       waction("IMM_D", format("ffi.cast(\"uintptr_t\", %s) %% 2^32", op64))
       waction("IMM_D", format("ffi.cast(\"uintptr_t\", %s) / 2^32", op64))
diff --git a/src/dynasm.lua b/src/dynasm.lua
index 10d93c0f8f..586e2a13dd 100644
--- a/src/dynasm.lua
+++ b/src/dynasm.lua
@@ -1141,14 +1141,13 @@ local function setlang(infile)
       g_opt.comment = "--|"
       g_opt.endcomment = ""
     end
+    -- Set initial defines only available in Lua mode.
+    local ffi = require("ffi")
+    map_def.ARCH = ffi.arch          --for `.arch ARCH`
+    map_def[upper(ffi.arch)] = 1     --for `.if X86 ...`
+    map_def.OS = ffi.os              --for `.if OS == 'Windows'`
+    map_def[upper(ffi.os)] = 1       --for `.if WINDOWS ...`
   end
-
-  -- Set initial defines only available in Lua mode.
-  local ffi = require'ffi'
-  map_def.ARCH = ffi.arch          --for `.arch ARCH`
-  map_def[upper(ffi.arch)] = 1     --for `.if X86 ...`
-  map_def.OS = ffi.os              --for `.if OS == 'Windows'`
-  map_def[upper(ffi.os)] = 1       --for `.if WINDOWS ...`
 end
 
 -- Parse arguments.
diff --git a/src/lib/ipsec/.images/esp.png b/src/lib/ipsec/.images/esp.png
new file mode 100644
index 0000000000..09c165442b
Binary files /dev/null and b/src/lib/ipsec/.images/esp.png differ
diff --git a/src/lib/ipsec/README.md b/src/lib/ipsec/README.md
new file mode 100644
index 0000000000..a4a3f6eac0
--- /dev/null
+++ b/src/lib/ipsec/README.md
@@ -0,0 +1,45 @@
+### IPsec/ESP (lib.ipsec.esp)
+
+The `lib.ipsec.esp` module contains two classes `esp_v6_encrypt` and
+`esp_v6_decrypt` which implement implement packet encryption and
+decryption with IPsec ESP using the AES-GCM-128 cipher in IPv6 transport
+mode. Packets are encrypted with the key and salt provided to the classes
+constructors. These classes do not implement any key exchange protocol.
+
+The encrypt class accepts IPv6 packets and inserts a new [ESP
+header](https://en.wikipedia.org/wiki/IPsec#Encapsulating_Security_Payload)
+between the outer IPv6 header and the inner protocol header (e.g. TCP,
+UDP, L2TPv3) and also encrypts the contents of the inner protocol
+header. The decrypt class does the reverse: it decrypts the inner
+protocol header and removes the ESP protocol header.
+
+References:
+
+- [IPsec Wikipedia page](https://en.wikipedia.org/wiki/IPsec).
+- [RFC 4106](https://tools.ietf.org/html/rfc4106) on using AES-GCM with IPsec ESP.
+- [LISP Data-Plane Confidentiality](https://tools.ietf.org/html/draft-ietf-lisp-crypto-02) example of a software layer above these apps that includes key exchange.
+
+— Method **esp_v6_encrypt:new** *config*
+
+— Method **esp_v6_decrypt:new** *config*
+
+Returns a new encryption/decryption context respectively. *Config* must a
+be a table with the following keys:
+
+* `mode` - Encryption mode (string). The only accepted value is the
+  string `"aes-128-gcm"`.
+* `keymat` - Hex string containing 16 bytes of key material as specified
+  in RFC 4106.
+* `salt` - Hex string containing four bytes of salt as specified in
+  RFC 4106.
+
+— Method **esp_v6_encrypt:encapsulate** *packet*
+
+Returns a freshly allocated packet that is the encrypted and encapsulated
+version of *packet*.
+
+— Method **esp_v6_decrypt:decapsulate** *packet*
+
+Returns a freshly allocated packet that is the decrypted and decapsulated
+version of *packet* or `nil` if authentication failed. The contents of
+*packet* are destroyed in the process.
diff --git a/src/lib/ipsec/aes_128_gcm.lua b/src/lib/ipsec/aes_128_gcm.lua
new file mode 100644
index 0000000000..e9677c4f04
--- /dev/null
+++ b/src/lib/ipsec/aes_128_gcm.lua
@@ -0,0 +1,104 @@
+module(..., package.seeall)
+local ffi = require("ffi")
+local C = ffi.C
+local ASM = require("lib.ipsec.aes_128_gcm_avx")
+local header = require("lib.protocol.header")
+local lib = require("core.lib")
+local ntohl, htonl, htonll = lib.ntohl, lib.htonl, lib.htonll
+
+
+-- IV pseudo header
+
+local iv = subClass(header)
+
+-- Class variables
+iv._name = "iv"
+iv:init(
+   {
+      [1] = ffi.typeof[[
+            struct {
+               uint8_t salt[4];
+               uint64_t iv;
+               uint32_t padding;
+            } __attribute__((packed, aligned(16)))
+      ]]
+   })
+
+-- Class methods
+
+function iv:new (salt)
+   local o = iv:superClass().new(self)
+   local h = o:header()
+   o:salt(salt)
+   h.padding = htonl(0x1)
+   return o
+end
+
+-- Instance methods
+
+function iv:salt (salt)
+   local h = self:header()
+   if salt ~= nil then
+      ffi.copy(h.salt, salt, 4)
+   else
+      return h.salt
+   end
+end
+
+function iv:iv (iv)
+   local h = self:header()
+   if iv ~= nil then
+      h.iv = htonll(iv)
+   else
+      return self:header_ptr()+4, 8
+   end
+end
+
+
+-- AES-128-GCM wrapper
+
+local function u8_ptr (ptr) return ffi.cast("uint8_t *", ptr) end
+
+local aes_128_gcm = {}
+
+function aes_128_gcm:new (keymat, salt)
+   assert(keymat and #keymat == 32, "Need 16 bytes of key material.")
+   assert(salt and #salt == 8, "Need 4 bytes of salt.")
+   local o = {}
+   o.keymat = ffi.new("uint8_t[16]")
+   ffi.copy(o.keymat, lib.hexundump(keymat, 16), 16)
+   o.iv = iv:new(lib.hexundump(salt, 4))
+   -- Compute subkey (H)
+   o.hash_subkey = ffi.new("uint8_t[?] __attribute__((aligned(16)))", 128)
+   o.gcm_data = ffi.new("gcm_data[1] __attribute__((aligned(16)))")
+   ASM.aes_keyexp_128_enc_avx(o.keymat, o.gcm_data[0].expanded_keys)
+   ASM.aesni_gcm_precomp_avx_gen4(o.gcm_data, o.hash_subkey)
+   o.blocksize = 128
+   o.auth_size = 16
+   o.auth_buf = ffi.new("uint8_t[?]", o.auth_size)
+   o.aad_size = 16
+   return setmetatable(o, {__index=aes_128_gcm})
+end
+
+function aes_128_gcm:encrypt (out_ptr, payload, length, esp)
+   self.iv:iv(esp:seq_no())
+   ASM.aesni_gcm_enc_avx_gen4(self.gcm_data,
+                            out_ptr,
+                            payload, length,
+                            u8_ptr(self.iv:header_ptr()),
+                            u8_ptr(esp:header_ptr()), esp:sizeof(),
+                            payload + length, self.auth_size)
+end
+
+function aes_128_gcm:decrypt (out_ptr, ciphertext, length, esp)
+   self.iv:iv(esp:seq_no())
+   ASM.aesni_gcm_dec_avx_gen4(self.gcm_data,
+                            out_ptr,
+                            ciphertext, length,
+                            u8_ptr(self.iv:header_ptr()),
+                            u8_ptr(esp:header_ptr()), esp:sizeof(),
+                            self.auth_buf, self.auth_size)
+   return C.memcmp(self.auth_buf, ciphertext + length, self.auth_size) == 0
+end
+
+return aes_128_gcm
diff --git a/src/lib/ipsec/aes_128_gcm_avx.dasl b/src/lib/ipsec/aes_128_gcm_avx.dasl
new file mode 100644
index 0000000000..bd341a7b63
--- /dev/null
+++ b/src/lib/ipsec/aes_128_gcm_avx.dasl
@@ -0,0 +1,498 @@
+-- Selected AES GCM routines, based heavily on the Intel IPsec code from:
+-- https://github.com/lukego/intel-ipsec/blob/master/code/avx2/gcm_avx_gen4.asm
+-- https://github.com/lukego/intel-ipsec/blob/master/code/gcm_defines.asm
+-- https://github.com/lukego/intel-ipsec/blob/master/code/aes_keyexp_128.asm
+
+local dasm = require("dasm")
+local ffi = require("ffi")
+
+ffi.cdef[[
+typedef struct gcm_data
+{
+  uint8_t expanded_keys[16*11];
+  uint8_t shifted_hkey_1[16];
+  uint8_t shifted_hkey_2[16];
+  uint8_t shifted_hkey_3[16];
+  uint8_t shifted_hkey_4[16];
+  uint8_t shifted_hkey_5[16];
+  uint8_t shifted_hkey_6[16];
+  uint8_t shifted_hkey_7[16];
+  uint8_t shifted_hkey_8[16];
+  uint8_t shifted_hkey_1_k[16];
+  uint8_t shifted_hkey_2_k[16];
+  uint8_t shifted_hkey_3_k[16];
+  uint8_t shifted_hkey_4_k[16];
+  uint8_t shifted_hkey_5_k[16];
+  uint8_t shifted_hkey_6_k[16];
+  uint8_t shifted_hkey_7_k[16];
+  uint8_t shifted_hkey_8_k[16];
+} gcm_data;
+]]
+
+|.arch x64
+|.actionlist actions
+|.globalnames globalnames
+
+|.define arg1, rdi
+|.define arg2, rsi
+|.define arg3, rdx
+|.define arg4, rcx
+|.define arg5, r8
+|.define arg6, r9
+|.define arg7, [r14 + 32 + 8*1]
+|.define arg8, [r14 + 32 + 8*2]
+|.define arg9, [r14 + 32 + 8*3]
+
+local function ghash_tail(Dst, gh, t1, t2, t3)
+  | vmovdqa xmm(t3), [->poly2]
+  | vpclmulqdq xmm(t2), xmm(t3), xmm(gh), 0x01; vpslldq xmm(t2), xmm(t2), 8; vpxor xmm(gh), xmm(gh), xmm(t2)
+  | vpclmulqdq xmm(t2), xmm(t3), xmm(gh), 0x00; vpsrldq xmm(t2), xmm(t2), 4
+  | vpclmulqdq xmm(gh), xmm(t3), xmm(gh), 0x10; vpslldq xmm(gh), xmm(gh), 4; vpxor xmm(gh), xmm(gh), xmm(t2)
+  | vpxor xmm(gh), xmm(gh), xmm(t1)
+end
+
+local function ghash_mul(Dst, gh, hk, t1, t2, t3)
+  | vpclmulqdq xmm(t1), xmm(gh), xmm(hk), 0x11
+  | vpclmulqdq xmm(t2), xmm(gh), xmm(hk), 0x00
+  | vpclmulqdq xmm(t3), xmm(gh), xmm(hk), 0x01
+  | vpclmulqdq xmm(gh), xmm(gh), xmm(hk), 0x10
+  | vpxor xmm(gh), xmm(gh), xmm(t3)
+
+  | vpsrldq xmm(t3), xmm(gh), 8
+  | vpslldq xmm(gh), xmm(gh), 8
+  | vpxor xmm(t1), xmm(t1), xmm(t3)
+  | vpxor xmm(gh), xmm(gh), xmm(t2)
+  || ghash_tail(Dst, gh, t1, t2, t3)
+end
+
+local function almost_encrypt_8(Dst, initial, ctr, t_key, operation, loop_idx, before_round)
+  local prev = ctr
+  for i = initial, 8 do
+    if loop_idx == "in_order" then
+      | vpaddd xmm(i), xmm(prev), [->one]
+    else
+      | vpaddd xmm(i), xmm(prev), [->onef]
+    end
+    prev = i
+  end
+  if prev ~= ctr then
+    | vmovdqa xmm(ctr), xmm(prev)
+  end
+  if loop_idx == "in_order" then
+    for i = initial, 8 do
+      | vpshufb xmm(i), xmm(i), [->shuf_mask]
+    end
+  end
+
+  | vmovdqa xmm(t_key), [arg1+16*0]
+  for i = initial, 8 do
+    | vpxor xmm(i), xmm(i), xmm(t_key)
+  end
+  for j = 1, 9 do
+    before_round(j)
+    | vmovdqa xmm(t_key), [arg1+16*j]
+    for i = initial, 8 do
+      | vaesenc xmm(i), xmm(i), xmm(t_key)
+    end
+  end
+  before_round(10)
+end
+
+local function encrypt_8(Dst, initial, t, ctr, t_key, operation)
+  almost_encrypt_8(Dst, initial, ctr, t_key, operation, "in_order", function() end)
+
+  | vmovdqa xmm(t_key), [arg1+16*10]
+  for i = initial, 8 do
+    | vaesenclast xmm(i), xmm(i), xmm(t_key)
+  end
+
+  for i = initial, 8 do
+    | vmovdqu xmm(t), [arg3 + r11 + 16*(i-initial)]
+    | vpxor xmm(i), xmm(i), xmm(t)
+    | vmovdqu [arg2 + r11 + 16*(i-initial)], xmm(i)
+    if operation == "dec" then
+      | vmovdqa xmm(i), xmm(t)
+    end 
+    | vpshufb xmm(i), xmm(i), [->shuf_mask]
+  end
+  | add r11, (9-initial)*16
+end
+
+local function initial_blocks(Dst, num_initial_blocks, t, ctr, t_key, operation)
+  local i = 8 - num_initial_blocks
+  | mov r10, arg6
+  | mov r12, arg7
+  | mov r11, r12
+
+  | vpxor xmm(i), xmm(i), xmm(i)
+  |1:
+  | vmovd xmm(t[1]), dword [r10]
+  | vpslldq xmm(t[1]), xmm(t[1]), 12
+  | vpsrldq xmm(i), xmm(i), 4
+  | vpxor xmm(i), xmm(i), xmm(t[1])
+  | add r10, 4
+  | sub r12, 4
+  | jg <1
+  | cmp r11, 16
+  | je >3
+  | mov r12, 16
+  |2:
+  | vpsrldq xmm(i), xmm(i), 4
+  | sub r12, 4
+  | cmp r12, r11
+  | jg <2
+  |3:
+
+  | vpshufb xmm(i), xmm(i), [->shuf_mask]
+  | xor r11, r11
+  | mov rax, arg5
+  | vmovdqu xmm(ctr), [rax]
+  | vpshufb xmm(ctr), xmm(ctr), [->shuf_mask]
+  || encrypt_8(Dst, 9-num_initial_blocks, t[1], ctr, t_key, operation)
+
+  local prev
+  | vmovdqu xmm(t[2]), [arg1 + 16*11]
+  for j = 8-num_initial_blocks, 8 do
+    if prev then
+      | vpxor xmm(j), xmm(j), xmm(prev)
+    end
+    ghash_mul(Dst, j, t[2], t[1], t[3], t[4])
+    prev = j
+  end
+
+  | vmovdqa [rsp], xmm8
+  | vmovdqa xmm(t[3]), xmm8
+  | cmp r13, 128
+  | jl >9
+  || encrypt_8(Dst, 1, t[1], ctr, t_key, operation)
+  | vpxor xmm1, xmm1, [rsp]
+  |9:
+end
+
+local function mulqdqxor(Dst, out, qdq1, qdq2, qdqI, xor)
+  | vpclmulqdq xmm(xor or out), xmm(qdq1), xmm(qdq2), qdqI
+  if xor then
+    | vpxor xmm(out), xmm(out), xmm(xor)
+  end
+end
+
+local function ghash_8_encrypt_8_parallel(Dst, t, ctr, loop_idx, operation)
+  | add r15b, 8
+  | vmovdqa xmm(t[2]), xmm1
+  for i = 2, 8 do
+    | vmovdqa [rsp + 16*(i-1)], xmm(i)
+  end
+
+  almost_encrypt_8(Dst, 1, ctr, t[1], operation, loop_idx, function(round)
+    if round >= 3 then
+      | vmovdqa xmm(t[5]), [arg1 + 16*(21-round)]
+      local xor
+      if round > 3 then
+        | vmovdqa xmm(t[2]), [rsp + 16*(round-3)]
+        xor = t[3]
+      end
+      mulqdqxor(Dst, t[4], t[2], t[5], 0x11, xor)
+      mulqdqxor(Dst, t[7], t[2], t[5], 0x00, xor)
+      mulqdqxor(Dst, t[6], t[2], t[5], 0x01, xor)
+      mulqdqxor(Dst, t[6], t[2], t[5], 0x10, t[3])
+    end
+  end)
+
+  | vmovdqa xmm(t[5]), [arg1+16*10]
+  for j = 1, 8 do
+    local i = j - 1
+    | vpxor xmm(t[2]), xmm(t[5]), [arg3 + r11 + 16*i]
+    if operation == "enc" then
+      | vaesenclast xmm(j), xmm(j), xmm(t[2])
+      | vmovdqu [arg2 + r11 + 16*i], xmm(j)
+    else
+      | vaesenclast xmm(t[3]), xmm(j), xmm(t[2])
+      | vmovdqu xmm(j), [arg3 + r11 + 16*i]
+      | vmovdqu [arg2 + r11 + 16*i], xmm(t[3])
+    end
+    | vpshufb xmm(j), xmm(j), [->shuf_mask]
+  end
+
+  | vpslldq xmm(t[3]), xmm(t[6]), 8
+  | vpsrldq xmm(t[6]), xmm(t[6]), 8
+  | vpxor xmm(t[7]), xmm(t[7]), xmm(t[3])
+  | vpxor xmm(t[1]), xmm(t[4]), xmm(t[6])
+  || ghash_tail(Dst, t[7], t[1], t[2], t[3])
+  | vpxor xmm1, xmm1, xmm(t[7])
+  | add r11, 128
+  | sub r13, 128
+end
+
+local function ghash_last_8(Dst, t)
+  for i = 1, 8 do
+    | vmovdqa xmm(t[5]), [arg1 + 16*(19-i)]
+    | vpshufd xmm(t[2]), xmm(i), 0x4e
+    | vpshufd xmm(t[3]), xmm(t[5]), 0x4e
+    | vpxor xmm(t[2]), xmm(t[2]), xmm(i)
+    | vpxor xmm(t[3]), xmm(t[3]), xmm(t[5])
+    mulqdqxor(Dst, t[6], i, t[5], 0x11, i ~= 1 and t[4])
+    mulqdqxor(Dst, t[7], i, t[5], 0x00, i ~= 1 and t[4])
+    mulqdqxor(Dst, 1, t[2], t[3], 0x00, i ~= 1 and t[4])
+  end
+  | vpxor xmm1, xmm1, xmm(t[6])
+  | vpxor xmm(t[2]), xmm1, xmm(t[7])
+
+  | vpslldq xmm(t[4]), xmm(t[2]), 8
+  | vpsrldq xmm(t[2]), xmm(t[2]), 8
+  | vpxor xmm(t[7]), xmm(t[7]), xmm(t[4])
+  | vpxor xmm(t[6]), xmm(t[6]), xmm(t[2])
+  || ghash_tail(Dst, t[7], t[6], t[2], t[3])
+  | vmovdqa xmm14, xmm15
+end
+
+local function encrypt_single_block(Dst, x)
+  | vpxor xmm(x), xmm(x), [arg1+16*0]
+  for i = 1, 9 do
+    | vaesenc xmm(x), xmm(x), [arg1+16*i]
+  end
+  | vaesenclast xmm(x), xmm(x), [arg1+16*10]
+end
+
+local function prologue(Dst)
+  for i = 12, 15 do
+    | push Rq(i)
+  end
+  | mov r14, rsp
+  | sub rsp, 16*8
+  | and rsp, -64
+end
+
+local function epilogue(Dst)
+  | mov rsp, r14
+  for i = 15, 12, -1 do
+    | pop Rq(i)
+  end
+  | ret
+end
+
+local function gcm_enc_dec(Dst, operation, pc)
+  prologue(Dst)
+
+  | mov r13, arg4
+  | and r13, -16
+  | mov r12, r13
+  | shr r12, 4
+  | and r12, 7
+  | jz =>pc+0
+  for i = 7, 2, -1 do
+    | cmp r12, i
+    | je =>pc+i
+  end
+  | jmp =>pc+1
+  for i = 7, 0, -1 do
+    |=>pc+i:
+    || initial_blocks(Dst, i, {12, 13, 14, 15}, 9, 0, operation)
+    if i ~= 0 then
+      | sub r13, 16*i
+      | jmp >8
+    end
+  end
+
+  |8:
+  | cmp r13, 0
+  | je >1
+  | sub r13, 128
+  | je >2
+  | vmovd r15d, xmm9
+  | and r15d, 255
+  | vpshufb xmm9, xmm9, [->shuf_mask]
+  |3:
+  | cmp r15b, 255-8
+  | jg >4
+  || ghash_8_encrypt_8_parallel(Dst, {0, 10, 11, 12, 13, 14, 15}, 9, "out_order", operation)
+  | jne <3
+  | vpshufb xmm9, xmm9, [->shuf_mask]
+  | jmp >2
+  |4:
+  | vpshufb xmm9, xmm9, [->shuf_mask]
+  || ghash_8_encrypt_8_parallel(Dst, {0, 10, 11, 12, 13, 14, 15}, 9, "in_order", operation)
+  | vpshufb xmm9, xmm9, [->shuf_mask]
+  | jne <3
+  | vpshufb xmm9, xmm9, [->shuf_mask]
+  |2:
+  || ghash_last_8(Dst, {0, 10, 11, 12, 13, 14, 15})
+  |1:
+
+  | mov r13, arg4
+  | and r13, 15
+  | je >1
+
+  | vpaddd xmm9, xmm9, [->one]
+  | vpshufb xmm9, xmm9, [->shuf_mask]
+  || encrypt_single_block(Dst, 9)
+
+  | sub r11, 16
+  | add r11, r13
+  | vmovdqu xmm1, [arg3 + r11]
+  | lea r12, [->all_f]
+  | sub r12, r13
+  | vmovdqu xmm2, [r12]
+  | vpshufb xmm1, xmm1, xmm2
+
+  if operation == "dec" then
+    | vmovdqa xmm2, xmm1
+  end
+  | vpxor xmm9, xmm9, xmm1
+  | vmovdqu xmm1, [r12 + 16]
+  | vpand xmm9, xmm9, xmm1
+  if operation == "dec" then
+    | vpand xmm2, xmm2, xmm1
+  else
+    | vmovdqa xmm2, xmm9
+  end
+  | vpshufb xmm2, xmm2, [->shuf_mask]
+  | vpxor xmm14, xmm14, xmm2
+  || ghash_mul(Dst, 14, 13, 0, 10, 11)
+  | sub r11, r13
+  | add r11, 16
+
+  | vmovd rax, xmm9
+  | cmp r13, 8
+  | jle >2
+  | mov [arg2 + r11], rax
+  | add r11, 8
+  | vpsrldq xmm9, xmm9, 8
+  | vmovd rax, xmm9
+  | sub r13, 8
+  |2:
+  | mov byte [arg2 + r11], al
+  | add r11, 1
+  | shr rax, 8
+  | sub r13, 1
+  | jne <2
+
+  |1:
+  | mov r12, arg7
+  | shl r12, 3
+  | vmovd xmm15, r12d
+
+  | shl arg4, 3
+  | vmovd xmm1, arg4
+  | vpslldq xmm15, xmm15, 8
+  | vpxor xmm15, xmm15, xmm1
+        
+  | vpxor xmm14, xmm14, xmm15
+  || ghash_mul(Dst, 14, 13, 0, 10, 11)
+  | vpshufb xmm14, xmm14, [->shuf_mask]
+  | mov rax, arg5
+  | vmovdqu xmm9, [rax]
+  || encrypt_single_block(Dst, 9)
+  | vpxor xmm9, xmm9, xmm14
+
+  | mov r10, arg8
+  | mov r11, arg9
+  | cmp r11, 16
+  | je >3
+  | cmp r11, 12
+  | je >2
+  | vmovd rax, xmm9
+  | mov [r10], rax
+  | jmp >4
+  |2:
+  | vmovd rax, xmm9
+  | mov [r10], rax
+  | vpsrldq xmm9, xmm9, 8
+  | vmovd eax, xmm9
+  | mov [r10 + 8], eax
+  | jmp >4       
+  |3:
+  | vmovdqu [r10], xmm9
+  |4:
+
+  epilogue(Dst)
+end
+
+local function precompute(Dst)
+  prologue(Dst)
+
+  | vmovdqu xmm6, [arg2]
+  | vpshufb xmm6, xmm6, [->shuf_mask]
+  | vmovdqa xmm2, xmm6
+  | vpsllq xmm6, xmm6, 1
+  | vpsrlq xmm2, xmm2, 63
+  | vmovdqa xmm1, xmm2
+  | vpslldq xmm2, xmm2, 8 
+  | vpsrldq xmm1, xmm1, 8
+  | vpor xmm6, xmm6, xmm2     
+  | vpshufd xmm2, xmm1, 0x24
+  | vpcmpeqd xmm2, xmm2, [->two_one]
+  | vpand xmm2, xmm2, [->poly]
+  | vpxor xmm6, xmm6, xmm2
+  | vmovdqa [arg1 + 16*11], xmm6
+
+  | vmovdqa xmm4, xmm6
+  for i = 2, 8 do
+    || ghash_mul(Dst, 4, 6, 0, 1, 2)
+    | vmovdqa [arg1 + 16*(10+i)], xmm4
+  end
+
+  epilogue(Dst)
+end
+
+local function keyexp(Dst)
+  | vmovdqu xmm1, [arg1]
+  | vmovdqa [arg2], xmm1
+  | vpxor xmm3, xmm3, xmm3
+  for i = 1, 10 do
+    | vaeskeygenassist xmm2, xmm1, i < 9 and 2^(i-1) or 27*(i-8)
+    | vpshufd xmm2, xmm2, 0xff
+    | vshufps xmm3, xmm3, xmm1, 0x10
+    | vpxor xmm1, xmm1, xmm3
+    | vshufps xmm3, xmm3, xmm1, 0x8c
+    | vpxor xmm1, xmm1, xmm3
+    | vpxor xmm1, xmm1, xmm2
+    | vmovdqa [arg2 + 16*i], xmm1
+  end
+  | ret
+end
+
+local function generator(Dst)
+  Dst:growpc(16)
+
+  -- Functions
+  |->aesni_gcm_precomp_avx_gen4:
+  || precompute(Dst)
+  |.align 16
+  |->aes_keyexp_128_enc_avx:
+  || keyexp(Dst)
+  |.align 16
+  |->aesni_gcm_enc_avx_gen4:
+  || gcm_enc_dec(Dst, "enc", 0)
+  |.align 16
+  |->aesni_gcm_dec_avx_gen4:
+  || gcm_enc_dec(Dst, "dec", 8)
+
+  -- Data
+  |.align 64
+  |->poly:;    .dword          1, 0, 0, 0xC2000000
+  |->poly2:;   .dword 0xC2000000, 1, 0, 0xC2000000
+  |->two_one:; .dword          1, 0, 0,          1
+  |->shuf_mask:
+  for i = 15, 0, -1 do
+    |.byte i
+  end
+  for i = 0, 15 do
+    |.byte i
+  end
+  |->all_f:; .dword -1, -1, -1,   -1
+  |          .dword  0,  0,  0,    0
+  |->one:;   .dword  1,  0,  0,    0
+  |->onef:;  .dword  0,  0,  0, 2^24
+end
+
+local Dst, globals = dasm.new(actions, nil, nil, 1 + #globalnames)
+generator(Dst)
+local mcode, size = Dst:build()
+local entry = dasm.globals(globals, globalnames)
+local fn_t = ffi.typeof("void(*)(gcm_data*, uint8_t*, const uint8_t*, uint64_t, uint8_t*, const uint8_t*, uint64_t, uint8_t*, uint64_t)")
+return setmetatable({
+  aes_keyexp_128_enc_avx = ffi.cast("void(*)(void*, void*)", entry.aes_keyexp_128_enc_avx),
+  aesni_gcm_precomp_avx_gen4 = ffi.cast("void(*)(gcm_data*, uint8_t*)", entry.aesni_gcm_precomp_avx_gen4),
+  aesni_gcm_enc_avx_gen4 = ffi.cast(fn_t, entry.aesni_gcm_enc_avx_gen4),
+  aesni_gcm_dec_avx_gen4 = ffi.cast(fn_t, entry.aesni_gcm_dec_avx_gen4),
+}, {_anchor = mcode})
diff --git a/src/lib/ipsec/esp.lua b/src/lib/ipsec/esp.lua
new file mode 100644
index 0000000000..36fb473401
--- /dev/null
+++ b/src/lib/ipsec/esp.lua
@@ -0,0 +1,149 @@
+module(..., package.seeall)
+local datagram = require("lib.protocol.datagram")
+local ethernet = require("lib.protocol.ethernet")
+local esp = require("lib.protocol.esp")
+local esp_tail = require("lib.protocol.esp_tail")
+local aes_128_gcm = require("lib.ipsec.aes_128_gcm")
+local lib = require("core.lib")
+local ffi = require("ffi")
+
+
+local esp_nh = 50 -- https://tools.ietf.org/html/rfc4303#section-2
+local esp_length = esp:sizeof()
+local esp_tail_length = esp_tail:sizeof()
+
+function esp_v6_new (conf)
+   assert(conf.mode == "aes-128-gcm", "Only supports aes-128-gcm.")
+   return { aes_128_gcm = aes_128_gcm:new(conf.keymat, conf.salt),
+            seq_no = 0 }
+end
+
+
+local esp_v6_encrypt = {}
+
+function esp_v6_encrypt:new (conf)
+   local o = esp_v6_new(conf)
+   o.pad_buf = ffi.new("uint8_t[?]", o.aes_128_gcm.blocksize-1)
+   o.esp_buf = ffi.new("uint8_t[?]", o.aes_128_gcm.aad_size)
+   -- Fix me https://tools.ietf.org/html/rfc4303#section-3.3.3
+   o.esp = esp:new_from_mem(o.esp_buf, esp_length)
+   o.esp:spi(0x0) -- Fix me, set esp:spi value.
+   o.esp_tail = esp_tail:new({})
+   return setmetatable(o, {__index=esp_v6_encrypt})
+end
+
+-- Return next sequence number.
+function esp_v6_encrypt:next_seq_no ()
+   self.seq_no = self.seq_no + 1
+   return self.seq_no
+end
+
+function esp_v6_encrypt:encrypt (nh, payload, length)
+   local p = packet.allocate()
+   self.esp:seq_no(self:next_seq_no())
+   packet.append(p, self.esp:header_ptr(), esp_length)
+   packet.append(p, payload, length)
+   local pad_length = self.aes_128_gcm.blocksize
+      - ((length + esp_tail_length) % self.aes_128_gcm.blocksize)
+   packet.append(p, self.pad_buf, pad_length)
+   self.esp_tail:next_header(nh)
+   self.esp_tail:pad_length(pad_length)
+   packet.append(p, self.esp_tail:header_ptr(), esp_tail_length)
+   packet.append(p, self.pad_buf, self.aes_128_gcm.auth_size)
+   self.aes_128_gcm:encrypt(packet.data(p) + esp_length,
+                            packet.data(p) + esp_length,
+                            length + pad_length + esp_tail_length,
+                            self.esp)
+   return p
+end
+
+function esp_v6_encrypt:encapsulate (p)
+   local plain = datagram:new(p, ethernet)
+   local eth = plain:parse_match()
+   local ip = plain:parse_match()
+   local nh = ip:next_header()
+   local encrypted = datagram:new(self:encrypt(nh, plain:payload()))
+   local _, length = encrypted:payload()
+   ip:next_header(esp_nh)
+   ip:payload_length(length)
+   encrypted:push(ip)
+   encrypted:push(eth)
+   return encrypted:packet()
+end
+
+
+local esp_v6_decrypt = {}
+
+function esp_v6_decrypt:new (conf)
+   local o = esp_v6_new(conf)
+   o.esp_overhead_size = esp_length + o.aes_128_gcm.auth_size
+   o.min_payload_length = o.aes_128_gcm.blocksize + o.esp_overhead_size
+   return setmetatable(o, {__index=esp_v6_decrypt})
+end
+
+-- Verify sequence number.
+function esp_v6_decrypt:check_seq_no (seq_no)
+   self.seq_no = self.seq_no + 1
+   return self.seq_no <= seq_no
+end
+
+function esp_v6_decrypt:decrypt (payload, length)
+   if length < self.min_payload_length
+      or (length - self.esp_overhead_size) % self.aes_128_gcm.blocksize ~= 0
+   then return end
+   local data_start = payload + esp_length
+   local data_length = length - esp_length - self.aes_128_gcm.auth_size
+   local esp = esp:new_from_mem(payload, esp_length)
+   if self.aes_128_gcm:decrypt(data_start, data_start, data_length, esp) then
+      local esp_tail_start = data_start + data_length - esp_tail_length
+      local esp_tail = esp_tail:new_from_mem(esp_tail_start, esp_tail_length)
+      local cleartext_length = data_length - esp_tail:pad_length() - esp_tail_length
+      local p = packet.from_pointer(data_start, cleartext_length)
+      return esp:seq_no(), p, esp_tail:next_header()
+   end
+end
+
+function esp_v6_decrypt:decapsulate (p)
+   local encrypted = datagram:new(p, ethernet)
+   local eth = encrypted:parse_match()
+   local ip = encrypted:parse_match()
+   local decrypted = nil
+   if ip:next_header() == esp_nh then
+      local seq_no, payload, nh = self:decrypt(encrypted:payload())
+      if payload and self:check_seq_no(seq_no) then
+         local plain = datagram:new(payload)
+         ip:next_header(nh)
+         ip:payload_length(packet.length(payload))
+         plain:push(ip)
+         plain:push(eth)
+         return plain:packet()
+      end
+   end
+end
+
+
+function selftest ()
+   local C = require("ffi").C
+   local ipv6 = require("lib.protocol.ipv6")
+   local conf = { mode = "aes-128-gcm",
+                  keymat = "00112233445566778899AABBCCDDEEFF",
+                  salt = "00112233"}
+   local enc, dec = esp_v6_encrypt:new(conf), esp_v6_decrypt:new(conf)
+   local payload = packet.from_string(
+[[abcdefghijklmnopqrstuvwxyz
+ABCDEFGHIJKLMNOPQRSTUVWXYZ
+0123456789]]
+   )
+   local d = datagram:new(payload)
+   d:push(ipv6:new({}))
+   d:push(ethernet:new({type=0x86dd}))
+   -- Check integrity
+   local p = d:packet()
+   local p2 = dec:decapsulate(enc:encapsulate(p))
+   if p2 and p2.length == p.length and C.memcmp(p, p2, p.length) == 0 then
+      print("selftest passed")
+   else
+      print("integrity check failed")
+      os.exit(1)
+   end
+end
diff --git a/src/lib/protocol/esp.lua b/src/lib/protocol/esp.lua
new file mode 100644
index 0000000000..2395feb3bf
--- /dev/null
+++ b/src/lib/protocol/esp.lua
@@ -0,0 +1,51 @@
+module(..., package.seeall)
+local ffi = require("ffi")
+local header = require("lib.protocol.header")
+local lib = require("core.lib")
+local ntohl, htonl = lib.ntohl, lib.htonl
+local ntohll, htonll = lib.ntohll, lib.htonll
+
+local esp = subClass(header)
+
+-- Class variables
+esp._name = "esp"
+esp:init(
+   {
+      [1] = ffi.typeof[[
+            struct {
+               uint32_t spi;
+               uint64_t seq_no;
+            } __attribute__((packed))
+      ]]
+   })
+
+-- Class methods
+
+function esp:new (config)
+   local o = esp:superClass().new(self)
+   o:spi(config.spi)
+   o:seq_no(config.seq_no)
+   return o
+end
+
+-- Instance methods
+
+function esp:spi (spi)
+   local h = self:header()
+   if spi ~= nil then
+      h.spi = htonl(spi)
+   else
+      return(ntohl(h.spi))
+   end
+end
+
+function esp:seq_no (seq_no)
+   local h = self:header()
+   if seq_no ~= nil then
+      h.seq_no = htonll(seq_no)
+   else
+      return(ntohll(h.seq_no))
+   end
+end
+
+return esp
diff --git a/src/lib/protocol/esp_tail.lua b/src/lib/protocol/esp_tail.lua
new file mode 100644
index 0000000000..d8cadfee3a
--- /dev/null
+++ b/src/lib/protocol/esp_tail.lua
@@ -0,0 +1,48 @@
+module(..., package.seeall)
+local ffi = require("ffi")
+local header = require("lib.protocol.header")
+
+local esp_tail = subClass(header)
+
+-- Class variables
+esp_tail._name = "esp_tail"
+esp_tail:init(
+   {
+      [1] = ffi.typeof[[
+            struct {
+               uint8_t pad_length;
+               uint8_t next_header;
+            } __attribute__((packed))
+      ]]
+   })
+
+-- Class methods
+
+function esp_tail:new (config)
+   local o = esp_tail:superClass().new(self)
+   o:pad_length(config.pad_length)
+   o:next_header(config.next_header)
+   return o
+end
+
+-- Instance methods
+
+function esp_tail:pad_length (length)
+   local h = self:header()
+   if length ~= nil then
+      h.pad_length = length
+   else
+      return h.pad_length
+   end
+end
+
+function esp_tail:next_header (next_header)
+   local h = self:header()
+   if next_header ~= nil then
+      h.next_header = next_header
+   else
+      return h.next_header
+   end
+end
+
+return esp_tail