diff --git a/.gitmodules b/.gitmodules
deleted file mode 100644
index e69de29bb2..0000000000
diff --git a/lib/luajit/doc/changes.html b/lib/luajit/doc/changes.html
index 125b58b4ca..826cd2436b 100644
--- a/lib/luajit/doc/changes.html
+++ b/lib/luajit/doc/changes.html
@@ -113,6 +113,7 @@
LuaJIT 2.1.0-beta1 — 2015-08-25
x64: Add separate port of the interpreter to LJ_GC64 mode.
x86/x64: Drop internal x87 math functions. Use libm functions.
x86: Remove x87 support from interpreter. SSE2 is mandatory now.
+x86/x64: Add support for AES-NI, AVX and AVX2 to DynASM.
PPC/e500: Drop support for this architecture.
FFI library:
@@ -123,6 +124,7 @@ LuaJIT 2.1.0-beta1 — 2015-08-25
FFI: Compile lightuserdata to void * conversion.
FFI: Compile ffi.gc(cdata, nil), too.
FFI: Add ffi.typeinfo().
+FFI: Add ssize_t declaration.
diff --git a/lib/luajit/doc/ext_ffi_semantics.html b/lib/luajit/doc/ext_ffi_semantics.html
index 889d44d823..f65fe8f36d 100644
--- a/lib/luajit/doc/ext_ffi_semantics.html
+++ b/lib/luajit/doc/ext_ffi_semantics.html
@@ -185,6 +185,8 @@ C Language Support
uint16_t, uint32_t, uint64_t,
intptr_t, uintptr_t.
+From <unistd.h> (POSIX): ssize_t.
+
You're encouraged to use these types in preference to
diff --git a/lib/luajit/doc/install.html b/lib/luajit/doc/install.html
index b5df697b67..a4cc721512 100644
--- a/lib/luajit/doc/install.html
+++ b/lib/luajit/doc/install.html
@@ -114,30 +114,30 @@
Installation
x86 (32 bit) |
-GCC 4.x GCC 3.4 |
-GCC 4.x GCC 3.4 |
-GCC 4.x GCC 3.4 |
+GCC 4.2+ |
+GCC 4.2+ |
+XCode 5.0+ Clang |
MSVC, MSVC/EE WinSDK MinGW, Cygwin |
x64 (64 bit) |
-GCC 4.x |
+GCC 4.2+ |
ORBIS (PS4) |
-GCC 4.x |
+XCode 5.0+ Clang |
MSVC + SDK v7.0 WinSDK v7.0 Durango (Xbox One) |
ARMv5+ ARM9E+ |
GCC 4.2+ |
GCC 4.2+ PSP2 (PS VITA) |
-GCC 4.2+ |
+XCode 5.0+ Clang |
|
ARM64 |
GCC 4.8+ |
|
-Clang 3.5+ |
+XCode 6.0+ Clang 3.5+ |
|
@@ -442,8 +442,7 @@ Cross-compiling LuaJIT
make HOST_CC="gcc -m32" CROSS=$NDKP TARGET_FLAGS="$NDKF"
-You can cross-compile for iOS 3.0+ (iPhone/iPad) using the » iOS SDK.
-The environment variables need to match the iOS SDK version:
+You can cross-compile for iOS 3.0+ (iPhone/iPad) using the » iOS SDK:
Note: the JIT compiler is disabled for iOS, because regular iOS Apps
@@ -453,13 +452,18 @@
Cross-compiling LuaJIT
Or use Android. :-p
-IXCODE=`xcode-select -print-path`
-ISDK=$IXCODE/Platforms/iPhoneOS.platform/Developer
-ISDKVER=iPhoneOS6.0.sdk
-ISDKP=$ISDK/usr/bin/
-ISDKF="-arch armv7 -isysroot $ISDK/SDKs/$ISDKVER"
-make HOST_CC="gcc -m32 -arch i386" CROSS=$ISDKP TARGET_FLAGS="$ISDKF" \
- TARGET_SYS=iOS
+# iOS/ARM (32 bit)
+ISDKP=$(xcrun --sdk iphoneos --show-sdk-path)
+ICC=$(xcrun --sdk iphoneos --find clang)
+ISDKF="-arch armv7 -isysroot $ISDKP"
+make HOST_CC="clang -m32 -arch i386" CROSS="$(dirname $ICC)/" \
+ TARGET_FLAGS="$ISDKF" TARGET_SYS=iOS
+
+# iOS/ARM64
+ISDKP=$(xcrun --sdk iphoneos --show-sdk-path)
+ICC=$(xcrun --sdk iphoneos --find clang)
+ISDKF="-arch arm64 -isysroot $ISDKP"
+make CROSS="$(dirname $ICC)/" TARGET_FLAGS="$ISDKF" TARGET_SYS=iOS
Cross-compiling for consoles
diff --git a/lib/luajit/dynasm/dasm_arm.lua b/lib/luajit/dynasm/dasm_arm.lua
index 90a259c5c3..6a1d1d5195 100644
--- a/lib/luajit/dynasm/dasm_arm.lua
+++ b/lib/luajit/dynasm/dasm_arm.lua
@@ -9,9 +9,9 @@
local _info = {
arch = "arm",
description = "DynASM ARM module",
- version = "1.3.0",
- vernum = 10300,
- release = "2011-05-05",
+ version = "1.4.0",
+ vernum = 10400,
+ release = "2015-10-18",
author = "Mike Pall",
license = "MIT",
}
diff --git a/lib/luajit/dynasm/dasm_arm64.lua b/lib/luajit/dynasm/dasm_arm64.lua
index 9766e475b0..c1e3a81b11 100644
--- a/lib/luajit/dynasm/dasm_arm64.lua
+++ b/lib/luajit/dynasm/dasm_arm64.lua
@@ -9,9 +9,9 @@
local _info = {
arch = "arm",
description = "DynASM ARM64 module",
- version = "1.3.0",
- vernum = 10300,
- release = "2014-12-03",
+ version = "1.4.0",
+ vernum = 10400,
+ release = "2015-10-18",
author = "Mike Pall",
license = "MIT",
}
diff --git a/lib/luajit/dynasm/dasm_mips.lua b/lib/luajit/dynasm/dasm_mips.lua
index ae0dbd7a9b..ef383431cd 100644
--- a/lib/luajit/dynasm/dasm_mips.lua
+++ b/lib/luajit/dynasm/dasm_mips.lua
@@ -9,9 +9,9 @@
local _info = {
arch = "mips",
description = "DynASM MIPS module",
- version = "1.3.0",
- vernum = 10300,
- release = "2012-01-23",
+ version = "1.4.0",
+ vernum = 10400,
+ release = "2015-10-18",
author = "Mike Pall",
license = "MIT",
}
diff --git a/lib/luajit/dynasm/dasm_ppc.lua b/lib/luajit/dynasm/dasm_ppc.lua
index 278f09526d..1e9bccaeb8 100644
--- a/lib/luajit/dynasm/dasm_ppc.lua
+++ b/lib/luajit/dynasm/dasm_ppc.lua
@@ -11,9 +11,9 @@
local _info = {
arch = "ppc",
description = "DynASM PPC module",
- version = "1.3.0",
- vernum = 10300,
- release = "2015-01-14",
+ version = "1.4.0",
+ vernum = 10400,
+ release = "2015-10-18",
author = "Mike Pall",
license = "MIT",
}
diff --git a/lib/luajit/dynasm/dasm_proto.h b/lib/luajit/dynasm/dasm_proto.h
index a8bc6fd285..93ca06533c 100644
--- a/lib/luajit/dynasm/dasm_proto.h
+++ b/lib/luajit/dynasm/dasm_proto.h
@@ -10,8 +10,8 @@
#include
#include
-#define DASM_IDENT "DynASM 1.3.0"
-#define DASM_VERSION 10300 /* 1.3.0 */
+#define DASM_IDENT "DynASM 1.4.0"
+#define DASM_VERSION 10400 /* 1.4.0 */
#ifndef Dst_DECL
#define Dst_DECL dasm_State **Dst
diff --git a/lib/luajit/dynasm/dasm_x86.h b/lib/luajit/dynasm/dasm_x86.h
deleted file mode 100644
index 652e8c99b0..0000000000
--- a/lib/luajit/dynasm/dasm_x86.h
+++ /dev/null
@@ -1,471 +0,0 @@
-/*
-** DynASM x86 encoding engine.
-** Copyright (C) 2005-2015 Mike Pall. All rights reserved.
-** Released under the MIT license. See dynasm.lua for full copyright notice.
-*/
-
-#include
-#include
-#include
-#include
-
-#define DASM_ARCH "x86"
-
-#ifndef DASM_EXTERN
-#define DASM_EXTERN(a,b,c,d) 0
-#endif
-
-/* Action definitions. DASM_STOP must be 255. */
-enum {
- DASM_DISP = 233,
- DASM_IMM_S, DASM_IMM_B, DASM_IMM_W, DASM_IMM_D, DASM_IMM_WB, DASM_IMM_DB,
- DASM_VREG, DASM_SPACE, DASM_SETLABEL, DASM_REL_A, DASM_REL_LG, DASM_REL_PC,
- DASM_IMM_LG, DASM_IMM_PC, DASM_LABEL_LG, DASM_LABEL_PC, DASM_ALIGN,
- DASM_EXTERN, DASM_ESC, DASM_MARK, DASM_SECTION, DASM_STOP
-};
-
-/* Maximum number of section buffer positions for a single dasm_put() call. */
-#define DASM_MAXSECPOS 25
-
-/* DynASM encoder status codes. Action list offset or number are or'ed in. */
-#define DASM_S_OK 0x00000000
-#define DASM_S_NOMEM 0x01000000
-#define DASM_S_PHASE 0x02000000
-#define DASM_S_MATCH_SEC 0x03000000
-#define DASM_S_RANGE_I 0x11000000
-#define DASM_S_RANGE_SEC 0x12000000
-#define DASM_S_RANGE_LG 0x13000000
-#define DASM_S_RANGE_PC 0x14000000
-#define DASM_S_RANGE_VREG 0x15000000
-#define DASM_S_UNDEF_L 0x21000000
-#define DASM_S_UNDEF_PC 0x22000000
-
-/* Macros to convert positions (8 bit section + 24 bit index). */
-#define DASM_POS2IDX(pos) ((pos)&0x00ffffff)
-#define DASM_POS2BIAS(pos) ((pos)&0xff000000)
-#define DASM_SEC2POS(sec) ((sec)<<24)
-#define DASM_POS2SEC(pos) ((pos)>>24)
-#define DASM_POS2PTR(D, pos) (D->sections[DASM_POS2SEC(pos)].rbuf + (pos))
-
-/* Action list type. */
-typedef const unsigned char *dasm_ActList;
-
-/* Per-section structure. */
-typedef struct dasm_Section {
- int *rbuf; /* Biased buffer pointer (negative section bias). */
- int *buf; /* True buffer pointer. */
- size_t bsize; /* Buffer size in bytes. */
- int pos; /* Biased buffer position. */
- int epos; /* End of biased buffer position - max single put. */
- int ofs; /* Byte offset into section. */
-} dasm_Section;
-
-/* Core structure holding the DynASM encoding state. */
-struct dasm_State {
- size_t psize; /* Allocated size of this structure. */
- dasm_ActList actionlist; /* Current actionlist pointer. */
- int *lglabels; /* Local/global chain/pos ptrs. */
- size_t lgsize;
- int *pclabels; /* PC label chains/pos ptrs. */
- size_t pcsize;
- void **globals; /* Array of globals (bias -10). */
- dasm_Section *section; /* Pointer to active section. */
- size_t codesize; /* Total size of all code sections. */
- int maxsection; /* 0 <= sectionidx < maxsection. */
- int status; /* Status code. */
- dasm_Section sections[1]; /* All sections. Alloc-extended. */
-};
-
-/* The size of the core structure depends on the max. number of sections. */
-#define DASM_PSZ(ms) (sizeof(dasm_State)+(ms-1)*sizeof(dasm_Section))
-
-
-/* Initialize DynASM state. */
-void dasm_init(Dst_DECL, int maxsection)
-{
- dasm_State *D;
- size_t psz = 0;
- int i;
- Dst_REF = NULL;
- DASM_M_GROW(Dst, struct dasm_State, Dst_REF, psz, DASM_PSZ(maxsection));
- D = Dst_REF;
- D->psize = psz;
- D->lglabels = NULL;
- D->lgsize = 0;
- D->pclabels = NULL;
- D->pcsize = 0;
- D->globals = NULL;
- D->maxsection = maxsection;
- for (i = 0; i < maxsection; i++) {
- D->sections[i].buf = NULL; /* Need this for pass3. */
- D->sections[i].rbuf = D->sections[i].buf - DASM_SEC2POS(i);
- D->sections[i].bsize = 0;
- D->sections[i].epos = 0; /* Wrong, but is recalculated after resize. */
- }
-}
-
-/* Free DynASM state. */
-void dasm_free(Dst_DECL)
-{
- dasm_State *D = Dst_REF;
- int i;
- for (i = 0; i < D->maxsection; i++)
- if (D->sections[i].buf)
- DASM_M_FREE(Dst, D->sections[i].buf, D->sections[i].bsize);
- if (D->pclabels) DASM_M_FREE(Dst, D->pclabels, D->pcsize);
- if (D->lglabels) DASM_M_FREE(Dst, D->lglabels, D->lgsize);
- DASM_M_FREE(Dst, D, D->psize);
-}
-
-/* Setup global label array. Must be called before dasm_setup(). */
-void dasm_setupglobal(Dst_DECL, void **gl, unsigned int maxgl)
-{
- dasm_State *D = Dst_REF;
- D->globals = gl - 10; /* Negative bias to compensate for locals. */
- DASM_M_GROW(Dst, int, D->lglabels, D->lgsize, (10+maxgl)*sizeof(int));
-}
-
-/* Grow PC label array. Can be called after dasm_setup(), too. */
-void dasm_growpc(Dst_DECL, unsigned int maxpc)
-{
- dasm_State *D = Dst_REF;
- size_t osz = D->pcsize;
- DASM_M_GROW(Dst, int, D->pclabels, D->pcsize, maxpc*sizeof(int));
- memset((void *)(((unsigned char *)D->pclabels)+osz), 0, D->pcsize-osz);
-}
-
-/* Setup encoder. */
-void dasm_setup(Dst_DECL, const void *actionlist)
-{
- dasm_State *D = Dst_REF;
- int i;
- D->actionlist = (dasm_ActList)actionlist;
- D->status = DASM_S_OK;
- D->section = &D->sections[0];
- memset((void *)D->lglabels, 0, D->lgsize);
- if (D->pclabels) memset((void *)D->pclabels, 0, D->pcsize);
- for (i = 0; i < D->maxsection; i++) {
- D->sections[i].pos = DASM_SEC2POS(i);
- D->sections[i].ofs = 0;
- }
-}
-
-
-#ifdef DASM_CHECKS
-#define CK(x, st) \
- do { if (!(x)) { \
- D->status = DASM_S_##st|(int)(p-D->actionlist-1); return; } } while (0)
-#define CKPL(kind, st) \
- do { if ((size_t)((char *)pl-(char *)D->kind##labels) >= D->kind##size) { \
- D->status=DASM_S_RANGE_##st|(int)(p-D->actionlist-1); return; } } while (0)
-#else
-#define CK(x, st) ((void)0)
-#define CKPL(kind, st) ((void)0)
-#endif
-
-/* Pass 1: Store actions and args, link branches/labels, estimate offsets. */
-void dasm_put(Dst_DECL, int start, ...)
-{
- va_list ap;
- dasm_State *D = Dst_REF;
- dasm_ActList p = D->actionlist + start;
- dasm_Section *sec = D->section;
- int pos = sec->pos, ofs = sec->ofs, mrm = 4;
- int *b;
-
- if (pos >= sec->epos) {
- DASM_M_GROW(Dst, int, sec->buf, sec->bsize,
- sec->bsize + 2*DASM_MAXSECPOS*sizeof(int));
- sec->rbuf = sec->buf - DASM_POS2BIAS(pos);
- sec->epos = (int)sec->bsize/sizeof(int) - DASM_MAXSECPOS+DASM_POS2BIAS(pos);
- }
-
- b = sec->rbuf;
- b[pos++] = start;
-
- va_start(ap, start);
- while (1) {
- int action = *p++;
- if (action < DASM_DISP) {
- ofs++;
- } else if (action <= DASM_REL_A) {
- int n = va_arg(ap, int);
- b[pos++] = n;
- switch (action) {
- case DASM_DISP:
- if (n == 0) { if ((mrm&7) == 4) mrm = p[-2]; if ((mrm&7) != 5) break; }
- case DASM_IMM_DB: if (((n+128)&-256) == 0) goto ob;
- case DASM_REL_A: /* Assumes ptrdiff_t is int. !x64 */
- case DASM_IMM_D: ofs += 4; break;
- case DASM_IMM_S: CK(((n+128)&-256) == 0, RANGE_I); goto ob;
- case DASM_IMM_B: CK((n&-256) == 0, RANGE_I); ob: ofs++; break;
- case DASM_IMM_WB: if (((n+128)&-256) == 0) goto ob;
- case DASM_IMM_W: CK((n&-65536) == 0, RANGE_I); ofs += 2; break;
- case DASM_SPACE: p++; ofs += n; break;
- case DASM_SETLABEL: b[pos-2] = -0x40000000; break; /* Neg. label ofs. */
- case DASM_VREG: CK((n&-8) == 0 && (n != 4 || (*p&1) == 0), RANGE_VREG);
- if (*p++ == 1 && *p == DASM_DISP) mrm = n; continue;
- }
- mrm = 4;
- } else {
- int *pl, n;
- switch (action) {
- case DASM_REL_LG:
- case DASM_IMM_LG:
- n = *p++; pl = D->lglabels + n;
- /* Bkwd rel or global. */
- if (n <= 246) { CK(n>=10||*pl<0, RANGE_LG); CKPL(lg, LG); goto putrel; }
- pl -= 246; n = *pl;
- if (n < 0) n = 0; /* Start new chain for fwd rel if label exists. */
- goto linkrel;
- case DASM_REL_PC:
- case DASM_IMM_PC: pl = D->pclabels + va_arg(ap, int); CKPL(pc, PC);
- putrel:
- n = *pl;
- if (n < 0) { /* Label exists. Get label pos and store it. */
- b[pos] = -n;
- } else {
- linkrel:
- b[pos] = n; /* Else link to rel chain, anchored at label. */
- *pl = pos;
- }
- pos++;
- ofs += 4; /* Maximum offset needed. */
- if (action == DASM_REL_LG || action == DASM_REL_PC)
- b[pos++] = ofs; /* Store pass1 offset estimate. */
- break;
- case DASM_LABEL_LG: pl = D->lglabels + *p++; CKPL(lg, LG); goto putlabel;
- case DASM_LABEL_PC: pl = D->pclabels + va_arg(ap, int); CKPL(pc, PC);
- putlabel:
- n = *pl; /* n > 0: Collapse rel chain and replace with label pos. */
- while (n > 0) { int *pb = DASM_POS2PTR(D, n); n = *pb; *pb = pos; }
- *pl = -pos; /* Label exists now. */
- b[pos++] = ofs; /* Store pass1 offset estimate. */
- break;
- case DASM_ALIGN:
- ofs += *p++; /* Maximum alignment needed (arg is 2**n-1). */
- b[pos++] = ofs; /* Store pass1 offset estimate. */
- break;
- case DASM_EXTERN: p += 2; ofs += 4; break;
- case DASM_ESC: p++; ofs++; break;
- case DASM_MARK: mrm = p[-2]; break;
- case DASM_SECTION:
- n = *p; CK(n < D->maxsection, RANGE_SEC); D->section = &D->sections[n];
- case DASM_STOP: goto stop;
- }
- }
- }
-stop:
- va_end(ap);
- sec->pos = pos;
- sec->ofs = ofs;
-}
-#undef CK
-
-/* Pass 2: Link sections, shrink branches/aligns, fix label offsets. */
-int dasm_link(Dst_DECL, size_t *szp)
-{
- dasm_State *D = Dst_REF;
- int secnum;
- int ofs = 0;
-
-#ifdef DASM_CHECKS
- *szp = 0;
- if (D->status != DASM_S_OK) return D->status;
- {
- int pc;
- for (pc = 0; pc*sizeof(int) < D->pcsize; pc++)
- if (D->pclabels[pc] > 0) return DASM_S_UNDEF_PC|pc;
- }
-#endif
-
- { /* Handle globals not defined in this translation unit. */
- int idx;
- for (idx = 10; idx*sizeof(int) < D->lgsize; idx++) {
- int n = D->lglabels[idx];
- /* Undefined label: Collapse rel chain and replace with marker (< 0). */
- while (n > 0) { int *pb = DASM_POS2PTR(D, n); n = *pb; *pb = -idx; }
- }
- }
-
- /* Combine all code sections. No support for data sections (yet). */
- for (secnum = 0; secnum < D->maxsection; secnum++) {
- dasm_Section *sec = D->sections + secnum;
- int *b = sec->rbuf;
- int pos = DASM_SEC2POS(secnum);
- int lastpos = sec->pos;
-
- while (pos != lastpos) {
- dasm_ActList p = D->actionlist + b[pos++];
- while (1) {
- int op, action = *p++;
- switch (action) {
- case DASM_REL_LG: p++; op = p[-3]; goto rel_pc;
- case DASM_REL_PC: op = p[-2]; rel_pc: {
- int shrink = op == 0xe9 ? 3 : ((op&0xf0) == 0x80 ? 4 : 0);
- if (shrink) { /* Shrinkable branch opcode? */
- int lofs, lpos = b[pos];
- if (lpos < 0) goto noshrink; /* Ext global? */
- lofs = *DASM_POS2PTR(D, lpos);
- if (lpos > pos) { /* Fwd label: add cumulative section offsets. */
- int i;
- for (i = secnum; i < DASM_POS2SEC(lpos); i++)
- lofs += D->sections[i].ofs;
- } else {
- lofs -= ofs; /* Bkwd label: unfix offset. */
- }
- lofs -= b[pos+1]; /* Short branch ok? */
- if (lofs >= -128-shrink && lofs <= 127) ofs -= shrink; /* Yes. */
- else { noshrink: shrink = 0; } /* No, cannot shrink op. */
- }
- b[pos+1] = shrink;
- pos += 2;
- break;
- }
- case DASM_SPACE: case DASM_IMM_LG: case DASM_VREG: p++;
- case DASM_DISP: case DASM_IMM_S: case DASM_IMM_B: case DASM_IMM_W:
- case DASM_IMM_D: case DASM_IMM_WB: case DASM_IMM_DB:
- case DASM_SETLABEL: case DASM_REL_A: case DASM_IMM_PC: pos++; break;
- case DASM_LABEL_LG: p++;
- case DASM_LABEL_PC: b[pos++] += ofs; break; /* Fix label offset. */
- case DASM_ALIGN: ofs -= (b[pos++]+ofs)&*p++; break; /* Adjust ofs. */
- case DASM_EXTERN: p += 2; break;
- case DASM_ESC: p++; break;
- case DASM_MARK: break;
- case DASM_SECTION: case DASM_STOP: goto stop;
- }
- }
- stop: (void)0;
- }
- ofs += sec->ofs; /* Next section starts right after current section. */
- }
-
- D->codesize = ofs; /* Total size of all code sections */
- *szp = ofs;
- return DASM_S_OK;
-}
-
-#define dasmb(x) *cp++ = (unsigned char)(x)
-#ifndef DASM_ALIGNED_WRITES
-#define dasmw(x) \
- do { *((unsigned short *)cp) = (unsigned short)(x); cp+=2; } while (0)
-#define dasmd(x) \
- do { *((unsigned int *)cp) = (unsigned int)(x); cp+=4; } while (0)
-#else
-#define dasmw(x) do { dasmb(x); dasmb((x)>>8); } while (0)
-#define dasmd(x) do { dasmw(x); dasmw((x)>>16); } while (0)
-#endif
-
-/* Pass 3: Encode sections. */
-int dasm_encode(Dst_DECL, void *buffer)
-{
- dasm_State *D = Dst_REF;
- unsigned char *base = (unsigned char *)buffer;
- unsigned char *cp = base;
- int secnum;
-
- /* Encode all code sections. No support for data sections (yet). */
- for (secnum = 0; secnum < D->maxsection; secnum++) {
- dasm_Section *sec = D->sections + secnum;
- int *b = sec->buf;
- int *endb = sec->rbuf + sec->pos;
-
- while (b != endb) {
- dasm_ActList p = D->actionlist + *b++;
- unsigned char *mark = NULL;
- while (1) {
- int action = *p++;
- int n = (action >= DASM_DISP && action <= DASM_ALIGN) ? *b++ : 0;
- switch (action) {
- case DASM_DISP: if (!mark) mark = cp; {
- unsigned char *mm = mark;
- if (*p != DASM_IMM_DB && *p != DASM_IMM_WB) mark = NULL;
- if (n == 0) { int mrm = mm[-1]&7; if (mrm == 4) mrm = mm[0]&7;
- if (mrm != 5) { mm[-1] -= 0x80; break; } }
- if (((n+128) & -256) != 0) goto wd; else mm[-1] -= 0x40;
- }
- case DASM_IMM_S: case DASM_IMM_B: wb: dasmb(n); break;
- case DASM_IMM_DB: if (((n+128)&-256) == 0) {
- db: if (!mark) mark = cp; mark[-2] += 2; mark = NULL; goto wb;
- } else mark = NULL;
- case DASM_IMM_D: wd: dasmd(n); break;
- case DASM_IMM_WB: if (((n+128)&-256) == 0) goto db; else mark = NULL;
- case DASM_IMM_W: dasmw(n); break;
- case DASM_VREG: { int t = *p++; if (t >= 2) n<<=3; cp[-1] |= n; break; }
- case DASM_REL_LG: p++; if (n >= 0) goto rel_pc;
- b++; n = (int)(ptrdiff_t)D->globals[-n];
- case DASM_REL_A: rel_a: n -= (int)(ptrdiff_t)(cp+4); goto wd; /* !x64 */
- case DASM_REL_PC: rel_pc: {
- int shrink = *b++;
- int *pb = DASM_POS2PTR(D, n); if (*pb < 0) { n = pb[1]; goto rel_a; }
- n = *pb - ((int)(cp-base) + 4-shrink);
- if (shrink == 0) goto wd;
- if (shrink == 4) { cp--; cp[-1] = *cp-0x10; } else cp[-1] = 0xeb;
- goto wb;
- }
- case DASM_IMM_LG:
- p++; if (n < 0) { n = (int)(ptrdiff_t)D->globals[-n]; goto wd; }
- case DASM_IMM_PC: {
- int *pb = DASM_POS2PTR(D, n);
- n = *pb < 0 ? pb[1] : (*pb + (int)(ptrdiff_t)base);
- goto wd;
- }
- case DASM_LABEL_LG: {
- int idx = *p++;
- if (idx >= 10)
- D->globals[idx] = (void *)(base + (*p == DASM_SETLABEL ? *b : n));
- break;
- }
- case DASM_LABEL_PC: case DASM_SETLABEL: break;
- case DASM_SPACE: { int fill = *p++; while (n--) *cp++ = fill; break; }
- case DASM_ALIGN:
- n = *p++;
- while (((cp-base) & n)) *cp++ = 0x90; /* nop */
- break;
- case DASM_EXTERN: n = DASM_EXTERN(Dst, cp, p[1], *p); p += 2; goto wd;
- case DASM_MARK: mark = cp; break;
- case DASM_ESC: action = *p++;
- default: *cp++ = action; break;
- case DASM_SECTION: case DASM_STOP: goto stop;
- }
- }
- stop: (void)0;
- }
- }
-
- if (base + D->codesize != cp) /* Check for phase errors. */
- return DASM_S_PHASE;
- return DASM_S_OK;
-}
-
-/* Get PC label offset. */
-int dasm_getpclabel(Dst_DECL, unsigned int pc)
-{
- dasm_State *D = Dst_REF;
- if (pc*sizeof(int) < D->pcsize) {
- int pos = D->pclabels[pc];
- if (pos < 0) return *DASM_POS2PTR(D, -pos);
- if (pos > 0) return -1; /* Undefined. */
- }
- return -2; /* Unused or out of range. */
-}
-
-#ifdef DASM_CHECKS
-/* Optional sanity checker to call between isolated encoding steps. */
-int dasm_checkstep(Dst_DECL, int secmatch)
-{
- dasm_State *D = Dst_REF;
- if (D->status == DASM_S_OK) {
- int i;
- for (i = 1; i <= 9; i++) {
- if (D->lglabels[i] > 0) { D->status = DASM_S_UNDEF_L|i; break; }
- D->lglabels[i] = 0;
- }
- }
- if (D->status == DASM_S_OK && secmatch >= 0 &&
- D->section != &D->sections[secmatch])
- D->status = DASM_S_MATCH_SEC|(int)(D->section-D->sections);
- return D->status;
-}
-#endif
-
diff --git a/lib/luajit/dynasm/dasm_x86.lua b/lib/luajit/dynasm/dasm_x86.lua
index 7ca061d22f..60f5211a33 100644
--- a/lib/luajit/dynasm/dasm_x86.lua
+++ b/lib/luajit/dynasm/dasm_x86.lua
@@ -11,9 +11,9 @@ local x64 = x64
local _info = {
arch = x64 and "x64" or "x86",
description = "DynASM x86/x64 module",
- version = "1.3.0",
- vernum = 10300,
- release = "2011-05-05",
+ version = "1.4.0",
+ vernum = 10400,
+ release = "2015-10-18",
author = "Mike Pall",
license = "MIT",
}
@@ -27,9 +27,9 @@ local assert, unpack, setmetatable = assert, unpack or table.unpack, setmetatabl
local _s = string
local sub, format, byte, char = _s.sub, _s.format, _s.byte, _s.char
local find, match, gmatch, gsub = _s.find, _s.match, _s.gmatch, _s.gsub
-local concat, sort = table.concat, table.sort
+local concat, sort, remove = table.concat, table.sort, table.remove
local bit = bit or require("bit")
-local band, shl, shr = bit.band, bit.lshift, bit.rshift
+local band, bxor, shl, shr = bit.band, bit.bxor, bit.lshift, bit.rshift
-- Inherited tables and callbacks.
local g_opt, g_arch
@@ -41,7 +41,7 @@ local action_names = {
-- int arg, 1 buffer pos:
"DISP", "IMM_S", "IMM_B", "IMM_W", "IMM_D", "IMM_WB", "IMM_DB",
-- action arg (1 byte), int arg, 1 buffer pos (reg/num):
- "VREG", "SPACE", -- !x64: VREG support NYI.
+ "VREG", "SPACE",
-- ptrdiff_t arg, 1 buffer pos (address): !x64
"SETLABEL", "REL_A",
-- action arg (1 byte) or int arg, 2 buffer pos (link, offset):
@@ -83,6 +83,21 @@ local actargs = { 0 }
-- Current number of section buffer positions for dasm_put().
local secpos = 1
+-- VREG kind encodings, pre-shifted by 5 bits.
+local map_vreg = {
+ ["modrm.rm.m"] = 0x00,
+ ["modrm.rm.r"] = 0x20,
+ ["opcode"] = 0x20,
+ ["sib.base"] = 0x20,
+ ["sib.index"] = 0x40,
+ ["modrm.reg"] = 0x80,
+ ["vex.v"] = 0xa0,
+ ["imm.hi"] = 0xc0,
+}
+
+-- Current number of VREG actions contributing to REX/VEX shrinkage.
+local vreg_shrink_count = 0
+
------------------------------------------------------------------------------
-- Compute action numbers for action names.
@@ -134,6 +149,21 @@ local function waction(action, a, num)
if a or num then secpos = secpos + (num or 1) end
end
+-- Optionally add a VREG action.
+local function wvreg(kind, vreg, psz, sk, defer)
+ if not vreg then return end
+ waction("VREG", vreg)
+ local b = assert(map_vreg[kind], "bad vreg kind `"..vreg.."'")
+ if b < (sk or 0) then
+ vreg_shrink_count = vreg_shrink_count + 1
+ end
+ if not defer then
+ b = b + vreg_shrink_count * 8
+ vreg_shrink_count = 0
+ end
+ wputxb(b + (psz or 0))
+end
+
-- Add call to embedded DynASM C code.
local function wcall(func, args)
wline(format("dasm_%s(Dst, %s);", func, concat(args, ", ")), true)
@@ -299,7 +329,7 @@ local function mkrmap(sz, cl, names)
local iname = format("@%s%x%s", sz, i, needrex and "R" or "")
if needrex then map_reg_needrex[iname] = true end
local name
- if sz == "o" then name = format("xmm%d", i)
+ if sz == "o" or sz == "y" then name = format("%s%d", cl, i)
elseif sz == "f" then name = format("st%d", i)
else name = format("r%d%s", i, sz == addrsize and "" or sz) end
map_archdef[name] = iname
@@ -326,6 +356,7 @@ mkrmap("w", "Rw", {"ax", "cx", "dx", "bx", "sp", "bp", "si", "di"})
mkrmap("b", "Rb", {"al", "cl", "dl", "bl", "ah", "ch", "dh", "bh"})
map_reg_valid_index[map_archdef.esp] = false
if x64 then map_reg_valid_index[map_archdef.rsp] = false end
+if x64 then map_reg_needrex[map_archdef.Rb] = true end
map_archdef["Ra"] = "@"..addrsize
-- FP registers (internally tword sized, but use "f" as operand size).
@@ -334,21 +365,24 @@ mkrmap("f", "Rf")
-- SSE registers (oword sized, but qword and dword accessible).
mkrmap("o", "xmm")
+-- AVX registers (yword sized, but oword, qword and dword accessible).
+mkrmap("y", "ymm")
+
-- Operand size prefixes to codes.
local map_opsize = {
- byte = "b", word = "w", dword = "d", qword = "q", oword = "o", tword = "t",
- aword = addrsize,
+ byte = "b", word = "w", dword = "d", qword = "q", oword = "o", yword = "y",
+ tword = "t", aword = addrsize,
}
-- Operand size code to number.
local map_opsizenum = {
- b = 1, w = 2, d = 4, q = 8, o = 16, t = 10,
+ b = 1, w = 2, d = 4, q = 8, o = 16, y = 32, t = 10,
}
-- Operand size code to name.
local map_opsizename = {
- b = "byte", w = "word", d = "dword", q = "qword", o = "oword", t = "tword",
- f = "fpword",
+ b = "byte", w = "word", d = "dword", q = "qword", o = "oword", y = "yword",
+ t = "tword", f = "fpword",
}
-- Valid index register scale factors.
@@ -460,9 +494,45 @@ local function wputszarg(sz, n)
end
-- Put multi-byte opcode with operand-size dependent modifications.
-local function wputop(sz, op, rex)
+local function wputop(sz, op, rex, vex, vregr, vregxb)
+ local psz, sk = 0, nil
+ if vex then
+ local tail
+ if vex.m == 1 and band(rex, 11) == 0 then
+ if x64 and vregxb then
+ sk = map_vreg["modrm.reg"]
+ else
+ wputb(0xc5)
+ tail = shl(bxor(band(rex, 4), 4), 5)
+ psz = 3
+ end
+ end
+ if not tail then
+ wputb(0xc4)
+ wputb(shl(bxor(band(rex, 7), 7), 5) + vex.m)
+ tail = shl(band(rex, 8), 4)
+ psz = 4
+ end
+ local reg, vreg = 0, nil
+ if vex.v then
+ reg = vex.v.reg
+ if not reg then werror("bad vex operand") end
+ if reg < 0 then reg = 0; vreg = vex.v.vreg end
+ end
+ if sz == "y" or vex.l then tail = tail + 4 end
+ wputb(tail + shl(bxor(reg, 15), 3) + vex.p)
+ wvreg("vex.v", vreg)
+ rex = 0
+ if op >= 256 then werror("bad vex opcode") end
+ else
+ if rex ~= 0 then
+ if not x64 then werror("bad operand size") end
+ elseif (vregr or vregxb) and x64 then
+ rex = 0x10
+ sk = map_vreg["vex.v"]
+ end
+ end
local r
- if rex ~= 0 and not x64 then werror("bad operand size") end
if sz == "w" then wputb(102) end
-- Needs >32 bit numbers, but only for crc32 eax, word [ebx]
if op >= 4294967296 then r = op%4294967296 wputb((op-r)/4294967296) op = r end
@@ -471,20 +541,20 @@ local function wputop(sz, op, rex)
if rex ~= 0 then
local opc3 = band(op, 0xffff00)
if opc3 == 0x0f3a00 or opc3 == 0x0f3800 then
- wputb(64 + band(rex, 15)); rex = 0
+ wputb(64 + band(rex, 15)); rex = 0; psz = 2
end
end
- wputb(shr(op, 16)); op = band(op, 0xffff)
+ wputb(shr(op, 16)); op = band(op, 0xffff); psz = psz + 1
end
if op >= 256 then
local b = shr(op, 8)
- if b == 15 and rex ~= 0 then wputb(64 + band(rex, 15)); rex = 0 end
- wputb(b)
- op = band(op, 255)
+ if b == 15 and rex ~= 0 then wputb(64 + band(rex, 15)); rex = 0; psz = 2 end
+ wputb(b); op = band(op, 255); psz = psz + 1
end
- if rex ~= 0 then wputb(64 + band(rex, 15)) end
+ if rex ~= 0 then wputb(64 + band(rex, 15)); psz = 2 end
if sz == "b" then op = op - 1 end
wputb(op)
+ return psz, sk
end
-- Put ModRM or SIB formatted byte.
@@ -494,7 +564,7 @@ local function wputmodrm(m, s, rm, vs, vrm)
end
-- Put ModRM/SIB plus optional displacement.
-local function wputmrmsib(t, imark, s, vsreg)
+local function wputmrmsib(t, imark, s, vsreg, psz, sk)
local vreg, vxreg
local reg, xreg = t.reg, t.xreg
if reg and reg < 0 then reg = 0; vreg = t.vreg end
@@ -504,8 +574,8 @@ local function wputmrmsib(t, imark, s, vsreg)
-- Register mode.
if sub(t.mode, 1, 1) == "r" then
wputmodrm(3, s, reg)
- if vsreg then waction("VREG", vsreg); wputxb(2) end
- if vreg then waction("VREG", vreg); wputxb(0) end
+ wvreg("modrm.reg", vsreg, psz+1, sk, vreg)
+ wvreg("modrm.rm.r", vreg, psz+1, sk)
return
end
@@ -519,21 +589,22 @@ local function wputmrmsib(t, imark, s, vsreg)
-- [xreg*xsc+disp] -> (0, s, esp) (xsc, xreg, ebp)
wputmodrm(0, s, 4)
if imark == "I" then waction("MARK") end
- if vsreg then waction("VREG", vsreg); wputxb(2) end
+ wvreg("modrm.reg", vsreg, psz+1, sk, vxreg)
wputmodrm(t.xsc, xreg, 5)
- if vxreg then waction("VREG", vxreg); wputxb(3) end
+ wvreg("sib.index", vxreg, psz+2, sk)
else
-- Pure 32 bit displacement.
if x64 and tdisp ~= "table" then
wputmodrm(0, s, 4) -- [disp] -> (0, s, esp) (0, esp, ebp)
+ wvreg("modrm.reg", vsreg, psz+1, sk)
if imark == "I" then waction("MARK") end
wputmodrm(0, 4, 5)
else
riprel = x64
wputmodrm(0, s, 5) -- [disp|rip-label] -> (0, s, ebp)
+ wvreg("modrm.reg", vsreg, psz+1, sk)
if imark == "I" then waction("MARK") end
end
- if vsreg then waction("VREG", vsreg); wputxb(2) end
end
if riprel then -- Emit rip-relative displacement.
if match("UWSiI", imark) then
@@ -561,16 +632,16 @@ local function wputmrmsib(t, imark, s, vsreg)
if xreg or band(reg, 7) == 4 then
wputmodrm(m or 2, s, 4) -- ModRM.
if m == nil or imark == "I" then waction("MARK") end
- if vsreg then waction("VREG", vsreg); wputxb(2) end
+ wvreg("modrm.reg", vsreg, psz+1, sk, vxreg or vreg)
wputmodrm(t.xsc or 0, xreg or 4, reg) -- SIB.
- if vxreg then waction("VREG", vxreg); wputxb(3) end
- if vreg then waction("VREG", vreg); wputxb(1) end
+ wvreg("sib.index", vxreg, psz+2, sk, vreg)
+ wvreg("sib.base", vreg, psz+2, sk)
else
wputmodrm(m or 2, s, reg) -- ModRM.
if (imark == "I" and (m == 1 or m == 2)) or
(m == nil and (vsreg or vreg)) then waction("MARK") end
- if vsreg then waction("VREG", vsreg); wputxb(2) end
- if vreg then waction("VREG", vreg); wputxb(1) end
+ wvreg("modrm.reg", vsreg, psz+1, sk, vreg)
+ wvreg("modrm.rm.m", vreg, psz+1, sk)
end
-- Put displacement.
@@ -881,9 +952,15 @@ end
-- "m"/"M" generates ModRM/SIB from the 1st/2nd operand.
-- The spare 3 bits are either filled with the last hex digit or
-- the result from a previous "r"/"R". The opcode is restored.
+-- "u" Use VEX encoding, vvvv unused.
+-- "v"/"V" Use VEX encoding, vvvv from 1st/2nd operand (the operand is
+-- removed from the list used by future characters).
+-- "L" Force VEX.L
--
-- All of the following characters force a flush of the opcode:
-- "o"/"O" stores a pure 32 bit disp (offset) from the 1st/2nd operand.
+-- "s" stores a 4 bit immediate from the last register operand,
+-- followed by 4 zero bits.
-- "S" stores a signed 8 bit immediate from the last operand.
-- "U" stores an unsigned 8 bit immediate from the last operand.
-- "W" stores an unsigned 16 bit immediate from the last operand.
@@ -1081,10 +1158,11 @@ local map_op = {
btr_2 = "mrqdw:0FB3Rm|miqdw:0FBA6mU",
bts_2 = "mrqdw:0FABRm|miqdw:0FBA5mU",
- shld_3 = "mriqdw:0FA4RmU|mrCqdw:0FA5Rm",
- shrd_3 = "mriqdw:0FACRmU|mrCqdw:0FADRm",
+ shld_3 = "mriqdw:0FA4RmU|mrC/qq:0FA5Rm|mrC/dd:|mrC/ww:",
+ shrd_3 = "mriqdw:0FACRmU|mrC/qq:0FADRm|mrC/dd:|mrC/ww:",
rdtsc_0 = "0F31", -- P1+
+ rdpmc_0 = "0F33", -- P6+
cpuid_0 = "0FA2", -- P1+
-- floating point ops
@@ -1190,7 +1268,7 @@ local map_op = {
cvtsi2sd_2 = "rm/od:F20F2ArM|rm/oq:F20F2ArXM",
cvtsi2ss_2 = "rm/od:F30F2ArM|rm/oq:F30F2ArXM",
cvtss2sd_2 = "rro:F30F5ArM|rx/od:",
- cvtss2si_2 = "rr/do:F20F2CrM|rr/qo:|rxd:|rx/qd:",
+ cvtss2si_2 = "rr/do:F30F2DrM|rr/qo:|rxd:|rx/qd:",
cvttpd2dq_2 = "rmo:660FE6rM",
cvttps2dq_2 = "rmo:F30F5BrM",
cvttsd2si_2 = "rr/do:F20F2CrM|rr/qo:|rx/dq:|rxq:",
@@ -1225,46 +1303,14 @@ local map_op = {
movups_2 = "rmo:0F10rM|mro:0F11Rm",
orpd_2 = "rmo:660F56rM",
orps_2 = "rmo:0F56rM",
- packssdw_2 = "rmo:660F6BrM",
- packsswb_2 = "rmo:660F63rM",
- packuswb_2 = "rmo:660F67rM",
- paddb_2 = "rmo:660FFCrM",
- paddd_2 = "rmo:660FFErM",
- paddq_2 = "rmo:660FD4rM",
- paddsb_2 = "rmo:660FECrM",
- paddsw_2 = "rmo:660FEDrM",
- paddusb_2 = "rmo:660FDCrM",
- paddusw_2 = "rmo:660FDDrM",
- paddw_2 = "rmo:660FFDrM",
- pand_2 = "rmo:660FDBrM",
- pandn_2 = "rmo:660FDFrM",
pause_0 = "F390",
- pavgb_2 = "rmo:660FE0rM",
- pavgw_2 = "rmo:660FE3rM",
- pcmpeqb_2 = "rmo:660F74rM",
- pcmpeqd_2 = "rmo:660F76rM",
- pcmpeqw_2 = "rmo:660F75rM",
- pcmpgtb_2 = "rmo:660F64rM",
- pcmpgtd_2 = "rmo:660F66rM",
- pcmpgtw_2 = "rmo:660F65rM",
- pextrw_3 = "rri/do:660FC5rMU|xri/wo:660F3A15nrMU", -- Mem op: SSE4.1 only.
+ pextrw_3 = "rri/do:660FC5rMU|xri/wo:660F3A15nRmU", -- Mem op: SSE4.1 only.
pinsrw_3 = "rri/od:660FC4rMU|rxi/ow:",
- pmaddwd_2 = "rmo:660FF5rM",
- pmaxsw_2 = "rmo:660FEErM",
- pmaxub_2 = "rmo:660FDErM",
- pminsw_2 = "rmo:660FEArM",
- pminub_2 = "rmo:660FDArM",
pmovmskb_2 = "rr/do:660FD7rM",
- pmulhuw_2 = "rmo:660FE4rM",
- pmulhw_2 = "rmo:660FE5rM",
- pmullw_2 = "rmo:660FD5rM",
- pmuludq_2 = "rmo:660FF4rM",
- por_2 = "rmo:660FEBrM",
prefetchnta_1 = "xb:n0F180m",
prefetcht0_1 = "xb:n0F181m",
prefetcht1_1 = "xb:n0F182m",
prefetcht2_1 = "xb:n0F183m",
- psadbw_2 = "rmo:660FF6rM",
pshufd_3 = "rmio:660F70rMU",
pshufhw_3 = "rmio:F30F70rMU",
pshuflw_3 = "rmio:F20F70rMU",
@@ -1278,23 +1324,6 @@ local map_op = {
psrldq_2 = "rio:660F733mU",
psrlq_2 = "rmo:660FD3rM|rio:660F732mU",
psrlw_2 = "rmo:660FD1rM|rio:660F712mU",
- psubb_2 = "rmo:660FF8rM",
- psubd_2 = "rmo:660FFArM",
- psubq_2 = "rmo:660FFBrM",
- psubsb_2 = "rmo:660FE8rM",
- psubsw_2 = "rmo:660FE9rM",
- psubusb_2 = "rmo:660FD8rM",
- psubusw_2 = "rmo:660FD9rM",
- psubw_2 = "rmo:660FF9rM",
- punpckhbw_2 = "rmo:660F68rM",
- punpckhdq_2 = "rmo:660F6ArM",
- punpckhqdq_2 = "rmo:660F6DrM",
- punpckhwd_2 = "rmo:660F69rM",
- punpcklbw_2 = "rmo:660F60rM",
- punpckldq_2 = "rmo:660F62rM",
- punpcklqdq_2 = "rmo:660F6CrM",
- punpcklwd_2 = "rmo:660F61rM",
- pxor_2 = "rmo:660FEFrM",
rcpps_2 = "rmo:0F53rM",
rcpss_2 = "rro:F30F53rM|rx/od:",
rsqrtps_2 = "rmo:0F52rM",
@@ -1352,7 +1381,7 @@ local map_op = {
dpps_3 = "rmio:660F3A40rMU",
extractps_3 = "mri/do:660F3A17RmU|rri/qo:660F3A17RXmU",
insertps_3 = "rrio:660F3A41rMU|rxi/od:",
- movntdqa_2 = "rmo:660F382ArM",
+ movntdqa_2 = "rxo:660F382ArM",
mpsadbw_3 = "rmio:660F3A42rMU",
packusdw_2 = "rmo:660F382BrM",
pblendvb_3 = "rmRo:660F3810rM",
@@ -1412,6 +1441,238 @@ local map_op = {
movntsd_2 = "xr/qo:nF20F2BRm",
movntss_2 = "xr/do:F30F2BRm",
-- popcnt is also in SSE4.2
+
+ -- AES-NI
+ aesdec_2 = "rmo:660F38DErM",
+ aesdeclast_2 = "rmo:660F38DFrM",
+ aesenc_2 = "rmo:660F38DCrM",
+ aesenclast_2 = "rmo:660F38DDrM",
+ aesimc_2 = "rmo:660F38DBrM",
+ aeskeygenassist_3 = "rmio:660F3ADFrMU",
+ pclmulqdq_3 = "rmio:660F3A44rMU",
+
+ -- AVX FP ops
+ vaddsubpd_3 = "rrmoy:660FVD0rM",
+ vaddsubps_3 = "rrmoy:F20FVD0rM",
+ vandpd_3 = "rrmoy:660FV54rM",
+ vandps_3 = "rrmoy:0FV54rM",
+ vandnpd_3 = "rrmoy:660FV55rM",
+ vandnps_3 = "rrmoy:0FV55rM",
+ vblendpd_4 = "rrmioy:660F3AV0DrMU",
+ vblendps_4 = "rrmioy:660F3AV0CrMU",
+ vblendvpd_4 = "rrmroy:660F3AV4BrMs",
+ vblendvps_4 = "rrmroy:660F3AV4ArMs",
+ vbroadcastf128_2 = "rx/yo:660F38u1ArM",
+ vcmppd_4 = "rrmioy:660FVC2rMU",
+ vcmpps_4 = "rrmioy:0FVC2rMU",
+ vcmpsd_4 = "rrrio:F20FVC2rMU|rrxi/ooq:",
+ vcmpss_4 = "rrrio:F30FVC2rMU|rrxi/ood:",
+ vcomisd_2 = "rro:660Fu2FrM|rx/oq:",
+ vcomiss_2 = "rro:0Fu2FrM|rx/od:",
+ vcvtdq2pd_2 = "rro:F30FuE6rM|rx/oq:|rm/yo:",
+ vcvtdq2ps_2 = "rmoy:0Fu5BrM",
+ vcvtpd2dq_2 = "rmoy:F20FuE6rM",
+ vcvtpd2ps_2 = "rmoy:660Fu5ArM",
+ vcvtps2dq_2 = "rmoy:660Fu5BrM",
+ vcvtps2pd_2 = "rro:0Fu5ArM|rx/oq:|rm/yo:",
+ vcvtsd2si_2 = "rr/do:F20Fu2DrM|rx/dq:|rr/qo:|rxq:",
+ vcvtsd2ss_3 = "rrro:F20FV5ArM|rrx/ooq:",
+ vcvtsi2sd_3 = "rrm/ood:F20FV2ArM|rrm/ooq:F20FVX2ArM",
+ vcvtsi2ss_3 = "rrm/ood:F30FV2ArM|rrm/ooq:F30FVX2ArM",
+ vcvtss2sd_3 = "rrro:F30FV5ArM|rrx/ood:",
+ vcvtss2si_2 = "rr/do:F30Fu2DrM|rxd:|rr/qo:|rx/qd:",
+ vcvttpd2dq_2 = "rmo:660FuE6rM|rm/oy:660FuLE6rM",
+ vcvttps2dq_2 = "rmoy:F30Fu5BrM",
+ vcvttsd2si_2 = "rr/do:F20Fu2CrM|rx/dq:|rr/qo:|rxq:",
+ vcvttss2si_2 = "rr/do:F30Fu2CrM|rxd:|rr/qo:|rx/qd:",
+ vdppd_4 = "rrmio:660F3AV41rMU",
+ vdpps_4 = "rrmioy:660F3AV40rMU",
+ vextractf128_3 = "mri/oy:660F3AuL19RmU",
+ vextractps_3 = "mri/do:660F3Au17RmU",
+ vhaddpd_3 = "rrmoy:660FV7CrM",
+ vhaddps_3 = "rrmoy:F20FV7CrM",
+ vhsubpd_3 = "rrmoy:660FV7DrM",
+ vhsubps_3 = "rrmoy:F20FV7DrM",
+ vinsertf128_4 = "rrmi/yyo:660F3AV18rMU",
+ vinsertps_4 = "rrrio:660F3AV21rMU|rrxi/ood:",
+ vldmxcsr_1 = "xd:0FuAE2m",
+ vmaskmovps_3 = "rrxoy:660F38V2CrM|xrroy:660F38V2ERm",
+ vmaskmovpd_3 = "rrxoy:660F38V2DrM|xrroy:660F38V2FRm",
+ vmovapd_2 = "rmoy:660Fu28rM|mroy:660Fu29Rm",
+ vmovaps_2 = "rmoy:0Fu28rM|mroy:0Fu29Rm",
+ vmovd_2 = "rm/od:660Fu6ErM|rm/oq:660FuX6ErM|mr/do:660Fu7ERm|mr/qo:",
+ vmovq_2 = "rro:F30Fu7ErM|rx/oq:|xr/qo:660FuD6Rm",
+ vmovddup_2 = "rmy:F20Fu12rM|rro:|rx/oq:",
+ vmovhlps_3 = "rrro:0FV12rM",
+ vmovhpd_2 = "xr/qo:660Fu17Rm",
+ vmovhpd_3 = "rrx/ooq:660FV16rM",
+ vmovhps_2 = "xr/qo:0Fu17Rm",
+ vmovhps_3 = "rrx/ooq:0FV16rM",
+ vmovlhps_3 = "rrro:0FV16rM",
+ vmovlpd_2 = "xr/qo:660Fu13Rm",
+ vmovlpd_3 = "rrx/ooq:660FV12rM",
+ vmovlps_2 = "xr/qo:0Fu13Rm",
+ vmovlps_3 = "rrx/ooq:0FV12rM",
+ vmovmskpd_2 = "rr/do:660Fu50rM|rr/dy:660FuL50rM",
+ vmovmskps_2 = "rr/do:0Fu50rM|rr/dy:0FuL50rM",
+ vmovntpd_2 = "xroy:660Fu2BRm",
+ vmovntps_2 = "xroy:0Fu2BRm",
+ vmovsd_2 = "rx/oq:F20Fu10rM|xr/qo:F20Fu11Rm",
+ vmovsd_3 = "rrro:F20FV10rM",
+ vmovshdup_2 = "rmoy:F30Fu16rM",
+ vmovsldup_2 = "rmoy:F30Fu12rM",
+ vmovss_2 = "rx/od:F30Fu10rM|xr/do:F30Fu11Rm",
+ vmovss_3 = "rrro:F30FV10rM",
+ vmovupd_2 = "rmoy:660Fu10rM|mroy:660Fu11Rm",
+ vmovups_2 = "rmoy:0Fu10rM|mroy:0Fu11Rm",
+ vorpd_3 = "rrmoy:660FV56rM",
+ vorps_3 = "rrmoy:0FV56rM",
+ vpermilpd_3 = "rrmoy:660F38V0DrM|rmioy:660F3Au05rMU",
+ vpermilps_3 = "rrmoy:660F38V0CrM|rmioy:660F3Au04rMU",
+ vperm2f128_4 = "rrmiy:660F3AV06rMU",
+ vptestpd_2 = "rmoy:660F38u0FrM",
+ vptestps_2 = "rmoy:660F38u0ErM",
+ vrcpps_2 = "rmoy:0Fu53rM",
+ vrcpss_3 = "rrro:F30FV53rM|rrx/ood:",
+ vrsqrtps_2 = "rmoy:0Fu52rM",
+ vrsqrtss_3 = "rrro:F30FV52rM|rrx/ood:",
+ vroundpd_3 = "rmioy:660F3AV09rMU",
+ vroundps_3 = "rmioy:660F3AV08rMU",
+ vroundsd_4 = "rrrio:660F3AV0BrMU|rrxi/ooq:",
+ vroundss_4 = "rrrio:660F3AV0ArMU|rrxi/ood:",
+ vshufpd_4 = "rrmioy:660FVC6rMU",
+ vshufps_4 = "rrmioy:0FVC6rMU",
+ vsqrtps_2 = "rmoy:0Fu51rM",
+ vsqrtss_2 = "rro:F30Fu51rM|rx/od:",
+ vsqrtpd_2 = "rmoy:660Fu51rM",
+ vsqrtsd_2 = "rro:F20Fu51rM|rx/oq:",
+ vstmxcsr_1 = "xd:0FuAE3m",
+ vucomisd_2 = "rro:660Fu2ErM|rx/oq:",
+ vucomiss_2 = "rro:0Fu2ErM|rx/od:",
+ vunpckhpd_3 = "rrmoy:660FV15rM",
+ vunpckhps_3 = "rrmoy:0FV15rM",
+ vunpcklpd_3 = "rrmoy:660FV14rM",
+ vunpcklps_3 = "rrmoy:0FV14rM",
+ vxorpd_3 = "rrmoy:660FV57rM",
+ vxorps_3 = "rrmoy:0FV57rM",
+ vzeroall_0 = "0FuL77",
+ vzeroupper_0 = "0Fu77",
+
+ -- AVX2 FP ops
+ vbroadcastss_2 = "rx/od:660F38u18rM|rx/yd:|rro:|rr/yo:",
+ vbroadcastsd_2 = "rx/yq:660F38u19rM|rr/yo:",
+ -- *vgather* (!vsib)
+ vpermpd_3 = "rmiy:660F3AuX01rMU",
+ vpermps_3 = "rrmy:660F38V16rM",
+
+ -- AVX, AVX2 integer ops
+ -- In general, xmm requires AVX, ymm requires AVX2.
+ vaesdec_3 = "rrmo:660F38VDErM",
+ vaesdeclast_3 = "rrmo:660F38VDFrM",
+ vaesenc_3 = "rrmo:660F38VDCrM",
+ vaesenclast_3 = "rrmo:660F38VDDrM",
+ vaesimc_2 = "rmo:660F38uDBrM",
+ vaeskeygenassist_3 = "rmio:660F3AuDFrMU",
+ vlddqu_2 = "rxoy:F20FuF0rM",
+ vmaskmovdqu_2 = "rro:660FuF7rM",
+ vmovdqa_2 = "rmoy:660Fu6FrM|mroy:660Fu7FRm",
+ vmovdqu_2 = "rmoy:F30Fu6FrM|mroy:F30Fu7FRm",
+ vmovntdq_2 = "xroy:660FuE7Rm",
+ vmovntdqa_2 = "rxoy:660F38u2ArM",
+ vmpsadbw_4 = "rrmioy:660F3AV42rMU",
+ vpabsb_2 = "rmoy:660F38u1CrM",
+ vpabsd_2 = "rmoy:660F38u1ErM",
+ vpabsw_2 = "rmoy:660F38u1DrM",
+ vpackusdw_3 = "rrmoy:660F38V2BrM",
+ vpalignr_4 = "rrmioy:660F3AV0FrMU",
+ vpblendvb_4 = "rrmroy:660F3AV4CrMs",
+ vpblendw_4 = "rrmioy:660F3AV0ErMU",
+ vpclmulqdq_4 = "rrmio:660F3AV44rMU",
+ vpcmpeqq_3 = "rrmoy:660F38V29rM",
+ vpcmpestri_3 = "rmio:660F3Au61rMU",
+ vpcmpestrm_3 = "rmio:660F3Au60rMU",
+ vpcmpgtq_3 = "rrmoy:660F38V37rM",
+ vpcmpistri_3 = "rmio:660F3Au63rMU",
+ vpcmpistrm_3 = "rmio:660F3Au62rMU",
+ vpextrb_3 = "rri/do:660F3Au14nRmU|rri/qo:|xri/bo:",
+ vpextrw_3 = "rri/do:660FuC5rMU|xri/wo:660F3Au15nRmU",
+ vpextrd_3 = "mri/do:660F3Au16RmU",
+ vpextrq_3 = "mri/qo:660F3Au16RmU",
+ vphaddw_3 = "rrmoy:660F38V01rM",
+ vphaddd_3 = "rrmoy:660F38V02rM",
+ vphaddsw_3 = "rrmoy:660F38V03rM",
+ vphminposuw_2 = "rmo:660F38u41rM",
+ vphsubw_3 = "rrmoy:660F38V05rM",
+ vphsubd_3 = "rrmoy:660F38V06rM",
+ vphsubsw_3 = "rrmoy:660F38V07rM",
+ vpinsrb_4 = "rrri/ood:660F3AV20rMU|rrxi/oob:",
+ vpinsrw_4 = "rrri/ood:660FVC4rMU|rrxi/oow:",
+ vpinsrd_4 = "rrmi/ood:660F3AV22rMU",
+ vpinsrq_4 = "rrmi/ooq:660F3AVX22rMU",
+ vpmaddubsw_3 = "rrmoy:660F38V04rM",
+ vpmaxsb_3 = "rrmoy:660F38V3CrM",
+ vpmaxsd_3 = "rrmoy:660F38V3DrM",
+ vpmaxuw_3 = "rrmoy:660F38V3ErM",
+ vpmaxud_3 = "rrmoy:660F38V3FrM",
+ vpminsb_3 = "rrmoy:660F38V38rM",
+ vpminsd_3 = "rrmoy:660F38V39rM",
+ vpminuw_3 = "rrmoy:660F38V3ArM",
+ vpminud_3 = "rrmoy:660F38V3BrM",
+ vpmovmskb_2 = "rr/do:660FuD7rM|rr/dy:660FuLD7rM",
+ vpmovsxbw_2 = "rroy:660F38u20rM|rx/oq:|rx/yo:",
+ vpmovsxbd_2 = "rroy:660F38u21rM|rx/od:|rx/yq:",
+ vpmovsxbq_2 = "rroy:660F38u22rM|rx/ow:|rx/yd:",
+ vpmovsxwd_2 = "rroy:660F38u23rM|rx/oq:|rx/yo:",
+ vpmovsxwq_2 = "rroy:660F38u24rM|rx/od:|rx/yq:",
+ vpmovsxdq_2 = "rroy:660F38u25rM|rx/oq:|rx/yo:",
+ vpmovzxbw_2 = "rroy:660F38u30rM|rx/oq:|rx/yo:",
+ vpmovzxbd_2 = "rroy:660F38u31rM|rx/od:|rx/yq:",
+ vpmovzxbq_2 = "rroy:660F38u32rM|rx/ow:|rx/yd:",
+ vpmovzxwd_2 = "rroy:660F38u33rM|rx/oq:|rx/yo:",
+ vpmovzxwq_2 = "rroy:660F38u34rM|rx/od:|rx/yq:",
+ vpmovzxdq_2 = "rroy:660F38u35rM|rx/oq:|rx/yo:",
+ vpmuldq_3 = "rrmoy:660F38V28rM",
+ vpmulhrsw_3 = "rrmoy:660F38V0BrM",
+ vpmulld_3 = "rrmoy:660F38V40rM",
+ vpshufb_3 = "rrmoy:660F38V00rM",
+ vpshufd_3 = "rmioy:660Fu70rMU",
+ vpshufhw_3 = "rmioy:F30Fu70rMU",
+ vpshuflw_3 = "rmioy:F20Fu70rMU",
+ vpsignb_3 = "rrmoy:660F38V08rM",
+ vpsignw_3 = "rrmoy:660F38V09rM",
+ vpsignd_3 = "rrmoy:660F38V0ArM",
+ vpslldq_3 = "rrioy:660Fv737mU",
+ vpsllw_3 = "rrmoy:660FVF1rM|rrioy:660Fv716mU",
+ vpslld_3 = "rrmoy:660FVF2rM|rrioy:660Fv726mU",
+ vpsllq_3 = "rrmoy:660FVF3rM|rrioy:660Fv736mU",
+ vpsraw_3 = "rrmoy:660FVE1rM|rrioy:660Fv714mU",
+ vpsrad_3 = "rrmoy:660FVE2rM|rrioy:660Fv724mU",
+ vpsrldq_3 = "rrioy:660Fv733mU",
+ vpsrlw_3 = "rrmoy:660FVD1rM|rrioy:660Fv712mU",
+ vpsrld_3 = "rrmoy:660FVD2rM|rrioy:660Fv722mU",
+ vpsrlq_3 = "rrmoy:660FVD3rM|rrioy:660Fv732mU",
+ vptest_2 = "rmoy:660F38u17rM",
+
+ -- AVX2 integer ops
+ vbroadcasti128_2 = "rx/yo:660F38u5ArM",
+ vinserti128_4 = "rrmi/yyo:660F3AV38rMU",
+ vextracti128_3 = "mri/oy:660F3AuL39RmU",
+ vpblendd_4 = "rrmioy:660F3AV02rMU",
+ vpbroadcastb_2 = "rro:660F38u78rM|rx/ob:|rr/yo:|rx/yb:",
+ vpbroadcastw_2 = "rro:660F38u79rM|rx/ow:|rr/yo:|rx/yw:",
+ vpbroadcastd_2 = "rro:660F38u58rM|rx/od:|rr/yo:|rx/yd:",
+ vpbroadcastq_2 = "rro:660F38u59rM|rx/oq:|rr/yo:|rx/yq:",
+ vpermd_3 = "rrmy:660F38V36rM",
+ vpermq_3 = "rmiy:660F3AuX00rMU",
+ -- *vpgather* (!vsib)
+ vperm2i128_4 = "rrmiy:660F3AV46rMU",
+ vpmaskmovd_3 = "rrxoy:660F38V8CrM|xrroy:660F38V8ERm",
+ vpmaskmovq_3 = "rrxoy:660F38VX8CrM|xrroy:660F38VX8ERm",
+ vpsllvd_3 = "rrmoy:660F38V47rM",
+ vpsllvq_3 = "rrmoy:660F38VX47rM",
+ vpsravd_3 = "rrmoy:660F38V46rM",
+ vpsrlvd_3 = "rrmoy:660F38V45rM",
+ vpsrlvq_3 = "rrmoy:660F38VX45rM",
}
------------------------------------------------------------------------------
@@ -1462,28 +1723,58 @@ for cc,n in pairs{ b=0, e=1, be=2, u=3, nb=4, ne=5, nbe=6, nu=7 } do
map_op["fcmov"..cc.."_2"] = format("Fff:%04XR", nc) -- P6+
end
--- SSE FP arithmetic ops.
+-- SSE / AVX FP arithmetic ops.
for name,n in pairs{ sqrt = 1, add = 8, mul = 9,
sub = 12, min = 13, div = 14, max = 15 } do
map_op[name.."ps_2"] = format("rmo:0F5%XrM", n)
map_op[name.."ss_2"] = format("rro:F30F5%XrM|rx/od:", n)
map_op[name.."pd_2"] = format("rmo:660F5%XrM", n)
map_op[name.."sd_2"] = format("rro:F20F5%XrM|rx/oq:", n)
+ if n ~= 1 then
+ map_op["v"..name.."ps_3"] = format("rrmoy:0FV5%XrM", n)
+ map_op["v"..name.."ss_3"] = format("rrro:F30FV5%XrM|rrx/ood:", n)
+ map_op["v"..name.."pd_3"] = format("rrmoy:660FV5%XrM", n)
+ map_op["v"..name.."sd_3"] = format("rrro:F20FV5%XrM|rrx/ooq:", n)
+ end
+end
+
+-- SSE2 / AVX / AVX2 integer arithmetic ops (66 0F leaf).
+for name,n in pairs{
+ paddb = 0xFC, paddw = 0xFD, paddd = 0xFE, paddq = 0xD4,
+ paddsb = 0xEC, paddsw = 0xED, packssdw = 0x6B,
+ packsswb = 0x63, packuswb = 0x67, paddusb = 0xDC,
+ paddusw = 0xDD, pand = 0xDB, pandn = 0xDF, pavgb = 0xE0,
+ pavgw = 0xE3, pcmpeqb = 0x74, pcmpeqd = 0x76,
+ pcmpeqw = 0x75, pcmpgtb = 0x64, pcmpgtd = 0x66,
+ pcmpgtw = 0x65, pmaddwd = 0xF5, pmaxsw = 0xEE,
+ pmaxub = 0xDE, pminsw = 0xEA, pminub = 0xDA,
+ pmulhuw = 0xE4, pmulhw = 0xE5, pmullw = 0xD5,
+ pmuludq = 0xF4, por = 0xEB, psadbw = 0xF6, psubb = 0xF8,
+ psubw = 0xF9, psubd = 0xFA, psubq = 0xFB, psubsb = 0xE8,
+ psubsw = 0xE9, psubusb = 0xD8, psubusw = 0xD9,
+ punpckhbw = 0x68, punpckhwd = 0x69, punpckhdq = 0x6A,
+ punpckhqdq = 0x6D, punpcklbw = 0x60, punpcklwd = 0x61,
+ punpckldq = 0x62, punpcklqdq = 0x6C, pxor = 0xEF
+} do
+ map_op[name.."_2"] = format("rmo:660F%02XrM", n)
+ map_op["v"..name.."_3"] = format("rrmoy:660FV%02XrM", n)
end
------------------------------------------------------------------------------
+local map_vexarg = { u = false, v = 1, V = 2 }
+
-- Process pattern string.
local function dopattern(pat, args, sz, op, needrex)
- local digit, addin
+ local digit, addin, vex
local opcode = 0
local szov = sz
local narg = 1
local rex = 0
-- Limit number of section buffer positions used by a single dasm_put().
- -- A single opcode needs a maximum of 5 positions.
- if secpos+5 > maxsecpos then wflush() end
+ -- A single opcode needs a maximum of 6 positions.
+ if secpos+6 > maxsecpos then wflush() end
-- Process each character.
for c in gmatch(pat.."|", ".") do
@@ -1497,6 +1788,8 @@ local function dopattern(pat, args, sz, op, needrex)
szov = nil
elseif c == "X" then -- Force REX.W.
rex = 8
+ elseif c == "L" then -- Force VEX.L.
+ vex.l = true
elseif c == "r" then -- Merge 1st operand regno. into opcode.
addin = args[1]; opcode = opcode + (addin.reg % 8)
if narg < 2 then narg = 2 end
@@ -1520,21 +1813,42 @@ local function dopattern(pat, args, sz, op, needrex)
if t.xreg and t.xreg > 7 then rex = rex + 2 end
if s > 7 then rex = rex + 4 end
if needrex then rex = rex + 16 end
- wputop(szov, opcode, rex); opcode = nil
+ local psz, sk = wputop(szov, opcode, rex, vex, s < 0, t.vreg or t.vxreg)
+ opcode = nil
local imark = sub(pat, -1) -- Force a mark (ugly).
-- Put ModRM/SIB with regno/last digit as spare.
- wputmrmsib(t, imark, s, addin and addin.vreg)
+ wputmrmsib(t, imark, s, addin and addin.vreg, psz, sk)
addin = nil
+ elseif map_vexarg[c] ~= nil then -- Encode using VEX prefix
+ local b = band(opcode, 255); opcode = shr(opcode, 8)
+ local m = 1
+ if b == 0x38 then m = 2
+ elseif b == 0x3a then m = 3 end
+ if m ~= 1 then b = band(opcode, 255); opcode = shr(opcode, 8) end
+ if b ~= 0x0f then
+ werror("expected `0F', `0F38', or `0F3A' to precede `"..c..
+ "' in pattern `"..pat.."' for `"..op.."'")
+ end
+ local v = map_vexarg[c]
+ if v then v = remove(args, v) end
+ b = band(opcode, 255)
+ local p = 0
+ if b == 0x66 then p = 1
+ elseif b == 0xf3 then p = 2
+ elseif b == 0xf2 then p = 3 end
+ if p ~= 0 then opcode = shr(opcode, 8) end
+ if opcode ~= 0 then wputop(nil, opcode, 0); opcode = 0 end
+ vex = { m = m, p = p, v = v }
else
if opcode then -- Flush opcode.
if szov == "q" and rex == 0 then rex = rex + 8 end
if needrex then rex = rex + 16 end
if addin and addin.reg == -1 then
- wputop(szov, opcode - 7, rex)
- waction("VREG", addin.vreg); wputxb(0)
+ local psz, sk = wputop(szov, opcode - 7, rex, vex, true)
+ wvreg("opcode", addin.vreg, psz, sk)
else
if addin and addin.reg > 7 then rex = rex + 1 end
- wputop(szov, opcode, rex)
+ wputop(szov, opcode, rex, vex)
end
opcode = nil
end
@@ -1571,6 +1885,14 @@ local function dopattern(pat, args, sz, op, needrex)
else
wputlabel("REL_", imm, 2)
end
+ elseif c == "s" then
+ local reg = a.reg
+ if reg < 0 then
+ wputb(0)
+ wvreg("imm.hi", a.vreg)
+ else
+ wputb(shl(reg, 4))
+ end
else
werror("bad char `"..c.."' in pattern `"..pat.."' for `"..op.."'")
end
@@ -1647,11 +1969,14 @@ map_op[".template__"] = function(params, template, nparams)
if pat == "" then pat = lastpat else lastpat = pat end
if matchtm(tm, args) then
local prefix = sub(szm, 1, 1)
- if prefix == "/" then -- Match both operand sizes.
- if args[1].opsize == sub(szm, 2, 2) and
- args[2].opsize == sub(szm, 3, 3) then
- dopattern(pat, args, sz, params.op, needrex) -- Process pattern.
- return
+ if prefix == "/" then -- Exactly match leading operand sizes.
+ for i = #szm,1,-1 do
+ if i == 1 then
+ dopattern(pat, args, sz, params.op, needrex) -- Process pattern.
+ return
+ elseif args[i-1].opsize ~= sub(szm, i, i) then
+ break
+ end
end
else -- Match common operand size.
local szp = sz
@@ -1716,8 +2041,8 @@ if x64 then
rex = a.reg > 7 and 9 or 8
end
end
- wputop(sz, opcode, rex)
- if vreg then waction("VREG", vreg); wputxb(0) end
+ local psz, sk = wputop(sz, opcode, rex, nil, vreg)
+ wvreg("opcode", vreg, psz, sk)
waction("IMM_D", format("(unsigned int)(%s)", op64))
waction("IMM_D", format("(unsigned int)((%s)>>32)", op64))
end
diff --git a/lib/luajit/dynasm/dynasm.lua b/lib/luajit/dynasm/dynasm.lua
index fffda7513c..145fb0cc6d 100644
--- a/lib/luajit/dynasm/dynasm.lua
+++ b/lib/luajit/dynasm/dynasm.lua
@@ -10,9 +10,9 @@
local _info = {
name = "DynASM",
description = "A dynamic assembler for code generation engines",
- version = "1.3.0",
- vernum = 10300,
- release = "2011-05-05",
+ version = "1.4.0",
+ vernum = 10400,
+ release = "2015-10-18",
author = "Mike Pall",
url = "http://luajit.org/dynasm.html",
license = "MIT",
diff --git a/lib/luajit/src/Makefile b/lib/luajit/src/Makefile
index 532da6e94d..9845f6a0e2 100644
--- a/lib/luajit/src/Makefile
+++ b/lib/luajit/src/Makefile
@@ -24,11 +24,13 @@ NODOTABIVER= 51
# removing the '#' in front of them. Make sure you force a full recompile
# with "make clean", followed by "make" if you change any options.
#
+DEFAULT_CC = gcc
+#
# LuaJIT builds as a native 32 or 64 bit binary by default.
-CC= gcc
+CC= $(DEFAULT_CC)
#
# Use this if you want to force a 32 bit build on a 64 bit multilib OS.
-#CC= gcc -m32
+#CC= $(DEFAULT_CC) -m32
#
# Since the assembler part does NOT maintain a frame pointer, it's pointless
# to slow down the C part by not omitting it. Debugging, tracebacks and
@@ -147,6 +149,29 @@ XCFLAGS=
# You probably don't need to change anything below this line!
##############################################################################
+##############################################################################
+# Host system detection.
+##############################################################################
+
+ifeq (Windows,$(findstring Windows,$(OS))$(MSYSTEM)$(TERM))
+ HOST_SYS= Windows
+ HOST_RM= del
+else
+ HOST_SYS:= $(shell uname -s)
+ ifneq (,$(findstring MINGW,$(HOST_SYS)))
+ HOST_SYS= Windows
+ HOST_MSYS= mingw
+ endif
+ ifneq (,$(findstring CYGWIN,$(HOST_SYS)))
+ HOST_SYS= Windows
+ HOST_MSYS= cygwin
+ endif
+ # Use Clang for OSX host.
+ ifeq (Darwin,$(HOST_SYS))
+ DEFAULT_CC= clang
+ endif
+endif
+
##############################################################################
# Flags and options for host and target.
##############################################################################
@@ -268,24 +293,9 @@ ifneq (,$(LMULTILIB))
endif
##############################################################################
-# System detection.
+# Target system detection.
##############################################################################
-ifeq (Windows,$(findstring Windows,$(OS))$(MSYSTEM)$(TERM))
- HOST_SYS= Windows
- HOST_RM= del
-else
- HOST_SYS:= $(shell uname -s)
- ifneq (,$(findstring MINGW,$(HOST_SYS)))
- HOST_SYS= Windows
- HOST_MSYS= mingw
- endif
- ifneq (,$(findstring CYGWIN,$(HOST_SYS)))
- HOST_SYS= Windows
- HOST_MSYS= cygwin
- endif
-endif
-
TARGET_SYS?= $(HOST_SYS)
ifeq (Windows,$(TARGET_SYS))
TARGET_STRIP+= --strip-unneeded
diff --git a/lib/luajit/src/host/buildvm_asm.c b/lib/luajit/src/host/buildvm_asm.c
index 9b7ae53a26..9b1194259a 100644
--- a/lib/luajit/src/host/buildvm_asm.c
+++ b/lib/luajit/src/host/buildvm_asm.c
@@ -261,11 +261,20 @@ void emit_asm(BuildCtx *ctx)
#if LJ_TARGET_ARM && defined(__GNUC__) && !LJ_NO_UNWIND
/* This should really be moved into buildvm_arm.dasc. */
+#if LJ_ARCH_HASFPU
+ fprintf(ctx->fp,
+ ".fnstart\n"
+ ".save {r5, r6, r7, r8, r9, r10, r11, lr}\n"
+ ".vsave {d8-d15}\n"
+ ".save {r4}\n"
+ ".pad #28\n");
+#else
fprintf(ctx->fp,
".fnstart\n"
".save {r4, r5, r6, r7, r8, r9, r10, r11, lr}\n"
".pad #28\n");
#endif
+#endif
#if LJ_TARGET_MIPS
fprintf(ctx->fp, ".set nomips16\n.abicalls\n.set noreorder\n.set nomacro\n");
#endif
diff --git a/lib/luajit/src/jit/dis_x86.lua b/lib/luajit/src/jit/dis_x86.lua
index 6bc38066fe..a7c05ed6d5 100644
--- a/lib/luajit/src/jit/dis_x86.lua
+++ b/lib/luajit/src/jit/dis_x86.lua
@@ -15,13 +15,12 @@
-- Intel and AMD manuals. The supported instruction set is quite extensive
-- and reflects what a current generation Intel or AMD CPU implements in
-- 32 bit and 64 bit mode. Yes, this includes MMX, SSE, SSE2, SSE3, SSSE3,
--- SSE4.1, SSE4.2, SSE4a and even privileged and hypervisor (VMX/SVM)
--- instructions.
+-- SSE4.1, SSE4.2, SSE4a, AVX, AVX2 and even privileged and hypervisor
+-- (VMX/SVM) instructions.
--
-- Notes:
-- * The (useless) a16 prefix, 3DNow and pre-586 opcodes are unsupported.
-- * No attempt at optimization has been made -- it's fast enough for my needs.
--- * The public API may change when more architectures are added.
------------------------------------------------------------------------------
local type = type
@@ -78,7 +77,7 @@ local map_opc1_32 = {
"movBRi","movBRi","movBRi","movBRi","movBRi","movBRi","movBRi","movBRi",
"movVRI","movVRI","movVRI","movVRI","movVRI","movVRI","movVRI","movVRI",
--Cx
-"shift!Bmu","shift!Vmu","retBw","ret","$lesVrm","$ldsVrm","movBmi","movVmi",
+"shift!Bmu","shift!Vmu","retBw","ret","vex*3$lesVrm","vex*2$ldsVrm","movBmi","movVmi",
"enterBwu","leave","retfBw","retf","int3","intBu","into","iretVS",
--Dx
"shift!Bm1","shift!Vm1","shift!Bmc","shift!Vmc","aamBu","aadBu","salc","xlatb",
@@ -103,7 +102,7 @@ local map_opc1_64 = setmetatable({
[0x44]="rex*r", [0x45]="rex*rb", [0x46]="rex*rx", [0x47]="rex*rxb",
[0x48]="rex*w", [0x49]="rex*wb", [0x4a]="rex*wx", [0x4b]="rex*wxb",
[0x4c]="rex*wr", [0x4d]="rex*wrb", [0x4e]="rex*wrx", [0x4f]="rex*wrxb",
- [0x82]=false, [0x9a]=false, [0xc4]=false, [0xc5]=false, [0xce]=false,
+ [0x82]=false, [0x9a]=false, [0xc4]="vex*3", [0xc5]="vex*2", [0xce]=false,
[0xd4]=false, [0xd5]=false, [0xd6]=false, [0xea]=false,
}, { __index = map_opc1_32 })
@@ -114,12 +113,12 @@ local map_opc2 = {
[0]="sldt!Dmp","sgdt!Ump","larVrm","lslVrm",nil,"syscall","clts","sysret",
"invd","wbinvd",nil,"ud1",nil,"$prefetch!Bm","femms","3dnowMrmu",
--1x
-"movupsXrm|movssXrm|movupdXrm|movsdXrm",
-"movupsXmr|movssXmr|movupdXmr|movsdXmr",
+"movupsXrm|movssXrvm|movupdXrm|movsdXrvm",
+"movupsXmr|movssXmvr|movupdXmr|movsdXmvr",
"movhlpsXrm$movlpsXrm|movsldupXrm|movlpdXrm|movddupXrm",
"movlpsXmr||movlpdXmr",
-"unpcklpsXrm||unpcklpdXrm",
-"unpckhpsXrm||unpckhpdXrm",
+"unpcklpsXrvm||unpcklpdXrvm",
+"unpckhpsXrvm||unpckhpdXrvm",
"movlhpsXrm$movhpsXrm|movshdupXrm|movhpdXrm",
"movhpsXmr||movhpdXmr",
"$prefetcht!Bm","hintnopVm","hintnopVm","hintnopVm",
@@ -128,7 +127,7 @@ local map_opc2 = {
"movUmx$","movUmy$","movUxm$","movUym$","movUmz$",nil,"movUzm$",nil,
"movapsXrm||movapdXrm",
"movapsXmr||movapdXmr",
-"cvtpi2psXrMm|cvtsi2ssXrVmt|cvtpi2pdXrMm|cvtsi2sdXrVmt",
+"cvtpi2psXrMm|cvtsi2ssXrvVmt|cvtpi2pdXrMm|cvtsi2sdXrvVmt",
"movntpsXmr|movntssXmr|movntpdXmr|movntsdXmr",
"cvttps2piMrXm|cvttss2siVrXm|cvttpd2piMrXm|cvttsd2siVrXm",
"cvtps2piMrXm|cvtss2siVrXm|cvtpd2piMrXm|cvtsd2siVrXm",
@@ -144,27 +143,27 @@ local map_opc2 = {
"cmovlVrm","cmovgeVrm","cmovleVrm","cmovgVrm",
--5x
"movmskpsVrXm$||movmskpdVrXm$","sqrtpsXrm|sqrtssXrm|sqrtpdXrm|sqrtsdXrm",
-"rsqrtpsXrm|rsqrtssXrm","rcppsXrm|rcpssXrm",
-"andpsXrm||andpdXrm","andnpsXrm||andnpdXrm",
-"orpsXrm||orpdXrm","xorpsXrm||xorpdXrm",
-"addpsXrm|addssXrm|addpdXrm|addsdXrm","mulpsXrm|mulssXrm|mulpdXrm|mulsdXrm",
-"cvtps2pdXrm|cvtss2sdXrm|cvtpd2psXrm|cvtsd2ssXrm",
+"rsqrtpsXrm|rsqrtssXrvm","rcppsXrm|rcpssXrvm",
+"andpsXrvm||andpdXrvm","andnpsXrvm||andnpdXrvm",
+"orpsXrvm||orpdXrvm","xorpsXrvm||xorpdXrvm",
+"addpsXrvm|addssXrvm|addpdXrvm|addsdXrvm","mulpsXrvm|mulssXrvm|mulpdXrvm|mulsdXrvm",
+"cvtps2pdXrm|cvtss2sdXrvm|cvtpd2psXrm|cvtsd2ssXrvm",
"cvtdq2psXrm|cvttps2dqXrm|cvtps2dqXrm",
-"subpsXrm|subssXrm|subpdXrm|subsdXrm","minpsXrm|minssXrm|minpdXrm|minsdXrm",
-"divpsXrm|divssXrm|divpdXrm|divsdXrm","maxpsXrm|maxssXrm|maxpdXrm|maxsdXrm",
+"subpsXrvm|subssXrvm|subpdXrvm|subsdXrvm","minpsXrvm|minssXrvm|minpdXrvm|minsdXrvm",
+"divpsXrvm|divssXrvm|divpdXrvm|divsdXrvm","maxpsXrvm|maxssXrvm|maxpdXrvm|maxsdXrvm",
--6x
-"punpcklbwPrm","punpcklwdPrm","punpckldqPrm","packsswbPrm",
-"pcmpgtbPrm","pcmpgtwPrm","pcmpgtdPrm","packuswbPrm",
-"punpckhbwPrm","punpckhwdPrm","punpckhdqPrm","packssdwPrm",
-"||punpcklqdqXrm","||punpckhqdqXrm",
+"punpcklbwPrvm","punpcklwdPrvm","punpckldqPrvm","packsswbPrvm",
+"pcmpgtbPrvm","pcmpgtwPrvm","pcmpgtdPrvm","packuswbPrvm",
+"punpckhbwPrvm","punpckhwdPrvm","punpckhdqPrvm","packssdwPrvm",
+"||punpcklqdqXrvm","||punpckhqdqXrvm",
"movPrVSm","movqMrm|movdquXrm|movdqaXrm",
--7x
-"pshufwMrmu|pshufhwXrmu|pshufdXrmu|pshuflwXrmu","pshiftw!Pmu",
-"pshiftd!Pmu","pshiftq!Mmu||pshiftdq!Xmu",
-"pcmpeqbPrm","pcmpeqwPrm","pcmpeqdPrm","emms|",
+"pshufwMrmu|pshufhwXrmu|pshufdXrmu|pshuflwXrmu","pshiftw!Pvmu",
+"pshiftd!Pvmu","pshiftq!Mvmu||pshiftdq!Xvmu",
+"pcmpeqbPrvm","pcmpeqwPrvm","pcmpeqdPrvm","emms*|",
"vmreadUmr||extrqXmuu$|insertqXrmuu$","vmwriteUrm||extrqXrm$|insertqXrm$",
nil,nil,
-"||haddpdXrm|haddpsXrm","||hsubpdXrm|hsubpsXrm",
+"||haddpdXrvm|haddpsXrvm","||hsubpdXrvm|hsubpsXrvm",
"movVSmMr|movqXrm|movVSmXr","movqMmr|movdquXmr|movdqaXmr",
--8x
"joVj","jnoVj","jbVj","jnbVj","jzVj","jnzVj","jbeVj","jaVj",
@@ -182,27 +181,27 @@ nil,nil,
"bsfVrm","bsrVrm|lzcntVrm|bsrWrm","movsxVrBmt","movsxVrWmt",
--Cx
"xaddBmr","xaddVmr",
-"cmppsXrmu|cmpssXrmu|cmppdXrmu|cmpsdXrmu","$movntiVmr|",
-"pinsrwPrWmu","pextrwDrPmu",
-"shufpsXrmu||shufpdXrmu","$cmpxchg!Qmp",
+"cmppsXrvmu|cmpssXrvmu|cmppdXrvmu|cmpsdXrvmu","$movntiVmr|",
+"pinsrwPrvWmu","pextrwDrPmu",
+"shufpsXrvmu||shufpdXrvmu","$cmpxchg!Qmp",
"bswapVR","bswapVR","bswapVR","bswapVR","bswapVR","bswapVR","bswapVR","bswapVR",
--Dx
-"||addsubpdXrm|addsubpsXrm","psrlwPrm","psrldPrm","psrlqPrm",
-"paddqPrm","pmullwPrm",
+"||addsubpdXrvm|addsubpsXrvm","psrlwPrvm","psrldPrvm","psrlqPrvm",
+"paddqPrvm","pmullwPrvm",
"|movq2dqXrMm|movqXmr|movdq2qMrXm$","pmovmskbVrMm||pmovmskbVrXm",
-"psubusbPrm","psubuswPrm","pminubPrm","pandPrm",
-"paddusbPrm","padduswPrm","pmaxubPrm","pandnPrm",
+"psubusbPrvm","psubuswPrvm","pminubPrvm","pandPrvm",
+"paddusbPrvm","padduswPrvm","pmaxubPrvm","pandnPrvm",
--Ex
-"pavgbPrm","psrawPrm","psradPrm","pavgwPrm",
-"pmulhuwPrm","pmulhwPrm",
+"pavgbPrvm","psrawPrvm","psradPrvm","pavgwPrvm",
+"pmulhuwPrvm","pmulhwPrvm",
"|cvtdq2pdXrm|cvttpd2dqXrm|cvtpd2dqXrm","$movntqMmr||$movntdqXmr",
-"psubsbPrm","psubswPrm","pminswPrm","porPrm",
-"paddsbPrm","paddswPrm","pmaxswPrm","pxorPrm",
+"psubsbPrvm","psubswPrvm","pminswPrvm","porPrvm",
+"paddsbPrvm","paddswPrvm","pmaxswPrvm","pxorPrvm",
--Fx
-"|||lddquXrm","psllwPrm","pslldPrm","psllqPrm",
-"pmuludqPrm","pmaddwdPrm","psadbwPrm","maskmovqMrm||maskmovdquXrm$",
-"psubbPrm","psubwPrm","psubdPrm","psubqPrm",
-"paddbPrm","paddwPrm","padddPrm","ud",
+"|||lddquXrm","psllwPrvm","pslldPrvm","psllqPrvm",
+"pmuludqPrvm","pmaddwdPrvm","psadbwPrvm","maskmovqMrm||maskmovdquXrm$",
+"psubbPrvm","psubwPrvm","psubdPrvm","psubqPrvm",
+"paddbPrvm","paddwPrvm","padddPrvm","ud",
}
assert(map_opc2[255] == "ud")
@@ -210,49 +209,70 @@ assert(map_opc2[255] == "ud")
local map_opc3 = {
["38"] = { -- [66] 0f 38 xx
--0x
-[0]="pshufbPrm","phaddwPrm","phadddPrm","phaddswPrm",
-"pmaddubswPrm","phsubwPrm","phsubdPrm","phsubswPrm",
-"psignbPrm","psignwPrm","psigndPrm","pmulhrswPrm",
-nil,nil,nil,nil,
+[0]="pshufbPrvm","phaddwPrvm","phadddPrvm","phaddswPrvm",
+"pmaddubswPrvm","phsubwPrvm","phsubdPrvm","phsubswPrvm",
+"psignbPrvm","psignwPrvm","psigndPrvm","pmulhrswPrvm",
+"||permilpsXrvm","||permilpdXrvm",nil,nil,
--1x
"||pblendvbXrma",nil,nil,nil,
-"||blendvpsXrma","||blendvpdXrma",nil,"||ptestXrm",
-nil,nil,nil,nil,
+"||blendvpsXrma","||blendvpdXrma","||permpsXrvm","||ptestXrm",
+"||broadcastssXrm","||broadcastsdXrm","||broadcastf128XrlXm",nil,
"pabsbPrm","pabswPrm","pabsdPrm",nil,
--2x
"||pmovsxbwXrm","||pmovsxbdXrm","||pmovsxbqXrm","||pmovsxwdXrm",
"||pmovsxwqXrm","||pmovsxdqXrm",nil,nil,
-"||pmuldqXrm","||pcmpeqqXrm","||$movntdqaXrm","||packusdwXrm",
-nil,nil,nil,nil,
+"||pmuldqXrvm","||pcmpeqqXrvm","||$movntdqaXrm","||packusdwXrvm",
+"||maskmovpsXrvm","||maskmovpdXrvm","||maskmovpsXmvr","||maskmovpdXmvr",
--3x
"||pmovzxbwXrm","||pmovzxbdXrm","||pmovzxbqXrm","||pmovzxwdXrm",
-"||pmovzxwqXrm","||pmovzxdqXrm",nil,"||pcmpgtqXrm",
-"||pminsbXrm","||pminsdXrm","||pminuwXrm","||pminudXrm",
-"||pmaxsbXrm","||pmaxsdXrm","||pmaxuwXrm","||pmaxudXrm",
+"||pmovzxwqXrm","||pmovzxdqXrm","||permdXrvm","||pcmpgtqXrvm",
+"||pminsbXrvm","||pminsdXrvm","||pminuwXrvm","||pminudXrvm",
+"||pmaxsbXrvm","||pmaxsdXrvm","||pmaxuwXrvm","||pmaxudXrvm",
--4x
-"||pmulddXrm","||phminposuwXrm",
+"||pmulddXrvm","||phminposuwXrm",nil,nil,
+nil,"||psrlvVSXrvm","||psravdXrvm","||psllvVSXrvm",
+--5x
+[0x58] = "||pbroadcastdXrlXm",[0x59] = "||pbroadcastqXrlXm",
+[0x5a] = "||broadcasti128XrlXm",
+--7x
+[0x78] = "||pbroadcastbXrlXm",[0x79] = "||pbroadcastwXrlXm",
+--8x
+[0x8c] = "||pmaskmovXrvVSm",
+[0x8e] = "||pmaskmovVSmXvr",
+--Dx
+[0xdc] = "||aesencXrvm", [0xdd] = "||aesenclastXrvm",
+[0xde] = "||aesdecXrvm", [0xdf] = "||aesdeclastXrvm",
--Fx
[0xf0] = "|||crc32TrBmt",[0xf1] = "|||crc32TrVmt",
},
["3a"] = { -- [66] 0f 3a xx
--0x
-[0x00]=nil,nil,nil,nil,nil,nil,nil,nil,
-"||roundpsXrmu","||roundpdXrmu","||roundssXrmu","||roundsdXrmu",
-"||blendpsXrmu","||blendpdXrmu","||pblendwXrmu","palignrPrmu",
+[0x00]="||permqXrmu","||permpdXrmu","||pblenddXrvmu",nil,
+"||permilpsXrmu","||permilpdXrmu","||perm2f128Xrvmu",nil,
+"||roundpsXrmu","||roundpdXrmu","||roundssXrvmu","||roundsdXrvmu",
+"||blendpsXrvmu","||blendpdXrvmu","||pblendwXrvmu","palignrPrvmu",
--1x
nil,nil,nil,nil,
"||pextrbVmXru","||pextrwVmXru","||pextrVmSXru","||extractpsVmXru",
-nil,nil,nil,nil,nil,nil,nil,nil,
+"||insertf128XrvlXmu","||extractf128XlXmYru",nil,nil,
+nil,nil,nil,nil,
--2x
-"||pinsrbXrVmu","||insertpsXrmu","||pinsrXrVmuS",nil,
+"||pinsrbXrvVmu","||insertpsXrvmu","||pinsrXrvVmuS",nil,
+--3x
+[0x38] = "||inserti128Xrvmu",[0x39] = "||extracti128XlXmYru",
--4x
-[0x40] = "||dppsXrmu",
-[0x41] = "||dppdXrmu",
-[0x42] = "||mpsadbwXrmu",
+[0x40] = "||dppsXrvmu",
+[0x41] = "||dppdXrvmu",
+[0x42] = "||mpsadbwXrvmu",
+[0x44] = "||pclmulqdqXrvmu",
+[0x46] = "||perm2i128Xrvmu",
+[0x4a] = "||blendvpsXrvmb",[0x4b] = "||blendvpdXrvmb",
+[0x4c] = "||pblendvbXrvmb",
--6x
[0x60] = "||pcmpestrmXrmu",[0x61] = "||pcmpestriXrmu",
[0x62] = "||pcmpistrmXrmu",[0x63] = "||pcmpistriXrmu",
+[0xdf] = "||aeskeygenassistXrmu",
},
}
@@ -356,17 +376,19 @@ local map_regs = {
"mm0", "mm1", "mm2", "mm3", "mm4", "mm5", "mm6", "mm7" }, -- No x64 ext!
X = { "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6", "xmm7",
"xmm8", "xmm9", "xmm10", "xmm11", "xmm12", "xmm13", "xmm14", "xmm15" },
+ Y = { "ymm0", "ymm1", "ymm2", "ymm3", "ymm4", "ymm5", "ymm6", "ymm7",
+ "ymm8", "ymm9", "ymm10", "ymm11", "ymm12", "ymm13", "ymm14", "ymm15" },
}
local map_segregs = { "es", "cs", "ss", "ds", "fs", "gs", "segr6", "segr7" }
-- Maps for size names.
local map_sz2n = {
- B = 1, W = 2, D = 4, Q = 8, M = 8, X = 16,
+ B = 1, W = 2, D = 4, Q = 8, M = 8, X = 16, Y = 32,
}
local map_sz2prefix = {
B = "byte", W = "word", D = "dword",
Q = "qword",
- M = "qword", X = "xword",
+ M = "qword", X = "xword", Y = "yword",
F = "dword", G = "qword", -- No need for sizes/register names for these two.
}
@@ -389,10 +411,13 @@ local function putop(ctx, text, operands)
if ctx.rep then text = ctx.rep.." "..text; ctx.rep = false end
if ctx.rex then
local t = (ctx.rexw and "w" or "")..(ctx.rexr and "r" or "")..
- (ctx.rexx and "x" or "")..(ctx.rexb and "b" or "")
- if t ~= "" then text = "rex."..t.." "..text end
+ (ctx.rexx and "x" or "")..(ctx.rexb and "b" or "")..
+ (ctx.vexl and "l" or "")
+ if ctx.vexv and ctx.vexv ~= 0 then t = t.."v"..ctx.vexv end
+ if t ~= "" then text = ctx.rex.."."..t.." "..text
+ elseif ctx.rex == "vex" then text = "v"..text end
ctx.rexw = false; ctx.rexr = false; ctx.rexx = false; ctx.rexb = false
- ctx.rex = false
+ ctx.rex = false; ctx.vexl = false; ctx.vexv = false
end
if ctx.seg then
local text2, n = gsub(text, "%[", "["..ctx.seg..":")
@@ -407,6 +432,7 @@ local function putop(ctx, text, operands)
end
ctx.out(format("%08x %s%s\n", ctx.addr+ctx.start, hex, text))
ctx.mrm = false
+ ctx.vexv = false
ctx.start = pos
ctx.imm = nil
end
@@ -415,7 +441,7 @@ end
local function clearprefixes(ctx)
ctx.o16 = false; ctx.seg = false; ctx.lock = false; ctx.rep = false
ctx.rexw = false; ctx.rexr = false; ctx.rexx = false; ctx.rexb = false
- ctx.rex = false; ctx.a32 = false
+ ctx.rex = false; ctx.a32 = false; ctx.vexl = false
end
-- Fallback for incomplete opcodes at the end.
@@ -452,9 +478,9 @@ end
-- Process pattern string and generate the operands.
local function putpat(ctx, name, pat)
local operands, regs, sz, mode, sp, rm, sc, rx, sdisp
- local code, pos, stop = ctx.code, ctx.pos, ctx.stop
+ local code, pos, stop, vexl = ctx.code, ctx.pos, ctx.stop, ctx.vexl
- -- Chars used: 1DFGIMPQRSTUVWXacdfgijmoprstuwxyz
+ -- Chars used: 1DFGIMPQRSTUVWXYabcdfgijlmoprstuvwxyz
for p in gmatch(pat, ".") do
local x = nil
if p == "V" or p == "U" then
@@ -469,11 +495,13 @@ local function putpat(ctx, name, pat)
elseif p == "B" then
sz = "B"
regs = ctx.rex and map_regs.B64 or map_regs.B
- elseif match(p, "[WDQMXFG]") then
+ elseif match(p, "[WDQMXYFG]") then
sz = p
+ if sz == "X" and vexl then sz = "Y"; ctx.vexl = false end
regs = map_regs[sz]
elseif p == "P" then
sz = ctx.o16 and "X" or "M"; ctx.o16 = false
+ if sz == "X" and vexl then sz = "Y"; ctx.vexl = false end
regs = map_regs[sz]
elseif p == "S" then
name = name..lower(sz)
@@ -486,6 +514,10 @@ local function putpat(ctx, name, pat)
local imm = getimm(ctx, pos, 1); if not imm then return end
x = format("0x%02x", imm)
pos = pos+1
+ elseif p == "b" then
+ local imm = getimm(ctx, pos, 1); if not imm then return end
+ x = regs[imm/16+1]
+ pos = pos+1
elseif p == "w" then
local imm = getimm(ctx, pos, 2); if not imm then return end
x = format("0x%x", imm)
@@ -618,8 +650,13 @@ local function putpat(ctx, name, pat)
else
x = "CR"..sp
end
+ elseif p == "v" then
+ if ctx.vexv then
+ x = regs[ctx.vexv+1]; ctx.vexv = false
+ end
elseif p == "y" then x = "DR"..sp
elseif p == "z" then x = "TR"..sp
+ elseif p == "l" then vexl = false
elseif p == "t" then
else
error("bad pattern `"..pat.."'")
@@ -694,7 +731,7 @@ map_act = {
B = putpat, W = putpat, D = putpat, Q = putpat,
V = putpat, U = putpat, T = putpat,
M = putpat, X = putpat, P = putpat,
- F = putpat, G = putpat,
+ F = putpat, G = putpat, Y = putpat,
-- Collect prefixes.
[":"] = function(ctx, name, pat)
@@ -755,15 +792,68 @@ map_act = {
-- REX prefix.
rex = function(ctx, name, pat)
- if ctx.rex then return unknown(ctx) end -- Only 1 REX prefix allowed.
+ if ctx.rex then return unknown(ctx) end -- Only 1 REX or VEX prefix allowed.
for p in gmatch(pat, ".") do ctx["rex"..p] = true end
- ctx.rex = true
+ ctx.rex = "rex"
+ end,
+
+ -- VEX prefix.
+ vex = function(ctx, name, pat)
+ if ctx.rex then return unknown(ctx) end -- Only 1 REX or VEX prefix allowed.
+ ctx.rex = "vex"
+ local pos = ctx.pos
+ if ctx.mrm then
+ ctx.mrm = nil
+ pos = pos-1
+ end
+ local b = byte(ctx.code, pos, pos)
+ if not b then return incomplete(ctx) end
+ pos = pos+1
+ if b < 128 then ctx.rexr = true end
+ local m = 1
+ if pat == "3" then
+ m = b%32; b = (b-m)/32
+ local nb = b%2; b = (b-nb)/2
+ if nb == 0 then ctx.rexb = true end
+ local nx = b%2; b = (b-nx)/2
+ if nx == 0 then ctx.rexx = true end
+ b = byte(ctx.code, pos, pos)
+ if not b then return incomplete(ctx) end
+ pos = pos+1
+ if b >= 128 then ctx.rexw = true end
+ end
+ ctx.pos = pos
+ local map
+ if m == 1 then map = map_opc2
+ elseif m == 2 then map = map_opc3["38"]
+ elseif m == 3 then map = map_opc3["3a"]
+ else return unknown(ctx) end
+ local p = b%4; b = (b-p)/4
+ if p == 1 then ctx.o16 = "o16"
+ elseif p == 2 then ctx.rep = "rep"
+ elseif p == 3 then ctx.rep = "repne" end
+ local l = b%2; b = (b-l)/2
+ if l ~= 0 then ctx.vexl = true end
+ ctx.vexv = (-1-b)%16
+ return dispatchmap(ctx, map)
end,
-- Special case for nop with REX prefix.
nop = function(ctx, name, pat)
return dispatch(ctx, ctx.rex and pat or "nop")
end,
+
+ -- Special case for 0F 77.
+ emms = function(ctx, name, pat)
+ if ctx.rex ~= "vex" then
+ return putop(ctx, "emms")
+ elseif ctx.vexl then
+ ctx.vexl = false
+ return putop(ctx, "zeroall")
+ else
+ return putop(ctx, "zeroupper")
+ end
+ end,
}
------------------------------------------------------------------------------
diff --git a/lib/luajit/src/jit/dump.lua b/lib/luajit/src/jit/dump.lua
index c52d0f217e..b1cdcfe294 100644
--- a/lib/luajit/src/jit/dump.lua
+++ b/lib/luajit/src/jit/dump.lua
@@ -75,9 +75,6 @@ local bcline, disass
-- Active flag, output file handle and dump mode.
local active, out, dumpmode
--- Information about traces that is remembered for future reference.
-local info = {}
-
------------------------------------------------------------------------------
local symtabmt = { __index = false }
@@ -553,7 +550,6 @@ local function dump_trace(what, tr, func, pc, otr, oex)
if dumpmode.m then dump_mcode(tr) end
end
if what == "start" then
- info[tr] = { func = func, pc = pc, otr = otr, oex = oex }
if dumpmode.H then out:write('\n') end
out:write("---- TRACE ", tr, " ", what)
if otr then out:write(" ", otr, "/", oex) end
@@ -575,6 +571,7 @@ local function dump_trace(what, tr, func, pc, otr, oex)
end
if dumpmode.H then out:write("
\n\n") else out:write("\n") end
else
+ if what == "flush" then symtab, nexitsym = {}, 0 end
out:write("---- TRACE ", what, "\n\n")
end
out:flush()
@@ -705,7 +702,6 @@ end
return {
on = dumpon,
off = dumpoff,
- start = dumpon, -- For -j command line option.
- info = info
+ start = dumpon -- For -j command line option.
}
diff --git a/lib/luajit/src/jit/p.lua b/lib/luajit/src/jit/p.lua
index d894bb7d2c..97d4ccdf87 100644
--- a/lib/luajit/src/jit/p.lua
+++ b/lib/luajit/src/jit/p.lua
@@ -36,7 +36,6 @@
-- G Produce raw output suitable for graphical tools (e.g. flame graphs).
-- m Minimum sample percentage to be shown. Default: 3.
-- i Sampling interval in milliseconds. Default: 10.
--- S[] Events source if performace events are enabled
--
----------------------------------------------------------------------------
@@ -45,8 +44,6 @@ local jit = require("jit")
assert(jit.version_num == 20100, "LuaJIT core/library version mismatch")
local profile = require("jit.profile")
local vmdef = require("jit.vmdef")
-local jutil = require("jit.util")
-local dump = require("jit.dump")
local math = math
local pairs, ipairs, tonumber, floor = pairs, ipairs, tonumber, math.floor
local sort, format = table.sort, string.format
@@ -77,38 +74,7 @@ local function prof_cb(th, samples, vmmode)
-- Collect keys for sample.
if prof_states then
if prof_states == "v" then
- if map_vmmode[vmmode] then
- key_state = map_vmmode[vmmode]
- else
- -- Sampling a trace: make an understandable one-line description.
- local tr = tonumber(vmmode)
- local info = jutil.traceinfo(tr)
- local extra = dump.info[tr]
- -- Show the parent of this trace (if this is a side trace)
- local parent = ""
- if extra and extra.otr and extra.oex then
- parent = "("..extra.otr.."/"..extra.oex..")"
- end
- -- Show what the end of the trace links to (e.g. loop or other trace)
- local lnk = ""
- local link, ltype = info.link, info.linktype
- if link == tr or link == 0 then lnk = "->"..ltype
- elseif ltype == "root" then lnk = "->"..link
- else lnk = "->"..link.." "..ltype end
- -- Show the current zone (if zone profiling is enabled)
- local z = ""
- if zone and zone:get() then
- z = (" %-16s"):format(zone:get())
- end
- -- Show the source location where the trace starts
- local loc = ""
- if extra and extra.func then
- local fi = jutil.funcinfo(extra.func, extra.pc)
- if fi.loc then loc = fi.loc end
- end
- local s = ("TRACE %3d %-8s %-10s%s %s"):format(vmmode, parent, lnk, z, loc)
- key_state = map_vmmode[vmmode] or s
- end
+ key_state = map_vmmode[vmmode] or vmmode
else
key_state = zone:get() or "(none)"
end
@@ -277,18 +243,15 @@ end
-- Start profiling.
local function prof_start(mode)
local interval = ""
- mode = mode:gsub("i%d+", function(s) interval = s; return "" end)
+ mode = mode:gsub("i%d*", function(s) interval = s; return "" end)
prof_min = 3
mode = mode:gsub("m(%d+)", function(s) prof_min = tonumber(s); return "" end)
prof_depth = 1
mode = mode:gsub("%-?%d+", function(s) prof_depth = tonumber(s); return "" end)
- local flavour = "S[vanilla]"
- mode = mode:gsub("S%[.+%]", function(s) flavour = s; return "" end)
-
local m = {}
for c in mode:gmatch(".") do m[c] = c end
- prof_states = m.v or m.z
- if m.z == "z" then zone = require("jit.zone") end
+ prof_states = m.z or m.v
+ if prof_states == "z" then zone = require("jit.zone") end
local scope = m.l or m.f or m.F or (prof_states and "" or "f")
local flags = (m.p or "")
prof_raw = m.r
@@ -322,7 +285,7 @@ local function prof_start(mode)
prof_count1 = {}
prof_count2 = {}
prof_samples = 0
- profile.start(scope:lower()..interval..flavour, prof_cb)
+ profile.start(scope:lower()..interval, prof_cb)
prof_ud = newproxy(true)
getmetatable(prof_ud).__gc = prof_finish
end
diff --git a/lib/luajit/src/lib_base.c b/lib/luajit/src/lib_base.c
index 887fea7a58..ca268b1d07 100644
--- a/lib/luajit/src/lib_base.c
+++ b/lib/luajit/src/lib_base.c
@@ -435,13 +435,13 @@ LJLIB_CF(gcinfo)
LJLIB_CF(collectgarbage)
{
int opt = lj_lib_checkopt(L, 1, LUA_GCCOLLECT, /* ORDER LUA_GC* */
- "\4stop\7restart\7collect\5count\1\377\4step\10setpause\12setstepmul");
+ "\4stop\7restart\7collect\5count\1\377\4step\10setpause\12setstepmul\1\377\11isrunning");
int32_t data = lj_lib_optint(L, 2, 0);
if (opt == LUA_GCCOUNT) {
setnumV(L->top, (lua_Number)G(L)->gc.total/1024.0);
} else {
int res = lua_gc(L, opt, data);
- if (opt == LUA_GCSTEP)
+ if (opt == LUA_GCSTEP || opt == LUA_GCISRUNNING)
setboolV(L->top, res);
else
setintV(L->top, res);
diff --git a/lib/luajit/src/lib_ffi.c b/lib/luajit/src/lib_ffi.c
index b2b2d37ff7..7be624b42d 100644
--- a/lib/luajit/src/lib_ffi.c
+++ b/lib/luajit/src/lib_ffi.c
@@ -505,10 +505,7 @@ LJLIB_CF(ffi_new) LJLIB_REC(.)
}
if (sz == CTSIZE_INVALID)
lj_err_arg(L, 1, LJ_ERR_FFI_INVSIZE);
- if (!(info & CTF_VLA) && ctype_align(info) <= CT_MEMALIGN)
- cd = lj_cdata_new(cts, id, sz);
- else
- cd = lj_cdata_newv(L, id, sz, ctype_align(info));
+ cd = lj_cdata_newx(cts, id, sz, info);
setcdataV(L, o-1, cd); /* Anchor the uninitialized cdata. */
lj_cconv_ct_init(cts, ct, sz, cdataptr(cd),
o, (MSize)(L->top - o)); /* Initialize cdata. */
diff --git a/lib/luajit/src/lib_jit.c b/lib/luajit/src/lib_jit.c
index 2227d198c5..178ef249df 100644
--- a/lib/luajit/src/lib_jit.c
+++ b/lib/luajit/src/lib_jit.c
@@ -299,9 +299,6 @@ LJLIB_CF(jit_util_traceinfo)
setintfield(L, t, "nk", REF_BIAS - (int32_t)T->nk);
setintfield(L, t, "link", T->link);
setintfield(L, t, "nexit", T->nsnap);
- setintfield(L, t, "szmcode", T->szmcode);
- setintfield(L, t, "mcode", (int32_t)(intptr_t)T->mcode);
- setintfield(L, t, "mcloop", T->mcloop);
setstrV(L, L->top++, lj_str_newz(L, jit_trlinkname[T->linktype]));
lua_setfield(L, -2, "linktype");
/* There are many more fields. Add them only when needed. */
@@ -558,10 +555,7 @@ static void jit_profile_callback(lua_State *L2, lua_State *L, int samples,
setfuncV(L2, L2->top++, funcV(tv));
setthreadV(L2, L2->top++, L);
setintV(L2->top++, samples);
- if (vmstate >= 256)
- setintV(L2->top++, vmstate-256);
- else
- setstrV(L2, L2->top++, lj_str_new(L2, &vmst, 1));
+ setstrV(L2, L2->top++, lj_str_new(L2, &vmst, 1));
status = lua_pcall(L2, 3, 0, 0); /* callback(thread, samples, vmstate) */
if (status) {
if (G(L2)->panic) G(L2)->panic(L2);
diff --git a/lib/luajit/src/lib_os.c b/lib/luajit/src/lib_os.c
index 7b5873a518..37d7d5be61 100644
--- a/lib/luajit/src/lib_os.c
+++ b/lib/luajit/src/lib_os.c
@@ -39,7 +39,7 @@
LJLIB_CF(os_execute)
{
-#if LJ_TARGET_CONSOLE
+#if LJ_NO_SYSTEM
#if LJ_52
errno = ENOSYS;
return luaL_fileresult(L, 0, NULL);
diff --git a/lib/luajit/src/lj.supp b/lib/luajit/src/lj.supp
index 411f261700..acb9e789d0 100644
--- a/lib/luajit/src/lj.supp
+++ b/lib/luajit/src/lj.supp
@@ -24,3 +24,18 @@
Memcheck:Cond
fun:lj_str_new
}
+{
+ Optimized string compare
+ Memcheck:Addr4
+ fun:lj_str_fastcmp
+}
+{
+ Optimized string compare
+ Memcheck:Addr1
+ fun:lj_str_fastcmp
+}
+{
+ Optimized string compare
+ Memcheck:Cond
+ fun:lj_str_fastcmp
+}
diff --git a/lib/luajit/src/lj_alloc.c b/lib/luajit/src/lj_alloc.c
index 0aad826d36..ddd50cae4f 100644
--- a/lib/luajit/src/lj_alloc.c
+++ b/lib/luajit/src/lj_alloc.c
@@ -196,7 +196,7 @@ static LJ_AINLINE void *CALL_MMAP(size_t size)
return ptr;
}
-#elif LJ_TARGET_OSX || LJ_TARGET_PS4 || defined(__FreeBSD__) || defined(__FreeBSD_kernel__) || defined(__NetBSD__) || defined(__OpenBSD__) || defined(__DragonFly__) || defined(__sun__)
+#elif LJ_TARGET_OSX || LJ_TARGET_PS4 || defined(__FreeBSD__) || defined(__FreeBSD_kernel__) || defined(__NetBSD__) || defined(__OpenBSD__) || defined(__DragonFly__) || defined(__sun__) || defined(__CYGWIN__)
/* OSX and FreeBSD mmap() use a naive first-fit linear search.
** That's perfect for us. Except that -pagezero_size must be set for OSX,
diff --git a/lib/luajit/src/lj_api.c b/lib/luajit/src/lj_api.c
index 1f09284f99..042b0d9c8d 100644
--- a/lib/luajit/src/lj_api.c
+++ b/lib/luajit/src/lj_api.c
@@ -1188,6 +1188,9 @@ LUA_API int lua_gc(lua_State *L, int what, int data)
res = (int)(g->gc.stepmul);
g->gc.stepmul = (MSize)data;
break;
+ case LUA_GCISRUNNING:
+ res = (g->gc.threshold != LJ_MAX_MEM);
+ break;
default:
res = -1; /* Invalid option. */
}
diff --git a/lib/luajit/src/lj_arch.h b/lib/luajit/src/lj_arch.h
index f1e7d7f45c..a114bdda53 100644
--- a/lib/luajit/src/lj_arch.h
+++ b/lib/luajit/src/lj_arch.h
@@ -155,7 +155,11 @@
#define LJ_ARCH_NAME "x64"
#define LJ_ARCH_BITS 64
#define LJ_ARCH_ENDIAN LUAJIT_LE
-#define LJ_ABI_WIN LJ_TARGET_WINDOWS
+#if LJ_TARGET_WINDOWS || __CYGWIN__
+#define LJ_ABI_WIN 1
+#else
+#define LJ_ABI_WIN 0
+#endif
#define LJ_TARGET_X64 1
#define LJ_TARGET_X86ORX64 1
#define LJ_TARGET_EHRETREG 0
@@ -300,6 +304,13 @@
#define LJ_TARGET_UNIFYROT 2 /* Want only IR_BROR. */
#define LJ_ARCH_NUMMODE LJ_NUMMODE_SINGLE
+#if !defined(LJ_ARCH_HASFPU) && defined(__mips_soft_float)
+#define LJ_ARCH_HASFPU 0
+#endif
+#if !defined(LJ_ABI_SOFTFP) && defined(__mips_soft_float)
+#define LJ_ABI_SOFTFP 1
+#endif
+
#if _MIPS_ARCH_MIPS32R2
#define LJ_ARCH_VERSION 20
#else
@@ -382,9 +393,6 @@
#error "No support for PPC/e500 anymore (use LuaJIT 2.0)"
#endif
#elif LJ_TARGET_MIPS
-#if defined(__mips_soft_float)
-#error "No support for MIPS CPUs without FPU"
-#endif
#if defined(_LP64)
#error "No support for MIPS64"
#endif
@@ -494,6 +502,9 @@
#if defined(__symbian__) || LJ_TARGET_WINDOWS
#define LUAJIT_NO_EXP2
#endif
+#if LJ_TARGET_CONSOLE || (LJ_TARGET_IOS && __IPHONE_OS_VERSION_MIN_REQUIRED >= __IPHONE_8_0)
+#define LJ_NO_SYSTEM 1
+#endif
#if defined(LUAJIT_NO_UNWIND) || defined(__symbian__) || LJ_TARGET_IOS || LJ_TARGET_PS3 || LJ_TARGET_PS4
#define LJ_NO_UNWIND 1
diff --git a/lib/luajit/src/lj_ccall.c b/lib/luajit/src/lj_ccall.c
index 5ab5b60daa..2dda540510 100644
--- a/lib/luajit/src/lj_ccall.c
+++ b/lib/luajit/src/lj_ccall.c
@@ -418,6 +418,18 @@
/* Complex values are returned in 1 or 2 FPRs. */ \
cc->retref = 0;
+#if LJ_ABI_SOFTFP
+#define CCALL_HANDLE_COMPLEXRET2 \
+ if (ctr->size == 2*sizeof(float)) { /* Copy complex float from GPRs. */ \
+ ((intptr_t *)dp)[0] = cc->gpr[0]; \
+ ((intptr_t *)dp)[1] = cc->gpr[1]; \
+ } else { /* Copy complex double from GPRs. */ \
+ ((intptr_t *)dp)[0] = cc->gpr[0]; \
+ ((intptr_t *)dp)[1] = cc->gpr[1]; \
+ ((intptr_t *)dp)[2] = cc->gpr[2]; \
+ ((intptr_t *)dp)[3] = cc->gpr[3]; \
+ }
+#else
#define CCALL_HANDLE_COMPLEXRET2 \
if (ctr->size == 2*sizeof(float)) { /* Copy complex float from FPRs. */ \
((float *)dp)[0] = cc->fpr[0].f; \
@@ -426,6 +438,7 @@
((double *)dp)[0] = cc->fpr[0].d; \
((double *)dp)[1] = cc->fpr[1].d; \
}
+#endif
#define CCALL_HANDLE_STRUCTARG \
/* Pass all structs by value in registers and/or on the stack. */
@@ -433,6 +446,22 @@
#define CCALL_HANDLE_COMPLEXARG \
/* Pass complex by value in 2 or 4 GPRs. */
+#define CCALL_HANDLE_GPR \
+ if ((d->info & CTF_ALIGN) > CTALIGN_PTR) \
+ ngpr = (ngpr + 1u) & ~1u; /* Align to regpair. */ \
+ if (ngpr < maxgpr) { \
+ dp = &cc->gpr[ngpr]; \
+ if (ngpr + n > maxgpr) { \
+ nsp += ngpr + n - maxgpr; /* Assumes contiguous gpr/stack fields. */ \
+ if (nsp > CCALL_MAXSTACK) goto err_nyi; /* Too many arguments. */ \
+ ngpr = maxgpr; \
+ } else { \
+ ngpr += n; \
+ } \
+ goto done; \
+ }
+
+#if !LJ_ABI_SOFTFP /* MIPS32 hard-float */
#define CCALL_HANDLE_REGARG \
if (isfp && nfpr < CCALL_NARG_FPR && !(ct->info & CTF_VARARG)) { \
/* Try to pass argument in FPRs. */ \
@@ -441,24 +470,18 @@
goto done; \
} else { /* Try to pass argument in GPRs. */ \
nfpr = CCALL_NARG_FPR; \
- if ((d->info & CTF_ALIGN) > CTALIGN_PTR) \
- ngpr = (ngpr + 1u) & ~1u; /* Align to regpair. */ \
- if (ngpr < maxgpr) { \
- dp = &cc->gpr[ngpr]; \
- if (ngpr + n > maxgpr) { \
- nsp += ngpr + n - maxgpr; /* Assumes contiguous gpr/stack fields. */ \
- if (nsp > CCALL_MAXSTACK) goto err_nyi; /* Too many arguments. */ \
- ngpr = maxgpr; \
- } else { \
- ngpr += n; \
- } \
- goto done; \
- } \
+ CCALL_HANDLE_GPR \
}
+#else /* MIPS32 soft-float */
+#define CCALL_HANDLE_REGARG CCALL_HANDLE_GPR
+#endif
+#if !LJ_ABI_SOFTFP
+/* On MIPS64 soft-float, position of float return values is endian-dependant. */
#define CCALL_HANDLE_RET \
if (ctype_isfp(ctr->info) && ctr->size == sizeof(float)) \
sp = (uint8_t *)&cc->fpr[0].f;
+#endif
#else
#error "Missing calling convention definitions for this architecture"
diff --git a/lib/luajit/src/lj_ccall.h b/lib/luajit/src/lj_ccall.h
index 91983feebd..8b0e796bfc 100644
--- a/lib/luajit/src/lj_ccall.h
+++ b/lib/luajit/src/lj_ccall.h
@@ -98,9 +98,9 @@ typedef double FPRArg;
#elif LJ_TARGET_MIPS
#define CCALL_NARG_GPR 4
-#define CCALL_NARG_FPR 2
+#define CCALL_NARG_FPR (LJ_ABI_SOFTFP ? 0 : 2)
#define CCALL_NRET_GPR 2
-#define CCALL_NRET_FPR 2
+#define CCALL_NRET_FPR (LJ_ABI_SOFTFP ? 0 : 2)
#define CCALL_SPS_EXTRA 7
#define CCALL_SPS_FREE 1
diff --git a/lib/luajit/src/lj_ccallback.c b/lib/luajit/src/lj_ccallback.c
index 065c329fa7..539c9e3da4 100644
--- a/lib/luajit/src/lj_ccallback.c
+++ b/lib/luajit/src/lj_ccallback.c
@@ -427,6 +427,15 @@ void lj_ccallback_mcode_free(CTState *cts)
#elif LJ_TARGET_MIPS
+#define CALLBACK_HANDLE_GPR \
+ if (n > 1) ngpr = (ngpr + 1u) & ~1u; /* Align to regpair. */ \
+ if (ngpr + n <= maxgpr) { \
+ sp = &cts->cb.gpr[ngpr]; \
+ ngpr += n; \
+ goto done; \
+ }
+
+#if !LJ_ABI_SOFTFP /* MIPS32 hard-float */
#define CALLBACK_HANDLE_REGARG \
if (isfp && nfpr < CCALL_NARG_FPR) { /* Try to pass argument in FPRs. */ \
sp = (void *)((uint8_t *)&cts->cb.fpr[nfpr] + ((LJ_BE && n==1) ? 4 : 0)); \
@@ -434,13 +443,13 @@ void lj_ccallback_mcode_free(CTState *cts)
goto done; \
} else { /* Try to pass argument in GPRs. */ \
nfpr = CCALL_NARG_FPR; \
- if (n > 1) ngpr = (ngpr + 1u) & ~1u; /* Align to regpair. */ \
- if (ngpr + n <= maxgpr) { \
- sp = &cts->cb.gpr[ngpr]; \
- ngpr += n; \
- goto done; \
- } \
+ CALLBACK_HANDLE_GPR \
}
+#else /* MIPS32 soft-float */
+#define CALLBACK_HANDLE_REGARG \
+ CALLBACK_HANDLE_GPR \
+ UNUSED(isfp);
+#endif
#define CALLBACK_HANDLE_RET \
if (ctype_isfp(ctr->info) && ctr->size == sizeof(float)) \
diff --git a/lib/luajit/src/lj_cdata.c b/lib/luajit/src/lj_cdata.c
index 5cd2c1140e..30d788e4c9 100644
--- a/lib/luajit/src/lj_cdata.c
+++ b/lib/luajit/src/lj_cdata.c
@@ -49,6 +49,15 @@ GCcdata *lj_cdata_newv(lua_State *L, CTypeID id, CTSize sz, CTSize align)
return cd;
}
+/* Allocate arbitrary C data object. */
+GCcdata *lj_cdata_newx(CTState *cts, CTypeID id, CTSize sz, CTInfo info)
+{
+ if (!(info & CTF_VLA) && ctype_align(info) <= CT_MEMALIGN)
+ return lj_cdata_new(cts, id, sz);
+ else
+ return lj_cdata_newv(cts->L, id, sz, ctype_align(info));
+}
+
/* Free a C data object. */
void LJ_FASTCALL lj_cdata_free(global_State *g, GCcdata *cd)
{
diff --git a/lib/luajit/src/lj_cdata.h b/lib/luajit/src/lj_cdata.h
index c8975be1c9..0891c33c80 100644
--- a/lib/luajit/src/lj_cdata.h
+++ b/lib/luajit/src/lj_cdata.h
@@ -60,6 +60,8 @@ static LJ_AINLINE GCcdata *lj_cdata_new_(lua_State *L, CTypeID id, CTSize sz)
LJ_FUNC GCcdata *lj_cdata_newref(CTState *cts, const void *pp, CTypeID id);
LJ_FUNC GCcdata *lj_cdata_newv(lua_State *L, CTypeID id, CTSize sz,
CTSize align);
+LJ_FUNC GCcdata *lj_cdata_newx(CTState *cts, CTypeID id, CTSize sz,
+ CTInfo info);
LJ_FUNC void LJ_FASTCALL lj_cdata_free(global_State *g, GCcdata *cd);
LJ_FUNC void lj_cdata_setfin(lua_State *L, GCcdata *cd, GCobj *obj,
diff --git a/lib/luajit/src/lj_ctype.c b/lib/luajit/src/lj_ctype.c
index 2e23c994bb..eda070ce1e 100644
--- a/lib/luajit/src/lj_ctype.c
+++ b/lib/luajit/src/lj_ctype.c
@@ -38,6 +38,8 @@
_("uint64_t", UINT64) \
_("intptr_t", INT_PSZ) \
_("uintptr_t", UINT_PSZ) \
+ /* From POSIX. */ \
+ _("ssize_t", INT_PSZ) \
/* End of typedef list. */
/* Keywords (only the ones we actually care for). */
diff --git a/lib/luajit/src/lj_dispatch.h b/lib/luajit/src/lj_dispatch.h
index 1e247e3828..73d00ec00c 100644
--- a/lib/luajit/src/lj_dispatch.h
+++ b/lib/luajit/src/lj_dispatch.h
@@ -14,6 +14,21 @@
#if LJ_TARGET_MIPS
/* Need our own global offset table for the dreaded MIPS calling conventions. */
+#if LJ_SOFTFP
+extern double __adddf3(double a, double b);
+extern double __subdf3(double a, double b);
+extern double __muldf3(double a, double b);
+extern double __divdf3(double a, double b);
+extern void __ledf2(double a, double b);
+extern double __floatsidf(int32_t a);
+extern int32_t __fixdfsi(double a);
+
+#define SFGOTDEF(_) \
+ _(lj_num2bit) _(sqrt) _(__adddf3) _(__subdf3) _(__muldf3) _(__divdf3) _(__ledf2) \
+ _(__floatsidf) _(__fixdfsi)
+#else
+#define SFGOTDEF(_)
+#endif
#if LJ_HASJIT
#define JITGOTDEF(_) _(lj_trace_exit) _(lj_trace_hot)
#else
@@ -39,7 +54,8 @@
_(lj_str_new) _(lj_tab_dup) _(lj_tab_get) _(lj_tab_getinth) _(lj_tab_len) \
_(lj_tab_new) _(lj_tab_newkey) _(lj_tab_next) _(lj_tab_reasize) \
_(lj_tab_setinth) _(lj_buf_putstr_reverse) _(lj_buf_putstr_lower) \
- _(lj_buf_putstr_upper) _(lj_buf_tostr) JITGOTDEF(_) FFIGOTDEF(_)
+ _(lj_buf_putstr_upper) _(lj_buf_tostr) \
+ JITGOTDEF(_) FFIGOTDEF(_) SFGOTDEF(_)
enum {
#define GOTENUM(name) LJ_GOT_##name,
diff --git a/lib/luajit/src/lj_err.c b/lib/luajit/src/lj_err.c
index 2e20c2c0f8..d641735e9f 100644
--- a/lib/luajit/src/lj_err.c
+++ b/lib/luajit/src/lj_err.c
@@ -183,20 +183,13 @@ static void *err_unwind(lua_State *L, void *stopcf, int errcode)
/* -- External frame unwinding -------------------------------------------- */
-#if defined(__GNUC__) && !LJ_NO_UNWIND && !LJ_TARGET_WINDOWS
+#if defined(__GNUC__) && !LJ_NO_UNWIND && !LJ_ABI_WIN
/*
** We have to use our own definitions instead of the mandatory (!) unwind.h,
** since various OS, distros and compilers mess up the header installation.
*/
-typedef struct _Unwind_Exception
-{
- uint64_t exclass;
- void (*excleanup)(int, struct _Unwind_Exception *);
- uintptr_t p1, p2;
-} __attribute__((__aligned__)) _Unwind_Exception;
-
typedef struct _Unwind_Context _Unwind_Context;
#define _URC_OK 0
@@ -206,8 +199,20 @@ typedef struct _Unwind_Context _Unwind_Context;
#define _URC_CONTINUE_UNWIND 8
#define _URC_FAILURE 9
+#define LJ_UEXCLASS 0x4c55414a49543200ULL /* LUAJIT2\0 */
+#define LJ_UEXCLASS_MAKE(c) (LJ_UEXCLASS | (uint64_t)(c))
+#define LJ_UEXCLASS_CHECK(cl) (((cl) ^ LJ_UEXCLASS) <= 0xff)
+#define LJ_UEXCLASS_ERRCODE(cl) ((int)((cl) & 0xff))
+
#if !LJ_TARGET_ARM
+typedef struct _Unwind_Exception
+{
+ uint64_t exclass;
+ void (*excleanup)(int, struct _Unwind_Exception *);
+ uintptr_t p1, p2;
+} __attribute__((__aligned__)) _Unwind_Exception;
+
extern uintptr_t _Unwind_GetCFA(_Unwind_Context *);
extern void _Unwind_SetGR(_Unwind_Context *, int, uintptr_t);
extern void _Unwind_SetIP(_Unwind_Context *, uintptr_t);
@@ -219,11 +224,6 @@ extern int _Unwind_RaiseException(_Unwind_Exception *);
#define _UA_HANDLER_FRAME 4
#define _UA_FORCE_UNWIND 8
-#define LJ_UEXCLASS 0x4c55414a49543200ULL /* LUAJIT2\0 */
-#define LJ_UEXCLASS_MAKE(c) (LJ_UEXCLASS | (uint64_t)(c))
-#define LJ_UEXCLASS_CHECK(cl) (((cl) ^ LJ_UEXCLASS) <= 0xff)
-#define LJ_UEXCLASS_ERRCODE(cl) ((int)((cl) & 0xff))
-
/* DWARF2 personality handler referenced from interpreter .eh_frame. */
LJ_FUNCA int lj_err_unwind_dwarf(int version, int actions,
uint64_t uexclass, _Unwind_Exception *uex, _Unwind_Context *ctx)
@@ -302,10 +302,23 @@ static void err_raise_ext(int errcode)
}
#endif
-#else
+#else /* LJ_TARGET_ARM */
+
+#define _US_VIRTUAL_UNWIND_FRAME 0
+#define _US_UNWIND_FRAME_STARTING 1
+#define _US_ACTION_MASK 3
+#define _US_FORCE_UNWIND 8
+
+typedef struct _Unwind_Control_Block _Unwind_Control_Block;
+typedef struct _Unwind_Context _Unwind_Context;
-extern void _Unwind_DeleteException(void *);
-extern int __gnu_unwind_frame (void *, _Unwind_Context *);
+struct _Unwind_Control_Block {
+ uint64_t exclass;
+ uint32_t misc[20];
+};
+
+extern int _Unwind_RaiseException(_Unwind_Control_Block *);
+extern int __gnu_unwind_frame(_Unwind_Control_Block *, _Unwind_Context *);
extern int _Unwind_VRS_Set(_Unwind_Context *, int, uint32_t, int, void *);
extern int _Unwind_VRS_Get(_Unwind_Context *, int, uint32_t, int, void *);
@@ -321,35 +334,58 @@ static inline void _Unwind_SetGR(_Unwind_Context *ctx, int r, uint32_t v)
_Unwind_VRS_Set(ctx, 0, r, 0, &v);
}
-#define _US_VIRTUAL_UNWIND_FRAME 0
-#define _US_UNWIND_FRAME_STARTING 1
-#define _US_ACTION_MASK 3
-#define _US_FORCE_UNWIND 8
+extern void lj_vm_unwind_ext(void);
/* ARM unwinder personality handler referenced from interpreter .ARM.extab. */
-LJ_FUNCA int lj_err_unwind_arm(int state, void *ucb, _Unwind_Context *ctx)
+LJ_FUNCA int lj_err_unwind_arm(int state, _Unwind_Control_Block *ucb,
+ _Unwind_Context *ctx)
{
void *cf = (void *)_Unwind_GetGR(ctx, 13);
lua_State *L = cframe_L(cf);
- if ((state & _US_ACTION_MASK) == _US_VIRTUAL_UNWIND_FRAME) {
- setstrV(L, L->top++, lj_err_str(L, LJ_ERR_ERRCPP));
+ int errcode;
+
+ switch ((state & _US_ACTION_MASK)) {
+ case _US_VIRTUAL_UNWIND_FRAME:
+ if ((state & _US_FORCE_UNWIND)) break;
return _URC_HANDLER_FOUND;
- }
- if ((state&(_US_ACTION_MASK|_US_FORCE_UNWIND)) == _US_UNWIND_FRAME_STARTING) {
- _Unwind_DeleteException(ucb);
- _Unwind_SetGR(ctx, 15, (uint32_t)(void *)lj_err_throw);
- _Unwind_SetGR(ctx, 0, (uint32_t)L);
- _Unwind_SetGR(ctx, 1, (uint32_t)LUA_ERRRUN);
+ case _US_UNWIND_FRAME_STARTING:
+ if (LJ_UEXCLASS_CHECK(ucb->exclass)) {
+ errcode = LJ_UEXCLASS_ERRCODE(ucb->exclass);
+ } else {
+ errcode = LUA_ERRRUN;
+ setstrV(L, L->top++, lj_err_str(L, LJ_ERR_ERRCPP));
+ }
+ cf = err_unwind(L, cf, errcode);
+ if ((state & _US_FORCE_UNWIND) || cf == NULL) break;
+ _Unwind_SetGR(ctx, 15, (uint32_t)lj_vm_unwind_ext);
+ _Unwind_SetGR(ctx, 0, (uint32_t)ucb);
+ _Unwind_SetGR(ctx, 1, (uint32_t)errcode);
+ _Unwind_SetGR(ctx, 2, cframe_unwind_ff(cf) ?
+ (uint32_t)lj_vm_unwind_ff_eh :
+ (uint32_t)lj_vm_unwind_c_eh);
return _URC_INSTALL_CONTEXT;
+ default:
+ return _URC_FAILURE;
}
if (__gnu_unwind_frame(ucb, ctx) != _URC_OK)
return _URC_FAILURE;
return _URC_CONTINUE_UNWIND;
}
+#if LJ_UNWIND_EXT
+static __thread _Unwind_Control_Block static_uex;
+
+static void err_raise_ext(int errcode)
+{
+ memset(&static_uex, 0, sizeof(static_uex));
+ static_uex.exclass = LJ_UEXCLASS_MAKE(errcode);
+ _Unwind_RaiseException(&static_uex);
+}
#endif
-#elif LJ_TARGET_X64 && LJ_TARGET_WINDOWS
+#endif /* LJ_TARGET_ARM */
+
+#elif LJ_TARGET_X64 && LJ_ABI_WIN
/*
** Someone in Redmond owes me several days of my life. A lot of this is
@@ -414,7 +450,9 @@ LJ_FUNCA EXCEPTION_DISPOSITION lj_err_unwind_win64(EXCEPTION_RECORD *rec,
if (cf2) { /* We catch it, so start unwinding the upper frames. */
if (rec->ExceptionCode == LJ_MSVC_EXCODE ||
rec->ExceptionCode == LJ_GCC_EXCODE) {
+#if LJ_TARGET_WINDOWS
__DestructExceptionObject(rec, 1);
+#endif
setstrV(L, L->top++, lj_err_str(L, LJ_ERR_ERRCPP));
} else if (!LJ_EXCODE_CHECK(rec->ExceptionCode)) {
/* Don't catch access violations etc. */
diff --git a/lib/luajit/src/lj_ffrecord.c b/lib/luajit/src/lj_ffrecord.c
index 6cc05a24f7..281f017856 100644
--- a/lib/luajit/src/lj_ffrecord.c
+++ b/lib/luajit/src/lj_ffrecord.c
@@ -435,11 +435,12 @@ static void LJ_FASTCALL recff_ipairs_aux(jit_State *J, RecordFFData *rd)
static void LJ_FASTCALL recff_xpairs(jit_State *J, RecordFFData *rd)
{
- if (!(LJ_52 && recff_metacall(J, rd, MM_ipairs))) {
- TRef tab = J->base[0];
- if (tref_istab(tab)) {
+ TRef tr = J->base[0];
+ if (!((LJ_52 || (LJ_HASFFI && tref_iscdata(tr))) &&
+ recff_metacall(J, rd, MM_pairs + rd->data))) {
+ if (tref_istab(tr)) {
J->base[0] = lj_ir_kfunc(J, funcV(&J->fn->c.upvalue[0]));
- J->base[1] = tab;
+ J->base[1] = tr;
J->base[2] = rd->data ? lj_ir_kint(J, 0) : TREF_NIL;
rd->nres = 3;
} /* else: Interpreter will throw. */
diff --git a/lib/luajit/src/lj_frame.h b/lib/luajit/src/lj_frame.h
index a86c36be7e..aa3ab20bbf 100644
--- a/lib/luajit/src/lj_frame.h
+++ b/lib/luajit/src/lj_frame.h
@@ -218,6 +218,7 @@ enum { LJ_CONT_TAILCALL, LJ_CONT_FFI_CALLBACK }; /* Special continuations. */
#define CFRAME_SHIFT_MULTRES 3
#endif
#elif LJ_TARGET_MIPS
+#if LJ_ARCH_HASFPU
#define CFRAME_OFS_ERRF 124
#define CFRAME_OFS_NRES 120
#define CFRAME_OFS_PREV 116
@@ -227,6 +228,16 @@ enum { LJ_CONT_TAILCALL, LJ_CONT_FFI_CALLBACK }; /* Special continuations. */
#define CFRAME_SIZE 112
#define CFRAME_SHIFT_MULTRES 3
#else
+#define CFRAME_OFS_ERRF 100
+#define CFRAME_OFS_NRES 96
+#define CFRAME_OFS_PREV 92
+#define CFRAME_OFS_L 88
+#define CFRAME_OFS_PC 44
+#define CFRAME_OFS_MULTRES 16
+#define CFRAME_SIZE 88
+#define CFRAME_SHIFT_MULTRES 3
+#endif
+#else
#error "Missing CFRAME_* definitions for this architecture"
#endif
diff --git a/lib/luajit/src/lj_ircall.h b/lib/luajit/src/lj_ircall.h
index 84e41ecfcc..1f44b03d67 100644
--- a/lib/luajit/src/lj_ircall.h
+++ b/lib/luajit/src/lj_ircall.h
@@ -270,6 +270,22 @@ LJ_DATA const CCallInfo lj_ir_callinfo[IRCALL__MAX+1];
#define fp64_f2l __aeabi_f2lz
#define fp64_f2ul __aeabi_f2ulz
#endif
+#elif LJ_TARGET_MIPS
+#define softfp_add __adddf3
+#define softfp_sub __subdf3
+#define softfp_mul __muldf3
+#define softfp_div __divdf3
+#define softfp_cmp __ledf2
+#define softfp_i2d __floatsidf
+#define softfp_d2i __fixdfsi
+#define softfp_ui2d __floatunsidf
+#define softfp_f2d __extendsfdf2
+#define softfp_d2ui __fixunsdfsi
+#define softfp_d2f __truncdfsf2
+#define softfp_i2f __floatsisf
+#define softfp_ui2f __floatunsisf
+#define softfp_f2i __fixsfsi
+#define softfp_f2ui __fixunssfsi
#else
#error "Missing soft-float definitions for target architecture"
#endif
diff --git a/lib/luajit/src/lj_opt_split.c b/lib/luajit/src/lj_opt_split.c
index 81ded6c0a0..4652c73786 100644
--- a/lib/luajit/src/lj_opt_split.c
+++ b/lib/luajit/src/lj_opt_split.c
@@ -596,7 +596,8 @@ static void split_ir(jit_State *J)
}
#endif
else if (st == IRT_I64 || st == IRT_U64) { /* 64/64 bit cast. */
- /* Drop cast, since assembler doesn't care. */
+ /* Drop cast, since assembler doesn't care. But fwd both parts. */
+ hi = hiref;
goto fwdlo;
} else if ((ir->op2 & IRCONV_SEXT)) { /* Sign-extend to 64 bit. */
IRRef k31 = lj_ir_kint(J, 31);
diff --git a/lib/luajit/src/lj_profile.c b/lib/luajit/src/lj_profile.c
index f4d6fe18de..c7e53963b5 100644
--- a/lib/luajit/src/lj_profile.c
+++ b/lib/luajit/src/lj_profile.c
@@ -5,7 +5,6 @@
#define lj_profile_c
#define LUA_CORE
-#define _GNU_SOURCE 1
#include "lj_obj.h"
@@ -30,17 +29,6 @@
#define profile_lock(ps) UNUSED(ps)
#define profile_unlock(ps) UNUSED(ps)
-#if 1
-#include
-#include
-#include
-#include
-#include
-#include
-#include
-#endif
-
-
#elif LJ_PROFILE_PTHREAD
#include
@@ -74,8 +62,6 @@ typedef struct ProfileState {
SBuf sb; /* String buffer for stack dumps. */
int interval; /* Sample interval in milliseconds. */
int samples; /* Number of samples for next callback. */
- char *flavour; /* What generates profiling events. */
- int perf_event_fd; /* Performace event file descriptor */
int vmstate; /* VM state when profile timer triggered. */
#if LJ_PROFILE_SIGPROF
struct sigaction oldsa; /* Previous SIGPROF state. */
@@ -169,7 +155,7 @@ static void profile_trigger(ProfileState *ps)
mask = g->hookmask;
if (!(mask & (HOOK_PROFILE|HOOK_VMEVENT))) { /* Set profile hook. */
int st = g->vmstate;
- ps->vmstate = st >= 0 ? 256+st :
+ ps->vmstate = st >= 0 ? 'N' :
st == ~LJ_VMST_INTERP ? 'I' :
st == ~LJ_VMST_C ? 'C' :
st == ~LJ_VMST_GC ? 'G' : 'J';
@@ -190,178 +176,29 @@ static void profile_signal(int sig)
profile_trigger(&profile_state);
}
-
-static int perf_event_open(struct perf_event_attr *attr,
- pid_t pid, int cpu, int group_fd,
- unsigned long flags)
-{
- return syscall(SYS_perf_event_open, attr, pid, cpu, group_fd, flags);
-}
-
-
-static void register_prof_events(ProfileState *ps)
-{
- struct flavour_t {
- char *name; uint32_t type; uint64_t config;
- };
-
- static struct flavour_t flavours[] =
- {
- { "sw-cpu-clock",
- PERF_TYPE_SOFTWARE, PERF_COUNT_SW_CPU_CLOCK },
-
- { "sw-context-switches",
- PERF_TYPE_SOFTWARE, PERF_COUNT_SW_CONTEXT_SWITCHES },
-
- { "sw-page-faults",
- PERF_TYPE_SOFTWARE, PERF_COUNT_SW_PAGE_FAULTS },
-
- { "sw-minor-page-faults",
- PERF_TYPE_SOFTWARE, PERF_COUNT_SW_PAGE_FAULTS_MIN },
-
- { "sw-major-page-faults",
- PERF_TYPE_SOFTWARE, PERF_COUNT_SW_PAGE_FAULTS_MAJ },
-
- { "branch-instructions",
- PERF_TYPE_HARDWARE, PERF_COUNT_HW_BRANCH_INSTRUCTIONS },
-
- { "cpu-cycles",
- PERF_TYPE_HARDWARE, PERF_COUNT_HW_CPU_CYCLES },
-
- { "instructions",
- PERF_TYPE_HARDWARE, PERF_COUNT_HW_INSTRUCTIONS },
-
- { "cache-references",
- PERF_TYPE_HARDWARE, PERF_COUNT_HW_CACHE_REFERENCES },
-
- { "cache-misses",
- PERF_TYPE_HARDWARE, PERF_COUNT_HW_CACHE_MISSES },
-
- { "branch-instructions",
- PERF_TYPE_HARDWARE, PERF_COUNT_HW_BRANCH_INSTRUCTIONS },
-
- { "branch-misses",
- PERF_TYPE_HARDWARE, PERF_COUNT_HW_BRANCH_MISSES },
-
- { "bus-cycles",
- PERF_TYPE_HARDWARE, PERF_COUNT_HW_BUS_CYCLES },
-
- { "stalled-cycles-frontend",
- PERF_TYPE_HARDWARE, PERF_COUNT_HW_STALLED_CYCLES_FRONTEND },
-
- { "stalled-cycles-backend",
- PERF_TYPE_HARDWARE, PERF_COUNT_HW_STALLED_CYCLES_BACKEND },
-
- { "cpu-cycles",
- PERF_TYPE_HARDWARE, PERF_COUNT_HW_REF_CPU_CYCLES },
-
- { 0, 0, 0 }
- };
-
-
- struct perf_event_attr attr = { };
-
- memset(&attr, 0, sizeof(struct perf_event_attr));
-
- const struct flavour_t *f;
- for (f = flavours; f->name != 0; f++)
- {
- if (strcmp (ps->flavour, f->name) == 0)
- {
- attr.type = f->type;
- attr.config = f->config;
- break;
- }
- }
-
- if (strcmp (ps->flavour, "?") == 0)
- {
- const struct flavour_t *f;
- fprintf (stderr, "I know: ");
- for (f = flavours; f->name != 0; f++)
- fprintf (stderr, "%s ", f->name);
- fprintf(stderr, "\n");
- }
- else if (! f->name)
- {
- fprintf (stderr, "unknown profiling flavour `%s', S[?] to list\n", ps->flavour);
- }
-
- attr.size = sizeof(struct perf_event_attr);
- attr.sample_type = PERF_SAMPLE_IP;
- /* attr.read_format = PERF_FORMAT_GROUP | PERF_FORMAT_ID; */
- attr.disabled=1;
- attr.pinned=1;
- attr.exclude_kernel=1;
- attr.exclude_hv=1;
-
- attr.sample_period = ps->interval;
- /* attr.watermark=0; */
- /* attr.wakeup_events=1; */
-
- int fd = perf_event_open(&attr, 0, -1, -1, 0);
- if (fd == -1)
- {
- printf ("! perf_event_open %m\n");
- }
-
- ps->perf_event_fd = fd;
-
- fcntl(fd, F_SETFL, O_RDWR|O_NONBLOCK|O_ASYNC);
- fcntl(fd, F_SETSIG, SIGPROF);
- fcntl(fd, F_SETOWN, getpid());
-
- ioctl(fd, PERF_EVENT_IOC_RESET, 0);
-
- int err = ioctl(fd, PERF_EVENT_IOC_ENABLE, 0);
- if (err != 0)
- printf ("! perf_events enable\n");
-}
-
-
-
/* Start profiling timer. */
static void profile_timer_start(ProfileState *ps)
{
- struct sigaction sa = {
- .sa_flags = SA_RESTART,
- .sa_handler = profile_signal
- };
-
+ int interval = ps->interval;
+ struct itimerval tm;
+ struct sigaction sa;
+ tm.it_value.tv_sec = tm.it_interval.tv_sec = interval / 1000;
+ tm.it_value.tv_usec = tm.it_interval.tv_usec = (interval % 1000) * 1000;
+ setitimer(ITIMER_PROF, &tm, NULL);
+ sa.sa_flags = SA_RESTART;
+ sa.sa_handler = profile_signal;
sigemptyset(&sa.sa_mask);
sigaction(SIGPROF, &sa, &ps->oldsa);
-
- if (strcmp(ps->flavour, "vanilla") == 0)
- {
- int interval = ps->interval;
- struct itimerval tm;
- tm.it_value.tv_sec = tm.it_interval.tv_sec = interval / 1000;
- tm.it_value.tv_usec = tm.it_interval.tv_usec = (interval % 1000) * 1000;
- setitimer(ITIMER_PROF, &tm, NULL);
- }
- else
- {
- register_prof_events(ps);
- }
}
-
-
/* Stop profiling timer. */
static void profile_timer_stop(ProfileState *ps)
{
- if (ps->perf_event_fd)
- {
- ioctl(ps->perf_event_fd, PERF_EVENT_IOC_DISABLE, 0);
- }
- else
- {
- struct itimerval tm;
- tm.it_value.tv_sec = tm.it_interval.tv_sec = 0;
- tm.it_value.tv_usec = tm.it_interval.tv_usec = 0;
- setitimer(ITIMER_PROF, &tm, NULL);
- sigaction(SIGPROF, &ps->oldsa, NULL);
- }
+ struct itimerval tm;
+ tm.it_value.tv_sec = tm.it_interval.tv_sec = 0;
+ tm.it_value.tv_usec = tm.it_interval.tv_usec = 0;
+ setitimer(ITIMER_PROF, &tm, NULL);
+ sigaction(SIGPROF, &ps->oldsa, NULL);
}
#elif LJ_PROFILE_PTHREAD
@@ -463,8 +300,6 @@ LUA_API void luaJIT_profile_start(lua_State *L, const char *mode,
{
ProfileState *ps = &profile_state;
int interval = LJ_PROFILE_INTERVAL_DEFAULT;
- char *flavour;
-
while (*mode) {
int m = *mode++;
switch (m) {
@@ -480,13 +315,6 @@ LUA_API void luaJIT_profile_start(lua_State *L, const char *mode,
lj_trace_flushall(L);
break;
#endif
- case 'S':
- {
- int k;
- if (sscanf (mode, "[%m[^]]]%n", &flavour, &k) > 0)
- mode += k;
- }
-
default: /* Ignore unknown mode chars. */
break;
}
@@ -500,7 +328,6 @@ LUA_API void luaJIT_profile_start(lua_State *L, const char *mode,
ps->cb = cb;
ps->data = data;
ps->samples = 0;
- ps->flavour = flavour;
lj_buf_init(L, &ps->sb);
profile_timer_start(ps);
}
diff --git a/lib/luajit/src/lj_snap.c b/lib/luajit/src/lj_snap.c
index fa9abb7475..62515ed0f6 100644
--- a/lib/luajit/src/lj_snap.c
+++ b/lib/luajit/src/lj_snap.c
@@ -26,9 +26,6 @@
#include "lj_cdata.h"
#endif
-/* Some local macros to save typing. Undef'd at the end. */
-#define IR(ref) (&J->cur.ir[(ref)])
-
/* Pass IR on to next optimization in chain (FOLD). */
#define emitir(ot, a, b) (lj_ir_set(J, (ot), (a), (b)), lj_opt_fold(J))
@@ -73,7 +70,7 @@ static MSize snapshot_slots(jit_State *J, SnapEntry *map, BCReg nslots)
IRRef ref = tref_ref(tr);
if (ref) {
SnapEntry sn = SNAP_TR(s, tr);
- IRIns *ir = IR(ref);
+ IRIns *ir = &J->cur.ir[ref];
if (!(sn & (SNAP_CONT|SNAP_FRAME)) &&
ir->o == IR_SLOAD && ir->op1 == s && ref > retf) {
/* No need to snapshot unmodified non-inherited slots. */
@@ -407,24 +404,24 @@ static TRef snap_pref(jit_State *J, GCtrace *T, SnapEntry *map, MSize nmax,
}
/* Check whether a sunk store corresponds to an allocation. Slow path. */
-static int snap_sunk_store2(jit_State *J, IRIns *ira, IRIns *irs)
+static int snap_sunk_store2(GCtrace *T, IRIns *ira, IRIns *irs)
{
if (irs->o == IR_ASTORE || irs->o == IR_HSTORE ||
irs->o == IR_FSTORE || irs->o == IR_XSTORE) {
- IRIns *irk = IR(irs->op1);
+ IRIns *irk = &T->ir[irs->op1];
if (irk->o == IR_AREF || irk->o == IR_HREFK)
- irk = IR(irk->op1);
- return (IR(irk->op1) == ira);
+ irk = &T->ir[irk->op1];
+ return (&T->ir[irk->op1] == ira);
}
return 0;
}
/* Check whether a sunk store corresponds to an allocation. Fast path. */
-static LJ_AINLINE int snap_sunk_store(jit_State *J, IRIns *ira, IRIns *irs)
+static LJ_AINLINE int snap_sunk_store(GCtrace *T, IRIns *ira, IRIns *irs)
{
if (irs->s != 255)
return (ira + irs->s == irs); /* Fast check. */
- return snap_sunk_store2(J, ira, irs);
+ return snap_sunk_store2(T, ira, irs);
}
/* Replay snapshot state to setup side trace. */
@@ -487,7 +484,7 @@ void lj_snap_replay(jit_State *J, GCtrace *T)
} else {
IRIns *irs;
for (irs = ir+1; irs < irlast; irs++)
- if (irs->r == RID_SINK && snap_sunk_store(J, ir, irs)) {
+ if (irs->r == RID_SINK && snap_sunk_store(T, ir, irs)) {
if (snap_pref(J, T, map, nent, seen, irs->op2) == 0)
snap_pref(J, T, map, nent, seen, T->ir[irs->op2].op1);
else if ((LJ_SOFTFP || (LJ_32 && LJ_HASFFI)) &&
@@ -521,13 +518,13 @@ void lj_snap_replay(jit_State *J, GCtrace *T)
op2 = emitir_raw(IRT(IR_HIOP, IRT_I64), op2,
snap_pref(J, T, map, nent, seen, (ir+1)->op2));
}
- J->slot[snap_slot(sn)] = emitir(ir->ot, op1, op2);
+ J->slot[snap_slot(sn)] = emitir(ir->ot & ~(IRT_MARK|IRT_ISPHI), op1, op2);
} else {
IRIns *irs;
TRef tr = emitir(ir->ot, op1, op2);
J->slot[snap_slot(sn)] = tr;
for (irs = ir+1; irs < irlast; irs++)
- if (irs->r == RID_SINK && snap_sunk_store(J, ir, irs)) {
+ if (irs->r == RID_SINK && snap_sunk_store(T, ir, irs)) {
IRIns *irr = &T->ir[irs->op1];
TRef val, key = irr->op2, tmp = tr;
if (irr->o != IR_FREF) {
@@ -714,8 +711,9 @@ static void snap_unsink(jit_State *J, GCtrace *T, ExitState *ex,
if (ir->o == IR_CNEW || ir->o == IR_CNEWI) {
CTState *cts = ctype_cts(J->L);
CTypeID id = (CTypeID)T->ir[ir->op1].i;
- CTSize sz = lj_ctype_size(cts, id);
- GCcdata *cd = lj_cdata_new(cts, id, sz);
+ CTSize sz;
+ CTInfo info = lj_ctype_info(cts, id, &sz);
+ GCcdata *cd = lj_cdata_newx(cts, id, sz, info);
setcdataV(J->L, o, cd);
if (ir->o == IR_CNEWI) {
uint8_t *p = (uint8_t *)cdataptr(cd);
@@ -729,7 +727,7 @@ static void snap_unsink(jit_State *J, GCtrace *T, ExitState *ex,
} else {
IRIns *irs, *irlast = &T->ir[T->snap[snapno].ref];
for (irs = ir+1; irs < irlast; irs++)
- if (irs->r == RID_SINK && snap_sunk_store(J, ir, irs)) {
+ if (irs->r == RID_SINK && snap_sunk_store(T, ir, irs)) {
IRIns *iro = &T->ir[T->ir[irs->op1].op2];
uint8_t *p = (uint8_t *)cd;
CTSize szs;
@@ -762,7 +760,7 @@ static void snap_unsink(jit_State *J, GCtrace *T, ExitState *ex,
settabV(J->L, o, t);
irlast = &T->ir[T->snap[snapno].ref];
for (irs = ir+1; irs < irlast; irs++)
- if (irs->r == RID_SINK && snap_sunk_store(J, ir, irs)) {
+ if (irs->r == RID_SINK && snap_sunk_store(T, ir, irs)) {
IRIns *irk = &T->ir[irs->op1];
TValue tmp, *val;
lua_assert(irs->o == IR_ASTORE || irs->o == IR_HSTORE ||
@@ -863,7 +861,6 @@ const BCIns *lj_snap_restore(jit_State *J, void *exptr)
return pc;
}
-#undef IR
#undef emitir_raw
#undef emitir
diff --git a/lib/luajit/src/lj_vm.h b/lib/luajit/src/lj_vm.h
index b31e22f70f..cb76d7a700 100644
--- a/lib/luajit/src/lj_vm.h
+++ b/lib/luajit/src/lj_vm.h
@@ -50,7 +50,7 @@ LJ_ASMF void lj_vm_exit_handler(void);
LJ_ASMF void lj_vm_exit_interp(void);
/* Internal math helper functions. */
-#if LJ_TARGET_PPC || LJ_TARGET_ARM64
+#if LJ_TARGET_PPC || LJ_TARGET_ARM64 || (LJ_TARGET_MIPS && LJ_ABI_SOFTFP)
#define lj_vm_floor floor
#define lj_vm_ceil ceil
#else
diff --git a/lib/luajit/src/lua.h b/lib/luajit/src/lua.h
index c83fd3bbe7..352d29f3cd 100644
--- a/lib/luajit/src/lua.h
+++ b/lib/luajit/src/lua.h
@@ -226,6 +226,7 @@ LUA_API int (lua_status) (lua_State *L);
#define LUA_GCSTEP 5
#define LUA_GCSETPAUSE 6
#define LUA_GCSETSTEPMUL 7
+#define LUA_GCISRUNNING 9
LUA_API int (lua_gc) (lua_State *L, int what, int data);
diff --git a/lib/luajit/src/vm_arm.dasc b/lib/luajit/src/vm_arm.dasc
index af722f9eac..acc0853bb7 100644
--- a/lib/luajit/src/vm_arm.dasc
+++ b/lib/luajit/src/vm_arm.dasc
@@ -372,6 +372,17 @@ static void build_subroutines(BuildCtx *ctx)
| str CARG1, [BASE, #-4] // Prepend false to error message.
| st_vmstate CARG2
| b ->vm_returnc
+ |
+ |->vm_unwind_ext: // Complete external unwind.
+#if !LJ_NO_UNWIND
+ | push {r0, r1, r2, lr}
+ | bl extern _Unwind_Complete
+ | ldr r0, [sp]
+ | bl extern _Unwind_DeleteException
+ | pop {r0, r1, r2, lr}
+ | mov r0, r1
+ | bx r2
+#endif
|
|//-----------------------------------------------------------------------
|//-- Grow stack for calls -----------------------------------------------
diff --git a/lib/luajit/src/vm_mips.dasc b/lib/luajit/src/vm_mips.dasc
index 134ed569e8..0dba129316 100644
--- a/lib/luajit/src/vm_mips.dasc
+++ b/lib/luajit/src/vm_mips.dasc
@@ -1,6 +1,9 @@
|// Low-level VM code for MIPS CPUs.
|// Bytecode interpreter, fast functions and helper functions.
|// Copyright (C) 2005-2015 Mike Pall. See Copyright Notice in luajit.h
+|//
+|// MIPS soft-float support contributed by Djordje Kovacevic and
+|// Stefan Pejic from RT-RK.com, sponsored by Cisco Systems, Inc.
|
|.arch mips
|.section code_op, code_sub
@@ -18,6 +21,12 @@
|// Fixed register assignments for the interpreter.
|// Don't use: r0 = 0, r26/r27 = reserved, r28 = gp, r29 = sp, r31 = ra
|
+|.macro .FPU, a, b
+|.if FPU
+| a, b
+|.endif
+|.endmacro
+|
|// The following must be C callee-save (but BASE is often refetched).
|.define BASE, r16 // Base of current Lua stack frame.
|.define KBASE, r17 // Constants of current Lua function.
@@ -31,7 +40,9 @@
|
|// Constants for type-comparisons, stores and conversions. C callee-save.
|.define TISNIL, r30
+|.if FPU
|.define TOBIT, f30 // 2^52 + 2^51.
+|.endif
|
|// The following temporaries are not saved across C calls, except for RA.
|.define RA, r23 // Callee-save.
@@ -46,6 +57,13 @@
|.define TMP2, r14
|.define TMP3, r15
|
+|.if not FPU
+|.define SFT1, r2
+|.define SFT2, r3
+|.define SFT3, r4
+|.define SFT4, r5
+|.endif
+|
|// Calling conventions.
|.define CFUNCADDR, r25
|.define CARG1, r4
@@ -56,13 +74,16 @@
|.define CRET1, r2
|.define CRET2, r3
|
+|.if FPU
|.define FARG1, f12
|.define FARG2, f14
|
|.define FRET1, f0
|.define FRET2, f2
+|.endif
|
|// Stack layout while in interpreter. Must match with lj_frame.h.
+|.if FPU // MIPS32 hard-float.
|.define CFRAME_SPACE, 112 // Delta for sp.
|
|.define SAVE_ERRF, 124(sp) // 32 bit C frame info.
@@ -83,43 +104,76 @@
|.define ARG5_OFS, 16
|.define SAVE_MULTRES, ARG5
|
+|//-----------------------------------------------------------------------
+|.else // MIPS32 soft-float.
+|
+|.define CFRAME_SPACE, 88 // Delta for sp.
+|
+|.define SAVE_ERRF, 100(sp) // 32 bit C frame info.
+|.define SAVE_NRES, 96(sp)
+|.define SAVE_CFRAME, 92(sp)
+|.define SAVE_L, 88(sp)
+|//----- 8 byte aligned, ^^^^ 16 byte register save area, owned by interpreter.
+|.define SAVE_GPR_, 48 // .. 48+10*4: 32 bit GPR saves.
+|.define SAVE_PC, 44(sp)
+|.define TEMP_SAVE_6, 40(sp)
+|.define TEMP_SAVE_5, 36(sp)
+|.define TEMP_SAVE_4, 32(sp)
+|.define TEMP_SAVE_3, 28(sp)
+|.define TEMP_SAVE_2, 24(sp)
+|.define TEMP_SAVE_1, 20(sp)
+|//----- 8 byte aligned, ^^^^ 24 byte register save area, owned by caller.
+|.define ARG5, 16(sp)
+|.define CSAVE_4, 12(sp)
+|.define CSAVE_3, 8(sp)
+|.define CSAVE_2, 4(sp)
+|.define CSAVE_1, 0(sp)
+|//----- 8 byte aligned, ^^^^ 16 byte register save area, owned by callee.
+|
+|.define ARG5_OFS, 16
+|.define SAVE_MULTRES, ARG5
+|
+|.endif
+|
+|//-----------------------------------------------------------------------
+|
|.macro saveregs
| addiu sp, sp, -CFRAME_SPACE
| sw ra, SAVE_GPR_+9*4(sp)
| sw r30, SAVE_GPR_+8*4(sp)
-| sdc1 f30, SAVE_FPR_+5*8(sp)
+| .FPU sdc1 f30, SAVE_FPR_+5*8(sp)
| sw r23, SAVE_GPR_+7*4(sp)
| sw r22, SAVE_GPR_+6*4(sp)
-| sdc1 f28, SAVE_FPR_+4*8(sp)
+| .FPU sdc1 f28, SAVE_FPR_+4*8(sp)
| sw r21, SAVE_GPR_+5*4(sp)
| sw r20, SAVE_GPR_+4*4(sp)
-| sdc1 f26, SAVE_FPR_+3*8(sp)
+| .FPU sdc1 f26, SAVE_FPR_+3*8(sp)
| sw r19, SAVE_GPR_+3*4(sp)
| sw r18, SAVE_GPR_+2*4(sp)
-| sdc1 f24, SAVE_FPR_+2*8(sp)
+| .FPU sdc1 f24, SAVE_FPR_+2*8(sp)
| sw r17, SAVE_GPR_+1*4(sp)
| sw r16, SAVE_GPR_+0*4(sp)
-| sdc1 f22, SAVE_FPR_+1*8(sp)
-| sdc1 f20, SAVE_FPR_+0*8(sp)
+| .FPU sdc1 f22, SAVE_FPR_+1*8(sp)
+| .FPU sdc1 f20, SAVE_FPR_+0*8(sp)
|.endmacro
|
|.macro restoreregs_ret
| lw ra, SAVE_GPR_+9*4(sp)
| lw r30, SAVE_GPR_+8*4(sp)
-| ldc1 f30, SAVE_FPR_+5*8(sp)
+| .FPU ldc1 f30, SAVE_FPR_+5*8(sp)
| lw r23, SAVE_GPR_+7*4(sp)
| lw r22, SAVE_GPR_+6*4(sp)
-| ldc1 f28, SAVE_FPR_+4*8(sp)
+| .FPU ldc1 f28, SAVE_FPR_+4*8(sp)
| lw r21, SAVE_GPR_+5*4(sp)
| lw r20, SAVE_GPR_+4*4(sp)
-| ldc1 f26, SAVE_FPR_+3*8(sp)
+| .FPU ldc1 f26, SAVE_FPR_+3*8(sp)
| lw r19, SAVE_GPR_+3*4(sp)
| lw r18, SAVE_GPR_+2*4(sp)
-| ldc1 f24, SAVE_FPR_+2*8(sp)
+| .FPU ldc1 f24, SAVE_FPR_+2*8(sp)
| lw r17, SAVE_GPR_+1*4(sp)
| lw r16, SAVE_GPR_+0*4(sp)
-| ldc1 f22, SAVE_FPR_+1*8(sp)
-| ldc1 f20, SAVE_FPR_+0*8(sp)
+| .FPU ldc1 f22, SAVE_FPR_+1*8(sp)
+| .FPU ldc1 f20, SAVE_FPR_+0*8(sp)
| jr ra
| addiu sp, sp, CFRAME_SPACE
|.endmacro
@@ -270,6 +324,61 @@
|.macro call_extern; jalr CFUNCADDR; .endmacro
|.macro jmp_extern; jr CFUNCADDR; .endmacro
|
+|// Converts int from given reg to double, result in CRET1 and CRET2 regs.
+|.if not FPU
+|.macro cvti2d, arg
+| load_got __floatsidf
+| call_extern
+|. move CARG1, arg
+|.endmacro
+|.endif
+|
+|// Loads a double-word floating-point value.
+|.macro load_double, fpr, gpr1, gpr2, src
+|.if FPU
+| ldc1 fpr, src
+|.else
+| lw gpr1, src
+| lw gpr2, 4+src
+|.endif
+|.endmacro
+|
+|// Stores a double-word floating-point value.
+|.macro store_double, fpr, gpr1, gpr2, dst
+|.if FPU
+| sdc1 fpr, dst
+|.else
+| sw gpr1, dst
+| sw gpr2, 4+dst
+|.endif
+|.endmacro
+|
+|// Loads the first double-word floating-point argument.
+|.macro load_farg1, src
+| load_double FARG1, CARG1, CARG2, src
+|.endmacro
+|
+|// Loads the second double-word floating-point argument.
+|.macro load_farg2, src
+| load_double FARG2, CARG3, CARG4, src
+|.endmacro
+|
+|.macro load_double1, src
+| load_double f0, SFT1, SFT2, src
+|.endmacro
+|
+|.macro store_double1, dst
+| store_double f0, SFT1, SFT2, dst
+|.endmacro
+|
+|.macro load_double2, src
+| load_double f2, SFT3, SFT4, src
+|.endmacro
+|
+|.macro store_double2, dst
+| store_double f2, SFT3, SFT4, dst
+|.endmacro
+|
|.macro hotcheck, delta, target
| srl TMP1, PC, 1
| andi TMP1, TMP1, 126
@@ -354,9 +463,9 @@ static void build_subroutines(BuildCtx *ctx)
|. sll TMP2, TMP2, 3
|1:
| addiu TMP1, TMP1, -8
- | ldc1 f0, 0(RA)
+ | load_double1 0(RA)
| addiu RA, RA, 8
- | sdc1 f0, 0(BASE)
+ | store_double1 0(BASE)
| bnez TMP1, <1
|. addiu BASE, BASE, 8
|
@@ -425,15 +534,15 @@ static void build_subroutines(BuildCtx *ctx)
| and sp, CARG1, AT
|->vm_unwind_ff_eh: // Landing pad for external unwinder.
| lw L, SAVE_L
- | lui TMP3, 0x59c0 // TOBIT = 2^52 + 2^51 (float).
+ | .FPU lui TMP3, 0x59c0 // TOBIT = 2^52 + 2^51 (float).
| li TISNIL, LJ_TNIL
| lw BASE, L->base
| lw DISPATCH, L->glref // Setup pointer to dispatch table.
- | mtc1 TMP3, TOBIT
+ | .FPU mtc1 TMP3, TOBIT
| li TMP1, LJ_TFALSE
| li_vmstate INTERP
| lw PC, FRAME_PC(BASE) // Fetch PC of previous frame.
- | cvt.d.s TOBIT, TOBIT
+ | .FPU cvt.d.s TOBIT, TOBIT
| addiu RA, BASE, -8 // Results start at BASE-8.
| addiu DISPATCH, DISPATCH, GG_G2DISP
| sw TMP1, HI(RA) // Prepend false to error message.
@@ -498,11 +607,11 @@ static void build_subroutines(BuildCtx *ctx)
| lw BASE, L->base
| lw TMP1, L->top
| lw PC, FRAME_PC(BASE)
- | lui TMP3, 0x59c0 // TOBIT = 2^52 + 2^51 (float).
+ | .FPU lui TMP3, 0x59c0 // TOBIT = 2^52 + 2^51 (float).
| subu RD, TMP1, BASE
- | mtc1 TMP3, TOBIT
+ | .FPU mtc1 TMP3, TOBIT
| sb r0, L->status
- | cvt.d.s TOBIT, TOBIT
+ | .FPU cvt.d.s TOBIT, TOBIT
| li_vmstate INTERP
| addiu RD, RD, 8
| st_vmstate
@@ -540,13 +649,13 @@ static void build_subroutines(BuildCtx *ctx)
|3: // Entry point for vm_cpcall/vm_resume (BASE = base, PC = ftype).
| sw L, DISPATCH_GL(cur_L)(DISPATCH)
| lw TMP2, L->base // TMP2 = old base (used in vmeta_call).
- | lui TMP3, 0x59c0 // TOBIT = 2^52 + 2^51 (float).
+ | .FPU lui TMP3, 0x59c0 // TOBIT = 2^52 + 2^51 (float).
| lw TMP1, L->top
- | mtc1 TMP3, TOBIT
+ | .FPU mtc1 TMP3, TOBIT
| addu PC, PC, BASE
| subu NARGS8:RC, TMP1, BASE
| subu PC, PC, TMP2 // PC = frame delta + frame type
- | cvt.d.s TOBIT, TOBIT
+ | .FPU cvt.d.s TOBIT, TOBIT
| li_vmstate INTERP
| li TISNIL, LJ_TNIL
| st_vmstate
@@ -628,7 +737,7 @@ static void build_subroutines(BuildCtx *ctx)
|->cont_cat: // RA = resultptr, RB = meta base
| lw INS, -4(PC)
| addiu CARG2, RB, -16
- | ldc1 f0, 0(RA)
+ | load_double1 0(RA)
| decode_RB8a MULTRES, INS
| decode_RA8a RA, INS
| decode_RB8b MULTRES
@@ -636,11 +745,21 @@ static void build_subroutines(BuildCtx *ctx)
| addu TMP1, BASE, MULTRES
| sw BASE, L->base
| subu CARG3, CARG2, TMP1
+ |.if FPU
| bne TMP1, CARG2, ->BC_CAT_Z
|. sdc1 f0, 0(CARG2)
| addu RA, BASE, RA
| b ->cont_nop
|. sdc1 f0, 0(RA)
+ |.else
+ | sw SFT1, 0(CARG2)
+ | bne TMP1, CARG2, ->BC_CAT_Z
+ |. sw SFT2, 4(CARG2)
+ | addu RA, BASE, RA
+ | sw SFT1, 0(RA)
+ | b ->cont_nop
+ |. sw SFT2, 4(RA)
+ |.endif
|
|//-- Table indexing metamethods -----------------------------------------
|
@@ -663,10 +782,19 @@ static void build_subroutines(BuildCtx *ctx)
|. sw TMP1, HI(CARG3)
|
|->vmeta_tgetb: // TMP0 = index
+ |.if FPU
| mtc1 TMP0, f0
| cvt.d.w f0, f0
| addiu CARG3, DISPATCH, DISPATCH_GL(tmptv)
| sdc1 f0, 0(CARG3)
+ |.else
+ | sw CARG2, TEMP_SAVE_1 //needed to be saved because it's used later in lj_meta_tget
+ | cvti2d TMP0
+ | addiu CARG3, DISPATCH, DISPATCH_GL(tmptv)
+ | sw CRET1, 0(CARG3)
+ | sw CRET2, 4(CARG3)
+ | lw CARG2, TEMP_SAVE_1
+ |.endif
|
|->vmeta_tgetv:
|1:
@@ -678,9 +806,9 @@ static void build_subroutines(BuildCtx *ctx)
| // Returns TValue * (finished) or NULL (metamethod).
| beqz CRET1, >3
|. addiu TMP1, BASE, -FRAME_CONT
- | ldc1 f0, 0(CRET1)
+ | load_double2 0(CRET1)
| ins_next1
- | sdc1 f0, 0(RA)
+ | store_double2 0(RA)
| ins_next2
|
|3: // Call __index metamethod.
@@ -699,8 +827,14 @@ static void build_subroutines(BuildCtx *ctx)
| // Returns cTValue * or NULL.
| beqz CRET1, >1
|. nop
+ |.if FPU
| b ->BC_TGETR_Z
|. ldc1 f0, 0(CRET1)
+ |.else
+ | lw SFT1, 0(CRET1)
+ | b ->BC_TGETR_Z
+ |. lw SFT2, 4(CRET1)
+ |.endif
|
|//-----------------------------------------------------------------------
|
@@ -723,10 +857,19 @@ static void build_subroutines(BuildCtx *ctx)
|. sw TMP1, HI(CARG3)
|
|->vmeta_tsetb: // TMP0 = index
+ |.if FPU
| mtc1 TMP0, f0
| cvt.d.w f0, f0
| addiu CARG3, DISPATCH, DISPATCH_GL(tmptv)
| sdc1 f0, 0(CARG3)
+ |.else
+ | sw CARG2, TEMP_SAVE_1
+ | cvti2d TMP0
+ | addiu CARG3, DISPATCH, DISPATCH_GL(tmptv)
+ | sw CRET1, 0(CARG3)
+ | sw CRET2, 4(CARG3)
+ | lw CARG2, TEMP_SAVE_1
+ |.endif
|
|->vmeta_tsetv:
|1:
@@ -736,11 +879,17 @@ static void build_subroutines(BuildCtx *ctx)
| call_intern lj_meta_tset // (lua_State *L, TValue *o, TValue *k)
|. move CARG1, L
| // Returns TValue * (finished) or NULL (metamethod).
+ |.if FPU
| beqz CRET1, >3
- |. ldc1 f0, 0(RA)
+ |. ldc1 f2, 0(RA)
+ |.else
+ | lw SFT3, 0(RA)
+ | beqz CRET1, >3
+ |. lw SFT4, 4(RA)
+ |.endif
| // NOBARRIER: lj_meta_tset ensures the table is not black.
| ins_next1
- | sdc1 f0, 0(CRET1)
+ | store_double2 0(CRET1)
| ins_next2
|
|3: // Call __newindex metamethod.
@@ -750,7 +899,7 @@ static void build_subroutines(BuildCtx *ctx)
| sw PC, -16+HI(BASE) // [cont|PC]
| subu PC, BASE, TMP1
| lw LFUNC:RB, FRAME_FUNC(BASE) // Guaranteed to be a function here.
- | sdc1 f0, 16(BASE) // Copy value to third argument.
+ | store_double2 16(BASE) // Copy value to third argument.
| b ->vm_call_dispatch_f
|. li NARGS8:RC, 24 // 3 args for func(t, k, v)
|
@@ -793,11 +942,17 @@ static void build_subroutines(BuildCtx *ctx)
|
|->cont_ra: // RA = resultptr
| lbu TMP1, -4+OFS_RA(PC)
- | ldc1 f0, 0(RA)
+ | load_double1 0(RA)
| sll TMP1, TMP1, 3
| addu TMP1, BASE, TMP1
+ |.if FPU
| b ->cont_nop
|. sdc1 f0, 0(TMP1)
+ |.else
+ | sw SFT1, 0(TMP1)
+ | b ->cont_nop
+ |. sw SFT2, 4(TMP1)
+ |.endif
|
|->cont_condt: // RA = resultptr
| lw TMP0, HI(RA)
@@ -852,7 +1007,22 @@ static void build_subroutines(BuildCtx *ctx)
|//-- Arithmetic metamethods ---------------------------------------------
|
|->vmeta_unm:
- | move CARG4, CARG3
+ | b ->vmeta_arith
+ |. move CARG4, CARG3
+ |
+ |->vmeta_arith_vn:
+ | addu CARG3, BASE, RB
+ | b ->vmeta_arith
+ |. addu CARG4, KBASE, RC
+ |
+ |->vmeta_arith_nv:
+ | addu CARG4, BASE, RB
+ | b ->vmeta_arith
+ |. addu CARG3, KBASE, RC
+ |
+ |->vmeta_arith_vv:
+ | addu CARG3, BASE, RB
+ | addu CARG4, BASE, RC
|
|->vmeta_arith:
| load_got lj_meta_arith
@@ -985,9 +1155,9 @@ static void build_subroutines(BuildCtx *ctx)
|.macro .ffunc_n, name // Caveat: has delay slot!
|->ff_ .. name:
| lw CARG3, HI(BASE)
+ | load_farg1 0(BASE)
| beqz NARGS8:RC, ->fff_fallback
- |. ldc1 FARG1, 0(BASE)
- | sltiu AT, CARG3, LJ_TISNUM
+ |. sltiu AT, CARG3, LJ_TISNUM
| beqz AT, ->fff_fallback
|.endmacro
|
@@ -997,10 +1167,10 @@ static void build_subroutines(BuildCtx *ctx)
| lw CARG3, HI(BASE)
| bnez AT, ->fff_fallback
|. lw CARG4, 8+HI(BASE)
- | ldc1 FARG1, 0(BASE)
- | ldc1 FARG2, 8(BASE)
| sltiu TMP0, CARG3, LJ_TISNUM
| sltiu TMP1, CARG4, LJ_TISNUM
+ | load_farg1 0(BASE)
+ | load_farg2 8(BASE)
| and TMP0, TMP0, TMP1
| beqz TMP0, ->fff_fallback
|.endmacro
@@ -1027,8 +1197,8 @@ static void build_subroutines(BuildCtx *ctx)
| beq BASE, TMP2, ->fff_res // Done if exactly 1 argument.
|. sw CARG1, LO(RA)
|1:
- | ldc1 f0, 0(TMP1)
- | sdc1 f0, -8(TMP1)
+ | load_double1 0(TMP1)
+ | store_double1 -8(TMP1)
| bne TMP1, TMP2, <1
|. addiu TMP1, TMP1, 8
| b ->fff_res
@@ -1043,8 +1213,14 @@ static void build_subroutines(BuildCtx *ctx)
| not TMP1, TMP1
| sll TMP1, TMP1, 3
| addu TMP1, CFUNC:RB, TMP1
+ |.if HFABI
| b ->fff_resn
|. ldc1 FRET1, CFUNC:TMP1->upvalue
+ |.else
+ | lw CRET1, CFUNC:TMP1->upvalue[0].u32.hi
+ | b ->fff_resn
+ |. lw CRET2, CFUNC:TMP1->upvalue[0].u32.lo
+ |.endif
|
|//-- Base library: getters and setters ---------------------------------
|
@@ -1125,8 +1301,14 @@ static void build_subroutines(BuildCtx *ctx)
| call_intern lj_tab_get // (lua_State *L, GCtab *t, cTValue *key)
|. move CARG1, L
| // Returns cTValue *.
+ |.if HFABI
| b ->fff_resn
|. ldc1 FRET1, 0(CRET1)
+ |.else
+ | lw CRET2, 4(CRET1)
+ | b ->fff_resn
+ |. lw CRET1, 0(CRET1)
+ |.endif
|
|//-- Base library: conversions ------------------------------------------
|
@@ -1136,8 +1318,14 @@ static void build_subroutines(BuildCtx *ctx)
| xori AT, NARGS8:RC, 8
| sltiu CARG1, CARG1, LJ_TISNUM
| movn CARG1, r0, AT
+ |.if HFABI
| beqz CARG1, ->fff_fallback // Exactly one number argument.
|. ldc1 FRET1, 0(BASE)
+ |.else
+ | lw CRET1, 0(BASE)
+ | beqz CARG1, ->fff_fallback // Exactly one number argument.
+ |. lw CRET2, 4(BASE)
+ |.endif
| b ->fff_resn
|. nop
|
@@ -1185,13 +1373,13 @@ static void build_subroutines(BuildCtx *ctx)
| // Returns 0 at end of traversal.
| beqz CRET1, ->fff_restv // End of traversal: return nil.
|. li CARG3, LJ_TNIL
- | ldc1 f0, 8(BASE) // Copy key and value to results.
+ | load_double1 8(BASE)
| addiu RA, BASE, -8
- | ldc1 f2, 16(BASE)
- | li RD, (2+1)*8
- | sdc1 f0, 0(RA)
+ | load_double2 16(BASE)
+ | store_double1 0(RA)
+ | store_double2 8(RA)
| b ->fff_res
- |. sdc1 f2, 8(RA)
+ |. li RD, (2+1)*8
|
|.ffunc_1 pairs
| li AT, LJ_TTAB
@@ -1199,16 +1387,32 @@ static void build_subroutines(BuildCtx *ctx)
|. lw PC, FRAME_PC(BASE)
#if LJ_52
| lw TAB:TMP2, TAB:CARG1->metatable
+ |.if FPU
| ldc1 f0, CFUNC:RB->upvalue[0]
+ |.else
+ | lw SFT1, CFUNC:RB->upvalue[0].u32.hi
+ | lw SFT2, CFUNC:RB->upvalue[0].u32.lo
+ |.endif
| bnez TAB:TMP2, ->fff_fallback
#else
+ |.if FPU
| ldc1 f0, CFUNC:RB->upvalue[0]
+ |.else
+ | lw SFT1, CFUNC:RB->upvalue[0].u32.hi
+ | lw SFT2, CFUNC:RB->upvalue[0].u32.lo
+ |.endif
#endif
|. addiu RA, BASE, -8
| sw TISNIL, 8+HI(BASE)
| li RD, (3+1)*8
+ |.if FPU
| b ->fff_res
|. sdc1 f0, 0(RA)
+ |.else
+ | sw SFT1, 0(RA)
+ | b ->fff_res
+ |. sw SFT2, 4(RA)
+ |.endif
|
|.ffunc ipairs_aux
| sltiu AT, NARGS8:RC, 16
@@ -1216,35 +1420,55 @@ static void build_subroutines(BuildCtx *ctx)
| lw TAB:CARG1, LO(BASE)
| lw CARG4, 8+HI(BASE)
| bnez AT, ->fff_fallback
- |. ldc1 FARG2, 8(BASE)
- | addiu CARG3, CARG3, -LJ_TTAB
+ |. addiu CARG3, CARG3, -LJ_TTAB
| sltiu AT, CARG4, LJ_TISNUM
| li TMP0, 1
| movn AT, r0, CARG3
- | mtc1 TMP0, FARG1
| beqz AT, ->fff_fallback
|. lw PC, FRAME_PC(BASE)
+ |.if FPU
+ | ldc1 FARG2, 8(BASE)
+ | mtc1 TMP0, FARG1
| trunc.w.d FRET1, FARG2
| cvt.d.w FARG1, FARG1
- | lw TMP0, TAB:CARG1->asize
- | lw TMP1, TAB:CARG1->array
| mfc1 TMP2, FRET1
- | addiu RA, BASE, -8
| add.d FARG2, FARG2, FARG1
+ |.else
+ | sw CARG1, TEMP_SAVE_1
+ | cvti2d TMP0
+ | sw CRET1, TEMP_SAVE_2 // Store result CRET1/CRET2=1 (double).
+ | sw CRET2, TEMP_SAVE_3
+ | lw CARG2, 8+4(BASE)
+ | load_got __fixdfsi
+ | call_extern
+ |. lw CARG1, 8(BASE)
+ | sw CRET1, TEMP_SAVE_4
+ | load_got __adddf3
+ | lw CARG2, TEMP_SAVE_3
+ | lw CARG3, 8(BASE)
+ | lw CARG4, 8+4(BASE)
+ | call_extern
+ |. lw CARG1, TEMP_SAVE_2
+ | lw TMP2, TEMP_SAVE_4
+ | lw CARG1, TEMP_SAVE_1
+ |.endif
+ | lw TMP0, TAB:CARG1->asize
+ | lw TMP1, TAB:CARG1->array
| addiu TMP2, TMP2, 1
| sltu AT, TMP2, TMP0
+ | beqz AT, >2 // Not in array part?
+ |. addiu RA, BASE, -8
+ | store_double FARG2, CRET1, CRET2, 0(RA)
| sll TMP3, TMP2, 3
| addu TMP3, TMP1, TMP3
- | beqz AT, >2 // Not in array part?
- |. sdc1 FARG2, 0(RA)
| lw TMP2, HI(TMP3)
- | ldc1 f0, 0(TMP3)
+ | load_double1 0(TMP3)
|1:
| beq TMP2, TISNIL, ->fff_res // End of iteration, return 0 results.
|. li RD, (0+1)*8
- | li RD, (2+1)*8
+ | store_double1 8(RA)
| b ->fff_res
- |. sdc1 f0, 8(RA)
+ |. li RD, (2+1)*8
|2: // Check for empty hash part first. Otherwise call C function.
| lw TMP0, TAB:CARG1->hmask
| load_got lj_tab_getinth
@@ -1256,8 +1480,14 @@ static void build_subroutines(BuildCtx *ctx)
| beqz CRET1, ->fff_res
|. li RD, (0+1)*8
| lw TMP2, HI(CRET1)
+ |.if FPU
| b <1
|. ldc1 f0, 0(CRET1)
+ |.else
+ | lw SFT2, 4(CRET1)
+ | b <1
+ |. lw SFT1, 0(CRET1)
+ |.endif
|
|.ffunc_1 ipairs
| li AT, LJ_TTAB
@@ -1265,17 +1495,33 @@ static void build_subroutines(BuildCtx *ctx)
|. lw PC, FRAME_PC(BASE)
#if LJ_52
| lw TAB:TMP2, TAB:CARG1->metatable
+ |.if FPU
| ldc1 f0, CFUNC:RB->upvalue[0]
+ |.else
+ | lw SFT1, CFUNC:RB->upvalue[0].u32.hi
+ | lw SFT2, CFUNC:RB->upvalue[0].u32.lo
+ |.endif
| bnez TAB:TMP2, ->fff_fallback
#else
+ |.if FPU
| ldc1 f0, CFUNC:RB->upvalue[0]
+ |.else
+ | lw SFT1, CFUNC:RB->upvalue[0].u32.hi
+ | lw SFT2, CFUNC:RB->upvalue[0].u32.lo
+ |.endif
#endif
|. addiu RA, BASE, -8
| sw r0, 8+HI(BASE)
| sw r0, 8+LO(BASE)
| li RD, (3+1)*8
+ |.if FPU
| b ->fff_res
|. sdc1 f0, 0(RA)
+ |.else
+ | sw SFT1, 0(RA)
+ | b ->fff_res
+ |. sw SFT2, 4(RA)
+ |.endif
|
|//-- Base library: catch errors ----------------------------------------
|
@@ -1295,8 +1541,12 @@ static void build_subroutines(BuildCtx *ctx)
| sltiu AT, NARGS8:RC, 16
| lw CARG4, 8+HI(BASE)
| bnez AT, ->fff_fallback
+ |.if FPU
|. ldc1 FARG2, 8(BASE)
- | ldc1 FARG1, 0(BASE)
+ |.else
+ |. lw CARG3, 8+LO(BASE)
+ |.endif
+ | load_double FARG1, CARG1, CARG2, 0(BASE)
| lbu TMP1, DISPATCH_GL(hookmask)(DISPATCH)
| li AT, LJ_TFUNC
| move TMP2, BASE
@@ -1304,9 +1554,14 @@ static void build_subroutines(BuildCtx *ctx)
| addiu BASE, BASE, 16
| // Remember active hook before pcall.
| srl TMP3, TMP3, HOOK_ACTIVE_SHIFT
+ |.if FPU
| sdc1 FARG2, 0(TMP2) // Swap function and traceback.
+ |.else
+ | sw CARG3, LO(TMP2)
+ | sw CARG4, HI(TMP2)
+ |.endif
| andi TMP3, TMP3, 1
- | sdc1 FARG1, 8(TMP2)
+ | store_double FARG1, CARG1, CARG2, 8(TMP2)
| addiu PC, TMP3, 16+FRAME_PCALL
| b ->vm_call_dispatch
|. addiu NARGS8:RC, NARGS8:RC, -16
@@ -1350,11 +1605,11 @@ static void build_subroutines(BuildCtx *ctx)
| move CARG3, CARG2
| sw BASE, L->top
|2: // Move args to coroutine.
- | ldc1 f0, 0(BASE)
+ | load_double1 0(BASE)
| sltu AT, BASE, TMP1
| beqz AT, >3
|. addiu BASE, BASE, 8
- | sdc1 f0, 0(CARG3)
+ | store_double1 0(CARG3)
| b <2
|. addiu CARG3, CARG3, 8
|3:
@@ -1380,10 +1635,10 @@ static void build_subroutines(BuildCtx *ctx)
| sw TMP2, L:RA->top // Clear coroutine stack.
| move TMP1, BASE
|5: // Move results from coroutine.
- | ldc1 f0, 0(TMP2)
+ | load_double1 0(TMP2)
| addiu TMP2, TMP2, 8
| sltu AT, TMP2, TMP3
- | sdc1 f0, 0(TMP1)
+ | store_double1 0(TMP1)
| bnez AT, <5
|. addiu TMP1, TMP1, 8
|6:
@@ -1408,12 +1663,12 @@ static void build_subroutines(BuildCtx *ctx)
|.if resume
| addiu TMP3, TMP3, -8
| li TMP1, LJ_TFALSE
- | ldc1 f0, 0(TMP3)
+ | load_double1 0(TMP3)
| sw TMP3, L:RA->top // Remove error from coroutine stack.
| li RD, (2+1)*8
| sw TMP1, -8+HI(BASE) // Prepend false to results.
| addiu RA, BASE, -8
- | sdc1 f0, 0(BASE) // Copy error message.
+ | store_double1 0(BASE) // Copy error message.
| b <7
|. andi TMP0, PC, FRAME_TYPE
|.else
@@ -1449,13 +1704,33 @@ static void build_subroutines(BuildCtx *ctx)
|
|//-- Math library -------------------------------------------------------
|
- |.ffunc_n math_abs
+ |.ffunc_1 math_abs
+ | load_farg1 0(BASE)
+ | sltiu AT, CARG3, LJ_TISNUM
+ | beqz AT, ->fff_fallback
+ |. nop
+ |.if FPU
|. abs.d FRET1, FARG1
+ |.else
+ |. lui TMP1, 0x8000
+ | and AT, CARG1, TMP1
+ | move CRET2, CARG2
+ | beqz AT, ->fff_resn
+ |. move CRET1, CARG1
+ | xor CRET1, CARG1, TMP1
+ |.endif
+ |
|->fff_resn:
| lw PC, FRAME_PC(BASE)
| addiu RA, BASE, -8
+ |.if HFABI
| b ->fff_res1
|. sdc1 FRET1, -8(BASE)
+ |.else
+ | sw CRET1, -8(BASE)
+ | b ->fff_res1
+ |. sw CRET2, -8+4(BASE)
+ |.endif
|
|->fff_restv:
| // CARG3/CARG1 = TValue result.
@@ -1498,8 +1773,14 @@ static void build_subroutines(BuildCtx *ctx)
| sltiu AT, CARG3, LJ_TISNUM
| beqz AT, ->fff_fallback
|. nop
+ |.if HFABI
| call_extern
|. ldc1 FARG1, 0(BASE)
+ |.else
+ | lw CARG1, 0(BASE)
+ | call_extern
+ |. lw CARG2, 4(BASE)
+ |.endif
| b ->fff_resn
|. nop
|.endmacro
@@ -1526,15 +1807,20 @@ static void build_subroutines(BuildCtx *ctx)
| math_round ceil
|
|.ffunc math_log
- | lw CARG3, HI(BASE)
| li AT, 8
| bne NARGS8:RC, AT, ->fff_fallback // Exactly 1 argument.
- |. load_got log
+ |. lw CARG3, HI(BASE)
| sltiu AT, CARG3, LJ_TISNUM
| beqz AT, ->fff_fallback
- |. nop
+ |. load_got log
+ |.if HFABI
| call_extern
|. ldc1 FARG1, 0(BASE)
+ |.else
+ | lw CARG1, 0(BASE)
+ | call_extern
+ |. lw CARG2, 4(BASE)
+ |.endif
| b ->fff_resn
|. nop
|
@@ -1553,17 +1839,40 @@ static void build_subroutines(BuildCtx *ctx)
| math_extern2 atan2
| math_extern2 fmod
|
+ |.if FPU
|.ffunc_n math_sqrt
|. sqrt.d FRET1, FARG1
| b ->fff_resn
|. nop
+ |.else
+ | math_extern sqrt
+ |.endif
|
- |.ffunc_nn math_ldexp
+ |.ffunc_2 math_ldexp
+ | sltiu TMP0, CARG3, LJ_TISNUM
+ | sltiu TMP1, CARG4, LJ_TISNUM
+ | load_farg1 0(BASE)
+ | load_farg2 8(BASE)
+ | and TMP0, TMP0, TMP1
+ | beqz TMP0, ->fff_fallback
+ |.if FPU
+ | load_got ldexp
| trunc.w.d FARG2, FARG2
+ | call_extern
+ |. mfc1 CARG3, FARG2
+ |.else
+ | sw CARG1, TEMP_SAVE_1
+ | sw CARG2, TEMP_SAVE_2
+ | load_got __fixdfsi
+ | move CARG1, CARG3
+ | call_extern
+ |. move CARG2, CARG4
+ | lw CARG1, TEMP_SAVE_1
| load_got ldexp
- | mfc1 CARG3, FARG2
+ | lw CARG2, TEMP_SAVE_2
| call_extern
- |. nop
+ |. move CARG3, CRET1
+ |.endif
| b ->fff_resn
|. nop
|
@@ -1574,10 +1883,14 @@ static void build_subroutines(BuildCtx *ctx)
|. addiu CARG3, DISPATCH, DISPATCH_GL(tmptv)
| lw TMP1, DISPATCH_GL(tmptv)(DISPATCH)
| addiu RA, BASE, -8
+ | store_double FRET1, CRET1, CRET2, 0(RA)
+ |.if FPU
| mtc1 TMP1, FARG2
- | sdc1 FRET1, 0(RA)
| cvt.d.w FARG2, FARG2
- | sdc1 FARG2, 8(RA)
+ |.else
+ | cvti2d TMP1
+ |.endif
+ | store_double FARG2, CRET1, CRET2, 8(RA)
| b ->fff_res
|. li RD, (2+1)*8
|
@@ -1587,7 +1900,12 @@ static void build_subroutines(BuildCtx *ctx)
| call_extern
|. addiu CARG3, BASE, -8
| addiu RA, BASE, -8
+ |.if HFABI
| sdc1 FRET1, 0(BASE)
+ |.else
+ | sw CRET1, 0(BASE)
+ | sw CRET2, 4(BASE)
+ |.endif
| b ->fff_res
|. li RD, (2+1)*8
|
@@ -1595,25 +1913,73 @@ static void build_subroutines(BuildCtx *ctx)
|->ff_ .. name:
| lw CARG3, HI(BASE)
| beqz NARGS8:RC, ->fff_fallback
- |. ldc1 FRET1, 0(BASE)
- | sltiu AT, CARG3, LJ_TISNUM
+ |. sltiu AT, CARG3, LJ_TISNUM
| beqz AT, ->fff_fallback
|. addu TMP2, BASE, NARGS8:RC
| addiu TMP1, BASE, 8
+ |.if HFABI
+ | ldc1 FRET1, 0(BASE)
| beq TMP1, TMP2, ->fff_resn
+ |.else
+ | lw CRET1, 0(BASE)
+ | lw CRET2, 4(BASE)
+ | beq TMP1, TMP2, ->fff_resn
+ |.endif
|1:
|. lw CARG3, HI(TMP1)
+ |.if HFABI
| ldc1 FARG1, 0(TMP1)
- | addiu TMP1, TMP1, 8
+ |.else
+ | lw CARG1, 0(TMP1)
+ | lw CARG2, 4(TMP1)
+ |.endif
| sltiu AT, CARG3, LJ_TISNUM
| beqz AT, ->fff_fallback
+ |. addiu TMP1, TMP1, 8
+ |.if FPU
|.if ismax
- |. c.olt.d FARG1, FRET1
+ | c.olt.d FARG1, FRET1
|.else
- |. c.olt.d FRET1, FARG1
+ | c.olt.d FRET1, FARG1
|.endif
| bne TMP1, TMP2, <1
|. movf.d FRET1, FARG1
+ |.else
+ | load_got __ledf2
+ | sw TMP1, TEMP_SAVE_1
+ | sw TMP2, TEMP_SAVE_2
+ | sw CARG1, TEMP_SAVE_3
+ | sw CARG2, TEMP_SAVE_4
+ | sw CRET1, TEMP_SAVE_5
+ | sw CRET2, TEMP_SAVE_6
+ | move CARG3, CRET1
+ | call_extern
+ |. move CARG4, CRET2
+ | lw CARG4, TEMP_SAVE_6
+ | lw CARG3, TEMP_SAVE_5
+ | lw CARG2, TEMP_SAVE_4
+ | lw CARG1, TEMP_SAVE_3
+ | lw TMP2, TEMP_SAVE_2
+ | lw TMP1, TEMP_SAVE_1
+ |.if ismax
+ | beqz CRET1, >2 // farg1==fret1
+ |. li TMP3, 1
+ | beq CRET1, TMP3, >2 // farg1>fret1
+ |. nop
+ |.else
+ | blez CRET1, >2
+ |. nop
+ |.endif
+ | move CRET1, CARG3 // Keep the value.
+ | b >3
+ |. move CRET2, CARG4
+ |2:
+ | move CRET1, CARG1 // Set new value.
+ | move CRET2, CARG2
+ |3:
+ | bne TMP1, TMP2, <1
+ |. nop
+ |.endif
| b ->fff_resn
|. nop
|.endmacro
@@ -1632,32 +1998,52 @@ static void build_subroutines(BuildCtx *ctx)
| bnez AT, ->fff_fallback // Need exactly 1 string argument.
|. nop
| lw TMP0, STR:CARG1->len
- | lbu TMP1, STR:CARG1[1] // Access is always ok (NUL at end).
| addiu RA, BASE, -8
| sltu RD, r0, TMP0
- | mtc1 TMP1, f0
+ | lw PC, FRAME_PC(BASE)
| addiu RD, RD, 1
+ | lbu TMP1, STR:CARG1[1] // Access is always ok (NUL at end).
+ |.if FPU
+ | mtc1 TMP1, f0
| cvt.d.w f0, f0
- | lw PC, FRAME_PC(BASE)
- | sll RD, RD, 3 // RD = ((str->len != 0)+1)*8
+ | sdc1 f0, 0(RA)
+ |.else
+ | sw RD, TEMP_SAVE_1
+ | cvti2d TMP1
+ | sw CRET1, 0(RA)
+ | sw CRET2, 4(RA)
+ | lw RD, TEMP_SAVE_1
+ |.endif
| b ->fff_res
- |. sdc1 f0, 0(RA)
+ |. sll RD, RD, 3 // RD = ((str->len != 0)+1)*8
|
|.ffunc string_char // Only handle the 1-arg case here.
| ffgccheck
| lw CARG3, HI(BASE)
- | ldc1 FARG1, 0(BASE)
| li AT, 8
| bne NARGS8:RC, AT, ->fff_fallback // Exactly 1 argument.
|. sltiu AT, CARG3, LJ_TISNUM
| beqz AT, ->fff_fallback
|. li CARG3, 1
- | trunc.w.d FARG1, FARG1
- | addiu CARG2, sp, ARG5_OFS
| sltiu AT, TMP0, 256
- | mfc1 TMP0, FARG1
| beqz AT, ->fff_fallback
- |. sw TMP0, ARG5
+ | load_farg1 0(BASE)
+ |.if FPU
+ | trunc.w.d FARG1, FARG1
+ | mfc1 TMP0, FARG1
+ |.else
+ | load_got __fixdfsi
+ | sw RB, TEMP_SAVE_1
+ | sw RC, TEMP_SAVE_2
+ | call_extern
+ |. sw CARG3, TEMP_SAVE_3
+ | lw CARG3, TEMP_SAVE_3
+ | lw RC, TEMP_SAVE_2
+ | lw RB, TEMP_SAVE_1
+ | move TMP0, CRET1
+ |.endif
+ | addiu CARG2, sp, ARG5_OFS
+ | sw TMP0, ARG5
|->fff_newstr:
| load_got lj_str_new
| sw BASE, L->base
@@ -1674,27 +2060,52 @@ static void build_subroutines(BuildCtx *ctx)
|.ffunc string_sub
| ffgccheck
| addiu AT, NARGS8:RC, -16
+ |.if FPU
+ | ldc1 f0, 16(BASE)
+ | trunc.w.d f0, f0
+ |.else
+ | lw CARG1, 16(BASE)
+ | load_got __fixdfsi
+ | sw AT, TEMP_SAVE_1
+ | call_extern
+ |. lw CARG2, 16+4(BASE)
+ | lw AT, TEMP_SAVE_1
+ |.endif
| lw CARG3, 16+HI(BASE)
- | ldc1 f0, 16(BASE)
| lw TMP0, HI(BASE)
| lw STR:CARG1, LO(BASE)
| bltz AT, ->fff_fallback
- | lw CARG2, 8+HI(BASE)
- | ldc1 f2, 8(BASE)
+ |. lw CARG2, 8+HI(BASE)
| beqz AT, >1
|. li CARG4, -1
- | trunc.w.d f0, f0
| sltiu AT, CARG3, LJ_TISNUM
| beqz AT, ->fff_fallback
+ |.if FPU
|. mfc1 CARG4, f0
+ |.else
+ |. move CARG4, CRET1
+ |.endif
|1:
| sltiu AT, CARG2, LJ_TISNUM
| beqz AT, ->fff_fallback
|. li AT, LJ_TSTR
- | trunc.w.d f2, f2
| bne TMP0, AT, ->fff_fallback
- |. lw CARG2, STR:CARG1->len
+ |.if FPU
+ |. ldc1 f2, 8(BASE)
+ | trunc.w.d f2, f2
| mfc1 CARG3, f2
+ |.else
+ |. sw CARG1, TEMP_SAVE_1
+ | sw CARG4, TEMP_SAVE_2
+ | lw CARG2, 8+4(BASE)
+ | load_got __fixdfsi
+ | call_extern
+ |. lw CARG1, 8(BASE)
+ | lw CARG1, TEMP_SAVE_1
+ | lw CARG4, TEMP_SAVE_2
+ | move CARG3, CRET1
+ |.endif
+ | lw CARG2, STR:CARG1->len
| // STR:CARG1 = str, CARG2 = str->len, CARG3 = start, CARG4 = end
| slt AT, CARG4, r0
| addiu TMP0, CARG2, 1
@@ -1749,10 +2160,58 @@ static void build_subroutines(BuildCtx *ctx)
|
|//-- Bit library --------------------------------------------------------
|
+ |.if not FPU
+ |// FP number to bit conversion for soft-float.
+ |->vm_tobit:
+ | sll TMP0, CARG1, 1
+ | lui TMP3, 0x0020
+ | addu TMP0, TMP0, TMP3
+ | slt TMP3, TMP0, r0
+ | movz CARG2, r0, TMP3
+ | beqz TMP3, >2
+ |. li CARG4, 0x3e0
+ | not CARG4, CARG4
+ | sra TMP0, TMP0, 21
+ | subu TMP0, CARG4, TMP0
+ | slt TMP3, TMP0, r0
+ | bnez TMP3, >1
+ |. sll CARG4, CARG1, 11
+ | lui TMP3, 0x8000
+ | or CARG4, CARG4, TMP3
+ | srl TMP3, CARG2, 21
+ | or CARG4, CARG4, TMP3
+ | slt TMP3, CARG1, r0
+ | beqz TMP3, >2
+ |. srlv CARG2, CARG4, TMP0
+ | subu CARG2, r0, CARG2
+ |2:
+ | jr ra
+ |. move CRET1, CARG2
+ |1:
+ | addiu TMP0, TMP0, 21
+ | srlv CARG4, CARG2, TMP0
+ | li TMP3, 20
+ | subu TMP0, TMP3, TMP0
+ | sll CARG2, CARG1, 12
+ | sllv TMP3, CARG2, TMP0
+ | or CARG2, CARG4, TMP3
+ | slt TMP3, CARG1, r0
+ | beqz TMP3, <2
+ |. nop
+ | jr ra
+ |. subu CRET1, r0, CARG2
+ |.endif
+ |
|.macro .ffunc_bit, name
| .ffunc_n bit_..name
+ |.if FPU
|. add.d FARG1, FARG1, TOBIT
| mfc1 CRET1, FARG1
+ |.else
+ |. nop
+ | bal ->vm_tobit
+ |. nop
+ |.endif
|.endmacro
|
|.macro .ffunc_bit_op, name, ins
@@ -1760,14 +2219,27 @@ static void build_subroutines(BuildCtx *ctx)
| addiu TMP1, BASE, 8
| addu TMP2, BASE, NARGS8:RC
|1:
+ | move CRET2, CRET1
| lw CARG4, HI(TMP1)
+ |.if FPU
| beq TMP1, TMP2, ->fff_resi
|. ldc1 FARG1, 0(TMP1)
+ |.else
+ | lw CARG1, 0(TMP1)
+ | beq TMP1, TMP2, ->fff_resi
+ |. lw CARG2, 4(TMP1)
+ |.endif
| sltiu AT, CARG4, LJ_TISNUM
| beqz AT, ->fff_fallback
- | add.d FARG1, FARG1, TOBIT
- | mfc1 CARG2, FARG1
- | ins CRET1, CRET1, CARG2
+ |.if FPU
+ |. add.d FARG1, FARG1, TOBIT
+ | mfc1 CRET1, FARG1
+ |.else
+ |. nop
+ | bal ->vm_tobit
+ |. nop
+ |.endif
+ | ins CRET1, CRET2, CRET1
| b <1
|. addiu TMP1, TMP1, 8
|.endmacro
@@ -1794,10 +2266,22 @@ static void build_subroutines(BuildCtx *ctx)
|
|.macro .ffunc_bit_sh, name, ins, shmod
| .ffunc_nn bit_..name
+ |.if FPU
|. add.d FARG1, FARG1, TOBIT
| add.d FARG2, FARG2, TOBIT
| mfc1 CARG1, FARG1
| mfc1 CARG2, FARG2
+ |.else
+ |. sw CARG4, TEMP_SAVE_1
+ | bal ->vm_tobit
+ |. nop
+ | move CRET2, CRET1
+ | lw CARG2, TEMP_SAVE_1
+ | bal ->vm_tobit
+ |. move CARG1, CARG3
+ | move CARG2, CRET1
+ | move CARG1, CRET2
+ |.endif
|.if shmod == 1
| li AT, 32
| subu TMP0, AT, CARG2
@@ -1822,9 +2306,19 @@ static void build_subroutines(BuildCtx *ctx)
|
|.ffunc_bit tobit
|->fff_resi:
+ | lw PC, FRAME_PC(BASE)
+ | addiu RA, BASE, -8
+ |.if HFABI
| mtc1 CRET1, FRET1
- | b ->fff_resn
- |. cvt.d.w FRET1, FRET1
+ | cvt.d.w FRET1, FRET1
+ | b ->fff_res1
+ |. sdc1 FRET1, -8(BASE)
+ |.else // Result already in CRET1.
+ | cvti2d CRET1
+ | sw CRET1, -8(BASE)
+ | b ->fff_res1
+ |. sw CRET2, -8+4(BASE)
+ |.endif
|
|//-----------------------------------------------------------------------
|
@@ -2082,14 +2576,23 @@ static void build_subroutines(BuildCtx *ctx)
|//-----------------------------------------------------------------------
|
|.macro savex_, a, b
+ |.if FPU
| sdc1 f..a, 16+a*8(sp)
| sw r..a, 16+32*8+a*4(sp)
| sw r..b, 16+32*8+b*4(sp)
+ |.else
+ | sw r..a, 16+a*4(sp)
+ | sw r..b, 16+b*4(sp)
+ |.endif
|.endmacro
|
|->vm_exit_handler:
|.if JIT
+ |.if FPU
| addiu sp, sp, -(16+32*8+32*4)
+ |.else
+ | addiu sp, sp, -(16+32*4)
+ |.endif
| savex_ 0, 1
| savex_ 2, 3
| savex_ 4, 5
@@ -2104,17 +2607,25 @@ static void build_subroutines(BuildCtx *ctx)
| savex_ 22, 23
| savex_ 24, 25
| savex_ 26, 27
+ |.if FPU
| sdc1 f28, 16+28*8(sp)
- | sw r28, 16+32*8+28*4(sp)
| sdc1 f30, 16+30*8(sp)
+ | sw r28, 16+32*8+28*4(sp)
| sw r30, 16+32*8+30*4(sp)
| sw r0, 16+32*8+31*4(sp) // Clear RID_TMP.
+ | addiu TMP2, sp, 16+32*8+32*4 // Recompute original value of sp.
+ | sw TMP2, 16+32*8+29*4(sp) // Store sp in RID_SP
+ |.else
+ | sw r28, 16+28*4(sp)
+ | sw r30, 16+30*4(sp)
+ | sw r0, 16+31*4(sp) // Clear RID_TMP.
+ | addiu TMP2, sp, 16+32*4 // Recompute original value of sp.
+ | sw TMP2, 16+29*4(sp) // Store sp in RID_SP
+ |.endif
| li_vmstate EXIT
- | addiu TMP2, sp, 16+32*8+32*4 // Recompute original value of sp.
| addiu DISPATCH, JGL, -GG_DISP2G-32768
| lw TMP1, 0(TMP2) // Load exit number.
| st_vmstate
- | sw TMP2, 16+32*8+29*4(sp) // Store sp in RID_SP.
| lw L, DISPATCH_GL(cur_L)(DISPATCH)
| lw BASE, DISPATCH_GL(jit_base)(DISPATCH)
| load_got lj_trace_exit
@@ -2144,15 +2655,15 @@ static void build_subroutines(BuildCtx *ctx)
|1:
| bltz CRET1, >9 // Check for error from exit.
|. lw LFUNC:RB, FRAME_FUNC(BASE)
- | lui TMP3, 0x59c0 // TOBIT = 2^52 + 2^51 (float).
+ | .FPU lui TMP3, 0x59c0 // TOBIT = 2^52 + 2^51 (float).
| sll MULTRES, CRET1, 3
| li TISNIL, LJ_TNIL
| sw MULTRES, SAVE_MULTRES
- | mtc1 TMP3, TOBIT
+ | .FPU mtc1 TMP3, TOBIT
| lw TMP1, LFUNC:RB->pc
| sw r0, DISPATCH_GL(jit_base)(DISPATCH)
| lw KBASE, PC2PROTO(k)(TMP1)
- | cvt.d.s TOBIT, TOBIT
+ | .FPU cvt.d.s TOBIT, TOBIT
| // Modified copy of ins_next which handles function header dispatch, too.
| lw INS, 0(PC)
| addiu PC, PC, 4
@@ -2160,7 +2671,7 @@ static void build_subroutines(BuildCtx *ctx)
| sw TISNIL, DISPATCH_GL(vmstate)(DISPATCH)
| decode_OP4a TMP1, INS
| decode_OP4b TMP1
- | sltiu TMP2, TMP1, BC_FUNCF*4 // Function header?
+ | sltiu TMP2, TMP1, BC_FUNCF*4
| addu TMP0, DISPATCH, TMP1
| decode_RD8a RD, INS
| lw AT, 0(TMP0)
@@ -2202,7 +2713,7 @@ static void build_subroutines(BuildCtx *ctx)
|//-----------------------------------------------------------------------
|
|// Modifies AT, TMP0, FRET1, FRET2, f4. Keeps all others incl. FARG1.
- |.macro vm_round, func
+ |.macro vm_round_hf, func
| lui TMP0, 0x4330 // Hiword of 2^52 (double).
| mtc1 r0, f4
| mtc1 TMP0, f5
@@ -2244,6 +2755,25 @@ static void build_subroutines(BuildCtx *ctx)
|. mov.d FRET1, FARG1
|.endmacro
|
+ |.macro vm_round_sf, func
+ | addiu sp, sp, -8
+ | load_got func
+ | sw ra, 0(sp)
+ | call_extern
+ |. nop
+ | lw ra, 0(sp)
+ | jr ra
+ |. addiu sp, sp, 8
+ |.endmacro
+ |
+ |.macro vm_round, func
+ |.if FPU
+ | vm_round_hf, func
+ |.else
+ | vm_round_sf, func
+ |.endif
+ |.endmacro
+ |
|->vm_floor:
| vm_round floor
|->vm_ceil:
@@ -2272,10 +2802,10 @@ static void build_subroutines(BuildCtx *ctx)
| sw r1, CTSTATE->cb.slot
| sw CARG1, CTSTATE->cb.gpr[0]
| sw CARG2, CTSTATE->cb.gpr[1]
- | sdc1 FARG1, CTSTATE->cb.fpr[0]
+ | .FPU sdc1 FARG1, CTSTATE->cb.fpr[0]
| sw CARG3, CTSTATE->cb.gpr[2]
| sw CARG4, CTSTATE->cb.gpr[3]
- | sdc1 FARG2, CTSTATE->cb.fpr[1]
+ | .FPU sdc1 FARG2, CTSTATE->cb.fpr[1]
| addiu TMP0, sp, CFRAME_SPACE+16
| sw TMP0, CTSTATE->cb.stack
| sw r0, SAVE_PC // Any value outside of bytecode is ok.
@@ -2286,14 +2816,14 @@ static void build_subroutines(BuildCtx *ctx)
| lw BASE, L:CRET1->base
| lw RC, L:CRET1->top
| move L, CRET1
- | lui TMP3, 0x59c0 // TOBIT = 2^52 + 2^51 (float).
+ | .FPU lui TMP3, 0x59c0 // TOBIT = 2^52 + 2^51 (float).
| lw LFUNC:RB, FRAME_FUNC(BASE)
- | mtc1 TMP3, TOBIT
+ | .FPU mtc1 TMP3, TOBIT
| li_vmstate INTERP
| li TISNIL, LJ_TNIL
| subu RC, RC, BASE
| st_vmstate
- | cvt.d.s TOBIT, TOBIT
+ | .FPU cvt.d.s TOBIT, TOBIT
| ins_callt
|.endif
|
@@ -2307,11 +2837,11 @@ static void build_subroutines(BuildCtx *ctx)
| move CARG2, RA
| call_intern lj_ccallback_leave // (CTState *cts, TValue *o)
|. move CARG1, CTSTATE
+ | .FPU ldc1 FRET1, CTSTATE->cb.fpr[0]
| lw CRET1, CTSTATE->cb.gpr[0]
- | ldc1 FRET1, CTSTATE->cb.fpr[0]
- | lw CRET2, CTSTATE->cb.gpr[1]
+ | .FPU ldc1 FRET2, CTSTATE->cb.fpr[1]
| b ->vm_leave_unw
- |. ldc1 FRET2, CTSTATE->cb.fpr[1]
+ |. lw CRET2, CTSTATE->cb.gpr[1]
|.endif
|
|->vm_ffi_call: // Call C function via FFI.
@@ -2343,8 +2873,8 @@ static void build_subroutines(BuildCtx *ctx)
| lw CARG2, CCSTATE->gpr[1]
| lw CARG3, CCSTATE->gpr[2]
| lw CARG4, CCSTATE->gpr[3]
- | ldc1 FARG1, CCSTATE->fpr[0]
- | ldc1 FARG2, CCSTATE->fpr[1]
+ | .FPU ldc1 FARG1, CCSTATE->fpr[0]
+ | .FPU ldc1 FARG2, CCSTATE->fpr[1]
| jalr CFUNCADDR
|. lw CARG1, CCSTATE->gpr[0] // Do this last, since CCSTATE is CARG1.
| lw CCSTATE:TMP1, -12(r16)
@@ -2352,8 +2882,10 @@ static void build_subroutines(BuildCtx *ctx)
| lw ra, -4(r16)
| sw CRET1, CCSTATE:TMP1->gpr[0]
| sw CRET2, CCSTATE:TMP1->gpr[1]
- | sdc1 FRET1, CCSTATE:TMP1->fpr[0]
- | sdc1 FRET2, CCSTATE:TMP1->fpr[1]
+ | .FPU sdc1 FRET1, CCSTATE:TMP1->fpr[0]
+ | .FPU sdc1 FRET2, CCSTATE:TMP1->fpr[1]
+ | sw CARG1, CCSTATE:TMP1->gpr[2] // MIPS32 soft-float.
+ | sw CARG2, CCSTATE:TMP1->gpr[3] // Complex doubles are returned in v0, v1, a0, a1.
| move sp, r16
| jr ra
|. move r16, TMP2
@@ -2381,8 +2913,6 @@ static void build_ins(BuildCtx *ctx, BCOp op, int defop)
| addu CARG3, BASE, RD
| lw TMP0, HI(CARG2)
| lw TMP1, HI(CARG3)
- | ldc1 f0, 0(CARG2)
- | ldc1 f2, 0(CARG3)
| sltiu TMP0, TMP0, LJ_TISNUM
| sltiu TMP1, TMP1, LJ_TISNUM
| lhu TMP2, OFS_RD(PC)
@@ -2390,8 +2920,16 @@ static void build_ins(BuildCtx *ctx, BCOp op, int defop)
| addiu PC, PC, 4
| beqz TMP0, ->vmeta_comp
|. lui TMP1, (-(BCBIAS_J*4 >> 16) & 65535)
+ | load_double f0, CARG1, CARG2, 0(CARG2)
+ |.if FPU
+ | ldc1 f2, 0(CARG3)
+ |.else
+ | lw CARG4, 4(CARG3)
+ | lw CARG3, 0(CARG3)
+ |.endif
| decode_RD4b TMP2
| addu TMP2, TMP2, TMP1
+ |.if FPU
if (op == BC_ISLT || op == BC_ISGE) {
| c.olt.d f0, f2
} else {
@@ -2402,8 +2940,28 @@ static void build_ins(BuildCtx *ctx, BCOp op, int defop)
} else {
| movt TMP2, r0
}
- | addu PC, PC, TMP2
+ |.else
+ | load_got __ledf2
+ | sw RD, TEMP_SAVE_1
+ | sw TMP1, TEMP_SAVE_2
+ | call_extern //CRET1 = f0<=f2
+ |. sw TMP2, TEMP_SAVE_3
+ | lw TMP2, TEMP_SAVE_3
+ | lw TMP1, TEMP_SAVE_2
+ if (op == BC_ISLT) {
+ | bltz CRET1, >1
+ } else if (op == BC_ISLE) {
+ | blez CRET1, >1
+ } else if (op == BC_ISGT) {
+ | bgtz CRET1, >1
+ } else {
+ | bgez CRET1, >1
+ }
+ |. lw RD, TEMP_SAVE_1
+ | move TMP2, r0
|1:
+ |.endif
+ | addu PC, PC, TMP2
| ins_next
break;
@@ -2413,24 +2971,43 @@ static void build_ins(BuildCtx *ctx, BCOp op, int defop)
| addu RA, BASE, RA
| addiu PC, PC, 4
| lw TMP0, HI(RA)
- | ldc1 f0, 0(RA)
| addu RD, BASE, RD
| lhu TMP2, -4+OFS_RD(PC)
- | lw TMP1, HI(RD)
- | ldc1 f2, 0(RD)
| lui TMP3, (-(BCBIAS_J*4 >> 16) & 65535)
+ | lw TMP1, HI(RD)
+ | decode_RD4b TMP2
| sltiu AT, TMP0, LJ_TISNUM
| sltiu CARG1, TMP1, LJ_TISNUM
- | decode_RD4b TMP2
+ | load_double f2, CARG3, CARG4, 0(RD)
+ | lui TMP3, (-(BCBIAS_J*4 >> 16) & 65535)
| and AT, AT, CARG1
+ | load_double f0, CARG1, CARG2, 0(RA)
| beqz AT, >5
|. addu TMP2, TMP2, TMP3
+ |.if FPU
| c.eq.d f0, f2
if (vk) {
| movf TMP2, r0
} else {
| movt TMP2, r0
}
+ |.else
+ | load_got __ledf2
+ | sw RD, TEMP_SAVE_1
+ | call_extern
+ |. sw TMP2, TEMP_SAVE_2
+ | lw RD, TEMP_SAVE_1
+ | lw TMP2, TEMP_SAVE_2
+ if (vk) {
+ | beqz CRET1, >4
+ |. nop
+ } else {
+ | bnez CRET1, >4
+ |. nop
+ }
+ | move TMP2, r0
+ |4:
+ |.endif
|1:
| addu PC, PC, TMP2
| ins_next
@@ -2507,10 +3084,10 @@ static void build_ins(BuildCtx *ctx, BCOp op, int defop)
| addu RA, BASE, RA
| addiu PC, PC, 4
| lw TMP0, HI(RA)
- | ldc1 f0, 0(RA)
+ | load_double f0, CARG1, CARG2, 0(RA)
| addu RD, KBASE, RD
| lhu TMP2, -4+OFS_RD(PC)
- | ldc1 f2, 0(RD)
+ | load_double f2, CARG3, CARG4, 0(RD)
| lui TMP3, (-(BCBIAS_J*4 >> 16) & 65535)
| sltiu AT, TMP0, LJ_TISNUM
| decode_RD4b TMP2
@@ -2520,6 +3097,7 @@ static void build_ins(BuildCtx *ctx, BCOp op, int defop)
| beqz AT, >1
|.endif
|. addu TMP2, TMP2, TMP3
+ |.if FPU
| c.eq.d f0, f2
if (vk) {
| movf TMP2, r0
@@ -2530,6 +3108,28 @@ static void build_ins(BuildCtx *ctx, BCOp op, int defop)
|1:
| addu PC, PC, TMP2
}
+ |.else
+ | load_got __ledf2
+ | sw RD, TEMP_SAVE_1
+ | call_extern
+ |. sw TMP2, TEMP_SAVE_2
+ | lw RD, TEMP_SAVE_1
+ | lw TMP2, TEMP_SAVE_2
+ if (vk) {
+ | beqz CRET1, >4
+ |. nop
+ | move TMP2, r0
+ |4:
+ | addu PC, PC, TMP2
+ |1:
+ } else {
+ | bnez CRET1, >1
+ |. nop
+ | move TMP2, r0
+ |1:
+ | addu PC, PC, TMP2
+ }
+ |.endif
| ins_next
|.if FFI
|5:
@@ -2588,7 +3188,7 @@ static void build_ins(BuildCtx *ctx, BCOp op, int defop)
| addu PC, PC, TMP2
} else {
| sltiu TMP0, TMP0, LJ_TISTRUECOND
- | ldc1 f0, 0(RD)
+ | load_double1 0(RD)
if (op == BC_ISTC) {
| beqz TMP0, >1
} else {
@@ -2598,7 +3198,7 @@ static void build_ins(BuildCtx *ctx, BCOp op, int defop)
| decode_RD4b TMP2
| lui TMP3, (-(BCBIAS_J*4 >> 16) & 65535)
| addu TMP2, TMP2, TMP3
- | sdc1 f0, 0(RA)
+ | store_double1 0(RA)
| addu PC, PC, TMP2
|1:
}
@@ -2631,9 +3231,9 @@ static void build_ins(BuildCtx *ctx, BCOp op, int defop)
| // RA = dst*8, RD = src*8
| addu RD, BASE, RD
| addu RA, BASE, RA
- | ldc1 f0, 0(RD)
+ | load_double1 0(RD)
| ins_next1
- | sdc1 f0, 0(RA)
+ | store_double1 0(RA)
| ins_next2
break;
case BC_NOT:
@@ -2653,12 +3253,19 @@ static void build_ins(BuildCtx *ctx, BCOp op, int defop)
| addu CARG3, BASE, RD
| addu RA, BASE, RA
| lw TMP0, HI(CARG3)
- | ldc1 f0, 0(CARG3)
| sltiu AT, TMP0, LJ_TISNUM
+ | load_double f0, CARG1, CARG2, 0(CARG3)
+ |.if FPU
| beqz AT, ->vmeta_unm
|. neg.d f0, f0
+ |.else
+ | lui TMP1, 0x8000
+ | xor CRET1, TMP1, CARG1
+ | beqz AT, ->vmeta_unm
+ |. move CRET2, CARG2
+ |.endif
| ins_next1
- | sdc1 f0, 0(RA)
+ | store_double f0, CRET1, CRET2, 0(RA)
| ins_next2
break;
case BC_LEN:
@@ -2672,10 +3279,14 @@ static void build_ins(BuildCtx *ctx, BCOp op, int defop)
|. li AT, LJ_TTAB
| lw CRET1, STR:CARG1->len
|1:
+ |.if FPU
| mtc1 CRET1, f0
| cvt.d.w f0, f0
+ |.else
+ | cvti2d CRET1
+ |.endif
| ins_next1
- | sdc1 f0, 0(RA)
+ | store_double f0, CRET1, CRET2, 0(RA)
| ins_next2
|2:
| bne TMP0, AT, ->vmeta_len
@@ -2717,72 +3328,142 @@ static void build_ins(BuildCtx *ctx, BCOp op, int defop)
| addu CARG3, BASE, RB
| addu CARG4, KBASE, RC
| lw TMP1, HI(CARG3)
- | ldc1 f20, 0(CARG3)
- | ldc1 f22, 0(CARG4)
- | sltiu AT, TMP1, LJ_TISNUM
+ | sltiu AT, TMP1, LJ_TISNUM
+ | load_double f20, CARG1, CARG2, 0(CARG3)
+ | load_double f22, CARG3, CARG4, 0(CARG4)
+ |.if FPU
+ | beqz AT, ->vmeta_arith
+ |.else
+ | beqz AT, ->vmeta_arith_vn
+ |.endif
+ |. addu RA, BASE, RA
|| break;
||case 1:
| addu CARG4, BASE, RB
| addu CARG3, KBASE, RC
| lw TMP1, HI(CARG4)
- | ldc1 f22, 0(CARG4)
- | ldc1 f20, 0(CARG3)
- | sltiu AT, TMP1, LJ_TISNUM
+ | sltiu AT, TMP1, LJ_TISNUM
+ | load_double f20, CARG1, CARG2, 0(CARG3)
+ | load_double f22, CARG3, CARG4, 0(CARG4)
+ |.if FPU
+ | beqz AT, ->vmeta_arith
+ |.else
+ | beqz AT, ->vmeta_arith_nv
+ |.endif
+ |. addu RA, BASE, RA
|| break;
||default:
| addu CARG3, BASE, RB
| addu CARG4, BASE, RC
| lw TMP1, HI(CARG3)
| lw TMP2, HI(CARG4)
- | ldc1 f20, 0(CARG3)
- | ldc1 f22, 0(CARG4)
- | sltiu AT, TMP1, LJ_TISNUM
- | sltiu TMP0, TMP2, LJ_TISNUM
- | and AT, AT, TMP0
+ | sltiu AT, TMP1, LJ_TISNUM
+ | sltiu TMP0, TMP2, LJ_TISNUM
+ | and AT, AT, TMP0
+ | load_double f20, CARG1, CARG2, 0(CARG3)
+ | load_double f22, CARG3, CARG4, 0(CARG4)
+ |.if FPU
+ | beqz AT, ->vmeta_arith
+ |.else
+ | beqz AT, ->vmeta_arith_vv
+ |.endif
+ |. addu RA, BASE, RA
|| break;
||}
- | beqz AT, ->vmeta_arith
- |. addu RA, BASE, RA
|.endmacro
|
+ |.macro ins_arithfallback
+ ||switch (vk) {
+ ||case 0:
+ | b ->vmeta_arith_vn
+ |. nop
+ || break;
+ ||case 1:
+ | b ->vmeta_arith_nv
+ |. nop
+ || break;
+ ||default:
+ | b ->vmeta_arith_vv
+ |. nop
+ || break;
+ ||}
+ |.endmacro
+ |
+ |.if FPU
|.macro fpmod, a, b, c
|->BC_MODVN_Z:
- | bal ->vm_floor // floor(b/c)
+ | bal ->vm_floor // floor(b/c)
|. div.d FARG1, b, c
| mul.d a, FRET1, c
- | sub.d a, b, a // b - floor(b/c)*c
+ | sub.d a, b, a // b - floor(b/c)*c
|.endmacro
+ |.else
|
- |.macro ins_arith, ins
+ |.macro sfpmod
+ |->BC_MODVN_Z:
+ | load_got __divdf3
+ | sw CARG1, TEMP_SAVE_1
+ | sw CARG2, TEMP_SAVE_2
+ | sw CARG3, TEMP_SAVE_3
+ | call_extern
+ |. sw CARG4, TEMP_SAVE_4
+ | move CARG1, CRET1
+ | bal ->vm_floor
+ |. move CARG2, CRET2
+ | load_got __muldf3
+ | move CARG1, CRET1
+ | move CARG2, CRET2
+ | lw CARG3, TEMP_SAVE_3
+ | call_extern
+ |. lw CARG4, TEMP_SAVE_4
+ | load_got __subdf3
+ | lw CARG1, TEMP_SAVE_1
+ | lw CARG2, TEMP_SAVE_2
+ | move CARG3, CRET1
+ | call_extern
+ |. move CARG4, CRET2
+ |.endmacro
+ |.endif
+ |
+ |.macro ins_arith, intins, fpins, fpcall
| ins_arithpre
- |.if "ins" == "fpmod_"
- | b ->BC_MODVN_Z // Avoid 3 copies. It's slow anyway.
+ |.if "fpins" == "fpmod_"
+ | b ->BC_MODVN_Z // Avoid 3 copies. It's slow anyway.
|. nop
|.else
- | ins f0, f20, f22
+ |.if FPU
+ | fpins f0, f20, f22
+ |.else
+ |.if "fpcall" == "sfpmod"
+ | sfpmod
+ |.else
+ | load_got fpcall
+ | call_extern
+ |. nop
+ |.endif
+ |.endif
| ins_next1
- | sdc1 f0, 0(RA)
+ | store_double1 0(RA)
| ins_next2
|.endif
|.endmacro
case BC_ADDVN: case BC_ADDNV: case BC_ADDVV:
- | ins_arith add.d
+ | ins_arith addu, add.d, __adddf3
break;
case BC_SUBVN: case BC_SUBNV: case BC_SUBVV:
- | ins_arith sub.d
+ | ins_arith subu, sub.d, __subdf3
break;
case BC_MULVN: case BC_MULNV: case BC_MULVV:
- | ins_arith mul.d
+ | ins_arith mult, mul.d, __muldf3
break;
case BC_DIVVN: case BC_DIVNV: case BC_DIVVV:
- | ins_arith div.d
+ | ins_arith div, div.d, __divdf3
break;
case BC_MODVN:
- | ins_arith fpmod
- break;
+ | ins_arith modi, fpmod, sfpmod
case BC_MODNV: case BC_MODVV:
- | ins_arith fpmod_
+ | ins_arith modi, fpmod_, sfpmod
break;
case BC_POW:
| decode_RB8a RB, INS
@@ -2792,18 +3473,23 @@ static void build_ins(BuildCtx *ctx, BCOp op, int defop)
| addu CARG4, BASE, RC
| lw TMP1, HI(CARG3)
| lw TMP2, HI(CARG4)
- | ldc1 FARG1, 0(CARG3)
- | ldc1 FARG2, 0(CARG4)
| sltiu AT, TMP1, LJ_TISNUM
| sltiu TMP0, TMP2, LJ_TISNUM
| and AT, AT, TMP0
| load_got pow
| beqz AT, ->vmeta_arith
|. addu RA, BASE, RA
+ | load_farg1 0(CARG3)
+ | load_farg2 0(CARG4)
| call_extern
|. nop
| ins_next1
+ |.if HFABI
| sdc1 FRET1, 0(RA)
+ |.else
+ | sw CRET1, 0(RA)
+ | sw CRET2, 4(RA)
+ |.endif
| ins_next2
break;
@@ -2826,10 +3512,10 @@ static void build_ins(BuildCtx *ctx, BCOp op, int defop)
| bnez CRET1, ->vmeta_binop
|. lw BASE, L->base
| addu RB, BASE, MULTRES
- | ldc1 f0, 0(RB)
+ | load_double1 0(RB)
| addu RA, BASE, RA
| ins_next1
- | sdc1 f0, 0(RA) // Copy result from RB to RA.
+ | store_double1 0(RA)
| ins_next2
break;
@@ -2864,20 +3550,24 @@ static void build_ins(BuildCtx *ctx, BCOp op, int defop)
case BC_KSHORT:
| // RA = dst*8, RD = int16_literal*8
| sra RD, INS, 16
- | mtc1 RD, f0
| addu RA, BASE, RA
+ |.if FPU
+ | mtc1 RD, f0
| cvt.d.w f0, f0
+ |.else
+ | cvti2d RD
+ |.endif
| ins_next1
- | sdc1 f0, 0(RA)
+ | store_double f0, CRET1, CRET2, 0(RA)
| ins_next2
break;
case BC_KNUM:
| // RA = dst*8, RD = num_const*8
| addu RD, KBASE, RD
| addu RA, BASE, RA
- | ldc1 f0, 0(RD)
+ | load_double1 0(RD)
| ins_next1
- | sdc1 f0, 0(RA)
+ | store_double1 0(RA)
| ins_next2
break;
case BC_KPRI:
@@ -2913,9 +3603,9 @@ static void build_ins(BuildCtx *ctx, BCOp op, int defop)
| lw UPVAL:RB, LFUNC:RD->uvptr
| ins_next1
| lw TMP1, UPVAL:RB->v
- | ldc1 f0, 0(TMP1)
+ | load_double1 0(TMP1)
| addu RA, BASE, RA
- | sdc1 f0, 0(RA)
+ | store_double1 0(RA)
| ins_next2
break;
case BC_USETV:
@@ -2924,14 +3614,14 @@ static void build_ins(BuildCtx *ctx, BCOp op, int defop)
| srl RA, RA, 1
| addu RD, BASE, RD
| addu RA, RA, LFUNC:RB
- | ldc1 f0, 0(RD)
+ | load_double1 0(RD)
| lw UPVAL:RB, LFUNC:RA->uvptr
| lbu TMP3, UPVAL:RB->marked
| lw CARG2, UPVAL:RB->v
| andi TMP3, TMP3, LJ_GC_BLACK // isblack(uv)
| lbu TMP0, UPVAL:RB->closed
| lw TMP2, HI(RD)
- | sdc1 f0, 0(CARG2)
+ | store_double1 0(CARG2)
| li AT, LJ_GC_BLACK|1
| or TMP3, TMP3, TMP0
| beq TMP3, AT, >2 // Upvalue is closed and black?
@@ -2991,11 +3681,11 @@ static void build_ins(BuildCtx *ctx, BCOp op, int defop)
| srl RA, RA, 1
| addu RD, KBASE, RD
| addu RA, RA, LFUNC:RB
- | ldc1 f0, 0(RD)
+ | load_double1 0(RD)
| lw UPVAL:RB, LFUNC:RA->uvptr
| ins_next1
| lw TMP1, UPVAL:RB->v
- | sdc1 f0, 0(TMP1)
+ | store_double1 0(TMP1)
| ins_next2
break;
case BC_USETP:
@@ -3126,13 +3816,13 @@ static void build_ins(BuildCtx *ctx, BCOp op, int defop)
| lw TMP2, HI(CARG3)
| lw TAB:RB, LO(CARG2)
| li AT, LJ_TTAB
- | ldc1 f0, 0(CARG3)
| bne TMP1, AT, ->vmeta_tgetv
|. addu RA, BASE, RA
| sltiu AT, TMP2, LJ_TISNUM
| beqz AT, >5
|. li AT, LJ_TSTR
- |
+ |.if FPU
+ | ldc1 f0, 0(CARG3)
| // Convert number key to integer, check for integerness and range.
| cvt.w.d f2, f0
| lw TMP0, TAB:RB->asize
@@ -3148,9 +3838,51 @@ static void build_ins(BuildCtx *ctx, BCOp op, int defop)
| lw TMP0, HI(TMP2)
| beq TMP0, TISNIL, >2
|. ldc1 f0, 0(TMP2)
+ |.else
+ | sw RB, TEMP_SAVE_1
+ | sw CARG2, TEMP_SAVE_3
+ | load_got __fixdfsi
+ | lw CARG1, 0(CARG3)
+ | lw CARG2, 4(CARG3)
+ | call_extern // cvt.w.d f2, f0
+ |. sw RC, TEMP_SAVE_2
+ | sw CRET1, TEMP_SAVE_4
+ | cvti2d CRET1 // cvt.d.w f4, f2
+ | load_got __ledf2
+ | lw RC, TEMP_SAVE_2
+ | addu CARG3, BASE, RC
+ | lw CARG1, 0(CARG3)
+ | lw CARG2, 4(CARG3)
+ | move CARG3, CRET1
+ | move CARG4, CRET2
+ | call_extern // c.eq.d f0, f4
+ |. nop
+ | lw CARG3, TEMP_SAVE_3
+ | lw RC, TEMP_SAVE_2
+ | lw RB, TEMP_SAVE_1
+ | lw TMP0, TAB:RB->asize
+ | lw TMP1, TAB:RB->array
+ | lw TMP2, TEMP_SAVE_4
+ | lw CARG2, TEMP_SAVE_3 // Restore old CARG2 and CARG3.
+ | addu CARG3, BASE, RC
+ | bnez CRET1, >3
+ |. sltu AT, TMP2, TMP0
+ | b >4
+ |. nop
+ |3:
+ | move AT, r0
+ |4:
+ | sll TMP2, TMP2, 3
+ | beqz AT, ->vmeta_tgetv // Integer key and in array part?
+ |. addu TMP2, TMP1, TMP2
+ | lw TMP0, HI(TMP2)
+ | lw SFT2, 4(TMP2)
+ | beq TMP0, TISNIL, >2
+ |. lw SFT1, 0(TMP2)
+ |.endif
|1:
| ins_next1
- | sdc1 f0, 0(RA)
+ | store_double1 0(RA)
| ins_next2
|
|2: // Check for __index if table value is nil.
@@ -3246,10 +3978,11 @@ static void build_ins(BuildCtx *ctx, BCOp op, int defop)
|. addu RC, TMP2, RC
| lw TMP1, HI(RC)
| beq TMP1, TISNIL, >5
- |. ldc1 f0, 0(RC)
+ |. nop
|1:
+ | load_double1 0(RC)
| ins_next1
- | sdc1 f0, 0(RA)
+ | store_double1 0(RA)
| ins_next2
|
|5: // Check for __index if table value is nil.
@@ -3271,20 +4004,28 @@ static void build_ins(BuildCtx *ctx, BCOp op, int defop)
| addu CARG2, BASE, RB
| addu CARG3, BASE, RC
| lw TAB:CARG1, LO(CARG2)
+ | lw TMP0, TAB:CARG1->asize
+ | lw TMP1, TAB:CARG1->array
+ |.if FPU
| ldc1 f0, 0(CARG3)
| trunc.w.d f2, f0
- | lw TMP0, TAB:CARG1->asize
| mfc1 CARG2, f2
- | lw TMP1, TAB:CARG1->array
+ |.else
+ | load_got __fixdfsi
+ | lw CARG1, 0(CARG3)
+ | call_extern
+ |. lw CARG2, 4(CARG3)
+ | move CARG2, CRET1
+ |.endif
| sltu AT, CARG2, TMP0
| sll TMP2, CARG2, 3
| beqz AT, ->vmeta_tgetr // In array part?
|. addu TMP2, TMP1, TMP2
- | ldc1 f0, 0(TMP2)
+ | load_double1 0(TMP2)
|->BC_TGETR_Z:
| addu RA, BASE, RA
| ins_next1
- | sdc1 f0, 0(RA)
+ | store_double1 0(RA)
| ins_next2
break;
@@ -3299,13 +4040,13 @@ static void build_ins(BuildCtx *ctx, BCOp op, int defop)
| lw TMP2, HI(CARG3)
| lw TAB:RB, LO(CARG2)
| li AT, LJ_TTAB
- | ldc1 f0, 0(CARG3)
| bne TMP1, AT, ->vmeta_tsetv
|. addu RA, BASE, RA
| sltiu AT, TMP2, LJ_TISNUM
| beqz AT, >5
|. li AT, LJ_TSTR
- |
+ |.if FPU
+ | ldc1 f0, 0(CARG3)
| // Convert number key to integer, check for integerness and range.
| cvt.w.d f2, f0
| lw TMP0, TAB:RB->asize
@@ -3326,6 +4067,52 @@ static void build_ins(BuildCtx *ctx, BCOp op, int defop)
| andi AT, TMP3, LJ_GC_BLACK // isblack(table)
| bnez AT, >7
|. sdc1 f0, 0(TMP1)
+ |.else
+ | sw RB, TEMP_SAVE_1
+ | sw RC, TEMP_SAVE_2
+ | sw CARG2, TEMP_SAVE_3
+ | load_got __fixdfsi
+ | lw CARG1, 0(CARG3)
+ | call_extern // cvt.w.d f2, f0
+ |. lw CARG2, 4(CARG3)
+ | sw CRET1, TEMP_SAVE_4
+ | cvti2d CRET1 // cvt.d.w f4, f2
+ | load_got __ledf2
+ | lw RC, TEMP_SAVE_2
+ | addu CARG3, BASE, RC
+ | lw CARG1, 0(CARG3)
+ | lw CARG2, 4(CARG3)
+ | move CARG3, CRET1
+ | call_extern // c.eq.d f0, f4
+ |. move CARG4, CRET2
+ | lw RC, TEMP_SAVE_2
+ | lw RB, TEMP_SAVE_1
+ | lw TMP0, TAB:RB->asize
+ | lw TMP1, TAB:RB->array
+ | lw TMP2, TEMP_SAVE_4
+ | lw CARG2, TEMP_SAVE_3 // Restore old CARG2 and CARG3.
+ | addu CARG3, BASE, RC
+ | bnez CRET1, >4 // NaN?
+ |. sltu AT, TMP2, TMP0
+ | b >6
+ |. nop
+ |4:
+ | move AT, r0
+ |6:
+ | sll TMP2, TMP2, 3
+ | beqz AT, ->vmeta_tsetv // Integer key and in array part?
+ |. addu TMP1, TMP1, TMP2
+ | lbu TMP3, TAB:RB->marked
+ | lw TMP0, HI(TMP1)
+ | lw SFT1, 0(RA)
+ | beq TMP0, TISNIL, >3
+ |. lw SFT2, 4(RA)
+ |1:
+ | andi AT, TMP3, LJ_GC_BLACK // isblack(table)
+ | sw SFT1, 0(TMP1)
+ | bnez AT, >7
+ |. sw SFT2, 4(TMP1)
+ |.endif
|2:
| ins_next
|
@@ -3374,7 +4161,7 @@ static void build_ins(BuildCtx *ctx, BCOp op, int defop)
| sll TMP1, TMP1, 3
| subu TMP1, TMP0, TMP1
| addu NODE:TMP2, NODE:TMP2, TMP1 // node = tab->node + (idx*32-idx*8)
- | ldc1 f20, 0(RA)
+ | load_double f20, SFT1, SFT2, 0(RA)
|1:
| lw CARG1, offsetof(Node, key)+HI(NODE:TMP2)
| lw TMP0, offsetof(Node, key)+LO(NODE:TMP2)
@@ -3388,8 +4175,14 @@ static void build_ins(BuildCtx *ctx, BCOp op, int defop)
|. lw TAB:TMP0, TAB:RB->metatable
|2:
| andi AT, TMP3, LJ_GC_BLACK // isblack(table)
+ |.if FPU
| bnez AT, >7
|. sdc1 f20, NODE:TMP2->val
+ |.else
+ | sw SFT1, NODE:TMP2->val.u32.hi
+ | bnez AT, >7
+ |. sw SFT2, NODE:TMP2->val.u32.lo
+ |.endif
|3:
| ins_next
|
@@ -3417,6 +4210,10 @@ static void build_ins(BuildCtx *ctx, BCOp op, int defop)
| beqz TMP0, ->vmeta_tsets // 'no __newindex' flag NOT set: check.
|. li AT, LJ_TSTR
|6:
+ |.if not FPU
+ | sw SFT1, TEMP_SAVE_1
+ | sw SFT2, TEMP_SAVE_2
+ |.endif
| load_got lj_tab_newkey
| sw STR:RC, LO(CARG3)
| sw AT, HI(CARG3)
@@ -3427,8 +4224,16 @@ static void build_ins(BuildCtx *ctx, BCOp op, int defop)
|. move CARG1, L
| // Returns TValue *.
| lw BASE, L->base
+ |.if FPU
| b <3 // No 2nd write barrier needed.
|. sdc1 f20, 0(CRET1)
+ |.else
+ | lw SFT2, TEMP_SAVE_1
+ | lw SFT3, TEMP_SAVE_2
+ | sw SFT2, 0(CRET1)
+ | b <3
+ |. sw SFT3, 4(CRET1)
+ |.endif
|
|7: // Possible table write barrier for the value. Skip valiswhite check.
| barrierback TAB:RB, TMP3, TMP0, <3
@@ -3453,11 +4258,17 @@ static void build_ins(BuildCtx *ctx, BCOp op, int defop)
| lw TMP1, HI(RC)
| lbu TMP3, TAB:RB->marked
| beq TMP1, TISNIL, >5
- |. ldc1 f0, 0(RA)
|1:
- | andi AT, TMP3, LJ_GC_BLACK // isblack(table)
+ |. andi AT, TMP3, LJ_GC_BLACK // isblack(table)
+ | load_double1 0(RA)
+ |.if FPU
| bnez AT, >7
|. sdc1 f0, 0(RC)
+ |.else
+ | sw SFT1, 0(RC)
+ | bnez AT, >7
+ |. sw SFT2, 4(RC)
+ |.endif
|2:
| ins_next
|
@@ -3482,12 +4293,22 @@ static void build_ins(BuildCtx *ctx, BCOp op, int defop)
| decode_RDtoRC8 RC, RD
| addu CARG1, BASE, RB
| addu CARG3, BASE, RC
- | lw TAB:CARG2, LO(CARG1)
+ |.if FPU
| ldc1 f0, 0(CARG3)
| trunc.w.d f2, f0
+ | mfc1 CARG3, f2
+ |.else
+ | load_got __fixdfsi
+ | sw CARG1, TEMP_SAVE_1
+ | lw CARG1, 0(CARG3)
+ | call_extern
+ |. lw CARG2, 4(CARG3)
+ | lw CARG1, TEMP_SAVE_1
+ | move CARG3, CRET1
+ |.endif
+ | lw TAB:CARG2, LO(CARG1)
| lbu TMP3, TAB:CARG2->marked
| lw TMP0, TAB:CARG2->asize
- | mfc1 CARG3, f2
| lw TMP1, TAB:CARG2->array
| andi AT, TMP3, LJ_GC_BLACK // isblack(table)
| bnez AT, >7
@@ -3495,12 +4316,24 @@ static void build_ins(BuildCtx *ctx, BCOp op, int defop)
|2:
| sltu AT, CARG3, TMP0
| sll TMP2, CARG3, 3
+ |.if FPU
| beqz AT, ->vmeta_tsetr // In array part?
|. ldc1 f20, 0(RA)
| addu CRET1, TMP1, TMP2
|->BC_TSETR_Z:
+ |.else
+ | lw TMP0, 0(RA)
+ | lw TMP3, 4(RA)
+ | sw TMP0, TEMP_SAVE_1
+ | beqz AT, ->vmeta_tsetr // In array part?
+ |. sw TMP3, TEMP_SAVE_2
+ | addu CRET1, TMP1, TMP2
+ |->BC_TSETR_Z:
+ | lw TMP0, TEMP_SAVE_1
+ | lw TMP3, TEMP_SAVE_2
+ |.endif
| ins_next1
- | sdc1 f20, 0(CRET1)
+ | store_double f20, TMP0, TMP3, 0(CRET1)
| ins_next2
|
|7: // Possible table write barrier for the value. Skip valiswhite check.
@@ -3529,10 +4362,10 @@ static void build_ins(BuildCtx *ctx, BCOp op, int defop)
| addu TMP1, TMP1, CARG1
| andi TMP0, TMP3, LJ_GC_BLACK // isblack(table)
|3: // Copy result slots to table.
- | ldc1 f0, 0(RA)
+ | load_double1 0(RA)
| addiu RA, RA, 8
| sltu AT, RA, TMP2
- | sdc1 f0, 0(TMP1)
+ | store_double1 0(TMP1)
| bnez AT, <3
|. addiu TMP1, TMP1, 8
| bnez TMP0, >7
@@ -3607,10 +4440,10 @@ static void build_ins(BuildCtx *ctx, BCOp op, int defop)
| beqz NARGS8:RC, >3
|. move TMP3, NARGS8:RC
|2:
- | ldc1 f0, 0(RA)
+ | load_double1 0(RA)
| addiu RA, RA, 8
| addiu TMP3, TMP3, -8
- | sdc1 f0, 0(TMP2)
+ | store_double1 0(TMP2)
| bnez TMP3, <2
|. addiu TMP2, TMP2, 8
|3:
@@ -3647,12 +4480,12 @@ static void build_ins(BuildCtx *ctx, BCOp op, int defop)
| li AT, LJ_TFUNC
| lw TMP1, -24+HI(BASE)
| lw LFUNC:RB, -24+LO(BASE)
- | ldc1 f2, -8(BASE)
- | ldc1 f0, -16(BASE)
+ | load_double1 -8(BASE)
+ | load_double2 -16(BASE)
| sw TMP1, HI(BASE) // Copy callable.
| sw LFUNC:RB, LO(BASE)
- | sdc1 f2, 16(BASE) // Copy control var.
- | sdc1 f0, 8(BASE) // Copy state.
+ | store_double1 16(BASE) // Copy control var.
+ | store_double2 8(BASE) // Copy state.
| addiu BASE, BASE, 8
| bne TMP1, AT, ->vmeta_call
|. li NARGS8:RC, 16 // Iterators get 2 arguments.
@@ -3676,19 +4509,29 @@ static void build_ins(BuildCtx *ctx, BCOp op, int defop)
|. sll TMP3, RC, 3
| addu TMP3, TMP1, TMP3
| lw TMP2, HI(TMP3)
- | ldc1 f0, 0(TMP3)
+ | load_double1 0(TMP3)
+ |.if FPU
| mtc1 RC, f2
+ |.else
+ | move CARG1, RC
+ |.endif
| lhu RD, -4+OFS_RD(PC)
| beq TMP2, TISNIL, <1 // Skip holes in array part.
|. addiu RC, RC, 1
+ | store_double1 8(RA)
+ |.if FPU
| cvt.d.w f2, f2
+ |.else
+ | load_got __floatsidf
+ | call_extern
+ |. nop
+ |.endif
| lui TMP3, (-(BCBIAS_J*4 >> 16) & 65535)
- | sdc1 f0, 8(RA)
+ | store_double f2, CRET1, CRET2, 0(RA)
| decode_RD4b RD
| addu RD, RD, TMP3
| sw RC, -8+LO(RA) // Update control var.
| addu PC, PC, RD
- | sdc1 f2, 0(RA)
|3:
| ins_next
|
@@ -3704,17 +4547,22 @@ static void build_ins(BuildCtx *ctx, BCOp op, int defop)
| subu TMP3, TMP3, RB
| addu NODE:TMP3, TMP3, TMP2
| lw RB, HI(NODE:TMP3)
- | ldc1 f0, 0(NODE:TMP3)
+ | load_double1 0(NODE:TMP3)
| lhu RD, -4+OFS_RD(PC)
| beq RB, TISNIL, <6 // Skip holes in hash part.
|. addiu RC, RC, 1
+ |.if FPU
| ldc1 f2, NODE:TMP3->key
+ |.else
+ | lw SFT3, NODE:TMP3->key.u32.hi
+ | lw SFT4, NODE:TMP3->key.u32.lo
+ |.endif
| lui TMP3, (-(BCBIAS_J*4 >> 16) & 65535)
- | sdc1 f0, 8(RA)
+ | store_double1 8(RA)
| addu RC, RC, TMP0
| decode_RD4b RD
| addu RD, RD, TMP3
- | sdc1 f2, 0(RA)
+ | store_double2 0(RA)
| addu PC, PC, RD
| b <3
|. sw RC, -8+LO(RA) // Update control var.
@@ -3794,9 +4642,9 @@ static void build_ins(BuildCtx *ctx, BCOp op, int defop)
| bnez AT, >7
|. addiu MULTRES, TMP1, 8
|6:
- | ldc1 f0, 0(RC)
+ | load_double1 0(RC)
| addiu RC, RC, 8
- | sdc1 f0, 0(RA)
+ | store_double1 0(RA)
| sltu AT, RC, TMP3
| bnez AT, <6 // More vararg slots?
|. addiu RA, RA, 8
@@ -3852,10 +4700,10 @@ static void build_ins(BuildCtx *ctx, BCOp op, int defop)
| beqz RC, >3
|. subu BASE, TMP2, TMP0
|2:
- | ldc1 f0, 0(RA)
+ | load_double1 0(RA)
| addiu RA, RA, 8
| addiu RC, RC, -8
- | sdc1 f0, 0(TMP2)
+ | store_double1 0(TMP2)
| bnez RC, <2
|. addiu TMP2, TMP2, 8
|3:
@@ -3896,14 +4744,14 @@ static void build_ins(BuildCtx *ctx, BCOp op, int defop)
| lw INS, -4(PC)
| addiu TMP2, BASE, -8
if (op == BC_RET1) {
- | ldc1 f0, 0(RA)
+ | load_double1 0(RA)
}
| decode_RB8a RB, INS
| decode_RA8a RA, INS
| decode_RB8b RB
| decode_RA8b RA
if (op == BC_RET1) {
- | sdc1 f0, 0(TMP2)
+ | store_double1 0(TMP2)
}
| subu BASE, TMP2, RA
|5:
@@ -3928,6 +4776,45 @@ static void build_ins(BuildCtx *ctx, BCOp op, int defop)
/* -- Loops and branches ------------------------------------------------ */
+ |.macro cmp_res, gt
+ |.if gt == 1
+ |.if FPU
+ | movf TMP1, r0, 0 // f0>f2: TMP1=0
+ | movf TMP2, r0, 1 // f2>f0: TMP2=0
+ |.else
+ | li SFT2, 1
+ | bne CRET1, SFT2, >1
+ |. nop
+ | b >2
+ |. move TMP1, r0
+ |1:
+ | li SFT2, -1
+ | bne CRET1, SFT2, >2
+ |. nop
+ | move TMP2, r0
+ |2:
+ |.endif
+ |.else
+ |.if FPU
+ | movt TMP1, r0, 0 // f0<=f2: TMP1=0
+ | movt TMP2, r0, 1 // f2<=f0: TMP2=0
+ |.else
+ | bltz CRET1, >3 // f02 // f0==f2: TMP1=TMP2=0
+ |. li SFT2, 1
+ | bne SFT2, CRET1, >4 // f0>f2: TMP2=0
+ |. nop
+ | b >4
+ |2:
+ |. move TMP2, r0
+ |3:
+ | move TMP1, r0
+ |4:
+ |.endif
+ |.endif
+ |.endmacro
+
case BC_FORL:
|.if JIT
| hotloop
@@ -3946,12 +4833,26 @@ static void build_ins(BuildCtx *ctx, BCOp op, int defop)
vk = (op == BC_IFORL || op == BC_JFORL);
| addu RA, BASE, RA
if (vk) {
+ |.if FPU
| ldc1 f0, FORL_IDX*8(RA)
| ldc1 f4, FORL_STEP*8(RA)
| ldc1 f2, FORL_STOP*8(RA)
| lw TMP3, FORL_STEP*8+HI(RA)
| add.d f0, f0, f4
| sdc1 f0, FORL_IDX*8(RA)
+ |.else
+ | load_got __adddf3
+ | load_farg1 FORL_IDX*8(RA)
+ | load_farg2 FORL_STEP*8(RA)
+ | call_extern
+ |. sw RD, TEMP_SAVE_1 //save RD
+ | sw CRET1, FORL_IDX*8(RA)
+ | sw CRET2, FORL_IDX*8+4(RA)
+ | load_farg1 FORL_IDX*8(RA)
+ | load_farg2 FORL_STOP*8(RA) // f0 and f2
+ | lw TMP3, FORL_STEP*8+HI(RA)
+ | lw RD, TEMP_SAVE_1
+ |.endif
} else {
| lw TMP1, FORL_IDX*8+HI(RA)
| lw TMP3, FORL_STEP*8+HI(RA)
@@ -3961,25 +4862,41 @@ static void build_ins(BuildCtx *ctx, BCOp op, int defop)
| sltiu TMP2, TMP2, LJ_TISNUM
| and TMP1, TMP1, TMP0
| and TMP1, TMP1, TMP2
+ |.if FPU
| ldc1 f0, FORL_IDX*8(RA)
| beqz TMP1, ->vmeta_for
|. ldc1 f2, FORL_STOP*8(RA)
+ |.else
+ | beqz TMP1, ->vmeta_for
+ | load_farg1 FORL_IDX*8(RA)
+ | load_farg2 FORL_STOP*8(RA)
+ |.endif
}
if (op != BC_JFORL) {
| srl RD, RD, 1
| lui TMP0, (-(BCBIAS_J*4 >> 16) & 65535)
}
+ | store_double f0, CARG1, CARG2, FORL_EXT*8(RA)
+ |.if FPU
| c.le.d 0, f0, f2
| c.le.d 1, f2, f0
- | sdc1 f0, FORL_EXT*8(RA)
+ |.else
+ | sw RD, TEMP_SAVE_1
+ | load_got __ledf2 // f0<=f2
+ | call_extern
+ |. sw TMP0, TEMP_SAVE_2
+ | lw TMP0, TEMP_SAVE_2
+ | lw RD, TEMP_SAVE_1
+ | lw TMP3, FORL_STEP*8+HI(RA) // Restored step.
+ |.endif
+ |
if (op == BC_JFORI) {
| li TMP1, 1
| li TMP2, 1
| addu TMP0, RD, TMP0
| slt TMP3, TMP3, r0
- | movf TMP1, r0, 0
+ | cmp_res 1
| addu PC, PC, TMP0
- | movf TMP2, r0, 1
| lhu RD, -4+OFS_RD(PC)
| movn TMP1, TMP2, TMP3
| bnez TMP1, =>BC_JLOOP
@@ -3988,8 +4905,7 @@ static void build_ins(BuildCtx *ctx, BCOp op, int defop)
| li TMP1, 1
| li TMP2, 1
| slt TMP3, TMP3, r0
- | movf TMP1, r0, 0
- | movf TMP2, r0, 1
+ | cmp_res 1
| movn TMP1, TMP2, TMP3
| bnez TMP1, =>BC_JLOOP
|. nop
@@ -3998,11 +4914,9 @@ static void build_ins(BuildCtx *ctx, BCOp op, int defop)
| slt TMP3, TMP3, r0
| move TMP2, TMP1
if (op == BC_FORI) {
- | movt TMP1, r0, 0
- | movt TMP2, r0, 1
+ | cmp_res 0
} else {
- | movf TMP1, r0, 0
- | movf TMP2, r0, 1
+ | cmp_res 1
}
| movn TMP1, TMP2, TMP3
| addu PC, PC, TMP1
@@ -4256,8 +5170,10 @@ static void emit_asm_debug(BuildCtx *ctx)
fcofs, CFRAME_SIZE);
for (i = 23; i >= 16; i--)
fprintf(ctx->fp, "\t.byte %d\n\t.uleb128 %d\n", 0x80+i, 26-i);
+#if !LJ_SOFTFP
for (i = 30; i >= 20; i -= 2)
fprintf(ctx->fp, "\t.byte %d\n\t.uleb128 %d\n", 0x80+32+i, 42-i);
+#endif
fprintf(ctx->fp,
"\t.align 2\n"
".LEFDE0:\n\n");
@@ -4275,6 +5191,7 @@ static void emit_asm_debug(BuildCtx *ctx)
"\t.align 2\n"
".LEFDE1:\n\n", (int)ctx->codesz - fcofs);
#endif
+#if !LJ_NO_UNWIND
fprintf(ctx->fp, "\t.section .eh_frame,\"aw\",@progbits\n");
fprintf(ctx->fp,
"\t.globl lj_err_unwind_dwarf\n"
@@ -4342,6 +5259,7 @@ static void emit_asm_debug(BuildCtx *ctx)
"\t.byte 0xd\n\t.uleb128 0x10\n"
"\t.align 2\n"
".LEFDE3:\n\n", (int)ctx->codesz - fcofs);
+#endif
#endif
break;
default:
diff --git a/lib/luajit/src/vm_x64.dasc b/lib/luajit/src/vm_x64.dasc
index e7e990ae27..bba89aaf1b 100644
--- a/lib/luajit/src/vm_x64.dasc
+++ b/lib/luajit/src/vm_x64.dasc
@@ -531,7 +531,7 @@ static void build_subroutines(BuildCtx *ctx)
| jmp >2
|
|->vm_growstack_v: // Grow stack for vararg Lua function.
- | sub RD, 8
+ | sub RD, 16 // LJ_FR2
| jmp >1
|
|->vm_growstack_f: // Grow stack for fixarg Lua function.
diff --git a/src/core/lib.c b/src/core/lib.c
index e1703f71ee..84951da3bf 100644
--- a/src/core/lib.c
+++ b/src/core/lib.c
@@ -77,3 +77,15 @@ void nop()
{
}
+/* Bitswap uint64_t. */
+uint64_t bswap64 (uint64_t b)
+{
+ return ((((uint64_t) b & (uint64_t) 0x00000000000000ff) << 56) |
+ (((uint64_t) b & (uint64_t) 0x000000000000ff00) << 40) |
+ (((uint64_t) b & (uint64_t) 0x0000000000ff0000) << 24) |
+ (((uint64_t) b & (uint64_t) 0x00000000ff000000) << 8) |
+ (((uint64_t) b & (uint64_t) 0x000000ff00000000) >> 8) |
+ (((uint64_t) b & (uint64_t) 0x0000ff0000000000) >> 24) |
+ (((uint64_t) b & (uint64_t) 0x00ff000000000000) >> 40) |
+ (((uint64_t) b & (uint64_t) 0xff00000000000000) >> 56));
+}
diff --git a/src/core/lib.h b/src/core/lib.h
index 013bdcfecc..616d12a484 100644
--- a/src/core/lib.h
+++ b/src/core/lib.h
@@ -6,3 +6,4 @@ void full_memory_barrier();
void prefetch_for_read(const void *address);
void prefetch_for_write(const void *address);
unsigned int stat_mtime(const char *path);
+uint64_t bswap64 (uint64_t b);
diff --git a/src/core/lib.lua b/src/core/lib.lua
index eccd097dbc..8ecf6f9df8 100644
--- a/src/core/lib.lua
+++ b/src/core/lib.lua
@@ -351,15 +351,19 @@ end
-- avoid C function call overhead while using C.xxxx counterparts
if ffi.abi("be") then
-- nothing to do
+ function htonll(b) return b end
function htonl(b) return b end
function htons(b) return b end
else
+ function htonll(b) return C.bswap64(b) end
function htonl(b) return bswap(b) end
function htons(b) return rshift(bswap(b), 16) end
end
+ntohll = htonll
ntohl = htonl
ntohs = htons
+
-- Manipulation of bit fields in uint{8,16,32)_t stored in network
-- byte order. Using bit fields in C structs is compiler-dependent
-- and a little awkward for handling endianness and fields that cross
diff --git a/src/dasm.lua b/src/dasm.lua
index 448eab9cf9..acf8587e5f 100644
--- a/src/dasm.lua
+++ b/src/dasm.lua
@@ -1,5 +1,5 @@
---binding to the DynASM encoding engine.
+--Binding to the DynASM encoding engine.
--Written by Cosmin Apreutesei. Public Domain.
local ffi = require'ffi'
diff --git a/src/dasm_proto.h b/src/dasm_proto.h
index 93ca06533c..d3ba39ab1e 100644
--- a/src/dasm_proto.h
+++ b/src/dasm_proto.h
@@ -13,6 +13,8 @@
#define DASM_IDENT "DynASM 1.4.0"
#define DASM_VERSION 10400 /* 1.4.0 */
+#undef DASM_CHECKS
+
#ifndef Dst_DECL
#define Dst_DECL dasm_State **Dst
#endif
@@ -76,7 +78,8 @@ DASM_FDEF int dasm_getpclabel(Dst_DECL, unsigned int pc);
/* Optional sanity checker to call between isolated encoding steps. */
DASM_FDEF int dasm_checkstep(Dst_DECL, int secmatch);
#else
-#define dasm_checkstep(a, b) 0
+/*#define dasm_checkstep(a, b) 0*/
+DASM_FDEF int dasm_checkstep(Dst_DECL, int secmatch) {return 0;}
#endif
diff --git a/src/dasm_x64.lua b/src/dasm_x64.lua
index 24efbae866..c22ddcfda8 100644
--- a/src/dasm_x64.lua
+++ b/src/dasm_x64.lua
@@ -9,10 +9,11 @@
------------------------------------------------------------------------------
--unload dasm_x86 if it's already loaded.
+if not package then package = {loaded = {}} end --for compat. with minilua
local dasm_x86 = package.loaded.dasm_x86
package.loaded.dasm_x86 = nil
-rawset(_G, 'x64', true) -- Using a global is an ugly, but effective solution.
+x64 = true -- Using a global is an ugly, but effective solution.
local dasm_x64 = require("dasm_x86")
package.loaded.dasm_x86 = dasm_x86 --put it back
diff --git a/src/dasm_x86.c b/src/dasm_x86.c
index 85376ca7ca..59c3bb63c0 100644
--- a/src/dasm_x86.c
+++ b/src/dasm_x86.c
@@ -1,4 +1,10 @@
-#define DASM_CHECKS
+/*
+ Encoding engine to use with dasm.lua.
+
+ Compile with:
+
+ gcc dasm_x86.c -DDASM_CHECKS -shared -s -o dasm_x86.so
+*/
#include "dasm_extern.h"
#include "dasm_proto.h"
diff --git a/src/dasm_x86.h b/src/dasm_x86.h
index 175febe0ca..be9c289f02 100644
--- a/src/dasm_x86.h
+++ b/src/dasm_x86.h
@@ -170,7 +170,7 @@ void dasm_put(Dst_DECL, int start, ...)
dasm_State *D = Dst_REF;
dasm_ActList p = D->actionlist + start;
dasm_Section *sec = D->section;
- int pos = sec->pos, ofs = sec->ofs, mrm = 4;
+ int pos = sec->pos, ofs = sec->ofs, mrm = -1;
int *b;
if (pos >= sec->epos) {
@@ -193,7 +193,7 @@ void dasm_put(Dst_DECL, int start, ...)
b[pos++] = n;
switch (action) {
case DASM_DISP:
- if (n == 0) { if ((mrm&7) == 4) mrm = p[-2]; if ((mrm&7) != 5) break; }
+ if (n == 0) { if (mrm < 0) mrm = p[-2]; if ((mrm&7) != 5) break; }
case DASM_IMM_DB: if (((n+128)&-256) == 0) goto ob;
case DASM_REL_A: /* Assumes ptrdiff_t is int. !x64 */
case DASM_IMM_D: ofs += 4; break;
@@ -203,10 +203,17 @@ void dasm_put(Dst_DECL, int start, ...)
case DASM_IMM_W: CK((n&-65536) == 0, RANGE_I); ofs += 2; break;
case DASM_SPACE: p++; ofs += n; break;
case DASM_SETLABEL: b[pos-2] = -0x40000000; break; /* Neg. label ofs. */
- case DASM_VREG: CK((n&-8) == 0 && (n != 4 || (*p&1) == 0), RANGE_VREG);
- if (*p++ == 1 && *p == DASM_DISP) mrm = n; continue;
+ case DASM_VREG: CK((n&-16) == 0 && (n != 4 || (*p>>5) != 2), RANGE_VREG);
+ if (*p < 0x40 && p[1] == DASM_DISP) mrm = n;
+ if (*p < 0x20 && (n&7) == 4) ofs++;
+ switch ((*p++ >> 3) & 3) {
+ case 3: n |= b[pos-3];
+ case 2: n |= b[pos-2];
+ case 1: if (n <= 7) { b[pos-1] |= 0x10; ofs--; }
+ }
+ continue;
}
- mrm = 4;
+ mrm = -1;
} else {
int *pl, n;
switch (action) {
@@ -393,7 +400,22 @@ int dasm_encode(Dst_DECL, void *buffer)
case DASM_IMM_W: dasmw(n); break;
case DASM_VREG: {
int t = *p++;
- if (t >= 5) n <<= 4; else if (t >= 2) n <<= 3;
+ unsigned char *ex = cp - (t&7);
+ if ((n & 8) && t < 0xa0) {
+ if (*ex & 0x80) ex[1] ^= 0x20 << (t>>6); else *ex ^= 1 << (t>>6);
+ n &= 7;
+ } else if (n & 0x10) {
+ if (*ex & 0x80) {
+ *ex = 0xc5; ex[1] = (ex[1] & 0x80) | ex[2]; ex += 2;
+ }
+ while (++ex < cp) ex[-1] = *ex;
+ if (mark) mark--;
+ cp--;
+ n &= 7;
+ }
+ if (t >= 0xc0) n <<= 4;
+ else if (t >= 0x40) n <<= 3;
+ else if (n == 4 && t < 0x20) { cp[-1] ^= n; *cp++ = 0x20; }
cp[-1] ^= n;
break;
}
diff --git a/src/dasm_x86.lua b/src/dasm_x86.lua
index e7563d477f..0c11f020ec 100644
--- a/src/dasm_x86.lua
+++ b/src/dasm_x86.lua
@@ -44,7 +44,7 @@ local action_names = {
-- int arg, 1 buffer pos:
"DISP", "IMM_S", "IMM_B", "IMM_W", "IMM_D", "IMM_WB", "IMM_DB",
-- action arg (1 byte), int arg, 1 buffer pos (reg/num):
- "VREG", "SPACE", -- !x64: VREG support NYI.
+ "VREG", "SPACE",
-- ptrdiff_t arg, 1 buffer pos (address): !x64
"SETLABEL", "REL_A",
-- action arg (1 byte) or int arg, 2 buffer pos (link, offset):
@@ -92,6 +92,21 @@ local function init_actionlist()
secpos = 1
end
+-- VREG kind encodings, pre-shifted by 5 bits.
+local map_vreg = {
+ ["modrm.rm.m"] = 0x00,
+ ["modrm.rm.r"] = 0x20,
+ ["opcode"] = 0x20,
+ ["sib.base"] = 0x20,
+ ["sib.index"] = 0x40,
+ ["modrm.reg"] = 0x80,
+ ["vex.v"] = 0xa0,
+ ["imm.hi"] = 0xc0,
+}
+
+-- Current number of VREG actions contributing to REX/VEX shrinkage.
+local vreg_shrink_count = 0
+
------------------------------------------------------------------------------
-- Compute action numbers for action names.
@@ -151,6 +166,21 @@ local function waction(action, a, num)
if a or num then secpos = secpos + (num or 1) end
end
+-- Optionally add a VREG action.
+local function wvreg(kind, vreg, psz, sk, defer)
+ if not vreg then return end
+ waction("VREG", vreg)
+ local b = assert(map_vreg[kind], "bad vreg kind `"..vreg.."'")
+ if b < (sk or 0) then
+ vreg_shrink_count = vreg_shrink_count + 1
+ end
+ if not defer then
+ b = b + vreg_shrink_count * 8
+ vreg_shrink_count = 0
+ end
+ wputxb(b + (psz or 0))
+end
+
-- Add call to embedded DynASM C code.
local function wcall(func, args)
if luamode then
@@ -390,6 +420,7 @@ mkrmap("w", "Rw", {"ax", "cx", "dx", "bx", "sp", "bp", "si", "di"})
mkrmap("b", "Rb", {"al", "cl", "dl", "bl", "ah", "ch", "dh", "bh"})
map_reg_valid_index[map_archdef.esp] = false
if x64 then map_reg_valid_index[map_archdef.rsp] = false end
+if x64 then map_reg_needrex[map_archdef.Rb] = true end
map_archdef["Ra"] = "@"..addrsize
-- FP registers (internally tword sized, but use "f" as operand size).
@@ -527,16 +558,24 @@ local function wputszarg(sz, n)
end
-- Put multi-byte opcode with operand-size dependent modifications.
-local function wputop(sz, op, rex, vex)
+local function wputop(sz, op, rex, vex, vregr, vregxb)
+ local psz, sk = 0, nil
if vex then
local tail
if vex.m == 1 and band(rex, 11) == 0 then
- wputb(0xc5)
+ if x64 and vregxb then
+ sk = map_vreg["modrm.reg"]
+ else
+ wputb(0xc5)
tail = shl(bxor(band(rex, 4), 4), 5)
- else
+ psz = 3
+ end
+ end
+ if not tail then
wputb(0xc4)
wputb(shl(bxor(band(rex, 7), 7), 5) + vex.m)
tail = shl(band(rex, 8), 4)
+ psz = 4
end
local reg, vreg = 0, nil
if vex.v then
@@ -546,12 +585,18 @@ local function wputop(sz, op, rex, vex)
end
if sz == "y" or vex.l then tail = tail + 4 end
wputb(tail + shl(bxor(reg, 15), 3) + vex.p)
- if vreg then waction("VREG", vreg); wputxb(4) end
+ wvreg("vex.v", vreg)
rex = 0
if op >= 256 then werror("bad vex opcode") end
+ else
+ if rex ~= 0 then
+ if not x64 then werror("bad operand size") end
+ elseif (vregr or vregxb) and x64 then
+ rex = 0x10
+ sk = map_vreg["vex.v"]
+ end
end
local r
- if rex ~= 0 and not x64 then werror("bad operand size") end
if sz == "w" then wputb(102) end
-- Needs >32 bit numbers, but only for crc32 eax, word [ebx]
if op >= 4294967296 then r = op%4294967296 wputb((op-r)/4294967296) op = r end
@@ -560,20 +605,20 @@ local function wputop(sz, op, rex, vex)
if rex ~= 0 then
local opc3 = band(op, 0xffff00)
if opc3 == 0x0f3a00 or opc3 == 0x0f3800 then
- wputb(64 + band(rex, 15)); rex = 0
+ wputb(64 + band(rex, 15)); rex = 0; psz = 2
end
end
- wputb(shr(op, 16)); op = band(op, 0xffff)
+ wputb(shr(op, 16)); op = band(op, 0xffff); psz = psz + 1
end
if op >= 256 then
local b = shr(op, 8)
- if b == 15 and rex ~= 0 then wputb(64 + band(rex, 15)); rex = 0 end
- wputb(b)
- op = band(op, 255)
+ if b == 15 and rex ~= 0 then wputb(64 + band(rex, 15)); rex = 0; psz = 2 end
+ wputb(b); op = band(op, 255); psz = psz + 1
end
- if rex ~= 0 then wputb(64 + band(rex, 15)) end
+ if rex ~= 0 then wputb(64 + band(rex, 15)); psz = 2 end
if sz == "b" then op = op - 1 end
wputb(op)
+ return psz, sk
end
-- Put ModRM or SIB formatted byte.
@@ -583,7 +628,7 @@ local function wputmodrm(m, s, rm, vs, vrm)
end
-- Put ModRM/SIB plus optional displacement.
-local function wputmrmsib(t, imark, s, vsreg)
+local function wputmrmsib(t, imark, s, vsreg, psz, sk)
local vreg, vxreg
local reg, xreg = t.reg, t.xreg
if reg and reg < 0 then reg = 0; vreg = t.vreg end
@@ -593,8 +638,8 @@ local function wputmrmsib(t, imark, s, vsreg)
-- Register mode.
if sub(t.mode, 1, 1) == "r" then
wputmodrm(3, s, reg)
- if vsreg then waction("VREG", vsreg); wputxb(2) end
- if vreg then waction("VREG", vreg); wputxb(0) end
+ wvreg("modrm.reg", vsreg, psz+1, sk, vreg)
+ wvreg("modrm.rm.r", vreg, psz+1, sk)
return
end
@@ -608,21 +653,22 @@ local function wputmrmsib(t, imark, s, vsreg)
-- [xreg*xsc+disp] -> (0, s, esp) (xsc, xreg, ebp)
wputmodrm(0, s, 4)
if imark == "I" then waction("MARK") end
- if vsreg then waction("VREG", vsreg); wputxb(2) end
+ wvreg("modrm.reg", vsreg, psz+1, sk, vxreg)
wputmodrm(t.xsc, xreg, 5)
- if vxreg then waction("VREG", vxreg); wputxb(3) end
+ wvreg("sib.index", vxreg, psz+2, sk)
else
-- Pure 32 bit displacement.
if x64 and tdisp ~= "table" then
wputmodrm(0, s, 4) -- [disp] -> (0, s, esp) (0, esp, ebp)
+ wvreg("modrm.reg", vsreg, psz+1, sk)
if imark == "I" then waction("MARK") end
wputmodrm(0, 4, 5)
else
riprel = x64
wputmodrm(0, s, 5) -- [disp|rip-label] -> (0, s, ebp)
+ wvreg("modrm.reg", vsreg, psz+1, sk)
if imark == "I" then waction("MARK") end
end
- if vsreg then waction("VREG", vsreg); wputxb(2) end
end
if riprel then -- Emit rip-relative displacement.
if match("UWSiI", imark) then
@@ -650,16 +696,16 @@ local function wputmrmsib(t, imark, s, vsreg)
if xreg or band(reg, 7) == 4 then
wputmodrm(m or 2, s, 4) -- ModRM.
if m == nil or imark == "I" then waction("MARK") end
- if vsreg then waction("VREG", vsreg); wputxb(2) end
+ wvreg("modrm.reg", vsreg, psz+1, sk, vxreg or vreg)
wputmodrm(t.xsc or 0, xreg or 4, reg) -- SIB.
- if vxreg then waction("VREG", vxreg); wputxb(3) end
- if vreg then waction("VREG", vreg); wputxb(1) end
+ wvreg("sib.index", vxreg, psz+2, sk, vreg)
+ wvreg("sib.base", vreg, psz+2, sk)
else
wputmodrm(m or 2, s, reg) -- ModRM.
if (imark == "I" and (m == 1 or m == 2)) or
(m == nil and (vsreg or vreg)) then waction("MARK") end
- if vsreg then waction("VREG", vsreg); wputxb(2) end
- if vreg then waction("VREG", vreg); wputxb(1) end
+ wvreg("modrm.reg", vsreg, psz+1, sk, vreg)
+ wvreg("modrm.rm.m", vreg, psz+1, sk)
end
-- Put displacement.
@@ -1184,7 +1230,7 @@ local map_op = {
shrd_3 = "mriqdw:0FACRmU|mrC/qq:0FADRm|mrC/dd:|mrC/ww:",
rdtsc_0 = "0F31", -- P1+
- rdpmc_0 = "0F33",
+ rdpmc_0 = "0F33", -- P6+
cpuid_0 = "0FA2", -- P1+
-- floating point ops
@@ -1327,46 +1373,14 @@ local map_op = {
movups_2 = "rmo:0F10rM|mro:0F11Rm",
orpd_2 = "rmo:660F56rM",
orps_2 = "rmo:0F56rM",
- packssdw_2 = "rmo:660F6BrM",
- packsswb_2 = "rmo:660F63rM",
- packuswb_2 = "rmo:660F67rM",
- paddb_2 = "rmo:660FFCrM",
- paddd_2 = "rmo:660FFErM",
- paddq_2 = "rmo:660FD4rM",
- paddsb_2 = "rmo:660FECrM",
- paddsw_2 = "rmo:660FEDrM",
- paddusb_2 = "rmo:660FDCrM",
- paddusw_2 = "rmo:660FDDrM",
- paddw_2 = "rmo:660FFDrM",
- pand_2 = "rmo:660FDBrM",
- pandn_2 = "rmo:660FDFrM",
pause_0 = "F390",
- pavgb_2 = "rmo:660FE0rM",
- pavgw_2 = "rmo:660FE3rM",
- pcmpeqb_2 = "rmo:660F74rM",
- pcmpeqd_2 = "rmo:660F76rM",
- pcmpeqw_2 = "rmo:660F75rM",
- pcmpgtb_2 = "rmo:660F64rM",
- pcmpgtd_2 = "rmo:660F66rM",
- pcmpgtw_2 = "rmo:660F65rM",
pextrw_3 = "rri/do:660FC5rMU|xri/wo:660F3A15nRmU", -- Mem op: SSE4.1 only.
pinsrw_3 = "rri/od:660FC4rMU|rxi/ow:",
- pmaddwd_2 = "rmo:660FF5rM",
- pmaxsw_2 = "rmo:660FEErM",
- pmaxub_2 = "rmo:660FDErM",
- pminsw_2 = "rmo:660FEArM",
- pminub_2 = "rmo:660FDArM",
pmovmskb_2 = "rr/do:660FD7rM",
- pmulhuw_2 = "rmo:660FE4rM",
- pmulhw_2 = "rmo:660FE5rM",
- pmullw_2 = "rmo:660FD5rM",
- pmuludq_2 = "rmo:660FF4rM",
- por_2 = "rmo:660FEBrM",
prefetchnta_1 = "xb:n0F180m",
prefetcht0_1 = "xb:n0F181m",
prefetcht1_1 = "xb:n0F182m",
prefetcht2_1 = "xb:n0F183m",
- psadbw_2 = "rmo:660FF6rM",
pshufd_3 = "rmio:660F70rMU",
pshufhw_3 = "rmio:F30F70rMU",
pshuflw_3 = "rmio:F20F70rMU",
@@ -1380,23 +1394,6 @@ local map_op = {
psrldq_2 = "rio:660F733mU",
psrlq_2 = "rmo:660FD3rM|rio:660F732mU",
psrlw_2 = "rmo:660FD1rM|rio:660F712mU",
- psubb_2 = "rmo:660FF8rM",
- psubd_2 = "rmo:660FFArM",
- psubq_2 = "rmo:660FFBrM",
- psubsb_2 = "rmo:660FE8rM",
- psubsw_2 = "rmo:660FE9rM",
- psubusb_2 = "rmo:660FD8rM",
- psubusw_2 = "rmo:660FD9rM",
- psubw_2 = "rmo:660FF9rM",
- punpckhbw_2 = "rmo:660F68rM",
- punpckhdq_2 = "rmo:660F6ArM",
- punpckhqdq_2 = "rmo:660F6DrM",
- punpckhwd_2 = "rmo:660F69rM",
- punpcklbw_2 = "rmo:660F60rM",
- punpckldq_2 = "rmo:660F62rM",
- punpcklqdq_2 = "rmo:660F6CrM",
- punpcklwd_2 = "rmo:660F61rM",
- pxor_2 = "rmo:660FEFrM",
rcpps_2 = "rmo:0F53rM",
rcpss_2 = "rro:F30F53rM|rx/od:",
rsqrtps_2 = "rmo:0F52rM",
@@ -1640,6 +1637,12 @@ local map_op = {
-- AVX, AVX2 integer ops
-- In general, xmm requires AVX, ymm requires AVX2.
+ vaesdec_3 = "rrmo:660F38VDErM",
+ vaesdeclast_3 = "rrmo:660F38VDFrM",
+ vaesenc_3 = "rrmo:660F38VDCrM",
+ vaesenclast_3 = "rrmo:660F38VDDrM",
+ vaesimc_2 = "rmo:660F38uDBrM",
+ vaeskeygenassist_3 = "rmio:660F3AuDFrMU",
vlddqu_2 = "rxoy:F20FuF0rM",
vmaskmovdqu_2 = "rro:660FuF7rM",
vmovdqa_2 = "rmoy:660Fu6FrM|mroy:660Fu7FRm",
@@ -1880,10 +1883,11 @@ local function dopattern(pat, args, sz, op, needrex)
if t.xreg and t.xreg > 7 then rex = rex + 2 end
if s > 7 then rex = rex + 4 end
if needrex then rex = rex + 16 end
- wputop(szov, opcode, rex, vex); opcode = nil
+ local psz, sk = wputop(szov, opcode, rex, vex, s < 0, t.vreg or t.vxreg)
+ opcode = nil
local imark = sub(pat, -1) -- Force a mark (ugly).
-- Put ModRM/SIB with regno/last digit as spare.
- wputmrmsib(t, imark, s, addin and addin.vreg)
+ wputmrmsib(t, imark, s, addin and addin.vreg, psz, sk)
addin = nil
elseif map_vexarg[c] ~= nil then -- Encode using VEX prefix
local b = band(opcode, 255); opcode = shr(opcode, 8)
@@ -1910,8 +1914,8 @@ local function dopattern(pat, args, sz, op, needrex)
if szov == "q" and rex == 0 then rex = rex + 8 end
if needrex then rex = rex + 16 end
if addin and addin.reg == -1 then
- wputop(szov, opcode - 7, rex, vex)
- waction("VREG", addin.vreg); wputxb(0)
+ local psz, sk = wputop(szov, opcode - 7, rex, vex, true)
+ wvreg("opcode", addin.vreg, psz, sk)
else
if addin and addin.reg > 7 then rex = rex + 1 end
wputop(szov, opcode, rex, vex)
@@ -1955,7 +1959,7 @@ local function dopattern(pat, args, sz, op, needrex)
local reg = a.reg
if reg < 0 then
wputb(0)
- waction("VREG", a.vreg); wputxb(5)
+ wvreg("imm.hi", a.vreg)
else
wputb(shl(reg, 4))
end
@@ -2107,8 +2111,8 @@ if x64 then
rex = a.reg > 7 and 9 or 8
end
end
- wputop(sz, opcode, rex)
- if vreg then waction("VREG", vreg); wputxb(0) end
+ local psz, sk = wputop(sz, opcode, rex, nil, vreg)
+ wvreg("opcode", vreg, psz, sk)
if luamode then
waction("IMM_D", format("ffi.cast(\"uintptr_t\", %s) %% 2^32", op64))
waction("IMM_D", format("ffi.cast(\"uintptr_t\", %s) / 2^32", op64))
diff --git a/src/dynasm.lua b/src/dynasm.lua
index 10d93c0f8f..586e2a13dd 100644
--- a/src/dynasm.lua
+++ b/src/dynasm.lua
@@ -1141,14 +1141,13 @@ local function setlang(infile)
g_opt.comment = "--|"
g_opt.endcomment = ""
end
+ -- Set initial defines only available in Lua mode.
+ local ffi = require("ffi")
+ map_def.ARCH = ffi.arch --for `.arch ARCH`
+ map_def[upper(ffi.arch)] = 1 --for `.if X86 ...`
+ map_def.OS = ffi.os --for `.if OS == 'Windows'`
+ map_def[upper(ffi.os)] = 1 --for `.if WINDOWS ...`
end
-
- -- Set initial defines only available in Lua mode.
- local ffi = require'ffi'
- map_def.ARCH = ffi.arch --for `.arch ARCH`
- map_def[upper(ffi.arch)] = 1 --for `.if X86 ...`
- map_def.OS = ffi.os --for `.if OS == 'Windows'`
- map_def[upper(ffi.os)] = 1 --for `.if WINDOWS ...`
end
-- Parse arguments.
diff --git a/src/lib/ipsec/.images/esp.png b/src/lib/ipsec/.images/esp.png
new file mode 100644
index 0000000000..09c165442b
Binary files /dev/null and b/src/lib/ipsec/.images/esp.png differ
diff --git a/src/lib/ipsec/README.md b/src/lib/ipsec/README.md
new file mode 100644
index 0000000000..a4a3f6eac0
--- /dev/null
+++ b/src/lib/ipsec/README.md
@@ -0,0 +1,45 @@
+### IPsec/ESP (lib.ipsec.esp)
+
+The `lib.ipsec.esp` module contains two classes `esp_v6_encrypt` and
+`esp_v6_decrypt` which implement implement packet encryption and
+decryption with IPsec ESP using the AES-GCM-128 cipher in IPv6 transport
+mode. Packets are encrypted with the key and salt provided to the classes
+constructors. These classes do not implement any key exchange protocol.
+
+The encrypt class accepts IPv6 packets and inserts a new [ESP
+header](https://en.wikipedia.org/wiki/IPsec#Encapsulating_Security_Payload)
+between the outer IPv6 header and the inner protocol header (e.g. TCP,
+UDP, L2TPv3) and also encrypts the contents of the inner protocol
+header. The decrypt class does the reverse: it decrypts the inner
+protocol header and removes the ESP protocol header.
+
+References:
+
+- [IPsec Wikipedia page](https://en.wikipedia.org/wiki/IPsec).
+- [RFC 4106](https://tools.ietf.org/html/rfc4106) on using AES-GCM with IPsec ESP.
+- [LISP Data-Plane Confidentiality](https://tools.ietf.org/html/draft-ietf-lisp-crypto-02) example of a software layer above these apps that includes key exchange.
+
+— Method **esp_v6_encrypt:new** *config*
+
+— Method **esp_v6_decrypt:new** *config*
+
+Returns a new encryption/decryption context respectively. *Config* must a
+be a table with the following keys:
+
+* `mode` - Encryption mode (string). The only accepted value is the
+ string `"aes-128-gcm"`.
+* `keymat` - Hex string containing 16 bytes of key material as specified
+ in RFC 4106.
+* `salt` - Hex string containing four bytes of salt as specified in
+ RFC 4106.
+
+— Method **esp_v6_encrypt:encapsulate** *packet*
+
+Returns a freshly allocated packet that is the encrypted and encapsulated
+version of *packet*.
+
+— Method **esp_v6_decrypt:decapsulate** *packet*
+
+Returns a freshly allocated packet that is the decrypted and decapsulated
+version of *packet* or `nil` if authentication failed. The contents of
+*packet* are destroyed in the process.
diff --git a/src/lib/ipsec/aes_128_gcm.lua b/src/lib/ipsec/aes_128_gcm.lua
new file mode 100644
index 0000000000..e9677c4f04
--- /dev/null
+++ b/src/lib/ipsec/aes_128_gcm.lua
@@ -0,0 +1,104 @@
+module(..., package.seeall)
+local ffi = require("ffi")
+local C = ffi.C
+local ASM = require("lib.ipsec.aes_128_gcm_avx")
+local header = require("lib.protocol.header")
+local lib = require("core.lib")
+local ntohl, htonl, htonll = lib.ntohl, lib.htonl, lib.htonll
+
+
+-- IV pseudo header
+
+local iv = subClass(header)
+
+-- Class variables
+iv._name = "iv"
+iv:init(
+ {
+ [1] = ffi.typeof[[
+ struct {
+ uint8_t salt[4];
+ uint64_t iv;
+ uint32_t padding;
+ } __attribute__((packed, aligned(16)))
+ ]]
+ })
+
+-- Class methods
+
+function iv:new (salt)
+ local o = iv:superClass().new(self)
+ local h = o:header()
+ o:salt(salt)
+ h.padding = htonl(0x1)
+ return o
+end
+
+-- Instance methods
+
+function iv:salt (salt)
+ local h = self:header()
+ if salt ~= nil then
+ ffi.copy(h.salt, salt, 4)
+ else
+ return h.salt
+ end
+end
+
+function iv:iv (iv)
+ local h = self:header()
+ if iv ~= nil then
+ h.iv = htonll(iv)
+ else
+ return self:header_ptr()+4, 8
+ end
+end
+
+
+-- AES-128-GCM wrapper
+
+local function u8_ptr (ptr) return ffi.cast("uint8_t *", ptr) end
+
+local aes_128_gcm = {}
+
+function aes_128_gcm:new (keymat, salt)
+ assert(keymat and #keymat == 32, "Need 16 bytes of key material.")
+ assert(salt and #salt == 8, "Need 4 bytes of salt.")
+ local o = {}
+ o.keymat = ffi.new("uint8_t[16]")
+ ffi.copy(o.keymat, lib.hexundump(keymat, 16), 16)
+ o.iv = iv:new(lib.hexundump(salt, 4))
+ -- Compute subkey (H)
+ o.hash_subkey = ffi.new("uint8_t[?] __attribute__((aligned(16)))", 128)
+ o.gcm_data = ffi.new("gcm_data[1] __attribute__((aligned(16)))")
+ ASM.aes_keyexp_128_enc_avx(o.keymat, o.gcm_data[0].expanded_keys)
+ ASM.aesni_gcm_precomp_avx_gen4(o.gcm_data, o.hash_subkey)
+ o.blocksize = 128
+ o.auth_size = 16
+ o.auth_buf = ffi.new("uint8_t[?]", o.auth_size)
+ o.aad_size = 16
+ return setmetatable(o, {__index=aes_128_gcm})
+end
+
+function aes_128_gcm:encrypt (out_ptr, payload, length, esp)
+ self.iv:iv(esp:seq_no())
+ ASM.aesni_gcm_enc_avx_gen4(self.gcm_data,
+ out_ptr,
+ payload, length,
+ u8_ptr(self.iv:header_ptr()),
+ u8_ptr(esp:header_ptr()), esp:sizeof(),
+ payload + length, self.auth_size)
+end
+
+function aes_128_gcm:decrypt (out_ptr, ciphertext, length, esp)
+ self.iv:iv(esp:seq_no())
+ ASM.aesni_gcm_dec_avx_gen4(self.gcm_data,
+ out_ptr,
+ ciphertext, length,
+ u8_ptr(self.iv:header_ptr()),
+ u8_ptr(esp:header_ptr()), esp:sizeof(),
+ self.auth_buf, self.auth_size)
+ return C.memcmp(self.auth_buf, ciphertext + length, self.auth_size) == 0
+end
+
+return aes_128_gcm
diff --git a/src/lib/ipsec/aes_128_gcm_avx.dasl b/src/lib/ipsec/aes_128_gcm_avx.dasl
new file mode 100644
index 0000000000..bd341a7b63
--- /dev/null
+++ b/src/lib/ipsec/aes_128_gcm_avx.dasl
@@ -0,0 +1,498 @@
+-- Selected AES GCM routines, based heavily on the Intel IPsec code from:
+-- https://github.com/lukego/intel-ipsec/blob/master/code/avx2/gcm_avx_gen4.asm
+-- https://github.com/lukego/intel-ipsec/blob/master/code/gcm_defines.asm
+-- https://github.com/lukego/intel-ipsec/blob/master/code/aes_keyexp_128.asm
+
+local dasm = require("dasm")
+local ffi = require("ffi")
+
+ffi.cdef[[
+typedef struct gcm_data
+{
+ uint8_t expanded_keys[16*11];
+ uint8_t shifted_hkey_1[16];
+ uint8_t shifted_hkey_2[16];
+ uint8_t shifted_hkey_3[16];
+ uint8_t shifted_hkey_4[16];
+ uint8_t shifted_hkey_5[16];
+ uint8_t shifted_hkey_6[16];
+ uint8_t shifted_hkey_7[16];
+ uint8_t shifted_hkey_8[16];
+ uint8_t shifted_hkey_1_k[16];
+ uint8_t shifted_hkey_2_k[16];
+ uint8_t shifted_hkey_3_k[16];
+ uint8_t shifted_hkey_4_k[16];
+ uint8_t shifted_hkey_5_k[16];
+ uint8_t shifted_hkey_6_k[16];
+ uint8_t shifted_hkey_7_k[16];
+ uint8_t shifted_hkey_8_k[16];
+} gcm_data;
+]]
+
+|.arch x64
+|.actionlist actions
+|.globalnames globalnames
+
+|.define arg1, rdi
+|.define arg2, rsi
+|.define arg3, rdx
+|.define arg4, rcx
+|.define arg5, r8
+|.define arg6, r9
+|.define arg7, [r14 + 32 + 8*1]
+|.define arg8, [r14 + 32 + 8*2]
+|.define arg9, [r14 + 32 + 8*3]
+
+local function ghash_tail(Dst, gh, t1, t2, t3)
+ | vmovdqa xmm(t3), [->poly2]
+ | vpclmulqdq xmm(t2), xmm(t3), xmm(gh), 0x01; vpslldq xmm(t2), xmm(t2), 8; vpxor xmm(gh), xmm(gh), xmm(t2)
+ | vpclmulqdq xmm(t2), xmm(t3), xmm(gh), 0x00; vpsrldq xmm(t2), xmm(t2), 4
+ | vpclmulqdq xmm(gh), xmm(t3), xmm(gh), 0x10; vpslldq xmm(gh), xmm(gh), 4; vpxor xmm(gh), xmm(gh), xmm(t2)
+ | vpxor xmm(gh), xmm(gh), xmm(t1)
+end
+
+local function ghash_mul(Dst, gh, hk, t1, t2, t3)
+ | vpclmulqdq xmm(t1), xmm(gh), xmm(hk), 0x11
+ | vpclmulqdq xmm(t2), xmm(gh), xmm(hk), 0x00
+ | vpclmulqdq xmm(t3), xmm(gh), xmm(hk), 0x01
+ | vpclmulqdq xmm(gh), xmm(gh), xmm(hk), 0x10
+ | vpxor xmm(gh), xmm(gh), xmm(t3)
+
+ | vpsrldq xmm(t3), xmm(gh), 8
+ | vpslldq xmm(gh), xmm(gh), 8
+ | vpxor xmm(t1), xmm(t1), xmm(t3)
+ | vpxor xmm(gh), xmm(gh), xmm(t2)
+ || ghash_tail(Dst, gh, t1, t2, t3)
+end
+
+local function almost_encrypt_8(Dst, initial, ctr, t_key, operation, loop_idx, before_round)
+ local prev = ctr
+ for i = initial, 8 do
+ if loop_idx == "in_order" then
+ | vpaddd xmm(i), xmm(prev), [->one]
+ else
+ | vpaddd xmm(i), xmm(prev), [->onef]
+ end
+ prev = i
+ end
+ if prev ~= ctr then
+ | vmovdqa xmm(ctr), xmm(prev)
+ end
+ if loop_idx == "in_order" then
+ for i = initial, 8 do
+ | vpshufb xmm(i), xmm(i), [->shuf_mask]
+ end
+ end
+
+ | vmovdqa xmm(t_key), [arg1+16*0]
+ for i = initial, 8 do
+ | vpxor xmm(i), xmm(i), xmm(t_key)
+ end
+ for j = 1, 9 do
+ before_round(j)
+ | vmovdqa xmm(t_key), [arg1+16*j]
+ for i = initial, 8 do
+ | vaesenc xmm(i), xmm(i), xmm(t_key)
+ end
+ end
+ before_round(10)
+end
+
+local function encrypt_8(Dst, initial, t, ctr, t_key, operation)
+ almost_encrypt_8(Dst, initial, ctr, t_key, operation, "in_order", function() end)
+
+ | vmovdqa xmm(t_key), [arg1+16*10]
+ for i = initial, 8 do
+ | vaesenclast xmm(i), xmm(i), xmm(t_key)
+ end
+
+ for i = initial, 8 do
+ | vmovdqu xmm(t), [arg3 + r11 + 16*(i-initial)]
+ | vpxor xmm(i), xmm(i), xmm(t)
+ | vmovdqu [arg2 + r11 + 16*(i-initial)], xmm(i)
+ if operation == "dec" then
+ | vmovdqa xmm(i), xmm(t)
+ end
+ | vpshufb xmm(i), xmm(i), [->shuf_mask]
+ end
+ | add r11, (9-initial)*16
+end
+
+local function initial_blocks(Dst, num_initial_blocks, t, ctr, t_key, operation)
+ local i = 8 - num_initial_blocks
+ | mov r10, arg6
+ | mov r12, arg7
+ | mov r11, r12
+
+ | vpxor xmm(i), xmm(i), xmm(i)
+ |1:
+ | vmovd xmm(t[1]), dword [r10]
+ | vpslldq xmm(t[1]), xmm(t[1]), 12
+ | vpsrldq xmm(i), xmm(i), 4
+ | vpxor xmm(i), xmm(i), xmm(t[1])
+ | add r10, 4
+ | sub r12, 4
+ | jg <1
+ | cmp r11, 16
+ | je >3
+ | mov r12, 16
+ |2:
+ | vpsrldq xmm(i), xmm(i), 4
+ | sub r12, 4
+ | cmp r12, r11
+ | jg <2
+ |3:
+
+ | vpshufb xmm(i), xmm(i), [->shuf_mask]
+ | xor r11, r11
+ | mov rax, arg5
+ | vmovdqu xmm(ctr), [rax]
+ | vpshufb xmm(ctr), xmm(ctr), [->shuf_mask]
+ || encrypt_8(Dst, 9-num_initial_blocks, t[1], ctr, t_key, operation)
+
+ local prev
+ | vmovdqu xmm(t[2]), [arg1 + 16*11]
+ for j = 8-num_initial_blocks, 8 do
+ if prev then
+ | vpxor xmm(j), xmm(j), xmm(prev)
+ end
+ ghash_mul(Dst, j, t[2], t[1], t[3], t[4])
+ prev = j
+ end
+
+ | vmovdqa [rsp], xmm8
+ | vmovdqa xmm(t[3]), xmm8
+ | cmp r13, 128
+ | jl >9
+ || encrypt_8(Dst, 1, t[1], ctr, t_key, operation)
+ | vpxor xmm1, xmm1, [rsp]
+ |9:
+end
+
+local function mulqdqxor(Dst, out, qdq1, qdq2, qdqI, xor)
+ | vpclmulqdq xmm(xor or out), xmm(qdq1), xmm(qdq2), qdqI
+ if xor then
+ | vpxor xmm(out), xmm(out), xmm(xor)
+ end
+end
+
+local function ghash_8_encrypt_8_parallel(Dst, t, ctr, loop_idx, operation)
+ | add r15b, 8
+ | vmovdqa xmm(t[2]), xmm1
+ for i = 2, 8 do
+ | vmovdqa [rsp + 16*(i-1)], xmm(i)
+ end
+
+ almost_encrypt_8(Dst, 1, ctr, t[1], operation, loop_idx, function(round)
+ if round >= 3 then
+ | vmovdqa xmm(t[5]), [arg1 + 16*(21-round)]
+ local xor
+ if round > 3 then
+ | vmovdqa xmm(t[2]), [rsp + 16*(round-3)]
+ xor = t[3]
+ end
+ mulqdqxor(Dst, t[4], t[2], t[5], 0x11, xor)
+ mulqdqxor(Dst, t[7], t[2], t[5], 0x00, xor)
+ mulqdqxor(Dst, t[6], t[2], t[5], 0x01, xor)
+ mulqdqxor(Dst, t[6], t[2], t[5], 0x10, t[3])
+ end
+ end)
+
+ | vmovdqa xmm(t[5]), [arg1+16*10]
+ for j = 1, 8 do
+ local i = j - 1
+ | vpxor xmm(t[2]), xmm(t[5]), [arg3 + r11 + 16*i]
+ if operation == "enc" then
+ | vaesenclast xmm(j), xmm(j), xmm(t[2])
+ | vmovdqu [arg2 + r11 + 16*i], xmm(j)
+ else
+ | vaesenclast xmm(t[3]), xmm(j), xmm(t[2])
+ | vmovdqu xmm(j), [arg3 + r11 + 16*i]
+ | vmovdqu [arg2 + r11 + 16*i], xmm(t[3])
+ end
+ | vpshufb xmm(j), xmm(j), [->shuf_mask]
+ end
+
+ | vpslldq xmm(t[3]), xmm(t[6]), 8
+ | vpsrldq xmm(t[6]), xmm(t[6]), 8
+ | vpxor xmm(t[7]), xmm(t[7]), xmm(t[3])
+ | vpxor xmm(t[1]), xmm(t[4]), xmm(t[6])
+ || ghash_tail(Dst, t[7], t[1], t[2], t[3])
+ | vpxor xmm1, xmm1, xmm(t[7])
+ | add r11, 128
+ | sub r13, 128
+end
+
+local function ghash_last_8(Dst, t)
+ for i = 1, 8 do
+ | vmovdqa xmm(t[5]), [arg1 + 16*(19-i)]
+ | vpshufd xmm(t[2]), xmm(i), 0x4e
+ | vpshufd xmm(t[3]), xmm(t[5]), 0x4e
+ | vpxor xmm(t[2]), xmm(t[2]), xmm(i)
+ | vpxor xmm(t[3]), xmm(t[3]), xmm(t[5])
+ mulqdqxor(Dst, t[6], i, t[5], 0x11, i ~= 1 and t[4])
+ mulqdqxor(Dst, t[7], i, t[5], 0x00, i ~= 1 and t[4])
+ mulqdqxor(Dst, 1, t[2], t[3], 0x00, i ~= 1 and t[4])
+ end
+ | vpxor xmm1, xmm1, xmm(t[6])
+ | vpxor xmm(t[2]), xmm1, xmm(t[7])
+
+ | vpslldq xmm(t[4]), xmm(t[2]), 8
+ | vpsrldq xmm(t[2]), xmm(t[2]), 8
+ | vpxor xmm(t[7]), xmm(t[7]), xmm(t[4])
+ | vpxor xmm(t[6]), xmm(t[6]), xmm(t[2])
+ || ghash_tail(Dst, t[7], t[6], t[2], t[3])
+ | vmovdqa xmm14, xmm15
+end
+
+local function encrypt_single_block(Dst, x)
+ | vpxor xmm(x), xmm(x), [arg1+16*0]
+ for i = 1, 9 do
+ | vaesenc xmm(x), xmm(x), [arg1+16*i]
+ end
+ | vaesenclast xmm(x), xmm(x), [arg1+16*10]
+end
+
+local function prologue(Dst)
+ for i = 12, 15 do
+ | push Rq(i)
+ end
+ | mov r14, rsp
+ | sub rsp, 16*8
+ | and rsp, -64
+end
+
+local function epilogue(Dst)
+ | mov rsp, r14
+ for i = 15, 12, -1 do
+ | pop Rq(i)
+ end
+ | ret
+end
+
+local function gcm_enc_dec(Dst, operation, pc)
+ prologue(Dst)
+
+ | mov r13, arg4
+ | and r13, -16
+ | mov r12, r13
+ | shr r12, 4
+ | and r12, 7
+ | jz =>pc+0
+ for i = 7, 2, -1 do
+ | cmp r12, i
+ | je =>pc+i
+ end
+ | jmp =>pc+1
+ for i = 7, 0, -1 do
+ |=>pc+i:
+ || initial_blocks(Dst, i, {12, 13, 14, 15}, 9, 0, operation)
+ if i ~= 0 then
+ | sub r13, 16*i
+ | jmp >8
+ end
+ end
+
+ |8:
+ | cmp r13, 0
+ | je >1
+ | sub r13, 128
+ | je >2
+ | vmovd r15d, xmm9
+ | and r15d, 255
+ | vpshufb xmm9, xmm9, [->shuf_mask]
+ |3:
+ | cmp r15b, 255-8
+ | jg >4
+ || ghash_8_encrypt_8_parallel(Dst, {0, 10, 11, 12, 13, 14, 15}, 9, "out_order", operation)
+ | jne <3
+ | vpshufb xmm9, xmm9, [->shuf_mask]
+ | jmp >2
+ |4:
+ | vpshufb xmm9, xmm9, [->shuf_mask]
+ || ghash_8_encrypt_8_parallel(Dst, {0, 10, 11, 12, 13, 14, 15}, 9, "in_order", operation)
+ | vpshufb xmm9, xmm9, [->shuf_mask]
+ | jne <3
+ | vpshufb xmm9, xmm9, [->shuf_mask]
+ |2:
+ || ghash_last_8(Dst, {0, 10, 11, 12, 13, 14, 15})
+ |1:
+
+ | mov r13, arg4
+ | and r13, 15
+ | je >1
+
+ | vpaddd xmm9, xmm9, [->one]
+ | vpshufb xmm9, xmm9, [->shuf_mask]
+ || encrypt_single_block(Dst, 9)
+
+ | sub r11, 16
+ | add r11, r13
+ | vmovdqu xmm1, [arg3 + r11]
+ | lea r12, [->all_f]
+ | sub r12, r13
+ | vmovdqu xmm2, [r12]
+ | vpshufb xmm1, xmm1, xmm2
+
+ if operation == "dec" then
+ | vmovdqa xmm2, xmm1
+ end
+ | vpxor xmm9, xmm9, xmm1
+ | vmovdqu xmm1, [r12 + 16]
+ | vpand xmm9, xmm9, xmm1
+ if operation == "dec" then
+ | vpand xmm2, xmm2, xmm1
+ else
+ | vmovdqa xmm2, xmm9
+ end
+ | vpshufb xmm2, xmm2, [->shuf_mask]
+ | vpxor xmm14, xmm14, xmm2
+ || ghash_mul(Dst, 14, 13, 0, 10, 11)
+ | sub r11, r13
+ | add r11, 16
+
+ | vmovd rax, xmm9
+ | cmp r13, 8
+ | jle >2
+ | mov [arg2 + r11], rax
+ | add r11, 8
+ | vpsrldq xmm9, xmm9, 8
+ | vmovd rax, xmm9
+ | sub r13, 8
+ |2:
+ | mov byte [arg2 + r11], al
+ | add r11, 1
+ | shr rax, 8
+ | sub r13, 1
+ | jne <2
+
+ |1:
+ | mov r12, arg7
+ | shl r12, 3
+ | vmovd xmm15, r12d
+
+ | shl arg4, 3
+ | vmovd xmm1, arg4
+ | vpslldq xmm15, xmm15, 8
+ | vpxor xmm15, xmm15, xmm1
+
+ | vpxor xmm14, xmm14, xmm15
+ || ghash_mul(Dst, 14, 13, 0, 10, 11)
+ | vpshufb xmm14, xmm14, [->shuf_mask]
+ | mov rax, arg5
+ | vmovdqu xmm9, [rax]
+ || encrypt_single_block(Dst, 9)
+ | vpxor xmm9, xmm9, xmm14
+
+ | mov r10, arg8
+ | mov r11, arg9
+ | cmp r11, 16
+ | je >3
+ | cmp r11, 12
+ | je >2
+ | vmovd rax, xmm9
+ | mov [r10], rax
+ | jmp >4
+ |2:
+ | vmovd rax, xmm9
+ | mov [r10], rax
+ | vpsrldq xmm9, xmm9, 8
+ | vmovd eax, xmm9
+ | mov [r10 + 8], eax
+ | jmp >4
+ |3:
+ | vmovdqu [r10], xmm9
+ |4:
+
+ epilogue(Dst)
+end
+
+local function precompute(Dst)
+ prologue(Dst)
+
+ | vmovdqu xmm6, [arg2]
+ | vpshufb xmm6, xmm6, [->shuf_mask]
+ | vmovdqa xmm2, xmm6
+ | vpsllq xmm6, xmm6, 1
+ | vpsrlq xmm2, xmm2, 63
+ | vmovdqa xmm1, xmm2
+ | vpslldq xmm2, xmm2, 8
+ | vpsrldq xmm1, xmm1, 8
+ | vpor xmm6, xmm6, xmm2
+ | vpshufd xmm2, xmm1, 0x24
+ | vpcmpeqd xmm2, xmm2, [->two_one]
+ | vpand xmm2, xmm2, [->poly]
+ | vpxor xmm6, xmm6, xmm2
+ | vmovdqa [arg1 + 16*11], xmm6
+
+ | vmovdqa xmm4, xmm6
+ for i = 2, 8 do
+ || ghash_mul(Dst, 4, 6, 0, 1, 2)
+ | vmovdqa [arg1 + 16*(10+i)], xmm4
+ end
+
+ epilogue(Dst)
+end
+
+local function keyexp(Dst)
+ | vmovdqu xmm1, [arg1]
+ | vmovdqa [arg2], xmm1
+ | vpxor xmm3, xmm3, xmm3
+ for i = 1, 10 do
+ | vaeskeygenassist xmm2, xmm1, i < 9 and 2^(i-1) or 27*(i-8)
+ | vpshufd xmm2, xmm2, 0xff
+ | vshufps xmm3, xmm3, xmm1, 0x10
+ | vpxor xmm1, xmm1, xmm3
+ | vshufps xmm3, xmm3, xmm1, 0x8c
+ | vpxor xmm1, xmm1, xmm3
+ | vpxor xmm1, xmm1, xmm2
+ | vmovdqa [arg2 + 16*i], xmm1
+ end
+ | ret
+end
+
+local function generator(Dst)
+ Dst:growpc(16)
+
+ -- Functions
+ |->aesni_gcm_precomp_avx_gen4:
+ || precompute(Dst)
+ |.align 16
+ |->aes_keyexp_128_enc_avx:
+ || keyexp(Dst)
+ |.align 16
+ |->aesni_gcm_enc_avx_gen4:
+ || gcm_enc_dec(Dst, "enc", 0)
+ |.align 16
+ |->aesni_gcm_dec_avx_gen4:
+ || gcm_enc_dec(Dst, "dec", 8)
+
+ -- Data
+ |.align 64
+ |->poly:; .dword 1, 0, 0, 0xC2000000
+ |->poly2:; .dword 0xC2000000, 1, 0, 0xC2000000
+ |->two_one:; .dword 1, 0, 0, 1
+ |->shuf_mask:
+ for i = 15, 0, -1 do
+ |.byte i
+ end
+ for i = 0, 15 do
+ |.byte i
+ end
+ |->all_f:; .dword -1, -1, -1, -1
+ | .dword 0, 0, 0, 0
+ |->one:; .dword 1, 0, 0, 0
+ |->onef:; .dword 0, 0, 0, 2^24
+end
+
+local Dst, globals = dasm.new(actions, nil, nil, 1 + #globalnames)
+generator(Dst)
+local mcode, size = Dst:build()
+local entry = dasm.globals(globals, globalnames)
+local fn_t = ffi.typeof("void(*)(gcm_data*, uint8_t*, const uint8_t*, uint64_t, uint8_t*, const uint8_t*, uint64_t, uint8_t*, uint64_t)")
+return setmetatable({
+ aes_keyexp_128_enc_avx = ffi.cast("void(*)(void*, void*)", entry.aes_keyexp_128_enc_avx),
+ aesni_gcm_precomp_avx_gen4 = ffi.cast("void(*)(gcm_data*, uint8_t*)", entry.aesni_gcm_precomp_avx_gen4),
+ aesni_gcm_enc_avx_gen4 = ffi.cast(fn_t, entry.aesni_gcm_enc_avx_gen4),
+ aesni_gcm_dec_avx_gen4 = ffi.cast(fn_t, entry.aesni_gcm_dec_avx_gen4),
+}, {_anchor = mcode})
diff --git a/src/lib/ipsec/esp.lua b/src/lib/ipsec/esp.lua
new file mode 100644
index 0000000000..36fb473401
--- /dev/null
+++ b/src/lib/ipsec/esp.lua
@@ -0,0 +1,149 @@
+module(..., package.seeall)
+local datagram = require("lib.protocol.datagram")
+local ethernet = require("lib.protocol.ethernet")
+local esp = require("lib.protocol.esp")
+local esp_tail = require("lib.protocol.esp_tail")
+local aes_128_gcm = require("lib.ipsec.aes_128_gcm")
+local lib = require("core.lib")
+local ffi = require("ffi")
+
+
+local esp_nh = 50 -- https://tools.ietf.org/html/rfc4303#section-2
+local esp_length = esp:sizeof()
+local esp_tail_length = esp_tail:sizeof()
+
+function esp_v6_new (conf)
+ assert(conf.mode == "aes-128-gcm", "Only supports aes-128-gcm.")
+ return { aes_128_gcm = aes_128_gcm:new(conf.keymat, conf.salt),
+ seq_no = 0 }
+end
+
+
+local esp_v6_encrypt = {}
+
+function esp_v6_encrypt:new (conf)
+ local o = esp_v6_new(conf)
+ o.pad_buf = ffi.new("uint8_t[?]", o.aes_128_gcm.blocksize-1)
+ o.esp_buf = ffi.new("uint8_t[?]", o.aes_128_gcm.aad_size)
+ -- Fix me https://tools.ietf.org/html/rfc4303#section-3.3.3
+ o.esp = esp:new_from_mem(o.esp_buf, esp_length)
+ o.esp:spi(0x0) -- Fix me, set esp:spi value.
+ o.esp_tail = esp_tail:new({})
+ return setmetatable(o, {__index=esp_v6_encrypt})
+end
+
+-- Return next sequence number.
+function esp_v6_encrypt:next_seq_no ()
+ self.seq_no = self.seq_no + 1
+ return self.seq_no
+end
+
+function esp_v6_encrypt:encrypt (nh, payload, length)
+ local p = packet.allocate()
+ self.esp:seq_no(self:next_seq_no())
+ packet.append(p, self.esp:header_ptr(), esp_length)
+ packet.append(p, payload, length)
+ local pad_length = self.aes_128_gcm.blocksize
+ - ((length + esp_tail_length) % self.aes_128_gcm.blocksize)
+ packet.append(p, self.pad_buf, pad_length)
+ self.esp_tail:next_header(nh)
+ self.esp_tail:pad_length(pad_length)
+ packet.append(p, self.esp_tail:header_ptr(), esp_tail_length)
+ packet.append(p, self.pad_buf, self.aes_128_gcm.auth_size)
+ self.aes_128_gcm:encrypt(packet.data(p) + esp_length,
+ packet.data(p) + esp_length,
+ length + pad_length + esp_tail_length,
+ self.esp)
+ return p
+end
+
+function esp_v6_encrypt:encapsulate (p)
+ local plain = datagram:new(p, ethernet)
+ local eth = plain:parse_match()
+ local ip = plain:parse_match()
+ local nh = ip:next_header()
+ local encrypted = datagram:new(self:encrypt(nh, plain:payload()))
+ local _, length = encrypted:payload()
+ ip:next_header(esp_nh)
+ ip:payload_length(length)
+ encrypted:push(ip)
+ encrypted:push(eth)
+ return encrypted:packet()
+end
+
+
+local esp_v6_decrypt = {}
+
+function esp_v6_decrypt:new (conf)
+ local o = esp_v6_new(conf)
+ o.esp_overhead_size = esp_length + o.aes_128_gcm.auth_size
+ o.min_payload_length = o.aes_128_gcm.blocksize + o.esp_overhead_size
+ return setmetatable(o, {__index=esp_v6_decrypt})
+end
+
+-- Verify sequence number.
+function esp_v6_decrypt:check_seq_no (seq_no)
+ self.seq_no = self.seq_no + 1
+ return self.seq_no <= seq_no
+end
+
+function esp_v6_decrypt:decrypt (payload, length)
+ if length < self.min_payload_length
+ or (length - self.esp_overhead_size) % self.aes_128_gcm.blocksize ~= 0
+ then return end
+ local data_start = payload + esp_length
+ local data_length = length - esp_length - self.aes_128_gcm.auth_size
+ local esp = esp:new_from_mem(payload, esp_length)
+ if self.aes_128_gcm:decrypt(data_start, data_start, data_length, esp) then
+ local esp_tail_start = data_start + data_length - esp_tail_length
+ local esp_tail = esp_tail:new_from_mem(esp_tail_start, esp_tail_length)
+ local cleartext_length = data_length - esp_tail:pad_length() - esp_tail_length
+ local p = packet.from_pointer(data_start, cleartext_length)
+ return esp:seq_no(), p, esp_tail:next_header()
+ end
+end
+
+function esp_v6_decrypt:decapsulate (p)
+ local encrypted = datagram:new(p, ethernet)
+ local eth = encrypted:parse_match()
+ local ip = encrypted:parse_match()
+ local decrypted = nil
+ if ip:next_header() == esp_nh then
+ local seq_no, payload, nh = self:decrypt(encrypted:payload())
+ if payload and self:check_seq_no(seq_no) then
+ local plain = datagram:new(payload)
+ ip:next_header(nh)
+ ip:payload_length(packet.length(payload))
+ plain:push(ip)
+ plain:push(eth)
+ return plain:packet()
+ end
+ end
+end
+
+
+function selftest ()
+ local C = require("ffi").C
+ local ipv6 = require("lib.protocol.ipv6")
+ local conf = { mode = "aes-128-gcm",
+ keymat = "00112233445566778899AABBCCDDEEFF",
+ salt = "00112233"}
+ local enc, dec = esp_v6_encrypt:new(conf), esp_v6_decrypt:new(conf)
+ local payload = packet.from_string(
+[[abcdefghijklmnopqrstuvwxyz
+ABCDEFGHIJKLMNOPQRSTUVWXYZ
+0123456789]]
+ )
+ local d = datagram:new(payload)
+ d:push(ipv6:new({}))
+ d:push(ethernet:new({type=0x86dd}))
+ -- Check integrity
+ local p = d:packet()
+ local p2 = dec:decapsulate(enc:encapsulate(p))
+ if p2 and p2.length == p.length and C.memcmp(p, p2, p.length) == 0 then
+ print("selftest passed")
+ else
+ print("integrity check failed")
+ os.exit(1)
+ end
+end
diff --git a/src/lib/protocol/esp.lua b/src/lib/protocol/esp.lua
new file mode 100644
index 0000000000..2395feb3bf
--- /dev/null
+++ b/src/lib/protocol/esp.lua
@@ -0,0 +1,51 @@
+module(..., package.seeall)
+local ffi = require("ffi")
+local header = require("lib.protocol.header")
+local lib = require("core.lib")
+local ntohl, htonl = lib.ntohl, lib.htonl
+local ntohll, htonll = lib.ntohll, lib.htonll
+
+local esp = subClass(header)
+
+-- Class variables
+esp._name = "esp"
+esp:init(
+ {
+ [1] = ffi.typeof[[
+ struct {
+ uint32_t spi;
+ uint64_t seq_no;
+ } __attribute__((packed))
+ ]]
+ })
+
+-- Class methods
+
+function esp:new (config)
+ local o = esp:superClass().new(self)
+ o:spi(config.spi)
+ o:seq_no(config.seq_no)
+ return o
+end
+
+-- Instance methods
+
+function esp:spi (spi)
+ local h = self:header()
+ if spi ~= nil then
+ h.spi = htonl(spi)
+ else
+ return(ntohl(h.spi))
+ end
+end
+
+function esp:seq_no (seq_no)
+ local h = self:header()
+ if seq_no ~= nil then
+ h.seq_no = htonll(seq_no)
+ else
+ return(ntohll(h.seq_no))
+ end
+end
+
+return esp
diff --git a/src/lib/protocol/esp_tail.lua b/src/lib/protocol/esp_tail.lua
new file mode 100644
index 0000000000..d8cadfee3a
--- /dev/null
+++ b/src/lib/protocol/esp_tail.lua
@@ -0,0 +1,48 @@
+module(..., package.seeall)
+local ffi = require("ffi")
+local header = require("lib.protocol.header")
+
+local esp_tail = subClass(header)
+
+-- Class variables
+esp_tail._name = "esp_tail"
+esp_tail:init(
+ {
+ [1] = ffi.typeof[[
+ struct {
+ uint8_t pad_length;
+ uint8_t next_header;
+ } __attribute__((packed))
+ ]]
+ })
+
+-- Class methods
+
+function esp_tail:new (config)
+ local o = esp_tail:superClass().new(self)
+ o:pad_length(config.pad_length)
+ o:next_header(config.next_header)
+ return o
+end
+
+-- Instance methods
+
+function esp_tail:pad_length (length)
+ local h = self:header()
+ if length ~= nil then
+ h.pad_length = length
+ else
+ return h.pad_length
+ end
+end
+
+function esp_tail:next_header (next_header)
+ local h = self:header()
+ if next_header ~= nil then
+ h.next_header = next_header
+ else
+ return h.next_header
+ end
+end
+
+return esp_tail