diff --git a/.gitmodules b/.gitmodules deleted file mode 100644 index e69de29bb2..0000000000 diff --git a/lib/luajit/doc/changes.html b/lib/luajit/doc/changes.html index 125b58b4ca..826cd2436b 100644 --- a/lib/luajit/doc/changes.html +++ b/lib/luajit/doc/changes.html @@ -113,6 +113,7 @@

LuaJIT 2.1.0-beta1 — 2015-08-25

  • x64: Add separate port of the interpreter to LJ_GC64 mode.
  • x86/x64: Drop internal x87 math functions. Use libm functions.
  • x86: Remove x87 support from interpreter. SSE2 is mandatory now.
  • +
  • x86/x64: Add support for AES-NI, AVX and AVX2 to DynASM.
  • PPC/e500: Drop support for this architecture.
  • FFI library: @@ -123,6 +124,7 @@

    LuaJIT 2.1.0-beta1 — 2015-08-25

  • FFI: Compile lightuserdata to void * conversion.
  • FFI: Compile ffi.gc(cdata, nil), too.
  • FFI: Add ffi.typeinfo().
  • +
  • FFI: Add ssize_t declaration.
  • diff --git a/lib/luajit/doc/ext_ffi_semantics.html b/lib/luajit/doc/ext_ffi_semantics.html index 889d44d823..f65fe8f36d 100644 --- a/lib/luajit/doc/ext_ffi_semantics.html +++ b/lib/luajit/doc/ext_ffi_semantics.html @@ -185,6 +185,8 @@

    C Language Support

    uint16_t, uint32_t, uint64_t, intptr_t, uintptr_t. +
  • From <unistd.h> (POSIX): ssize_t.
  • +

    You're encouraged to use these types in preference to diff --git a/lib/luajit/doc/install.html b/lib/luajit/doc/install.html index b5df697b67..a4cc721512 100644 --- a/lib/luajit/doc/install.html +++ b/lib/luajit/doc/install.html @@ -114,30 +114,30 @@

    Installation

    x86 (32 bit) -GCC 4.x
    GCC 3.4 -GCC 4.x
    GCC 3.4 -GCC 4.x
    GCC 3.4 +GCC 4.2+ +GCC 4.2+ +XCode 5.0+
    Clang MSVC, MSVC/EE
    WinSDK
    MinGW, Cygwin x64 (64 bit) -GCC 4.x +GCC 4.2+ ORBIS (PS4) -GCC 4.x +XCode 5.0+
    Clang MSVC + SDK v7.0
    WinSDK v7.0
    Durango (Xbox One) ARMv5+
    ARM9E+
    GCC 4.2+ GCC 4.2+
    PSP2 (PS VITA) -GCC 4.2+ +XCode 5.0+
    Clang   ARM64 GCC 4.8+   -Clang 3.5+ +XCode 6.0+
    Clang 3.5+   @@ -442,8 +442,7 @@

    Cross-compiling LuaJIT

    make HOST_CC="gcc -m32" CROSS=$NDKP TARGET_FLAGS="$NDKF"

    -You can cross-compile for iOS 3.0+ (iPhone/iPad) using the » iOS SDK. -The environment variables need to match the iOS SDK version: +You can cross-compile for iOS 3.0+ (iPhone/iPad) using the » iOS SDK:

    Note: the JIT compiler is disabled for iOS, because regular iOS Apps @@ -453,13 +452,18 @@

    Cross-compiling LuaJIT

    Or use Android. :-p

    -IXCODE=`xcode-select -print-path`
    -ISDK=$IXCODE/Platforms/iPhoneOS.platform/Developer
    -ISDKVER=iPhoneOS6.0.sdk
    -ISDKP=$ISDK/usr/bin/
    -ISDKF="-arch armv7 -isysroot $ISDK/SDKs/$ISDKVER"
    -make HOST_CC="gcc -m32 -arch i386" CROSS=$ISDKP TARGET_FLAGS="$ISDKF" \
    -     TARGET_SYS=iOS
    +# iOS/ARM (32 bit)
    +ISDKP=$(xcrun --sdk iphoneos --show-sdk-path)
    +ICC=$(xcrun --sdk iphoneos --find clang)
    +ISDKF="-arch armv7 -isysroot $ISDKP"
    +make HOST_CC="clang -m32 -arch i386" CROSS="$(dirname $ICC)/" \
    +     TARGET_FLAGS="$ISDKF" TARGET_SYS=iOS
    +
    +# iOS/ARM64
    +ISDKP=$(xcrun --sdk iphoneos --show-sdk-path)
    +ICC=$(xcrun --sdk iphoneos --find clang)
    +ISDKF="-arch arm64 -isysroot $ISDKP"
    +make CROSS="$(dirname $ICC)/" TARGET_FLAGS="$ISDKF" TARGET_SYS=iOS
     

    Cross-compiling for consoles

    diff --git a/lib/luajit/dynasm/dasm_arm.lua b/lib/luajit/dynasm/dasm_arm.lua index 90a259c5c3..6a1d1d5195 100644 --- a/lib/luajit/dynasm/dasm_arm.lua +++ b/lib/luajit/dynasm/dasm_arm.lua @@ -9,9 +9,9 @@ local _info = { arch = "arm", description = "DynASM ARM module", - version = "1.3.0", - vernum = 10300, - release = "2011-05-05", + version = "1.4.0", + vernum = 10400, + release = "2015-10-18", author = "Mike Pall", license = "MIT", } diff --git a/lib/luajit/dynasm/dasm_arm64.lua b/lib/luajit/dynasm/dasm_arm64.lua index 9766e475b0..c1e3a81b11 100644 --- a/lib/luajit/dynasm/dasm_arm64.lua +++ b/lib/luajit/dynasm/dasm_arm64.lua @@ -9,9 +9,9 @@ local _info = { arch = "arm", description = "DynASM ARM64 module", - version = "1.3.0", - vernum = 10300, - release = "2014-12-03", + version = "1.4.0", + vernum = 10400, + release = "2015-10-18", author = "Mike Pall", license = "MIT", } diff --git a/lib/luajit/dynasm/dasm_mips.lua b/lib/luajit/dynasm/dasm_mips.lua index ae0dbd7a9b..ef383431cd 100644 --- a/lib/luajit/dynasm/dasm_mips.lua +++ b/lib/luajit/dynasm/dasm_mips.lua @@ -9,9 +9,9 @@ local _info = { arch = "mips", description = "DynASM MIPS module", - version = "1.3.0", - vernum = 10300, - release = "2012-01-23", + version = "1.4.0", + vernum = 10400, + release = "2015-10-18", author = "Mike Pall", license = "MIT", } diff --git a/lib/luajit/dynasm/dasm_ppc.lua b/lib/luajit/dynasm/dasm_ppc.lua index 278f09526d..1e9bccaeb8 100644 --- a/lib/luajit/dynasm/dasm_ppc.lua +++ b/lib/luajit/dynasm/dasm_ppc.lua @@ -11,9 +11,9 @@ local _info = { arch = "ppc", description = "DynASM PPC module", - version = "1.3.0", - vernum = 10300, - release = "2015-01-14", + version = "1.4.0", + vernum = 10400, + release = "2015-10-18", author = "Mike Pall", license = "MIT", } diff --git a/lib/luajit/dynasm/dasm_proto.h b/lib/luajit/dynasm/dasm_proto.h index a8bc6fd285..93ca06533c 100644 --- a/lib/luajit/dynasm/dasm_proto.h +++ b/lib/luajit/dynasm/dasm_proto.h @@ -10,8 +10,8 @@ #include #include -#define DASM_IDENT "DynASM 1.3.0" -#define DASM_VERSION 10300 /* 1.3.0 */ +#define DASM_IDENT "DynASM 1.4.0" +#define DASM_VERSION 10400 /* 1.4.0 */ #ifndef Dst_DECL #define Dst_DECL dasm_State **Dst diff --git a/lib/luajit/dynasm/dasm_x86.h b/lib/luajit/dynasm/dasm_x86.h deleted file mode 100644 index 652e8c99b0..0000000000 --- a/lib/luajit/dynasm/dasm_x86.h +++ /dev/null @@ -1,471 +0,0 @@ -/* -** DynASM x86 encoding engine. -** Copyright (C) 2005-2015 Mike Pall. All rights reserved. -** Released under the MIT license. See dynasm.lua for full copyright notice. -*/ - -#include -#include -#include -#include - -#define DASM_ARCH "x86" - -#ifndef DASM_EXTERN -#define DASM_EXTERN(a,b,c,d) 0 -#endif - -/* Action definitions. DASM_STOP must be 255. */ -enum { - DASM_DISP = 233, - DASM_IMM_S, DASM_IMM_B, DASM_IMM_W, DASM_IMM_D, DASM_IMM_WB, DASM_IMM_DB, - DASM_VREG, DASM_SPACE, DASM_SETLABEL, DASM_REL_A, DASM_REL_LG, DASM_REL_PC, - DASM_IMM_LG, DASM_IMM_PC, DASM_LABEL_LG, DASM_LABEL_PC, DASM_ALIGN, - DASM_EXTERN, DASM_ESC, DASM_MARK, DASM_SECTION, DASM_STOP -}; - -/* Maximum number of section buffer positions for a single dasm_put() call. */ -#define DASM_MAXSECPOS 25 - -/* DynASM encoder status codes. Action list offset or number are or'ed in. */ -#define DASM_S_OK 0x00000000 -#define DASM_S_NOMEM 0x01000000 -#define DASM_S_PHASE 0x02000000 -#define DASM_S_MATCH_SEC 0x03000000 -#define DASM_S_RANGE_I 0x11000000 -#define DASM_S_RANGE_SEC 0x12000000 -#define DASM_S_RANGE_LG 0x13000000 -#define DASM_S_RANGE_PC 0x14000000 -#define DASM_S_RANGE_VREG 0x15000000 -#define DASM_S_UNDEF_L 0x21000000 -#define DASM_S_UNDEF_PC 0x22000000 - -/* Macros to convert positions (8 bit section + 24 bit index). */ -#define DASM_POS2IDX(pos) ((pos)&0x00ffffff) -#define DASM_POS2BIAS(pos) ((pos)&0xff000000) -#define DASM_SEC2POS(sec) ((sec)<<24) -#define DASM_POS2SEC(pos) ((pos)>>24) -#define DASM_POS2PTR(D, pos) (D->sections[DASM_POS2SEC(pos)].rbuf + (pos)) - -/* Action list type. */ -typedef const unsigned char *dasm_ActList; - -/* Per-section structure. */ -typedef struct dasm_Section { - int *rbuf; /* Biased buffer pointer (negative section bias). */ - int *buf; /* True buffer pointer. */ - size_t bsize; /* Buffer size in bytes. */ - int pos; /* Biased buffer position. */ - int epos; /* End of biased buffer position - max single put. */ - int ofs; /* Byte offset into section. */ -} dasm_Section; - -/* Core structure holding the DynASM encoding state. */ -struct dasm_State { - size_t psize; /* Allocated size of this structure. */ - dasm_ActList actionlist; /* Current actionlist pointer. */ - int *lglabels; /* Local/global chain/pos ptrs. */ - size_t lgsize; - int *pclabels; /* PC label chains/pos ptrs. */ - size_t pcsize; - void **globals; /* Array of globals (bias -10). */ - dasm_Section *section; /* Pointer to active section. */ - size_t codesize; /* Total size of all code sections. */ - int maxsection; /* 0 <= sectionidx < maxsection. */ - int status; /* Status code. */ - dasm_Section sections[1]; /* All sections. Alloc-extended. */ -}; - -/* The size of the core structure depends on the max. number of sections. */ -#define DASM_PSZ(ms) (sizeof(dasm_State)+(ms-1)*sizeof(dasm_Section)) - - -/* Initialize DynASM state. */ -void dasm_init(Dst_DECL, int maxsection) -{ - dasm_State *D; - size_t psz = 0; - int i; - Dst_REF = NULL; - DASM_M_GROW(Dst, struct dasm_State, Dst_REF, psz, DASM_PSZ(maxsection)); - D = Dst_REF; - D->psize = psz; - D->lglabels = NULL; - D->lgsize = 0; - D->pclabels = NULL; - D->pcsize = 0; - D->globals = NULL; - D->maxsection = maxsection; - for (i = 0; i < maxsection; i++) { - D->sections[i].buf = NULL; /* Need this for pass3. */ - D->sections[i].rbuf = D->sections[i].buf - DASM_SEC2POS(i); - D->sections[i].bsize = 0; - D->sections[i].epos = 0; /* Wrong, but is recalculated after resize. */ - } -} - -/* Free DynASM state. */ -void dasm_free(Dst_DECL) -{ - dasm_State *D = Dst_REF; - int i; - for (i = 0; i < D->maxsection; i++) - if (D->sections[i].buf) - DASM_M_FREE(Dst, D->sections[i].buf, D->sections[i].bsize); - if (D->pclabels) DASM_M_FREE(Dst, D->pclabels, D->pcsize); - if (D->lglabels) DASM_M_FREE(Dst, D->lglabels, D->lgsize); - DASM_M_FREE(Dst, D, D->psize); -} - -/* Setup global label array. Must be called before dasm_setup(). */ -void dasm_setupglobal(Dst_DECL, void **gl, unsigned int maxgl) -{ - dasm_State *D = Dst_REF; - D->globals = gl - 10; /* Negative bias to compensate for locals. */ - DASM_M_GROW(Dst, int, D->lglabels, D->lgsize, (10+maxgl)*sizeof(int)); -} - -/* Grow PC label array. Can be called after dasm_setup(), too. */ -void dasm_growpc(Dst_DECL, unsigned int maxpc) -{ - dasm_State *D = Dst_REF; - size_t osz = D->pcsize; - DASM_M_GROW(Dst, int, D->pclabels, D->pcsize, maxpc*sizeof(int)); - memset((void *)(((unsigned char *)D->pclabels)+osz), 0, D->pcsize-osz); -} - -/* Setup encoder. */ -void dasm_setup(Dst_DECL, const void *actionlist) -{ - dasm_State *D = Dst_REF; - int i; - D->actionlist = (dasm_ActList)actionlist; - D->status = DASM_S_OK; - D->section = &D->sections[0]; - memset((void *)D->lglabels, 0, D->lgsize); - if (D->pclabels) memset((void *)D->pclabels, 0, D->pcsize); - for (i = 0; i < D->maxsection; i++) { - D->sections[i].pos = DASM_SEC2POS(i); - D->sections[i].ofs = 0; - } -} - - -#ifdef DASM_CHECKS -#define CK(x, st) \ - do { if (!(x)) { \ - D->status = DASM_S_##st|(int)(p-D->actionlist-1); return; } } while (0) -#define CKPL(kind, st) \ - do { if ((size_t)((char *)pl-(char *)D->kind##labels) >= D->kind##size) { \ - D->status=DASM_S_RANGE_##st|(int)(p-D->actionlist-1); return; } } while (0) -#else -#define CK(x, st) ((void)0) -#define CKPL(kind, st) ((void)0) -#endif - -/* Pass 1: Store actions and args, link branches/labels, estimate offsets. */ -void dasm_put(Dst_DECL, int start, ...) -{ - va_list ap; - dasm_State *D = Dst_REF; - dasm_ActList p = D->actionlist + start; - dasm_Section *sec = D->section; - int pos = sec->pos, ofs = sec->ofs, mrm = 4; - int *b; - - if (pos >= sec->epos) { - DASM_M_GROW(Dst, int, sec->buf, sec->bsize, - sec->bsize + 2*DASM_MAXSECPOS*sizeof(int)); - sec->rbuf = sec->buf - DASM_POS2BIAS(pos); - sec->epos = (int)sec->bsize/sizeof(int) - DASM_MAXSECPOS+DASM_POS2BIAS(pos); - } - - b = sec->rbuf; - b[pos++] = start; - - va_start(ap, start); - while (1) { - int action = *p++; - if (action < DASM_DISP) { - ofs++; - } else if (action <= DASM_REL_A) { - int n = va_arg(ap, int); - b[pos++] = n; - switch (action) { - case DASM_DISP: - if (n == 0) { if ((mrm&7) == 4) mrm = p[-2]; if ((mrm&7) != 5) break; } - case DASM_IMM_DB: if (((n+128)&-256) == 0) goto ob; - case DASM_REL_A: /* Assumes ptrdiff_t is int. !x64 */ - case DASM_IMM_D: ofs += 4; break; - case DASM_IMM_S: CK(((n+128)&-256) == 0, RANGE_I); goto ob; - case DASM_IMM_B: CK((n&-256) == 0, RANGE_I); ob: ofs++; break; - case DASM_IMM_WB: if (((n+128)&-256) == 0) goto ob; - case DASM_IMM_W: CK((n&-65536) == 0, RANGE_I); ofs += 2; break; - case DASM_SPACE: p++; ofs += n; break; - case DASM_SETLABEL: b[pos-2] = -0x40000000; break; /* Neg. label ofs. */ - case DASM_VREG: CK((n&-8) == 0 && (n != 4 || (*p&1) == 0), RANGE_VREG); - if (*p++ == 1 && *p == DASM_DISP) mrm = n; continue; - } - mrm = 4; - } else { - int *pl, n; - switch (action) { - case DASM_REL_LG: - case DASM_IMM_LG: - n = *p++; pl = D->lglabels + n; - /* Bkwd rel or global. */ - if (n <= 246) { CK(n>=10||*pl<0, RANGE_LG); CKPL(lg, LG); goto putrel; } - pl -= 246; n = *pl; - if (n < 0) n = 0; /* Start new chain for fwd rel if label exists. */ - goto linkrel; - case DASM_REL_PC: - case DASM_IMM_PC: pl = D->pclabels + va_arg(ap, int); CKPL(pc, PC); - putrel: - n = *pl; - if (n < 0) { /* Label exists. Get label pos and store it. */ - b[pos] = -n; - } else { - linkrel: - b[pos] = n; /* Else link to rel chain, anchored at label. */ - *pl = pos; - } - pos++; - ofs += 4; /* Maximum offset needed. */ - if (action == DASM_REL_LG || action == DASM_REL_PC) - b[pos++] = ofs; /* Store pass1 offset estimate. */ - break; - case DASM_LABEL_LG: pl = D->lglabels + *p++; CKPL(lg, LG); goto putlabel; - case DASM_LABEL_PC: pl = D->pclabels + va_arg(ap, int); CKPL(pc, PC); - putlabel: - n = *pl; /* n > 0: Collapse rel chain and replace with label pos. */ - while (n > 0) { int *pb = DASM_POS2PTR(D, n); n = *pb; *pb = pos; } - *pl = -pos; /* Label exists now. */ - b[pos++] = ofs; /* Store pass1 offset estimate. */ - break; - case DASM_ALIGN: - ofs += *p++; /* Maximum alignment needed (arg is 2**n-1). */ - b[pos++] = ofs; /* Store pass1 offset estimate. */ - break; - case DASM_EXTERN: p += 2; ofs += 4; break; - case DASM_ESC: p++; ofs++; break; - case DASM_MARK: mrm = p[-2]; break; - case DASM_SECTION: - n = *p; CK(n < D->maxsection, RANGE_SEC); D->section = &D->sections[n]; - case DASM_STOP: goto stop; - } - } - } -stop: - va_end(ap); - sec->pos = pos; - sec->ofs = ofs; -} -#undef CK - -/* Pass 2: Link sections, shrink branches/aligns, fix label offsets. */ -int dasm_link(Dst_DECL, size_t *szp) -{ - dasm_State *D = Dst_REF; - int secnum; - int ofs = 0; - -#ifdef DASM_CHECKS - *szp = 0; - if (D->status != DASM_S_OK) return D->status; - { - int pc; - for (pc = 0; pc*sizeof(int) < D->pcsize; pc++) - if (D->pclabels[pc] > 0) return DASM_S_UNDEF_PC|pc; - } -#endif - - { /* Handle globals not defined in this translation unit. */ - int idx; - for (idx = 10; idx*sizeof(int) < D->lgsize; idx++) { - int n = D->lglabels[idx]; - /* Undefined label: Collapse rel chain and replace with marker (< 0). */ - while (n > 0) { int *pb = DASM_POS2PTR(D, n); n = *pb; *pb = -idx; } - } - } - - /* Combine all code sections. No support for data sections (yet). */ - for (secnum = 0; secnum < D->maxsection; secnum++) { - dasm_Section *sec = D->sections + secnum; - int *b = sec->rbuf; - int pos = DASM_SEC2POS(secnum); - int lastpos = sec->pos; - - while (pos != lastpos) { - dasm_ActList p = D->actionlist + b[pos++]; - while (1) { - int op, action = *p++; - switch (action) { - case DASM_REL_LG: p++; op = p[-3]; goto rel_pc; - case DASM_REL_PC: op = p[-2]; rel_pc: { - int shrink = op == 0xe9 ? 3 : ((op&0xf0) == 0x80 ? 4 : 0); - if (shrink) { /* Shrinkable branch opcode? */ - int lofs, lpos = b[pos]; - if (lpos < 0) goto noshrink; /* Ext global? */ - lofs = *DASM_POS2PTR(D, lpos); - if (lpos > pos) { /* Fwd label: add cumulative section offsets. */ - int i; - for (i = secnum; i < DASM_POS2SEC(lpos); i++) - lofs += D->sections[i].ofs; - } else { - lofs -= ofs; /* Bkwd label: unfix offset. */ - } - lofs -= b[pos+1]; /* Short branch ok? */ - if (lofs >= -128-shrink && lofs <= 127) ofs -= shrink; /* Yes. */ - else { noshrink: shrink = 0; } /* No, cannot shrink op. */ - } - b[pos+1] = shrink; - pos += 2; - break; - } - case DASM_SPACE: case DASM_IMM_LG: case DASM_VREG: p++; - case DASM_DISP: case DASM_IMM_S: case DASM_IMM_B: case DASM_IMM_W: - case DASM_IMM_D: case DASM_IMM_WB: case DASM_IMM_DB: - case DASM_SETLABEL: case DASM_REL_A: case DASM_IMM_PC: pos++; break; - case DASM_LABEL_LG: p++; - case DASM_LABEL_PC: b[pos++] += ofs; break; /* Fix label offset. */ - case DASM_ALIGN: ofs -= (b[pos++]+ofs)&*p++; break; /* Adjust ofs. */ - case DASM_EXTERN: p += 2; break; - case DASM_ESC: p++; break; - case DASM_MARK: break; - case DASM_SECTION: case DASM_STOP: goto stop; - } - } - stop: (void)0; - } - ofs += sec->ofs; /* Next section starts right after current section. */ - } - - D->codesize = ofs; /* Total size of all code sections */ - *szp = ofs; - return DASM_S_OK; -} - -#define dasmb(x) *cp++ = (unsigned char)(x) -#ifndef DASM_ALIGNED_WRITES -#define dasmw(x) \ - do { *((unsigned short *)cp) = (unsigned short)(x); cp+=2; } while (0) -#define dasmd(x) \ - do { *((unsigned int *)cp) = (unsigned int)(x); cp+=4; } while (0) -#else -#define dasmw(x) do { dasmb(x); dasmb((x)>>8); } while (0) -#define dasmd(x) do { dasmw(x); dasmw((x)>>16); } while (0) -#endif - -/* Pass 3: Encode sections. */ -int dasm_encode(Dst_DECL, void *buffer) -{ - dasm_State *D = Dst_REF; - unsigned char *base = (unsigned char *)buffer; - unsigned char *cp = base; - int secnum; - - /* Encode all code sections. No support for data sections (yet). */ - for (secnum = 0; secnum < D->maxsection; secnum++) { - dasm_Section *sec = D->sections + secnum; - int *b = sec->buf; - int *endb = sec->rbuf + sec->pos; - - while (b != endb) { - dasm_ActList p = D->actionlist + *b++; - unsigned char *mark = NULL; - while (1) { - int action = *p++; - int n = (action >= DASM_DISP && action <= DASM_ALIGN) ? *b++ : 0; - switch (action) { - case DASM_DISP: if (!mark) mark = cp; { - unsigned char *mm = mark; - if (*p != DASM_IMM_DB && *p != DASM_IMM_WB) mark = NULL; - if (n == 0) { int mrm = mm[-1]&7; if (mrm == 4) mrm = mm[0]&7; - if (mrm != 5) { mm[-1] -= 0x80; break; } } - if (((n+128) & -256) != 0) goto wd; else mm[-1] -= 0x40; - } - case DASM_IMM_S: case DASM_IMM_B: wb: dasmb(n); break; - case DASM_IMM_DB: if (((n+128)&-256) == 0) { - db: if (!mark) mark = cp; mark[-2] += 2; mark = NULL; goto wb; - } else mark = NULL; - case DASM_IMM_D: wd: dasmd(n); break; - case DASM_IMM_WB: if (((n+128)&-256) == 0) goto db; else mark = NULL; - case DASM_IMM_W: dasmw(n); break; - case DASM_VREG: { int t = *p++; if (t >= 2) n<<=3; cp[-1] |= n; break; } - case DASM_REL_LG: p++; if (n >= 0) goto rel_pc; - b++; n = (int)(ptrdiff_t)D->globals[-n]; - case DASM_REL_A: rel_a: n -= (int)(ptrdiff_t)(cp+4); goto wd; /* !x64 */ - case DASM_REL_PC: rel_pc: { - int shrink = *b++; - int *pb = DASM_POS2PTR(D, n); if (*pb < 0) { n = pb[1]; goto rel_a; } - n = *pb - ((int)(cp-base) + 4-shrink); - if (shrink == 0) goto wd; - if (shrink == 4) { cp--; cp[-1] = *cp-0x10; } else cp[-1] = 0xeb; - goto wb; - } - case DASM_IMM_LG: - p++; if (n < 0) { n = (int)(ptrdiff_t)D->globals[-n]; goto wd; } - case DASM_IMM_PC: { - int *pb = DASM_POS2PTR(D, n); - n = *pb < 0 ? pb[1] : (*pb + (int)(ptrdiff_t)base); - goto wd; - } - case DASM_LABEL_LG: { - int idx = *p++; - if (idx >= 10) - D->globals[idx] = (void *)(base + (*p == DASM_SETLABEL ? *b : n)); - break; - } - case DASM_LABEL_PC: case DASM_SETLABEL: break; - case DASM_SPACE: { int fill = *p++; while (n--) *cp++ = fill; break; } - case DASM_ALIGN: - n = *p++; - while (((cp-base) & n)) *cp++ = 0x90; /* nop */ - break; - case DASM_EXTERN: n = DASM_EXTERN(Dst, cp, p[1], *p); p += 2; goto wd; - case DASM_MARK: mark = cp; break; - case DASM_ESC: action = *p++; - default: *cp++ = action; break; - case DASM_SECTION: case DASM_STOP: goto stop; - } - } - stop: (void)0; - } - } - - if (base + D->codesize != cp) /* Check for phase errors. */ - return DASM_S_PHASE; - return DASM_S_OK; -} - -/* Get PC label offset. */ -int dasm_getpclabel(Dst_DECL, unsigned int pc) -{ - dasm_State *D = Dst_REF; - if (pc*sizeof(int) < D->pcsize) { - int pos = D->pclabels[pc]; - if (pos < 0) return *DASM_POS2PTR(D, -pos); - if (pos > 0) return -1; /* Undefined. */ - } - return -2; /* Unused or out of range. */ -} - -#ifdef DASM_CHECKS -/* Optional sanity checker to call between isolated encoding steps. */ -int dasm_checkstep(Dst_DECL, int secmatch) -{ - dasm_State *D = Dst_REF; - if (D->status == DASM_S_OK) { - int i; - for (i = 1; i <= 9; i++) { - if (D->lglabels[i] > 0) { D->status = DASM_S_UNDEF_L|i; break; } - D->lglabels[i] = 0; - } - } - if (D->status == DASM_S_OK && secmatch >= 0 && - D->section != &D->sections[secmatch]) - D->status = DASM_S_MATCH_SEC|(int)(D->section-D->sections); - return D->status; -} -#endif - diff --git a/lib/luajit/dynasm/dasm_x86.lua b/lib/luajit/dynasm/dasm_x86.lua index 7ca061d22f..60f5211a33 100644 --- a/lib/luajit/dynasm/dasm_x86.lua +++ b/lib/luajit/dynasm/dasm_x86.lua @@ -11,9 +11,9 @@ local x64 = x64 local _info = { arch = x64 and "x64" or "x86", description = "DynASM x86/x64 module", - version = "1.3.0", - vernum = 10300, - release = "2011-05-05", + version = "1.4.0", + vernum = 10400, + release = "2015-10-18", author = "Mike Pall", license = "MIT", } @@ -27,9 +27,9 @@ local assert, unpack, setmetatable = assert, unpack or table.unpack, setmetatabl local _s = string local sub, format, byte, char = _s.sub, _s.format, _s.byte, _s.char local find, match, gmatch, gsub = _s.find, _s.match, _s.gmatch, _s.gsub -local concat, sort = table.concat, table.sort +local concat, sort, remove = table.concat, table.sort, table.remove local bit = bit or require("bit") -local band, shl, shr = bit.band, bit.lshift, bit.rshift +local band, bxor, shl, shr = bit.band, bit.bxor, bit.lshift, bit.rshift -- Inherited tables and callbacks. local g_opt, g_arch @@ -41,7 +41,7 @@ local action_names = { -- int arg, 1 buffer pos: "DISP", "IMM_S", "IMM_B", "IMM_W", "IMM_D", "IMM_WB", "IMM_DB", -- action arg (1 byte), int arg, 1 buffer pos (reg/num): - "VREG", "SPACE", -- !x64: VREG support NYI. + "VREG", "SPACE", -- ptrdiff_t arg, 1 buffer pos (address): !x64 "SETLABEL", "REL_A", -- action arg (1 byte) or int arg, 2 buffer pos (link, offset): @@ -83,6 +83,21 @@ local actargs = { 0 } -- Current number of section buffer positions for dasm_put(). local secpos = 1 +-- VREG kind encodings, pre-shifted by 5 bits. +local map_vreg = { + ["modrm.rm.m"] = 0x00, + ["modrm.rm.r"] = 0x20, + ["opcode"] = 0x20, + ["sib.base"] = 0x20, + ["sib.index"] = 0x40, + ["modrm.reg"] = 0x80, + ["vex.v"] = 0xa0, + ["imm.hi"] = 0xc0, +} + +-- Current number of VREG actions contributing to REX/VEX shrinkage. +local vreg_shrink_count = 0 + ------------------------------------------------------------------------------ -- Compute action numbers for action names. @@ -134,6 +149,21 @@ local function waction(action, a, num) if a or num then secpos = secpos + (num or 1) end end +-- Optionally add a VREG action. +local function wvreg(kind, vreg, psz, sk, defer) + if not vreg then return end + waction("VREG", vreg) + local b = assert(map_vreg[kind], "bad vreg kind `"..vreg.."'") + if b < (sk or 0) then + vreg_shrink_count = vreg_shrink_count + 1 + end + if not defer then + b = b + vreg_shrink_count * 8 + vreg_shrink_count = 0 + end + wputxb(b + (psz or 0)) +end + -- Add call to embedded DynASM C code. local function wcall(func, args) wline(format("dasm_%s(Dst, %s);", func, concat(args, ", ")), true) @@ -299,7 +329,7 @@ local function mkrmap(sz, cl, names) local iname = format("@%s%x%s", sz, i, needrex and "R" or "") if needrex then map_reg_needrex[iname] = true end local name - if sz == "o" then name = format("xmm%d", i) + if sz == "o" or sz == "y" then name = format("%s%d", cl, i) elseif sz == "f" then name = format("st%d", i) else name = format("r%d%s", i, sz == addrsize and "" or sz) end map_archdef[name] = iname @@ -326,6 +356,7 @@ mkrmap("w", "Rw", {"ax", "cx", "dx", "bx", "sp", "bp", "si", "di"}) mkrmap("b", "Rb", {"al", "cl", "dl", "bl", "ah", "ch", "dh", "bh"}) map_reg_valid_index[map_archdef.esp] = false if x64 then map_reg_valid_index[map_archdef.rsp] = false end +if x64 then map_reg_needrex[map_archdef.Rb] = true end map_archdef["Ra"] = "@"..addrsize -- FP registers (internally tword sized, but use "f" as operand size). @@ -334,21 +365,24 @@ mkrmap("f", "Rf") -- SSE registers (oword sized, but qword and dword accessible). mkrmap("o", "xmm") +-- AVX registers (yword sized, but oword, qword and dword accessible). +mkrmap("y", "ymm") + -- Operand size prefixes to codes. local map_opsize = { - byte = "b", word = "w", dword = "d", qword = "q", oword = "o", tword = "t", - aword = addrsize, + byte = "b", word = "w", dword = "d", qword = "q", oword = "o", yword = "y", + tword = "t", aword = addrsize, } -- Operand size code to number. local map_opsizenum = { - b = 1, w = 2, d = 4, q = 8, o = 16, t = 10, + b = 1, w = 2, d = 4, q = 8, o = 16, y = 32, t = 10, } -- Operand size code to name. local map_opsizename = { - b = "byte", w = "word", d = "dword", q = "qword", o = "oword", t = "tword", - f = "fpword", + b = "byte", w = "word", d = "dword", q = "qword", o = "oword", y = "yword", + t = "tword", f = "fpword", } -- Valid index register scale factors. @@ -460,9 +494,45 @@ local function wputszarg(sz, n) end -- Put multi-byte opcode with operand-size dependent modifications. -local function wputop(sz, op, rex) +local function wputop(sz, op, rex, vex, vregr, vregxb) + local psz, sk = 0, nil + if vex then + local tail + if vex.m == 1 and band(rex, 11) == 0 then + if x64 and vregxb then + sk = map_vreg["modrm.reg"] + else + wputb(0xc5) + tail = shl(bxor(band(rex, 4), 4), 5) + psz = 3 + end + end + if not tail then + wputb(0xc4) + wputb(shl(bxor(band(rex, 7), 7), 5) + vex.m) + tail = shl(band(rex, 8), 4) + psz = 4 + end + local reg, vreg = 0, nil + if vex.v then + reg = vex.v.reg + if not reg then werror("bad vex operand") end + if reg < 0 then reg = 0; vreg = vex.v.vreg end + end + if sz == "y" or vex.l then tail = tail + 4 end + wputb(tail + shl(bxor(reg, 15), 3) + vex.p) + wvreg("vex.v", vreg) + rex = 0 + if op >= 256 then werror("bad vex opcode") end + else + if rex ~= 0 then + if not x64 then werror("bad operand size") end + elseif (vregr or vregxb) and x64 then + rex = 0x10 + sk = map_vreg["vex.v"] + end + end local r - if rex ~= 0 and not x64 then werror("bad operand size") end if sz == "w" then wputb(102) end -- Needs >32 bit numbers, but only for crc32 eax, word [ebx] if op >= 4294967296 then r = op%4294967296 wputb((op-r)/4294967296) op = r end @@ -471,20 +541,20 @@ local function wputop(sz, op, rex) if rex ~= 0 then local opc3 = band(op, 0xffff00) if opc3 == 0x0f3a00 or opc3 == 0x0f3800 then - wputb(64 + band(rex, 15)); rex = 0 + wputb(64 + band(rex, 15)); rex = 0; psz = 2 end end - wputb(shr(op, 16)); op = band(op, 0xffff) + wputb(shr(op, 16)); op = band(op, 0xffff); psz = psz + 1 end if op >= 256 then local b = shr(op, 8) - if b == 15 and rex ~= 0 then wputb(64 + band(rex, 15)); rex = 0 end - wputb(b) - op = band(op, 255) + if b == 15 and rex ~= 0 then wputb(64 + band(rex, 15)); rex = 0; psz = 2 end + wputb(b); op = band(op, 255); psz = psz + 1 end - if rex ~= 0 then wputb(64 + band(rex, 15)) end + if rex ~= 0 then wputb(64 + band(rex, 15)); psz = 2 end if sz == "b" then op = op - 1 end wputb(op) + return psz, sk end -- Put ModRM or SIB formatted byte. @@ -494,7 +564,7 @@ local function wputmodrm(m, s, rm, vs, vrm) end -- Put ModRM/SIB plus optional displacement. -local function wputmrmsib(t, imark, s, vsreg) +local function wputmrmsib(t, imark, s, vsreg, psz, sk) local vreg, vxreg local reg, xreg = t.reg, t.xreg if reg and reg < 0 then reg = 0; vreg = t.vreg end @@ -504,8 +574,8 @@ local function wputmrmsib(t, imark, s, vsreg) -- Register mode. if sub(t.mode, 1, 1) == "r" then wputmodrm(3, s, reg) - if vsreg then waction("VREG", vsreg); wputxb(2) end - if vreg then waction("VREG", vreg); wputxb(0) end + wvreg("modrm.reg", vsreg, psz+1, sk, vreg) + wvreg("modrm.rm.r", vreg, psz+1, sk) return end @@ -519,21 +589,22 @@ local function wputmrmsib(t, imark, s, vsreg) -- [xreg*xsc+disp] -> (0, s, esp) (xsc, xreg, ebp) wputmodrm(0, s, 4) if imark == "I" then waction("MARK") end - if vsreg then waction("VREG", vsreg); wputxb(2) end + wvreg("modrm.reg", vsreg, psz+1, sk, vxreg) wputmodrm(t.xsc, xreg, 5) - if vxreg then waction("VREG", vxreg); wputxb(3) end + wvreg("sib.index", vxreg, psz+2, sk) else -- Pure 32 bit displacement. if x64 and tdisp ~= "table" then wputmodrm(0, s, 4) -- [disp] -> (0, s, esp) (0, esp, ebp) + wvreg("modrm.reg", vsreg, psz+1, sk) if imark == "I" then waction("MARK") end wputmodrm(0, 4, 5) else riprel = x64 wputmodrm(0, s, 5) -- [disp|rip-label] -> (0, s, ebp) + wvreg("modrm.reg", vsreg, psz+1, sk) if imark == "I" then waction("MARK") end end - if vsreg then waction("VREG", vsreg); wputxb(2) end end if riprel then -- Emit rip-relative displacement. if match("UWSiI", imark) then @@ -561,16 +632,16 @@ local function wputmrmsib(t, imark, s, vsreg) if xreg or band(reg, 7) == 4 then wputmodrm(m or 2, s, 4) -- ModRM. if m == nil or imark == "I" then waction("MARK") end - if vsreg then waction("VREG", vsreg); wputxb(2) end + wvreg("modrm.reg", vsreg, psz+1, sk, vxreg or vreg) wputmodrm(t.xsc or 0, xreg or 4, reg) -- SIB. - if vxreg then waction("VREG", vxreg); wputxb(3) end - if vreg then waction("VREG", vreg); wputxb(1) end + wvreg("sib.index", vxreg, psz+2, sk, vreg) + wvreg("sib.base", vreg, psz+2, sk) else wputmodrm(m or 2, s, reg) -- ModRM. if (imark == "I" and (m == 1 or m == 2)) or (m == nil and (vsreg or vreg)) then waction("MARK") end - if vsreg then waction("VREG", vsreg); wputxb(2) end - if vreg then waction("VREG", vreg); wputxb(1) end + wvreg("modrm.reg", vsreg, psz+1, sk, vreg) + wvreg("modrm.rm.m", vreg, psz+1, sk) end -- Put displacement. @@ -881,9 +952,15 @@ end -- "m"/"M" generates ModRM/SIB from the 1st/2nd operand. -- The spare 3 bits are either filled with the last hex digit or -- the result from a previous "r"/"R". The opcode is restored. +-- "u" Use VEX encoding, vvvv unused. +-- "v"/"V" Use VEX encoding, vvvv from 1st/2nd operand (the operand is +-- removed from the list used by future characters). +-- "L" Force VEX.L -- -- All of the following characters force a flush of the opcode: -- "o"/"O" stores a pure 32 bit disp (offset) from the 1st/2nd operand. +-- "s" stores a 4 bit immediate from the last register operand, +-- followed by 4 zero bits. -- "S" stores a signed 8 bit immediate from the last operand. -- "U" stores an unsigned 8 bit immediate from the last operand. -- "W" stores an unsigned 16 bit immediate from the last operand. @@ -1081,10 +1158,11 @@ local map_op = { btr_2 = "mrqdw:0FB3Rm|miqdw:0FBA6mU", bts_2 = "mrqdw:0FABRm|miqdw:0FBA5mU", - shld_3 = "mriqdw:0FA4RmU|mrCqdw:0FA5Rm", - shrd_3 = "mriqdw:0FACRmU|mrCqdw:0FADRm", + shld_3 = "mriqdw:0FA4RmU|mrC/qq:0FA5Rm|mrC/dd:|mrC/ww:", + shrd_3 = "mriqdw:0FACRmU|mrC/qq:0FADRm|mrC/dd:|mrC/ww:", rdtsc_0 = "0F31", -- P1+ + rdpmc_0 = "0F33", -- P6+ cpuid_0 = "0FA2", -- P1+ -- floating point ops @@ -1190,7 +1268,7 @@ local map_op = { cvtsi2sd_2 = "rm/od:F20F2ArM|rm/oq:F20F2ArXM", cvtsi2ss_2 = "rm/od:F30F2ArM|rm/oq:F30F2ArXM", cvtss2sd_2 = "rro:F30F5ArM|rx/od:", - cvtss2si_2 = "rr/do:F20F2CrM|rr/qo:|rxd:|rx/qd:", + cvtss2si_2 = "rr/do:F30F2DrM|rr/qo:|rxd:|rx/qd:", cvttpd2dq_2 = "rmo:660FE6rM", cvttps2dq_2 = "rmo:F30F5BrM", cvttsd2si_2 = "rr/do:F20F2CrM|rr/qo:|rx/dq:|rxq:", @@ -1225,46 +1303,14 @@ local map_op = { movups_2 = "rmo:0F10rM|mro:0F11Rm", orpd_2 = "rmo:660F56rM", orps_2 = "rmo:0F56rM", - packssdw_2 = "rmo:660F6BrM", - packsswb_2 = "rmo:660F63rM", - packuswb_2 = "rmo:660F67rM", - paddb_2 = "rmo:660FFCrM", - paddd_2 = "rmo:660FFErM", - paddq_2 = "rmo:660FD4rM", - paddsb_2 = "rmo:660FECrM", - paddsw_2 = "rmo:660FEDrM", - paddusb_2 = "rmo:660FDCrM", - paddusw_2 = "rmo:660FDDrM", - paddw_2 = "rmo:660FFDrM", - pand_2 = "rmo:660FDBrM", - pandn_2 = "rmo:660FDFrM", pause_0 = "F390", - pavgb_2 = "rmo:660FE0rM", - pavgw_2 = "rmo:660FE3rM", - pcmpeqb_2 = "rmo:660F74rM", - pcmpeqd_2 = "rmo:660F76rM", - pcmpeqw_2 = "rmo:660F75rM", - pcmpgtb_2 = "rmo:660F64rM", - pcmpgtd_2 = "rmo:660F66rM", - pcmpgtw_2 = "rmo:660F65rM", - pextrw_3 = "rri/do:660FC5rMU|xri/wo:660F3A15nrMU", -- Mem op: SSE4.1 only. + pextrw_3 = "rri/do:660FC5rMU|xri/wo:660F3A15nRmU", -- Mem op: SSE4.1 only. pinsrw_3 = "rri/od:660FC4rMU|rxi/ow:", - pmaddwd_2 = "rmo:660FF5rM", - pmaxsw_2 = "rmo:660FEErM", - pmaxub_2 = "rmo:660FDErM", - pminsw_2 = "rmo:660FEArM", - pminub_2 = "rmo:660FDArM", pmovmskb_2 = "rr/do:660FD7rM", - pmulhuw_2 = "rmo:660FE4rM", - pmulhw_2 = "rmo:660FE5rM", - pmullw_2 = "rmo:660FD5rM", - pmuludq_2 = "rmo:660FF4rM", - por_2 = "rmo:660FEBrM", prefetchnta_1 = "xb:n0F180m", prefetcht0_1 = "xb:n0F181m", prefetcht1_1 = "xb:n0F182m", prefetcht2_1 = "xb:n0F183m", - psadbw_2 = "rmo:660FF6rM", pshufd_3 = "rmio:660F70rMU", pshufhw_3 = "rmio:F30F70rMU", pshuflw_3 = "rmio:F20F70rMU", @@ -1278,23 +1324,6 @@ local map_op = { psrldq_2 = "rio:660F733mU", psrlq_2 = "rmo:660FD3rM|rio:660F732mU", psrlw_2 = "rmo:660FD1rM|rio:660F712mU", - psubb_2 = "rmo:660FF8rM", - psubd_2 = "rmo:660FFArM", - psubq_2 = "rmo:660FFBrM", - psubsb_2 = "rmo:660FE8rM", - psubsw_2 = "rmo:660FE9rM", - psubusb_2 = "rmo:660FD8rM", - psubusw_2 = "rmo:660FD9rM", - psubw_2 = "rmo:660FF9rM", - punpckhbw_2 = "rmo:660F68rM", - punpckhdq_2 = "rmo:660F6ArM", - punpckhqdq_2 = "rmo:660F6DrM", - punpckhwd_2 = "rmo:660F69rM", - punpcklbw_2 = "rmo:660F60rM", - punpckldq_2 = "rmo:660F62rM", - punpcklqdq_2 = "rmo:660F6CrM", - punpcklwd_2 = "rmo:660F61rM", - pxor_2 = "rmo:660FEFrM", rcpps_2 = "rmo:0F53rM", rcpss_2 = "rro:F30F53rM|rx/od:", rsqrtps_2 = "rmo:0F52rM", @@ -1352,7 +1381,7 @@ local map_op = { dpps_3 = "rmio:660F3A40rMU", extractps_3 = "mri/do:660F3A17RmU|rri/qo:660F3A17RXmU", insertps_3 = "rrio:660F3A41rMU|rxi/od:", - movntdqa_2 = "rmo:660F382ArM", + movntdqa_2 = "rxo:660F382ArM", mpsadbw_3 = "rmio:660F3A42rMU", packusdw_2 = "rmo:660F382BrM", pblendvb_3 = "rmRo:660F3810rM", @@ -1412,6 +1441,238 @@ local map_op = { movntsd_2 = "xr/qo:nF20F2BRm", movntss_2 = "xr/do:F30F2BRm", -- popcnt is also in SSE4.2 + + -- AES-NI + aesdec_2 = "rmo:660F38DErM", + aesdeclast_2 = "rmo:660F38DFrM", + aesenc_2 = "rmo:660F38DCrM", + aesenclast_2 = "rmo:660F38DDrM", + aesimc_2 = "rmo:660F38DBrM", + aeskeygenassist_3 = "rmio:660F3ADFrMU", + pclmulqdq_3 = "rmio:660F3A44rMU", + + -- AVX FP ops + vaddsubpd_3 = "rrmoy:660FVD0rM", + vaddsubps_3 = "rrmoy:F20FVD0rM", + vandpd_3 = "rrmoy:660FV54rM", + vandps_3 = "rrmoy:0FV54rM", + vandnpd_3 = "rrmoy:660FV55rM", + vandnps_3 = "rrmoy:0FV55rM", + vblendpd_4 = "rrmioy:660F3AV0DrMU", + vblendps_4 = "rrmioy:660F3AV0CrMU", + vblendvpd_4 = "rrmroy:660F3AV4BrMs", + vblendvps_4 = "rrmroy:660F3AV4ArMs", + vbroadcastf128_2 = "rx/yo:660F38u1ArM", + vcmppd_4 = "rrmioy:660FVC2rMU", + vcmpps_4 = "rrmioy:0FVC2rMU", + vcmpsd_4 = "rrrio:F20FVC2rMU|rrxi/ooq:", + vcmpss_4 = "rrrio:F30FVC2rMU|rrxi/ood:", + vcomisd_2 = "rro:660Fu2FrM|rx/oq:", + vcomiss_2 = "rro:0Fu2FrM|rx/od:", + vcvtdq2pd_2 = "rro:F30FuE6rM|rx/oq:|rm/yo:", + vcvtdq2ps_2 = "rmoy:0Fu5BrM", + vcvtpd2dq_2 = "rmoy:F20FuE6rM", + vcvtpd2ps_2 = "rmoy:660Fu5ArM", + vcvtps2dq_2 = "rmoy:660Fu5BrM", + vcvtps2pd_2 = "rro:0Fu5ArM|rx/oq:|rm/yo:", + vcvtsd2si_2 = "rr/do:F20Fu2DrM|rx/dq:|rr/qo:|rxq:", + vcvtsd2ss_3 = "rrro:F20FV5ArM|rrx/ooq:", + vcvtsi2sd_3 = "rrm/ood:F20FV2ArM|rrm/ooq:F20FVX2ArM", + vcvtsi2ss_3 = "rrm/ood:F30FV2ArM|rrm/ooq:F30FVX2ArM", + vcvtss2sd_3 = "rrro:F30FV5ArM|rrx/ood:", + vcvtss2si_2 = "rr/do:F30Fu2DrM|rxd:|rr/qo:|rx/qd:", + vcvttpd2dq_2 = "rmo:660FuE6rM|rm/oy:660FuLE6rM", + vcvttps2dq_2 = "rmoy:F30Fu5BrM", + vcvttsd2si_2 = "rr/do:F20Fu2CrM|rx/dq:|rr/qo:|rxq:", + vcvttss2si_2 = "rr/do:F30Fu2CrM|rxd:|rr/qo:|rx/qd:", + vdppd_4 = "rrmio:660F3AV41rMU", + vdpps_4 = "rrmioy:660F3AV40rMU", + vextractf128_3 = "mri/oy:660F3AuL19RmU", + vextractps_3 = "mri/do:660F3Au17RmU", + vhaddpd_3 = "rrmoy:660FV7CrM", + vhaddps_3 = "rrmoy:F20FV7CrM", + vhsubpd_3 = "rrmoy:660FV7DrM", + vhsubps_3 = "rrmoy:F20FV7DrM", + vinsertf128_4 = "rrmi/yyo:660F3AV18rMU", + vinsertps_4 = "rrrio:660F3AV21rMU|rrxi/ood:", + vldmxcsr_1 = "xd:0FuAE2m", + vmaskmovps_3 = "rrxoy:660F38V2CrM|xrroy:660F38V2ERm", + vmaskmovpd_3 = "rrxoy:660F38V2DrM|xrroy:660F38V2FRm", + vmovapd_2 = "rmoy:660Fu28rM|mroy:660Fu29Rm", + vmovaps_2 = "rmoy:0Fu28rM|mroy:0Fu29Rm", + vmovd_2 = "rm/od:660Fu6ErM|rm/oq:660FuX6ErM|mr/do:660Fu7ERm|mr/qo:", + vmovq_2 = "rro:F30Fu7ErM|rx/oq:|xr/qo:660FuD6Rm", + vmovddup_2 = "rmy:F20Fu12rM|rro:|rx/oq:", + vmovhlps_3 = "rrro:0FV12rM", + vmovhpd_2 = "xr/qo:660Fu17Rm", + vmovhpd_3 = "rrx/ooq:660FV16rM", + vmovhps_2 = "xr/qo:0Fu17Rm", + vmovhps_3 = "rrx/ooq:0FV16rM", + vmovlhps_3 = "rrro:0FV16rM", + vmovlpd_2 = "xr/qo:660Fu13Rm", + vmovlpd_3 = "rrx/ooq:660FV12rM", + vmovlps_2 = "xr/qo:0Fu13Rm", + vmovlps_3 = "rrx/ooq:0FV12rM", + vmovmskpd_2 = "rr/do:660Fu50rM|rr/dy:660FuL50rM", + vmovmskps_2 = "rr/do:0Fu50rM|rr/dy:0FuL50rM", + vmovntpd_2 = "xroy:660Fu2BRm", + vmovntps_2 = "xroy:0Fu2BRm", + vmovsd_2 = "rx/oq:F20Fu10rM|xr/qo:F20Fu11Rm", + vmovsd_3 = "rrro:F20FV10rM", + vmovshdup_2 = "rmoy:F30Fu16rM", + vmovsldup_2 = "rmoy:F30Fu12rM", + vmovss_2 = "rx/od:F30Fu10rM|xr/do:F30Fu11Rm", + vmovss_3 = "rrro:F30FV10rM", + vmovupd_2 = "rmoy:660Fu10rM|mroy:660Fu11Rm", + vmovups_2 = "rmoy:0Fu10rM|mroy:0Fu11Rm", + vorpd_3 = "rrmoy:660FV56rM", + vorps_3 = "rrmoy:0FV56rM", + vpermilpd_3 = "rrmoy:660F38V0DrM|rmioy:660F3Au05rMU", + vpermilps_3 = "rrmoy:660F38V0CrM|rmioy:660F3Au04rMU", + vperm2f128_4 = "rrmiy:660F3AV06rMU", + vptestpd_2 = "rmoy:660F38u0FrM", + vptestps_2 = "rmoy:660F38u0ErM", + vrcpps_2 = "rmoy:0Fu53rM", + vrcpss_3 = "rrro:F30FV53rM|rrx/ood:", + vrsqrtps_2 = "rmoy:0Fu52rM", + vrsqrtss_3 = "rrro:F30FV52rM|rrx/ood:", + vroundpd_3 = "rmioy:660F3AV09rMU", + vroundps_3 = "rmioy:660F3AV08rMU", + vroundsd_4 = "rrrio:660F3AV0BrMU|rrxi/ooq:", + vroundss_4 = "rrrio:660F3AV0ArMU|rrxi/ood:", + vshufpd_4 = "rrmioy:660FVC6rMU", + vshufps_4 = "rrmioy:0FVC6rMU", + vsqrtps_2 = "rmoy:0Fu51rM", + vsqrtss_2 = "rro:F30Fu51rM|rx/od:", + vsqrtpd_2 = "rmoy:660Fu51rM", + vsqrtsd_2 = "rro:F20Fu51rM|rx/oq:", + vstmxcsr_1 = "xd:0FuAE3m", + vucomisd_2 = "rro:660Fu2ErM|rx/oq:", + vucomiss_2 = "rro:0Fu2ErM|rx/od:", + vunpckhpd_3 = "rrmoy:660FV15rM", + vunpckhps_3 = "rrmoy:0FV15rM", + vunpcklpd_3 = "rrmoy:660FV14rM", + vunpcklps_3 = "rrmoy:0FV14rM", + vxorpd_3 = "rrmoy:660FV57rM", + vxorps_3 = "rrmoy:0FV57rM", + vzeroall_0 = "0FuL77", + vzeroupper_0 = "0Fu77", + + -- AVX2 FP ops + vbroadcastss_2 = "rx/od:660F38u18rM|rx/yd:|rro:|rr/yo:", + vbroadcastsd_2 = "rx/yq:660F38u19rM|rr/yo:", + -- *vgather* (!vsib) + vpermpd_3 = "rmiy:660F3AuX01rMU", + vpermps_3 = "rrmy:660F38V16rM", + + -- AVX, AVX2 integer ops + -- In general, xmm requires AVX, ymm requires AVX2. + vaesdec_3 = "rrmo:660F38VDErM", + vaesdeclast_3 = "rrmo:660F38VDFrM", + vaesenc_3 = "rrmo:660F38VDCrM", + vaesenclast_3 = "rrmo:660F38VDDrM", + vaesimc_2 = "rmo:660F38uDBrM", + vaeskeygenassist_3 = "rmio:660F3AuDFrMU", + vlddqu_2 = "rxoy:F20FuF0rM", + vmaskmovdqu_2 = "rro:660FuF7rM", + vmovdqa_2 = "rmoy:660Fu6FrM|mroy:660Fu7FRm", + vmovdqu_2 = "rmoy:F30Fu6FrM|mroy:F30Fu7FRm", + vmovntdq_2 = "xroy:660FuE7Rm", + vmovntdqa_2 = "rxoy:660F38u2ArM", + vmpsadbw_4 = "rrmioy:660F3AV42rMU", + vpabsb_2 = "rmoy:660F38u1CrM", + vpabsd_2 = "rmoy:660F38u1ErM", + vpabsw_2 = "rmoy:660F38u1DrM", + vpackusdw_3 = "rrmoy:660F38V2BrM", + vpalignr_4 = "rrmioy:660F3AV0FrMU", + vpblendvb_4 = "rrmroy:660F3AV4CrMs", + vpblendw_4 = "rrmioy:660F3AV0ErMU", + vpclmulqdq_4 = "rrmio:660F3AV44rMU", + vpcmpeqq_3 = "rrmoy:660F38V29rM", + vpcmpestri_3 = "rmio:660F3Au61rMU", + vpcmpestrm_3 = "rmio:660F3Au60rMU", + vpcmpgtq_3 = "rrmoy:660F38V37rM", + vpcmpistri_3 = "rmio:660F3Au63rMU", + vpcmpistrm_3 = "rmio:660F3Au62rMU", + vpextrb_3 = "rri/do:660F3Au14nRmU|rri/qo:|xri/bo:", + vpextrw_3 = "rri/do:660FuC5rMU|xri/wo:660F3Au15nRmU", + vpextrd_3 = "mri/do:660F3Au16RmU", + vpextrq_3 = "mri/qo:660F3Au16RmU", + vphaddw_3 = "rrmoy:660F38V01rM", + vphaddd_3 = "rrmoy:660F38V02rM", + vphaddsw_3 = "rrmoy:660F38V03rM", + vphminposuw_2 = "rmo:660F38u41rM", + vphsubw_3 = "rrmoy:660F38V05rM", + vphsubd_3 = "rrmoy:660F38V06rM", + vphsubsw_3 = "rrmoy:660F38V07rM", + vpinsrb_4 = "rrri/ood:660F3AV20rMU|rrxi/oob:", + vpinsrw_4 = "rrri/ood:660FVC4rMU|rrxi/oow:", + vpinsrd_4 = "rrmi/ood:660F3AV22rMU", + vpinsrq_4 = "rrmi/ooq:660F3AVX22rMU", + vpmaddubsw_3 = "rrmoy:660F38V04rM", + vpmaxsb_3 = "rrmoy:660F38V3CrM", + vpmaxsd_3 = "rrmoy:660F38V3DrM", + vpmaxuw_3 = "rrmoy:660F38V3ErM", + vpmaxud_3 = "rrmoy:660F38V3FrM", + vpminsb_3 = "rrmoy:660F38V38rM", + vpminsd_3 = "rrmoy:660F38V39rM", + vpminuw_3 = "rrmoy:660F38V3ArM", + vpminud_3 = "rrmoy:660F38V3BrM", + vpmovmskb_2 = "rr/do:660FuD7rM|rr/dy:660FuLD7rM", + vpmovsxbw_2 = "rroy:660F38u20rM|rx/oq:|rx/yo:", + vpmovsxbd_2 = "rroy:660F38u21rM|rx/od:|rx/yq:", + vpmovsxbq_2 = "rroy:660F38u22rM|rx/ow:|rx/yd:", + vpmovsxwd_2 = "rroy:660F38u23rM|rx/oq:|rx/yo:", + vpmovsxwq_2 = "rroy:660F38u24rM|rx/od:|rx/yq:", + vpmovsxdq_2 = "rroy:660F38u25rM|rx/oq:|rx/yo:", + vpmovzxbw_2 = "rroy:660F38u30rM|rx/oq:|rx/yo:", + vpmovzxbd_2 = "rroy:660F38u31rM|rx/od:|rx/yq:", + vpmovzxbq_2 = "rroy:660F38u32rM|rx/ow:|rx/yd:", + vpmovzxwd_2 = "rroy:660F38u33rM|rx/oq:|rx/yo:", + vpmovzxwq_2 = "rroy:660F38u34rM|rx/od:|rx/yq:", + vpmovzxdq_2 = "rroy:660F38u35rM|rx/oq:|rx/yo:", + vpmuldq_3 = "rrmoy:660F38V28rM", + vpmulhrsw_3 = "rrmoy:660F38V0BrM", + vpmulld_3 = "rrmoy:660F38V40rM", + vpshufb_3 = "rrmoy:660F38V00rM", + vpshufd_3 = "rmioy:660Fu70rMU", + vpshufhw_3 = "rmioy:F30Fu70rMU", + vpshuflw_3 = "rmioy:F20Fu70rMU", + vpsignb_3 = "rrmoy:660F38V08rM", + vpsignw_3 = "rrmoy:660F38V09rM", + vpsignd_3 = "rrmoy:660F38V0ArM", + vpslldq_3 = "rrioy:660Fv737mU", + vpsllw_3 = "rrmoy:660FVF1rM|rrioy:660Fv716mU", + vpslld_3 = "rrmoy:660FVF2rM|rrioy:660Fv726mU", + vpsllq_3 = "rrmoy:660FVF3rM|rrioy:660Fv736mU", + vpsraw_3 = "rrmoy:660FVE1rM|rrioy:660Fv714mU", + vpsrad_3 = "rrmoy:660FVE2rM|rrioy:660Fv724mU", + vpsrldq_3 = "rrioy:660Fv733mU", + vpsrlw_3 = "rrmoy:660FVD1rM|rrioy:660Fv712mU", + vpsrld_3 = "rrmoy:660FVD2rM|rrioy:660Fv722mU", + vpsrlq_3 = "rrmoy:660FVD3rM|rrioy:660Fv732mU", + vptest_2 = "rmoy:660F38u17rM", + + -- AVX2 integer ops + vbroadcasti128_2 = "rx/yo:660F38u5ArM", + vinserti128_4 = "rrmi/yyo:660F3AV38rMU", + vextracti128_3 = "mri/oy:660F3AuL39RmU", + vpblendd_4 = "rrmioy:660F3AV02rMU", + vpbroadcastb_2 = "rro:660F38u78rM|rx/ob:|rr/yo:|rx/yb:", + vpbroadcastw_2 = "rro:660F38u79rM|rx/ow:|rr/yo:|rx/yw:", + vpbroadcastd_2 = "rro:660F38u58rM|rx/od:|rr/yo:|rx/yd:", + vpbroadcastq_2 = "rro:660F38u59rM|rx/oq:|rr/yo:|rx/yq:", + vpermd_3 = "rrmy:660F38V36rM", + vpermq_3 = "rmiy:660F3AuX00rMU", + -- *vpgather* (!vsib) + vperm2i128_4 = "rrmiy:660F3AV46rMU", + vpmaskmovd_3 = "rrxoy:660F38V8CrM|xrroy:660F38V8ERm", + vpmaskmovq_3 = "rrxoy:660F38VX8CrM|xrroy:660F38VX8ERm", + vpsllvd_3 = "rrmoy:660F38V47rM", + vpsllvq_3 = "rrmoy:660F38VX47rM", + vpsravd_3 = "rrmoy:660F38V46rM", + vpsrlvd_3 = "rrmoy:660F38V45rM", + vpsrlvq_3 = "rrmoy:660F38VX45rM", } ------------------------------------------------------------------------------ @@ -1462,28 +1723,58 @@ for cc,n in pairs{ b=0, e=1, be=2, u=3, nb=4, ne=5, nbe=6, nu=7 } do map_op["fcmov"..cc.."_2"] = format("Fff:%04XR", nc) -- P6+ end --- SSE FP arithmetic ops. +-- SSE / AVX FP arithmetic ops. for name,n in pairs{ sqrt = 1, add = 8, mul = 9, sub = 12, min = 13, div = 14, max = 15 } do map_op[name.."ps_2"] = format("rmo:0F5%XrM", n) map_op[name.."ss_2"] = format("rro:F30F5%XrM|rx/od:", n) map_op[name.."pd_2"] = format("rmo:660F5%XrM", n) map_op[name.."sd_2"] = format("rro:F20F5%XrM|rx/oq:", n) + if n ~= 1 then + map_op["v"..name.."ps_3"] = format("rrmoy:0FV5%XrM", n) + map_op["v"..name.."ss_3"] = format("rrro:F30FV5%XrM|rrx/ood:", n) + map_op["v"..name.."pd_3"] = format("rrmoy:660FV5%XrM", n) + map_op["v"..name.."sd_3"] = format("rrro:F20FV5%XrM|rrx/ooq:", n) + end +end + +-- SSE2 / AVX / AVX2 integer arithmetic ops (66 0F leaf). +for name,n in pairs{ + paddb = 0xFC, paddw = 0xFD, paddd = 0xFE, paddq = 0xD4, + paddsb = 0xEC, paddsw = 0xED, packssdw = 0x6B, + packsswb = 0x63, packuswb = 0x67, paddusb = 0xDC, + paddusw = 0xDD, pand = 0xDB, pandn = 0xDF, pavgb = 0xE0, + pavgw = 0xE3, pcmpeqb = 0x74, pcmpeqd = 0x76, + pcmpeqw = 0x75, pcmpgtb = 0x64, pcmpgtd = 0x66, + pcmpgtw = 0x65, pmaddwd = 0xF5, pmaxsw = 0xEE, + pmaxub = 0xDE, pminsw = 0xEA, pminub = 0xDA, + pmulhuw = 0xE4, pmulhw = 0xE5, pmullw = 0xD5, + pmuludq = 0xF4, por = 0xEB, psadbw = 0xF6, psubb = 0xF8, + psubw = 0xF9, psubd = 0xFA, psubq = 0xFB, psubsb = 0xE8, + psubsw = 0xE9, psubusb = 0xD8, psubusw = 0xD9, + punpckhbw = 0x68, punpckhwd = 0x69, punpckhdq = 0x6A, + punpckhqdq = 0x6D, punpcklbw = 0x60, punpcklwd = 0x61, + punpckldq = 0x62, punpcklqdq = 0x6C, pxor = 0xEF +} do + map_op[name.."_2"] = format("rmo:660F%02XrM", n) + map_op["v"..name.."_3"] = format("rrmoy:660FV%02XrM", n) end ------------------------------------------------------------------------------ +local map_vexarg = { u = false, v = 1, V = 2 } + -- Process pattern string. local function dopattern(pat, args, sz, op, needrex) - local digit, addin + local digit, addin, vex local opcode = 0 local szov = sz local narg = 1 local rex = 0 -- Limit number of section buffer positions used by a single dasm_put(). - -- A single opcode needs a maximum of 5 positions. - if secpos+5 > maxsecpos then wflush() end + -- A single opcode needs a maximum of 6 positions. + if secpos+6 > maxsecpos then wflush() end -- Process each character. for c in gmatch(pat.."|", ".") do @@ -1497,6 +1788,8 @@ local function dopattern(pat, args, sz, op, needrex) szov = nil elseif c == "X" then -- Force REX.W. rex = 8 + elseif c == "L" then -- Force VEX.L. + vex.l = true elseif c == "r" then -- Merge 1st operand regno. into opcode. addin = args[1]; opcode = opcode + (addin.reg % 8) if narg < 2 then narg = 2 end @@ -1520,21 +1813,42 @@ local function dopattern(pat, args, sz, op, needrex) if t.xreg and t.xreg > 7 then rex = rex + 2 end if s > 7 then rex = rex + 4 end if needrex then rex = rex + 16 end - wputop(szov, opcode, rex); opcode = nil + local psz, sk = wputop(szov, opcode, rex, vex, s < 0, t.vreg or t.vxreg) + opcode = nil local imark = sub(pat, -1) -- Force a mark (ugly). -- Put ModRM/SIB with regno/last digit as spare. - wputmrmsib(t, imark, s, addin and addin.vreg) + wputmrmsib(t, imark, s, addin and addin.vreg, psz, sk) addin = nil + elseif map_vexarg[c] ~= nil then -- Encode using VEX prefix + local b = band(opcode, 255); opcode = shr(opcode, 8) + local m = 1 + if b == 0x38 then m = 2 + elseif b == 0x3a then m = 3 end + if m ~= 1 then b = band(opcode, 255); opcode = shr(opcode, 8) end + if b ~= 0x0f then + werror("expected `0F', `0F38', or `0F3A' to precede `"..c.. + "' in pattern `"..pat.."' for `"..op.."'") + end + local v = map_vexarg[c] + if v then v = remove(args, v) end + b = band(opcode, 255) + local p = 0 + if b == 0x66 then p = 1 + elseif b == 0xf3 then p = 2 + elseif b == 0xf2 then p = 3 end + if p ~= 0 then opcode = shr(opcode, 8) end + if opcode ~= 0 then wputop(nil, opcode, 0); opcode = 0 end + vex = { m = m, p = p, v = v } else if opcode then -- Flush opcode. if szov == "q" and rex == 0 then rex = rex + 8 end if needrex then rex = rex + 16 end if addin and addin.reg == -1 then - wputop(szov, opcode - 7, rex) - waction("VREG", addin.vreg); wputxb(0) + local psz, sk = wputop(szov, opcode - 7, rex, vex, true) + wvreg("opcode", addin.vreg, psz, sk) else if addin and addin.reg > 7 then rex = rex + 1 end - wputop(szov, opcode, rex) + wputop(szov, opcode, rex, vex) end opcode = nil end @@ -1571,6 +1885,14 @@ local function dopattern(pat, args, sz, op, needrex) else wputlabel("REL_", imm, 2) end + elseif c == "s" then + local reg = a.reg + if reg < 0 then + wputb(0) + wvreg("imm.hi", a.vreg) + else + wputb(shl(reg, 4)) + end else werror("bad char `"..c.."' in pattern `"..pat.."' for `"..op.."'") end @@ -1647,11 +1969,14 @@ map_op[".template__"] = function(params, template, nparams) if pat == "" then pat = lastpat else lastpat = pat end if matchtm(tm, args) then local prefix = sub(szm, 1, 1) - if prefix == "/" then -- Match both operand sizes. - if args[1].opsize == sub(szm, 2, 2) and - args[2].opsize == sub(szm, 3, 3) then - dopattern(pat, args, sz, params.op, needrex) -- Process pattern. - return + if prefix == "/" then -- Exactly match leading operand sizes. + for i = #szm,1,-1 do + if i == 1 then + dopattern(pat, args, sz, params.op, needrex) -- Process pattern. + return + elseif args[i-1].opsize ~= sub(szm, i, i) then + break + end end else -- Match common operand size. local szp = sz @@ -1716,8 +2041,8 @@ if x64 then rex = a.reg > 7 and 9 or 8 end end - wputop(sz, opcode, rex) - if vreg then waction("VREG", vreg); wputxb(0) end + local psz, sk = wputop(sz, opcode, rex, nil, vreg) + wvreg("opcode", vreg, psz, sk) waction("IMM_D", format("(unsigned int)(%s)", op64)) waction("IMM_D", format("(unsigned int)((%s)>>32)", op64)) end diff --git a/lib/luajit/dynasm/dynasm.lua b/lib/luajit/dynasm/dynasm.lua index fffda7513c..145fb0cc6d 100644 --- a/lib/luajit/dynasm/dynasm.lua +++ b/lib/luajit/dynasm/dynasm.lua @@ -10,9 +10,9 @@ local _info = { name = "DynASM", description = "A dynamic assembler for code generation engines", - version = "1.3.0", - vernum = 10300, - release = "2011-05-05", + version = "1.4.0", + vernum = 10400, + release = "2015-10-18", author = "Mike Pall", url = "http://luajit.org/dynasm.html", license = "MIT", diff --git a/lib/luajit/src/Makefile b/lib/luajit/src/Makefile index 532da6e94d..9845f6a0e2 100644 --- a/lib/luajit/src/Makefile +++ b/lib/luajit/src/Makefile @@ -24,11 +24,13 @@ NODOTABIVER= 51 # removing the '#' in front of them. Make sure you force a full recompile # with "make clean", followed by "make" if you change any options. # +DEFAULT_CC = gcc +# # LuaJIT builds as a native 32 or 64 bit binary by default. -CC= gcc +CC= $(DEFAULT_CC) # # Use this if you want to force a 32 bit build on a 64 bit multilib OS. -#CC= gcc -m32 +#CC= $(DEFAULT_CC) -m32 # # Since the assembler part does NOT maintain a frame pointer, it's pointless # to slow down the C part by not omitting it. Debugging, tracebacks and @@ -147,6 +149,29 @@ XCFLAGS= # You probably don't need to change anything below this line! ############################################################################## +############################################################################## +# Host system detection. +############################################################################## + +ifeq (Windows,$(findstring Windows,$(OS))$(MSYSTEM)$(TERM)) + HOST_SYS= Windows + HOST_RM= del +else + HOST_SYS:= $(shell uname -s) + ifneq (,$(findstring MINGW,$(HOST_SYS))) + HOST_SYS= Windows + HOST_MSYS= mingw + endif + ifneq (,$(findstring CYGWIN,$(HOST_SYS))) + HOST_SYS= Windows + HOST_MSYS= cygwin + endif + # Use Clang for OSX host. + ifeq (Darwin,$(HOST_SYS)) + DEFAULT_CC= clang + endif +endif + ############################################################################## # Flags and options for host and target. ############################################################################## @@ -268,24 +293,9 @@ ifneq (,$(LMULTILIB)) endif ############################################################################## -# System detection. +# Target system detection. ############################################################################## -ifeq (Windows,$(findstring Windows,$(OS))$(MSYSTEM)$(TERM)) - HOST_SYS= Windows - HOST_RM= del -else - HOST_SYS:= $(shell uname -s) - ifneq (,$(findstring MINGW,$(HOST_SYS))) - HOST_SYS= Windows - HOST_MSYS= mingw - endif - ifneq (,$(findstring CYGWIN,$(HOST_SYS))) - HOST_SYS= Windows - HOST_MSYS= cygwin - endif -endif - TARGET_SYS?= $(HOST_SYS) ifeq (Windows,$(TARGET_SYS)) TARGET_STRIP+= --strip-unneeded diff --git a/lib/luajit/src/host/buildvm_asm.c b/lib/luajit/src/host/buildvm_asm.c index 9b7ae53a26..9b1194259a 100644 --- a/lib/luajit/src/host/buildvm_asm.c +++ b/lib/luajit/src/host/buildvm_asm.c @@ -261,11 +261,20 @@ void emit_asm(BuildCtx *ctx) #if LJ_TARGET_ARM && defined(__GNUC__) && !LJ_NO_UNWIND /* This should really be moved into buildvm_arm.dasc. */ +#if LJ_ARCH_HASFPU + fprintf(ctx->fp, + ".fnstart\n" + ".save {r5, r6, r7, r8, r9, r10, r11, lr}\n" + ".vsave {d8-d15}\n" + ".save {r4}\n" + ".pad #28\n"); +#else fprintf(ctx->fp, ".fnstart\n" ".save {r4, r5, r6, r7, r8, r9, r10, r11, lr}\n" ".pad #28\n"); #endif +#endif #if LJ_TARGET_MIPS fprintf(ctx->fp, ".set nomips16\n.abicalls\n.set noreorder\n.set nomacro\n"); #endif diff --git a/lib/luajit/src/jit/dis_x86.lua b/lib/luajit/src/jit/dis_x86.lua index 6bc38066fe..a7c05ed6d5 100644 --- a/lib/luajit/src/jit/dis_x86.lua +++ b/lib/luajit/src/jit/dis_x86.lua @@ -15,13 +15,12 @@ -- Intel and AMD manuals. The supported instruction set is quite extensive -- and reflects what a current generation Intel or AMD CPU implements in -- 32 bit and 64 bit mode. Yes, this includes MMX, SSE, SSE2, SSE3, SSSE3, --- SSE4.1, SSE4.2, SSE4a and even privileged and hypervisor (VMX/SVM) --- instructions. +-- SSE4.1, SSE4.2, SSE4a, AVX, AVX2 and even privileged and hypervisor +-- (VMX/SVM) instructions. -- -- Notes: -- * The (useless) a16 prefix, 3DNow and pre-586 opcodes are unsupported. -- * No attempt at optimization has been made -- it's fast enough for my needs. --- * The public API may change when more architectures are added. ------------------------------------------------------------------------------ local type = type @@ -78,7 +77,7 @@ local map_opc1_32 = { "movBRi","movBRi","movBRi","movBRi","movBRi","movBRi","movBRi","movBRi", "movVRI","movVRI","movVRI","movVRI","movVRI","movVRI","movVRI","movVRI", --Cx -"shift!Bmu","shift!Vmu","retBw","ret","$lesVrm","$ldsVrm","movBmi","movVmi", +"shift!Bmu","shift!Vmu","retBw","ret","vex*3$lesVrm","vex*2$ldsVrm","movBmi","movVmi", "enterBwu","leave","retfBw","retf","int3","intBu","into","iretVS", --Dx "shift!Bm1","shift!Vm1","shift!Bmc","shift!Vmc","aamBu","aadBu","salc","xlatb", @@ -103,7 +102,7 @@ local map_opc1_64 = setmetatable({ [0x44]="rex*r", [0x45]="rex*rb", [0x46]="rex*rx", [0x47]="rex*rxb", [0x48]="rex*w", [0x49]="rex*wb", [0x4a]="rex*wx", [0x4b]="rex*wxb", [0x4c]="rex*wr", [0x4d]="rex*wrb", [0x4e]="rex*wrx", [0x4f]="rex*wrxb", - [0x82]=false, [0x9a]=false, [0xc4]=false, [0xc5]=false, [0xce]=false, + [0x82]=false, [0x9a]=false, [0xc4]="vex*3", [0xc5]="vex*2", [0xce]=false, [0xd4]=false, [0xd5]=false, [0xd6]=false, [0xea]=false, }, { __index = map_opc1_32 }) @@ -114,12 +113,12 @@ local map_opc2 = { [0]="sldt!Dmp","sgdt!Ump","larVrm","lslVrm",nil,"syscall","clts","sysret", "invd","wbinvd",nil,"ud1",nil,"$prefetch!Bm","femms","3dnowMrmu", --1x -"movupsXrm|movssXrm|movupdXrm|movsdXrm", -"movupsXmr|movssXmr|movupdXmr|movsdXmr", +"movupsXrm|movssXrvm|movupdXrm|movsdXrvm", +"movupsXmr|movssXmvr|movupdXmr|movsdXmvr", "movhlpsXrm$movlpsXrm|movsldupXrm|movlpdXrm|movddupXrm", "movlpsXmr||movlpdXmr", -"unpcklpsXrm||unpcklpdXrm", -"unpckhpsXrm||unpckhpdXrm", +"unpcklpsXrvm||unpcklpdXrvm", +"unpckhpsXrvm||unpckhpdXrvm", "movlhpsXrm$movhpsXrm|movshdupXrm|movhpdXrm", "movhpsXmr||movhpdXmr", "$prefetcht!Bm","hintnopVm","hintnopVm","hintnopVm", @@ -128,7 +127,7 @@ local map_opc2 = { "movUmx$","movUmy$","movUxm$","movUym$","movUmz$",nil,"movUzm$",nil, "movapsXrm||movapdXrm", "movapsXmr||movapdXmr", -"cvtpi2psXrMm|cvtsi2ssXrVmt|cvtpi2pdXrMm|cvtsi2sdXrVmt", +"cvtpi2psXrMm|cvtsi2ssXrvVmt|cvtpi2pdXrMm|cvtsi2sdXrvVmt", "movntpsXmr|movntssXmr|movntpdXmr|movntsdXmr", "cvttps2piMrXm|cvttss2siVrXm|cvttpd2piMrXm|cvttsd2siVrXm", "cvtps2piMrXm|cvtss2siVrXm|cvtpd2piMrXm|cvtsd2siVrXm", @@ -144,27 +143,27 @@ local map_opc2 = { "cmovlVrm","cmovgeVrm","cmovleVrm","cmovgVrm", --5x "movmskpsVrXm$||movmskpdVrXm$","sqrtpsXrm|sqrtssXrm|sqrtpdXrm|sqrtsdXrm", -"rsqrtpsXrm|rsqrtssXrm","rcppsXrm|rcpssXrm", -"andpsXrm||andpdXrm","andnpsXrm||andnpdXrm", -"orpsXrm||orpdXrm","xorpsXrm||xorpdXrm", -"addpsXrm|addssXrm|addpdXrm|addsdXrm","mulpsXrm|mulssXrm|mulpdXrm|mulsdXrm", -"cvtps2pdXrm|cvtss2sdXrm|cvtpd2psXrm|cvtsd2ssXrm", +"rsqrtpsXrm|rsqrtssXrvm","rcppsXrm|rcpssXrvm", +"andpsXrvm||andpdXrvm","andnpsXrvm||andnpdXrvm", +"orpsXrvm||orpdXrvm","xorpsXrvm||xorpdXrvm", +"addpsXrvm|addssXrvm|addpdXrvm|addsdXrvm","mulpsXrvm|mulssXrvm|mulpdXrvm|mulsdXrvm", +"cvtps2pdXrm|cvtss2sdXrvm|cvtpd2psXrm|cvtsd2ssXrvm", "cvtdq2psXrm|cvttps2dqXrm|cvtps2dqXrm", -"subpsXrm|subssXrm|subpdXrm|subsdXrm","minpsXrm|minssXrm|minpdXrm|minsdXrm", -"divpsXrm|divssXrm|divpdXrm|divsdXrm","maxpsXrm|maxssXrm|maxpdXrm|maxsdXrm", +"subpsXrvm|subssXrvm|subpdXrvm|subsdXrvm","minpsXrvm|minssXrvm|minpdXrvm|minsdXrvm", +"divpsXrvm|divssXrvm|divpdXrvm|divsdXrvm","maxpsXrvm|maxssXrvm|maxpdXrvm|maxsdXrvm", --6x -"punpcklbwPrm","punpcklwdPrm","punpckldqPrm","packsswbPrm", -"pcmpgtbPrm","pcmpgtwPrm","pcmpgtdPrm","packuswbPrm", -"punpckhbwPrm","punpckhwdPrm","punpckhdqPrm","packssdwPrm", -"||punpcklqdqXrm","||punpckhqdqXrm", +"punpcklbwPrvm","punpcklwdPrvm","punpckldqPrvm","packsswbPrvm", +"pcmpgtbPrvm","pcmpgtwPrvm","pcmpgtdPrvm","packuswbPrvm", +"punpckhbwPrvm","punpckhwdPrvm","punpckhdqPrvm","packssdwPrvm", +"||punpcklqdqXrvm","||punpckhqdqXrvm", "movPrVSm","movqMrm|movdquXrm|movdqaXrm", --7x -"pshufwMrmu|pshufhwXrmu|pshufdXrmu|pshuflwXrmu","pshiftw!Pmu", -"pshiftd!Pmu","pshiftq!Mmu||pshiftdq!Xmu", -"pcmpeqbPrm","pcmpeqwPrm","pcmpeqdPrm","emms|", +"pshufwMrmu|pshufhwXrmu|pshufdXrmu|pshuflwXrmu","pshiftw!Pvmu", +"pshiftd!Pvmu","pshiftq!Mvmu||pshiftdq!Xvmu", +"pcmpeqbPrvm","pcmpeqwPrvm","pcmpeqdPrvm","emms*|", "vmreadUmr||extrqXmuu$|insertqXrmuu$","vmwriteUrm||extrqXrm$|insertqXrm$", nil,nil, -"||haddpdXrm|haddpsXrm","||hsubpdXrm|hsubpsXrm", +"||haddpdXrvm|haddpsXrvm","||hsubpdXrvm|hsubpsXrvm", "movVSmMr|movqXrm|movVSmXr","movqMmr|movdquXmr|movdqaXmr", --8x "joVj","jnoVj","jbVj","jnbVj","jzVj","jnzVj","jbeVj","jaVj", @@ -182,27 +181,27 @@ nil,nil, "bsfVrm","bsrVrm|lzcntVrm|bsrWrm","movsxVrBmt","movsxVrWmt", --Cx "xaddBmr","xaddVmr", -"cmppsXrmu|cmpssXrmu|cmppdXrmu|cmpsdXrmu","$movntiVmr|", -"pinsrwPrWmu","pextrwDrPmu", -"shufpsXrmu||shufpdXrmu","$cmpxchg!Qmp", +"cmppsXrvmu|cmpssXrvmu|cmppdXrvmu|cmpsdXrvmu","$movntiVmr|", +"pinsrwPrvWmu","pextrwDrPmu", +"shufpsXrvmu||shufpdXrvmu","$cmpxchg!Qmp", "bswapVR","bswapVR","bswapVR","bswapVR","bswapVR","bswapVR","bswapVR","bswapVR", --Dx -"||addsubpdXrm|addsubpsXrm","psrlwPrm","psrldPrm","psrlqPrm", -"paddqPrm","pmullwPrm", +"||addsubpdXrvm|addsubpsXrvm","psrlwPrvm","psrldPrvm","psrlqPrvm", +"paddqPrvm","pmullwPrvm", "|movq2dqXrMm|movqXmr|movdq2qMrXm$","pmovmskbVrMm||pmovmskbVrXm", -"psubusbPrm","psubuswPrm","pminubPrm","pandPrm", -"paddusbPrm","padduswPrm","pmaxubPrm","pandnPrm", +"psubusbPrvm","psubuswPrvm","pminubPrvm","pandPrvm", +"paddusbPrvm","padduswPrvm","pmaxubPrvm","pandnPrvm", --Ex -"pavgbPrm","psrawPrm","psradPrm","pavgwPrm", -"pmulhuwPrm","pmulhwPrm", +"pavgbPrvm","psrawPrvm","psradPrvm","pavgwPrvm", +"pmulhuwPrvm","pmulhwPrvm", "|cvtdq2pdXrm|cvttpd2dqXrm|cvtpd2dqXrm","$movntqMmr||$movntdqXmr", -"psubsbPrm","psubswPrm","pminswPrm","porPrm", -"paddsbPrm","paddswPrm","pmaxswPrm","pxorPrm", +"psubsbPrvm","psubswPrvm","pminswPrvm","porPrvm", +"paddsbPrvm","paddswPrvm","pmaxswPrvm","pxorPrvm", --Fx -"|||lddquXrm","psllwPrm","pslldPrm","psllqPrm", -"pmuludqPrm","pmaddwdPrm","psadbwPrm","maskmovqMrm||maskmovdquXrm$", -"psubbPrm","psubwPrm","psubdPrm","psubqPrm", -"paddbPrm","paddwPrm","padddPrm","ud", +"|||lddquXrm","psllwPrvm","pslldPrvm","psllqPrvm", +"pmuludqPrvm","pmaddwdPrvm","psadbwPrvm","maskmovqMrm||maskmovdquXrm$", +"psubbPrvm","psubwPrvm","psubdPrvm","psubqPrvm", +"paddbPrvm","paddwPrvm","padddPrvm","ud", } assert(map_opc2[255] == "ud") @@ -210,49 +209,70 @@ assert(map_opc2[255] == "ud") local map_opc3 = { ["38"] = { -- [66] 0f 38 xx --0x -[0]="pshufbPrm","phaddwPrm","phadddPrm","phaddswPrm", -"pmaddubswPrm","phsubwPrm","phsubdPrm","phsubswPrm", -"psignbPrm","psignwPrm","psigndPrm","pmulhrswPrm", -nil,nil,nil,nil, +[0]="pshufbPrvm","phaddwPrvm","phadddPrvm","phaddswPrvm", +"pmaddubswPrvm","phsubwPrvm","phsubdPrvm","phsubswPrvm", +"psignbPrvm","psignwPrvm","psigndPrvm","pmulhrswPrvm", +"||permilpsXrvm","||permilpdXrvm",nil,nil, --1x "||pblendvbXrma",nil,nil,nil, -"||blendvpsXrma","||blendvpdXrma",nil,"||ptestXrm", -nil,nil,nil,nil, +"||blendvpsXrma","||blendvpdXrma","||permpsXrvm","||ptestXrm", +"||broadcastssXrm","||broadcastsdXrm","||broadcastf128XrlXm",nil, "pabsbPrm","pabswPrm","pabsdPrm",nil, --2x "||pmovsxbwXrm","||pmovsxbdXrm","||pmovsxbqXrm","||pmovsxwdXrm", "||pmovsxwqXrm","||pmovsxdqXrm",nil,nil, -"||pmuldqXrm","||pcmpeqqXrm","||$movntdqaXrm","||packusdwXrm", -nil,nil,nil,nil, +"||pmuldqXrvm","||pcmpeqqXrvm","||$movntdqaXrm","||packusdwXrvm", +"||maskmovpsXrvm","||maskmovpdXrvm","||maskmovpsXmvr","||maskmovpdXmvr", --3x "||pmovzxbwXrm","||pmovzxbdXrm","||pmovzxbqXrm","||pmovzxwdXrm", -"||pmovzxwqXrm","||pmovzxdqXrm",nil,"||pcmpgtqXrm", -"||pminsbXrm","||pminsdXrm","||pminuwXrm","||pminudXrm", -"||pmaxsbXrm","||pmaxsdXrm","||pmaxuwXrm","||pmaxudXrm", +"||pmovzxwqXrm","||pmovzxdqXrm","||permdXrvm","||pcmpgtqXrvm", +"||pminsbXrvm","||pminsdXrvm","||pminuwXrvm","||pminudXrvm", +"||pmaxsbXrvm","||pmaxsdXrvm","||pmaxuwXrvm","||pmaxudXrvm", --4x -"||pmulddXrm","||phminposuwXrm", +"||pmulddXrvm","||phminposuwXrm",nil,nil, +nil,"||psrlvVSXrvm","||psravdXrvm","||psllvVSXrvm", +--5x +[0x58] = "||pbroadcastdXrlXm",[0x59] = "||pbroadcastqXrlXm", +[0x5a] = "||broadcasti128XrlXm", +--7x +[0x78] = "||pbroadcastbXrlXm",[0x79] = "||pbroadcastwXrlXm", +--8x +[0x8c] = "||pmaskmovXrvVSm", +[0x8e] = "||pmaskmovVSmXvr", +--Dx +[0xdc] = "||aesencXrvm", [0xdd] = "||aesenclastXrvm", +[0xde] = "||aesdecXrvm", [0xdf] = "||aesdeclastXrvm", --Fx [0xf0] = "|||crc32TrBmt",[0xf1] = "|||crc32TrVmt", }, ["3a"] = { -- [66] 0f 3a xx --0x -[0x00]=nil,nil,nil,nil,nil,nil,nil,nil, -"||roundpsXrmu","||roundpdXrmu","||roundssXrmu","||roundsdXrmu", -"||blendpsXrmu","||blendpdXrmu","||pblendwXrmu","palignrPrmu", +[0x00]="||permqXrmu","||permpdXrmu","||pblenddXrvmu",nil, +"||permilpsXrmu","||permilpdXrmu","||perm2f128Xrvmu",nil, +"||roundpsXrmu","||roundpdXrmu","||roundssXrvmu","||roundsdXrvmu", +"||blendpsXrvmu","||blendpdXrvmu","||pblendwXrvmu","palignrPrvmu", --1x nil,nil,nil,nil, "||pextrbVmXru","||pextrwVmXru","||pextrVmSXru","||extractpsVmXru", -nil,nil,nil,nil,nil,nil,nil,nil, +"||insertf128XrvlXmu","||extractf128XlXmYru",nil,nil, +nil,nil,nil,nil, --2x -"||pinsrbXrVmu","||insertpsXrmu","||pinsrXrVmuS",nil, +"||pinsrbXrvVmu","||insertpsXrvmu","||pinsrXrvVmuS",nil, +--3x +[0x38] = "||inserti128Xrvmu",[0x39] = "||extracti128XlXmYru", --4x -[0x40] = "||dppsXrmu", -[0x41] = "||dppdXrmu", -[0x42] = "||mpsadbwXrmu", +[0x40] = "||dppsXrvmu", +[0x41] = "||dppdXrvmu", +[0x42] = "||mpsadbwXrvmu", +[0x44] = "||pclmulqdqXrvmu", +[0x46] = "||perm2i128Xrvmu", +[0x4a] = "||blendvpsXrvmb",[0x4b] = "||blendvpdXrvmb", +[0x4c] = "||pblendvbXrvmb", --6x [0x60] = "||pcmpestrmXrmu",[0x61] = "||pcmpestriXrmu", [0x62] = "||pcmpistrmXrmu",[0x63] = "||pcmpistriXrmu", +[0xdf] = "||aeskeygenassistXrmu", }, } @@ -356,17 +376,19 @@ local map_regs = { "mm0", "mm1", "mm2", "mm3", "mm4", "mm5", "mm6", "mm7" }, -- No x64 ext! X = { "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6", "xmm7", "xmm8", "xmm9", "xmm10", "xmm11", "xmm12", "xmm13", "xmm14", "xmm15" }, + Y = { "ymm0", "ymm1", "ymm2", "ymm3", "ymm4", "ymm5", "ymm6", "ymm7", + "ymm8", "ymm9", "ymm10", "ymm11", "ymm12", "ymm13", "ymm14", "ymm15" }, } local map_segregs = { "es", "cs", "ss", "ds", "fs", "gs", "segr6", "segr7" } -- Maps for size names. local map_sz2n = { - B = 1, W = 2, D = 4, Q = 8, M = 8, X = 16, + B = 1, W = 2, D = 4, Q = 8, M = 8, X = 16, Y = 32, } local map_sz2prefix = { B = "byte", W = "word", D = "dword", Q = "qword", - M = "qword", X = "xword", + M = "qword", X = "xword", Y = "yword", F = "dword", G = "qword", -- No need for sizes/register names for these two. } @@ -389,10 +411,13 @@ local function putop(ctx, text, operands) if ctx.rep then text = ctx.rep.." "..text; ctx.rep = false end if ctx.rex then local t = (ctx.rexw and "w" or "")..(ctx.rexr and "r" or "").. - (ctx.rexx and "x" or "")..(ctx.rexb and "b" or "") - if t ~= "" then text = "rex."..t.." "..text end + (ctx.rexx and "x" or "")..(ctx.rexb and "b" or "").. + (ctx.vexl and "l" or "") + if ctx.vexv and ctx.vexv ~= 0 then t = t.."v"..ctx.vexv end + if t ~= "" then text = ctx.rex.."."..t.." "..text + elseif ctx.rex == "vex" then text = "v"..text end ctx.rexw = false; ctx.rexr = false; ctx.rexx = false; ctx.rexb = false - ctx.rex = false + ctx.rex = false; ctx.vexl = false; ctx.vexv = false end if ctx.seg then local text2, n = gsub(text, "%[", "["..ctx.seg..":") @@ -407,6 +432,7 @@ local function putop(ctx, text, operands) end ctx.out(format("%08x %s%s\n", ctx.addr+ctx.start, hex, text)) ctx.mrm = false + ctx.vexv = false ctx.start = pos ctx.imm = nil end @@ -415,7 +441,7 @@ end local function clearprefixes(ctx) ctx.o16 = false; ctx.seg = false; ctx.lock = false; ctx.rep = false ctx.rexw = false; ctx.rexr = false; ctx.rexx = false; ctx.rexb = false - ctx.rex = false; ctx.a32 = false + ctx.rex = false; ctx.a32 = false; ctx.vexl = false end -- Fallback for incomplete opcodes at the end. @@ -452,9 +478,9 @@ end -- Process pattern string and generate the operands. local function putpat(ctx, name, pat) local operands, regs, sz, mode, sp, rm, sc, rx, sdisp - local code, pos, stop = ctx.code, ctx.pos, ctx.stop + local code, pos, stop, vexl = ctx.code, ctx.pos, ctx.stop, ctx.vexl - -- Chars used: 1DFGIMPQRSTUVWXacdfgijmoprstuwxyz + -- Chars used: 1DFGIMPQRSTUVWXYabcdfgijlmoprstuvwxyz for p in gmatch(pat, ".") do local x = nil if p == "V" or p == "U" then @@ -469,11 +495,13 @@ local function putpat(ctx, name, pat) elseif p == "B" then sz = "B" regs = ctx.rex and map_regs.B64 or map_regs.B - elseif match(p, "[WDQMXFG]") then + elseif match(p, "[WDQMXYFG]") then sz = p + if sz == "X" and vexl then sz = "Y"; ctx.vexl = false end regs = map_regs[sz] elseif p == "P" then sz = ctx.o16 and "X" or "M"; ctx.o16 = false + if sz == "X" and vexl then sz = "Y"; ctx.vexl = false end regs = map_regs[sz] elseif p == "S" then name = name..lower(sz) @@ -486,6 +514,10 @@ local function putpat(ctx, name, pat) local imm = getimm(ctx, pos, 1); if not imm then return end x = format("0x%02x", imm) pos = pos+1 + elseif p == "b" then + local imm = getimm(ctx, pos, 1); if not imm then return end + x = regs[imm/16+1] + pos = pos+1 elseif p == "w" then local imm = getimm(ctx, pos, 2); if not imm then return end x = format("0x%x", imm) @@ -618,8 +650,13 @@ local function putpat(ctx, name, pat) else x = "CR"..sp end + elseif p == "v" then + if ctx.vexv then + x = regs[ctx.vexv+1]; ctx.vexv = false + end elseif p == "y" then x = "DR"..sp elseif p == "z" then x = "TR"..sp + elseif p == "l" then vexl = false elseif p == "t" then else error("bad pattern `"..pat.."'") @@ -694,7 +731,7 @@ map_act = { B = putpat, W = putpat, D = putpat, Q = putpat, V = putpat, U = putpat, T = putpat, M = putpat, X = putpat, P = putpat, - F = putpat, G = putpat, + F = putpat, G = putpat, Y = putpat, -- Collect prefixes. [":"] = function(ctx, name, pat) @@ -755,15 +792,68 @@ map_act = { -- REX prefix. rex = function(ctx, name, pat) - if ctx.rex then return unknown(ctx) end -- Only 1 REX prefix allowed. + if ctx.rex then return unknown(ctx) end -- Only 1 REX or VEX prefix allowed. for p in gmatch(pat, ".") do ctx["rex"..p] = true end - ctx.rex = true + ctx.rex = "rex" + end, + + -- VEX prefix. + vex = function(ctx, name, pat) + if ctx.rex then return unknown(ctx) end -- Only 1 REX or VEX prefix allowed. + ctx.rex = "vex" + local pos = ctx.pos + if ctx.mrm then + ctx.mrm = nil + pos = pos-1 + end + local b = byte(ctx.code, pos, pos) + if not b then return incomplete(ctx) end + pos = pos+1 + if b < 128 then ctx.rexr = true end + local m = 1 + if pat == "3" then + m = b%32; b = (b-m)/32 + local nb = b%2; b = (b-nb)/2 + if nb == 0 then ctx.rexb = true end + local nx = b%2; b = (b-nx)/2 + if nx == 0 then ctx.rexx = true end + b = byte(ctx.code, pos, pos) + if not b then return incomplete(ctx) end + pos = pos+1 + if b >= 128 then ctx.rexw = true end + end + ctx.pos = pos + local map + if m == 1 then map = map_opc2 + elseif m == 2 then map = map_opc3["38"] + elseif m == 3 then map = map_opc3["3a"] + else return unknown(ctx) end + local p = b%4; b = (b-p)/4 + if p == 1 then ctx.o16 = "o16" + elseif p == 2 then ctx.rep = "rep" + elseif p == 3 then ctx.rep = "repne" end + local l = b%2; b = (b-l)/2 + if l ~= 0 then ctx.vexl = true end + ctx.vexv = (-1-b)%16 + return dispatchmap(ctx, map) end, -- Special case for nop with REX prefix. nop = function(ctx, name, pat) return dispatch(ctx, ctx.rex and pat or "nop") end, + + -- Special case for 0F 77. + emms = function(ctx, name, pat) + if ctx.rex ~= "vex" then + return putop(ctx, "emms") + elseif ctx.vexl then + ctx.vexl = false + return putop(ctx, "zeroall") + else + return putop(ctx, "zeroupper") + end + end, } ------------------------------------------------------------------------------ diff --git a/lib/luajit/src/jit/dump.lua b/lib/luajit/src/jit/dump.lua index c52d0f217e..b1cdcfe294 100644 --- a/lib/luajit/src/jit/dump.lua +++ b/lib/luajit/src/jit/dump.lua @@ -75,9 +75,6 @@ local bcline, disass -- Active flag, output file handle and dump mode. local active, out, dumpmode --- Information about traces that is remembered for future reference. -local info = {} - ------------------------------------------------------------------------------ local symtabmt = { __index = false } @@ -553,7 +550,6 @@ local function dump_trace(what, tr, func, pc, otr, oex) if dumpmode.m then dump_mcode(tr) end end if what == "start" then - info[tr] = { func = func, pc = pc, otr = otr, oex = oex } if dumpmode.H then out:write('
    \n') end
         out:write("---- TRACE ", tr, " ", what)
         if otr then out:write(" ", otr, "/", oex) end
    @@ -575,6 +571,7 @@ local function dump_trace(what, tr, func, pc, otr, oex)
         end
         if dumpmode.H then out:write("
    \n\n") else out:write("\n") end else + if what == "flush" then symtab, nexitsym = {}, 0 end out:write("---- TRACE ", what, "\n\n") end out:flush() @@ -705,7 +702,6 @@ end return { on = dumpon, off = dumpoff, - start = dumpon, -- For -j command line option. - info = info + start = dumpon -- For -j command line option. } diff --git a/lib/luajit/src/jit/p.lua b/lib/luajit/src/jit/p.lua index d894bb7d2c..97d4ccdf87 100644 --- a/lib/luajit/src/jit/p.lua +++ b/lib/luajit/src/jit/p.lua @@ -36,7 +36,6 @@ -- G Produce raw output suitable for graphical tools (e.g. flame graphs). -- m Minimum sample percentage to be shown. Default: 3. -- i Sampling interval in milliseconds. Default: 10. --- S[] Events source if performace events are enabled -- ---------------------------------------------------------------------------- @@ -45,8 +44,6 @@ local jit = require("jit") assert(jit.version_num == 20100, "LuaJIT core/library version mismatch") local profile = require("jit.profile") local vmdef = require("jit.vmdef") -local jutil = require("jit.util") -local dump = require("jit.dump") local math = math local pairs, ipairs, tonumber, floor = pairs, ipairs, tonumber, math.floor local sort, format = table.sort, string.format @@ -77,38 +74,7 @@ local function prof_cb(th, samples, vmmode) -- Collect keys for sample. if prof_states then if prof_states == "v" then - if map_vmmode[vmmode] then - key_state = map_vmmode[vmmode] - else - -- Sampling a trace: make an understandable one-line description. - local tr = tonumber(vmmode) - local info = jutil.traceinfo(tr) - local extra = dump.info[tr] - -- Show the parent of this trace (if this is a side trace) - local parent = "" - if extra and extra.otr and extra.oex then - parent = "("..extra.otr.."/"..extra.oex..")" - end - -- Show what the end of the trace links to (e.g. loop or other trace) - local lnk = "" - local link, ltype = info.link, info.linktype - if link == tr or link == 0 then lnk = "->"..ltype - elseif ltype == "root" then lnk = "->"..link - else lnk = "->"..link.." "..ltype end - -- Show the current zone (if zone profiling is enabled) - local z = "" - if zone and zone:get() then - z = (" %-16s"):format(zone:get()) - end - -- Show the source location where the trace starts - local loc = "" - if extra and extra.func then - local fi = jutil.funcinfo(extra.func, extra.pc) - if fi.loc then loc = fi.loc end - end - local s = ("TRACE %3d %-8s %-10s%s %s"):format(vmmode, parent, lnk, z, loc) - key_state = map_vmmode[vmmode] or s - end + key_state = map_vmmode[vmmode] or vmmode else key_state = zone:get() or "(none)" end @@ -277,18 +243,15 @@ end -- Start profiling. local function prof_start(mode) local interval = "" - mode = mode:gsub("i%d+", function(s) interval = s; return "" end) + mode = mode:gsub("i%d*", function(s) interval = s; return "" end) prof_min = 3 mode = mode:gsub("m(%d+)", function(s) prof_min = tonumber(s); return "" end) prof_depth = 1 mode = mode:gsub("%-?%d+", function(s) prof_depth = tonumber(s); return "" end) - local flavour = "S[vanilla]" - mode = mode:gsub("S%[.+%]", function(s) flavour = s; return "" end) - local m = {} for c in mode:gmatch(".") do m[c] = c end - prof_states = m.v or m.z - if m.z == "z" then zone = require("jit.zone") end + prof_states = m.z or m.v + if prof_states == "z" then zone = require("jit.zone") end local scope = m.l or m.f or m.F or (prof_states and "" or "f") local flags = (m.p or "") prof_raw = m.r @@ -322,7 +285,7 @@ local function prof_start(mode) prof_count1 = {} prof_count2 = {} prof_samples = 0 - profile.start(scope:lower()..interval..flavour, prof_cb) + profile.start(scope:lower()..interval, prof_cb) prof_ud = newproxy(true) getmetatable(prof_ud).__gc = prof_finish end diff --git a/lib/luajit/src/lib_base.c b/lib/luajit/src/lib_base.c index 887fea7a58..ca268b1d07 100644 --- a/lib/luajit/src/lib_base.c +++ b/lib/luajit/src/lib_base.c @@ -435,13 +435,13 @@ LJLIB_CF(gcinfo) LJLIB_CF(collectgarbage) { int opt = lj_lib_checkopt(L, 1, LUA_GCCOLLECT, /* ORDER LUA_GC* */ - "\4stop\7restart\7collect\5count\1\377\4step\10setpause\12setstepmul"); + "\4stop\7restart\7collect\5count\1\377\4step\10setpause\12setstepmul\1\377\11isrunning"); int32_t data = lj_lib_optint(L, 2, 0); if (opt == LUA_GCCOUNT) { setnumV(L->top, (lua_Number)G(L)->gc.total/1024.0); } else { int res = lua_gc(L, opt, data); - if (opt == LUA_GCSTEP) + if (opt == LUA_GCSTEP || opt == LUA_GCISRUNNING) setboolV(L->top, res); else setintV(L->top, res); diff --git a/lib/luajit/src/lib_ffi.c b/lib/luajit/src/lib_ffi.c index b2b2d37ff7..7be624b42d 100644 --- a/lib/luajit/src/lib_ffi.c +++ b/lib/luajit/src/lib_ffi.c @@ -505,10 +505,7 @@ LJLIB_CF(ffi_new) LJLIB_REC(.) } if (sz == CTSIZE_INVALID) lj_err_arg(L, 1, LJ_ERR_FFI_INVSIZE); - if (!(info & CTF_VLA) && ctype_align(info) <= CT_MEMALIGN) - cd = lj_cdata_new(cts, id, sz); - else - cd = lj_cdata_newv(L, id, sz, ctype_align(info)); + cd = lj_cdata_newx(cts, id, sz, info); setcdataV(L, o-1, cd); /* Anchor the uninitialized cdata. */ lj_cconv_ct_init(cts, ct, sz, cdataptr(cd), o, (MSize)(L->top - o)); /* Initialize cdata. */ diff --git a/lib/luajit/src/lib_jit.c b/lib/luajit/src/lib_jit.c index 2227d198c5..178ef249df 100644 --- a/lib/luajit/src/lib_jit.c +++ b/lib/luajit/src/lib_jit.c @@ -299,9 +299,6 @@ LJLIB_CF(jit_util_traceinfo) setintfield(L, t, "nk", REF_BIAS - (int32_t)T->nk); setintfield(L, t, "link", T->link); setintfield(L, t, "nexit", T->nsnap); - setintfield(L, t, "szmcode", T->szmcode); - setintfield(L, t, "mcode", (int32_t)(intptr_t)T->mcode); - setintfield(L, t, "mcloop", T->mcloop); setstrV(L, L->top++, lj_str_newz(L, jit_trlinkname[T->linktype])); lua_setfield(L, -2, "linktype"); /* There are many more fields. Add them only when needed. */ @@ -558,10 +555,7 @@ static void jit_profile_callback(lua_State *L2, lua_State *L, int samples, setfuncV(L2, L2->top++, funcV(tv)); setthreadV(L2, L2->top++, L); setintV(L2->top++, samples); - if (vmstate >= 256) - setintV(L2->top++, vmstate-256); - else - setstrV(L2, L2->top++, lj_str_new(L2, &vmst, 1)); + setstrV(L2, L2->top++, lj_str_new(L2, &vmst, 1)); status = lua_pcall(L2, 3, 0, 0); /* callback(thread, samples, vmstate) */ if (status) { if (G(L2)->panic) G(L2)->panic(L2); diff --git a/lib/luajit/src/lib_os.c b/lib/luajit/src/lib_os.c index 7b5873a518..37d7d5be61 100644 --- a/lib/luajit/src/lib_os.c +++ b/lib/luajit/src/lib_os.c @@ -39,7 +39,7 @@ LJLIB_CF(os_execute) { -#if LJ_TARGET_CONSOLE +#if LJ_NO_SYSTEM #if LJ_52 errno = ENOSYS; return luaL_fileresult(L, 0, NULL); diff --git a/lib/luajit/src/lj.supp b/lib/luajit/src/lj.supp index 411f261700..acb9e789d0 100644 --- a/lib/luajit/src/lj.supp +++ b/lib/luajit/src/lj.supp @@ -24,3 +24,18 @@ Memcheck:Cond fun:lj_str_new } +{ + Optimized string compare + Memcheck:Addr4 + fun:lj_str_fastcmp +} +{ + Optimized string compare + Memcheck:Addr1 + fun:lj_str_fastcmp +} +{ + Optimized string compare + Memcheck:Cond + fun:lj_str_fastcmp +} diff --git a/lib/luajit/src/lj_alloc.c b/lib/luajit/src/lj_alloc.c index 0aad826d36..ddd50cae4f 100644 --- a/lib/luajit/src/lj_alloc.c +++ b/lib/luajit/src/lj_alloc.c @@ -196,7 +196,7 @@ static LJ_AINLINE void *CALL_MMAP(size_t size) return ptr; } -#elif LJ_TARGET_OSX || LJ_TARGET_PS4 || defined(__FreeBSD__) || defined(__FreeBSD_kernel__) || defined(__NetBSD__) || defined(__OpenBSD__) || defined(__DragonFly__) || defined(__sun__) +#elif LJ_TARGET_OSX || LJ_TARGET_PS4 || defined(__FreeBSD__) || defined(__FreeBSD_kernel__) || defined(__NetBSD__) || defined(__OpenBSD__) || defined(__DragonFly__) || defined(__sun__) || defined(__CYGWIN__) /* OSX and FreeBSD mmap() use a naive first-fit linear search. ** That's perfect for us. Except that -pagezero_size must be set for OSX, diff --git a/lib/luajit/src/lj_api.c b/lib/luajit/src/lj_api.c index 1f09284f99..042b0d9c8d 100644 --- a/lib/luajit/src/lj_api.c +++ b/lib/luajit/src/lj_api.c @@ -1188,6 +1188,9 @@ LUA_API int lua_gc(lua_State *L, int what, int data) res = (int)(g->gc.stepmul); g->gc.stepmul = (MSize)data; break; + case LUA_GCISRUNNING: + res = (g->gc.threshold != LJ_MAX_MEM); + break; default: res = -1; /* Invalid option. */ } diff --git a/lib/luajit/src/lj_arch.h b/lib/luajit/src/lj_arch.h index f1e7d7f45c..a114bdda53 100644 --- a/lib/luajit/src/lj_arch.h +++ b/lib/luajit/src/lj_arch.h @@ -155,7 +155,11 @@ #define LJ_ARCH_NAME "x64" #define LJ_ARCH_BITS 64 #define LJ_ARCH_ENDIAN LUAJIT_LE -#define LJ_ABI_WIN LJ_TARGET_WINDOWS +#if LJ_TARGET_WINDOWS || __CYGWIN__ +#define LJ_ABI_WIN 1 +#else +#define LJ_ABI_WIN 0 +#endif #define LJ_TARGET_X64 1 #define LJ_TARGET_X86ORX64 1 #define LJ_TARGET_EHRETREG 0 @@ -300,6 +304,13 @@ #define LJ_TARGET_UNIFYROT 2 /* Want only IR_BROR. */ #define LJ_ARCH_NUMMODE LJ_NUMMODE_SINGLE +#if !defined(LJ_ARCH_HASFPU) && defined(__mips_soft_float) +#define LJ_ARCH_HASFPU 0 +#endif +#if !defined(LJ_ABI_SOFTFP) && defined(__mips_soft_float) +#define LJ_ABI_SOFTFP 1 +#endif + #if _MIPS_ARCH_MIPS32R2 #define LJ_ARCH_VERSION 20 #else @@ -382,9 +393,6 @@ #error "No support for PPC/e500 anymore (use LuaJIT 2.0)" #endif #elif LJ_TARGET_MIPS -#if defined(__mips_soft_float) -#error "No support for MIPS CPUs without FPU" -#endif #if defined(_LP64) #error "No support for MIPS64" #endif @@ -494,6 +502,9 @@ #if defined(__symbian__) || LJ_TARGET_WINDOWS #define LUAJIT_NO_EXP2 #endif +#if LJ_TARGET_CONSOLE || (LJ_TARGET_IOS && __IPHONE_OS_VERSION_MIN_REQUIRED >= __IPHONE_8_0) +#define LJ_NO_SYSTEM 1 +#endif #if defined(LUAJIT_NO_UNWIND) || defined(__symbian__) || LJ_TARGET_IOS || LJ_TARGET_PS3 || LJ_TARGET_PS4 #define LJ_NO_UNWIND 1 diff --git a/lib/luajit/src/lj_ccall.c b/lib/luajit/src/lj_ccall.c index 5ab5b60daa..2dda540510 100644 --- a/lib/luajit/src/lj_ccall.c +++ b/lib/luajit/src/lj_ccall.c @@ -418,6 +418,18 @@ /* Complex values are returned in 1 or 2 FPRs. */ \ cc->retref = 0; +#if LJ_ABI_SOFTFP +#define CCALL_HANDLE_COMPLEXRET2 \ + if (ctr->size == 2*sizeof(float)) { /* Copy complex float from GPRs. */ \ + ((intptr_t *)dp)[0] = cc->gpr[0]; \ + ((intptr_t *)dp)[1] = cc->gpr[1]; \ + } else { /* Copy complex double from GPRs. */ \ + ((intptr_t *)dp)[0] = cc->gpr[0]; \ + ((intptr_t *)dp)[1] = cc->gpr[1]; \ + ((intptr_t *)dp)[2] = cc->gpr[2]; \ + ((intptr_t *)dp)[3] = cc->gpr[3]; \ + } +#else #define CCALL_HANDLE_COMPLEXRET2 \ if (ctr->size == 2*sizeof(float)) { /* Copy complex float from FPRs. */ \ ((float *)dp)[0] = cc->fpr[0].f; \ @@ -426,6 +438,7 @@ ((double *)dp)[0] = cc->fpr[0].d; \ ((double *)dp)[1] = cc->fpr[1].d; \ } +#endif #define CCALL_HANDLE_STRUCTARG \ /* Pass all structs by value in registers and/or on the stack. */ @@ -433,6 +446,22 @@ #define CCALL_HANDLE_COMPLEXARG \ /* Pass complex by value in 2 or 4 GPRs. */ +#define CCALL_HANDLE_GPR \ + if ((d->info & CTF_ALIGN) > CTALIGN_PTR) \ + ngpr = (ngpr + 1u) & ~1u; /* Align to regpair. */ \ + if (ngpr < maxgpr) { \ + dp = &cc->gpr[ngpr]; \ + if (ngpr + n > maxgpr) { \ + nsp += ngpr + n - maxgpr; /* Assumes contiguous gpr/stack fields. */ \ + if (nsp > CCALL_MAXSTACK) goto err_nyi; /* Too many arguments. */ \ + ngpr = maxgpr; \ + } else { \ + ngpr += n; \ + } \ + goto done; \ + } + +#if !LJ_ABI_SOFTFP /* MIPS32 hard-float */ #define CCALL_HANDLE_REGARG \ if (isfp && nfpr < CCALL_NARG_FPR && !(ct->info & CTF_VARARG)) { \ /* Try to pass argument in FPRs. */ \ @@ -441,24 +470,18 @@ goto done; \ } else { /* Try to pass argument in GPRs. */ \ nfpr = CCALL_NARG_FPR; \ - if ((d->info & CTF_ALIGN) > CTALIGN_PTR) \ - ngpr = (ngpr + 1u) & ~1u; /* Align to regpair. */ \ - if (ngpr < maxgpr) { \ - dp = &cc->gpr[ngpr]; \ - if (ngpr + n > maxgpr) { \ - nsp += ngpr + n - maxgpr; /* Assumes contiguous gpr/stack fields. */ \ - if (nsp > CCALL_MAXSTACK) goto err_nyi; /* Too many arguments. */ \ - ngpr = maxgpr; \ - } else { \ - ngpr += n; \ - } \ - goto done; \ - } \ + CCALL_HANDLE_GPR \ } +#else /* MIPS32 soft-float */ +#define CCALL_HANDLE_REGARG CCALL_HANDLE_GPR +#endif +#if !LJ_ABI_SOFTFP +/* On MIPS64 soft-float, position of float return values is endian-dependant. */ #define CCALL_HANDLE_RET \ if (ctype_isfp(ctr->info) && ctr->size == sizeof(float)) \ sp = (uint8_t *)&cc->fpr[0].f; +#endif #else #error "Missing calling convention definitions for this architecture" diff --git a/lib/luajit/src/lj_ccall.h b/lib/luajit/src/lj_ccall.h index 91983feebd..8b0e796bfc 100644 --- a/lib/luajit/src/lj_ccall.h +++ b/lib/luajit/src/lj_ccall.h @@ -98,9 +98,9 @@ typedef double FPRArg; #elif LJ_TARGET_MIPS #define CCALL_NARG_GPR 4 -#define CCALL_NARG_FPR 2 +#define CCALL_NARG_FPR (LJ_ABI_SOFTFP ? 0 : 2) #define CCALL_NRET_GPR 2 -#define CCALL_NRET_FPR 2 +#define CCALL_NRET_FPR (LJ_ABI_SOFTFP ? 0 : 2) #define CCALL_SPS_EXTRA 7 #define CCALL_SPS_FREE 1 diff --git a/lib/luajit/src/lj_ccallback.c b/lib/luajit/src/lj_ccallback.c index 065c329fa7..539c9e3da4 100644 --- a/lib/luajit/src/lj_ccallback.c +++ b/lib/luajit/src/lj_ccallback.c @@ -427,6 +427,15 @@ void lj_ccallback_mcode_free(CTState *cts) #elif LJ_TARGET_MIPS +#define CALLBACK_HANDLE_GPR \ + if (n > 1) ngpr = (ngpr + 1u) & ~1u; /* Align to regpair. */ \ + if (ngpr + n <= maxgpr) { \ + sp = &cts->cb.gpr[ngpr]; \ + ngpr += n; \ + goto done; \ + } + +#if !LJ_ABI_SOFTFP /* MIPS32 hard-float */ #define CALLBACK_HANDLE_REGARG \ if (isfp && nfpr < CCALL_NARG_FPR) { /* Try to pass argument in FPRs. */ \ sp = (void *)((uint8_t *)&cts->cb.fpr[nfpr] + ((LJ_BE && n==1) ? 4 : 0)); \ @@ -434,13 +443,13 @@ void lj_ccallback_mcode_free(CTState *cts) goto done; \ } else { /* Try to pass argument in GPRs. */ \ nfpr = CCALL_NARG_FPR; \ - if (n > 1) ngpr = (ngpr + 1u) & ~1u; /* Align to regpair. */ \ - if (ngpr + n <= maxgpr) { \ - sp = &cts->cb.gpr[ngpr]; \ - ngpr += n; \ - goto done; \ - } \ + CALLBACK_HANDLE_GPR \ } +#else /* MIPS32 soft-float */ +#define CALLBACK_HANDLE_REGARG \ + CALLBACK_HANDLE_GPR \ + UNUSED(isfp); +#endif #define CALLBACK_HANDLE_RET \ if (ctype_isfp(ctr->info) && ctr->size == sizeof(float)) \ diff --git a/lib/luajit/src/lj_cdata.c b/lib/luajit/src/lj_cdata.c index 5cd2c1140e..30d788e4c9 100644 --- a/lib/luajit/src/lj_cdata.c +++ b/lib/luajit/src/lj_cdata.c @@ -49,6 +49,15 @@ GCcdata *lj_cdata_newv(lua_State *L, CTypeID id, CTSize sz, CTSize align) return cd; } +/* Allocate arbitrary C data object. */ +GCcdata *lj_cdata_newx(CTState *cts, CTypeID id, CTSize sz, CTInfo info) +{ + if (!(info & CTF_VLA) && ctype_align(info) <= CT_MEMALIGN) + return lj_cdata_new(cts, id, sz); + else + return lj_cdata_newv(cts->L, id, sz, ctype_align(info)); +} + /* Free a C data object. */ void LJ_FASTCALL lj_cdata_free(global_State *g, GCcdata *cd) { diff --git a/lib/luajit/src/lj_cdata.h b/lib/luajit/src/lj_cdata.h index c8975be1c9..0891c33c80 100644 --- a/lib/luajit/src/lj_cdata.h +++ b/lib/luajit/src/lj_cdata.h @@ -60,6 +60,8 @@ static LJ_AINLINE GCcdata *lj_cdata_new_(lua_State *L, CTypeID id, CTSize sz) LJ_FUNC GCcdata *lj_cdata_newref(CTState *cts, const void *pp, CTypeID id); LJ_FUNC GCcdata *lj_cdata_newv(lua_State *L, CTypeID id, CTSize sz, CTSize align); +LJ_FUNC GCcdata *lj_cdata_newx(CTState *cts, CTypeID id, CTSize sz, + CTInfo info); LJ_FUNC void LJ_FASTCALL lj_cdata_free(global_State *g, GCcdata *cd); LJ_FUNC void lj_cdata_setfin(lua_State *L, GCcdata *cd, GCobj *obj, diff --git a/lib/luajit/src/lj_ctype.c b/lib/luajit/src/lj_ctype.c index 2e23c994bb..eda070ce1e 100644 --- a/lib/luajit/src/lj_ctype.c +++ b/lib/luajit/src/lj_ctype.c @@ -38,6 +38,8 @@ _("uint64_t", UINT64) \ _("intptr_t", INT_PSZ) \ _("uintptr_t", UINT_PSZ) \ + /* From POSIX. */ \ + _("ssize_t", INT_PSZ) \ /* End of typedef list. */ /* Keywords (only the ones we actually care for). */ diff --git a/lib/luajit/src/lj_dispatch.h b/lib/luajit/src/lj_dispatch.h index 1e247e3828..73d00ec00c 100644 --- a/lib/luajit/src/lj_dispatch.h +++ b/lib/luajit/src/lj_dispatch.h @@ -14,6 +14,21 @@ #if LJ_TARGET_MIPS /* Need our own global offset table for the dreaded MIPS calling conventions. */ +#if LJ_SOFTFP +extern double __adddf3(double a, double b); +extern double __subdf3(double a, double b); +extern double __muldf3(double a, double b); +extern double __divdf3(double a, double b); +extern void __ledf2(double a, double b); +extern double __floatsidf(int32_t a); +extern int32_t __fixdfsi(double a); + +#define SFGOTDEF(_) \ + _(lj_num2bit) _(sqrt) _(__adddf3) _(__subdf3) _(__muldf3) _(__divdf3) _(__ledf2) \ + _(__floatsidf) _(__fixdfsi) +#else +#define SFGOTDEF(_) +#endif #if LJ_HASJIT #define JITGOTDEF(_) _(lj_trace_exit) _(lj_trace_hot) #else @@ -39,7 +54,8 @@ _(lj_str_new) _(lj_tab_dup) _(lj_tab_get) _(lj_tab_getinth) _(lj_tab_len) \ _(lj_tab_new) _(lj_tab_newkey) _(lj_tab_next) _(lj_tab_reasize) \ _(lj_tab_setinth) _(lj_buf_putstr_reverse) _(lj_buf_putstr_lower) \ - _(lj_buf_putstr_upper) _(lj_buf_tostr) JITGOTDEF(_) FFIGOTDEF(_) + _(lj_buf_putstr_upper) _(lj_buf_tostr) \ + JITGOTDEF(_) FFIGOTDEF(_) SFGOTDEF(_) enum { #define GOTENUM(name) LJ_GOT_##name, diff --git a/lib/luajit/src/lj_err.c b/lib/luajit/src/lj_err.c index 2e20c2c0f8..d641735e9f 100644 --- a/lib/luajit/src/lj_err.c +++ b/lib/luajit/src/lj_err.c @@ -183,20 +183,13 @@ static void *err_unwind(lua_State *L, void *stopcf, int errcode) /* -- External frame unwinding -------------------------------------------- */ -#if defined(__GNUC__) && !LJ_NO_UNWIND && !LJ_TARGET_WINDOWS +#if defined(__GNUC__) && !LJ_NO_UNWIND && !LJ_ABI_WIN /* ** We have to use our own definitions instead of the mandatory (!) unwind.h, ** since various OS, distros and compilers mess up the header installation. */ -typedef struct _Unwind_Exception -{ - uint64_t exclass; - void (*excleanup)(int, struct _Unwind_Exception *); - uintptr_t p1, p2; -} __attribute__((__aligned__)) _Unwind_Exception; - typedef struct _Unwind_Context _Unwind_Context; #define _URC_OK 0 @@ -206,8 +199,20 @@ typedef struct _Unwind_Context _Unwind_Context; #define _URC_CONTINUE_UNWIND 8 #define _URC_FAILURE 9 +#define LJ_UEXCLASS 0x4c55414a49543200ULL /* LUAJIT2\0 */ +#define LJ_UEXCLASS_MAKE(c) (LJ_UEXCLASS | (uint64_t)(c)) +#define LJ_UEXCLASS_CHECK(cl) (((cl) ^ LJ_UEXCLASS) <= 0xff) +#define LJ_UEXCLASS_ERRCODE(cl) ((int)((cl) & 0xff)) + #if !LJ_TARGET_ARM +typedef struct _Unwind_Exception +{ + uint64_t exclass; + void (*excleanup)(int, struct _Unwind_Exception *); + uintptr_t p1, p2; +} __attribute__((__aligned__)) _Unwind_Exception; + extern uintptr_t _Unwind_GetCFA(_Unwind_Context *); extern void _Unwind_SetGR(_Unwind_Context *, int, uintptr_t); extern void _Unwind_SetIP(_Unwind_Context *, uintptr_t); @@ -219,11 +224,6 @@ extern int _Unwind_RaiseException(_Unwind_Exception *); #define _UA_HANDLER_FRAME 4 #define _UA_FORCE_UNWIND 8 -#define LJ_UEXCLASS 0x4c55414a49543200ULL /* LUAJIT2\0 */ -#define LJ_UEXCLASS_MAKE(c) (LJ_UEXCLASS | (uint64_t)(c)) -#define LJ_UEXCLASS_CHECK(cl) (((cl) ^ LJ_UEXCLASS) <= 0xff) -#define LJ_UEXCLASS_ERRCODE(cl) ((int)((cl) & 0xff)) - /* DWARF2 personality handler referenced from interpreter .eh_frame. */ LJ_FUNCA int lj_err_unwind_dwarf(int version, int actions, uint64_t uexclass, _Unwind_Exception *uex, _Unwind_Context *ctx) @@ -302,10 +302,23 @@ static void err_raise_ext(int errcode) } #endif -#else +#else /* LJ_TARGET_ARM */ + +#define _US_VIRTUAL_UNWIND_FRAME 0 +#define _US_UNWIND_FRAME_STARTING 1 +#define _US_ACTION_MASK 3 +#define _US_FORCE_UNWIND 8 + +typedef struct _Unwind_Control_Block _Unwind_Control_Block; +typedef struct _Unwind_Context _Unwind_Context; -extern void _Unwind_DeleteException(void *); -extern int __gnu_unwind_frame (void *, _Unwind_Context *); +struct _Unwind_Control_Block { + uint64_t exclass; + uint32_t misc[20]; +}; + +extern int _Unwind_RaiseException(_Unwind_Control_Block *); +extern int __gnu_unwind_frame(_Unwind_Control_Block *, _Unwind_Context *); extern int _Unwind_VRS_Set(_Unwind_Context *, int, uint32_t, int, void *); extern int _Unwind_VRS_Get(_Unwind_Context *, int, uint32_t, int, void *); @@ -321,35 +334,58 @@ static inline void _Unwind_SetGR(_Unwind_Context *ctx, int r, uint32_t v) _Unwind_VRS_Set(ctx, 0, r, 0, &v); } -#define _US_VIRTUAL_UNWIND_FRAME 0 -#define _US_UNWIND_FRAME_STARTING 1 -#define _US_ACTION_MASK 3 -#define _US_FORCE_UNWIND 8 +extern void lj_vm_unwind_ext(void); /* ARM unwinder personality handler referenced from interpreter .ARM.extab. */ -LJ_FUNCA int lj_err_unwind_arm(int state, void *ucb, _Unwind_Context *ctx) +LJ_FUNCA int lj_err_unwind_arm(int state, _Unwind_Control_Block *ucb, + _Unwind_Context *ctx) { void *cf = (void *)_Unwind_GetGR(ctx, 13); lua_State *L = cframe_L(cf); - if ((state & _US_ACTION_MASK) == _US_VIRTUAL_UNWIND_FRAME) { - setstrV(L, L->top++, lj_err_str(L, LJ_ERR_ERRCPP)); + int errcode; + + switch ((state & _US_ACTION_MASK)) { + case _US_VIRTUAL_UNWIND_FRAME: + if ((state & _US_FORCE_UNWIND)) break; return _URC_HANDLER_FOUND; - } - if ((state&(_US_ACTION_MASK|_US_FORCE_UNWIND)) == _US_UNWIND_FRAME_STARTING) { - _Unwind_DeleteException(ucb); - _Unwind_SetGR(ctx, 15, (uint32_t)(void *)lj_err_throw); - _Unwind_SetGR(ctx, 0, (uint32_t)L); - _Unwind_SetGR(ctx, 1, (uint32_t)LUA_ERRRUN); + case _US_UNWIND_FRAME_STARTING: + if (LJ_UEXCLASS_CHECK(ucb->exclass)) { + errcode = LJ_UEXCLASS_ERRCODE(ucb->exclass); + } else { + errcode = LUA_ERRRUN; + setstrV(L, L->top++, lj_err_str(L, LJ_ERR_ERRCPP)); + } + cf = err_unwind(L, cf, errcode); + if ((state & _US_FORCE_UNWIND) || cf == NULL) break; + _Unwind_SetGR(ctx, 15, (uint32_t)lj_vm_unwind_ext); + _Unwind_SetGR(ctx, 0, (uint32_t)ucb); + _Unwind_SetGR(ctx, 1, (uint32_t)errcode); + _Unwind_SetGR(ctx, 2, cframe_unwind_ff(cf) ? + (uint32_t)lj_vm_unwind_ff_eh : + (uint32_t)lj_vm_unwind_c_eh); return _URC_INSTALL_CONTEXT; + default: + return _URC_FAILURE; } if (__gnu_unwind_frame(ucb, ctx) != _URC_OK) return _URC_FAILURE; return _URC_CONTINUE_UNWIND; } +#if LJ_UNWIND_EXT +static __thread _Unwind_Control_Block static_uex; + +static void err_raise_ext(int errcode) +{ + memset(&static_uex, 0, sizeof(static_uex)); + static_uex.exclass = LJ_UEXCLASS_MAKE(errcode); + _Unwind_RaiseException(&static_uex); +} #endif -#elif LJ_TARGET_X64 && LJ_TARGET_WINDOWS +#endif /* LJ_TARGET_ARM */ + +#elif LJ_TARGET_X64 && LJ_ABI_WIN /* ** Someone in Redmond owes me several days of my life. A lot of this is @@ -414,7 +450,9 @@ LJ_FUNCA EXCEPTION_DISPOSITION lj_err_unwind_win64(EXCEPTION_RECORD *rec, if (cf2) { /* We catch it, so start unwinding the upper frames. */ if (rec->ExceptionCode == LJ_MSVC_EXCODE || rec->ExceptionCode == LJ_GCC_EXCODE) { +#if LJ_TARGET_WINDOWS __DestructExceptionObject(rec, 1); +#endif setstrV(L, L->top++, lj_err_str(L, LJ_ERR_ERRCPP)); } else if (!LJ_EXCODE_CHECK(rec->ExceptionCode)) { /* Don't catch access violations etc. */ diff --git a/lib/luajit/src/lj_ffrecord.c b/lib/luajit/src/lj_ffrecord.c index 6cc05a24f7..281f017856 100644 --- a/lib/luajit/src/lj_ffrecord.c +++ b/lib/luajit/src/lj_ffrecord.c @@ -435,11 +435,12 @@ static void LJ_FASTCALL recff_ipairs_aux(jit_State *J, RecordFFData *rd) static void LJ_FASTCALL recff_xpairs(jit_State *J, RecordFFData *rd) { - if (!(LJ_52 && recff_metacall(J, rd, MM_ipairs))) { - TRef tab = J->base[0]; - if (tref_istab(tab)) { + TRef tr = J->base[0]; + if (!((LJ_52 || (LJ_HASFFI && tref_iscdata(tr))) && + recff_metacall(J, rd, MM_pairs + rd->data))) { + if (tref_istab(tr)) { J->base[0] = lj_ir_kfunc(J, funcV(&J->fn->c.upvalue[0])); - J->base[1] = tab; + J->base[1] = tr; J->base[2] = rd->data ? lj_ir_kint(J, 0) : TREF_NIL; rd->nres = 3; } /* else: Interpreter will throw. */ diff --git a/lib/luajit/src/lj_frame.h b/lib/luajit/src/lj_frame.h index a86c36be7e..aa3ab20bbf 100644 --- a/lib/luajit/src/lj_frame.h +++ b/lib/luajit/src/lj_frame.h @@ -218,6 +218,7 @@ enum { LJ_CONT_TAILCALL, LJ_CONT_FFI_CALLBACK }; /* Special continuations. */ #define CFRAME_SHIFT_MULTRES 3 #endif #elif LJ_TARGET_MIPS +#if LJ_ARCH_HASFPU #define CFRAME_OFS_ERRF 124 #define CFRAME_OFS_NRES 120 #define CFRAME_OFS_PREV 116 @@ -227,6 +228,16 @@ enum { LJ_CONT_TAILCALL, LJ_CONT_FFI_CALLBACK }; /* Special continuations. */ #define CFRAME_SIZE 112 #define CFRAME_SHIFT_MULTRES 3 #else +#define CFRAME_OFS_ERRF 100 +#define CFRAME_OFS_NRES 96 +#define CFRAME_OFS_PREV 92 +#define CFRAME_OFS_L 88 +#define CFRAME_OFS_PC 44 +#define CFRAME_OFS_MULTRES 16 +#define CFRAME_SIZE 88 +#define CFRAME_SHIFT_MULTRES 3 +#endif +#else #error "Missing CFRAME_* definitions for this architecture" #endif diff --git a/lib/luajit/src/lj_ircall.h b/lib/luajit/src/lj_ircall.h index 84e41ecfcc..1f44b03d67 100644 --- a/lib/luajit/src/lj_ircall.h +++ b/lib/luajit/src/lj_ircall.h @@ -270,6 +270,22 @@ LJ_DATA const CCallInfo lj_ir_callinfo[IRCALL__MAX+1]; #define fp64_f2l __aeabi_f2lz #define fp64_f2ul __aeabi_f2ulz #endif +#elif LJ_TARGET_MIPS +#define softfp_add __adddf3 +#define softfp_sub __subdf3 +#define softfp_mul __muldf3 +#define softfp_div __divdf3 +#define softfp_cmp __ledf2 +#define softfp_i2d __floatsidf +#define softfp_d2i __fixdfsi +#define softfp_ui2d __floatunsidf +#define softfp_f2d __extendsfdf2 +#define softfp_d2ui __fixunsdfsi +#define softfp_d2f __truncdfsf2 +#define softfp_i2f __floatsisf +#define softfp_ui2f __floatunsisf +#define softfp_f2i __fixsfsi +#define softfp_f2ui __fixunssfsi #else #error "Missing soft-float definitions for target architecture" #endif diff --git a/lib/luajit/src/lj_opt_split.c b/lib/luajit/src/lj_opt_split.c index 81ded6c0a0..4652c73786 100644 --- a/lib/luajit/src/lj_opt_split.c +++ b/lib/luajit/src/lj_opt_split.c @@ -596,7 +596,8 @@ static void split_ir(jit_State *J) } #endif else if (st == IRT_I64 || st == IRT_U64) { /* 64/64 bit cast. */ - /* Drop cast, since assembler doesn't care. */ + /* Drop cast, since assembler doesn't care. But fwd both parts. */ + hi = hiref; goto fwdlo; } else if ((ir->op2 & IRCONV_SEXT)) { /* Sign-extend to 64 bit. */ IRRef k31 = lj_ir_kint(J, 31); diff --git a/lib/luajit/src/lj_profile.c b/lib/luajit/src/lj_profile.c index f4d6fe18de..c7e53963b5 100644 --- a/lib/luajit/src/lj_profile.c +++ b/lib/luajit/src/lj_profile.c @@ -5,7 +5,6 @@ #define lj_profile_c #define LUA_CORE -#define _GNU_SOURCE 1 #include "lj_obj.h" @@ -30,17 +29,6 @@ #define profile_lock(ps) UNUSED(ps) #define profile_unlock(ps) UNUSED(ps) -#if 1 -#include -#include -#include -#include -#include -#include -#include -#endif - - #elif LJ_PROFILE_PTHREAD #include @@ -74,8 +62,6 @@ typedef struct ProfileState { SBuf sb; /* String buffer for stack dumps. */ int interval; /* Sample interval in milliseconds. */ int samples; /* Number of samples for next callback. */ - char *flavour; /* What generates profiling events. */ - int perf_event_fd; /* Performace event file descriptor */ int vmstate; /* VM state when profile timer triggered. */ #if LJ_PROFILE_SIGPROF struct sigaction oldsa; /* Previous SIGPROF state. */ @@ -169,7 +155,7 @@ static void profile_trigger(ProfileState *ps) mask = g->hookmask; if (!(mask & (HOOK_PROFILE|HOOK_VMEVENT))) { /* Set profile hook. */ int st = g->vmstate; - ps->vmstate = st >= 0 ? 256+st : + ps->vmstate = st >= 0 ? 'N' : st == ~LJ_VMST_INTERP ? 'I' : st == ~LJ_VMST_C ? 'C' : st == ~LJ_VMST_GC ? 'G' : 'J'; @@ -190,178 +176,29 @@ static void profile_signal(int sig) profile_trigger(&profile_state); } - -static int perf_event_open(struct perf_event_attr *attr, - pid_t pid, int cpu, int group_fd, - unsigned long flags) -{ - return syscall(SYS_perf_event_open, attr, pid, cpu, group_fd, flags); -} - - -static void register_prof_events(ProfileState *ps) -{ - struct flavour_t { - char *name; uint32_t type; uint64_t config; - }; - - static struct flavour_t flavours[] = - { - { "sw-cpu-clock", - PERF_TYPE_SOFTWARE, PERF_COUNT_SW_CPU_CLOCK }, - - { "sw-context-switches", - PERF_TYPE_SOFTWARE, PERF_COUNT_SW_CONTEXT_SWITCHES }, - - { "sw-page-faults", - PERF_TYPE_SOFTWARE, PERF_COUNT_SW_PAGE_FAULTS }, - - { "sw-minor-page-faults", - PERF_TYPE_SOFTWARE, PERF_COUNT_SW_PAGE_FAULTS_MIN }, - - { "sw-major-page-faults", - PERF_TYPE_SOFTWARE, PERF_COUNT_SW_PAGE_FAULTS_MAJ }, - - { "branch-instructions", - PERF_TYPE_HARDWARE, PERF_COUNT_HW_BRANCH_INSTRUCTIONS }, - - { "cpu-cycles", - PERF_TYPE_HARDWARE, PERF_COUNT_HW_CPU_CYCLES }, - - { "instructions", - PERF_TYPE_HARDWARE, PERF_COUNT_HW_INSTRUCTIONS }, - - { "cache-references", - PERF_TYPE_HARDWARE, PERF_COUNT_HW_CACHE_REFERENCES }, - - { "cache-misses", - PERF_TYPE_HARDWARE, PERF_COUNT_HW_CACHE_MISSES }, - - { "branch-instructions", - PERF_TYPE_HARDWARE, PERF_COUNT_HW_BRANCH_INSTRUCTIONS }, - - { "branch-misses", - PERF_TYPE_HARDWARE, PERF_COUNT_HW_BRANCH_MISSES }, - - { "bus-cycles", - PERF_TYPE_HARDWARE, PERF_COUNT_HW_BUS_CYCLES }, - - { "stalled-cycles-frontend", - PERF_TYPE_HARDWARE, PERF_COUNT_HW_STALLED_CYCLES_FRONTEND }, - - { "stalled-cycles-backend", - PERF_TYPE_HARDWARE, PERF_COUNT_HW_STALLED_CYCLES_BACKEND }, - - { "cpu-cycles", - PERF_TYPE_HARDWARE, PERF_COUNT_HW_REF_CPU_CYCLES }, - - { 0, 0, 0 } - }; - - - struct perf_event_attr attr = { }; - - memset(&attr, 0, sizeof(struct perf_event_attr)); - - const struct flavour_t *f; - for (f = flavours; f->name != 0; f++) - { - if (strcmp (ps->flavour, f->name) == 0) - { - attr.type = f->type; - attr.config = f->config; - break; - } - } - - if (strcmp (ps->flavour, "?") == 0) - { - const struct flavour_t *f; - fprintf (stderr, "I know: "); - for (f = flavours; f->name != 0; f++) - fprintf (stderr, "%s ", f->name); - fprintf(stderr, "\n"); - } - else if (! f->name) - { - fprintf (stderr, "unknown profiling flavour `%s', S[?] to list\n", ps->flavour); - } - - attr.size = sizeof(struct perf_event_attr); - attr.sample_type = PERF_SAMPLE_IP; - /* attr.read_format = PERF_FORMAT_GROUP | PERF_FORMAT_ID; */ - attr.disabled=1; - attr.pinned=1; - attr.exclude_kernel=1; - attr.exclude_hv=1; - - attr.sample_period = ps->interval; - /* attr.watermark=0; */ - /* attr.wakeup_events=1; */ - - int fd = perf_event_open(&attr, 0, -1, -1, 0); - if (fd == -1) - { - printf ("! perf_event_open %m\n"); - } - - ps->perf_event_fd = fd; - - fcntl(fd, F_SETFL, O_RDWR|O_NONBLOCK|O_ASYNC); - fcntl(fd, F_SETSIG, SIGPROF); - fcntl(fd, F_SETOWN, getpid()); - - ioctl(fd, PERF_EVENT_IOC_RESET, 0); - - int err = ioctl(fd, PERF_EVENT_IOC_ENABLE, 0); - if (err != 0) - printf ("! perf_events enable\n"); -} - - - /* Start profiling timer. */ static void profile_timer_start(ProfileState *ps) { - struct sigaction sa = { - .sa_flags = SA_RESTART, - .sa_handler = profile_signal - }; - + int interval = ps->interval; + struct itimerval tm; + struct sigaction sa; + tm.it_value.tv_sec = tm.it_interval.tv_sec = interval / 1000; + tm.it_value.tv_usec = tm.it_interval.tv_usec = (interval % 1000) * 1000; + setitimer(ITIMER_PROF, &tm, NULL); + sa.sa_flags = SA_RESTART; + sa.sa_handler = profile_signal; sigemptyset(&sa.sa_mask); sigaction(SIGPROF, &sa, &ps->oldsa); - - if (strcmp(ps->flavour, "vanilla") == 0) - { - int interval = ps->interval; - struct itimerval tm; - tm.it_value.tv_sec = tm.it_interval.tv_sec = interval / 1000; - tm.it_value.tv_usec = tm.it_interval.tv_usec = (interval % 1000) * 1000; - setitimer(ITIMER_PROF, &tm, NULL); - } - else - { - register_prof_events(ps); - } } - - /* Stop profiling timer. */ static void profile_timer_stop(ProfileState *ps) { - if (ps->perf_event_fd) - { - ioctl(ps->perf_event_fd, PERF_EVENT_IOC_DISABLE, 0); - } - else - { - struct itimerval tm; - tm.it_value.tv_sec = tm.it_interval.tv_sec = 0; - tm.it_value.tv_usec = tm.it_interval.tv_usec = 0; - setitimer(ITIMER_PROF, &tm, NULL); - sigaction(SIGPROF, &ps->oldsa, NULL); - } + struct itimerval tm; + tm.it_value.tv_sec = tm.it_interval.tv_sec = 0; + tm.it_value.tv_usec = tm.it_interval.tv_usec = 0; + setitimer(ITIMER_PROF, &tm, NULL); + sigaction(SIGPROF, &ps->oldsa, NULL); } #elif LJ_PROFILE_PTHREAD @@ -463,8 +300,6 @@ LUA_API void luaJIT_profile_start(lua_State *L, const char *mode, { ProfileState *ps = &profile_state; int interval = LJ_PROFILE_INTERVAL_DEFAULT; - char *flavour; - while (*mode) { int m = *mode++; switch (m) { @@ -480,13 +315,6 @@ LUA_API void luaJIT_profile_start(lua_State *L, const char *mode, lj_trace_flushall(L); break; #endif - case 'S': - { - int k; - if (sscanf (mode, "[%m[^]]]%n", &flavour, &k) > 0) - mode += k; - } - default: /* Ignore unknown mode chars. */ break; } @@ -500,7 +328,6 @@ LUA_API void luaJIT_profile_start(lua_State *L, const char *mode, ps->cb = cb; ps->data = data; ps->samples = 0; - ps->flavour = flavour; lj_buf_init(L, &ps->sb); profile_timer_start(ps); } diff --git a/lib/luajit/src/lj_snap.c b/lib/luajit/src/lj_snap.c index fa9abb7475..62515ed0f6 100644 --- a/lib/luajit/src/lj_snap.c +++ b/lib/luajit/src/lj_snap.c @@ -26,9 +26,6 @@ #include "lj_cdata.h" #endif -/* Some local macros to save typing. Undef'd at the end. */ -#define IR(ref) (&J->cur.ir[(ref)]) - /* Pass IR on to next optimization in chain (FOLD). */ #define emitir(ot, a, b) (lj_ir_set(J, (ot), (a), (b)), lj_opt_fold(J)) @@ -73,7 +70,7 @@ static MSize snapshot_slots(jit_State *J, SnapEntry *map, BCReg nslots) IRRef ref = tref_ref(tr); if (ref) { SnapEntry sn = SNAP_TR(s, tr); - IRIns *ir = IR(ref); + IRIns *ir = &J->cur.ir[ref]; if (!(sn & (SNAP_CONT|SNAP_FRAME)) && ir->o == IR_SLOAD && ir->op1 == s && ref > retf) { /* No need to snapshot unmodified non-inherited slots. */ @@ -407,24 +404,24 @@ static TRef snap_pref(jit_State *J, GCtrace *T, SnapEntry *map, MSize nmax, } /* Check whether a sunk store corresponds to an allocation. Slow path. */ -static int snap_sunk_store2(jit_State *J, IRIns *ira, IRIns *irs) +static int snap_sunk_store2(GCtrace *T, IRIns *ira, IRIns *irs) { if (irs->o == IR_ASTORE || irs->o == IR_HSTORE || irs->o == IR_FSTORE || irs->o == IR_XSTORE) { - IRIns *irk = IR(irs->op1); + IRIns *irk = &T->ir[irs->op1]; if (irk->o == IR_AREF || irk->o == IR_HREFK) - irk = IR(irk->op1); - return (IR(irk->op1) == ira); + irk = &T->ir[irk->op1]; + return (&T->ir[irk->op1] == ira); } return 0; } /* Check whether a sunk store corresponds to an allocation. Fast path. */ -static LJ_AINLINE int snap_sunk_store(jit_State *J, IRIns *ira, IRIns *irs) +static LJ_AINLINE int snap_sunk_store(GCtrace *T, IRIns *ira, IRIns *irs) { if (irs->s != 255) return (ira + irs->s == irs); /* Fast check. */ - return snap_sunk_store2(J, ira, irs); + return snap_sunk_store2(T, ira, irs); } /* Replay snapshot state to setup side trace. */ @@ -487,7 +484,7 @@ void lj_snap_replay(jit_State *J, GCtrace *T) } else { IRIns *irs; for (irs = ir+1; irs < irlast; irs++) - if (irs->r == RID_SINK && snap_sunk_store(J, ir, irs)) { + if (irs->r == RID_SINK && snap_sunk_store(T, ir, irs)) { if (snap_pref(J, T, map, nent, seen, irs->op2) == 0) snap_pref(J, T, map, nent, seen, T->ir[irs->op2].op1); else if ((LJ_SOFTFP || (LJ_32 && LJ_HASFFI)) && @@ -521,13 +518,13 @@ void lj_snap_replay(jit_State *J, GCtrace *T) op2 = emitir_raw(IRT(IR_HIOP, IRT_I64), op2, snap_pref(J, T, map, nent, seen, (ir+1)->op2)); } - J->slot[snap_slot(sn)] = emitir(ir->ot, op1, op2); + J->slot[snap_slot(sn)] = emitir(ir->ot & ~(IRT_MARK|IRT_ISPHI), op1, op2); } else { IRIns *irs; TRef tr = emitir(ir->ot, op1, op2); J->slot[snap_slot(sn)] = tr; for (irs = ir+1; irs < irlast; irs++) - if (irs->r == RID_SINK && snap_sunk_store(J, ir, irs)) { + if (irs->r == RID_SINK && snap_sunk_store(T, ir, irs)) { IRIns *irr = &T->ir[irs->op1]; TRef val, key = irr->op2, tmp = tr; if (irr->o != IR_FREF) { @@ -714,8 +711,9 @@ static void snap_unsink(jit_State *J, GCtrace *T, ExitState *ex, if (ir->o == IR_CNEW || ir->o == IR_CNEWI) { CTState *cts = ctype_cts(J->L); CTypeID id = (CTypeID)T->ir[ir->op1].i; - CTSize sz = lj_ctype_size(cts, id); - GCcdata *cd = lj_cdata_new(cts, id, sz); + CTSize sz; + CTInfo info = lj_ctype_info(cts, id, &sz); + GCcdata *cd = lj_cdata_newx(cts, id, sz, info); setcdataV(J->L, o, cd); if (ir->o == IR_CNEWI) { uint8_t *p = (uint8_t *)cdataptr(cd); @@ -729,7 +727,7 @@ static void snap_unsink(jit_State *J, GCtrace *T, ExitState *ex, } else { IRIns *irs, *irlast = &T->ir[T->snap[snapno].ref]; for (irs = ir+1; irs < irlast; irs++) - if (irs->r == RID_SINK && snap_sunk_store(J, ir, irs)) { + if (irs->r == RID_SINK && snap_sunk_store(T, ir, irs)) { IRIns *iro = &T->ir[T->ir[irs->op1].op2]; uint8_t *p = (uint8_t *)cd; CTSize szs; @@ -762,7 +760,7 @@ static void snap_unsink(jit_State *J, GCtrace *T, ExitState *ex, settabV(J->L, o, t); irlast = &T->ir[T->snap[snapno].ref]; for (irs = ir+1; irs < irlast; irs++) - if (irs->r == RID_SINK && snap_sunk_store(J, ir, irs)) { + if (irs->r == RID_SINK && snap_sunk_store(T, ir, irs)) { IRIns *irk = &T->ir[irs->op1]; TValue tmp, *val; lua_assert(irs->o == IR_ASTORE || irs->o == IR_HSTORE || @@ -863,7 +861,6 @@ const BCIns *lj_snap_restore(jit_State *J, void *exptr) return pc; } -#undef IR #undef emitir_raw #undef emitir diff --git a/lib/luajit/src/lj_vm.h b/lib/luajit/src/lj_vm.h index b31e22f70f..cb76d7a700 100644 --- a/lib/luajit/src/lj_vm.h +++ b/lib/luajit/src/lj_vm.h @@ -50,7 +50,7 @@ LJ_ASMF void lj_vm_exit_handler(void); LJ_ASMF void lj_vm_exit_interp(void); /* Internal math helper functions. */ -#if LJ_TARGET_PPC || LJ_TARGET_ARM64 +#if LJ_TARGET_PPC || LJ_TARGET_ARM64 || (LJ_TARGET_MIPS && LJ_ABI_SOFTFP) #define lj_vm_floor floor #define lj_vm_ceil ceil #else diff --git a/lib/luajit/src/lua.h b/lib/luajit/src/lua.h index c83fd3bbe7..352d29f3cd 100644 --- a/lib/luajit/src/lua.h +++ b/lib/luajit/src/lua.h @@ -226,6 +226,7 @@ LUA_API int (lua_status) (lua_State *L); #define LUA_GCSTEP 5 #define LUA_GCSETPAUSE 6 #define LUA_GCSETSTEPMUL 7 +#define LUA_GCISRUNNING 9 LUA_API int (lua_gc) (lua_State *L, int what, int data); diff --git a/lib/luajit/src/vm_arm.dasc b/lib/luajit/src/vm_arm.dasc index af722f9eac..acc0853bb7 100644 --- a/lib/luajit/src/vm_arm.dasc +++ b/lib/luajit/src/vm_arm.dasc @@ -372,6 +372,17 @@ static void build_subroutines(BuildCtx *ctx) | str CARG1, [BASE, #-4] // Prepend false to error message. | st_vmstate CARG2 | b ->vm_returnc + | + |->vm_unwind_ext: // Complete external unwind. +#if !LJ_NO_UNWIND + | push {r0, r1, r2, lr} + | bl extern _Unwind_Complete + | ldr r0, [sp] + | bl extern _Unwind_DeleteException + | pop {r0, r1, r2, lr} + | mov r0, r1 + | bx r2 +#endif | |//----------------------------------------------------------------------- |//-- Grow stack for calls ----------------------------------------------- diff --git a/lib/luajit/src/vm_mips.dasc b/lib/luajit/src/vm_mips.dasc index 134ed569e8..0dba129316 100644 --- a/lib/luajit/src/vm_mips.dasc +++ b/lib/luajit/src/vm_mips.dasc @@ -1,6 +1,9 @@ |// Low-level VM code for MIPS CPUs. |// Bytecode interpreter, fast functions and helper functions. |// Copyright (C) 2005-2015 Mike Pall. See Copyright Notice in luajit.h +|// +|// MIPS soft-float support contributed by Djordje Kovacevic and +|// Stefan Pejic from RT-RK.com, sponsored by Cisco Systems, Inc. | |.arch mips |.section code_op, code_sub @@ -18,6 +21,12 @@ |// Fixed register assignments for the interpreter. |// Don't use: r0 = 0, r26/r27 = reserved, r28 = gp, r29 = sp, r31 = ra | +|.macro .FPU, a, b +|.if FPU +| a, b +|.endif +|.endmacro +| |// The following must be C callee-save (but BASE is often refetched). |.define BASE, r16 // Base of current Lua stack frame. |.define KBASE, r17 // Constants of current Lua function. @@ -31,7 +40,9 @@ | |// Constants for type-comparisons, stores and conversions. C callee-save. |.define TISNIL, r30 +|.if FPU |.define TOBIT, f30 // 2^52 + 2^51. +|.endif | |// The following temporaries are not saved across C calls, except for RA. |.define RA, r23 // Callee-save. @@ -46,6 +57,13 @@ |.define TMP2, r14 |.define TMP3, r15 | +|.if not FPU +|.define SFT1, r2 +|.define SFT2, r3 +|.define SFT3, r4 +|.define SFT4, r5 +|.endif +| |// Calling conventions. |.define CFUNCADDR, r25 |.define CARG1, r4 @@ -56,13 +74,16 @@ |.define CRET1, r2 |.define CRET2, r3 | +|.if FPU |.define FARG1, f12 |.define FARG2, f14 | |.define FRET1, f0 |.define FRET2, f2 +|.endif | |// Stack layout while in interpreter. Must match with lj_frame.h. +|.if FPU // MIPS32 hard-float. |.define CFRAME_SPACE, 112 // Delta for sp. | |.define SAVE_ERRF, 124(sp) // 32 bit C frame info. @@ -83,43 +104,76 @@ |.define ARG5_OFS, 16 |.define SAVE_MULTRES, ARG5 | +|//----------------------------------------------------------------------- +|.else // MIPS32 soft-float. +| +|.define CFRAME_SPACE, 88 // Delta for sp. +| +|.define SAVE_ERRF, 100(sp) // 32 bit C frame info. +|.define SAVE_NRES, 96(sp) +|.define SAVE_CFRAME, 92(sp) +|.define SAVE_L, 88(sp) +|//----- 8 byte aligned, ^^^^ 16 byte register save area, owned by interpreter. +|.define SAVE_GPR_, 48 // .. 48+10*4: 32 bit GPR saves. +|.define SAVE_PC, 44(sp) +|.define TEMP_SAVE_6, 40(sp) +|.define TEMP_SAVE_5, 36(sp) +|.define TEMP_SAVE_4, 32(sp) +|.define TEMP_SAVE_3, 28(sp) +|.define TEMP_SAVE_2, 24(sp) +|.define TEMP_SAVE_1, 20(sp) +|//----- 8 byte aligned, ^^^^ 24 byte register save area, owned by caller. +|.define ARG5, 16(sp) +|.define CSAVE_4, 12(sp) +|.define CSAVE_3, 8(sp) +|.define CSAVE_2, 4(sp) +|.define CSAVE_1, 0(sp) +|//----- 8 byte aligned, ^^^^ 16 byte register save area, owned by callee. +| +|.define ARG5_OFS, 16 +|.define SAVE_MULTRES, ARG5 +| +|.endif +| +|//----------------------------------------------------------------------- +| |.macro saveregs | addiu sp, sp, -CFRAME_SPACE | sw ra, SAVE_GPR_+9*4(sp) | sw r30, SAVE_GPR_+8*4(sp) -| sdc1 f30, SAVE_FPR_+5*8(sp) +| .FPU sdc1 f30, SAVE_FPR_+5*8(sp) | sw r23, SAVE_GPR_+7*4(sp) | sw r22, SAVE_GPR_+6*4(sp) -| sdc1 f28, SAVE_FPR_+4*8(sp) +| .FPU sdc1 f28, SAVE_FPR_+4*8(sp) | sw r21, SAVE_GPR_+5*4(sp) | sw r20, SAVE_GPR_+4*4(sp) -| sdc1 f26, SAVE_FPR_+3*8(sp) +| .FPU sdc1 f26, SAVE_FPR_+3*8(sp) | sw r19, SAVE_GPR_+3*4(sp) | sw r18, SAVE_GPR_+2*4(sp) -| sdc1 f24, SAVE_FPR_+2*8(sp) +| .FPU sdc1 f24, SAVE_FPR_+2*8(sp) | sw r17, SAVE_GPR_+1*4(sp) | sw r16, SAVE_GPR_+0*4(sp) -| sdc1 f22, SAVE_FPR_+1*8(sp) -| sdc1 f20, SAVE_FPR_+0*8(sp) +| .FPU sdc1 f22, SAVE_FPR_+1*8(sp) +| .FPU sdc1 f20, SAVE_FPR_+0*8(sp) |.endmacro | |.macro restoreregs_ret | lw ra, SAVE_GPR_+9*4(sp) | lw r30, SAVE_GPR_+8*4(sp) -| ldc1 f30, SAVE_FPR_+5*8(sp) +| .FPU ldc1 f30, SAVE_FPR_+5*8(sp) | lw r23, SAVE_GPR_+7*4(sp) | lw r22, SAVE_GPR_+6*4(sp) -| ldc1 f28, SAVE_FPR_+4*8(sp) +| .FPU ldc1 f28, SAVE_FPR_+4*8(sp) | lw r21, SAVE_GPR_+5*4(sp) | lw r20, SAVE_GPR_+4*4(sp) -| ldc1 f26, SAVE_FPR_+3*8(sp) +| .FPU ldc1 f26, SAVE_FPR_+3*8(sp) | lw r19, SAVE_GPR_+3*4(sp) | lw r18, SAVE_GPR_+2*4(sp) -| ldc1 f24, SAVE_FPR_+2*8(sp) +| .FPU ldc1 f24, SAVE_FPR_+2*8(sp) | lw r17, SAVE_GPR_+1*4(sp) | lw r16, SAVE_GPR_+0*4(sp) -| ldc1 f22, SAVE_FPR_+1*8(sp) -| ldc1 f20, SAVE_FPR_+0*8(sp) +| .FPU ldc1 f22, SAVE_FPR_+1*8(sp) +| .FPU ldc1 f20, SAVE_FPR_+0*8(sp) | jr ra | addiu sp, sp, CFRAME_SPACE |.endmacro @@ -270,6 +324,61 @@ |.macro call_extern; jalr CFUNCADDR; .endmacro |.macro jmp_extern; jr CFUNCADDR; .endmacro | +|// Converts int from given reg to double, result in CRET1 and CRET2 regs. +|.if not FPU +|.macro cvti2d, arg +| load_got __floatsidf +| call_extern +|. move CARG1, arg +|.endmacro +|.endif +| +|// Loads a double-word floating-point value. +|.macro load_double, fpr, gpr1, gpr2, src +|.if FPU +| ldc1 fpr, src +|.else +| lw gpr1, src +| lw gpr2, 4+src +|.endif +|.endmacro +| +|// Stores a double-word floating-point value. +|.macro store_double, fpr, gpr1, gpr2, dst +|.if FPU +| sdc1 fpr, dst +|.else +| sw gpr1, dst +| sw gpr2, 4+dst +|.endif +|.endmacro +| +|// Loads the first double-word floating-point argument. +|.macro load_farg1, src +| load_double FARG1, CARG1, CARG2, src +|.endmacro +| +|// Loads the second double-word floating-point argument. +|.macro load_farg2, src +| load_double FARG2, CARG3, CARG4, src +|.endmacro +| +|.macro load_double1, src +| load_double f0, SFT1, SFT2, src +|.endmacro +| +|.macro store_double1, dst +| store_double f0, SFT1, SFT2, dst +|.endmacro +| +|.macro load_double2, src +| load_double f2, SFT3, SFT4, src +|.endmacro +| +|.macro store_double2, dst +| store_double f2, SFT3, SFT4, dst +|.endmacro +| |.macro hotcheck, delta, target | srl TMP1, PC, 1 | andi TMP1, TMP1, 126 @@ -354,9 +463,9 @@ static void build_subroutines(BuildCtx *ctx) |. sll TMP2, TMP2, 3 |1: | addiu TMP1, TMP1, -8 - | ldc1 f0, 0(RA) + | load_double1 0(RA) | addiu RA, RA, 8 - | sdc1 f0, 0(BASE) + | store_double1 0(BASE) | bnez TMP1, <1 |. addiu BASE, BASE, 8 | @@ -425,15 +534,15 @@ static void build_subroutines(BuildCtx *ctx) | and sp, CARG1, AT |->vm_unwind_ff_eh: // Landing pad for external unwinder. | lw L, SAVE_L - | lui TMP3, 0x59c0 // TOBIT = 2^52 + 2^51 (float). + | .FPU lui TMP3, 0x59c0 // TOBIT = 2^52 + 2^51 (float). | li TISNIL, LJ_TNIL | lw BASE, L->base | lw DISPATCH, L->glref // Setup pointer to dispatch table. - | mtc1 TMP3, TOBIT + | .FPU mtc1 TMP3, TOBIT | li TMP1, LJ_TFALSE | li_vmstate INTERP | lw PC, FRAME_PC(BASE) // Fetch PC of previous frame. - | cvt.d.s TOBIT, TOBIT + | .FPU cvt.d.s TOBIT, TOBIT | addiu RA, BASE, -8 // Results start at BASE-8. | addiu DISPATCH, DISPATCH, GG_G2DISP | sw TMP1, HI(RA) // Prepend false to error message. @@ -498,11 +607,11 @@ static void build_subroutines(BuildCtx *ctx) | lw BASE, L->base | lw TMP1, L->top | lw PC, FRAME_PC(BASE) - | lui TMP3, 0x59c0 // TOBIT = 2^52 + 2^51 (float). + | .FPU lui TMP3, 0x59c0 // TOBIT = 2^52 + 2^51 (float). | subu RD, TMP1, BASE - | mtc1 TMP3, TOBIT + | .FPU mtc1 TMP3, TOBIT | sb r0, L->status - | cvt.d.s TOBIT, TOBIT + | .FPU cvt.d.s TOBIT, TOBIT | li_vmstate INTERP | addiu RD, RD, 8 | st_vmstate @@ -540,13 +649,13 @@ static void build_subroutines(BuildCtx *ctx) |3: // Entry point for vm_cpcall/vm_resume (BASE = base, PC = ftype). | sw L, DISPATCH_GL(cur_L)(DISPATCH) | lw TMP2, L->base // TMP2 = old base (used in vmeta_call). - | lui TMP3, 0x59c0 // TOBIT = 2^52 + 2^51 (float). + | .FPU lui TMP3, 0x59c0 // TOBIT = 2^52 + 2^51 (float). | lw TMP1, L->top - | mtc1 TMP3, TOBIT + | .FPU mtc1 TMP3, TOBIT | addu PC, PC, BASE | subu NARGS8:RC, TMP1, BASE | subu PC, PC, TMP2 // PC = frame delta + frame type - | cvt.d.s TOBIT, TOBIT + | .FPU cvt.d.s TOBIT, TOBIT | li_vmstate INTERP | li TISNIL, LJ_TNIL | st_vmstate @@ -628,7 +737,7 @@ static void build_subroutines(BuildCtx *ctx) |->cont_cat: // RA = resultptr, RB = meta base | lw INS, -4(PC) | addiu CARG2, RB, -16 - | ldc1 f0, 0(RA) + | load_double1 0(RA) | decode_RB8a MULTRES, INS | decode_RA8a RA, INS | decode_RB8b MULTRES @@ -636,11 +745,21 @@ static void build_subroutines(BuildCtx *ctx) | addu TMP1, BASE, MULTRES | sw BASE, L->base | subu CARG3, CARG2, TMP1 + |.if FPU | bne TMP1, CARG2, ->BC_CAT_Z |. sdc1 f0, 0(CARG2) | addu RA, BASE, RA | b ->cont_nop |. sdc1 f0, 0(RA) + |.else + | sw SFT1, 0(CARG2) + | bne TMP1, CARG2, ->BC_CAT_Z + |. sw SFT2, 4(CARG2) + | addu RA, BASE, RA + | sw SFT1, 0(RA) + | b ->cont_nop + |. sw SFT2, 4(RA) + |.endif | |//-- Table indexing metamethods ----------------------------------------- | @@ -663,10 +782,19 @@ static void build_subroutines(BuildCtx *ctx) |. sw TMP1, HI(CARG3) | |->vmeta_tgetb: // TMP0 = index + |.if FPU | mtc1 TMP0, f0 | cvt.d.w f0, f0 | addiu CARG3, DISPATCH, DISPATCH_GL(tmptv) | sdc1 f0, 0(CARG3) + |.else + | sw CARG2, TEMP_SAVE_1 //needed to be saved because it's used later in lj_meta_tget + | cvti2d TMP0 + | addiu CARG3, DISPATCH, DISPATCH_GL(tmptv) + | sw CRET1, 0(CARG3) + | sw CRET2, 4(CARG3) + | lw CARG2, TEMP_SAVE_1 + |.endif | |->vmeta_tgetv: |1: @@ -678,9 +806,9 @@ static void build_subroutines(BuildCtx *ctx) | // Returns TValue * (finished) or NULL (metamethod). | beqz CRET1, >3 |. addiu TMP1, BASE, -FRAME_CONT - | ldc1 f0, 0(CRET1) + | load_double2 0(CRET1) | ins_next1 - | sdc1 f0, 0(RA) + | store_double2 0(RA) | ins_next2 | |3: // Call __index metamethod. @@ -699,8 +827,14 @@ static void build_subroutines(BuildCtx *ctx) | // Returns cTValue * or NULL. | beqz CRET1, >1 |. nop + |.if FPU | b ->BC_TGETR_Z |. ldc1 f0, 0(CRET1) + |.else + | lw SFT1, 0(CRET1) + | b ->BC_TGETR_Z + |. lw SFT2, 4(CRET1) + |.endif | |//----------------------------------------------------------------------- | @@ -723,10 +857,19 @@ static void build_subroutines(BuildCtx *ctx) |. sw TMP1, HI(CARG3) | |->vmeta_tsetb: // TMP0 = index + |.if FPU | mtc1 TMP0, f0 | cvt.d.w f0, f0 | addiu CARG3, DISPATCH, DISPATCH_GL(tmptv) | sdc1 f0, 0(CARG3) + |.else + | sw CARG2, TEMP_SAVE_1 + | cvti2d TMP0 + | addiu CARG3, DISPATCH, DISPATCH_GL(tmptv) + | sw CRET1, 0(CARG3) + | sw CRET2, 4(CARG3) + | lw CARG2, TEMP_SAVE_1 + |.endif | |->vmeta_tsetv: |1: @@ -736,11 +879,17 @@ static void build_subroutines(BuildCtx *ctx) | call_intern lj_meta_tset // (lua_State *L, TValue *o, TValue *k) |. move CARG1, L | // Returns TValue * (finished) or NULL (metamethod). + |.if FPU | beqz CRET1, >3 - |. ldc1 f0, 0(RA) + |. ldc1 f2, 0(RA) + |.else + | lw SFT3, 0(RA) + | beqz CRET1, >3 + |. lw SFT4, 4(RA) + |.endif | // NOBARRIER: lj_meta_tset ensures the table is not black. | ins_next1 - | sdc1 f0, 0(CRET1) + | store_double2 0(CRET1) | ins_next2 | |3: // Call __newindex metamethod. @@ -750,7 +899,7 @@ static void build_subroutines(BuildCtx *ctx) | sw PC, -16+HI(BASE) // [cont|PC] | subu PC, BASE, TMP1 | lw LFUNC:RB, FRAME_FUNC(BASE) // Guaranteed to be a function here. - | sdc1 f0, 16(BASE) // Copy value to third argument. + | store_double2 16(BASE) // Copy value to third argument. | b ->vm_call_dispatch_f |. li NARGS8:RC, 24 // 3 args for func(t, k, v) | @@ -793,11 +942,17 @@ static void build_subroutines(BuildCtx *ctx) | |->cont_ra: // RA = resultptr | lbu TMP1, -4+OFS_RA(PC) - | ldc1 f0, 0(RA) + | load_double1 0(RA) | sll TMP1, TMP1, 3 | addu TMP1, BASE, TMP1 + |.if FPU | b ->cont_nop |. sdc1 f0, 0(TMP1) + |.else + | sw SFT1, 0(TMP1) + | b ->cont_nop + |. sw SFT2, 4(TMP1) + |.endif | |->cont_condt: // RA = resultptr | lw TMP0, HI(RA) @@ -852,7 +1007,22 @@ static void build_subroutines(BuildCtx *ctx) |//-- Arithmetic metamethods --------------------------------------------- | |->vmeta_unm: - | move CARG4, CARG3 + | b ->vmeta_arith + |. move CARG4, CARG3 + | + |->vmeta_arith_vn: + | addu CARG3, BASE, RB + | b ->vmeta_arith + |. addu CARG4, KBASE, RC + | + |->vmeta_arith_nv: + | addu CARG4, BASE, RB + | b ->vmeta_arith + |. addu CARG3, KBASE, RC + | + |->vmeta_arith_vv: + | addu CARG3, BASE, RB + | addu CARG4, BASE, RC | |->vmeta_arith: | load_got lj_meta_arith @@ -985,9 +1155,9 @@ static void build_subroutines(BuildCtx *ctx) |.macro .ffunc_n, name // Caveat: has delay slot! |->ff_ .. name: | lw CARG3, HI(BASE) + | load_farg1 0(BASE) | beqz NARGS8:RC, ->fff_fallback - |. ldc1 FARG1, 0(BASE) - | sltiu AT, CARG3, LJ_TISNUM + |. sltiu AT, CARG3, LJ_TISNUM | beqz AT, ->fff_fallback |.endmacro | @@ -997,10 +1167,10 @@ static void build_subroutines(BuildCtx *ctx) | lw CARG3, HI(BASE) | bnez AT, ->fff_fallback |. lw CARG4, 8+HI(BASE) - | ldc1 FARG1, 0(BASE) - | ldc1 FARG2, 8(BASE) | sltiu TMP0, CARG3, LJ_TISNUM | sltiu TMP1, CARG4, LJ_TISNUM + | load_farg1 0(BASE) + | load_farg2 8(BASE) | and TMP0, TMP0, TMP1 | beqz TMP0, ->fff_fallback |.endmacro @@ -1027,8 +1197,8 @@ static void build_subroutines(BuildCtx *ctx) | beq BASE, TMP2, ->fff_res // Done if exactly 1 argument. |. sw CARG1, LO(RA) |1: - | ldc1 f0, 0(TMP1) - | sdc1 f0, -8(TMP1) + | load_double1 0(TMP1) + | store_double1 -8(TMP1) | bne TMP1, TMP2, <1 |. addiu TMP1, TMP1, 8 | b ->fff_res @@ -1043,8 +1213,14 @@ static void build_subroutines(BuildCtx *ctx) | not TMP1, TMP1 | sll TMP1, TMP1, 3 | addu TMP1, CFUNC:RB, TMP1 + |.if HFABI | b ->fff_resn |. ldc1 FRET1, CFUNC:TMP1->upvalue + |.else + | lw CRET1, CFUNC:TMP1->upvalue[0].u32.hi + | b ->fff_resn + |. lw CRET2, CFUNC:TMP1->upvalue[0].u32.lo + |.endif | |//-- Base library: getters and setters --------------------------------- | @@ -1125,8 +1301,14 @@ static void build_subroutines(BuildCtx *ctx) | call_intern lj_tab_get // (lua_State *L, GCtab *t, cTValue *key) |. move CARG1, L | // Returns cTValue *. + |.if HFABI | b ->fff_resn |. ldc1 FRET1, 0(CRET1) + |.else + | lw CRET2, 4(CRET1) + | b ->fff_resn + |. lw CRET1, 0(CRET1) + |.endif | |//-- Base library: conversions ------------------------------------------ | @@ -1136,8 +1318,14 @@ static void build_subroutines(BuildCtx *ctx) | xori AT, NARGS8:RC, 8 | sltiu CARG1, CARG1, LJ_TISNUM | movn CARG1, r0, AT + |.if HFABI | beqz CARG1, ->fff_fallback // Exactly one number argument. |. ldc1 FRET1, 0(BASE) + |.else + | lw CRET1, 0(BASE) + | beqz CARG1, ->fff_fallback // Exactly one number argument. + |. lw CRET2, 4(BASE) + |.endif | b ->fff_resn |. nop | @@ -1185,13 +1373,13 @@ static void build_subroutines(BuildCtx *ctx) | // Returns 0 at end of traversal. | beqz CRET1, ->fff_restv // End of traversal: return nil. |. li CARG3, LJ_TNIL - | ldc1 f0, 8(BASE) // Copy key and value to results. + | load_double1 8(BASE) | addiu RA, BASE, -8 - | ldc1 f2, 16(BASE) - | li RD, (2+1)*8 - | sdc1 f0, 0(RA) + | load_double2 16(BASE) + | store_double1 0(RA) + | store_double2 8(RA) | b ->fff_res - |. sdc1 f2, 8(RA) + |. li RD, (2+1)*8 | |.ffunc_1 pairs | li AT, LJ_TTAB @@ -1199,16 +1387,32 @@ static void build_subroutines(BuildCtx *ctx) |. lw PC, FRAME_PC(BASE) #if LJ_52 | lw TAB:TMP2, TAB:CARG1->metatable + |.if FPU | ldc1 f0, CFUNC:RB->upvalue[0] + |.else + | lw SFT1, CFUNC:RB->upvalue[0].u32.hi + | lw SFT2, CFUNC:RB->upvalue[0].u32.lo + |.endif | bnez TAB:TMP2, ->fff_fallback #else + |.if FPU | ldc1 f0, CFUNC:RB->upvalue[0] + |.else + | lw SFT1, CFUNC:RB->upvalue[0].u32.hi + | lw SFT2, CFUNC:RB->upvalue[0].u32.lo + |.endif #endif |. addiu RA, BASE, -8 | sw TISNIL, 8+HI(BASE) | li RD, (3+1)*8 + |.if FPU | b ->fff_res |. sdc1 f0, 0(RA) + |.else + | sw SFT1, 0(RA) + | b ->fff_res + |. sw SFT2, 4(RA) + |.endif | |.ffunc ipairs_aux | sltiu AT, NARGS8:RC, 16 @@ -1216,35 +1420,55 @@ static void build_subroutines(BuildCtx *ctx) | lw TAB:CARG1, LO(BASE) | lw CARG4, 8+HI(BASE) | bnez AT, ->fff_fallback - |. ldc1 FARG2, 8(BASE) - | addiu CARG3, CARG3, -LJ_TTAB + |. addiu CARG3, CARG3, -LJ_TTAB | sltiu AT, CARG4, LJ_TISNUM | li TMP0, 1 | movn AT, r0, CARG3 - | mtc1 TMP0, FARG1 | beqz AT, ->fff_fallback |. lw PC, FRAME_PC(BASE) + |.if FPU + | ldc1 FARG2, 8(BASE) + | mtc1 TMP0, FARG1 | trunc.w.d FRET1, FARG2 | cvt.d.w FARG1, FARG1 - | lw TMP0, TAB:CARG1->asize - | lw TMP1, TAB:CARG1->array | mfc1 TMP2, FRET1 - | addiu RA, BASE, -8 | add.d FARG2, FARG2, FARG1 + |.else + | sw CARG1, TEMP_SAVE_1 + | cvti2d TMP0 + | sw CRET1, TEMP_SAVE_2 // Store result CRET1/CRET2=1 (double). + | sw CRET2, TEMP_SAVE_3 + | lw CARG2, 8+4(BASE) + | load_got __fixdfsi + | call_extern + |. lw CARG1, 8(BASE) + | sw CRET1, TEMP_SAVE_4 + | load_got __adddf3 + | lw CARG2, TEMP_SAVE_3 + | lw CARG3, 8(BASE) + | lw CARG4, 8+4(BASE) + | call_extern + |. lw CARG1, TEMP_SAVE_2 + | lw TMP2, TEMP_SAVE_4 + | lw CARG1, TEMP_SAVE_1 + |.endif + | lw TMP0, TAB:CARG1->asize + | lw TMP1, TAB:CARG1->array | addiu TMP2, TMP2, 1 | sltu AT, TMP2, TMP0 + | beqz AT, >2 // Not in array part? + |. addiu RA, BASE, -8 + | store_double FARG2, CRET1, CRET2, 0(RA) | sll TMP3, TMP2, 3 | addu TMP3, TMP1, TMP3 - | beqz AT, >2 // Not in array part? - |. sdc1 FARG2, 0(RA) | lw TMP2, HI(TMP3) - | ldc1 f0, 0(TMP3) + | load_double1 0(TMP3) |1: | beq TMP2, TISNIL, ->fff_res // End of iteration, return 0 results. |. li RD, (0+1)*8 - | li RD, (2+1)*8 + | store_double1 8(RA) | b ->fff_res - |. sdc1 f0, 8(RA) + |. li RD, (2+1)*8 |2: // Check for empty hash part first. Otherwise call C function. | lw TMP0, TAB:CARG1->hmask | load_got lj_tab_getinth @@ -1256,8 +1480,14 @@ static void build_subroutines(BuildCtx *ctx) | beqz CRET1, ->fff_res |. li RD, (0+1)*8 | lw TMP2, HI(CRET1) + |.if FPU | b <1 |. ldc1 f0, 0(CRET1) + |.else + | lw SFT2, 4(CRET1) + | b <1 + |. lw SFT1, 0(CRET1) + |.endif | |.ffunc_1 ipairs | li AT, LJ_TTAB @@ -1265,17 +1495,33 @@ static void build_subroutines(BuildCtx *ctx) |. lw PC, FRAME_PC(BASE) #if LJ_52 | lw TAB:TMP2, TAB:CARG1->metatable + |.if FPU | ldc1 f0, CFUNC:RB->upvalue[0] + |.else + | lw SFT1, CFUNC:RB->upvalue[0].u32.hi + | lw SFT2, CFUNC:RB->upvalue[0].u32.lo + |.endif | bnez TAB:TMP2, ->fff_fallback #else + |.if FPU | ldc1 f0, CFUNC:RB->upvalue[0] + |.else + | lw SFT1, CFUNC:RB->upvalue[0].u32.hi + | lw SFT2, CFUNC:RB->upvalue[0].u32.lo + |.endif #endif |. addiu RA, BASE, -8 | sw r0, 8+HI(BASE) | sw r0, 8+LO(BASE) | li RD, (3+1)*8 + |.if FPU | b ->fff_res |. sdc1 f0, 0(RA) + |.else + | sw SFT1, 0(RA) + | b ->fff_res + |. sw SFT2, 4(RA) + |.endif | |//-- Base library: catch errors ---------------------------------------- | @@ -1295,8 +1541,12 @@ static void build_subroutines(BuildCtx *ctx) | sltiu AT, NARGS8:RC, 16 | lw CARG4, 8+HI(BASE) | bnez AT, ->fff_fallback + |.if FPU |. ldc1 FARG2, 8(BASE) - | ldc1 FARG1, 0(BASE) + |.else + |. lw CARG3, 8+LO(BASE) + |.endif + | load_double FARG1, CARG1, CARG2, 0(BASE) | lbu TMP1, DISPATCH_GL(hookmask)(DISPATCH) | li AT, LJ_TFUNC | move TMP2, BASE @@ -1304,9 +1554,14 @@ static void build_subroutines(BuildCtx *ctx) | addiu BASE, BASE, 16 | // Remember active hook before pcall. | srl TMP3, TMP3, HOOK_ACTIVE_SHIFT + |.if FPU | sdc1 FARG2, 0(TMP2) // Swap function and traceback. + |.else + | sw CARG3, LO(TMP2) + | sw CARG4, HI(TMP2) + |.endif | andi TMP3, TMP3, 1 - | sdc1 FARG1, 8(TMP2) + | store_double FARG1, CARG1, CARG2, 8(TMP2) | addiu PC, TMP3, 16+FRAME_PCALL | b ->vm_call_dispatch |. addiu NARGS8:RC, NARGS8:RC, -16 @@ -1350,11 +1605,11 @@ static void build_subroutines(BuildCtx *ctx) | move CARG3, CARG2 | sw BASE, L->top |2: // Move args to coroutine. - | ldc1 f0, 0(BASE) + | load_double1 0(BASE) | sltu AT, BASE, TMP1 | beqz AT, >3 |. addiu BASE, BASE, 8 - | sdc1 f0, 0(CARG3) + | store_double1 0(CARG3) | b <2 |. addiu CARG3, CARG3, 8 |3: @@ -1380,10 +1635,10 @@ static void build_subroutines(BuildCtx *ctx) | sw TMP2, L:RA->top // Clear coroutine stack. | move TMP1, BASE |5: // Move results from coroutine. - | ldc1 f0, 0(TMP2) + | load_double1 0(TMP2) | addiu TMP2, TMP2, 8 | sltu AT, TMP2, TMP3 - | sdc1 f0, 0(TMP1) + | store_double1 0(TMP1) | bnez AT, <5 |. addiu TMP1, TMP1, 8 |6: @@ -1408,12 +1663,12 @@ static void build_subroutines(BuildCtx *ctx) |.if resume | addiu TMP3, TMP3, -8 | li TMP1, LJ_TFALSE - | ldc1 f0, 0(TMP3) + | load_double1 0(TMP3) | sw TMP3, L:RA->top // Remove error from coroutine stack. | li RD, (2+1)*8 | sw TMP1, -8+HI(BASE) // Prepend false to results. | addiu RA, BASE, -8 - | sdc1 f0, 0(BASE) // Copy error message. + | store_double1 0(BASE) // Copy error message. | b <7 |. andi TMP0, PC, FRAME_TYPE |.else @@ -1449,13 +1704,33 @@ static void build_subroutines(BuildCtx *ctx) | |//-- Math library ------------------------------------------------------- | - |.ffunc_n math_abs + |.ffunc_1 math_abs + | load_farg1 0(BASE) + | sltiu AT, CARG3, LJ_TISNUM + | beqz AT, ->fff_fallback + |. nop + |.if FPU |. abs.d FRET1, FARG1 + |.else + |. lui TMP1, 0x8000 + | and AT, CARG1, TMP1 + | move CRET2, CARG2 + | beqz AT, ->fff_resn + |. move CRET1, CARG1 + | xor CRET1, CARG1, TMP1 + |.endif + | |->fff_resn: | lw PC, FRAME_PC(BASE) | addiu RA, BASE, -8 + |.if HFABI | b ->fff_res1 |. sdc1 FRET1, -8(BASE) + |.else + | sw CRET1, -8(BASE) + | b ->fff_res1 + |. sw CRET2, -8+4(BASE) + |.endif | |->fff_restv: | // CARG3/CARG1 = TValue result. @@ -1498,8 +1773,14 @@ static void build_subroutines(BuildCtx *ctx) | sltiu AT, CARG3, LJ_TISNUM | beqz AT, ->fff_fallback |. nop + |.if HFABI | call_extern |. ldc1 FARG1, 0(BASE) + |.else + | lw CARG1, 0(BASE) + | call_extern + |. lw CARG2, 4(BASE) + |.endif | b ->fff_resn |. nop |.endmacro @@ -1526,15 +1807,20 @@ static void build_subroutines(BuildCtx *ctx) | math_round ceil | |.ffunc math_log - | lw CARG3, HI(BASE) | li AT, 8 | bne NARGS8:RC, AT, ->fff_fallback // Exactly 1 argument. - |. load_got log + |. lw CARG3, HI(BASE) | sltiu AT, CARG3, LJ_TISNUM | beqz AT, ->fff_fallback - |. nop + |. load_got log + |.if HFABI | call_extern |. ldc1 FARG1, 0(BASE) + |.else + | lw CARG1, 0(BASE) + | call_extern + |. lw CARG2, 4(BASE) + |.endif | b ->fff_resn |. nop | @@ -1553,17 +1839,40 @@ static void build_subroutines(BuildCtx *ctx) | math_extern2 atan2 | math_extern2 fmod | + |.if FPU |.ffunc_n math_sqrt |. sqrt.d FRET1, FARG1 | b ->fff_resn |. nop + |.else + | math_extern sqrt + |.endif | - |.ffunc_nn math_ldexp + |.ffunc_2 math_ldexp + | sltiu TMP0, CARG3, LJ_TISNUM + | sltiu TMP1, CARG4, LJ_TISNUM + | load_farg1 0(BASE) + | load_farg2 8(BASE) + | and TMP0, TMP0, TMP1 + | beqz TMP0, ->fff_fallback + |.if FPU + | load_got ldexp | trunc.w.d FARG2, FARG2 + | call_extern + |. mfc1 CARG3, FARG2 + |.else + | sw CARG1, TEMP_SAVE_1 + | sw CARG2, TEMP_SAVE_2 + | load_got __fixdfsi + | move CARG1, CARG3 + | call_extern + |. move CARG2, CARG4 + | lw CARG1, TEMP_SAVE_1 | load_got ldexp - | mfc1 CARG3, FARG2 + | lw CARG2, TEMP_SAVE_2 | call_extern - |. nop + |. move CARG3, CRET1 + |.endif | b ->fff_resn |. nop | @@ -1574,10 +1883,14 @@ static void build_subroutines(BuildCtx *ctx) |. addiu CARG3, DISPATCH, DISPATCH_GL(tmptv) | lw TMP1, DISPATCH_GL(tmptv)(DISPATCH) | addiu RA, BASE, -8 + | store_double FRET1, CRET1, CRET2, 0(RA) + |.if FPU | mtc1 TMP1, FARG2 - | sdc1 FRET1, 0(RA) | cvt.d.w FARG2, FARG2 - | sdc1 FARG2, 8(RA) + |.else + | cvti2d TMP1 + |.endif + | store_double FARG2, CRET1, CRET2, 8(RA) | b ->fff_res |. li RD, (2+1)*8 | @@ -1587,7 +1900,12 @@ static void build_subroutines(BuildCtx *ctx) | call_extern |. addiu CARG3, BASE, -8 | addiu RA, BASE, -8 + |.if HFABI | sdc1 FRET1, 0(BASE) + |.else + | sw CRET1, 0(BASE) + | sw CRET2, 4(BASE) + |.endif | b ->fff_res |. li RD, (2+1)*8 | @@ -1595,25 +1913,73 @@ static void build_subroutines(BuildCtx *ctx) |->ff_ .. name: | lw CARG3, HI(BASE) | beqz NARGS8:RC, ->fff_fallback - |. ldc1 FRET1, 0(BASE) - | sltiu AT, CARG3, LJ_TISNUM + |. sltiu AT, CARG3, LJ_TISNUM | beqz AT, ->fff_fallback |. addu TMP2, BASE, NARGS8:RC | addiu TMP1, BASE, 8 + |.if HFABI + | ldc1 FRET1, 0(BASE) | beq TMP1, TMP2, ->fff_resn + |.else + | lw CRET1, 0(BASE) + | lw CRET2, 4(BASE) + | beq TMP1, TMP2, ->fff_resn + |.endif |1: |. lw CARG3, HI(TMP1) + |.if HFABI | ldc1 FARG1, 0(TMP1) - | addiu TMP1, TMP1, 8 + |.else + | lw CARG1, 0(TMP1) + | lw CARG2, 4(TMP1) + |.endif | sltiu AT, CARG3, LJ_TISNUM | beqz AT, ->fff_fallback + |. addiu TMP1, TMP1, 8 + |.if FPU |.if ismax - |. c.olt.d FARG1, FRET1 + | c.olt.d FARG1, FRET1 |.else - |. c.olt.d FRET1, FARG1 + | c.olt.d FRET1, FARG1 |.endif | bne TMP1, TMP2, <1 |. movf.d FRET1, FARG1 + |.else + | load_got __ledf2 + | sw TMP1, TEMP_SAVE_1 + | sw TMP2, TEMP_SAVE_2 + | sw CARG1, TEMP_SAVE_3 + | sw CARG2, TEMP_SAVE_4 + | sw CRET1, TEMP_SAVE_5 + | sw CRET2, TEMP_SAVE_6 + | move CARG3, CRET1 + | call_extern + |. move CARG4, CRET2 + | lw CARG4, TEMP_SAVE_6 + | lw CARG3, TEMP_SAVE_5 + | lw CARG2, TEMP_SAVE_4 + | lw CARG1, TEMP_SAVE_3 + | lw TMP2, TEMP_SAVE_2 + | lw TMP1, TEMP_SAVE_1 + |.if ismax + | beqz CRET1, >2 // farg1==fret1 + |. li TMP3, 1 + | beq CRET1, TMP3, >2 // farg1>fret1 + |. nop + |.else + | blez CRET1, >2 + |. nop + |.endif + | move CRET1, CARG3 // Keep the value. + | b >3 + |. move CRET2, CARG4 + |2: + | move CRET1, CARG1 // Set new value. + | move CRET2, CARG2 + |3: + | bne TMP1, TMP2, <1 + |. nop + |.endif | b ->fff_resn |. nop |.endmacro @@ -1632,32 +1998,52 @@ static void build_subroutines(BuildCtx *ctx) | bnez AT, ->fff_fallback // Need exactly 1 string argument. |. nop | lw TMP0, STR:CARG1->len - | lbu TMP1, STR:CARG1[1] // Access is always ok (NUL at end). | addiu RA, BASE, -8 | sltu RD, r0, TMP0 - | mtc1 TMP1, f0 + | lw PC, FRAME_PC(BASE) | addiu RD, RD, 1 + | lbu TMP1, STR:CARG1[1] // Access is always ok (NUL at end). + |.if FPU + | mtc1 TMP1, f0 | cvt.d.w f0, f0 - | lw PC, FRAME_PC(BASE) - | sll RD, RD, 3 // RD = ((str->len != 0)+1)*8 + | sdc1 f0, 0(RA) + |.else + | sw RD, TEMP_SAVE_1 + | cvti2d TMP1 + | sw CRET1, 0(RA) + | sw CRET2, 4(RA) + | lw RD, TEMP_SAVE_1 + |.endif | b ->fff_res - |. sdc1 f0, 0(RA) + |. sll RD, RD, 3 // RD = ((str->len != 0)+1)*8 | |.ffunc string_char // Only handle the 1-arg case here. | ffgccheck | lw CARG3, HI(BASE) - | ldc1 FARG1, 0(BASE) | li AT, 8 | bne NARGS8:RC, AT, ->fff_fallback // Exactly 1 argument. |. sltiu AT, CARG3, LJ_TISNUM | beqz AT, ->fff_fallback |. li CARG3, 1 - | trunc.w.d FARG1, FARG1 - | addiu CARG2, sp, ARG5_OFS | sltiu AT, TMP0, 256 - | mfc1 TMP0, FARG1 | beqz AT, ->fff_fallback - |. sw TMP0, ARG5 + | load_farg1 0(BASE) + |.if FPU + | trunc.w.d FARG1, FARG1 + | mfc1 TMP0, FARG1 + |.else + | load_got __fixdfsi + | sw RB, TEMP_SAVE_1 + | sw RC, TEMP_SAVE_2 + | call_extern + |. sw CARG3, TEMP_SAVE_3 + | lw CARG3, TEMP_SAVE_3 + | lw RC, TEMP_SAVE_2 + | lw RB, TEMP_SAVE_1 + | move TMP0, CRET1 + |.endif + | addiu CARG2, sp, ARG5_OFS + | sw TMP0, ARG5 |->fff_newstr: | load_got lj_str_new | sw BASE, L->base @@ -1674,27 +2060,52 @@ static void build_subroutines(BuildCtx *ctx) |.ffunc string_sub | ffgccheck | addiu AT, NARGS8:RC, -16 + |.if FPU + | ldc1 f0, 16(BASE) + | trunc.w.d f0, f0 + |.else + | lw CARG1, 16(BASE) + | load_got __fixdfsi + | sw AT, TEMP_SAVE_1 + | call_extern + |. lw CARG2, 16+4(BASE) + | lw AT, TEMP_SAVE_1 + |.endif | lw CARG3, 16+HI(BASE) - | ldc1 f0, 16(BASE) | lw TMP0, HI(BASE) | lw STR:CARG1, LO(BASE) | bltz AT, ->fff_fallback - | lw CARG2, 8+HI(BASE) - | ldc1 f2, 8(BASE) + |. lw CARG2, 8+HI(BASE) | beqz AT, >1 |. li CARG4, -1 - | trunc.w.d f0, f0 | sltiu AT, CARG3, LJ_TISNUM | beqz AT, ->fff_fallback + |.if FPU |. mfc1 CARG4, f0 + |.else + |. move CARG4, CRET1 + |.endif |1: | sltiu AT, CARG2, LJ_TISNUM | beqz AT, ->fff_fallback |. li AT, LJ_TSTR - | trunc.w.d f2, f2 | bne TMP0, AT, ->fff_fallback - |. lw CARG2, STR:CARG1->len + |.if FPU + |. ldc1 f2, 8(BASE) + | trunc.w.d f2, f2 | mfc1 CARG3, f2 + |.else + |. sw CARG1, TEMP_SAVE_1 + | sw CARG4, TEMP_SAVE_2 + | lw CARG2, 8+4(BASE) + | load_got __fixdfsi + | call_extern + |. lw CARG1, 8(BASE) + | lw CARG1, TEMP_SAVE_1 + | lw CARG4, TEMP_SAVE_2 + | move CARG3, CRET1 + |.endif + | lw CARG2, STR:CARG1->len | // STR:CARG1 = str, CARG2 = str->len, CARG3 = start, CARG4 = end | slt AT, CARG4, r0 | addiu TMP0, CARG2, 1 @@ -1749,10 +2160,58 @@ static void build_subroutines(BuildCtx *ctx) | |//-- Bit library -------------------------------------------------------- | + |.if not FPU + |// FP number to bit conversion for soft-float. + |->vm_tobit: + | sll TMP0, CARG1, 1 + | lui TMP3, 0x0020 + | addu TMP0, TMP0, TMP3 + | slt TMP3, TMP0, r0 + | movz CARG2, r0, TMP3 + | beqz TMP3, >2 + |. li CARG4, 0x3e0 + | not CARG4, CARG4 + | sra TMP0, TMP0, 21 + | subu TMP0, CARG4, TMP0 + | slt TMP3, TMP0, r0 + | bnez TMP3, >1 + |. sll CARG4, CARG1, 11 + | lui TMP3, 0x8000 + | or CARG4, CARG4, TMP3 + | srl TMP3, CARG2, 21 + | or CARG4, CARG4, TMP3 + | slt TMP3, CARG1, r0 + | beqz TMP3, >2 + |. srlv CARG2, CARG4, TMP0 + | subu CARG2, r0, CARG2 + |2: + | jr ra + |. move CRET1, CARG2 + |1: + | addiu TMP0, TMP0, 21 + | srlv CARG4, CARG2, TMP0 + | li TMP3, 20 + | subu TMP0, TMP3, TMP0 + | sll CARG2, CARG1, 12 + | sllv TMP3, CARG2, TMP0 + | or CARG2, CARG4, TMP3 + | slt TMP3, CARG1, r0 + | beqz TMP3, <2 + |. nop + | jr ra + |. subu CRET1, r0, CARG2 + |.endif + | |.macro .ffunc_bit, name | .ffunc_n bit_..name + |.if FPU |. add.d FARG1, FARG1, TOBIT | mfc1 CRET1, FARG1 + |.else + |. nop + | bal ->vm_tobit + |. nop + |.endif |.endmacro | |.macro .ffunc_bit_op, name, ins @@ -1760,14 +2219,27 @@ static void build_subroutines(BuildCtx *ctx) | addiu TMP1, BASE, 8 | addu TMP2, BASE, NARGS8:RC |1: + | move CRET2, CRET1 | lw CARG4, HI(TMP1) + |.if FPU | beq TMP1, TMP2, ->fff_resi |. ldc1 FARG1, 0(TMP1) + |.else + | lw CARG1, 0(TMP1) + | beq TMP1, TMP2, ->fff_resi + |. lw CARG2, 4(TMP1) + |.endif | sltiu AT, CARG4, LJ_TISNUM | beqz AT, ->fff_fallback - | add.d FARG1, FARG1, TOBIT - | mfc1 CARG2, FARG1 - | ins CRET1, CRET1, CARG2 + |.if FPU + |. add.d FARG1, FARG1, TOBIT + | mfc1 CRET1, FARG1 + |.else + |. nop + | bal ->vm_tobit + |. nop + |.endif + | ins CRET1, CRET2, CRET1 | b <1 |. addiu TMP1, TMP1, 8 |.endmacro @@ -1794,10 +2266,22 @@ static void build_subroutines(BuildCtx *ctx) | |.macro .ffunc_bit_sh, name, ins, shmod | .ffunc_nn bit_..name + |.if FPU |. add.d FARG1, FARG1, TOBIT | add.d FARG2, FARG2, TOBIT | mfc1 CARG1, FARG1 | mfc1 CARG2, FARG2 + |.else + |. sw CARG4, TEMP_SAVE_1 + | bal ->vm_tobit + |. nop + | move CRET2, CRET1 + | lw CARG2, TEMP_SAVE_1 + | bal ->vm_tobit + |. move CARG1, CARG3 + | move CARG2, CRET1 + | move CARG1, CRET2 + |.endif |.if shmod == 1 | li AT, 32 | subu TMP0, AT, CARG2 @@ -1822,9 +2306,19 @@ static void build_subroutines(BuildCtx *ctx) | |.ffunc_bit tobit |->fff_resi: + | lw PC, FRAME_PC(BASE) + | addiu RA, BASE, -8 + |.if HFABI | mtc1 CRET1, FRET1 - | b ->fff_resn - |. cvt.d.w FRET1, FRET1 + | cvt.d.w FRET1, FRET1 + | b ->fff_res1 + |. sdc1 FRET1, -8(BASE) + |.else // Result already in CRET1. + | cvti2d CRET1 + | sw CRET1, -8(BASE) + | b ->fff_res1 + |. sw CRET2, -8+4(BASE) + |.endif | |//----------------------------------------------------------------------- | @@ -2082,14 +2576,23 @@ static void build_subroutines(BuildCtx *ctx) |//----------------------------------------------------------------------- | |.macro savex_, a, b + |.if FPU | sdc1 f..a, 16+a*8(sp) | sw r..a, 16+32*8+a*4(sp) | sw r..b, 16+32*8+b*4(sp) + |.else + | sw r..a, 16+a*4(sp) + | sw r..b, 16+b*4(sp) + |.endif |.endmacro | |->vm_exit_handler: |.if JIT + |.if FPU | addiu sp, sp, -(16+32*8+32*4) + |.else + | addiu sp, sp, -(16+32*4) + |.endif | savex_ 0, 1 | savex_ 2, 3 | savex_ 4, 5 @@ -2104,17 +2607,25 @@ static void build_subroutines(BuildCtx *ctx) | savex_ 22, 23 | savex_ 24, 25 | savex_ 26, 27 + |.if FPU | sdc1 f28, 16+28*8(sp) - | sw r28, 16+32*8+28*4(sp) | sdc1 f30, 16+30*8(sp) + | sw r28, 16+32*8+28*4(sp) | sw r30, 16+32*8+30*4(sp) | sw r0, 16+32*8+31*4(sp) // Clear RID_TMP. + | addiu TMP2, sp, 16+32*8+32*4 // Recompute original value of sp. + | sw TMP2, 16+32*8+29*4(sp) // Store sp in RID_SP + |.else + | sw r28, 16+28*4(sp) + | sw r30, 16+30*4(sp) + | sw r0, 16+31*4(sp) // Clear RID_TMP. + | addiu TMP2, sp, 16+32*4 // Recompute original value of sp. + | sw TMP2, 16+29*4(sp) // Store sp in RID_SP + |.endif | li_vmstate EXIT - | addiu TMP2, sp, 16+32*8+32*4 // Recompute original value of sp. | addiu DISPATCH, JGL, -GG_DISP2G-32768 | lw TMP1, 0(TMP2) // Load exit number. | st_vmstate - | sw TMP2, 16+32*8+29*4(sp) // Store sp in RID_SP. | lw L, DISPATCH_GL(cur_L)(DISPATCH) | lw BASE, DISPATCH_GL(jit_base)(DISPATCH) | load_got lj_trace_exit @@ -2144,15 +2655,15 @@ static void build_subroutines(BuildCtx *ctx) |1: | bltz CRET1, >9 // Check for error from exit. |. lw LFUNC:RB, FRAME_FUNC(BASE) - | lui TMP3, 0x59c0 // TOBIT = 2^52 + 2^51 (float). + | .FPU lui TMP3, 0x59c0 // TOBIT = 2^52 + 2^51 (float). | sll MULTRES, CRET1, 3 | li TISNIL, LJ_TNIL | sw MULTRES, SAVE_MULTRES - | mtc1 TMP3, TOBIT + | .FPU mtc1 TMP3, TOBIT | lw TMP1, LFUNC:RB->pc | sw r0, DISPATCH_GL(jit_base)(DISPATCH) | lw KBASE, PC2PROTO(k)(TMP1) - | cvt.d.s TOBIT, TOBIT + | .FPU cvt.d.s TOBIT, TOBIT | // Modified copy of ins_next which handles function header dispatch, too. | lw INS, 0(PC) | addiu PC, PC, 4 @@ -2160,7 +2671,7 @@ static void build_subroutines(BuildCtx *ctx) | sw TISNIL, DISPATCH_GL(vmstate)(DISPATCH) | decode_OP4a TMP1, INS | decode_OP4b TMP1 - | sltiu TMP2, TMP1, BC_FUNCF*4 // Function header? + | sltiu TMP2, TMP1, BC_FUNCF*4 | addu TMP0, DISPATCH, TMP1 | decode_RD8a RD, INS | lw AT, 0(TMP0) @@ -2202,7 +2713,7 @@ static void build_subroutines(BuildCtx *ctx) |//----------------------------------------------------------------------- | |// Modifies AT, TMP0, FRET1, FRET2, f4. Keeps all others incl. FARG1. - |.macro vm_round, func + |.macro vm_round_hf, func | lui TMP0, 0x4330 // Hiword of 2^52 (double). | mtc1 r0, f4 | mtc1 TMP0, f5 @@ -2244,6 +2755,25 @@ static void build_subroutines(BuildCtx *ctx) |. mov.d FRET1, FARG1 |.endmacro | + |.macro vm_round_sf, func + | addiu sp, sp, -8 + | load_got func + | sw ra, 0(sp) + | call_extern + |. nop + | lw ra, 0(sp) + | jr ra + |. addiu sp, sp, 8 + |.endmacro + | + |.macro vm_round, func + |.if FPU + | vm_round_hf, func + |.else + | vm_round_sf, func + |.endif + |.endmacro + | |->vm_floor: | vm_round floor |->vm_ceil: @@ -2272,10 +2802,10 @@ static void build_subroutines(BuildCtx *ctx) | sw r1, CTSTATE->cb.slot | sw CARG1, CTSTATE->cb.gpr[0] | sw CARG2, CTSTATE->cb.gpr[1] - | sdc1 FARG1, CTSTATE->cb.fpr[0] + | .FPU sdc1 FARG1, CTSTATE->cb.fpr[0] | sw CARG3, CTSTATE->cb.gpr[2] | sw CARG4, CTSTATE->cb.gpr[3] - | sdc1 FARG2, CTSTATE->cb.fpr[1] + | .FPU sdc1 FARG2, CTSTATE->cb.fpr[1] | addiu TMP0, sp, CFRAME_SPACE+16 | sw TMP0, CTSTATE->cb.stack | sw r0, SAVE_PC // Any value outside of bytecode is ok. @@ -2286,14 +2816,14 @@ static void build_subroutines(BuildCtx *ctx) | lw BASE, L:CRET1->base | lw RC, L:CRET1->top | move L, CRET1 - | lui TMP3, 0x59c0 // TOBIT = 2^52 + 2^51 (float). + | .FPU lui TMP3, 0x59c0 // TOBIT = 2^52 + 2^51 (float). | lw LFUNC:RB, FRAME_FUNC(BASE) - | mtc1 TMP3, TOBIT + | .FPU mtc1 TMP3, TOBIT | li_vmstate INTERP | li TISNIL, LJ_TNIL | subu RC, RC, BASE | st_vmstate - | cvt.d.s TOBIT, TOBIT + | .FPU cvt.d.s TOBIT, TOBIT | ins_callt |.endif | @@ -2307,11 +2837,11 @@ static void build_subroutines(BuildCtx *ctx) | move CARG2, RA | call_intern lj_ccallback_leave // (CTState *cts, TValue *o) |. move CARG1, CTSTATE + | .FPU ldc1 FRET1, CTSTATE->cb.fpr[0] | lw CRET1, CTSTATE->cb.gpr[0] - | ldc1 FRET1, CTSTATE->cb.fpr[0] - | lw CRET2, CTSTATE->cb.gpr[1] + | .FPU ldc1 FRET2, CTSTATE->cb.fpr[1] | b ->vm_leave_unw - |. ldc1 FRET2, CTSTATE->cb.fpr[1] + |. lw CRET2, CTSTATE->cb.gpr[1] |.endif | |->vm_ffi_call: // Call C function via FFI. @@ -2343,8 +2873,8 @@ static void build_subroutines(BuildCtx *ctx) | lw CARG2, CCSTATE->gpr[1] | lw CARG3, CCSTATE->gpr[2] | lw CARG4, CCSTATE->gpr[3] - | ldc1 FARG1, CCSTATE->fpr[0] - | ldc1 FARG2, CCSTATE->fpr[1] + | .FPU ldc1 FARG1, CCSTATE->fpr[0] + | .FPU ldc1 FARG2, CCSTATE->fpr[1] | jalr CFUNCADDR |. lw CARG1, CCSTATE->gpr[0] // Do this last, since CCSTATE is CARG1. | lw CCSTATE:TMP1, -12(r16) @@ -2352,8 +2882,10 @@ static void build_subroutines(BuildCtx *ctx) | lw ra, -4(r16) | sw CRET1, CCSTATE:TMP1->gpr[0] | sw CRET2, CCSTATE:TMP1->gpr[1] - | sdc1 FRET1, CCSTATE:TMP1->fpr[0] - | sdc1 FRET2, CCSTATE:TMP1->fpr[1] + | .FPU sdc1 FRET1, CCSTATE:TMP1->fpr[0] + | .FPU sdc1 FRET2, CCSTATE:TMP1->fpr[1] + | sw CARG1, CCSTATE:TMP1->gpr[2] // MIPS32 soft-float. + | sw CARG2, CCSTATE:TMP1->gpr[3] // Complex doubles are returned in v0, v1, a0, a1. | move sp, r16 | jr ra |. move r16, TMP2 @@ -2381,8 +2913,6 @@ static void build_ins(BuildCtx *ctx, BCOp op, int defop) | addu CARG3, BASE, RD | lw TMP0, HI(CARG2) | lw TMP1, HI(CARG3) - | ldc1 f0, 0(CARG2) - | ldc1 f2, 0(CARG3) | sltiu TMP0, TMP0, LJ_TISNUM | sltiu TMP1, TMP1, LJ_TISNUM | lhu TMP2, OFS_RD(PC) @@ -2390,8 +2920,16 @@ static void build_ins(BuildCtx *ctx, BCOp op, int defop) | addiu PC, PC, 4 | beqz TMP0, ->vmeta_comp |. lui TMP1, (-(BCBIAS_J*4 >> 16) & 65535) + | load_double f0, CARG1, CARG2, 0(CARG2) + |.if FPU + | ldc1 f2, 0(CARG3) + |.else + | lw CARG4, 4(CARG3) + | lw CARG3, 0(CARG3) + |.endif | decode_RD4b TMP2 | addu TMP2, TMP2, TMP1 + |.if FPU if (op == BC_ISLT || op == BC_ISGE) { | c.olt.d f0, f2 } else { @@ -2402,8 +2940,28 @@ static void build_ins(BuildCtx *ctx, BCOp op, int defop) } else { | movt TMP2, r0 } - | addu PC, PC, TMP2 + |.else + | load_got __ledf2 + | sw RD, TEMP_SAVE_1 + | sw TMP1, TEMP_SAVE_2 + | call_extern //CRET1 = f0<=f2 + |. sw TMP2, TEMP_SAVE_3 + | lw TMP2, TEMP_SAVE_3 + | lw TMP1, TEMP_SAVE_2 + if (op == BC_ISLT) { + | bltz CRET1, >1 + } else if (op == BC_ISLE) { + | blez CRET1, >1 + } else if (op == BC_ISGT) { + | bgtz CRET1, >1 + } else { + | bgez CRET1, >1 + } + |. lw RD, TEMP_SAVE_1 + | move TMP2, r0 |1: + |.endif + | addu PC, PC, TMP2 | ins_next break; @@ -2413,24 +2971,43 @@ static void build_ins(BuildCtx *ctx, BCOp op, int defop) | addu RA, BASE, RA | addiu PC, PC, 4 | lw TMP0, HI(RA) - | ldc1 f0, 0(RA) | addu RD, BASE, RD | lhu TMP2, -4+OFS_RD(PC) - | lw TMP1, HI(RD) - | ldc1 f2, 0(RD) | lui TMP3, (-(BCBIAS_J*4 >> 16) & 65535) + | lw TMP1, HI(RD) + | decode_RD4b TMP2 | sltiu AT, TMP0, LJ_TISNUM | sltiu CARG1, TMP1, LJ_TISNUM - | decode_RD4b TMP2 + | load_double f2, CARG3, CARG4, 0(RD) + | lui TMP3, (-(BCBIAS_J*4 >> 16) & 65535) | and AT, AT, CARG1 + | load_double f0, CARG1, CARG2, 0(RA) | beqz AT, >5 |. addu TMP2, TMP2, TMP3 + |.if FPU | c.eq.d f0, f2 if (vk) { | movf TMP2, r0 } else { | movt TMP2, r0 } + |.else + | load_got __ledf2 + | sw RD, TEMP_SAVE_1 + | call_extern + |. sw TMP2, TEMP_SAVE_2 + | lw RD, TEMP_SAVE_1 + | lw TMP2, TEMP_SAVE_2 + if (vk) { + | beqz CRET1, >4 + |. nop + } else { + | bnez CRET1, >4 + |. nop + } + | move TMP2, r0 + |4: + |.endif |1: | addu PC, PC, TMP2 | ins_next @@ -2507,10 +3084,10 @@ static void build_ins(BuildCtx *ctx, BCOp op, int defop) | addu RA, BASE, RA | addiu PC, PC, 4 | lw TMP0, HI(RA) - | ldc1 f0, 0(RA) + | load_double f0, CARG1, CARG2, 0(RA) | addu RD, KBASE, RD | lhu TMP2, -4+OFS_RD(PC) - | ldc1 f2, 0(RD) + | load_double f2, CARG3, CARG4, 0(RD) | lui TMP3, (-(BCBIAS_J*4 >> 16) & 65535) | sltiu AT, TMP0, LJ_TISNUM | decode_RD4b TMP2 @@ -2520,6 +3097,7 @@ static void build_ins(BuildCtx *ctx, BCOp op, int defop) | beqz AT, >1 |.endif |. addu TMP2, TMP2, TMP3 + |.if FPU | c.eq.d f0, f2 if (vk) { | movf TMP2, r0 @@ -2530,6 +3108,28 @@ static void build_ins(BuildCtx *ctx, BCOp op, int defop) |1: | addu PC, PC, TMP2 } + |.else + | load_got __ledf2 + | sw RD, TEMP_SAVE_1 + | call_extern + |. sw TMP2, TEMP_SAVE_2 + | lw RD, TEMP_SAVE_1 + | lw TMP2, TEMP_SAVE_2 + if (vk) { + | beqz CRET1, >4 + |. nop + | move TMP2, r0 + |4: + | addu PC, PC, TMP2 + |1: + } else { + | bnez CRET1, >1 + |. nop + | move TMP2, r0 + |1: + | addu PC, PC, TMP2 + } + |.endif | ins_next |.if FFI |5: @@ -2588,7 +3188,7 @@ static void build_ins(BuildCtx *ctx, BCOp op, int defop) | addu PC, PC, TMP2 } else { | sltiu TMP0, TMP0, LJ_TISTRUECOND - | ldc1 f0, 0(RD) + | load_double1 0(RD) if (op == BC_ISTC) { | beqz TMP0, >1 } else { @@ -2598,7 +3198,7 @@ static void build_ins(BuildCtx *ctx, BCOp op, int defop) | decode_RD4b TMP2 | lui TMP3, (-(BCBIAS_J*4 >> 16) & 65535) | addu TMP2, TMP2, TMP3 - | sdc1 f0, 0(RA) + | store_double1 0(RA) | addu PC, PC, TMP2 |1: } @@ -2631,9 +3231,9 @@ static void build_ins(BuildCtx *ctx, BCOp op, int defop) | // RA = dst*8, RD = src*8 | addu RD, BASE, RD | addu RA, BASE, RA - | ldc1 f0, 0(RD) + | load_double1 0(RD) | ins_next1 - | sdc1 f0, 0(RA) + | store_double1 0(RA) | ins_next2 break; case BC_NOT: @@ -2653,12 +3253,19 @@ static void build_ins(BuildCtx *ctx, BCOp op, int defop) | addu CARG3, BASE, RD | addu RA, BASE, RA | lw TMP0, HI(CARG3) - | ldc1 f0, 0(CARG3) | sltiu AT, TMP0, LJ_TISNUM + | load_double f0, CARG1, CARG2, 0(CARG3) + |.if FPU | beqz AT, ->vmeta_unm |. neg.d f0, f0 + |.else + | lui TMP1, 0x8000 + | xor CRET1, TMP1, CARG1 + | beqz AT, ->vmeta_unm + |. move CRET2, CARG2 + |.endif | ins_next1 - | sdc1 f0, 0(RA) + | store_double f0, CRET1, CRET2, 0(RA) | ins_next2 break; case BC_LEN: @@ -2672,10 +3279,14 @@ static void build_ins(BuildCtx *ctx, BCOp op, int defop) |. li AT, LJ_TTAB | lw CRET1, STR:CARG1->len |1: + |.if FPU | mtc1 CRET1, f0 | cvt.d.w f0, f0 + |.else + | cvti2d CRET1 + |.endif | ins_next1 - | sdc1 f0, 0(RA) + | store_double f0, CRET1, CRET2, 0(RA) | ins_next2 |2: | bne TMP0, AT, ->vmeta_len @@ -2717,72 +3328,142 @@ static void build_ins(BuildCtx *ctx, BCOp op, int defop) | addu CARG3, BASE, RB | addu CARG4, KBASE, RC | lw TMP1, HI(CARG3) - | ldc1 f20, 0(CARG3) - | ldc1 f22, 0(CARG4) - | sltiu AT, TMP1, LJ_TISNUM + | sltiu AT, TMP1, LJ_TISNUM + | load_double f20, CARG1, CARG2, 0(CARG3) + | load_double f22, CARG3, CARG4, 0(CARG4) + |.if FPU + | beqz AT, ->vmeta_arith + |.else + | beqz AT, ->vmeta_arith_vn + |.endif + |. addu RA, BASE, RA || break; ||case 1: | addu CARG4, BASE, RB | addu CARG3, KBASE, RC | lw TMP1, HI(CARG4) - | ldc1 f22, 0(CARG4) - | ldc1 f20, 0(CARG3) - | sltiu AT, TMP1, LJ_TISNUM + | sltiu AT, TMP1, LJ_TISNUM + | load_double f20, CARG1, CARG2, 0(CARG3) + | load_double f22, CARG3, CARG4, 0(CARG4) + |.if FPU + | beqz AT, ->vmeta_arith + |.else + | beqz AT, ->vmeta_arith_nv + |.endif + |. addu RA, BASE, RA || break; ||default: | addu CARG3, BASE, RB | addu CARG4, BASE, RC | lw TMP1, HI(CARG3) | lw TMP2, HI(CARG4) - | ldc1 f20, 0(CARG3) - | ldc1 f22, 0(CARG4) - | sltiu AT, TMP1, LJ_TISNUM - | sltiu TMP0, TMP2, LJ_TISNUM - | and AT, AT, TMP0 + | sltiu AT, TMP1, LJ_TISNUM + | sltiu TMP0, TMP2, LJ_TISNUM + | and AT, AT, TMP0 + | load_double f20, CARG1, CARG2, 0(CARG3) + | load_double f22, CARG3, CARG4, 0(CARG4) + |.if FPU + | beqz AT, ->vmeta_arith + |.else + | beqz AT, ->vmeta_arith_vv + |.endif + |. addu RA, BASE, RA || break; ||} - | beqz AT, ->vmeta_arith - |. addu RA, BASE, RA |.endmacro | + |.macro ins_arithfallback + ||switch (vk) { + ||case 0: + | b ->vmeta_arith_vn + |. nop + || break; + ||case 1: + | b ->vmeta_arith_nv + |. nop + || break; + ||default: + | b ->vmeta_arith_vv + |. nop + || break; + ||} + |.endmacro + | + |.if FPU |.macro fpmod, a, b, c |->BC_MODVN_Z: - | bal ->vm_floor // floor(b/c) + | bal ->vm_floor // floor(b/c) |. div.d FARG1, b, c | mul.d a, FRET1, c - | sub.d a, b, a // b - floor(b/c)*c + | sub.d a, b, a // b - floor(b/c)*c |.endmacro + |.else | - |.macro ins_arith, ins + |.macro sfpmod + |->BC_MODVN_Z: + | load_got __divdf3 + | sw CARG1, TEMP_SAVE_1 + | sw CARG2, TEMP_SAVE_2 + | sw CARG3, TEMP_SAVE_3 + | call_extern + |. sw CARG4, TEMP_SAVE_4 + | move CARG1, CRET1 + | bal ->vm_floor + |. move CARG2, CRET2 + | load_got __muldf3 + | move CARG1, CRET1 + | move CARG2, CRET2 + | lw CARG3, TEMP_SAVE_3 + | call_extern + |. lw CARG4, TEMP_SAVE_4 + | load_got __subdf3 + | lw CARG1, TEMP_SAVE_1 + | lw CARG2, TEMP_SAVE_2 + | move CARG3, CRET1 + | call_extern + |. move CARG4, CRET2 + |.endmacro + |.endif + | + |.macro ins_arith, intins, fpins, fpcall | ins_arithpre - |.if "ins" == "fpmod_" - | b ->BC_MODVN_Z // Avoid 3 copies. It's slow anyway. + |.if "fpins" == "fpmod_" + | b ->BC_MODVN_Z // Avoid 3 copies. It's slow anyway. |. nop |.else - | ins f0, f20, f22 + |.if FPU + | fpins f0, f20, f22 + |.else + |.if "fpcall" == "sfpmod" + | sfpmod + |.else + | load_got fpcall + | call_extern + |. nop + |.endif + |.endif | ins_next1 - | sdc1 f0, 0(RA) + | store_double1 0(RA) | ins_next2 |.endif |.endmacro case BC_ADDVN: case BC_ADDNV: case BC_ADDVV: - | ins_arith add.d + | ins_arith addu, add.d, __adddf3 break; case BC_SUBVN: case BC_SUBNV: case BC_SUBVV: - | ins_arith sub.d + | ins_arith subu, sub.d, __subdf3 break; case BC_MULVN: case BC_MULNV: case BC_MULVV: - | ins_arith mul.d + | ins_arith mult, mul.d, __muldf3 break; case BC_DIVVN: case BC_DIVNV: case BC_DIVVV: - | ins_arith div.d + | ins_arith div, div.d, __divdf3 break; case BC_MODVN: - | ins_arith fpmod - break; + | ins_arith modi, fpmod, sfpmod case BC_MODNV: case BC_MODVV: - | ins_arith fpmod_ + | ins_arith modi, fpmod_, sfpmod break; case BC_POW: | decode_RB8a RB, INS @@ -2792,18 +3473,23 @@ static void build_ins(BuildCtx *ctx, BCOp op, int defop) | addu CARG4, BASE, RC | lw TMP1, HI(CARG3) | lw TMP2, HI(CARG4) - | ldc1 FARG1, 0(CARG3) - | ldc1 FARG2, 0(CARG4) | sltiu AT, TMP1, LJ_TISNUM | sltiu TMP0, TMP2, LJ_TISNUM | and AT, AT, TMP0 | load_got pow | beqz AT, ->vmeta_arith |. addu RA, BASE, RA + | load_farg1 0(CARG3) + | load_farg2 0(CARG4) | call_extern |. nop | ins_next1 + |.if HFABI | sdc1 FRET1, 0(RA) + |.else + | sw CRET1, 0(RA) + | sw CRET2, 4(RA) + |.endif | ins_next2 break; @@ -2826,10 +3512,10 @@ static void build_ins(BuildCtx *ctx, BCOp op, int defop) | bnez CRET1, ->vmeta_binop |. lw BASE, L->base | addu RB, BASE, MULTRES - | ldc1 f0, 0(RB) + | load_double1 0(RB) | addu RA, BASE, RA | ins_next1 - | sdc1 f0, 0(RA) // Copy result from RB to RA. + | store_double1 0(RA) | ins_next2 break; @@ -2864,20 +3550,24 @@ static void build_ins(BuildCtx *ctx, BCOp op, int defop) case BC_KSHORT: | // RA = dst*8, RD = int16_literal*8 | sra RD, INS, 16 - | mtc1 RD, f0 | addu RA, BASE, RA + |.if FPU + | mtc1 RD, f0 | cvt.d.w f0, f0 + |.else + | cvti2d RD + |.endif | ins_next1 - | sdc1 f0, 0(RA) + | store_double f0, CRET1, CRET2, 0(RA) | ins_next2 break; case BC_KNUM: | // RA = dst*8, RD = num_const*8 | addu RD, KBASE, RD | addu RA, BASE, RA - | ldc1 f0, 0(RD) + | load_double1 0(RD) | ins_next1 - | sdc1 f0, 0(RA) + | store_double1 0(RA) | ins_next2 break; case BC_KPRI: @@ -2913,9 +3603,9 @@ static void build_ins(BuildCtx *ctx, BCOp op, int defop) | lw UPVAL:RB, LFUNC:RD->uvptr | ins_next1 | lw TMP1, UPVAL:RB->v - | ldc1 f0, 0(TMP1) + | load_double1 0(TMP1) | addu RA, BASE, RA - | sdc1 f0, 0(RA) + | store_double1 0(RA) | ins_next2 break; case BC_USETV: @@ -2924,14 +3614,14 @@ static void build_ins(BuildCtx *ctx, BCOp op, int defop) | srl RA, RA, 1 | addu RD, BASE, RD | addu RA, RA, LFUNC:RB - | ldc1 f0, 0(RD) + | load_double1 0(RD) | lw UPVAL:RB, LFUNC:RA->uvptr | lbu TMP3, UPVAL:RB->marked | lw CARG2, UPVAL:RB->v | andi TMP3, TMP3, LJ_GC_BLACK // isblack(uv) | lbu TMP0, UPVAL:RB->closed | lw TMP2, HI(RD) - | sdc1 f0, 0(CARG2) + | store_double1 0(CARG2) | li AT, LJ_GC_BLACK|1 | or TMP3, TMP3, TMP0 | beq TMP3, AT, >2 // Upvalue is closed and black? @@ -2991,11 +3681,11 @@ static void build_ins(BuildCtx *ctx, BCOp op, int defop) | srl RA, RA, 1 | addu RD, KBASE, RD | addu RA, RA, LFUNC:RB - | ldc1 f0, 0(RD) + | load_double1 0(RD) | lw UPVAL:RB, LFUNC:RA->uvptr | ins_next1 | lw TMP1, UPVAL:RB->v - | sdc1 f0, 0(TMP1) + | store_double1 0(TMP1) | ins_next2 break; case BC_USETP: @@ -3126,13 +3816,13 @@ static void build_ins(BuildCtx *ctx, BCOp op, int defop) | lw TMP2, HI(CARG3) | lw TAB:RB, LO(CARG2) | li AT, LJ_TTAB - | ldc1 f0, 0(CARG3) | bne TMP1, AT, ->vmeta_tgetv |. addu RA, BASE, RA | sltiu AT, TMP2, LJ_TISNUM | beqz AT, >5 |. li AT, LJ_TSTR - | + |.if FPU + | ldc1 f0, 0(CARG3) | // Convert number key to integer, check for integerness and range. | cvt.w.d f2, f0 | lw TMP0, TAB:RB->asize @@ -3148,9 +3838,51 @@ static void build_ins(BuildCtx *ctx, BCOp op, int defop) | lw TMP0, HI(TMP2) | beq TMP0, TISNIL, >2 |. ldc1 f0, 0(TMP2) + |.else + | sw RB, TEMP_SAVE_1 + | sw CARG2, TEMP_SAVE_3 + | load_got __fixdfsi + | lw CARG1, 0(CARG3) + | lw CARG2, 4(CARG3) + | call_extern // cvt.w.d f2, f0 + |. sw RC, TEMP_SAVE_2 + | sw CRET1, TEMP_SAVE_4 + | cvti2d CRET1 // cvt.d.w f4, f2 + | load_got __ledf2 + | lw RC, TEMP_SAVE_2 + | addu CARG3, BASE, RC + | lw CARG1, 0(CARG3) + | lw CARG2, 4(CARG3) + | move CARG3, CRET1 + | move CARG4, CRET2 + | call_extern // c.eq.d f0, f4 + |. nop + | lw CARG3, TEMP_SAVE_3 + | lw RC, TEMP_SAVE_2 + | lw RB, TEMP_SAVE_1 + | lw TMP0, TAB:RB->asize + | lw TMP1, TAB:RB->array + | lw TMP2, TEMP_SAVE_4 + | lw CARG2, TEMP_SAVE_3 // Restore old CARG2 and CARG3. + | addu CARG3, BASE, RC + | bnez CRET1, >3 + |. sltu AT, TMP2, TMP0 + | b >4 + |. nop + |3: + | move AT, r0 + |4: + | sll TMP2, TMP2, 3 + | beqz AT, ->vmeta_tgetv // Integer key and in array part? + |. addu TMP2, TMP1, TMP2 + | lw TMP0, HI(TMP2) + | lw SFT2, 4(TMP2) + | beq TMP0, TISNIL, >2 + |. lw SFT1, 0(TMP2) + |.endif |1: | ins_next1 - | sdc1 f0, 0(RA) + | store_double1 0(RA) | ins_next2 | |2: // Check for __index if table value is nil. @@ -3246,10 +3978,11 @@ static void build_ins(BuildCtx *ctx, BCOp op, int defop) |. addu RC, TMP2, RC | lw TMP1, HI(RC) | beq TMP1, TISNIL, >5 - |. ldc1 f0, 0(RC) + |. nop |1: + | load_double1 0(RC) | ins_next1 - | sdc1 f0, 0(RA) + | store_double1 0(RA) | ins_next2 | |5: // Check for __index if table value is nil. @@ -3271,20 +4004,28 @@ static void build_ins(BuildCtx *ctx, BCOp op, int defop) | addu CARG2, BASE, RB | addu CARG3, BASE, RC | lw TAB:CARG1, LO(CARG2) + | lw TMP0, TAB:CARG1->asize + | lw TMP1, TAB:CARG1->array + |.if FPU | ldc1 f0, 0(CARG3) | trunc.w.d f2, f0 - | lw TMP0, TAB:CARG1->asize | mfc1 CARG2, f2 - | lw TMP1, TAB:CARG1->array + |.else + | load_got __fixdfsi + | lw CARG1, 0(CARG3) + | call_extern + |. lw CARG2, 4(CARG3) + | move CARG2, CRET1 + |.endif | sltu AT, CARG2, TMP0 | sll TMP2, CARG2, 3 | beqz AT, ->vmeta_tgetr // In array part? |. addu TMP2, TMP1, TMP2 - | ldc1 f0, 0(TMP2) + | load_double1 0(TMP2) |->BC_TGETR_Z: | addu RA, BASE, RA | ins_next1 - | sdc1 f0, 0(RA) + | store_double1 0(RA) | ins_next2 break; @@ -3299,13 +4040,13 @@ static void build_ins(BuildCtx *ctx, BCOp op, int defop) | lw TMP2, HI(CARG3) | lw TAB:RB, LO(CARG2) | li AT, LJ_TTAB - | ldc1 f0, 0(CARG3) | bne TMP1, AT, ->vmeta_tsetv |. addu RA, BASE, RA | sltiu AT, TMP2, LJ_TISNUM | beqz AT, >5 |. li AT, LJ_TSTR - | + |.if FPU + | ldc1 f0, 0(CARG3) | // Convert number key to integer, check for integerness and range. | cvt.w.d f2, f0 | lw TMP0, TAB:RB->asize @@ -3326,6 +4067,52 @@ static void build_ins(BuildCtx *ctx, BCOp op, int defop) | andi AT, TMP3, LJ_GC_BLACK // isblack(table) | bnez AT, >7 |. sdc1 f0, 0(TMP1) + |.else + | sw RB, TEMP_SAVE_1 + | sw RC, TEMP_SAVE_2 + | sw CARG2, TEMP_SAVE_3 + | load_got __fixdfsi + | lw CARG1, 0(CARG3) + | call_extern // cvt.w.d f2, f0 + |. lw CARG2, 4(CARG3) + | sw CRET1, TEMP_SAVE_4 + | cvti2d CRET1 // cvt.d.w f4, f2 + | load_got __ledf2 + | lw RC, TEMP_SAVE_2 + | addu CARG3, BASE, RC + | lw CARG1, 0(CARG3) + | lw CARG2, 4(CARG3) + | move CARG3, CRET1 + | call_extern // c.eq.d f0, f4 + |. move CARG4, CRET2 + | lw RC, TEMP_SAVE_2 + | lw RB, TEMP_SAVE_1 + | lw TMP0, TAB:RB->asize + | lw TMP1, TAB:RB->array + | lw TMP2, TEMP_SAVE_4 + | lw CARG2, TEMP_SAVE_3 // Restore old CARG2 and CARG3. + | addu CARG3, BASE, RC + | bnez CRET1, >4 // NaN? + |. sltu AT, TMP2, TMP0 + | b >6 + |. nop + |4: + | move AT, r0 + |6: + | sll TMP2, TMP2, 3 + | beqz AT, ->vmeta_tsetv // Integer key and in array part? + |. addu TMP1, TMP1, TMP2 + | lbu TMP3, TAB:RB->marked + | lw TMP0, HI(TMP1) + | lw SFT1, 0(RA) + | beq TMP0, TISNIL, >3 + |. lw SFT2, 4(RA) + |1: + | andi AT, TMP3, LJ_GC_BLACK // isblack(table) + | sw SFT1, 0(TMP1) + | bnez AT, >7 + |. sw SFT2, 4(TMP1) + |.endif |2: | ins_next | @@ -3374,7 +4161,7 @@ static void build_ins(BuildCtx *ctx, BCOp op, int defop) | sll TMP1, TMP1, 3 | subu TMP1, TMP0, TMP1 | addu NODE:TMP2, NODE:TMP2, TMP1 // node = tab->node + (idx*32-idx*8) - | ldc1 f20, 0(RA) + | load_double f20, SFT1, SFT2, 0(RA) |1: | lw CARG1, offsetof(Node, key)+HI(NODE:TMP2) | lw TMP0, offsetof(Node, key)+LO(NODE:TMP2) @@ -3388,8 +4175,14 @@ static void build_ins(BuildCtx *ctx, BCOp op, int defop) |. lw TAB:TMP0, TAB:RB->metatable |2: | andi AT, TMP3, LJ_GC_BLACK // isblack(table) + |.if FPU | bnez AT, >7 |. sdc1 f20, NODE:TMP2->val + |.else + | sw SFT1, NODE:TMP2->val.u32.hi + | bnez AT, >7 + |. sw SFT2, NODE:TMP2->val.u32.lo + |.endif |3: | ins_next | @@ -3417,6 +4210,10 @@ static void build_ins(BuildCtx *ctx, BCOp op, int defop) | beqz TMP0, ->vmeta_tsets // 'no __newindex' flag NOT set: check. |. li AT, LJ_TSTR |6: + |.if not FPU + | sw SFT1, TEMP_SAVE_1 + | sw SFT2, TEMP_SAVE_2 + |.endif | load_got lj_tab_newkey | sw STR:RC, LO(CARG3) | sw AT, HI(CARG3) @@ -3427,8 +4224,16 @@ static void build_ins(BuildCtx *ctx, BCOp op, int defop) |. move CARG1, L | // Returns TValue *. | lw BASE, L->base + |.if FPU | b <3 // No 2nd write barrier needed. |. sdc1 f20, 0(CRET1) + |.else + | lw SFT2, TEMP_SAVE_1 + | lw SFT3, TEMP_SAVE_2 + | sw SFT2, 0(CRET1) + | b <3 + |. sw SFT3, 4(CRET1) + |.endif | |7: // Possible table write barrier for the value. Skip valiswhite check. | barrierback TAB:RB, TMP3, TMP0, <3 @@ -3453,11 +4258,17 @@ static void build_ins(BuildCtx *ctx, BCOp op, int defop) | lw TMP1, HI(RC) | lbu TMP3, TAB:RB->marked | beq TMP1, TISNIL, >5 - |. ldc1 f0, 0(RA) |1: - | andi AT, TMP3, LJ_GC_BLACK // isblack(table) + |. andi AT, TMP3, LJ_GC_BLACK // isblack(table) + | load_double1 0(RA) + |.if FPU | bnez AT, >7 |. sdc1 f0, 0(RC) + |.else + | sw SFT1, 0(RC) + | bnez AT, >7 + |. sw SFT2, 4(RC) + |.endif |2: | ins_next | @@ -3482,12 +4293,22 @@ static void build_ins(BuildCtx *ctx, BCOp op, int defop) | decode_RDtoRC8 RC, RD | addu CARG1, BASE, RB | addu CARG3, BASE, RC - | lw TAB:CARG2, LO(CARG1) + |.if FPU | ldc1 f0, 0(CARG3) | trunc.w.d f2, f0 + | mfc1 CARG3, f2 + |.else + | load_got __fixdfsi + | sw CARG1, TEMP_SAVE_1 + | lw CARG1, 0(CARG3) + | call_extern + |. lw CARG2, 4(CARG3) + | lw CARG1, TEMP_SAVE_1 + | move CARG3, CRET1 + |.endif + | lw TAB:CARG2, LO(CARG1) | lbu TMP3, TAB:CARG2->marked | lw TMP0, TAB:CARG2->asize - | mfc1 CARG3, f2 | lw TMP1, TAB:CARG2->array | andi AT, TMP3, LJ_GC_BLACK // isblack(table) | bnez AT, >7 @@ -3495,12 +4316,24 @@ static void build_ins(BuildCtx *ctx, BCOp op, int defop) |2: | sltu AT, CARG3, TMP0 | sll TMP2, CARG3, 3 + |.if FPU | beqz AT, ->vmeta_tsetr // In array part? |. ldc1 f20, 0(RA) | addu CRET1, TMP1, TMP2 |->BC_TSETR_Z: + |.else + | lw TMP0, 0(RA) + | lw TMP3, 4(RA) + | sw TMP0, TEMP_SAVE_1 + | beqz AT, ->vmeta_tsetr // In array part? + |. sw TMP3, TEMP_SAVE_2 + | addu CRET1, TMP1, TMP2 + |->BC_TSETR_Z: + | lw TMP0, TEMP_SAVE_1 + | lw TMP3, TEMP_SAVE_2 + |.endif | ins_next1 - | sdc1 f20, 0(CRET1) + | store_double f20, TMP0, TMP3, 0(CRET1) | ins_next2 | |7: // Possible table write barrier for the value. Skip valiswhite check. @@ -3529,10 +4362,10 @@ static void build_ins(BuildCtx *ctx, BCOp op, int defop) | addu TMP1, TMP1, CARG1 | andi TMP0, TMP3, LJ_GC_BLACK // isblack(table) |3: // Copy result slots to table. - | ldc1 f0, 0(RA) + | load_double1 0(RA) | addiu RA, RA, 8 | sltu AT, RA, TMP2 - | sdc1 f0, 0(TMP1) + | store_double1 0(TMP1) | bnez AT, <3 |. addiu TMP1, TMP1, 8 | bnez TMP0, >7 @@ -3607,10 +4440,10 @@ static void build_ins(BuildCtx *ctx, BCOp op, int defop) | beqz NARGS8:RC, >3 |. move TMP3, NARGS8:RC |2: - | ldc1 f0, 0(RA) + | load_double1 0(RA) | addiu RA, RA, 8 | addiu TMP3, TMP3, -8 - | sdc1 f0, 0(TMP2) + | store_double1 0(TMP2) | bnez TMP3, <2 |. addiu TMP2, TMP2, 8 |3: @@ -3647,12 +4480,12 @@ static void build_ins(BuildCtx *ctx, BCOp op, int defop) | li AT, LJ_TFUNC | lw TMP1, -24+HI(BASE) | lw LFUNC:RB, -24+LO(BASE) - | ldc1 f2, -8(BASE) - | ldc1 f0, -16(BASE) + | load_double1 -8(BASE) + | load_double2 -16(BASE) | sw TMP1, HI(BASE) // Copy callable. | sw LFUNC:RB, LO(BASE) - | sdc1 f2, 16(BASE) // Copy control var. - | sdc1 f0, 8(BASE) // Copy state. + | store_double1 16(BASE) // Copy control var. + | store_double2 8(BASE) // Copy state. | addiu BASE, BASE, 8 | bne TMP1, AT, ->vmeta_call |. li NARGS8:RC, 16 // Iterators get 2 arguments. @@ -3676,19 +4509,29 @@ static void build_ins(BuildCtx *ctx, BCOp op, int defop) |. sll TMP3, RC, 3 | addu TMP3, TMP1, TMP3 | lw TMP2, HI(TMP3) - | ldc1 f0, 0(TMP3) + | load_double1 0(TMP3) + |.if FPU | mtc1 RC, f2 + |.else + | move CARG1, RC + |.endif | lhu RD, -4+OFS_RD(PC) | beq TMP2, TISNIL, <1 // Skip holes in array part. |. addiu RC, RC, 1 + | store_double1 8(RA) + |.if FPU | cvt.d.w f2, f2 + |.else + | load_got __floatsidf + | call_extern + |. nop + |.endif | lui TMP3, (-(BCBIAS_J*4 >> 16) & 65535) - | sdc1 f0, 8(RA) + | store_double f2, CRET1, CRET2, 0(RA) | decode_RD4b RD | addu RD, RD, TMP3 | sw RC, -8+LO(RA) // Update control var. | addu PC, PC, RD - | sdc1 f2, 0(RA) |3: | ins_next | @@ -3704,17 +4547,22 @@ static void build_ins(BuildCtx *ctx, BCOp op, int defop) | subu TMP3, TMP3, RB | addu NODE:TMP3, TMP3, TMP2 | lw RB, HI(NODE:TMP3) - | ldc1 f0, 0(NODE:TMP3) + | load_double1 0(NODE:TMP3) | lhu RD, -4+OFS_RD(PC) | beq RB, TISNIL, <6 // Skip holes in hash part. |. addiu RC, RC, 1 + |.if FPU | ldc1 f2, NODE:TMP3->key + |.else + | lw SFT3, NODE:TMP3->key.u32.hi + | lw SFT4, NODE:TMP3->key.u32.lo + |.endif | lui TMP3, (-(BCBIAS_J*4 >> 16) & 65535) - | sdc1 f0, 8(RA) + | store_double1 8(RA) | addu RC, RC, TMP0 | decode_RD4b RD | addu RD, RD, TMP3 - | sdc1 f2, 0(RA) + | store_double2 0(RA) | addu PC, PC, RD | b <3 |. sw RC, -8+LO(RA) // Update control var. @@ -3794,9 +4642,9 @@ static void build_ins(BuildCtx *ctx, BCOp op, int defop) | bnez AT, >7 |. addiu MULTRES, TMP1, 8 |6: - | ldc1 f0, 0(RC) + | load_double1 0(RC) | addiu RC, RC, 8 - | sdc1 f0, 0(RA) + | store_double1 0(RA) | sltu AT, RC, TMP3 | bnez AT, <6 // More vararg slots? |. addiu RA, RA, 8 @@ -3852,10 +4700,10 @@ static void build_ins(BuildCtx *ctx, BCOp op, int defop) | beqz RC, >3 |. subu BASE, TMP2, TMP0 |2: - | ldc1 f0, 0(RA) + | load_double1 0(RA) | addiu RA, RA, 8 | addiu RC, RC, -8 - | sdc1 f0, 0(TMP2) + | store_double1 0(TMP2) | bnez RC, <2 |. addiu TMP2, TMP2, 8 |3: @@ -3896,14 +4744,14 @@ static void build_ins(BuildCtx *ctx, BCOp op, int defop) | lw INS, -4(PC) | addiu TMP2, BASE, -8 if (op == BC_RET1) { - | ldc1 f0, 0(RA) + | load_double1 0(RA) } | decode_RB8a RB, INS | decode_RA8a RA, INS | decode_RB8b RB | decode_RA8b RA if (op == BC_RET1) { - | sdc1 f0, 0(TMP2) + | store_double1 0(TMP2) } | subu BASE, TMP2, RA |5: @@ -3928,6 +4776,45 @@ static void build_ins(BuildCtx *ctx, BCOp op, int defop) /* -- Loops and branches ------------------------------------------------ */ + |.macro cmp_res, gt + |.if gt == 1 + |.if FPU + | movf TMP1, r0, 0 // f0>f2: TMP1=0 + | movf TMP2, r0, 1 // f2>f0: TMP2=0 + |.else + | li SFT2, 1 + | bne CRET1, SFT2, >1 + |. nop + | b >2 + |. move TMP1, r0 + |1: + | li SFT2, -1 + | bne CRET1, SFT2, >2 + |. nop + | move TMP2, r0 + |2: + |.endif + |.else + |.if FPU + | movt TMP1, r0, 0 // f0<=f2: TMP1=0 + | movt TMP2, r0, 1 // f2<=f0: TMP2=0 + |.else + | bltz CRET1, >3 // f02 // f0==f2: TMP1=TMP2=0 + |. li SFT2, 1 + | bne SFT2, CRET1, >4 // f0>f2: TMP2=0 + |. nop + | b >4 + |2: + |. move TMP2, r0 + |3: + | move TMP1, r0 + |4: + |.endif + |.endif + |.endmacro + case BC_FORL: |.if JIT | hotloop @@ -3946,12 +4833,26 @@ static void build_ins(BuildCtx *ctx, BCOp op, int defop) vk = (op == BC_IFORL || op == BC_JFORL); | addu RA, BASE, RA if (vk) { + |.if FPU | ldc1 f0, FORL_IDX*8(RA) | ldc1 f4, FORL_STEP*8(RA) | ldc1 f2, FORL_STOP*8(RA) | lw TMP3, FORL_STEP*8+HI(RA) | add.d f0, f0, f4 | sdc1 f0, FORL_IDX*8(RA) + |.else + | load_got __adddf3 + | load_farg1 FORL_IDX*8(RA) + | load_farg2 FORL_STEP*8(RA) + | call_extern + |. sw RD, TEMP_SAVE_1 //save RD + | sw CRET1, FORL_IDX*8(RA) + | sw CRET2, FORL_IDX*8+4(RA) + | load_farg1 FORL_IDX*8(RA) + | load_farg2 FORL_STOP*8(RA) // f0 and f2 + | lw TMP3, FORL_STEP*8+HI(RA) + | lw RD, TEMP_SAVE_1 + |.endif } else { | lw TMP1, FORL_IDX*8+HI(RA) | lw TMP3, FORL_STEP*8+HI(RA) @@ -3961,25 +4862,41 @@ static void build_ins(BuildCtx *ctx, BCOp op, int defop) | sltiu TMP2, TMP2, LJ_TISNUM | and TMP1, TMP1, TMP0 | and TMP1, TMP1, TMP2 + |.if FPU | ldc1 f0, FORL_IDX*8(RA) | beqz TMP1, ->vmeta_for |. ldc1 f2, FORL_STOP*8(RA) + |.else + | beqz TMP1, ->vmeta_for + | load_farg1 FORL_IDX*8(RA) + | load_farg2 FORL_STOP*8(RA) + |.endif } if (op != BC_JFORL) { | srl RD, RD, 1 | lui TMP0, (-(BCBIAS_J*4 >> 16) & 65535) } + | store_double f0, CARG1, CARG2, FORL_EXT*8(RA) + |.if FPU | c.le.d 0, f0, f2 | c.le.d 1, f2, f0 - | sdc1 f0, FORL_EXT*8(RA) + |.else + | sw RD, TEMP_SAVE_1 + | load_got __ledf2 // f0<=f2 + | call_extern + |. sw TMP0, TEMP_SAVE_2 + | lw TMP0, TEMP_SAVE_2 + | lw RD, TEMP_SAVE_1 + | lw TMP3, FORL_STEP*8+HI(RA) // Restored step. + |.endif + | if (op == BC_JFORI) { | li TMP1, 1 | li TMP2, 1 | addu TMP0, RD, TMP0 | slt TMP3, TMP3, r0 - | movf TMP1, r0, 0 + | cmp_res 1 | addu PC, PC, TMP0 - | movf TMP2, r0, 1 | lhu RD, -4+OFS_RD(PC) | movn TMP1, TMP2, TMP3 | bnez TMP1, =>BC_JLOOP @@ -3988,8 +4905,7 @@ static void build_ins(BuildCtx *ctx, BCOp op, int defop) | li TMP1, 1 | li TMP2, 1 | slt TMP3, TMP3, r0 - | movf TMP1, r0, 0 - | movf TMP2, r0, 1 + | cmp_res 1 | movn TMP1, TMP2, TMP3 | bnez TMP1, =>BC_JLOOP |. nop @@ -3998,11 +4914,9 @@ static void build_ins(BuildCtx *ctx, BCOp op, int defop) | slt TMP3, TMP3, r0 | move TMP2, TMP1 if (op == BC_FORI) { - | movt TMP1, r0, 0 - | movt TMP2, r0, 1 + | cmp_res 0 } else { - | movf TMP1, r0, 0 - | movf TMP2, r0, 1 + | cmp_res 1 } | movn TMP1, TMP2, TMP3 | addu PC, PC, TMP1 @@ -4256,8 +5170,10 @@ static void emit_asm_debug(BuildCtx *ctx) fcofs, CFRAME_SIZE); for (i = 23; i >= 16; i--) fprintf(ctx->fp, "\t.byte %d\n\t.uleb128 %d\n", 0x80+i, 26-i); +#if !LJ_SOFTFP for (i = 30; i >= 20; i -= 2) fprintf(ctx->fp, "\t.byte %d\n\t.uleb128 %d\n", 0x80+32+i, 42-i); +#endif fprintf(ctx->fp, "\t.align 2\n" ".LEFDE0:\n\n"); @@ -4275,6 +5191,7 @@ static void emit_asm_debug(BuildCtx *ctx) "\t.align 2\n" ".LEFDE1:\n\n", (int)ctx->codesz - fcofs); #endif +#if !LJ_NO_UNWIND fprintf(ctx->fp, "\t.section .eh_frame,\"aw\",@progbits\n"); fprintf(ctx->fp, "\t.globl lj_err_unwind_dwarf\n" @@ -4342,6 +5259,7 @@ static void emit_asm_debug(BuildCtx *ctx) "\t.byte 0xd\n\t.uleb128 0x10\n" "\t.align 2\n" ".LEFDE3:\n\n", (int)ctx->codesz - fcofs); +#endif #endif break; default: diff --git a/lib/luajit/src/vm_x64.dasc b/lib/luajit/src/vm_x64.dasc index e7e990ae27..bba89aaf1b 100644 --- a/lib/luajit/src/vm_x64.dasc +++ b/lib/luajit/src/vm_x64.dasc @@ -531,7 +531,7 @@ static void build_subroutines(BuildCtx *ctx) | jmp >2 | |->vm_growstack_v: // Grow stack for vararg Lua function. - | sub RD, 8 + | sub RD, 16 // LJ_FR2 | jmp >1 | |->vm_growstack_f: // Grow stack for fixarg Lua function. diff --git a/src/core/lib.c b/src/core/lib.c index e1703f71ee..84951da3bf 100644 --- a/src/core/lib.c +++ b/src/core/lib.c @@ -77,3 +77,15 @@ void nop() { } +/* Bitswap uint64_t. */ +uint64_t bswap64 (uint64_t b) +{ + return ((((uint64_t) b & (uint64_t) 0x00000000000000ff) << 56) | + (((uint64_t) b & (uint64_t) 0x000000000000ff00) << 40) | + (((uint64_t) b & (uint64_t) 0x0000000000ff0000) << 24) | + (((uint64_t) b & (uint64_t) 0x00000000ff000000) << 8) | + (((uint64_t) b & (uint64_t) 0x000000ff00000000) >> 8) | + (((uint64_t) b & (uint64_t) 0x0000ff0000000000) >> 24) | + (((uint64_t) b & (uint64_t) 0x00ff000000000000) >> 40) | + (((uint64_t) b & (uint64_t) 0xff00000000000000) >> 56)); +} diff --git a/src/core/lib.h b/src/core/lib.h index 013bdcfecc..616d12a484 100644 --- a/src/core/lib.h +++ b/src/core/lib.h @@ -6,3 +6,4 @@ void full_memory_barrier(); void prefetch_for_read(const void *address); void prefetch_for_write(const void *address); unsigned int stat_mtime(const char *path); +uint64_t bswap64 (uint64_t b); diff --git a/src/core/lib.lua b/src/core/lib.lua index eccd097dbc..8ecf6f9df8 100644 --- a/src/core/lib.lua +++ b/src/core/lib.lua @@ -351,15 +351,19 @@ end -- avoid C function call overhead while using C.xxxx counterparts if ffi.abi("be") then -- nothing to do + function htonll(b) return b end function htonl(b) return b end function htons(b) return b end else + function htonll(b) return C.bswap64(b) end function htonl(b) return bswap(b) end function htons(b) return rshift(bswap(b), 16) end end +ntohll = htonll ntohl = htonl ntohs = htons + -- Manipulation of bit fields in uint{8,16,32)_t stored in network -- byte order. Using bit fields in C structs is compiler-dependent -- and a little awkward for handling endianness and fields that cross diff --git a/src/dasm.lua b/src/dasm.lua index 448eab9cf9..acf8587e5f 100644 --- a/src/dasm.lua +++ b/src/dasm.lua @@ -1,5 +1,5 @@ ---binding to the DynASM encoding engine. +--Binding to the DynASM encoding engine. --Written by Cosmin Apreutesei. Public Domain. local ffi = require'ffi' diff --git a/src/dasm_proto.h b/src/dasm_proto.h index 93ca06533c..d3ba39ab1e 100644 --- a/src/dasm_proto.h +++ b/src/dasm_proto.h @@ -13,6 +13,8 @@ #define DASM_IDENT "DynASM 1.4.0" #define DASM_VERSION 10400 /* 1.4.0 */ +#undef DASM_CHECKS + #ifndef Dst_DECL #define Dst_DECL dasm_State **Dst #endif @@ -76,7 +78,8 @@ DASM_FDEF int dasm_getpclabel(Dst_DECL, unsigned int pc); /* Optional sanity checker to call between isolated encoding steps. */ DASM_FDEF int dasm_checkstep(Dst_DECL, int secmatch); #else -#define dasm_checkstep(a, b) 0 +/*#define dasm_checkstep(a, b) 0*/ +DASM_FDEF int dasm_checkstep(Dst_DECL, int secmatch) {return 0;} #endif diff --git a/src/dasm_x64.lua b/src/dasm_x64.lua index 24efbae866..c22ddcfda8 100644 --- a/src/dasm_x64.lua +++ b/src/dasm_x64.lua @@ -9,10 +9,11 @@ ------------------------------------------------------------------------------ --unload dasm_x86 if it's already loaded. +if not package then package = {loaded = {}} end --for compat. with minilua local dasm_x86 = package.loaded.dasm_x86 package.loaded.dasm_x86 = nil -rawset(_G, 'x64', true) -- Using a global is an ugly, but effective solution. +x64 = true -- Using a global is an ugly, but effective solution. local dasm_x64 = require("dasm_x86") package.loaded.dasm_x86 = dasm_x86 --put it back diff --git a/src/dasm_x86.c b/src/dasm_x86.c index 85376ca7ca..59c3bb63c0 100644 --- a/src/dasm_x86.c +++ b/src/dasm_x86.c @@ -1,4 +1,10 @@ -#define DASM_CHECKS +/* + Encoding engine to use with dasm.lua. + + Compile with: + + gcc dasm_x86.c -DDASM_CHECKS -shared -s -o dasm_x86.so +*/ #include "dasm_extern.h" #include "dasm_proto.h" diff --git a/src/dasm_x86.h b/src/dasm_x86.h index 175febe0ca..be9c289f02 100644 --- a/src/dasm_x86.h +++ b/src/dasm_x86.h @@ -170,7 +170,7 @@ void dasm_put(Dst_DECL, int start, ...) dasm_State *D = Dst_REF; dasm_ActList p = D->actionlist + start; dasm_Section *sec = D->section; - int pos = sec->pos, ofs = sec->ofs, mrm = 4; + int pos = sec->pos, ofs = sec->ofs, mrm = -1; int *b; if (pos >= sec->epos) { @@ -193,7 +193,7 @@ void dasm_put(Dst_DECL, int start, ...) b[pos++] = n; switch (action) { case DASM_DISP: - if (n == 0) { if ((mrm&7) == 4) mrm = p[-2]; if ((mrm&7) != 5) break; } + if (n == 0) { if (mrm < 0) mrm = p[-2]; if ((mrm&7) != 5) break; } case DASM_IMM_DB: if (((n+128)&-256) == 0) goto ob; case DASM_REL_A: /* Assumes ptrdiff_t is int. !x64 */ case DASM_IMM_D: ofs += 4; break; @@ -203,10 +203,17 @@ void dasm_put(Dst_DECL, int start, ...) case DASM_IMM_W: CK((n&-65536) == 0, RANGE_I); ofs += 2; break; case DASM_SPACE: p++; ofs += n; break; case DASM_SETLABEL: b[pos-2] = -0x40000000; break; /* Neg. label ofs. */ - case DASM_VREG: CK((n&-8) == 0 && (n != 4 || (*p&1) == 0), RANGE_VREG); - if (*p++ == 1 && *p == DASM_DISP) mrm = n; continue; + case DASM_VREG: CK((n&-16) == 0 && (n != 4 || (*p>>5) != 2), RANGE_VREG); + if (*p < 0x40 && p[1] == DASM_DISP) mrm = n; + if (*p < 0x20 && (n&7) == 4) ofs++; + switch ((*p++ >> 3) & 3) { + case 3: n |= b[pos-3]; + case 2: n |= b[pos-2]; + case 1: if (n <= 7) { b[pos-1] |= 0x10; ofs--; } + } + continue; } - mrm = 4; + mrm = -1; } else { int *pl, n; switch (action) { @@ -393,7 +400,22 @@ int dasm_encode(Dst_DECL, void *buffer) case DASM_IMM_W: dasmw(n); break; case DASM_VREG: { int t = *p++; - if (t >= 5) n <<= 4; else if (t >= 2) n <<= 3; + unsigned char *ex = cp - (t&7); + if ((n & 8) && t < 0xa0) { + if (*ex & 0x80) ex[1] ^= 0x20 << (t>>6); else *ex ^= 1 << (t>>6); + n &= 7; + } else if (n & 0x10) { + if (*ex & 0x80) { + *ex = 0xc5; ex[1] = (ex[1] & 0x80) | ex[2]; ex += 2; + } + while (++ex < cp) ex[-1] = *ex; + if (mark) mark--; + cp--; + n &= 7; + } + if (t >= 0xc0) n <<= 4; + else if (t >= 0x40) n <<= 3; + else if (n == 4 && t < 0x20) { cp[-1] ^= n; *cp++ = 0x20; } cp[-1] ^= n; break; } diff --git a/src/dasm_x86.lua b/src/dasm_x86.lua index e7563d477f..0c11f020ec 100644 --- a/src/dasm_x86.lua +++ b/src/dasm_x86.lua @@ -44,7 +44,7 @@ local action_names = { -- int arg, 1 buffer pos: "DISP", "IMM_S", "IMM_B", "IMM_W", "IMM_D", "IMM_WB", "IMM_DB", -- action arg (1 byte), int arg, 1 buffer pos (reg/num): - "VREG", "SPACE", -- !x64: VREG support NYI. + "VREG", "SPACE", -- ptrdiff_t arg, 1 buffer pos (address): !x64 "SETLABEL", "REL_A", -- action arg (1 byte) or int arg, 2 buffer pos (link, offset): @@ -92,6 +92,21 @@ local function init_actionlist() secpos = 1 end +-- VREG kind encodings, pre-shifted by 5 bits. +local map_vreg = { + ["modrm.rm.m"] = 0x00, + ["modrm.rm.r"] = 0x20, + ["opcode"] = 0x20, + ["sib.base"] = 0x20, + ["sib.index"] = 0x40, + ["modrm.reg"] = 0x80, + ["vex.v"] = 0xa0, + ["imm.hi"] = 0xc0, +} + +-- Current number of VREG actions contributing to REX/VEX shrinkage. +local vreg_shrink_count = 0 + ------------------------------------------------------------------------------ -- Compute action numbers for action names. @@ -151,6 +166,21 @@ local function waction(action, a, num) if a or num then secpos = secpos + (num or 1) end end +-- Optionally add a VREG action. +local function wvreg(kind, vreg, psz, sk, defer) + if not vreg then return end + waction("VREG", vreg) + local b = assert(map_vreg[kind], "bad vreg kind `"..vreg.."'") + if b < (sk or 0) then + vreg_shrink_count = vreg_shrink_count + 1 + end + if not defer then + b = b + vreg_shrink_count * 8 + vreg_shrink_count = 0 + end + wputxb(b + (psz or 0)) +end + -- Add call to embedded DynASM C code. local function wcall(func, args) if luamode then @@ -390,6 +420,7 @@ mkrmap("w", "Rw", {"ax", "cx", "dx", "bx", "sp", "bp", "si", "di"}) mkrmap("b", "Rb", {"al", "cl", "dl", "bl", "ah", "ch", "dh", "bh"}) map_reg_valid_index[map_archdef.esp] = false if x64 then map_reg_valid_index[map_archdef.rsp] = false end +if x64 then map_reg_needrex[map_archdef.Rb] = true end map_archdef["Ra"] = "@"..addrsize -- FP registers (internally tword sized, but use "f" as operand size). @@ -527,16 +558,24 @@ local function wputszarg(sz, n) end -- Put multi-byte opcode with operand-size dependent modifications. -local function wputop(sz, op, rex, vex) +local function wputop(sz, op, rex, vex, vregr, vregxb) + local psz, sk = 0, nil if vex then local tail if vex.m == 1 and band(rex, 11) == 0 then - wputb(0xc5) + if x64 and vregxb then + sk = map_vreg["modrm.reg"] + else + wputb(0xc5) tail = shl(bxor(band(rex, 4), 4), 5) - else + psz = 3 + end + end + if not tail then wputb(0xc4) wputb(shl(bxor(band(rex, 7), 7), 5) + vex.m) tail = shl(band(rex, 8), 4) + psz = 4 end local reg, vreg = 0, nil if vex.v then @@ -546,12 +585,18 @@ local function wputop(sz, op, rex, vex) end if sz == "y" or vex.l then tail = tail + 4 end wputb(tail + shl(bxor(reg, 15), 3) + vex.p) - if vreg then waction("VREG", vreg); wputxb(4) end + wvreg("vex.v", vreg) rex = 0 if op >= 256 then werror("bad vex opcode") end + else + if rex ~= 0 then + if not x64 then werror("bad operand size") end + elseif (vregr or vregxb) and x64 then + rex = 0x10 + sk = map_vreg["vex.v"] + end end local r - if rex ~= 0 and not x64 then werror("bad operand size") end if sz == "w" then wputb(102) end -- Needs >32 bit numbers, but only for crc32 eax, word [ebx] if op >= 4294967296 then r = op%4294967296 wputb((op-r)/4294967296) op = r end @@ -560,20 +605,20 @@ local function wputop(sz, op, rex, vex) if rex ~= 0 then local opc3 = band(op, 0xffff00) if opc3 == 0x0f3a00 or opc3 == 0x0f3800 then - wputb(64 + band(rex, 15)); rex = 0 + wputb(64 + band(rex, 15)); rex = 0; psz = 2 end end - wputb(shr(op, 16)); op = band(op, 0xffff) + wputb(shr(op, 16)); op = band(op, 0xffff); psz = psz + 1 end if op >= 256 then local b = shr(op, 8) - if b == 15 and rex ~= 0 then wputb(64 + band(rex, 15)); rex = 0 end - wputb(b) - op = band(op, 255) + if b == 15 and rex ~= 0 then wputb(64 + band(rex, 15)); rex = 0; psz = 2 end + wputb(b); op = band(op, 255); psz = psz + 1 end - if rex ~= 0 then wputb(64 + band(rex, 15)) end + if rex ~= 0 then wputb(64 + band(rex, 15)); psz = 2 end if sz == "b" then op = op - 1 end wputb(op) + return psz, sk end -- Put ModRM or SIB formatted byte. @@ -583,7 +628,7 @@ local function wputmodrm(m, s, rm, vs, vrm) end -- Put ModRM/SIB plus optional displacement. -local function wputmrmsib(t, imark, s, vsreg) +local function wputmrmsib(t, imark, s, vsreg, psz, sk) local vreg, vxreg local reg, xreg = t.reg, t.xreg if reg and reg < 0 then reg = 0; vreg = t.vreg end @@ -593,8 +638,8 @@ local function wputmrmsib(t, imark, s, vsreg) -- Register mode. if sub(t.mode, 1, 1) == "r" then wputmodrm(3, s, reg) - if vsreg then waction("VREG", vsreg); wputxb(2) end - if vreg then waction("VREG", vreg); wputxb(0) end + wvreg("modrm.reg", vsreg, psz+1, sk, vreg) + wvreg("modrm.rm.r", vreg, psz+1, sk) return end @@ -608,21 +653,22 @@ local function wputmrmsib(t, imark, s, vsreg) -- [xreg*xsc+disp] -> (0, s, esp) (xsc, xreg, ebp) wputmodrm(0, s, 4) if imark == "I" then waction("MARK") end - if vsreg then waction("VREG", vsreg); wputxb(2) end + wvreg("modrm.reg", vsreg, psz+1, sk, vxreg) wputmodrm(t.xsc, xreg, 5) - if vxreg then waction("VREG", vxreg); wputxb(3) end + wvreg("sib.index", vxreg, psz+2, sk) else -- Pure 32 bit displacement. if x64 and tdisp ~= "table" then wputmodrm(0, s, 4) -- [disp] -> (0, s, esp) (0, esp, ebp) + wvreg("modrm.reg", vsreg, psz+1, sk) if imark == "I" then waction("MARK") end wputmodrm(0, 4, 5) else riprel = x64 wputmodrm(0, s, 5) -- [disp|rip-label] -> (0, s, ebp) + wvreg("modrm.reg", vsreg, psz+1, sk) if imark == "I" then waction("MARK") end end - if vsreg then waction("VREG", vsreg); wputxb(2) end end if riprel then -- Emit rip-relative displacement. if match("UWSiI", imark) then @@ -650,16 +696,16 @@ local function wputmrmsib(t, imark, s, vsreg) if xreg or band(reg, 7) == 4 then wputmodrm(m or 2, s, 4) -- ModRM. if m == nil or imark == "I" then waction("MARK") end - if vsreg then waction("VREG", vsreg); wputxb(2) end + wvreg("modrm.reg", vsreg, psz+1, sk, vxreg or vreg) wputmodrm(t.xsc or 0, xreg or 4, reg) -- SIB. - if vxreg then waction("VREG", vxreg); wputxb(3) end - if vreg then waction("VREG", vreg); wputxb(1) end + wvreg("sib.index", vxreg, psz+2, sk, vreg) + wvreg("sib.base", vreg, psz+2, sk) else wputmodrm(m or 2, s, reg) -- ModRM. if (imark == "I" and (m == 1 or m == 2)) or (m == nil and (vsreg or vreg)) then waction("MARK") end - if vsreg then waction("VREG", vsreg); wputxb(2) end - if vreg then waction("VREG", vreg); wputxb(1) end + wvreg("modrm.reg", vsreg, psz+1, sk, vreg) + wvreg("modrm.rm.m", vreg, psz+1, sk) end -- Put displacement. @@ -1184,7 +1230,7 @@ local map_op = { shrd_3 = "mriqdw:0FACRmU|mrC/qq:0FADRm|mrC/dd:|mrC/ww:", rdtsc_0 = "0F31", -- P1+ - rdpmc_0 = "0F33", + rdpmc_0 = "0F33", -- P6+ cpuid_0 = "0FA2", -- P1+ -- floating point ops @@ -1327,46 +1373,14 @@ local map_op = { movups_2 = "rmo:0F10rM|mro:0F11Rm", orpd_2 = "rmo:660F56rM", orps_2 = "rmo:0F56rM", - packssdw_2 = "rmo:660F6BrM", - packsswb_2 = "rmo:660F63rM", - packuswb_2 = "rmo:660F67rM", - paddb_2 = "rmo:660FFCrM", - paddd_2 = "rmo:660FFErM", - paddq_2 = "rmo:660FD4rM", - paddsb_2 = "rmo:660FECrM", - paddsw_2 = "rmo:660FEDrM", - paddusb_2 = "rmo:660FDCrM", - paddusw_2 = "rmo:660FDDrM", - paddw_2 = "rmo:660FFDrM", - pand_2 = "rmo:660FDBrM", - pandn_2 = "rmo:660FDFrM", pause_0 = "F390", - pavgb_2 = "rmo:660FE0rM", - pavgw_2 = "rmo:660FE3rM", - pcmpeqb_2 = "rmo:660F74rM", - pcmpeqd_2 = "rmo:660F76rM", - pcmpeqw_2 = "rmo:660F75rM", - pcmpgtb_2 = "rmo:660F64rM", - pcmpgtd_2 = "rmo:660F66rM", - pcmpgtw_2 = "rmo:660F65rM", pextrw_3 = "rri/do:660FC5rMU|xri/wo:660F3A15nRmU", -- Mem op: SSE4.1 only. pinsrw_3 = "rri/od:660FC4rMU|rxi/ow:", - pmaddwd_2 = "rmo:660FF5rM", - pmaxsw_2 = "rmo:660FEErM", - pmaxub_2 = "rmo:660FDErM", - pminsw_2 = "rmo:660FEArM", - pminub_2 = "rmo:660FDArM", pmovmskb_2 = "rr/do:660FD7rM", - pmulhuw_2 = "rmo:660FE4rM", - pmulhw_2 = "rmo:660FE5rM", - pmullw_2 = "rmo:660FD5rM", - pmuludq_2 = "rmo:660FF4rM", - por_2 = "rmo:660FEBrM", prefetchnta_1 = "xb:n0F180m", prefetcht0_1 = "xb:n0F181m", prefetcht1_1 = "xb:n0F182m", prefetcht2_1 = "xb:n0F183m", - psadbw_2 = "rmo:660FF6rM", pshufd_3 = "rmio:660F70rMU", pshufhw_3 = "rmio:F30F70rMU", pshuflw_3 = "rmio:F20F70rMU", @@ -1380,23 +1394,6 @@ local map_op = { psrldq_2 = "rio:660F733mU", psrlq_2 = "rmo:660FD3rM|rio:660F732mU", psrlw_2 = "rmo:660FD1rM|rio:660F712mU", - psubb_2 = "rmo:660FF8rM", - psubd_2 = "rmo:660FFArM", - psubq_2 = "rmo:660FFBrM", - psubsb_2 = "rmo:660FE8rM", - psubsw_2 = "rmo:660FE9rM", - psubusb_2 = "rmo:660FD8rM", - psubusw_2 = "rmo:660FD9rM", - psubw_2 = "rmo:660FF9rM", - punpckhbw_2 = "rmo:660F68rM", - punpckhdq_2 = "rmo:660F6ArM", - punpckhqdq_2 = "rmo:660F6DrM", - punpckhwd_2 = "rmo:660F69rM", - punpcklbw_2 = "rmo:660F60rM", - punpckldq_2 = "rmo:660F62rM", - punpcklqdq_2 = "rmo:660F6CrM", - punpcklwd_2 = "rmo:660F61rM", - pxor_2 = "rmo:660FEFrM", rcpps_2 = "rmo:0F53rM", rcpss_2 = "rro:F30F53rM|rx/od:", rsqrtps_2 = "rmo:0F52rM", @@ -1640,6 +1637,12 @@ local map_op = { -- AVX, AVX2 integer ops -- In general, xmm requires AVX, ymm requires AVX2. + vaesdec_3 = "rrmo:660F38VDErM", + vaesdeclast_3 = "rrmo:660F38VDFrM", + vaesenc_3 = "rrmo:660F38VDCrM", + vaesenclast_3 = "rrmo:660F38VDDrM", + vaesimc_2 = "rmo:660F38uDBrM", + vaeskeygenassist_3 = "rmio:660F3AuDFrMU", vlddqu_2 = "rxoy:F20FuF0rM", vmaskmovdqu_2 = "rro:660FuF7rM", vmovdqa_2 = "rmoy:660Fu6FrM|mroy:660Fu7FRm", @@ -1880,10 +1883,11 @@ local function dopattern(pat, args, sz, op, needrex) if t.xreg and t.xreg > 7 then rex = rex + 2 end if s > 7 then rex = rex + 4 end if needrex then rex = rex + 16 end - wputop(szov, opcode, rex, vex); opcode = nil + local psz, sk = wputop(szov, opcode, rex, vex, s < 0, t.vreg or t.vxreg) + opcode = nil local imark = sub(pat, -1) -- Force a mark (ugly). -- Put ModRM/SIB with regno/last digit as spare. - wputmrmsib(t, imark, s, addin and addin.vreg) + wputmrmsib(t, imark, s, addin and addin.vreg, psz, sk) addin = nil elseif map_vexarg[c] ~= nil then -- Encode using VEX prefix local b = band(opcode, 255); opcode = shr(opcode, 8) @@ -1910,8 +1914,8 @@ local function dopattern(pat, args, sz, op, needrex) if szov == "q" and rex == 0 then rex = rex + 8 end if needrex then rex = rex + 16 end if addin and addin.reg == -1 then - wputop(szov, opcode - 7, rex, vex) - waction("VREG", addin.vreg); wputxb(0) + local psz, sk = wputop(szov, opcode - 7, rex, vex, true) + wvreg("opcode", addin.vreg, psz, sk) else if addin and addin.reg > 7 then rex = rex + 1 end wputop(szov, opcode, rex, vex) @@ -1955,7 +1959,7 @@ local function dopattern(pat, args, sz, op, needrex) local reg = a.reg if reg < 0 then wputb(0) - waction("VREG", a.vreg); wputxb(5) + wvreg("imm.hi", a.vreg) else wputb(shl(reg, 4)) end @@ -2107,8 +2111,8 @@ if x64 then rex = a.reg > 7 and 9 or 8 end end - wputop(sz, opcode, rex) - if vreg then waction("VREG", vreg); wputxb(0) end + local psz, sk = wputop(sz, opcode, rex, nil, vreg) + wvreg("opcode", vreg, psz, sk) if luamode then waction("IMM_D", format("ffi.cast(\"uintptr_t\", %s) %% 2^32", op64)) waction("IMM_D", format("ffi.cast(\"uintptr_t\", %s) / 2^32", op64)) diff --git a/src/dynasm.lua b/src/dynasm.lua index 10d93c0f8f..586e2a13dd 100644 --- a/src/dynasm.lua +++ b/src/dynasm.lua @@ -1141,14 +1141,13 @@ local function setlang(infile) g_opt.comment = "--|" g_opt.endcomment = "" end + -- Set initial defines only available in Lua mode. + local ffi = require("ffi") + map_def.ARCH = ffi.arch --for `.arch ARCH` + map_def[upper(ffi.arch)] = 1 --for `.if X86 ...` + map_def.OS = ffi.os --for `.if OS == 'Windows'` + map_def[upper(ffi.os)] = 1 --for `.if WINDOWS ...` end - - -- Set initial defines only available in Lua mode. - local ffi = require'ffi' - map_def.ARCH = ffi.arch --for `.arch ARCH` - map_def[upper(ffi.arch)] = 1 --for `.if X86 ...` - map_def.OS = ffi.os --for `.if OS == 'Windows'` - map_def[upper(ffi.os)] = 1 --for `.if WINDOWS ...` end -- Parse arguments. diff --git a/src/lib/ipsec/.images/esp.png b/src/lib/ipsec/.images/esp.png new file mode 100644 index 0000000000..09c165442b Binary files /dev/null and b/src/lib/ipsec/.images/esp.png differ diff --git a/src/lib/ipsec/README.md b/src/lib/ipsec/README.md new file mode 100644 index 0000000000..a4a3f6eac0 --- /dev/null +++ b/src/lib/ipsec/README.md @@ -0,0 +1,45 @@ +### IPsec/ESP (lib.ipsec.esp) + +The `lib.ipsec.esp` module contains two classes `esp_v6_encrypt` and +`esp_v6_decrypt` which implement implement packet encryption and +decryption with IPsec ESP using the AES-GCM-128 cipher in IPv6 transport +mode. Packets are encrypted with the key and salt provided to the classes +constructors. These classes do not implement any key exchange protocol. + +The encrypt class accepts IPv6 packets and inserts a new [ESP +header](https://en.wikipedia.org/wiki/IPsec#Encapsulating_Security_Payload) +between the outer IPv6 header and the inner protocol header (e.g. TCP, +UDP, L2TPv3) and also encrypts the contents of the inner protocol +header. The decrypt class does the reverse: it decrypts the inner +protocol header and removes the ESP protocol header. + +References: + +- [IPsec Wikipedia page](https://en.wikipedia.org/wiki/IPsec). +- [RFC 4106](https://tools.ietf.org/html/rfc4106) on using AES-GCM with IPsec ESP. +- [LISP Data-Plane Confidentiality](https://tools.ietf.org/html/draft-ietf-lisp-crypto-02) example of a software layer above these apps that includes key exchange. + +— Method **esp_v6_encrypt:new** *config* + +— Method **esp_v6_decrypt:new** *config* + +Returns a new encryption/decryption context respectively. *Config* must a +be a table with the following keys: + +* `mode` - Encryption mode (string). The only accepted value is the + string `"aes-128-gcm"`. +* `keymat` - Hex string containing 16 bytes of key material as specified + in RFC 4106. +* `salt` - Hex string containing four bytes of salt as specified in + RFC 4106. + +— Method **esp_v6_encrypt:encapsulate** *packet* + +Returns a freshly allocated packet that is the encrypted and encapsulated +version of *packet*. + +— Method **esp_v6_decrypt:decapsulate** *packet* + +Returns a freshly allocated packet that is the decrypted and decapsulated +version of *packet* or `nil` if authentication failed. The contents of +*packet* are destroyed in the process. diff --git a/src/lib/ipsec/aes_128_gcm.lua b/src/lib/ipsec/aes_128_gcm.lua new file mode 100644 index 0000000000..e9677c4f04 --- /dev/null +++ b/src/lib/ipsec/aes_128_gcm.lua @@ -0,0 +1,104 @@ +module(..., package.seeall) +local ffi = require("ffi") +local C = ffi.C +local ASM = require("lib.ipsec.aes_128_gcm_avx") +local header = require("lib.protocol.header") +local lib = require("core.lib") +local ntohl, htonl, htonll = lib.ntohl, lib.htonl, lib.htonll + + +-- IV pseudo header + +local iv = subClass(header) + +-- Class variables +iv._name = "iv" +iv:init( + { + [1] = ffi.typeof[[ + struct { + uint8_t salt[4]; + uint64_t iv; + uint32_t padding; + } __attribute__((packed, aligned(16))) + ]] + }) + +-- Class methods + +function iv:new (salt) + local o = iv:superClass().new(self) + local h = o:header() + o:salt(salt) + h.padding = htonl(0x1) + return o +end + +-- Instance methods + +function iv:salt (salt) + local h = self:header() + if salt ~= nil then + ffi.copy(h.salt, salt, 4) + else + return h.salt + end +end + +function iv:iv (iv) + local h = self:header() + if iv ~= nil then + h.iv = htonll(iv) + else + return self:header_ptr()+4, 8 + end +end + + +-- AES-128-GCM wrapper + +local function u8_ptr (ptr) return ffi.cast("uint8_t *", ptr) end + +local aes_128_gcm = {} + +function aes_128_gcm:new (keymat, salt) + assert(keymat and #keymat == 32, "Need 16 bytes of key material.") + assert(salt and #salt == 8, "Need 4 bytes of salt.") + local o = {} + o.keymat = ffi.new("uint8_t[16]") + ffi.copy(o.keymat, lib.hexundump(keymat, 16), 16) + o.iv = iv:new(lib.hexundump(salt, 4)) + -- Compute subkey (H) + o.hash_subkey = ffi.new("uint8_t[?] __attribute__((aligned(16)))", 128) + o.gcm_data = ffi.new("gcm_data[1] __attribute__((aligned(16)))") + ASM.aes_keyexp_128_enc_avx(o.keymat, o.gcm_data[0].expanded_keys) + ASM.aesni_gcm_precomp_avx_gen4(o.gcm_data, o.hash_subkey) + o.blocksize = 128 + o.auth_size = 16 + o.auth_buf = ffi.new("uint8_t[?]", o.auth_size) + o.aad_size = 16 + return setmetatable(o, {__index=aes_128_gcm}) +end + +function aes_128_gcm:encrypt (out_ptr, payload, length, esp) + self.iv:iv(esp:seq_no()) + ASM.aesni_gcm_enc_avx_gen4(self.gcm_data, + out_ptr, + payload, length, + u8_ptr(self.iv:header_ptr()), + u8_ptr(esp:header_ptr()), esp:sizeof(), + payload + length, self.auth_size) +end + +function aes_128_gcm:decrypt (out_ptr, ciphertext, length, esp) + self.iv:iv(esp:seq_no()) + ASM.aesni_gcm_dec_avx_gen4(self.gcm_data, + out_ptr, + ciphertext, length, + u8_ptr(self.iv:header_ptr()), + u8_ptr(esp:header_ptr()), esp:sizeof(), + self.auth_buf, self.auth_size) + return C.memcmp(self.auth_buf, ciphertext + length, self.auth_size) == 0 +end + +return aes_128_gcm diff --git a/src/lib/ipsec/aes_128_gcm_avx.dasl b/src/lib/ipsec/aes_128_gcm_avx.dasl new file mode 100644 index 0000000000..bd341a7b63 --- /dev/null +++ b/src/lib/ipsec/aes_128_gcm_avx.dasl @@ -0,0 +1,498 @@ +-- Selected AES GCM routines, based heavily on the Intel IPsec code from: +-- https://github.com/lukego/intel-ipsec/blob/master/code/avx2/gcm_avx_gen4.asm +-- https://github.com/lukego/intel-ipsec/blob/master/code/gcm_defines.asm +-- https://github.com/lukego/intel-ipsec/blob/master/code/aes_keyexp_128.asm + +local dasm = require("dasm") +local ffi = require("ffi") + +ffi.cdef[[ +typedef struct gcm_data +{ + uint8_t expanded_keys[16*11]; + uint8_t shifted_hkey_1[16]; + uint8_t shifted_hkey_2[16]; + uint8_t shifted_hkey_3[16]; + uint8_t shifted_hkey_4[16]; + uint8_t shifted_hkey_5[16]; + uint8_t shifted_hkey_6[16]; + uint8_t shifted_hkey_7[16]; + uint8_t shifted_hkey_8[16]; + uint8_t shifted_hkey_1_k[16]; + uint8_t shifted_hkey_2_k[16]; + uint8_t shifted_hkey_3_k[16]; + uint8_t shifted_hkey_4_k[16]; + uint8_t shifted_hkey_5_k[16]; + uint8_t shifted_hkey_6_k[16]; + uint8_t shifted_hkey_7_k[16]; + uint8_t shifted_hkey_8_k[16]; +} gcm_data; +]] + +|.arch x64 +|.actionlist actions +|.globalnames globalnames + +|.define arg1, rdi +|.define arg2, rsi +|.define arg3, rdx +|.define arg4, rcx +|.define arg5, r8 +|.define arg6, r9 +|.define arg7, [r14 + 32 + 8*1] +|.define arg8, [r14 + 32 + 8*2] +|.define arg9, [r14 + 32 + 8*3] + +local function ghash_tail(Dst, gh, t1, t2, t3) + | vmovdqa xmm(t3), [->poly2] + | vpclmulqdq xmm(t2), xmm(t3), xmm(gh), 0x01; vpslldq xmm(t2), xmm(t2), 8; vpxor xmm(gh), xmm(gh), xmm(t2) + | vpclmulqdq xmm(t2), xmm(t3), xmm(gh), 0x00; vpsrldq xmm(t2), xmm(t2), 4 + | vpclmulqdq xmm(gh), xmm(t3), xmm(gh), 0x10; vpslldq xmm(gh), xmm(gh), 4; vpxor xmm(gh), xmm(gh), xmm(t2) + | vpxor xmm(gh), xmm(gh), xmm(t1) +end + +local function ghash_mul(Dst, gh, hk, t1, t2, t3) + | vpclmulqdq xmm(t1), xmm(gh), xmm(hk), 0x11 + | vpclmulqdq xmm(t2), xmm(gh), xmm(hk), 0x00 + | vpclmulqdq xmm(t3), xmm(gh), xmm(hk), 0x01 + | vpclmulqdq xmm(gh), xmm(gh), xmm(hk), 0x10 + | vpxor xmm(gh), xmm(gh), xmm(t3) + + | vpsrldq xmm(t3), xmm(gh), 8 + | vpslldq xmm(gh), xmm(gh), 8 + | vpxor xmm(t1), xmm(t1), xmm(t3) + | vpxor xmm(gh), xmm(gh), xmm(t2) + || ghash_tail(Dst, gh, t1, t2, t3) +end + +local function almost_encrypt_8(Dst, initial, ctr, t_key, operation, loop_idx, before_round) + local prev = ctr + for i = initial, 8 do + if loop_idx == "in_order" then + | vpaddd xmm(i), xmm(prev), [->one] + else + | vpaddd xmm(i), xmm(prev), [->onef] + end + prev = i + end + if prev ~= ctr then + | vmovdqa xmm(ctr), xmm(prev) + end + if loop_idx == "in_order" then + for i = initial, 8 do + | vpshufb xmm(i), xmm(i), [->shuf_mask] + end + end + + | vmovdqa xmm(t_key), [arg1+16*0] + for i = initial, 8 do + | vpxor xmm(i), xmm(i), xmm(t_key) + end + for j = 1, 9 do + before_round(j) + | vmovdqa xmm(t_key), [arg1+16*j] + for i = initial, 8 do + | vaesenc xmm(i), xmm(i), xmm(t_key) + end + end + before_round(10) +end + +local function encrypt_8(Dst, initial, t, ctr, t_key, operation) + almost_encrypt_8(Dst, initial, ctr, t_key, operation, "in_order", function() end) + + | vmovdqa xmm(t_key), [arg1+16*10] + for i = initial, 8 do + | vaesenclast xmm(i), xmm(i), xmm(t_key) + end + + for i = initial, 8 do + | vmovdqu xmm(t), [arg3 + r11 + 16*(i-initial)] + | vpxor xmm(i), xmm(i), xmm(t) + | vmovdqu [arg2 + r11 + 16*(i-initial)], xmm(i) + if operation == "dec" then + | vmovdqa xmm(i), xmm(t) + end + | vpshufb xmm(i), xmm(i), [->shuf_mask] + end + | add r11, (9-initial)*16 +end + +local function initial_blocks(Dst, num_initial_blocks, t, ctr, t_key, operation) + local i = 8 - num_initial_blocks + | mov r10, arg6 + | mov r12, arg7 + | mov r11, r12 + + | vpxor xmm(i), xmm(i), xmm(i) + |1: + | vmovd xmm(t[1]), dword [r10] + | vpslldq xmm(t[1]), xmm(t[1]), 12 + | vpsrldq xmm(i), xmm(i), 4 + | vpxor xmm(i), xmm(i), xmm(t[1]) + | add r10, 4 + | sub r12, 4 + | jg <1 + | cmp r11, 16 + | je >3 + | mov r12, 16 + |2: + | vpsrldq xmm(i), xmm(i), 4 + | sub r12, 4 + | cmp r12, r11 + | jg <2 + |3: + + | vpshufb xmm(i), xmm(i), [->shuf_mask] + | xor r11, r11 + | mov rax, arg5 + | vmovdqu xmm(ctr), [rax] + | vpshufb xmm(ctr), xmm(ctr), [->shuf_mask] + || encrypt_8(Dst, 9-num_initial_blocks, t[1], ctr, t_key, operation) + + local prev + | vmovdqu xmm(t[2]), [arg1 + 16*11] + for j = 8-num_initial_blocks, 8 do + if prev then + | vpxor xmm(j), xmm(j), xmm(prev) + end + ghash_mul(Dst, j, t[2], t[1], t[3], t[4]) + prev = j + end + + | vmovdqa [rsp], xmm8 + | vmovdqa xmm(t[3]), xmm8 + | cmp r13, 128 + | jl >9 + || encrypt_8(Dst, 1, t[1], ctr, t_key, operation) + | vpxor xmm1, xmm1, [rsp] + |9: +end + +local function mulqdqxor(Dst, out, qdq1, qdq2, qdqI, xor) + | vpclmulqdq xmm(xor or out), xmm(qdq1), xmm(qdq2), qdqI + if xor then + | vpxor xmm(out), xmm(out), xmm(xor) + end +end + +local function ghash_8_encrypt_8_parallel(Dst, t, ctr, loop_idx, operation) + | add r15b, 8 + | vmovdqa xmm(t[2]), xmm1 + for i = 2, 8 do + | vmovdqa [rsp + 16*(i-1)], xmm(i) + end + + almost_encrypt_8(Dst, 1, ctr, t[1], operation, loop_idx, function(round) + if round >= 3 then + | vmovdqa xmm(t[5]), [arg1 + 16*(21-round)] + local xor + if round > 3 then + | vmovdqa xmm(t[2]), [rsp + 16*(round-3)] + xor = t[3] + end + mulqdqxor(Dst, t[4], t[2], t[5], 0x11, xor) + mulqdqxor(Dst, t[7], t[2], t[5], 0x00, xor) + mulqdqxor(Dst, t[6], t[2], t[5], 0x01, xor) + mulqdqxor(Dst, t[6], t[2], t[5], 0x10, t[3]) + end + end) + + | vmovdqa xmm(t[5]), [arg1+16*10] + for j = 1, 8 do + local i = j - 1 + | vpxor xmm(t[2]), xmm(t[5]), [arg3 + r11 + 16*i] + if operation == "enc" then + | vaesenclast xmm(j), xmm(j), xmm(t[2]) + | vmovdqu [arg2 + r11 + 16*i], xmm(j) + else + | vaesenclast xmm(t[3]), xmm(j), xmm(t[2]) + | vmovdqu xmm(j), [arg3 + r11 + 16*i] + | vmovdqu [arg2 + r11 + 16*i], xmm(t[3]) + end + | vpshufb xmm(j), xmm(j), [->shuf_mask] + end + + | vpslldq xmm(t[3]), xmm(t[6]), 8 + | vpsrldq xmm(t[6]), xmm(t[6]), 8 + | vpxor xmm(t[7]), xmm(t[7]), xmm(t[3]) + | vpxor xmm(t[1]), xmm(t[4]), xmm(t[6]) + || ghash_tail(Dst, t[7], t[1], t[2], t[3]) + | vpxor xmm1, xmm1, xmm(t[7]) + | add r11, 128 + | sub r13, 128 +end + +local function ghash_last_8(Dst, t) + for i = 1, 8 do + | vmovdqa xmm(t[5]), [arg1 + 16*(19-i)] + | vpshufd xmm(t[2]), xmm(i), 0x4e + | vpshufd xmm(t[3]), xmm(t[5]), 0x4e + | vpxor xmm(t[2]), xmm(t[2]), xmm(i) + | vpxor xmm(t[3]), xmm(t[3]), xmm(t[5]) + mulqdqxor(Dst, t[6], i, t[5], 0x11, i ~= 1 and t[4]) + mulqdqxor(Dst, t[7], i, t[5], 0x00, i ~= 1 and t[4]) + mulqdqxor(Dst, 1, t[2], t[3], 0x00, i ~= 1 and t[4]) + end + | vpxor xmm1, xmm1, xmm(t[6]) + | vpxor xmm(t[2]), xmm1, xmm(t[7]) + + | vpslldq xmm(t[4]), xmm(t[2]), 8 + | vpsrldq xmm(t[2]), xmm(t[2]), 8 + | vpxor xmm(t[7]), xmm(t[7]), xmm(t[4]) + | vpxor xmm(t[6]), xmm(t[6]), xmm(t[2]) + || ghash_tail(Dst, t[7], t[6], t[2], t[3]) + | vmovdqa xmm14, xmm15 +end + +local function encrypt_single_block(Dst, x) + | vpxor xmm(x), xmm(x), [arg1+16*0] + for i = 1, 9 do + | vaesenc xmm(x), xmm(x), [arg1+16*i] + end + | vaesenclast xmm(x), xmm(x), [arg1+16*10] +end + +local function prologue(Dst) + for i = 12, 15 do + | push Rq(i) + end + | mov r14, rsp + | sub rsp, 16*8 + | and rsp, -64 +end + +local function epilogue(Dst) + | mov rsp, r14 + for i = 15, 12, -1 do + | pop Rq(i) + end + | ret +end + +local function gcm_enc_dec(Dst, operation, pc) + prologue(Dst) + + | mov r13, arg4 + | and r13, -16 + | mov r12, r13 + | shr r12, 4 + | and r12, 7 + | jz =>pc+0 + for i = 7, 2, -1 do + | cmp r12, i + | je =>pc+i + end + | jmp =>pc+1 + for i = 7, 0, -1 do + |=>pc+i: + || initial_blocks(Dst, i, {12, 13, 14, 15}, 9, 0, operation) + if i ~= 0 then + | sub r13, 16*i + | jmp >8 + end + end + + |8: + | cmp r13, 0 + | je >1 + | sub r13, 128 + | je >2 + | vmovd r15d, xmm9 + | and r15d, 255 + | vpshufb xmm9, xmm9, [->shuf_mask] + |3: + | cmp r15b, 255-8 + | jg >4 + || ghash_8_encrypt_8_parallel(Dst, {0, 10, 11, 12, 13, 14, 15}, 9, "out_order", operation) + | jne <3 + | vpshufb xmm9, xmm9, [->shuf_mask] + | jmp >2 + |4: + | vpshufb xmm9, xmm9, [->shuf_mask] + || ghash_8_encrypt_8_parallel(Dst, {0, 10, 11, 12, 13, 14, 15}, 9, "in_order", operation) + | vpshufb xmm9, xmm9, [->shuf_mask] + | jne <3 + | vpshufb xmm9, xmm9, [->shuf_mask] + |2: + || ghash_last_8(Dst, {0, 10, 11, 12, 13, 14, 15}) + |1: + + | mov r13, arg4 + | and r13, 15 + | je >1 + + | vpaddd xmm9, xmm9, [->one] + | vpshufb xmm9, xmm9, [->shuf_mask] + || encrypt_single_block(Dst, 9) + + | sub r11, 16 + | add r11, r13 + | vmovdqu xmm1, [arg3 + r11] + | lea r12, [->all_f] + | sub r12, r13 + | vmovdqu xmm2, [r12] + | vpshufb xmm1, xmm1, xmm2 + + if operation == "dec" then + | vmovdqa xmm2, xmm1 + end + | vpxor xmm9, xmm9, xmm1 + | vmovdqu xmm1, [r12 + 16] + | vpand xmm9, xmm9, xmm1 + if operation == "dec" then + | vpand xmm2, xmm2, xmm1 + else + | vmovdqa xmm2, xmm9 + end + | vpshufb xmm2, xmm2, [->shuf_mask] + | vpxor xmm14, xmm14, xmm2 + || ghash_mul(Dst, 14, 13, 0, 10, 11) + | sub r11, r13 + | add r11, 16 + + | vmovd rax, xmm9 + | cmp r13, 8 + | jle >2 + | mov [arg2 + r11], rax + | add r11, 8 + | vpsrldq xmm9, xmm9, 8 + | vmovd rax, xmm9 + | sub r13, 8 + |2: + | mov byte [arg2 + r11], al + | add r11, 1 + | shr rax, 8 + | sub r13, 1 + | jne <2 + + |1: + | mov r12, arg7 + | shl r12, 3 + | vmovd xmm15, r12d + + | shl arg4, 3 + | vmovd xmm1, arg4 + | vpslldq xmm15, xmm15, 8 + | vpxor xmm15, xmm15, xmm1 + + | vpxor xmm14, xmm14, xmm15 + || ghash_mul(Dst, 14, 13, 0, 10, 11) + | vpshufb xmm14, xmm14, [->shuf_mask] + | mov rax, arg5 + | vmovdqu xmm9, [rax] + || encrypt_single_block(Dst, 9) + | vpxor xmm9, xmm9, xmm14 + + | mov r10, arg8 + | mov r11, arg9 + | cmp r11, 16 + | je >3 + | cmp r11, 12 + | je >2 + | vmovd rax, xmm9 + | mov [r10], rax + | jmp >4 + |2: + | vmovd rax, xmm9 + | mov [r10], rax + | vpsrldq xmm9, xmm9, 8 + | vmovd eax, xmm9 + | mov [r10 + 8], eax + | jmp >4 + |3: + | vmovdqu [r10], xmm9 + |4: + + epilogue(Dst) +end + +local function precompute(Dst) + prologue(Dst) + + | vmovdqu xmm6, [arg2] + | vpshufb xmm6, xmm6, [->shuf_mask] + | vmovdqa xmm2, xmm6 + | vpsllq xmm6, xmm6, 1 + | vpsrlq xmm2, xmm2, 63 + | vmovdqa xmm1, xmm2 + | vpslldq xmm2, xmm2, 8 + | vpsrldq xmm1, xmm1, 8 + | vpor xmm6, xmm6, xmm2 + | vpshufd xmm2, xmm1, 0x24 + | vpcmpeqd xmm2, xmm2, [->two_one] + | vpand xmm2, xmm2, [->poly] + | vpxor xmm6, xmm6, xmm2 + | vmovdqa [arg1 + 16*11], xmm6 + + | vmovdqa xmm4, xmm6 + for i = 2, 8 do + || ghash_mul(Dst, 4, 6, 0, 1, 2) + | vmovdqa [arg1 + 16*(10+i)], xmm4 + end + + epilogue(Dst) +end + +local function keyexp(Dst) + | vmovdqu xmm1, [arg1] + | vmovdqa [arg2], xmm1 + | vpxor xmm3, xmm3, xmm3 + for i = 1, 10 do + | vaeskeygenassist xmm2, xmm1, i < 9 and 2^(i-1) or 27*(i-8) + | vpshufd xmm2, xmm2, 0xff + | vshufps xmm3, xmm3, xmm1, 0x10 + | vpxor xmm1, xmm1, xmm3 + | vshufps xmm3, xmm3, xmm1, 0x8c + | vpxor xmm1, xmm1, xmm3 + | vpxor xmm1, xmm1, xmm2 + | vmovdqa [arg2 + 16*i], xmm1 + end + | ret +end + +local function generator(Dst) + Dst:growpc(16) + + -- Functions + |->aesni_gcm_precomp_avx_gen4: + || precompute(Dst) + |.align 16 + |->aes_keyexp_128_enc_avx: + || keyexp(Dst) + |.align 16 + |->aesni_gcm_enc_avx_gen4: + || gcm_enc_dec(Dst, "enc", 0) + |.align 16 + |->aesni_gcm_dec_avx_gen4: + || gcm_enc_dec(Dst, "dec", 8) + + -- Data + |.align 64 + |->poly:; .dword 1, 0, 0, 0xC2000000 + |->poly2:; .dword 0xC2000000, 1, 0, 0xC2000000 + |->two_one:; .dword 1, 0, 0, 1 + |->shuf_mask: + for i = 15, 0, -1 do + |.byte i + end + for i = 0, 15 do + |.byte i + end + |->all_f:; .dword -1, -1, -1, -1 + | .dword 0, 0, 0, 0 + |->one:; .dword 1, 0, 0, 0 + |->onef:; .dword 0, 0, 0, 2^24 +end + +local Dst, globals = dasm.new(actions, nil, nil, 1 + #globalnames) +generator(Dst) +local mcode, size = Dst:build() +local entry = dasm.globals(globals, globalnames) +local fn_t = ffi.typeof("void(*)(gcm_data*, uint8_t*, const uint8_t*, uint64_t, uint8_t*, const uint8_t*, uint64_t, uint8_t*, uint64_t)") +return setmetatable({ + aes_keyexp_128_enc_avx = ffi.cast("void(*)(void*, void*)", entry.aes_keyexp_128_enc_avx), + aesni_gcm_precomp_avx_gen4 = ffi.cast("void(*)(gcm_data*, uint8_t*)", entry.aesni_gcm_precomp_avx_gen4), + aesni_gcm_enc_avx_gen4 = ffi.cast(fn_t, entry.aesni_gcm_enc_avx_gen4), + aesni_gcm_dec_avx_gen4 = ffi.cast(fn_t, entry.aesni_gcm_dec_avx_gen4), +}, {_anchor = mcode}) diff --git a/src/lib/ipsec/esp.lua b/src/lib/ipsec/esp.lua new file mode 100644 index 0000000000..36fb473401 --- /dev/null +++ b/src/lib/ipsec/esp.lua @@ -0,0 +1,149 @@ +module(..., package.seeall) +local datagram = require("lib.protocol.datagram") +local ethernet = require("lib.protocol.ethernet") +local esp = require("lib.protocol.esp") +local esp_tail = require("lib.protocol.esp_tail") +local aes_128_gcm = require("lib.ipsec.aes_128_gcm") +local lib = require("core.lib") +local ffi = require("ffi") + + +local esp_nh = 50 -- https://tools.ietf.org/html/rfc4303#section-2 +local esp_length = esp:sizeof() +local esp_tail_length = esp_tail:sizeof() + +function esp_v6_new (conf) + assert(conf.mode == "aes-128-gcm", "Only supports aes-128-gcm.") + return { aes_128_gcm = aes_128_gcm:new(conf.keymat, conf.salt), + seq_no = 0 } +end + + +local esp_v6_encrypt = {} + +function esp_v6_encrypt:new (conf) + local o = esp_v6_new(conf) + o.pad_buf = ffi.new("uint8_t[?]", o.aes_128_gcm.blocksize-1) + o.esp_buf = ffi.new("uint8_t[?]", o.aes_128_gcm.aad_size) + -- Fix me https://tools.ietf.org/html/rfc4303#section-3.3.3 + o.esp = esp:new_from_mem(o.esp_buf, esp_length) + o.esp:spi(0x0) -- Fix me, set esp:spi value. + o.esp_tail = esp_tail:new({}) + return setmetatable(o, {__index=esp_v6_encrypt}) +end + +-- Return next sequence number. +function esp_v6_encrypt:next_seq_no () + self.seq_no = self.seq_no + 1 + return self.seq_no +end + +function esp_v6_encrypt:encrypt (nh, payload, length) + local p = packet.allocate() + self.esp:seq_no(self:next_seq_no()) + packet.append(p, self.esp:header_ptr(), esp_length) + packet.append(p, payload, length) + local pad_length = self.aes_128_gcm.blocksize + - ((length + esp_tail_length) % self.aes_128_gcm.blocksize) + packet.append(p, self.pad_buf, pad_length) + self.esp_tail:next_header(nh) + self.esp_tail:pad_length(pad_length) + packet.append(p, self.esp_tail:header_ptr(), esp_tail_length) + packet.append(p, self.pad_buf, self.aes_128_gcm.auth_size) + self.aes_128_gcm:encrypt(packet.data(p) + esp_length, + packet.data(p) + esp_length, + length + pad_length + esp_tail_length, + self.esp) + return p +end + +function esp_v6_encrypt:encapsulate (p) + local plain = datagram:new(p, ethernet) + local eth = plain:parse_match() + local ip = plain:parse_match() + local nh = ip:next_header() + local encrypted = datagram:new(self:encrypt(nh, plain:payload())) + local _, length = encrypted:payload() + ip:next_header(esp_nh) + ip:payload_length(length) + encrypted:push(ip) + encrypted:push(eth) + return encrypted:packet() +end + + +local esp_v6_decrypt = {} + +function esp_v6_decrypt:new (conf) + local o = esp_v6_new(conf) + o.esp_overhead_size = esp_length + o.aes_128_gcm.auth_size + o.min_payload_length = o.aes_128_gcm.blocksize + o.esp_overhead_size + return setmetatable(o, {__index=esp_v6_decrypt}) +end + +-- Verify sequence number. +function esp_v6_decrypt:check_seq_no (seq_no) + self.seq_no = self.seq_no + 1 + return self.seq_no <= seq_no +end + +function esp_v6_decrypt:decrypt (payload, length) + if length < self.min_payload_length + or (length - self.esp_overhead_size) % self.aes_128_gcm.blocksize ~= 0 + then return end + local data_start = payload + esp_length + local data_length = length - esp_length - self.aes_128_gcm.auth_size + local esp = esp:new_from_mem(payload, esp_length) + if self.aes_128_gcm:decrypt(data_start, data_start, data_length, esp) then + local esp_tail_start = data_start + data_length - esp_tail_length + local esp_tail = esp_tail:new_from_mem(esp_tail_start, esp_tail_length) + local cleartext_length = data_length - esp_tail:pad_length() - esp_tail_length + local p = packet.from_pointer(data_start, cleartext_length) + return esp:seq_no(), p, esp_tail:next_header() + end +end + +function esp_v6_decrypt:decapsulate (p) + local encrypted = datagram:new(p, ethernet) + local eth = encrypted:parse_match() + local ip = encrypted:parse_match() + local decrypted = nil + if ip:next_header() == esp_nh then + local seq_no, payload, nh = self:decrypt(encrypted:payload()) + if payload and self:check_seq_no(seq_no) then + local plain = datagram:new(payload) + ip:next_header(nh) + ip:payload_length(packet.length(payload)) + plain:push(ip) + plain:push(eth) + return plain:packet() + end + end +end + + +function selftest () + local C = require("ffi").C + local ipv6 = require("lib.protocol.ipv6") + local conf = { mode = "aes-128-gcm", + keymat = "00112233445566778899AABBCCDDEEFF", + salt = "00112233"} + local enc, dec = esp_v6_encrypt:new(conf), esp_v6_decrypt:new(conf) + local payload = packet.from_string( +[[abcdefghijklmnopqrstuvwxyz +ABCDEFGHIJKLMNOPQRSTUVWXYZ +0123456789]] + ) + local d = datagram:new(payload) + d:push(ipv6:new({})) + d:push(ethernet:new({type=0x86dd})) + -- Check integrity + local p = d:packet() + local p2 = dec:decapsulate(enc:encapsulate(p)) + if p2 and p2.length == p.length and C.memcmp(p, p2, p.length) == 0 then + print("selftest passed") + else + print("integrity check failed") + os.exit(1) + end +end diff --git a/src/lib/protocol/esp.lua b/src/lib/protocol/esp.lua new file mode 100644 index 0000000000..2395feb3bf --- /dev/null +++ b/src/lib/protocol/esp.lua @@ -0,0 +1,51 @@ +module(..., package.seeall) +local ffi = require("ffi") +local header = require("lib.protocol.header") +local lib = require("core.lib") +local ntohl, htonl = lib.ntohl, lib.htonl +local ntohll, htonll = lib.ntohll, lib.htonll + +local esp = subClass(header) + +-- Class variables +esp._name = "esp" +esp:init( + { + [1] = ffi.typeof[[ + struct { + uint32_t spi; + uint64_t seq_no; + } __attribute__((packed)) + ]] + }) + +-- Class methods + +function esp:new (config) + local o = esp:superClass().new(self) + o:spi(config.spi) + o:seq_no(config.seq_no) + return o +end + +-- Instance methods + +function esp:spi (spi) + local h = self:header() + if spi ~= nil then + h.spi = htonl(spi) + else + return(ntohl(h.spi)) + end +end + +function esp:seq_no (seq_no) + local h = self:header() + if seq_no ~= nil then + h.seq_no = htonll(seq_no) + else + return(ntohll(h.seq_no)) + end +end + +return esp diff --git a/src/lib/protocol/esp_tail.lua b/src/lib/protocol/esp_tail.lua new file mode 100644 index 0000000000..d8cadfee3a --- /dev/null +++ b/src/lib/protocol/esp_tail.lua @@ -0,0 +1,48 @@ +module(..., package.seeall) +local ffi = require("ffi") +local header = require("lib.protocol.header") + +local esp_tail = subClass(header) + +-- Class variables +esp_tail._name = "esp_tail" +esp_tail:init( + { + [1] = ffi.typeof[[ + struct { + uint8_t pad_length; + uint8_t next_header; + } __attribute__((packed)) + ]] + }) + +-- Class methods + +function esp_tail:new (config) + local o = esp_tail:superClass().new(self) + o:pad_length(config.pad_length) + o:next_header(config.next_header) + return o +end + +-- Instance methods + +function esp_tail:pad_length (length) + local h = self:header() + if length ~= nil then + h.pad_length = length + else + return h.pad_length + end +end + +function esp_tail:next_header (next_header) + local h = self:header() + if next_header ~= nil then + h.next_header = next_header + else + return h.next_header + end +end + +return esp_tail