diff --git a/src/Makefile b/src/Makefile index bd51ff5..cc8102f 100644 --- a/src/Makefile +++ b/src/Makefile @@ -36,7 +36,8 @@ CC= $(DEFAULT_CC) # to slow down the C part by not omitting it. Debugging, tracebacks and # unwinding are not affected -- the assembler part has frame unwind # information and GCC emits it where needed (x64) or with -g (see CCDEBUG). -CCOPT= -O2 -fomit-frame-pointer +#CCOPT= -O2 -fomit-frame-pointer +CCOPT= -fomit-frame-pointer # Use this if you want to generate a smaller binary (but it's slower): #CCOPT= -Os -fomit-frame-pointer # Note: it's no longer recommended to use -O3 with GCC 4.x. @@ -54,9 +55,9 @@ CCOPT_arm64= -fno-omit-frame-pointer CCOPT_ppc= CCOPT_mips= # -CCDEBUG= +#CCDEBUG= # Uncomment the next line to generate debug information: -#CCDEBUG= -g +CCDEBUG= -g # CCWARN= -Wall # Uncomment the next line to enable more warnings: @@ -585,10 +586,10 @@ endif endif endif -Q= @ -E= @echo -#Q= -#E= @: +#Q= @ +#E= @echo +Q= +E= @: ############################################################################## # Make targets. diff --git a/src/lj_asm_arm64.h b/src/lj_asm_arm64.h index 74cd35b..7b5bb0d 100644 --- a/src/lj_asm_arm64.h +++ b/src/lj_asm_arm64.h @@ -551,7 +551,7 @@ static void asm_hrefk(ASMState *as, IRIns *ir) int large_ofs = check_offset(A64I_LDRx, ofs) == OFS_INVALID; Reg dest = (ra_used(ir) || large_ofs) ? ra_dest(as, ir, RSET_GPR) : RID_NONE; Reg node = ra_alloc1(as, ir->op1, RSET_GPR); - Reg key = RID_NONE, type = RID_TMP, idx = node; + Reg key = RID_NONE, /*type = RID_TMP,*/ idx = node; RegSet allow = rset_exclude(RSET_GPR, node); lua_assert(ofs % sizeof(Node) == 0); if (large_ofs) { @@ -856,7 +856,51 @@ dotypecheck: #if LJ_HASFFI static void asm_cnew(ASMState *as, IRIns *ir) { - lua_unimpl(); + CTState *cts = ctype_ctsG(J2G(as->J)); + CTypeID id = (CTypeID)IR(ir->op1)->i; + CTSize sz; + CTInfo info = lj_ctype_info(cts, id, &sz); + const CCallInfo *ci = &lj_ir_callinfo[IRCALL_lj_mem_newgco]; + IRRef args[4]; + RegSet allow = (RSET_GPR & ~RSET_SCRATCH); + //RegSet drop = RSET_SCRATCH; + lua_assert(sz != CTSIZE_INVALID || (ir->o == IR_CNEW && ir->op2 != REF_NIL)); + + as->gcsteps++; + asm_setupresult(as, ir, ci); /* GCcdata * */ + + /* Initialize immutable cdata object. */ + if (ir->o == IR_CNEWI) { + RegSet allow = (RSET_GPR & ~RSET_SCRATCH); + int32_t ofs = sizeof(GCcdata); + lua_assert(sz == 4 || sz == 8); + Reg r = ra_alloc1(as, ir->op2, allow); + emit_lso(as, A64I_STRx, r, RID_RET, ofs); + } else if (ir->op2 != REF_NIL) { /* Create VLA/VLS/aligned cdata. */ + ci = &lj_ir_callinfo[IRCALL_lj_cdata_newv]; + args[0] = ASMREF_L; /* lua_State *L */ + args[1] = ir->op1; /* CTypeID id */ + args[2] = ir->op2; /* CTSize sz */ + args[3] = ASMREF_TMP1; /* CTSize align */ + asm_gencall(as, ci, args); + emit_loadi(as, ra_releasetmp(as, ASMREF_TMP1), (int32_t)ctype_align(info)); + return; + } + + /* Initialize gct and ctypeid. lj_mem_newgco() already sets marked. */ + { + bool k = (id & 0xffff) ? 0 : 1; + Reg r = k ? RID_X1 : ra_allock(as, id, allow); + emit_lso(as, A64I_STRBw, RID_TMP, RID_RET, offsetof(GCcdata, gct)); + emit_lso(as, A64I_STRHw, r, RID_RET, offsetof(GCcdata, ctypeid)); + emit_d(as, A64I_MOVKw|A64F_U16(~LJ_TCDATA), RID_TMP); + if (k) emit_d(as, A64I_MOVKw|A64F_U16(id), RID_X1); + } + args[0] = ASMREF_L; /* lua_State *L */ + args[1] = ASMREF_TMP1; /* MSize size */ + asm_gencall(as, ci, args); + ra_allockreg(as, (int32_t)(sz+sizeof(GCcdata)), + ra_releasetmp(as, ASMREF_TMP1)); } #else #define asm_cnew(as, ir) ((void)0) @@ -866,7 +910,37 @@ static void asm_cnew(ASMState *as, IRIns *ir) static void asm_tbar(ASMState *as, IRIns *ir) { - lua_todo(); + Reg tab, link, logr, gr, mark = RID_TMP; + uint32_t allow = RSET_GPR; + MCLabel l_end; + //uint32_t logimm = emit_isk13(0, LJ_GC_BLACK); + + /* Allocate the registers */ + tab = ra_alloc1(as, ir->op1, allow); + allow = rset_exclude(allow, tab); + link = ra_scratch(as, allow); + allow = rset_exclude(allow, link); + gr = ra_allock(as, i64ptr(J2G(as->J)), allow); + allow = rset_exclude(allow, gr); + logr = ra_scratch(as, allow); + + l_end = emit_label(as); + + emit_lso(as, A64I_STRw, link, tab, (int32_t)offsetof(GCtab, gclist)); + emit_lso(as, A64I_STRBw, mark, tab, (int32_t)offsetof(GCtab, marked)); + emit_lso(as, A64I_STRw, tab, gr, + (int32_t)offsetof(global_State, gc.grayagain)); + emit_dnm(as, A64I_BICw, mark, mark, logr); + emit_lso(as, A64I_LDRw, link, gr, + (int32_t)offsetof(global_State, gc.grayagain)); + emit_cond_branch(as, CC_EQ, l_end); + //if (logimm != (uint32_t)-1) + // emit_n(as, A64I_TSTIw|A64F_IS(logimm), mark); + //else { + emit_nm(as, A64I_TSTw, mark, logr); + emit_d(as, A64I_MOVKw|A64F_U16(LJ_GC_BLACK), logr); + //} + emit_lso(as, A64I_LDRBw, mark, tab, (int32_t)offsetof(GCtab, marked)); } static void asm_obar(ASMState *as, IRIns *ir) diff --git a/src/lj_emit_arm64.h b/src/lj_emit_arm64.h index f616d07..d331af7 100644 --- a/src/lj_emit_arm64.h +++ b/src/lj_emit_arm64.h @@ -2,6 +2,7 @@ ** ARM64 instruction emitter. ** Copyright !!!TODO */ +#include static Reg ra_allock(ASMState *as, int32_t k, RegSet allow); @@ -49,20 +50,125 @@ static void emit_dnm(ASMState *as, A64Ins ai, Reg rd, Reg rn, Reg rm) /* Encode constant in K12 format for data processing instructions. */ static uint32_t emit_isk12(A64Ins ai, int32_t n) { - if (n >= 0 && n <= 4095) + if (n >= 0 && n <= 0xfff) { - return (n & 4095) << 10; + return (n & 0xfff) << 10; } - if ((n & 4095) == 0 && n < (4095 << 12)) + if ((n & 0xfff) == 0 && n < (0xfff << 12)) { - return (((n >> 12) & 4095) << 10) | (1 << 22); + return (((n >> 12) & 0xfff) << 10) | (1 << 22); } return -1; } +int count_leading_zeroes(uint64_t value) +{ + int count = 0; + if (value == 0) + return 64; + if ((value & 0xffffffff00000000ull) == 0) { + count += 32; + value = value << 32; + } + if ((value & 0xffff000000000000ull) == 0) { + count += 16; + value = value << 16; + } + if ((value & 0xff00000000000000ull) == 0) { + count += 8; + value = value << 8; + } + if ((value & 0xf000000000000000ull) == 0) { + count += 4; + value = value << 4; + } + if ((value & 0xc000000000000000ull) == 0) { + count += 2; + value = value << 2; + } + if ((value & 0x8000000000000000ull) == 0) + count += 1; + count += (value == 0); + return count; +} + /* Encode constant in K13 format for data processing instructions. */ -static uint32_t emit_isk13(A64Ins ai, int64_t n) { - lua_unimpl(); + +/* The source code has been copied from arm vixl implementation + * https://github.com/armvixl/vixl.git function IsImmLogical() + * !!TODO: We can speed up the steps by precalculating all possible + * encodings and then doing a binary search on that. It takes about + * storage space of 5k entries. + */ +static uint32_t emit_isk13(A64Ins ai, int64_t n) +{ + int is64 = ((ai & A64I_X) != 0x0); + uint32_t res; + bool neg = false; + uint64_t a, n_plus_a, b, n_plus_a_minus_b, c, mask; + int d, clz_a, clz_b, out_n, r, s; + uint64_t multiplier, candidate; + uint64_t multipliers[] = { + 0x0000000000000001UL, + 0x0000000100000001UL, + 0x0001000100010001UL, + 0x0101010101010101UL, + 0x1111111111111111UL, + 0x5555555555555555UL, + }; + + if (n & 1) { + neg = true; + n = ~n; + } + if (!is64) { + n <<= 32; + n |= n >> 32; + } + a = (n & -n); //Set to lowest set bit; + n_plus_a = n + a; + b = (n_plus_a & -n_plus_a); + n_plus_a_minus_b = n_plus_a - b; + c = (n_plus_a_minus_b & -n_plus_a_minus_b); + if (c) { /* General case */ + int clz_c = count_leading_zeroes(c); + clz_a = count_leading_zeroes(a); + d = clz_a - clz_c; + mask = (1ull << d) - 1; + out_n = 0; + } + else { /* degenerate case */ + if (!a) + return (uint32_t)-1; + else { + clz_a = count_leading_zeroes(a); + d = 64; + mask = ~(0ull); + out_n = 1; + } + } + if ((d & (d-1)) != 0) /* Not a power of 2 */ + return (uint32_t)-1; + + if (((b - a) & ~mask) != 0) + return (uint32_t)-1; + + multiplier = multipliers[count_leading_zeroes(d) - 57]; + candidate = (b - a) * multiplier; + if (n != candidate) + return (uint32_t)-1; + + clz_b = b ? count_leading_zeroes(b) : -1; + s = clz_a - clz_b; + if (neg) { + s = d - s; + r = (clz_b + 1) & (d - 1); + } + else + r = (clz_a + 1) & (d - 1); + s = ((-d << 1) | (s - 1)) & 0x3f; + res = (out_n<<12) | (r << 6) | s; + return res; } /* -- Emit loads/stores --------------------------------------------------- */ diff --git a/src/lj_target_arm64.h b/src/lj_target_arm64.h index cf7211f..121c9cd 100644 --- a/src/lj_target_arm64.h +++ b/src/lj_target_arm64.h @@ -149,7 +149,10 @@ typedef enum A64CC { typedef enum A64Ins { A64I_S = 0x20000000, + A64I_X = 0x80000000, A64I_EX = 0x00200000, + A64I_MOVKw = 0x72800000, + A64I_MOVKx = A64I_X|A64I_MOVKw, A64I_MOVK_16w = 0x72a00000, A64I_MOVK_16x = 0xf2a00000, A64I_MOVK_32x = 0xf2c00000, @@ -177,14 +180,28 @@ typedef enum A64Ins { A64I_BR = 0xd61f0000, A64I_ANDw = 0x0a000000, A64I_ANDx = 0x8a000000, + A64I_ANDIw = 0x12000000, + A64I_ANDIx = A64I_X|A64I_ANDIw, + A64I_ANDSw = 0x6a000000, + A64I_ANDSx = A64I_X|A64I_ANDSw, + A64I_ANDSIw= 0x72000000, + A64I_ANDSIx= A64I_X|A64I_ANDSIw, A64I_ORRw = 0x2a000000, /* orr w0,w0,w0 */ A64I_ORRx = 0xaa000000, /* orr x0,x0,x0 */ A64I_EORw = 0x4a000000, A64I_EORx = 0xca000000, + A64I_BICw = 0x0a200000, + A64I_BICx = 0x8a200000, + A64I_BICSw = 0x6a200000, + A64I_BICSx = 0xea200000, + A64I_TSTIw = A64I_ANDSIw|0x1f, /* tst wn, #imm*/ + A64I_TSTIx = A64I_ANDSIx|0x1f, /* tst xn, #imm*/ + A64I_TSTw = A64I_ANDSw|0x1f, /* tst wn, wm {,#shift}*/ + A64I_TSTx = A64I_ANDSx|0x1f, /* tst xn, xm {,#shift}*/ A64I_CCMPw = 0x7a400000, /* ccmp w0,w0,#0,eq */ A64I_CCMPx = 0xfa400000, /* ccmp x0,x0,#0,eq */ A64I_STRBw = 0x39000000, /* strb w0,[x0] */ - A64I_STRHw = 0x79000000, /* strh w0,[x0] */ + A64I_STRHw = 0x79000000, A64I_STRw = 0xb9000000, /* str w0,[x0] */ A64I_STRx = 0xf9000000, /* str x0,[x0] */ A64I_SUBx = 0xcb000000,