/* ES40 emulator -- JIT engine implementation. */ #ifdef ES40_JIT #include "jitengine.h" #include #include #include #include // note_exec times its own stats-print I/O (excluded from the wall-clock RPCC) #include #define ASMJIT_STATIC #include // asmjit's x64 backend emits x86-64; asmjit's CallConv maps the host C ABI // (Microsoft x64 or System V) from the build env. Block any other host arch // rather than silently emit for a 32-bit or non-x86 target. (until ARM is added) // I don't forsee ever wanting to add or use 32-bit x86. #if !defined(_M_X64) && !defined(__x86_64__) #error "ES40_JIT requires an x86-64 host (asmjit x64 backend emits 64-bit code)" #endif namespace { #ifdef JIT_DISASM // Log/error any asmjit emit failure (badly formed instruction / bad operand) that the Assembler // would otherwise accept and then ship as a silently truncated block. Records the failure so // compile_block discards the block instead of adding it. class JitErrorHandler : public asmjit::ErrorHandler { public: FILE* fp = nullptr; // disassembly trace file (falls back to stderr if unopened) int cpu_id = -1; bool failed = false; void handle_error(asmjit::Error err, const char* message, asmjit::BaseEmitter*) override { (void) err; failed = true; fprintf(fp ? fp : stderr, "[JIT][CPU%d][EMIT-ERROR] %s\n", cpu_id, message); } }; #endif enum SafeOp { OP_NONE, OP_ADDQ, OP_SUBQ, OP_ADDL, OP_SUBL, OP_S4ADDQ, OP_S8ADDQ, OP_S4SUBQ, OP_S8SUBQ, OP_S4ADDL, OP_S8ADDL, OP_S4SUBL, OP_S8SUBL, // INTA (0x10) scaled longword: sext32((Ra*scale) +/- Rb) OP_CMPBGE, // INTA (0x10) per-byte unsigned compare -> 8-bit mask OP_SEXTB, OP_SEXTW, // FPTI (0x1c) CIX/BWX: Rc = sign-extend byte/word of op2 OP_CTPOP, OP_CTLZ, OP_CTTZ, // FPTI (0x1c) CIX bit-count of op2: popcount / leading / trailing zeros OP_ITOFS, OP_ITOFF, OP_ITOFT, // ITFP (0x14) int->FP reg moves: f[Fc] = fmt(Ra) OP_FTOIS, OP_FTOIT, // FPTI (0x1c) FP->int reg moves: Rc = fmt(f[Fa]) OP_FLTL, // FLTL (0x17) non-arithmetic: FPCR moves, CPYSx, FCMOVx, CVTLQ/QL OP_CVTQT, OP_CVTQS, // FLTI (0x16) int->IEEE convert: f[Fc] = (T/S)(s64)f[Fb] via SSE OP_ADDT, OP_SUBT, OP_MULT, OP_DIVT, // FLTI (0x16) IEEE T-float arith: f[Fc] = f[Fa] op f[Fb] via SSE OP_CMPTUN, OP_CMPTEQ, OP_CMPTLT, OP_CMPTLE, // FLTI (0x16) IEEE T-float compares: f[Fc] = (cmp) ? 2.0 : 0.0 OP_ADDS, OP_SUBS, OP_MULS, OP_DIVS, // FLTI (0x16) IEEE S-float arith: f[Fc] = round_single(f[Fa] op f[Fb]) OP_CVTST, OP_CVTTS, OP_CVTTQ, // FLTI (0x16) IEEE converts: S->T widen, T->S narrow, T->Q to int OP_SQRTS, OP_SQRTT, // ITFP (0x14) IEEE sqrt: f[Fc] = sqrt(f[Fb]) via sqrtss/sqrtsd OP_FLTV, // FLTV (0x15) VAX arith/convert/compare: f[Fc] via jit_fltv helper OP_AND, OP_BIS, OP_XOR, OP_BIC, OP_ORNOT, OP_EQV, OP_CMOV, // INTL (0x11) conditional moves CMOVxx: Rc = cond(Ra) ? op2 : Rc OP_AMASK, OP_IMPLVER, // INTL (0x11) CPU feature probes: Rc = op2 & ~CPU_AMASK / Rc = CPU_IMPLVER OP_CMPEQ, OP_CMPLT, OP_CMPLE, OP_CMPULT, OP_CMPULE, OP_SLL, OP_SRL, OP_SRA, OP_MULQ, OP_MULL, OP_UMULH, // INTM (0x13): MULL = sext32(Ra*op2); UMULH = hi64 of unsigned Ra*op2 OP_EXTL, OP_EXTH, OP_INSL, OP_INSH, OP_MSKL, OP_MSKH, OP_ZAP, // INTS (0x12) byte-manip (Rb&7 keyed) OP_NOP, OP_MFENCE, // MISC (0x18): prefetch/cache hints (no-op), barriers (mfence) OP_RPCC, OP_RC, OP_RS, // MISC (0x18) state reads (Ra dest): cycle counter / read-and-clear,set intr flag OP_LDQ, OP_LDL, // memory-format loads: Ra = MEM[Rb + disp16] OP_LDBU, OP_LDWU, // BWX byte/word loads (0x0a/0x0c): Ra = zero-extend MEM[Rb+disp16].{b,w} OP_LDL_L, OP_LDQ_L, // load-locked (0x2a/0x2b): Ra = MEM[Rb+disp16] + establish LL/SC monitor OP_STL_C, OP_STQ_C, // store-conditional (0x2e/0x2f): cond store Ra -> MEM; Ra = success(1)/fail(0) OP_STQ, OP_STL, // memory-format stores: MEM[Rb + disp16] = Ra OP_STB, OP_STW, // BWX byte/word stores (0x0e/0x0d): MEM[Rb + disp16].{b,w} = Ra OP_LDQ_U, OP_STQ_U, // unaligned quad (0x0b/0x0f): MEM[(Rb+disp16) & ~7] load/store OP_LDT, OP_LDS, OP_STT, OP_STS, // FP memory (0x23/0x22/0x27/0x26): f[Fa] <-> MEM, LDT/STT raw, LDS/STS ieee conv OP_LDF, OP_LDG, OP_STF, OP_STG, // VAX FP memory (0x20/0x21/0x24/0x25): f[Fa] <-> MEM, F/G format conversion (helper) OP_LDA, OP_LDAH, // load-address: Ra = Rb + disp16 (<<16 for LDAH); pure ALU OP_HW_MFPR, // HW_MFPR (0x19), PALmode only: Ra = IPR[(ins>>8)&0xff] via helper OP_HW_LDL, OP_HW_LDQ, // HW_LD (0x1b) physical func 0/1, PALmode only: Ra = phys[Rb+disp12] OP_HW_LDQ_VPTE, // HW_LD (0x1b) func 5: virtual PTE fetch, access-checked vs KERNEL mode OP_HW_LDL_WCHK, // HW_LD (0x1b) func 0xa: longword virtual read + write-check (WrChk) OP_HW_MTPR, // HW_MTPR (0x1d) side-effect-free IPRs, PALmode only: IPR[fn] = Rb OP_HW_MTPR_TERM, // HW_MTPR I_CTL (0x11): writes SDE/SPE/VA mode -> terminate, re-dispatch past it OP_HW_STL, OP_HW_STQ, // HW_ST (0x1f) physical func 0/1, PALmode only: phys[Rb+disp12] = Ra OP_JMP, // JMP/JSR/RET (0x1a): Ra = PC+4; PC = Rb & ~3 (computed target) OP_HW_RET, // HW_RET (0x1e), PALmode: PC = Rb & ~2 (computed jump, the PAL return) OP_CALL_PAL, // CALL_PAL (0x00): save R23/exc_addr; PC = pal_base | entry offset // Branch-format terminators (contiguous; see is_branch). Conditional on Ra, plus BR/BSR. OP_BEQ, OP_BNE, OP_BLT, OP_BLE, OP_BGT, OP_BGE, OP_BLBC, OP_BLBS, OP_BR, OP_BSR, OP_FBEQ, OP_FBNE, OP_FBLT, OP_FBLE, OP_FBGT, OP_FBGE // FP branches (0x31-0x37): branch on f[Fa] vs 0.0 }; static inline bool is_branch(SafeOp op) { return op >= OP_BEQ && op <= OP_FBGE; } static inline bool is_fp_branch(SafeOp op) { return op >= OP_FBEQ && op <= OP_FBGE; } static inline bool is_store(SafeOp op) { return op == OP_STQ || op == OP_STL; } // A terminator ends the block and writes its own next PC (branches + the computed jump). static inline bool is_terminator(SafeOp op) { return op == OP_JMP || op == OP_HW_RET || op == OP_CALL_PAL || op == OP_HW_MTPR_TERM || is_branch(op); } // POPCNT isn't baseline x86-64 (pre-2008 CPUs lack it); query the host once. CTLZ/CTTZ use // baseline BSR/BSF, so only CTPOP is gated -- it stays interpreted when the host lacks POPCNT. static bool host_has_popcnt() { static const bool ok = asmjit::CpuInfo::host().features().x86().has_popcnt(); return ok; } // Safe = goto-free, register-only operate-format ops (no trap, memory, or branch). // pal_block enables PALmode-only ops (HW_MFPR): outside PALmode they'd OPCDEC, so only // compile them when the block is PALmode (the dispatcher keys blocks by PC bit 0). SafeOp classify(uint32_t ins, bool pal_block) { uint32_t opcode = ins >> 26; uint32_t func = (ins >> 5) & 0x7F; switch (opcode) { case 0x10: // INTA switch (func) { case 0x20: return OP_ADDQ; case 0x29: return OP_SUBQ; case 0x00: return OP_ADDL; case 0x09: return OP_SUBL; case 0x22: return OP_S4ADDQ; case 0x32: return OP_S8ADDQ; case 0x2b: return OP_S4SUBQ; case 0x3b: return OP_S8SUBQ; case 0x02: return OP_S4ADDL; case 0x12: return OP_S8ADDL; case 0x0b: return OP_S4SUBL; case 0x1b: return OP_S8SUBL; case 0x0f: return OP_CMPBGE; case 0x2d: return OP_CMPEQ; case 0x4d: return OP_CMPLT; case 0x6d: return OP_CMPLE; case 0x1d: return OP_CMPULT; case 0x3d: return OP_CMPULE; // ADDL/V (0x40), SUBL/V (0x49), ADDQ/V (0x60), SUBQ/V (0x69): overflow-trapping -> interpret } break; case 0x11: // INTL switch (func) { case 0x00: return OP_AND; case 0x20: return OP_BIS; case 0x40: return OP_XOR; case 0x08: return OP_BIC; case 0x28: return OP_ORNOT; case 0x48: return OP_EQV; case 0x14: case 0x16: case 0x24: case 0x26: // CMOVLBS/LBC/EQ/NE case 0x44: case 0x46: case 0x64: case 0x66: // CMOVLT/GE/LE/GT return OP_CMOV; case 0x61: // AMASK: Ra must be R31 (else the interp's // DO_AMASK traps OPCDEC) -- compile only the architecturally valid form. if (((ins >> 21) & 0x1f) != 31) return OP_NONE; return OP_AMASK; case 0x6c: return OP_IMPLVER; // IMPLVER: Rc = implementation version constant } break; case 0x12: // INTS: shifts + byte-manipulation (extract / insert / mask / zap), Rb&7 keyed switch (func) { case 0x39: return OP_SLL; case 0x34: return OP_SRL; case 0x3c: return OP_SRA; case 0x06: case 0x16: case 0x26: case 0x36: return OP_EXTL; // EXTBL/WL/LL/QL case 0x5a: case 0x6a: case 0x7a: return OP_EXTH; // EXTWH/LH/QH case 0x0b: case 0x1b: case 0x2b: case 0x3b: return OP_INSL; // INSBL/WL/LL/QL case 0x57: case 0x67: case 0x77: return OP_INSH; // INSWH/LH/QH case 0x02: case 0x12: case 0x22: case 0x32: return OP_MSKL; // MSKBL/WL/LL/QL case 0x52: case 0x62: case 0x72: return OP_MSKH; // MSKWH/LH/QH case 0x30: case 0x31: return OP_ZAP; // ZAP / ZAPNOT } break; case 0x13: // INTM (integer multiply). MULL/V (0x40) + MULQ/V (0x60) overflow-trap -> interpret. if (func == 0x20) return OP_MULQ; if (func == 0x00) return OP_MULL; // 32-bit multiply, low 32 sign-extended if (func == 0x30) return OP_UMULH; // high 64 bits of the unsigned 128-bit product break; case 0x14: { // ITFP: compile the int->FP register moves + IEEE SQRT*. Fc==31 is interpreted // (the interp zeroes f[31] per instruction -- a compiled write would desync). const uint32_t f14 = (ins >> 5) & 0x7ff; if ((ins & 0x1f) == 31) return OP_NONE; // Fc==31 -> interp const uint32_t sb = f14 & 0x3f; // IEEE sqrt (source Fb): SQRTS 0x0b / SQRTT 0x2b if (sb == 0x0b || sb == 0x2b) { const uint32_t r = (f14 >> 6) & 3; if (r == 0 || r == 1) return OP_NONE; // /C, /M: SSE rounds nearest -> interpret if (((f14 >> 8) & 7) == 7) return OP_NONE; // /SUI -> interpret return (sb == 0x2b) ? OP_SQRTT : OP_SQRTS; } if (((ins >> 16) & 0x1f) != 31) return OP_NONE; // ITOFx: Rb must be 31 (Ra is the int source) if (f14 == 0x004) return OP_ITOFS; if (f14 == 0x014) return OP_ITOFF; if (f14 == 0x024) return OP_ITOFT; return OP_NONE; } case 0x17: { // FLTL: all non-arithmetic (FPCR moves, sign-copies, FCMOVs, CVTLQ/QL) via // jit_fltl; only the /V trap variants of CVTQL stay interpreted. const uint32_t f17 = (ins >> 5) & 0x7ff; const bool ok = f17 == 0x010 || (f17 >= 0x020 && f17 <= 0x022) || f17 == 0x024 || f17 == 0x025 || (f17 >= 0x02a && f17 <= 0x02f) || f17 == 0x030; if (!ok) return OP_NONE; // f31-dest gate (interp zeroes f[31] per instr): MF_FPCR writes f[Fa]; MT_FPCR has no FP dest if (f17 == 0x025) { if (((ins >> 21) & 0x1f) == 31) return OP_NONE; } else if (f17 != 0x024) { if ((ins & 0x1f) == 31) return OP_NONE; } return OP_FLTL; } case 0x15: { // FLTV (VAX): helper-dispatched arith/convert/compare; bail to the interp on trap. if ((ins & 0x1f) == 31) return OP_NONE; // Fc==31: interp zeroes f[31] per instr const uint32_t f = (ins >> 5) & 0x7ff; if (f == 0x0a5 || f == 0x4a5 || f == 0x0a6 || f == 0x4a6 || f == 0x0a7 || f == 0x4a7 // CMPGEQ/LT/LE || f == 0x03c || f == 0x0bc || f == 0x03e || f == 0x0be) return OP_FLTV; // CVTQF/CVTQG if (f & 0x200) return OP_NONE; // invalid qualifier -> interp OPCDECs (UNKNOWN2) switch (f & 0x7f) { // base-op arith / G<->F<->D / G->Q convert case 0x000: case 0x001: case 0x002: case 0x003: // ADDF/SUBF/MULF/DIVF case 0x01e: // CVTDG case 0x020: case 0x021: case 0x022: case 0x023: // ADDG/SUBG/MULG/DIVG case 0x02c: case 0x02d: case 0x02f: // CVTGF/CVTGD/CVTGQ return OP_FLTV; } return OP_NONE; // other 0x15 funcs: interp OPCDECs } case 0x16: { // FLTI (IEEE): SSE-inline the int->float converts; bail to the interp on traps/edges. const uint32_t f = (ins >> 5) & 0x7ff; if ((ins & 0x1f) == 31) return OP_NONE; // Fc==31: interp zeroes f[31] per instr (all 0x16) if (f == 0x2ac || f == 0x6ac) return OP_CVTST; // CVTST (S->T): before the invalid gate (SRC bit set); // S-denormal renormalizes / NaN quiets -> bail, else valid-T copy if ((f & 0x3f) == 0x2f) { // CVTTQ (T->Q int): /C chop is valid -> handle before round gate if (((f >> 6) & 3) == 1) return OP_NONE; // /M -> interp if (((f >> 8) & 7) == 7) return OP_NONE; // /SUI -> interp if (((f & 0x600) == 0x200) || ((f & 0x500) == 0x400)) return OP_NONE; // invalid qualifier return OP_CVTTQ; } if (((f & 0x600) == 0x200) || ((f & 0x500) == 0x400)) return OP_NONE; // invalid qualifier -> OPCDEC const uint32_t rnd = (f >> 6) & 3; // 0=/C 1=/M 2=/N 3=/D (dynamic, runtime-checked) if (rnd == 0 || rnd == 1) return OP_NONE; // chopped / minus-inf: SSE rounds to nearest -> interpret if (((f >> 8) & 7) == 7) return OP_NONE; // /SUI: traps on every inexact -> interpret if (f == 0x0a4 || f == 0x5a4) return OP_CMPTUN; // IEEE compares (None + /SU): f[Fc] = (cmp) ? 2.0 : 0.0 if (f == 0x0a5 || f == 0x5a5) return OP_CMPTEQ; // full-function; a NaN operand bails to the interp if (f == 0x0a6 || f == 0x5a6) return OP_CMPTLT; if (f == 0x0a7 || f == 0x5a7) return OP_CMPTLE; const uint32_t baseop = f & 0x3f; if (baseop == 0x00) return OP_ADDS; // ADDS \ IEEE S-float (single) arith via SSE if (baseop == 0x01) return OP_SUBS; // SUBS > if (baseop == 0x02) return OP_MULS; // MULS | if (baseop == 0x03) return OP_DIVS; // DIVS / if (baseop == 0x20) return OP_ADDT; // ADDT \ IEEE T-float (double) arith via SSE: if (baseop == 0x21) return OP_SUBT; // SUBT > f[Fc] = f[Fa] op f[Fb] if (baseop == 0x22) return OP_MULT; // MULT | if (baseop == 0x23) return OP_DIVT; // DIVT / if (baseop == 0x2c) return OP_CVTTS; // CVTTS (T->S narrow): f[Fc] = round_single(f[Fb]) if (baseop == 0x3e && (f & 0x300) != 0x100) return OP_CVTQT; // CVTQT: f[Fc] = (double)(s64) f[Fb] if (baseop == 0x3c && (f & 0x300) != 0x100) return OP_CVTQS; // CVTQS: f[Fc] = (single)(s64) f[Fb] return OP_NONE; // all other 0x16 funcs interpreted } case 0x1c: // FPTI (CIX/BWX/MVI/FP-moves): the sign-extends, bit-counts, and FP->int register // moves compile here (CTPOP needs host POPCNT). MVI packed media (PERR/MIN/MAX/PK/UNPK) // stays interpreted for now. if (func == 0x00) return OP_SEXTB; // sign-extend byte of op2 if (func == 0x01) return OP_SEXTW; // sign-extend word of op2 if (func == 0x32) return OP_CTLZ; // count leading zeros of op2 (BSR; op2==0 -> 64) if (func == 0x33) return OP_CTTZ; // count trailing zeros of op2 (BSF; op2==0 -> 64) if (func == 0x30 && host_has_popcnt()) return OP_CTPOP; // popcount; interpret when host lacks POPCNT if (func == 0x70 || func == 0x78) { // FTOIT / FTOIS: Rb must be 31; Rc==31 -> interpret if (((ins >> 16) & 0x1f) != 31 || (ins & 0x1f) == 31) return OP_NONE; return (func == 0x70) ? OP_FTOIT : OP_FTOIS; } break; case 0x18: // MISC: memory barriers -> mfence (keep MP ordering); prefetch/cache hints -> no-op switch (ins & 0xFFFF) { case 0x0000: case 0x0400: // TRAPB, EXCB case 0x4000: case 0x4400: return OP_MFENCE; // MB, WMB case 0x8000: case 0xA000: case 0xE800: // FETCH, FETCH_M, ECB case 0xF800: case 0xFC00: return OP_NOP; // WH64, WH64EN // RPCC/RC/RS read time-varying / consumed state into Ra; the verify log+replays the read so the // two passes agree. RC/RS also side-effect the soft-interrupt flag, so the Ra==31 (no GPR dest, // flag-only) forms are compiled too -- the helper still runs, the store is skipped. RPCC Ra==31 // is a pure no-op read (nothing to replay), so it stays interpreted. case 0xC000: if (((ins >> 21) & 0x1f) != 31) return OP_RPCC; break; // RPCC (cycle counter) case 0xE000: return OP_RC; // RC (read & clear soft-intr flag); Ra==31 -> clear-only case 0xF000: return OP_RS; // RS (read & set soft-intr flag); Ra==31 -> set-only } break; case 0x00: { // CALL_PAL: compile valid standard funcs (priv 0x00-0x3f, unpriv 0x80-0xbf) const uint32_t fn = ins & 0x1FFFFFFF; if (fn <= 0x3F || (fn >= 0x80 && fn <= 0xBF)) return OP_CALL_PAL; break; // SRM specials (0x1234xx) / invalid ranges -> interpret } case 0x08: return OP_LDA; // load address (Ra = Rb + disp16) -- pure ALU, no memory case 0x09: return OP_LDAH; // load address high (Ra = Rb + (disp16 << 16)) case 0x19: { // HW_MFPR: read IPR -> Ra. PALmode-only (else OPCDEC). // ISUM (fn 0x0d) reads async interrupt lines -- compiled via log/replay. Only the IPRs // jit_hw_mfpr implements compile: an unknown function must reach the interpreter's // UNKNOWN2 (OPCDEC trap), which the helper's keep-Ra default would silently skip. if (!pal_block) return OP_NONE; const uint32_t fn = (ins >> 8) & 0xff; const bool known = ((fn & 0xc0) == 0x40) // PCTX group || (fn >= 0x05 && fn <= 0x0d) || fn == 0x0f || fn == 0x10 || fn == 0x11 || fn == 0x14 || fn == 0x16 || fn == 0x27 || fn == 0x2a || fn == 0x2b || fn == 0xc0 || fn == 0xc2 || fn == 0xc3; return known ? OP_HW_MFPR : OP_NONE; } case 0x1b: { // HW_LD: read phys[Rb+disp12] -> Ra. PALmode-only. Compile the // physical forms (func 0/1), the quad VPTE fetch (func 5, kernel-checked -- jit_read_vpte), // and the longword virtual WrChk (func 0xa, jit_read_wchk). Locked + alt forms interpret. if (!pal_block) return OP_NONE; const uint32_t f = (ins >> 12) & 0xf; if (f == 0) return OP_HW_LDL; if (f == 1) return OP_HW_LDQ; if (f == 5) { // Ra==31: nothing to log/replay -> interpret if (((ins >> 21) & 0x1f) == 31) return OP_NONE; return OP_HW_LDQ_VPTE; } if (f == 10) { // func 0xa (HRM TYPE 1012): longword virtual read + WrChk -- jit_read_wchk if (((ins >> 21) & 0x1f) == 31) return OP_NONE; // Ra==31: probe-only, interpret for the fault return OP_HW_LDL_WCHK; } return OP_NONE; } case 0x1d: { // HW_MTPR (PALmode): compile the pure-store IPRs, the TB fills // (idempotent add_tb_i/_d), IER (field stores + check_int kick), the ITB invalidates (idempotent // tbia/tbiap/tbis -> note_itb_invalidate), IC_FLUSH (lazy flush; reclaim deferred off the // compiled frame), I_CTL (terminator: writes SDE/SPE/VA mode), CM/SIRR (mode + soft-int fields, // check_int kick), and the 0x40-7f AST/FPEN/PPCEN stores. DTB invalidates (dpc coherence), ASN // writes, HW_INT_CLR, PAL_BASE, VA_CTL (translation/flush) stay interpreted. if (!pal_block) return OP_NONE; const uint32_t mfn = (ins >> 8) & 0xff; // 0x40-0x7f bitmask group: bit 0 writes ASN (dpc flush + asn-epoch bump) -> must terminate; the // rest (ASTER/ASTRR/PPCEN/FPEN) are pure field stores (+check_int), safe to compile. if ((mfn & 0xc0) == 0x40) return (mfn & 1) ? OP_NONE : OP_HW_MTPR; switch (mfn) { case 0x00: case 0x14: case 0x20: case 0x26: // ITB_TAG, PCTR_CTL, DTB_TAG0, DTB_ALTMODE case 0x29: case 0xa0: case 0xc0: // DC_CTL, DTB_TAG1, CC case 0x01: case 0x21: case 0xa1: // ITB_PTE, DTB_PTE0, DTB_PTE1 (TB fills) case 0x02: case 0x03: case 0x04: // ITB_IAP, ITB_IA, ITB_IS (idempotent ITB invalidates) case 0x13: // IC_FLUSH (lazy gen-bump flush; reclaim deferred off-frame) case 0x0a: // IER (interrupt enables + check_int kick) case 0x09: case 0x0b: case 0x0c: // CM, IER_CM, SIRR (mode/soft-int fields + check_int) return OP_HW_MTPR; case 0x11: // I_CTL: changes SDE (shadow remap)/SPE/VA mode -> terminate return OP_HW_MTPR_TERM; case 0x15: case 0x17: case 0x27: // CLR_MAP, SLEEP, MM_STAT (no-ops) case 0x2b: case 0x2c: case 0x2d: // C_DATA, C_SHIFT, M_FIX (no-ops) return OP_NOP; } return OP_NONE; } case 0x1f: { // HW_ST (PALmode): compile physical longword/quadword (func 0/1). // Store-conditional (2/3) does LL/SC, virtual (4/5) and virtual-alt (12/13) translate with // side effects -> interpret. Mirrors HW_LD; the value is Ra, base Rb, displacement 12-bit. if (!pal_block) return OP_NONE; const uint32_t f = (ins >> 12) & 0xf; if (f == 0) return OP_HW_STL; if (f == 1) return OP_HW_STQ; return OP_NONE; } case 0x1a: return OP_JMP; // JMP/JSR/RET: computed jump (target = Rb & ~3). Now compiled -- // chained in-frame via a block-cache lookup (jit_indirect), which // handles varying targets without the old single-slot thrash. case 0x1e: // HW_RET (HWREI): a PAL return, also a simple computed jump (Rb & ~2). return pal_block ? OP_HW_RET : OP_NONE; case 0x28: return OP_LDL; // memory-format loads (Ra = MEM[Rb+disp16]) case 0x29: return OP_LDQ; case 0x0a: return OP_LDBU; // BWX byte/word loads (Ra = zero-extend MEM[Rb+disp16].{b,w}) case 0x0c: return OP_LDWU; case 0x0b: return OP_LDQ_U; // unaligned quad load: Ra = MEM[(Rb+disp16) & ~7] case 0x2a: // LDL_L / LDQ_L: the load-locked half of LL/SC. Compile the value- case 0x2b: // returning forms only -- Ra==31 (lock without consuming the value) // leaves the loaded value out of the GPRs, so the verify can't replay it; interpret those. if (((ins >> 21) & 0x1f) == 31) return OP_NONE; return (opcode == 0x2a) ? OP_LDL_L : OP_LDQ_L; case 0x2c: return OP_STL; // memory-format stores (MEM[Rb+disp16] = Ra) case 0x2d: return OP_STQ; case 0x0e: return OP_STB; // BWX byte/word stores (MEM[Rb+disp16].{b,w} = Ra low bits) case 0x0d: return OP_STW; case 0x0f: return OP_STQ_U; // unaligned quad store: MEM[(Rb+disp16) & ~7] = Ra case 0x23: return OP_LDT; // FP loads (f[Fa] = MEM[Rb+disp16]): LDT raw 8B case 0x22: return OP_LDS; // LDS ieee_lds(4B) case 0x27: return OP_STT; // FP stores (MEM[Rb+disp16] = f[Fa]): STT raw 8B case 0x26: return OP_STS; // STS ieee_sts(4B) case 0x20: return OP_LDF; // VAX FP loads: LDF vax_ldf(4B) case 0x21: return OP_LDG; // LDG vax_ldg(8B) case 0x24: return OP_STF; // VAX FP stores: STF vax_stf(4B) case 0x25: return OP_STG; // STG vax_stg(8B) case 0x2e: // STL_C / STQ_C: the store-conditional half of LL/SC. Ra is both the case 0x2f: // value AND the success/fail dest. Compile Ra!=31 (Ra==31 discards the // result into R31, so the verify can't capture it from a GPR); interpret those. if (((ins >> 21) & 0x1f) == 31) return OP_NONE; return (opcode == 0x2e) ? OP_STL_C : OP_STQ_C; // Branch format: opcode | Ra | disp21. Conditional on Ra, plus BR/BSR. case 0x30: return OP_BR; case 0x34: return OP_BSR; case 0x38: return OP_BLBC; case 0x39: return OP_BEQ; case 0x3a: return OP_BLT; case 0x3b: return OP_BLE; case 0x3c: return OP_BLBS; case 0x3d: return OP_BNE; case 0x3e: return OP_BGE; case 0x3f: return OP_BGT; case 0x31: return OP_FBEQ; case 0x32: return OP_FBLT; // FP branches: FPSTART, then f[Fa] vs 0.0 case 0x33: return OP_FBLE; case 0x35: return OP_FBNE; case 0x36: return OP_FBGE; case 0x37: return OP_FBGT; } return OP_NONE; } } // namespace // Defined further down; forward-declared so compile_block's punch-list print can use it. static const char* opcode_name(unsigned op); CJitEngine::CJitEngine(int cpu_id) : m_cpu_id(cpu_id), m_recorded(0), m_code_bytes(0), m_rt(nullptr) { memset(m_blocks, 0, sizeof(m_blocks)); // flush() is lazy (gen bump) -- zero the slots here memset(m_traces, 0, sizeof(m_traces)); // trace tier: empty until formation fills slots // Trace tier kill-switch (config_debug.h JIT_TRACES). OFF by default, 1-block traces preempt block // chaining = a net loss; re-enable when fusion closes loops in-trace. #ifdef JIT_TRACES m_traces_enabled = true; #else m_traces_enabled = false; #endif m_rt = new asmjit::JitRuntime(); #ifdef JIT_VERIFY m_v_exec = m_v_fail = 0; #endif #ifdef JIT_STATS m_stat_native = m_stat_interp = m_stat_hot = m_stat_miss = 0; m_stat_compiled = m_stat_plen_sum = m_stat_code_bytes = 0; m_stat_wall_last_ns = (uint64_t) std::chrono::duration_cast( std::chrono::steady_clock::now().time_since_epoch()).count(); m_tsc_compiled = m_tsc_interp = 0; m_tsc_window_start = jit_rdtsc(); m_bail_link = m_jmp_attempt = m_jmp_hit = 0; m_fresh_cold = m_fresh_tag = m_fresh_asn = m_fresh_phys = m_fresh_hash = 0; m_trace_formed = m_trace_entered = m_trace_exits = m_trace_stale = 0; memset(m_term_op, 0, sizeof(m_term_op)); memset(m_pal_func, 0, sizeof(m_pal_func)); memset(m_mtpr_func, 0, sizeof(m_mtpr_func)); memset(m_hwld_func, 0, sizeof(m_hwld_func)); memset(m_misc_func, 0, sizeof(m_misc_func)); m_first_breaker_logged = false; #endif #ifdef JIT_DISASM char name[64]; snprintf(name, sizeof name, "jit_disasm_cpu%d.txt", m_cpu_id); m_disasm_fp = fopen(name, "w"); if (!m_disasm_fp) fprintf(stderr, "[JIT][CPU%d] could not open %s for the disassembly trace\n", m_cpu_id, name); #endif #ifdef JIT_VERIFY if (cpu_id == 0) trace_selftest(); // M0: validate trace_ok's source-coherence once (SMC/IMB/remap) #endif } CJitEngine::~CJitEngine() { delete (asmjit::JitRuntime*) m_rt; #ifdef JIT_DISASM if (m_disasm_fp) fclose(m_disasm_fp); #endif } // FNV-1a/64 over a block's source words -- record() uses it to revalidate kept code after a flush // instead of recompiling, and to catch self-modifying code (changed bytes -> different hash). static inline uint64_t src_hash(const uint8_t* p, uint32_t n_instr) { const uint32_t* w = (const uint32_t*) p; uint64_t h = 1469598103934665603ULL; for (uint32_t i = 0; i < n_instr; ++i) { h ^= w[i]; h *= 1099511628211ULL; } return h; } CJitEngine::JitBlock* CJitEngine::record(uint64_t virt_pc, uint64_t phys_pc, uint32_t asn, bool asm_global, uint32_t n_instr, const uint8_t* dram) { JitBlock& b = m_blocks[index_of(virt_pc)]; // record() is only reached after the dispatcher validated the live physical, and every returning // branch below verifies the code bytes (flush-fresh or hash), so stamp the chain epoch here. b.vgen = m_itb_gen + m_flush_gen; // Still valid + matching + flush-fresh: nothing flushed us since last seen, so the code can't // have changed. (flush_gen-stale must NOT take this no-hash path -- post-IMB needs the hash.) if (b.valid && b.flush_gen == m_flush_gen && b.tag == virt_pc && (b.asm_global || b.asn == asn) && b.phys == phys_pc) { b.n_instr = n_instr; return &b; } // Revalidate: a flush dropped the block but kept the compiled code. If the bytes the prefix was // compiled from still hash the same, reuse it instead of recompiling. Hash over hash_len, NOT // the caller's n_instr -- interrupt-truncated cold passes make n vary for the same block. if (b.code && b.tag == virt_pc && (b.asm_global || b.asn == asn) && b.phys == phys_pc && b.src_sum == src_hash(dram + phys_pc, b.hash_len)) { b.valid = true; b.flush_gen = m_flush_gen; b.n_instr = n_instr; b.jit_body = (void*) ((uint8_t*) (void*) b.code + b.body_off); // restore chained re-entry return &b; } // New block, page remap, or modified bytes: record fresh and force a recompile. #ifdef JIT_STATS // Why is this a FRESH compile (steps 2+3 both failed)? Categorize the slot's prior occupant in the // same order step 3 checks, so we know whether the churn is cache aliasing (tag) -- which more slots // fix -- vs same-PC cross-process (asn) -- which needs asn in the index -- vs remap/self-mod/cold. if (!b.code) m_fresh_cold++; // empty / reclaimed slot (genuine cold or warmup) else if (b.tag != virt_pc) m_fresh_tag++; // a DIFFERENT block aliases this slot (cache-size lever) else if (!(b.asm_global || b.asn == asn)) m_fresh_asn++; // same PC, different process (needs asn-in-index) else if (b.phys != phys_pc) m_fresh_phys++; // page remap else m_fresh_hash++; // source bytes changed (self-modifying code) #endif b.tag = virt_pc; b.phys = phys_pc; b.asn = asn; b.asm_global = asm_global; b.n_instr = n_instr; b.valid = true; b.flush_gen = m_flush_gen; b.code = nullptr; b.jit_body = nullptr; // not compiled yet -> cached links to us must miss until compile for (int i = 0; i < kLinkSlots; ++i) b.link[i] = nullptr; // no cached successors yet #ifdef JIT_STATS b.link_misses = 0; b.link_fanout = 0; // instrumentation: reset successor-fanout tracking on (re)use #endif b.prefix_len = 0; b.compiled = false; b.hot = 0; // fresh block: restart the trace-promotion counter #ifdef JIT_REGPROF b.rp_hits = 0; // fresh block: restart the exec counter (resurrect/revalidate keep theirs) #endif if (++m_recorded == 50000) printf("[JIT][CPU%d] block dispatcher active: 50000 blocks discovered.\n", m_cpu_id); return &b; } // Lazy-flush survivor: the dispatcher calls this on a lookup miss, with the LIVE physical it just // translated. If the slot matches and its source bytes still hash the same, restamp and return it // straight to the hot path CJitEngine::JitBlock* CJitEngine::revalidate_flushed(uint64_t virt_pc, uint32_t asn, uint64_t phys_pc, const uint8_t* dram) { JitBlock& b = m_blocks[index_of(virt_pc)]; // Resurrect BOTH lazy-flush survivors (flush(): valid, flush_gen-stale) AND flush_non_global() drops // (valid cleared). The source-hash below is the guard, matching record()'s revalidate path // so don't require valid flags; requiring it forced every flush_non_global'd block through an // interpret pass. tag/asn/phys/hash still guard collisions, cross-process aliasing, page remaps, // and self-modifying code. if (!(b.code && b.tag == virt_pc && (b.asm_global || b.asn == asn))) return nullptr; if (b.phys != phys_pc || b.src_sum != src_hash(dram + phys_pc, b.hash_len)) return nullptr; b.valid = true; // flush_non_global() may have cleared it; the hash just re-validated the bytes b.flush_gen = m_flush_gen; b.vgen = m_itb_gen + m_flush_gen; // phys + code bytes just validated b.jit_body = (void*) ((uint8_t*) (void*) b.code + b.body_off); return &b; } // Trace tier - is a looked-up trace safe to enter? (review: per-segment source validation.) // head_live_phys is the head's freshly resolved physical. The steps mirror the block dispatcher's phys // check (jit_run ~line 832) + revalidate_flushed's hash: // 1. head remap / ASN-recycle: the head's live physical no longer matches what we built from -> stale. // 2. epoch fresh (nothing flushed/ITB-invalidated since build) -> enter directly. // 3. epoch changed -> re-hash every fused segment. Unchanged bytes => the epoch bumped for an unrelated // reason (e.g. an IMB on another page): keep the trace + re-stamp. Changed bytes (SMC/remap): stale. // Interior coherence: the source hash here checks BYTES at the cached phys -- it can't see an interior page // remapped to a DIFFERENT physical with identical bytes. The CALLER closes that gap by re-resolving each // segment's LIVE physical: trace_segs_live (trace entry) + the recorder's per-successor check (formation). bool CJitEngine::trace_ok(TraceFragment* t, uint64_t head_live_phys, const uint8_t* dram) { if (t->n_segs == 0 || t->segs[0].phys_pc != head_live_phys) { note_trace_stale(); return false; } // head remap / ASN recycle // A 1-block trace mirrors its head block's compiled prefix, whose length can change with NO source-byte // or epoch change: a fault-truncated cold record shrinks prefix_len (n_instr oscillates), a later clean // record regrows it invisibly to the hash below. If the live head block compiled a different length, // the trace is stale; drop it so the dispatcher re-forms a consistent one. for (uint32_t s = 0; s < t->n_segs; ++s) { // ANY fused block's prefix_len can oscillate with no source/epoch change (not just the head) const JitBlock& sb = m_blocks[index_of(t->segs[s].guest_pc)]; if (sb.valid && sb.tag == t->segs[s].guest_pc && sb.prefix_len != t->segs[s].n_instr) { note_trace_stale(); return false; } } if (t->vgen == m_itb_gen + m_flush_gen) return true; // epoch fresh: nothing changed since build for (uint32_t i = 0; i < t->n_segs; ++i) { const SourceSeg& s = t->segs[i]; if (s.src_sum != src_hash(dram + s.phys_pc, s.n_instr)) { note_trace_stale(); return false; } // a segment's source bytes changed -> stale } t->vgen = m_itb_gen + m_flush_gen; // all segments re-validated: re-stamp the epoch t->flush_gen = m_flush_gen; return true; } #ifdef JIT_VERIFY // unit-test trace_ok's source-coherence decision (SMC/IMB, ITB-remap, head-remap, multi-segment) // WITHOUT real traces or the emitter. Mutates m_itb_gen/m_flush_gen but saves/restores; runs // once at ctor when the engine is fresh (counters 0, no live blocks). A FAIL here means the // trace tier's coherence is broken void CJitEngine::trace_selftest() { const uint64_t save_itb = m_itb_gen, save_flush = m_flush_gen; uint32_t mem[8] = { 0x11111111, 0x22222222, 0x33333333, 0x44444444, 0x55555555, 0x66666666, 0x77777777, 0x88888888 }; const uint8_t* d = (const uint8_t*) mem; TraceFragment t = {}; t.valid = true; t.head_tag = 0x2000; t.asn = 1; t.n_segs = 2; t.segs[0] = { 0x2000, 0, 4, false, 1, src_hash(d + 0, 4) }; // words[0..3] at phys 0 t.segs[1] = { 0x2010, 16, 4, false, 1, src_hash(d + 16, 4) }; // words[4..7] at phys 16 t.vgen = m_itb_gen + m_flush_gen; bool ok = true; ok &= ( trace_ok(&t, 0, d) == true ); // 1. fresh: epoch + head-phys match -> enter ok &= ( trace_ok(&t, 999, d) == false ); // 2. head remap / ASN-recycle: live head phys differs -> drop ++m_itb_gen; // 3. ITB-invalidate, bytes unchanged: ok &= ( trace_ok(&t, 0, d) == true ); // epoch bumps, re-hash matches -> keep ... ok &= ( t.vgen == m_itb_gen + m_flush_gen ); // ... and re-stamped to the new epoch mem[5] = 0xDEADBEEF; ++m_flush_gen; // 4. SMC on interior seg1 + IMB (flush bump): ok &= ( trace_ok(&t, 0, d) == false ); // re-hash mismatch -> drop mem[5] = 0x66666666; // 5. restore the byte, epoch still bumped: ok &= ( trace_ok(&t, 0, d) == true ); // re-hash matches again -> keep printf("[JIT][CPU%d] trace_ok self-test (SMC/IMB/ITB-remap/head-remap): %s\n", m_cpu_id, ok ? "PASS" : "*** FAIL ***"); m_itb_gen = save_itb; m_flush_gen = save_flush; } #endif // Free ALL compiled code (delete+new of the runtime; reset()-and-reuse corrupted the JitAllocator // block tree) and drop every slot's now-dangling pointers. Safe only from this CPU's cold path // (never while its compiled code could be executing); runtimes are per-CPU. void CJitEngine::reclaim_code() { //printf("[JIT][CPU%d] code reclaim: %llu MB freed\n", m_cpu_id, // (unsigned long long) (m_code_bytes >> 20)); delete (asmjit::JitRuntime*) m_rt; m_rt = new asmjit::JitRuntime(); m_code_bytes = 0; m_reclaim_pending = false; // a reclaim (cold-path or deferred) satisfies any pending request for (int i = 0; i < kCacheEntries; ++i) { m_blocks[i].valid = false; m_blocks[i].code = nullptr; m_blocks[i].jit_body = nullptr; m_blocks[i].compiled = false; } // Traces hold JitFns into the runtime we just deleted -- drop them too, or a post-reclaim trace // dispatch jumps through a freed pointer. trace_lookup keys on valid, so clearing it is enough. for (int i = 0; i < kTraceEntries; ++i) m_traces[i].valid = false; } void CJitEngine::flush() { // LAZY: don't walk 16K slots each time. Bump the generation instead: stale blocks miss in // lookup() and revalidate_flushed() re-hashes their source bytes before they run again. ++m_flush_gen; if (m_rt && m_code_bytes >= kReclaimBytes) m_reclaim_pending = true; // DEFER: reclaim frees all code -- unsafe from a compiled IC_FLUSH; // reclaim_if_pending() does it at the next dispatch boundary. } // ASM-bit-clear icache flush (process/ASN switch): drop only !asm_global blocks. Global (ASM) PAL // blocks are ASN-independent and must survive it. The drop is SOFT, revalidate_flushed() re-hashes // and resurrects the compiled code on next use (no recompile, no interpret pass, unless the bytes // actually changed) void CJitEngine::flush_non_global() { if (m_rt && m_code_bytes >= kReclaimBytes) { flush(); return; } for (int i = 0; i < kCacheEntries; ++i) { if (!m_blocks[i].asm_global) { m_blocks[i].valid = false; m_blocks[i].jit_body = nullptr; } } for (int i = 0; i < kTraceEntries; ++i) { // a trace spanning any !asm_global segment depends on a soft-dropped block -> drop it too if (!m_traces[i].valid) continue; for (uint32_t s = 0; s < m_traces[i].n_segs; ++s) if (!m_traces[i].segs[s].asm_global) { m_traces[i].valid = false; note_trace_stale(); break; } } } // ZAP/ZAPNOT byte-expand: g_zapnot_mask[b] has byte i = 0xFF where bit i of b is set (ZAPNOT keeps // those bytes; ZAP keeps the complement). Compiled ZAP indexes this instead of an 8-way bit test. static uint64_t g_zapnot_mask[256]; static const bool g_zapnot_init = [] { for (int b = 0; b < 256; ++b) { uint64_t m = 0; for (int i = 0; i < 8; ++i) if (b & (1 << i)) m |= (uint64_t) 0xff << (i * 8); g_zapnot_mask[b] = m; } return true; }(); #ifdef JIT_REGPROF // Bitmask of the Alpha integer GPRs a block's prefix touches (read or written), for pin selection. // Format-aware over the ops that drive the store-forward chains -- integer operate, memory, branch, // JMP, and the MISC state reads; FP / HW / CALL_PAL are skipped (not the GPR forwarding path). static uint32_t regprof_mask(const uint32_t* w, uint32_t n) { uint32_t mask = 0; auto touch = [&](int r) { if (r != 31) mask |= (1u << r); }; // R31 is hardwired 0, never a pin for (uint32_t i = 0; i < n; ++i) { const uint32_t ins = w[i]; const uint32_t op = ins >> 26; const int ra = (ins >> 21) & 0x1f, rb = (ins >> 16) & 0x1f, rc = ins & 0x1f; const bool islit = ((ins >> 12) & 1) != 0; switch (op) { case 0x10: case 0x11: case 0x12: case 0x13: case 0x1c: // integer operate: Ra, Rc, Rb(if reg) touch(ra); touch(rc); if (!islit) touch(rb); break; case 0x08: case 0x09: case 0x0a: case 0x0b: case 0x0c: // LDA/LDAH + BWX ld/st: Ra, Rb(base) case 0x0d: case 0x0e: case 0x0f: case 0x28: case 0x29: case 0x2a: case 0x2b: // int LDL/LDQ/LDx_L case 0x2c: case 0x2d: case 0x2e: case 0x2f: // int STL/STQ/STx_C case 0x1a: // JMP/JSR/RET: Ra(link), Rb(target) touch(ra); touch(rb); break; case 0x30: case 0x34: case 0x38: case 0x39: case 0x3a: // integer branches (incl BR/BSR link): Ra case 0x3b: case 0x3c: case 0x3d: case 0x3e: case 0x3f: touch(ra); break; case 0x18: { // MISC: RPCC/RC/RS write Ra const uint32_t f = ins & 0xffff; if (f == 0xc000 || f == 0xe000 || f == 0xf000) touch(ra); break; } default: break; // FP / HW / CALL_PAL: not the GPR forwarding path } } return mask; } #endif // The 3 guest GPRs kept live in the callee-saved pins r12/r13/r15. compile_block uses the global hot // set (RA/a0/PV); compile_trace can override with the trace's own hot regs (the M2 regalloc spike). static const int kGlobalPins[3] = { 26, 16, 27 }; void CJitEngine::emit_op(void* a_ptr, const uint8_t* gpa, void* done_ptr, const HelperSet& hs, bool pal_block, JitBlock* b, uint32_t ins, uint32_t i, RegAlloc& regalloc) { using namespace asmjit; x86::Assembler& a = *(x86::Assembler*)a_ptr; Label& done = *(Label*)done_ptr; // aliases so the moved if-chain references the helper names verbatim: void* read_helper = hs.read_helper; void* write_helper = hs.write_helper; void* opcdec_helper = hs.opcdec_helper; void* hw_mfpr_helper = hs.hw_mfpr_helper; void* hw_ld_helper = hs.hw_ld_helper; void* hw_mtpr_helper = hs.hw_mtpr_helper; void* hw_st_helper = hs.hw_st_helper; void* indirect_helper = hs.indirect_helper; void* read_locked_helper = hs.read_locked_helper; void* stc_helper = hs.stc_helper; void* misc_helper = hs.misc_helper; void* read_vpte_helper = hs.read_vpte_helper; void* read_wchk_helper = hs.read_wchk_helper; void* itof_helper = hs.itof_helper; void* ftoi_helper = hs.ftoi_helper; void* fltl_helper = hs.fltl_helper; void* fp_read_helper = hs.fp_read_helper; void* fp_write_helper = hs.fp_write_helper; void* fltv_helper = hs.fltv_helper; auto aq = [&](int k) { return x86::gpq(gpa[k]); }; auto ad = [&](int k) { return x86::gpd(gpa[k]); }; auto set_pc = [&](uint64_t pc_val) { a.mov(x86::r10, imm(pc_val)); a.mov(x86::qword_ptr(x86::rbp, m_off.state_pc), x86::r10); }; auto pin_id = [&](int r) -> int { // r -> its bound host reg id, or -1 = the state.r[] memory slot return regalloc.host_of(r); }; int ra = (ins >> 21) & 0x1F; int rb = (ins >> 16) & 0x1F; int rc = ins & 0x1F; bool islit = ((ins >> 12) & 1) != 0; uint32_t lit = (ins >> 13) & 0xFF; SafeOp op = classify(ins, pal_block); do { // MISC (0x18) barriers/hints: emit an mfence for TRAPB/EXCB/MB/WMB (x86's seq_cst fence, to // preserve the guest's MP memory ordering, matching DO_*'s atomic_thread_fence), nothing for // the prefetch/cache hints -- then keep going so the block extends straight past them. if (op == OP_NOP) continue; if (op == OP_MFENCE) { a.mfence(); continue; } // Value-forwarding: rax may still hold the guest reg the previous op computed. Capture that for // op1_rax's reuse, then default-invalidate; only mov_to_reg(_, rax) below re-marks what rax holds. const int prev_rax = regalloc.rax_holds; regalloc.rax_holds = -1; auto reg = [&](int r) { // PALshadow (RREG, AlphaCPU.h): in a PALmode block with SDE set, R4-7 and R20-23 map to // the shadow bank r[r+32]. int idx = (pal_block && ((r & 0xc) == 0x4)) ? r + 32 : r; return x86::qword_ptr(x86::rbx, idx * 8); }; // 32-bit (low-dword) view with the SAME shadow remap, for the 32-bit ALU ops (ADDL/SUBL). auto reg32 = [&](int r) { int idx = (pal_block && ((r & 0xc) == 0x4)) ? r + 32 : r; return x86::dword_ptr(x86::rbx, idx * 8); }; // Pin-aware reg access: route through the pinned x86 reg when r is pinned, else the regs[] // memory slot via reg()/reg32(). Callers handle r==31 (it varies per site: 0, a displacement, // or skip). mov_to_reg writes the slot for any non-pinned r (R31 included, matching the old // unconditional stores -- nothing reads r[31] back). auto mov_from_reg = [&](const x86::Gp& dst, int r) { // dst is 64-bit int p = pin_id(r); if (p >= 0) a.mov(dst, x86::gpq((uint32_t)p)); else a.mov(dst, reg(r)); }; auto mov_from_reg32 = [&](const x86::Gp& dst, int r) { // dst is 32-bit int p = pin_id(r); if (p >= 0) a.mov(dst, x86::gpd((uint32_t)p)); else a.mov(dst, reg32(r)); }; auto mov_to_reg = [&](int r, const x86::Gp& src) { // src is 64-bit int p = pin_id(r); if (p >= 0) a.mov(x86::gpq((uint32_t)p), src); else a.mov(reg(r), src); if (src.id() == x86::rax.id() && r != 31) regalloc.rax_holds = r; // rax now mirrors r[r]; forward it }; auto op1_rax = [&]() { if (ra != 31 && prev_rax == ra) return; // value-forward: rax already holds Ra (prev op's result) if (ra == 31) a.xor_(x86::eax, x86::eax); else mov_from_reg(x86::rax, ra); }; auto op2_rcx = [&]() { // operand2 (literal, or r[Rb] with r31=0) if (islit) a.mov(x86::rcx, imm(lit)); else if (rb == 31) a.xor_(x86::ecx, x86::ecx); else mov_from_reg(x86::rcx, rb); }; // op2 as a DIRECT ALU source, folding away the mov-to-rcx: literal imm, R31 -> 0, a pinned host // reg, or the regs[] memory slot -- all valid `OP rax, ` sources. emit_alu2 uses it for the // simple accumulate-into-rax ops (rax stays the dest, so no aliasing concern). auto op2op = [&]() -> Operand { if (islit) return imm(lit); if (rb == 31) return imm(0); int p = pin_id(rb); if (p >= 0) return x86::gpq((uint32_t) p); return reg(rb); }; auto emit_alu2 = [&](uint32_t instId) { op1_rax(); a.emit(instId, x86::rax, op2op()); }; // ABI-native helper call. // Each JitArg names an argument source; the k # source is placed in arg register k (aq/ad). // Non-immediate sources are emitted before immediates so a size/selector immediate can't // overwrite RDX while JA_VA (the only register-sourced argument) still needs it. enum JitArgKind { JA_CPU, JA_GP, JA_GPZ, JA_VA, JA_OUT, JA_R10, JA_I32, JA_I64 }; struct JitArg { JitArgKind k; uint64_t v; }; auto emit_call = [&](void* fn, std::initializer_list as) { auto place = [&](int k, const JitArg& s) { switch (s.k) { case JA_CPU: a.mov(aq(k), x86::rbp); break; // cpu case JA_GP: mov_from_reg(aq(k), (int)s.v); break; // r[v] case JA_GPZ: if (s.v == 31) a.xor_(ad(k), ad(k)); // r[v], or 0 if R31 else mov_from_reg(aq(k), (int)s.v); break; case JA_VA: if (aq(k).id() != x86::rdx.id()) a.mov(aq(k), x86::rdx); break; // va: precomputed in RDX case JA_OUT: a.lea(aq(k), x86::qword_ptr(x86::rsp, 32)); break; // &out slot case JA_R10: a.mov(aq(k), x86::r10); break; // value already in R10 case JA_I32: a.mov(ad(k), imm((uint32_t)s.v)); break; case JA_I64: a.mov(aq(k), imm((uint64_t)s.v)); break; } }; int k = 0; for (const JitArg& s : as) { if (s.k != JA_I32 && s.k != JA_I64) place(k, s); ++k; } k = 0; for (const JitArg& s : as) { if (s.k == JA_I32 || s.k == JA_I64) place(k, s); ++k; } a.mov(x86::rax, imm((uint64_t)fn)); a.call(x86::rax); }; // Memory-format loads: va = regs[Rb] + disp16. if (op == OP_LDQ || op == OP_LDL || op == OP_LDBU || op == OP_LDWU || op == OP_LDQ_U) { if (ra == 31) continue; // LDx R31 is a NOP (interpreter skips the read) const int disp = (int)(int16_t)(ins & 0xFFFF); const int size_bits = (op == OP_LDQ || op == OP_LDQ_U) ? 64 : (op == OP_LDL) ? 32 : (op == OP_LDWU) ? 16 : 8; const int amask = (size_bits / 8) - 1; if (rb == 31) a.mov(x86::rdx, imm(disp)); // va -> RDX else { mov_from_reg(x86::rdx, rb); if (disp) a.add(x86::rdx, imm(disp)); } if (op == OP_LDQ_U) a.and_(x86::rdx, imm(~(uint64_t)7)); // LDQ_U: force 8-byte alignment // Slow path: jit_read(cpu, va, size, &out); on fault bail to `done` returning i // (0..i-1 committed). In JIT_VERIFY builds this is the ONLY path, so the helper's // replay keeps the differential check race-free. auto emit_helper = [&]() { emit_call(read_helper, { {JA_CPU, 0}, {JA_VA, 0}, {JA_I32, (uint64_t)size_bits}, {JA_OUT, 0} }); Label ok = a.new_label(); a.test(x86::eax, x86::eax); a.jz(ok); set_pc(b->tag + 4 * (uint64_t)i); // resume at the faulting load a.mov(x86::eax, imm(i)); // this iteration: i instrs done a.add(x86::eax, x86::dword_ptr(x86::rsp, 40)); // + earlier chained iterations a.jmp(done); a.bind(ok); if (op == OP_LDQ || op == OP_LDQ_U) a.mov(x86::rax, x86::qword_ptr(x86::rsp, 32)); // LDQ/LDQ_U: full quad else if (op == OP_LDL) a.movsxd(x86::rax, x86::dword_ptr(x86::rsp, 32)); else if (op == OP_LDWU) a.movzx(x86::eax, x86::word_ptr(x86::rsp, 32)); // BWX: zero-extend else a.movzx(x86::eax, x86::byte_ptr(x86::rsp, 32)); // LDBU mov_to_reg(ra, x86::rax); }; #ifdef JIT_VERIFY emit_helper(); #else // Inline fast path: aligned + data_page_cache[0][dpc_index(va)] hit + DRAM. Falls to the // helper on misalign / cache miss / MMIO. Mirrors jit_read's data-cache path. RDX = va // (preserved); R11 = slot byte offset = dpc_index(va)*stride; RAX/R10 scratch. Label slow = a.new_label(); Label ldone = a.new_label(); a.test(x86::dl, imm(amask)); a.jnz(slow); a.mov(x86::r11, x86::rdx); a.shr(x86::r11, imm(13)); a.and_(x86::r11, imm(m_off.dpc_mask)); // r11 = dpc_index(va) a.imul(x86::r11, x86::r11, imm(m_off.dpc_stride)); // r11 = slot byte offset a.mov(x86::r10, x86::rdx); a.and_(x86::r10, imm(-0x2000)); // r10 = va & ~0x1FFF a.cmp(x86::qword_ptr(x86::rbp, x86::r11, 0, m_off.dpc_virt_page), x86::r10); a.jne(slow); a.mov(x86::rax, x86::qword_ptr(x86::rbp, m_off.state_cm)); // rax = {cm, asn0} (adjacent in state) a.cmp(x86::qword_ptr(x86::rbp, x86::r11, 0, m_off.dpc_cm), x86::rax); a.jne(slow); // vs slot {cm, asn} a.mov(x86::r10, x86::qword_ptr(x86::rbp, x86::r11, 0, m_off.dpc_host_base)); // r10 = host page base (0 = MMIO/none) a.test(x86::r10, x86::r10); a.jz(slow); // MMIO/straddle: device read via helper a.mov(x86::rax, x86::rdx); a.and_(x86::rax, imm(0x1FFF)); // rax = page offset if (op == OP_LDQ || op == OP_LDQ_U) a.mov(x86::rax, x86::qword_ptr(x86::r10, x86::rax)); // LDQ/LDQ_U: full quad else if (op == OP_LDL) a.movsxd(x86::rax, x86::dword_ptr(x86::r10, x86::rax)); else if (op == OP_LDWU) a.movzx(x86::eax, x86::word_ptr(x86::r10, x86::rax)); // BWX: zero-extend else a.movzx(x86::eax, x86::byte_ptr(x86::r10, x86::rax)); // LDBU mov_to_reg(ra, x86::rax); a.jmp(ldone); a.bind(slow); emit_helper(); a.bind(ldone); #endif continue; } // Memory-format stores: MEM[regs[Rb] + disp16] = regs[Ra]. Inline fast path mirrors the load // path against the WRITE cache [1] (a hit already passed the write-permission + FOW check, so // the page is writable); falls to jit_write on misalign / cache miss / MMIO. In verify the // helper is the only path -- it compares the store against the interpreter's recorded one. if (op == OP_STL || op == OP_STQ || op == OP_STB || op == OP_STW || op == OP_STQ_U) { const int disp = (int)(int16_t)(ins & 0xFFFF); const int size_bits = (op == OP_STQ || op == OP_STQ_U) ? 64 : (op == OP_STL) ? 32 : (op == OP_STW) ? 16 : 8; const int amask = (size_bits / 8) - 1; if (rb == 31) a.mov(x86::rdx, imm(disp)); // va -> RDX (preserved for helper) else { mov_from_reg(x86::rdx, rb); if (disp) a.add(x86::rdx, imm(disp)); } if (op == OP_STQ_U) a.and_(x86::rdx, imm(~(uint64_t)7)); // STQ_U: force 8-byte alignment auto emit_helper = [&]() { emit_call(write_helper, { {JA_CPU, 0}, {JA_VA, 0}, {JA_I32, (uint64_t)size_bits}, {JA_GPZ, (uint64_t)ra} }); // jit_write(cpu, va, size, value) Label ok = a.new_label(); a.test(x86::eax, x86::eax); a.jz(ok); set_pc(b->tag + 4 * (uint64_t)i); // resume at the faulting store a.mov(x86::eax, imm(i)); // this iteration: i instrs done a.add(x86::eax, x86::dword_ptr(x86::rsp, 40)); // + earlier chained iterations a.jmp(done); a.bind(ok); }; #ifdef JIT_VERIFY emit_helper(); #else // Inline fast path: aligned + data_page_cache[1][dpc_index(va)] hit + DRAM. RDX = va // (preserved for the helper); R11 = write-cache slot byte offset; RAX/R10/R9 scratch. Label slow = a.new_label(); Label sdone = a.new_label(); a.test(x86::dl, imm(amask)); a.jnz(slow); a.mov(x86::r11, x86::rdx); a.shr(x86::r11, imm(13)); a.and_(x86::r11, imm(m_off.dpc_mask)); // r11 = dpc_index(va) a.imul(x86::r11, x86::r11, imm(m_off.dpc_stride)); // r11 = slot byte offset a.add(x86::r11, imm(m_off.dpc_write_row)); // -> write cache [1] a.mov(x86::r10, x86::rdx); a.and_(x86::r10, imm(-0x2000)); // r10 = va & ~0x1FFF a.cmp(x86::qword_ptr(x86::rbp, x86::r11, 0, m_off.dpc_virt_page), x86::r10); a.jne(slow); a.mov(x86::rax, x86::qword_ptr(x86::rbp, m_off.state_cm)); // rax = {cm, asn0} (adjacent in state) a.cmp(x86::qword_ptr(x86::rbp, x86::r11, 0, m_off.dpc_cm), x86::rax); a.jne(slow); // vs slot {cm, asn} a.mov(x86::r10, x86::qword_ptr(x86::rbp, x86::r11, 0, m_off.dpc_host_base)); // r10 = host page base (0 = MMIO/none) a.test(x86::r10, x86::r10); a.jz(slow); // MMIO/straddle: device write via helper a.mov(x86::rax, x86::rdx); a.and_(x86::rax, imm(0x1FFF)); // rax = page offset if (ra == 31) a.xor_(x86::r9d, x86::r9d); // value -> R9 (R31 == 0) else mov_from_reg(x86::r9, ra); if (op == OP_STQ || op == OP_STQ_U) a.mov(x86::qword_ptr(x86::r10, x86::rax), x86::r9); // full quad else if (op == OP_STL) a.mov(x86::dword_ptr(x86::r10, x86::rax), x86::r9d); else if (op == OP_STW) a.mov(x86::word_ptr(x86::r10, x86::rax), x86::r9w); else a.mov(x86::byte_ptr(x86::r10, x86::rax), x86::r9b); // STB a.jmp(sdone); a.bind(slow); emit_helper(); a.bind(sdone); #endif continue; } // FP memory (LDS/LDT/STS/STT + VAX LDF/LDG/STF/STG): f[Fa] <-> MEM[Rb+disp16] with FPSTART. // LDT/STT are raw 8B -> inline fast path; the converting forms (S/F/G) go through the helper. Verify uses the // helper only (FP loads replay the logged f-value, FP stores compare via jit_write's store-log). if (op == OP_LDT || op == OP_LDS || op == OP_STT || op == OP_STS || op == OP_LDF || op == OP_LDG || op == OP_STF || op == OP_STG) { const bool isload = (op == OP_LDT || op == OP_LDS || op == OP_LDF || op == OP_LDG); const bool israw = (op == OP_LDT || op == OP_STT); // T-format: no conversion -> inline-able const int fa = ra; // Fa = (ins>>21)&0x1f (FP regs: no shadow remap) // fmt: 0=T raw, 1=S ieee, 2=F vax, 3=G vax. S/F are 32-bit in memory; T/G are 64-bit. const uint32_t fmt = (op == OP_LDS || op == OP_STS) ? 1u : (op == OP_LDF || op == OP_STF) ? 2u : (op == OP_LDG || op == OP_STG) ? 3u : 0u; const int size_bits = (fmt == 1 || fmt == 2) ? 32 : 64; const int disp = (int)(int16_t)(ins & 0xFFFF); const uint32_t descr = (fmt << 16) | (uint32_t)size_bits; // fmt<<16 | size if (isload && fa == 31) continue; // LDT/LDS f31: interp skips the read (NOP) if (rb == 31) a.mov(x86::rdx, imm(disp)); // va -> RDX (preserved for helper) else { mov_from_reg(x86::rdx, rb); if (disp) a.add(x86::rdx, imm(disp)); } auto emit_helper = [&]() { emit_call(isload ? fp_read_helper : fp_write_helper, { {JA_CPU, 0}, {JA_VA, 0}, {JA_I32, (uint64_t)fa}, {JA_I32, (uint64_t)descr} }); // jit_fp_read/write -> 0/1 Label ok = a.new_label(); a.test(x86::eax, x86::eax); a.jz(ok); set_pc(b->tag + 4 * (uint64_t)i); // resume at the faulting FP mem op a.mov(x86::eax, imm(i)); a.add(x86::eax, x86::dword_ptr(x86::rsp, 40)); a.jmp(done); a.bind(ok); }; #ifdef JIT_VERIFY emit_helper(); #else if (!israw) { emit_helper(); continue; } // LDS/STS: ieee conversion only via the helper // Inline fast path for LDT/STT: FPSTART (fpen gate + exc_sum=0), then the data-cache hit. // misalign / cache miss / MMIO / fpen==0 fall to the helper. R11 = slot byte offset; RAX/R10/R9 scratch. Label slow = a.new_label(); Label fdone = a.new_label(); a.cmp(x86::byte_ptr(x86::rbp, m_off.fpen), imm(0)); a.je(slow); // fpen==0 -> FEN trap a.mov(x86::qword_ptr(x86::rbp, m_off.exc_sum), imm(0)); // exc_sum = 0 a.test(x86::dl, imm(7)); a.jnz(slow); // 8-byte aligned a.mov(x86::r11, x86::rdx); a.shr(x86::r11, imm(13)); a.and_(x86::r11, imm(m_off.dpc_mask)); a.imul(x86::r11, x86::r11, imm(m_off.dpc_stride)); if (!isload) a.add(x86::r11, imm(m_off.dpc_write_row)); // store -> write cache [1] a.mov(x86::r10, x86::rdx); a.and_(x86::r10, imm(-0x2000)); a.cmp(x86::qword_ptr(x86::rbp, x86::r11, 0, m_off.dpc_virt_page), x86::r10); a.jne(slow); a.mov(x86::rax, x86::qword_ptr(x86::rbp, m_off.state_cm)); // rax = {cm, asn0} (adjacent in state) a.cmp(x86::qword_ptr(x86::rbp, x86::r11, 0, m_off.dpc_cm), x86::rax); a.jne(slow); // vs slot {cm, asn} a.mov(x86::r10, x86::qword_ptr(x86::rbp, x86::r11, 0, m_off.dpc_host_base)); // r10 = host page base (0 = MMIO/none) a.test(x86::r10, x86::r10); a.jz(slow); // MMIO/straddle -> helper a.mov(x86::rax, x86::rdx); a.and_(x86::rax, imm(0x1FFF)); // rax = page offset if (isload) { // LDT: f[fa] = MEM[phys] a.mov(x86::rax, x86::qword_ptr(x86::r10, x86::rax)); a.mov(x86::qword_ptr(x86::rbp, m_off.f_base + fa * 8), x86::rax); } else { // STT: MEM[phys] = f[fa] a.mov(x86::r9, x86::qword_ptr(x86::rbp, m_off.f_base + fa * 8)); a.mov(x86::qword_ptr(x86::r10, x86::rax), x86::r9); } a.jmp(fdone); a.bind(slow); emit_helper(); a.bind(fdone); #endif continue; } // Store-conditional STL_C/STQ_C (0x2e/0x2f): conditionally store Ra; Ra = 1 (success) / 0 (fail). // jit_stc consumes the LL monitor + does the host CAS (production), or compares against the // interpreter's logged outcome (verify). if (op == OP_STL_C || op == OP_STQ_C) { const int disp = (int)(int16_t)(ins & 0xFFFF); const int size_bits = (op == OP_STQ_C) ? 64 : 32; if (rb == 31) a.mov(x86::rdx, imm(disp)); // va -> RDX else { mov_from_reg(x86::rdx, rb); if (disp) a.add(x86::rdx, imm(disp)); } emit_call(stc_helper, { {JA_CPU, 0}, {JA_VA, 0}, {JA_I32, (uint64_t)size_bits}, {JA_GP, (uint64_t)ra} }); // jit_stc(cpu, va, size, value) Label nobail = a.new_label(); a.test(x86::eax, imm(0x100)); // 0x100 = translation-fault bail a.jz(nobail); set_pc(b->tag + 4 * (uint64_t)i); // resume at the faulting STx_C a.mov(x86::eax, imm(i)); a.add(x86::eax, x86::dword_ptr(x86::rsp, 40)); a.jmp(done); a.bind(nobail); mov_to_reg(ra, x86::rax); // Ra = success(1) / fail(0) continue; } // HW_LD physical (PALmode func 0/1): Ra = phys[Rb + disp12], no translation. jit_read_phys // does the aligned DRAM read (or replays in verify, bails on MMIO so the interpreter does the // ordered device read). disp is 12-bit here, not the 16-bit memory-format displacement. if (op == OP_HW_LDL || op == OP_HW_LDQ || op == OP_HW_LDQ_VPTE || op == OP_HW_LDL_WCHK) { if (ra == 31) continue; // R31 dest discards the read const int disp = (int)((int32_t)(ins << 20) >> 20); // sign-extend 12-bit displacement const int size_bits = (op == OP_HW_LDL || op == OP_HW_LDL_WCHK) ? 32 : 64; if (rb == 31) a.mov(x86::rdx, imm(disp)); // address (phys, or virtual for VPTE) -> RDX else { mov_from_reg(x86::rdx, rb); if (disp) a.add(x86::rdx, imm(disp)); } // func 5 -> jit_read_vpte (kernel-checked virtual read); else jit_read_phys emit_call(op == OP_HW_LDQ_VPTE ? read_vpte_helper : op == OP_HW_LDL_WCHK ? read_wchk_helper : hw_ld_helper, { {JA_CPU, 0}, {JA_VA, 0}, {JA_I32, (uint64_t)size_bits}, {JA_OUT, 0} }); Label ok = a.new_label(); a.test(x86::eax, x86::eax); a.jz(ok); set_pc(b->tag + 4 * (uint64_t)i); // resume at the faulting HW_LD a.mov(x86::eax, imm(i)); // this iteration: i instrs done a.add(x86::eax, x86::dword_ptr(x86::rsp, 40)); // + earlier chained iterations a.jmp(done); a.bind(ok); // HW_LDL SIGN-extends the longword to canonical form (QEMU gen_hw_ld uses MO_LESL; the EV68CB // HRM is silent but the Alpha longword-canonical rule applies, same as LDL). NOTE: the interp's // DO_HW_LDL zero-extends -- that is the bug, fixed in cpu_pal.h to match this. if (op == OP_HW_LDL || op == OP_HW_LDL_WCHK) a.movsxd(x86::rax, x86::dword_ptr(x86::rsp, 32)); else a.mov(x86::rax, x86::qword_ptr(x86::rsp, 32)); // HW_LDQ / VPTE: full quad mov_to_reg(ra, x86::rax); continue; } // Load-locked LDL_L/LDQ_L (0x2a/0x2b): Ra = MEM[Rb + disp16] AND establish the LL/SC exclusive // monitor. if (op == OP_LDL_L || op == OP_LDQ_L) { const int disp = (int)(int16_t)(ins & 0xFFFF); const int size_bits = (op == OP_LDQ_L) ? 64 : 32; if (rb == 31) a.mov(x86::rdx, imm(disp)); // va -> RDX else { mov_from_reg(x86::rdx, rb); if (disp) a.add(x86::rdx, imm(disp)); } emit_call(read_locked_helper, { {JA_CPU, 0}, {JA_VA, 0}, {JA_I32, (uint64_t)size_bits}, {JA_OUT, 0} }); // jit_read_locked(cpu, va, size, &out) Label ok = a.new_label(); a.test(x86::eax, x86::eax); a.jz(ok); set_pc(b->tag + 4 * (uint64_t)i); // resume at the faulting LDx_L a.mov(x86::eax, imm(i)); a.add(x86::eax, x86::dword_ptr(x86::rsp, 40)); a.jmp(done); a.bind(ok); a.mov(x86::rax, x86::qword_ptr(x86::rsp, 32)); // *out already sign-extended by the helper mov_to_reg(ra, x86::rax); continue; } // HW_MTPR (PALmode, side-effect-free IPRs): jit_hw_mtpr(cpu, function, value=Rb) stores an IPR // field directly. The verify pass snapshots+compares those live-state writes. No fault/bail. if (op == OP_HW_MTPR) { const uint32_t function = (ins >> 8) & 0xff; emit_call(hw_mtpr_helper, { {JA_CPU, 0}, {JA_I32, (uint64_t)function}, {JA_GPZ, (uint64_t)rb} }); // jit_hw_mtpr(cpu, function, value) continue; } // HW_MTPR I_CTL (0x11), terminator: I_CTL writes SDE (the PALshadow R4-7/R20-23 remap), SPE and // VA mode -- assumptions the reg() remap and the MMU bake in -- so compiled code PAST it would be // wrong. Run the IPR write via the same helper, then end the block: set the next PC and re-dispatch, // so post-I_CTL code recompiles under the new SDE. if (op == OP_HW_MTPR_TERM) { const uint32_t function = (ins >> 8) & 0xff; // 0x11 (I_CTL) emit_call(hw_mtpr_helper, { {JA_CPU, 0}, {JA_I32, (uint64_t)function}, {JA_GPZ, (uint64_t)rb} }); set_pc(b->tag + 4 * (uint64_t)(i + 1)); // next PC -> R10 + state.pc (PALmode bit preserved) continue; } // HW_ST physical (PALmode func 0/1): phys[Rb + disp12] = Ra, no translation. jit_write_phys // does the aligned DRAM write (or compares the logged store in verify, bails on MMIO). disp is // 12-bit here, not the 16-bit memory-format displacement. if (op == OP_HW_STL || op == OP_HW_STQ) { const int disp = (int)((int32_t)(ins << 20) >> 20); // sign-extend 12-bit displacement const int size_bits = (op == OP_HW_STQ) ? 64 : 32; if (rb == 31) a.mov(x86::rdx, imm(disp)); // phys addr -> RDX else { mov_from_reg(x86::rdx, rb); if (disp) a.add(x86::rdx, imm(disp)); } emit_call(hw_st_helper, { {JA_CPU, 0}, {JA_VA, 0}, {JA_I32, (uint64_t)size_bits}, {JA_GPZ, (uint64_t)ra} }); // jit_write_phys(cpu, phys, size, value) Label ok = a.new_label(); a.test(x86::eax, x86::eax); a.jz(ok); set_pc(b->tag + 4 * (uint64_t)i); // resume at the faulting HW_ST a.mov(x86::eax, imm(i)); a.add(x86::eax, x86::dword_ptr(x86::rsp, 40)); a.jmp(done); a.bind(ok); continue; } // Load-address: Ra = Rb + disp16 (LDA) or Rb + (disp16 << 16) (LDAH). Pure register // arithmetic. R31 dest discards the result (a NOP). if (op == OP_LDA || op == OP_LDAH) { if (ra == 31) continue; int64_t d = (int64_t)(int16_t)(ins & 0xFFFF); if (op == OP_LDAH) d <<= 16; if (rb == 31) a.mov(x86::rax, imm(d)); else { mov_from_reg(x86::rax, rb); if (d) a.add(x86::rax, imm(d)); } mov_to_reg(ra, x86::rax); continue; } // HW_MFPR (0x19, PALmode): read the IPR named by (ins>>8)&0xff into Ra. The helper is an // independent reimplementation of DO_HW_MFPR that RETURNS the value (it reads state only, never // writes it). pass the current Ra as `cur` so an unknown IPR returns it unchanged (matching interp), // and write reg(ra) here so the value lands in whichever regs[] array we hold. Every MFPR IPR is // a pure read if (op == OP_HW_MFPR) { if (ra != 31) { // MFPR R31 discards the value (R31 is hardwired 0) emit_call(hw_mfpr_helper, { {JA_CPU, 0}, {JA_I32, (uint64_t)ins}, {JA_GP, (uint64_t)ra} }); // -> RAX = IPR value mov_to_reg(ra, x86::rax); // Ra = value (reg() applies the PALshadow remap) } continue; } // MISC state reads RPCC/RC/RS (0x18): Ra = jit_misc(cpu, sel). The helper reads the cycle // counter / interrupt flag (clearing or setting it) in production, and replays the interp // pass's value in verify. Dest is Ra (not Rc); classify gated Ra!=31, so a store always happens. if (op == OP_RPCC || op == OP_RC || op == OP_RS) { const int sel = (op == OP_RPCC) ? 0 : (op == OP_RC) ? 1 : 2; emit_call(misc_helper, { {JA_CPU, 0}, {JA_I32, (uint64_t)sel} }); // -> RAX = value (replayed in verify); RC/RS also side-effect the flag if (ra != 31) mov_to_reg(ra, x86::rax); // Ra = value (reg() remap); Ra==31 RC/RS = flag side-effect only, discard continue; } // ITOFx (0x14): f[Fc] = fmt(Ra). jit_itof mirrors FPSTART (fpen -> FEN trap = bail, exc_sum=0); // the verify compares the FP file via its snapshot. if (op == OP_ITOFS || op == OP_ITOFF || op == OP_ITOFT) { emit_call(itof_helper, { {JA_CPU, 0}, {JA_I32, (uint64_t)rc}, {JA_GPZ, (uint64_t)ra}, {JA_I32, (uint64_t)(op == OP_ITOFS ? 1 : (op == OP_ITOFF) ? 2 : 0)} }); // jit_itof(cpu, fc, value, fmt) Label ok = a.new_label(); a.test(x86::eax, x86::eax); a.jz(ok); set_pc(b->tag + 4 * (uint64_t)i); // FEN trap: resume here in the interpreter a.mov(x86::eax, imm(i)); a.add(x86::eax, x86::dword_ptr(x86::rsp, 40)); a.jmp(done); a.bind(ok); continue; } // FTOIx (0x1c): Rc = fmt(f[Fa]). Same FPSTART bail shape; dest is a GPR (verify-compared). if (op == OP_FTOIS || op == OP_FTOIT) { emit_call(ftoi_helper, { {JA_CPU, 0}, {JA_I32, (uint64_t)ra}, {JA_I32, (uint64_t)(op == OP_FTOIS ? 1 : 0)}, {JA_OUT, 0} }); // jit_ftoi(cpu, fa, fmt, &out) Label ok = a.new_label(); a.test(x86::eax, x86::eax); a.jz(ok); set_pc(b->tag + 4 * (uint64_t)i); // FEN trap: resume here in the interpreter a.mov(x86::eax, imm(i)); a.add(x86::eax, x86::dword_ptr(x86::rsp, 40)); a.jmp(done); a.bind(ok); a.mov(x86::rax, x86::qword_ptr(x86::rsp, 32)); mov_to_reg(rc, x86::rax); // Rc (reg() applies the PALshadow remap) continue; } // CVTQT/CVTQS (0x16): f[Fc] = (double|single)(s64) f[Fb] via SSE. Inline the steady-state // common case; bail to the interpreter (it owns rounding/traps/FPCR) on any edge that would // trap or need non-nearest rounding. The verify carries fpcr/exc_sum across its two passes, so // the inline path leaves them untouched -- correctness rests on the bails, not on FPCR upkeep. if (op == OP_CVTQT || op == OP_CVTQS) { const bool dyn = (((ins >> 11) & 3) == 3); // /D: rounding is FPCR<59:58>, checked at runtime Label bail = a.new_label(), fdone = a.new_label(), cont = a.new_label(); a.cmp(x86::byte_ptr(x86::rbp, m_off.fpen), imm(0)); // FPSTART: FP disabled -> FEN trap (interp) a.je(bail); a.mov(x86::qword_ptr(x86::rbp, m_off.exc_sum), imm(0)); if (dyn) { // dynamic rounding: only nearest is SSE's default a.mov(x86::rax, x86::qword_ptr(x86::rbp, m_off.fpcr)); a.shr(x86::rax, imm(58)); a.and_(x86::eax, imm(3)); a.cmp(x86::eax, imm(2)); a.jne(bail); // FPCR<59:58> != round-to-nearest -> bail } if (rb == 31) a.xor_(x86::eax, x86::eax); // val = (s64) f[Fb] (Fb==31 -> 0) else a.mov(x86::rax, x86::qword_ptr(x86::rbp, m_off.f_base + 8u * rb)); if (op == OP_CVTQT) { a.cvtsi2sd(x86::xmm0, x86::rax); a.cvttsd2si(x86::rcx, x86::xmm0); } else { a.cvtsi2ss(x86::xmm0, x86::rax); a.cvttss2si(x86::rcx, x86::xmm0); } a.cmp(x86::rcx, x86::rax); // round-trip equal -> exact (no inexact, no trap) a.je(fdone); a.bt(x86::qword_ptr(x86::rbp, m_off.fpcr), imm(56)); // inexact: FPCR.INE clear -> first one traps a.jnc(bail); a.bind(fdone); if (op == OP_CVTQS) a.cvtss2sd(x86::xmm0, x86::xmm0); // widen single -> register (T-format) bits a.movq(x86::qword_ptr(x86::rbp, m_off.f_base + 8u * rc), x86::xmm0); a.jmp(cont); a.bind(bail); set_pc(b->tag + 4 * (uint64_t)i); // resume this instruction in the interpreter a.mov(x86::eax, imm(i)); a.add(x86::eax, x86::dword_ptr(x86::rsp, 40)); a.jmp(done); a.bind(cont); continue; } // IEEE T-float arithmetic (0x16 ADDT/SUBT/MULT/DIVT): inline SSE on the double f-registers. // FPCR.INE already sticky-set (so an inexact result won't trap). Every edge bails to the interpreter: // FP-off, /D-not-nearest, denormal operand, INE-clear (a first inexact would trap), or an Inf/NaN/ // denormal result (overflow/invalid/div-zero/underflow). if (op == OP_ADDT || op == OP_SUBT || op == OP_MULT || op == OP_DIVT) { const bool dyn = (((ins >> 11) & 3) == 3); // /D: rounding is FPCR<59:58>, checked at runtime Label bail = a.new_label(), cont = a.new_label(); auto class_bail = [&](const x86::Vec& x, bool chk_inf_nan) { // bail if denormal (always) / Inf|NaN (if asked) Label ok = a.new_label(); a.movq(x86::rax, x); a.mov(x86::rcx, x86::rax); a.shr(x86::rcx, imm(52)); a.and_(x86::ecx, imm(0x7ff)); // biased exponent if (chk_inf_nan) { a.cmp(x86::ecx, imm(0x7ff)); a.je(bail); } a.test(x86::ecx, x86::ecx); a.jnz(ok); // exp != 0 -> normal (or Inf/NaN) a.shl(x86::rax, imm(12)); a.jnz(bail); // exp == 0, mantissa != 0 -> denormal a.bind(ok); }; a.cmp(x86::byte_ptr(x86::rbp, m_off.fpen), imm(0)); a.je(bail); // FPSTART: FP disabled -> FEN trap a.mov(x86::qword_ptr(x86::rbp, m_off.exc_sum), imm(0)); a.bt(x86::qword_ptr(x86::rbp, m_off.fpcr), imm(56)); a.jnc(bail); // INE clear -> first inexact traps if (dyn) { // dynamic rounding: SSE gives nearest a.mov(x86::rax, x86::qword_ptr(x86::rbp, m_off.fpcr)); a.shr(x86::rax, imm(58)); a.and_(x86::eax, imm(3)); a.cmp(x86::eax, imm(2)); a.jne(bail); } a.movq(x86::xmm0, x86::qword_ptr(x86::rbp, m_off.f_base + 8u * (uint32_t)ra)); // f[Fa] a.movq(x86::xmm1, x86::qword_ptr(x86::rbp, m_off.f_base + 8u * (uint32_t)rb)); // f[Fb] class_bail(x86::xmm0, false); // denormal operand -> interp (DNZ/trap) class_bail(x86::xmm1, false); switch (op) { case OP_ADDT: a.addsd(x86::xmm0, x86::xmm1); break; case OP_SUBT: a.subsd(x86::xmm0, x86::xmm1); break; case OP_MULT: a.mulsd(x86::xmm0, x86::xmm1); break; default: a.divsd(x86::xmm0, x86::xmm1); break; // OP_DIVT } class_bail(x86::xmm0, true); // Inf/NaN/denormal result -> interp a.movq(x86::qword_ptr(x86::rbp, m_off.f_base + 8u * (uint32_t)rc), x86::xmm0); // f[Fc] a.jmp(cont); a.bind(bail); set_pc(b->tag + 4 * (uint64_t)i); a.mov(x86::eax, imm(i)); a.add(x86::eax, x86::dword_ptr(x86::rsp, 40)); a.jmp(done); a.bind(cont); continue; } // IEEE T-float compares (0x16 CMPTUN/EQ/LT/LE): f[Fc] = (Fa cmp Fb) ? 2.0 : 0.0. No rounding or // inexact. A NaN operand (INV / quiet-NaN / SWC rules) or a denormal operand (unmaskable denormal // trap -- the interp's ieee_fcmp traps via ieee_unpack) goes to the interp; +/-Inf, zero and // normals compare inline via ucomisd. Two ordered operands make CMPTUN always false. if (op == OP_CMPTUN || op == OP_CMPTEQ || op == OP_CMPTLT || op == OP_CMPTLE) { Label bail = a.new_label(), cont = a.new_label(); auto cmp_bail = [&](const x86::Vec& x) { // bail on NaN or denormal; Inf/zero/normal -> ok Label ok = a.new_label(), special = a.new_label(); a.movq(x86::rax, x); a.mov(x86::rcx, x86::rax); a.shr(x86::rcx, imm(52)); a.and_(x86::ecx, imm(0x7ff)); // biased exponent a.cmp(x86::ecx, imm(0x7ff)); a.je(special); // exp all-ones -> Inf or NaN a.test(x86::ecx, x86::ecx); a.jnz(ok); // exp != 0 -> normal a.bind(special); // exp == 0 (zero/denormal) or all-ones (Inf/NaN) a.shl(x86::rax, imm(12)); a.jnz(bail); // mantissa != 0 -> denormal or NaN -> bail a.bind(ok); // mantissa == 0 -> zero or Inf -> compare inline }; a.cmp(x86::byte_ptr(x86::rbp, m_off.fpen), imm(0)); a.je(bail); // FPSTART a.mov(x86::qword_ptr(x86::rbp, m_off.exc_sum), imm(0)); a.movq(x86::xmm0, x86::qword_ptr(x86::rbp, m_off.f_base + 8u * (uint32_t)ra)); // f[Fa] a.movq(x86::xmm1, x86::qword_ptr(x86::rbp, m_off.f_base + 8u * (uint32_t)rb)); // f[Fb] cmp_bail(x86::xmm0); cmp_bail(x86::xmm1); if (op == OP_CMPTUN) { a.xor_(x86::eax, x86::eax); // both ordered -> unordered is false } else { a.ucomisd(x86::xmm0, x86::xmm1); if (op == OP_CMPTEQ) a.sete(x86::al); // ZF -> Fa == Fb else if (op == OP_CMPTLT) a.setb(x86::al); // CF -> Fa < Fb else a.setbe(x86::al); // CF|ZF -> Fa <= Fb (CMPTLE) a.movzx(x86::eax, x86::al); a.shl(x86::rax, imm(62)); // true -> 0x4000000000000000 (2.0) } a.mov(x86::qword_ptr(x86::rbp, m_off.f_base + 8u * (uint32_t)rc), x86::rax); // f[Fc] a.jmp(cont); a.bind(bail); set_pc(b->tag + 4 * (uint64_t)i); a.mov(x86::eax, imm(i)); a.add(x86::eax, x86::dword_ptr(x86::rsp, 40)); a.jmp(done); a.bind(cont); continue; } // IEEE convert / sqrt / S-float class-check bails (used by the blocks below). is each // block's bail label; bail if denormal (always) or Inf|NaN (when chk). dbl = double, sgl = single. auto dbl_bail = [&](const x86::Vec& x, bool chk, const Label& bl) { Label ok = a.new_label(); a.movq(x86::rax, x); a.mov(x86::rcx, x86::rax); a.shr(x86::rcx, imm(52)); a.and_(x86::ecx, imm(0x7ff)); if (chk) { a.cmp(x86::ecx, imm(0x7ff)); a.je(bl); } a.test(x86::ecx, x86::ecx); a.jnz(ok); a.shl(x86::rax, imm(12)); a.jnz(bl); a.bind(ok); }; auto sgl_bail = [&](const x86::Vec& x, bool chk, const Label& bl) { Label ok = a.new_label(); a.movd(x86::eax, x); a.mov(x86::ecx, x86::eax); a.shr(x86::ecx, imm(23)); a.and_(x86::ecx, imm(0xff)); if (chk) { a.cmp(x86::ecx, imm(0xff)); a.je(bl); } a.test(x86::ecx, x86::ecx); a.jnz(ok); a.shl(x86::eax, imm(9)); a.jnz(bl); a.bind(ok); }; auto fp_dyn_bail = [&](const Label& bl) { // /D: SSE rounds to nearest, so bail unless FPCR does too a.mov(x86::rax, x86::qword_ptr(x86::rbp, m_off.fpcr)); a.shr(x86::rax, imm(58)); a.and_(x86::eax, imm(3)); a.cmp(x86::eax, imm(2)); a.jne(bl); }; // CVTST (S->T widen): zero/normal/Inf are already valid T bit patterns -> copy. S-denormal // (renormalizes) and NaN (quiets / may INV) go to the interpreter. No rounding or inexact. if (op == OP_CVTST) { Label bail = a.new_label(), cont = a.new_label(); a.cmp(x86::byte_ptr(x86::rbp, m_off.fpen), imm(0)); a.je(bail); // FPSTART a.mov(x86::qword_ptr(x86::rbp, m_off.exc_sum), imm(0)); a.movq(x86::xmm0, x86::qword_ptr(x86::rbp, m_off.f_base + 8u * (uint32_t)rb)); // f[Fb] dbl_bail(x86::xmm0, true, bail); // denormal/Inf/NaN -> interp a.movq(x86::qword_ptr(x86::rbp, m_off.f_base + 8u * (uint32_t)rc), x86::xmm0); // S bits are valid T a.jmp(cont); a.bind(bail); set_pc(b->tag + 4 * (uint64_t)i); a.mov(x86::eax, imm(i)); a.add(x86::eax, x86::dword_ptr(x86::rsp, 40)); a.jmp(done); a.bind(cont); continue; } // CVTTS (T->S narrow): round the double to single, re-widen for the S register format. Denormal // operand, an Inf/NaN/denormal single result (overflow/invalid/underflow), or a first inexact // (INE clear) bail. if (op == OP_CVTTS) { const bool dyn = (((ins >> 11) & 3) == 3); Label bail = a.new_label(), cont = a.new_label(); a.cmp(x86::byte_ptr(x86::rbp, m_off.fpen), imm(0)); a.je(bail); // FPSTART a.mov(x86::qword_ptr(x86::rbp, m_off.exc_sum), imm(0)); a.bt(x86::qword_ptr(x86::rbp, m_off.fpcr), imm(56)); a.jnc(bail); // INE clear -> first inexact traps if (dyn) fp_dyn_bail(bail); a.movq(x86::xmm0, x86::qword_ptr(x86::rbp, m_off.f_base + 8u * (uint32_t)rb)); // f[Fb] (double) dbl_bail(x86::xmm0, false, bail); // denormal operand -> interp a.cvtsd2ss(x86::xmm0, x86::xmm0); // round to single sgl_bail(x86::xmm0, true, bail); // Inf/NaN/denormal single -> interp a.cvtss2sd(x86::xmm0, x86::xmm0); // -> double (register format) a.movq(x86::qword_ptr(x86::rbp, m_off.f_base + 8u * (uint32_t)rc), x86::xmm0); a.jmp(cont); a.bind(bail); set_pc(b->tag + 4 * (uint64_t)i); a.mov(x86::eax, imm(i)); a.add(x86::eax, x86::dword_ptr(x86::rsp, 40)); a.jmp(done); a.bind(cont); continue; } // CVTTQ (T->Q): double -> signed 64-bit integer (raw bits into f[Fc]). /C chops (cvttsd2si), else // round-to-nearest (cvtsd2si). Overflow / Inf / NaN -> the integer indefinite -> interp (covers // IOV). A non-integer source (inexact) with INE clear traps -> interp; exact converts run inline. if (op == OP_CVTTQ) { const bool chop = (((ins >> 11) & 3) == 0); const bool dyn = (((ins >> 11) & 3) == 3); Label bail = a.new_label(), cont = a.new_label(), exact = a.new_label(); a.cmp(x86::byte_ptr(x86::rbp, m_off.fpen), imm(0)); a.je(bail); // FPSTART a.mov(x86::qword_ptr(x86::rbp, m_off.exc_sum), imm(0)); if (dyn) fp_dyn_bail(bail); a.movq(x86::xmm0, x86::qword_ptr(x86::rbp, m_off.f_base + 8u * (uint32_t)rb)); // f[Fb] (double) dbl_bail(x86::xmm0, false, bail); // denormal operand -> interp if (chop) a.cvttsd2si(x86::rax, x86::xmm0); else a.cvtsd2si(x86::rax, x86::xmm0); a.mov(x86::ecx, imm(1)); a.shl(x86::rcx, imm(63)); // rcx = INT64_MIN (indefinite) a.cmp(x86::rax, x86::rcx); a.je(bail); // overflow / Inf / NaN -> interp a.cvtsi2sd(x86::xmm1, x86::rax); // round-trip to test exactness a.ucomisd(x86::xmm1, x86::xmm0); a.je(exact); // round-trip == source -> exact a.bt(x86::qword_ptr(x86::rbp, m_off.fpcr), imm(56)); a.jnc(bail); // inexact + INE clear -> trap a.bind(exact); a.mov(x86::qword_ptr(x86::rbp, m_off.f_base + 8u * (uint32_t)rc), x86::rax); // store s64 bits a.jmp(cont); a.bind(bail); set_pc(b->tag + 4 * (uint64_t)i); a.mov(x86::eax, imm(i)); a.add(x86::eax, x86::dword_ptr(x86::rsp, 40)); a.jmp(done); a.bind(cont); continue; } // IEEE SQRT (T/S): sqrtsd / sqrtss. Denormal operand, an Inf/NaN/denormal result (a negative // operand yields NaN), or a first inexact (INE clear) bail to the interpreter. if (op == OP_SQRTT || op == OP_SQRTS) { const bool dyn = (((ins >> 11) & 3) == 3); Label bail = a.new_label(), cont = a.new_label(); a.cmp(x86::byte_ptr(x86::rbp, m_off.fpen), imm(0)); a.je(bail); // FPSTART a.mov(x86::qword_ptr(x86::rbp, m_off.exc_sum), imm(0)); a.bt(x86::qword_ptr(x86::rbp, m_off.fpcr), imm(56)); a.jnc(bail); // INE clear -> first inexact traps if (dyn) fp_dyn_bail(bail); a.movq(x86::xmm0, x86::qword_ptr(x86::rbp, m_off.f_base + 8u * (uint32_t)rb)); // f[Fb] dbl_bail(x86::xmm0, false, bail); // denormal operand -> interp if (op == OP_SQRTT) { a.sqrtsd(x86::xmm0, x86::xmm0); dbl_bail(x86::xmm0, true, bail); // Inf/NaN(neg)/denormal result -> interp } else { a.cvtsd2ss(x86::xmm0, x86::xmm0); a.sqrtss(x86::xmm0, x86::xmm0); sgl_bail(x86::xmm0, true, bail); // Inf/NaN(neg)/denormal result -> interp a.cvtss2sd(x86::xmm0, x86::xmm0); } a.movq(x86::qword_ptr(x86::rbp, m_off.f_base + 8u * (uint32_t)rc), x86::xmm0); a.jmp(cont); a.bind(bail); set_pc(b->tag + 4 * (uint64_t)i); a.mov(x86::eax, imm(i)); a.add(x86::eax, x86::dword_ptr(x86::rsp, 40)); a.jmp(done); a.bind(cont); continue; } // IEEE S-float arith (ADDS/SUBS/MULS/DIVS): compute in single precision (narrow the operands, // op, re-widen). Same bail policy as the T-float arith -- FP-off, /D-not-nearest, denormal // operand, INE clear, or an Inf/NaN/denormal single result. if (op == OP_ADDS || op == OP_SUBS || op == OP_MULS || op == OP_DIVS) { const bool dyn = (((ins >> 11) & 3) == 3); Label bail = a.new_label(), cont = a.new_label(); a.cmp(x86::byte_ptr(x86::rbp, m_off.fpen), imm(0)); a.je(bail); // FPSTART a.mov(x86::qword_ptr(x86::rbp, m_off.exc_sum), imm(0)); a.bt(x86::qword_ptr(x86::rbp, m_off.fpcr), imm(56)); a.jnc(bail); // INE clear -> first inexact traps if (dyn) fp_dyn_bail(bail); a.movq(x86::xmm0, x86::qword_ptr(x86::rbp, m_off.f_base + 8u * (uint32_t)ra)); // f[Fa] a.movq(x86::xmm1, x86::qword_ptr(x86::rbp, m_off.f_base + 8u * (uint32_t)rb)); // f[Fb] dbl_bail(x86::xmm0, false, bail); // denormal operands -> interp dbl_bail(x86::xmm1, false, bail); a.cvtsd2ss(x86::xmm0, x86::xmm0); // operands -> single (exact) a.cvtsd2ss(x86::xmm1, x86::xmm1); switch (op) { case OP_ADDS: a.addss(x86::xmm0, x86::xmm1); break; case OP_SUBS: a.subss(x86::xmm0, x86::xmm1); break; case OP_MULS: a.mulss(x86::xmm0, x86::xmm1); break; default: a.divss(x86::xmm0, x86::xmm1); break; // OP_DIVS } sgl_bail(x86::xmm0, true, bail); // Inf/NaN/denormal single result -> interp a.cvtss2sd(x86::xmm0, x86::xmm0); // -> double (register format) a.movq(x86::qword_ptr(x86::rbp, m_off.f_base + 8u * (uint32_t)rc), x86::xmm0); a.jmp(cont); a.bind(bail); set_pc(b->tag + 4 * (uint64_t)i); a.mov(x86::eax, imm(i)); a.add(x86::eax, x86::dword_ptr(x86::rsp, 40)); a.jmp(done); a.bind(cont); continue; } // FLTL non-arithmetic (0x17): all effects in state.f / fpcr via jit_fltl(cpu, ins). if (op == OP_FLTL) { emit_call(fltl_helper, { {JA_CPU, 0}, {JA_I32, (uint64_t)ins} }); // jit_fltl(cpu, ins) Label ok = a.new_label(); a.test(x86::eax, x86::eax); a.jz(ok); set_pc(b->tag + 4 * (uint64_t)i); // FEN trap: resume here in the interpreter a.mov(x86::eax, imm(i)); a.add(x86::eax, x86::dword_ptr(x86::rsp, 40)); a.jmp(done); a.bind(ok); continue; } // FLTV VAX arith (0x15): jit_fltv runs the op into f[Fc]. Return 0 ok / 1 FPSTART bail (op not run // -> set_pc, interp re-runs) / 2 arith trap (op ran + GO_PAL already set state.pc -> return as-is). if (op == OP_FLTV) { emit_call(fltv_helper, { {JA_CPU, 0}, {JA_I32, (uint64_t)ins} }); // jit_fltv(cpu, ins) Label ok = a.new_label(), trapped = a.new_label(); a.test(x86::eax, x86::eax); a.jz(ok); // 0: no trap -> continue the block a.cmp(x86::eax, imm(2)); a.je(trapped); // 2: arith trap -- GO_PAL already set state.pc set_pc(b->tag + 4 * (uint64_t)i); // 1: FEN trap (op not run) -> resume this instr a.mov(x86::eax, imm(i)); a.add(x86::eax, x86::dword_ptr(x86::rsp, 40)); a.jmp(done); a.bind(trapped); // op ran then diverted: count it, keep state.pc a.mov(x86::eax, imm(i + 1)); a.add(x86::eax, x86::dword_ptr(x86::rsp, 40)); a.jmp(done); a.bind(ok); continue; } // Computed jump JMP/JSR/RET (0x1a): Ra = PC+4 (return address); PC = Rb & ~3. A // terminator like a branch, but the target is a register -- left in R10 so the epilogue's // cached link validates it (succ->tag == target), chaining calls/returns/dispatch. if (op == OP_JMP) { const uint64_t ret = b->tag + 4 * (uint64_t)(i + 1); if (rb == 31) a.xor_(x86::r10d, x86::r10d); else mov_from_reg(x86::r10, rb); a.and_(x86::r10, imm(~(uint64_t)3)); // target = Rb & ~3 (clear low 2) if (b->tag & 3) a.or_(x86::r10, imm(b->tag & 3)); // DO_JMP: mode bits come from the current pc if (ra != 31) { a.mov(x86::rax, imm(ret & ~(uint64_t)3)); mov_to_reg(ra, x86::rax); } // return addr = PC & ~3 (DO_JMP) a.mov(x86::qword_ptr(x86::rbp, m_off.state_pc), x86::r10); // state.pc = target continue; } // HW_RET (HWREI, 0x1e): a PAL return -- a simple computed jump, target = Rb & ~2, no return- // address write. Like OP_JMP, leaves R10 = target for the epilogue's in-frame chain. if (op == OP_HW_RET) { if (rb == 31) a.xor_(x86::r10d, x86::r10d); else mov_from_reg(x86::r10, rb); a.and_(x86::r10, imm(~(uint64_t)2)); // target = Rb & ~2 (clear bit 1) a.mov(x86::qword_ptr(x86::rbp, m_off.state_pc), x86::r10); // state.pc = target continue; } // CALL_PAL (0x00): vector to the PALcode entry, saving the return address in R23 and the // faulting PC in EXC_ADDR (per ENTER_NATIVE_CALL_PAL). if (op == OP_CALL_PAL) { const uint32_t func = ins & 0x1FFFFFFF; const uint64_t cpc = b->tag + 4 * (uint64_t)i; // CALL_PAL address const uint64_t ret = (b->tag + 4 * (uint64_t)(i + 1)) & ~(uint64_t)2; // return addr (PC & ~2) const uint64_t voff = (uint64_t)0x2000 | ((uint64_t)(func & 0x80) << 5) | ((uint64_t)(func & 0x3f) << 6) | (uint64_t)1; // PAL entry offset Label do_vector = a.new_label(); if (func < 0x40) { // privileged: OPCDEC trap if in user mode (cm != 0) a.cmp(x86::dword_ptr(x86::rbp, m_off.state_cm), imm(0)); a.je(do_vector); emit_call(opcdec_helper, { {JA_CPU, 0}, {JA_I64, cpc} }); // jit_opcdec: sets state.pc/exc_addr, clears lock a.add(x86::qword_ptr(x86::rsp, 40), imm(i + 1)); // count the block; helper already wrote state.pc a.mov(x86::eax, x86::dword_ptr(x86::rsp, 40)); a.jmp(done); // trap path exits (does not chain) } a.bind(do_vector); a.mov(x86::rax, imm(cpc)); // EXC_ADDR = CALL_PAL address a.mov(x86::qword_ptr(x86::rbp, m_off.exc_addr), x86::rax); a.movzx(x86::eax, x86::byte_ptr(x86::rbp, m_off.sde)); // SDE (0/1) a.shl(x86::eax, imm(5)); // * 32 a.add(x86::eax, imm(23)); // R23 index: 23, or 55 if SDE a.mov(x86::rcx, imm(ret)); a.mov(x86::qword_ptr(x86::rbx, x86::rax, 3), x86::rcx); // r[idx] = return address a.mov(x86::r10, x86::qword_ptr(x86::rbp, m_off.pal_base)); a.or_(x86::r10, imm(voff)); // r10 = pal_base | entry offset a.mov(x86::qword_ptr(x86::rbp, m_off.state_pc), x86::r10); // state.pc = PAL entry continue; // -> terminator epilogue chains via r10 } // Branch terminators: compute the target into state.pc, then end the block. The // branch is at index i, so the PC of the next instruction is b->tag + 4*(i+1). // FP conditional branches (0x31-0x37): FPSTART, then branch on f[Fa] vs 0.0 (sign-magnitude). // Map the bits to a monotonic signed s = sign ? -magnitude : magnitude so the integer signed // cmov conditions apply directly; -0 -> s=0 (matches the interp's zero handling), NaN by bits. if (is_fp_branch(op)) { const int64_t bdisp = (int64_t)((uint64_t)(ins & 0x1FFFFF) << 43) >> 43; // sext disp21 const uint64_t fall = b->tag + 4 * (uint64_t)(i + 1); const uint64_t tgt = fall + (uint64_t)(bdisp * 4); Label bail = a.new_label(), cont = a.new_label(); a.cmp(x86::byte_ptr(x86::rbp, m_off.fpen), imm(0)); // FPSTART: FP disabled -> FEN trap (interp) a.je(bail); a.mov(x86::qword_ptr(x86::rbp, m_off.exc_sum), imm(0)); if (ra == 31) a.xor_(x86::eax, x86::eax); // f[31] = 0 -> s = 0 else { a.mov(x86::rax, x86::qword_ptr(x86::rbp, m_off.f_base + 8u * ra)); a.mov(x86::rcx, x86::rax); a.sar(x86::rcx, imm(63)); // rcx = sign mask (0 or -1) a.btr(x86::rax, imm(63)); // rax = magnitude (sign bit cleared) a.xor_(x86::rax, x86::rcx); a.sub(x86::rax, x86::rcx); // rax = s = sign ? -magnitude : magnitude } a.mov(x86::r10, imm(fall)); a.mov(x86::r11, imm(tgt)); a.test(x86::rax, x86::rax); switch (op) { case OP_FBEQ: a.cmovz(x86::r10, x86::r11); break; case OP_FBNE: a.cmovnz(x86::r10, x86::r11); break; case OP_FBLT: a.cmovs(x86::r10, x86::r11); break; case OP_FBGE: a.cmovns(x86::r10, x86::r11); break; case OP_FBLE: a.cmovle(x86::r10, x86::r11); break; case OP_FBGT: a.cmovg(x86::r10, x86::r11); break; default: break; } a.mov(x86::qword_ptr(x86::rbp, m_off.state_pc), x86::r10); a.jmp(cont); a.bind(bail); set_pc(b->tag + 4 * (uint64_t)i); // resume this instruction in the interpreter a.mov(x86::eax, imm(i)); a.add(x86::eax, x86::dword_ptr(x86::rsp, 40)); a.jmp(done); a.bind(cont); continue; } if (is_branch(op)) { const int64_t bdisp = (int64_t)((uint64_t)(ins & 0x1FFFFF) << 43) >> 43; // sext disp21 const uint64_t fall = b->tag + 4 * (uint64_t)(i + 1); const uint64_t tgt = fall + (uint64_t)(bdisp * 4); if (op == OP_BR || op == OP_BSR) { // Ra = return address; PC = target if (ra != 31) { a.mov(x86::r10, imm(fall & ~(uint64_t)3)); mov_to_reg(ra, x86::r10); } // link = PC & ~3 (DO_BR) a.mov(x86::r10, imm(tgt)); } else { // conditional: PC = cond ? target : fall if (ra == 31) a.xor_(x86::eax, x86::eax); else mov_from_reg(x86::rax, ra); a.mov(x86::r10, imm(fall)); a.mov(x86::r11, imm(tgt)); if (op == OP_BLBC || op == OP_BLBS) a.test(x86::rax, imm(1)); else a.test(x86::rax, x86::rax); switch (op) { case OP_BEQ: a.cmovz(x86::r10, x86::r11); break; case OP_BNE: a.cmovnz(x86::r10, x86::r11); break; case OP_BLT: a.cmovs(x86::r10, x86::r11); break; case OP_BGE: a.cmovns(x86::r10, x86::r11); break; case OP_BLE: a.cmovle(x86::r10, x86::r11); break; case OP_BGT: a.cmovg(x86::r10, x86::r11); break; case OP_BLBC: a.cmovz(x86::r10, x86::r11); break; case OP_BLBS: a.cmovnz(x86::r10, x86::r11); break; default: break; } } a.mov(x86::qword_ptr(x86::rbp, m_off.state_pc), x86::r10); continue; } // INTL conditional moves (CMOVxx): Rc = cond(Ra) ? op2 : Rc. Same condition tests as the // matching branches; op2 (Rb or literal) moves into Rc only when the condition holds, so the // current Rc is read and kept otherwise. R31 dest discards the result. if (op == OP_CMOV) { if (rc == 31) continue; const uint32_t f = (ins >> 5) & 0x7f; op1_rax(); // rax = Ra (condition operand) op2_rcx(); // rcx = op2 (Rb or literal -- the moved value) mov_from_reg(x86::r10, rc); // r10 = current Rc (kept when the condition fails) if (f == 0x14 || f == 0x16) a.test(x86::rax, imm(1)); // CMOVLBS/LBC: test the low bit else a.test(x86::rax, x86::rax); // others: test the full value switch (f) { case 0x24: a.cmovz(x86::r10, x86::rcx); break; // CMOVEQ (Ra == 0) case 0x26: a.cmovnz(x86::r10, x86::rcx); break; // CMOVNE (Ra != 0) case 0x44: a.cmovs(x86::r10, x86::rcx); break; // CMOVLT (Ra < 0) case 0x46: a.cmovns(x86::r10, x86::rcx); break; // CMOVGE (Ra >= 0) case 0x64: a.cmovle(x86::r10, x86::rcx); break; // CMOVLE (Ra <= 0) case 0x66: a.cmovg(x86::r10, x86::rcx); break; // CMOVGT (Ra > 0) case 0x14: a.cmovnz(x86::r10, x86::rcx); break; // CMOVLBS (Ra & 1) case 0x16: a.cmovz(x86::r10, x86::rcx); break; // CMOVLBC (!(Ra & 1)) } mov_to_reg(rc, x86::r10); continue; } // INTS (0x12) byte-manipulation: extract/insert/mask/zap. Rc = a shift+mask of Ra keyed on the // byte position pos = Rb&7 (the selector V_2). Pure ALU -> verify-checked by the GPR compare. // RAX = Ra/result, CL = the variable shift, R10/R11 scratch, EDX preserves pos for the H-form // pos==0 edge cases. Mirrors cpu_bwx.h; size (0=B 1=W 2=L 3=Q) and mask come from the function. if (op == OP_EXTL || op == OP_EXTH || op == OP_INSL || op == OP_INSH || op == OP_MSKL || op == OP_MSKH || op == OP_ZAP) { if (rc == 31) continue; const uint32_t f = (ins >> 5) & 0x7f; const int size = (f >> 4) & 3; const uint64_t mask = (size == 0) ? (uint64_t)0xff : (size == 1) ? (uint64_t)0xffff : (size == 2) ? (uint64_t)0xffffffff : ~(uint64_t)0; op1_rax(); // rax = Ra (data) op2_rcx(); // rcx = Rb / literal (selector) switch (op) { case OP_EXTL: // (Ra >> pos*8) & mask a.and_(x86::ecx, imm(7)); a.shl(x86::ecx, imm(3)); a.shr(x86::rax, x86::cl); if (size != 3) { a.mov(x86::r10, imm(mask)); a.and_(x86::rax, x86::r10); } break; case OP_EXTH: // (Ra << ((64-pos*8)&63)) & mask a.and_(x86::ecx, imm(7)); a.shl(x86::ecx, imm(3)); a.neg(x86::ecx); a.and_(x86::ecx, imm(63)); a.shl(x86::rax, x86::cl); if (size != 3) { a.mov(x86::r10, imm(mask)); a.and_(x86::rax, x86::r10); } break; case OP_INSL: // (Ra & mask) << pos*8 if (size != 3) { a.mov(x86::r10, imm(mask)); a.and_(x86::rax, x86::r10); } a.and_(x86::ecx, imm(7)); a.shl(x86::ecx, imm(3)); a.shl(x86::rax, x86::cl); break; case OP_INSH: // pos ? ((Ra&mask) >> ((64-pos*8)&63)) : 0 if (size != 3) { a.mov(x86::r10, imm(mask)); a.and_(x86::rax, x86::r10); } a.and_(x86::ecx, imm(7)); a.mov(x86::edx, x86::ecx); a.shl(x86::ecx, imm(3)); a.neg(x86::ecx); a.and_(x86::ecx, imm(63)); a.shr(x86::rax, x86::cl); a.xor_(x86::r11d, x86::r11d); a.test(x86::edx, x86::edx); a.cmovz(x86::rax, x86::r11); break; case OP_MSKL: // Ra & ~(mask << pos*8) a.and_(x86::ecx, imm(7)); a.shl(x86::ecx, imm(3)); a.mov(x86::r10, imm(mask)); a.shl(x86::r10, x86::cl); a.not_(x86::r10); a.and_(x86::rax, x86::r10); break; case OP_MSKH: // pos ? (Ra & ~(mask >> ((64-pos*8)&63))) : Ra a.and_(x86::ecx, imm(7)); a.mov(x86::edx, x86::ecx); a.shl(x86::ecx, imm(3)); a.neg(x86::ecx); a.and_(x86::ecx, imm(63)); a.mov(x86::r10, imm(mask)); a.shr(x86::r10, x86::cl); a.not_(x86::r10); a.mov(x86::r11, x86::rax); a.and_(x86::r11, x86::r10); a.test(x86::edx, x86::edx); a.cmovnz(x86::rax, x86::r11); break; case OP_ZAP: // Ra & byte_expand(selector); ZAP inverts a.movzx(x86::ecx, x86::cl); a.mov(x86::r11, imm((uint64_t)&g_zapnot_mask[0])); a.mov(x86::r10, x86::qword_ptr(x86::r11, x86::rcx, 3)); // g_zapnot_mask[selector & 0xff] if (f == 0x30) a.not_(x86::r10); // ZAP keeps bytes whose bit is CLEAR a.and_(x86::rax, x86::r10); break; default: break; } mov_to_reg(rc, x86::rax); continue; } switch (op) { case OP_ADDQ: emit_alu2(x86::Inst::kIdAdd); break; case OP_SUBQ: emit_alu2(x86::Inst::kIdSub); break; case OP_AND: emit_alu2(x86::Inst::kIdAnd); break; case OP_BIS: emit_alu2(x86::Inst::kIdOr); break; case OP_XOR: emit_alu2(x86::Inst::kIdXor); break; case OP_BIC: op1_rax(); op2_rcx(); a.not_(x86::rcx); a.and_(x86::rax, x86::rcx); break; case OP_ORNOT: op1_rax(); op2_rcx(); a.not_(x86::rcx); a.or_(x86::rax, x86::rcx); break; case OP_EQV: op1_rax(); op2_rcx(); a.not_(x86::rcx); a.xor_(x86::rax, x86::rcx); break; case OP_MULQ: op1_rax(); op2_rcx(); a.imul(x86::rax, x86::rcx); break; case OP_UMULH: op1_rax(); op2_rcx(); a.mul(x86::rcx); a.mov(x86::rax, x86::rdx); break; // RDX:RAX = Ra*op2; hi64 = RDX case OP_MULL: // 32-bit multiply, low 32 sign-extended { if (ra == 31) a.xor_(x86::eax, x86::eax); else mov_from_reg32(x86::eax, ra); if (islit) a.mov(x86::ecx, imm(lit)); else if (rb == 31) a.xor_(x86::ecx, x86::ecx); else mov_from_reg32(x86::ecx, rb); a.imul(x86::eax, x86::ecx); // eax = (Ra*op2)[31:0] a.movsxd(x86::rax, x86::eax); break; } case OP_S4ADDQ: op1_rax(); a.shl(x86::rax, imm(2)); op2_rcx(); a.add(x86::rax, x86::rcx); break; case OP_S8ADDQ: op1_rax(); a.shl(x86::rax, imm(3)); op2_rcx(); a.add(x86::rax, x86::rcx); break; case OP_S4SUBQ: op1_rax(); a.shl(x86::rax, imm(2)); op2_rcx(); a.sub(x86::rax, x86::rcx); break; case OP_S8SUBQ: op1_rax(); a.shl(x86::rax, imm(3)); op2_rcx(); a.sub(x86::rax, x86::rcx); break; case OP_SLL: op1_rax(); op2_rcx(); a.shl(x86::rax, x86::cl); break; case OP_SRL: op1_rax(); op2_rcx(); a.shr(x86::rax, x86::cl); break; case OP_SRA: op1_rax(); op2_rcx(); a.sar(x86::rax, x86::cl); break; case OP_SEXTB: op2_rcx(); a.movsx(x86::rax, x86::cl); break; // Rc = sign-extend low byte of op2 (V_2) case OP_SEXTW: op2_rcx(); a.movsx(x86::rax, x86::cx); break; // Rc = sign-extend low word of op2 (V_2) case OP_CTPOP: op2_rcx(); a.popcnt(x86::rax, x86::rcx); break; // Rc = popcount(op2); POPCNT(0)==0 == CTPOP(0) case OP_CTLZ: { // Rc = (op2==0) ? 64 : 63 - BSR(op2) op2_rcx(); Label zero = a.new_label(), done = a.new_label(); a.bsr(x86::rax, x86::rcx); // ZF=1 if op2==0; else rax = index of MSB (0..63) a.jz(zero); a.xor_(x86::rax, imm(63)); // 63 - bsr (bsr in [0,63], so 63-n == n^63) a.jmp(done); a.bind(zero); a.mov(x86::eax, imm(64)); a.bind(done); break; } case OP_CTTZ: // Rc = (op2==0) ? 64 : BSF(op2) op2_rcx(); a.bsf(x86::rax, x86::rcx); // ZF=1 if op2==0; else rax = index of LSB a.mov(x86::r10d, imm(64)); // MOV preserves ZF a.cmovz(x86::rax, x86::r10); // op2==0 -> 64 (BSF leaves rax undefined) break; case OP_AMASK: // Rc = op2 & ~CPU_AMASK -- EV68 feature mask 0x1307 (keep in sync w/ cpu_defs.h); op2_rcx(); // classify enforced Ra==31 (the Ra!=31 form traps OPCDEC in the interpreter) a.mov(x86::rax, imm(~(uint64_t)0x1307)); a.and_(x86::rax, x86::rcx); break; case OP_IMPLVER: // Rc = CPU_IMPLVER (2 = EV6 family; keep in sync w/ cpu_defs.h) a.mov(x86::eax, imm(2)); break; case OP_CMPEQ: op1_rax(); op2_rcx(); a.cmp(x86::rax, x86::rcx); a.sete(x86::al); a.movzx(x86::eax, x86::al); break; case OP_CMPLT: op1_rax(); op2_rcx(); a.cmp(x86::rax, x86::rcx); a.setl(x86::al); a.movzx(x86::eax, x86::al); break; case OP_CMPLE: op1_rax(); op2_rcx(); a.cmp(x86::rax, x86::rcx); a.setle(x86::al); a.movzx(x86::eax, x86::al); break; case OP_CMPULT: op1_rax(); op2_rcx(); a.cmp(x86::rax, x86::rcx); a.setb(x86::al); a.movzx(x86::eax, x86::al); break; case OP_CMPULE: op1_rax(); op2_rcx(); a.cmp(x86::rax, x86::rcx); a.setbe(x86::al); a.movzx(x86::eax, x86::al); break; case OP_CMPBGE: // 8 parallel unsigned byte compares (Ra.byte[i] >= op2.byte[i]) -> bit i; bits 63:8 = 0 op1_rax(); op2_rcx(); a.movq(x86::xmm0, x86::rax); // Ra (8 bytes) a.movq(x86::xmm1, x86::rcx); // op2 (8 bytes; literal in byte 0, 0 elsewhere) a.movdqa(x86::xmm2, x86::xmm0); // copy of Ra a.pmaxub(x86::xmm2, x86::xmm1); // per-byte unsigned max(Ra, op2) a.pcmpeqb(x86::xmm2, x86::xmm0); // 0xff where max == Ra, i.e. Ra >= op2 a.pmovmskb(x86::eax, x86::xmm2); // gather byte-sign bits -> low 8 bits = the mask a.movzx(x86::eax, x86::al); // discard the high (zero-padded) lane bits break; case OP_ADDL: case OP_SUBL: // 32-bit, result sign-extended to 64 case OP_S4ADDL: case OP_S8ADDL: case OP_S4SUBL: case OP_S8SUBL: // scaled longword: sext32((Ra*scale) +/- Rb) { const bool issub = (op == OP_SUBL || op == OP_S4SUBL || op == OP_S8SUBL); const int sh = (op == OP_S4ADDL || op == OP_S4SUBL) ? 2 // Ra*4 : (op == OP_S8ADDL || op == OP_S8SUBL) ? 3 // Ra*8 : 0; if (ra == 31) a.xor_(x86::eax, x86::eax); else mov_from_reg32(x86::eax, ra); // shadow-remapped (was a raw rbx read) if (sh) a.shl(x86::eax, imm(sh)); // scale in 32-bit: (Ra<= kReclaimBytes) { reclaim_code(); b->valid = true; // b was just (re)validated by record(); restore it after the wipe } b->compiled = true; uint64_t phys = b->phys; if (b->n_instr == 0 || phys + (uint64_t) b->n_instr * 4 > dram_size) return; const uint32_t* words = (const uint32_t*) (dram + phys); // x86 LE == Alpha LE // PALmode blocks (PC bit 0) remap R4-7/R20-23 to the shadow bank (see RREG); reg() applies it. const bool pal_block = (b->tag & 1) != 0; // Stop at the 8 KB page boundary: past it the next instruction's physical // address need not be phys+4 (the next virtual page maps elsewhere), so words[] // there would be the wrong instructions. (The page-crossing case verify caught.) const uint64_t page_end = (phys & ~(uint64_t) 0x1FFF) + 0x2000; uint32_t plen = 0; bool terminator_branch = false; // last instruction is a compiled terminator (sets its own PC) bool terminator_jmp = false; // ...and it's a computed jump (don't chain: targets vary) while (plen < b->n_instr && plen < 64 && (phys + (uint64_t) plen * 4) < page_end) { SafeOp sop = classify(words[plen], pal_block); if (sop == OP_NONE) { // uncompilable op ends the straight-line prefix #ifdef JIT_STATS const uint32_t bop = words[plen] >> 26; m_term_op[bop]++; // tally what cut this block (the coverage gap to chase) if (bop == 0x00) // CALL_PAL: also tally the function code (low 8 bits) m_pal_func[words[plen] & 0xFF]++; else if (bop == 0x1d) // HW_MTPR: tally the IPR index -- which writes break blocks m_mtpr_func[(words[plen] >> 8) & 0xFF]++; else if (bop == 0x1b) // HW_LD: tally the form (phys/virt/lock/vpte/chk, ins[15:12]) m_hwld_func[(words[plen] >> 12) & 0xF]++; else if (bop == 0x18) // MISC: tally the Ra==31 form (ins[15:12]: 0xc RPCC / 0xe RC / 0xf RS) m_misc_func[(words[plen] >> 12) & 0xF]++; // Punch list: one-shot print of the first ACTIONABLE breaker -- skip the opcodes whose // compilable subset is already settled, so it points at the next NEW target rather than a // decided one: 0x00 CALL_PAL (terminator), 0x1b HW_LD / 0x1f HW_ST (physical done; // conditional/virtual forms side-effecting), 0x1d HW_MTPR (pure-store IPRs done; rest // side-effecting), 0x10 INTA + 0x13 INTM (non-trapping ops done; only /V overflow-trap variants left), // 0x18 MISC (barriers/hints + RPCC/RC/RS via log/replay done; only the rare Ra==31 RC/RS forms interpret), // 0x14 ITFP (ITOF* moves done; SQRT* = the deferred FP-math class: FPCR/rounding/traps), // 0x17 FLTL (non-arithmetic done; only CVTQL/V trap variants left). // JMP (0x1a) + HW_RET (0x1e) are now compiled+chained. Stats count all. if (!m_first_breaker_logged && bop != 0x00 && bop != 0x1b && bop != 0x1d && bop != 0x1f && bop != 0x10 && bop != 0x18 && bop != 0x13 && bop != 0x14 && bop != 0x17) { m_first_breaker_logged = true; printf("[JIT][PUNCH][CPU%d] first unhandled breaker: %s(0x%02x) ins=%08x at pc=%016llx%s\n", m_cpu_id, opcode_name(bop), bop, words[plen], (unsigned long long) ((b->tag & ~(uint64_t) 1) + (uint64_t) plen * 4), pal_block ? " [PALmode]" : ""); } #endif break; } plen++; if (is_terminator(sop)) { // branch or computed jump ends the block terminator_branch = true; if (sop == OP_JMP || sop == OP_HW_RET) terminator_jmp = true; break; } } if (plen == 0) return; #ifdef JIT_REGPROF b->rp_mask = regprof_mask(words, plen); // GPR-access fingerprint; exec-weighted at report time #endif // Emit uint32_t fn(CAlphaCPU* cpu, uint64_t* regs) (Win64: cpu=RCX, regs=RDX). // Keep cpu in RBP and regs in RBX (callee-saved, so they survive helper calls); // reserve a 40-byte frame (32 shadow + 8 load-out slot) that keeps RSP 16-aligned // for calls. RAX = op1/result, RCX = operand2 (CL for variable shifts). CodeHolder code; if (code.init(((JitRuntime*) m_rt)->environment()) != Error::kOk) return; #ifdef JIT_DISASM // Dev: capture this block's disassembly, validate each emitted instruction, and trap any // emit failure (dumped + bailed near rt->add() below). Logging formats every instruction. StringLogger logger; code.set_logger(&logger); JitErrorHandler eh; eh.cpu_id = m_cpu_id; eh.fp = m_disasm_fp; code.set_error_handler(&eh); #endif x86::Assembler a(&code); #ifdef JIT_DISASM a.add_diagnostic_options(DiagnosticOptions::kValidateAssembler); #endif // Host integer-argument registers: // Win64 {rcx,rdx,r8,r9}; System V {rdi,rsi,rdx,rcx}. aq(i)/ad(i) = the i argument as a // 64-/32-bit register; emit_call (below) marshals into them, replacing out usage of // #ifdef _WIN32. Helpers limited to <= 4 integer arguments. CallConv cc; (void) cc.init(CallConvId::kCDecl, ((JitRuntime*) m_rt)->environment()); const uint8_t* gpa = cc.passed_order(RegGroup::kGp); auto aq = [&](int i) { return x86::gpq(gpa[i]); }; auto ad = [&](int i) { return x86::gpd(gpa[i]); }; // cpu (RBP), regs (RBX), the chain counter (R14) and the basic Alpha-GPR pins (R12/R13/R15) // all hold live values across helper calls, so they must be callee-saved under the host ABI. // Verified at dev time (compiled out in NDEBUG builds). [[maybe_unused]] const uint32_t kPinnedGp = (1u << x86::rbp.id()) | (1u << x86::rbx.id()) | (1u << x86::r14.id()) | (1u << x86::r12.id()) | (1u << x86::r13.id()) | (1u << x86::r15.id()); assert((((uint32_t) cc.preserved_regs(RegGroup::kGp)) & kPinnedGp) == kPinnedGp); a.push(x86::rbx); a.push(x86::rbp); a.push(x86::r14); // callee-saved: now a pin for Alpha R30 (SP); chain count moved to [rsp+40] a.push(x86::r12); // callee-saved: pin for Alpha R26 (RA) a.push(x86::r13); // callee-saved: pin for Alpha R16 (a0) a.push(x86::r15); // callee-saved: pin for Alpha R27 (PV) #ifdef _WIN32 a.push(x86::rsi); // callee-saved on Win64: pin for Alpha R29 (GP) a.push(x86::rdi); // callee-saved on Win64: pin for Alpha R0 (v0) #endif a.sub(x86::rsp, imm(56)); // 32 shadow + out slot + chain-count slot [rsp+40]; 6/8 pushes -> 56 keeps RSP 16-aligned a.mov(x86::rbp, aq(0)); // cpu (arg 0) a.mov(x86::rbx, aq(1)); // regs (arg 1) a.mov(x86::qword_ptr(x86::rsp, 40), imm(0)); // chain instruction count := 0 (reclaimed r14 -> stack slot) // Load the global pins from regs[] on cold entry. Chained re-entry jumps to `body` below, // skipping this -- the pins stay live in x86 across the whole chain, synced back at `done`. a.mov(x86::r12, x86::qword_ptr(x86::rbx, 26 * 8)); // R26 (RA) a.mov(x86::r13, x86::qword_ptr(x86::rbx, 16 * 8)); // R16 (a0) a.mov(x86::r15, x86::qword_ptr(x86::rbx, 27 * 8)); // R27 (PV) a.mov(x86::r14, x86::qword_ptr(x86::rbx, 30 * 8)); // R30 (SP) -- reclaimed r14 #ifdef _WIN32 a.mov(x86::rsi, x86::qword_ptr(x86::rbx, 29 * 8)); // R29 (GP) a.mov(x86::rdi, x86::qword_ptr(x86::rbx, 0 * 8)); // R0 (v0) #endif Label done = a.new_label(); // shared exit: restore frame + ret (EAX preset by caller) Label body = a.new_label(); // chained re-entry (after the prologue; preserves R14) a.bind(body); const size_t body_off = code.code_size(); // byte offset of the chained entry from fn #ifdef JIT_REGPROF a.mov(x86::rax, imm((uint64_t) &b->rp_hits)); // REGPROF: count every execution (cold entry + chained re-entry) a.inc(x86::qword_ptr(x86::rax)); // RAX is dead at body entry -- the first op reloads it #endif // The compiled block computes its own next PC into state.pc at every exit (the // foundation for branch compilation and block linking). R10 is scratch here. auto set_pc = [&](uint64_t pc_val) { a.mov(x86::r10, imm(pc_val)); a.mov(x86::qword_ptr(x86::rbp, m_off.state_pc), x86::r10); }; // Block register allocator: the 3 global pins (R26/R16/R27 -> r12/r13/r15, callee-saved, live across the // chain) are the static binding today. Dynamic next. RegAlloc ra; for (int r = 0; r < 32; ++r) ra.host[r] = -1; ra.rax_holds = -1; ra.host[kGlobalPins[0]] = (int) x86::r12.id(); ra.host[kGlobalPins[1]] = (int) x86::r13.id(); ra.host[kGlobalPins[2]] = (int) x86::r15.id(); ra.host[30] = (int) x86::r14.id(); // SP (reclaimed r14), all platforms #ifdef _WIN32 ra.host[29] = (int) x86::rsi.id(); // GP (Win64) ra.host[0] = (int) x86::rdi.id(); // v0 (Win64) #endif const HelperSet hs = { read_helper, write_helper, opcdec_helper, hw_mfpr_helper, hw_ld_helper, hw_mtpr_helper, hw_st_helper, indirect_helper, read_locked_helper, stc_helper, misc_helper, read_vpte_helper, read_wchk_helper, itof_helper, ftoi_helper, fltl_helper, fp_read_helper, fp_write_helper, fltv_helper }; for (uint32_t i = 0; i < plen; ++i) emit_op(&a, gpa, &done, hs, pal_block, b, words[i], i, ra); // Epilogue. Count this block's instructions, then chain into the next block (staying // in native code) or return to the dispatcher. #ifndef JIT_VERIFY // Gate the chain: stop if we've hit the budget ceiling or an interrupt/timer is pending auto emit_gate = [&](Label& lbl) { a.mov(x86::rax, x86::qword_ptr(x86::rsp, 40)); a.cmp(x86::rax, x86::qword_ptr(x86::rbp, m_off.jit_budget)); a.jge(lbl); a.cmp(x86::byte_ptr(x86::rbp, m_off.check_int), imm(0)); a.jne(lbl); a.cmp(x86::byte_ptr(x86::rbp, m_off.check_timers), imm(0)); a.jne(lbl); }; // Field offsets within a JitBlock, so the epilogue can validate a cached successor via // [succ + off]. const uint32_t off_body = (uint32_t) ((char*) &b->jit_body - (char*) b); const uint32_t off_tag = (uint32_t) ((char*) &b->tag - (char*) b); const uint32_t off_vgen = (uint32_t) ((char*) &b->vgen - (char*) b); // Cached direct link: tail straight into our cached successor's body if it's still live // -- compiled (jit_body != 0, cleared on flush/recompile) AND still maps this exit's PC // (tag == R10) AND, for native targets, phys-validated under the current ITB generation // (gen-stale -> dispatcher, which rechecks phys and re-stamps). Otherwise record a patch // request (m_link_from = this block) and fall back. R10 = next PC; clobbers RAX/RCX/RDX. auto emit_chain = [&](Label& lbl) { Label miss = a.new_label(); // PAL/SDE guard once -- depends on the target (R10) + SDE, not the slot: a PALmode target's shadow // remap assumes SDE, so if R10 is PAL and !SDE no cached slot may chain in. { Label ok = a.new_label(); a.test(x86::r10, imm(1)); a.jz(ok); a.cmp(x86::byte_ptr(x86::rbp, m_off.sde), imm(0)); a.je(miss); // PALmode + !SDE: don't a.bind(ok); } // Epoch sum once, reused by every slot: vgen = itb_gen + flush_gen at last validation; both counters // are monotonic, so one sum compare catches a remap OR a flush since then. const uint32_t fg_off = (uint32_t) ((char*) &m_flush_gen - (char*) &m_itb_gen); a.mov(x86::rdx, imm((uint64_t) &m_itb_gen)); a.mov(x86::r11, x86::qword_ptr(x86::rdx)); a.add(x86::r11, x86::qword_ptr(x86::rdx, fg_off)); // r11 = current epoch sum // Poly-link: walk the cached direct successors; the first LIVE one mapping this exit (tag == R10) tails // in. A 2-successor block keeps both cached, so an alternating successor stops thrashing the dispatcher. for (int sl = 0; sl < kLinkSlots; ++sl) { Label nxt = (sl + 1 < kLinkSlots) ? a.new_label() : miss; a.mov(x86::rax, imm((uint64_t) &b->link[sl])); a.mov(x86::rax, x86::qword_ptr(x86::rax)); // succ = b->link[sl] a.test(x86::rax, x86::rax); a.jz(nxt); a.mov(x86::rcx, x86::qword_ptr(x86::rax, off_body)); // succ->jit_body (cleared on flush) a.test(x86::rcx, x86::rcx); a.jz(nxt); a.mov(x86::rdx, x86::qword_ptr(x86::rax, off_tag)); // succ->tag a.cmp(x86::rdx, x86::r10); a.jne(nxt); // not this exit's target a.cmp(x86::qword_ptr(x86::rax, off_vgen), x86::r11); a.jne(nxt); // stale: revalidate via dispatcher a.jmp(x86::rcx); // HIT: tail in (shared frame) if (sl + 1 < kLinkSlots) a.bind(nxt); } a.bind(miss); a.mov(x86::rax, imm((uint64_t) b)); a.mov(x86::qword_ptr(x86::rbp, m_off.link_from), x86::rax); // request a successor-cache patch // fall through to lbl (return to dispatcher) }; #endif if (terminator_jmp) { // Computed jump (JMP / HW_RET): R10 holds the register target (already written to state.pc). // Chain in-frame via jit_indirect -- the dispatcher's own block-cache lookup -- tailing into // the target's compiled body when it's live + runnable here. Unlike the old single-slot link // this keys on the ACTUAL target, so it handles all targets with no thrash on varying jumps. a.add(x86::qword_ptr(x86::rsp, 40), imm(plen)); #ifndef JIT_VERIFY Label exit_chain = a.new_label(); emit_gate(exit_chain); // budget/interrupt: bail to dispatcher a.mov(aq(0), x86::rbp); // cpu (arg 0) a.mov(aq(1), x86::r10); // target (arg 1) == state.pc a.mov(x86::rax, imm((uint64_t) indirect_helper)); a.call(x86::rax); // jit_indirect(cpu, target) -> body | 0 a.test(x86::rax, x86::rax); a.jz(exit_chain); a.jmp(x86::rax); // HIT: tail into the target's body a.bind(exit_chain); #endif } else if (terminator_branch) { a.add(x86::qword_ptr(x86::rsp, 40), imm(plen)); // R10 still holds the next PC (branch wrote state.pc + R10) #ifndef JIT_VERIFY Label exit_chain = a.new_label(); Label not_self = a.new_label(); emit_gate(exit_chain); // Self-loop fast path: a taken branch back to our own start (r10 == b->tag) jumps // straight into the body, skipping the resolver call entirely. a.mov(x86::rax, imm(b->tag)); // tag may exceed imm32 a.cmp(x86::r10, x86::rax); a.jne(not_self); a.jmp(body); a.bind(not_self); emit_chain(exit_chain); a.bind(exit_chain); #endif } else { set_pc(b->tag + 4 * (uint64_t) plen); // straight-line fall-through to the next block a.add(x86::qword_ptr(x86::rsp, 40), imm(plen)); #ifndef JIT_VERIFY Label exit_chain = a.new_label(); emit_gate(exit_chain); emit_chain(exit_chain); a.bind(exit_chain); #endif } a.mov(x86::eax, x86::dword_ptr(x86::rsp, 40)); // total instructions completed across the chain a.bind(done); // bail jumps here with EAX already set // Sync the pins back to regs[] -- rbx still = regs (restored last), and every dispatcher exit // (fall-through or mid-block bail) reaches here, so regs[] is live when we return. a.mov(x86::qword_ptr(x86::rbx, 26 * 8), x86::r12); // R26 (RA) a.mov(x86::qword_ptr(x86::rbx, 16 * 8), x86::r13); // R16 (a0) a.mov(x86::qword_ptr(x86::rbx, 27 * 8), x86::r15); // R27 (PV) a.mov(x86::qword_ptr(x86::rbx, 30 * 8), x86::r14); // R30 (SP) #ifdef _WIN32 a.mov(x86::qword_ptr(x86::rbx, 29 * 8), x86::rsi); // R29 (GP) a.mov(x86::qword_ptr(x86::rbx, 0 * 8), x86::rdi); // R0 (v0) #endif a.add(x86::rsp, imm(56)); #ifdef _WIN32 a.pop(x86::rdi); // Win64 pins pop first (reverse push order) a.pop(x86::rsi); #endif a.pop(x86::r15); // pins pop in reverse push order a.pop(x86::r13); a.pop(x86::r12); a.pop(x86::r14); a.pop(x86::rbp); a.pop(x86::rbx); a.ret(); const size_t csz = code.code_size(); JitFn fn = nullptr; #ifdef JIT_DISASM { FILE* out = m_disasm_fp ? m_disasm_fp : stderr; fprintf(out, "[JIT][CPU%d] block @ %016llx%s (%u instr, %llu bytes)\n%s\n", m_cpu_id, (unsigned long long) (b->tag & ~(uint64_t) 1), (b->tag & 1) ? " PAL" : "", plen, (unsigned long long) csz, logger.data()); fflush(out); // per-block flush: preserve the trace if JIT'd code later crashes } if (eh.failed) return; // emit error already reported -- don't ship a broken block #endif if (((JitRuntime*) m_rt)->add(&fn, &code) != Error::kOk) return; b->code = fn; b->jit_body = (void*) ((uint8_t*) (void*) fn + body_off); // chained re-entry (past prologue) b->body_off = (uint32_t) body_off; // to restore jit_body on revalidate b->src_sum = src_hash(dram + phys, b->n_instr); // source fingerprint (revalidate vs self-mod) b->hash_len = b->n_instr; // freeze the hash extent (n_instr drifts) b->prefix_len = plen; m_code_bytes += csz; // track for the reclaim threshold (see flush()) #ifdef JIT_STATS m_stat_compiled++; m_stat_plen_sum += plen; m_stat_code_bytes += csz; #endif #ifdef JIT_REGPROF b->rp_csz = (uint32_t) csz; // exec-weighted expansion: sum(rp_hits*rp_csz) / sum(rp_hits*prefix_len) #endif } // Compile an N-block trace. Reuses the shared emit_op for each block's per-op codegen (so the body // is the exact one the block path already verifies). Blocks are fused with a GUARD between them: after a // block's terminator (which left R10 = its next PC), check R10 == the next fused block's tag; on a hit fall // through in-trace, on a miss SIDE-EXIT to the dispatcher at the real next PC. n_blocks==1 = the single- // block trace. Fills the caller-provided slot t (code + per-segment source descriptors + coherence epoch). void CJitEngine::compile_trace(TraceFragment* t, JitBlock** blocks, uint32_t n_blocks, const uint8_t* dram, uint64_t dram_size, const HelperSet& hs) { using namespace asmjit; if (n_blocks == 0 || n_blocks > kMaxTraceSegs) return; // Validate every segment up front: a compiled prefix that fits in DRAM. (prefix_len, NOT n_instr -- // ops past the prefix never passed the safe-to-compile scan; emitting them runs code PAST the terminator.) for (uint32_t bi = 0; bi < n_blocks; ++bi) { const JitBlock* b = blocks[bi]; if (b->prefix_len == 0 || b->phys + (uint64_t) b->prefix_len * 4 > dram_size) return; } CodeHolder code; if (code.init(((JitRuntime*) m_rt)->environment()) != Error::kOk) return; x86::Assembler a(&code); CallConv cc; (void) cc.init(CallConvId::kCDecl, ((JitRuntime*) m_rt)->environment()); const uint8_t* gpa = cc.passed_order(RegGroup::kGp); a.push(x86::rbx); a.push(x86::rbp); a.push(x86::r14); a.push(x86::r12); a.push(x86::r13); a.push(x86::r15); #ifdef _WIN32 a.push(x86::rsi); a.push(x86::rdi); #endif a.sub(x86::rsp, imm(56)); a.mov(x86::rbp, x86::gpq(gpa[0])); // cpu a.mov(x86::rbx, x86::gpq(gpa[1])); // regs a.mov(x86::qword_ptr(x86::rsp, 40), imm(0)); // chain count := 0 (reclaimed r14 -> stack slot) a.mov(x86::r12, x86::qword_ptr(x86::rbx, 26 * 8)); // R26 (RA) pin a.mov(x86::r13, x86::qword_ptr(x86::rbx, 16 * 8)); // R16 (a0) pin a.mov(x86::r15, x86::qword_ptr(x86::rbx, 27 * 8)); // R27 (PV) pin a.mov(x86::r14, x86::qword_ptr(x86::rbx, 30 * 8)); // R30 (SP) pin #ifdef _WIN32 a.mov(x86::rsi, x86::qword_ptr(x86::rbx, 29 * 8)); // R29 (GP) pin a.mov(x86::rdi, x86::qword_ptr(x86::rbx, 0 * 8)); // R0 (v0) pin #endif Label done = a.new_label(); // shared side-exit/return: EAX preset to the instr count, state.pc live Label body = a.new_label(); // loop re-entry (after the prologue; pins + count stay live across iterations) a.bind(body); // Block register allocator: the 3 global pins (static, live across the trace). dynamic pool in future. RegAlloc ra; for (int r = 0; r < 32; ++r) ra.host[r] = -1; ra.rax_holds = -1; ra.host[kGlobalPins[0]] = (int) x86::r12.id(); ra.host[kGlobalPins[1]] = (int) x86::r13.id(); ra.host[kGlobalPins[2]] = (int) x86::r15.id(); ra.host[30] = (int) x86::r14.id(); // SP (reclaimed r14), all platforms #ifdef _WIN32 ra.host[29] = (int) x86::rsi.id(); // GP (Win64) ra.host[0] = (int) x86::rdi.id(); // v0 (Win64) #endif for (uint32_t bi = 0; bi < n_blocks; ++bi) { JitBlock* b = blocks[bi]; const uint32_t plen = b->prefix_len; const uint32_t* words = (const uint32_t*) (dram + b->phys); const bool pal_block = (b->tag & 1) != 0; // Default R10 + state.pc = this block's sequential next (the fall-through exit). emit_op's branch/jump // terminator overwrites both with its target; a fault bail writes the fault PC. For an intermediate // block this also makes the guard below see R10 == the sequential successor when it falls through. // (note: no compiled op currently READS state.pc mid-block, so default-before-emit is equivalent.) a.mov(x86::r10, imm(b->tag + 4 * (uint64_t) plen)); a.mov(x86::qword_ptr(x86::rbp, m_off.state_pc), x86::r10); for (uint32_t i = 0; i < plen; ++i) emit_op(&a, gpa, &done, hs, pal_block, b, words[i], i, ra); a.add(x86::qword_ptr(x86::rsp, 40), imm(plen)); // count this block a.mov(x86::eax, x86::dword_ptr(x86::rsp, 40)); // EAX = instrs completed so far (preset for `done` -- a side-exit or return) if (bi + 1 < n_blocks) { // Guard: did this block actually flow to the next fused block? R10 = its next PC; a mismatch means // the path diverged from what we fused -> side-exit to the dispatcher at the real next PC (state.pc). a.mov(x86::rcx, imm(blocks[bi + 1]->tag)); // 64-bit tag may exceed imm32; rcx scratch (not EAX) a.cmp(x86::r10, x86::rcx); a.jne(done); } } #ifndef JIT_VERIFY // Loop closure: if the last block branches back to the trace head, close the loop in compiled code with // the budget/interrupt gate ON the back-edge (risk #1). Verify builds omit this -> the trace runs one // iteration and exits at state.pc, so the side-exit verify validates the body unchanged. { JitBlock* lb = blocks[n_blocks - 1]; const uint32_t* lw = (const uint32_t*) (dram + lb->phys); const uint32_t lop = lw[lb->prefix_len - 1], lopc = lop >> 26; if (lopc == 0x30 || lopc == 0x34 || (lopc >= 0x38 && lopc <= 0x3f)) { // PC-relative branch terminator const int64_t disp = (int64_t) ((uint64_t) (lop & 0x1FFFFF) << 43) >> 43; const uint64_t tgt = (((lb->tag & ~(uint64_t) 1) + 4 * (uint64_t) (lb->prefix_len - 1)) + 4 + (uint64_t) (disp * 4)) | (lb->tag & 1); if (tgt == blocks[0]->tag) { // taken target == head -> a closable loop a.mov(x86::rcx, imm(blocks[0]->tag)); a.cmp(x86::r10, x86::rcx); a.jne(done); // not looping now -> exit a.mov(x86::rax, x86::qword_ptr(x86::rsp, 40)); a.cmp(x86::rax, x86::qword_ptr(x86::rbp, m_off.jit_budget)); a.jge(done); // budget ceiling a.cmp(x86::byte_ptr(x86::rbp, m_off.check_int), imm(0)); a.jne(done); // interrupt pending a.cmp(x86::byte_ptr(x86::rbp, m_off.check_timers), imm(0)); a.jne(done); // timer pending a.jmp(body); // loop in compiled code } } } #endif a.bind(done); a.mov(x86::qword_ptr(x86::rbx, 26 * 8), x86::r12); // sync pins back to regs[] before returning a.mov(x86::qword_ptr(x86::rbx, 16 * 8), x86::r13); a.mov(x86::qword_ptr(x86::rbx, 27 * 8), x86::r15); a.mov(x86::qword_ptr(x86::rbx, 30 * 8), x86::r14); // R30 (SP) #ifdef _WIN32 a.mov(x86::qword_ptr(x86::rbx, 29 * 8), x86::rsi); // R29 (GP) a.mov(x86::qword_ptr(x86::rbx, 0 * 8), x86::rdi); // R0 (v0) #endif a.add(x86::rsp, imm(56)); #ifdef _WIN32 a.pop(x86::rdi); a.pop(x86::rsi); #endif a.pop(x86::r15); a.pop(x86::r13); a.pop(x86::r12); a.pop(x86::r14); a.pop(x86::rbp); a.pop(x86::rbx); a.ret(); const size_t csz = code.code_size(); JitFn fn = nullptr; if (((JitRuntime*) m_rt)->add(&fn, &code) != Error::kOk) return; uint32_t total = 0; for (uint32_t bi = 0; bi < n_blocks; ++bi) { JitBlock* b = blocks[bi]; t->segs[bi] = { b->tag, b->phys, b->prefix_len, b->asm_global, b->asn, src_hash(dram + b->phys, b->prefix_len) }; total += b->prefix_len; } t->code = fn; t->head_tag = blocks[0]->tag; t->asn = blocks[0]->asn; t->asm_global = blocks[0]->asm_global; t->valid = true; t->vgen = m_itb_gen + m_flush_gen; t->flush_gen = m_flush_gen; t->n_blocks = n_blocks; t->n_instr = total; t->n_segs = n_blocks; t->n_exits = 0; m_code_bytes += csz; #ifdef JIT_STATS m_trace_formed++; #endif } #ifdef JIT_VERIFY uint64_t CJitEngine::verify_compare(uint64_t blk_virt, const uint64_t* interp, const uint64_t* jit, const uint32_t* words, uint32_t nwords) { m_v_exec++; for (int r = 0; r < 63; ++r) { // r0..r30 main bank + r32..r62 PALshadow bank if (r == 31) continue; // zero register if (interp[r] != jit[r]) { m_v_fail++; printf("[JIT][VERIFY] MISMATCH at block pc=%016llx: r%d interp=%016llx jit=%016llx\n", (unsigned long long) blk_virt, r, (unsigned long long) interp[r], (unsigned long long) jit[r]); printf(" words:"); // the compiled prefix, to decode which instruction diverged for (uint32_t w = 0; w < nwords && w < 16; ++w) printf(" %08x", words[w]); printf("\n"); break; } } if ((m_v_exec % 500000) == 0) { const auto t0 = std::chrono::steady_clock::now(); // exclude this print's I/O stall from the printf("[JIT][VERIFY] %llu compiled-block execs, %llu mismatches\n", // wall-clock-pinned RPCC (unsigned long long) m_v_exec, (unsigned long long) m_v_fail); return (uint64_t) std::chrono::duration_cast( std::chrono::steady_clock::now() - t0).count(); } return 0; } #endif #ifdef JIT_STATS // Short mnemonic for the opcode that ended a block's compiled prefix (the coverage gap). static const char* opcode_name(unsigned op) { switch (op) { case 0x00: return "CALL_PAL"; case 0x08: return "LDA"; case 0x09: return "LDAH"; case 0x0a: return "LDBU"; case 0x0b: return "LDQ_U"; case 0x0c: return "LDWU"; case 0x0d: return "STW"; case 0x0e: return "STB"; case 0x0f: return "STQ_U"; case 0x10: return "INTA"; case 0x11: return "INTL"; case 0x12: return "INTS"; case 0x13: return "INTM"; case 0x14: return "ITFP"; case 0x15: return "FLTV"; case 0x16: return "FLTI"; case 0x17: return "FLTL"; case 0x18: return "MISC"; case 0x19: return "HWMFPR";case 0x1a: return "JMP"; case 0x1b: return "HWLD"; case 0x1c: return "FPTI"; case 0x1d: return "HWMTPR";case 0x1e: return "HWREI"; case 0x1f: return "HWST"; case 0x20: return "LDF"; case 0x21: return "LDG"; case 0x22: return "LDS"; case 0x23: return "LDT"; case 0x24: return "STF"; case 0x25: return "STG"; case 0x26: return "STS"; case 0x27: return "STT"; case 0x2a: return "LDL_L"; case 0x2b: return "LDQ_L";case 0x2e: return "STL_C"; case 0x2f: return "STQ_C"; default: return "op"; } } uint64_t CJitEngine::note_exec(uint32_t native_instr, uint32_t interp_instr, uint64_t comp_tsc, uint64_t interp_tsc) { m_stat_native += native_instr; m_stat_interp += interp_instr; m_tsc_compiled += comp_tsc; // host cycles spent in b->code() (compiled chains) m_tsc_interp += interp_tsc; // host cycles spent in the interp fallback loop if (native_instr) m_stat_hot++; // one compiled-chain dispatch if (interp_instr) m_stat_miss++; // one interpreted (cold/uncompilable) block const uint64_t total = m_stat_native + m_stat_interp; if (total < 100000000) return 0; // report every 100M instructions // Time this report's own blocking I/O so the caller can exclude it from the wall-clock-pinned // RPCC -- else the printf stall is billed to the guest cycle counter as a forward jump. const auto stat_t0 = std::chrono::steady_clock::now(); const uint64_t win_tsc = jit_rdtsc() - m_tsc_window_start; // window's host cycles (time-split denominator) #ifdef JIT_REGPROF regprof_report(); // exec-weighted GPR histogram for pin selection (I/O timed within this window) #endif // Build each line in a buffer and print it with ONE call: 4 CPU threads print concurrently, // and per-item printf loops interleave mid-line. The [CPU%d] tag attributes each line. const double chain = m_stat_hot ? (double) m_stat_native / (double) m_stat_hot : 0.0; const double avgpl = m_stat_compiled ? (double) m_stat_plen_sum / (double) m_stat_compiled : 0.0; char buf[512]; int len; printf("[JIT][STATS][CPU%d] native %.1f%% (%llu/%llu) | chain avg %.1f instr over %llu dispatches | interp %llu blks\n", m_cpu_id, 100.0 * (double) m_stat_native / (double) total, (unsigned long long) m_stat_native, (unsigned long long) total, chain, (unsigned long long) m_stat_hot, (unsigned long long) m_stat_miss); // Throughput + code expansion: the in-JIT proxy for the codegen-quality diagnosis. MIPS is // wall-clock (clock-independent: cycles/instr = host_GHz / MIPS); x86-bytes/instr is the static // average emitted expansion. Read these off the WORKER CPU; the first window includes warmup. const uint64_t now_ns = (uint64_t) std::chrono::duration_cast( stat_t0.time_since_epoch()).count(); const uint64_t dt_ns = (now_ns > m_stat_wall_last_ns) ? now_ns - m_stat_wall_last_ns : 0; const double mips = dt_ns ? (double) total * 1000.0 / (double) dt_ns : 0.0; const double expand = m_stat_plen_sum ? (double) m_stat_code_bytes / (double) m_stat_plen_sum : 0.0; printf("[JIT][STATS][CPU%d] throughput %.0f MIPS (%llu instr / %.1f ms) | %.1f x86-bytes/instr (static avg)\n", m_cpu_id, mips, (unsigned long long) total, (double) dt_ns / 1e6, expand); // Wall-time split (host TSC): where the window actually spent time -- compiled execution vs interp // fallback vs the dispatcher/chaining remainder. Separates the two bottleneck candidates the // instruction counts can't (chain length and interp rate both just track how hot/covered the code is). if (win_tsc) { const double cf = 100.0 * (double) m_tsc_compiled / (double) win_tsc; const double itf = 100.0 * (double) m_tsc_interp / (double) win_tsc; printf("[JIT][STATS][CPU%d] time-split: compiled %.1f%% | interp %.1f%% | dispatch %.1f%%\n", m_cpu_id, cf, itf, (cf + itf < 100.0) ? (100.0 - cf - itf) : 0.0); } // Bail-cause: of the compiled-chain dispatches (m_stat_hot), how many bailed via a cached-link miss // (branches/fall-through) vs a jit_indirect miss (computed jumps) vs gate/other (budget/interrupt/ // mid-block fault = remainder). Pinpoints whether the branch link is the dispatch lever. if (m_stat_hot) { const uint64_t jmp_bail = (m_jmp_attempt > m_jmp_hit) ? (m_jmp_attempt - m_jmp_hit) : 0; const uint64_t other = (m_stat_hot > m_bail_link + jmp_bail) ? (m_stat_hot - m_bail_link - jmp_bail) : 0; printf("[JIT][STATS][CPU%d] bail-cause: link %.0f%% | jump %.0f%% | gate/other %.0f%% (of %llu) | jump chain-rate %.0f%%\n", m_cpu_id, 100.0 * (double) m_bail_link / (double) m_stat_hot, 100.0 * (double) jmp_bail / (double) m_stat_hot, 100.0 * (double) other / (double) m_stat_hot, (unsigned long long) m_stat_hot, m_jmp_attempt ? 100.0 * (double) m_jmp_hit / (double) m_jmp_attempt : 0.0); } // Fresh-compile reason (per window): tag=cache aliasing, asn=cross-process same-PC, phys=remap, // hash=self-mod, cold=genuine new/warmup. Sums to the window's `recorded` growth (the churn cost). const uint64_t fresh = m_fresh_tag + m_fresh_asn + m_fresh_phys + m_fresh_hash + m_fresh_cold; if (fresh) printf("[JIT][STATS][CPU%d] fresh-cause: tag %llu | asn %llu | phys %llu | hash %llu | cold %llu (of %llu recompiled)\n", m_cpu_id, (unsigned long long) m_fresh_tag, (unsigned long long) m_fresh_asn, (unsigned long long) m_fresh_phys, (unsigned long long) m_fresh_hash, (unsigned long long) m_fresh_cold, (unsigned long long) fresh); if (m_trace_formed || m_trace_entered) printf("[JIT][STATS][CPU%d] traces: formed %llu | entered %llu | exits %llu | stale %llu (windowed)\n", m_cpu_id, (unsigned long long) m_trace_formed, (unsigned long long) m_trace_entered, (unsigned long long) m_trace_exits, (unsigned long long) m_trace_stale); { uint64_t fh[6] = {0}, mh[6] = {0}, tm = 0; // link-fanout: thrashing source blocks + cumulative misses, bucketed by #distinct successors for (int i = 0; i < kCacheEntries; ++i) { if (!m_blocks[i].valid || m_blocks[i].link_misses == 0) continue; int f = m_blocks[i].link_fanout > 5 ? 5 : m_blocks[i].link_fanout; fh[f]++; mh[f] += m_blocks[i].link_misses; tm += m_blocks[i].link_misses; } if (tm) printf("[JIT][STATS][CPU%d] link-fanout (srcs/misses by #distinct successors): f1=%llu/%llu f2=%llu/%llu f3=%llu/%llu f4=%llu/%llu f5+=%llu/%llu | total miss %llu\n", m_cpu_id, (unsigned long long) fh[1], (unsigned long long) mh[1], (unsigned long long) fh[2], (unsigned long long) mh[2], (unsigned long long) fh[3], (unsigned long long) mh[3], (unsigned long long) fh[4], (unsigned long long) mh[4], (unsigned long long) fh[5], (unsigned long long) mh[5], (unsigned long long) tm); } len = snprintf(buf, sizeof(buf), "[JIT][STATS][CPU%d] %llu recorded, %llu compiled (avg %.1f instr) | block-breakers:", m_cpu_id, (unsigned long long) m_recorded, (unsigned long long) m_stat_compiled, avgpl); uint64_t hist[64]; memcpy(hist, m_term_op, sizeof(hist)); for (int rank = 0; rank < 8 && len < (int) sizeof(buf) - 32; ++rank) { int best = -1; uint64_t bestv = 0; for (int op = 0; op < 64; ++op) if (hist[op] > bestv) { bestv = hist[op]; best = op; } if (best < 0) break; len += snprintf(buf + len, sizeof(buf) - len, " %s(0x%02x)=%llu", opcode_name(best), best, (unsigned long long) bestv); hist[best] = 0; } printf("%s\n", buf); if (m_term_op[0]) { // CALL_PAL cut blocks -- show which function codes dominate (chain targets) len = snprintf(buf, sizeof(buf), "[JIT][STATS][CPU%d] CALL_PAL by func:", m_cpu_id); uint64_t ph[256]; memcpy(ph, m_pal_func, sizeof(ph)); for (int rank = 0; rank < 8 && len < (int) sizeof(buf) - 32; ++rank) { int best = -1; uint64_t bestv = 0; for (int f = 0; f < 256; ++f) if (ph[f] > bestv) { bestv = ph[f]; best = f; } if (best < 0) break; len += snprintf(buf + len, sizeof(buf) - len, " 0x%02x=%llu", best, (unsigned long long) bestv); ph[best] = 0; } printf("%s\n", buf); } if (m_term_op[0x1d]) { // HW_MTPR cut blocks -- which IPR writes dominate (the side-effecting set) len = snprintf(buf, sizeof(buf), "[JIT][STATS][CPU%d] HW_MTPR by IPR:", m_cpu_id); uint64_t mh[256]; memcpy(mh, m_mtpr_func, sizeof(mh)); for (int rank = 0; rank < 8 && len < (int) sizeof(buf) - 32; ++rank) { int best = -1; uint64_t bestv = 0; for (int f = 0; f < 256; ++f) if (mh[f] > bestv) { bestv = mh[f]; best = f; } if (best < 0) break; len += snprintf(buf + len, sizeof(buf) - len, " 0x%02x=%llu", best, (unsigned long long) bestv); mh[best] = 0; } printf("%s\n", buf); } if (m_term_op[0x1b]) { // HW_LD cut blocks -- which forms dominate (virt/lock/vpte/chk) len = snprintf(buf, sizeof(buf), "[JIT][STATS][CPU%d] HW_LD by form:", m_cpu_id); uint64_t lh[16]; memcpy(lh, m_hwld_func, sizeof(lh)); for (int rank = 0; rank < 8 && len < (int) sizeof(buf) - 32; ++rank) { int best = -1; uint64_t bestv = 0; for (int f = 0; f < 16; ++f) if (lh[f] > bestv) { bestv = lh[f]; best = f; } if (best < 0) break; len += snprintf(buf + len, sizeof(buf) - len, " 0x%x=%llu", best, (unsigned long long) bestv); lh[best] = 0; } printf("%s\n", buf); } if (m_term_op[0x18]) { // MISC cut blocks -- which Ra==31 form dominates (RPCC degenerate-noop / RC,RS soft-intr-flag) printf("[JIT][STATS][CPU%d] MISC by form: RPCC=%llu RC=%llu RS=%llu\n", m_cpu_id, (unsigned long long) m_misc_func[0xc], (unsigned long long) m_misc_func[0xe], (unsigned long long) m_misc_func[0xf]); } m_stat_native = m_stat_interp = m_stat_hot = m_stat_miss = 0; // reset the window m_tsc_compiled = m_tsc_interp = 0; m_tsc_window_start = jit_rdtsc(); // next window's split denominator starts after this report's I/O m_bail_link = m_jmp_attempt = m_jmp_hit = 0; m_fresh_cold = m_fresh_tag = m_fresh_asn = m_fresh_phys = m_fresh_hash = 0; m_trace_formed = m_trace_entered = m_trace_exits = m_trace_stale = 0; const auto stat_end = std::chrono::steady_clock::now(); m_stat_wall_last_ns = (uint64_t) std::chrono::duration_cast( stat_end.time_since_epoch()).count(); // next window's throughput delta starts after this I/O return (uint64_t) std::chrono::duration_cast(stat_end - stat_t0).count(); } #endif #ifdef JIT_REGPROF // Pin-selection report: attribute each live block's executions (rp_hits) to every GPR it touches // (rp_mask), then print the heaviest. A trailing '*' marks regs that CANNOT be pinned -- R4-7/R20-23, // the PALshadow remap targets (which also covers R23's CALL_PAL save). R31 is excluded by the mask. void CJitEngine::regprof_report() { uint64_t hist[32] = { 0 }; uint64_t exec_instr = 0, exec_bytes = 0; // exec-weighted: hot-path Alpha instrs and emitted x86 bytes for (int s = 0; s < kCacheEntries; ++s) { const JitBlock& b = m_blocks[s]; if (!b.valid || b.rp_hits == 0) continue; for (int r = 0; r < 31; ++r) if (b.rp_mask & (1u << r)) hist[r] += b.rp_hits; exec_instr += b.rp_hits * (uint64_t) b.prefix_len; exec_bytes += b.rp_hits * (uint64_t) b.rp_csz; } // Execution-weighted code expansion -- the HOT path, not the cold-block-skewed static average. // x86-instrs/instr ~= this / ~3.5; with cycles/instr from the throughput line -> hot-path IPC. printf("[JIT][REGPROF][CPU%d] exec-weighted expansion: %.1f x86-bytes/instr (hot path)\n", m_cpu_id, exec_instr ? (double) exec_bytes / (double) exec_instr : 0.0); char buf[256]; int len = snprintf(buf, sizeof(buf), "[JIT][REGPROF][CPU%d] hot GPRs (exec x accesses):", m_cpu_id); for (int rank = 0; rank < 8 && len < (int) sizeof(buf) - 24; ++rank) { int best = -1; uint64_t bestv = 0; for (int r = 0; r < 31; ++r) if (hist[r] > bestv) { bestv = hist[r]; best = r; } if (best < 0) break; const bool pinnable = (best & 0xc) != 0x4; // exclude R4-7 / R20-23 (shadow remap; covers R23) len += snprintf(buf + len, sizeof(buf) - len, " R%d=%llu%s", best, (unsigned long long) bestv, pinnable ? "" : "*"); hist[best] = 0; } printf("%s (* = not pin-eligible)\n", buf); } #endif #endif // ES40_JIT