From 96d426f8e6834f783326093f314e656ce0e51974 Mon Sep 17 00:00:00 2001 From: Dmitry Stogov Date: Thu, 14 Nov 2024 01:29:55 +0300 Subject: [PATCH 01/17] Naive JIT through IR --- Makefile | 8 +- jited_ir.c | 440 +++++++++++++++++++++++++++++++++++++++++++++++++++++ 2 files changed, 447 insertions(+), 1 deletion(-) create mode 100644 jited_ir.c diff --git a/Makefile b/Makefile index 320377c..b296d08 100644 --- a/Makefile +++ b/Makefile @@ -10,7 +10,7 @@ COMMON_SRC = common.c COMMON_OBJ := $(COMMON_SRC:.c=.o) COMMON_HEADERS = common.h -ALL = switched threaded predecoded subroutined threaded-cached tailrecursive asmopt translated native +ALL = switched threaded predecoded subroutined threaded-cached tailrecursive asmopt translated native jited_ir # Must be the first target for the magic below to work all: $(ALL) @@ -110,3 +110,9 @@ threaded-notune: threaded.o # This will crash with stack overflow tailrecursive-noopt: CFLAGS += -O0 -fno-optimize-sibling-calls tailrecursive-noopt: tailrecursive.o + +jited_ir.o: jited_ir.c + $(CC) $(DEPFLAGS) $(CFLAGS) $(CPPFLAGS) -c $< + +jited_ir: jited_ir.o + $(CC) $^ -lir -lcapstone -lm -o $@ diff --git a/jited_ir.c b/jited_ir.c new file mode 100644 index 0000000..08fb94e --- /dev/null +++ b/jited_ir.c @@ -0,0 +1,440 @@ +#include +#include +#include +#include +#include +#include +#include +#include + +#include "common.h" + +#include +#include "ir.h" +#include "ir_builder.h" + +#define ir_CONST_STR(str) ir_const_str(ctx, ir_str(ctx, str)) + +static inline decode_t decode_at_address(const Instr_t* prog, uint32_t addr) { + assert(addr < PROGRAM_SIZE); + decode_t result = {0}; + Instr_t raw_instr = prog[addr]; + result.opcode = raw_instr; + switch (raw_instr) { + case Instr_Nop: + case Instr_Halt: + case Instr_Print: + case Instr_Swap: + case Instr_Dup: + case Instr_Inc: + case Instr_Add: + case Instr_Sub: + case Instr_Mul: + case Instr_Rand: + case Instr_Dec: + case Instr_Drop: + case Instr_Over: + case Instr_Mod: + case Instr_And: + case Instr_Or: + case Instr_Xor: + case Instr_SHL: + case Instr_SHR: + case Instr_Rot: + case Instr_SQRT: + case Instr_Pick: + result.length = 1; + break; + case Instr_Push: + case Instr_JNE: + case Instr_JE: + case Instr_Jump: + result.length = 2; + if (!(addr+1 < PROGRAM_SIZE)) { + result.length = 1; + result.opcode = Instr_Break; + break; + } + result.immediate = (int32_t)prog[addr+1]; + break; + case Instr_Break: + default: /* Undefined instructions equal to Break */ + result.length = 1; + result.opcode = Instr_Break; + break; + } + return result; +} + +/*** Service routines ***/ +#define BAIL_ON_ERROR() if (cpu.state != Cpu_Running) break; + +static void jit_push(ir_ctx *ctx, ir_ref cpu, ir_ref *stack_overflow, ir_ref v) { + // JIT: if (pcpu->sp >= STACK_CAPACITY-1) { + ir_ref sp_addr = ir_ADD_OFFSET(cpu, offsetof(cpu_t, sp)); + ir_ref sp = ir_LOAD_I32(sp_addr); + ir_ref if_overflow = ir_IF(ir_GE(sp, ir_CONST_I32(STACK_CAPACITY-1))); + + ir_IF_TRUE_cold(if_overflow); + ir_END_list(*stack_overflow); + + ir_IF_FALSE(if_overflow); + + // JIT: pcpu->stack[++pcpu->sp] = v; + sp = ir_ADD_I32(sp, ir_CONST_I32(1)); + ir_STORE(sp_addr, sp); + ir_STORE(ir_ADD_I32(ir_ADD_OFFSET(cpu, offsetof(cpu_t, stack)), + ir_MUL_I32(sp, ir_CONST_I32(sizeof(uint32_t)))), v); +} + +static ir_ref jit_pop(ir_ctx *ctx, ir_ref cpu, ir_ref *stack_underflow) { + // JIT: if (pcpu->sp < 0) { + ir_ref sp_addr = ir_ADD_OFFSET(cpu, offsetof(cpu_t, sp)); + ir_ref sp = ir_LOAD_I32(sp_addr); + ir_ref if_underflow = ir_IF(ir_LT(sp, ir_CONST_I32(0))); + + ir_IF_TRUE_cold(if_underflow); + ir_END_list(*stack_underflow); + + ir_IF_FALSE(if_underflow); + + //JIT: pcpu->stack[pcpu->sp--]; + ir_ref ret = ir_LOAD_I32(ir_ADD_I32(ir_ADD_OFFSET(cpu, offsetof(cpu_t, stack)), + ir_MUL_I32(sp, ir_CONST_I32(sizeof(uint32_t))))); + sp = ir_SUB_I32(sp, ir_CONST_I32(1)); + ir_STORE(sp_addr, sp); + + return ret; +} + +static ir_ref jit_pick(ir_ctx *ctx, ir_ref cpu, ir_ref *stack_bound, ir_ref pos) { + // JIT: if (pcpu->sp - 1 < pos) { + ir_ref sp = ir_LOAD_I32(ir_ADD_OFFSET(cpu, offsetof(cpu_t, sp))); + ir_ref if_out = ir_IF(ir_LT(ir_SUB_U32(sp, ir_CONST_U32(1)), pos)); + + ir_IF_TRUE_cold(if_out); + ir_END_list(*stack_bound); + + ir_IF_FALSE(if_out); + // JIT: pcpu->stack[pcpu->sp - pos]; + return ir_LOAD_I32(ir_ADD_I32(ir_ADD_OFFSET(cpu, offsetof(cpu_t, stack)), + ir_MUL_I32(ir_SUB_I32(sp, pos), ir_CONST_I32(sizeof(uint32_t))))); +} + +typedef struct _jit_label { + ir_ref inputs; /* number of input edges */ + ir_ref merge; /* reference of MERGE or "list" of forward inputs */ +} jit_label; + +static void jit_goto_backward(ir_ctx *ctx, jit_label *label) { + ir_set_op(ctx, label->merge, ++label->inputs, ir_END()); +} + +static void jit_goto_forward(ir_ctx *ctx, jit_label *label) { + ir_END_list(label->merge); +} + +static void jit_program(ir_ctx *ctx, const Instr_t *prog, int len) { + assert(prog); + ir_ref stack_overflow = IR_UNUSED; + ir_ref stack_underflow = IR_UNUSED; + ir_ref stack_bound = IR_UNUSED; + jit_label *labels = calloc(len, sizeof(jit_label)); + + /* mark goto targets */ + for (int i=0; i < len;) { + decode_t decoded = decode_at_address(prog, i); + + i += decoded.length; + switch(decoded.opcode) { + case Instr_JE: + case Instr_JNE: + case Instr_Jump: + labels[i + decoded.immediate].inputs++; + break; + } + } + + ir_START(); + ir_ref tmp1, tmp2, tmp3; + ir_ref cpu = ir_PARAM(IR_ADDR, "cpu", 1); + ir_ref printf_func = + ir_const_func(ctx, ir_str(ctx, "printf"), ir_proto_1(ctx, IR_I32, IR_VARARG_FUNC, IR_ADDR)); + ir_ref rand_func = + ir_const_func(ctx, ir_str(ctx, "rand"), ir_proto_0(ctx, IR_I32, 0)); + ir_ref sqrt_func = + ir_const_func(ctx, ir_str(ctx, "sqrt"), ir_proto_1(ctx, IR_DOUBLE, IR_BUILTIN_FUNC, IR_DOUBLE)); + + for (int i=0; i < len;) { + decode_t decoded = decode_at_address(prog, i); + + if (labels[i].inputs > 0) { + assert(!ctx->control); + if (labels[i].inputs == 1) { + tmp1 = ir_emit1(ctx, IR_BEGIN, IR_UNUSED); + } else { + tmp1 = ir_emit_N(ctx, IR_MERGE, labels[i].inputs); + } + tmp2 = 0; + tmp3 = labels[i].merge; + labels[i].merge = ctx->control = tmp1; + + while (tmp3) { + /* Store forward GOTOs into MERGE */ + tmp2++; + assert(tmp2 <= labels[i].inputs); + ir_set_op(ctx, tmp1, tmp2, tmp3); + ir_insn *insn = &ctx->ir_base[tmp3]; + assert(insn->op == IR_END); + tmp3 = insn->op2; + insn->op2 = IR_UNUSED; + } + labels[i].inputs = tmp2; + } + + i += decoded.length; + + switch(decoded.opcode) { + case Instr_Nop: + /* Do nothing */ + break; + case Instr_Halt: + // JIT: cpu.state = Cpu_Halted; + ir_STORE(ir_ADD_OFFSET(cpu, offsetof(cpu_t, state)), ir_CONST_I32(Cpu_Halted)); + ir_RETURN(IR_VOID); + break; + case Instr_Push: + jit_push(ctx, cpu, &stack_overflow, ir_CONST_U32(decoded.immediate)); + break; + case Instr_Print: + tmp1 = jit_pop(ctx, cpu, &stack_underflow); + ir_CALL_2(IR_VOID, printf_func, ir_CONST_STR("[%d]\n"), tmp1); + break; + case Instr_Swap: + tmp1 = jit_pop(ctx, cpu, &stack_underflow); + tmp2 = jit_pop(ctx, cpu, &stack_underflow); + jit_push(ctx, cpu, &stack_overflow, tmp1); + jit_push(ctx, cpu, &stack_overflow, tmp2); + break; + case Instr_Dup: + tmp1 = jit_pop(ctx, cpu, &stack_underflow); + jit_push(ctx, cpu, &stack_overflow, tmp1); + jit_push(ctx, cpu, &stack_overflow, tmp1); + break; + case Instr_Over: + tmp1 = jit_pop(ctx, cpu, &stack_underflow); + tmp2 = jit_pop(ctx, cpu, &stack_underflow); + jit_push(ctx, cpu, &stack_overflow, tmp2); + jit_push(ctx, cpu, &stack_overflow, tmp1); + jit_push(ctx, cpu, &stack_overflow, tmp2); + break; + case Instr_Inc: + tmp1 = jit_pop(ctx, cpu, &stack_underflow); + jit_push(ctx, cpu, &stack_overflow, ir_ADD_U32(tmp1, ir_CONST_U32(1))); + break; + case Instr_Add: + tmp1 = jit_pop(ctx, cpu, &stack_underflow); + tmp2 = jit_pop(ctx, cpu, &stack_underflow); + jit_push(ctx, cpu, &stack_overflow, ir_ADD_U32(tmp1, tmp2)); + break; + case Instr_Sub: + tmp1 = jit_pop(ctx, cpu, &stack_underflow); + tmp2 = jit_pop(ctx, cpu, &stack_underflow); + jit_push(ctx, cpu, &stack_overflow, ir_SUB_U32(tmp1, tmp2)); + break; + case Instr_Mod: + tmp1 = jit_pop(ctx, cpu, &stack_underflow); + tmp2 = jit_pop(ctx, cpu, &stack_underflow); + // JIT if (tmp2 == 0) + tmp3 = ir_IF(ir_EQ(tmp2, ir_CONST_U32(0))); + + ir_IF_TRUE_cold(tmp3); + // JIT: printf("Division by zero\n"); + ir_CALL_1(IR_VOID, printf_func, ir_CONST_STR("Division by zero\n")); + // JIT: pcpu->state = Cpu_Break; + ir_STORE(ir_ADD_OFFSET(cpu, offsetof(cpu_t, state)), ir_CONST_I32(Cpu_Break)); + ir_RETURN(IR_VOID); + + ir_IF_FALSE(tmp3); + jit_push(ctx, cpu, &stack_overflow, ir_MOD_U32(tmp1, tmp2)); + break; + + case Instr_Mul: + tmp1 = jit_pop(ctx, cpu, &stack_underflow); + tmp2 = jit_pop(ctx, cpu, &stack_underflow); + jit_push(ctx, cpu, &stack_overflow, ir_MUL_U32(tmp1, tmp2)); + break; + case Instr_Rand: + tmp1 = ir_CALL(IR_I32, rand_func); + jit_push(ctx, cpu, &stack_overflow, tmp1); + break; + case Instr_Dec: + tmp1 = jit_pop(ctx, cpu, &stack_underflow); + jit_push(ctx, cpu, &stack_overflow, ir_SUB_U32(tmp1, ir_CONST_U32(1))); + break; + case Instr_Drop: + (void)jit_pop(ctx, cpu, &stack_underflow); + break; + case Instr_JE: + tmp1 = jit_pop(ctx, cpu, &stack_underflow); + // JIT: if (tmp1 == 0) + tmp3 = ir_IF(ir_EQ(tmp1, ir_CONST_U32(0))); + ir_IF_TRUE(tmp3); + if (decoded.immediate < -decoded.length) { + jit_goto_backward(ctx, &labels[i + decoded.immediate]); + } else { + jit_goto_forward(ctx, &labels[i + decoded.immediate]); + } + ir_IF_FALSE(tmp3); + break; + case Instr_JNE: + tmp1 = jit_pop(ctx, cpu, &stack_underflow); + // JIT: if (tmp1 == 0) + tmp3 = ir_IF(ir_NE(tmp1, ir_CONST_U32(0))); + ir_IF_TRUE(tmp3); + if (decoded.immediate < -decoded.length) { + jit_goto_backward(ctx, &labels[i + decoded.immediate]); + } else { + jit_goto_forward(ctx, &labels[i + decoded.immediate]); + } + ir_IF_FALSE(tmp3); + break; + case Instr_Jump: + if (decoded.immediate < -decoded.length) { + jit_goto_backward(ctx, &labels[i + decoded.immediate]); + } else { + jit_goto_forward(ctx, &labels[i + decoded.immediate]); + } + break; + case Instr_And: + tmp1 = jit_pop(ctx, cpu, &stack_underflow); + tmp2 = jit_pop(ctx, cpu, &stack_underflow); + jit_push(ctx, cpu, &stack_overflow, ir_AND_U32(tmp1, tmp2)); + break; + case Instr_Or: + tmp1 = jit_pop(ctx, cpu, &stack_underflow); + tmp2 = jit_pop(ctx, cpu, &stack_underflow); + jit_push(ctx, cpu, &stack_overflow, ir_OR_U32(tmp1, tmp2)); + break; + case Instr_Xor: + tmp1 = jit_pop(ctx, cpu, &stack_underflow); + tmp2 = jit_pop(ctx, cpu, &stack_underflow); + jit_push(ctx, cpu, &stack_overflow, ir_XOR_U32(tmp1, tmp2)); + break; + case Instr_SHL: + tmp1 = jit_pop(ctx, cpu, &stack_underflow); + tmp2 = jit_pop(ctx, cpu, &stack_underflow); + jit_push(ctx, cpu, &stack_overflow, ir_SHL_U32(tmp1, tmp2)); + break; + case Instr_SHR: + tmp1 = jit_pop(ctx, cpu, &stack_underflow); + tmp2 = jit_pop(ctx, cpu, &stack_underflow); + jit_push(ctx, cpu, &stack_overflow, ir_SHR_U32(tmp1, tmp2)); + break; + case Instr_Rot: + tmp1 = jit_pop(ctx, cpu, &stack_underflow); + tmp2 = jit_pop(ctx, cpu, &stack_underflow); + tmp3 = jit_pop(ctx, cpu, &stack_underflow); + jit_push(ctx, cpu, &stack_overflow, tmp1); + jit_push(ctx, cpu, &stack_overflow, tmp3); + jit_push(ctx, cpu, &stack_overflow, tmp2); + break; + case Instr_SQRT: + tmp1 = jit_pop(ctx, cpu, &stack_underflow); + tmp1 = ir_FP2U32(ir_CALL_1(IR_DOUBLE, sqrt_func, ir_INT2D(tmp1))); + jit_push(ctx, cpu, &stack_overflow, tmp1); + break; + case Instr_Pick: + tmp1 = jit_pop(ctx, cpu, &stack_underflow); + tmp1 = jit_pick(ctx, cpu, &stack_bound, tmp1); + jit_push(ctx, cpu, &stack_overflow, tmp1); + break; + case Instr_Break: + if (ctx->control) { + // JIT: pcpu->state = Cpu_Break; + ir_STORE(ir_ADD_OFFSET(cpu, offsetof(cpu_t, state)), ir_CONST_I32(Cpu_Break)); + ir_RETURN(IR_VOID); + } + i = len; + break; + default: + assert(0 && "Unsupported instruction"); + break; + } + if (i < len && labels[i].inputs > 0 && decoded.opcode != Instr_Jump) { + labels[i].inputs++; + jit_goto_forward(ctx, &labels[i]); + } + } + + if (stack_overflow) { + ir_MERGE_list(stack_overflow); + // JIT: printf("Stack overflow\n"); + ir_CALL_1(IR_VOID, printf_func, ir_CONST_STR("Stack overflow\n")); + // JIT: pcpu->state = Cpu_Break; + ir_STORE(ir_ADD_OFFSET(cpu, offsetof(cpu_t, state)), ir_CONST_I32(Cpu_Break)); + ir_RETURN(IR_VOID); + } + + if (stack_underflow) { + ir_MERGE_list(stack_underflow); + // JIT: printf("Stack overflow\n"); + ir_CALL_1(IR_VOID, printf_func, ir_CONST_STR("Stack underflow\n")); + // JIT: pcpu->state = Cpu_Break; + ir_STORE(ir_ADD_OFFSET(cpu, offsetof(cpu_t, state)), ir_CONST_I32(Cpu_Break)); + ir_RETURN(IR_VOID); + } + if (stack_bound) { + ir_MERGE_list(stack_bound); + // JIT: printf("Out of bound picking\n"); + ir_CALL_1(IR_VOID, printf_func, ir_CONST_STR("Stack underflow\n")); + // JIT: pcpu->state = Cpu_Break; + ir_STORE(ir_ADD_OFFSET(cpu, offsetof(cpu_t, state)), ir_CONST_I32(Cpu_Break)); + ir_RETURN(IR_VOID); + } +} + +int main(int argc, char **argv) { + uint64_t steplimit = parse_args(argc, argv); + cpu_t cpu = init_cpu(); + ir_ctx ctx; + typedef void (*entry_t)(cpu_t*); + entry_t entry; + size_t size; + + ir_init(&ctx, IR_FUNCTION | IR_OPT_FOLDING | IR_OPT_CFG | IR_OPT_CODEGEN, 256, 1024); + + jit_program(&ctx, cpu.pmem, PROGRAM_SIZE); + ir_save(&ctx, IR_SAVE_CFG | IR_SAVE_RULES | IR_SAVE_REGS, stderr); + + entry = (entry_t)ir_jit_compile(&ctx, 2, &size); + if (!entry) { + printf("Compilation failure\n"); + } + + ir_save(&ctx, IR_SAVE_CFG | IR_SAVE_RULES | IR_SAVE_REGS, stderr); + ir_disasm("prog", entry, size, 0, &ctx, stderr); + + entry(&cpu); + + ir_free(&ctx); + + assert(cpu.state != Cpu_Running || cpu.steps == steplimit); + + /* Print CPU state */ + printf("CPU executed %ld steps. End state \"%s\".\n", + cpu.steps, cpu.state == Cpu_Halted? "Halted": + cpu.state == Cpu_Running? "Running": "Break"); + printf("PC = %#x, SP = %d\n", cpu.pc, cpu.sp); + printf("Stack: "); + for (int32_t i=cpu.sp; i >= 0 ; i--) { + printf("%#10x ", cpu.stack[i]); + } + printf("%s\n", cpu.sp == -1? "(empty)": ""); + + free(LoadedProgram); + + return cpu.state == Cpu_Halted || + (cpu.state == Cpu_Running && + cpu.steps == steplimit)?0:1; +} From d4ca3b3a0337a7dd27e0e1909c1449c2c7e18653 Mon Sep 17 00:00:00 2001 From: Dmitry Stogov Date: Thu, 14 Nov 2024 02:40:30 +0300 Subject: [PATCH 02/17] cleanup --- jited_ir.c | 19 +++++++++---------- 1 file changed, 9 insertions(+), 10 deletions(-) diff --git a/jited_ir.c b/jited_ir.c index 08fb94e..42805f5 100644 --- a/jited_ir.c +++ b/jited_ir.c @@ -67,8 +67,6 @@ static inline decode_t decode_at_address(const Instr_t* prog, uint32_t addr) { } /*** Service routines ***/ -#define BAIL_ON_ERROR() if (cpu.state != Cpu_Running) break; - static void jit_push(ir_ctx *ctx, ir_ref cpu, ir_ref *stack_overflow, ir_ref v) { // JIT: if (pcpu->sp >= STACK_CAPACITY-1) { ir_ref sp_addr = ir_ADD_OFFSET(cpu, offsetof(cpu_t, sp)); @@ -140,11 +138,11 @@ static void jit_program(ir_ctx *ctx, const Instr_t *prog, int len) { ir_ref stack_underflow = IR_UNUSED; ir_ref stack_bound = IR_UNUSED; jit_label *labels = calloc(len, sizeof(jit_label)); + decode_t decoded; /* mark goto targets */ for (int i=0; i < len;) { - decode_t decoded = decode_at_address(prog, i); - + decoded = decode_at_address(prog, i); i += decoded.length; switch(decoded.opcode) { case Instr_JE: @@ -165,10 +163,14 @@ static void jit_program(ir_ctx *ctx, const Instr_t *prog, int len) { ir_ref sqrt_func = ir_const_func(ctx, ir_str(ctx, "sqrt"), ir_proto_1(ctx, IR_DOUBLE, IR_BUILTIN_FUNC, IR_DOUBLE)); - for (int i=0; i < len;) { - decode_t decoded = decode_at_address(prog, i); + decoded.opcode = Instr_Nop; + for (int i=0; i < len;) { if (labels[i].inputs > 0) { + if (decoded.opcode != Instr_Jump) { + labels[i].inputs++; + jit_goto_forward(ctx, &labels[i]); + } assert(!ctx->control); if (labels[i].inputs == 1) { tmp1 = ir_emit1(ctx, IR_BEGIN, IR_UNUSED); @@ -192,6 +194,7 @@ static void jit_program(ir_ctx *ctx, const Instr_t *prog, int len) { labels[i].inputs = tmp2; } + decoded = decode_at_address(prog, i); i += decoded.length; switch(decoded.opcode) { @@ -361,10 +364,6 @@ static void jit_program(ir_ctx *ctx, const Instr_t *prog, int len) { assert(0 && "Unsupported instruction"); break; } - if (i < len && labels[i].inputs > 0 && decoded.opcode != Instr_Jump) { - labels[i].inputs++; - jit_goto_forward(ctx, &labels[i]); - } } if (stack_overflow) { From 1a0b692280fdd59c77891d5c7e179517f069e54e Mon Sep 17 00:00:00 2001 From: Dmitry Stogov Date: Thu, 14 Nov 2024 12:49:59 +0300 Subject: [PATCH 03/17] Source refactoring --- jited_ir.c | 267 ++++++++++++++++++++++++++++------------------------- 1 file changed, 139 insertions(+), 128 deletions(-) diff --git a/jited_ir.c b/jited_ir.c index 42805f5..e5760c2 100644 --- a/jited_ir.c +++ b/jited_ir.c @@ -13,7 +13,7 @@ #include "ir.h" #include "ir_builder.h" -#define ir_CONST_STR(str) ir_const_str(ctx, ir_str(ctx, str)) +#define ir_CONST_STR(str) ir_const_str(_ir_CTX, ir_str(_ir_CTX, str)) static inline decode_t decode_at_address(const Instr_t* prog, uint32_t addr) { assert(addr < PROGRAM_SIZE); @@ -67,37 +67,53 @@ static inline decode_t decode_at_address(const Instr_t* prog, uint32_t addr) { } /*** Service routines ***/ -static void jit_push(ir_ctx *ctx, ir_ref cpu, ir_ref *stack_overflow, ir_ref v) { +typedef struct _jit_label { + ir_ref inputs; /* number of input edges */ + ir_ref merge; /* reference of MERGE or "list" of forward inputs */ +} jit_label; + +typedef struct _jit_ctx { + ir_ctx ctx; + ir_ref cpu; + ir_ref stack_overflow; + ir_ref stack_underflow; + ir_ref stack_bound; +} jit_ctx; + +#undef _ir_CTX +#define _ir_CTX (&jit->ctx) + +static void jit_push(jit_ctx *jit, ir_ref v) { // JIT: if (pcpu->sp >= STACK_CAPACITY-1) { - ir_ref sp_addr = ir_ADD_OFFSET(cpu, offsetof(cpu_t, sp)); + ir_ref sp_addr = ir_ADD_OFFSET(jit->cpu, offsetof(cpu_t, sp)); ir_ref sp = ir_LOAD_I32(sp_addr); ir_ref if_overflow = ir_IF(ir_GE(sp, ir_CONST_I32(STACK_CAPACITY-1))); ir_IF_TRUE_cold(if_overflow); - ir_END_list(*stack_overflow); + ir_END_list(jit->stack_overflow); ir_IF_FALSE(if_overflow); // JIT: pcpu->stack[++pcpu->sp] = v; sp = ir_ADD_I32(sp, ir_CONST_I32(1)); ir_STORE(sp_addr, sp); - ir_STORE(ir_ADD_I32(ir_ADD_OFFSET(cpu, offsetof(cpu_t, stack)), + ir_STORE(ir_ADD_I32(ir_ADD_OFFSET(jit->cpu, offsetof(cpu_t, stack)), ir_MUL_I32(sp, ir_CONST_I32(sizeof(uint32_t)))), v); } -static ir_ref jit_pop(ir_ctx *ctx, ir_ref cpu, ir_ref *stack_underflow) { +static ir_ref jit_pop(jit_ctx *jit) { // JIT: if (pcpu->sp < 0) { - ir_ref sp_addr = ir_ADD_OFFSET(cpu, offsetof(cpu_t, sp)); + ir_ref sp_addr = ir_ADD_OFFSET(jit->cpu, offsetof(cpu_t, sp)); ir_ref sp = ir_LOAD_I32(sp_addr); ir_ref if_underflow = ir_IF(ir_LT(sp, ir_CONST_I32(0))); ir_IF_TRUE_cold(if_underflow); - ir_END_list(*stack_underflow); + ir_END_list(jit->stack_underflow); ir_IF_FALSE(if_underflow); //JIT: pcpu->stack[pcpu->sp--]; - ir_ref ret = ir_LOAD_I32(ir_ADD_I32(ir_ADD_OFFSET(cpu, offsetof(cpu_t, stack)), + ir_ref ret = ir_LOAD_I32(ir_ADD_I32(ir_ADD_OFFSET(jit->cpu, offsetof(cpu_t, stack)), ir_MUL_I32(sp, ir_CONST_I32(sizeof(uint32_t))))); sp = ir_SUB_I32(sp, ir_CONST_I32(1)); ir_STORE(sp_addr, sp); @@ -105,38 +121,30 @@ static ir_ref jit_pop(ir_ctx *ctx, ir_ref cpu, ir_ref *stack_underflow) { return ret; } -static ir_ref jit_pick(ir_ctx *ctx, ir_ref cpu, ir_ref *stack_bound, ir_ref pos) { +static ir_ref jit_pick(jit_ctx *jit, ir_ref pos) { // JIT: if (pcpu->sp - 1 < pos) { - ir_ref sp = ir_LOAD_I32(ir_ADD_OFFSET(cpu, offsetof(cpu_t, sp))); + ir_ref sp = ir_LOAD_I32(ir_ADD_OFFSET(jit->cpu, offsetof(cpu_t, sp))); ir_ref if_out = ir_IF(ir_LT(ir_SUB_U32(sp, ir_CONST_U32(1)), pos)); ir_IF_TRUE_cold(if_out); - ir_END_list(*stack_bound); + ir_END_list(jit->stack_bound); ir_IF_FALSE(if_out); // JIT: pcpu->stack[pcpu->sp - pos]; - return ir_LOAD_I32(ir_ADD_I32(ir_ADD_OFFSET(cpu, offsetof(cpu_t, stack)), + return ir_LOAD_I32(ir_ADD_I32(ir_ADD_OFFSET(jit->cpu, offsetof(cpu_t, stack)), ir_MUL_I32(ir_SUB_I32(sp, pos), ir_CONST_I32(sizeof(uint32_t))))); } -typedef struct _jit_label { - ir_ref inputs; /* number of input edges */ - ir_ref merge; /* reference of MERGE or "list" of forward inputs */ -} jit_label; - -static void jit_goto_backward(ir_ctx *ctx, jit_label *label) { - ir_set_op(ctx, label->merge, ++label->inputs, ir_END()); +static void jit_goto_backward(jit_ctx *jit, jit_label *label) { + ir_set_op(_ir_CTX, label->merge, ++label->inputs, ir_END()); } -static void jit_goto_forward(ir_ctx *ctx, jit_label *label) { +static void jit_goto_forward(jit_ctx *jit, jit_label *label) { ir_END_list(label->merge); } -static void jit_program(ir_ctx *ctx, const Instr_t *prog, int len) { +static void jit_program(jit_ctx *jit, const Instr_t *prog, int len) { assert(prog); - ir_ref stack_overflow = IR_UNUSED; - ir_ref stack_underflow = IR_UNUSED; - ir_ref stack_bound = IR_UNUSED; jit_label *labels = calloc(len, sizeof(jit_label)); decode_t decoded; @@ -155,13 +163,18 @@ static void jit_program(ir_ctx *ctx, const Instr_t *prog, int len) { ir_START(); ir_ref tmp1, tmp2, tmp3; - ir_ref cpu = ir_PARAM(IR_ADDR, "cpu", 1); + + jit->cpu = ir_PARAM(IR_ADDR, "cpu", 1); + jit->stack_overflow = IR_UNUSED; + jit->stack_underflow = IR_UNUSED; + jit->stack_bound = IR_UNUSED; + ir_ref printf_func = - ir_const_func(ctx, ir_str(ctx, "printf"), ir_proto_1(ctx, IR_I32, IR_VARARG_FUNC, IR_ADDR)); + ir_const_func(_ir_CTX, ir_str(_ir_CTX, "printf"), ir_proto_1(_ir_CTX, IR_I32, IR_VARARG_FUNC, IR_ADDR)); ir_ref rand_func = - ir_const_func(ctx, ir_str(ctx, "rand"), ir_proto_0(ctx, IR_I32, 0)); + ir_const_func(_ir_CTX, ir_str(_ir_CTX, "rand"), ir_proto_0(_ir_CTX, IR_I32, 0)); ir_ref sqrt_func = - ir_const_func(ctx, ir_str(ctx, "sqrt"), ir_proto_1(ctx, IR_DOUBLE, IR_BUILTIN_FUNC, IR_DOUBLE)); + ir_const_func(_ir_CTX, ir_str(_ir_CTX, "sqrt"), ir_proto_1(_ir_CTX, IR_DOUBLE, IR_BUILTIN_FUNC, IR_DOUBLE)); decoded.opcode = Instr_Nop; @@ -169,24 +182,24 @@ static void jit_program(ir_ctx *ctx, const Instr_t *prog, int len) { if (labels[i].inputs > 0) { if (decoded.opcode != Instr_Jump) { labels[i].inputs++; - jit_goto_forward(ctx, &labels[i]); + jit_goto_forward(jit, &labels[i]); } - assert(!ctx->control); + assert(!jit->ctx.control); if (labels[i].inputs == 1) { - tmp1 = ir_emit1(ctx, IR_BEGIN, IR_UNUSED); + tmp1 = ir_emit1(_ir_CTX, IR_BEGIN, IR_UNUSED); } else { - tmp1 = ir_emit_N(ctx, IR_MERGE, labels[i].inputs); + tmp1 = ir_emit_N(_ir_CTX, IR_MERGE, labels[i].inputs); } tmp2 = 0; tmp3 = labels[i].merge; - labels[i].merge = ctx->control = tmp1; + labels[i].merge = jit->ctx.control = tmp1; while (tmp3) { /* Store forward GOTOs into MERGE */ tmp2++; assert(tmp2 <= labels[i].inputs); - ir_set_op(ctx, tmp1, tmp2, tmp3); - ir_insn *insn = &ctx->ir_base[tmp3]; + ir_set_op(_ir_CTX, tmp1, tmp2, tmp3); + ir_insn *insn = &jit->ctx.ir_base[tmp3]; assert(insn->op == IR_END); tmp3 = insn->op2; insn->op2 = IR_UNUSED; @@ -203,51 +216,51 @@ static void jit_program(ir_ctx *ctx, const Instr_t *prog, int len) { break; case Instr_Halt: // JIT: cpu.state = Cpu_Halted; - ir_STORE(ir_ADD_OFFSET(cpu, offsetof(cpu_t, state)), ir_CONST_I32(Cpu_Halted)); + ir_STORE(ir_ADD_OFFSET(jit->cpu, offsetof(cpu_t, state)), ir_CONST_I32(Cpu_Halted)); ir_RETURN(IR_VOID); break; case Instr_Push: - jit_push(ctx, cpu, &stack_overflow, ir_CONST_U32(decoded.immediate)); + jit_push(jit, ir_CONST_U32(decoded.immediate)); break; case Instr_Print: - tmp1 = jit_pop(ctx, cpu, &stack_underflow); + tmp1 = jit_pop(jit); ir_CALL_2(IR_VOID, printf_func, ir_CONST_STR("[%d]\n"), tmp1); break; case Instr_Swap: - tmp1 = jit_pop(ctx, cpu, &stack_underflow); - tmp2 = jit_pop(ctx, cpu, &stack_underflow); - jit_push(ctx, cpu, &stack_overflow, tmp1); - jit_push(ctx, cpu, &stack_overflow, tmp2); + tmp1 = jit_pop(jit); + tmp2 = jit_pop(jit); + jit_push(jit, tmp1); + jit_push(jit, tmp2); break; case Instr_Dup: - tmp1 = jit_pop(ctx, cpu, &stack_underflow); - jit_push(ctx, cpu, &stack_overflow, tmp1); - jit_push(ctx, cpu, &stack_overflow, tmp1); + tmp1 = jit_pop(jit); + jit_push(jit, tmp1); + jit_push(jit, tmp1); break; case Instr_Over: - tmp1 = jit_pop(ctx, cpu, &stack_underflow); - tmp2 = jit_pop(ctx, cpu, &stack_underflow); - jit_push(ctx, cpu, &stack_overflow, tmp2); - jit_push(ctx, cpu, &stack_overflow, tmp1); - jit_push(ctx, cpu, &stack_overflow, tmp2); + tmp1 = jit_pop(jit); + tmp2 = jit_pop(jit); + jit_push(jit, tmp2); + jit_push(jit, tmp1); + jit_push(jit, tmp2); break; case Instr_Inc: - tmp1 = jit_pop(ctx, cpu, &stack_underflow); - jit_push(ctx, cpu, &stack_overflow, ir_ADD_U32(tmp1, ir_CONST_U32(1))); + tmp1 = jit_pop(jit); + jit_push(jit, ir_ADD_U32(tmp1, ir_CONST_U32(1))); break; case Instr_Add: - tmp1 = jit_pop(ctx, cpu, &stack_underflow); - tmp2 = jit_pop(ctx, cpu, &stack_underflow); - jit_push(ctx, cpu, &stack_overflow, ir_ADD_U32(tmp1, tmp2)); + tmp1 = jit_pop(jit); + tmp2 = jit_pop(jit); + jit_push(jit, ir_ADD_U32(tmp1, tmp2)); break; case Instr_Sub: - tmp1 = jit_pop(ctx, cpu, &stack_underflow); - tmp2 = jit_pop(ctx, cpu, &stack_underflow); - jit_push(ctx, cpu, &stack_overflow, ir_SUB_U32(tmp1, tmp2)); + tmp1 = jit_pop(jit); + tmp2 = jit_pop(jit); + jit_push(jit, ir_SUB_U32(tmp1, tmp2)); break; case Instr_Mod: - tmp1 = jit_pop(ctx, cpu, &stack_underflow); - tmp2 = jit_pop(ctx, cpu, &stack_underflow); + tmp1 = jit_pop(jit); + tmp2 = jit_pop(jit); // JIT if (tmp2 == 0) tmp3 = ir_IF(ir_EQ(tmp2, ir_CONST_U32(0))); @@ -255,107 +268,106 @@ static void jit_program(ir_ctx *ctx, const Instr_t *prog, int len) { // JIT: printf("Division by zero\n"); ir_CALL_1(IR_VOID, printf_func, ir_CONST_STR("Division by zero\n")); // JIT: pcpu->state = Cpu_Break; - ir_STORE(ir_ADD_OFFSET(cpu, offsetof(cpu_t, state)), ir_CONST_I32(Cpu_Break)); + ir_STORE(ir_ADD_OFFSET(jit->cpu, offsetof(cpu_t, state)), ir_CONST_I32(Cpu_Break)); ir_RETURN(IR_VOID); ir_IF_FALSE(tmp3); - jit_push(ctx, cpu, &stack_overflow, ir_MOD_U32(tmp1, tmp2)); + jit_push(jit, ir_MOD_U32(tmp1, tmp2)); break; - case Instr_Mul: - tmp1 = jit_pop(ctx, cpu, &stack_underflow); - tmp2 = jit_pop(ctx, cpu, &stack_underflow); - jit_push(ctx, cpu, &stack_overflow, ir_MUL_U32(tmp1, tmp2)); + tmp1 = jit_pop(jit); + tmp2 = jit_pop(jit); + jit_push(jit, ir_MUL_U32(tmp1, tmp2)); break; case Instr_Rand: tmp1 = ir_CALL(IR_I32, rand_func); - jit_push(ctx, cpu, &stack_overflow, tmp1); + jit_push(jit, tmp1); break; case Instr_Dec: - tmp1 = jit_pop(ctx, cpu, &stack_underflow); - jit_push(ctx, cpu, &stack_overflow, ir_SUB_U32(tmp1, ir_CONST_U32(1))); + tmp1 = jit_pop(jit); + jit_push(jit, ir_SUB_U32(tmp1, ir_CONST_U32(1))); break; case Instr_Drop: - (void)jit_pop(ctx, cpu, &stack_underflow); + (void)jit_pop(jit); break; case Instr_JE: - tmp1 = jit_pop(ctx, cpu, &stack_underflow); + tmp1 = jit_pop(jit); // JIT: if (tmp1 == 0) tmp3 = ir_IF(ir_EQ(tmp1, ir_CONST_U32(0))); ir_IF_TRUE(tmp3); - if (decoded.immediate < -decoded.length) { - jit_goto_backward(ctx, &labels[i + decoded.immediate]); + if (decoded.immediate >= 0) { + jit_goto_forward(jit, &labels[i + decoded.immediate]); } else { - jit_goto_forward(ctx, &labels[i + decoded.immediate]); + jit_goto_backward(jit, &labels[i + decoded.immediate]); } ir_IF_FALSE(tmp3); break; case Instr_JNE: - tmp1 = jit_pop(ctx, cpu, &stack_underflow); + tmp1 = jit_pop(jit); // JIT: if (tmp1 == 0) tmp3 = ir_IF(ir_NE(tmp1, ir_CONST_U32(0))); ir_IF_TRUE(tmp3); - if (decoded.immediate < -decoded.length) { - jit_goto_backward(ctx, &labels[i + decoded.immediate]); + if (decoded.immediate >= 0) { + jit_goto_forward(jit, &labels[i + decoded.immediate]); } else { - jit_goto_forward(ctx, &labels[i + decoded.immediate]); + jit_goto_backward(jit, &labels[i + decoded.immediate]); } ir_IF_FALSE(tmp3); break; case Instr_Jump: - if (decoded.immediate < -decoded.length) { - jit_goto_backward(ctx, &labels[i + decoded.immediate]); + if (decoded.immediate >= 0) { + jit_goto_forward(jit, &labels[i + decoded.immediate]); } else { - jit_goto_forward(ctx, &labels[i + decoded.immediate]); + jit_goto_backward(jit, &labels[i + decoded.immediate]); } break; case Instr_And: - tmp1 = jit_pop(ctx, cpu, &stack_underflow); - tmp2 = jit_pop(ctx, cpu, &stack_underflow); - jit_push(ctx, cpu, &stack_overflow, ir_AND_U32(tmp1, tmp2)); + tmp1 = jit_pop(jit); + tmp2 = jit_pop(jit); + jit_push(jit, ir_AND_U32(tmp1, tmp2)); break; case Instr_Or: - tmp1 = jit_pop(ctx, cpu, &stack_underflow); - tmp2 = jit_pop(ctx, cpu, &stack_underflow); - jit_push(ctx, cpu, &stack_overflow, ir_OR_U32(tmp1, tmp2)); + tmp1 = jit_pop(jit); + tmp2 = jit_pop(jit); + jit_push(jit, ir_OR_U32(tmp1, tmp2)); break; case Instr_Xor: - tmp1 = jit_pop(ctx, cpu, &stack_underflow); - tmp2 = jit_pop(ctx, cpu, &stack_underflow); - jit_push(ctx, cpu, &stack_overflow, ir_XOR_U32(tmp1, tmp2)); + tmp1 = jit_pop(jit); + tmp2 = jit_pop(jit); + jit_push(jit, ir_XOR_U32(tmp1, tmp2)); break; case Instr_SHL: - tmp1 = jit_pop(ctx, cpu, &stack_underflow); - tmp2 = jit_pop(ctx, cpu, &stack_underflow); - jit_push(ctx, cpu, &stack_overflow, ir_SHL_U32(tmp1, tmp2)); + tmp1 = jit_pop(jit); + tmp2 = jit_pop(jit); + jit_push(jit, ir_SHL_U32(tmp1, tmp2)); break; case Instr_SHR: - tmp1 = jit_pop(ctx, cpu, &stack_underflow); - tmp2 = jit_pop(ctx, cpu, &stack_underflow); - jit_push(ctx, cpu, &stack_overflow, ir_SHR_U32(tmp1, tmp2)); + tmp1 = jit_pop(jit); + tmp2 = jit_pop(jit); + jit_push(jit, ir_SHR_U32(tmp1, tmp2)); break; case Instr_Rot: - tmp1 = jit_pop(ctx, cpu, &stack_underflow); - tmp2 = jit_pop(ctx, cpu, &stack_underflow); - tmp3 = jit_pop(ctx, cpu, &stack_underflow); - jit_push(ctx, cpu, &stack_overflow, tmp1); - jit_push(ctx, cpu, &stack_overflow, tmp3); - jit_push(ctx, cpu, &stack_overflow, tmp2); + tmp1 = jit_pop(jit); + tmp2 = jit_pop(jit); + tmp3 = jit_pop(jit); + jit_push(jit, tmp1); + jit_push(jit, tmp3); + jit_push(jit, tmp2); break; case Instr_SQRT: - tmp1 = jit_pop(ctx, cpu, &stack_underflow); + tmp1 = jit_pop(jit); tmp1 = ir_FP2U32(ir_CALL_1(IR_DOUBLE, sqrt_func, ir_INT2D(tmp1))); - jit_push(ctx, cpu, &stack_overflow, tmp1); + jit_push(jit, tmp1); break; case Instr_Pick: - tmp1 = jit_pop(ctx, cpu, &stack_underflow); - tmp1 = jit_pick(ctx, cpu, &stack_bound, tmp1); - jit_push(ctx, cpu, &stack_overflow, tmp1); + tmp1 = jit_pop(jit); + tmp1 = jit_pick(jit, tmp1); + jit_push(jit, tmp1); break; case Instr_Break: - if (ctx->control) { + if (jit->ctx.control) { // JIT: pcpu->state = Cpu_Break; - ir_STORE(ir_ADD_OFFSET(cpu, offsetof(cpu_t, state)), ir_CONST_I32(Cpu_Break)); + ir_STORE(ir_ADD_OFFSET(jit->cpu, offsetof(cpu_t, state)), ir_CONST_I32(Cpu_Break)); ir_RETURN(IR_VOID); } i = len; @@ -366,29 +378,28 @@ static void jit_program(ir_ctx *ctx, const Instr_t *prog, int len) { } } - if (stack_overflow) { - ir_MERGE_list(stack_overflow); + if (jit->stack_overflow) { + ir_MERGE_list(jit->stack_overflow); // JIT: printf("Stack overflow\n"); ir_CALL_1(IR_VOID, printf_func, ir_CONST_STR("Stack overflow\n")); // JIT: pcpu->state = Cpu_Break; - ir_STORE(ir_ADD_OFFSET(cpu, offsetof(cpu_t, state)), ir_CONST_I32(Cpu_Break)); + ir_STORE(ir_ADD_OFFSET(jit->cpu, offsetof(cpu_t, state)), ir_CONST_I32(Cpu_Break)); ir_RETURN(IR_VOID); } - - if (stack_underflow) { - ir_MERGE_list(stack_underflow); + if (jit->stack_underflow) { + ir_MERGE_list(jit->stack_underflow); // JIT: printf("Stack overflow\n"); ir_CALL_1(IR_VOID, printf_func, ir_CONST_STR("Stack underflow\n")); // JIT: pcpu->state = Cpu_Break; - ir_STORE(ir_ADD_OFFSET(cpu, offsetof(cpu_t, state)), ir_CONST_I32(Cpu_Break)); + ir_STORE(ir_ADD_OFFSET(jit->cpu, offsetof(cpu_t, state)), ir_CONST_I32(Cpu_Break)); ir_RETURN(IR_VOID); } - if (stack_bound) { - ir_MERGE_list(stack_bound); + if (jit->stack_bound) { + ir_MERGE_list(jit->stack_bound); // JIT: printf("Out of bound picking\n"); ir_CALL_1(IR_VOID, printf_func, ir_CONST_STR("Stack underflow\n")); // JIT: pcpu->state = Cpu_Break; - ir_STORE(ir_ADD_OFFSET(cpu, offsetof(cpu_t, state)), ir_CONST_I32(Cpu_Break)); + ir_STORE(ir_ADD_OFFSET(jit->cpu, offsetof(cpu_t, state)), ir_CONST_I32(Cpu_Break)); ir_RETURN(IR_VOID); } } @@ -396,27 +407,27 @@ static void jit_program(ir_ctx *ctx, const Instr_t *prog, int len) { int main(int argc, char **argv) { uint64_t steplimit = parse_args(argc, argv); cpu_t cpu = init_cpu(); - ir_ctx ctx; + jit_ctx jit; typedef void (*entry_t)(cpu_t*); entry_t entry; size_t size; - ir_init(&ctx, IR_FUNCTION | IR_OPT_FOLDING | IR_OPT_CFG | IR_OPT_CODEGEN, 256, 1024); + ir_init(&jit.ctx, IR_FUNCTION | IR_OPT_FOLDING | IR_OPT_CFG | IR_OPT_CODEGEN, 256, 1024); - jit_program(&ctx, cpu.pmem, PROGRAM_SIZE); - ir_save(&ctx, IR_SAVE_CFG | IR_SAVE_RULES | IR_SAVE_REGS, stderr); + jit_program(&jit, cpu.pmem, PROGRAM_SIZE); + ir_save(&jit.ctx, IR_SAVE_CFG | IR_SAVE_RULES | IR_SAVE_REGS, stderr); - entry = (entry_t)ir_jit_compile(&ctx, 2, &size); + entry = (entry_t)ir_jit_compile(&jit.ctx, 2, &size); if (!entry) { printf("Compilation failure\n"); } - ir_save(&ctx, IR_SAVE_CFG | IR_SAVE_RULES | IR_SAVE_REGS, stderr); - ir_disasm("prog", entry, size, 0, &ctx, stderr); + ir_save(&jit.ctx, IR_SAVE_CFG | IR_SAVE_RULES | IR_SAVE_REGS, stderr); + ir_disasm("prog", entry, size, 0, &jit.ctx, stderr); entry(&cpu); - ir_free(&ctx); + ir_free(&jit.ctx); assert(cpu.state != Cpu_Running || cpu.steps == steplimit); From f7028141293b0315abbd5c689e3d035b5ceafc6b Mon Sep 17 00:00:00 2001 From: Dmitry Stogov Date: Thu, 14 Nov 2024 13:08:17 +0300 Subject: [PATCH 04/17] JIT with compile-time stack resoluton (-DJIT_RESOLVE_STACK) Stack resolution is incomplete. It doesn't respect Control Flow Graph yet. --- Makefile | 8 +++++++- jited_ir.c | 33 +++++++++++++++++++++++++++++++++ 2 files changed, 40 insertions(+), 1 deletion(-) diff --git a/Makefile b/Makefile index b296d08..734e832 100644 --- a/Makefile +++ b/Makefile @@ -10,7 +10,7 @@ COMMON_SRC = common.c COMMON_OBJ := $(COMMON_SRC:.c=.o) COMMON_HEADERS = common.h -ALL = switched threaded predecoded subroutined threaded-cached tailrecursive asmopt translated native jited_ir +ALL = switched threaded predecoded subroutined threaded-cached tailrecursive asmopt translated native jited_ir jited_ir_stack # Must be the first target for the magic below to work all: $(ALL) @@ -116,3 +116,9 @@ jited_ir.o: jited_ir.c jited_ir: jited_ir.o $(CC) $^ -lir -lcapstone -lm -o $@ + +jited_ir_stack.o: jited_ir.c + $(CC) $(DEPFLAGS) $(CFLAGS) $(CPPFLAGS) -DJIT_RESOLVE_STACK -c $< + +jited_ir_stack: jited_ir_stack.o + $(CC) $^ -lir -lcapstone -lm -o $@ diff --git a/jited_ir.c b/jited_ir.c index e5760c2..b1e3ecd 100644 --- a/jited_ir.c +++ b/jited_ir.c @@ -75,6 +75,9 @@ typedef struct _jit_label { typedef struct _jit_ctx { ir_ctx ctx; ir_ref cpu; +#ifdef JIT_RESOLVE_STACK + int sp; +#endif ir_ref stack_overflow; ir_ref stack_underflow; ir_ref stack_bound; @@ -84,6 +87,12 @@ typedef struct _jit_ctx { #define _ir_CTX (&jit->ctx) static void jit_push(jit_ctx *jit, ir_ref v) { +#ifdef JIT_RESOLVE_STACK + assert(jit->sp < STACK_CAPACITY - 1); + int sp = ++jit->sp; + // JIT: pcpu->stack[++pcpu->sp] = v; + ir_STORE(ir_ADD_OFFSET(jit->cpu, offsetof(cpu_t, stack) + sp * sizeof(uint32_t)), v); +#else // JIT: if (pcpu->sp >= STACK_CAPACITY-1) { ir_ref sp_addr = ir_ADD_OFFSET(jit->cpu, offsetof(cpu_t, sp)); ir_ref sp = ir_LOAD_I32(sp_addr); @@ -99,9 +108,16 @@ static void jit_push(jit_ctx *jit, ir_ref v) { ir_STORE(sp_addr, sp); ir_STORE(ir_ADD_I32(ir_ADD_OFFSET(jit->cpu, offsetof(cpu_t, stack)), ir_MUL_I32(sp, ir_CONST_I32(sizeof(uint32_t)))), v); +#endif } static ir_ref jit_pop(jit_ctx *jit) { +#ifdef JIT_RESOLVE_STACK + assert(jit->sp >= 0); + int sp = jit->sp--; + //JIT: pcpu->stack[pcpu->sp--]; + return ir_LOAD_I32(ir_ADD_OFFSET(jit->cpu, offsetof(cpu_t, stack) + sp * sizeof(uint32_t))); +#else // JIT: if (pcpu->sp < 0) { ir_ref sp_addr = ir_ADD_OFFSET(jit->cpu, offsetof(cpu_t, sp)); ir_ref sp = ir_LOAD_I32(sp_addr); @@ -119,9 +135,22 @@ static ir_ref jit_pop(jit_ctx *jit) { ir_STORE(sp_addr, sp); return ret; +#endif } static ir_ref jit_pick(jit_ctx *jit, ir_ref pos) { +#ifdef JIT_RESOLVE_STACK + // JIT: if (pcpu->sp - 1 < pos) { + ir_ref if_out = ir_IF(ir_LT(ir_CONST_U32(jit->sp - 1), pos)); + + ir_IF_TRUE_cold(if_out); + ir_END_list(jit->stack_bound); + + ir_IF_FALSE(if_out); + // JIT: pcpu->stack[pcpu->sp - pos]; + return ir_LOAD_I32(ir_ADD_I32(ir_ADD_OFFSET(jit->cpu, offsetof(cpu_t, stack)), + ir_MUL_I32(ir_SUB_I32(ir_CONST_U32(jit->sp), pos), ir_CONST_I32(sizeof(uint32_t))))); +#else // JIT: if (pcpu->sp - 1 < pos) { ir_ref sp = ir_LOAD_I32(ir_ADD_OFFSET(jit->cpu, offsetof(cpu_t, sp))); ir_ref if_out = ir_IF(ir_LT(ir_SUB_U32(sp, ir_CONST_U32(1)), pos)); @@ -133,6 +162,7 @@ static ir_ref jit_pick(jit_ctx *jit, ir_ref pos) { // JIT: pcpu->stack[pcpu->sp - pos]; return ir_LOAD_I32(ir_ADD_I32(ir_ADD_OFFSET(jit->cpu, offsetof(cpu_t, stack)), ir_MUL_I32(ir_SUB_I32(sp, pos), ir_CONST_I32(sizeof(uint32_t))))); +#endif } static void jit_goto_backward(jit_ctx *jit, jit_label *label) { @@ -165,6 +195,9 @@ static void jit_program(jit_ctx *jit, const Instr_t *prog, int len) { ir_ref tmp1, tmp2, tmp3; jit->cpu = ir_PARAM(IR_ADDR, "cpu", 1); +#ifdef JIT_RESOLVE_STACK + jit->sp = 0; +#endif jit->stack_overflow = IR_UNUSED; jit->stack_underflow = IR_UNUSED; jit->stack_bound = IR_UNUSED; From 83ccd448ab39df4ce8b5e5da38a0ece176e7d3ca Mon Sep 17 00:00:00 2001 From: Dmitry Stogov Date: Thu, 14 Nov 2024 13:30:39 +0300 Subject: [PATCH 05/17] Fix Makefile --- Makefile | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/Makefile b/Makefile index 734e832..5d53325 100644 --- a/Makefile +++ b/Makefile @@ -118,7 +118,7 @@ jited_ir: jited_ir.o $(CC) $^ -lir -lcapstone -lm -o $@ jited_ir_stack.o: jited_ir.c - $(CC) $(DEPFLAGS) $(CFLAGS) $(CPPFLAGS) -DJIT_RESOLVE_STACK -c $< + $(CC) $(DEPFLAGS) $(CFLAGS) $(CPPFLAGS) -DJIT_RESOLVE_STACK -o $@ -c $< jited_ir_stack: jited_ir_stack.o $(CC) $^ -lir -lcapstone -lm -o $@ From 42358726488273b50d18d816ace862456bc6b4f1 Mon Sep 17 00:00:00 2001 From: Dmitry Stogov Date: Thu, 14 Nov 2024 13:31:34 +0300 Subject: [PATCH 06/17] Fix incorrect types --- jited_ir.c | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/jited_ir.c b/jited_ir.c index b1e3ecd..81fb231 100644 --- a/jited_ir.c +++ b/jited_ir.c @@ -116,7 +116,7 @@ static ir_ref jit_pop(jit_ctx *jit) { assert(jit->sp >= 0); int sp = jit->sp--; //JIT: pcpu->stack[pcpu->sp--]; - return ir_LOAD_I32(ir_ADD_OFFSET(jit->cpu, offsetof(cpu_t, stack) + sp * sizeof(uint32_t))); + return ir_LOAD_U32(ir_ADD_OFFSET(jit->cpu, offsetof(cpu_t, stack) + sp * sizeof(uint32_t))); #else // JIT: if (pcpu->sp < 0) { ir_ref sp_addr = ir_ADD_OFFSET(jit->cpu, offsetof(cpu_t, sp)); @@ -129,7 +129,7 @@ static ir_ref jit_pop(jit_ctx *jit) { ir_IF_FALSE(if_underflow); //JIT: pcpu->stack[pcpu->sp--]; - ir_ref ret = ir_LOAD_I32(ir_ADD_I32(ir_ADD_OFFSET(jit->cpu, offsetof(cpu_t, stack)), + ir_ref ret = ir_LOAD_U32(ir_ADD_I32(ir_ADD_OFFSET(jit->cpu, offsetof(cpu_t, stack)), ir_MUL_I32(sp, ir_CONST_I32(sizeof(uint32_t))))); sp = ir_SUB_I32(sp, ir_CONST_I32(1)); ir_STORE(sp_addr, sp); @@ -148,7 +148,7 @@ static ir_ref jit_pick(jit_ctx *jit, ir_ref pos) { ir_IF_FALSE(if_out); // JIT: pcpu->stack[pcpu->sp - pos]; - return ir_LOAD_I32(ir_ADD_I32(ir_ADD_OFFSET(jit->cpu, offsetof(cpu_t, stack)), + return ir_LOAD_U32(ir_ADD_I32(ir_ADD_OFFSET(jit->cpu, offsetof(cpu_t, stack)), ir_MUL_I32(ir_SUB_I32(ir_CONST_U32(jit->sp), pos), ir_CONST_I32(sizeof(uint32_t))))); #else // JIT: if (pcpu->sp - 1 < pos) { @@ -160,7 +160,7 @@ static ir_ref jit_pick(jit_ctx *jit, ir_ref pos) { ir_IF_FALSE(if_out); // JIT: pcpu->stack[pcpu->sp - pos]; - return ir_LOAD_I32(ir_ADD_I32(ir_ADD_OFFSET(jit->cpu, offsetof(cpu_t, stack)), + return ir_LOAD_U32(ir_ADD_I32(ir_ADD_OFFSET(jit->cpu, offsetof(cpu_t, stack)), ir_MUL_I32(ir_SUB_I32(sp, pos), ir_CONST_I32(sizeof(uint32_t))))); #endif } From 070b41eb962b63bdc7ec3c0eb1679613ece88e0c Mon Sep 17 00:00:00 2001 From: Dmitry Stogov Date: Thu, 14 Nov 2024 14:50:49 +0300 Subject: [PATCH 07/17] Resolve stack with respect to CFG (without bytecode verification) --- jited_ir.c | 97 ++++++++++++++++++++++++++++++++---------------------- 1 file changed, 58 insertions(+), 39 deletions(-) diff --git a/jited_ir.c b/jited_ir.c index 81fb231..05979c0 100644 --- a/jited_ir.c +++ b/jited_ir.c @@ -6,10 +6,11 @@ #include #include #include +#include +#include #include "common.h" -#include #include "ir.h" #include "ir_builder.h" @@ -73,14 +74,16 @@ typedef struct _jit_label { } jit_label; typedef struct _jit_ctx { - ir_ctx ctx; - ir_ref cpu; + ir_ctx ctx; + ir_ref cpu; #ifdef JIT_RESOLVE_STACK - int sp; + int sp; + int *bb_sp; /* SP value at start of basic-block */ #endif - ir_ref stack_overflow; - ir_ref stack_underflow; - ir_ref stack_bound; + jit_label *labels; + ir_ref stack_overflow; + ir_ref stack_underflow; + ir_ref stack_bound; } jit_ctx; #undef _ir_CTX @@ -165,18 +168,41 @@ static ir_ref jit_pick(jit_ctx *jit, ir_ref pos) { #endif } -static void jit_goto_backward(jit_ctx *jit, jit_label *label) { - ir_set_op(_ir_CTX, label->merge, ++label->inputs, ir_END()); +static void jit_goto_backward(jit_ctx *jit, uint32_t target) { + ir_set_op(_ir_CTX, jit->labels[target].merge, ++jit->labels[target].inputs, ir_END()); +#ifdef JIT_RESOLVE_STACK + assert(jit->bb_sp[target] == -1 || jit->bb_sp[target] == jit->sp); + jit->bb_sp[target] = jit->sp; +#endif } -static void jit_goto_forward(jit_ctx *jit, jit_label *label) { - ir_END_list(label->merge); +static void jit_goto_forward(jit_ctx *jit, uint32_t target) { + ir_END_list(jit->labels[target].merge); +#ifdef JIT_RESOLVE_STACK + assert(jit->bb_sp[target] == -1 || jit->bb_sp[target] == jit->sp); + jit->bb_sp[target] = jit->sp; +#endif } static void jit_program(jit_ctx *jit, const Instr_t *prog, int len) { assert(prog); - jit_label *labels = calloc(len, sizeof(jit_label)); decode_t decoded; + ir_ref tmp1, tmp2, tmp3; + + ir_START(); + jit->cpu = ir_PARAM(IR_ADDR, "cpu", 1); + +#ifdef JIT_RESOLVE_STACK + jit->sp = 0; + jit->bb_sp = malloc(len * sizeof(int)); + memset(jit->bb_sp, -1, len * sizeof(int)); +#endif + + jit->labels = calloc(len, sizeof(jit_label)); + jit->stack_overflow = IR_UNUSED; + jit->stack_underflow = IR_UNUSED; + jit->stack_bound = IR_UNUSED; + /* mark goto targets */ for (int i=0; i < len;) { @@ -186,22 +212,11 @@ static void jit_program(jit_ctx *jit, const Instr_t *prog, int len) { case Instr_JE: case Instr_JNE: case Instr_Jump: - labels[i + decoded.immediate].inputs++; + jit->labels[i + decoded.immediate].inputs++; break; } } - ir_START(); - ir_ref tmp1, tmp2, tmp3; - - jit->cpu = ir_PARAM(IR_ADDR, "cpu", 1); -#ifdef JIT_RESOLVE_STACK - jit->sp = 0; -#endif - jit->stack_overflow = IR_UNUSED; - jit->stack_underflow = IR_UNUSED; - jit->stack_bound = IR_UNUSED; - ir_ref printf_func = ir_const_func(_ir_CTX, ir_str(_ir_CTX, "printf"), ir_proto_1(_ir_CTX, IR_I32, IR_VARARG_FUNC, IR_ADDR)); ir_ref rand_func = @@ -212,32 +227,36 @@ static void jit_program(jit_ctx *jit, const Instr_t *prog, int len) { decoded.opcode = Instr_Nop; for (int i=0; i < len;) { - if (labels[i].inputs > 0) { + if (jit->labels[i].inputs > 0) { if (decoded.opcode != Instr_Jump) { - labels[i].inputs++; - jit_goto_forward(jit, &labels[i]); + jit->labels[i].inputs++; + jit_goto_forward(jit, i); } +#ifdef JIT_RESOLVE_STACK + assert(jit->bb_sp[i] != -1); + jit->sp == jit->bb_sp[i]; +#endif assert(!jit->ctx.control); - if (labels[i].inputs == 1) { + if (jit->labels[i].inputs == 1) { tmp1 = ir_emit1(_ir_CTX, IR_BEGIN, IR_UNUSED); } else { - tmp1 = ir_emit_N(_ir_CTX, IR_MERGE, labels[i].inputs); + tmp1 = ir_emit_N(_ir_CTX, IR_MERGE, jit->labels[i].inputs); } tmp2 = 0; - tmp3 = labels[i].merge; - labels[i].merge = jit->ctx.control = tmp1; + tmp3 = jit->labels[i].merge; + jit->labels[i].merge = jit->ctx.control = tmp1; while (tmp3) { /* Store forward GOTOs into MERGE */ tmp2++; - assert(tmp2 <= labels[i].inputs); + assert(tmp2 <= jit->labels[i].inputs); ir_set_op(_ir_CTX, tmp1, tmp2, tmp3); ir_insn *insn = &jit->ctx.ir_base[tmp3]; assert(insn->op == IR_END); tmp3 = insn->op2; insn->op2 = IR_UNUSED; } - labels[i].inputs = tmp2; + jit->labels[i].inputs = tmp2; } decoded = decode_at_address(prog, i); @@ -329,9 +348,9 @@ static void jit_program(jit_ctx *jit, const Instr_t *prog, int len) { tmp3 = ir_IF(ir_EQ(tmp1, ir_CONST_U32(0))); ir_IF_TRUE(tmp3); if (decoded.immediate >= 0) { - jit_goto_forward(jit, &labels[i + decoded.immediate]); + jit_goto_forward(jit, i + decoded.immediate); } else { - jit_goto_backward(jit, &labels[i + decoded.immediate]); + jit_goto_backward(jit, i + decoded.immediate); } ir_IF_FALSE(tmp3); break; @@ -341,17 +360,17 @@ static void jit_program(jit_ctx *jit, const Instr_t *prog, int len) { tmp3 = ir_IF(ir_NE(tmp1, ir_CONST_U32(0))); ir_IF_TRUE(tmp3); if (decoded.immediate >= 0) { - jit_goto_forward(jit, &labels[i + decoded.immediate]); + jit_goto_forward(jit, i + decoded.immediate); } else { - jit_goto_backward(jit, &labels[i + decoded.immediate]); + jit_goto_backward(jit, i + decoded.immediate); } ir_IF_FALSE(tmp3); break; case Instr_Jump: if (decoded.immediate >= 0) { - jit_goto_forward(jit, &labels[i + decoded.immediate]); + jit_goto_forward(jit, i + decoded.immediate); } else { - jit_goto_backward(jit, &labels[i + decoded.immediate]); + jit_goto_backward(jit, i + decoded.immediate); } break; case Instr_And: From 7bf6851751796a536f2edc09537731393520f14b Mon Sep 17 00:00:00 2001 From: Dmitry Stogov Date: Thu, 14 Nov 2024 15:55:17 +0300 Subject: [PATCH 08/17] Map resolved stack slots to local variables (-DJIT_RESOLVE_STACK -DJIT_USE_VARS) --- Makefile | 8 +++- jited_ir.c | 121 +++++++++++++++++++++++++++++++++++++++++++++++++---- 2 files changed, 119 insertions(+), 10 deletions(-) diff --git a/Makefile b/Makefile index 5d53325..a3384ec 100644 --- a/Makefile +++ b/Makefile @@ -10,7 +10,7 @@ COMMON_SRC = common.c COMMON_OBJ := $(COMMON_SRC:.c=.o) COMMON_HEADERS = common.h -ALL = switched threaded predecoded subroutined threaded-cached tailrecursive asmopt translated native jited_ir jited_ir_stack +ALL = switched threaded predecoded subroutined threaded-cached tailrecursive asmopt translated native jited_ir jited_ir_stack jited_ir_var # Must be the first target for the magic below to work all: $(ALL) @@ -122,3 +122,9 @@ jited_ir_stack.o: jited_ir.c jited_ir_stack: jited_ir_stack.o $(CC) $^ -lir -lcapstone -lm -o $@ + +jited_ir_var.o: jited_ir.c + $(CC) $(DEPFLAGS) $(CFLAGS) $(CPPFLAGS) -DJIT_RESOLVE_STACK -DJIT_USE_VARS -O0 -g -o $@ -c $< + +jited_ir_var: jited_ir_var.o + $(CC) $^ -lir -lcapstone -lm -o $@ diff --git a/jited_ir.c b/jited_ir.c index 05979c0..a8d8a05 100644 --- a/jited_ir.c +++ b/jited_ir.c @@ -77,6 +77,10 @@ typedef struct _jit_ctx { ir_ctx ctx; ir_ref cpu; #ifdef JIT_RESOLVE_STACK +# ifdef JIT_USE_VARS + int stack_limit; + ir_ref *vars; +#endif int sp; int *bb_sp; /* SP value at start of basic-block */ #endif @@ -91,10 +95,16 @@ typedef struct _jit_ctx { static void jit_push(jit_ctx *jit, ir_ref v) { #ifdef JIT_RESOLVE_STACK - assert(jit->sp < STACK_CAPACITY - 1); int sp = ++jit->sp; +# ifdef JIT_USE_VARS + assert(sp < jit->stack_limit); + // JIT: pcpu->stack[++pcpu->sp] = v; + ir_VSTORE(jit->vars[sp], v); +# else + assert(jit->sp < STACK_CAPACITY); // JIT: pcpu->stack[++pcpu->sp] = v; ir_STORE(ir_ADD_OFFSET(jit->cpu, offsetof(cpu_t, stack) + sp * sizeof(uint32_t)), v); +# endif #else // JIT: if (pcpu->sp >= STACK_CAPACITY-1) { ir_ref sp_addr = ir_ADD_OFFSET(jit->cpu, offsetof(cpu_t, sp)); @@ -116,10 +126,15 @@ static void jit_push(jit_ctx *jit, ir_ref v) { static ir_ref jit_pop(jit_ctx *jit) { #ifdef JIT_RESOLVE_STACK - assert(jit->sp >= 0); int sp = jit->sp--; + assert(sp >= 0); +# ifdef JIT_USE_VARS + //JIT: pcpu->stack[pcpu->sp--]; + return ir_VLOAD_U32(jit->vars[sp]); +# else //JIT: pcpu->stack[pcpu->sp--]; return ir_LOAD_U32(ir_ADD_OFFSET(jit->cpu, offsetof(cpu_t, stack) + sp * sizeof(uint32_t))); +# endif #else // JIT: if (pcpu->sp < 0) { ir_ref sp_addr = ir_ADD_OFFSET(jit->cpu, offsetof(cpu_t, sp)); @@ -143,6 +158,12 @@ static ir_ref jit_pop(jit_ctx *jit) { static ir_ref jit_pick(jit_ctx *jit, ir_ref pos) { #ifdef JIT_RESOLVE_STACK +# ifdef JIT_USE_VARS + assert(IR_IS_CONST_REF(pos)); + int sp = jit->ctx.ir_base[pos].val.i32; + assert(sp >= 0 && sp < jit->stack_limit); + return ir_VLOAD_U32(jit->vars[sp]); +# else // JIT: if (pcpu->sp - 1 < pos) { ir_ref if_out = ir_IF(ir_LT(ir_CONST_U32(jit->sp - 1), pos)); @@ -153,6 +174,7 @@ static ir_ref jit_pick(jit_ctx *jit, ir_ref pos) { // JIT: pcpu->stack[pcpu->sp - pos]; return ir_LOAD_U32(ir_ADD_I32(ir_ADD_OFFSET(jit->cpu, offsetof(cpu_t, stack)), ir_MUL_I32(ir_SUB_I32(ir_CONST_U32(jit->sp), pos), ir_CONST_I32(sizeof(uint32_t))))); +# endif #else // JIT: if (pcpu->sp - 1 < pos) { ir_ref sp = ir_LOAD_I32(ir_ADD_OFFSET(jit->cpu, offsetof(cpu_t, sp))); @@ -192,18 +214,11 @@ static void jit_program(jit_ctx *jit, const Instr_t *prog, int len) { ir_START(); jit->cpu = ir_PARAM(IR_ADDR, "cpu", 1); -#ifdef JIT_RESOLVE_STACK - jit->sp = 0; - jit->bb_sp = malloc(len * sizeof(int)); - memset(jit->bb_sp, -1, len * sizeof(int)); -#endif - jit->labels = calloc(len, sizeof(jit_label)); jit->stack_overflow = IR_UNUSED; jit->stack_underflow = IR_UNUSED; jit->stack_bound = IR_UNUSED; - /* mark goto targets */ for (int i=0; i < len;) { decoded = decode_at_address(prog, i); @@ -214,9 +229,97 @@ static void jit_program(jit_ctx *jit, const Instr_t *prog, int len) { case Instr_Jump: jit->labels[i + decoded.immediate].inputs++; break; + case Instr_Break: + i = len; + break; + } + } + +#ifdef JIT_RESOLVE_STACK + jit->sp = -1; + jit->bb_sp = malloc(len * sizeof(int)); + memset(jit->bb_sp, -1, len * sizeof(int)); + +# ifdef JIT_USE_VARS + /* calculate stack_limit */ + jit->stack_limit = 0; + decoded.opcode = Instr_Nop; + + for (int i=0; i < len;) { + if (jit->labels[i].inputs > 0) { + if (decoded.opcode != Instr_Jump) { + assert(jit->bb_sp[i] == -1 || jit->bb_sp[i] == jit->sp); + jit->bb_sp[i] = jit->sp; + } + assert(jit->bb_sp[i] != -1); + jit->sp == jit->bb_sp[i]; + } + + decoded = decode_at_address(prog, i); + i += decoded.length; + + switch(decoded.opcode) { + case Instr_Nop: + case Instr_Halt: + case Instr_Swap: + case Instr_Inc: + case Instr_Dec: + case Instr_Rot: + case Instr_SQRT: + case Instr_Pick: + /* Do nothing */ + break; + case Instr_Push: + case Instr_Dup: + case Instr_Over: + case Instr_Rand: + jit->sp++; + if (jit->sp >= jit->stack_limit) { + jit->stack_limit = jit->sp + 1; + } + break; + case Instr_Print: + case Instr_Add: + case Instr_Sub: + case Instr_Mod: + case Instr_Mul: + case Instr_Drop: + case Instr_And: + case Instr_Or: + case Instr_Xor: + case Instr_SHL: + case Instr_SHR: + jit->sp--; + break; + case Instr_JE: + case Instr_JNE: + jit->sp--; + assert(jit->bb_sp[i + decoded.immediate] == -1 || jit->bb_sp[i + decoded.immediate] == jit->sp); + jit->bb_sp[i + decoded.immediate] = jit->sp; + break; + case Instr_Jump: + assert(jit->bb_sp[i + decoded.immediate] == -1 || jit->bb_sp[i + decoded.immediate] == jit->sp); + jit->bb_sp[i + decoded.immediate] = jit->sp; + break; + case Instr_Break: + i = len; + break; + default: + assert(0 && "Unsupported instruction"); + break; } } + jit->sp = -1; + jit->vars = malloc(jit->stack_limit * sizeof(ir_ref)); + for (int i = 0; i < jit->stack_limit; i++) { + char s[16]; + sprintf(s, "t%d", i); + jit->vars[i] = ir_var(_ir_CTX, IR_U32, 1, s); + } +# endif +#endif + ir_ref printf_func = ir_const_func(_ir_CTX, ir_str(_ir_CTX, "printf"), ir_proto_1(_ir_CTX, IR_I32, IR_VARARG_FUNC, IR_ADDR)); ir_ref rand_func = From a8a6bd623715026e6cbed3f0d275e4532d3d4377 Mon Sep 17 00:00:00 2001 From: Dmitry Stogov Date: Thu, 14 Nov 2024 15:59:39 +0300 Subject: [PATCH 09/17] Remove "-g" --- Makefile | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/Makefile b/Makefile index a3384ec..0857499 100644 --- a/Makefile +++ b/Makefile @@ -124,7 +124,7 @@ jited_ir_stack: jited_ir_stack.o $(CC) $^ -lir -lcapstone -lm -o $@ jited_ir_var.o: jited_ir.c - $(CC) $(DEPFLAGS) $(CFLAGS) $(CPPFLAGS) -DJIT_RESOLVE_STACK -DJIT_USE_VARS -O0 -g -o $@ -c $< + $(CC) $(DEPFLAGS) $(CFLAGS) $(CPPFLAGS) -DJIT_RESOLVE_STACK -DJIT_USE_VARS -o $@ -c $< jited_ir_var: jited_ir_var.o $(CC) $^ -lir -lcapstone -lm -o $@ From c701de7c02dda581b921c2f72e2c9672611c9dda Mon Sep 17 00:00:00 2001 From: Dmitry Stogov Date: Fri, 15 Nov 2024 03:33:24 +0300 Subject: [PATCH 10/17] Add PHP version of the benchmarks (PHP VM is 30% faster than asmopt) native 0.601 asmopt 2.986 predecoded 21.943 subroutined 15.425 switched 24.787 tailrecursive 14.670 threaded 25.444 threaded-cached 18.190 translated 20.212 php -d opcache.enable=0 2.188 php -d opcache.jit=0 1.968 php -d opcache.jit=function 1.005 php -d opcache.jit=tracing 1.011 jited_ir 6.277 jited_ir_stack 0.616 jited_ir_var 0.613 --- primes.php | 16 ++++++++++++++++ 1 file changed, 16 insertions(+) create mode 100644 primes.php diff --git a/primes.php b/primes.php new file mode 100644 index 0000000..31c2430 --- /dev/null +++ b/primes.php @@ -0,0 +1,16 @@ + Date: Fri, 15 Nov 2024 04:27:42 +0300 Subject: [PATCH 11/17] Fix indentation --- jited_ir.c | 16 ++++++++-------- 1 file changed, 8 insertions(+), 8 deletions(-) diff --git a/jited_ir.c b/jited_ir.c index a8d8a05..94bd439 100644 --- a/jited_ir.c +++ b/jited_ir.c @@ -241,8 +241,8 @@ static void jit_program(jit_ctx *jit, const Instr_t *prog, int len) { memset(jit->bb_sp, -1, len * sizeof(int)); # ifdef JIT_USE_VARS - /* calculate stack_limit */ - jit->stack_limit = 0; + /* calculate stack_limit */ + jit->stack_limit = 0; decoded.opcode = Instr_Nop; for (int i=0; i < len;) { @@ -311,12 +311,12 @@ static void jit_program(jit_ctx *jit, const Instr_t *prog, int len) { } jit->sp = -1; - jit->vars = malloc(jit->stack_limit * sizeof(ir_ref)); - for (int i = 0; i < jit->stack_limit; i++) { - char s[16]; - sprintf(s, "t%d", i); - jit->vars[i] = ir_var(_ir_CTX, IR_U32, 1, s); - } + jit->vars = malloc(jit->stack_limit * sizeof(ir_ref)); + for (int i = 0; i < jit->stack_limit; i++) { + char s[16]; + sprintf(s, "t%d", i); + jit->vars[i] = ir_var(_ir_CTX, IR_U32, 1, s); + } # endif #endif From dad76de36e57b99dc2fe20372e86b728b67c9ca9 Mon Sep 17 00:00:00 2001 From: Dmitry Stogov Date: Fri, 15 Nov 2024 04:28:36 +0300 Subject: [PATCH 12/17] Fix indentation --- jited_ir.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/jited_ir.c b/jited_ir.c index 94bd439..543332f 100644 --- a/jited_ir.c +++ b/jited_ir.c @@ -74,8 +74,8 @@ typedef struct _jit_label { } jit_label; typedef struct _jit_ctx { - ir_ctx ctx; - ir_ref cpu; + ir_ctx ctx; + ir_ref cpu; #ifdef JIT_RESOLVE_STACK # ifdef JIT_USE_VARS int stack_limit; From 10554f05bf94469aa9f15b9045a91b032501709d Mon Sep 17 00:00:00 2001 From: Dmitry Stogov Date: Fri, 15 Nov 2024 12:02:06 +0300 Subject: [PATCH 13/17] Dump IR and diassembly only if jitted_ir* are run with --debug option --- common.c | 4 ++++ common.h | 2 ++ jited_ir.c | 11 ++++++++--- 3 files changed, 14 insertions(+), 3 deletions(-) diff --git a/common.c b/common.c index 85d4d50..f768274 100644 --- a/common.c +++ b/common.c @@ -35,6 +35,8 @@ OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. */ #include "common.h" +int debug = 0; + /* Program to print all prime numbers < 10000 */ const Instr_t Primes[PROGRAM_SIZE] = { Instr_Push, 100000, // nmax (maximal number to test) @@ -198,6 +200,8 @@ uint64_t parse_args(int argc, char** argv) { for (int i = 1; i < argc; ++i) { if (!strcmp(argv[i], "--help")) report_usage_and_exit(argv[0], 0); + if (!strcmp(argv[i], "--debug")) + debug = 1; else if (!strncmp(argv[i], steplimit_opt, strlen(steplimit_opt))) { char *endptr = NULL; steplimit = strtoll(argv[i] + strlen(steplimit_opt), &endptr, 10); diff --git a/common.h b/common.h index 05c1392..daa4605 100644 --- a/common.h +++ b/common.h @@ -33,6 +33,8 @@ OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. */ #ifndef COMMON_H_ #define COMMON_H_ +extern int debug; + /* Instruction Set Architecture: opcodes and arguments for individual instructions. Those marked with "imm" use the next machine word diff --git a/jited_ir.c b/jited_ir.c index 543332f..0fca37e 100644 --- a/jited_ir.c +++ b/jited_ir.c @@ -570,15 +570,20 @@ int main(int argc, char **argv) { ir_init(&jit.ctx, IR_FUNCTION | IR_OPT_FOLDING | IR_OPT_CFG | IR_OPT_CODEGEN, 256, 1024); jit_program(&jit, cpu.pmem, PROGRAM_SIZE); - ir_save(&jit.ctx, IR_SAVE_CFG | IR_SAVE_RULES | IR_SAVE_REGS, stderr); + + if (debug) { + ir_save(&jit.ctx, IR_SAVE_CFG | IR_SAVE_RULES | IR_SAVE_REGS, stderr); + } entry = (entry_t)ir_jit_compile(&jit.ctx, 2, &size); if (!entry) { printf("Compilation failure\n"); } - ir_save(&jit.ctx, IR_SAVE_CFG | IR_SAVE_RULES | IR_SAVE_REGS, stderr); - ir_disasm("prog", entry, size, 0, &jit.ctx, stderr); + if (debug) { + ir_save(&jit.ctx, IR_SAVE_CFG | IR_SAVE_RULES | IR_SAVE_REGS, stderr); + ir_disasm("prog", entry, size, 0, &jit.ctx, stderr); + } entry(&cpu); From ba5081950f823946d807d7e0e24b88f782357fa5 Mon Sep 17 00:00:00 2001 From: Dmitry Stogov Date: Tue, 19 Nov 2024 12:56:38 +0300 Subject: [PATCH 14/17] Construct SSA for resolved stack slots (-DJIT_RESOLVE_STACK -DJIT_USE_SSA) --- Makefile | 9 +- jited_ir.c | 243 +++++++++++++++++++++++++++++++++++++++++++++++------ 2 files changed, 224 insertions(+), 28 deletions(-) diff --git a/Makefile b/Makefile index 0857499..4a57331 100644 --- a/Makefile +++ b/Makefile @@ -10,7 +10,8 @@ COMMON_SRC = common.c COMMON_OBJ := $(COMMON_SRC:.c=.o) COMMON_HEADERS = common.h -ALL = switched threaded predecoded subroutined threaded-cached tailrecursive asmopt translated native jited_ir jited_ir_stack jited_ir_var +ALL = switched threaded predecoded subroutined threaded-cached tailrecursive asmopt translated native \ + jited_ir jited_ir_stack jited_ir_var jited_ir_ssa # Must be the first target for the magic below to work all: $(ALL) @@ -128,3 +129,9 @@ jited_ir_var.o: jited_ir.c jited_ir_var: jited_ir_var.o $(CC) $^ -lir -lcapstone -lm -o $@ + +jited_ir_ssa.o: jited_ir.c + $(CC) $(DEPFLAGS) $(CFLAGS) $(CPPFLAGS) -DJIT_RESOLVE_STACK -DJIT_USE_SSA -o $@ -c $< + +jited_ir_ssa: jited_ir_ssa.o + $(CC) $^ -lir -lcapstone -lm -o $@ diff --git a/jited_ir.c b/jited_ir.c index 0fca37e..cdf76e6 100644 --- a/jited_ir.c +++ b/jited_ir.c @@ -71,16 +71,25 @@ static inline decode_t decode_at_address(const Instr_t* prog, uint32_t addr) { typedef struct _jit_label { ir_ref inputs; /* number of input edges */ ir_ref merge; /* reference of MERGE or "list" of forward inputs */ +#if defined(JIT_RESOLVE_STACK) && defined(JIT_USE_SSA) + int b; +#endif } jit_label; typedef struct _jit_ctx { ir_ctx ctx; ir_ref cpu; #ifdef JIT_RESOLVE_STACK -# ifdef JIT_USE_VARS +# if defined(JIT_USE_SSA) + int b; /* current block */ + int blocks_count; + int stack_limit; + ir_ref *ssa_vars; + ir_ref *incomplete_phis; +# elif defined(JIT_USE_VARS) int stack_limit; ir_ref *vars; -#endif +# endif int sp; int *bb_sp; /* SP value at start of basic-block */ #endif @@ -93,10 +102,123 @@ typedef struct _jit_ctx { #undef _ir_CTX #define _ir_CTX (&jit->ctx) +#ifdef JIT_USE_SSA +static ir_ref jit_ssa_get_var(jit_ctx *jit, int b, int var, ir_ref control); + +static ir_ref jit_ssa_try_remove_trivial_phi(jit_ctx *jit, ir_ref phi) { + ir_ref i, n = jit->ctx.ir_base[phi].inputs_count; + ir_ref same, op; + + assert(n > 2); + same = ir_get_op(_ir_CTX, phi, 2); + for (i = 3; i <= n; i++) { + op = ir_get_op(_ir_CTX, phi, i); + if (op != same && op != phi) { + return phi; + } + } + + // Remember all users except the phi itself + // users = phi.users.remove(phi) + // Reroute all uses of phi to same and remove phi + // phi.replaceBy(same) + // Try to recursively remove all phi users, which might have become trivial + // for use in users: f use is a Phi: tryRemoveTrivialPhi(use) + + return same; +} + +static void jit_ssa_set_var(jit_ctx *jit, int b, int var, ir_ref val) { + jit->ssa_vars[var * jit->blocks_count + b] = val; +} + +static ir_ref jit_ssa_get_var(jit_ctx *jit, int b, int var, ir_ref control) { + ir_ref val = jit->ssa_vars[var * jit->blocks_count + b]; + ir_ref ref; + ir_insn *insn; + + if (val) { + return val; + } + + ref = control; + assert(ref); + insn = &jit->ctx.ir_base[ref]; + + /* go up to the start of basic-block through control links */ + while (insn->op < IR_START || insn->op > IR_LOOP_BEGIN) { + ref = insn->op1; + insn = &jit->ctx.ir_base[ref]; + } + + assert(insn->op != IR_START); + if (insn->op == IR_MERGE || insn->op == IR_LOOP_BEGIN) { + bool incomplete = 0; + uint32_t i, n = insn->inputs_count; + val = ir_emit_N(_ir_CTX, IR_OPT(IR_PHI, IR_U32), n + 1); + ir_set_op(_ir_CTX, val, 1, ref); + jit->ssa_vars[var * jit->blocks_count + b] = val; + for (i = 1; i <= n; i++) { + ir_ref end = ir_get_op(_ir_CTX, ref, i); + if (end) { + ir_insn *end_insn = &jit->ctx.ir_base[end]; + assert(end_insn->op >= IR_END && end_insn->op <= IR_SWITCH); + assert(end_insn->op3 >= 1000); + ir_ref op = jit_ssa_get_var(jit, end_insn->op3 - 1000, var, end); + ir_set_op(_ir_CTX, val, i + 1, op); + } else { + incomplete = 1; + } + } + if (incomplete) { + jit->incomplete_phis[var * jit->blocks_count + b] = val; + } else { + val = jit_ssa_try_remove_trivial_phi(jit, val); + } + } else { + ir_ref end = insn->op1; + assert(end); + ir_insn *end_insn = &jit->ctx.ir_base[end]; + assert(end_insn->op >= IR_END && end_insn->op <= IR_SWITCH); + assert(end_insn->op3 >= 1000); + val = jit_ssa_get_var(jit, end_insn->op3 - 1000, var, end); + } + jit->ssa_vars[var * jit->blocks_count + b] = val; + return val; +} + +static void jit_ssa_fix_incomplete_phis(jit_ctx *jit, uint32_t target) +{ + int dst_block = jit->labels[target].b; + int var; + + for (var = 0; var < jit->stack_limit; var++) { + ir_ref phi = jit->incomplete_phis[var * jit->blocks_count + dst_block]; + if (phi) { + ir_ref val = jit_ssa_get_var(jit, jit->b, var, jit->ctx.control); + ir_set_op(_ir_CTX, phi, jit->labels[target].inputs + 2, val); + } + } +} + +static void jit_ssa_end_block(jit_ctx *jit) { + ir_ref end = jit->ctx.insns_count - 1; + ir_insn *insn = &jit->ctx.ir_base[end]; + assert(insn->op >= IR_END && insn->op <= IR_SWITCH); + /* Use END->op3 to store the corresponding BB index */ + insn->op3 = 1000 + jit->b; +} + +#endif + static void jit_push(jit_ctx *jit, ir_ref v) { #ifdef JIT_RESOLVE_STACK int sp = ++jit->sp; -# ifdef JIT_USE_VARS +# ifdef JIT_USE_SSA + assert(sp < jit->stack_limit); + // JIT: pcpu->stack[++pcpu->sp] = v; + jit_ssa_set_var(jit, jit->b, sp, v); +# elif defined(JIT_USE_VARS) assert(sp < jit->stack_limit); // JIT: pcpu->stack[++pcpu->sp] = v; ir_VSTORE(jit->vars[sp], v); @@ -128,7 +250,10 @@ static ir_ref jit_pop(jit_ctx *jit) { #ifdef JIT_RESOLVE_STACK int sp = jit->sp--; assert(sp >= 0); -# ifdef JIT_USE_VARS +# ifdef JIT_USE_SSA + // JIT: pcpu->stack[++pcpu->sp] = v; + return jit_ssa_get_var(jit, jit->b, sp, jit->ctx.control); +# elif defined(JIT_USE_VARS) //JIT: pcpu->stack[pcpu->sp--]; return ir_VLOAD_U32(jit->vars[sp]); # else @@ -158,7 +283,12 @@ static ir_ref jit_pop(jit_ctx *jit) { static ir_ref jit_pick(jit_ctx *jit, ir_ref pos) { #ifdef JIT_RESOLVE_STACK -# ifdef JIT_USE_VARS +# ifdef JIT_USE_SSA + assert(IR_IS_CONST_REF(pos)); + int sp = jit->ctx.ir_base[pos].val.i32; + assert(sp >= 0 && sp < jit->stack_limit); + return jit_ssa_get_var(jit, jit->b, sp, jit->ctx.control); +# elif defined(JIT_USE_VARS) assert(IR_IS_CONST_REF(pos)); int sp = jit->ctx.ir_base[pos].val.i32; assert(sp >= 0 && sp < jit->stack_limit); @@ -191,10 +321,16 @@ static ir_ref jit_pick(jit_ctx *jit, ir_ref pos) { } static void jit_goto_backward(jit_ctx *jit, uint32_t target) { +#if defined(JIT_RESOLVE_STACK) && defined(JIT_USE_SSA) + jit_ssa_fix_incomplete_phis(jit, target); +#endif ir_set_op(_ir_CTX, jit->labels[target].merge, ++jit->labels[target].inputs, ir_END()); #ifdef JIT_RESOLVE_STACK assert(jit->bb_sp[target] == -1 || jit->bb_sp[target] == jit->sp); jit->bb_sp[target] = jit->sp; +# ifdef JIT_USE_SSA + jit_ssa_end_block(jit); +# endif #endif } @@ -203,6 +339,9 @@ static void jit_goto_forward(jit_ctx *jit, uint32_t target) { #ifdef JIT_RESOLVE_STACK assert(jit->bb_sp[target] == -1 || jit->bb_sp[target] == jit->sp); jit->bb_sp[target] = jit->sp; +# ifdef JIT_USE_SSA + jit_ssa_end_block(jit); +# endif #endif } @@ -226,6 +365,17 @@ static void jit_program(jit_ctx *jit, const Instr_t *prog, int len) { switch(decoded.opcode) { case Instr_JE: case Instr_JNE: +#if defined(JIT_RESOLVE_STACK) && defined(JIT_USE_SSA) + if (!jit->labels[i + decoded.immediate].inputs && i + decoded.immediate != 0) { + jit->blocks_count++; + } + if (!jit->labels[i].inputs) { + jit->blocks_count++; + } +#endif + jit->labels[i + decoded.immediate].inputs++; + jit->labels[i].inputs++; + break; case Instr_Jump: jit->labels[i + decoded.immediate].inputs++; break; @@ -240,11 +390,16 @@ static void jit_program(jit_ctx *jit, const Instr_t *prog, int len) { jit->bb_sp = malloc(len * sizeof(int)); memset(jit->bb_sp, -1, len * sizeof(int)); -# ifdef JIT_USE_VARS +# if defined(JIT_USE_SSA) || defined(JIT_USE_VARS) /* calculate stack_limit */ jit->stack_limit = 0; decoded.opcode = Instr_Nop; +# ifdef JIT_USE_SSA + jit->blocks_count = 1; + jit->b = 0; +# endif + for (int i=0; i < len;) { if (jit->labels[i].inputs > 0) { if (decoded.opcode != Instr_Jump) { @@ -253,6 +408,11 @@ static void jit_program(jit_ctx *jit, const Instr_t *prog, int len) { } assert(jit->bb_sp[i] != -1); jit->sp == jit->bb_sp[i]; +# ifdef JIT_USE_SSA + if (i != 0) { + jit->blocks_count++; + } +# endif } decoded = decode_at_address(prog, i); @@ -311,12 +471,17 @@ static void jit_program(jit_ctx *jit, const Instr_t *prog, int len) { } jit->sp = -1; +# ifdef JIT_USE_SSA + jit->ssa_vars = calloc(jit->stack_limit * jit->blocks_count, sizeof(ir_ref)); + jit->incomplete_phis = calloc(jit->stack_limit * jit->blocks_count, sizeof(ir_ref)); +# else jit->vars = malloc(jit->stack_limit * sizeof(ir_ref)); for (int i = 0; i < jit->stack_limit; i++) { char s[16]; sprintf(s, "t%d", i); jit->vars[i] = ir_var(_ir_CTX, IR_U32, 1, s); } +# endif # endif #endif @@ -332,34 +497,50 @@ static void jit_program(jit_ctx *jit, const Instr_t *prog, int len) { for (int i=0; i < len;) { if (jit->labels[i].inputs > 0) { if (decoded.opcode != Instr_Jump) { - jit->labels[i].inputs++; + if (decoded.opcode != Instr_JE + && decoded.opcode != Instr_JNE) { + jit->labels[i].inputs++; + } jit_goto_forward(jit, i); } -#ifdef JIT_RESOLVE_STACK - assert(jit->bb_sp[i] != -1); - jit->sp == jit->bb_sp[i]; -#endif assert(!jit->ctx.control); if (jit->labels[i].inputs == 1) { - tmp1 = ir_emit1(_ir_CTX, IR_BEGIN, IR_UNUSED); + tmp3 = jit->labels[i].merge; + assert(tmp3); + ir_insn *insn = &jit->ctx.ir_base[tmp3]; + assert(insn->op == IR_END && !insn->op2); + insn->op2 = IR_UNUSED; + ir_BEGIN(tmp3); + jit->labels[i].merge = IR_UNUSED; } else { tmp1 = ir_emit_N(_ir_CTX, IR_MERGE, jit->labels[i].inputs); + tmp2 = 0; + tmp3 = jit->labels[i].merge; + jit->labels[i].merge = jit->ctx.control = tmp1; + + while (tmp3) { + /* Store forward GOTOs into MERGE */ + tmp2++; + assert(tmp2 <= jit->labels[i].inputs); + ir_set_op(_ir_CTX, tmp1, tmp2, tmp3); + ir_insn *insn = &jit->ctx.ir_base[tmp3]; + assert(insn->op == IR_END); + tmp3 = insn->op2; + insn->op2 = IR_UNUSED; + } + jit->labels[i].inputs = tmp2; } - tmp2 = 0; - tmp3 = jit->labels[i].merge; - jit->labels[i].merge = jit->ctx.control = tmp1; - - while (tmp3) { - /* Store forward GOTOs into MERGE */ - tmp2++; - assert(tmp2 <= jit->labels[i].inputs); - ir_set_op(_ir_CTX, tmp1, tmp2, tmp3); - ir_insn *insn = &jit->ctx.ir_base[tmp3]; - assert(insn->op == IR_END); - tmp3 = insn->op2; - insn->op2 = IR_UNUSED; + +#ifdef JIT_RESOLVE_STACK + assert(jit->bb_sp[i] != -1); + jit->sp == jit->bb_sp[i]; +# ifdef JIT_USE_SSA + if (i != 0) { + jit->b++; } - jit->labels[i].inputs = tmp2; + jit->labels[i].b = jit->b; +# endif +#endif } decoded = decode_at_address(prog, i); @@ -418,7 +599,9 @@ static void jit_program(jit_ctx *jit, const Instr_t *prog, int len) { tmp2 = jit_pop(jit); // JIT if (tmp2 == 0) tmp3 = ir_IF(ir_EQ(tmp2, ir_CONST_U32(0))); - +#if defined(JIT_RESOLVE_STACK) && defined(JIT_USE_SSA) + jit_ssa_end_block(jit); +#endif ir_IF_TRUE_cold(tmp3); // JIT: printf("Division by zero\n"); ir_CALL_1(IR_VOID, printf_func, ir_CONST_STR("Division by zero\n")); @@ -449,6 +632,9 @@ static void jit_program(jit_ctx *jit, const Instr_t *prog, int len) { tmp1 = jit_pop(jit); // JIT: if (tmp1 == 0) tmp3 = ir_IF(ir_EQ(tmp1, ir_CONST_U32(0))); +#if defined(JIT_RESOLVE_STACK) && defined(JIT_USE_SSA) + jit_ssa_end_block(jit); +#endif ir_IF_TRUE(tmp3); if (decoded.immediate >= 0) { jit_goto_forward(jit, i + decoded.immediate); @@ -461,6 +647,9 @@ static void jit_program(jit_ctx *jit, const Instr_t *prog, int len) { tmp1 = jit_pop(jit); // JIT: if (tmp1 == 0) tmp3 = ir_IF(ir_NE(tmp1, ir_CONST_U32(0))); +#if defined(JIT_RESOLVE_STACK) && defined(JIT_USE_SSA) + jit_ssa_end_block(jit); +#endif ir_IF_TRUE(tmp3); if (decoded.immediate >= 0) { jit_goto_forward(jit, i + decoded.immediate); From 240b44c83d2a5fbfcc4914305e9c68a3038ba6d0 Mon Sep 17 00:00:00 2001 From: Dmitry Stogov Date: Thu, 28 Nov 2024 12:52:52 +0300 Subject: [PATCH 15/17] Use LDFLAGS --- Makefile | 28 ++++++++++++++-------------- 1 file changed, 14 insertions(+), 14 deletions(-) diff --git a/Makefile b/Makefile index 4a57331..17c982b 100644 --- a/Makefile +++ b/Makefile @@ -44,46 +44,46 @@ $(ALL): $(COMMON_OBJ) # Note that some of them use customized CFLAGS switched: switched.o - $(CC) $^ -lm -o $@ + $(CC) $^ $(LDFLAGS) -o $@ threaded: CFLAGS += -fno-gcse -fno-function-cse -fno-thread-jumps -fno-cse-follow-jumps -fno-crossjumping -fno-cse-skip-blocks -fomit-frame-pointer threaded: threaded.o - $(CC) $^ -lm -o $@ + $(CC) $^ $(LDFLAGS) -o $@ predecoded: predecoded.o - $(CC) $^ -lm -o $@ + $(CC) $^ $(LDFLAGS) -o $@ tailrecursive: CFLAGS += -foptimize-sibling-calls tailrecursive: tailrecursive.o - $(CC) $^ -lm -o $@ + $(CC) $^ $(LDFLAGS) -o $@ asmoptll: asmoptll.o $(CC) -g -pg -c $< -o $@ asmopt: CFLAGS += -foptimize-sibling-calls asmopt: asmoptll.o asmopt.o - $(CC) -g -pg $^ -lm -o $@ + $(CC) -g -pg $^ $(LDFLAGS) -o $@ prof: gprof -b asmopt gmon.out threaded-cached: CFLAGS += -fno-gcse -fno-thread-jumps -fno-cse-follow-jumps -fno-crossjumping -fno-cse-skip-blocks -fomit-frame-pointer threaded-cached: threaded-cached.o - $(CC) $^ -lm -o $@ + $(CC) $^ $(LDFLAGS) -o $@ subroutined: subroutined.o - $(CC) $^ -lm -o $@ + $(CC) $^ $(LDFLAGS) -o $@ translated: CFLAGS += -std=gnu11 translated: translated.o - $(CC) $^ -lm -o $@ + $(CC) $^ $(LDFLAGS) -o $@ translated-inline: CFLAGS += -std=gnu11 translated-inline: translated-inline.o - $(CC) $^ -lm -o $@ + $(CC) $^ $(LDFLAGS) -o $@ native: native.o - $(CC) $^ -lm -o $@ + $(CC) $^ $(LDFLAGS) -o $@ ######################## ### Maintainance targets @@ -116,22 +116,22 @@ jited_ir.o: jited_ir.c $(CC) $(DEPFLAGS) $(CFLAGS) $(CPPFLAGS) -c $< jited_ir: jited_ir.o - $(CC) $^ -lir -lcapstone -lm -o $@ + $(CC) $^ -lir -lcapstone $(LDFLAGS) -o $@ jited_ir_stack.o: jited_ir.c $(CC) $(DEPFLAGS) $(CFLAGS) $(CPPFLAGS) -DJIT_RESOLVE_STACK -o $@ -c $< jited_ir_stack: jited_ir_stack.o - $(CC) $^ -lir -lcapstone -lm -o $@ + $(CC) $^ -lir -lcapstone $(LDFLAGS) -o $@ jited_ir_var.o: jited_ir.c $(CC) $(DEPFLAGS) $(CFLAGS) $(CPPFLAGS) -DJIT_RESOLVE_STACK -DJIT_USE_VARS -o $@ -c $< jited_ir_var: jited_ir_var.o - $(CC) $^ -lir -lcapstone -lm -o $@ + $(CC) $^ -lir -lcapstone $(LDFLAGS) -o $@ jited_ir_ssa.o: jited_ir.c $(CC) $(DEPFLAGS) $(CFLAGS) $(CPPFLAGS) -DJIT_RESOLVE_STACK -DJIT_USE_SSA -o $@ -c $< jited_ir_ssa: jited_ir_ssa.o - $(CC) $^ -lir -lcapstone -lm -o $@ + $(CC) $^ -lir -lcapstone $(LDFLAGS) -o $@ From c57d5ef2d37eb5f9096926d5644be36ec713d6cc Mon Sep 17 00:00:00 2001 From: Dmitry Stogov Date: Thu, 28 Nov 2024 12:53:25 +0300 Subject: [PATCH 16/17] Clean memory --- jited_ir.c | 11 +++++++++++ 1 file changed, 11 insertions(+) diff --git a/jited_ir.c b/jited_ir.c index cdf76e6..b66246b 100644 --- a/jited_ir.c +++ b/jited_ir.c @@ -746,6 +746,17 @@ static void jit_program(jit_ctx *jit, const Instr_t *prog, int len) { ir_STORE(ir_ADD_OFFSET(jit->cpu, offsetof(cpu_t, state)), ir_CONST_I32(Cpu_Break)); ir_RETURN(IR_VOID); } + +#ifdef JIT_RESOLVE_STACK +# if defined(JIT_USE_SSA) + free(jit->ssa_vars); + free(jit->incomplete_phis); +# elif defined(JIT_USE_SSA) + free(jit->vars); +# endif + free(jit->bb_sp); +#endif + free(jit->labels); } int main(int argc, char **argv) { From b617707ad90b30708a428c36fd6795ae1b62d502 Mon Sep 17 00:00:00 2001 From: Dmitry Stogov Date: Thu, 19 Dec 2024 19:52:56 +0300 Subject: [PATCH 17/17] Use MEM2SSA pass for SSA construction --- Makefile | 2 +- jited_ir.c | 199 ++++------------------------------------------------- 2 files changed, 13 insertions(+), 188 deletions(-) diff --git a/Makefile b/Makefile index 17c982b..194b7a6 100644 --- a/Makefile +++ b/Makefile @@ -3,7 +3,7 @@ # Copyright (c) 2015, 2016 Grigory Rechistov. All rights reserved. # -CFLAGS=-std=c11 -O2 -Wextra -Werror -gdwarf-3 +CFLAGS=-std=c11 -O0 -g -Wextra -Werror -gdwarf-3 LDFLAGS = -lm COMMON_SRC = common.c diff --git a/jited_ir.c b/jited_ir.c index b66246b..0b7d6ca 100644 --- a/jited_ir.c +++ b/jited_ir.c @@ -71,22 +71,13 @@ static inline decode_t decode_at_address(const Instr_t* prog, uint32_t addr) { typedef struct _jit_label { ir_ref inputs; /* number of input edges */ ir_ref merge; /* reference of MERGE or "list" of forward inputs */ -#if defined(JIT_RESOLVE_STACK) && defined(JIT_USE_SSA) - int b; -#endif } jit_label; typedef struct _jit_ctx { ir_ctx ctx; ir_ref cpu; #ifdef JIT_RESOLVE_STACK -# if defined(JIT_USE_SSA) - int b; /* current block */ - int blocks_count; - int stack_limit; - ir_ref *ssa_vars; - ir_ref *incomplete_phis; -# elif defined(JIT_USE_VARS) +# if defined(JIT_USE_VARS) || defined(JIT_USE_SSA) int stack_limit; ir_ref *vars; # endif @@ -102,123 +93,10 @@ typedef struct _jit_ctx { #undef _ir_CTX #define _ir_CTX (&jit->ctx) -#ifdef JIT_USE_SSA -static ir_ref jit_ssa_get_var(jit_ctx *jit, int b, int var, ir_ref control); - -static ir_ref jit_ssa_try_remove_trivial_phi(jit_ctx *jit, ir_ref phi) { - ir_ref i, n = jit->ctx.ir_base[phi].inputs_count; - ir_ref same, op; - - assert(n > 2); - same = ir_get_op(_ir_CTX, phi, 2); - for (i = 3; i <= n; i++) { - op = ir_get_op(_ir_CTX, phi, i); - if (op != same && op != phi) { - return phi; - } - } - - // Remember all users except the phi itself - // users = phi.users.remove(phi) - // Reroute all uses of phi to same and remove phi - // phi.replaceBy(same) - // Try to recursively remove all phi users, which might have become trivial - // for use in users: f use is a Phi: tryRemoveTrivialPhi(use) - - return same; -} - -static void jit_ssa_set_var(jit_ctx *jit, int b, int var, ir_ref val) { - jit->ssa_vars[var * jit->blocks_count + b] = val; -} - -static ir_ref jit_ssa_get_var(jit_ctx *jit, int b, int var, ir_ref control) { - ir_ref val = jit->ssa_vars[var * jit->blocks_count + b]; - ir_ref ref; - ir_insn *insn; - - if (val) { - return val; - } - - ref = control; - assert(ref); - insn = &jit->ctx.ir_base[ref]; - - /* go up to the start of basic-block through control links */ - while (insn->op < IR_START || insn->op > IR_LOOP_BEGIN) { - ref = insn->op1; - insn = &jit->ctx.ir_base[ref]; - } - - assert(insn->op != IR_START); - if (insn->op == IR_MERGE || insn->op == IR_LOOP_BEGIN) { - bool incomplete = 0; - uint32_t i, n = insn->inputs_count; - val = ir_emit_N(_ir_CTX, IR_OPT(IR_PHI, IR_U32), n + 1); - ir_set_op(_ir_CTX, val, 1, ref); - jit->ssa_vars[var * jit->blocks_count + b] = val; - for (i = 1; i <= n; i++) { - ir_ref end = ir_get_op(_ir_CTX, ref, i); - if (end) { - ir_insn *end_insn = &jit->ctx.ir_base[end]; - assert(end_insn->op >= IR_END && end_insn->op <= IR_SWITCH); - assert(end_insn->op3 >= 1000); - ir_ref op = jit_ssa_get_var(jit, end_insn->op3 - 1000, var, end); - ir_set_op(_ir_CTX, val, i + 1, op); - } else { - incomplete = 1; - } - } - if (incomplete) { - jit->incomplete_phis[var * jit->blocks_count + b] = val; - } else { - val = jit_ssa_try_remove_trivial_phi(jit, val); - } - } else { - ir_ref end = insn->op1; - assert(end); - ir_insn *end_insn = &jit->ctx.ir_base[end]; - assert(end_insn->op >= IR_END && end_insn->op <= IR_SWITCH); - assert(end_insn->op3 >= 1000); - val = jit_ssa_get_var(jit, end_insn->op3 - 1000, var, end); - } - jit->ssa_vars[var * jit->blocks_count + b] = val; - return val; -} - -static void jit_ssa_fix_incomplete_phis(jit_ctx *jit, uint32_t target) -{ - int dst_block = jit->labels[target].b; - int var; - - for (var = 0; var < jit->stack_limit; var++) { - ir_ref phi = jit->incomplete_phis[var * jit->blocks_count + dst_block]; - if (phi) { - ir_ref val = jit_ssa_get_var(jit, jit->b, var, jit->ctx.control); - ir_set_op(_ir_CTX, phi, jit->labels[target].inputs + 2, val); - } - } -} - -static void jit_ssa_end_block(jit_ctx *jit) { - ir_ref end = jit->ctx.insns_count - 1; - ir_insn *insn = &jit->ctx.ir_base[end]; - assert(insn->op >= IR_END && insn->op <= IR_SWITCH); - /* Use END->op3 to store the corresponding BB index */ - insn->op3 = 1000 + jit->b; -} - -#endif - static void jit_push(jit_ctx *jit, ir_ref v) { #ifdef JIT_RESOLVE_STACK int sp = ++jit->sp; -# ifdef JIT_USE_SSA - assert(sp < jit->stack_limit); - // JIT: pcpu->stack[++pcpu->sp] = v; - jit_ssa_set_var(jit, jit->b, sp, v); -# elif defined(JIT_USE_VARS) +# if defined(JIT_USE_VARS) || defined(JIT_USE_SSA) assert(sp < jit->stack_limit); // JIT: pcpu->stack[++pcpu->sp] = v; ir_VSTORE(jit->vars[sp], v); @@ -250,10 +128,7 @@ static ir_ref jit_pop(jit_ctx *jit) { #ifdef JIT_RESOLVE_STACK int sp = jit->sp--; assert(sp >= 0); -# ifdef JIT_USE_SSA - // JIT: pcpu->stack[++pcpu->sp] = v; - return jit_ssa_get_var(jit, jit->b, sp, jit->ctx.control); -# elif defined(JIT_USE_VARS) +# if defined(JIT_USE_VARS) || defined(JIT_USE_SSA) //JIT: pcpu->stack[pcpu->sp--]; return ir_VLOAD_U32(jit->vars[sp]); # else @@ -283,12 +158,7 @@ static ir_ref jit_pop(jit_ctx *jit) { static ir_ref jit_pick(jit_ctx *jit, ir_ref pos) { #ifdef JIT_RESOLVE_STACK -# ifdef JIT_USE_SSA - assert(IR_IS_CONST_REF(pos)); - int sp = jit->ctx.ir_base[pos].val.i32; - assert(sp >= 0 && sp < jit->stack_limit); - return jit_ssa_get_var(jit, jit->b, sp, jit->ctx.control); -# elif defined(JIT_USE_VARS) +# if defined(JIT_USE_VARS) || defined(JIT_USE_SSA) assert(IR_IS_CONST_REF(pos)); int sp = jit->ctx.ir_base[pos].val.i32; assert(sp >= 0 && sp < jit->stack_limit); @@ -321,16 +191,10 @@ static ir_ref jit_pick(jit_ctx *jit, ir_ref pos) { } static void jit_goto_backward(jit_ctx *jit, uint32_t target) { -#if defined(JIT_RESOLVE_STACK) && defined(JIT_USE_SSA) - jit_ssa_fix_incomplete_phis(jit, target); -#endif ir_set_op(_ir_CTX, jit->labels[target].merge, ++jit->labels[target].inputs, ir_END()); #ifdef JIT_RESOLVE_STACK assert(jit->bb_sp[target] == -1 || jit->bb_sp[target] == jit->sp); jit->bb_sp[target] = jit->sp; -# ifdef JIT_USE_SSA - jit_ssa_end_block(jit); -# endif #endif } @@ -339,9 +203,6 @@ static void jit_goto_forward(jit_ctx *jit, uint32_t target) { #ifdef JIT_RESOLVE_STACK assert(jit->bb_sp[target] == -1 || jit->bb_sp[target] == jit->sp); jit->bb_sp[target] = jit->sp; -# ifdef JIT_USE_SSA - jit_ssa_end_block(jit); -# endif #endif } @@ -365,14 +226,6 @@ static void jit_program(jit_ctx *jit, const Instr_t *prog, int len) { switch(decoded.opcode) { case Instr_JE: case Instr_JNE: -#if defined(JIT_RESOLVE_STACK) && defined(JIT_USE_SSA) - if (!jit->labels[i + decoded.immediate].inputs && i + decoded.immediate != 0) { - jit->blocks_count++; - } - if (!jit->labels[i].inputs) { - jit->blocks_count++; - } -#endif jit->labels[i + decoded.immediate].inputs++; jit->labels[i].inputs++; break; @@ -390,16 +243,11 @@ static void jit_program(jit_ctx *jit, const Instr_t *prog, int len) { jit->bb_sp = malloc(len * sizeof(int)); memset(jit->bb_sp, -1, len * sizeof(int)); -# if defined(JIT_USE_SSA) || defined(JIT_USE_VARS) +# if defined(JIT_USE_VARS) || defined(JIT_USE_SSA) /* calculate stack_limit */ jit->stack_limit = 0; decoded.opcode = Instr_Nop; -# ifdef JIT_USE_SSA - jit->blocks_count = 1; - jit->b = 0; -# endif - for (int i=0; i < len;) { if (jit->labels[i].inputs > 0) { if (decoded.opcode != Instr_Jump) { @@ -408,11 +256,6 @@ static void jit_program(jit_ctx *jit, const Instr_t *prog, int len) { } assert(jit->bb_sp[i] != -1); jit->sp == jit->bb_sp[i]; -# ifdef JIT_USE_SSA - if (i != 0) { - jit->blocks_count++; - } -# endif } decoded = decode_at_address(prog, i); @@ -471,17 +314,12 @@ static void jit_program(jit_ctx *jit, const Instr_t *prog, int len) { } jit->sp = -1; -# ifdef JIT_USE_SSA - jit->ssa_vars = calloc(jit->stack_limit * jit->blocks_count, sizeof(ir_ref)); - jit->incomplete_phis = calloc(jit->stack_limit * jit->blocks_count, sizeof(ir_ref)); -# else jit->vars = malloc(jit->stack_limit * sizeof(ir_ref)); for (int i = 0; i < jit->stack_limit; i++) { char s[16]; sprintf(s, "t%d", i); jit->vars[i] = ir_var(_ir_CTX, IR_U32, 1, s); } -# endif # endif #endif @@ -534,12 +372,6 @@ static void jit_program(jit_ctx *jit, const Instr_t *prog, int len) { #ifdef JIT_RESOLVE_STACK assert(jit->bb_sp[i] != -1); jit->sp == jit->bb_sp[i]; -# ifdef JIT_USE_SSA - if (i != 0) { - jit->b++; - } - jit->labels[i].b = jit->b; -# endif #endif } @@ -599,9 +431,6 @@ static void jit_program(jit_ctx *jit, const Instr_t *prog, int len) { tmp2 = jit_pop(jit); // JIT if (tmp2 == 0) tmp3 = ir_IF(ir_EQ(tmp2, ir_CONST_U32(0))); -#if defined(JIT_RESOLVE_STACK) && defined(JIT_USE_SSA) - jit_ssa_end_block(jit); -#endif ir_IF_TRUE_cold(tmp3); // JIT: printf("Division by zero\n"); ir_CALL_1(IR_VOID, printf_func, ir_CONST_STR("Division by zero\n")); @@ -632,9 +461,6 @@ static void jit_program(jit_ctx *jit, const Instr_t *prog, int len) { tmp1 = jit_pop(jit); // JIT: if (tmp1 == 0) tmp3 = ir_IF(ir_EQ(tmp1, ir_CONST_U32(0))); -#if defined(JIT_RESOLVE_STACK) && defined(JIT_USE_SSA) - jit_ssa_end_block(jit); -#endif ir_IF_TRUE(tmp3); if (decoded.immediate >= 0) { jit_goto_forward(jit, i + decoded.immediate); @@ -647,9 +473,6 @@ static void jit_program(jit_ctx *jit, const Instr_t *prog, int len) { tmp1 = jit_pop(jit); // JIT: if (tmp1 == 0) tmp3 = ir_IF(ir_NE(tmp1, ir_CONST_U32(0))); -#if defined(JIT_RESOLVE_STACK) && defined(JIT_USE_SSA) - jit_ssa_end_block(jit); -#endif ir_IF_TRUE(tmp3); if (decoded.immediate >= 0) { jit_goto_forward(jit, i + decoded.immediate); @@ -748,10 +571,7 @@ static void jit_program(jit_ctx *jit, const Instr_t *prog, int len) { } #ifdef JIT_RESOLVE_STACK -# if defined(JIT_USE_SSA) - free(jit->ssa_vars); - free(jit->incomplete_phis); -# elif defined(JIT_USE_SSA) +# if defined(JIT_USE_VARS) || defined(JIT_USE_SSA) free(jit->vars); # endif free(jit->bb_sp); @@ -766,8 +586,12 @@ int main(int argc, char **argv) { typedef void (*entry_t)(cpu_t*); entry_t entry; size_t size; + uint32_t flags = IR_FUNCTION | IR_OPT_FOLDING | IR_OPT_CFG | IR_OPT_CODEGEN; - ir_init(&jit.ctx, IR_FUNCTION | IR_OPT_FOLDING | IR_OPT_CFG | IR_OPT_CODEGEN, 256, 1024); +#if defined(JIT_RESOLVE_STACK) && defined(JIT_USE_SSA) + flags |= IR_OPT_MEM2SSA; +#endif + ir_init(&jit.ctx, flags, 256, 1024); jit_program(&jit, cpu.pmem, PROGRAM_SIZE); @@ -778,6 +602,7 @@ int main(int argc, char **argv) { entry = (entry_t)ir_jit_compile(&jit.ctx, 2, &size); if (!entry) { printf("Compilation failure\n"); + exit(-1); } if (debug) {