From 23bbc25085ceac827e1da9bebead058f436f66a6 Mon Sep 17 00:00:00 2001 From: Richard Henderson Date: Sat, 27 Jul 2013 08:42:51 -1000 Subject: [PATCH 1/8] tcg-arm: Use ldrd/strd for appropriate qemu_ld/st64 Reviewed-by: Aurelien Jarno Signed-off-by: Richard Henderson --- tcg/arm/tcg-target.c | 48 +++++++++++++++++++++++++++++++++++++++----- 1 file changed, 43 insertions(+), 5 deletions(-) diff --git a/tcg/arm/tcg-target.c b/tcg/arm/tcg-target.c index eb0e84ce44..ea0d9b45d3 100644 --- a/tcg/arm/tcg-target.c +++ b/tcg/arm/tcg-target.c @@ -320,6 +320,9 @@ typedef enum { INSN_STRB_REG = 0x06400000, INSN_LDRD_IMM = 0x004000d0, + INSN_LDRD_REG = 0x000000d0, + INSN_STRD_IMM = 0x004000f0, + INSN_STRD_REG = 0x000000f0, } ARMInsn; #define SHIFT_IMM_LSL(im) (((im) << 7) | 0x00) @@ -810,6 +813,30 @@ static inline void tcg_out_st32_r(TCGContext *s, int cond, TCGReg rt, tcg_out_memop_r(s, cond, INSN_STR_REG, rt, rn, rm, 1, 1, 0); } +static inline void tcg_out_ldrd_8(TCGContext *s, int cond, TCGReg rt, + TCGReg rn, int imm8) +{ + tcg_out_memop_8(s, cond, INSN_LDRD_IMM, rt, rn, imm8, 1, 0); +} + +static inline void tcg_out_ldrd_r(TCGContext *s, int cond, TCGReg rt, + TCGReg rn, TCGReg rm) +{ + tcg_out_memop_r(s, cond, INSN_LDRD_REG, rt, rn, rm, 1, 1, 0); +} + +static inline void tcg_out_strd_8(TCGContext *s, int cond, TCGReg rt, + TCGReg rn, int imm8) +{ + tcg_out_memop_8(s, cond, INSN_STRD_IMM, rt, rn, imm8, 1, 0); +} + +static inline void tcg_out_strd_r(TCGContext *s, int cond, TCGReg rt, + TCGReg rn, TCGReg rm) +{ + tcg_out_memop_r(s, cond, INSN_STRD_REG, rt, rn, rm, 1, 1, 0); +} + /* Register pre-increment with base writeback. */ static inline void tcg_out_ld32_rwb(TCGContext *s, int cond, TCGReg rt, TCGReg rn, TCGReg rm) @@ -1397,6 +1424,9 @@ static void tcg_out_qemu_ld(TCGContext *s, const TCGArg *args, int opc) tcg_out_ld32_12(s, COND_AL, data_reg, TCG_REG_R1, 4); tcg_out_bswap32(s, COND_AL, data_reg2, data_reg2); tcg_out_bswap32(s, COND_AL, data_reg, data_reg); + } else if (use_armv6_instructions + && (data_reg & 1) == 0 && data_reg2 == data_reg + 1) { + tcg_out_ldrd_r(s, COND_AL, data_reg, addr_reg, TCG_REG_R1); } else { tcg_out_ld32_rwb(s, COND_AL, data_reg, TCG_REG_R1, addr_reg); tcg_out_ld32_12(s, COND_AL, data_reg2, TCG_REG_R1, 4); @@ -1450,9 +1480,13 @@ static void tcg_out_qemu_ld(TCGContext *s, const TCGArg *args, int opc) } break; case 3: - /* TODO: use block load - - * check that data_reg2 > data_reg or the other way */ - if (data_reg == addr_reg) { + if (use_armv6_instructions && !bswap + && (data_reg & 1) == 0 && data_reg2 == data_reg + 1) { + tcg_out_ldrd_8(s, COND_AL, data_reg, addr_reg, 0); + } else if (use_armv6_instructions && bswap + && (data_reg2 & 1) == 0 && data_reg == data_reg2 + 1) { + tcg_out_ldrd_8(s, COND_AL, data_reg2, addr_reg, 0); + } else if (data_reg == addr_reg) { tcg_out_ld32_12(s, COND_AL, data_reg2, addr_reg, bswap ? 0 : 4); tcg_out_ld32_12(s, COND_AL, data_reg, addr_reg, bswap ? 4 : 0); } else { @@ -1529,6 +1563,9 @@ static void tcg_out_qemu_st(TCGContext *s, const TCGArg *args, int opc) tcg_out_st32_rwb(s, COND_AL, TCG_REG_R0, TCG_REG_R1, addr_reg); tcg_out_bswap32(s, COND_AL, TCG_REG_R0, data_reg); tcg_out_st32_12(s, COND_AL, TCG_REG_R0, TCG_REG_R1, 4); + } else if (use_armv6_instructions + && (data_reg & 1) == 0 && data_reg2 == data_reg + 1) { + tcg_out_strd_r(s, COND_AL, data_reg, addr_reg, TCG_REG_R1); } else { tcg_out_st32_rwb(s, COND_AL, data_reg, TCG_REG_R1, addr_reg); tcg_out_st32_12(s, COND_AL, data_reg2, TCG_REG_R1, 4); @@ -1576,13 +1613,14 @@ static void tcg_out_qemu_st(TCGContext *s, const TCGArg *args, int opc) } break; case 3: - /* TODO: use block store - - * check that data_reg2 > data_reg or the other way */ if (bswap) { tcg_out_bswap32(s, COND_AL, TCG_REG_R0, data_reg2); tcg_out_st32_12(s, COND_AL, TCG_REG_R0, addr_reg, 0); tcg_out_bswap32(s, COND_AL, TCG_REG_R0, data_reg); tcg_out_st32_12(s, COND_AL, TCG_REG_R0, addr_reg, 4); + } else if (use_armv6_instructions + && (data_reg & 1) == 0 && data_reg2 == data_reg + 1) { + tcg_out_strd_8(s, COND_AL, data_reg, addr_reg, 0); } else { tcg_out_st32_12(s, COND_AL, data_reg, addr_reg, 0); tcg_out_st32_12(s, COND_AL, data_reg2, addr_reg, 4); From d9f4dde4a6d34f14509664edc262016f21be5aac Mon Sep 17 00:00:00 2001 From: Richard Henderson Date: Sat, 27 Jul 2013 14:09:47 -1000 Subject: [PATCH 2/8] tcg-arm: Rearrange slow-path qemu_ld/st Use the new helper_ret_*_mmu routines. Use a conditional call to arrange for a tail-call from the store path, and to load the return address for the helper for the load path. Signed-off-by: Richard Henderson --- include/exec/exec-all.h | 16 +--- tcg/arm/tcg-target.c | 175 ++++++++++++++++++++-------------------- 2 files changed, 87 insertions(+), 104 deletions(-) diff --git a/include/exec/exec-all.h b/include/exec/exec-all.h index dc27f33152..8dd15948d8 100644 --- a/include/exec/exec-all.h +++ b/include/exec/exec-all.h @@ -324,21 +324,7 @@ extern uintptr_t tci_tb_ptr; In some implementations, we pass the "logical" return address manually; in others, we must infer the logical return from the true return. */ #if defined(CONFIG_QEMU_LDST_OPTIMIZATION) && defined(CONFIG_SOFTMMU) -# if defined(__arm__) -/* We define two insns between the return address and the branch back to - straight-line. Find and decode that branch insn. */ -# define GETRA_LDST(RA) tcg_getra_ldst(RA) -static inline uintptr_t tcg_getra_ldst(uintptr_t ra) -{ - int32_t b; - ra += 8; /* skip the two insns */ - b = *(int32_t *)ra; /* load the branch insn */ - b = (b << 8) >> (8 - 2); /* extract the displacement */ - ra += 8; /* branches are relative to pc+8 */ - ra += b; /* apply the displacement */ - return ra; -} -# elif defined(__aarch64__) +# if defined(__aarch64__) # define GETRA_LDST(RA) tcg_getra_ldst(RA) static inline uintptr_t tcg_getra_ldst(uintptr_t ra) { diff --git a/tcg/arm/tcg-target.c b/tcg/arm/tcg-target.c index ea0d9b45d3..9d2fe8a0f7 100644 --- a/tcg/arm/tcg-target.c +++ b/tcg/arm/tcg-target.c @@ -175,11 +175,12 @@ static int target_parse_constraint(TCGArgConstraint *ct, const char **pct_str) ct->ct |= TCG_CT_REG; tcg_regset_set32(ct->u.regs, 0, (1 << TCG_TARGET_NB_REGS) - 1); #ifdef CONFIG_SOFTMMU - /* r0-r2 will be overwritten when reading the tlb entry, + /* r0-r2,lr will be overwritten when reading the tlb entry, so don't use these. */ tcg_regset_reset_reg(ct->u.regs, TCG_REG_R0); tcg_regset_reset_reg(ct->u.regs, TCG_REG_R1); tcg_regset_reset_reg(ct->u.regs, TCG_REG_R2); + tcg_regset_reset_reg(ct->u.regs, TCG_REG_R14); #endif break; case 'L': @@ -207,6 +208,7 @@ static int target_parse_constraint(TCGArgConstraint *ct, const char **pct_str) /* Avoid clashes with registers being used for helper args */ tcg_regset_reset_reg(ct->u.regs, TCG_REG_R3); #endif + tcg_regset_reset_reg(ct->u.regs, TCG_REG_R14); #endif break; @@ -382,13 +384,17 @@ static inline void tcg_out_b_noaddr(TCGContext *s, int cond) /* We pay attention here to not modify the branch target by skipping the corresponding bytes. This ensure that caches and memory are kept coherent during retranslation. */ -#ifdef HOST_WORDS_BIGENDIAN - tcg_out8(s, (cond << 4) | 0x0a); - s->code_ptr += 3; -#else s->code_ptr += 3; tcg_out8(s, (cond << 4) | 0x0a); -#endif +} + +static inline void tcg_out_bl_noaddr(TCGContext *s, int cond) +{ + /* We pay attention here to not modify the branch target by skipping + the corresponding bytes. This ensure that caches and memory are + kept coherent during retranslation. */ + s->code_ptr += 3; + tcg_out8(s, (cond << 4) | 0x0b); } static inline void tcg_out_bl(TCGContext *s, int cond, int32_t offset) @@ -1002,34 +1008,27 @@ static inline void tcg_out_st8(TCGContext *s, int cond, tcg_out_st8_12(s, cond, rd, rn, offset); } -/* The _goto case is normally between TBs within the same code buffer, - * and with the code buffer limited to 16MB we shouldn't need the long - * case. - * - * .... except to the prologue that is in its own buffer. +/* The _goto case is normally between TBs within the same code buffer, and + * with the code buffer limited to 16MB we wouldn't need the long case. + * But we also use it for the tail-call to the qemu_ld/st helpers, which does. */ static inline void tcg_out_goto(TCGContext *s, int cond, uint32_t addr) { - int32_t val; + int32_t disp = addr - (tcg_target_long) s->code_ptr; - if (addr & 1) { - /* goto to a Thumb destination isn't supported */ - tcg_abort(); + if ((addr & 1) == 0 && disp - 8 < 0x01fffffd && disp - 8 > -0x01fffffd) { + tcg_out_b(s, cond, disp); + return; } - val = addr - (tcg_target_long) s->code_ptr; - if (val - 8 < 0x01fffffd && val - 8 > -0x01fffffd) - tcg_out_b(s, cond, val); - else { - if (cond == COND_AL) { - tcg_out_ld32_12(s, COND_AL, TCG_REG_PC, TCG_REG_PC, -4); - tcg_out32(s, addr); - } else { - tcg_out_movi32(s, cond, TCG_REG_TMP, val - 8); - tcg_out_dat_reg(s, cond, ARITH_ADD, - TCG_REG_PC, TCG_REG_PC, - TCG_REG_TMP, SHIFT_IMM_LSL(0)); + tcg_out_movi32(s, cond, TCG_REG_TMP, addr); + if (use_armv5t_instructions) { + tcg_out_bx(s, cond, TCG_REG_TMP); + } else { + if (addr & 1) { + tcg_abort(); } + tcg_out_mov_reg(s, cond, TCG_REG_PC, TCG_REG_TMP); } } @@ -1084,23 +1083,29 @@ static inline void tcg_out_goto_label(TCGContext *s, int cond, int label_index) } #ifdef CONFIG_SOFTMMU +/* helper signature: helper_ret_ld_mmu(CPUState *env, target_ulong addr, + * int mmu_idx, uintptr_t ra) + */ +static const void * const qemu_ld_helpers[8] = { + helper_ret_ldub_mmu, + helper_ret_lduw_mmu, + helper_ret_ldul_mmu, + helper_ret_ldq_mmu, -/* helper signature: helper_ld_mmu(CPUState *env, target_ulong addr, - int mmu_idx) */ -static const void * const qemu_ld_helpers[4] = { - helper_ldb_mmu, - helper_ldw_mmu, - helper_ldl_mmu, - helper_ldq_mmu, + helper_ret_ldsb_mmu, + helper_ret_ldsw_mmu, + helper_ret_ldul_mmu, + helper_ret_ldq_mmu, }; -/* helper signature: helper_st_mmu(CPUState *env, target_ulong addr, - uintxx_t val, int mmu_idx) */ +/* helper signature: helper_ret_st_mmu(CPUState *env, target_ulong addr, + * uintxx_t val, int mmu_idx, uintptr_t ra) + */ static const void * const qemu_st_helpers[4] = { - helper_stb_mmu, - helper_stw_mmu, - helper_stl_mmu, - helper_stq_mmu, + helper_ret_stb_mmu, + helper_ret_stw_mmu, + helper_ret_stl_mmu, + helper_ret_stq_mmu, }; /* Helper routines for marshalling helper function arguments into @@ -1259,7 +1264,8 @@ static void add_qemu_ldst_label(TCGContext *s, int is_ld, int opc, static void tcg_out_qemu_ld_slow_path(TCGContext *s, TCGLabelQemuLdst *lb) { TCGReg argreg, data_reg, data_reg2; - uint8_t *start; + int opc = lb->opc; + uintptr_t func; reloc_pc24(lb->label_ptr[0], (tcg_target_long)s->code_ptr); @@ -1270,22 +1276,30 @@ static void tcg_out_qemu_ld_slow_path(TCGContext *s, TCGLabelQemuLdst *lb) argreg = tcg_out_arg_reg32(s, argreg, lb->addrlo_reg); } argreg = tcg_out_arg_imm32(s, argreg, lb->mem_index); - tcg_out_call(s, (tcg_target_long) qemu_ld_helpers[lb->opc & 3]); + argreg = tcg_out_arg_reg32(s, argreg, TCG_REG_R14); + + /* For armv6 we can use the canonical unsigned helpers and minimize + icache usage. For pre-armv6, use the signed helpers since we do + not have a single insn sign-extend. */ + if (use_armv6_instructions) { + func = (uintptr_t)qemu_ld_helpers[opc & 3]; + } else { + func = (uintptr_t)qemu_ld_helpers[opc]; + if (opc & 4) { + opc = 2; + } + } + tcg_out_call(s, func); data_reg = lb->datalo_reg; data_reg2 = lb->datahi_reg; - - start = s->code_ptr; - switch (lb->opc) { + switch (opc) { case 0 | 4: tcg_out_ext8s(s, COND_AL, data_reg, TCG_REG_R0); break; case 1 | 4: tcg_out_ext16s(s, COND_AL, data_reg, TCG_REG_R0); break; - case 0: - case 1: - case 2: default: tcg_out_mov_reg(s, COND_AL, data_reg, TCG_REG_R0); break; @@ -1295,23 +1309,6 @@ static void tcg_out_qemu_ld_slow_path(TCGContext *s, TCGLabelQemuLdst *lb) break; } - /* For GETPC_LDST in exec-all.h, we architect exactly 2 insns between - the call and the branch back to straight-line code. Note that the - moves above could be elided by register allocation, nor do we know - which code alternative we chose for extension. */ - switch (s->code_ptr - start) { - case 0: - tcg_out_nop(s); - /* FALLTHRU */ - case 4: - tcg_out_nop(s); - /* FALLTHRU */ - case 8: - break; - default: - abort(); - } - tcg_out_goto(s, COND_AL, (tcg_target_long)lb->raddr); } @@ -1347,13 +1344,10 @@ static void tcg_out_qemu_st_slow_path(TCGContext *s, TCGLabelQemuLdst *lb) } argreg = tcg_out_arg_imm32(s, argreg, lb->mem_index); - tcg_out_call(s, (tcg_target_long) qemu_st_helpers[lb->opc & 3]); + argreg = tcg_out_arg_reg32(s, argreg, TCG_REG_R14); - /* For GETPC_LDST in exec-all.h, we architect exactly 2 insns between - the call and the branch back to straight-line code. */ - tcg_out_nop(s); - tcg_out_nop(s); - tcg_out_goto(s, COND_AL, (tcg_target_long)lb->raddr); + /* Tail-call to the helper, which will return to the fast path. */ + tcg_out_goto(s, COND_AL, (tcg_target_long) qemu_st_helpers[lb->opc & 3]); } #endif /* SOFTMMU */ @@ -1383,8 +1377,10 @@ static void tcg_out_qemu_ld(TCGContext *s, const TCGArg *args, int opc) tcg_out_tlb_read(s, addr_reg, addr_reg2, s_bits, offsetof(CPUArchState, tlb_table[mem_index][0].addr_read)); + /* This a conditional BL only to load a pointer within this opcode into LR + for the slow path. We will not be using the value for a tail call. */ label_ptr = s->code_ptr; - tcg_out_b_noaddr(s, COND_NE); + tcg_out_bl_noaddr(s, COND_NE); tcg_out_ld32_12(s, COND_AL, TCG_REG_R1, TCG_REG_R2, offsetof(CPUTLBEntry, addend) @@ -1529,50 +1525,51 @@ static void tcg_out_qemu_st(TCGContext *s, const TCGArg *args, int opc) offsetof(CPUArchState, tlb_table[mem_index][0].addr_write)); - label_ptr = s->code_ptr; - tcg_out_b_noaddr(s, COND_NE); - tcg_out_ld32_12(s, COND_AL, TCG_REG_R1, TCG_REG_R2, offsetof(CPUTLBEntry, addend) - offsetof(CPUTLBEntry, addr_write)); switch (opc) { case 0: - tcg_out_st8_r(s, COND_AL, data_reg, addr_reg, TCG_REG_R1); + tcg_out_st8_r(s, COND_EQ, data_reg, addr_reg, TCG_REG_R1); break; case 1: if (bswap) { - tcg_out_bswap16st(s, COND_AL, TCG_REG_R0, data_reg); - tcg_out_st16_r(s, COND_AL, TCG_REG_R0, addr_reg, TCG_REG_R1); + tcg_out_bswap16st(s, COND_EQ, TCG_REG_R0, data_reg); + tcg_out_st16_r(s, COND_EQ, TCG_REG_R0, addr_reg, TCG_REG_R1); } else { - tcg_out_st16_r(s, COND_AL, data_reg, addr_reg, TCG_REG_R1); + tcg_out_st16_r(s, COND_EQ, data_reg, addr_reg, TCG_REG_R1); } break; case 2: default: if (bswap) { - tcg_out_bswap32(s, COND_AL, TCG_REG_R0, data_reg); - tcg_out_st32_r(s, COND_AL, TCG_REG_R0, addr_reg, TCG_REG_R1); + tcg_out_bswap32(s, COND_EQ, TCG_REG_R0, data_reg); + tcg_out_st32_r(s, COND_EQ, TCG_REG_R0, addr_reg, TCG_REG_R1); } else { - tcg_out_st32_r(s, COND_AL, data_reg, addr_reg, TCG_REG_R1); + tcg_out_st32_r(s, COND_EQ, data_reg, addr_reg, TCG_REG_R1); } break; case 3: if (bswap) { - tcg_out_bswap32(s, COND_AL, TCG_REG_R0, data_reg2); - tcg_out_st32_rwb(s, COND_AL, TCG_REG_R0, TCG_REG_R1, addr_reg); - tcg_out_bswap32(s, COND_AL, TCG_REG_R0, data_reg); - tcg_out_st32_12(s, COND_AL, TCG_REG_R0, TCG_REG_R1, 4); + tcg_out_bswap32(s, COND_EQ, TCG_REG_R0, data_reg2); + tcg_out_st32_rwb(s, COND_EQ, TCG_REG_R0, TCG_REG_R1, addr_reg); + tcg_out_bswap32(s, COND_EQ, TCG_REG_R0, data_reg); + tcg_out_st32_12(s, COND_EQ, TCG_REG_R0, TCG_REG_R1, 4); } else if (use_armv6_instructions && (data_reg & 1) == 0 && data_reg2 == data_reg + 1) { - tcg_out_strd_r(s, COND_AL, data_reg, addr_reg, TCG_REG_R1); + tcg_out_strd_r(s, COND_EQ, data_reg, addr_reg, TCG_REG_R1); } else { - tcg_out_st32_rwb(s, COND_AL, data_reg, TCG_REG_R1, addr_reg); - tcg_out_st32_12(s, COND_AL, data_reg2, TCG_REG_R1, 4); + tcg_out_st32_rwb(s, COND_EQ, data_reg, TCG_REG_R1, addr_reg); + tcg_out_st32_12(s, COND_EQ, data_reg2, TCG_REG_R1, 4); } break; } + /* The conditional call must come last, as we're going to return here. */ + label_ptr = s->code_ptr; + tcg_out_bl_noaddr(s, COND_NE); + add_qemu_ldst_label(s, 0, opc, data_reg, data_reg2, addr_reg, addr_reg2, mem_index, s->code_ptr, label_ptr); #else /* !CONFIG_SOFTMMU */ From e5e2e4a74b75b41f72e1e3b3bac8c2a6b02896c2 Mon Sep 17 00:00:00 2001 From: Richard Henderson Date: Wed, 28 Aug 2013 11:16:16 -0700 Subject: [PATCH 3/8] tcg-arm: Use strd for tcg_out_arg_reg64 Signed-off-by: Richard Henderson --- tcg/arm/tcg-target.c | 13 ++++++++++--- 1 file changed, 10 insertions(+), 3 deletions(-) diff --git a/tcg/arm/tcg-target.c b/tcg/arm/tcg-target.c index 9d2fe8a0f7..f953f4e556 100644 --- a/tcg/arm/tcg-target.c +++ b/tcg/arm/tcg-target.c @@ -1149,9 +1149,16 @@ static TCGReg tcg_out_arg_reg64(TCGContext *s, TCGReg argreg, if (argreg & 1) { argreg++; } - argreg = tcg_out_arg_reg32(s, argreg, arglo); - argreg = tcg_out_arg_reg32(s, argreg, arghi); - return argreg; + if (use_armv6_instructions && argreg >= 4 + && (arglo & 1) == 0 && arghi == arglo + 1) { + tcg_out_strd_8(s, COND_AL, arglo, + TCG_REG_CALL_STACK, (argreg - 4) * 4); + return argreg + 2; + } else { + argreg = tcg_out_arg_reg32(s, argreg, arglo); + argreg = tcg_out_arg_reg32(s, argreg, arghi); + return argreg; + } } #define TLB_SHIFT (CPU_TLB_ENTRY_BITS + CPU_TLB_BITS) From f2488736371ae902f345cf9270d141f0a6797731 Mon Sep 17 00:00:00 2001 From: Richard Henderson Date: Wed, 28 Aug 2013 14:40:52 -0700 Subject: [PATCH 4/8] tcg-arm: Use QEMU_BUILD_BUG_ON to verify constraints on tlb One of the two constraints we already checked via #if, but the tlb offset distance was only checked at runtime. Signed-off-by: Richard Henderson --- tcg/arm/tcg-target.c | 15 ++++++++++----- 1 file changed, 10 insertions(+), 5 deletions(-) diff --git a/tcg/arm/tcg-target.c b/tcg/arm/tcg-target.c index f953f4e556..1f7bbe1195 100644 --- a/tcg/arm/tcg-target.c +++ b/tcg/arm/tcg-target.c @@ -1163,6 +1163,15 @@ static TCGReg tcg_out_arg_reg64(TCGContext *s, TCGReg argreg, #define TLB_SHIFT (CPU_TLB_ENTRY_BITS + CPU_TLB_BITS) +/* We're expecting to use an 8-bit immediate and to mask. */ +QEMU_BUILD_BUG_ON(CPU_TLB_BITS > 8); + +/* We're expecting to use an 8-bit immediate add + 8-bit ldrd offset. + Using the offset of the second entry in the last tlb table ensures + that we can index all of the elements of the first entry. */ +QEMU_BUILD_BUG_ON(offsetof(CPUArchState, tlb_table[NB_MMU_MODES - 1][1]) + > 0xffff); + /* Load and compare a TLB entry, leaving the flags set. Leaves R2 pointing to the tlb entry. Clobbers R1 and TMP. */ @@ -1190,14 +1199,10 @@ static void tcg_out_tlb_read(TCGContext *s, TCGReg addrlo, TCGReg addrhi, * ldr r0, [r2, r0]! (3) * cmp r0, tmp (4) */ -# if CPU_TLB_BITS > 8 -# error -# endif tcg_out_dat_reg(s, COND_AL, ARITH_MOV, TCG_REG_TMP, 0, addrlo, SHIFT_IMM_LSR(TARGET_PAGE_BITS)); - /* We assume that the offset is contained within 16 bits. */ - assert((tlb_offset & ~0xffff) == 0); + /* We checked that the offset is contained within 16 bits above. */ if (tlb_offset > 0xff) { tcg_out_dat_imm(s, COND_AL, ARITH_ADD, TCG_REG_R2, base, (24 << 7) | (tlb_offset >> 8)); From d0ebde228415c6d89ad61270a461717fbb04915c Mon Sep 17 00:00:00 2001 From: Richard Henderson Date: Fri, 30 Aug 2013 08:16:00 -0700 Subject: [PATCH 5/8] tcg-arm: Move load of tlb addend into tcg_out_tlb_read This allows us to make more intelligent decisions about the relative offsets of the tlb comparator and the addend, avoiding any need of writeback addressing. Signed-off-by: Richard Henderson --- tcg/arm/tcg-target.c | 60 +++++++++++++++++--------------------------- 1 file changed, 23 insertions(+), 37 deletions(-) diff --git a/tcg/arm/tcg-target.c b/tcg/arm/tcg-target.c index 1f7bbe1195..b9ec4f6bd6 100644 --- a/tcg/arm/tcg-target.c +++ b/tcg/arm/tcg-target.c @@ -1172,42 +1172,39 @@ QEMU_BUILD_BUG_ON(CPU_TLB_BITS > 8); QEMU_BUILD_BUG_ON(offsetof(CPUArchState, tlb_table[NB_MMU_MODES - 1][1]) > 0xffff); -/* Load and compare a TLB entry, leaving the flags set. Leaves R2 pointing - to the tlb entry. Clobbers R1 and TMP. */ +/* Load and compare a TLB entry, leaving the flags set. Leaves R1 containing + the addend of the tlb entry. Clobbers R0, R2, TMP. */ static void tcg_out_tlb_read(TCGContext *s, TCGReg addrlo, TCGReg addrhi, - int s_bits, int tlb_offset) + int s_bits, int mem_index, bool is_load) { TCGReg base = TCG_AREG0; + int cmp_off = + (is_load + ? offsetof(CPUArchState, tlb_table[mem_index][0].addr_read) + : offsetof(CPUArchState, tlb_table[mem_index][0].addr_write)); + int add_off = offsetof(CPUArchState, tlb_table[mem_index][0].addend); /* Should generate something like the following: - * pre-v7: * shr tmp, addr_reg, #TARGET_PAGE_BITS (1) - * add r2, env, #off & 0xff00 + * add r2, env, #high * and r0, tmp, #(CPU_TLB_SIZE - 1) (2) * add r2, r2, r0, lsl #CPU_TLB_ENTRY_BITS (3) - * ldr r0, [r2, #off & 0xff]! (4) + * ldr r0, [r2, #cmp] (4) * tst addr_reg, #s_mask * cmpeq r0, tmp, lsl #TARGET_PAGE_BITS (5) - * - * v7 (not implemented yet): - * ubfx r2, addr_reg, #TARGET_PAGE_BITS, #CPU_TLB_BITS (1) - * movw tmp, #~TARGET_PAGE_MASK & ~s_mask - * movw r0, #off - * add r2, env, r2, lsl #CPU_TLB_ENTRY_BITS (2) - * bic tmp, addr_reg, tmp - * ldr r0, [r2, r0]! (3) - * cmp r0, tmp (4) + * ldr r1, [r2, #add] */ tcg_out_dat_reg(s, COND_AL, ARITH_MOV, TCG_REG_TMP, 0, addrlo, SHIFT_IMM_LSR(TARGET_PAGE_BITS)); /* We checked that the offset is contained within 16 bits above. */ - if (tlb_offset > 0xff) { + if (add_off > 0xfff || (use_armv6_instructions && cmp_off > 0xff)) { tcg_out_dat_imm(s, COND_AL, ARITH_ADD, TCG_REG_R2, base, - (24 << 7) | (tlb_offset >> 8)); - tlb_offset &= 0xff; + (24 << 7) | (cmp_off >> 8)); base = TCG_REG_R2; + add_off -= cmp_off & 0xff00; + cmp_off &= 0xff; } tcg_out_dat_imm(s, COND_AL, ARITH_AND, @@ -1219,14 +1216,11 @@ static void tcg_out_tlb_read(TCGContext *s, TCGReg addrlo, TCGReg addrhi, but due to how the pointer needs setting up, ldm isn't useful. Base arm5 doesn't have ldrd, but armv5te does. */ if (use_armv6_instructions && TARGET_LONG_BITS == 64) { - tcg_out_memop_8(s, COND_AL, INSN_LDRD_IMM, TCG_REG_R0, - TCG_REG_R2, tlb_offset, 1, 1); + tcg_out_ldrd_8(s, COND_AL, TCG_REG_R0, TCG_REG_R2, cmp_off); } else { - tcg_out_memop_12(s, COND_AL, INSN_LDR_IMM, TCG_REG_R0, - TCG_REG_R2, tlb_offset, 1, 1); + tcg_out_ld32_12(s, COND_AL, TCG_REG_R0, TCG_REG_R2, cmp_off); if (TARGET_LONG_BITS == 64) { - tcg_out_memop_12(s, COND_AL, INSN_LDR_IMM, TCG_REG_R1, - TCG_REG_R2, 4, 1, 0); + tcg_out_ld32_12(s, COND_AL, TCG_REG_R1, TCG_REG_R2, cmp_off + 4); } } @@ -1243,6 +1237,9 @@ static void tcg_out_tlb_read(TCGContext *s, TCGReg addrlo, TCGReg addrhi, tcg_out_dat_reg(s, COND_EQ, ARITH_CMP, 0, TCG_REG_R1, addrhi, SHIFT_IMM_LSL(0)); } + + /* Load the tlb addend. */ + tcg_out_ld32_12(s, COND_AL, TCG_REG_R1, TCG_REG_R2, add_off); } /* Record the context of a call to the out of line helper code for the slow @@ -1386,18 +1383,13 @@ static void tcg_out_qemu_ld(TCGContext *s, const TCGArg *args, int opc) mem_index = *args; s_bits = opc & 3; - tcg_out_tlb_read(s, addr_reg, addr_reg2, s_bits, - offsetof(CPUArchState, tlb_table[mem_index][0].addr_read)); + tcg_out_tlb_read(s, addr_reg, addr_reg2, s_bits, mem_index, 1); /* This a conditional BL only to load a pointer within this opcode into LR for the slow path. We will not be using the value for a tail call. */ label_ptr = s->code_ptr; tcg_out_bl_noaddr(s, COND_NE); - tcg_out_ld32_12(s, COND_AL, TCG_REG_R1, TCG_REG_R2, - offsetof(CPUTLBEntry, addend) - - offsetof(CPUTLBEntry, addr_read)); - switch (opc) { case 0: tcg_out_ld8_r(s, COND_AL, data_reg, addr_reg, TCG_REG_R1); @@ -1533,13 +1525,7 @@ static void tcg_out_qemu_st(TCGContext *s, const TCGArg *args, int opc) mem_index = *args; s_bits = opc & 3; - tcg_out_tlb_read(s, addr_reg, addr_reg2, s_bits, - offsetof(CPUArchState, - tlb_table[mem_index][0].addr_write)); - - tcg_out_ld32_12(s, COND_AL, TCG_REG_R1, TCG_REG_R2, - offsetof(CPUTLBEntry, addend) - - offsetof(CPUTLBEntry, addr_write)); + tcg_out_tlb_read(s, addr_reg, addr_reg2, s_bits, mem_index, 0); switch (opc) { case 0: From d3e440bef2783b7b2ebc210a0717c36351506b8c Mon Sep 17 00:00:00 2001 From: Richard Henderson Date: Fri, 30 Aug 2013 08:45:53 -0700 Subject: [PATCH 6/8] tcg-arm: Return register containing tlb addend Preparatory to rescheduling the tlb load, and changing said register. Continues to use R1 for now. Signed-off-by: Richard Henderson --- tcg/arm/tcg-target.c | 59 ++++++++++++++++++++++---------------------- 1 file changed, 30 insertions(+), 29 deletions(-) diff --git a/tcg/arm/tcg-target.c b/tcg/arm/tcg-target.c index b9ec4f6bd6..86e02c4d31 100644 --- a/tcg/arm/tcg-target.c +++ b/tcg/arm/tcg-target.c @@ -1172,11 +1172,11 @@ QEMU_BUILD_BUG_ON(CPU_TLB_BITS > 8); QEMU_BUILD_BUG_ON(offsetof(CPUArchState, tlb_table[NB_MMU_MODES - 1][1]) > 0xffff); -/* Load and compare a TLB entry, leaving the flags set. Leaves R1 containing - the addend of the tlb entry. Clobbers R0, R2, TMP. */ +/* Load and compare a TLB entry, leaving the flags set. Returns the register + containing the addend of the tlb entry. Clobbers R0, R1, R2, TMP. */ -static void tcg_out_tlb_read(TCGContext *s, TCGReg addrlo, TCGReg addrhi, - int s_bits, int mem_index, bool is_load) +static TCGReg tcg_out_tlb_read(TCGContext *s, TCGReg addrlo, TCGReg addrhi, + int s_bits, int mem_index, bool is_load) { TCGReg base = TCG_AREG0; int cmp_off = @@ -1240,6 +1240,7 @@ static void tcg_out_tlb_read(TCGContext *s, TCGReg addrlo, TCGReg addrhi, /* Load the tlb addend. */ tcg_out_ld32_12(s, COND_AL, TCG_REG_R1, TCG_REG_R2, add_off); + return TCG_REG_R1; } /* Record the context of a call to the out of line helper code for the slow @@ -1366,7 +1367,7 @@ static void tcg_out_qemu_ld(TCGContext *s, const TCGArg *args, int opc) bool bswap; #ifdef CONFIG_SOFTMMU int mem_index, s_bits; - TCGReg addr_reg2; + TCGReg addr_reg2, addend; uint8_t *label_ptr; #endif #ifdef TARGET_WORDS_BIGENDIAN @@ -1383,7 +1384,7 @@ static void tcg_out_qemu_ld(TCGContext *s, const TCGArg *args, int opc) mem_index = *args; s_bits = opc & 3; - tcg_out_tlb_read(s, addr_reg, addr_reg2, s_bits, mem_index, 1); + addend = tcg_out_tlb_read(s, addr_reg, addr_reg2, s_bits, mem_index, 1); /* This a conditional BL only to load a pointer within this opcode into LR for the slow path. We will not be using the value for a tail call. */ @@ -1392,44 +1393,44 @@ static void tcg_out_qemu_ld(TCGContext *s, const TCGArg *args, int opc) switch (opc) { case 0: - tcg_out_ld8_r(s, COND_AL, data_reg, addr_reg, TCG_REG_R1); + tcg_out_ld8_r(s, COND_AL, data_reg, addr_reg, addend); break; case 0 | 4: - tcg_out_ld8s_r(s, COND_AL, data_reg, addr_reg, TCG_REG_R1); + tcg_out_ld8s_r(s, COND_AL, data_reg, addr_reg, addend); break; case 1: - tcg_out_ld16u_r(s, COND_AL, data_reg, addr_reg, TCG_REG_R1); + tcg_out_ld16u_r(s, COND_AL, data_reg, addr_reg, addend); if (bswap) { tcg_out_bswap16(s, COND_AL, data_reg, data_reg); } break; case 1 | 4: if (bswap) { - tcg_out_ld16u_r(s, COND_AL, data_reg, addr_reg, TCG_REG_R1); + tcg_out_ld16u_r(s, COND_AL, data_reg, addr_reg, addend); tcg_out_bswap16s(s, COND_AL, data_reg, data_reg); } else { - tcg_out_ld16s_r(s, COND_AL, data_reg, addr_reg, TCG_REG_R1); + tcg_out_ld16s_r(s, COND_AL, data_reg, addr_reg, addend); } break; case 2: default: - tcg_out_ld32_r(s, COND_AL, data_reg, addr_reg, TCG_REG_R1); + tcg_out_ld32_r(s, COND_AL, data_reg, addr_reg, addend); if (bswap) { tcg_out_bswap32(s, COND_AL, data_reg, data_reg); } break; case 3: if (bswap) { - tcg_out_ld32_rwb(s, COND_AL, data_reg2, TCG_REG_R1, addr_reg); - tcg_out_ld32_12(s, COND_AL, data_reg, TCG_REG_R1, 4); + tcg_out_ld32_rwb(s, COND_AL, data_reg2, addend, addr_reg); + tcg_out_ld32_12(s, COND_AL, data_reg, addend, 4); tcg_out_bswap32(s, COND_AL, data_reg2, data_reg2); tcg_out_bswap32(s, COND_AL, data_reg, data_reg); } else if (use_armv6_instructions && (data_reg & 1) == 0 && data_reg2 == data_reg + 1) { - tcg_out_ldrd_r(s, COND_AL, data_reg, addr_reg, TCG_REG_R1); + tcg_out_ldrd_r(s, COND_AL, data_reg, addr_reg, addend); } else { - tcg_out_ld32_rwb(s, COND_AL, data_reg, TCG_REG_R1, addr_reg); - tcg_out_ld32_12(s, COND_AL, data_reg2, TCG_REG_R1, 4); + tcg_out_ld32_rwb(s, COND_AL, data_reg, addend, addr_reg); + tcg_out_ld32_12(s, COND_AL, data_reg2, addend, 4); } break; } @@ -1508,7 +1509,7 @@ static void tcg_out_qemu_st(TCGContext *s, const TCGArg *args, int opc) bool bswap; #ifdef CONFIG_SOFTMMU int mem_index, s_bits; - TCGReg addr_reg2; + TCGReg addr_reg2, addend; uint8_t *label_ptr; #endif #ifdef TARGET_WORDS_BIGENDIAN @@ -1525,41 +1526,41 @@ static void tcg_out_qemu_st(TCGContext *s, const TCGArg *args, int opc) mem_index = *args; s_bits = opc & 3; - tcg_out_tlb_read(s, addr_reg, addr_reg2, s_bits, mem_index, 0); + addend = tcg_out_tlb_read(s, addr_reg, addr_reg2, s_bits, mem_index, 0); switch (opc) { case 0: - tcg_out_st8_r(s, COND_EQ, data_reg, addr_reg, TCG_REG_R1); + tcg_out_st8_r(s, COND_EQ, data_reg, addr_reg, addend); break; case 1: if (bswap) { tcg_out_bswap16st(s, COND_EQ, TCG_REG_R0, data_reg); - tcg_out_st16_r(s, COND_EQ, TCG_REG_R0, addr_reg, TCG_REG_R1); + tcg_out_st16_r(s, COND_EQ, TCG_REG_R0, addr_reg, addend); } else { - tcg_out_st16_r(s, COND_EQ, data_reg, addr_reg, TCG_REG_R1); + tcg_out_st16_r(s, COND_EQ, data_reg, addr_reg, addend); } break; case 2: default: if (bswap) { tcg_out_bswap32(s, COND_EQ, TCG_REG_R0, data_reg); - tcg_out_st32_r(s, COND_EQ, TCG_REG_R0, addr_reg, TCG_REG_R1); + tcg_out_st32_r(s, COND_EQ, TCG_REG_R0, addr_reg, addend); } else { - tcg_out_st32_r(s, COND_EQ, data_reg, addr_reg, TCG_REG_R1); + tcg_out_st32_r(s, COND_EQ, data_reg, addr_reg, addend); } break; case 3: if (bswap) { tcg_out_bswap32(s, COND_EQ, TCG_REG_R0, data_reg2); - tcg_out_st32_rwb(s, COND_EQ, TCG_REG_R0, TCG_REG_R1, addr_reg); + tcg_out_st32_rwb(s, COND_EQ, TCG_REG_R0, addend, addr_reg); tcg_out_bswap32(s, COND_EQ, TCG_REG_R0, data_reg); - tcg_out_st32_12(s, COND_EQ, TCG_REG_R0, TCG_REG_R1, 4); + tcg_out_st32_12(s, COND_EQ, TCG_REG_R0, addend, 4); } else if (use_armv6_instructions && (data_reg & 1) == 0 && data_reg2 == data_reg + 1) { - tcg_out_strd_r(s, COND_EQ, data_reg, addr_reg, TCG_REG_R1); + tcg_out_strd_r(s, COND_EQ, data_reg, addr_reg, addend); } else { - tcg_out_st32_rwb(s, COND_EQ, data_reg, TCG_REG_R1, addr_reg); - tcg_out_st32_12(s, COND_EQ, data_reg2, TCG_REG_R1, 4); + tcg_out_st32_rwb(s, COND_EQ, data_reg, addend, addr_reg); + tcg_out_st32_12(s, COND_EQ, data_reg2, addend, 4); } break; } From 66c2056fb83b873df0a3a4bda3a679bf53d082a2 Mon Sep 17 00:00:00 2001 From: Richard Henderson Date: Fri, 30 Aug 2013 09:12:32 -0700 Subject: [PATCH 7/8] tcg-arm: Remove restriction on qemu_ld output register The main intent of the patch is to allow the tlb addend register to be changed, without tying that change to the constraint. But the most common side-effect seems to be to enable usage of ldrd with the r0,r1 pair. Signed-off-by: Richard Henderson --- tcg/arm/tcg-target.c | 58 ++++++++++++++++++++++++++------------------ 1 file changed, 34 insertions(+), 24 deletions(-) diff --git a/tcg/arm/tcg-target.c b/tcg/arm/tcg-target.c index 86e02c4d31..0b09672a48 100644 --- a/tcg/arm/tcg-target.c +++ b/tcg/arm/tcg-target.c @@ -181,15 +181,6 @@ static int target_parse_constraint(TCGArgConstraint *ct, const char **pct_str) tcg_regset_reset_reg(ct->u.regs, TCG_REG_R1); tcg_regset_reset_reg(ct->u.regs, TCG_REG_R2); tcg_regset_reset_reg(ct->u.regs, TCG_REG_R14); -#endif - break; - case 'L': - ct->ct |= TCG_CT_REG; - tcg_regset_set32(ct->u.regs, 0, (1 << TCG_TARGET_NB_REGS) - 1); -#ifdef CONFIG_SOFTMMU - /* r1 is still needed to load data_reg or data_reg2, - so don't use it. */ - tcg_regset_reset_reg(ct->u.regs, TCG_REG_R1); #endif break; @@ -1314,8 +1305,17 @@ static void tcg_out_qemu_ld_slow_path(TCGContext *s, TCGLabelQemuLdst *lb) tcg_out_mov_reg(s, COND_AL, data_reg, TCG_REG_R0); break; case 3: - tcg_out_mov_reg(s, COND_AL, data_reg, TCG_REG_R0); - tcg_out_mov_reg(s, COND_AL, data_reg2, TCG_REG_R1); + if (data_reg != TCG_REG_R1) { + tcg_out_mov_reg(s, COND_AL, data_reg, TCG_REG_R0); + tcg_out_mov_reg(s, COND_AL, data_reg2, TCG_REG_R1); + } else if (data_reg2 != TCG_REG_R0) { + tcg_out_mov_reg(s, COND_AL, data_reg2, TCG_REG_R1); + tcg_out_mov_reg(s, COND_AL, data_reg, TCG_REG_R0); + } else { + tcg_out_mov_reg(s, COND_AL, TCG_REG_TMP, TCG_REG_R0); + tcg_out_mov_reg(s, COND_AL, data_reg2, TCG_REG_R1); + tcg_out_mov_reg(s, COND_AL, data_reg, TCG_REG_TMP); + } break; } @@ -1420,17 +1420,27 @@ static void tcg_out_qemu_ld(TCGContext *s, const TCGArg *args, int opc) } break; case 3: - if (bswap) { - tcg_out_ld32_rwb(s, COND_AL, data_reg2, addend, addr_reg); - tcg_out_ld32_12(s, COND_AL, data_reg, addend, 4); - tcg_out_bswap32(s, COND_AL, data_reg2, data_reg2); - tcg_out_bswap32(s, COND_AL, data_reg, data_reg); - } else if (use_armv6_instructions - && (data_reg & 1) == 0 && data_reg2 == data_reg + 1) { - tcg_out_ldrd_r(s, COND_AL, data_reg, addr_reg, addend); - } else { - tcg_out_ld32_rwb(s, COND_AL, data_reg, addend, addr_reg); - tcg_out_ld32_12(s, COND_AL, data_reg2, addend, 4); + { + /* Be careful not to modify data_reg and data_reg2 + for the slow path below. */ + TCGReg dl = (bswap ? data_reg2 : data_reg); + TCGReg dh = (bswap ? data_reg : data_reg2); + + if (use_armv6_instructions && (dl & 1) == 0 && dh == dl + 1) { + tcg_out_ldrd_r(s, COND_AL, dl, addr_reg, addend); + } else if (dl != addend) { + tcg_out_ld32_rwb(s, COND_AL, dl, addend, addr_reg); + tcg_out_ld32_12(s, COND_AL, dh, addend, 4); + } else { + tcg_out_dat_reg(s, COND_AL, ARITH_ADD, TCG_REG_TMP, + addend, addr_reg, SHIFT_IMM_LSL(0)); + tcg_out_ld32_12(s, COND_AL, dl, TCG_REG_TMP, 0); + tcg_out_ld32_12(s, COND_AL, dh, TCG_REG_TMP, 4); + } + if (bswap) { + tcg_out_bswap32(s, COND_AL, dh, dh); + tcg_out_bswap32(s, COND_AL, dl, dl); + } } break; } @@ -2025,7 +2035,7 @@ static const TCGTargetOpDef arm_op_defs[] = { { INDEX_op_qemu_ld16u, { "r", "l" } }, { INDEX_op_qemu_ld16s, { "r", "l" } }, { INDEX_op_qemu_ld32, { "r", "l" } }, - { INDEX_op_qemu_ld64, { "L", "L", "l" } }, + { INDEX_op_qemu_ld64, { "r", "r", "l" } }, { INDEX_op_qemu_st8, { "s", "s" } }, { INDEX_op_qemu_st16, { "s", "s" } }, @@ -2037,7 +2047,7 @@ static const TCGTargetOpDef arm_op_defs[] = { { INDEX_op_qemu_ld16u, { "r", "l", "l" } }, { INDEX_op_qemu_ld16s, { "r", "l", "l" } }, { INDEX_op_qemu_ld32, { "r", "l", "l" } }, - { INDEX_op_qemu_ld64, { "L", "L", "l", "l" } }, + { INDEX_op_qemu_ld64, { "r", "r", "l", "l" } }, { INDEX_op_qemu_st8, { "s", "s", "s" } }, { INDEX_op_qemu_st16, { "s", "s", "s" } }, From ee06e23051251c00778edf54fb930198df0e873a Mon Sep 17 00:00:00 2001 From: Richard Henderson Date: Fri, 30 Aug 2013 09:48:56 -0700 Subject: [PATCH 8/8] tcg-arm: Move the tlb addend load earlier There are free scheduling slots between the sequence of comparison instructions. This requires changing the register in use to avoid conflict with those compares. Signed-off-by: Richard Henderson --- tcg/arm/tcg-target.c | 11 ++++++----- 1 file changed, 6 insertions(+), 5 deletions(-) diff --git a/tcg/arm/tcg-target.c b/tcg/arm/tcg-target.c index 0b09672a48..622cc49aa7 100644 --- a/tcg/arm/tcg-target.c +++ b/tcg/arm/tcg-target.c @@ -1183,8 +1183,8 @@ static TCGReg tcg_out_tlb_read(TCGContext *s, TCGReg addrlo, TCGReg addrhi, * add r2, r2, r0, lsl #CPU_TLB_ENTRY_BITS (3) * ldr r0, [r2, #cmp] (4) * tst addr_reg, #s_mask - * cmpeq r0, tmp, lsl #TARGET_PAGE_BITS (5) - * ldr r1, [r2, #add] + * ldr r1, [r2, #add] (5) + * cmpeq r0, tmp, lsl #TARGET_PAGE_BITS */ tcg_out_dat_reg(s, COND_AL, ARITH_MOV, TCG_REG_TMP, 0, addrlo, SHIFT_IMM_LSR(TARGET_PAGE_BITS)); @@ -1221,6 +1221,9 @@ static TCGReg tcg_out_tlb_read(TCGContext *s, TCGReg addrlo, TCGReg addrhi, 0, addrlo, (1 << s_bits) - 1); } + /* Load the tlb addend. */ + tcg_out_ld32_12(s, COND_AL, TCG_REG_R2, TCG_REG_R2, add_off); + tcg_out_dat_reg(s, (s_bits ? COND_EQ : COND_AL), ARITH_CMP, 0, TCG_REG_R0, TCG_REG_TMP, SHIFT_IMM_LSL(TARGET_PAGE_BITS)); @@ -1229,9 +1232,7 @@ static TCGReg tcg_out_tlb_read(TCGContext *s, TCGReg addrlo, TCGReg addrhi, TCG_REG_R1, addrhi, SHIFT_IMM_LSL(0)); } - /* Load the tlb addend. */ - tcg_out_ld32_12(s, COND_AL, TCG_REG_R1, TCG_REG_R2, add_off); - return TCG_REG_R1; + return TCG_REG_R2; } /* Record the context of a call to the out of line helper code for the slow