From 5b2b573243fd49fd7fc0e1ccd7542990687f1980 Mon Sep 17 00:00:00 2001 From: Igor Mitsyanko Date: Mon, 25 Jun 2012 16:29:41 +0400 Subject: [PATCH] target-i386: fix ld/st optimization Load/store TCG optimization for i386 target was broken after merging with upstream QEMU. This patch fixes this. Signed-off-by: Igor Mitsyanko --- exec-all.h | 16 +++--- softmmu_template.h | 46 +++++++-------- tcg/i386/tcg-target.c | 155 +++++++++++++++++++++++++++++--------------------- 3 files changed, 118 insertions(+), 99 deletions(-) diff --git a/exec-all.h b/exec-all.h index 67888c5..6c222c0 100644 --- a/exec-all.h +++ b/exec-all.h @@ -357,14 +357,14 @@ tb_page_addr_t get_page_addr_code(CPUArchState *env1, target_ulong addr); /* Extended versions of MMU helpers for qemu_ld/st optimization. They get return address arguments because the caller PCs are not where helpers return to. */ #if defined(__i386__) || defined(__x86_64__) -uint8_t __ldextb_mmu(target_ulong addr, int mmu_idx, void *ra); -void __stextb_mmu(target_ulong addr, uint8_t val, int mmu_idx, void *ra); -uint16_t __ldextw_mmu(target_ulong addr, int mmu_idx, void *ra); -void __stextw_mmu(target_ulong addr, uint16_t val, int mmu_idx, void *ra); -uint32_t __ldextl_mmu(target_ulong addr, int mmu_idx, void *ra); -void __stextl_mmu(target_ulong addr, uint32_t val, int mmu_idx, void *ra); -uint64_t __ldextq_mmu(target_ulong addr, int mmu_idx, void *ra); -void __stextq_mmu(target_ulong addr, uint64_t val, int mmu_idx, void *ra); +uint8_t __ldextb_mmu(target_ulong addr, int mmu_idx, uintptr_t retaddr); +void __stextb_mmu(target_ulong addr, uint8_t val, int mmu_idx, uintptr_t retaddr); +uint16_t __ldextw_mmu(target_ulong addr, int mmu_idx, uintptr_t retaddr); +void __stextw_mmu(target_ulong addr, uint16_t val, int mmu_idx, uintptr_t retaddr); +uint32_t __ldextl_mmu(target_ulong addr, int mmu_idx, uintptr_t retaddr); +void __stextl_mmu(target_ulong addr, uint32_t val, int mmu_idx, uintptr_t retaddr); +uint64_t __ldextq_mmu(target_ulong addr, int mmu_idx, uintptr_t retaddr); +void __stextq_mmu(target_ulong addr, uint64_t val, int mmu_idx, uintptr_t retaddr); #endif #endif /* CONFIG_QEMU_LDST_OPTIMIZATION */ diff --git a/softmmu_template.h b/softmmu_template.h index 684dbfa..0b403ce 100644 --- a/softmmu_template.h +++ b/softmmu_template.h @@ -219,16 +219,14 @@ glue(glue(slow_ld, SUFFIX), MMUSUFFIX)(ENV_PARAM They get return address arguments because the caller PCs are not where helpers return to. !defined(SOFTMMU_CODE_ACCESS) suppress warnings from exec.c */ #if defined(__i386__) || defined(__x86_64__) -DATA_TYPE glue(glue(__ldext, SUFFIX), MMUSUFFIX)(target_ulong addr, +DATA_TYPE glue(glue(__ldext, SUFFIX), MMUSUFFIX)(ENV_PARAM target_ulong addr, int mmu_idx, - void *ra) + uintptr_t retaddr) { DATA_TYPE res; int index; target_ulong tlb_addr; target_phys_addr_t ioaddr; - unsigned long addend; - void *retaddr; /* test if there is match for unaligned or IO access */ /* XXX: could done more in memory macro in a non portable way */ @@ -240,33 +238,33 @@ DATA_TYPE glue(glue(__ldext, SUFFIX), MMUSUFFIX)(target_ulong addr, /* IO access */ if ((addr & (DATA_SIZE - 1)) != 0) goto do_unaligned_access; - retaddr = ra; ioaddr = env->iotlb[mmu_idx][index]; - res = glue(io_read, SUFFIX)(ioaddr, addr, retaddr); + res = glue(io_read, SUFFIX)(ENV_VAR ioaddr, addr, retaddr); } else if (((addr & ~TARGET_PAGE_MASK) + DATA_SIZE - 1) >= TARGET_PAGE_SIZE) { /* slow unaligned access (it spans two pages or IO) */ do_unaligned_access: - retaddr = ra; #ifdef ALIGNED_ONLY - do_unaligned_access(addr, READ_ACCESS_TYPE, mmu_idx, retaddr); + do_unaligned_access(ENV_VAR addr, READ_ACCESS_TYPE, mmu_idx, retaddr); #endif - res = glue(glue(slow_ld, SUFFIX), MMUSUFFIX)(addr, + res = glue(glue(slow_ld, SUFFIX), MMUSUFFIX)(ENV_VAR addr, mmu_idx, retaddr); } else { /* unaligned/aligned access in the same page */ + uintptr_t addend; #ifdef ALIGNED_ONLY if ((addr & (DATA_SIZE - 1)) != 0) { - retaddr = ra; - do_unaligned_access(addr, READ_ACCESS_TYPE, mmu_idx, retaddr); + do_unaligned_access(ENV_VAR addr, READ_ACCESS_TYPE, mmu_idx, retaddr); } #endif + addend = env->tlb_table[mmu_idx][index].addend; + res = glue(glue(ld, USUFFIX), _raw)((uint8_t *)(intptr_t) + (addr + addend)); } } else { /* the page is not in the TLB : fill it */ - retaddr = ra; #ifdef ALIGNED_ONLY if ((addr & (DATA_SIZE - 1)) != 0) - do_unaligned_access(addr, READ_ACCESS_TYPE, mmu_idx, retaddr); + do_unaligned_access(ENV_VAR addr, READ_ACCESS_TYPE, mmu_idx, retaddr); #endif tlb_fill(env, addr, READ_ACCESS_TYPE, mmu_idx, retaddr); goto redo; @@ -423,15 +421,13 @@ static void glue(glue(slow_st, SUFFIX), MMUSUFFIX)(ENV_PARAM #if defined(__i386__) || defined(__x86_64__) /* Extended versions of MMU helpers for qemu_st IR optimization. They get return address arguments because the caller PCs are not where helpers return to. */ -void glue(glue(__stext, SUFFIX), MMUSUFFIX)(target_ulong addr, +void glue(glue(__stext, SUFFIX), MMUSUFFIX)(ENV_PARAM target_ulong addr, DATA_TYPE val, int mmu_idx, - void *ra) + uintptr_t retaddr) { target_phys_addr_t ioaddr; - unsigned long addend; target_ulong tlb_addr; - void *retaddr; int index; index = (addr >> TARGET_PAGE_BITS) & (CPU_TLB_SIZE - 1); @@ -442,32 +438,32 @@ void glue(glue(__stext, SUFFIX), MMUSUFFIX)(target_ulong addr, /* IO access */ if ((addr & (DATA_SIZE - 1)) != 0) goto do_unaligned_access; - retaddr = ra; ioaddr = env->iotlb[mmu_idx][index]; - glue(io_write, SUFFIX)(ioaddr, val, addr, retaddr); + glue(io_write, SUFFIX)(ENV_VAR ioaddr, val, addr, retaddr); } else if (((addr & ~TARGET_PAGE_MASK) + DATA_SIZE - 1) >= TARGET_PAGE_SIZE) { do_unaligned_access: - retaddr = ra; #ifdef ALIGNED_ONLY - do_unaligned_access(addr, 1, mmu_idx, retaddr); + do_unaligned_access(ENV_VAR addr, 1, mmu_idx, retaddr); #endif - glue(glue(slow_st, SUFFIX), MMUSUFFIX)(addr, val, + glue(glue(slow_st, SUFFIX), MMUSUFFIX)(ENV_VAR addr, val, mmu_idx, retaddr); } else { /* aligned/unaligned access in the same page */ + uintptr_t addend; #ifdef ALIGNED_ONLY if ((addr & (DATA_SIZE - 1)) != 0) { - retaddr = ra; do_unaligned_access(addr, 1, mmu_idx, retaddr); } #endif + addend = env->tlb_table[mmu_idx][index].addend; + glue(glue(st, SUFFIX), _raw)((uint8_t *)(intptr_t) + (addr + addend), val); } } else { /* the page is not in the TLB : fill it */ - retaddr = ra; #ifdef ALIGNED_ONLY if ((addr & (DATA_SIZE - 1)) != 0) - do_unaligned_access(addr, 1, mmu_idx, retaddr); + do_unaligned_access(ENV_VAR addr, 1, mmu_idx, retaddr); #endif tlb_fill(env, addr, 1, mmu_idx, retaddr); goto redo; diff --git a/tcg/i386/tcg-target.c b/tcg/i386/tcg-target.c index 7834115..6217b83 100644 --- a/tcg/i386/tcg-target.c +++ b/tcg/i386/tcg-target.c @@ -1559,14 +1559,20 @@ static void add_qemu_ldst_label(TCGContext *s, /* generates slow case of qemu_ld at the end of TB */ static void tcg_out_qemu_ld_slow_path(TCGContext *s, TCGLabelQemuLdst *label) { - int s_bits, arg_idx; + int s_bits; int opc = label->opc_ext & HL_OPC_MASK; int mem_index = label->mem_index; int data_reg = label->datalo_reg; int data_reg2 = label->datahi_reg; + int addrlo_reg = label->addrlo_reg; int addrhi_reg = label->addrhi_reg; uint8_t *raddr = label->raddr; uint32_t **label_ptr = &label->label_ptr[0]; +#if TCG_TARGET_REG_BITS == 64 + int arg_idx; +#else + int stack_adjust; +#endif s_bits = opc & 3; @@ -1576,27 +1582,50 @@ static void tcg_out_qemu_ld_slow_path(TCGContext *s, TCGLabelQemuLdst *label) *label_ptr[1] = (uint32_t)(s->code_ptr - (uint8_t *)label_ptr[1] - 4); } - /* 1st parameter(vaddr) has been alreay set in %eax */ - arg_idx = 1; - if (TCG_TARGET_REG_BITS == 32 && TARGET_LONG_BITS == 64) { - tcg_out_mov(s, TCG_TYPE_I32, tcg_target_call_iarg_regs[arg_idx++], - addrhi_reg); +#if TCG_TARGET_REG_BITS == 32 + tcg_out_pushi(s, (tcg_target_long)(raddr - 1)); + tcg_out_pushi(s, mem_index); + stack_adjust = 8; + if (TARGET_LONG_BITS == 64) { + tcg_out_push(s, addrhi_reg); + /* 4 bytes addrhi_reg and +4 bytes (raddr - 1) */ + stack_adjust += 8; } - tcg_out_movi(s, TCG_TYPE_I32, tcg_target_call_iarg_regs[arg_idx++], + tcg_out_push(s, addrlo_reg); + stack_adjust += 4; +#ifdef CONFIG_TCG_PASS_AREG0 + tcg_out_push(s, TCG_AREG0); + stack_adjust += 4; +#endif +#else + /* The first argument is already loaded with addrlo. */ + arg_idx = 1; + tcg_out_movi(s, TCG_TYPE_I32, tcg_target_call_iarg_regs[arg_idx], mem_index); - /* return address should indicate qemu_ld IR codes */ - if (TCG_TARGET_REG_BITS == 32 && TARGET_LONG_BITS == 64) { - /* 4 word parameters */ - tcg_out_pushi(s, (tcg_target_long)(raddr - 1)); - } else { - /* 3 word parameters */ - tcg_out_movi(s, TCG_TYPE_I32, TCG_REG_ECX, (tcg_target_long)(raddr - 1)); - } + tcg_out_movi(s, TCG_TYPE_I32, TCG_REG_ECX, (tcg_target_long)(raddr - 1)); +#ifdef CONFIG_TCG_PASS_AREG0 + /* XXX/FIXME: suboptimal */ + tcg_out_mov(s, TCG_TYPE_I64, tcg_target_call_iarg_regs[3], + tcg_target_call_iarg_regs[2]); + tcg_out_mov(s, TCG_TYPE_I64, tcg_target_call_iarg_regs[2], + tcg_target_call_iarg_regs[1]); + tcg_out_mov(s, TCG_TYPE_I64, tcg_target_call_iarg_regs[1], + tcg_target_call_iarg_regs[0]); + tcg_out_mov(s, TCG_TYPE_I64, tcg_target_call_iarg_regs[0], + TCG_AREG0); +#endif +#endif + tcg_out_calli(s, (tcg_target_long)qemu_ldext_helpers[s_bits]); - if (TCG_TARGET_REG_BITS == 32 && TARGET_LONG_BITS == 64) { + +#if TCG_TARGET_REG_BITS == 32 + if (stack_adjust == (TCG_TARGET_REG_BITS / 8)) { /* Pop and discard. This is 2 bytes smaller than the add. */ tcg_out_pop(s, TCG_REG_ECX); + } else if (stack_adjust != 0) { + tcg_out_addi(s, TCG_REG_CALL_STACK, stack_adjust); } +#endif switch(opc) { case 0 | 4: @@ -1634,9 +1663,8 @@ static void tcg_out_qemu_ld_slow_path(TCGContext *s, TCGLabelQemuLdst *label) default: tcg_abort(); } - /* jump back to original code */ - tcg_out_jmp(s, (tcg_target_long) raddr); + tcg_out_jmp(s, (tcg_target_long)raddr); } /* generates slow case of qemu_st at the end of TB */ @@ -1648,6 +1676,7 @@ static void tcg_out_qemu_st_slow_path(TCGContext *s, TCGLabelQemuLdst *label) int mem_index = label->mem_index; int data_reg = label->datalo_reg; int data_reg2 = label->datahi_reg; + int addrlo_reg = label->addrlo_reg; int addrhi_reg = label->addrhi_reg; uint8_t *raddr = label->raddr; uint32_t **label_ptr = &label->label_ptr[0]; @@ -1660,54 +1689,46 @@ static void tcg_out_qemu_st_slow_path(TCGContext *s, TCGLabelQemuLdst *label) *label_ptr[1] = (uint32_t)(s->code_ptr - (uint8_t *)label_ptr[1] - 4); } - /* 1st parameter(vaddr) has been already set */ - /* return address should indicate qemu_st IR codes */ - if (TCG_TARGET_REG_BITS == 64) { - tcg_out_mov(s, (opc == 3 ? TCG_TYPE_I64 : TCG_TYPE_I32), - TCG_REG_RSI, data_reg); - tcg_out_movi(s, TCG_TYPE_I32, TCG_REG_RDX, mem_index); - /* return address should indicate qemu_st IR codes */ - /* stack growth: 1word * 64bit */ - tcg_out_pushi(s, (tcg_target_long)(raddr - 1)); - stack_adjust = 8; - } else if (TARGET_LONG_BITS == 32) { - tcg_out_mov(s, TCG_TYPE_I32, TCG_REG_EDX, data_reg); - if (opc == 3) { - tcg_out_mov(s, TCG_TYPE_I32, TCG_REG_ECX, data_reg2); - tcg_out_pushi(s, (tcg_target_long)(raddr - 1)); - tcg_out_pushi(s, mem_index); - stack_adjust = 8; - } else { - tcg_out_movi(s, TCG_TYPE_I32, TCG_REG_ECX, mem_index); - tcg_out_pushi(s, (tcg_target_long)(raddr - 1)); - stack_adjust = 4; - } - } else { - if (opc == 3) { - tcg_out_mov(s, TCG_TYPE_I32, TCG_REG_EDX, addrhi_reg); - tcg_out_pushi(s, (tcg_target_long)(raddr - 1)); - tcg_out_pushi(s, mem_index); - tcg_out_push(s, data_reg2); - tcg_out_push(s, data_reg); - stack_adjust = 16; - } else { - tcg_out_mov(s, TCG_TYPE_I32, TCG_REG_EDX, addrhi_reg); - switch(opc) { - case 0: - tcg_out_ext8u(s, TCG_REG_ECX, data_reg); - break; - case 1: - tcg_out_ext16u(s, TCG_REG_ECX, data_reg); - break; - case 2: - tcg_out_mov(s, TCG_TYPE_I32, TCG_REG_ECX, data_reg); - break; - } - tcg_out_pushi(s, (tcg_target_long)(raddr - 1)); - tcg_out_pushi(s, mem_index); - stack_adjust = 8; - } +#if TCG_TARGET_REG_BITS == 32 + tcg_out_pushi(s, (tcg_target_long)(raddr - 1)); + tcg_out_pushi(s, mem_index); + stack_adjust = 8; + if (opc == 3) { + tcg_out_push(s, data_reg2); + stack_adjust += 4; } + tcg_out_push(s, data_reg); + stack_adjust += 4; + if (TARGET_LONG_BITS == 64) { + tcg_out_push(s, addrhi_reg); + stack_adjust += 4; + } + tcg_out_push(s, addrlo_reg); + stack_adjust += 4; +#ifdef CONFIG_TCG_PASS_AREG0 + tcg_out_push(s, TCG_AREG0); + stack_adjust += 4; +#endif +#else + tcg_out_mov(s, (opc == 3 ? TCG_TYPE_I64 : TCG_TYPE_I32), + tcg_target_call_iarg_regs[1], data_reg); + tcg_out_movi(s, TCG_TYPE_I32, tcg_target_call_iarg_regs[2], mem_index); + /* return address should indicate qemu_st IR codes */ + /* stack growth: 1word * 64bit */ + tcg_out_pushi(s, (tcg_target_long)(raddr - 1)); + stack_adjust = 8; +#ifdef CONFIG_TCG_PASS_AREG0 + /* XXX/FIXME: suboptimal */ + tcg_out_mov(s, TCG_TYPE_I64, tcg_target_call_iarg_regs[3], + tcg_target_call_iarg_regs[2]); + tcg_out_mov(s, TCG_TYPE_I64, tcg_target_call_iarg_regs[2], + tcg_target_call_iarg_regs[1]); + tcg_out_mov(s, TCG_TYPE_I64, tcg_target_call_iarg_regs[1], + tcg_target_call_iarg_regs[0]); + tcg_out_mov(s, TCG_TYPE_I64, tcg_target_call_iarg_regs[0], + TCG_AREG0); +#endif +#endif tcg_out_calli(s, (tcg_target_long)qemu_stext_helpers[s_bits]); @@ -1715,7 +1736,7 @@ static void tcg_out_qemu_st_slow_path(TCGContext *s, TCGLabelQemuLdst *label) /* Pop and discard. This is 2 bytes smaller than the add. */ tcg_out_pop(s, TCG_REG_ECX); } else if (stack_adjust != 0) { - tcg_out_addi(s, TCG_REG_ESP, stack_adjust); + tcg_out_addi(s, TCG_REG_CALL_STACK, stack_adjust); } /* jump back to original code */ @@ -1816,6 +1837,7 @@ static void tcg_out_qemu_ld_opt(TCGContext *s, const TCGArg *args, uint32_t *label_ptr[2]; data_reg = args[0]; + label_ptr[1] = 0; addrlo_idx = 1; if (TCG_TARGET_REG_BITS == 32 && opc == 3) { data_reg2 = args[1]; @@ -1854,6 +1876,7 @@ static void tcg_out_qemu_st_opt(TCGContext *s, const TCGArg *args, uint32_t *label_ptr[2]; data_reg = args[0]; + label_ptr[1] = 0; addrlo_idx = 1; if (TCG_TARGET_REG_BITS == 32 && opc == 3) { data_reg2 = args[1]; -- 2.7.4