From a8c0c6b90d5d6471f1591a5db30fdb5efb6ae84d Mon Sep 17 00:00:00 2001 From: Yeongkyoon Lee Date: Thu, 8 Nov 2012 21:25:52 +0900 Subject: [PATCH] tcg: Patch qemu_ld/st optimization according to QEMU 1.2 Patch qemu_ld/st optimization according to QEMU 1.2 rebase. It should be modified again after future rebasing to QEMU 1.3 which will have the final qemu_ld/st optimization patch. At that time, they need to be forcedly overwritten by maintream code. Signed-off-by: Yeongkyoon Lee --- configure | 15 +-- exec-all.h | 36 +++++ exec.c | 11 ++ softmmu_template.h | 47 ++----- target-arm/op_helper.c | 23 ---- tcg/i386/tcg-target.c | 347 ++++++++++++------------------------------------- tcg/tcg.c | 14 +- tcg/tcg.h | 37 +++--- 8 files changed, 168 insertions(+), 362 deletions(-) diff --git a/configure b/configure index 8ac06fe..5bebb8d 100755 --- a/configure +++ b/configure @@ -196,7 +196,6 @@ bsd="no" linux="no" solaris="no" profiler="no" -ldst_optimization="no" cocoa="no" softmmu="yes" linux_user="no" @@ -783,8 +782,6 @@ for opt do ;; --disable-cocoa) cocoa="no" ;; - --enable-ldst-optimization) ldst_optimization="yes" - ;; --enable-cocoa) cocoa="yes" ; sdl="no" ; @@ -3207,7 +3204,6 @@ echo "gprof enabled $gprof" echo "sparse enabled $sparse" echo "strip binaries $strip_opt" echo "profiler $profiler" -echo "Fast TCG ld/st $ldst_optimization" echo "static build $static" echo "-Werror enabled $werror" if test "$darwin" = "yes" ; then @@ -3989,6 +3985,12 @@ upper() { echo "$@"| LC_ALL=C tr '[a-z]' '[A-Z]' } +case "$cpu" in + i386|x86_64) + echo "CONFIG_QEMU_LDST_OPTIMIZATION=y" >> $config_target_mak + ;; +esac + echo "TARGET_SHORT_ALIGNMENT=$target_short_alignment" >> $config_target_mak echo "TARGET_INT_ALIGNMENT=$target_int_alignment" >> $config_target_mak echo "TARGET_LONG_ALIGNMENT=$target_long_alignment" >> $config_target_mak @@ -4053,11 +4055,6 @@ if test "$hax" = "yes" ; then echo "CONFIG_NO_HAX=y" >> $config_target_mak fi fi -if test "$ldst_optimization" = "yes" ; then - if test "$target_arch2" = "i386" -o "$target_arch2" = "x86_64"; then - echo "CONFIG_QEMU_LDST_OPTIMIZATION=y" >> $config_target_mak - fi -fi if test "$gl" = "yes" ; then case "$target_arch2" in i386|x86_64) diff --git a/exec-all.h b/exec-all.h index f9899a8..2d35a60 100644 --- a/exec-all.h +++ b/exec-all.h @@ -308,6 +308,42 @@ extern uintptr_t tci_tb_ptr; # define GETPC() ((uintptr_t)__builtin_return_address(0) - 1) #endif +#if defined(CONFIG_QEMU_LDST_OPTIMIZATION) && defined(CONFIG_SOFTMMU) +/* qemu_ld/st optimization split code generation to fast and slow path, thus, + it needs special handling for an MMU helper which is called from the slow + path, to get the fast path's pc without any additional argument. + It uses a tricky solution which embeds the fast path pc into the slow path. + + Code flow in slow path: + (1) pre-process + (2) call MMU helper + (3) jump to (5) + (4) fast path information (implementation specific) + (5) post-process (e.g. stack adjust) + (6) jump to corresponding code of the next of fast path + */ +# if defined(__i386__) || defined(__x86_64__) +/* To avoid broken disassembling, long jmp is used for embedding fast path pc, + so that the destination is the next code of fast path, though this jmp is + never executed. + + call MMU helper + jmp POST_PROC (2byte) <- GETRA() + jmp NEXT_CODE (5byte) + POST_PROCESS ... <- GETRA() + 7 + */ +# define GETRA() ((uintptr_t)__builtin_return_address(0)) +# define GETPC_LDST() ((uintptr_t)(GETRA() + 7 + \ + *(int32_t *)((void *)GETRA() + 3) - 1)) +# else +# error "CONFIG_QEMU_LDST_OPTIMIZATION needs GETPC_LDST() implementation!" +# endif +bool is_tcg_gen_code(uintptr_t pc_ptr); +# define GETPC_EXT() (is_tcg_gen_code(GETRA()) ? GETPC_LDST() : GETPC()) +#else +# define GETPC_EXT() GETPC() +#endif + #if !defined(CONFIG_USER_ONLY) struct MemoryRegion *iotlb_to_region(target_phys_addr_t index); diff --git a/exec.c b/exec.c index 1a72296..a64e70b 100644 --- a/exec.c +++ b/exec.c @@ -1382,6 +1382,17 @@ void tb_link_page(TranslationBlock *tb, mmap_unlock(); } +#if defined(CONFIG_QEMU_LDST_OPTIMIZATION) && defined(CONFIG_SOFTMMU) +/* check whether the given addr is in TCG generated code buffer or not */ +bool is_tcg_gen_code(uintptr_t tc_ptr) +{ + /* This can be called during code generation, code_gen_buffer_max_size + is used instead of code_gen_ptr for upper boundary checking */ + return (tc_ptr >= (uintptr_t)code_gen_buffer && + tc_ptr < (uintptr_t)(code_gen_buffer + code_gen_buffer_max_size)); +} +#endif + /* find the TB 'tb' such that tb[0].tc_ptr <= tc_ptr < tb[1].tc_ptr. Return NULL if not found */ TranslationBlock *tb_find_pc(uintptr_t tc_ptr) diff --git a/softmmu_template.h b/softmmu_template.h index 18ded0b..0366e67 100644 --- a/softmmu_template.h +++ b/softmmu_template.h @@ -66,27 +66,6 @@ #define HELPER_PREFIX helper_ #endif -#ifndef CONFIG_TCG_PASS_AREG0 -#ifdef USE_EXTENDED_HELPER -/* Exteneded helper funtions have one more argument of address - to which pc is returned after setting TLB entry */ -#ifndef CONFIG_QEMU_LDST_OPTIMIZATION -#error You need CONFIG_QEMU_LDST_OPTIMIZATION! -#endif -#undef HELPER_PREFIX -#define HELPER_PREFIX __ext_ -#define RET_PARAM , uintptr_t raddr -#define RET_VAR raddr -#define GET_RET_ADDR() RET_VAR -#else -#define RET_PARAM -#define RET_VAR -#define GET_RET_ADDR() GETPC() -#endif /* USE_EXTENDED_HELPER */ -#endif /* !CONFIG_TCG_PASS_AREG0 */ - - -#ifndef USE_EXTENDED_HELPER static DATA_TYPE glue(glue(slow_ld, SUFFIX), MMUSUFFIX)(ENV_PARAM target_ulong addr, int mmu_idx, @@ -122,7 +101,6 @@ static inline DATA_TYPE glue(io_read, SUFFIX)(ENV_PARAM #endif /* SHIFT > 2 */ return res; } -#endif /* !USE_EXTENDED_HELPER */ /* handle all cases except unaligned access which span two pages */ DATA_TYPE @@ -146,13 +124,13 @@ glue(glue(glue(HELPER_PREFIX, ld), SUFFIX), MMUSUFFIX)(ENV_PARAM /* IO access */ if ((addr & (DATA_SIZE - 1)) != 0) goto do_unaligned_access; - retaddr = GETPC(); + retaddr = GETPC_EXT(); ioaddr = env->iotlb[mmu_idx][index]; res = glue(io_read, SUFFIX)(ENV_VAR ioaddr, addr, retaddr); } else if (((addr & ~TARGET_PAGE_MASK) + DATA_SIZE - 1) >= TARGET_PAGE_SIZE) { /* slow unaligned access (it spans two pages or IO) */ do_unaligned_access: - retaddr = GETPC(); + retaddr = GETPC_EXT(); #ifdef ALIGNED_ONLY do_unaligned_access(ENV_VAR addr, READ_ACCESS_TYPE, mmu_idx, retaddr); #endif @@ -163,7 +141,7 @@ glue(glue(glue(HELPER_PREFIX, ld), SUFFIX), MMUSUFFIX)(ENV_PARAM uintptr_t addend; #ifdef ALIGNED_ONLY if ((addr & (DATA_SIZE - 1)) != 0) { - retaddr = GETPC(); + retaddr = GETPC_EXT(); do_unaligned_access(ENV_VAR addr, READ_ACCESS_TYPE, mmu_idx, retaddr); } #endif @@ -173,7 +151,7 @@ glue(glue(glue(HELPER_PREFIX, ld), SUFFIX), MMUSUFFIX)(ENV_PARAM } } else { /* the page is not in the TLB : fill it */ - retaddr = GETPC(); + retaddr = GETPC_EXT(); #ifdef ALIGNED_ONLY if ((addr & (DATA_SIZE - 1)) != 0) do_unaligned_access(ENV_VAR addr, READ_ACCESS_TYPE, mmu_idx, retaddr); @@ -184,7 +162,6 @@ glue(glue(glue(HELPER_PREFIX, ld), SUFFIX), MMUSUFFIX)(ENV_PARAM return res; } -#ifndef USE_EXTENDED_HELPER /* handle all unaligned cases */ static DATA_TYPE glue(glue(slow_ld, SUFFIX), MMUSUFFIX)(ENV_PARAM @@ -236,11 +213,9 @@ glue(glue(slow_ld, SUFFIX), MMUSUFFIX)(ENV_PARAM } return res; } -#endif /* !USE_EXTENDED_HELPER */ #ifndef SOFTMMU_CODE_ACCESS -#ifndef USE_EXTENDED_HELPER static void glue(glue(slow_st, SUFFIX), MMUSUFFIX)(ENV_PARAM target_ulong addr, DATA_TYPE val, @@ -277,7 +252,6 @@ static inline void glue(io_write, SUFFIX)(ENV_PARAM #endif #endif /* SHIFT > 2 */ } -#endif /* !USE_EXTENDED_HELPER */ void glue(glue(glue(HELPER_PREFIX, st), SUFFIX), MMUSUFFIX)(ENV_PARAM target_ulong addr, @@ -297,12 +271,12 @@ void glue(glue(glue(HELPER_PREFIX, st), SUFFIX), MMUSUFFIX)(ENV_PARAM /* IO access */ if ((addr & (DATA_SIZE - 1)) != 0) goto do_unaligned_access; - retaddr = GETPC(); + retaddr = GETPC_EXT(); ioaddr = env->iotlb[mmu_idx][index]; glue(io_write, SUFFIX)(ENV_VAR ioaddr, val, addr, retaddr); } else if (((addr & ~TARGET_PAGE_MASK) + DATA_SIZE - 1) >= TARGET_PAGE_SIZE) { do_unaligned_access: - retaddr = GETPC(); + retaddr = GETPC_EXT(); #ifdef ALIGNED_ONLY do_unaligned_access(ENV_VAR addr, 1, mmu_idx, retaddr); #endif @@ -313,7 +287,7 @@ void glue(glue(glue(HELPER_PREFIX, st), SUFFIX), MMUSUFFIX)(ENV_PARAM uintptr_t addend; #ifdef ALIGNED_ONLY if ((addr & (DATA_SIZE - 1)) != 0) { - retaddr = GETPC(); + retaddr = GETPC_EXT(); do_unaligned_access(ENV_VAR addr, 1, mmu_idx, retaddr); } #endif @@ -323,7 +297,7 @@ void glue(glue(glue(HELPER_PREFIX, st), SUFFIX), MMUSUFFIX)(ENV_PARAM } } else { /* the page is not in the TLB : fill it */ - retaddr = GETPC(); + retaddr = GETPC_EXT(); #ifdef ALIGNED_ONLY if ((addr & (DATA_SIZE - 1)) != 0) do_unaligned_access(ENV_VAR addr, 1, mmu_idx, retaddr); @@ -333,7 +307,6 @@ void glue(glue(glue(HELPER_PREFIX, st), SUFFIX), MMUSUFFIX)(ENV_PARAM } } -#ifndef USE_EXTENDED_HELPER /* handles all unaligned cases */ static void glue(glue(slow_st, SUFFIX), MMUSUFFIX)(ENV_PARAM target_ulong addr, @@ -383,7 +356,6 @@ static void glue(glue(slow_st, SUFFIX), MMUSUFFIX)(ENV_PARAM goto redo; } } -#endif /* !USE_EXTENDED_HELPER */ #endif /* !defined(SOFTMMU_CODE_ACCESS) */ @@ -398,6 +370,3 @@ static void glue(glue(slow_st, SUFFIX), MMUSUFFIX)(ENV_PARAM #undef ENV_VAR #undef CPU_PREFIX #undef HELPER_PREFIX -#undef RET_PARAM -#undef RET_VAR -#undef GET_RET_ADDR diff --git a/target-arm/op_helper.c b/target-arm/op_helper.c index 92f9fda..d77bfab 100644 --- a/target-arm/op_helper.c +++ b/target-arm/op_helper.c @@ -69,29 +69,6 @@ uint32_t HELPER(neon_tbl)(uint32_t ireg, uint32_t def, #define SHIFT 3 #include "softmmu_template.h" - -#if defined(CONFIG_QEMU_LDST_OPTIMIZATION) && defined(CONFIG_SOFTMMU) -/* Exteneded MMU helper funtions for qemu_ld/st optimization - Note that normal helper functions should be defined above - to avoid duplication of common functions, slow_ld/st and io_read/write. - */ -#define USE_EXTENDED_HELPER - -#define SHIFT 0 -#include "softmmu_template.h" - -#define SHIFT 1 -#include "softmmu_template.h" - -#define SHIFT 2 -#include "softmmu_template.h" - -#define SHIFT 3 -#include "softmmu_template.h" - -#undef USE_EXTENDED_HELPER -#endif /* CONFIG_QEMU_LDST_OPTIMIZATION && CONFIG_SOFTMMU */ - /* try to fill the TLB and return an exception if error. If retaddr is NULL, it means that the function was called in C code (i.e. not from generated code or from helper.c) */ diff --git a/tcg/i386/tcg-target.c b/tcg/i386/tcg-target.c index ec7a561..88ab27e 100644 --- a/tcg/i386/tcg-target.c +++ b/tcg/i386/tcg-target.c @@ -984,8 +984,6 @@ static const void *qemu_st_helpers[4] = { helper_stq_mmu, }; #else - -#ifndef CONFIG_QEMU_LDST_OPTIMIZATION /* legacy helper signature: __ld_mmu(target_ulong addr, int mmu_idx) */ static void *qemu_ld_helpers[4] = { @@ -1003,27 +1001,11 @@ static void *qemu_st_helpers[4] = { __stl_mmu, __stq_mmu, }; -#else -/* extended legacy helper signature: __ext_ld_mmu(target_ulong addr, int - mmu_idx, uintptr raddr) */ -static void *qemu_ld_helpers[4] = { - __ext_ldb_mmu, - __ext_ldw_mmu, - __ext_ldl_mmu, - __ext_ldq_mmu, -}; - -/* extended legacy helper signature: __ext_st_mmu(target_ulong addr, uintxx_t val, - int mmu_idx) */ -static void *qemu_st_helpers[4] = { - __ext_stb_mmu, - __ext_stw_mmu, - __ext_stl_mmu, - __ext_stq_mmu, -}; +#endif static void add_qemu_ldst_label(TCGContext *s, - int opc_ext, + int is_ld, + int opc, int data_reg, int data_reg2, int addrlo_reg, @@ -1031,8 +1013,6 @@ static void add_qemu_ldst_label(TCGContext *s, int mem_index, uint8_t *raddr, uint8_t **label_ptr); -#endif /* !CONFIG_QEMU_LDST_OPTIMIZATION */ -#endif /* Perform the TLB load and compare. @@ -1092,36 +1072,19 @@ static inline void tcg_out_tlb_load(TCGContext *s, int addrlo_idx, tcg_out_mov(s, type, r0, addrlo); -#if defined(CONFIG_QEMU_LDST_OPTIMIZATION) /* jne slow_path */ tcg_out_opc(s, OPC_JCC_long + JCC_JNE, 0, 0, 0); - if (!label_ptr) { - tcg_abort(); - } label_ptr[0] = s->code_ptr; s->code_ptr += 4; -#else - /* jne label1 */ - tcg_out8(s, OPC_JCC_short + JCC_JNE); - label_ptr[0] = s->code_ptr; - s->code_ptr++; -#endif if (TARGET_LONG_BITS > TCG_TARGET_REG_BITS) { /* cmp 4(r1), addrhi */ tcg_out_modrm_offset(s, OPC_CMP_GvEv, args[addrlo_idx+1], r1, 4); -#if defined(CONFIG_QEMU_LDST_OPTIMIZATION) /* jne slow_path */ tcg_out_opc(s, OPC_JCC_long + JCC_JNE, 0, 0, 0); label_ptr[1] = s->code_ptr; s->code_ptr += 4; -#else - /* jne label1 */ - tcg_out8(s, OPC_JCC_short + JCC_JNE); - label_ptr[1] = s->code_ptr; - s->code_ptr++; -#endif } /* TLB Hit. */ @@ -1219,14 +1182,7 @@ static void tcg_out_qemu_ld(TCGContext *s, const TCGArg *args, int addrlo_idx; #if defined(CONFIG_SOFTMMU) int mem_index, s_bits; -#if !defined(CONFIG_QEMU_LDST_OPTIMIZATION) -#if TCG_TARGET_REG_BITS == 64 - int arg_idx; -#else - int stack_adjust; -#endif -#endif /* !defined(CONFIG_QEMU_LDST_OPTIMIZATION) */ - uint8_t *label_ptr[3]; + uint8_t *label_ptr[2]; #endif data_reg = args[0]; @@ -1247,9 +1203,9 @@ static void tcg_out_qemu_ld(TCGContext *s, const TCGArg *args, tcg_out_qemu_ld_direct(s, data_reg, data_reg2, tcg_target_call_iarg_regs[0], 0, opc); -#if defined(CONFIG_QEMU_LDST_OPTIMIZATION) - /* helper stub will be jumped back here */ + /* Record the current context of a load into ldst label */ add_qemu_ldst_label(s, + 1, opc, data_reg, data_reg2, @@ -1259,103 +1215,6 @@ static void tcg_out_qemu_ld(TCGContext *s, const TCGArg *args, s->code_ptr, label_ptr); #else - /* jmp label2 */ - tcg_out8(s, OPC_JMP_short); - label_ptr[2] = s->code_ptr; - s->code_ptr++; - - /* TLB Miss. */ - - /* label1: */ - *label_ptr[0] = s->code_ptr - label_ptr[0] - 1; - if (TARGET_LONG_BITS > TCG_TARGET_REG_BITS) { - *label_ptr[1] = s->code_ptr - label_ptr[1] - 1; - } - - /* XXX: move that code at the end of the TB */ -#if TCG_TARGET_REG_BITS == 32 - tcg_out_pushi(s, mem_index); - stack_adjust = 4; - if (TARGET_LONG_BITS == 64) { - tcg_out_push(s, args[addrlo_idx + 1]); - stack_adjust += 4; - } - tcg_out_push(s, args[addrlo_idx]); - stack_adjust += 4; -#ifdef CONFIG_TCG_PASS_AREG0 - tcg_out_push(s, TCG_AREG0); - stack_adjust += 4; -#endif -#else - /* The first argument is already loaded with addrlo. */ - arg_idx = 1; - tcg_out_movi(s, TCG_TYPE_I32, tcg_target_call_iarg_regs[arg_idx], - mem_index); -#ifdef CONFIG_TCG_PASS_AREG0 - /* XXX/FIXME: suboptimal */ - tcg_out_mov(s, TCG_TYPE_I64, tcg_target_call_iarg_regs[3], - tcg_target_call_iarg_regs[2]); - tcg_out_mov(s, TCG_TYPE_I64, tcg_target_call_iarg_regs[2], - tcg_target_call_iarg_regs[1]); - tcg_out_mov(s, TCG_TYPE_I64, tcg_target_call_iarg_regs[1], - tcg_target_call_iarg_regs[0]); - tcg_out_mov(s, TCG_TYPE_I64, tcg_target_call_iarg_regs[0], - TCG_AREG0); -#endif -#endif - - tcg_out_calli(s, (tcg_target_long)qemu_ld_helpers[s_bits]); - -#if TCG_TARGET_REG_BITS == 32 - if (stack_adjust == (TCG_TARGET_REG_BITS / 8)) { - /* Pop and discard. This is 2 bytes smaller than the add. */ - tcg_out_pop(s, TCG_REG_ECX); - } else if (stack_adjust != 0) { - tcg_out_addi(s, TCG_REG_CALL_STACK, stack_adjust); - } -#endif - - switch(opc) { - case 0 | 4: - tcg_out_ext8s(s, data_reg, TCG_REG_EAX, P_REXW); - break; - case 1 | 4: - tcg_out_ext16s(s, data_reg, TCG_REG_EAX, P_REXW); - break; - case 0: - tcg_out_ext8u(s, data_reg, TCG_REG_EAX); - break; - case 1: - tcg_out_ext16u(s, data_reg, TCG_REG_EAX); - break; - case 2: - tcg_out_mov(s, TCG_TYPE_I32, data_reg, TCG_REG_EAX); - break; -#if TCG_TARGET_REG_BITS == 64 - case 2 | 4: - tcg_out_ext32s(s, data_reg, TCG_REG_EAX); - break; -#endif - case 3: - if (TCG_TARGET_REG_BITS == 64) { - tcg_out_mov(s, TCG_TYPE_I64, data_reg, TCG_REG_RAX); - } else if (data_reg == TCG_REG_EDX) { - /* xchg %edx, %eax */ - tcg_out_opc(s, OPC_XCHG_ax_r32 + TCG_REG_EDX, 0, 0, 0); - tcg_out_mov(s, TCG_TYPE_I32, data_reg2, TCG_REG_EAX); - } else { - tcg_out_mov(s, TCG_TYPE_I32, data_reg, TCG_REG_EAX); - tcg_out_mov(s, TCG_TYPE_I32, data_reg2, TCG_REG_EDX); - } - break; - default: - tcg_abort(); - } - - /* label2: */ - *label_ptr[2] = s->code_ptr - label_ptr[2] - 1; -#endif /* defined(CONFIG_QEMU_LDST_OPTIMIZATION) */ -#else { int32_t offset = GUEST_BASE; int base = args[addrlo_idx]; @@ -1448,10 +1307,7 @@ static void tcg_out_qemu_st(TCGContext *s, const TCGArg *args, int addrlo_idx; #if defined(CONFIG_SOFTMMU) int mem_index, s_bits; -#if !defined(CONFIG_QEMU_LDST_OPTIMIZATION) - int stack_adjust; -#endif - uint8_t *label_ptr[3]; + uint8_t *label_ptr[2]; #endif data_reg = args[0]; @@ -1472,10 +1328,10 @@ static void tcg_out_qemu_st(TCGContext *s, const TCGArg *args, tcg_out_qemu_st_direct(s, data_reg, data_reg2, tcg_target_call_iarg_regs[0], 0, opc); -#if defined(CONFIG_QEMU_LDST_OPTIMIZATION) - /* helper stub will be jumped back here */ + /* Record the current context of a store into ldst label */ add_qemu_ldst_label(s, - opc | HL_ST_MASK, + 0, + opc, data_reg, data_reg2, args[addrlo_idx], @@ -1484,70 +1340,6 @@ static void tcg_out_qemu_st(TCGContext *s, const TCGArg *args, s->code_ptr, label_ptr); #else - /* jmp label2 */ - tcg_out8(s, OPC_JMP_short); - label_ptr[2] = s->code_ptr; - s->code_ptr++; - - /* TLB Miss. */ - - /* label1: */ - *label_ptr[0] = s->code_ptr - label_ptr[0] - 1; - if (TARGET_LONG_BITS > TCG_TARGET_REG_BITS) { - *label_ptr[1] = s->code_ptr - label_ptr[1] - 1; - } - - /* XXX: move that code at the end of the TB */ -#if TCG_TARGET_REG_BITS == 32 - tcg_out_pushi(s, mem_index); - stack_adjust = 4; - if (opc == 3) { - tcg_out_push(s, data_reg2); - stack_adjust += 4; - } - tcg_out_push(s, data_reg); - stack_adjust += 4; - if (TARGET_LONG_BITS == 64) { - tcg_out_push(s, args[addrlo_idx + 1]); - stack_adjust += 4; - } - tcg_out_push(s, args[addrlo_idx]); - stack_adjust += 4; -#ifdef CONFIG_TCG_PASS_AREG0 - tcg_out_push(s, TCG_AREG0); - stack_adjust += 4; -#endif -#else - tcg_out_mov(s, (opc == 3 ? TCG_TYPE_I64 : TCG_TYPE_I32), - tcg_target_call_iarg_regs[1], data_reg); - tcg_out_movi(s, TCG_TYPE_I32, tcg_target_call_iarg_regs[2], mem_index); - stack_adjust = 0; -#ifdef CONFIG_TCG_PASS_AREG0 - /* XXX/FIXME: suboptimal */ - tcg_out_mov(s, TCG_TYPE_I64, tcg_target_call_iarg_regs[3], - tcg_target_call_iarg_regs[2]); - tcg_out_mov(s, TCG_TYPE_I64, tcg_target_call_iarg_regs[2], - tcg_target_call_iarg_regs[1]); - tcg_out_mov(s, TCG_TYPE_I64, tcg_target_call_iarg_regs[1], - tcg_target_call_iarg_regs[0]); - tcg_out_mov(s, TCG_TYPE_I64, tcg_target_call_iarg_regs[0], - TCG_AREG0); -#endif -#endif - - tcg_out_calli(s, (tcg_target_long)qemu_st_helpers[s_bits]); - - if (stack_adjust == (TCG_TARGET_REG_BITS / 8)) { - /* Pop and discard. This is 2 bytes smaller than the add. */ - tcg_out_pop(s, TCG_REG_ECX); - } else if (stack_adjust != 0) { - tcg_out_addi(s, TCG_REG_CALL_STACK, stack_adjust); - } - - /* label2: */ - *label_ptr[2] = s->code_ptr - label_ptr[2] - 1; -#endif /* defined(CONFIG_QEMU_LDST_OPTIMIZATION) */ -#else { int32_t offset = GUEST_BASE; int base = args[addrlo_idx]; @@ -1574,15 +1366,14 @@ static void tcg_out_qemu_st(TCGContext *s, const TCGArg *args, #endif } -#if defined(CONFIG_QEMU_LDST_OPTIMIZATION) -/* optimization to reduce jump overheads for qemu_ld/st IRs */ - +#if defined(CONFIG_SOFTMMU) /* - * qemu_ld/st code generator call add_qemu_ldst_label, - * so that slow case(TLB miss or I/O rw) is handled at the end of TB + * Record the context of a call to the out of line helper code for the slow path + * for a load or store, so that we can later generate the correct helper code */ static void add_qemu_ldst_label(TCGContext *s, - int opc_ext, + int is_ld, + int opc, int data_reg, int data_reg2, int addrlo_reg, @@ -1594,34 +1385,35 @@ static void add_qemu_ldst_label(TCGContext *s, int idx; TCGLabelQemuLdst *label; - if (s->nb_qemu_ldst_labels >= TCG_MAX_QEMU_LDST) + if (s->nb_qemu_ldst_labels >= TCG_MAX_QEMU_LDST) { tcg_abort(); + } idx = s->nb_qemu_ldst_labels++; label = (TCGLabelQemuLdst *)&s->qemu_ldst_labels[idx]; - label->opc_ext = opc_ext; + label->is_ld = is_ld; + label->opc = opc; label->datalo_reg = data_reg; label->datahi_reg = data_reg2; label->addrlo_reg = addrlo_reg; label->addrhi_reg = addrhi_reg; label->mem_index = mem_index; label->raddr = raddr; - if (!label_ptr) { - tcg_abort(); - } label->label_ptr[0] = label_ptr[0]; - label->label_ptr[1] = label_ptr[1]; + if (TARGET_LONG_BITS > TCG_TARGET_REG_BITS) { + label->label_ptr[1] = label_ptr[1]; + } } -/* generates slow case of qemu_ld at the end of TB */ +/* + * Generate code for the slow path for a load at the end of block + */ static void tcg_out_qemu_ld_slow_path(TCGContext *s, TCGLabelQemuLdst *label) { int s_bits; - int opc = label->opc_ext & HL_OPC_MASK; + int opc = label->opc; int mem_index = label->mem_index; -#if TCG_TARGET_REG_BITS == 64 - int arg_idx; -#else +#if TCG_TARGET_REG_BITS == 32 int stack_adjust; int addrlo_reg = label->addrlo_reg; int addrhi_reg = label->addrhi_reg; @@ -1639,18 +1431,14 @@ static void tcg_out_qemu_ld_slow_path(TCGContext *s, TCGLabelQemuLdst *label) *(uint32_t *)label_ptr[1] = (uint32_t)(s->code_ptr - label_ptr[1] - 4); } - /* extended helper signature: __ext_ld_mmu(target_ulong addr, int mmu_idx, - uintptr_t raddr) */ #if TCG_TARGET_REG_BITS == 32 - tcg_out_pushi(s, (uintptr_t)(raddr - 1)); /* return address */ + tcg_out_pushi(s, mem_index); stack_adjust = 4; - tcg_out_pushi(s, mem_index); /* mmu index */ - stack_adjust += 4; if (TARGET_LONG_BITS == 64) { tcg_out_push(s, addrhi_reg); stack_adjust += 4; } - tcg_out_push(s, addrlo_reg); /* guest addr */ + tcg_out_push(s, addrlo_reg); stack_adjust += 4; #ifdef CONFIG_TCG_PASS_AREG0 tcg_out_push(s, TCG_AREG0); @@ -1658,11 +1446,8 @@ static void tcg_out_qemu_ld_slow_path(TCGContext *s, TCGLabelQemuLdst *label) #endif #else /* The first argument is already loaded with addrlo. */ - arg_idx = 1; - tcg_out_movi(s, TCG_TYPE_I32, tcg_target_call_iarg_regs[arg_idx++], + tcg_out_movi(s, TCG_TYPE_I32, tcg_target_call_iarg_regs[1], mem_index); - tcg_out_movi(s, TCG_TYPE_I32, tcg_target_call_iarg_regs[arg_idx++], - (uintptr_t)(raddr - 1)); #ifdef CONFIG_TCG_PASS_AREG0 /* XXX/FIXME: suboptimal */ tcg_out_mov(s, TCG_TYPE_I64, tcg_target_call_iarg_regs[3], @@ -1676,8 +1461,26 @@ static void tcg_out_qemu_ld_slow_path(TCGContext *s, TCGLabelQemuLdst *label) #endif #endif + /* Code generation of qemu_ld/st's slow path calling MMU helper + + PRE_PROC ... + call MMU helper + jmp POST_PROC (2b) : short forward jump <- GETRA() + jmp next_code (5b) : dummy long backward jump which is never executed + POST_PROC ... : do post-processing <- GETRA() + 7 + jmp next_code : jump to the code corresponding to next IR of qemu_ld/st + */ + tcg_out_calli(s, (tcg_target_long)qemu_ld_helpers[s_bits]); + /* Jump to post-processing code */ + tcg_out8(s, OPC_JMP_short); + tcg_out8(s, 5); + /* Dummy backward jump having information of fast path'pc for MMU helpers */ + tcg_out8(s, OPC_JMP_long); + *(int32_t *)s->code_ptr = (int32_t)(raddr - s->code_ptr - 4); + s->code_ptr += 4; + #if TCG_TARGET_REG_BITS == 32 if (stack_adjust == (TCG_TARGET_REG_BITS / 8)) { /* Pop and discard. This is 2 bytes smaller than the add. */ @@ -1724,16 +1527,18 @@ static void tcg_out_qemu_ld_slow_path(TCGContext *s, TCGLabelQemuLdst *label) tcg_abort(); } - /* jump back to original code */ - tcg_out_jmp(s, (tcg_target_long) raddr); + /* Jump to the code corresponding to next IR of qemu_st */ + tcg_out_jmp(s, (tcg_target_long)raddr); } -/* generates slow case of qemu_st at the end of TB */ +/* + * Generate code for the slow path for a store at the end of block + */ static void tcg_out_qemu_st_slow_path(TCGContext *s, TCGLabelQemuLdst *label) { int s_bits; int stack_adjust; - int opc = label->opc_ext & HL_OPC_MASK; + int opc = label->opc; int mem_index = label->mem_index; int data_reg = label->datalo_reg; #if TCG_TARGET_REG_BITS == 32 @@ -1752,24 +1557,20 @@ static void tcg_out_qemu_st_slow_path(TCGContext *s, TCGLabelQemuLdst *label) *(uint32_t *)label_ptr[1] = (uint32_t)(s->code_ptr - label_ptr[1] - 4); } - /* extended helper signature: __ext_st_mmu(target_ulong addr, uintxx_t val, - int mmu_idx, uintptr_t raddr) */ #if TCG_TARGET_REG_BITS == 32 - tcg_out_pushi(s, (uintptr_t)(raddr - 1)); /* return address */ + tcg_out_pushi(s, mem_index); stack_adjust = 4; - tcg_out_pushi(s, mem_index); /* mmu index */ - stack_adjust += 4; if (opc == 3) { tcg_out_push(s, data_reg2); stack_adjust += 4; } - tcg_out_push(s, data_reg); /* guest data */ + tcg_out_push(s, data_reg); stack_adjust += 4; if (TARGET_LONG_BITS == 64) { tcg_out_push(s, addrhi_reg); stack_adjust += 4; } - tcg_out_push(s, addrlo_reg); /* guest addr */ + tcg_out_push(s, addrlo_reg); stack_adjust += 4; #ifdef CONFIG_TCG_PASS_AREG0 tcg_out_push(s, TCG_AREG0); @@ -1779,7 +1580,6 @@ static void tcg_out_qemu_st_slow_path(TCGContext *s, TCGLabelQemuLdst *label) tcg_out_mov(s, (opc == 3 ? TCG_TYPE_I64 : TCG_TYPE_I32), tcg_target_call_iarg_regs[1], data_reg); tcg_out_movi(s, TCG_TYPE_I32, tcg_target_call_iarg_regs[2], mem_index); - tcg_out_movi(s, TCG_TYPE_I64, tcg_target_call_iarg_regs[3], (uintptr_t)(raddr - 1)); stack_adjust = 0; #ifdef CONFIG_TCG_PASS_AREG0 /* XXX/FIXME: suboptimal */ @@ -1794,8 +1594,26 @@ static void tcg_out_qemu_st_slow_path(TCGContext *s, TCGLabelQemuLdst *label) #endif #endif + /* Code generation of qemu_ld/st's slow path calling MMU helper + + PRE_PROC ... + call MMU helper + jmp POST_PROC (2b) : short forward jump <- GETRA() + jmp next_code (5b) : dummy long backward jump which is never executed + POST_PROC ... : do post-processing <- GETRA() + 7 + jmp next_code : jump to the code corresponding to next IR of qemu_ld/st + */ + tcg_out_calli(s, (tcg_target_long)qemu_st_helpers[s_bits]); + /* Jump to post-processing code */ + tcg_out8(s, OPC_JMP_short); + tcg_out8(s, 5); + /* Dummy backward jump having information of fast path'pc for MMU helpers */ + tcg_out8(s, OPC_JMP_long); + *(int32_t *)s->code_ptr = (int32_t)(raddr - s->code_ptr - 4); + s->code_ptr += 4; + if (stack_adjust == (TCG_TARGET_REG_BITS / 8)) { /* Pop and discard. This is 2 bytes smaller than the add. */ tcg_out_pop(s, TCG_REG_ECX); @@ -1803,26 +1621,29 @@ static void tcg_out_qemu_st_slow_path(TCGContext *s, TCGLabelQemuLdst *label) tcg_out_addi(s, TCG_REG_CALL_STACK, stack_adjust); } - /* jump back to original code */ - tcg_out_jmp(s, (tcg_target_long) raddr); + /* Jump to the code corresponding to next IR of qemu_st */ + tcg_out_jmp(s, (tcg_target_long)raddr); } -/* generates all of the slow cases of qemu_ld/st at the end of TB */ -void tcg_out_qemu_ldst_slow_path(TCGContext *s) +/* + * Generate TB finalization at the end of block + */ +void tcg_out_tb_finalize(TCGContext *s) { int i; TCGLabelQemuLdst *label; + /* qemu_ld/st slow paths */ for (i = 0; i < s->nb_qemu_ldst_labels; i++) { label = (TCGLabelQemuLdst *)&s->qemu_ldst_labels[i]; - if (IS_QEMU_LD_LABEL(label)) { + if (label->is_ld) { tcg_out_qemu_ld_slow_path(s, label); } else { tcg_out_qemu_st_slow_path(s, label); } } } -#endif /* defined(CONFIG_QEMU_LDST_OPTIMIZATION) */ +#endif /* CONFIG_SOFTMMU */ static inline void tcg_out_op(TCGContext *s, TCGOpcode opc, const TCGArg *args, const int *const_args) diff --git a/tcg/tcg.c b/tcg/tcg.c index 3ab4190..5eac190 100644 --- a/tcg/tcg.c +++ b/tcg/tcg.c @@ -301,12 +301,12 @@ void tcg_func_start(TCGContext *s) gen_opc_ptr = gen_opc_buf; gen_opparam_ptr = gen_opparam_buf; + #if defined(CONFIG_QEMU_LDST_OPTIMIZATION) && defined(CONFIG_SOFTMMU) - /* initialize qemu_ld/st labels which help to generate TLB miss case codes at the end of TB */ - s->qemu_ldst_labels = tcg_malloc(sizeof(TCGLabelQemuLdst) * TCG_MAX_QEMU_LDST); - if (!s->qemu_ldst_labels) { - tcg_abort(); - } + /* Initialize qemu_ld/st labels to assist code generation at the end of TB + for TLB miss cases at the end of TB */ + s->qemu_ldst_labels = tcg_malloc(sizeof(TCGLabelQemuLdst) * + TCG_MAX_QEMU_LDST); s->nb_qemu_ldst_labels = 0; #endif } @@ -2178,8 +2178,8 @@ static inline int tcg_gen_code_common(TCGContext *s, uint8_t *gen_code_buf, } the_end: #if defined(CONFIG_QEMU_LDST_OPTIMIZATION) && defined(CONFIG_SOFTMMU) - /* Generate MMU call helpers at the end of block (currently only for qemu_ld/st) */ - tcg_out_qemu_ldst_slow_path(s); + /* Generate TB finalization at the end of block */ + tcg_out_tb_finalize(s); #endif return -1; } diff --git a/tcg/tcg.h b/tcg/tcg.h index 880f35a..08bbbd8 100644 --- a/tcg/tcg.h +++ b/tcg/tcg.h @@ -188,27 +188,22 @@ typedef tcg_target_ulong TCGArg; */ #if defined(CONFIG_QEMU_LDST_OPTIMIZATION) && defined(CONFIG_SOFTMMU) -/* Macros and structures for qemu_ld/st IR code optimization: - It looks good for TCG_MAX_HELPER_LABELS to be half of OPC_BUF_SIZE in exec-all.h. */ -#define TCG_MAX_QEMU_LDST 320 -#define HL_LDST_SHIFT 4 -#define HL_LDST_MASK (1 << HL_LDST_SHIFT) -#define HL_ST_MASK HL_LDST_MASK -#define HL_OPC_MASK (HL_LDST_MASK - 1) -#define IS_QEMU_LD_LABEL(L) (!((L)->opc_ext & HL_LDST_MASK)) -#define IS_QEMU_ST_LABEL(L) ((L)->opc_ext & HL_LDST_MASK) +/* Macros/structures for qemu_ld/st IR code optimization: + TCG_MAX_HELPER_LABELS is defined as same as OPC_BUF_SIZE in exec-all.h. */ +#define TCG_MAX_QEMU_LDST 640 typedef struct TCGLabelQemuLdst { - int opc_ext; /* | 27bit (reserved) | 1bit (ld/st flag) | 4bit (opc) | */ - int addrlo_reg; /* reg index for the low word of guest virtual address */ - int addrhi_reg; /* reg index for the high word of guest virtual address */ - int datalo_reg; /* reg index for the low word to be loaded or to be stored */ - int datahi_reg; /* reg index for the high word to be loaded or to be stored */ - int mem_index; /* soft MMU memory index */ - uint8_t *raddr; /* return address (located end of TB) */ + int is_ld:1; /* qemu_ld: 1, qemu_st: 0 */ + int opc:4; + int addrlo_reg; /* reg index for low word of guest virtual addr */ + int addrhi_reg; /* reg index for high word of guest virtual addr */ + int datalo_reg; /* reg index for low word to be loaded or stored */ + int datahi_reg; /* reg index for high word to be loaded or stored */ + int mem_index; /* soft MMU memory index */ + uint8_t *raddr; /* gen code addr of the next IR of qemu_ld/st IR */ uint8_t *label_ptr[2]; /* label pointers to be updated */ } TCGLabelQemuLdst; -#endif /* CONFIG_QEMU_LDST_OPTIMIZATION */ +#endif #ifdef CONFIG_DEBUG_TCG #define DEBUG_TCGV 1 @@ -617,9 +612,9 @@ extern uint8_t code_gen_prologue[]; ((tcg_target_ulong (*)(void *, void *))code_gen_prologue)(env, tb_ptr) #endif +void tcg_register_jit(void *buf, size_t buf_size); + #if defined(CONFIG_QEMU_LDST_OPTIMIZATION) && defined(CONFIG_SOFTMMU) -/* qemu_ld/st generation at the end of TB */ -void tcg_out_qemu_ldst_slow_path(TCGContext *s); +/* Generate TB finalization at the end of block */ +void tcg_out_tb_finalize(TCGContext *s); #endif - -void tcg_register_jit(void *buf, size_t buf_size); -- 2.7.4