From: yeongkyoon.lee Date: Wed, 30 Nov 2011 06:59:15 +0000 (+0900) Subject: [Title] Add x86 host TCG optimization and some profiling codes X-Git-Tag: TizenStudio_2.0_p2.3~1694^2~238^2~1 X-Git-Url: http://review.tizen.org/git/?a=commitdiff_plain;h=d3b33ec4703c1251842cf4ed310e4770572a3a80;p=sdk%2Femulator%2Fqemu.git [Title] Add x86 host TCG optimization and some profiling codes [Type] Enhancement [Module] TCG [Priority] major [CQ#] [Redmine#] [Problem] [Cause] TCG-generated code for qemu_ld/st IR has some jump overheads [Solution] Reduce jump overheads for qemu_ld/st IR code [TestCase] CoreMark, booting --- diff --git a/configure b/configure index b5de2cc..caac636 100755 --- a/configure +++ b/configure @@ -629,6 +629,8 @@ for opt do ;; --enable-profiler) profiler="yes" ;; + --enable-tcg-x86-opt) tcg_x86_opt="yes" + ;; --enable-cocoa) cocoa="yes" ; sdl="no" ; @@ -2431,6 +2433,7 @@ echo "gprof enabled $gprof" echo "sparse enabled $sparse" echo "strip binaries $strip_opt" echo "profiler $profiler" +echo "TCG optimization $tcg_x86_opt" echo "static build $static" echo "-Werror enabled $werror" if test "$darwin" = "yes" ; then @@ -2570,6 +2573,9 @@ fi if test $profiler = "yes" ; then echo "CONFIG_PROFILER=y" >> $config_host_mak fi +if test $tcg_x86_opt = "yes" ; then + echo "CONFIG_TCG_TARGET_X86_OPT=y" >> $config_host_mak +fi if test "$slirp" = "yes" ; then echo "CONFIG_SLIRP=y" >> $config_host_mak QEMU_INCLUDES="-I\$(SRC_PATH)/slirp $QEMU_INCLUDES" diff --git a/exec-all.h b/exec-all.h index e3a82bc..8871d9a 100644 --- a/exec-all.h +++ b/exec-all.h @@ -88,7 +88,7 @@ int cpu_restore_state(struct TranslationBlock *tb, void *puc); void cpu_resume_from_signal(CPUState *env1, void *puc); void cpu_io_recompile(CPUState *env, void *retaddr); -TranslationBlock *tb_gen_code(CPUState *env, +TranslationBlock *tb_gen_code(CPUState *env, target_ulong pc, target_ulong cs_base, int flags, int cflags); void cpu_exec_init(CPUState *env); @@ -158,6 +158,9 @@ struct TranslationBlock { struct TranslationBlock *jmp_next[2]; struct TranslationBlock *jmp_first; uint32_t icount; +#ifdef CONFIG_EXEC_PROFILE + uint32_t tbexec_count[2]; +#endif }; static inline unsigned int tb_jmp_cache_hash_page(target_ulong pc) @@ -334,6 +337,18 @@ static inline tb_page_addr_t get_page_addr_code(CPUState *env1, target_ulong add + env1->tlb_table[mmu_idx][page_index].addend; return qemu_ram_addr_from_host_nofail(p); } + +#if defined(CONFIG_TCG_TARGET_X86_OPT) +/* extended versions of MMU helpers for x86 TCG target optimization */ +uint8_t REGPARM __ldextb_mmu(target_ulong addr, int mmu_idx, void *ra); +void REGPARM __stextb_mmu(target_ulong addr, uint8_t val, int mmu_idx, void *ra); +uint16_t REGPARM __ldextw_mmu(target_ulong addr, int mmu_idx, void *ra); +void REGPARM __stextw_mmu(target_ulong addr, uint16_t val, int mmu_idx, void *ra); +uint32_t REGPARM __ldextl_mmu(target_ulong addr, int mmu_idx, void *ra); +void REGPARM __stextl_mmu(target_ulong addr, uint32_t val, int mmu_idx, void *ra); +uint64_t REGPARM __ldextq_mmu(target_ulong addr, int mmu_idx, void *ra); +void REGPARM __stextq_mmu(target_ulong addr, uint64_t val, int mmu_idx, void *ra); +#endif /* CONFIG_TCG_TARGET_X86_OPT */ #endif typedef void (CPUDebugExcpHandler)(CPUState *env); diff --git a/qemu_configure.sh b/qemu_configure.sh index 370e862..31bc7dd 100755 --- a/qemu_configure.sh +++ b/qemu_configure.sh @@ -13,6 +13,8 @@ exec ./configure \ --enable-mixemu \ --disable-vnc-tls \ --extra-ldflags="-lv4l2 -lv4lconvert" +# --enable-tcg-x86-opt \ +# --enable-debug \ #--enable-profiler \ # --enable-gles2 --gles2dir=/usr ;; diff --git a/softmmu_template.h b/softmmu_template.h index c2df9ec..73136ae 100644 --- a/softmmu_template.h +++ b/softmmu_template.h @@ -137,6 +137,69 @@ DATA_TYPE REGPARM glue(glue(__ld, SUFFIX), MMUSUFFIX)(target_ulong addr, return res; } +#if defined(CONFIG_TCG_TARGET_X86_OPT) && !defined(SOFTMMU_CODE_ACCESS) +/* + * extended versions of MMU helpers for x86 TCG target optimization + * !defined(SOFTMMU_CODE_ACCESS) suppress warnings from exec.c + */ +DATA_TYPE REGPARM glue(glue(__ldext, SUFFIX), MMUSUFFIX)(target_ulong addr, + int mmu_idx, + void *ra) +{ + DATA_TYPE res; + int index; + target_ulong tlb_addr; + target_phys_addr_t ioaddr; + unsigned long addend; + void *retaddr; + + /* test if there is match for unaligned or IO access */ + /* XXX: could done more in memory macro in a non portable way */ + index = (addr >> TARGET_PAGE_BITS) & (CPU_TLB_SIZE - 1); + redo: + tlb_addr = env->tlb_table[mmu_idx][index].ADDR_READ; + if ((addr & TARGET_PAGE_MASK) == (tlb_addr & (TARGET_PAGE_MASK | TLB_INVALID_MASK))) { + if (tlb_addr & ~TARGET_PAGE_MASK) { + /* IO access */ + if ((addr & (DATA_SIZE - 1)) != 0) + goto do_unaligned_access; + retaddr = ra; + ioaddr = env->iotlb[mmu_idx][index]; + res = glue(io_read, SUFFIX)(ioaddr, addr, retaddr); + } else if (((addr & ~TARGET_PAGE_MASK) + DATA_SIZE - 1) >= TARGET_PAGE_SIZE) { + /* slow unaligned access (it spans two pages or IO) */ + do_unaligned_access: + retaddr = ra; +#ifdef ALIGNED_ONLY + do_unaligned_access(addr, READ_ACCESS_TYPE, mmu_idx, retaddr); +#endif + res = glue(glue(slow_ld, SUFFIX), MMUSUFFIX)(addr, + mmu_idx, retaddr); + } else { + /* unaligned/aligned access in the same page */ +#ifdef ALIGNED_ONLY + if ((addr & (DATA_SIZE - 1)) != 0) { + retaddr = ra; + do_unaligned_access(addr, READ_ACCESS_TYPE, mmu_idx, retaddr); + } +#endif + addend = env->tlb_table[mmu_idx][index].addend; + res = glue(glue(ld, USUFFIX), _raw)((uint8_t *)(long)(addr+addend)); + } + } else { + /* the page is not in the TLB : fill it */ + retaddr = ra; +#ifdef ALIGNED_ONLY + if ((addr & (DATA_SIZE - 1)) != 0) + do_unaligned_access(addr, READ_ACCESS_TYPE, mmu_idx, retaddr); +#endif + tlb_fill(addr, READ_ACCESS_TYPE, mmu_idx, retaddr); + goto redo; + } + return res; +} +#endif /* CONFIG_TCG_TARGET_X86_OPT */ + /* handle all unaligned cases */ static DATA_TYPE glue(glue(slow_ld, SUFFIX), MMUSUFFIX)(target_ulong addr, int mmu_idx, @@ -274,6 +337,65 @@ void REGPARM glue(glue(__st, SUFFIX), MMUSUFFIX)(target_ulong addr, } } +#if defined(CONFIG_TCG_TARGET_X86_OPT) +/* + * extended versions of MMU helpers for x86 TCG target optimization + * !defined(SOFTMMU_CODE_ACCESS) suppress warnings from exec.c + */ +void REGPARM glue(glue(__stext, SUFFIX), MMUSUFFIX)(target_ulong addr, + DATA_TYPE val, + int mmu_idx, + void *ra) +{ + target_phys_addr_t ioaddr; + unsigned long addend; + target_ulong tlb_addr; + void *retaddr; + int index; + + index = (addr >> TARGET_PAGE_BITS) & (CPU_TLB_SIZE - 1); + redo: + tlb_addr = env->tlb_table[mmu_idx][index].addr_write; + if ((addr & TARGET_PAGE_MASK) == (tlb_addr & (TARGET_PAGE_MASK | TLB_INVALID_MASK))) { + if (tlb_addr & ~TARGET_PAGE_MASK) { + /* IO access */ + if ((addr & (DATA_SIZE - 1)) != 0) + goto do_unaligned_access; + retaddr = ra; + ioaddr = env->iotlb[mmu_idx][index]; + glue(io_write, SUFFIX)(ioaddr, val, addr, retaddr); + } else if (((addr & ~TARGET_PAGE_MASK) + DATA_SIZE - 1) >= TARGET_PAGE_SIZE) { + do_unaligned_access: + retaddr = ra; +#ifdef ALIGNED_ONLY + do_unaligned_access(addr, 1, mmu_idx, retaddr); +#endif + glue(glue(slow_st, SUFFIX), MMUSUFFIX)(addr, val, + mmu_idx, retaddr); + } else { + /* aligned/unaligned access in the same page */ +#ifdef ALIGNED_ONLY + if ((addr & (DATA_SIZE - 1)) != 0) { + retaddr = ra; + do_unaligned_access(addr, 1, mmu_idx, retaddr); + } +#endif + addend = env->tlb_table[mmu_idx][index].addend; + glue(glue(st, SUFFIX), _raw)((uint8_t *)(long)(addr+addend), val); + } + } else { + /* the page is not in the TLB : fill it */ + retaddr = ra; +#ifdef ALIGNED_ONLY + if ((addr & (DATA_SIZE - 1)) != 0) + do_unaligned_access(addr, 1, mmu_idx, retaddr); +#endif + tlb_fill(addr, 1, mmu_idx, retaddr); + goto redo; + } +} +#endif /* CONFIG_TCG_TARGET_X86_OPT */ + /* handles all unaligned cases */ static void glue(glue(slow_st, SUFFIX), MMUSUFFIX)(target_ulong addr, DATA_TYPE val, diff --git a/target-i386/translate.c b/target-i386/translate.c index 0162da7..df66f72 100644 --- a/target-i386/translate.c +++ b/target-i386/translate.c @@ -389,7 +389,7 @@ static inline void gen_op_addq_A0_im(int64_t val) tcg_gen_addi_tl(cpu_A0, cpu_A0, val); } #endif - + static void gen_add_A0_im(DisasContext *s, int val) { #ifdef TARGET_X86_64 @@ -665,7 +665,7 @@ static inline void gen_string_movl_A0_EDI(DisasContext *s) } } -static inline void gen_op_movl_T0_Dshift(int ot) +static inline void gen_op_movl_T0_Dshift(int ot) { tcg_gen_ld32s_tl(cpu_T[0], cpu_env, offsetof(CPUState, df)); tcg_gen_shli_tl(cpu_T[0], cpu_T[0], ot); @@ -962,7 +962,7 @@ static inline void gen_jcc1(DisasContext *s, int cc_op, int b, int l1) case CC_OP_SUBW: case CC_OP_SUBL: case CC_OP_SUBQ: - + size = cc_op - CC_OP_SUBB; switch(jcc_op) { case JCC_Z: @@ -993,28 +993,28 @@ static inline void gen_jcc1(DisasContext *s, int cc_op, int b, int l1) switch(size) { case 0: tcg_gen_andi_tl(cpu_tmp0, cpu_cc_dst, 0x80); - tcg_gen_brcondi_tl(inv ? TCG_COND_EQ : TCG_COND_NE, cpu_tmp0, + tcg_gen_brcondi_tl(inv ? TCG_COND_EQ : TCG_COND_NE, cpu_tmp0, 0, l1); break; case 1: tcg_gen_andi_tl(cpu_tmp0, cpu_cc_dst, 0x8000); - tcg_gen_brcondi_tl(inv ? TCG_COND_EQ : TCG_COND_NE, cpu_tmp0, + tcg_gen_brcondi_tl(inv ? TCG_COND_EQ : TCG_COND_NE, cpu_tmp0, 0, l1); break; #ifdef TARGET_X86_64 case 2: tcg_gen_andi_tl(cpu_tmp0, cpu_cc_dst, 0x80000000); - tcg_gen_brcondi_tl(inv ? TCG_COND_EQ : TCG_COND_NE, cpu_tmp0, + tcg_gen_brcondi_tl(inv ? TCG_COND_EQ : TCG_COND_NE, cpu_tmp0, 0, l1); break; #endif default: - tcg_gen_brcondi_tl(inv ? TCG_COND_GE : TCG_COND_LT, cpu_cc_dst, + tcg_gen_brcondi_tl(inv ? TCG_COND_GE : TCG_COND_LT, cpu_cc_dst, 0, l1); break; } break; - + case JCC_B: cond = inv ? TCG_COND_GEU : TCG_COND_LTU; goto fast_jcc_b; @@ -1046,7 +1046,7 @@ static inline void gen_jcc1(DisasContext *s, int cc_op, int b, int l1) } tcg_gen_brcond_tl(cond, cpu_tmp4, t0, l1); break; - + case JCC_L: cond = inv ? TCG_COND_GE : TCG_COND_LT; goto fast_jcc_l; @@ -1078,48 +1078,48 @@ static inline void gen_jcc1(DisasContext *s, int cc_op, int b, int l1) } tcg_gen_brcond_tl(cond, cpu_tmp4, t0, l1); break; - + default: goto slow_jcc; } break; - + /* some jumps are easy to compute */ case CC_OP_ADDB: case CC_OP_ADDW: case CC_OP_ADDL: case CC_OP_ADDQ: - + case CC_OP_ADCB: case CC_OP_ADCW: case CC_OP_ADCL: case CC_OP_ADCQ: - + case CC_OP_SBBB: case CC_OP_SBBW: case CC_OP_SBBL: case CC_OP_SBBQ: - + case CC_OP_LOGICB: case CC_OP_LOGICW: case CC_OP_LOGICL: case CC_OP_LOGICQ: - + case CC_OP_INCB: case CC_OP_INCW: case CC_OP_INCL: case CC_OP_INCQ: - + case CC_OP_DECB: case CC_OP_DECW: case CC_OP_DECL: case CC_OP_DECQ: - + case CC_OP_SHLB: case CC_OP_SHLW: case CC_OP_SHLL: case CC_OP_SHLQ: - + case CC_OP_SARB: case CC_OP_SARW: case CC_OP_SARL: @@ -1138,7 +1138,7 @@ static inline void gen_jcc1(DisasContext *s, int cc_op, int b, int l1) default: slow_jcc: gen_setcc_slow_T0(s, jcc_op); - tcg_gen_brcondi_tl(inv ? TCG_COND_EQ : TCG_COND_NE, + tcg_gen_brcondi_tl(inv ? TCG_COND_EQ : TCG_COND_NE, cpu_T[0], 0, l1); break; } @@ -1430,7 +1430,7 @@ static void gen_inc(DisasContext *s1, int ot, int d, int c) tcg_gen_mov_tl(cpu_cc_dst, cpu_T[0]); } -static void gen_shift_rm_T1(DisasContext *s, int ot, int op1, +static void gen_shift_rm_T1(DisasContext *s, int ot, int op1, int is_right, int is_arith) { target_ulong mask; @@ -1472,7 +1472,7 @@ static void gen_shift_rm_T1(DisasContext *s, int ot, int op1, gen_op_st_T0_A0(ot + s->mem_index); else gen_op_mov_reg_T0(ot, op1); - + /* update eflags if non zero shift */ if (s->cc_op != CC_OP_DYNAMIC) gen_op_set_cc_op(s->cc_op); @@ -1493,7 +1493,7 @@ static void gen_shift_rm_T1(DisasContext *s, int ot, int op1, tcg_gen_movi_i32(cpu_cc_op, CC_OP_SARB + ot); else tcg_gen_movi_i32(cpu_cc_op, CC_OP_SHLB + ot); - + gen_set_label(shift_label); s->cc_op = CC_OP_DYNAMIC; /* cannot predict flags after */ @@ -1505,7 +1505,7 @@ static void gen_shift_rm_im(DisasContext *s, int ot, int op1, int op2, int is_right, int is_arith) { int mask; - + if (ot == OT_QUAD) mask = 0x3f; else @@ -1540,7 +1540,7 @@ static void gen_shift_rm_im(DisasContext *s, int ot, int op1, int op2, gen_op_st_T0_A0(ot + s->mem_index); else gen_op_mov_reg_T0(ot, op1); - + /* update eflags if non zero shift */ if (op2 != 0) { tcg_gen_mov_tl(cpu_cc_src, cpu_tmp4); @@ -1560,7 +1560,7 @@ static inline void tcg_gen_lshift(TCGv ret, TCGv arg1, target_long arg2) tcg_gen_shri_tl(ret, arg1, -arg2); } -static void gen_rot_rm_T1(DisasContext *s, int ot, int op1, +static void gen_rot_rm_T1(DisasContext *s, int ot, int op1, int is_right) { target_ulong mask; @@ -1594,12 +1594,12 @@ static void gen_rot_rm_T1(DisasContext *s, int ot, int op1, shifts. */ label1 = gen_new_label(); tcg_gen_brcondi_tl(TCG_COND_EQ, t1, 0, label1); - + if (ot <= OT_WORD) tcg_gen_andi_tl(cpu_tmp0, t1, (1 << (3 + ot)) - 1); else tcg_gen_mov_tl(cpu_tmp0, t1); - + gen_extu(ot, t0); tcg_gen_mov_tl(t2, t0); @@ -1624,7 +1624,7 @@ static void gen_rot_rm_T1(DisasContext *s, int ot, int op1, } else { gen_op_mov_reg_v(ot, op1, t0); } - + /* update eflags */ if (s->cc_op != CC_OP_DYNAMIC) gen_op_set_cc_op(s->cc_op); @@ -1643,10 +1643,10 @@ static void gen_rot_rm_T1(DisasContext *s, int ot, int op1, } tcg_gen_andi_tl(t0, t0, CC_C); tcg_gen_or_tl(cpu_cc_src, cpu_cc_src, t0); - + tcg_gen_discard_tl(cpu_cc_dst); tcg_gen_movi_i32(cpu_cc_op, CC_OP_EFLAGS); - + gen_set_label(label2); s->cc_op = CC_OP_DYNAMIC; /* cannot predict flags after */ @@ -1734,7 +1734,7 @@ static void gen_rot_rm_im(DisasContext *s, int ot, int op1, int op2, } /* XXX: add faster immediate = 1 case */ -static void gen_rotc_rm_T1(DisasContext *s, int ot, int op1, +static void gen_rotc_rm_T1(DisasContext *s, int ot, int op1, int is_right) { int label1; @@ -1747,7 +1747,7 @@ static void gen_rotc_rm_T1(DisasContext *s, int ot, int op1, gen_op_ld_T0_A0(ot + s->mem_index); else gen_op_mov_TN_reg(ot, 0, op1); - + if (is_right) { switch (ot) { case 0: gen_helper_rcrb(cpu_T[0], cpu_T[0], cpu_T[1]); break; @@ -1780,13 +1780,13 @@ static void gen_rotc_rm_T1(DisasContext *s, int ot, int op1, tcg_gen_mov_tl(cpu_cc_src, cpu_cc_tmp); tcg_gen_discard_tl(cpu_cc_dst); tcg_gen_movi_i32(cpu_cc_op, CC_OP_EFLAGS); - + gen_set_label(label1); s->cc_op = CC_OP_DYNAMIC; /* cannot predict flags after */ } /* XXX: add faster immediate case */ -static void gen_shiftd_rm_T1_T3(DisasContext *s, int ot, int op1, +static void gen_shiftd_rm_T1_T3(DisasContext *s, int ot, int op1, int is_right) { int label1, label2, data_bits; @@ -1820,7 +1820,7 @@ static void gen_shiftd_rm_T1_T3(DisasContext *s, int ot, int op1, shifts. */ label1 = gen_new_label(); tcg_gen_brcondi_tl(TCG_COND_EQ, t2, 0, label1); - + tcg_gen_addi_tl(cpu_tmp5, t2, -1); if (ot == OT_WORD) { /* Note: we implement the Intel behaviour for shift count > 16 */ @@ -1831,7 +1831,7 @@ static void gen_shiftd_rm_T1_T3(DisasContext *s, int ot, int op1, tcg_gen_ext32u_tl(t0, t0); tcg_gen_shr_tl(cpu_tmp4, t0, cpu_tmp5); - + /* only needed if count > 16, but a test would complicate */ tcg_gen_subfi_tl(cpu_tmp5, 32, t2); tcg_gen_shl_tl(cpu_tmp0, t0, cpu_tmp5); @@ -1845,7 +1845,7 @@ static void gen_shiftd_rm_T1_T3(DisasContext *s, int ot, int op1, tcg_gen_shli_tl(t1, t1, 16); tcg_gen_or_tl(t1, t1, t0); tcg_gen_ext32u_tl(t1, t1); - + tcg_gen_shl_tl(cpu_tmp4, t0, cpu_tmp5); tcg_gen_subfi_tl(cpu_tmp0, 32, cpu_tmp5); tcg_gen_shr_tl(cpu_tmp5, t1, cpu_tmp0); @@ -1868,13 +1868,13 @@ static void gen_shiftd_rm_T1_T3(DisasContext *s, int ot, int op1, tcg_gen_subfi_tl(cpu_tmp5, data_bits, t2); tcg_gen_shl_tl(t1, t1, cpu_tmp5); tcg_gen_or_tl(t0, t0, t1); - + } else { if (ot == OT_LONG) tcg_gen_ext32u_tl(t1, t1); tcg_gen_shl_tl(cpu_tmp4, t0, cpu_tmp5); - + tcg_gen_shl_tl(t0, t0, t2); tcg_gen_subfi_tl(cpu_tmp5, data_bits, t2); tcg_gen_shr_tl(t1, t1, cpu_tmp5); @@ -1890,7 +1890,7 @@ static void gen_shiftd_rm_T1_T3(DisasContext *s, int ot, int op1, } else { gen_op_mov_reg_v(ot, op1, t0); } - + /* update eflags */ if (s->cc_op != CC_OP_DYNAMIC) gen_op_set_cc_op(s->cc_op); @@ -2289,6 +2289,21 @@ static inline int insn_const_size(unsigned int ot) return 4; } +#ifdef CONFIG_EXEC_PROFILE +/* generation of TB execution profiling */ +static inline void gen_prof_tbexec(DisasContext *s, int tb_num) +{ + tcg_gen_movi_tl(cpu_T[0], (target_ulong)((void *)s->tb + + offsetof(TranslationBlock, tbexec_count) + + sizeof (uint32_t) * (tb_num & 0x1))); + tcg_gen_ld32s_tl(cpu_T[1], cpu_T[0], 0); + tcg_gen_addi_i32(cpu_T[1], cpu_T[1], 1); + tcg_gen_st32_tl(cpu_T[1], cpu_T[0], 0); +} +#else +# define gen_prof_tbexec(s, tb_num) +#endif /* CONFIG_EXEC_PROFILE */ + static inline void gen_goto_tb(DisasContext *s, int tb_num, target_ulong eip) { TranslationBlock *tb; @@ -2299,6 +2314,8 @@ static inline void gen_goto_tb(DisasContext *s, int tb_num, target_ulong eip) /* NOTE: we handle the case where the TB spans two pages here */ if ((pc & TARGET_PAGE_MASK) == (tb->pc & TARGET_PAGE_MASK) || (pc & TARGET_PAGE_MASK) == ((s->pc - 1) & TARGET_PAGE_MASK)) { + /* profile TB execution, yklee 20111112 */ + gen_prof_tbexec(s, tb_num); /* jump to same page: we can use a direct jump */ tcg_gen_goto_tb(tb_num); gen_jmp_im(eip); @@ -2320,7 +2337,7 @@ static inline void gen_jcc(DisasContext *s, int b, if (s->jmp_opt) { l1 = gen_new_label(); gen_jcc1(s, cc_op, b, l1); - + gen_goto_tb(s, 0, next_eip); gen_set_label(l1); @@ -2373,17 +2390,17 @@ static void gen_setcc(DisasContext *s, int b) static inline void gen_op_movl_T0_seg(int seg_reg) { - tcg_gen_ld32u_tl(cpu_T[0], cpu_env, + tcg_gen_ld32u_tl(cpu_T[0], cpu_env, offsetof(CPUX86State,segs[seg_reg].selector)); } static inline void gen_op_movl_seg_T0_vm(int seg_reg) { tcg_gen_andi_tl(cpu_T[0], cpu_T[0], 0xffff); - tcg_gen_st32_tl(cpu_T[0], cpu_env, + tcg_gen_st32_tl(cpu_T[0], cpu_env, offsetof(CPUX86State,segs[seg_reg].selector)); tcg_gen_shli_tl(cpu_T[0], cpu_T[0], 4); - tcg_gen_st_tl(cpu_T[0], cpu_env, + tcg_gen_st_tl(cpu_T[0], cpu_env, offsetof(CPUX86State,segs[seg_reg].base)); } @@ -2703,7 +2720,7 @@ static void gen_interrupt(DisasContext *s, int intno, gen_op_set_cc_op(s->cc_op); gen_jmp_im(cur_eip); //gen_heler_test_interrupt(); - gen_helper_raise_interrupt(tcg_const_i32(intno), + gen_helper_raise_interrupt(tcg_const_i32(intno), tcg_const_i32(next_eip - cur_eip)); s->is_jmp = DISAS_TB_JUMP; } @@ -2737,6 +2754,7 @@ static void gen_eob(DisasContext *s) } else if (s->tf) { gen_helper_single_step(); } else { + gen_prof_tbexec(s, 0); tcg_gen_exit_tb(0); } s->is_jmp = DISAS_TB_JUMP; @@ -3218,7 +3236,7 @@ static void gen_sse(DisasContext *s, int b, target_ulong pc_start, int rex_r) #endif { gen_ldst_modrm(s, modrm, OT_LONG, OR_TMP0, 0); - tcg_gen_addi_ptr(cpu_ptr0, cpu_env, + tcg_gen_addi_ptr(cpu_ptr0, cpu_env, offsetof(CPUX86State,fpregs[reg].mmx)); tcg_gen_trunc_tl_i32(cpu_tmp2_i32, cpu_T[0]); gen_helper_movl_mm_T0_mmx(cpu_ptr0, cpu_tmp2_i32); @@ -3228,14 +3246,14 @@ static void gen_sse(DisasContext *s, int b, target_ulong pc_start, int rex_r) #ifdef TARGET_X86_64 if (s->dflag == 2) { gen_ldst_modrm(s, modrm, OT_QUAD, OR_TMP0, 0); - tcg_gen_addi_ptr(cpu_ptr0, cpu_env, + tcg_gen_addi_ptr(cpu_ptr0, cpu_env, offsetof(CPUX86State,xmm_regs[reg])); gen_helper_movq_mm_T0_xmm(cpu_ptr0, cpu_T[0]); } else #endif { gen_ldst_modrm(s, modrm, OT_LONG, OR_TMP0, 0); - tcg_gen_addi_ptr(cpu_ptr0, cpu_env, + tcg_gen_addi_ptr(cpu_ptr0, cpu_env, offsetof(CPUX86State,xmm_regs[reg])); tcg_gen_trunc_tl_i32(cpu_tmp2_i32, cpu_T[0]); gen_helper_movl_mm_T0_xmm(cpu_ptr0, cpu_tmp2_i32); @@ -3386,13 +3404,13 @@ static void gen_sse(DisasContext *s, int b, target_ulong pc_start, int rex_r) case 0x7e: /* movd ea, mm */ #ifdef TARGET_X86_64 if (s->dflag == 2) { - tcg_gen_ld_i64(cpu_T[0], cpu_env, + tcg_gen_ld_i64(cpu_T[0], cpu_env, offsetof(CPUX86State,fpregs[reg].mmx)); gen_ldst_modrm(s, modrm, OT_QUAD, OR_TMP0, 1); } else #endif { - tcg_gen_ld32u_tl(cpu_T[0], cpu_env, + tcg_gen_ld32u_tl(cpu_T[0], cpu_env, offsetof(CPUX86State,fpregs[reg].mmx.MMX_L(0))); gen_ldst_modrm(s, modrm, OT_LONG, OR_TMP0, 1); } @@ -3400,13 +3418,13 @@ static void gen_sse(DisasContext *s, int b, target_ulong pc_start, int rex_r) case 0x17e: /* movd ea, xmm */ #ifdef TARGET_X86_64 if (s->dflag == 2) { - tcg_gen_ld_i64(cpu_T[0], cpu_env, + tcg_gen_ld_i64(cpu_T[0], cpu_env, offsetof(CPUX86State,xmm_regs[reg].XMM_Q(0))); gen_ldst_modrm(s, modrm, OT_QUAD, OR_TMP0, 1); } else #endif { - tcg_gen_ld32u_tl(cpu_T[0], cpu_env, + tcg_gen_ld32u_tl(cpu_T[0], cpu_env, offsetof(CPUX86State,xmm_regs[reg].XMM_L(0))); gen_ldst_modrm(s, modrm, OT_LONG, OR_TMP0, 1); } @@ -3525,7 +3543,7 @@ static void gen_sse(DisasContext *s, int b, target_ulong pc_start, int rex_r) break; case 0x050: /* movmskps */ rm = (modrm & 7) | REX_B(s); - tcg_gen_addi_ptr(cpu_ptr0, cpu_env, + tcg_gen_addi_ptr(cpu_ptr0, cpu_env, offsetof(CPUX86State,xmm_regs[rm])); gen_helper_movmskps(cpu_tmp2_i32, cpu_ptr0); tcg_gen_extu_i32_tl(cpu_T[0], cpu_tmp2_i32); @@ -3533,7 +3551,7 @@ static void gen_sse(DisasContext *s, int b, target_ulong pc_start, int rex_r) break; case 0x150: /* movmskpd */ rm = (modrm & 7) | REX_B(s); - tcg_gen_addi_ptr(cpu_ptr0, cpu_env, + tcg_gen_addi_ptr(cpu_ptr0, cpu_env, offsetof(CPUX86State,xmm_regs[rm])); gen_helper_movmskpd(cpu_tmp2_i32, cpu_ptr0); tcg_gen_extu_i32_tl(cpu_T[0], cpu_tmp2_i32); @@ -4674,12 +4692,12 @@ static target_ulong disas_insn(DisasContext *s, target_ulong pc_start) gen_jmp_im(pc_start - s->cs_base); tcg_gen_trunc_tl_i32(cpu_tmp2_i32, cpu_T[0]); gen_helper_lcall_protected(cpu_tmp2_i32, cpu_T[1], - tcg_const_i32(dflag), + tcg_const_i32(dflag), tcg_const_i32(s->pc - pc_start)); } else { tcg_gen_trunc_tl_i32(cpu_tmp2_i32, cpu_T[0]); gen_helper_lcall_real(cpu_tmp2_i32, cpu_T[1], - tcg_const_i32(dflag), + tcg_const_i32(dflag), tcg_const_i32(s->pc - s->cs_base)); } gen_eob(s); @@ -4943,7 +4961,7 @@ static target_ulong disas_insn(DisasContext *s, target_ulong pc_start) gen_lea_modrm(s, modrm, ®_addr, &offset_addr); gen_helper_cmpxchg16b(cpu_A0); } else -#endif +#endif { if (!(s->cpuid_features & CPUID_CX8)) goto illegal_op; @@ -5519,7 +5537,7 @@ static target_ulong disas_insn(DisasContext *s, target_ulong pc_start) gen_helper_fildl_FT0(cpu_tmp2_i32); break; case 2: - tcg_gen_qemu_ld64(cpu_tmp1_i64, cpu_A0, + tcg_gen_qemu_ld64(cpu_tmp1_i64, cpu_A0, (s->mem_index >> 2) - 1); gen_helper_fldl_FT0(cpu_tmp1_i64); break; @@ -5558,7 +5576,7 @@ static target_ulong disas_insn(DisasContext *s, target_ulong pc_start) gen_helper_fildl_ST0(cpu_tmp2_i32); break; case 2: - tcg_gen_qemu_ld64(cpu_tmp1_i64, cpu_A0, + tcg_gen_qemu_ld64(cpu_tmp1_i64, cpu_A0, (s->mem_index >> 2) - 1); gen_helper_fldl_ST0(cpu_tmp1_i64); break; @@ -5580,7 +5598,7 @@ static target_ulong disas_insn(DisasContext *s, target_ulong pc_start) break; case 2: gen_helper_fisttll_ST0(cpu_tmp1_i64); - tcg_gen_qemu_st64(cpu_tmp1_i64, cpu_A0, + tcg_gen_qemu_st64(cpu_tmp1_i64, cpu_A0, (s->mem_index >> 2) - 1); break; case 3: @@ -5606,7 +5624,7 @@ static target_ulong disas_insn(DisasContext *s, target_ulong pc_start) break; case 2: gen_helper_fstl_ST0(cpu_tmp1_i64); - tcg_gen_qemu_st64(cpu_tmp1_i64, cpu_A0, + tcg_gen_qemu_st64(cpu_tmp1_i64, cpu_A0, (s->mem_index >> 2) - 1); break; case 3: @@ -5688,13 +5706,13 @@ static target_ulong disas_insn(DisasContext *s, target_ulong pc_start) gen_helper_fpop(); break; case 0x3d: /* fildll */ - tcg_gen_qemu_ld64(cpu_tmp1_i64, cpu_A0, + tcg_gen_qemu_ld64(cpu_tmp1_i64, cpu_A0, (s->mem_index >> 2) - 1); gen_helper_fildll_ST0(cpu_tmp1_i64); break; case 0x3f: /* fistpll */ gen_helper_fistll_ST0(cpu_tmp1_i64); - tcg_gen_qemu_st64(cpu_tmp1_i64, cpu_A0, + tcg_gen_qemu_st64(cpu_tmp1_i64, cpu_A0, (s->mem_index >> 2) - 1); gen_helper_fpop(); break; @@ -6082,7 +6100,7 @@ static target_ulong disas_insn(DisasContext *s, target_ulong pc_start) ot = dflag ? OT_LONG : OT_WORD; gen_op_mov_TN_reg(OT_WORD, 0, R_EDX); gen_op_andl_T0_ffff(); - gen_check_io(s, ot, pc_start - s->cs_base, + gen_check_io(s, ot, pc_start - s->cs_base, SVM_IOIO_TYPE_MASK | svm_is_rep(prefixes) | 4); if (prefixes & (PREFIX_REPZ | PREFIX_REPNZ)) { gen_repz_ins(s, ot, pc_start - s->cs_base, s->pc - s->cs_base); @@ -6273,7 +6291,7 @@ static target_ulong disas_insn(DisasContext *s, target_ulong pc_start) if (s->cc_op != CC_OP_DYNAMIC) gen_op_set_cc_op(s->cc_op); gen_jmp_im(pc_start - s->cs_base); - gen_helper_iret_protected(tcg_const_i32(s->dflag), + gen_helper_iret_protected(tcg_const_i32(s->dflag), tcg_const_i32(s->pc - s->cs_base)); s->cc_op = CC_OP_EFLAGS; } @@ -7160,7 +7178,7 @@ static target_ulong disas_insn(DisasContext *s, target_ulong pc_start) break; case 4: /* STGI */ if ((!(s->flags & HF_SVME_MASK) && - !(s->cpuid_ext3_features & CPUID_EXT3_SKINIT)) || + !(s->cpuid_ext3_features & CPUID_EXT3_SKINIT)) || !s->pe) goto illegal_op; if (s->cpl != 0) { @@ -7181,8 +7199,8 @@ static target_ulong disas_insn(DisasContext *s, target_ulong pc_start) } break; case 6: /* SKINIT */ - if ((!(s->flags & HF_SVME_MASK) && - !(s->cpuid_ext3_features & CPUID_EXT3_SKINIT)) || + if ((!(s->flags & HF_SVME_MASK) && + !(s->cpuid_ext3_features & CPUID_EXT3_SKINIT)) || !s->pe) goto illegal_op; gen_helper_skinit(); diff --git a/tcg/i386/tcg-target.c b/tcg/i386/tcg-target.c index bb19a95..cc750b4 100644 --- a/tcg/i386/tcg-target.c +++ b/tcg/i386/tcg-target.c @@ -1132,6 +1132,9 @@ static void tcg_out_qemu_ld_direct(TCGContext *s, int datalo, int datahi, } } +#if !defined(CONFIG_TCG_TARGET_X86_OPT) +/* to suppress warnings */ + /* XXX: qemu_ld and qemu_st could be modified to clobber only EDX and EAX. It will be useful once fixed registers globals are less common. */ @@ -1249,6 +1252,7 @@ static void tcg_out_qemu_ld(TCGContext *s, const TCGArg *args, } #endif } +#endif /* !CONFIG_TCG_TARGET_X86_OPT */ static void tcg_out_qemu_st_direct(TCGContext *s, int datalo, int datahi, int base, tcg_target_long ofs, int sizeop) @@ -1309,6 +1313,9 @@ static void tcg_out_qemu_st_direct(TCGContext *s, int datalo, int datahi, } } +#if !defined(CONFIG_TCG_TARGET_X86_OPT) +/* to suppress warnings */ + static void tcg_out_qemu_st(TCGContext *s, const TCGArg *args, int opc) { @@ -1426,6 +1433,378 @@ static void tcg_out_qemu_st(TCGContext *s, const TCGArg *args, } #endif } +#endif /* !CONFIG_TCG_TARGET_X86_OPT */ + +#if defined(CONFIG_TCG_TARGET_X86_OPT) +/* optimization to reduce jump overheads */ + +/* extened versions of MMU helpers */ +static void *qemu_ldext_helpers[4] = { + __ldextb_mmu, + __ldextw_mmu, + __ldextl_mmu, + __ldextq_mmu, +}; +static void *qemu_stext_helpers[4] = { + __stextb_mmu, + __stextw_mmu, + __stextl_mmu, + __stextq_mmu, +}; + +/* + * qemu_ld/st code generator call add_helper_label, + * so that slow case(TLB miss or I/O rw) is handled at the end of TB + */ +static void add_helper_label(TCGContext *s, + int opc_ext, + int data_reg, + int data_reg2, + int addrlo_reg, + int addrhi_reg, + int mem_index, + uint8_t *raddr, + uint32_t **label_ptr) +{ + int idx; + HelperLabel *label; + + if (s->nb_helper_labels >= TCG_MAX_HELPER_LABELS) + tcg_abort(); + + idx = s->nb_helper_labels++; + label = (HelperLabel *)&s->helper_labels[idx]; + label->opc_ext = opc_ext; + label->datalo_reg = data_reg; + label->datahi_reg = data_reg2; + label->addrlo_reg = addrlo_reg; + label->addrhi_reg = addrhi_reg; + label->mem_index = mem_index; + label->raddr = raddr; + if (!label_ptr) { + tcg_abort(); + } + label->label_ptr[0] = label_ptr[0]; + label->label_ptr[1] = label_ptr[1]; +} + +/* generates slow case of qemu_ld at the end of TB */ +static void tcg_out_qemu_ld_helper_call(TCGContext *s, HelperLabel *label) +{ + int s_bits, arg_idx; + int opc = label->opc_ext & HL_OPC_MASK; + int mem_index = label->mem_index; + int data_reg = label->datalo_reg; + int data_reg2 = label->datahi_reg; + int addrhi_reg = label->addrhi_reg; + uint8_t *raddr = label->raddr; + uint32_t **label_ptr = &label->label_ptr[0]; + + s_bits = opc & 3; + + /* resolove label address */ + *label_ptr[0] = (uint32_t)(s->code_ptr - (uint8_t *)label_ptr[0] - 4); + if (TARGET_LONG_BITS > TCG_TARGET_REG_BITS) { + *label_ptr[1] = (uint32_t)(s->code_ptr - (uint8_t *)label_ptr[1] - 4); + } + + /* 1st parameter(vaddr) has been alreay set in %eax */ + arg_idx = 1; + if (TCG_TARGET_REG_BITS == 32 && TARGET_LONG_BITS == 64) { + tcg_out_mov(s, TCG_TYPE_I32, tcg_target_call_iarg_regs[arg_idx++], + addrhi_reg); + } + tcg_out_movi(s, TCG_TYPE_I32, tcg_target_call_iarg_regs[arg_idx++], + mem_index); + /* return address should indicate qemu_ld IR codes */ + if (TCG_TARGET_REG_BITS == 32 && TARGET_LONG_BITS == 64) { + /* 4 word parameters */ + tcg_out_pushi(s, (int)(raddr - 1)); + } else { + /* 3 word parameters */ + tcg_out_movi(s, TCG_TYPE_I32, TCG_REG_ECX, (int)(raddr - 1)); + } + tcg_out_calli(s, (tcg_target_long)qemu_ldext_helpers[s_bits]); + if (TCG_TARGET_REG_BITS == 32 && TARGET_LONG_BITS == 64) { + /* Pop and discard. This is 2 bytes smaller than the add. */ + tcg_out_pop(s, TCG_REG_ECX); + } + + switch(opc) { + case 0 | 4: + tcg_out_ext8s(s, data_reg, TCG_REG_EAX, P_REXW); + break; + case 1 | 4: + tcg_out_ext16s(s, data_reg, TCG_REG_EAX, P_REXW); + break; + case 0: + tcg_out_ext8u(s, data_reg, TCG_REG_EAX); + break; + case 1: + tcg_out_ext16u(s, data_reg, TCG_REG_EAX); + break; + case 2: + tcg_out_mov(s, TCG_TYPE_I32, data_reg, TCG_REG_EAX); + break; +#if TCG_TARGET_REG_BITS == 64 + case 2 | 4: + tcg_out_ext32s(s, data_reg, TCG_REG_EAX); + break; +#endif + case 3: + if (TCG_TARGET_REG_BITS == 64) { + tcg_out_mov(s, TCG_TYPE_I64, data_reg, TCG_REG_RAX); + } else if (data_reg == TCG_REG_EDX) { + /* xchg %edx, %eax */ + tcg_out_opc(s, OPC_XCHG_ax_r32 + TCG_REG_EDX, 0, 0, 0); + tcg_out_mov(s, TCG_TYPE_I32, data_reg2, TCG_REG_EAX); + } else { + tcg_out_mov(s, TCG_TYPE_I32, data_reg, TCG_REG_EAX); + tcg_out_mov(s, TCG_TYPE_I32, data_reg2, TCG_REG_EDX); + } + break; + default: + tcg_abort(); + } + + /* jump back to original code */ + tcg_out_jmp(s, (tcg_target_long) raddr); +} + +/* generates slow case of qemu_st at the end of TB */ +static void tcg_out_qemu_st_helper_call(TCGContext *s, HelperLabel *label) +{ + int s_bits; + int stack_adjust; + int opc = label->opc_ext & HL_OPC_MASK; + int mem_index = label->mem_index; + int data_reg = label->datalo_reg; + int data_reg2 = label->datahi_reg; + int addrhi_reg = label->addrhi_reg; + uint8_t *raddr = label->raddr; + uint32_t **label_ptr = &label->label_ptr[0]; + + s_bits = opc & 3; + + /* resolove label address */ + *label_ptr[0] = (uint32_t)(s->code_ptr - (uint8_t *)label_ptr[0] - 4); + if (TARGET_LONG_BITS > TCG_TARGET_REG_BITS) { + *label_ptr[1] = (uint32_t)(s->code_ptr - (uint8_t *)label_ptr[1] - 4); + } + + /* 1st parameter(vaddr) has been already set */ + /* return address should indicate qemu_st IR codes */ + if (TCG_TARGET_REG_BITS == 64) { + tcg_out_mov(s, (opc == 3 ? TCG_TYPE_I64 : TCG_TYPE_I32), + TCG_REG_RSI, data_reg); + tcg_out_movi(s, TCG_TYPE_I32, TCG_REG_RDX, mem_index); + /* return address should indicate qemu_st IR codes */ + /* stack growth: 1word * 64bit */ + tcg_out_pushi(s, (int)(raddr - 1)); + stack_adjust = 8; + } else if (TARGET_LONG_BITS == 32) { + tcg_out_mov(s, TCG_TYPE_I32, TCG_REG_EDX, data_reg); + if (opc == 3) { + tcg_out_mov(s, TCG_TYPE_I32, TCG_REG_ECX, data_reg2); + tcg_out_pushi(s, (int)(raddr - 1)); + tcg_out_pushi(s, mem_index); + stack_adjust = 8; + } else { + tcg_out_movi(s, TCG_TYPE_I32, TCG_REG_ECX, mem_index); + tcg_out_pushi(s, (int)(raddr - 1)); + stack_adjust = 4; + } + } else { + if (opc == 3) { + tcg_out_mov(s, TCG_TYPE_I32, TCG_REG_EDX, addrhi_reg); + tcg_out_pushi(s, (int)(raddr - 1)); + tcg_out_pushi(s, mem_index); + tcg_out_push(s, data_reg2); + tcg_out_push(s, data_reg); + stack_adjust = 16; + } else { + tcg_out_mov(s, TCG_TYPE_I32, TCG_REG_EDX, addrhi_reg); + switch(opc) { + case 0: + tcg_out_ext8u(s, TCG_REG_ECX, data_reg); + break; + case 1: + tcg_out_ext16u(s, TCG_REG_ECX, data_reg); + break; + case 2: + tcg_out_mov(s, TCG_TYPE_I32, TCG_REG_ECX, data_reg); + break; + } + tcg_out_pushi(s, (int)(raddr - 1)); + tcg_out_pushi(s, mem_index); + stack_adjust = 8; + } + } + + tcg_out_calli(s, (tcg_target_long)qemu_stext_helpers[s_bits]); + + if (stack_adjust == (TCG_TARGET_REG_BITS / 8)) { + /* Pop and discard. This is 2 bytes smaller than the add. */ + tcg_out_pop(s, TCG_REG_ECX); + } else if (stack_adjust != 0) { + tcg_out_addi(s, TCG_REG_ESP, stack_adjust); + } + + /* jump back to original code */ + tcg_out_jmp(s, (tcg_target_long) raddr); +} + +/* generates all of the slow cases of qemu_ld/st at the end of TB */ +void tcg_out_qemu_ldst_helper_calls(TCGContext *s) +{ + int i; + HelperLabel *label; + + for (i = 0; i < s->nb_helper_labels; i++) { + label = (HelperLabel *)&s->helper_labels[i]; + if (IS_QEMU_LD_LABEL(label)) { + tcg_out_qemu_ld_helper_call(s, label); + } else { + tcg_out_qemu_st_helper_call(s, label); + } + } +} + +/* + * almost same with tcg_out_tlb_load except that forward jump target is different + * + */ + +static inline void tcg_out_tlb_load_opt(TCGContext *s, int addrlo_idx, + int mem_index, int s_bits, + const TCGArg *args, + uint32_t **label_ptr, int which) +{ + const int addrlo = args[addrlo_idx]; + const int r0 = tcg_target_call_iarg_regs[0]; + const int r1 = tcg_target_call_iarg_regs[1]; + TCGType type = TCG_TYPE_I32; + int rexw = 0; + + if (TCG_TARGET_REG_BITS == 64 && TARGET_LONG_BITS == 64) { + type = TCG_TYPE_I64; + rexw = P_REXW; + } + + tcg_out_mov(s, type, r1, addrlo); + tcg_out_mov(s, type, r0, addrlo); + + tcg_out_shifti(s, SHIFT_SHR + rexw, r1, + TARGET_PAGE_BITS - CPU_TLB_ENTRY_BITS); + + tgen_arithi(s, ARITH_AND + rexw, r0, + TARGET_PAGE_MASK | ((1 << s_bits) - 1), 0); + tgen_arithi(s, ARITH_AND + rexw, r1, + (CPU_TLB_SIZE - 1) << CPU_TLB_ENTRY_BITS, 0); + + tcg_out_modrm_sib_offset(s, OPC_LEA + P_REXW, r1, TCG_AREG0, r1, 0, + offsetof(CPUState, tlb_table[mem_index][0]) + + which); + + /* cmp 0(r1), r0 */ + tcg_out_modrm_offset(s, OPC_CMP_GvEv + rexw, r0, r1, 0); + + tcg_out_mov(s, type, r0, addrlo); + + /* jne label1; short jump is not enough in case of big TB */ + tcg_out_opc(s, OPC_JCC_long + JCC_JNE, 0, 0, 0); + if (!label_ptr) { + tcg_abort(); + } + label_ptr[0] = (uint32_t *)s->code_ptr; + s->code_ptr += 4; + + if (TARGET_LONG_BITS > TCG_TARGET_REG_BITS) { + /* cmp 4(r1), addrhi */ + tcg_out_modrm_offset(s, OPC_CMP_GvEv, args[addrlo_idx+1], r1, 4); + + /* jne label1 */ + tcg_out_opc(s, OPC_JCC_long + JCC_JNE, 0, 0, 0); + label_ptr[1] = (uint32_t *)s->code_ptr; + s->code_ptr += 4; + } + + /* TLB Hit. */ + + /* add addend(r1), r0 */ + tcg_out_modrm_offset(s, OPC_ADD_GvEv + P_REXW, r0, r1, + offsetof(CPUTLBEntry, addend) - which); +} + +/* XXX: qemu_ld and qemu_st could be modified to clobber only EDX and + EAX. It will be useful once fixed registers globals are less + common. */ +static void tcg_out_qemu_ld_opt(TCGContext *s, const TCGArg *args, + int opc) +{ + int data_reg, data_reg2 = 0; + int addrlo_idx; + int mem_index, s_bits; + uint32_t *label_ptr[2]; + + data_reg = args[0]; + addrlo_idx = 1; + if (TCG_TARGET_REG_BITS == 32 && opc == 3) { + data_reg2 = args[1]; + addrlo_idx = 2; + } + + mem_index = args[addrlo_idx + 1 + (TARGET_LONG_BITS > TCG_TARGET_REG_BITS)]; + s_bits = opc & 3; + + tcg_out_tlb_load_opt(s, addrlo_idx, mem_index, s_bits, args, + &label_ptr[0], offsetof(CPUTLBEntry, addr_read)); + + /* TLB Hit. */ + tcg_out_qemu_ld_direct(s, data_reg, data_reg2, + tcg_target_call_iarg_regs[0], 0, opc); + + /* helper stub will be jumped back here */ + add_helper_label(s, opc, data_reg, data_reg2, + args[addrlo_idx], + (TCG_TARGET_REG_BITS == 32 && TARGET_LONG_BITS == 64) ? args[addrlo_idx + 1] : 0, + mem_index, s->code_ptr, label_ptr); + +} + +static void tcg_out_qemu_st_opt(TCGContext *s, const TCGArg *args, + int opc) +{ + int data_reg, data_reg2 = 0; + int addrlo_idx; + int mem_index, s_bits; + uint32_t *label_ptr[2]; + + data_reg = args[0]; + addrlo_idx = 1; + if (TCG_TARGET_REG_BITS == 32 && opc == 3) { + data_reg2 = args[1]; + addrlo_idx = 2; + } + + mem_index = args[addrlo_idx + 1 + (TARGET_LONG_BITS > TCG_TARGET_REG_BITS)]; + s_bits = opc; + + tcg_out_tlb_load_opt(s, addrlo_idx, mem_index, s_bits, args, + &label_ptr[0], offsetof(CPUTLBEntry, addr_write)); + + /* TLB Hit. */ + tcg_out_qemu_st_direct(s, data_reg, data_reg2, + tcg_target_call_iarg_regs[0], 0, opc); + + /* helper stub will be jumped back here */ + add_helper_label(s, opc | HL_ST_MASK, data_reg, data_reg2, + args[addrlo_idx], + (TCG_TARGET_REG_BITS == 32 && TARGET_LONG_BITS == 64) ? args[addrlo_idx + 1] : 0, + mem_index, s->code_ptr, label_ptr); +} + +#endif /* CONFIG_TCG_TARGET_X86_OPT */ static inline void tcg_out_op(TCGContext *s, TCGOpcode opc, const TCGArg *args, const int *const_args) @@ -1639,6 +2018,9 @@ static inline void tcg_out_op(TCGContext *s, TCGOpcode opc, tcg_out_ext16u(s, args[0], args[1]); break; +#if defined(CONFIG_TCG_TARGET_X86_OPT) && defined(CONFIG_SOFTMMU) +#define tcg_out_qemu_ld(S, ARGS, OPC) tcg_out_qemu_ld_opt(S, ARGS, OPC) +#endif /* CONFIG_TCG_TARGET_X86_OPT */ case INDEX_op_qemu_ld8u: tcg_out_qemu_ld(s, args, 0); break; @@ -1661,6 +2043,9 @@ static inline void tcg_out_op(TCGContext *s, TCGOpcode opc, tcg_out_qemu_ld(s, args, 3); break; +#if defined(CONFIG_TCG_TARGET_X86_OPT) && defined(CONFIG_SOFTMMU) +#define tcg_out_qemu_st(S, ARGS, OPC) tcg_out_qemu_st_opt(S, ARGS, OPC) +#endif /* CONFIG_TCG_TARGET_X86_OPT */ case INDEX_op_qemu_st8: tcg_out_qemu_st(s, args, 0); break; @@ -1892,6 +2277,9 @@ static const TCGTargetOpDef x86_op_defs[] = { { INDEX_op_qemu_st32, { "L", "L", "L" } }, { INDEX_op_qemu_st64, { "L", "L", "L", "L" } }, #endif +#ifdef CONFIG_EXEC_PROFILE + { INDEX_op_prof_tbexec, { } }, +#endif { -1 }, }; diff --git a/tcg/tcg-op.h b/tcg/tcg-op.h index 207a89f..2b93b7d 100644 --- a/tcg/tcg-op.h +++ b/tcg/tcg-op.h @@ -1456,7 +1456,7 @@ static inline void tcg_gen_bswap16_i32(TCGv_i32 ret, TCGv_i32 arg) tcg_gen_op2_i32(INDEX_op_bswap16_i32, ret, arg); #else TCGv_i32 t0 = tcg_temp_new_i32(); - + tcg_gen_ext8u_i32(t0, arg); tcg_gen_shli_i32(t0, t0, 8); tcg_gen_shri_i32(ret, arg, 8); @@ -1473,17 +1473,17 @@ static inline void tcg_gen_bswap32_i32(TCGv_i32 ret, TCGv_i32 arg) TCGv_i32 t0, t1; t0 = tcg_temp_new_i32(); t1 = tcg_temp_new_i32(); - + tcg_gen_shli_i32(t0, arg, 24); - + tcg_gen_andi_i32(t1, arg, 0x0000ff00); tcg_gen_shli_i32(t1, t1, 8); tcg_gen_or_i32(t0, t0, t1); - + tcg_gen_shri_i32(t1, arg, 8); tcg_gen_andi_i32(t1, t1, 0x0000ff00); tcg_gen_or_i32(t0, t0, t1); - + tcg_gen_shri_i32(t1, arg, 24); tcg_gen_or_i32(ret, t0, t1); tcg_temp_free_i32(t0); @@ -1703,13 +1703,13 @@ static inline void tcg_gen_bswap64_i64(TCGv_i64 ret, TCGv_i64 arg) #else TCGv_i64 t0 = tcg_temp_new_i64(); TCGv_i64 t1 = tcg_temp_new_i64(); - + tcg_gen_shli_i64(t0, arg, 56); - + tcg_gen_andi_i64(t1, arg, 0x0000ff00); tcg_gen_shli_i64(t1, t1, 40); tcg_gen_or_i64(t0, t0, t1); - + tcg_gen_andi_i64(t1, arg, 0x00ff0000); tcg_gen_shli_i64(t1, t1, 24); tcg_gen_or_i64(t0, t0, t1); @@ -1721,7 +1721,7 @@ static inline void tcg_gen_bswap64_i64(TCGv_i64 ret, TCGv_i64 arg) tcg_gen_shri_i64(t1, arg, 8); tcg_gen_andi_i64(t1, t1, 0xff000000); tcg_gen_or_i64(t0, t0, t1); - + tcg_gen_shri_i64(t1, arg, 24); tcg_gen_andi_i64(t1, t1, 0x00ff0000); tcg_gen_or_i64(t0, t0, t1); @@ -2169,13 +2169,22 @@ static inline void tcg_gen_debug_insn_start(uint64_t pc) { /* XXX: must really use a 32 bit size for TCGArg in all cases */ #if TARGET_LONG_BITS > TCG_TARGET_REG_BITS - tcg_gen_op2ii(INDEX_op_debug_insn_start, + tcg_gen_op2ii(INDEX_op_debug_insn_start, (uint32_t)(pc), (uint32_t)(pc >> 32)); #else tcg_gen_op1i(INDEX_op_debug_insn_start, pc); #endif } +#ifdef CONFIG_EXEC_PROFILE +static inline void tcg_gen_prof_tbexec(int idx) +{ + tcg_gen_op1i(INDEX_op_prof_tbexec, idx); +} +#else +# define tcg_gen_prof_tbexec(idx) +#endif /* CONFIG_EXEC_PROFILE */ + static inline void tcg_gen_exit_tb(tcg_target_long val) { tcg_gen_op1i(INDEX_op_exit_tb, val); diff --git a/tcg/tcg-opc.h b/tcg/tcg-opc.h index 2c7ca1a..bca40bb 100644 --- a/tcg/tcg-opc.h +++ b/tcg/tcg-opc.h @@ -307,4 +307,8 @@ DEF(qemu_st64, 0, 2, 1, TCG_OPF_CALL_CLOBBER | TCG_OPF_SIDE_EFFECTS) #endif /* TCG_TARGET_REG_BITS != 32 */ +#ifdef CONFIG_EXEC_PROFILE +DEF(prof_tbexec, 0, 1, 0, 0) +#endif /* CONFIG_EXEC_PROFILE */ + #undef DEF diff --git a/tcg/tcg.c b/tcg/tcg.c index 5f7a34e..777a423 100644 --- a/tcg/tcg.c +++ b/tcg/tcg.c @@ -24,7 +24,7 @@ /* define it to use liveness analysis (better code) */ #define USE_LIVENESS_ANALYSIS -#define USE_TCG_OPTIMIZATIONS +#define USE_TCG_OPTIMIZATIONS #include "config.h" @@ -67,7 +67,7 @@ static void tcg_target_init(TCGContext *s); static void tcg_target_qemu_prologue(TCGContext *s); -static void patch_reloc(uint8_t *code_ptr, int type, +static void patch_reloc(uint8_t *code_ptr, int type, tcg_target_long value, tcg_target_long addend); static TCGOpDef tcg_op_defs[] = { @@ -111,7 +111,7 @@ static void tcg_out_reloc(TCGContext *s, uint8_t *code_ptr, int type, l = &s->labels[label_index]; if (l->has_value) { /* FIXME: This may break relocations on RISC targets that - modify instruction fields in place. The caller may not have + modify instruction fields in place. The caller may not have written the initial value. */ patch_reloc(code_ptr, type, l->u.value, addend); } else { @@ -125,7 +125,7 @@ static void tcg_out_reloc(TCGContext *s, uint8_t *code_ptr, int type, } } -static void tcg_out_label(TCGContext *s, int label_index, +static void tcg_out_label(TCGContext *s, int label_index, tcg_target_long value) { TCGLabel *l; @@ -165,7 +165,7 @@ void *tcg_malloc_internal(TCGContext *s, int size) { TCGPool *p; int pool_size; - + if (size > TCG_POOL_CHUNK_SIZE) { /* big malloc: insert a new pool (XXX: could optimize) */ p = qemu_malloc(sizeof(TCGPool) + size); @@ -188,7 +188,7 @@ void *tcg_malloc_internal(TCGContext *s, int size) p = qemu_malloc(sizeof(TCGPool) + pool_size); p->size = pool_size; p->next = NULL; - if (s->pool_current) + if (s->pool_current) s->pool_current->next = p; else s->pool_first = p; @@ -219,7 +219,7 @@ void tcg_context_init(TCGContext *s) memset(s, 0, sizeof(*s)); s->temps = s->static_temps; s->nb_globals = 0; - + /* Count total number of arguments and allocate the corresponding space */ total_args = 0; @@ -240,7 +240,7 @@ void tcg_context_init(TCGContext *s) sorted_args += n; args_ct += n; } - + tcg_target_init(s); } @@ -250,7 +250,7 @@ void tcg_prologue_init(TCGContext *s) s->code_buf = code_gen_prologue; s->code_ptr = s->code_buf; tcg_target_qemu_prologue(s); - flush_icache_range((unsigned long)s->code_buf, + flush_icache_range((unsigned long)s->code_buf, (unsigned long)s->code_ptr); } @@ -275,6 +275,13 @@ void tcg_func_start(TCGContext *s) gen_opc_ptr = gen_opc_buf; gen_opparam_ptr = gen_opparam_buf; +#if defined(CONFIG_TCG_TARGET_X86_OPT) && defined(CONFIG_SOFTMMU) + s->helper_labels = tcg_malloc(sizeof(HelperLabel) * TCG_MAX_HELPER_LABELS); + if (!s->helper_labels) { + tcg_abort(); + } + s->nb_helper_labels = 0; +#endif } static inline void tcg_temp_alloc(TCGContext *s, int n) @@ -752,7 +759,7 @@ static char *tcg_get_arg_str_idx(TCGContext *s, char *buf, int buf_size, if (idx < s->nb_globals) { pstrcpy(buf, buf_size, ts->name); } else { - if (ts->temp_local) + if (ts->temp_local) snprintf(buf, buf_size, "loc%d", idx - s->nb_globals); else snprintf(buf, buf_size, "tmp%d", idx - s->nb_globals); @@ -790,7 +797,7 @@ static TCGHelperInfo *tcg_find_helper(TCGContext *s, tcg_target_ulong val) tcg_target_ulong v; if (unlikely(!s->helpers_sorted)) { - qsort(s->helpers, s->nb_helpers, sizeof(TCGHelperInfo), + qsort(s->helpers, s->nb_helpers, sizeof(TCGHelperInfo), helper_cmp); s->helpers_sorted = 1; } @@ -850,7 +857,7 @@ void tcg_dump_ops(TCGContext *s, FILE *outfile) #else pc = args[0]; #endif - if (!first_insn) + if (!first_insn) fprintf(outfile, "\n"); fprintf(outfile, " ---- 0x%" PRIx64, pc); first_insn = 0; @@ -890,7 +897,7 @@ void tcg_dump_ops(TCGContext *s, FILE *outfile) tcg_get_arg_str_idx(s, buf, sizeof(buf), args[nb_oargs + i])); } } - } else if (c == INDEX_op_movi_i32 + } else if (c == INDEX_op_movi_i32 #if TCG_TARGET_REG_BITS == 64 || c == INDEX_op_movi_i64 #endif @@ -901,7 +908,7 @@ void tcg_dump_ops(TCGContext *s, FILE *outfile) nb_oargs = def->nb_oargs; nb_iargs = def->nb_iargs; nb_cargs = def->nb_cargs; - fprintf(outfile, " %s %s,$", def->name, + fprintf(outfile, " %s %s,$", def->name, tcg_get_arg_str_idx(s, buf, sizeof(buf), args[0])); val = args[1]; th = tcg_find_helper(s, val); @@ -925,7 +932,7 @@ void tcg_dump_ops(TCGContext *s, FILE *outfile) nb_iargs = def->nb_iargs; nb_cargs = def->nb_cargs; } - + k = 0; for(i = 0; i < nb_oargs; i++) { if (k != 0) @@ -1123,7 +1130,7 @@ void tcg_add_target_add_op_defs(const TCGTargetOpDef *tdefs) #ifdef USE_LIVENESS_ANALYSIS /* set a nop for an operation using 'nb_args' */ -static inline void tcg_set_nop(TCGContext *s, uint16_t *opc_ptr, +static inline void tcg_set_nop(TCGContext *s, uint16_t *opc_ptr, TCGArg *args, int nb_args) { if (nb_args == 0) { @@ -1174,13 +1181,13 @@ static void tcg_liveness_analysis(TCGContext *s) const TCGOpDef *def; uint8_t *dead_temps; unsigned int dead_iargs; - + gen_opc_ptr++; /* skip end */ nb_ops = gen_opc_ptr - gen_opc_buf; s->op_dead_iargs = tcg_malloc(nb_ops * sizeof(uint16_t)); - + dead_temps = tcg_malloc(s->nb_temps); memset(dead_temps, 1, s->nb_temps); @@ -1209,7 +1216,7 @@ static void tcg_liveness_analysis(TCGContext *s) if (!dead_temps[arg]) goto do_not_remove_call; } - tcg_set_nop(s, gen_opc_buf + op_index, + tcg_set_nop(s, gen_opc_buf + op_index, args - 1, nb_args); } else { do_not_remove_call: @@ -1219,7 +1226,7 @@ static void tcg_liveness_analysis(TCGContext *s) arg = args[i]; dead_temps[arg] = 1; } - + if (!(call_flags & TCG_CALL_CONST)) { /* globals are live (they may be used by the call) */ memset(dead_temps, 0, s->nb_globals); @@ -1359,8 +1366,8 @@ static void dump_regs(TCGContext *s) for(i = 0; i < TCG_TARGET_NB_REGS; i++) { if (s->reg_to_temp[i] >= 0) { - printf("%s: %s\n", - tcg_target_reg_names[i], + printf("%s: %s\n", + tcg_target_reg_names[i], tcg_get_arg_str_idx(s, buf, sizeof(buf), s->reg_to_temp[i])); } } @@ -1378,7 +1385,7 @@ static void check_regs(TCGContext *s) ts = &s->temps[k]; if (ts->val_type != TEMP_VAL_REG || ts->reg != reg) { - printf("Inconsistency for register %s:\n", + printf("Inconsistency for register %s:\n", tcg_target_reg_names[reg]); goto fail; } @@ -1389,7 +1396,7 @@ static void check_regs(TCGContext *s) if (ts->val_type == TEMP_VAL_REG && !ts->fixed_reg && s->reg_to_temp[ts->reg] != k) { - printf("Inconsistency for temp %s:\n", + printf("Inconsistency for temp %s:\n", tcg_get_arg_str_idx(s, buf, sizeof(buf), k)); fail: printf("reg state:\n"); @@ -1424,7 +1431,7 @@ static void tcg_reg_free(TCGContext *s, int reg) ts = &s->temps[temp]; assert(ts->val_type == TEMP_VAL_REG); if (!ts->mem_coherent) { - if (!ts->mem_allocated) + if (!ts->mem_allocated) temp_allocate_frame(s, temp); tcg_out_st(s, ts->type, reg, ts->mem_reg, ts->mem_offset); } @@ -1477,9 +1484,9 @@ static void temp_save(TCGContext *s, int temp, TCGRegSet allocated_regs) ts->val_type = TEMP_VAL_MEM; break; case TEMP_VAL_CONST: - reg = tcg_reg_alloc(s, tcg_target_available_regs[ts->type], + reg = tcg_reg_alloc(s, tcg_target_available_regs[ts->type], allocated_regs); - if (!ts->mem_allocated) + if (!ts->mem_allocated) temp_allocate_frame(s, temp); tcg_out_movi(s, ts->type, reg, ts->val); tcg_out_st(s, ts->type, reg, ts->mem_reg, ts->mem_offset); @@ -1609,7 +1616,7 @@ static void tcg_reg_alloc_mov(TCGContext *s, const TCGOpDef *def, ots->mem_coherent = 0; } -static void tcg_reg_alloc_op(TCGContext *s, +static void tcg_reg_alloc_op(TCGContext *s, const TCGOpDef *def, TCGOpcode opc, const TCGArg *args, unsigned int dead_iargs) @@ -1626,11 +1633,11 @@ static void tcg_reg_alloc_op(TCGContext *s, nb_iargs = def->nb_iargs; /* copy constants */ - memcpy(new_args + nb_oargs + nb_iargs, - args + nb_oargs + nb_iargs, + memcpy(new_args + nb_oargs + nb_iargs, + args + nb_oargs + nb_iargs, sizeof(TCGArg) * def->nb_cargs); - /* satisfy input constraints */ + /* satisfy input constraints */ tcg_regset_set(allocated_regs, s->reserved_regs); for(k = 0; k < nb_iargs; k++) { i = def->sorted_args[nb_oargs + k]; @@ -1671,7 +1678,7 @@ static void tcg_reg_alloc_op(TCGContext *s, /* if the input is aliased to an output and if it is not dead after the instruction, we must allocate a new register and move it */ - if (!IS_DEAD_IARG(i - nb_oargs)) + if (!IS_DEAD_IARG(i - nb_oargs)) goto allocate_in_reg; } } @@ -1680,7 +1687,7 @@ static void tcg_reg_alloc_op(TCGContext *s, /* nothing to do : the constraint is satisfied */ } else { allocate_in_reg: - /* allocate a new register matching the constraint + /* allocate a new register matching the constraint and move the temporary register into it */ reg = tcg_reg_alloc(s, arg_ct->u.regs, allocated_regs); tcg_out_mov(s, ts->type, reg, ts->reg); @@ -1690,7 +1697,7 @@ static void tcg_reg_alloc_op(TCGContext *s, tcg_regset_set_reg(allocated_regs, reg); iarg_end: ; } - + if (def->flags & TCG_OPF_BB_END) { tcg_reg_alloc_bb_end(s, allocated_regs); } else { @@ -1706,9 +1713,9 @@ static void tcg_reg_alloc_op(TCGContext *s, } } } - + if (def->flags & TCG_OPF_CALL_CLOBBER) { - /* XXX: permit generic clobber register list ? */ + /* XXX: permit generic clobber register list ? */ for(reg = 0; reg < TCG_TARGET_NB_REGS; reg++) { if (tcg_regset_test_reg(tcg_target_call_clobber_regs, reg)) { tcg_reg_free(s, reg); @@ -1716,12 +1723,12 @@ static void tcg_reg_alloc_op(TCGContext *s, } /* XXX: for load/store we could do that only for the slow path (i.e. when a memory callback is called) */ - + /* store globals and free associated registers (we assume the insn can modify any global. */ save_globals(s, allocated_regs); } - + /* satisfy the output constraints */ tcg_regset_set(allocated_regs, s->reserved_regs); for(k = 0; k < nb_oargs; k++) { @@ -1749,7 +1756,7 @@ static void tcg_reg_alloc_op(TCGContext *s, ts->reg = reg; /* temp value is modified, so the value kept in memory is potentially not the same */ - ts->mem_coherent = 0; + ts->mem_coherent = 0; s->reg_to_temp[reg] = arg; } oarg_end: @@ -1759,7 +1766,7 @@ static void tcg_reg_alloc_op(TCGContext *s, /* emit instruction */ tcg_out_op(s, opc, new_args, const_args); - + /* move the outputs in the correct register if needed */ for(i = 0; i < nb_oargs; i++) { ts = &s->temps[args[i]]; @@ -1803,7 +1810,7 @@ static int tcg_reg_alloc_call(TCGContext *s, const TCGOpDef *def, /* assign stack slots first */ /* XXX: preallocate call stack */ call_stack_size = (nb_params - nb_regs) * sizeof(tcg_target_long); - call_stack_size = (call_stack_size + TCG_TARGET_STACK_ALIGN - 1) & + call_stack_size = (call_stack_size + TCG_TARGET_STACK_ALIGN - 1) & ~(TCG_TARGET_STACK_ALIGN - 1); allocate_args = (call_stack_size > TCG_STATIC_CALL_ARGS_SIZE); if (allocate_args) { @@ -1821,13 +1828,13 @@ static int tcg_reg_alloc_call(TCGContext *s, const TCGOpDef *def, if (ts->val_type == TEMP_VAL_REG) { tcg_out_st(s, ts->type, ts->reg, TCG_REG_CALL_STACK, stack_offset); } else if (ts->val_type == TEMP_VAL_MEM) { - reg = tcg_reg_alloc(s, tcg_target_available_regs[ts->type], + reg = tcg_reg_alloc(s, tcg_target_available_regs[ts->type], s->reserved_regs); /* XXX: not correct if reading values from the stack */ tcg_out_ld(s, ts->type, reg, ts->mem_reg, ts->mem_offset); tcg_out_st(s, ts->type, reg, TCG_REG_CALL_STACK, stack_offset); } else if (ts->val_type == TEMP_VAL_CONST) { - reg = tcg_reg_alloc(s, tcg_target_available_regs[ts->type], + reg = tcg_reg_alloc(s, tcg_target_available_regs[ts->type], s->reserved_regs); /* XXX: sign extend may be needed on some targets */ tcg_out_movi(s, ts->type, reg, ts->val); @@ -1840,7 +1847,7 @@ static int tcg_reg_alloc_call(TCGContext *s, const TCGOpDef *def, stack_offset += sizeof(tcg_target_long); #endif } - + /* assign input registers */ tcg_regset_set(allocated_regs, s->reserved_regs); for(i = 0; i < nb_regs; i++) { @@ -1864,7 +1871,7 @@ static int tcg_reg_alloc_call(TCGContext *s, const TCGOpDef *def, tcg_regset_set_reg(allocated_regs, reg); } } - + /* assign function address */ func_arg = args[nb_oargs + nb_iargs - 1]; arg_ct = &def->args_ct[0]; @@ -1897,8 +1904,8 @@ static int tcg_reg_alloc_call(TCGContext *s, const TCGOpDef *def, } else { tcg_abort(); } - - + + /* mark dead temporaries and free the associated registers */ for(i = 0; i < nb_iargs; i++) { arg = args[nb_oargs + i]; @@ -1911,14 +1918,14 @@ static int tcg_reg_alloc_call(TCGContext *s, const TCGOpDef *def, } } } - + /* clobber call registers */ for(reg = 0; reg < TCG_TARGET_NB_REGS; reg++) { if (tcg_regset_test_reg(tcg_target_call_clobber_regs, reg)) { tcg_reg_free(s, reg); } } - + /* store globals and free associated registers (we assume the call can modify any global. */ if (!(flags & TCG_CALL_CONST)) { @@ -1926,7 +1933,7 @@ static int tcg_reg_alloc_call(TCGContext *s, const TCGOpDef *def, } tcg_out_op(s, opc, &func_arg, &const_func_arg); - + if (allocate_args) { tcg_out_addi(s, TCG_REG_CALL_STACK, STACK_DIR(call_stack_size)); } @@ -1946,11 +1953,11 @@ static int tcg_reg_alloc_call(TCGContext *s, const TCGOpDef *def, s->reg_to_temp[ts->reg] = -1; ts->val_type = TEMP_VAL_REG; ts->reg = reg; - ts->mem_coherent = 0; + ts->mem_coherent = 0; s->reg_to_temp[reg] = arg; } } - + return nb_iargs + nb_oargs + def->nb_cargs + 1; } @@ -2104,6 +2111,22 @@ static inline int tcg_gen_code_common(TCGContext *s, uint8_t *gen_code_buf, goto next; case INDEX_op_end: goto the_end; +#ifdef CONFIG_PROFILER_EX + case INDEX_op_qemu_ld8u: + case INDEX_op_qemu_ld8s: + case INDEX_op_qemu_ld16u: + case INDEX_op_qemu_ld16s: + case INDEX_op_qemu_ld32: + case INDEX_op_qemu_ld64: + s->qemu_ld_count++; + goto gen; + case INDEX_op_qemu_st8: + case INDEX_op_qemu_st16: + case INDEX_op_qemu_st32: + case INDEX_op_qemu_st64: + s->qemu_st_count++; + gen: +#endif default: /* Note: in order to speed up the code, it would be much faster to have specialized register allocator functions for @@ -2123,6 +2146,10 @@ static inline int tcg_gen_code_common(TCGContext *s, uint8_t *gen_code_buf, #endif } the_end: +#if defined(CONFIG_TCG_TARGET_X86_OPT) && defined(CONFIG_SOFTMMU) + /* Generate MMU call helpers at the end of block (currently only for qemu_ld/st) */ + tcg_out_qemu_ldst_helper_calls(s); +#endif return -1; } @@ -2145,7 +2172,7 @@ int tcg_gen_code(TCGContext *s, uint8_t *gen_code_buf) tcg_gen_code_common(s, gen_code_buf, -1); /* flush instruction cache */ - flush_icache_range((unsigned long)gen_code_buf, + flush_icache_range((unsigned long)gen_code_buf, (unsigned long)s->code_ptr); return s->code_ptr - gen_code_buf; } @@ -2168,33 +2195,33 @@ void tcg_dump_info(FILE *f, fprintf_function cpu_fprintf) tot = s->interm_time + s->code_time; cpu_fprintf(f, "JIT cycles %" PRId64 " (%0.3f s at 2.4 GHz)\n", tot, tot / 2.4e9); - cpu_fprintf(f, "translated TBs %" PRId64 " (aborted=%" PRId64 " %0.1f%%)\n", - s->tb_count, + cpu_fprintf(f, "translated TBs %" PRId64 " (aborted=%" PRId64 " %0.1f%%)\n", + s->tb_count, s->tb_count1 - s->tb_count, s->tb_count1 ? (double)(s->tb_count1 - s->tb_count) / s->tb_count1 * 100.0 : 0); - cpu_fprintf(f, "avg ops/TB %0.1f max=%d\n", + cpu_fprintf(f, "avg ops/TB %0.1f max=%d\n", s->tb_count ? (double)s->op_count / s->tb_count : 0, s->op_count_max); cpu_fprintf(f, "deleted ops/TB %0.2f\n", - s->tb_count ? + s->tb_count ? (double)s->del_op_count / s->tb_count : 0); cpu_fprintf(f, "avg temps/TB %0.2f max=%d\n", - s->tb_count ? + s->tb_count ? (double)s->temp_count / s->tb_count : 0, s->temp_count_max); - - cpu_fprintf(f, "cycles/op %0.1f\n", + + cpu_fprintf(f, "cycles/op %0.1f\n", s->op_count ? (double)tot / s->op_count : 0); - cpu_fprintf(f, "cycles/in byte %0.1f\n", + cpu_fprintf(f, "cycles/in byte %0.1f\n", s->code_in_len ? (double)tot / s->code_in_len : 0); - cpu_fprintf(f, "cycles/out byte %0.1f\n", + cpu_fprintf(f, "cycles/out byte %0.1f\n", s->code_out_len ? (double)tot / s->code_out_len : 0); if (tot == 0) tot = 1; - cpu_fprintf(f, " gen_interm time %0.1f%%\n", + cpu_fprintf(f, " gen_interm time %0.1f%%\n", (double)s->interm_time / tot * 100.0); - cpu_fprintf(f, " gen_code time %0.1f%%\n", + cpu_fprintf(f, " gen_code time %0.1f%%\n", (double)s->code_time / tot * 100.0); - cpu_fprintf(f, "liveness/code time %0.1f%%\n", + cpu_fprintf(f, "liveness/code time %0.1f%%\n", (double)s->la_time / (s->code_time ? s->code_time : 1) * 100.0); cpu_fprintf(f, "cpu_restore count %" PRId64 "\n", s->restore_count); diff --git a/tcg/tcg.h b/tcg/tcg.h index b2febc1..a3061c1 100644 --- a/tcg/tcg.h +++ b/tcg/tcg.h @@ -70,7 +70,7 @@ typedef struct TCGRelocation { int type; uint8_t *ptr; tcg_target_long addend; -} TCGRelocation; +} TCGRelocation; typedef struct TCGLabel { int has_value; @@ -134,6 +134,27 @@ typedef tcg_target_ulong TCGArg; are aliases for target_ulong and host pointer sized values respectively. */ +#if defined(CONFIG_TCG_TARGET_X86_OPT) && defined(CONFIG_SOFTMMU) +#define TCG_MAX_HELPER_LABELS 200 +#define HL_LDST_SHIFT 4 +#define HL_LDST_MASK (1 << HL_LDST_SHIFT) +#define HL_ST_MASK HL_LDST_MASK +#define HL_OPC_MASK (HL_LDST_MASK - 1) +#define IS_QEMU_LD_LABEL(L) (!((L)->opc_ext & HL_LDST_MASK)) +#define IS_QEMU_ST_LABEL(L) ((L)->opc_ext & HL_LDST_MASK) + +typedef struct HelperLabel { + int opc_ext; + int datalo_reg; + int datahi_reg; + int addrlo_reg; + int addrhi_reg; + int mem_index; + uint8_t *raddr; /* return address */ + uint32_t *label_ptr[2]; /* label pointer to be updated */ +} HelperLabel; +#endif /* CONFIG_TCG_TARGET_X86_OPT */ + #ifdef CONFIG_DEBUG_TCG #define DEBUG_TCGV 1 #endif @@ -193,7 +214,7 @@ typedef int TCGv_i64; /* A pure function only reads its arguments and TCG global variables and cannot raise exceptions. Hence a call to a pure function can be safely suppressed if the return value is not used. */ -#define TCG_CALL_PURE 0x0010 +#define TCG_CALL_PURE 0x0010 /* A const function only reads its arguments and does not use TCG global variables. Hence a call to such a function does not save TCG global variables back to their canonical location. */ @@ -277,7 +298,7 @@ struct TCGContext { int nb_globals; int nb_temps; /* index of free temps, -1 if none */ - int first_free_temp[TCG_TYPE_COUNT * 2]; + int first_free_temp[TCG_TYPE_COUNT * 2]; /* goto_tb support */ uint8_t *code_buf; @@ -288,7 +309,7 @@ struct TCGContext { /* liveness analysis */ uint16_t *op_dead_iargs; /* for each operation, each bit tells if the corresponding input argument is dead */ - + /* tells in which temporary a given register is. It does not take into account fixed registers */ int reg_to_temp[TCG_TARGET_NB_REGS]; @@ -322,6 +343,14 @@ struct TCGContext { int64_t la_time; int64_t restore_count; int64_t restore_time; +#ifdef CONFIG_PROFILER_EX + int64_t qemu_ld_count; + int64_t qemu_st_count; +#endif +#endif +#if defined(CONFIG_TCG_TARGET_X86_OPT) && defined(CONFIG_SOFTMMU) + HelperLabel *helper_labels; + int nb_helper_labels; #endif }; @@ -411,7 +440,7 @@ typedef struct TCGArgConstraint { #define TCG_OPF_BB_END 0x01 /* instruction defines the end of a basic block */ -#define TCG_OPF_CALL_CLOBBER 0x02 /* instruction clobbers call registers +#define TCG_OPF_CALL_CLOBBER 0x02 /* instruction clobbers call registers and potentially update globals. */ #define TCG_OPF_SIDE_EFFECTS 0x04 /* instruction has side effects : it cannot be removed if its output @@ -427,7 +456,7 @@ typedef struct TCGOpDef { int used; #endif } TCGOpDef; - + typedef struct TCGTargetOpDef { TCGOpcode op; const char *args_ct_str[TCG_MAX_OP_ARGS]; @@ -491,3 +520,7 @@ extern uint8_t code_gen_prologue[]; #else #define tcg_qemu_tb_exec(tb_ptr) ((long REGPARM (*)(void *))code_gen_prologue)(tb_ptr) #endif + +#if defined(CONFIG_TCG_TARGET_X86_OPT) +void tcg_out_qemu_ldst_helper_calls(TCGContext *s); +#endif