From: Ulrich Drepper Date: Thu, 16 Jul 2009 14:15:15 +0000 (-0700) Subject: Optimize restoring of ymm registers on x86-64. X-Git-Tag: upstream/2.30~13468 X-Git-Url: http://review.tizen.org/git/?a=commitdiff_plain;h=c8027cced1d3e7803c440cb13d4294754d8791e2;p=external%2Fglibc.git Optimize restoring of ymm registers on x86-64. The patch mainly reduces the code size but also avoids some jumps. --- diff --git a/ChangeLog b/ChangeLog index 87db19e..1bfdd7b 100644 --- a/ChangeLog +++ b/ChangeLog @@ -1,3 +1,8 @@ +2009-07-16 Ulrich Drepper + + * sysdeps/x86_64/dl-trampoline.S (_dl_runtime_profile): Optimize + restoring of ymm registers a bit. + 2009-07-15 H.J. Lu * sysdeps/x86_64/memcmp.S: New file. diff --git a/sysdeps/x86_64/dl-trampoline.S b/sysdeps/x86_64/dl-trampoline.S index 7f20491..49d239f 100644 --- a/sysdeps/x86_64/dl-trampoline.S +++ b/sysdeps/x86_64/dl-trampoline.S @@ -185,81 +185,73 @@ L(no_avx1): movq LR_R8_OFFSET(%rsp), %r8 movq LR_R9_OFFSET(%rsp), %r9 + movaps (LR_XMM_OFFSET)(%rsp), %xmm0 + movaps (LR_XMM_OFFSET + XMM_SIZE)(%rsp), %xmm1 + movaps (LR_XMM_OFFSET + XMM_SIZE*2)(%rsp), %xmm2 + movaps (LR_XMM_OFFSET + XMM_SIZE*3)(%rsp), %xmm3 + movaps (LR_XMM_OFFSET + XMM_SIZE*4)(%rsp), %xmm4 + movaps (LR_XMM_OFFSET + XMM_SIZE*5)(%rsp), %xmm5 + movaps (LR_XMM_OFFSET + XMM_SIZE*6)(%rsp), %xmm6 + movaps (LR_XMM_OFFSET + XMM_SIZE*7)(%rsp), %xmm7 + # ifdef HAVE_AVX_SUPPORT cmpl $0, L(have_avx)(%rip) js L(no_avx2) /* Check if any xmm0-xmm7 registers are changed by audit module. */ - vmovdqa (LR_XMM_OFFSET)(%rsp), %xmm0 - vpcmpeqq (LR_SIZE)(%rsp), %xmm0, %xmm1 - vpmovmskb %xmm1, %esi + vpcmpeqq (LR_SIZE)(%rsp), %xmm0, %xmm8 + vpmovmskb %xmm8, %esi cmpl $0xffff, %esi je 1f vmovdqu (LR_VECTOR_OFFSET)(%rsp), %ymm0 -1: vmovdqa (LR_XMM_OFFSET + XMM_SIZE)(%rsp), %xmm1 - vpcmpeqq (LR_SIZE + XMM_SIZE)(%rsp), %xmm1, %xmm2 - vpmovmskb %xmm2, %esi +1: vpcmpeqq (LR_SIZE + XMM_SIZE)(%rsp), %xmm1, %xmm8 + vpmovmskb %xmm8, %esi cmpl $0xffff, %esi je 1f vmovdqu (LR_VECTOR_OFFSET + VECTOR_SIZE)(%rsp), %ymm1 -1: vmovdqa (LR_XMM_OFFSET + XMM_SIZE*2)(%rsp), %xmm2 - vpcmpeqq (LR_SIZE + XMM_SIZE*2)(%rsp), %xmm2, %xmm3 - vpmovmskb %xmm3, %esi +1: vpcmpeqq (LR_SIZE + XMM_SIZE*2)(%rsp), %xmm2, %xmm8 + vpmovmskb %xmm8, %esi cmpl $0xffff, %esi je 1f vmovdqu (LR_VECTOR_OFFSET + VECTOR_SIZE*2)(%rsp), %ymm2 -1: vmovdqa (LR_XMM_OFFSET + XMM_SIZE*3)(%rsp), %xmm3 - vpcmpeqq (LR_SIZE + XMM_SIZE*3)(%rsp), %xmm3, %xmm4 - vpmovmskb %xmm4, %esi +1: vpcmpeqq (LR_SIZE + XMM_SIZE*3)(%rsp), %xmm3, %xmm8 + vpmovmskb %xmm8, %esi cmpl $0xffff, %esi je 1f vmovdqu (LR_VECTOR_OFFSET + VECTOR_SIZE*3)(%rsp), %ymm3 -1: vmovdqa (LR_XMM_OFFSET + XMM_SIZE*4)(%rsp), %xmm4 - vpcmpeqq (LR_SIZE + XMM_SIZE*4)(%rsp), %xmm4, %xmm5 - vpmovmskb %xmm5, %esi +1: vpcmpeqq (LR_SIZE + XMM_SIZE*4)(%rsp), %xmm4, %xmm8 + vpmovmskb %xmm8, %esi cmpl $0xffff, %esi je 1f vmovdqu (LR_VECTOR_OFFSET + VECTOR_SIZE*4)(%rsp), %ymm4 -1: vmovdqa (LR_XMM_OFFSET + XMM_SIZE*5)(%rsp), %xmm5 - vpcmpeqq (LR_SIZE + XMM_SIZE*5)(%rsp), %xmm5, %xmm6 - vpmovmskb %xmm6, %esi +1: vpcmpeqq (LR_SIZE + XMM_SIZE*5)(%rsp), %xmm5, %xmm8 + vpmovmskb %xmm8, %esi cmpl $0xffff, %esi je 1f vmovdqu (LR_VECTOR_OFFSET + VECTOR_SIZE*5)(%rsp), %ymm5 -1: vmovdqa (LR_XMM_OFFSET + XMM_SIZE*6)(%rsp), %xmm6 - vpcmpeqq (LR_SIZE + XMM_SIZE*6)(%rsp), %xmm6, %xmm7 - vpmovmskb %xmm7, %esi +1: vpcmpeqq (LR_SIZE + XMM_SIZE*6)(%rsp), %xmm6, %xmm8 + vpmovmskb %xmm8, %esi cmpl $0xffff, %esi je 1f vmovdqu (LR_VECTOR_OFFSET + VECTOR_SIZE*6)(%rsp), %ymm6 -1: vmovdqa (LR_XMM_OFFSET + XMM_SIZE*7)(%rsp), %xmm7 - vpcmpeqq (LR_SIZE + XMM_SIZE*7)(%rsp), %xmm7, %xmm8 +1: vpcmpeqq (LR_SIZE + XMM_SIZE*7)(%rsp), %xmm7, %xmm8 vpmovmskb %xmm8, %esi cmpl $0xffff, %esi je 1f vmovdqu (LR_VECTOR_OFFSET + VECTOR_SIZE*7)(%rsp), %ymm7 - jmp 1f L(no_avx2): +1: # endif - movaps (LR_XMM_OFFSET)(%rsp), %xmm0 - movaps (LR_XMM_OFFSET + XMM_SIZE)(%rsp), %xmm1 - movaps (LR_XMM_OFFSET + XMM_SIZE*2)(%rsp), %xmm2 - movaps (LR_XMM_OFFSET + XMM_SIZE*3)(%rsp), %xmm3 - movaps (LR_XMM_OFFSET + XMM_SIZE*4)(%rsp), %xmm4 - movaps (LR_XMM_OFFSET + XMM_SIZE*5)(%rsp), %xmm5 - movaps (LR_XMM_OFFSET + XMM_SIZE*6)(%rsp), %xmm6 - movaps (LR_XMM_OFFSET + XMM_SIZE*7)(%rsp), %xmm7 - -1: movq 16(%rbx), %r10 # Anything in framesize? + movq 16(%rbx), %r10 # Anything in framesize? testq %r10, %r10 jns 3f @@ -358,32 +350,31 @@ L(no_avx3): movq LRV_RAX_OFFSET(%rsp), %rax movq LRV_RDX_OFFSET(%rsp), %rdx + movaps LRV_XMM0_OFFSET(%rsp), %xmm0 + movaps LRV_XMM1_OFFSET(%rsp), %xmm1 + # ifdef HAVE_AVX_SUPPORT cmpl $0, L(have_avx)(%rip) js L(no_avx4) /* Check if xmm0/xmm1 registers are changed by audit module. */ - vmovdqa LRV_XMM0_OFFSET(%rsp), %xmm0 - vpcmpeqq (LRV_SIZE)(%rsp), %xmm0, %xmm1 - vpmovmskb %xmm1, %esi + vpcmpeqq (LRV_SIZE)(%rsp), %xmm0, %xmm2 + vpmovmskb %xmm2, %esi cmpl $0xffff, %esi je 1f vmovdqu LRV_VECTOR0_OFFSET(%rsp), %ymm0 -1: vmovdqa LRV_XMM1_OFFSET(%rsp), %xmm1 - vpcmpeqq (LRV_SIZE + XMM_SIZE)(%rsp), %xmm1, %xmm2 +1: vpcmpeqq (LRV_SIZE + XMM_SIZE)(%rsp), %xmm1, %xmm2 vpmovmskb %xmm2, %esi cmpl $0xffff, %esi je 1f vmovdqu LRV_VECTOR1_OFFSET(%rsp), %ymm1 - jmp 1f L(no_avx4): +1: # endif - movaps LRV_XMM0_OFFSET(%rsp), %xmm0 - movaps LRV_XMM1_OFFSET(%rsp), %xmm1 -1: fldt LRV_ST1_OFFSET(%rsp) + fldt LRV_ST1_OFFSET(%rsp) fldt LRV_ST0_OFFSET(%rsp) movq %rbx, %rsp