From: Craig Topper Date: Sun, 9 Aug 2020 00:12:20 +0000 (-0700) Subject: [X86] Autogenerate complete checks. NFC X-Git-Tag: llvmorg-13-init~15283 X-Git-Url: http://review.tizen.org/git/?a=commitdiff_plain;h=95e61ed85c1abb7dab20ab665d6204ea66f5bf1f;p=platform%2Fupstream%2Fllvm.git [X86] Autogenerate complete checks. NFC --- diff --git a/llvm/test/CodeGen/X86/break-false-dep.ll b/llvm/test/CodeGen/X86/break-false-dep.ll index 57e632d..e480ba131 100644 --- a/llvm/test/CodeGen/X86/break-false-dep.ll +++ b/llvm/test/CodeGen/X86/break-false-dep.ll @@ -1,13 +1,22 @@ -; RUN: llc < %s -mtriple=x86_64-linux -mattr=+sse2 -mcpu=nehalem | FileCheck %s --check-prefix=SSE -; RUN: llc < %s -mtriple=x86_64-win32 -mattr=+sse2 -mcpu=nehalem | FileCheck %s --check-prefix=SSE -; RUN: llc < %s -mtriple=x86_64-win32 -mattr=+avx -mcpu=corei7-avx | FileCheck %s --check-prefix=AVX -; RUN: llc < %s -mtriple=x86_64-win32 -mattr=+avx512vl -mcpu=skx | FileCheck %s --check-prefix=AVX +; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py +; RUN: llc < %s -mtriple=x86_64-linux -mattr=+sse2 -mcpu=nehalem | FileCheck %s --check-prefixes=SSE,SSE-LINUX +; RUN: llc < %s -mtriple=x86_64-win32 -mattr=+sse2 -mcpu=nehalem | FileCheck %s --check-prefixes=SSE,SSE-WIN +; RUN: llc < %s -mtriple=x86_64-win32 -mattr=+avx -mcpu=corei7-avx | FileCheck %s --check-prefixes=AVX,AVX1 +; RUN: llc < %s -mtriple=x86_64-win32 -mattr=+avx512vl -mcpu=skx | FileCheck %s --check-prefixes=AVX,AVX512VL define double @t1(float* nocapture %x) nounwind readonly ssp { -entry: ; SSE-LABEL: t1: -; SSE: movss ([[A0:%rdi|%rcx]]), %xmm0 -; SSE: cvtss2sd %xmm0, %xmm0 +; SSE: # %bb.0: # %entry +; SSE-NEXT: movss {{.*#+}} xmm0 = mem[0],zero,zero,zero +; SSE-NEXT: cvtss2sd %xmm0, %xmm0 +; SSE-NEXT: retq +; +; AVX-LABEL: t1: +; AVX: # %bb.0: # %entry +; AVX-NEXT: vmovss {{.*#+}} xmm0 = mem[0],zero,zero,zero +; AVX-NEXT: vcvtss2sd %xmm0, %xmm0, %xmm0 +; AVX-NEXT: retq +entry: %0 = load float, float* %x, align 4 %1 = fpext float %0 to double @@ -15,47 +24,99 @@ entry: } define float @t2(double* nocapture %x) nounwind readonly ssp optsize { +; SSE-LINUX-LABEL: t2: +; SSE-LINUX: # %bb.0: # %entry +; SSE-LINUX-NEXT: cvtsd2ss (%rdi), %xmm0 +; SSE-LINUX-NEXT: retq +; +; SSE-WIN-LABEL: t2: +; SSE-WIN: # %bb.0: # %entry +; SSE-WIN-NEXT: cvtsd2ss (%rcx), %xmm0 +; SSE-WIN-NEXT: retq +; +; AVX-LABEL: t2: +; AVX: # %bb.0: # %entry +; AVX-NEXT: vcvtsd2ss (%rcx), %xmm0, %xmm0 +; AVX-NEXT: retq entry: -; SSE-LABEL: t2: -; SSE: cvtsd2ss ([[A0]]), %xmm0 %0 = load double, double* %x, align 8 %1 = fptrunc double %0 to float ret float %1 } define float @squirtf(float* %x) nounwind { -entry: ; SSE-LABEL: squirtf: -; SSE: movss ([[A0]]), %xmm0 -; SSE: sqrtss %xmm0, %xmm0 +; SSE: # %bb.0: # %entry +; SSE-NEXT: movss {{.*#+}} xmm0 = mem[0],zero,zero,zero +; SSE-NEXT: sqrtss %xmm0, %xmm0 +; SSE-NEXT: retq +; +; AVX-LABEL: squirtf: +; AVX: # %bb.0: # %entry +; AVX-NEXT: vmovss {{.*#+}} xmm0 = mem[0],zero,zero,zero +; AVX-NEXT: vsqrtss %xmm0, %xmm0, %xmm0 +; AVX-NEXT: retq +entry: %z = load float, float* %x %t = call float @llvm.sqrt.f32(float %z) ret float %t } define double @squirt(double* %x) nounwind { -entry: ; SSE-LABEL: squirt: -; SSE: movsd ([[A0]]), %xmm0 -; SSE: sqrtsd %xmm0, %xmm0 +; SSE: # %bb.0: # %entry +; SSE-NEXT: movsd {{.*#+}} xmm0 = mem[0],zero +; SSE-NEXT: sqrtsd %xmm0, %xmm0 +; SSE-NEXT: retq +; +; AVX-LABEL: squirt: +; AVX: # %bb.0: # %entry +; AVX-NEXT: vmovsd {{.*#+}} xmm0 = mem[0],zero +; AVX-NEXT: vsqrtsd %xmm0, %xmm0, %xmm0 +; AVX-NEXT: retq +entry: %z = load double, double* %x %t = call double @llvm.sqrt.f64(double %z) ret double %t } define float @squirtf_size(float* %x) nounwind optsize { +; SSE-LINUX-LABEL: squirtf_size: +; SSE-LINUX: # %bb.0: # %entry +; SSE-LINUX-NEXT: sqrtss (%rdi), %xmm0 +; SSE-LINUX-NEXT: retq +; +; SSE-WIN-LABEL: squirtf_size: +; SSE-WIN: # %bb.0: # %entry +; SSE-WIN-NEXT: sqrtss (%rcx), %xmm0 +; SSE-WIN-NEXT: retq +; +; AVX-LABEL: squirtf_size: +; AVX: # %bb.0: # %entry +; AVX-NEXT: vsqrtss (%rcx), %xmm0, %xmm0 +; AVX-NEXT: retq entry: -; SSE-LABEL: squirtf_size: -; SSE: sqrtss ([[A0]]), %xmm0 %z = load float, float* %x %t = call float @llvm.sqrt.f32(float %z) ret float %t } define double @squirt_size(double* %x) nounwind optsize { +; SSE-LINUX-LABEL: squirt_size: +; SSE-LINUX: # %bb.0: # %entry +; SSE-LINUX-NEXT: sqrtsd (%rdi), %xmm0 +; SSE-LINUX-NEXT: retq +; +; SSE-WIN-LABEL: squirt_size: +; SSE-WIN: # %bb.0: # %entry +; SSE-WIN-NEXT: sqrtsd (%rcx), %xmm0 +; SSE-WIN-NEXT: retq +; +; AVX-LABEL: squirt_size: +; AVX: # %bb.0: # %entry +; AVX-NEXT: vsqrtsd (%rcx), %xmm0, %xmm0 +; AVX-NEXT: retq entry: -; SSE-LABEL: squirt_size: -; SSE: sqrtsd ([[A0]]), %xmm0 %z = load double, double* %x %t = call double @llvm.sqrt.f64(double %z) ret double %t @@ -64,23 +125,122 @@ entry: declare float @llvm.sqrt.f32(float) declare double @llvm.sqrt.f64(double) -; SSE-LABEL: loopdep1 -; SSE: for.body{{$}} -; ; This loop contains two cvtsi2ss instructions that update the same xmm ; register. Verify that the break false dependency fix pass breaks those ; dependencies by inserting xorps instructions. ; -; If the register allocator chooses different registers for the two cvtsi2ss -; instructions, they are still dependent on themselves. -; SSE: xorps [[XMM1:%xmm[0-9]+]] -; SSE: , [[XMM1]] -; SSE: cvtsi2ss %{{.*}}, [[XMM1]] -; SSE: xorps [[XMM2:%xmm[0-9]+]] -; SSE: , [[XMM2]] -; SSE: cvtsi2ss %{{.*}}, [[XMM2]] - define float @loopdep1(i32 %m) nounwind uwtable readnone ssp { +; SSE-LINUX-LABEL: loopdep1: +; SSE-LINUX: # %bb.0: # %entry +; SSE-LINUX-NEXT: testl %edi, %edi +; SSE-LINUX-NEXT: je .LBB6_1 +; SSE-LINUX-NEXT: # %bb.2: # %for.body.preheader +; SSE-LINUX-NEXT: movl $1, %eax +; SSE-LINUX-NEXT: xorps %xmm0, %xmm0 +; SSE-LINUX-NEXT: xorps %xmm1, %xmm1 +; SSE-LINUX-NEXT: .p2align 4, 0x90 +; SSE-LINUX-NEXT: .LBB6_3: # %for.body +; SSE-LINUX-NEXT: # =>This Inner Loop Header: Depth=1 +; SSE-LINUX-NEXT: xorps %xmm2, %xmm2 +; SSE-LINUX-NEXT: cvtsi2ss %eax, %xmm2 +; SSE-LINUX-NEXT: xorps %xmm3, %xmm3 +; SSE-LINUX-NEXT: cvtsi2ss %edi, %xmm3 +; SSE-LINUX-NEXT: addss %xmm2, %xmm0 +; SSE-LINUX-NEXT: addss %xmm3, %xmm1 +; SSE-LINUX-NEXT: incl %eax +; SSE-LINUX-NEXT: decl %edi +; SSE-LINUX-NEXT: jne .LBB6_3 +; SSE-LINUX-NEXT: # %bb.4: # %for.end +; SSE-LINUX-NEXT: subss %xmm1, %xmm0 +; SSE-LINUX-NEXT: retq +; SSE-LINUX-NEXT: .LBB6_1: +; SSE-LINUX-NEXT: xorps %xmm0, %xmm0 +; SSE-LINUX-NEXT: xorps %xmm1, %xmm1 +; SSE-LINUX-NEXT: subss %xmm1, %xmm0 +; SSE-LINUX-NEXT: retq +; +; SSE-WIN-LABEL: loopdep1: +; SSE-WIN: # %bb.0: # %entry +; SSE-WIN-NEXT: testl %ecx, %ecx +; SSE-WIN-NEXT: je .LBB6_1 +; SSE-WIN-NEXT: # %bb.2: # %for.body.preheader +; SSE-WIN-NEXT: movl $1, %eax +; SSE-WIN-NEXT: xorps %xmm0, %xmm0 +; SSE-WIN-NEXT: xorps %xmm1, %xmm1 +; SSE-WIN-NEXT: .p2align 4, 0x90 +; SSE-WIN-NEXT: .LBB6_3: # %for.body +; SSE-WIN-NEXT: # =>This Inner Loop Header: Depth=1 +; SSE-WIN-NEXT: xorps %xmm2, %xmm2 +; SSE-WIN-NEXT: cvtsi2ss %eax, %xmm2 +; SSE-WIN-NEXT: xorps %xmm3, %xmm3 +; SSE-WIN-NEXT: cvtsi2ss %ecx, %xmm3 +; SSE-WIN-NEXT: addss %xmm2, %xmm0 +; SSE-WIN-NEXT: addss %xmm3, %xmm1 +; SSE-WIN-NEXT: incl %eax +; SSE-WIN-NEXT: decl %ecx +; SSE-WIN-NEXT: jne .LBB6_3 +; SSE-WIN-NEXT: # %bb.4: # %for.end +; SSE-WIN-NEXT: subss %xmm1, %xmm0 +; SSE-WIN-NEXT: retq +; SSE-WIN-NEXT: .LBB6_1: +; SSE-WIN-NEXT: xorps %xmm0, %xmm0 +; SSE-WIN-NEXT: xorps %xmm1, %xmm1 +; SSE-WIN-NEXT: subss %xmm1, %xmm0 +; SSE-WIN-NEXT: retq +; +; AVX1-LABEL: loopdep1: +; AVX1: # %bb.0: # %entry +; AVX1-NEXT: testl %ecx, %ecx +; AVX1-NEXT: je .LBB6_1 +; AVX1-NEXT: # %bb.2: # %for.body.preheader +; AVX1-NEXT: movl $1, %eax +; AVX1-NEXT: vxorps %xmm0, %xmm0, %xmm0 +; AVX1-NEXT: vxorps %xmm1, %xmm1, %xmm1 +; AVX1-NEXT: .p2align 4, 0x90 +; AVX1-NEXT: .LBB6_3: # %for.body +; AVX1-NEXT: # =>This Inner Loop Header: Depth=1 +; AVX1-NEXT: vcvtsi2ss %eax, %xmm4, %xmm2 +; AVX1-NEXT: vcvtsi2ss %ecx, %xmm4, %xmm3 +; AVX1-NEXT: vaddss %xmm2, %xmm0, %xmm0 +; AVX1-NEXT: vaddss %xmm3, %xmm1, %xmm1 +; AVX1-NEXT: incl %eax +; AVX1-NEXT: decl %ecx +; AVX1-NEXT: jne .LBB6_3 +; AVX1-NEXT: # %bb.4: # %for.end +; AVX1-NEXT: vsubss %xmm1, %xmm0, %xmm0 +; AVX1-NEXT: retq +; AVX1-NEXT: .LBB6_1: +; AVX1-NEXT: vxorps %xmm0, %xmm0, %xmm0 +; AVX1-NEXT: vxorps %xmm1, %xmm1, %xmm1 +; AVX1-NEXT: vsubss %xmm1, %xmm0, %xmm0 +; AVX1-NEXT: retq +; +; AVX512VL-LABEL: loopdep1: +; AVX512VL: # %bb.0: # %entry +; AVX512VL-NEXT: testl %ecx, %ecx +; AVX512VL-NEXT: je .LBB6_1 +; AVX512VL-NEXT: # %bb.2: # %for.body.preheader +; AVX512VL-NEXT: movl $1, %eax +; AVX512VL-NEXT: vxorps %xmm0, %xmm0, %xmm0 +; AVX512VL-NEXT: vxorps %xmm1, %xmm1, %xmm1 +; AVX512VL-NEXT: .p2align 4, 0x90 +; AVX512VL-NEXT: .LBB6_3: # %for.body +; AVX512VL-NEXT: # =>This Inner Loop Header: Depth=1 +; AVX512VL-NEXT: vcvtsi2ss %eax, %xmm3, %xmm2 +; AVX512VL-NEXT: vaddss %xmm2, %xmm0, %xmm0 +; AVX512VL-NEXT: vcvtsi2ss %ecx, %xmm3, %xmm2 +; AVX512VL-NEXT: vaddss %xmm2, %xmm1, %xmm1 +; AVX512VL-NEXT: incl %eax +; AVX512VL-NEXT: decl %ecx +; AVX512VL-NEXT: jne .LBB6_3 +; AVX512VL-NEXT: # %bb.4: # %for.end +; AVX512VL-NEXT: vsubss %xmm1, %xmm0, %xmm0 +; AVX512VL-NEXT: retq +; AVX512VL-NEXT: .LBB6_1: +; AVX512VL-NEXT: vxorps %xmm0, %xmm0, %xmm0 +; AVX512VL-NEXT: vxorps %xmm1, %xmm1, %xmm1 +; AVX512VL-NEXT: vsubss %xmm1, %xmm0, %xmm0 +; AVX512VL-NEXT: retq entry: %tobool3 = icmp eq i32 %m, 0 br i1 %tobool3, label %for.end, label %for.body @@ -111,15 +271,119 @@ for.end: ; preds = %for.body, %entry ; to avoid cyclic dependence on a write to the same register in a ; previous iteration. -; AVX-LABEL: loopdep2: -; AVX-LABEL: %loop -; AVX: vxorps %[[REG:xmm.]], %{{xmm.}}, %{{xmm.}} -; AVX: vcvtsi2sd %{{r[0-9a-x]+}}, %[[REG]], %{{xmm.}} -; SSE-LABEL: loopdep2: -; SSE-LABEL: %loop -; SSE: xorps %[[REG:xmm.]], %[[REG]] -; SSE: cvtsi2sd %{{r[0-9a-x]+}}, %[[REG]] define i64 @loopdep2(i64* nocapture %x, double* nocapture %y) nounwind { +; SSE-LINUX-LABEL: loopdep2: +; SSE-LINUX: # %bb.0: # %entry +; SSE-LINUX-NEXT: movq (%rdi), %rax +; SSE-LINUX-NEXT: movl $1, %ecx +; SSE-LINUX-NEXT: .p2align 4, 0x90 +; SSE-LINUX-NEXT: .LBB7_1: # %loop +; SSE-LINUX-NEXT: # =>This Inner Loop Header: Depth=1 +; SSE-LINUX-NEXT: xorps %xmm0, %xmm0 +; SSE-LINUX-NEXT: cvtsi2sd %rcx, %xmm0 +; SSE-LINUX-NEXT: movsd %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill +; SSE-LINUX-NEXT: #APP +; SSE-LINUX-NEXT: #NO_APP +; SSE-LINUX-NEXT: movsd {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 8-byte Reload +; SSE-LINUX-NEXT: # xmm0 = mem[0],zero +; SSE-LINUX-NEXT: addsd (%rsi), %xmm0 +; SSE-LINUX-NEXT: cvttsd2si %xmm0, %rdx +; SSE-LINUX-NEXT: addq %rdx, %rax +; SSE-LINUX-NEXT: incq %rcx +; SSE-LINUX-NEXT: cmpq $156250000, %rcx # imm = 0x9502F90 +; SSE-LINUX-NEXT: jne .LBB7_1 +; SSE-LINUX-NEXT: # %bb.2: # %ret +; SSE-LINUX-NEXT: retq +; +; SSE-WIN-LABEL: loopdep2: +; SSE-WIN: # %bb.0: # %entry +; SSE-WIN-NEXT: subq $184, %rsp +; SSE-WIN-NEXT: movaps %xmm15, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-WIN-NEXT: movaps %xmm14, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-WIN-NEXT: movaps %xmm13, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-WIN-NEXT: movaps %xmm12, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-WIN-NEXT: movaps %xmm11, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-WIN-NEXT: movaps %xmm10, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-WIN-NEXT: movaps %xmm9, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-WIN-NEXT: movaps %xmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-WIN-NEXT: movaps %xmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-WIN-NEXT: movaps %xmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-WIN-NEXT: movq (%rcx), %rax +; SSE-WIN-NEXT: movl $1, %r8d +; SSE-WIN-NEXT: .p2align 4, 0x90 +; SSE-WIN-NEXT: .LBB7_1: # %loop +; SSE-WIN-NEXT: # =>This Inner Loop Header: Depth=1 +; SSE-WIN-NEXT: xorps %xmm0, %xmm0 +; SSE-WIN-NEXT: cvtsi2sd %r8, %xmm0 +; SSE-WIN-NEXT: movsd %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill +; SSE-WIN-NEXT: #APP +; SSE-WIN-NEXT: #NO_APP +; SSE-WIN-NEXT: movsd {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 8-byte Reload +; SSE-WIN-NEXT: # xmm0 = mem[0],zero +; SSE-WIN-NEXT: addsd (%rdx), %xmm0 +; SSE-WIN-NEXT: cvttsd2si %xmm0, %rcx +; SSE-WIN-NEXT: addq %rcx, %rax +; SSE-WIN-NEXT: incq %r8 +; SSE-WIN-NEXT: cmpq $156250000, %r8 # imm = 0x9502F90 +; SSE-WIN-NEXT: jne .LBB7_1 +; SSE-WIN-NEXT: # %bb.2: # %ret +; SSE-WIN-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm6 # 16-byte Reload +; SSE-WIN-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm7 # 16-byte Reload +; SSE-WIN-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm8 # 16-byte Reload +; SSE-WIN-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm9 # 16-byte Reload +; SSE-WIN-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm10 # 16-byte Reload +; SSE-WIN-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm11 # 16-byte Reload +; SSE-WIN-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm12 # 16-byte Reload +; SSE-WIN-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm13 # 16-byte Reload +; SSE-WIN-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm14 # 16-byte Reload +; SSE-WIN-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm15 # 16-byte Reload +; SSE-WIN-NEXT: addq $184, %rsp +; SSE-WIN-NEXT: retq +; +; AVX-LABEL: loopdep2: +; AVX: # %bb.0: # %entry +; AVX-NEXT: subq $184, %rsp +; AVX-NEXT: vmovaps %xmm15, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX-NEXT: vmovaps %xmm14, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX-NEXT: vmovaps %xmm13, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX-NEXT: vmovaps %xmm12, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX-NEXT: vmovaps %xmm11, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX-NEXT: vmovaps %xmm10, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX-NEXT: vmovaps %xmm9, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX-NEXT: vmovaps %xmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX-NEXT: vmovaps %xmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX-NEXT: vmovaps %xmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX-NEXT: movq (%rcx), %rax +; AVX-NEXT: movl $1, %r8d +; AVX-NEXT: .p2align 4, 0x90 +; AVX-NEXT: .LBB7_1: # %loop +; AVX-NEXT: # =>This Inner Loop Header: Depth=1 +; AVX-NEXT: vxorps %xmm1, %xmm1, %xmm1 +; AVX-NEXT: vcvtsi2sd %r8, %xmm1, %xmm0 +; AVX-NEXT: vmovsd %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill +; AVX-NEXT: #APP +; AVX-NEXT: #NO_APP +; AVX-NEXT: vmovsd {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 8-byte Reload +; AVX-NEXT: # xmm0 = mem[0],zero +; AVX-NEXT: vaddsd (%rdx), %xmm0, %xmm0 +; AVX-NEXT: vcvttsd2si %xmm0, %rcx +; AVX-NEXT: addq %rcx, %rax +; AVX-NEXT: incq %r8 +; AVX-NEXT: cmpq $156250000, %r8 # imm = 0x9502F90 +; AVX-NEXT: jne .LBB7_1 +; AVX-NEXT: # %bb.2: # %ret +; AVX-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm6 # 16-byte Reload +; AVX-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm7 # 16-byte Reload +; AVX-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm8 # 16-byte Reload +; AVX-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm9 # 16-byte Reload +; AVX-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm10 # 16-byte Reload +; AVX-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm11 # 16-byte Reload +; AVX-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm12 # 16-byte Reload +; AVX-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm13 # 16-byte Reload +; AVX-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm14 # 16-byte Reload +; AVX-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm15 # 16-byte Reload +; AVX-NEXT: addq $184, %rsp +; AVX-NEXT: retq entry: %vx = load i64, i64* %x br label %loop @@ -151,6 +415,191 @@ ret: @v = common global [1024 x i32] zeroinitializer, align 16 define void @loopdep3() { +; SSE-LINUX-LABEL: loopdep3: +; SSE-LINUX: # %bb.0: # %entry +; SSE-LINUX-NEXT: xorl %eax, %eax +; SSE-LINUX-NEXT: .p2align 4, 0x90 +; SSE-LINUX-NEXT: .LBB8_1: # %for.cond1.preheader +; SSE-LINUX-NEXT: # =>This Loop Header: Depth=1 +; SSE-LINUX-NEXT: # Child Loop BB8_2 Depth 2 +; SSE-LINUX-NEXT: movq $-4096, %rcx # imm = 0xF000 +; SSE-LINUX-NEXT: .p2align 4, 0x90 +; SSE-LINUX-NEXT: .LBB8_2: # %for.body3 +; SSE-LINUX-NEXT: # Parent Loop BB8_1 Depth=1 +; SSE-LINUX-NEXT: # => This Inner Loop Header: Depth=2 +; SSE-LINUX-NEXT: xorps %xmm0, %xmm0 +; SSE-LINUX-NEXT: cvtsi2sdl v+4096(%rcx), %xmm0 +; SSE-LINUX-NEXT: mulsd x+8192(%rcx,%rcx), %xmm0 +; SSE-LINUX-NEXT: mulsd y+8192(%rcx,%rcx), %xmm0 +; SSE-LINUX-NEXT: mulsd z+8192(%rcx,%rcx), %xmm0 +; SSE-LINUX-NEXT: movsd %xmm0, w+8192(%rcx,%rcx) +; SSE-LINUX-NEXT: #APP +; SSE-LINUX-NEXT: #NO_APP +; SSE-LINUX-NEXT: addq $4, %rcx +; SSE-LINUX-NEXT: jne .LBB8_2 +; SSE-LINUX-NEXT: # %bb.3: # %for.inc14 +; SSE-LINUX-NEXT: # in Loop: Header=BB8_1 Depth=1 +; SSE-LINUX-NEXT: incl %eax +; SSE-LINUX-NEXT: cmpl $100000, %eax # imm = 0x186A0 +; SSE-LINUX-NEXT: jne .LBB8_1 +; SSE-LINUX-NEXT: # %bb.4: # %for.end16 +; SSE-LINUX-NEXT: retq +; +; SSE-WIN-LABEL: loopdep3: +; SSE-WIN: # %bb.0: # %entry +; SSE-WIN-NEXT: pushq %rsi +; SSE-WIN-NEXT: .seh_pushreg %rsi +; SSE-WIN-NEXT: subq $160, %rsp +; SSE-WIN-NEXT: .seh_stackalloc 160 +; SSE-WIN-NEXT: movaps %xmm15, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-WIN-NEXT: .seh_savexmm %xmm15, 144 +; SSE-WIN-NEXT: movaps %xmm14, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-WIN-NEXT: .seh_savexmm %xmm14, 128 +; SSE-WIN-NEXT: movaps %xmm13, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-WIN-NEXT: .seh_savexmm %xmm13, 112 +; SSE-WIN-NEXT: movaps %xmm12, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-WIN-NEXT: .seh_savexmm %xmm12, 96 +; SSE-WIN-NEXT: movaps %xmm11, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-WIN-NEXT: .seh_savexmm %xmm11, 80 +; SSE-WIN-NEXT: movaps %xmm10, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-WIN-NEXT: .seh_savexmm %xmm10, 64 +; SSE-WIN-NEXT: movaps %xmm9, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-WIN-NEXT: .seh_savexmm %xmm9, 48 +; SSE-WIN-NEXT: movaps %xmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-WIN-NEXT: .seh_savexmm %xmm8, 32 +; SSE-WIN-NEXT: movaps %xmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-WIN-NEXT: .seh_savexmm %xmm7, 16 +; SSE-WIN-NEXT: movaps %xmm6, (%rsp) # 16-byte Spill +; SSE-WIN-NEXT: .seh_savexmm %xmm6, 0 +; SSE-WIN-NEXT: .seh_endprologue +; SSE-WIN-NEXT: xorl %r9d, %r9d +; SSE-WIN-NEXT: leaq {{.*}}(%rip), %r8 +; SSE-WIN-NEXT: leaq {{.*}}(%rip), %r10 +; SSE-WIN-NEXT: leaq {{.*}}(%rip), %r11 +; SSE-WIN-NEXT: leaq {{.*}}(%rip), %rax +; SSE-WIN-NEXT: leaq {{.*}}(%rip), %rdx +; SSE-WIN-NEXT: .p2align 4, 0x90 +; SSE-WIN-NEXT: .LBB8_1: # %for.cond1.preheader +; SSE-WIN-NEXT: # =>This Loop Header: Depth=1 +; SSE-WIN-NEXT: # Child Loop BB8_2 Depth 2 +; SSE-WIN-NEXT: movq %r8, %rcx +; SSE-WIN-NEXT: xorl %esi, %esi +; SSE-WIN-NEXT: .p2align 4, 0x90 +; SSE-WIN-NEXT: .LBB8_2: # %for.body3 +; SSE-WIN-NEXT: # Parent Loop BB8_1 Depth=1 +; SSE-WIN-NEXT: # => This Inner Loop Header: Depth=2 +; SSE-WIN-NEXT: xorps %xmm0, %xmm0 +; SSE-WIN-NEXT: cvtsi2sdl (%rcx), %xmm0 +; SSE-WIN-NEXT: mulsd (%rsi,%r10), %xmm0 +; SSE-WIN-NEXT: mulsd (%rsi,%r11), %xmm0 +; SSE-WIN-NEXT: mulsd (%rsi,%rax), %xmm0 +; SSE-WIN-NEXT: movsd %xmm0, (%rsi,%rdx) +; SSE-WIN-NEXT: #APP +; SSE-WIN-NEXT: #NO_APP +; SSE-WIN-NEXT: addq $8, %rsi +; SSE-WIN-NEXT: addq $4, %rcx +; SSE-WIN-NEXT: cmpq $8192, %rsi # imm = 0x2000 +; SSE-WIN-NEXT: jne .LBB8_2 +; SSE-WIN-NEXT: # %bb.3: # %for.inc14 +; SSE-WIN-NEXT: # in Loop: Header=BB8_1 Depth=1 +; SSE-WIN-NEXT: incl %r9d +; SSE-WIN-NEXT: cmpl $100000, %r9d # imm = 0x186A0 +; SSE-WIN-NEXT: jne .LBB8_1 +; SSE-WIN-NEXT: # %bb.4: # %for.end16 +; SSE-WIN-NEXT: movaps (%rsp), %xmm6 # 16-byte Reload +; SSE-WIN-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm7 # 16-byte Reload +; SSE-WIN-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm8 # 16-byte Reload +; SSE-WIN-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm9 # 16-byte Reload +; SSE-WIN-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm10 # 16-byte Reload +; SSE-WIN-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm11 # 16-byte Reload +; SSE-WIN-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm12 # 16-byte Reload +; SSE-WIN-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm13 # 16-byte Reload +; SSE-WIN-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm14 # 16-byte Reload +; SSE-WIN-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm15 # 16-byte Reload +; SSE-WIN-NEXT: addq $160, %rsp +; SSE-WIN-NEXT: popq %rsi +; SSE-WIN-NEXT: retq +; SSE-WIN-NEXT: .seh_handlerdata +; SSE-WIN-NEXT: .text +; SSE-WIN-NEXT: .seh_endproc +; +; AVX-LABEL: loopdep3: +; AVX: # %bb.0: # %entry +; AVX-NEXT: pushq %rsi +; AVX-NEXT: .seh_pushreg %rsi +; AVX-NEXT: subq $160, %rsp +; AVX-NEXT: .seh_stackalloc 160 +; AVX-NEXT: vmovaps %xmm15, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX-NEXT: .seh_savexmm %xmm15, 144 +; AVX-NEXT: vmovaps %xmm14, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX-NEXT: .seh_savexmm %xmm14, 128 +; AVX-NEXT: vmovaps %xmm13, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX-NEXT: .seh_savexmm %xmm13, 112 +; AVX-NEXT: vmovaps %xmm12, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX-NEXT: .seh_savexmm %xmm12, 96 +; AVX-NEXT: vmovaps %xmm11, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX-NEXT: .seh_savexmm %xmm11, 80 +; AVX-NEXT: vmovaps %xmm10, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX-NEXT: .seh_savexmm %xmm10, 64 +; AVX-NEXT: vmovaps %xmm9, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX-NEXT: .seh_savexmm %xmm9, 48 +; AVX-NEXT: vmovaps %xmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX-NEXT: .seh_savexmm %xmm8, 32 +; AVX-NEXT: vmovaps %xmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX-NEXT: .seh_savexmm %xmm7, 16 +; AVX-NEXT: vmovaps %xmm6, (%rsp) # 16-byte Spill +; AVX-NEXT: .seh_savexmm %xmm6, 0 +; AVX-NEXT: .seh_endprologue +; AVX-NEXT: xorl %r9d, %r9d +; AVX-NEXT: leaq {{.*}}(%rip), %r8 +; AVX-NEXT: leaq {{.*}}(%rip), %r10 +; AVX-NEXT: leaq {{.*}}(%rip), %r11 +; AVX-NEXT: leaq {{.*}}(%rip), %rax +; AVX-NEXT: leaq {{.*}}(%rip), %rdx +; AVX-NEXT: .p2align 4, 0x90 +; AVX-NEXT: .LBB8_1: # %for.cond1.preheader +; AVX-NEXT: # =>This Loop Header: Depth=1 +; AVX-NEXT: # Child Loop BB8_2 Depth 2 +; AVX-NEXT: movq %r8, %rcx +; AVX-NEXT: xorl %esi, %esi +; AVX-NEXT: .p2align 4, 0x90 +; AVX-NEXT: .LBB8_2: # %for.body3 +; AVX-NEXT: # Parent Loop BB8_1 Depth=1 +; AVX-NEXT: # => This Inner Loop Header: Depth=2 +; AVX-NEXT: vxorps %xmm0, %xmm0, %xmm0 +; AVX-NEXT: vcvtsi2sdl (%rcx), %xmm0, %xmm0 +; AVX-NEXT: vmulsd (%rsi,%r10), %xmm0, %xmm0 +; AVX-NEXT: vmulsd (%rsi,%r11), %xmm0, %xmm0 +; AVX-NEXT: vmulsd (%rsi,%rax), %xmm0, %xmm0 +; AVX-NEXT: vmovsd %xmm0, (%rsi,%rdx) +; AVX-NEXT: #APP +; AVX-NEXT: #NO_APP +; AVX-NEXT: addq $8, %rsi +; AVX-NEXT: addq $4, %rcx +; AVX-NEXT: cmpq $8192, %rsi # imm = 0x2000 +; AVX-NEXT: jne .LBB8_2 +; AVX-NEXT: # %bb.3: # %for.inc14 +; AVX-NEXT: # in Loop: Header=BB8_1 Depth=1 +; AVX-NEXT: incl %r9d +; AVX-NEXT: cmpl $100000, %r9d # imm = 0x186A0 +; AVX-NEXT: jne .LBB8_1 +; AVX-NEXT: # %bb.4: # %for.end16 +; AVX-NEXT: vmovaps (%rsp), %xmm6 # 16-byte Reload +; AVX-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm7 # 16-byte Reload +; AVX-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm8 # 16-byte Reload +; AVX-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm9 # 16-byte Reload +; AVX-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm10 # 16-byte Reload +; AVX-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm11 # 16-byte Reload +; AVX-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm12 # 16-byte Reload +; AVX-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm13 # 16-byte Reload +; AVX-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm14 # 16-byte Reload +; AVX-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm15 # 16-byte Reload +; AVX-NEXT: addq $160, %rsp +; AVX-NEXT: popq %rsi +; AVX-NEXT: retq +; AVX-NEXT: .seh_handlerdata +; AVX-NEXT: .text +; AVX-NEXT: .seh_endproc entry: br label %for.cond1.preheader @@ -187,23 +636,148 @@ for.inc14: ; preds = %for.body3 for.end16: ; preds = %for.inc14 ret void -;SSE-LABEL:@loopdep3 -;SSE: xorps [[XMM0:%xmm[0-9]+]], [[XMM0]] -;SSE-NEXT: cvtsi2sdl {{.*}}, [[XMM0]] -;SSE-NEXT: mulsd {{.*}}, [[XMM0]] -;SSE-NEXT: mulsd {{.*}}, [[XMM0]] -;SSE-NEXT: mulsd {{.*}}, [[XMM0]] -;SSE-NEXT: movsd [[XMM0]], -;AVX-LABEL:@loopdep3 -;AVX: vxorps [[XMM0:%xmm[0-9]+]], [[XMM0]] -;AVX-NEXT: vcvtsi2sdl {{.*}}, [[XMM0]], {{%xmm[0-9]+}} -;AVX-NEXT: vmulsd {{.*}}, [[XMM0]], [[XMM0]] -;AVX-NEXT: vmulsd {{.*}}, [[XMM0]], [[XMM0]] -;AVX-NEXT: vmulsd {{.*}}, [[XMM0]], [[XMM0]] -;AVX-NEXT: vmovsd [[XMM0]], } define double @inlineasmdep(i64 %arg) { +; SSE-LINUX-LABEL: inlineasmdep: +; SSE-LINUX: # %bb.0: # %top +; SSE-LINUX-NEXT: #APP +; SSE-LINUX-NEXT: #NO_APP +; SSE-LINUX-NEXT: #APP +; SSE-LINUX-NEXT: #NO_APP +; SSE-LINUX-NEXT: #APP +; SSE-LINUX-NEXT: #NO_APP +; SSE-LINUX-NEXT: #APP +; SSE-LINUX-NEXT: #NO_APP +; SSE-LINUX-NEXT: #APP +; SSE-LINUX-NEXT: #NO_APP +; SSE-LINUX-NEXT: #APP +; SSE-LINUX-NEXT: #NO_APP +; SSE-LINUX-NEXT: #APP +; SSE-LINUX-NEXT: #NO_APP +; SSE-LINUX-NEXT: #APP +; SSE-LINUX-NEXT: #NO_APP +; SSE-LINUX-NEXT: xorps %xmm0, %xmm0 +; SSE-LINUX-NEXT: cvtsi2sd %rdi, %xmm0 +; SSE-LINUX-NEXT: retq +; +; SSE-WIN-LABEL: inlineasmdep: +; SSE-WIN: # %bb.0: # %top +; SSE-WIN-NEXT: subq $168, %rsp +; SSE-WIN-NEXT: .seh_stackalloc 168 +; SSE-WIN-NEXT: movaps %xmm15, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-WIN-NEXT: .seh_savexmm %xmm15, 144 +; SSE-WIN-NEXT: movaps %xmm14, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-WIN-NEXT: .seh_savexmm %xmm14, 128 +; SSE-WIN-NEXT: movaps %xmm13, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-WIN-NEXT: .seh_savexmm %xmm13, 112 +; SSE-WIN-NEXT: movaps %xmm12, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-WIN-NEXT: .seh_savexmm %xmm12, 96 +; SSE-WIN-NEXT: movaps %xmm11, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-WIN-NEXT: .seh_savexmm %xmm11, 80 +; SSE-WIN-NEXT: movaps %xmm10, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-WIN-NEXT: .seh_savexmm %xmm10, 64 +; SSE-WIN-NEXT: movaps %xmm9, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-WIN-NEXT: .seh_savexmm %xmm9, 48 +; SSE-WIN-NEXT: movaps %xmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-WIN-NEXT: .seh_savexmm %xmm8, 32 +; SSE-WIN-NEXT: movaps %xmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-WIN-NEXT: .seh_savexmm %xmm7, 16 +; SSE-WIN-NEXT: movaps %xmm6, (%rsp) # 16-byte Spill +; SSE-WIN-NEXT: .seh_savexmm %xmm6, 0 +; SSE-WIN-NEXT: .seh_endprologue +; SSE-WIN-NEXT: #APP +; SSE-WIN-NEXT: #NO_APP +; SSE-WIN-NEXT: #APP +; SSE-WIN-NEXT: #NO_APP +; SSE-WIN-NEXT: #APP +; SSE-WIN-NEXT: #NO_APP +; SSE-WIN-NEXT: #APP +; SSE-WIN-NEXT: #NO_APP +; SSE-WIN-NEXT: #APP +; SSE-WIN-NEXT: #NO_APP +; SSE-WIN-NEXT: #APP +; SSE-WIN-NEXT: #NO_APP +; SSE-WIN-NEXT: #APP +; SSE-WIN-NEXT: #NO_APP +; SSE-WIN-NEXT: #APP +; SSE-WIN-NEXT: #NO_APP +; SSE-WIN-NEXT: xorps %xmm0, %xmm0 +; SSE-WIN-NEXT: cvtsi2sd %rcx, %xmm0 +; SSE-WIN-NEXT: movaps (%rsp), %xmm6 # 16-byte Reload +; SSE-WIN-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm7 # 16-byte Reload +; SSE-WIN-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm8 # 16-byte Reload +; SSE-WIN-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm9 # 16-byte Reload +; SSE-WIN-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm10 # 16-byte Reload +; SSE-WIN-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm11 # 16-byte Reload +; SSE-WIN-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm12 # 16-byte Reload +; SSE-WIN-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm13 # 16-byte Reload +; SSE-WIN-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm14 # 16-byte Reload +; SSE-WIN-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm15 # 16-byte Reload +; SSE-WIN-NEXT: addq $168, %rsp +; SSE-WIN-NEXT: retq +; SSE-WIN-NEXT: .seh_handlerdata +; SSE-WIN-NEXT: .text +; SSE-WIN-NEXT: .seh_endproc +; +; AVX-LABEL: inlineasmdep: +; AVX: # %bb.0: # %top +; AVX-NEXT: subq $168, %rsp +; AVX-NEXT: .seh_stackalloc 168 +; AVX-NEXT: vmovaps %xmm15, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX-NEXT: .seh_savexmm %xmm15, 144 +; AVX-NEXT: vmovaps %xmm14, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX-NEXT: .seh_savexmm %xmm14, 128 +; AVX-NEXT: vmovaps %xmm13, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX-NEXT: .seh_savexmm %xmm13, 112 +; AVX-NEXT: vmovaps %xmm12, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX-NEXT: .seh_savexmm %xmm12, 96 +; AVX-NEXT: vmovaps %xmm11, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX-NEXT: .seh_savexmm %xmm11, 80 +; AVX-NEXT: vmovaps %xmm10, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX-NEXT: .seh_savexmm %xmm10, 64 +; AVX-NEXT: vmovaps %xmm9, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX-NEXT: .seh_savexmm %xmm9, 48 +; AVX-NEXT: vmovaps %xmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX-NEXT: .seh_savexmm %xmm8, 32 +; AVX-NEXT: vmovaps %xmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX-NEXT: .seh_savexmm %xmm7, 16 +; AVX-NEXT: vmovaps %xmm6, (%rsp) # 16-byte Spill +; AVX-NEXT: .seh_savexmm %xmm6, 0 +; AVX-NEXT: .seh_endprologue +; AVX-NEXT: #APP +; AVX-NEXT: #NO_APP +; AVX-NEXT: #APP +; AVX-NEXT: #NO_APP +; AVX-NEXT: #APP +; AVX-NEXT: #NO_APP +; AVX-NEXT: #APP +; AVX-NEXT: #NO_APP +; AVX-NEXT: #APP +; AVX-NEXT: #NO_APP +; AVX-NEXT: #APP +; AVX-NEXT: #NO_APP +; AVX-NEXT: #APP +; AVX-NEXT: #NO_APP +; AVX-NEXT: #APP +; AVX-NEXT: #NO_APP +; AVX-NEXT: vxorps %xmm0, %xmm0, %xmm0 +; AVX-NEXT: vcvtsi2sd %rcx, %xmm0, %xmm0 +; AVX-NEXT: vmovaps (%rsp), %xmm6 # 16-byte Reload +; AVX-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm7 # 16-byte Reload +; AVX-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm8 # 16-byte Reload +; AVX-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm9 # 16-byte Reload +; AVX-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm10 # 16-byte Reload +; AVX-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm11 # 16-byte Reload +; AVX-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm12 # 16-byte Reload +; AVX-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm13 # 16-byte Reload +; AVX-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm14 # 16-byte Reload +; AVX-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm15 # 16-byte Reload +; AVX-NEXT: addq $168, %rsp +; AVX-NEXT: retq +; AVX-NEXT: .seh_handlerdata +; AVX-NEXT: .text +; AVX-NEXT: .seh_endproc top: tail call void asm sideeffect "", "~{xmm0},~{xmm1},~{xmm2},~{xmm3},~{dirflag},~{fpsr},~{flags}"() tail call void asm sideeffect "", "~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{dirflag},~{fpsr},~{flags}"() @@ -215,14 +789,162 @@ top: tail call void asm sideeffect "", "~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{dirflag},~{fpsr},~{flags}"() %tmp1 = sitofp i64 %arg to double ret double %tmp1 -;AVX-LABEL:@inlineasmdep -;AVX: vxorps [[XMM0:%xmm[0-9]+]], [[XMM0]], [[XMM0]] -;AVX-NEXT: vcvtsi2sd {{.*}}, [[XMM0]], {{%xmm[0-9]+}} } ; Make sure we are making a smart choice regarding undef registers and ; hiding the false dependency behind a true dependency define double @truedeps(float %arg) { +; SSE-LINUX-LABEL: truedeps: +; SSE-LINUX: # %bb.0: # %top +; SSE-LINUX-NEXT: movss %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill +; SSE-LINUX-NEXT: #APP +; SSE-LINUX-NEXT: #NO_APP +; SSE-LINUX-NEXT: #APP +; SSE-LINUX-NEXT: #NO_APP +; SSE-LINUX-NEXT: #APP +; SSE-LINUX-NEXT: #NO_APP +; SSE-LINUX-NEXT: #APP +; SSE-LINUX-NEXT: #NO_APP +; SSE-LINUX-NEXT: #APP +; SSE-LINUX-NEXT: #NO_APP +; SSE-LINUX-NEXT: #APP +; SSE-LINUX-NEXT: #NO_APP +; SSE-LINUX-NEXT: #APP +; SSE-LINUX-NEXT: #NO_APP +; SSE-LINUX-NEXT: #APP +; SSE-LINUX-NEXT: #NO_APP +; SSE-LINUX-NEXT: #APP +; SSE-LINUX-NEXT: #NO_APP +; SSE-LINUX-NEXT: movss {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 4-byte Reload +; SSE-LINUX-NEXT: # xmm0 = mem[0],zero,zero,zero +; SSE-LINUX-NEXT: cvtss2sd %xmm0, %xmm0 +; SSE-LINUX-NEXT: retq +; +; SSE-WIN-LABEL: truedeps: +; SSE-WIN: # %bb.0: # %top +; SSE-WIN-NEXT: subq $184, %rsp +; SSE-WIN-NEXT: .seh_stackalloc 184 +; SSE-WIN-NEXT: movaps %xmm15, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-WIN-NEXT: .seh_savexmm %xmm15, 160 +; SSE-WIN-NEXT: movaps %xmm14, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-WIN-NEXT: .seh_savexmm %xmm14, 144 +; SSE-WIN-NEXT: movaps %xmm13, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-WIN-NEXT: .seh_savexmm %xmm13, 128 +; SSE-WIN-NEXT: movaps %xmm12, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-WIN-NEXT: .seh_savexmm %xmm12, 112 +; SSE-WIN-NEXT: movaps %xmm11, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-WIN-NEXT: .seh_savexmm %xmm11, 96 +; SSE-WIN-NEXT: movaps %xmm10, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-WIN-NEXT: .seh_savexmm %xmm10, 80 +; SSE-WIN-NEXT: movaps %xmm9, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-WIN-NEXT: .seh_savexmm %xmm9, 64 +; SSE-WIN-NEXT: movaps %xmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-WIN-NEXT: .seh_savexmm %xmm8, 48 +; SSE-WIN-NEXT: movaps %xmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-WIN-NEXT: .seh_savexmm %xmm7, 32 +; SSE-WIN-NEXT: movaps %xmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-WIN-NEXT: .seh_savexmm %xmm6, 16 +; SSE-WIN-NEXT: .seh_endprologue +; SSE-WIN-NEXT: movss %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill +; SSE-WIN-NEXT: #APP +; SSE-WIN-NEXT: #NO_APP +; SSE-WIN-NEXT: #APP +; SSE-WIN-NEXT: #NO_APP +; SSE-WIN-NEXT: #APP +; SSE-WIN-NEXT: #NO_APP +; SSE-WIN-NEXT: #APP +; SSE-WIN-NEXT: #NO_APP +; SSE-WIN-NEXT: #APP +; SSE-WIN-NEXT: #NO_APP +; SSE-WIN-NEXT: #APP +; SSE-WIN-NEXT: #NO_APP +; SSE-WIN-NEXT: #APP +; SSE-WIN-NEXT: #NO_APP +; SSE-WIN-NEXT: #APP +; SSE-WIN-NEXT: #NO_APP +; SSE-WIN-NEXT: #APP +; SSE-WIN-NEXT: #NO_APP +; SSE-WIN-NEXT: movss {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 4-byte Reload +; SSE-WIN-NEXT: # xmm0 = mem[0],zero,zero,zero +; SSE-WIN-NEXT: cvtss2sd %xmm0, %xmm0 +; SSE-WIN-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm6 # 16-byte Reload +; SSE-WIN-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm7 # 16-byte Reload +; SSE-WIN-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm8 # 16-byte Reload +; SSE-WIN-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm9 # 16-byte Reload +; SSE-WIN-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm10 # 16-byte Reload +; SSE-WIN-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm11 # 16-byte Reload +; SSE-WIN-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm12 # 16-byte Reload +; SSE-WIN-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm13 # 16-byte Reload +; SSE-WIN-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm14 # 16-byte Reload +; SSE-WIN-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm15 # 16-byte Reload +; SSE-WIN-NEXT: addq $184, %rsp +; SSE-WIN-NEXT: retq +; SSE-WIN-NEXT: .seh_handlerdata +; SSE-WIN-NEXT: .text +; SSE-WIN-NEXT: .seh_endproc +; +; AVX-LABEL: truedeps: +; AVX: # %bb.0: # %top +; AVX-NEXT: subq $184, %rsp +; AVX-NEXT: .seh_stackalloc 184 +; AVX-NEXT: vmovaps %xmm15, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX-NEXT: .seh_savexmm %xmm15, 160 +; AVX-NEXT: vmovaps %xmm14, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX-NEXT: .seh_savexmm %xmm14, 144 +; AVX-NEXT: vmovaps %xmm13, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX-NEXT: .seh_savexmm %xmm13, 128 +; AVX-NEXT: vmovaps %xmm12, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX-NEXT: .seh_savexmm %xmm12, 112 +; AVX-NEXT: vmovaps %xmm11, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX-NEXT: .seh_savexmm %xmm11, 96 +; AVX-NEXT: vmovaps %xmm10, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX-NEXT: .seh_savexmm %xmm10, 80 +; AVX-NEXT: vmovaps %xmm9, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX-NEXT: .seh_savexmm %xmm9, 64 +; AVX-NEXT: vmovaps %xmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX-NEXT: .seh_savexmm %xmm8, 48 +; AVX-NEXT: vmovaps %xmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX-NEXT: .seh_savexmm %xmm7, 32 +; AVX-NEXT: vmovaps %xmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX-NEXT: .seh_savexmm %xmm6, 16 +; AVX-NEXT: .seh_endprologue +; AVX-NEXT: vmovss %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill +; AVX-NEXT: #APP +; AVX-NEXT: #NO_APP +; AVX-NEXT: #APP +; AVX-NEXT: #NO_APP +; AVX-NEXT: #APP +; AVX-NEXT: #NO_APP +; AVX-NEXT: #APP +; AVX-NEXT: #NO_APP +; AVX-NEXT: #APP +; AVX-NEXT: #NO_APP +; AVX-NEXT: #APP +; AVX-NEXT: #NO_APP +; AVX-NEXT: #APP +; AVX-NEXT: #NO_APP +; AVX-NEXT: #APP +; AVX-NEXT: #NO_APP +; AVX-NEXT: #APP +; AVX-NEXT: #NO_APP +; AVX-NEXT: vmovss {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 4-byte Reload +; AVX-NEXT: # xmm0 = mem[0],zero,zero,zero +; AVX-NEXT: vcvtss2sd %xmm0, %xmm0, %xmm0 +; AVX-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm6 # 16-byte Reload +; AVX-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm7 # 16-byte Reload +; AVX-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm8 # 16-byte Reload +; AVX-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm9 # 16-byte Reload +; AVX-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm10 # 16-byte Reload +; AVX-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm11 # 16-byte Reload +; AVX-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm12 # 16-byte Reload +; AVX-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm13 # 16-byte Reload +; AVX-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm14 # 16-byte Reload +; AVX-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm15 # 16-byte Reload +; AVX-NEXT: addq $184, %rsp +; AVX-NEXT: retq +; AVX-NEXT: .seh_handlerdata +; AVX-NEXT: .text +; AVX-NEXT: .seh_endproc top: tail call void asm sideeffect "", "~{xmm6},~{dirflag},~{fpsr},~{flags}"() tail call void asm sideeffect "", "~{xmm0},~{xmm1},~{xmm2},~{xmm3},~{dirflag},~{fpsr},~{flags}"() @@ -235,14 +957,156 @@ top: tail call void asm sideeffect "", "~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{dirflag},~{fpsr},~{flags}"() %tmp1 = fpext float %arg to double ret double %tmp1 -;AVX-LABEL:@truedeps -;AVX-NOT: vxorps -;AVX: vcvtss2sd [[XMM0:%xmm[0-9]+]], [[XMM0]], {{%xmm[0-9]+}} } ; Make sure we are making a smart choice regarding undef registers and ; choosing the register with the highest clearence define double @clearence(i64 %arg) { +; SSE-LINUX-LABEL: clearence: +; SSE-LINUX: # %bb.0: # %top +; SSE-LINUX-NEXT: #APP +; SSE-LINUX-NEXT: #NO_APP +; SSE-LINUX-NEXT: #APP +; SSE-LINUX-NEXT: #NO_APP +; SSE-LINUX-NEXT: #APP +; SSE-LINUX-NEXT: #NO_APP +; SSE-LINUX-NEXT: #APP +; SSE-LINUX-NEXT: #NO_APP +; SSE-LINUX-NEXT: #APP +; SSE-LINUX-NEXT: #NO_APP +; SSE-LINUX-NEXT: #APP +; SSE-LINUX-NEXT: #NO_APP +; SSE-LINUX-NEXT: #APP +; SSE-LINUX-NEXT: #NO_APP +; SSE-LINUX-NEXT: #APP +; SSE-LINUX-NEXT: #NO_APP +; SSE-LINUX-NEXT: #APP +; SSE-LINUX-NEXT: #NO_APP +; SSE-LINUX-NEXT: xorps %xmm0, %xmm0 +; SSE-LINUX-NEXT: cvtsi2sd %rdi, %xmm0 +; SSE-LINUX-NEXT: retq +; +; SSE-WIN-LABEL: clearence: +; SSE-WIN: # %bb.0: # %top +; SSE-WIN-NEXT: subq $168, %rsp +; SSE-WIN-NEXT: .seh_stackalloc 168 +; SSE-WIN-NEXT: movaps %xmm15, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-WIN-NEXT: .seh_savexmm %xmm15, 144 +; SSE-WIN-NEXT: movaps %xmm14, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-WIN-NEXT: .seh_savexmm %xmm14, 128 +; SSE-WIN-NEXT: movaps %xmm13, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-WIN-NEXT: .seh_savexmm %xmm13, 112 +; SSE-WIN-NEXT: movaps %xmm12, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-WIN-NEXT: .seh_savexmm %xmm12, 96 +; SSE-WIN-NEXT: movaps %xmm11, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-WIN-NEXT: .seh_savexmm %xmm11, 80 +; SSE-WIN-NEXT: movaps %xmm10, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-WIN-NEXT: .seh_savexmm %xmm10, 64 +; SSE-WIN-NEXT: movaps %xmm9, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-WIN-NEXT: .seh_savexmm %xmm9, 48 +; SSE-WIN-NEXT: movaps %xmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-WIN-NEXT: .seh_savexmm %xmm8, 32 +; SSE-WIN-NEXT: movaps %xmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-WIN-NEXT: .seh_savexmm %xmm7, 16 +; SSE-WIN-NEXT: movaps %xmm6, (%rsp) # 16-byte Spill +; SSE-WIN-NEXT: .seh_savexmm %xmm6, 0 +; SSE-WIN-NEXT: .seh_endprologue +; SSE-WIN-NEXT: #APP +; SSE-WIN-NEXT: #NO_APP +; SSE-WIN-NEXT: #APP +; SSE-WIN-NEXT: #NO_APP +; SSE-WIN-NEXT: #APP +; SSE-WIN-NEXT: #NO_APP +; SSE-WIN-NEXT: #APP +; SSE-WIN-NEXT: #NO_APP +; SSE-WIN-NEXT: #APP +; SSE-WIN-NEXT: #NO_APP +; SSE-WIN-NEXT: #APP +; SSE-WIN-NEXT: #NO_APP +; SSE-WIN-NEXT: #APP +; SSE-WIN-NEXT: #NO_APP +; SSE-WIN-NEXT: #APP +; SSE-WIN-NEXT: #NO_APP +; SSE-WIN-NEXT: #APP +; SSE-WIN-NEXT: #NO_APP +; SSE-WIN-NEXT: xorps %xmm0, %xmm0 +; SSE-WIN-NEXT: cvtsi2sd %rcx, %xmm0 +; SSE-WIN-NEXT: movaps (%rsp), %xmm6 # 16-byte Reload +; SSE-WIN-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm7 # 16-byte Reload +; SSE-WIN-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm8 # 16-byte Reload +; SSE-WIN-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm9 # 16-byte Reload +; SSE-WIN-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm10 # 16-byte Reload +; SSE-WIN-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm11 # 16-byte Reload +; SSE-WIN-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm12 # 16-byte Reload +; SSE-WIN-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm13 # 16-byte Reload +; SSE-WIN-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm14 # 16-byte Reload +; SSE-WIN-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm15 # 16-byte Reload +; SSE-WIN-NEXT: addq $168, %rsp +; SSE-WIN-NEXT: retq +; SSE-WIN-NEXT: .seh_handlerdata +; SSE-WIN-NEXT: .text +; SSE-WIN-NEXT: .seh_endproc +; +; AVX-LABEL: clearence: +; AVX: # %bb.0: # %top +; AVX-NEXT: subq $168, %rsp +; AVX-NEXT: .seh_stackalloc 168 +; AVX-NEXT: vmovaps %xmm15, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX-NEXT: .seh_savexmm %xmm15, 144 +; AVX-NEXT: vmovaps %xmm14, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX-NEXT: .seh_savexmm %xmm14, 128 +; AVX-NEXT: vmovaps %xmm13, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX-NEXT: .seh_savexmm %xmm13, 112 +; AVX-NEXT: vmovaps %xmm12, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX-NEXT: .seh_savexmm %xmm12, 96 +; AVX-NEXT: vmovaps %xmm11, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX-NEXT: .seh_savexmm %xmm11, 80 +; AVX-NEXT: vmovaps %xmm10, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX-NEXT: .seh_savexmm %xmm10, 64 +; AVX-NEXT: vmovaps %xmm9, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX-NEXT: .seh_savexmm %xmm9, 48 +; AVX-NEXT: vmovaps %xmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX-NEXT: .seh_savexmm %xmm8, 32 +; AVX-NEXT: vmovaps %xmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX-NEXT: .seh_savexmm %xmm7, 16 +; AVX-NEXT: vmovaps %xmm6, (%rsp) # 16-byte Spill +; AVX-NEXT: .seh_savexmm %xmm6, 0 +; AVX-NEXT: .seh_endprologue +; AVX-NEXT: #APP +; AVX-NEXT: #NO_APP +; AVX-NEXT: #APP +; AVX-NEXT: #NO_APP +; AVX-NEXT: #APP +; AVX-NEXT: #NO_APP +; AVX-NEXT: #APP +; AVX-NEXT: #NO_APP +; AVX-NEXT: #APP +; AVX-NEXT: #NO_APP +; AVX-NEXT: #APP +; AVX-NEXT: #NO_APP +; AVX-NEXT: #APP +; AVX-NEXT: #NO_APP +; AVX-NEXT: #APP +; AVX-NEXT: #NO_APP +; AVX-NEXT: #APP +; AVX-NEXT: #NO_APP +; AVX-NEXT: vxorps %xmm6, %xmm6, %xmm6 +; AVX-NEXT: vcvtsi2sd %rcx, %xmm6, %xmm0 +; AVX-NEXT: vmovaps (%rsp), %xmm6 # 16-byte Reload +; AVX-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm7 # 16-byte Reload +; AVX-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm8 # 16-byte Reload +; AVX-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm9 # 16-byte Reload +; AVX-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm10 # 16-byte Reload +; AVX-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm11 # 16-byte Reload +; AVX-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm12 # 16-byte Reload +; AVX-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm13 # 16-byte Reload +; AVX-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm14 # 16-byte Reload +; AVX-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm15 # 16-byte Reload +; AVX-NEXT: addq $168, %rsp +; AVX-NEXT: retq +; AVX-NEXT: .seh_handlerdata +; AVX-NEXT: .text +; AVX-NEXT: .seh_endproc top: tail call void asm sideeffect "", "~{xmm6},~{dirflag},~{fpsr},~{flags}"() tail call void asm sideeffect "", "~{xmm0},~{xmm1},~{xmm2},~{xmm3},~{dirflag},~{fpsr},~{flags}"() @@ -255,9 +1119,6 @@ top: tail call void asm sideeffect "", "~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{dirflag},~{fpsr},~{flags}"() %tmp1 = sitofp i64 %arg to double ret double %tmp1 -;AVX-LABEL:@clearence -;AVX: vxorps [[XMM6:%xmm6]], [[XMM6]], [[XMM6]] -;AVX-NEXT: vcvtsi2sd {{.*}}, [[XMM6]], {{%xmm[0-9]+}} } ; Make sure we are making a smart choice regarding undef registers in order to @@ -265,6 +1126,136 @@ top: ; iteration, especially when we cannot zero out the undef register because it ; is alive. define i64 @loopclearence(i64* nocapture %x, double* nocapture %y) nounwind { +; SSE-LINUX-LABEL: loopclearence: +; SSE-LINUX: # %bb.0: # %entry +; SSE-LINUX-NEXT: movq (%rdi), %rax +; SSE-LINUX-NEXT: movl $1, %ecx +; SSE-LINUX-NEXT: .p2align 4, 0x90 +; SSE-LINUX-NEXT: .LBB12_1: # %loop +; SSE-LINUX-NEXT: # =>This Inner Loop Header: Depth=1 +; SSE-LINUX-NEXT: xorps %xmm4, %xmm4 +; SSE-LINUX-NEXT: cvtsi2sd %rcx, %xmm4 +; SSE-LINUX-NEXT: #APP +; SSE-LINUX-NEXT: #NO_APP +; SSE-LINUX-NEXT: #APP +; SSE-LINUX-NEXT: #NO_APP +; SSE-LINUX-NEXT: #APP +; SSE-LINUX-NEXT: #NO_APP +; SSE-LINUX-NEXT: #APP +; SSE-LINUX-NEXT: #NO_APP +; SSE-LINUX-NEXT: #APP +; SSE-LINUX-NEXT: #NO_APP +; SSE-LINUX-NEXT: #APP +; SSE-LINUX-NEXT: #NO_APP +; SSE-LINUX-NEXT: #APP +; SSE-LINUX-NEXT: #NO_APP +; SSE-LINUX-NEXT: addsd (%rsi), %xmm4 +; SSE-LINUX-NEXT: cvttsd2si %xmm4, %rdx +; SSE-LINUX-NEXT: addq %rdx, %rax +; SSE-LINUX-NEXT: incq %rcx +; SSE-LINUX-NEXT: cmpq $156250000, %rcx # imm = 0x9502F90 +; SSE-LINUX-NEXT: jne .LBB12_1 +; SSE-LINUX-NEXT: # %bb.2: # %ret +; SSE-LINUX-NEXT: retq +; +; SSE-WIN-LABEL: loopclearence: +; SSE-WIN: # %bb.0: # %entry +; SSE-WIN-NEXT: subq $136, %rsp +; SSE-WIN-NEXT: movaps %xmm15, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-WIN-NEXT: movaps %xmm14, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-WIN-NEXT: movaps %xmm13, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-WIN-NEXT: movaps %xmm12, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-WIN-NEXT: movaps %xmm11, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-WIN-NEXT: movaps %xmm10, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-WIN-NEXT: movaps %xmm9, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-WIN-NEXT: movaps %xmm8, (%rsp) # 16-byte Spill +; SSE-WIN-NEXT: movq (%rcx), %rax +; SSE-WIN-NEXT: movl $1, %r8d +; SSE-WIN-NEXT: .p2align 4, 0x90 +; SSE-WIN-NEXT: .LBB12_1: # %loop +; SSE-WIN-NEXT: # =>This Inner Loop Header: Depth=1 +; SSE-WIN-NEXT: xorps %xmm4, %xmm4 +; SSE-WIN-NEXT: cvtsi2sd %r8, %xmm4 +; SSE-WIN-NEXT: #APP +; SSE-WIN-NEXT: #NO_APP +; SSE-WIN-NEXT: #APP +; SSE-WIN-NEXT: #NO_APP +; SSE-WIN-NEXT: #APP +; SSE-WIN-NEXT: #NO_APP +; SSE-WIN-NEXT: #APP +; SSE-WIN-NEXT: #NO_APP +; SSE-WIN-NEXT: #APP +; SSE-WIN-NEXT: #NO_APP +; SSE-WIN-NEXT: #APP +; SSE-WIN-NEXT: #NO_APP +; SSE-WIN-NEXT: #APP +; SSE-WIN-NEXT: #NO_APP +; SSE-WIN-NEXT: addsd (%rdx), %xmm4 +; SSE-WIN-NEXT: cvttsd2si %xmm4, %rcx +; SSE-WIN-NEXT: addq %rcx, %rax +; SSE-WIN-NEXT: incq %r8 +; SSE-WIN-NEXT: cmpq $156250000, %r8 # imm = 0x9502F90 +; SSE-WIN-NEXT: jne .LBB12_1 +; SSE-WIN-NEXT: # %bb.2: # %ret +; SSE-WIN-NEXT: movaps (%rsp), %xmm8 # 16-byte Reload +; SSE-WIN-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm9 # 16-byte Reload +; SSE-WIN-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm10 # 16-byte Reload +; SSE-WIN-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm11 # 16-byte Reload +; SSE-WIN-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm12 # 16-byte Reload +; SSE-WIN-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm13 # 16-byte Reload +; SSE-WIN-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm14 # 16-byte Reload +; SSE-WIN-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm15 # 16-byte Reload +; SSE-WIN-NEXT: addq $136, %rsp +; SSE-WIN-NEXT: retq +; +; AVX-LABEL: loopclearence: +; AVX: # %bb.0: # %entry +; AVX-NEXT: subq $136, %rsp +; AVX-NEXT: vmovaps %xmm15, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX-NEXT: vmovaps %xmm14, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX-NEXT: vmovaps %xmm13, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX-NEXT: vmovaps %xmm12, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX-NEXT: vmovaps %xmm11, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX-NEXT: vmovaps %xmm10, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX-NEXT: vmovaps %xmm9, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX-NEXT: vmovaps %xmm8, (%rsp) # 16-byte Spill +; AVX-NEXT: movq (%rcx), %rax +; AVX-NEXT: movl $1, %r8d +; AVX-NEXT: .p2align 4, 0x90 +; AVX-NEXT: .LBB12_1: # %loop +; AVX-NEXT: # =>This Inner Loop Header: Depth=1 +; AVX-NEXT: vcvtsi2sd %r8, %xmm5, %xmm4 +; AVX-NEXT: #APP +; AVX-NEXT: #NO_APP +; AVX-NEXT: #APP +; AVX-NEXT: #NO_APP +; AVX-NEXT: #APP +; AVX-NEXT: #NO_APP +; AVX-NEXT: #APP +; AVX-NEXT: #NO_APP +; AVX-NEXT: #APP +; AVX-NEXT: #NO_APP +; AVX-NEXT: #APP +; AVX-NEXT: #NO_APP +; AVX-NEXT: #APP +; AVX-NEXT: #NO_APP +; AVX-NEXT: vaddsd (%rdx), %xmm4, %xmm0 +; AVX-NEXT: vcvttsd2si %xmm0, %rcx +; AVX-NEXT: addq %rcx, %rax +; AVX-NEXT: incq %r8 +; AVX-NEXT: cmpq $156250000, %r8 # imm = 0x9502F90 +; AVX-NEXT: jne .LBB12_1 +; AVX-NEXT: # %bb.2: # %ret +; AVX-NEXT: vmovaps (%rsp), %xmm8 # 16-byte Reload +; AVX-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm9 # 16-byte Reload +; AVX-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm10 # 16-byte Reload +; AVX-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm11 # 16-byte Reload +; AVX-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm12 # 16-byte Reload +; AVX-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm13 # 16-byte Reload +; AVX-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm14 # 16-byte Reload +; AVX-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm15 # 16-byte Reload +; AVX-NEXT: addq $136, %rsp +; AVX-NEXT: retq entry: %vx = load i64, i64* %x br label %loop @@ -288,11 +1279,6 @@ loop: br i1 %exitcond, label %ret, label %loop ret: ret i64 %s2 -;AVX-LABEL:@loopclearence -;Registers 4-7 are not used and therefore one of them should be chosen -;AVX-NOT: {{%xmm[4-7]}} -;AVX: vcvtsi2sd {{.*}}, [[XMM4_7:%xmm[4-7]]], {{%xmm[0-9]+}} -;AVX-NOT: [[XMM4_7]] } ; Make sure we are making a smart choice regarding undef registers even for more @@ -300,6 +1286,305 @@ ret: ; julia> a = falses(10000); a[1:4:end] = true ; julia> linspace(1.0,2.0,10000)[a] define void @loopclearance2(double* nocapture %y, i64* %x, double %c1, double %c2, double %c3, double %c4, i64 %size) { +; SSE-LINUX-LABEL: loopclearance2: +; SSE-LINUX: # %bb.0: # %entry +; SSE-LINUX-NEXT: #APP +; SSE-LINUX-NEXT: #NO_APP +; SSE-LINUX-NEXT: #APP +; SSE-LINUX-NEXT: #NO_APP +; SSE-LINUX-NEXT: #APP +; SSE-LINUX-NEXT: #NO_APP +; SSE-LINUX-NEXT: #APP +; SSE-LINUX-NEXT: #NO_APP +; SSE-LINUX-NEXT: #APP +; SSE-LINUX-NEXT: #NO_APP +; SSE-LINUX-NEXT: #APP +; SSE-LINUX-NEXT: #NO_APP +; SSE-LINUX-NEXT: #APP +; SSE-LINUX-NEXT: #NO_APP +; SSE-LINUX-NEXT: movl $1, %r8d +; SSE-LINUX-NEXT: xorl %ecx, %ecx +; SSE-LINUX-NEXT: .p2align 4, 0x90 +; SSE-LINUX-NEXT: .LBB13_1: # %inner_loop +; SSE-LINUX-NEXT: # =>This Inner Loop Header: Depth=1 +; SSE-LINUX-NEXT: movq %rcx, %rax +; SSE-LINUX-NEXT: shrq $6, %rcx +; SSE-LINUX-NEXT: movq (%rsi,%rcx,8), %rcx +; SSE-LINUX-NEXT: btq %rax, %rcx +; SSE-LINUX-NEXT: leaq 1(%rax), %rcx +; SSE-LINUX-NEXT: jae .LBB13_1 +; SSE-LINUX-NEXT: # %bb.2: # %loop_end +; SSE-LINUX-NEXT: # in Loop: Header=BB13_1 Depth=1 +; SSE-LINUX-NEXT: leaq 1(%r8), %r9 +; SSE-LINUX-NEXT: xorps %xmm4, %xmm4 +; SSE-LINUX-NEXT: cvtsi2sd %r9, %xmm4 +; SSE-LINUX-NEXT: movapd %xmm0, %xmm5 +; SSE-LINUX-NEXT: subsd %xmm4, %xmm5 +; SSE-LINUX-NEXT: mulsd %xmm1, %xmm5 +; SSE-LINUX-NEXT: leaq -1(%rcx), %rax +; SSE-LINUX-NEXT: xorps %xmm4, %xmm4 +; SSE-LINUX-NEXT: cvtsi2sd %rax, %xmm4 +; SSE-LINUX-NEXT: mulsd %xmm2, %xmm4 +; SSE-LINUX-NEXT: addsd %xmm5, %xmm4 +; SSE-LINUX-NEXT: divsd %xmm3, %xmm4 +; SSE-LINUX-NEXT: movsd %xmm4, -8(%rdi,%r8,8) +; SSE-LINUX-NEXT: movq %r9, %r8 +; SSE-LINUX-NEXT: cmpq %r9, %rdx +; SSE-LINUX-NEXT: jge .LBB13_1 +; SSE-LINUX-NEXT: # %bb.3: # %loopdone +; SSE-LINUX-NEXT: retq +; +; SSE-WIN-LABEL: loopclearance2: +; SSE-WIN: # %bb.0: # %entry +; SSE-WIN-NEXT: subq $152, %rsp +; SSE-WIN-NEXT: .seh_stackalloc 152 +; SSE-WIN-NEXT: movaps %xmm15, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-WIN-NEXT: .seh_savexmm %xmm15, 128 +; SSE-WIN-NEXT: movaps %xmm14, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-WIN-NEXT: .seh_savexmm %xmm14, 112 +; SSE-WIN-NEXT: movaps %xmm13, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-WIN-NEXT: .seh_savexmm %xmm13, 96 +; SSE-WIN-NEXT: movaps %xmm12, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-WIN-NEXT: .seh_savexmm %xmm12, 80 +; SSE-WIN-NEXT: movaps %xmm11, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-WIN-NEXT: .seh_savexmm %xmm11, 64 +; SSE-WIN-NEXT: movaps %xmm10, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-WIN-NEXT: .seh_savexmm %xmm10, 48 +; SSE-WIN-NEXT: movaps %xmm9, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-WIN-NEXT: .seh_savexmm %xmm9, 32 +; SSE-WIN-NEXT: movaps %xmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-WIN-NEXT: .seh_savexmm %xmm8, 16 +; SSE-WIN-NEXT: movaps %xmm7, (%rsp) # 16-byte Spill +; SSE-WIN-NEXT: .seh_savexmm %xmm7, 0 +; SSE-WIN-NEXT: .seh_endprologue +; SSE-WIN-NEXT: movq {{[0-9]+}}(%rsp), %r8 +; SSE-WIN-NEXT: movsd {{.*#+}} xmm0 = mem[0],zero +; SSE-WIN-NEXT: movsd {{.*#+}} xmm1 = mem[0],zero +; SSE-WIN-NEXT: #APP +; SSE-WIN-NEXT: #NO_APP +; SSE-WIN-NEXT: #APP +; SSE-WIN-NEXT: #NO_APP +; SSE-WIN-NEXT: #APP +; SSE-WIN-NEXT: #NO_APP +; SSE-WIN-NEXT: #APP +; SSE-WIN-NEXT: #NO_APP +; SSE-WIN-NEXT: #APP +; SSE-WIN-NEXT: #NO_APP +; SSE-WIN-NEXT: #APP +; SSE-WIN-NEXT: #NO_APP +; SSE-WIN-NEXT: #APP +; SSE-WIN-NEXT: #NO_APP +; SSE-WIN-NEXT: movl $1, %r9d +; SSE-WIN-NEXT: xorl %r11d, %r11d +; SSE-WIN-NEXT: .p2align 4, 0x90 +; SSE-WIN-NEXT: .LBB13_1: # %inner_loop +; SSE-WIN-NEXT: # =>This Inner Loop Header: Depth=1 +; SSE-WIN-NEXT: movq %r11, %r10 +; SSE-WIN-NEXT: movq %r11, %rax +; SSE-WIN-NEXT: shrq $6, %rax +; SSE-WIN-NEXT: movq (%rdx,%rax,8), %rax +; SSE-WIN-NEXT: btq %r11, %rax +; SSE-WIN-NEXT: leaq 1(%r11), %r11 +; SSE-WIN-NEXT: jae .LBB13_1 +; SSE-WIN-NEXT: # %bb.2: # %loop_end +; SSE-WIN-NEXT: # in Loop: Header=BB13_1 Depth=1 +; SSE-WIN-NEXT: leaq 1(%r9), %r10 +; SSE-WIN-NEXT: xorps %xmm4, %xmm4 +; SSE-WIN-NEXT: cvtsi2sd %r10, %xmm4 +; SSE-WIN-NEXT: movapd %xmm2, %xmm5 +; SSE-WIN-NEXT: subsd %xmm4, %xmm5 +; SSE-WIN-NEXT: mulsd %xmm3, %xmm5 +; SSE-WIN-NEXT: leaq -1(%r11), %rax +; SSE-WIN-NEXT: xorps %xmm4, %xmm4 +; SSE-WIN-NEXT: cvtsi2sd %rax, %xmm4 +; SSE-WIN-NEXT: mulsd %xmm1, %xmm4 +; SSE-WIN-NEXT: addsd %xmm5, %xmm4 +; SSE-WIN-NEXT: divsd %xmm0, %xmm4 +; SSE-WIN-NEXT: movsd %xmm4, -8(%rcx,%r9,8) +; SSE-WIN-NEXT: movq %r10, %r9 +; SSE-WIN-NEXT: cmpq %r10, %r8 +; SSE-WIN-NEXT: jge .LBB13_1 +; SSE-WIN-NEXT: # %bb.3: # %loopdone +; SSE-WIN-NEXT: movaps (%rsp), %xmm7 # 16-byte Reload +; SSE-WIN-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm8 # 16-byte Reload +; SSE-WIN-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm9 # 16-byte Reload +; SSE-WIN-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm10 # 16-byte Reload +; SSE-WIN-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm11 # 16-byte Reload +; SSE-WIN-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm12 # 16-byte Reload +; SSE-WIN-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm13 # 16-byte Reload +; SSE-WIN-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm14 # 16-byte Reload +; SSE-WIN-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm15 # 16-byte Reload +; SSE-WIN-NEXT: addq $152, %rsp +; SSE-WIN-NEXT: retq +; SSE-WIN-NEXT: .seh_handlerdata +; SSE-WIN-NEXT: .text +; SSE-WIN-NEXT: .seh_endproc +; +; AVX1-LABEL: loopclearance2: +; AVX1: # %bb.0: # %entry +; AVX1-NEXT: subq $152, %rsp +; AVX1-NEXT: .seh_stackalloc 152 +; AVX1-NEXT: vmovaps %xmm15, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX1-NEXT: .seh_savexmm %xmm15, 128 +; AVX1-NEXT: vmovaps %xmm14, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX1-NEXT: .seh_savexmm %xmm14, 112 +; AVX1-NEXT: vmovaps %xmm13, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX1-NEXT: .seh_savexmm %xmm13, 96 +; AVX1-NEXT: vmovaps %xmm12, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX1-NEXT: .seh_savexmm %xmm12, 80 +; AVX1-NEXT: vmovaps %xmm11, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX1-NEXT: .seh_savexmm %xmm11, 64 +; AVX1-NEXT: vmovaps %xmm10, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX1-NEXT: .seh_savexmm %xmm10, 48 +; AVX1-NEXT: vmovaps %xmm9, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX1-NEXT: .seh_savexmm %xmm9, 32 +; AVX1-NEXT: vmovaps %xmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX1-NEXT: .seh_savexmm %xmm8, 16 +; AVX1-NEXT: vmovaps %xmm7, (%rsp) # 16-byte Spill +; AVX1-NEXT: .seh_savexmm %xmm7, 0 +; AVX1-NEXT: .seh_endprologue +; AVX1-NEXT: movq {{[0-9]+}}(%rsp), %r8 +; AVX1-NEXT: vmovsd {{.*#+}} xmm0 = mem[0],zero +; AVX1-NEXT: vmovsd {{.*#+}} xmm1 = mem[0],zero +; AVX1-NEXT: #APP +; AVX1-NEXT: #NO_APP +; AVX1-NEXT: #APP +; AVX1-NEXT: #NO_APP +; AVX1-NEXT: #APP +; AVX1-NEXT: #NO_APP +; AVX1-NEXT: #APP +; AVX1-NEXT: #NO_APP +; AVX1-NEXT: #APP +; AVX1-NEXT: #NO_APP +; AVX1-NEXT: #APP +; AVX1-NEXT: #NO_APP +; AVX1-NEXT: #APP +; AVX1-NEXT: #NO_APP +; AVX1-NEXT: movl $1, %r9d +; AVX1-NEXT: xorl %r11d, %r11d +; AVX1-NEXT: .p2align 4, 0x90 +; AVX1-NEXT: .LBB13_1: # %inner_loop +; AVX1-NEXT: # =>This Inner Loop Header: Depth=1 +; AVX1-NEXT: movq %r11, %r10 +; AVX1-NEXT: movq %r11, %rax +; AVX1-NEXT: shrq $6, %rax +; AVX1-NEXT: movq (%rdx,%rax,8), %rax +; AVX1-NEXT: btq %r11, %rax +; AVX1-NEXT: leaq 1(%r11), %r11 +; AVX1-NEXT: jae .LBB13_1 +; AVX1-NEXT: # %bb.2: # %loop_end +; AVX1-NEXT: # in Loop: Header=BB13_1 Depth=1 +; AVX1-NEXT: leaq 1(%r9), %r10 +; AVX1-NEXT: vcvtsi2sd %r10, %xmm6, %xmm4 +; AVX1-NEXT: vsubsd %xmm4, %xmm2, %xmm4 +; AVX1-NEXT: vmulsd %xmm3, %xmm4, %xmm4 +; AVX1-NEXT: leaq -1(%r11), %rax +; AVX1-NEXT: vcvtsi2sd %rax, %xmm6, %xmm5 +; AVX1-NEXT: vmulsd %xmm1, %xmm5, %xmm5 +; AVX1-NEXT: vaddsd %xmm5, %xmm4, %xmm4 +; AVX1-NEXT: vdivsd %xmm0, %xmm4, %xmm4 +; AVX1-NEXT: vmovsd %xmm4, -8(%rcx,%r9,8) +; AVX1-NEXT: movq %r10, %r9 +; AVX1-NEXT: cmpq %r10, %r8 +; AVX1-NEXT: jge .LBB13_1 +; AVX1-NEXT: # %bb.3: # %loopdone +; AVX1-NEXT: vmovaps (%rsp), %xmm7 # 16-byte Reload +; AVX1-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm8 # 16-byte Reload +; AVX1-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm9 # 16-byte Reload +; AVX1-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm10 # 16-byte Reload +; AVX1-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm11 # 16-byte Reload +; AVX1-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm12 # 16-byte Reload +; AVX1-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm13 # 16-byte Reload +; AVX1-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm14 # 16-byte Reload +; AVX1-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm15 # 16-byte Reload +; AVX1-NEXT: addq $152, %rsp +; AVX1-NEXT: retq +; AVX1-NEXT: .seh_handlerdata +; AVX1-NEXT: .text +; AVX1-NEXT: .seh_endproc +; +; AVX512VL-LABEL: loopclearance2: +; AVX512VL: # %bb.0: # %entry +; AVX512VL-NEXT: subq $152, %rsp +; AVX512VL-NEXT: .seh_stackalloc 152 +; AVX512VL-NEXT: vmovaps %xmm15, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX512VL-NEXT: .seh_savexmm %xmm15, 128 +; AVX512VL-NEXT: vmovaps %xmm14, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX512VL-NEXT: .seh_savexmm %xmm14, 112 +; AVX512VL-NEXT: vmovaps %xmm13, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX512VL-NEXT: .seh_savexmm %xmm13, 96 +; AVX512VL-NEXT: vmovaps %xmm12, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX512VL-NEXT: .seh_savexmm %xmm12, 80 +; AVX512VL-NEXT: vmovaps %xmm11, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX512VL-NEXT: .seh_savexmm %xmm11, 64 +; AVX512VL-NEXT: vmovaps %xmm10, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX512VL-NEXT: .seh_savexmm %xmm10, 48 +; AVX512VL-NEXT: vmovaps %xmm9, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX512VL-NEXT: .seh_savexmm %xmm9, 32 +; AVX512VL-NEXT: vmovaps %xmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX512VL-NEXT: .seh_savexmm %xmm8, 16 +; AVX512VL-NEXT: vmovaps %xmm7, (%rsp) # 16-byte Spill +; AVX512VL-NEXT: .seh_savexmm %xmm7, 0 +; AVX512VL-NEXT: .seh_endprologue +; AVX512VL-NEXT: vmovsd {{.*#+}} xmm0 = mem[0],zero +; AVX512VL-NEXT: movq {{[0-9]+}}(%rsp), %r8 +; AVX512VL-NEXT: vmovsd {{.*#+}} xmm1 = mem[0],zero +; AVX512VL-NEXT: #APP +; AVX512VL-NEXT: #NO_APP +; AVX512VL-NEXT: #APP +; AVX512VL-NEXT: #NO_APP +; AVX512VL-NEXT: #APP +; AVX512VL-NEXT: #NO_APP +; AVX512VL-NEXT: #APP +; AVX512VL-NEXT: #NO_APP +; AVX512VL-NEXT: #APP +; AVX512VL-NEXT: #NO_APP +; AVX512VL-NEXT: #APP +; AVX512VL-NEXT: #NO_APP +; AVX512VL-NEXT: #APP +; AVX512VL-NEXT: #NO_APP +; AVX512VL-NEXT: movl $1, %r9d +; AVX512VL-NEXT: xorl %r11d, %r11d +; AVX512VL-NEXT: .p2align 4, 0x90 +; AVX512VL-NEXT: .LBB13_1: # %inner_loop +; AVX512VL-NEXT: # =>This Inner Loop Header: Depth=1 +; AVX512VL-NEXT: movq %r11, %r10 +; AVX512VL-NEXT: movq %r11, %rax +; AVX512VL-NEXT: shrq $6, %rax +; AVX512VL-NEXT: movq (%rdx,%rax,8), %rax +; AVX512VL-NEXT: btq %r11, %rax +; AVX512VL-NEXT: leaq 1(%r11), %r11 +; AVX512VL-NEXT: jae .LBB13_1 +; AVX512VL-NEXT: # %bb.2: # %loop_end +; AVX512VL-NEXT: # in Loop: Header=BB13_1 Depth=1 +; AVX512VL-NEXT: leaq 1(%r9), %r10 +; AVX512VL-NEXT: vcvtsi2sd %r10, %xmm6, %xmm4 +; AVX512VL-NEXT: vsubsd %xmm4, %xmm2, %xmm4 +; AVX512VL-NEXT: vmulsd %xmm3, %xmm4, %xmm4 +; AVX512VL-NEXT: leaq -1(%r11), %rax +; AVX512VL-NEXT: vcvtsi2sd %rax, %xmm6, %xmm5 +; AVX512VL-NEXT: vmulsd %xmm1, %xmm5, %xmm5 +; AVX512VL-NEXT: vaddsd %xmm5, %xmm4, %xmm4 +; AVX512VL-NEXT: vdivsd %xmm0, %xmm4, %xmm4 +; AVX512VL-NEXT: vmovsd %xmm4, -8(%rcx,%r9,8) +; AVX512VL-NEXT: movq %r10, %r9 +; AVX512VL-NEXT: cmpq %r10, %r8 +; AVX512VL-NEXT: jge .LBB13_1 +; AVX512VL-NEXT: # %bb.3: # %loopdone +; AVX512VL-NEXT: vmovaps (%rsp), %xmm7 # 16-byte Reload +; AVX512VL-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm8 # 16-byte Reload +; AVX512VL-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm9 # 16-byte Reload +; AVX512VL-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm10 # 16-byte Reload +; AVX512VL-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm11 # 16-byte Reload +; AVX512VL-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm12 # 16-byte Reload +; AVX512VL-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm13 # 16-byte Reload +; AVX512VL-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm14 # 16-byte Reload +; AVX512VL-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm15 # 16-byte Reload +; AVX512VL-NEXT: addq $152, %rsp +; AVX512VL-NEXT: retq +; AVX512VL-NEXT: .seh_handlerdata +; AVX512VL-NEXT: .text +; AVX512VL-NEXT: .seh_endproc entry: tail call void asm sideeffect "", "~{xmm7},~{dirflag},~{fpsr},~{flags}"() tail call void asm sideeffect "", "~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{dirflag},~{fpsr},~{flags}"() @@ -334,14 +1619,9 @@ loop_end: ; Register use, plus us clobbering 7-15 above, basically forces xmm6 here as ; the only reasonable choice. The primary thing we care about is that it's ; not one of the registers used in the loop (e.g. not the output reg here) -;AVX-NOT: %xmm6 -;AVX: vcvtsi2sd {{.*}}, %xmm6, {{%xmm[0-9]+}} -;AVX-NOT: %xmm6 %nexti_f = sitofp i64 %nexti to double %sub = fsub double %c1, %nexti_f %mul = fmul double %sub, %c2 -;AVX: vcvtsi2sd {{.*}}, %xmm6, {{%xmm[0-9]+}} -;AVX-NOT: %xmm6 %phi_f = sitofp i64 %phi to double %mul2 = fmul double %phi_f, %c3 %add2 = fadd double %mul, %mul2