From bdb1729c8347c2b7d932976f8588e524b4f7b8d5 Mon Sep 17 00:00:00 2001 From: Craig Topper Date: Sat, 22 Feb 2020 18:50:41 -0800 Subject: [PATCH] [X86] Teach EltsFromConsecutiveLoads that it's ok to form a v4f32 VZEXT_LOAD with a 64 bit memory size on SSE1 targets. We can use MOVLPS which will load 64 bits, but we need a v4f32 result type. We already have isel patterns for this. The code here is a little hacky. We can probably improve it with more isel patterns. --- llvm/lib/Target/X86/X86ISelLowering.cpp | 4 ++ llvm/test/CodeGen/X86/atomic-fp.ll | 55 +++++++++---------- llvm/test/CodeGen/X86/atomic-non-integer.ll | 10 ++-- .../X86/merge-consecutive-loads-128.ll | 29 ++++------ 4 files changed, 43 insertions(+), 55 deletions(-) diff --git a/llvm/lib/Target/X86/X86ISelLowering.cpp b/llvm/lib/Target/X86/X86ISelLowering.cpp index 6d4c067ed0f6..d66474f641cd 100644 --- a/llvm/lib/Target/X86/X86ISelLowering.cpp +++ b/llvm/lib/Target/X86/X86ISelLowering.cpp @@ -8284,6 +8284,10 @@ static SDValue EltsFromConsecutiveLoads(EVT VT, ArrayRef Elts, MVT VecSVT = VT.isFloatingPoint() ? MVT::getFloatingPointVT(LoadSizeInBits) : MVT::getIntegerVT(LoadSizeInBits); MVT VecVT = MVT::getVectorVT(VecSVT, VT.getSizeInBits() / LoadSizeInBits); + // Allow v4f32 on SSE1 only targets. + // FIXME: Add more isel patterns so we can just use VT directly. + if (!Subtarget.hasSSE2() && VT == MVT::v4f32) + VecVT = MVT::v4f32; if (TLI.isTypeLegal(VecVT)) { SDVTList Tys = DAG.getVTList(VecVT, MVT::Other); SDValue Ops[] = { LDBase->getChain(), LDBase->getBasePtr() }; diff --git a/llvm/test/CodeGen/X86/atomic-fp.ll b/llvm/test/CodeGen/X86/atomic-fp.ll index 24950b53fb41..1f8f64399831 100644 --- a/llvm/test/CodeGen/X86/atomic-fp.ll +++ b/llvm/test/CodeGen/X86/atomic-fp.ll @@ -122,13 +122,12 @@ define void @fadd_64r(double* %loc, double %val) nounwind { ; X86-SSE1-NEXT: movl {{[0-9]+}}(%esp), %ecx ; X86-SSE1-NEXT: movl {{[0-9]+}}(%esp), %edx ; X86-SSE1-NEXT: movl %edx, {{[0-9]+}}(%esp) -; X86-SSE1-NEXT: movl %ecx, {{[0-9]+}}(%esp) -; X86-SSE1-NEXT: fldl {{[0-9]+}}(%esp) +; X86-SSE1-NEXT: movl %ecx, (%esp) +; X86-SSE1-NEXT: fldl (%esp) ; X86-SSE1-NEXT: faddl 12(%ebp) -; X86-SSE1-NEXT: fstpl (%esp) -; X86-SSE1-NEXT: movss {{.*#+}} xmm0 = mem[0],zero,zero,zero -; X86-SSE1-NEXT: movss {{.*#+}} xmm1 = mem[0],zero,zero,zero -; X86-SSE1-NEXT: unpcklps {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1] +; X86-SSE1-NEXT: fstpl {{[0-9]+}}(%esp) +; X86-SSE1-NEXT: xorps %xmm0, %xmm0 +; X86-SSE1-NEXT: movlps {{.*#+}} xmm0 = mem[0,1],xmm0[2,3] ; X86-SSE1-NEXT: movlps %xmm0, (%eax) ; X86-SSE1-NEXT: movl %ebp, %esp ; X86-SSE1-NEXT: popl %ebp @@ -295,13 +294,12 @@ define void @fadd_64g() nounwind { ; X86-SSE1-NEXT: movl {{[0-9]+}}(%esp), %eax ; X86-SSE1-NEXT: movl {{[0-9]+}}(%esp), %ecx ; X86-SSE1-NEXT: movl %ecx, {{[0-9]+}}(%esp) -; X86-SSE1-NEXT: movl %eax, {{[0-9]+}}(%esp) +; X86-SSE1-NEXT: movl %eax, (%esp) ; X86-SSE1-NEXT: fld1 -; X86-SSE1-NEXT: faddl {{[0-9]+}}(%esp) -; X86-SSE1-NEXT: fstpl (%esp) -; X86-SSE1-NEXT: movss {{.*#+}} xmm0 = mem[0],zero,zero,zero -; X86-SSE1-NEXT: movss {{.*#+}} xmm1 = mem[0],zero,zero,zero -; X86-SSE1-NEXT: unpcklps {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1] +; X86-SSE1-NEXT: faddl (%esp) +; X86-SSE1-NEXT: fstpl {{[0-9]+}}(%esp) +; X86-SSE1-NEXT: xorps %xmm0, %xmm0 +; X86-SSE1-NEXT: movlps {{.*#+}} xmm0 = mem[0,1],xmm0[2,3] ; X86-SSE1-NEXT: movlps %xmm0, glob64 ; X86-SSE1-NEXT: movl %ebp, %esp ; X86-SSE1-NEXT: popl %ebp @@ -466,13 +464,12 @@ define void @fadd_64imm() nounwind { ; X86-SSE1-NEXT: movl {{[0-9]+}}(%esp), %eax ; X86-SSE1-NEXT: movl {{[0-9]+}}(%esp), %ecx ; X86-SSE1-NEXT: movl %ecx, {{[0-9]+}}(%esp) -; X86-SSE1-NEXT: movl %eax, {{[0-9]+}}(%esp) +; X86-SSE1-NEXT: movl %eax, (%esp) ; X86-SSE1-NEXT: fld1 -; X86-SSE1-NEXT: faddl {{[0-9]+}}(%esp) -; X86-SSE1-NEXT: fstpl (%esp) -; X86-SSE1-NEXT: movss {{.*#+}} xmm0 = mem[0],zero,zero,zero -; X86-SSE1-NEXT: movss {{.*#+}} xmm1 = mem[0],zero,zero,zero -; X86-SSE1-NEXT: unpcklps {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1] +; X86-SSE1-NEXT: faddl (%esp) +; X86-SSE1-NEXT: fstpl {{[0-9]+}}(%esp) +; X86-SSE1-NEXT: xorps %xmm0, %xmm0 +; X86-SSE1-NEXT: movlps {{.*#+}} xmm0 = mem[0,1],xmm0[2,3] ; X86-SSE1-NEXT: movlps %xmm0, -559038737 ; X86-SSE1-NEXT: movl %ebp, %esp ; X86-SSE1-NEXT: popl %ebp @@ -643,13 +640,12 @@ define void @fadd_64stack() nounwind { ; X86-SSE1-NEXT: movl {{[0-9]+}}(%esp), %eax ; X86-SSE1-NEXT: movl {{[0-9]+}}(%esp), %ecx ; X86-SSE1-NEXT: movl %ecx, {{[0-9]+}}(%esp) -; X86-SSE1-NEXT: movl %eax, {{[0-9]+}}(%esp) +; X86-SSE1-NEXT: movl %eax, (%esp) ; X86-SSE1-NEXT: fld1 -; X86-SSE1-NEXT: faddl {{[0-9]+}}(%esp) -; X86-SSE1-NEXT: fstpl (%esp) -; X86-SSE1-NEXT: movss {{.*#+}} xmm0 = mem[0],zero,zero,zero -; X86-SSE1-NEXT: movss {{.*#+}} xmm1 = mem[0],zero,zero,zero -; X86-SSE1-NEXT: unpcklps {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1] +; X86-SSE1-NEXT: faddl (%esp) +; X86-SSE1-NEXT: fstpl {{[0-9]+}}(%esp) +; X86-SSE1-NEXT: xorps %xmm0, %xmm0 +; X86-SSE1-NEXT: movlps {{.*#+}} xmm0 = mem[0,1],xmm0[2,3] ; X86-SSE1-NEXT: movlps %xmm0, {{[0-9]+}}(%esp) ; X86-SSE1-NEXT: movl %ebp, %esp ; X86-SSE1-NEXT: popl %ebp @@ -760,13 +756,12 @@ define void @fadd_array(i64* %arg, double %arg1, i64 %arg2) nounwind { ; X86-SSE1-NEXT: movl {{[0-9]+}}(%esp), %edx ; X86-SSE1-NEXT: movl {{[0-9]+}}(%esp), %esi ; X86-SSE1-NEXT: movl %esi, {{[0-9]+}}(%esp) -; X86-SSE1-NEXT: movl %edx, {{[0-9]+}}(%esp) -; X86-SSE1-NEXT: fldl {{[0-9]+}}(%esp) +; X86-SSE1-NEXT: movl %edx, (%esp) +; X86-SSE1-NEXT: fldl (%esp) ; X86-SSE1-NEXT: faddl 12(%ebp) -; X86-SSE1-NEXT: fstpl (%esp) -; X86-SSE1-NEXT: movss {{.*#+}} xmm0 = mem[0],zero,zero,zero -; X86-SSE1-NEXT: movss {{.*#+}} xmm1 = mem[0],zero,zero,zero -; X86-SSE1-NEXT: unpcklps {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1] +; X86-SSE1-NEXT: fstpl {{[0-9]+}}(%esp) +; X86-SSE1-NEXT: xorps %xmm0, %xmm0 +; X86-SSE1-NEXT: movlps {{.*#+}} xmm0 = mem[0,1],xmm0[2,3] ; X86-SSE1-NEXT: movlps %xmm0, (%ecx,%eax,8) ; X86-SSE1-NEXT: leal -4(%ebp), %esp ; X86-SSE1-NEXT: popl %esi diff --git a/llvm/test/CodeGen/X86/atomic-non-integer.ll b/llvm/test/CodeGen/X86/atomic-non-integer.ll index ef31b2758dfe..e635a59cfdae 100644 --- a/llvm/test/CodeGen/X86/atomic-non-integer.ll +++ b/llvm/test/CodeGen/X86/atomic-non-integer.ll @@ -56,9 +56,8 @@ define void @store_double(double* %fptr, double %v) { ; X86-SSE1-LABEL: store_double: ; X86-SSE1: # %bb.0: ; X86-SSE1-NEXT: movl {{[0-9]+}}(%esp), %eax -; X86-SSE1-NEXT: movss {{.*#+}} xmm0 = mem[0],zero,zero,zero -; X86-SSE1-NEXT: movss {{.*#+}} xmm1 = mem[0],zero,zero,zero -; X86-SSE1-NEXT: unpcklps {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1] +; X86-SSE1-NEXT: xorps %xmm0, %xmm0 +; X86-SSE1-NEXT: movlps {{.*#+}} xmm0 = mem[0,1],xmm0[2,3] ; X86-SSE1-NEXT: movlps %xmm0, (%eax) ; X86-SSE1-NEXT: retl ; @@ -553,9 +552,8 @@ define void @store_double_seq_cst(double* %fptr, double %v) { ; X86-SSE1-LABEL: store_double_seq_cst: ; X86-SSE1: # %bb.0: ; X86-SSE1-NEXT: movl {{[0-9]+}}(%esp), %eax -; X86-SSE1-NEXT: movss {{.*#+}} xmm0 = mem[0],zero,zero,zero -; X86-SSE1-NEXT: movss {{.*#+}} xmm1 = mem[0],zero,zero,zero -; X86-SSE1-NEXT: unpcklps {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1] +; X86-SSE1-NEXT: xorps %xmm0, %xmm0 +; X86-SSE1-NEXT: movlps {{.*#+}} xmm0 = mem[0,1],xmm0[2,3] ; X86-SSE1-NEXT: movlps %xmm0, (%eax) ; X86-SSE1-NEXT: lock orl $0, (%esp) ; X86-SSE1-NEXT: retl diff --git a/llvm/test/CodeGen/X86/merge-consecutive-loads-128.ll b/llvm/test/CodeGen/X86/merge-consecutive-loads-128.ll index c6cd75b21c7f..0548886c3d16 100644 --- a/llvm/test/CodeGen/X86/merge-consecutive-loads-128.ll +++ b/llvm/test/CodeGen/X86/merge-consecutive-loads-128.ll @@ -159,9 +159,8 @@ define <4 x float> @merge_4f32_f32_34uu(float* %ptr) nounwind uwtable noinline s ; X32-SSE1-LABEL: merge_4f32_f32_34uu: ; X32-SSE1: # %bb.0: ; X32-SSE1-NEXT: movl {{[0-9]+}}(%esp), %eax -; X32-SSE1-NEXT: movss {{.*#+}} xmm0 = mem[0],zero,zero,zero -; X32-SSE1-NEXT: movss {{.*#+}} xmm1 = mem[0],zero,zero,zero -; X32-SSE1-NEXT: unpcklps {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1] +; X32-SSE1-NEXT: xorps %xmm0, %xmm0 +; X32-SSE1-NEXT: movlps {{.*#+}} xmm0 = mem[0,1],xmm0[2,3] ; X32-SSE1-NEXT: retl ; ; X32-SSE41-LABEL: merge_4f32_f32_34uu: @@ -242,11 +241,8 @@ define <4 x float> @merge_4f32_f32_45zz(float* %ptr) nounwind uwtable noinline s ; X32-SSE1-LABEL: merge_4f32_f32_45zz: ; X32-SSE1: # %bb.0: ; X32-SSE1-NEXT: movl {{[0-9]+}}(%esp), %eax -; X32-SSE1-NEXT: movss {{.*#+}} xmm0 = mem[0],zero,zero,zero -; X32-SSE1-NEXT: movss {{.*#+}} xmm1 = mem[0],zero,zero,zero -; X32-SSE1-NEXT: unpcklps {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1] -; X32-SSE1-NEXT: xorps %xmm1, %xmm1 -; X32-SSE1-NEXT: movlhps {{.*#+}} xmm0 = xmm0[0],xmm1[0] +; X32-SSE1-NEXT: xorps %xmm0, %xmm0 +; X32-SSE1-NEXT: movlps {{.*#+}} xmm0 = mem[0,1],xmm0[2,3] ; X32-SSE1-NEXT: retl ; ; X32-SSE41-LABEL: merge_4f32_f32_45zz: @@ -286,10 +282,9 @@ define <4 x float> @merge_4f32_f32_012u(float* %ptr) nounwind uwtable noinline s ; X32-SSE1-LABEL: merge_4f32_f32_012u: ; X32-SSE1: # %bb.0: ; X32-SSE1-NEXT: movl {{[0-9]+}}(%esp), %eax +; X32-SSE1-NEXT: xorps %xmm0, %xmm0 +; X32-SSE1-NEXT: movlps {{.*#+}} xmm0 = mem[0,1],xmm0[2,3] ; X32-SSE1-NEXT: movss {{.*#+}} xmm1 = mem[0],zero,zero,zero -; X32-SSE1-NEXT: movss {{.*#+}} xmm0 = mem[0],zero,zero,zero -; X32-SSE1-NEXT: movss {{.*#+}} xmm2 = mem[0],zero,zero,zero -; X32-SSE1-NEXT: unpcklps {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1] ; X32-SSE1-NEXT: movlhps {{.*#+}} xmm0 = xmm0[0],xmm1[0] ; X32-SSE1-NEXT: retl ; @@ -335,10 +330,9 @@ define <4 x float> @merge_4f32_f32_019u(float* %ptr) nounwind uwtable noinline s ; X32-SSE1-LABEL: merge_4f32_f32_019u: ; X32-SSE1: # %bb.0: ; X32-SSE1-NEXT: movl {{[0-9]+}}(%esp), %eax +; X32-SSE1-NEXT: xorps %xmm0, %xmm0 +; X32-SSE1-NEXT: movlps {{.*#+}} xmm0 = mem[0,1],xmm0[2,3] ; X32-SSE1-NEXT: movss {{.*#+}} xmm1 = mem[0],zero,zero,zero -; X32-SSE1-NEXT: movss {{.*#+}} xmm0 = mem[0],zero,zero,zero -; X32-SSE1-NEXT: movss {{.*#+}} xmm2 = mem[0],zero,zero,zero -; X32-SSE1-NEXT: unpcklps {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1] ; X32-SSE1-NEXT: movlhps {{.*#+}} xmm0 = xmm0[0],xmm1[0] ; X32-SSE1-NEXT: retl ; @@ -1197,11 +1191,8 @@ define <4 x float> @merge_4f32_f32_2345_volatile(float* %ptr) nounwind uwtable n ; X32-SSE1-NEXT: movl {{[0-9]+}}(%esp), %eax ; X32-SSE1-NEXT: movss {{.*#+}} xmm0 = mem[0],zero,zero,zero ; X32-SSE1-NEXT: movss {{.*#+}} xmm1 = mem[0],zero,zero,zero -; X32-SSE1-NEXT: unpcklps {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1] -; X32-SSE1-NEXT: movss {{.*#+}} xmm0 = mem[0],zero,zero,zero -; X32-SSE1-NEXT: movss {{.*#+}} xmm2 = mem[0],zero,zero,zero -; X32-SSE1-NEXT: unpcklps {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1] -; X32-SSE1-NEXT: movlhps {{.*#+}} xmm0 = xmm0[0],xmm1[0] +; X32-SSE1-NEXT: unpcklps {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1] +; X32-SSE1-NEXT: movhps {{.*#+}} xmm0 = xmm0[0,1],mem[0,1] ; X32-SSE1-NEXT: retl ; ; X32-SSE41-LABEL: merge_4f32_f32_2345_volatile: -- 2.34.1