From e5015d8abad0e4980c3a11cd4d68f65907504ffd Mon Sep 17 00:00:00 2001 From: Andrea Di Biagio Date: Sat, 21 Jun 2014 01:31:15 +0000 Subject: [PATCH] [X86] Add ISel patterns to select SSE3/AVX ADDSUB instructions. This patch adds ISel patterns to select SSE3/AVX ADDSUB instructions from a sequence of "vadd + vsub + blend". Example: /// typedef float float4 __attribute__((ext_vector_type(4))); float4 foo(float4 A, float4 B) { float4 X = A - B; float4 Y = A + B; return (float4){X[0], Y[1], X[2], Y[3]}; } /// Before this patch, (with flag -mcpu=corei7) llc produced the following assembly sequence: movaps %xmm0, %xmm2 addps %xmm1, %xmm2 subps %xmm1, %xmm0 blendps $10, %xmm2, %xmm0 With this patch, we now get a single addsubps %xmm1, %xmm0 llvm-svn: 211427 --- llvm/lib/Target/X86/X86InstrSSE.td | 46 ++++++++++ llvm/test/CodeGen/X86/sse3-avx-addsub.ll | 143 +++++++++++++++++++++++++++++++ 2 files changed, 189 insertions(+) create mode 100644 llvm/test/CodeGen/X86/sse3-avx-addsub.ll diff --git a/llvm/lib/Target/X86/X86InstrSSE.td b/llvm/lib/Target/X86/X86InstrSSE.td index 11c3f11..a33d2cc 100644 --- a/llvm/lib/Target/X86/X86InstrSSE.td +++ b/llvm/lib/Target/X86/X86InstrSSE.td @@ -5355,6 +5355,52 @@ let Constraints = "$src1 = $dst", Predicates = [UseSSE3] in { f128mem, SSE_ALU_F64P>, PD; } +// Patterns used to select 'addsub' instructions. +let Predicates = [HasAVX] in { + // Constant 170 corresponds to the binary mask '10101010'. + // When used as a blend mask, it allows selecting eight elements from two + // input vectors as follow: + // - Even-numbered values in the destination are copied from + // the corresponding elements in the first input vector; + // - Odd-numbered values in the destination are copied from + // the corresponding elements in the second input vector. + + def : Pat<(v8f32 (X86Blendi (v8f32 (fsub VR256:$lhs, VR256:$rhs)), + (v8f32 (fadd VR256:$lhs, VR256:$rhs)), (i32 170))), + (VADDSUBPSYrr VR256:$lhs, VR256:$rhs)>; + + // Constant 10 corresponds to the binary mask '1010'. + // In the two pattens below, constant 10 is used as a blend mask to select + // - the 1st and 3rd element from the first input vector (the 'fsub' node); + // - the 2nd and 4th element from the second input vector (the 'fadd' node). + + def : Pat<(v4f64 (X86Shufp (v4f64 (fsub VR256:$lhs, VR256:$rhs)), + (v4f64 (fadd VR256:$lhs, VR256:$rhs)), (i8 10))), + (VADDSUBPDYrr VR256:$lhs, VR256:$rhs)>; + def : Pat<(v4f32 (X86Blendi (v4f32 (fsub VR128:$lhs, VR128:$rhs)), + (v4f32 (fadd VR128:$lhs, VR128:$rhs)), (i32 10))), + (VADDSUBPSrr VR128:$lhs, VR128:$rhs)>; + + def : Pat<(v2f64 (X86Movsd (v2f64 (fadd VR128:$lhs, VR128:$rhs)), + (v2f64 (fsub VR128:$lhs, VR128:$rhs)))), + (VADDSUBPDrr VR128:$lhs, VR128:$rhs)>; +} + +let Predicates = [UseSSE3] in { + // Constant 10 corresponds to the binary mask '1010'. + // In the pattern below, it is used as a blend mask to select: + // - the 1st and 3rd element from the first input vector (the fsub node); + // - the 2nd and 4th element from the second input vector (the fadd node). + + def : Pat<(v4f32 (X86Blendi (v4f32 (fsub VR128:$lhs, VR128:$rhs)), + (v4f32 (fadd VR128:$lhs, VR128:$rhs)), (i32 10))), + (ADDSUBPSrr VR128:$lhs, VR128:$rhs)>; + + def : Pat<(v2f64 (X86Movsd (v2f64 (fadd VR128:$lhs, VR128:$rhs)), + (v2f64 (fsub VR128:$lhs, VR128:$rhs)))), + (ADDSUBPDrr VR128:$lhs, VR128:$rhs)>; +} + //===---------------------------------------------------------------------===// // SSE3 Instructions //===---------------------------------------------------------------------===// diff --git a/llvm/test/CodeGen/X86/sse3-avx-addsub.ll b/llvm/test/CodeGen/X86/sse3-avx-addsub.ll new file mode 100644 index 0000000..d7874e4 --- /dev/null +++ b/llvm/test/CodeGen/X86/sse3-avx-addsub.ll @@ -0,0 +1,143 @@ +; RUN: llc < %s -march=x86-64 -mcpu=corei7 | FileCheck %s -check-prefix=SSE -check-prefix=CHECK +; RUN: llc < %s -march=x86-64 -mcpu=corei7-avx | FileCheck %s -check-prefix=AVX -check-prefix=CHECK + +; Test ADDSUB ISel patterns. + +; All the functions below are obtained from the following source: +; +; typedef double double2 __attribute__((ext_vector_type(2))); +; typedef double double4 __attribute__((ext_vector_type(4))); +; typedef float float4 __attribute__((ext_vector_type(4))); +; typedef float float8 __attribute__((ext_vector_type(8))); +; +; float4 test1(float4 A, float4 B) { +; float4 X = A - B; +; float4 Y = A + B; +; return (float4){X[0], Y[1], X[2], Y[3]}; +; } +; +; float8 test2(float8 A, float8 B) { +; float8 X = A - B; +; float8 Y = A + B; +; return (float8){X[0], Y[1], X[2], Y[3], X[4], Y[5], X[6], [7]}; +; } +; +; double4 test3(double4 A, double4 B) { +; double4 X = A - B; +; double4 Y = A + B; +; return (double4){X[0], Y[1], X[2], Y[3]}; +; } +; +; double2 test4(double2 A, double2 B) { +; double2 X = A - B; +; double2 Y = A + B; +; return (double2){X[0], Y[1]}; +; } + +define <4 x float> @test1(<4 x float> %A, <4 x float> %B) { + %sub = fsub <4 x float> %A, %B + %add = fadd <4 x float> %A, %B + %vecinit6 = shufflevector <4 x float> %sub, <4 x float> %add, <4 x i32> + ret <4 x float> %vecinit6 +} +; CHECK-LABEL: test1 +; SSE: addsubps +; AVX: vaddsubps +; CHECK-NEXT: ret + + +define <8 x float> @test2(<8 x float> %A, <8 x float> %B) { + %sub = fsub <8 x float> %A, %B + %add = fadd <8 x float> %A, %B + %vecinit14 = shufflevector <8 x float> %sub, <8 x float> %add, <8 x i32> + ret <8 x float> %vecinit14 +} +; CHECK-LABEL: test2 +; SSE: addsubps +; SSE-NEXT: addsubps +; AVX: vaddsubps +; AVX-NOT: vaddsubps +; CHECK: ret + + +define <4 x double> @test3(<4 x double> %A, <4 x double> %B) { + %sub = fsub <4 x double> %A, %B + %add = fadd <4 x double> %A, %B + %vecinit6 = shufflevector <4 x double> %sub, <4 x double> %add, <4 x i32> + ret <4 x double> %vecinit6 +} +; CHECK-LABEL: test3 +; SSE: addsubpd +; SSE: addsubpd +; AVX: vaddsubpd +; AVX-NOT: vaddsubpd +; CHECK: ret + + +define <2 x double> @test4(<2 x double> %A, <2 x double> %B) #0 { + %add = fadd <2 x double> %A, %B + %sub = fsub <2 x double> %A, %B + %vecinit2 = shufflevector <2 x double> %sub, <2 x double> %add, <2 x i32> + ret <2 x double> %vecinit2 +} +; CHECK-LABEL: test4 +; SSE: addsubpd +; AVX: vaddsubpd +; CHECK-NEXT: ret + + +define <4 x float> @test1b(<4 x float> %A, <4 x float>* %B) { + %1 = load <4 x float>* %B + %add = fadd <4 x float> %A, %1 + %sub = fsub <4 x float> %A, %1 + %vecinit6 = shufflevector <4 x float> %sub, <4 x float> %add, <4 x i32> + ret <4 x float> %vecinit6 +} +; CHECK-LABEL: test1b +; SSE: addsubps +; AVX: vaddsubps +; CHECK-NEXT: ret + + +define <8 x float> @test2b(<8 x float> %A, <8 x float>* %B) { + %1 = load <8 x float>* %B + %add = fadd <8 x float> %A, %1 + %sub = fsub <8 x float> %A, %1 + %vecinit14 = shufflevector <8 x float> %sub, <8 x float> %add, <8 x i32> + ret <8 x float> %vecinit14 +} +; CHECK-LABEL: test2b +; SSE: addsubps +; SSE-NEXT: addsubps +; AVX: vaddsubps +; AVX-NOT: vaddsubps +; CHECK: ret + + +define <4 x double> @test3b(<4 x double> %A, <4 x double>* %B) { + %1 = load <4 x double>* %B + %add = fadd <4 x double> %A, %1 + %sub = fsub <4 x double> %A, %1 + %vecinit6 = shufflevector <4 x double> %sub, <4 x double> %add, <4 x i32> + ret <4 x double> %vecinit6 +} +; CHECK-LABEL: test3b +; SSE: addsubpd +; SSE: addsubpd +; AVX: vaddsubpd +; AVX-NOT: vaddsubpd +; CHECK: ret + + +define <2 x double> @test4b(<2 x double> %A, <2 x double>* %B) { + %1 = load <2 x double>* %B + %sub = fsub <2 x double> %A, %1 + %add = fadd <2 x double> %A, %1 + %vecinit2 = shufflevector <2 x double> %sub, <2 x double> %add, <2 x i32> + ret <2 x double> %vecinit2 +} +; CHECK-LABEL: test4b +; SSE: addsubpd +; AVX: vaddsubpd +; CHECK-NEXT: ret + -- 2.7.4