From 6eb66890e473d17c73f5167ed1d182598ba501c9 Mon Sep 17 00:00:00 2001 From: Simon Pilgrim Date: Sun, 12 Feb 2023 18:13:52 +0000 Subject: [PATCH] [X86] Add some basic matrix multiplication test coverage Based off the IR generated from matrix_type / -fenable-matrix - including 2x2, 3x3, 4x4 and 8x8 matrices --- llvm/test/CodeGen/X86/matrix-multiply.ll | 5084 ++++++++++++++++++++++++++++++ 1 file changed, 5084 insertions(+) create mode 100644 llvm/test/CodeGen/X86/matrix-multiply.ll diff --git a/llvm/test/CodeGen/X86/matrix-multiply.ll b/llvm/test/CodeGen/X86/matrix-multiply.ll new file mode 100644 index 0000000..076f333 --- /dev/null +++ b/llvm/test/CodeGen/X86/matrix-multiply.ll @@ -0,0 +1,5084 @@ +; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py +; RUN: llc < %s -disable-peephole -mtriple=x86_64-unknown-unknown | FileCheck %s --check-prefixes=SSE +; RUN: llc < %s -disable-peephole -mtriple=x86_64-unknown-unknown -mattr=+avx | FileCheck %s --check-prefixes=AVX,AVX1 +; RUN: llc < %s -disable-peephole -mtriple=x86_64-unknown-unknown -mattr=+avx2 | FileCheck %s --check-prefixes=AVX,AVX2 +; RUN: llc < %s -disable-peephole -mtriple=x86_64-unknown-unknown -mattr=+avx512f | FileCheck %s --check-prefixes=AVX,AVX512,AVX512F +; RUN: llc < %s -disable-peephole -mtriple=x86_64-unknown-unknown -mattr=+avx512vl | FileCheck %s --check-prefixes=AVX,AVX512,AVX512VL + +; +; Basic matrix multiply tests based on the pattern: +; +; using matrix_ty = float __attribute__((matrix_type(2,2))); +; matrix_ty test_mul2x2(matrix_ty a0, matrix_ty a1) nounwind { +; return a0 * a1; +; } +; + +define <4 x float> @test_mul2x2_f32(<4 x float> %a0, <4 x float> %a1) nounwind { +; SSE-LABEL: test_mul2x2_f32: +; SSE: # %bb.0: # %entry +; SSE-NEXT: movaps %xmm1, %xmm2 +; SSE-NEXT: unpcklps {{.*#+}} xmm2 = xmm2[0],xmm1[0],xmm2[1],xmm1[1] +; SSE-NEXT: mulps %xmm0, %xmm2 +; SSE-NEXT: movaps %xmm1, %xmm3 +; SSE-NEXT: unpckhps {{.*#+}} xmm3 = xmm3[2],xmm1[2],xmm3[3],xmm1[3] +; SSE-NEXT: mulps %xmm0, %xmm3 +; SSE-NEXT: movaps %xmm0, %xmm4 +; SSE-NEXT: unpckhpd {{.*#+}} xmm4 = xmm4[1],xmm0[1] +; SSE-NEXT: movaps %xmm1, %xmm0 +; SSE-NEXT: shufps {{.*#+}} xmm0 = xmm0[1,1],xmm1[1,1] +; SSE-NEXT: mulps %xmm4, %xmm0 +; SSE-NEXT: addps %xmm2, %xmm0 +; SSE-NEXT: shufps {{.*#+}} xmm1 = xmm1[3,3,3,3] +; SSE-NEXT: mulps %xmm4, %xmm1 +; SSE-NEXT: addps %xmm3, %xmm1 +; SSE-NEXT: movlhps {{.*#+}} xmm0 = xmm0[0],xmm1[0] +; SSE-NEXT: retq +; +; AVX1-LABEL: test_mul2x2_f32: +; AVX1: # %bb.0: # %entry +; AVX1-NEXT: vpermilpd {{.*#+}} xmm2 = xmm0[1,0] +; AVX1-NEXT: vmovsldup {{.*#+}} xmm3 = xmm1[0,0,2,2] +; AVX1-NEXT: vmulps %xmm3, %xmm0, %xmm3 +; AVX1-NEXT: vmovshdup {{.*#+}} xmm4 = xmm1[1,1,3,3] +; AVX1-NEXT: vmulps %xmm4, %xmm2, %xmm4 +; AVX1-NEXT: vaddps %xmm4, %xmm3, %xmm3 +; AVX1-NEXT: vpermilps {{.*#+}} xmm4 = xmm1[2,2,2,2] +; AVX1-NEXT: vmulps %xmm4, %xmm0, %xmm0 +; AVX1-NEXT: vpermilps {{.*#+}} xmm1 = xmm1[3,3,3,3] +; AVX1-NEXT: vmulps %xmm1, %xmm2, %xmm1 +; AVX1-NEXT: vaddps %xmm1, %xmm0, %xmm0 +; AVX1-NEXT: vmovlhps {{.*#+}} xmm0 = xmm3[0],xmm0[0] +; AVX1-NEXT: retq +; +; AVX2-LABEL: test_mul2x2_f32: +; AVX2: # %bb.0: # %entry +; AVX2-NEXT: vpermilpd {{.*#+}} xmm2 = xmm0[1,0] +; AVX2-NEXT: vbroadcastss %xmm1, %xmm3 +; AVX2-NEXT: vmulps %xmm3, %xmm0, %xmm3 +; AVX2-NEXT: vmovshdup {{.*#+}} xmm4 = xmm1[1,1,3,3] +; AVX2-NEXT: vmulps %xmm4, %xmm2, %xmm4 +; AVX2-NEXT: vaddps %xmm4, %xmm3, %xmm3 +; AVX2-NEXT: vpermilps {{.*#+}} xmm4 = xmm1[2,2,2,2] +; AVX2-NEXT: vmulps %xmm4, %xmm0, %xmm0 +; AVX2-NEXT: vpermilps {{.*#+}} xmm1 = xmm1[3,3,3,3] +; AVX2-NEXT: vmulps %xmm1, %xmm2, %xmm1 +; AVX2-NEXT: vaddps %xmm1, %xmm0, %xmm0 +; AVX2-NEXT: vmovlhps {{.*#+}} xmm0 = xmm3[0],xmm0[0] +; AVX2-NEXT: retq +; +; AVX512-LABEL: test_mul2x2_f32: +; AVX512: # %bb.0: # %entry +; AVX512-NEXT: vpermilpd {{.*#+}} xmm2 = xmm0[1,0] +; AVX512-NEXT: vbroadcastss %xmm1, %xmm3 +; AVX512-NEXT: vmulps %xmm3, %xmm0, %xmm3 +; AVX512-NEXT: vmovshdup {{.*#+}} xmm4 = xmm1[1,1,3,3] +; AVX512-NEXT: vmulps %xmm4, %xmm2, %xmm4 +; AVX512-NEXT: vaddps %xmm4, %xmm3, %xmm3 +; AVX512-NEXT: vpermilps {{.*#+}} xmm4 = xmm1[2,2,2,2] +; AVX512-NEXT: vmulps %xmm4, %xmm0, %xmm0 +; AVX512-NEXT: vpermilps {{.*#+}} xmm1 = xmm1[3,3,3,3] +; AVX512-NEXT: vmulps %xmm1, %xmm2, %xmm1 +; AVX512-NEXT: vaddps %xmm1, %xmm0, %xmm0 +; AVX512-NEXT: vmovlhps {{.*#+}} xmm0 = xmm3[0],xmm0[0] +; AVX512-NEXT: retq +entry: + %split = shufflevector <4 x float> %a0, <4 x float> poison, <2 x i32> + %split1 = shufflevector <4 x float> %a0, <4 x float> poison, <2 x i32> + %splat.splat = shufflevector <4 x float> %a1, <4 x float> undef, <2 x i32> zeroinitializer + %0 = fmul <2 x float> %split, %splat.splat + %splat.splat6 = shufflevector <4 x float> %a1, <4 x float> undef, <2 x i32> + %1 = fmul <2 x float> %split1, %splat.splat6 + %2 = fadd <2 x float> %0, %1 + %splat.splat9 = shufflevector <4 x float> %a1, <4 x float> undef, <2 x i32> + %3 = fmul <2 x float> %split, %splat.splat9 + %splat.splat12 = shufflevector <4 x float> %a1, <4 x float> undef, <2 x i32> + %4 = fmul <2 x float> %split1, %splat.splat12 + %5 = fadd <2 x float> %3, %4 + %6 = shufflevector <2 x float> %2, <2 x float> %5, <4 x i32> + ret <4 x float> %6 +} + +define <4 x double> @test_mul2x2_f64(<4 x double> %a0, <4 x double> %a1) nounwind { +; SSE-LABEL: test_mul2x2_f64: +; SSE: # %bb.0: # %entry +; SSE-NEXT: movapd %xmm2, %xmm4 +; SSE-NEXT: unpcklpd {{.*#+}} xmm4 = xmm4[0],xmm2[0] +; SSE-NEXT: mulpd %xmm0, %xmm4 +; SSE-NEXT: unpckhpd {{.*#+}} xmm2 = xmm2[1,1] +; SSE-NEXT: mulpd %xmm1, %xmm2 +; SSE-NEXT: addpd %xmm2, %xmm4 +; SSE-NEXT: movapd %xmm3, %xmm2 +; SSE-NEXT: unpcklpd {{.*#+}} xmm2 = xmm2[0],xmm3[0] +; SSE-NEXT: mulpd %xmm0, %xmm2 +; SSE-NEXT: unpckhpd {{.*#+}} xmm3 = xmm3[1,1] +; SSE-NEXT: mulpd %xmm3, %xmm1 +; SSE-NEXT: addpd %xmm2, %xmm1 +; SSE-NEXT: movapd %xmm4, %xmm0 +; SSE-NEXT: retq +; +; AVX1-LABEL: test_mul2x2_f64: +; AVX1: # %bb.0: # %entry +; AVX1-NEXT: vperm2f128 {{.*#+}} ymm2 = ymm0[2,3,2,3] +; AVX1-NEXT: vpermilpd {{.*#+}} ymm3 = ymm1[1,1,3,3] +; AVX1-NEXT: vmulpd %ymm3, %ymm2, %ymm2 +; AVX1-NEXT: vinsertf128 $1, %xmm0, %ymm0, %ymm0 +; AVX1-NEXT: vmovddup {{.*#+}} ymm1 = ymm1[0,0,2,2] +; AVX1-NEXT: vmulpd %ymm1, %ymm0, %ymm0 +; AVX1-NEXT: vaddpd %ymm2, %ymm0, %ymm0 +; AVX1-NEXT: retq +; +; AVX2-LABEL: test_mul2x2_f64: +; AVX2: # %bb.0: # %entry +; AVX2-NEXT: vpermilpd {{.*#+}} ymm2 = ymm1[1,1,3,3] +; AVX2-NEXT: vpermpd {{.*#+}} ymm3 = ymm0[2,3,2,3] +; AVX2-NEXT: vmulpd %ymm2, %ymm3, %ymm2 +; AVX2-NEXT: vmovddup {{.*#+}} ymm1 = ymm1[0,0,2,2] +; AVX2-NEXT: vpermpd {{.*#+}} ymm0 = ymm0[0,1,0,1] +; AVX2-NEXT: vmulpd %ymm1, %ymm0, %ymm0 +; AVX2-NEXT: vaddpd %ymm2, %ymm0, %ymm0 +; AVX2-NEXT: retq +; +; AVX512-LABEL: test_mul2x2_f64: +; AVX512: # %bb.0: # %entry +; AVX512-NEXT: vpermilpd {{.*#+}} ymm2 = ymm1[1,1,3,3] +; AVX512-NEXT: vpermpd {{.*#+}} ymm3 = ymm0[2,3,2,3] +; AVX512-NEXT: vmulpd %ymm2, %ymm3, %ymm2 +; AVX512-NEXT: vmovddup {{.*#+}} ymm1 = ymm1[0,0,2,2] +; AVX512-NEXT: vpermpd {{.*#+}} ymm0 = ymm0[0,1,0,1] +; AVX512-NEXT: vmulpd %ymm1, %ymm0, %ymm0 +; AVX512-NEXT: vaddpd %ymm2, %ymm0, %ymm0 +; AVX512-NEXT: retq +entry: + %split = shufflevector <4 x double> %a0, <4 x double> poison, <2 x i32> + %split1 = shufflevector <4 x double> %a0, <4 x double> poison, <2 x i32> + %splat.splat = shufflevector <4 x double> %a1, <4 x double> undef, <2 x i32> zeroinitializer + %0 = fmul <2 x double> %split, %splat.splat + %splat.splat6 = shufflevector <4 x double> %a1, <4 x double> undef, <2 x i32> + %1 = fmul <2 x double> %split1, %splat.splat6 + %2 = fadd <2 x double> %0, %1 + %splat.splat9 = shufflevector <4 x double> %a1, <4 x double> undef, <2 x i32> + %3 = fmul <2 x double> %split, %splat.splat9 + %splat.splat12 = shufflevector <4 x double> %a1, <4 x double> undef, <2 x i32> + %4 = fmul <2 x double> %split1, %splat.splat12 + %5 = fadd <2 x double> %3, %4 + %6 = shufflevector <2 x double> %2, <2 x double> %5, <4 x i32> + ret <4 x double> %6 +} + +define <9 x float> @test_mul3x3_f32(<9 x float> %a0, <9 x float> %a1) nounwind { +; SSE-LABEL: test_mul3x3_f32: +; SSE: # %bb.0: # %entry +; SSE-NEXT: movq %rdi, %rax +; SSE-NEXT: movss {{.*#+}} xmm8 = mem[0],zero,zero,zero +; SSE-NEXT: movss {{.*#+}} xmm10 = mem[0],zero,zero,zero +; SSE-NEXT: movss {{.*#+}} xmm9 = mem[0],zero,zero,zero +; SSE-NEXT: movss {{.*#+}} xmm11 = mem[0],zero,zero,zero +; SSE-NEXT: movss {{.*#+}} xmm12 = mem[0],zero,zero,zero +; SSE-NEXT: unpcklps {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1] +; SSE-NEXT: movaps %xmm2, %xmm13 +; SSE-NEXT: mulss %xmm12, %xmm13 +; SSE-NEXT: unpcklps {{.*#+}} xmm12 = xmm12[0,0,1,1] +; SSE-NEXT: mulps %xmm0, %xmm12 +; SSE-NEXT: unpcklps {{.*#+}} xmm3 = xmm3[0],xmm4[0],xmm3[1],xmm4[1] +; SSE-NEXT: movaps %xmm5, %xmm1 +; SSE-NEXT: mulss %xmm11, %xmm1 +; SSE-NEXT: unpcklps {{.*#+}} xmm11 = xmm11[0,0,1,1] +; SSE-NEXT: mulps %xmm3, %xmm11 +; SSE-NEXT: addps %xmm12, %xmm11 +; SSE-NEXT: movss {{.*#+}} xmm4 = mem[0],zero,zero,zero +; SSE-NEXT: unpcklps {{.*#+}} xmm6 = xmm6[0],xmm7[0],xmm6[1],xmm7[1] +; SSE-NEXT: movaps %xmm9, %xmm7 +; SSE-NEXT: mulss %xmm4, %xmm7 +; SSE-NEXT: unpcklps {{.*#+}} xmm4 = xmm4[0,0,1,1] +; SSE-NEXT: mulps %xmm6, %xmm4 +; SSE-NEXT: addps %xmm11, %xmm4 +; SSE-NEXT: movss {{.*#+}} xmm11 = mem[0],zero,zero,zero +; SSE-NEXT: addss %xmm13, %xmm1 +; SSE-NEXT: addss %xmm7, %xmm1 +; SSE-NEXT: movaps %xmm2, %xmm7 +; SSE-NEXT: mulss %xmm11, %xmm7 +; SSE-NEXT: unpcklps {{.*#+}} xmm11 = xmm11[0,0,1,1] +; SSE-NEXT: mulps %xmm0, %xmm11 +; SSE-NEXT: movaps %xmm5, %xmm12 +; SSE-NEXT: mulss %xmm10, %xmm12 +; SSE-NEXT: unpcklps {{.*#+}} xmm10 = xmm10[0,0,1,1] +; SSE-NEXT: mulps %xmm3, %xmm10 +; SSE-NEXT: addps %xmm11, %xmm10 +; SSE-NEXT: movaps %xmm9, %xmm11 +; SSE-NEXT: mulss %xmm8, %xmm11 +; SSE-NEXT: unpcklps {{.*#+}} xmm8 = xmm8[0,0,1,1] +; SSE-NEXT: mulps %xmm6, %xmm8 +; SSE-NEXT: addps %xmm10, %xmm8 +; SSE-NEXT: addss %xmm7, %xmm12 +; SSE-NEXT: addss %xmm11, %xmm12 +; SSE-NEXT: movaps %xmm8, %xmm7 +; SSE-NEXT: movlhps {{.*#+}} xmm7 = xmm7[0],xmm12[0] +; SSE-NEXT: movss {{.*#+}} xmm10 = mem[0],zero,zero,zero +; SSE-NEXT: mulss %xmm10, %xmm2 +; SSE-NEXT: unpcklps {{.*#+}} xmm10 = xmm10[0,0,1,1] +; SSE-NEXT: mulps %xmm0, %xmm10 +; SSE-NEXT: movss {{.*#+}} xmm0 = mem[0],zero,zero,zero +; SSE-NEXT: mulss %xmm0, %xmm5 +; SSE-NEXT: unpcklps {{.*#+}} xmm0 = xmm0[0,0,1,1] +; SSE-NEXT: mulps %xmm3, %xmm0 +; SSE-NEXT: addps %xmm10, %xmm0 +; SSE-NEXT: movss {{.*#+}} xmm3 = mem[0],zero,zero,zero +; SSE-NEXT: mulss %xmm3, %xmm9 +; SSE-NEXT: unpcklps {{.*#+}} xmm3 = xmm3[0,0,1,1] +; SSE-NEXT: mulps %xmm6, %xmm3 +; SSE-NEXT: addps %xmm0, %xmm3 +; SSE-NEXT: shufps {{.*#+}} xmm7 = xmm7[1,2],xmm3[0,1] +; SSE-NEXT: addss %xmm2, %xmm5 +; SSE-NEXT: addss %xmm9, %xmm5 +; SSE-NEXT: movlhps {{.*#+}} xmm8 = xmm8[0],xmm1[0] +; SSE-NEXT: shufps {{.*#+}} xmm4 = xmm4[0,1],xmm8[2,0] +; SSE-NEXT: movss %xmm5, 32(%rdi) +; SSE-NEXT: movaps %xmm7, 16(%rdi) +; SSE-NEXT: movaps %xmm4, (%rdi) +; SSE-NEXT: retq +; +; AVX1-LABEL: test_mul3x3_f32: +; AVX1: # %bb.0: # %entry +; AVX1-NEXT: movq %rdi, %rax +; AVX1-NEXT: vmovss {{.*#+}} xmm8 = mem[0],zero,zero,zero +; AVX1-NEXT: vinsertps {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[2,3] +; AVX1-NEXT: vbroadcastss {{[0-9]+}}(%rsp), %xmm1 +; AVX1-NEXT: vmulps %xmm1, %xmm0, %xmm9 +; AVX1-NEXT: vinsertps {{.*#+}} xmm3 = xmm3[0],xmm4[0],xmm3[2,3] +; AVX1-NEXT: vbroadcastss {{[0-9]+}}(%rsp), %xmm4 +; AVX1-NEXT: vmulps %xmm4, %xmm3, %xmm10 +; AVX1-NEXT: vaddps %xmm10, %xmm9, %xmm9 +; AVX1-NEXT: vinsertps {{.*#+}} xmm6 = xmm6[0],xmm7[0],xmm6[2,3] +; AVX1-NEXT: vbroadcastss {{[0-9]+}}(%rsp), %xmm7 +; AVX1-NEXT: vmulps %xmm7, %xmm6, %xmm10 +; AVX1-NEXT: vaddps %xmm10, %xmm9, %xmm9 +; AVX1-NEXT: vmulss %xmm1, %xmm2, %xmm1 +; AVX1-NEXT: vmulss %xmm4, %xmm5, %xmm4 +; AVX1-NEXT: vaddss %xmm4, %xmm1, %xmm1 +; AVX1-NEXT: vmulss %xmm7, %xmm8, %xmm4 +; AVX1-NEXT: vaddss %xmm4, %xmm1, %xmm1 +; AVX1-NEXT: vinsertps {{.*#+}} xmm1 = xmm9[0,1],xmm1[0],xmm9[3] +; AVX1-NEXT: vbroadcastss {{[0-9]+}}(%rsp), %xmm4 +; AVX1-NEXT: vmulps %xmm4, %xmm0, %xmm7 +; AVX1-NEXT: vbroadcastss {{[0-9]+}}(%rsp), %xmm9 +; AVX1-NEXT: vmulps %xmm3, %xmm9, %xmm10 +; AVX1-NEXT: vaddps %xmm7, %xmm10, %xmm7 +; AVX1-NEXT: vbroadcastss {{[0-9]+}}(%rsp), %xmm10 +; AVX1-NEXT: vmulps %xmm6, %xmm10, %xmm11 +; AVX1-NEXT: vaddps %xmm7, %xmm11, %xmm7 +; AVX1-NEXT: vmulss %xmm4, %xmm2, %xmm4 +; AVX1-NEXT: vmulss %xmm5, %xmm9, %xmm9 +; AVX1-NEXT: vaddss %xmm4, %xmm9, %xmm4 +; AVX1-NEXT: vmulss %xmm10, %xmm8, %xmm9 +; AVX1-NEXT: vaddss %xmm4, %xmm9, %xmm4 +; AVX1-NEXT: vinsertps {{.*#+}} xmm4 = xmm7[0,1],xmm4[0],xmm7[3] +; AVX1-NEXT: vbroadcastss {{[0-9]+}}(%rsp), %xmm9 +; AVX1-NEXT: vmulps %xmm0, %xmm9, %xmm0 +; AVX1-NEXT: vbroadcastss {{[0-9]+}}(%rsp), %xmm10 +; AVX1-NEXT: vmulps %xmm3, %xmm10, %xmm3 +; AVX1-NEXT: vaddps %xmm3, %xmm0, %xmm0 +; AVX1-NEXT: vbroadcastss {{[0-9]+}}(%rsp), %xmm3 +; AVX1-NEXT: vmulps %xmm3, %xmm6, %xmm6 +; AVX1-NEXT: vaddps %xmm6, %xmm0, %xmm0 +; AVX1-NEXT: vmulss %xmm2, %xmm9, %xmm2 +; AVX1-NEXT: vmulss %xmm5, %xmm10, %xmm5 +; AVX1-NEXT: vaddss %xmm5, %xmm2, %xmm2 +; AVX1-NEXT: vmulss %xmm3, %xmm8, %xmm3 +; AVX1-NEXT: vaddss %xmm3, %xmm2, %xmm2 +; AVX1-NEXT: vinsertps {{.*#+}} xmm1 = xmm1[0,1,2],xmm7[0] +; AVX1-NEXT: vpermilps {{.*#+}} xmm3 = xmm4[1,2,2,3] +; AVX1-NEXT: vinsertf128 $1, %xmm3, %ymm1, %ymm1 +; AVX1-NEXT: vmovddup {{.*#+}} xmm0 = xmm0[0,0] +; AVX1-NEXT: vinsertf128 $1, %xmm0, %ymm0, %ymm0 +; AVX1-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2,3,4,5],ymm0[6,7] +; AVX1-NEXT: vmovss %xmm2, 32(%rdi) +; AVX1-NEXT: vmovaps %ymm0, (%rdi) +; AVX1-NEXT: vzeroupper +; AVX1-NEXT: retq +; +; AVX2-LABEL: test_mul3x3_f32: +; AVX2: # %bb.0: # %entry +; AVX2-NEXT: movq %rdi, %rax +; AVX2-NEXT: vmovss {{.*#+}} xmm8 = mem[0],zero,zero,zero +; AVX2-NEXT: vinsertps {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[2,3] +; AVX2-NEXT: vbroadcastss {{[0-9]+}}(%rsp), %xmm1 +; AVX2-NEXT: vmulps %xmm1, %xmm0, %xmm9 +; AVX2-NEXT: vinsertps {{.*#+}} xmm3 = xmm3[0],xmm4[0],xmm3[2,3] +; AVX2-NEXT: vbroadcastss {{[0-9]+}}(%rsp), %xmm4 +; AVX2-NEXT: vmulps %xmm4, %xmm3, %xmm10 +; AVX2-NEXT: vaddps %xmm10, %xmm9, %xmm9 +; AVX2-NEXT: vinsertps {{.*#+}} xmm6 = xmm6[0],xmm7[0],xmm6[2,3] +; AVX2-NEXT: vbroadcastss {{[0-9]+}}(%rsp), %xmm7 +; AVX2-NEXT: vmulps %xmm7, %xmm6, %xmm10 +; AVX2-NEXT: vaddps %xmm10, %xmm9, %xmm9 +; AVX2-NEXT: vmulss %xmm1, %xmm2, %xmm1 +; AVX2-NEXT: vmulss %xmm4, %xmm5, %xmm4 +; AVX2-NEXT: vaddss %xmm4, %xmm1, %xmm1 +; AVX2-NEXT: vmulss %xmm7, %xmm8, %xmm4 +; AVX2-NEXT: vaddss %xmm4, %xmm1, %xmm1 +; AVX2-NEXT: vinsertps {{.*#+}} xmm1 = xmm9[0,1],xmm1[0],xmm9[3] +; AVX2-NEXT: vbroadcastss {{[0-9]+}}(%rsp), %xmm4 +; AVX2-NEXT: vmulps %xmm4, %xmm0, %xmm7 +; AVX2-NEXT: vbroadcastss {{[0-9]+}}(%rsp), %xmm9 +; AVX2-NEXT: vmulps %xmm3, %xmm9, %xmm10 +; AVX2-NEXT: vaddps %xmm7, %xmm10, %xmm7 +; AVX2-NEXT: vbroadcastss {{[0-9]+}}(%rsp), %xmm10 +; AVX2-NEXT: vmulps %xmm6, %xmm10, %xmm11 +; AVX2-NEXT: vaddps %xmm7, %xmm11, %xmm7 +; AVX2-NEXT: vmulss %xmm4, %xmm2, %xmm4 +; AVX2-NEXT: vmulss %xmm5, %xmm9, %xmm9 +; AVX2-NEXT: vaddss %xmm4, %xmm9, %xmm4 +; AVX2-NEXT: vmulss %xmm10, %xmm8, %xmm9 +; AVX2-NEXT: vaddss %xmm4, %xmm9, %xmm4 +; AVX2-NEXT: vinsertps {{.*#+}} xmm4 = xmm7[0,1],xmm4[0],xmm7[3] +; AVX2-NEXT: vbroadcastss {{[0-9]+}}(%rsp), %xmm7 +; AVX2-NEXT: vmulps %xmm7, %xmm0, %xmm0 +; AVX2-NEXT: vbroadcastss {{[0-9]+}}(%rsp), %xmm9 +; AVX2-NEXT: vmulps %xmm3, %xmm9, %xmm3 +; AVX2-NEXT: vaddps %xmm3, %xmm0, %xmm0 +; AVX2-NEXT: vbroadcastss {{[0-9]+}}(%rsp), %xmm3 +; AVX2-NEXT: vmulps %xmm3, %xmm6, %xmm6 +; AVX2-NEXT: vaddps %xmm6, %xmm0, %xmm0 +; AVX2-NEXT: vmulss %xmm7, %xmm2, %xmm2 +; AVX2-NEXT: vmulss %xmm5, %xmm9, %xmm5 +; AVX2-NEXT: vaddss %xmm5, %xmm2, %xmm2 +; AVX2-NEXT: vmulss %xmm3, %xmm8, %xmm3 +; AVX2-NEXT: vaddss %xmm3, %xmm2, %xmm2 +; AVX2-NEXT: vinsertf128 $1, %xmm4, %ymm1, %ymm1 +; AVX2-NEXT: vmovaps {{.*#+}} ymm3 = <0,1,2,4,5,6,u,u> +; AVX2-NEXT: vpermps %ymm1, %ymm3, %ymm1 +; AVX2-NEXT: vbroadcastsd %xmm0, %ymm0 +; AVX2-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2,3,4,5],ymm0[6,7] +; AVX2-NEXT: vmovss %xmm2, 32(%rdi) +; AVX2-NEXT: vmovaps %ymm0, (%rdi) +; AVX2-NEXT: vzeroupper +; AVX2-NEXT: retq +; +; AVX512F-LABEL: test_mul3x3_f32: +; AVX512F: # %bb.0: # %entry +; AVX512F-NEXT: valignd {{.*#+}} zmm2 = zmm0[3,4,5,6,7,8,9,10,11,12,13,14,15,0,1,2] +; AVX512F-NEXT: vbroadcastss %xmm1, %xmm3 +; AVX512F-NEXT: vmulps %xmm3, %xmm0, %xmm3 +; AVX512F-NEXT: vextractf128 $1, %ymm0, %xmm5 +; AVX512F-NEXT: vmovshdup {{.*#+}} xmm6 = xmm1[1,1,3,3] +; AVX512F-NEXT: vmulps %xmm6, %xmm2, %xmm4 +; AVX512F-NEXT: vaddps %xmm4, %xmm3, %xmm4 +; AVX512F-NEXT: vpermilpd {{.*#+}} xmm3 = xmm5[1,0] +; AVX512F-NEXT: vpermilps {{.*#+}} xmm7 = xmm1[3,3,3,3] +; AVX512F-NEXT: vpermilpd {{.*#+}} xmm8 = xmm1[1,0] +; AVX512F-NEXT: vpermilps {{.*#+}} xmm9 = xmm1[2,2,2,2] +; AVX512F-NEXT: vmulps %xmm3, %xmm9, %xmm9 +; AVX512F-NEXT: vaddps %xmm4, %xmm9, %xmm9 +; AVX512F-NEXT: vpermilpd {{.*#+}} xmm4 = xmm0[1,0] +; AVX512F-NEXT: vmulss %xmm1, %xmm4, %xmm10 +; AVX512F-NEXT: vmovshdup {{.*#+}} xmm5 = xmm5[1,1,3,3] +; AVX512F-NEXT: vmulss %xmm6, %xmm5, %xmm6 +; AVX512F-NEXT: vaddss %xmm6, %xmm10, %xmm6 +; AVX512F-NEXT: vextractf32x4 $2, %zmm0, %xmm10 +; AVX512F-NEXT: vmulss %xmm8, %xmm10, %xmm8 +; AVX512F-NEXT: vaddss %xmm6, %xmm8, %xmm6 +; AVX512F-NEXT: vinsertps {{.*#+}} xmm6 = xmm9[0,1],xmm6[0],xmm9[3] +; AVX512F-NEXT: vmulps %xmm7, %xmm0, %xmm8 +; AVX512F-NEXT: vextractf128 $1, %ymm1, %xmm9 +; AVX512F-NEXT: vmovsldup {{.*#+}} xmm11 = xmm9[0,0,2,2] +; AVX512F-NEXT: vmulps %xmm2, %xmm11, %xmm11 +; AVX512F-NEXT: vaddps %xmm11, %xmm8, %xmm8 +; AVX512F-NEXT: vmovshdup {{.*#+}} xmm11 = xmm9[1,1,3,3] +; AVX512F-NEXT: vmulps %xmm3, %xmm11, %xmm12 +; AVX512F-NEXT: vaddps %xmm12, %xmm8, %xmm8 +; AVX512F-NEXT: vmulss %xmm7, %xmm4, %xmm7 +; AVX512F-NEXT: vmulss %xmm5, %xmm9, %xmm12 +; AVX512F-NEXT: vaddss %xmm7, %xmm12, %xmm7 +; AVX512F-NEXT: vmulss %xmm11, %xmm10, %xmm11 +; AVX512F-NEXT: vaddss %xmm7, %xmm11, %xmm7 +; AVX512F-NEXT: vinsertps {{.*#+}} xmm7 = xmm8[0,1],xmm7[0],xmm8[3] +; AVX512F-NEXT: vpermilps {{.*#+}} xmm8 = xmm9[3,3,3,3] +; AVX512F-NEXT: vpermilpd {{.*#+}} xmm11 = xmm9[1,0] +; AVX512F-NEXT: vpermilps {{.*#+}} xmm9 = xmm9[2,2,2,2] +; AVX512F-NEXT: vmulps %xmm0, %xmm9, %xmm0 +; AVX512F-NEXT: vmulps %xmm2, %xmm8, %xmm2 +; AVX512F-NEXT: vaddps %xmm2, %xmm0, %xmm0 +; AVX512F-NEXT: vextractf32x4 $2, %zmm1, %xmm1 +; AVX512F-NEXT: vbroadcastss %xmm1, %xmm2 +; AVX512F-NEXT: vmulps %xmm2, %xmm3, %xmm2 +; AVX512F-NEXT: vaddps %xmm2, %xmm0, %xmm0 +; AVX512F-NEXT: vmulss %xmm4, %xmm11, %xmm2 +; AVX512F-NEXT: vmulss %xmm5, %xmm8, %xmm3 +; AVX512F-NEXT: vaddss %xmm3, %xmm2, %xmm2 +; AVX512F-NEXT: vmulss %xmm1, %xmm10, %xmm1 +; AVX512F-NEXT: vaddss %xmm1, %xmm2, %xmm1 +; AVX512F-NEXT: vinsertps {{.*#+}} xmm1 = xmm0[0,1],xmm1[0],xmm0[3] +; AVX512F-NEXT: vinsertf32x4 $1, %xmm7, %zmm6, %zmm2 +; AVX512F-NEXT: vmovaps {{.*#+}} zmm0 = <0,1,2,4,5,6,16,17,18,u,u,u,u,u,u,u> +; AVX512F-NEXT: vpermi2ps %zmm1, %zmm2, %zmm0 +; AVX512F-NEXT: retq +; +; AVX512VL-LABEL: test_mul3x3_f32: +; AVX512VL: # %bb.0: # %entry +; AVX512VL-NEXT: valignd {{.*#+}} zmm2 = zmm0[3,4,5,6,7,8,9,10,11,12,13,14,15,0,1,2] +; AVX512VL-NEXT: vbroadcastss %xmm1, %xmm3 +; AVX512VL-NEXT: vmulps %xmm3, %xmm0, %xmm3 +; AVX512VL-NEXT: vextractf128 $1, %ymm0, %xmm4 +; AVX512VL-NEXT: vmovshdup {{.*#+}} xmm5 = xmm1[1,1,3,3] +; AVX512VL-NEXT: vmulps %xmm5, %xmm2, %xmm6 +; AVX512VL-NEXT: vaddps %xmm6, %xmm3, %xmm3 +; AVX512VL-NEXT: vpermilpd {{.*#+}} xmm6 = xmm4[1,0] +; AVX512VL-NEXT: vpermilps {{.*#+}} xmm7 = xmm1[3,3,3,3] +; AVX512VL-NEXT: vpermilpd {{.*#+}} xmm8 = xmm1[1,0] +; AVX512VL-NEXT: vpermilps {{.*#+}} xmm9 = xmm1[2,2,2,2] +; AVX512VL-NEXT: vmulps %xmm6, %xmm9, %xmm9 +; AVX512VL-NEXT: vaddps %xmm3, %xmm9, %xmm3 +; AVX512VL-NEXT: vpermilpd {{.*#+}} xmm9 = xmm0[1,0] +; AVX512VL-NEXT: vmulss %xmm1, %xmm9, %xmm10 +; AVX512VL-NEXT: vmovshdup {{.*#+}} xmm4 = xmm4[1,1,3,3] +; AVX512VL-NEXT: vmulss %xmm5, %xmm4, %xmm5 +; AVX512VL-NEXT: vaddss %xmm5, %xmm10, %xmm5 +; AVX512VL-NEXT: vextractf32x4 $2, %zmm0, %xmm10 +; AVX512VL-NEXT: vmulss %xmm8, %xmm10, %xmm8 +; AVX512VL-NEXT: vaddss %xmm5, %xmm8, %xmm5 +; AVX512VL-NEXT: vinsertps {{.*#+}} xmm3 = xmm3[0,1],xmm5[0],xmm3[3] +; AVX512VL-NEXT: vmulps %xmm7, %xmm0, %xmm5 +; AVX512VL-NEXT: vextractf128 $1, %ymm1, %xmm8 +; AVX512VL-NEXT: vmovsldup {{.*#+}} xmm11 = xmm8[0,0,2,2] +; AVX512VL-NEXT: vmulps %xmm2, %xmm11, %xmm11 +; AVX512VL-NEXT: vaddps %xmm5, %xmm11, %xmm5 +; AVX512VL-NEXT: vmovshdup {{.*#+}} xmm11 = xmm8[1,1,3,3] +; AVX512VL-NEXT: vmulps %xmm6, %xmm11, %xmm12 +; AVX512VL-NEXT: vaddps %xmm5, %xmm12, %xmm5 +; AVX512VL-NEXT: vmulss %xmm7, %xmm9, %xmm7 +; AVX512VL-NEXT: vmulss %xmm4, %xmm8, %xmm12 +; AVX512VL-NEXT: vaddss %xmm7, %xmm12, %xmm7 +; AVX512VL-NEXT: vmulss %xmm11, %xmm10, %xmm11 +; AVX512VL-NEXT: vaddss %xmm7, %xmm11, %xmm7 +; AVX512VL-NEXT: vinsertps {{.*#+}} xmm5 = xmm5[0,1],xmm7[0],xmm5[3] +; AVX512VL-NEXT: vpermilps {{.*#+}} xmm7 = xmm8[3,3,3,3] +; AVX512VL-NEXT: vpermilpd {{.*#+}} xmm11 = xmm8[1,0] +; AVX512VL-NEXT: vpermilps {{.*#+}} xmm8 = xmm8[2,2,2,2] +; AVX512VL-NEXT: vmulps %xmm0, %xmm8, %xmm0 +; AVX512VL-NEXT: vmulps %xmm7, %xmm2, %xmm2 +; AVX512VL-NEXT: vaddps %xmm2, %xmm0, %xmm0 +; AVX512VL-NEXT: vextractf32x4 $2, %zmm1, %xmm1 +; AVX512VL-NEXT: vbroadcastss %xmm1, %xmm2 +; AVX512VL-NEXT: vmulps %xmm2, %xmm6, %xmm2 +; AVX512VL-NEXT: vaddps %xmm2, %xmm0, %xmm0 +; AVX512VL-NEXT: vmulss %xmm11, %xmm9, %xmm2 +; AVX512VL-NEXT: vmulss %xmm7, %xmm4, %xmm4 +; AVX512VL-NEXT: vaddss %xmm4, %xmm2, %xmm2 +; AVX512VL-NEXT: vmulss %xmm1, %xmm10, %xmm1 +; AVX512VL-NEXT: vaddss %xmm1, %xmm2, %xmm1 +; AVX512VL-NEXT: vinsertps {{.*#+}} xmm1 = xmm0[0,1],xmm1[0],xmm0[3] +; AVX512VL-NEXT: vinsertf32x4 $1, %xmm5, %zmm3, %zmm2 +; AVX512VL-NEXT: vmovaps {{.*#+}} zmm0 = <0,1,2,4,5,6,16,17,18,u,u,u,u,u,u,u> +; AVX512VL-NEXT: vpermi2ps %zmm1, %zmm2, %zmm0 +; AVX512VL-NEXT: retq +entry: + %block = shufflevector <9 x float> %a0, <9 x float> poison, <2 x i32> + %splat.splat = shufflevector <9 x float> %a1, <9 x float> undef, <2 x i32> zeroinitializer + %0 = fmul <2 x float> %block, %splat.splat + %block6 = shufflevector <9 x float> %a0, <9 x float> poison, <2 x i32> + %splat.splat8 = shufflevector <9 x float> %a1, <9 x float> undef, <2 x i32> + %1 = fmul <2 x float> %block6, %splat.splat8 + %2 = fadd <2 x float> %0, %1 + %block9 = shufflevector <9 x float> %a0, <9 x float> poison, <2 x i32> + %splat.splat11 = shufflevector <9 x float> %a1, <9 x float> undef, <2 x i32> + %3 = fmul <2 x float> %block9, %splat.splat11 + %4 = fadd <2 x float> %2, %3 + %5 = shufflevector <2 x float> %4, <2 x float> poison, <3 x i32> + %block12 = shufflevector <9 x float> %a0, <9 x float> poison, <1 x i32> + %splat.splatinsert13 = shufflevector <9 x float> %a1, <9 x float> undef, <1 x i32> zeroinitializer + %6 = fmul <1 x float> %block12, %splat.splatinsert13 + %block15 = shufflevector <9 x float> %a0, <9 x float> poison, <1 x i32> + %splat.splatinsert16 = shufflevector <9 x float> %a1, <9 x float> undef, <1 x i32> + %7 = fmul <1 x float> %block15, %splat.splatinsert16 + %8 = fadd <1 x float> %6, %7 + %block18 = shufflevector <9 x float> %a0, <9 x float> poison, <1 x i32> + %splat.splatinsert19 = shufflevector <9 x float> %a1, <9 x float> undef, <1 x i32> + %9 = fmul <1 x float> %block18, %splat.splatinsert19 + %10 = fadd <1 x float> %8, %9 + %11 = shufflevector <1 x float> %10, <1 x float> poison, <3 x i32> + %12 = shufflevector <3 x float> %5, <3 x float> %11, <3 x i32> + %splat.splat23 = shufflevector <9 x float> %a1, <9 x float> undef, <2 x i32> + %13 = fmul <2 x float> %block, %splat.splat23 + %splat.splat26 = shufflevector <9 x float> %a1, <9 x float> undef, <2 x i32> + %14 = fmul <2 x float> %block6, %splat.splat26 + %15 = fadd <2 x float> %13, %14 + %splat.splat29 = shufflevector <9 x float> %a1, <9 x float> undef, <2 x i32> + %16 = fmul <2 x float> %block9, %splat.splat29 + %17 = fadd <2 x float> %15, %16 + %18 = shufflevector <2 x float> %17, <2 x float> poison, <3 x i32> + %splat.splatinsert31 = shufflevector <9 x float> %a1, <9 x float> undef, <1 x i32> + %19 = fmul <1 x float> %block12, %splat.splatinsert31 + %splat.splatinsert34 = shufflevector <9 x float> %a1, <9 x float> undef, <1 x i32> + %20 = fmul <1 x float> %block15, %splat.splatinsert34 + %21 = fadd <1 x float> %19, %20 + %splat.splatinsert37 = shufflevector <9 x float> %a1, <9 x float> undef, <1 x i32> + %22 = fmul <1 x float> %block18, %splat.splatinsert37 + %23 = fadd <1 x float> %21, %22 + %24 = shufflevector <1 x float> %23, <1 x float> poison, <3 x i32> + %25 = shufflevector <3 x float> %18, <3 x float> %24, <3 x i32> + %splat.splat41 = shufflevector <9 x float> %a1, <9 x float> undef, <2 x i32> + %26 = fmul <2 x float> %block, %splat.splat41 + %splat.splat44 = shufflevector <9 x float> %a1, <9 x float> undef, <2 x i32> + %27 = fmul <2 x float> %block6, %splat.splat44 + %28 = fadd <2 x float> %26, %27 + %splat.splat47 = shufflevector <9 x float> %a1, <9 x float> undef, <2 x i32> + %29 = fmul <2 x float> %block9, %splat.splat47 + %30 = fadd <2 x float> %28, %29 + %31 = shufflevector <2 x float> %30, <2 x float> poison, <3 x i32> + %splat.splatinsert49 = shufflevector <9 x float> %a1, <9 x float> undef, <1 x i32> + %32 = fmul <1 x float> %block12, %splat.splatinsert49 + %splat.splatinsert52 = shufflevector <9 x float> %a1, <9 x float> undef, <1 x i32> + %33 = fmul <1 x float> %block15, %splat.splatinsert52 + %34 = fadd <1 x float> %32, %33 + %35 = fmul <9 x float> %a0, %a1 + %36 = shufflevector <9 x float> %35, <9 x float> poison, <1 x i32> + %37 = fadd <1 x float> %34, %36 + %38 = shufflevector <1 x float> %37, <1 x float> poison, <3 x i32> + %39 = shufflevector <3 x float> %31, <3 x float> %38, <3 x i32> + %40 = shufflevector <3 x float> %12, <3 x float> %25, <6 x i32> + %41 = shufflevector <3 x float> %39, <3 x float> poison, <6 x i32> + %42 = shufflevector <6 x float> %40, <6 x float> %41, <9 x i32> + ret <9 x float> %42 +} + +define <9 x double> @test_mul3x3_f64(<9 x double> %a0, <9 x double> %a1) nounwind { +; SSE-LABEL: test_mul3x3_f64: +; SSE: # %bb.0: # %entry +; SSE-NEXT: movq %rdi, %rax +; SSE-NEXT: movsd {{.*#+}} xmm8 = mem[0],zero +; SSE-NEXT: movsd {{.*#+}} xmm10 = mem[0],zero +; SSE-NEXT: movsd {{.*#+}} xmm9 = mem[0],zero +; SSE-NEXT: movsd {{.*#+}} xmm11 = mem[0],zero +; SSE-NEXT: movsd {{.*#+}} xmm12 = mem[0],zero +; SSE-NEXT: unpcklpd {{.*#+}} xmm0 = xmm0[0],xmm1[0] +; SSE-NEXT: movapd %xmm2, %xmm13 +; SSE-NEXT: mulsd %xmm12, %xmm13 +; SSE-NEXT: unpcklpd {{.*#+}} xmm12 = xmm12[0,0] +; SSE-NEXT: mulpd %xmm0, %xmm12 +; SSE-NEXT: unpcklpd {{.*#+}} xmm3 = xmm3[0],xmm4[0] +; SSE-NEXT: movapd %xmm5, %xmm1 +; SSE-NEXT: mulsd %xmm11, %xmm1 +; SSE-NEXT: unpcklpd {{.*#+}} xmm11 = xmm11[0,0] +; SSE-NEXT: mulpd %xmm3, %xmm11 +; SSE-NEXT: addpd %xmm12, %xmm11 +; SSE-NEXT: movsd {{.*#+}} xmm4 = mem[0],zero +; SSE-NEXT: unpcklpd {{.*#+}} xmm6 = xmm6[0],xmm7[0] +; SSE-NEXT: movapd %xmm9, %xmm7 +; SSE-NEXT: mulsd %xmm4, %xmm7 +; SSE-NEXT: unpcklpd {{.*#+}} xmm4 = xmm4[0,0] +; SSE-NEXT: mulpd %xmm6, %xmm4 +; SSE-NEXT: addpd %xmm11, %xmm4 +; SSE-NEXT: movsd {{.*#+}} xmm11 = mem[0],zero +; SSE-NEXT: addsd %xmm13, %xmm1 +; SSE-NEXT: addsd %xmm7, %xmm1 +; SSE-NEXT: movapd %xmm2, %xmm12 +; SSE-NEXT: mulsd %xmm11, %xmm12 +; SSE-NEXT: unpcklpd {{.*#+}} xmm11 = xmm11[0,0] +; SSE-NEXT: mulpd %xmm0, %xmm11 +; SSE-NEXT: movapd %xmm5, %xmm7 +; SSE-NEXT: mulsd %xmm10, %xmm7 +; SSE-NEXT: unpcklpd {{.*#+}} xmm10 = xmm10[0,0] +; SSE-NEXT: mulpd %xmm3, %xmm10 +; SSE-NEXT: addpd %xmm11, %xmm10 +; SSE-NEXT: movapd %xmm9, %xmm11 +; SSE-NEXT: mulsd %xmm8, %xmm11 +; SSE-NEXT: unpcklpd {{.*#+}} xmm8 = xmm8[0,0] +; SSE-NEXT: mulpd %xmm6, %xmm8 +; SSE-NEXT: addpd %xmm10, %xmm8 +; SSE-NEXT: addsd %xmm12, %xmm7 +; SSE-NEXT: addsd %xmm11, %xmm7 +; SSE-NEXT: movsd {{.*#+}} xmm10 = mem[0],zero +; SSE-NEXT: mulsd %xmm10, %xmm2 +; SSE-NEXT: unpcklpd {{.*#+}} xmm10 = xmm10[0,0] +; SSE-NEXT: mulpd %xmm0, %xmm10 +; SSE-NEXT: movsd {{.*#+}} xmm0 = mem[0],zero +; SSE-NEXT: mulsd %xmm0, %xmm5 +; SSE-NEXT: unpcklpd {{.*#+}} xmm0 = xmm0[0,0] +; SSE-NEXT: mulpd %xmm3, %xmm0 +; SSE-NEXT: addpd %xmm10, %xmm0 +; SSE-NEXT: movsd {{.*#+}} xmm3 = mem[0],zero +; SSE-NEXT: mulsd %xmm3, %xmm9 +; SSE-NEXT: unpcklpd {{.*#+}} xmm3 = xmm3[0,0] +; SSE-NEXT: mulpd %xmm6, %xmm3 +; SSE-NEXT: addpd %xmm0, %xmm3 +; SSE-NEXT: addsd %xmm2, %xmm5 +; SSE-NEXT: addsd %xmm9, %xmm5 +; SSE-NEXT: unpcklpd {{.*#+}} xmm1 = xmm1[0],xmm8[0] +; SSE-NEXT: shufpd {{.*#+}} xmm8 = xmm8[1],xmm7[0] +; SSE-NEXT: movsd %xmm5, 64(%rdi) +; SSE-NEXT: movapd %xmm3, 48(%rdi) +; SSE-NEXT: movapd %xmm4, (%rdi) +; SSE-NEXT: movapd %xmm8, 32(%rdi) +; SSE-NEXT: movapd %xmm1, 16(%rdi) +; SSE-NEXT: retq +; +; AVX1-LABEL: test_mul3x3_f64: +; AVX1: # %bb.0: # %entry +; AVX1-NEXT: movq %rdi, %rax +; AVX1-NEXT: vmovsd {{.*#+}} xmm11 = mem[0],zero +; AVX1-NEXT: vunpcklpd {{.*#+}} xmm9 = xmm0[0],xmm1[0] +; AVX1-NEXT: vmovddup {{.*#+}} xmm8 = mem[0,0] +; AVX1-NEXT: vmulpd %xmm8, %xmm9, %xmm1 +; AVX1-NEXT: vunpcklpd {{.*#+}} xmm10 = xmm3[0],xmm4[0] +; AVX1-NEXT: vmovddup {{.*#+}} xmm12 = mem[0,0] +; AVX1-NEXT: vmulpd %xmm12, %xmm10, %xmm3 +; AVX1-NEXT: vaddpd %xmm3, %xmm1, %xmm3 +; AVX1-NEXT: vunpcklpd {{.*#+}} xmm1 = xmm6[0],xmm7[0] +; AVX1-NEXT: vmovddup {{.*#+}} xmm7 = mem[0,0] +; AVX1-NEXT: vmulpd %xmm7, %xmm6, %xmm4 +; AVX1-NEXT: vaddpd %xmm4, %xmm3, %xmm3 +; AVX1-NEXT: vmulsd %xmm2, %xmm8, %xmm4 +; AVX1-NEXT: vmulsd %xmm5, %xmm12, %xmm6 +; AVX1-NEXT: vaddsd %xmm6, %xmm4, %xmm4 +; AVX1-NEXT: vmulsd %xmm7, %xmm11, %xmm6 +; AVX1-NEXT: vaddsd %xmm6, %xmm4, %xmm4 +; AVX1-NEXT: vinsertf128 $1, %xmm4, %ymm3, %ymm0 +; AVX1-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX1-NEXT: vmovddup {{.*#+}} xmm4 = mem[0,0] +; AVX1-NEXT: vmovddup {{.*#+}} xmm6 = mem[0,0] +; AVX1-NEXT: vmovddup {{.*#+}} xmm3 = mem[0,0] +; AVX1-NEXT: vmulsd %xmm4, %xmm2, %xmm13 +; AVX1-NEXT: vmulsd %xmm6, %xmm5, %xmm14 +; AVX1-NEXT: vaddsd %xmm14, %xmm13, %xmm13 +; AVX1-NEXT: vmulsd %xmm3, %xmm11, %xmm14 +; AVX1-NEXT: vaddsd %xmm14, %xmm13, %xmm0 +; AVX1-NEXT: vmovupd %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX1-NEXT: vmovddup {{.*#+}} xmm13 = mem[0,0] +; AVX1-NEXT: vmulpd %xmm13, %xmm9, %xmm14 +; AVX1-NEXT: vmovddup {{.*#+}} xmm0 = mem[0,0] +; AVX1-NEXT: vmulpd %xmm0, %xmm10, %xmm11 +; AVX1-NEXT: vaddpd %xmm11, %xmm14, %xmm11 +; AVX1-NEXT: vmovddup {{.*#+}} xmm14 = mem[0,0] +; AVX1-NEXT: vmulpd %xmm1, %xmm14, %xmm15 +; AVX1-NEXT: vaddpd %xmm15, %xmm11, %xmm11 +; AVX1-NEXT: vmulsd %xmm2, %xmm13, %xmm2 +; AVX1-NEXT: vmulsd %xmm0, %xmm5, %xmm5 +; AVX1-NEXT: vaddsd %xmm5, %xmm2, %xmm2 +; AVX1-NEXT: vmulsd {{[0-9]+}}(%rsp), %xmm14, %xmm5 +; AVX1-NEXT: vaddsd %xmm5, %xmm2, %xmm2 +; AVX1-NEXT: vinsertf128 $1, %xmm6, %ymm12, %ymm5 +; AVX1-NEXT: vinsertf128 $1, %xmm10, %ymm10, %ymm10 +; AVX1-NEXT: vmulpd %ymm5, %ymm10, %ymm5 +; AVX1-NEXT: vinsertf128 $1, %xmm4, %ymm8, %ymm8 +; AVX1-NEXT: vinsertf128 $1, %xmm9, %ymm9, %ymm9 +; AVX1-NEXT: vmulpd %ymm8, %ymm9, %ymm8 +; AVX1-NEXT: vaddpd %ymm5, %ymm8, %ymm5 +; AVX1-NEXT: vinsertf128 $1, %xmm3, %ymm7, %ymm7 +; AVX1-NEXT: vinsertf128 $1, %xmm1, %ymm1, %ymm1 +; AVX1-NEXT: vmulpd %ymm7, %ymm1, %ymm7 +; AVX1-NEXT: vaddpd %ymm7, %ymm5, %ymm5 +; AVX1-NEXT: vmovupd {{[-0-9]+}}(%r{{[sb]}}p), %ymm7 # 32-byte Reload +; AVX1-NEXT: vshufpd {{.*#+}} ymm5 = ymm7[0],ymm5[1],ymm7[2],ymm5[2] +; AVX1-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm7 # 32-byte Reload +; AVX1-NEXT: vinsertf128 $1, %xmm11, %ymm7, %ymm7 +; AVX1-NEXT: vinsertf128 $1, %xmm0, %ymm6, %ymm0 +; AVX1-NEXT: vmulpd %ymm0, %ymm10, %ymm0 +; AVX1-NEXT: vinsertf128 $1, %xmm13, %ymm4, %ymm4 +; AVX1-NEXT: vmulpd %ymm4, %ymm9, %ymm4 +; AVX1-NEXT: vaddpd %ymm0, %ymm4, %ymm0 +; AVX1-NEXT: vinsertf128 $1, %xmm14, %ymm3, %ymm3 +; AVX1-NEXT: vmulpd %ymm3, %ymm1, %ymm1 +; AVX1-NEXT: vaddpd %ymm1, %ymm0, %ymm0 +; AVX1-NEXT: vshufpd {{.*#+}} ymm0 = ymm0[1],ymm7[0],ymm0[2],ymm7[3] +; AVX1-NEXT: vmovsd %xmm2, 64(%rdi) +; AVX1-NEXT: vmovapd %ymm0, 32(%rdi) +; AVX1-NEXT: vmovapd %ymm5, (%rdi) +; AVX1-NEXT: vzeroupper +; AVX1-NEXT: retq +; +; AVX2-LABEL: test_mul3x3_f64: +; AVX2: # %bb.0: # %entry +; AVX2-NEXT: movq %rdi, %rax +; AVX2-NEXT: vmovsd {{.*#+}} xmm11 = mem[0],zero +; AVX2-NEXT: vunpcklpd {{.*#+}} xmm9 = xmm0[0],xmm1[0] +; AVX2-NEXT: vmovddup {{.*#+}} xmm8 = mem[0,0] +; AVX2-NEXT: vmulpd %xmm8, %xmm9, %xmm1 +; AVX2-NEXT: vunpcklpd {{.*#+}} xmm10 = xmm3[0],xmm4[0] +; AVX2-NEXT: vmovddup {{.*#+}} xmm12 = mem[0,0] +; AVX2-NEXT: vmulpd %xmm12, %xmm10, %xmm3 +; AVX2-NEXT: vaddpd %xmm3, %xmm1, %xmm3 +; AVX2-NEXT: vunpcklpd {{.*#+}} xmm1 = xmm6[0],xmm7[0] +; AVX2-NEXT: vmovddup {{.*#+}} xmm7 = mem[0,0] +; AVX2-NEXT: vmulpd %xmm7, %xmm6, %xmm4 +; AVX2-NEXT: vaddpd %xmm4, %xmm3, %xmm3 +; AVX2-NEXT: vmulsd %xmm2, %xmm8, %xmm4 +; AVX2-NEXT: vmulsd %xmm5, %xmm12, %xmm6 +; AVX2-NEXT: vaddsd %xmm6, %xmm4, %xmm4 +; AVX2-NEXT: vmulsd %xmm7, %xmm11, %xmm6 +; AVX2-NEXT: vaddsd %xmm6, %xmm4, %xmm4 +; AVX2-NEXT: vinsertf128 $1, %xmm4, %ymm3, %ymm0 +; AVX2-NEXT: vmovupd %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-NEXT: vmovddup {{.*#+}} xmm4 = mem[0,0] +; AVX2-NEXT: vmovddup {{.*#+}} xmm6 = mem[0,0] +; AVX2-NEXT: vmovddup {{.*#+}} xmm3 = mem[0,0] +; AVX2-NEXT: vmulsd %xmm4, %xmm2, %xmm13 +; AVX2-NEXT: vmulsd %xmm6, %xmm5, %xmm14 +; AVX2-NEXT: vaddsd %xmm14, %xmm13, %xmm13 +; AVX2-NEXT: vmulsd %xmm3, %xmm11, %xmm14 +; AVX2-NEXT: vaddsd %xmm14, %xmm13, %xmm0 +; AVX2-NEXT: vmovupd %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-NEXT: vmovddup {{.*#+}} xmm13 = mem[0,0] +; AVX2-NEXT: vmulpd %xmm13, %xmm9, %xmm14 +; AVX2-NEXT: vmovddup {{.*#+}} xmm0 = mem[0,0] +; AVX2-NEXT: vmulpd %xmm0, %xmm10, %xmm11 +; AVX2-NEXT: vaddpd %xmm11, %xmm14, %xmm11 +; AVX2-NEXT: vmovddup {{.*#+}} xmm14 = mem[0,0] +; AVX2-NEXT: vmulpd %xmm1, %xmm14, %xmm15 +; AVX2-NEXT: vaddpd %xmm15, %xmm11, %xmm11 +; AVX2-NEXT: vmulsd %xmm2, %xmm13, %xmm2 +; AVX2-NEXT: vmulsd %xmm0, %xmm5, %xmm5 +; AVX2-NEXT: vaddsd %xmm5, %xmm2, %xmm2 +; AVX2-NEXT: vmulsd {{[0-9]+}}(%rsp), %xmm14, %xmm5 +; AVX2-NEXT: vaddsd %xmm5, %xmm2, %xmm2 +; AVX2-NEXT: vinsertf128 $1, %xmm6, %ymm12, %ymm5 +; AVX2-NEXT: vinsertf128 $1, %xmm10, %ymm10, %ymm10 +; AVX2-NEXT: vmulpd %ymm5, %ymm10, %ymm5 +; AVX2-NEXT: vinsertf128 $1, %xmm4, %ymm8, %ymm8 +; AVX2-NEXT: vinsertf128 $1, %xmm9, %ymm9, %ymm9 +; AVX2-NEXT: vmulpd %ymm8, %ymm9, %ymm8 +; AVX2-NEXT: vaddpd %ymm5, %ymm8, %ymm5 +; AVX2-NEXT: vinsertf128 $1, %xmm3, %ymm7, %ymm7 +; AVX2-NEXT: vinsertf128 $1, %xmm1, %ymm1, %ymm1 +; AVX2-NEXT: vmulpd %ymm7, %ymm1, %ymm7 +; AVX2-NEXT: vaddpd %ymm7, %ymm5, %ymm5 +; AVX2-NEXT: vmovupd {{[-0-9]+}}(%r{{[sb]}}p), %ymm7 # 32-byte Reload +; AVX2-NEXT: vshufpd {{.*#+}} ymm5 = ymm7[0],ymm5[1],ymm7[2],ymm5[2] +; AVX2-NEXT: vmovupd {{[-0-9]+}}(%r{{[sb]}}p), %ymm7 # 32-byte Reload +; AVX2-NEXT: vinsertf128 $1, %xmm11, %ymm7, %ymm7 +; AVX2-NEXT: vinsertf128 $1, %xmm0, %ymm6, %ymm0 +; AVX2-NEXT: vmulpd %ymm0, %ymm10, %ymm0 +; AVX2-NEXT: vinsertf128 $1, %xmm13, %ymm4, %ymm4 +; AVX2-NEXT: vmulpd %ymm4, %ymm9, %ymm4 +; AVX2-NEXT: vaddpd %ymm0, %ymm4, %ymm0 +; AVX2-NEXT: vinsertf128 $1, %xmm14, %ymm3, %ymm3 +; AVX2-NEXT: vmulpd %ymm3, %ymm1, %ymm1 +; AVX2-NEXT: vaddpd %ymm1, %ymm0, %ymm0 +; AVX2-NEXT: vshufpd {{.*#+}} ymm0 = ymm0[1],ymm7[0],ymm0[2],ymm7[3] +; AVX2-NEXT: vmovsd %xmm2, 64(%rdi) +; AVX2-NEXT: vmovapd %ymm0, 32(%rdi) +; AVX2-NEXT: vmovapd %ymm5, (%rdi) +; AVX2-NEXT: vzeroupper +; AVX2-NEXT: retq +; +; AVX512F-LABEL: test_mul3x3_f64: +; AVX512F: # %bb.0: # %entry +; AVX512F-NEXT: movq %rdi, %rax +; AVX512F-NEXT: vmovsd {{.*#+}} xmm8 = mem[0],zero +; AVX512F-NEXT: vunpcklpd {{.*#+}} xmm0 = xmm0[0],xmm1[0] +; AVX512F-NEXT: vmovddup {{.*#+}} xmm9 = mem[0,0] +; AVX512F-NEXT: vmulpd %xmm0, %xmm9, %xmm10 +; AVX512F-NEXT: vunpcklpd {{.*#+}} xmm1 = xmm3[0],xmm4[0] +; AVX512F-NEXT: vmovddup {{.*#+}} xmm3 = mem[0,0] +; AVX512F-NEXT: vmulpd %xmm3, %xmm1, %xmm4 +; AVX512F-NEXT: vaddpd %xmm4, %xmm10, %xmm4 +; AVX512F-NEXT: vunpcklpd {{.*#+}} xmm6 = xmm6[0],xmm7[0] +; AVX512F-NEXT: vmovddup {{.*#+}} xmm7 = mem[0,0] +; AVX512F-NEXT: vmulpd %xmm7, %xmm6, %xmm10 +; AVX512F-NEXT: vaddpd %xmm4, %xmm10, %xmm4 +; AVX512F-NEXT: vmulsd %xmm2, %xmm9, %xmm9 +; AVX512F-NEXT: vmulsd %xmm3, %xmm5, %xmm3 +; AVX512F-NEXT: vaddsd %xmm3, %xmm9, %xmm3 +; AVX512F-NEXT: vmulsd %xmm7, %xmm8, %xmm7 +; AVX512F-NEXT: vaddsd %xmm7, %xmm3, %xmm3 +; AVX512F-NEXT: vinsertf128 $1, %xmm3, %ymm4, %ymm3 +; AVX512F-NEXT: vmovddup {{.*#+}} xmm4 = mem[0,0] +; AVX512F-NEXT: vmulpd %xmm4, %xmm0, %xmm7 +; AVX512F-NEXT: vmovddup {{.*#+}} xmm9 = mem[0,0] +; AVX512F-NEXT: vmulpd %xmm1, %xmm9, %xmm10 +; AVX512F-NEXT: vaddpd %xmm7, %xmm10, %xmm7 +; AVX512F-NEXT: vmovddup {{.*#+}} xmm10 = mem[0,0] +; AVX512F-NEXT: vmulpd %xmm6, %xmm10, %xmm11 +; AVX512F-NEXT: vaddpd %xmm7, %xmm11, %xmm7 +; AVX512F-NEXT: vmulsd %xmm4, %xmm2, %xmm4 +; AVX512F-NEXT: vmulsd %xmm5, %xmm9, %xmm9 +; AVX512F-NEXT: vaddsd %xmm4, %xmm9, %xmm4 +; AVX512F-NEXT: vmulsd %xmm10, %xmm8, %xmm9 +; AVX512F-NEXT: vaddsd %xmm4, %xmm9, %xmm4 +; AVX512F-NEXT: vinsertf128 $1, %xmm4, %ymm7, %ymm4 +; AVX512F-NEXT: vmovddup {{.*#+}} xmm7 = mem[0,0] +; AVX512F-NEXT: vmulpd %xmm7, %xmm0, %xmm0 +; AVX512F-NEXT: vmovddup {{.*#+}} xmm9 = mem[0,0] +; AVX512F-NEXT: vmulpd %xmm1, %xmm9, %xmm1 +; AVX512F-NEXT: vaddpd %xmm1, %xmm0, %xmm0 +; AVX512F-NEXT: vmovddup {{.*#+}} xmm1 = mem[0,0] +; AVX512F-NEXT: vmulpd %xmm1, %xmm6, %xmm6 +; AVX512F-NEXT: vaddpd %xmm6, %xmm0, %xmm0 +; AVX512F-NEXT: vmulsd %xmm7, %xmm2, %xmm2 +; AVX512F-NEXT: vmulsd %xmm5, %xmm9, %xmm5 +; AVX512F-NEXT: vaddsd %xmm5, %xmm2, %xmm2 +; AVX512F-NEXT: vmulsd %xmm1, %xmm8, %xmm1 +; AVX512F-NEXT: vaddsd %xmm1, %xmm2, %xmm1 +; AVX512F-NEXT: vinsertf64x4 $1, %ymm4, %zmm3, %zmm2 +; AVX512F-NEXT: vmovapd {{.*#+}} zmm3 = [0,1,2,4,5,6,8,9] +; AVX512F-NEXT: vpermi2pd %zmm0, %zmm2, %zmm3 +; AVX512F-NEXT: vmovsd %xmm1, 64(%rdi) +; AVX512F-NEXT: vmovapd %zmm3, (%rdi) +; AVX512F-NEXT: vzeroupper +; AVX512F-NEXT: retq +; +; AVX512VL-LABEL: test_mul3x3_f64: +; AVX512VL: # %bb.0: # %entry +; AVX512VL-NEXT: movq %rdi, %rax +; AVX512VL-NEXT: vmovsd {{.*#+}} xmm8 = mem[0],zero +; AVX512VL-NEXT: vunpcklpd {{.*#+}} xmm0 = xmm0[0],xmm1[0] +; AVX512VL-NEXT: vmovddup {{.*#+}} xmm1 = mem[0,0] +; AVX512VL-NEXT: vmulpd %xmm1, %xmm0, %xmm9 +; AVX512VL-NEXT: vunpcklpd {{.*#+}} xmm3 = xmm3[0],xmm4[0] +; AVX512VL-NEXT: vmovddup {{.*#+}} xmm4 = mem[0,0] +; AVX512VL-NEXT: vmulpd %xmm4, %xmm3, %xmm10 +; AVX512VL-NEXT: vaddpd %xmm10, %xmm9, %xmm9 +; AVX512VL-NEXT: vunpcklpd {{.*#+}} xmm6 = xmm6[0],xmm7[0] +; AVX512VL-NEXT: vmovddup {{.*#+}} xmm7 = mem[0,0] +; AVX512VL-NEXT: vmulpd %xmm7, %xmm6, %xmm10 +; AVX512VL-NEXT: vaddpd %xmm10, %xmm9, %xmm9 +; AVX512VL-NEXT: vmulsd %xmm1, %xmm2, %xmm1 +; AVX512VL-NEXT: vmulsd %xmm4, %xmm5, %xmm4 +; AVX512VL-NEXT: vaddsd %xmm4, %xmm1, %xmm1 +; AVX512VL-NEXT: vmulsd %xmm7, %xmm8, %xmm4 +; AVX512VL-NEXT: vaddsd %xmm4, %xmm1, %xmm1 +; AVX512VL-NEXT: vinsertf128 $1, %xmm1, %ymm9, %ymm1 +; AVX512VL-NEXT: vmovddup {{.*#+}} xmm4 = mem[0,0] +; AVX512VL-NEXT: vmulpd %xmm4, %xmm0, %xmm7 +; AVX512VL-NEXT: vmovddup {{.*#+}} xmm9 = mem[0,0] +; AVX512VL-NEXT: vmulpd %xmm3, %xmm9, %xmm10 +; AVX512VL-NEXT: vaddpd %xmm7, %xmm10, %xmm7 +; AVX512VL-NEXT: vmovddup {{.*#+}} xmm10 = mem[0,0] +; AVX512VL-NEXT: vmulpd %xmm6, %xmm10, %xmm11 +; AVX512VL-NEXT: vaddpd %xmm7, %xmm11, %xmm7 +; AVX512VL-NEXT: vmulsd %xmm4, %xmm2, %xmm4 +; AVX512VL-NEXT: vmulsd %xmm5, %xmm9, %xmm9 +; AVX512VL-NEXT: vaddsd %xmm4, %xmm9, %xmm4 +; AVX512VL-NEXT: vmulsd %xmm10, %xmm8, %xmm9 +; AVX512VL-NEXT: vaddsd %xmm4, %xmm9, %xmm4 +; AVX512VL-NEXT: vinsertf128 $1, %xmm4, %ymm7, %ymm4 +; AVX512VL-NEXT: vmovddup {{.*#+}} xmm7 = mem[0,0] +; AVX512VL-NEXT: vmulpd %xmm7, %xmm0, %xmm0 +; AVX512VL-NEXT: vmovddup {{.*#+}} xmm9 = mem[0,0] +; AVX512VL-NEXT: vmulpd %xmm3, %xmm9, %xmm3 +; AVX512VL-NEXT: vaddpd %xmm3, %xmm0, %xmm0 +; AVX512VL-NEXT: vmovddup {{.*#+}} xmm3 = mem[0,0] +; AVX512VL-NEXT: vmulpd %xmm3, %xmm6, %xmm6 +; AVX512VL-NEXT: vaddpd %xmm6, %xmm0, %xmm0 +; AVX512VL-NEXT: vmulsd %xmm7, %xmm2, %xmm2 +; AVX512VL-NEXT: vmulsd %xmm5, %xmm9, %xmm5 +; AVX512VL-NEXT: vaddsd %xmm5, %xmm2, %xmm2 +; AVX512VL-NEXT: vmulsd %xmm3, %xmm8, %xmm3 +; AVX512VL-NEXT: vaddsd %xmm3, %xmm2, %xmm2 +; AVX512VL-NEXT: vinsertf64x4 $1, %ymm4, %zmm1, %zmm1 +; AVX512VL-NEXT: vmovapd {{.*#+}} zmm3 = [0,1,2,4,5,6,8,9] +; AVX512VL-NEXT: vpermi2pd %zmm0, %zmm1, %zmm3 +; AVX512VL-NEXT: vmovsd %xmm2, 64(%rdi) +; AVX512VL-NEXT: vmovapd %zmm3, (%rdi) +; AVX512VL-NEXT: vzeroupper +; AVX512VL-NEXT: retq +entry: + %block = shufflevector <9 x double> %a0, <9 x double> poison, <2 x i32> + %splat.splat = shufflevector <9 x double> %a1, <9 x double> undef, <2 x i32> zeroinitializer + %0 = fmul <2 x double> %block, %splat.splat + %block6 = shufflevector <9 x double> %a0, <9 x double> poison, <2 x i32> + %splat.splat8 = shufflevector <9 x double> %a1, <9 x double> undef, <2 x i32> + %1 = fmul <2 x double> %block6, %splat.splat8 + %2 = fadd <2 x double> %0, %1 + %block9 = shufflevector <9 x double> %a0, <9 x double> poison, <2 x i32> + %splat.splat11 = shufflevector <9 x double> %a1, <9 x double> undef, <2 x i32> + %3 = fmul <2 x double> %block9, %splat.splat11 + %4 = fadd <2 x double> %2, %3 + %5 = shufflevector <2 x double> %4, <2 x double> poison, <3 x i32> + %block12 = shufflevector <9 x double> %a0, <9 x double> poison, <1 x i32> + %splat.splatinsert13 = shufflevector <9 x double> %a1, <9 x double> undef, <1 x i32> zeroinitializer + %6 = fmul <1 x double> %block12, %splat.splatinsert13 + %block15 = shufflevector <9 x double> %a0, <9 x double> poison, <1 x i32> + %splat.splatinsert16 = shufflevector <9 x double> %a1, <9 x double> undef, <1 x i32> + %7 = fmul <1 x double> %block15, %splat.splatinsert16 + %8 = fadd <1 x double> %6, %7 + %block18 = shufflevector <9 x double> %a0, <9 x double> poison, <1 x i32> + %splat.splatinsert19 = shufflevector <9 x double> %a1, <9 x double> undef, <1 x i32> + %9 = fmul <1 x double> %block18, %splat.splatinsert19 + %10 = fadd <1 x double> %8, %9 + %11 = shufflevector <1 x double> %10, <1 x double> poison, <3 x i32> + %12 = shufflevector <3 x double> %5, <3 x double> %11, <3 x i32> + %splat.splat23 = shufflevector <9 x double> %a1, <9 x double> undef, <2 x i32> + %13 = fmul <2 x double> %block, %splat.splat23 + %splat.splat26 = shufflevector <9 x double> %a1, <9 x double> undef, <2 x i32> + %14 = fmul <2 x double> %block6, %splat.splat26 + %15 = fadd <2 x double> %13, %14 + %splat.splat29 = shufflevector <9 x double> %a1, <9 x double> undef, <2 x i32> + %16 = fmul <2 x double> %block9, %splat.splat29 + %17 = fadd <2 x double> %15, %16 + %18 = shufflevector <2 x double> %17, <2 x double> poison, <3 x i32> + %splat.splatinsert31 = shufflevector <9 x double> %a1, <9 x double> undef, <1 x i32> + %19 = fmul <1 x double> %block12, %splat.splatinsert31 + %splat.splatinsert34 = shufflevector <9 x double> %a1, <9 x double> undef, <1 x i32> + %20 = fmul <1 x double> %block15, %splat.splatinsert34 + %21 = fadd <1 x double> %19, %20 + %splat.splatinsert37 = shufflevector <9 x double> %a1, <9 x double> undef, <1 x i32> + %22 = fmul <1 x double> %block18, %splat.splatinsert37 + %23 = fadd <1 x double> %21, %22 + %24 = shufflevector <1 x double> %23, <1 x double> poison, <3 x i32> + %25 = shufflevector <3 x double> %18, <3 x double> %24, <3 x i32> + %splat.splat41 = shufflevector <9 x double> %a1, <9 x double> undef, <2 x i32> + %26 = fmul <2 x double> %block, %splat.splat41 + %splat.splat44 = shufflevector <9 x double> %a1, <9 x double> undef, <2 x i32> + %27 = fmul <2 x double> %block6, %splat.splat44 + %28 = fadd <2 x double> %26, %27 + %splat.splat47 = shufflevector <9 x double> %a1, <9 x double> undef, <2 x i32> + %29 = fmul <2 x double> %block9, %splat.splat47 + %30 = fadd <2 x double> %28, %29 + %31 = shufflevector <2 x double> %30, <2 x double> poison, <3 x i32> + %splat.splatinsert49 = shufflevector <9 x double> %a1, <9 x double> undef, <1 x i32> + %32 = fmul <1 x double> %block12, %splat.splatinsert49 + %splat.splatinsert52 = shufflevector <9 x double> %a1, <9 x double> undef, <1 x i32> + %33 = fmul <1 x double> %block15, %splat.splatinsert52 + %34 = fadd <1 x double> %32, %33 + %35 = fmul <9 x double> %a0, %a1 + %36 = shufflevector <9 x double> %35, <9 x double> poison, <1 x i32> + %37 = fadd <1 x double> %34, %36 + %38 = shufflevector <1 x double> %37, <1 x double> poison, <3 x i32> + %39 = shufflevector <3 x double> %31, <3 x double> %38, <3 x i32> + %40 = shufflevector <3 x double> %12, <3 x double> %25, <6 x i32> + %41 = shufflevector <3 x double> %39, <3 x double> poison, <6 x i32> + %42 = shufflevector <6 x double> %40, <6 x double> %41, <9 x i32> + ret <9 x double> %42 +} + +define <16 x float> @test_mul4x4_f32(<16 x float> %a0, <16 x float> %a1) nounwind { +; SSE-LABEL: test_mul4x4_f32: +; SSE: # %bb.0: # %entry +; SSE-NEXT: movaps %xmm0, %xmm9 +; SSE-NEXT: movaps %xmm4, %xmm0 +; SSE-NEXT: shufps {{.*#+}} xmm0 = xmm0[0,0],xmm4[0,0] +; SSE-NEXT: mulps %xmm9, %xmm0 +; SSE-NEXT: movaps %xmm4, %xmm8 +; SSE-NEXT: shufps {{.*#+}} xmm8 = xmm8[1,1],xmm4[1,1] +; SSE-NEXT: mulps %xmm1, %xmm8 +; SSE-NEXT: addps %xmm0, %xmm8 +; SSE-NEXT: movaps %xmm4, %xmm0 +; SSE-NEXT: shufps {{.*#+}} xmm0 = xmm0[2,2],xmm4[2,2] +; SSE-NEXT: mulps %xmm2, %xmm0 +; SSE-NEXT: addps %xmm8, %xmm0 +; SSE-NEXT: shufps {{.*#+}} xmm4 = xmm4[3,3,3,3] +; SSE-NEXT: mulps %xmm3, %xmm4 +; SSE-NEXT: addps %xmm4, %xmm0 +; SSE-NEXT: movaps %xmm5, %xmm4 +; SSE-NEXT: shufps {{.*#+}} xmm4 = xmm4[0,0],xmm5[0,0] +; SSE-NEXT: mulps %xmm9, %xmm4 +; SSE-NEXT: movaps %xmm5, %xmm10 +; SSE-NEXT: shufps {{.*#+}} xmm10 = xmm10[1,1],xmm5[1,1] +; SSE-NEXT: mulps %xmm1, %xmm10 +; SSE-NEXT: addps %xmm4, %xmm10 +; SSE-NEXT: movaps %xmm5, %xmm8 +; SSE-NEXT: shufps {{.*#+}} xmm8 = xmm8[2,2],xmm5[2,2] +; SSE-NEXT: mulps %xmm2, %xmm8 +; SSE-NEXT: addps %xmm10, %xmm8 +; SSE-NEXT: shufps {{.*#+}} xmm5 = xmm5[3,3,3,3] +; SSE-NEXT: mulps %xmm3, %xmm5 +; SSE-NEXT: addps %xmm5, %xmm8 +; SSE-NEXT: movaps %xmm6, %xmm4 +; SSE-NEXT: shufps {{.*#+}} xmm4 = xmm4[0,0],xmm6[0,0] +; SSE-NEXT: mulps %xmm9, %xmm4 +; SSE-NEXT: movaps %xmm6, %xmm10 +; SSE-NEXT: shufps {{.*#+}} xmm10 = xmm10[1,1],xmm6[1,1] +; SSE-NEXT: mulps %xmm1, %xmm10 +; SSE-NEXT: addps %xmm4, %xmm10 +; SSE-NEXT: movaps %xmm6, %xmm5 +; SSE-NEXT: shufps {{.*#+}} xmm5 = xmm5[2,2],xmm6[2,2] +; SSE-NEXT: mulps %xmm2, %xmm5 +; SSE-NEXT: addps %xmm10, %xmm5 +; SSE-NEXT: shufps {{.*#+}} xmm6 = xmm6[3,3,3,3] +; SSE-NEXT: mulps %xmm3, %xmm6 +; SSE-NEXT: addps %xmm6, %xmm5 +; SSE-NEXT: movaps %xmm7, %xmm4 +; SSE-NEXT: shufps {{.*#+}} xmm4 = xmm4[0,0],xmm7[0,0] +; SSE-NEXT: mulps %xmm9, %xmm4 +; SSE-NEXT: movaps %xmm7, %xmm6 +; SSE-NEXT: shufps {{.*#+}} xmm6 = xmm6[1,1],xmm7[1,1] +; SSE-NEXT: mulps %xmm1, %xmm6 +; SSE-NEXT: addps %xmm4, %xmm6 +; SSE-NEXT: movaps %xmm7, %xmm1 +; SSE-NEXT: shufps {{.*#+}} xmm1 = xmm1[2,2],xmm7[2,2] +; SSE-NEXT: mulps %xmm2, %xmm1 +; SSE-NEXT: addps %xmm6, %xmm1 +; SSE-NEXT: shufps {{.*#+}} xmm7 = xmm7[3,3,3,3] +; SSE-NEXT: mulps %xmm7, %xmm3 +; SSE-NEXT: addps %xmm1, %xmm3 +; SSE-NEXT: movaps %xmm8, %xmm1 +; SSE-NEXT: movaps %xmm5, %xmm2 +; SSE-NEXT: retq +; +; AVX1-LABEL: test_mul4x4_f32: +; AVX1: # %bb.0: # %entry +; AVX1-NEXT: vpermilps {{.*#+}} ymm4 = ymm2[1,1,1,1,5,5,5,5] +; AVX1-NEXT: vperm2f128 {{.*#+}} ymm5 = ymm0[2,3,2,3] +; AVX1-NEXT: vmulps %ymm4, %ymm5, %ymm4 +; AVX1-NEXT: vpermilps {{.*#+}} ymm6 = ymm2[0,0,0,0,4,4,4,4] +; AVX1-NEXT: vinsertf128 $1, %xmm0, %ymm0, %ymm7 +; AVX1-NEXT: vmulps %ymm6, %ymm7, %ymm0 +; AVX1-NEXT: vaddps %ymm4, %ymm0, %ymm0 +; AVX1-NEXT: vpermilps {{.*#+}} ymm4 = ymm2[2,2,2,2,6,6,6,6] +; AVX1-NEXT: vinsertf128 $1, %xmm1, %ymm1, %ymm6 +; AVX1-NEXT: vmulps %ymm4, %ymm6, %ymm4 +; AVX1-NEXT: vaddps %ymm4, %ymm0, %ymm0 +; AVX1-NEXT: vpermilps {{.*#+}} ymm2 = ymm2[3,3,3,3,7,7,7,7] +; AVX1-NEXT: vperm2f128 {{.*#+}} ymm1 = ymm1[2,3,2,3] +; AVX1-NEXT: vmulps %ymm2, %ymm1, %ymm2 +; AVX1-NEXT: vaddps %ymm2, %ymm0, %ymm0 +; AVX1-NEXT: vpermilps {{.*#+}} ymm2 = ymm3[1,1,1,1,5,5,5,5] +; AVX1-NEXT: vmulps %ymm2, %ymm5, %ymm2 +; AVX1-NEXT: vpermilps {{.*#+}} ymm4 = ymm3[0,0,0,0,4,4,4,4] +; AVX1-NEXT: vmulps %ymm4, %ymm7, %ymm4 +; AVX1-NEXT: vaddps %ymm2, %ymm4, %ymm2 +; AVX1-NEXT: vpermilps {{.*#+}} ymm4 = ymm3[2,2,2,2,6,6,6,6] +; AVX1-NEXT: vmulps %ymm4, %ymm6, %ymm4 +; AVX1-NEXT: vaddps %ymm4, %ymm2, %ymm2 +; AVX1-NEXT: vpermilps {{.*#+}} ymm3 = ymm3[3,3,3,3,7,7,7,7] +; AVX1-NEXT: vmulps %ymm3, %ymm1, %ymm1 +; AVX1-NEXT: vaddps %ymm1, %ymm2, %ymm1 +; AVX1-NEXT: retq +; +; AVX2-LABEL: test_mul4x4_f32: +; AVX2: # %bb.0: # %entry +; AVX2-NEXT: vpermilps {{.*#+}} ymm4 = ymm2[1,1,1,1,5,5,5,5] +; AVX2-NEXT: vpermpd {{.*#+}} ymm5 = ymm0[2,3,2,3] +; AVX2-NEXT: vmulps %ymm4, %ymm5, %ymm4 +; AVX2-NEXT: vpermilps {{.*#+}} ymm6 = ymm2[0,0,0,0,4,4,4,4] +; AVX2-NEXT: vpermpd {{.*#+}} ymm7 = ymm0[0,1,0,1] +; AVX2-NEXT: vmulps %ymm6, %ymm7, %ymm0 +; AVX2-NEXT: vaddps %ymm4, %ymm0, %ymm0 +; AVX2-NEXT: vpermilps {{.*#+}} ymm4 = ymm2[2,2,2,2,6,6,6,6] +; AVX2-NEXT: vpermpd {{.*#+}} ymm6 = ymm1[0,1,0,1] +; AVX2-NEXT: vmulps %ymm4, %ymm6, %ymm4 +; AVX2-NEXT: vaddps %ymm4, %ymm0, %ymm0 +; AVX2-NEXT: vpermilps {{.*#+}} ymm2 = ymm2[3,3,3,3,7,7,7,7] +; AVX2-NEXT: vpermpd {{.*#+}} ymm1 = ymm1[2,3,2,3] +; AVX2-NEXT: vmulps %ymm2, %ymm1, %ymm2 +; AVX2-NEXT: vaddps %ymm2, %ymm0, %ymm0 +; AVX2-NEXT: vpermilps {{.*#+}} ymm2 = ymm3[1,1,1,1,5,5,5,5] +; AVX2-NEXT: vmulps %ymm2, %ymm5, %ymm2 +; AVX2-NEXT: vpermilps {{.*#+}} ymm4 = ymm3[0,0,0,0,4,4,4,4] +; AVX2-NEXT: vmulps %ymm4, %ymm7, %ymm4 +; AVX2-NEXT: vaddps %ymm2, %ymm4, %ymm2 +; AVX2-NEXT: vpermilps {{.*#+}} ymm4 = ymm3[2,2,2,2,6,6,6,6] +; AVX2-NEXT: vmulps %ymm4, %ymm6, %ymm4 +; AVX2-NEXT: vaddps %ymm4, %ymm2, %ymm2 +; AVX2-NEXT: vpermilps {{.*#+}} ymm3 = ymm3[3,3,3,3,7,7,7,7] +; AVX2-NEXT: vmulps %ymm3, %ymm1, %ymm1 +; AVX2-NEXT: vaddps %ymm1, %ymm2, %ymm1 +; AVX2-NEXT: retq +; +; AVX512-LABEL: test_mul4x4_f32: +; AVX512: # %bb.0: # %entry +; AVX512-NEXT: vpermilps {{.*#+}} ymm2 = ymm1[3,3,3,3,7,7,7,7] +; AVX512-NEXT: vpermilps {{.*#+}} ymm3 = ymm1[2,2,2,2,6,6,6,6] +; AVX512-NEXT: vpermilps {{.*#+}} ymm4 = ymm1[0,0,0,0,4,4,4,4] +; AVX512-NEXT: vpermilps {{.*#+}} ymm5 = ymm1[1,1,1,1,5,5,5,5] +; AVX512-NEXT: vextractf32x4 $2, %zmm1, %xmm6 +; AVX512-NEXT: vextractf32x4 $3, %zmm1, %xmm1 +; AVX512-NEXT: vinsertf128 $1, %xmm1, %ymm6, %ymm1 +; AVX512-NEXT: vpermilps {{.*#+}} ymm6 = ymm1[1,1,1,1,5,5,5,5] +; AVX512-NEXT: vinsertf64x4 $1, %ymm6, %zmm5, %zmm5 +; AVX512-NEXT: vshuff64x2 {{.*#+}} zmm6 = zmm0[2,3,2,3,2,3,2,3] +; AVX512-NEXT: vmulps %zmm5, %zmm6, %zmm5 +; AVX512-NEXT: vpermilps {{.*#+}} ymm6 = ymm1[0,0,0,0,4,4,4,4] +; AVX512-NEXT: vinsertf64x4 $1, %ymm6, %zmm4, %zmm4 +; AVX512-NEXT: vshuff64x2 {{.*#+}} zmm6 = zmm0[0,1,0,1,0,1,0,1] +; AVX512-NEXT: vmulps %zmm4, %zmm6, %zmm4 +; AVX512-NEXT: vaddps %zmm5, %zmm4, %zmm4 +; AVX512-NEXT: vpermilps {{.*#+}} ymm5 = ymm1[2,2,2,2,6,6,6,6] +; AVX512-NEXT: vinsertf64x4 $1, %ymm5, %zmm3, %zmm3 +; AVX512-NEXT: vshuff64x2 {{.*#+}} zmm5 = zmm0[4,5,4,5,4,5,4,5] +; AVX512-NEXT: vmulps %zmm3, %zmm5, %zmm3 +; AVX512-NEXT: vaddps %zmm3, %zmm4, %zmm3 +; AVX512-NEXT: vpermilps {{.*#+}} ymm1 = ymm1[3,3,3,3,7,7,7,7] +; AVX512-NEXT: vinsertf64x4 $1, %ymm1, %zmm2, %zmm1 +; AVX512-NEXT: vshuff64x2 {{.*#+}} zmm0 = zmm0[6,7,6,7,6,7,6,7] +; AVX512-NEXT: vmulps %zmm1, %zmm0, %zmm0 +; AVX512-NEXT: vaddps %zmm0, %zmm3, %zmm0 +; AVX512-NEXT: retq +entry: + %split = shufflevector <16 x float> %a0, <16 x float> poison, <4 x i32> + %split1 = shufflevector <16 x float> %a0, <16 x float> poison, <4 x i32> + %split2 = shufflevector <16 x float> %a0, <16 x float> poison, <4 x i32> + %split3 = shufflevector <16 x float> %a0, <16 x float> poison, <4 x i32> + %splat.splat = shufflevector <16 x float> %a1, <16 x float> undef, <4 x i32> zeroinitializer + %0 = fmul <4 x float> %split, %splat.splat + %splat.splat10 = shufflevector <16 x float> %a1, <16 x float> undef, <4 x i32> + %1 = fmul <4 x float> %split1, %splat.splat10 + %2 = fadd <4 x float> %0, %1 + %splat.splat13 = shufflevector <16 x float> %a1, <16 x float> undef, <4 x i32> + %3 = fmul <4 x float> %split2, %splat.splat13 + %4 = fadd <4 x float> %2, %3 + %splat.splat16 = shufflevector <16 x float> %a1, <16 x float> undef, <4 x i32> + %5 = fmul <4 x float> %split3, %splat.splat16 + %6 = fadd <4 x float> %4, %5 + %splat.splat19 = shufflevector <16 x float> %a1, <16 x float> undef, <4 x i32> + %7 = fmul <4 x float> %split, %splat.splat19 + %splat.splat22 = shufflevector <16 x float> %a1, <16 x float> undef, <4 x i32> + %8 = fmul <4 x float> %split1, %splat.splat22 + %9 = fadd <4 x float> %7, %8 + %splat.splat25 = shufflevector <16 x float> %a1, <16 x float> undef, <4 x i32> + %10 = fmul <4 x float> %split2, %splat.splat25 + %11 = fadd <4 x float> %9, %10 + %splat.splat28 = shufflevector <16 x float> %a1, <16 x float> undef, <4 x i32> + %12 = fmul <4 x float> %split3, %splat.splat28 + %13 = fadd <4 x float> %11, %12 + %splat.splat31 = shufflevector <16 x float> %a1, <16 x float> undef, <4 x i32> + %14 = fmul <4 x float> %split, %splat.splat31 + %splat.splat34 = shufflevector <16 x float> %a1, <16 x float> undef, <4 x i32> + %15 = fmul <4 x float> %split1, %splat.splat34 + %16 = fadd <4 x float> %14, %15 + %splat.splat37 = shufflevector <16 x float> %a1, <16 x float> undef, <4 x i32> + %17 = fmul <4 x float> %split2, %splat.splat37 + %18 = fadd <4 x float> %16, %17 + %splat.splat40 = shufflevector <16 x float> %a1, <16 x float> undef, <4 x i32> + %19 = fmul <4 x float> %split3, %splat.splat40 + %20 = fadd <4 x float> %18, %19 + %splat.splat43 = shufflevector <16 x float> %a1, <16 x float> undef, <4 x i32> + %21 = fmul <4 x float> %split, %splat.splat43 + %splat.splat46 = shufflevector <16 x float> %a1, <16 x float> undef, <4 x i32> + %22 = fmul <4 x float> %split1, %splat.splat46 + %23 = fadd <4 x float> %21, %22 + %splat.splat49 = shufflevector <16 x float> %a1, <16 x float> undef, <4 x i32> + %24 = fmul <4 x float> %split2, %splat.splat49 + %25 = fadd <4 x float> %23, %24 + %splat.splat52 = shufflevector <16 x float> %a1, <16 x float> undef, <4 x i32> + %26 = fmul <4 x float> %split3, %splat.splat52 + %27 = fadd <4 x float> %25, %26 + %28 = shufflevector <4 x float> %6, <4 x float> %13, <8 x i32> + %29 = shufflevector <4 x float> %20, <4 x float> %27, <8 x i32> + %30 = shufflevector <8 x float> %28, <8 x float> %29, <16 x i32> + ret <16 x float> %30 +} + +define <16 x double> @test_mul4x4_f64(<16 x double> %a0, <16 x double> %a1) nounwind { +; SSE-LABEL: test_mul4x4_f64: +; SSE: # %bb.0: # %entry +; SSE-NEXT: movaps %xmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movapd %xmm5, %xmm6 +; SSE-NEXT: movapd %xmm4, %xmm5 +; SSE-NEXT: movq %rdi, %rax +; SSE-NEXT: movapd {{[0-9]+}}(%rsp), %xmm11 +; SSE-NEXT: movapd {{[0-9]+}}(%rsp), %xmm9 +; SSE-NEXT: movapd {{[0-9]+}}(%rsp), %xmm12 +; SSE-NEXT: movapd {{[0-9]+}}(%rsp), %xmm8 +; SSE-NEXT: movapd {{[0-9]+}}(%rsp), %xmm10 +; SSE-NEXT: movapd %xmm10, %xmm13 +; SSE-NEXT: unpcklpd {{.*#+}} xmm13 = xmm13[0],xmm10[0] +; SSE-NEXT: movapd %xmm1, %xmm14 +; SSE-NEXT: mulpd %xmm13, %xmm14 +; SSE-NEXT: mulpd %xmm0, %xmm13 +; SSE-NEXT: unpckhpd {{.*#+}} xmm10 = xmm10[1,1] +; SSE-NEXT: movapd %xmm3, %xmm15 +; SSE-NEXT: mulpd %xmm10, %xmm15 +; SSE-NEXT: addpd %xmm14, %xmm15 +; SSE-NEXT: mulpd %xmm2, %xmm10 +; SSE-NEXT: movapd %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: addpd %xmm13, %xmm10 +; SSE-NEXT: movapd %xmm8, %xmm13 +; SSE-NEXT: unpcklpd {{.*#+}} xmm13 = xmm13[0],xmm8[0] +; SSE-NEXT: movapd %xmm4, %xmm14 +; SSE-NEXT: mulpd %xmm13, %xmm14 +; SSE-NEXT: addpd %xmm10, %xmm14 +; SSE-NEXT: movapd %xmm6, %xmm4 +; SSE-NEXT: mulpd %xmm6, %xmm13 +; SSE-NEXT: addpd %xmm15, %xmm13 +; SSE-NEXT: unpckhpd {{.*#+}} xmm8 = xmm8[1,1] +; SSE-NEXT: movapd %xmm7, %xmm10 +; SSE-NEXT: mulpd %xmm8, %xmm10 +; SSE-NEXT: addpd %xmm13, %xmm10 +; SSE-NEXT: movapd {{[-0-9]+}}(%r{{[sb]}}p), %xmm6 # 16-byte Reload +; SSE-NEXT: mulpd %xmm6, %xmm8 +; SSE-NEXT: addpd %xmm14, %xmm8 +; SSE-NEXT: movapd %xmm12, %xmm13 +; SSE-NEXT: unpcklpd {{.*#+}} xmm13 = xmm13[0],xmm12[0] +; SSE-NEXT: movapd %xmm1, %xmm14 +; SSE-NEXT: mulpd %xmm13, %xmm14 +; SSE-NEXT: mulpd %xmm0, %xmm13 +; SSE-NEXT: unpckhpd {{.*#+}} xmm12 = xmm12[1,1] +; SSE-NEXT: movapd %xmm3, %xmm15 +; SSE-NEXT: mulpd %xmm12, %xmm15 +; SSE-NEXT: addpd %xmm14, %xmm15 +; SSE-NEXT: mulpd %xmm2, %xmm12 +; SSE-NEXT: addpd %xmm13, %xmm12 +; SSE-NEXT: movapd %xmm9, %xmm13 +; SSE-NEXT: unpcklpd {{.*#+}} xmm13 = xmm13[0],xmm9[0] +; SSE-NEXT: movapd %xmm5, %xmm14 +; SSE-NEXT: mulpd %xmm13, %xmm14 +; SSE-NEXT: addpd %xmm12, %xmm14 +; SSE-NEXT: mulpd %xmm4, %xmm13 +; SSE-NEXT: movapd %xmm4, %xmm2 +; SSE-NEXT: addpd %xmm15, %xmm13 +; SSE-NEXT: unpckhpd {{.*#+}} xmm9 = xmm9[1,1] +; SSE-NEXT: movapd %xmm7, %xmm12 +; SSE-NEXT: mulpd %xmm9, %xmm12 +; SSE-NEXT: addpd %xmm13, %xmm12 +; SSE-NEXT: mulpd %xmm6, %xmm9 +; SSE-NEXT: addpd %xmm14, %xmm9 +; SSE-NEXT: movapd %xmm11, %xmm14 +; SSE-NEXT: unpcklpd {{.*#+}} xmm14 = xmm14[0],xmm11[0] +; SSE-NEXT: movapd %xmm1, %xmm13 +; SSE-NEXT: mulpd %xmm14, %xmm13 +; SSE-NEXT: unpckhpd {{.*#+}} xmm11 = xmm11[1,1] +; SSE-NEXT: movapd %xmm3, %xmm15 +; SSE-NEXT: mulpd %xmm11, %xmm15 +; SSE-NEXT: addpd %xmm13, %xmm15 +; SSE-NEXT: movapd {{[0-9]+}}(%rsp), %xmm13 +; SSE-NEXT: mulpd %xmm0, %xmm14 +; SSE-NEXT: movapd %xmm0, %xmm6 +; SSE-NEXT: movapd {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload +; SSE-NEXT: mulpd %xmm0, %xmm11 +; SSE-NEXT: addpd %xmm14, %xmm11 +; SSE-NEXT: movapd %xmm13, %xmm14 +; SSE-NEXT: unpcklpd {{.*#+}} xmm14 = xmm14[0],xmm13[0] +; SSE-NEXT: movapd %xmm5, %xmm4 +; SSE-NEXT: mulpd %xmm14, %xmm4 +; SSE-NEXT: addpd %xmm11, %xmm4 +; SSE-NEXT: mulpd %xmm2, %xmm14 +; SSE-NEXT: addpd %xmm15, %xmm14 +; SSE-NEXT: unpckhpd {{.*#+}} xmm13 = xmm13[1,1] +; SSE-NEXT: movapd %xmm7, %xmm11 +; SSE-NEXT: mulpd %xmm13, %xmm11 +; SSE-NEXT: addpd %xmm14, %xmm11 +; SSE-NEXT: movapd {{[0-9]+}}(%rsp), %xmm14 +; SSE-NEXT: movapd {{[-0-9]+}}(%r{{[sb]}}p), %xmm15 # 16-byte Reload +; SSE-NEXT: mulpd %xmm15, %xmm13 +; SSE-NEXT: addpd %xmm4, %xmm13 +; SSE-NEXT: movapd %xmm14, %xmm4 +; SSE-NEXT: unpcklpd {{.*#+}} xmm4 = xmm4[0],xmm14[0] +; SSE-NEXT: mulpd %xmm4, %xmm1 +; SSE-NEXT: mulpd %xmm6, %xmm4 +; SSE-NEXT: unpckhpd {{.*#+}} xmm14 = xmm14[1,1] +; SSE-NEXT: mulpd %xmm14, %xmm3 +; SSE-NEXT: addpd %xmm1, %xmm3 +; SSE-NEXT: mulpd %xmm0, %xmm14 +; SSE-NEXT: movapd {{[0-9]+}}(%rsp), %xmm0 +; SSE-NEXT: addpd %xmm4, %xmm14 +; SSE-NEXT: movapd %xmm0, %xmm1 +; SSE-NEXT: unpcklpd {{.*#+}} xmm1 = xmm1[0],xmm0[0] +; SSE-NEXT: mulpd %xmm1, %xmm5 +; SSE-NEXT: addpd %xmm14, %xmm5 +; SSE-NEXT: mulpd %xmm2, %xmm1 +; SSE-NEXT: addpd %xmm3, %xmm1 +; SSE-NEXT: unpckhpd {{.*#+}} xmm0 = xmm0[1,1] +; SSE-NEXT: mulpd %xmm0, %xmm7 +; SSE-NEXT: addpd %xmm1, %xmm7 +; SSE-NEXT: mulpd %xmm15, %xmm0 +; SSE-NEXT: addpd %xmm5, %xmm0 +; SSE-NEXT: movapd %xmm7, 112(%rdi) +; SSE-NEXT: movapd %xmm0, 96(%rdi) +; SSE-NEXT: movapd %xmm11, 80(%rdi) +; SSE-NEXT: movapd %xmm13, 64(%rdi) +; SSE-NEXT: movapd %xmm12, 48(%rdi) +; SSE-NEXT: movapd %xmm9, 32(%rdi) +; SSE-NEXT: movapd %xmm10, 16(%rdi) +; SSE-NEXT: movapd %xmm8, (%rdi) +; SSE-NEXT: retq +; +; AVX1-LABEL: test_mul4x4_f64: +; AVX1: # %bb.0: # %entry +; AVX1-NEXT: vmovddup {{.*#+}} xmm8 = xmm4[0,0] +; AVX1-NEXT: vinsertf128 $1, %xmm8, %ymm8, %ymm8 +; AVX1-NEXT: vmulpd %ymm0, %ymm8, %ymm8 +; AVX1-NEXT: vpermilpd {{.*#+}} xmm9 = xmm4[1,1] +; AVX1-NEXT: vinsertf128 $1, %xmm9, %ymm9, %ymm9 +; AVX1-NEXT: vmulpd %ymm1, %ymm9, %ymm9 +; AVX1-NEXT: vaddpd %ymm9, %ymm8, %ymm8 +; AVX1-NEXT: vperm2f128 {{.*#+}} ymm4 = ymm4[2,3,2,3] +; AVX1-NEXT: vmovddup {{.*#+}} ymm9 = ymm4[0,0,2,2] +; AVX1-NEXT: vmulpd %ymm2, %ymm9, %ymm9 +; AVX1-NEXT: vaddpd %ymm9, %ymm8, %ymm8 +; AVX1-NEXT: vpermilpd {{.*#+}} ymm4 = ymm4[1,1,3,3] +; AVX1-NEXT: vmulpd %ymm4, %ymm3, %ymm4 +; AVX1-NEXT: vaddpd %ymm4, %ymm8, %ymm4 +; AVX1-NEXT: vmovddup {{.*#+}} xmm8 = xmm5[0,0] +; AVX1-NEXT: vinsertf128 $1, %xmm8, %ymm8, %ymm8 +; AVX1-NEXT: vmulpd %ymm0, %ymm8, %ymm8 +; AVX1-NEXT: vpermilpd {{.*#+}} xmm9 = xmm5[1,1] +; AVX1-NEXT: vinsertf128 $1, %xmm9, %ymm9, %ymm9 +; AVX1-NEXT: vmulpd %ymm1, %ymm9, %ymm9 +; AVX1-NEXT: vaddpd %ymm9, %ymm8, %ymm8 +; AVX1-NEXT: vperm2f128 {{.*#+}} ymm5 = ymm5[2,3,2,3] +; AVX1-NEXT: vmovddup {{.*#+}} ymm9 = ymm5[0,0,2,2] +; AVX1-NEXT: vmulpd %ymm2, %ymm9, %ymm9 +; AVX1-NEXT: vaddpd %ymm9, %ymm8, %ymm8 +; AVX1-NEXT: vpermilpd {{.*#+}} ymm5 = ymm5[1,1,3,3] +; AVX1-NEXT: vmulpd %ymm5, %ymm3, %ymm5 +; AVX1-NEXT: vaddpd %ymm5, %ymm8, %ymm5 +; AVX1-NEXT: vmovddup {{.*#+}} xmm8 = xmm6[0,0] +; AVX1-NEXT: vinsertf128 $1, %xmm8, %ymm8, %ymm8 +; AVX1-NEXT: vmulpd %ymm0, %ymm8, %ymm8 +; AVX1-NEXT: vpermilpd {{.*#+}} xmm9 = xmm6[1,1] +; AVX1-NEXT: vinsertf128 $1, %xmm9, %ymm9, %ymm9 +; AVX1-NEXT: vmulpd %ymm1, %ymm9, %ymm9 +; AVX1-NEXT: vaddpd %ymm9, %ymm8, %ymm8 +; AVX1-NEXT: vperm2f128 {{.*#+}} ymm6 = ymm6[2,3,2,3] +; AVX1-NEXT: vmovddup {{.*#+}} ymm9 = ymm6[0,0,2,2] +; AVX1-NEXT: vmulpd %ymm2, %ymm9, %ymm9 +; AVX1-NEXT: vaddpd %ymm9, %ymm8, %ymm8 +; AVX1-NEXT: vpermilpd {{.*#+}} ymm6 = ymm6[1,1,3,3] +; AVX1-NEXT: vmulpd %ymm6, %ymm3, %ymm6 +; AVX1-NEXT: vaddpd %ymm6, %ymm8, %ymm6 +; AVX1-NEXT: vmovddup {{.*#+}} xmm8 = xmm7[0,0] +; AVX1-NEXT: vinsertf128 $1, %xmm8, %ymm8, %ymm8 +; AVX1-NEXT: vmulpd %ymm0, %ymm8, %ymm0 +; AVX1-NEXT: vpermilpd {{.*#+}} xmm8 = xmm7[1,1] +; AVX1-NEXT: vinsertf128 $1, %xmm8, %ymm8, %ymm8 +; AVX1-NEXT: vmulpd %ymm1, %ymm8, %ymm1 +; AVX1-NEXT: vaddpd %ymm1, %ymm0, %ymm0 +; AVX1-NEXT: vperm2f128 {{.*#+}} ymm1 = ymm7[2,3,2,3] +; AVX1-NEXT: vmovddup {{.*#+}} ymm7 = ymm1[0,0,2,2] +; AVX1-NEXT: vmulpd %ymm7, %ymm2, %ymm2 +; AVX1-NEXT: vaddpd %ymm2, %ymm0, %ymm0 +; AVX1-NEXT: vpermilpd {{.*#+}} ymm1 = ymm1[1,1,3,3] +; AVX1-NEXT: vmulpd %ymm1, %ymm3, %ymm1 +; AVX1-NEXT: vaddpd %ymm1, %ymm0, %ymm3 +; AVX1-NEXT: vmovapd %ymm4, %ymm0 +; AVX1-NEXT: vmovapd %ymm5, %ymm1 +; AVX1-NEXT: vmovapd %ymm6, %ymm2 +; AVX1-NEXT: retq +; +; AVX2-LABEL: test_mul4x4_f64: +; AVX2: # %bb.0: # %entry +; AVX2-NEXT: vbroadcastsd %xmm4, %ymm8 +; AVX2-NEXT: vmulpd %ymm0, %ymm8, %ymm8 +; AVX2-NEXT: vpermpd {{.*#+}} ymm9 = ymm4[1,1,1,1] +; AVX2-NEXT: vmulpd %ymm1, %ymm9, %ymm9 +; AVX2-NEXT: vaddpd %ymm9, %ymm8, %ymm8 +; AVX2-NEXT: vpermpd {{.*#+}} ymm9 = ymm4[2,2,2,2] +; AVX2-NEXT: vmulpd %ymm2, %ymm9, %ymm9 +; AVX2-NEXT: vaddpd %ymm9, %ymm8, %ymm8 +; AVX2-NEXT: vpermpd {{.*#+}} ymm4 = ymm4[3,3,3,3] +; AVX2-NEXT: vmulpd %ymm4, %ymm3, %ymm4 +; AVX2-NEXT: vaddpd %ymm4, %ymm8, %ymm4 +; AVX2-NEXT: vbroadcastsd %xmm5, %ymm8 +; AVX2-NEXT: vmulpd %ymm0, %ymm8, %ymm8 +; AVX2-NEXT: vpermpd {{.*#+}} ymm9 = ymm5[1,1,1,1] +; AVX2-NEXT: vmulpd %ymm1, %ymm9, %ymm9 +; AVX2-NEXT: vaddpd %ymm9, %ymm8, %ymm8 +; AVX2-NEXT: vpermpd {{.*#+}} ymm9 = ymm5[2,2,2,2] +; AVX2-NEXT: vmulpd %ymm2, %ymm9, %ymm9 +; AVX2-NEXT: vaddpd %ymm9, %ymm8, %ymm8 +; AVX2-NEXT: vpermpd {{.*#+}} ymm5 = ymm5[3,3,3,3] +; AVX2-NEXT: vmulpd %ymm5, %ymm3, %ymm5 +; AVX2-NEXT: vaddpd %ymm5, %ymm8, %ymm5 +; AVX2-NEXT: vbroadcastsd %xmm6, %ymm8 +; AVX2-NEXT: vmulpd %ymm0, %ymm8, %ymm8 +; AVX2-NEXT: vpermpd {{.*#+}} ymm9 = ymm6[1,1,1,1] +; AVX2-NEXT: vmulpd %ymm1, %ymm9, %ymm9 +; AVX2-NEXT: vaddpd %ymm9, %ymm8, %ymm8 +; AVX2-NEXT: vpermpd {{.*#+}} ymm9 = ymm6[2,2,2,2] +; AVX2-NEXT: vmulpd %ymm2, %ymm9, %ymm9 +; AVX2-NEXT: vaddpd %ymm9, %ymm8, %ymm8 +; AVX2-NEXT: vpermpd {{.*#+}} ymm6 = ymm6[3,3,3,3] +; AVX2-NEXT: vmulpd %ymm6, %ymm3, %ymm6 +; AVX2-NEXT: vaddpd %ymm6, %ymm8, %ymm6 +; AVX2-NEXT: vbroadcastsd %xmm7, %ymm8 +; AVX2-NEXT: vmulpd %ymm0, %ymm8, %ymm0 +; AVX2-NEXT: vpermpd {{.*#+}} ymm8 = ymm7[1,1,1,1] +; AVX2-NEXT: vmulpd %ymm1, %ymm8, %ymm1 +; AVX2-NEXT: vaddpd %ymm1, %ymm0, %ymm0 +; AVX2-NEXT: vpermpd {{.*#+}} ymm1 = ymm7[2,2,2,2] +; AVX2-NEXT: vmulpd %ymm1, %ymm2, %ymm1 +; AVX2-NEXT: vaddpd %ymm1, %ymm0, %ymm0 +; AVX2-NEXT: vpermpd {{.*#+}} ymm1 = ymm7[3,3,3,3] +; AVX2-NEXT: vmulpd %ymm1, %ymm3, %ymm1 +; AVX2-NEXT: vaddpd %ymm1, %ymm0, %ymm3 +; AVX2-NEXT: vmovapd %ymm4, %ymm0 +; AVX2-NEXT: vmovapd %ymm5, %ymm1 +; AVX2-NEXT: vmovapd %ymm6, %ymm2 +; AVX2-NEXT: retq +; +; AVX512-LABEL: test_mul4x4_f64: +; AVX512: # %bb.0: # %entry +; AVX512-NEXT: vbroadcastsd %xmm2, %ymm4 +; AVX512-NEXT: vextractf64x4 $1, %zmm2, %ymm5 +; AVX512-NEXT: vbroadcastsd %xmm5, %ymm5 +; AVX512-NEXT: vbroadcastsd %xmm3, %ymm6 +; AVX512-NEXT: vextractf64x4 $1, %zmm3, %ymm7 +; AVX512-NEXT: vbroadcastsd %xmm7, %ymm7 +; AVX512-NEXT: vinsertf64x4 $1, %ymm5, %zmm4, %zmm4 +; AVX512-NEXT: vinsertf64x4 $1, %ymm0, %zmm0, %zmm5 +; AVX512-NEXT: vmulpd %zmm4, %zmm5, %zmm4 +; AVX512-NEXT: vpermpd {{.*#+}} zmm8 = zmm2[1,1,1,1,5,5,5,5] +; AVX512-NEXT: vshuff64x2 {{.*#+}} zmm9 = zmm0[4,5,6,7,4,5,6,7] +; AVX512-NEXT: vmulpd %zmm8, %zmm9, %zmm0 +; AVX512-NEXT: vaddpd %zmm0, %zmm4, %zmm0 +; AVX512-NEXT: vpermpd {{.*#+}} zmm4 = zmm2[2,2,2,2,6,6,6,6] +; AVX512-NEXT: vinsertf64x4 $1, %ymm1, %zmm1, %zmm8 +; AVX512-NEXT: vmulpd %zmm4, %zmm8, %zmm4 +; AVX512-NEXT: vaddpd %zmm4, %zmm0, %zmm0 +; AVX512-NEXT: vpermpd {{.*#+}} zmm2 = zmm2[3,3,3,3,7,7,7,7] +; AVX512-NEXT: vshuff64x2 {{.*#+}} zmm1 = zmm1[4,5,6,7,4,5,6,7] +; AVX512-NEXT: vmulpd %zmm2, %zmm1, %zmm2 +; AVX512-NEXT: vaddpd %zmm2, %zmm0, %zmm0 +; AVX512-NEXT: vinsertf64x4 $1, %ymm7, %zmm6, %zmm2 +; AVX512-NEXT: vmulpd %zmm2, %zmm5, %zmm2 +; AVX512-NEXT: vpermpd {{.*#+}} zmm4 = zmm3[1,1,1,1,5,5,5,5] +; AVX512-NEXT: vmulpd %zmm4, %zmm9, %zmm4 +; AVX512-NEXT: vaddpd %zmm4, %zmm2, %zmm2 +; AVX512-NEXT: vpermpd {{.*#+}} zmm4 = zmm3[2,2,2,2,6,6,6,6] +; AVX512-NEXT: vmulpd %zmm4, %zmm8, %zmm4 +; AVX512-NEXT: vaddpd %zmm4, %zmm2, %zmm2 +; AVX512-NEXT: vpermpd {{.*#+}} zmm3 = zmm3[3,3,3,3,7,7,7,7] +; AVX512-NEXT: vmulpd %zmm3, %zmm1, %zmm1 +; AVX512-NEXT: vaddpd %zmm1, %zmm2, %zmm1 +; AVX512-NEXT: retq +entry: + %split = shufflevector <16 x double> %a0, <16 x double> poison, <4 x i32> + %split1 = shufflevector <16 x double> %a0, <16 x double> poison, <4 x i32> + %split2 = shufflevector <16 x double> %a0, <16 x double> poison, <4 x i32> + %split3 = shufflevector <16 x double> %a0, <16 x double> poison, <4 x i32> + %splat.splat = shufflevector <16 x double> %a1, <16 x double> undef, <4 x i32> zeroinitializer + %0 = fmul <4 x double> %split, %splat.splat + %splat.splat10 = shufflevector <16 x double> %a1, <16 x double> undef, <4 x i32> + %1 = fmul <4 x double> %split1, %splat.splat10 + %2 = fadd <4 x double> %0, %1 + %splat.splat13 = shufflevector <16 x double> %a1, <16 x double> undef, <4 x i32> + %3 = fmul <4 x double> %split2, %splat.splat13 + %4 = fadd <4 x double> %2, %3 + %splat.splat16 = shufflevector <16 x double> %a1, <16 x double> undef, <4 x i32> + %5 = fmul <4 x double> %split3, %splat.splat16 + %6 = fadd <4 x double> %4, %5 + %splat.splat19 = shufflevector <16 x double> %a1, <16 x double> undef, <4 x i32> + %7 = fmul <4 x double> %split, %splat.splat19 + %splat.splat22 = shufflevector <16 x double> %a1, <16 x double> undef, <4 x i32> + %8 = fmul <4 x double> %split1, %splat.splat22 + %9 = fadd <4 x double> %7, %8 + %splat.splat25 = shufflevector <16 x double> %a1, <16 x double> undef, <4 x i32> + %10 = fmul <4 x double> %split2, %splat.splat25 + %11 = fadd <4 x double> %9, %10 + %splat.splat28 = shufflevector <16 x double> %a1, <16 x double> undef, <4 x i32> + %12 = fmul <4 x double> %split3, %splat.splat28 + %13 = fadd <4 x double> %11, %12 + %splat.splat31 = shufflevector <16 x double> %a1, <16 x double> undef, <4 x i32> + %14 = fmul <4 x double> %split, %splat.splat31 + %splat.splat34 = shufflevector <16 x double> %a1, <16 x double> undef, <4 x i32> + %15 = fmul <4 x double> %split1, %splat.splat34 + %16 = fadd <4 x double> %14, %15 + %splat.splat37 = shufflevector <16 x double> %a1, <16 x double> undef, <4 x i32> + %17 = fmul <4 x double> %split2, %splat.splat37 + %18 = fadd <4 x double> %16, %17 + %splat.splat40 = shufflevector <16 x double> %a1, <16 x double> undef, <4 x i32> + %19 = fmul <4 x double> %split3, %splat.splat40 + %20 = fadd <4 x double> %18, %19 + %splat.splat43 = shufflevector <16 x double> %a1, <16 x double> undef, <4 x i32> + %21 = fmul <4 x double> %split, %splat.splat43 + %splat.splat46 = shufflevector <16 x double> %a1, <16 x double> undef, <4 x i32> + %22 = fmul <4 x double> %split1, %splat.splat46 + %23 = fadd <4 x double> %21, %22 + %splat.splat49 = shufflevector <16 x double> %a1, <16 x double> undef, <4 x i32> + %24 = fmul <4 x double> %split2, %splat.splat49 + %25 = fadd <4 x double> %23, %24 + %splat.splat52 = shufflevector <16 x double> %a1, <16 x double> undef, <4 x i32> + %26 = fmul <4 x double> %split3, %splat.splat52 + %27 = fadd <4 x double> %25, %26 + %28 = shufflevector <4 x double> %6, <4 x double> %13, <8 x i32> + %29 = shufflevector <4 x double> %20, <4 x double> %27, <8 x i32> + %30 = shufflevector <8 x double> %28, <8 x double> %29, <16 x i32> + ret <16 x double> %30 +} + +define <64 x float> @test_mul8x8_f32(<64 x float> %a0, <64 x float> %a1) nounwind { +; SSE-LABEL: test_mul8x8_f32: +; SSE: # %bb.0: # %entry +; SSE-NEXT: subq $88, %rsp +; SSE-NEXT: movaps %xmm7, %xmm10 +; SSE-NEXT: movaps %xmm6, %xmm9 +; SSE-NEXT: movaps %xmm5, %xmm11 +; SSE-NEXT: movaps %xmm2, %xmm7 +; SSE-NEXT: movaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movq %rdi, %rax +; SSE-NEXT: movaps {{[0-9]+}}(%rsp), %xmm13 +; SSE-NEXT: movaps {{[0-9]+}}(%rsp), %xmm14 +; SSE-NEXT: movaps %xmm14, %xmm15 +; SSE-NEXT: shufps {{.*#+}} xmm15 = xmm15[0,0],xmm14[0,0] +; SSE-NEXT: movaps %xmm1, %xmm5 +; SSE-NEXT: mulps %xmm15, %xmm5 +; SSE-NEXT: mulps %xmm0, %xmm15 +; SSE-NEXT: movaps %xmm14, %xmm0 +; SSE-NEXT: shufps {{.*#+}} xmm0 = xmm0[1,1],xmm14[1,1] +; SSE-NEXT: movaps %xmm3, %xmm8 +; SSE-NEXT: movaps %xmm3, %xmm12 +; SSE-NEXT: mulps %xmm0, %xmm8 +; SSE-NEXT: addps %xmm5, %xmm8 +; SSE-NEXT: mulps %xmm2, %xmm0 +; SSE-NEXT: addps %xmm15, %xmm0 +; SSE-NEXT: movaps %xmm14, %xmm1 +; SSE-NEXT: shufps {{.*#+}} xmm1 = xmm1[2,2],xmm14[2,2] +; SSE-NEXT: movaps %xmm4, %xmm2 +; SSE-NEXT: movaps %xmm4, %xmm15 +; SSE-NEXT: mulps %xmm1, %xmm2 +; SSE-NEXT: addps %xmm0, %xmm2 +; SSE-NEXT: movaps {{[0-9]+}}(%rsp), %xmm5 +; SSE-NEXT: mulps %xmm11, %xmm1 +; SSE-NEXT: addps %xmm8, %xmm1 +; SSE-NEXT: shufps {{.*#+}} xmm14 = xmm14[3,3,3,3] +; SSE-NEXT: movaps %xmm10, %xmm8 +; SSE-NEXT: movaps %xmm10, %xmm3 +; SSE-NEXT: mulps %xmm14, %xmm3 +; SSE-NEXT: addps %xmm1, %xmm3 +; SSE-NEXT: mulps %xmm6, %xmm14 +; SSE-NEXT: addps %xmm2, %xmm14 +; SSE-NEXT: movaps %xmm5, %xmm1 +; SSE-NEXT: shufps {{.*#+}} xmm1 = xmm1[0,0],xmm5[0,0] +; SSE-NEXT: movaps %xmm13, %xmm2 +; SSE-NEXT: mulps %xmm1, %xmm2 +; SSE-NEXT: addps %xmm14, %xmm2 +; SSE-NEXT: mulps {{[0-9]+}}(%rsp), %xmm1 +; SSE-NEXT: addps %xmm3, %xmm1 +; SSE-NEXT: movaps %xmm5, %xmm0 +; SSE-NEXT: shufps {{.*#+}} xmm0 = xmm0[1,1],xmm5[1,1] +; SSE-NEXT: movaps {{[0-9]+}}(%rsp), %xmm3 +; SSE-NEXT: mulps %xmm0, %xmm3 +; SSE-NEXT: addps %xmm1, %xmm3 +; SSE-NEXT: mulps {{[0-9]+}}(%rsp), %xmm0 +; SSE-NEXT: addps %xmm2, %xmm0 +; SSE-NEXT: movaps %xmm5, %xmm1 +; SSE-NEXT: shufps {{.*#+}} xmm1 = xmm1[2,2],xmm5[2,2] +; SSE-NEXT: movaps {{[0-9]+}}(%rsp), %xmm2 +; SSE-NEXT: mulps %xmm1, %xmm2 +; SSE-NEXT: addps %xmm0, %xmm2 +; SSE-NEXT: mulps {{[0-9]+}}(%rsp), %xmm1 +; SSE-NEXT: addps %xmm3, %xmm1 +; SSE-NEXT: shufps {{.*#+}} xmm5 = xmm5[3,3,3,3] +; SSE-NEXT: movaps {{[0-9]+}}(%rsp), %xmm0 +; SSE-NEXT: mulps %xmm5, %xmm0 +; SSE-NEXT: addps %xmm1, %xmm0 +; SSE-NEXT: movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movaps {{[0-9]+}}(%rsp), %xmm14 +; SSE-NEXT: mulps %xmm14, %xmm5 +; SSE-NEXT: addps %xmm2, %xmm5 +; SSE-NEXT: movaps %xmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movaps {{[0-9]+}}(%rsp), %xmm0 +; SSE-NEXT: movaps %xmm0, %xmm1 +; SSE-NEXT: shufps {{.*#+}} xmm1 = xmm1[0,0],xmm0[0,0] +; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm10 # 16-byte Reload +; SSE-NEXT: movaps %xmm10, %xmm2 +; SSE-NEXT: mulps %xmm1, %xmm2 +; SSE-NEXT: movaps %xmm0, %xmm3 +; SSE-NEXT: shufps {{.*#+}} xmm3 = xmm3[1,1],xmm0[1,1] +; SSE-NEXT: movaps %xmm12, %xmm4 +; SSE-NEXT: mulps %xmm3, %xmm4 +; SSE-NEXT: addps %xmm2, %xmm4 +; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm6 # 16-byte Reload +; SSE-NEXT: mulps %xmm6, %xmm1 +; SSE-NEXT: mulps %xmm7, %xmm3 +; SSE-NEXT: movaps %xmm7, %xmm13 +; SSE-NEXT: addps %xmm1, %xmm3 +; SSE-NEXT: movaps %xmm0, %xmm1 +; SSE-NEXT: shufps {{.*#+}} xmm1 = xmm1[2,2],xmm0[2,2] +; SSE-NEXT: movaps %xmm15, %xmm5 +; SSE-NEXT: movaps %xmm15, %xmm2 +; SSE-NEXT: mulps %xmm1, %xmm2 +; SSE-NEXT: addps %xmm3, %xmm2 +; SSE-NEXT: mulps %xmm11, %xmm1 +; SSE-NEXT: addps %xmm4, %xmm1 +; SSE-NEXT: shufps {{.*#+}} xmm0 = xmm0[3,3,3,3] +; SSE-NEXT: movaps %xmm8, %xmm7 +; SSE-NEXT: movaps %xmm8, %xmm3 +; SSE-NEXT: mulps %xmm0, %xmm3 +; SSE-NEXT: addps %xmm1, %xmm3 +; SSE-NEXT: mulps %xmm9, %xmm0 +; SSE-NEXT: addps %xmm2, %xmm0 +; SSE-NEXT: movaps {{[0-9]+}}(%rsp), %xmm4 +; SSE-NEXT: movaps %xmm4, %xmm1 +; SSE-NEXT: shufps {{.*#+}} xmm1 = xmm1[0,0],xmm4[0,0] +; SSE-NEXT: movaps {{[0-9]+}}(%rsp), %xmm2 +; SSE-NEXT: mulps %xmm1, %xmm2 +; SSE-NEXT: addps %xmm0, %xmm2 +; SSE-NEXT: mulps {{[0-9]+}}(%rsp), %xmm1 +; SSE-NEXT: addps %xmm3, %xmm1 +; SSE-NEXT: movaps %xmm4, %xmm0 +; SSE-NEXT: shufps {{.*#+}} xmm0 = xmm0[1,1],xmm4[1,1] +; SSE-NEXT: movaps {{[0-9]+}}(%rsp), %xmm15 +; SSE-NEXT: movaps %xmm15, %xmm3 +; SSE-NEXT: mulps %xmm0, %xmm3 +; SSE-NEXT: addps %xmm1, %xmm3 +; SSE-NEXT: mulps {{[0-9]+}}(%rsp), %xmm0 +; SSE-NEXT: addps %xmm2, %xmm0 +; SSE-NEXT: movaps %xmm4, %xmm1 +; SSE-NEXT: shufps {{.*#+}} xmm1 = xmm1[2,2],xmm4[2,2] +; SSE-NEXT: movaps {{[0-9]+}}(%rsp), %xmm2 +; SSE-NEXT: mulps %xmm1, %xmm2 +; SSE-NEXT: addps %xmm0, %xmm2 +; SSE-NEXT: mulps {{[0-9]+}}(%rsp), %xmm1 +; SSE-NEXT: addps %xmm3, %xmm1 +; SSE-NEXT: shufps {{.*#+}} xmm4 = xmm4[3,3,3,3] +; SSE-NEXT: movaps {{[0-9]+}}(%rsp), %xmm0 +; SSE-NEXT: mulps %xmm4, %xmm0 +; SSE-NEXT: addps %xmm1, %xmm0 +; SSE-NEXT: movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: mulps %xmm14, %xmm4 +; SSE-NEXT: addps %xmm2, %xmm4 +; SSE-NEXT: movaps %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movaps {{[0-9]+}}(%rsp), %xmm0 +; SSE-NEXT: movaps %xmm0, %xmm1 +; SSE-NEXT: shufps {{.*#+}} xmm1 = xmm1[0,0],xmm0[0,0] +; SSE-NEXT: movaps %xmm10, %xmm2 +; SSE-NEXT: mulps %xmm1, %xmm2 +; SSE-NEXT: movaps %xmm0, %xmm3 +; SSE-NEXT: shufps {{.*#+}} xmm3 = xmm3[1,1],xmm0[1,1] +; SSE-NEXT: movaps %xmm12, %xmm4 +; SSE-NEXT: mulps %xmm3, %xmm4 +; SSE-NEXT: addps %xmm2, %xmm4 +; SSE-NEXT: mulps %xmm6, %xmm1 +; SSE-NEXT: movaps %xmm6, %xmm8 +; SSE-NEXT: mulps %xmm13, %xmm3 +; SSE-NEXT: addps %xmm1, %xmm3 +; SSE-NEXT: movaps %xmm0, %xmm1 +; SSE-NEXT: shufps {{.*#+}} xmm1 = xmm1[2,2],xmm0[2,2] +; SSE-NEXT: movaps %xmm5, %xmm2 +; SSE-NEXT: movaps %xmm5, %xmm14 +; SSE-NEXT: mulps %xmm1, %xmm2 +; SSE-NEXT: addps %xmm3, %xmm2 +; SSE-NEXT: movaps %xmm11, %xmm6 +; SSE-NEXT: mulps %xmm11, %xmm1 +; SSE-NEXT: addps %xmm4, %xmm1 +; SSE-NEXT: shufps {{.*#+}} xmm0 = xmm0[3,3,3,3] +; SSE-NEXT: movaps %xmm7, %xmm3 +; SSE-NEXT: movaps %xmm7, %xmm5 +; SSE-NEXT: mulps %xmm0, %xmm3 +; SSE-NEXT: addps %xmm1, %xmm3 +; SSE-NEXT: mulps %xmm9, %xmm0 +; SSE-NEXT: movaps %xmm9, %xmm11 +; SSE-NEXT: addps %xmm2, %xmm0 +; SSE-NEXT: movaps {{[0-9]+}}(%rsp), %xmm4 +; SSE-NEXT: movaps %xmm4, %xmm1 +; SSE-NEXT: shufps {{.*#+}} xmm1 = xmm1[0,0],xmm4[0,0] +; SSE-NEXT: movaps {{[0-9]+}}(%rsp), %xmm2 +; SSE-NEXT: mulps %xmm1, %xmm2 +; SSE-NEXT: addps %xmm0, %xmm2 +; SSE-NEXT: movaps {{[0-9]+}}(%rsp), %xmm7 +; SSE-NEXT: mulps %xmm7, %xmm1 +; SSE-NEXT: addps %xmm3, %xmm1 +; SSE-NEXT: movaps %xmm4, %xmm0 +; SSE-NEXT: shufps {{.*#+}} xmm0 = xmm0[1,1],xmm4[1,1] +; SSE-NEXT: movaps %xmm15, %xmm3 +; SSE-NEXT: mulps %xmm0, %xmm3 +; SSE-NEXT: addps %xmm1, %xmm3 +; SSE-NEXT: movaps {{[0-9]+}}(%rsp), %xmm1 +; SSE-NEXT: mulps %xmm1, %xmm0 +; SSE-NEXT: addps %xmm2, %xmm0 +; SSE-NEXT: movaps %xmm4, %xmm1 +; SSE-NEXT: shufps {{.*#+}} xmm1 = xmm1[2,2],xmm4[2,2] +; SSE-NEXT: movaps {{[0-9]+}}(%rsp), %xmm2 +; SSE-NEXT: mulps %xmm1, %xmm2 +; SSE-NEXT: addps %xmm0, %xmm2 +; SSE-NEXT: mulps {{[0-9]+}}(%rsp), %xmm1 +; SSE-NEXT: addps %xmm3, %xmm1 +; SSE-NEXT: shufps {{.*#+}} xmm4 = xmm4[3,3,3,3] +; SSE-NEXT: movaps {{[0-9]+}}(%rsp), %xmm0 +; SSE-NEXT: mulps %xmm4, %xmm0 +; SSE-NEXT: addps %xmm1, %xmm0 +; SSE-NEXT: movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movaps {{[0-9]+}}(%rsp), %xmm0 +; SSE-NEXT: mulps %xmm0, %xmm4 +; SSE-NEXT: addps %xmm2, %xmm4 +; SSE-NEXT: movaps %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movaps {{[0-9]+}}(%rsp), %xmm0 +; SSE-NEXT: movaps %xmm0, %xmm1 +; SSE-NEXT: shufps {{.*#+}} xmm1 = xmm1[0,0],xmm0[0,0] +; SSE-NEXT: movaps %xmm10, %xmm9 +; SSE-NEXT: movaps %xmm10, %xmm2 +; SSE-NEXT: mulps %xmm1, %xmm2 +; SSE-NEXT: movaps %xmm0, %xmm3 +; SSE-NEXT: shufps {{.*#+}} xmm3 = xmm3[1,1],xmm0[1,1] +; SSE-NEXT: movaps %xmm12, %xmm4 +; SSE-NEXT: mulps %xmm3, %xmm4 +; SSE-NEXT: addps %xmm2, %xmm4 +; SSE-NEXT: mulps %xmm8, %xmm1 +; SSE-NEXT: mulps %xmm13, %xmm3 +; SSE-NEXT: movaps %xmm13, %xmm10 +; SSE-NEXT: movaps %xmm13, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: addps %xmm1, %xmm3 +; SSE-NEXT: movaps %xmm0, %xmm1 +; SSE-NEXT: shufps {{.*#+}} xmm1 = xmm1[2,2],xmm0[2,2] +; SSE-NEXT: movaps %xmm14, %xmm13 +; SSE-NEXT: movaps %xmm14, %xmm2 +; SSE-NEXT: mulps %xmm1, %xmm2 +; SSE-NEXT: addps %xmm3, %xmm2 +; SSE-NEXT: mulps %xmm6, %xmm1 +; SSE-NEXT: movaps %xmm6, %xmm14 +; SSE-NEXT: addps %xmm4, %xmm1 +; SSE-NEXT: shufps {{.*#+}} xmm0 = xmm0[3,3,3,3] +; SSE-NEXT: movaps %xmm5, %xmm4 +; SSE-NEXT: movaps %xmm5, %xmm3 +; SSE-NEXT: mulps %xmm0, %xmm3 +; SSE-NEXT: addps %xmm1, %xmm3 +; SSE-NEXT: mulps %xmm11, %xmm0 +; SSE-NEXT: movaps %xmm11, %xmm6 +; SSE-NEXT: addps %xmm2, %xmm0 +; SSE-NEXT: movaps {{[0-9]+}}(%rsp), %xmm11 +; SSE-NEXT: movaps %xmm11, %xmm1 +; SSE-NEXT: shufps {{.*#+}} xmm1 = xmm1[0,0],xmm11[0,0] +; SSE-NEXT: movaps {{[0-9]+}}(%rsp), %xmm2 +; SSE-NEXT: mulps %xmm1, %xmm2 +; SSE-NEXT: addps %xmm0, %xmm2 +; SSE-NEXT: mulps %xmm7, %xmm1 +; SSE-NEXT: addps %xmm3, %xmm1 +; SSE-NEXT: movaps %xmm11, %xmm0 +; SSE-NEXT: shufps {{.*#+}} xmm0 = xmm0[1,1],xmm11[1,1] +; SSE-NEXT: movaps %xmm15, %xmm3 +; SSE-NEXT: mulps %xmm0, %xmm3 +; SSE-NEXT: addps %xmm1, %xmm3 +; SSE-NEXT: mulps {{[0-9]+}}(%rsp), %xmm0 +; SSE-NEXT: addps %xmm2, %xmm0 +; SSE-NEXT: movaps %xmm11, %xmm1 +; SSE-NEXT: shufps {{.*#+}} xmm1 = xmm1[2,2],xmm11[2,2] +; SSE-NEXT: movaps {{[0-9]+}}(%rsp), %xmm2 +; SSE-NEXT: mulps %xmm1, %xmm2 +; SSE-NEXT: addps %xmm0, %xmm2 +; SSE-NEXT: movaps {{[0-9]+}}(%rsp), %xmm0 +; SSE-NEXT: mulps %xmm0, %xmm1 +; SSE-NEXT: addps %xmm3, %xmm1 +; SSE-NEXT: shufps {{.*#+}} xmm11 = xmm11[3,3,3,3] +; SSE-NEXT: movaps {{[0-9]+}}(%rsp), %xmm0 +; SSE-NEXT: mulps %xmm11, %xmm0 +; SSE-NEXT: addps %xmm1, %xmm0 +; SSE-NEXT: movaps %xmm0, (%rsp) # 16-byte Spill +; SSE-NEXT: mulps {{[0-9]+}}(%rsp), %xmm11 +; SSE-NEXT: addps %xmm2, %xmm11 +; SSE-NEXT: movaps {{[0-9]+}}(%rsp), %xmm0 +; SSE-NEXT: movaps %xmm0, %xmm1 +; SSE-NEXT: shufps {{.*#+}} xmm1 = xmm1[0,0],xmm0[0,0] +; SSE-NEXT: movaps %xmm9, %xmm2 +; SSE-NEXT: movaps %xmm9, %xmm5 +; SSE-NEXT: mulps %xmm1, %xmm2 +; SSE-NEXT: movaps %xmm0, %xmm3 +; SSE-NEXT: shufps {{.*#+}} xmm3 = xmm3[1,1],xmm0[1,1] +; SSE-NEXT: movaps %xmm12, %xmm7 +; SSE-NEXT: mulps %xmm3, %xmm7 +; SSE-NEXT: addps %xmm2, %xmm7 +; SSE-NEXT: mulps %xmm8, %xmm1 +; SSE-NEXT: mulps %xmm10, %xmm3 +; SSE-NEXT: addps %xmm1, %xmm3 +; SSE-NEXT: movaps %xmm0, %xmm1 +; SSE-NEXT: shufps {{.*#+}} xmm1 = xmm1[2,2],xmm0[2,2] +; SSE-NEXT: movaps %xmm13, %xmm2 +; SSE-NEXT: movaps %xmm13, %xmm10 +; SSE-NEXT: mulps %xmm1, %xmm2 +; SSE-NEXT: addps %xmm3, %xmm2 +; SSE-NEXT: movaps %xmm14, %xmm9 +; SSE-NEXT: mulps %xmm14, %xmm1 +; SSE-NEXT: addps %xmm7, %xmm1 +; SSE-NEXT: shufps {{.*#+}} xmm0 = xmm0[3,3,3,3] +; SSE-NEXT: movaps %xmm4, %xmm7 +; SSE-NEXT: movaps %xmm4, %xmm8 +; SSE-NEXT: mulps %xmm0, %xmm7 +; SSE-NEXT: addps %xmm1, %xmm7 +; SSE-NEXT: movaps %xmm6, %xmm3 +; SSE-NEXT: movaps %xmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: mulps %xmm6, %xmm0 +; SSE-NEXT: addps %xmm2, %xmm0 +; SSE-NEXT: movaps {{[0-9]+}}(%rsp), %xmm4 +; SSE-NEXT: movaps %xmm4, %xmm1 +; SSE-NEXT: shufps {{.*#+}} xmm1 = xmm1[0,0],xmm4[0,0] +; SSE-NEXT: movaps {{[0-9]+}}(%rsp), %xmm2 +; SSE-NEXT: mulps %xmm1, %xmm2 +; SSE-NEXT: addps %xmm0, %xmm2 +; SSE-NEXT: mulps {{[0-9]+}}(%rsp), %xmm1 +; SSE-NEXT: addps %xmm7, %xmm1 +; SSE-NEXT: movaps %xmm4, %xmm0 +; SSE-NEXT: shufps {{.*#+}} xmm0 = xmm0[1,1],xmm4[1,1] +; SSE-NEXT: mulps %xmm0, %xmm15 +; SSE-NEXT: addps %xmm1, %xmm15 +; SSE-NEXT: movaps {{[0-9]+}}(%rsp), %xmm1 +; SSE-NEXT: mulps %xmm1, %xmm0 +; SSE-NEXT: addps %xmm2, %xmm0 +; SSE-NEXT: movaps %xmm4, %xmm1 +; SSE-NEXT: shufps {{.*#+}} xmm1 = xmm1[2,2],xmm4[2,2] +; SSE-NEXT: movaps {{[0-9]+}}(%rsp), %xmm2 +; SSE-NEXT: mulps %xmm1, %xmm2 +; SSE-NEXT: addps %xmm0, %xmm2 +; SSE-NEXT: mulps {{[0-9]+}}(%rsp), %xmm1 +; SSE-NEXT: addps %xmm15, %xmm1 +; SSE-NEXT: shufps {{.*#+}} xmm4 = xmm4[3,3,3,3] +; SSE-NEXT: movaps {{[0-9]+}}(%rsp), %xmm0 +; SSE-NEXT: mulps %xmm4, %xmm0 +; SSE-NEXT: addps %xmm1, %xmm0 +; SSE-NEXT: movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: mulps {{[0-9]+}}(%rsp), %xmm4 +; SSE-NEXT: addps %xmm2, %xmm4 +; SSE-NEXT: movaps {{[0-9]+}}(%rsp), %xmm0 +; SSE-NEXT: movaps %xmm0, %xmm1 +; SSE-NEXT: shufps {{.*#+}} xmm1 = xmm1[0,0],xmm0[0,0] +; SSE-NEXT: movaps %xmm5, %xmm7 +; SSE-NEXT: movaps %xmm5, %xmm2 +; SSE-NEXT: mulps %xmm1, %xmm2 +; SSE-NEXT: movaps %xmm0, %xmm14 +; SSE-NEXT: shufps {{.*#+}} xmm14 = xmm14[1,1],xmm0[1,1] +; SSE-NEXT: movaps %xmm12, %xmm15 +; SSE-NEXT: movaps %xmm12, %xmm13 +; SSE-NEXT: movaps %xmm12, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: mulps %xmm14, %xmm15 +; SSE-NEXT: addps %xmm2, %xmm15 +; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm5 # 16-byte Reload +; SSE-NEXT: mulps %xmm5, %xmm1 +; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm6 # 16-byte Reload +; SSE-NEXT: mulps %xmm6, %xmm14 +; SSE-NEXT: addps %xmm1, %xmm14 +; SSE-NEXT: movaps %xmm0, %xmm1 +; SSE-NEXT: shufps {{.*#+}} xmm1 = xmm1[2,2],xmm0[2,2] +; SSE-NEXT: movaps %xmm10, %xmm2 +; SSE-NEXT: mulps %xmm1, %xmm2 +; SSE-NEXT: addps %xmm14, %xmm2 +; SSE-NEXT: mulps %xmm9, %xmm1 +; SSE-NEXT: movaps %xmm9, %xmm12 +; SSE-NEXT: addps %xmm15, %xmm1 +; SSE-NEXT: shufps {{.*#+}} xmm0 = xmm0[3,3,3,3] +; SSE-NEXT: movaps %xmm8, %xmm14 +; SSE-NEXT: mulps %xmm0, %xmm14 +; SSE-NEXT: addps %xmm1, %xmm14 +; SSE-NEXT: mulps %xmm3, %xmm0 +; SSE-NEXT: addps %xmm2, %xmm0 +; SSE-NEXT: movaps {{[0-9]+}}(%rsp), %xmm3 +; SSE-NEXT: movaps %xmm3, %xmm1 +; SSE-NEXT: shufps {{.*#+}} xmm1 = xmm1[0,0],xmm3[0,0] +; SSE-NEXT: movaps {{[0-9]+}}(%rsp), %xmm15 +; SSE-NEXT: mulps %xmm1, %xmm15 +; SSE-NEXT: addps %xmm0, %xmm15 +; SSE-NEXT: mulps {{[0-9]+}}(%rsp), %xmm1 +; SSE-NEXT: addps %xmm14, %xmm1 +; SSE-NEXT: movaps %xmm3, %xmm0 +; SSE-NEXT: shufps {{.*#+}} xmm0 = xmm0[1,1],xmm3[1,1] +; SSE-NEXT: movaps {{[0-9]+}}(%rsp), %xmm14 +; SSE-NEXT: mulps %xmm0, %xmm14 +; SSE-NEXT: addps %xmm1, %xmm14 +; SSE-NEXT: mulps {{[0-9]+}}(%rsp), %xmm0 +; SSE-NEXT: addps %xmm15, %xmm0 +; SSE-NEXT: movaps %xmm3, %xmm1 +; SSE-NEXT: shufps {{.*#+}} xmm1 = xmm1[2,2],xmm3[2,2] +; SSE-NEXT: movaps {{[0-9]+}}(%rsp), %xmm15 +; SSE-NEXT: mulps %xmm1, %xmm15 +; SSE-NEXT: addps %xmm0, %xmm15 +; SSE-NEXT: mulps {{[0-9]+}}(%rsp), %xmm1 +; SSE-NEXT: addps %xmm14, %xmm1 +; SSE-NEXT: shufps {{.*#+}} xmm3 = xmm3[3,3,3,3] +; SSE-NEXT: movaps {{[0-9]+}}(%rsp), %xmm14 +; SSE-NEXT: mulps %xmm3, %xmm14 +; SSE-NEXT: addps %xmm1, %xmm14 +; SSE-NEXT: mulps {{[0-9]+}}(%rsp), %xmm3 +; SSE-NEXT: addps %xmm15, %xmm3 +; SSE-NEXT: movaps {{[0-9]+}}(%rsp), %xmm0 +; SSE-NEXT: movaps %xmm0, %xmm1 +; SSE-NEXT: shufps {{.*#+}} xmm1 = xmm1[0,0],xmm0[0,0] +; SSE-NEXT: mulps %xmm1, %xmm7 +; SSE-NEXT: movaps %xmm0, %xmm15 +; SSE-NEXT: shufps {{.*#+}} xmm15 = xmm15[1,1],xmm0[1,1] +; SSE-NEXT: mulps %xmm15, %xmm13 +; SSE-NEXT: addps %xmm7, %xmm13 +; SSE-NEXT: mulps %xmm5, %xmm1 +; SSE-NEXT: mulps %xmm6, %xmm15 +; SSE-NEXT: movaps %xmm6, %xmm7 +; SSE-NEXT: addps %xmm1, %xmm15 +; SSE-NEXT: movaps %xmm0, %xmm1 +; SSE-NEXT: shufps {{.*#+}} xmm1 = xmm1[2,2],xmm0[2,2] +; SSE-NEXT: movaps %xmm10, %xmm2 +; SSE-NEXT: mulps %xmm1, %xmm2 +; SSE-NEXT: addps %xmm15, %xmm2 +; SSE-NEXT: mulps %xmm9, %xmm1 +; SSE-NEXT: addps %xmm13, %xmm1 +; SSE-NEXT: shufps {{.*#+}} xmm0 = xmm0[3,3,3,3] +; SSE-NEXT: movaps %xmm8, %xmm9 +; SSE-NEXT: mulps %xmm0, %xmm9 +; SSE-NEXT: addps %xmm1, %xmm9 +; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm6 # 16-byte Reload +; SSE-NEXT: mulps %xmm6, %xmm0 +; SSE-NEXT: addps %xmm2, %xmm0 +; SSE-NEXT: movaps {{[0-9]+}}(%rsp), %xmm1 +; SSE-NEXT: movaps %xmm1, %xmm2 +; SSE-NEXT: shufps {{.*#+}} xmm2 = xmm2[0,0],xmm1[0,0] +; SSE-NEXT: movaps {{[0-9]+}}(%rsp), %xmm15 +; SSE-NEXT: mulps %xmm2, %xmm15 +; SSE-NEXT: addps %xmm0, %xmm15 +; SSE-NEXT: mulps {{[0-9]+}}(%rsp), %xmm2 +; SSE-NEXT: addps %xmm9, %xmm2 +; SSE-NEXT: movaps %xmm1, %xmm0 +; SSE-NEXT: shufps {{.*#+}} xmm0 = xmm0[1,1],xmm1[1,1] +; SSE-NEXT: movaps {{[0-9]+}}(%rsp), %xmm9 +; SSE-NEXT: mulps %xmm0, %xmm9 +; SSE-NEXT: addps %xmm2, %xmm9 +; SSE-NEXT: mulps {{[0-9]+}}(%rsp), %xmm0 +; SSE-NEXT: addps %xmm15, %xmm0 +; SSE-NEXT: movaps %xmm1, %xmm2 +; SSE-NEXT: shufps {{.*#+}} xmm2 = xmm2[2,2],xmm1[2,2] +; SSE-NEXT: movaps {{[0-9]+}}(%rsp), %xmm13 +; SSE-NEXT: mulps %xmm2, %xmm13 +; SSE-NEXT: addps %xmm0, %xmm13 +; SSE-NEXT: mulps {{[0-9]+}}(%rsp), %xmm2 +; SSE-NEXT: addps %xmm9, %xmm2 +; SSE-NEXT: shufps {{.*#+}} xmm1 = xmm1[3,3,3,3] +; SSE-NEXT: movaps {{[0-9]+}}(%rsp), %xmm15 +; SSE-NEXT: mulps %xmm1, %xmm15 +; SSE-NEXT: addps %xmm2, %xmm15 +; SSE-NEXT: mulps {{[0-9]+}}(%rsp), %xmm1 +; SSE-NEXT: addps %xmm13, %xmm1 +; SSE-NEXT: movaps {{[0-9]+}}(%rsp), %xmm0 +; SSE-NEXT: movaps %xmm0, %xmm2 +; SSE-NEXT: shufps {{.*#+}} xmm2 = xmm2[0,0],xmm0[0,0] +; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm13 # 16-byte Reload +; SSE-NEXT: mulps %xmm2, %xmm13 +; SSE-NEXT: mulps %xmm5, %xmm2 +; SSE-NEXT: movaps %xmm0, %xmm9 +; SSE-NEXT: shufps {{.*#+}} xmm9 = xmm9[1,1],xmm0[1,1] +; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm5 # 16-byte Reload +; SSE-NEXT: mulps %xmm9, %xmm5 +; SSE-NEXT: addps %xmm13, %xmm5 +; SSE-NEXT: mulps %xmm7, %xmm9 +; SSE-NEXT: addps %xmm2, %xmm9 +; SSE-NEXT: movaps %xmm0, %xmm2 +; SSE-NEXT: shufps {{.*#+}} xmm2 = xmm2[2,2],xmm0[2,2] +; SSE-NEXT: mulps %xmm2, %xmm10 +; SSE-NEXT: addps %xmm9, %xmm10 +; SSE-NEXT: mulps %xmm12, %xmm2 +; SSE-NEXT: addps %xmm5, %xmm2 +; SSE-NEXT: shufps {{.*#+}} xmm0 = xmm0[3,3,3,3] +; SSE-NEXT: mulps %xmm0, %xmm8 +; SSE-NEXT: addps %xmm2, %xmm8 +; SSE-NEXT: mulps %xmm6, %xmm0 +; SSE-NEXT: addps %xmm10, %xmm0 +; SSE-NEXT: movaps {{[0-9]+}}(%rsp), %xmm9 +; SSE-NEXT: movaps %xmm9, %xmm2 +; SSE-NEXT: shufps {{.*#+}} xmm2 = xmm2[0,0],xmm9[0,0] +; SSE-NEXT: movaps {{[0-9]+}}(%rsp), %xmm13 +; SSE-NEXT: mulps %xmm2, %xmm13 +; SSE-NEXT: addps %xmm0, %xmm13 +; SSE-NEXT: mulps {{[0-9]+}}(%rsp), %xmm2 +; SSE-NEXT: addps %xmm8, %xmm2 +; SSE-NEXT: movaps %xmm9, %xmm0 +; SSE-NEXT: shufps {{.*#+}} xmm0 = xmm0[1,1],xmm9[1,1] +; SSE-NEXT: movaps {{[0-9]+}}(%rsp), %xmm12 +; SSE-NEXT: mulps %xmm0, %xmm12 +; SSE-NEXT: addps %xmm2, %xmm12 +; SSE-NEXT: mulps {{[0-9]+}}(%rsp), %xmm0 +; SSE-NEXT: addps %xmm13, %xmm0 +; SSE-NEXT: movaps %xmm9, %xmm2 +; SSE-NEXT: shufps {{.*#+}} xmm2 = xmm2[2,2],xmm9[2,2] +; SSE-NEXT: movaps {{[0-9]+}}(%rsp), %xmm5 +; SSE-NEXT: mulps %xmm2, %xmm5 +; SSE-NEXT: addps %xmm0, %xmm5 +; SSE-NEXT: mulps {{[0-9]+}}(%rsp), %xmm2 +; SSE-NEXT: addps %xmm12, %xmm2 +; SSE-NEXT: shufps {{.*#+}} xmm9 = xmm9[3,3,3,3] +; SSE-NEXT: movaps {{[0-9]+}}(%rsp), %xmm0 +; SSE-NEXT: mulps %xmm9, %xmm0 +; SSE-NEXT: addps %xmm2, %xmm0 +; SSE-NEXT: mulps {{[0-9]+}}(%rsp), %xmm9 +; SSE-NEXT: addps %xmm5, %xmm9 +; SSE-NEXT: movaps %xmm0, 240(%rdi) +; SSE-NEXT: movaps %xmm9, 224(%rdi) +; SSE-NEXT: movaps %xmm15, 208(%rdi) +; SSE-NEXT: movaps %xmm1, 192(%rdi) +; SSE-NEXT: movaps %xmm14, 176(%rdi) +; SSE-NEXT: movaps %xmm3, 160(%rdi) +; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload +; SSE-NEXT: movaps %xmm0, 144(%rdi) +; SSE-NEXT: movaps %xmm4, 128(%rdi) +; SSE-NEXT: movaps (%rsp), %xmm0 # 16-byte Reload +; SSE-NEXT: movaps %xmm0, 112(%rdi) +; SSE-NEXT: movaps %xmm11, 96(%rdi) +; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload +; SSE-NEXT: movaps %xmm0, 80(%rdi) +; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload +; SSE-NEXT: movaps %xmm0, 64(%rdi) +; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload +; SSE-NEXT: movaps %xmm0, 48(%rdi) +; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload +; SSE-NEXT: movaps %xmm0, 32(%rdi) +; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload +; SSE-NEXT: movaps %xmm0, 16(%rdi) +; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload +; SSE-NEXT: movaps %xmm0, (%rdi) +; SSE-NEXT: addq $88, %rsp +; SSE-NEXT: retq +; +; AVX1-LABEL: test_mul8x8_f32: +; AVX1: # %bb.0: # %entry +; AVX1-NEXT: pushq %rbp +; AVX1-NEXT: movq %rsp, %rbp +; AVX1-NEXT: andq $-32, %rsp +; AVX1-NEXT: subq $32, %rsp +; AVX1-NEXT: movq %rdi, %rax +; AVX1-NEXT: vbroadcastss 16(%rbp), %ymm8 +; AVX1-NEXT: vmulps %ymm0, %ymm8, %ymm8 +; AVX1-NEXT: vbroadcastss 20(%rbp), %ymm9 +; AVX1-NEXT: vmulps %ymm1, %ymm9, %ymm9 +; AVX1-NEXT: vaddps %ymm9, %ymm8, %ymm8 +; AVX1-NEXT: vbroadcastss 24(%rbp), %ymm9 +; AVX1-NEXT: vmulps %ymm2, %ymm9, %ymm9 +; AVX1-NEXT: vaddps %ymm9, %ymm8, %ymm8 +; AVX1-NEXT: vbroadcastss 28(%rbp), %ymm9 +; AVX1-NEXT: vmulps %ymm3, %ymm9, %ymm9 +; AVX1-NEXT: vaddps %ymm9, %ymm8, %ymm8 +; AVX1-NEXT: vbroadcastss 32(%rbp), %ymm9 +; AVX1-NEXT: vmulps %ymm4, %ymm9, %ymm9 +; AVX1-NEXT: vaddps %ymm9, %ymm8, %ymm8 +; AVX1-NEXT: vbroadcastss 36(%rbp), %ymm9 +; AVX1-NEXT: vmulps %ymm5, %ymm9, %ymm9 +; AVX1-NEXT: vaddps %ymm9, %ymm8, %ymm8 +; AVX1-NEXT: vbroadcastss 40(%rbp), %ymm9 +; AVX1-NEXT: vmulps %ymm6, %ymm9, %ymm9 +; AVX1-NEXT: vaddps %ymm9, %ymm8, %ymm8 +; AVX1-NEXT: vbroadcastss 44(%rbp), %ymm9 +; AVX1-NEXT: vmulps %ymm7, %ymm9, %ymm9 +; AVX1-NEXT: vaddps %ymm9, %ymm8, %ymm8 +; AVX1-NEXT: vbroadcastss 48(%rbp), %ymm9 +; AVX1-NEXT: vmulps %ymm0, %ymm9, %ymm9 +; AVX1-NEXT: vbroadcastss 52(%rbp), %ymm10 +; AVX1-NEXT: vmulps %ymm1, %ymm10, %ymm10 +; AVX1-NEXT: vaddps %ymm10, %ymm9, %ymm9 +; AVX1-NEXT: vbroadcastss 56(%rbp), %ymm10 +; AVX1-NEXT: vmulps %ymm2, %ymm10, %ymm10 +; AVX1-NEXT: vaddps %ymm10, %ymm9, %ymm9 +; AVX1-NEXT: vbroadcastss 60(%rbp), %ymm10 +; AVX1-NEXT: vmulps %ymm3, %ymm10, %ymm10 +; AVX1-NEXT: vaddps %ymm10, %ymm9, %ymm9 +; AVX1-NEXT: vbroadcastss 64(%rbp), %ymm10 +; AVX1-NEXT: vmulps %ymm4, %ymm10, %ymm10 +; AVX1-NEXT: vaddps %ymm10, %ymm9, %ymm9 +; AVX1-NEXT: vbroadcastss 68(%rbp), %ymm10 +; AVX1-NEXT: vmulps %ymm5, %ymm10, %ymm10 +; AVX1-NEXT: vaddps %ymm10, %ymm9, %ymm9 +; AVX1-NEXT: vbroadcastss 72(%rbp), %ymm10 +; AVX1-NEXT: vmulps %ymm6, %ymm10, %ymm10 +; AVX1-NEXT: vaddps %ymm10, %ymm9, %ymm9 +; AVX1-NEXT: vbroadcastss 76(%rbp), %ymm10 +; AVX1-NEXT: vmulps %ymm7, %ymm10, %ymm10 +; AVX1-NEXT: vaddps %ymm10, %ymm9, %ymm9 +; AVX1-NEXT: vbroadcastss 80(%rbp), %ymm10 +; AVX1-NEXT: vmulps %ymm0, %ymm10, %ymm10 +; AVX1-NEXT: vbroadcastss 84(%rbp), %ymm11 +; AVX1-NEXT: vmulps %ymm1, %ymm11, %ymm11 +; AVX1-NEXT: vaddps %ymm11, %ymm10, %ymm10 +; AVX1-NEXT: vbroadcastss 88(%rbp), %ymm11 +; AVX1-NEXT: vmulps %ymm2, %ymm11, %ymm11 +; AVX1-NEXT: vaddps %ymm11, %ymm10, %ymm10 +; AVX1-NEXT: vbroadcastss 92(%rbp), %ymm11 +; AVX1-NEXT: vmulps %ymm3, %ymm11, %ymm11 +; AVX1-NEXT: vaddps %ymm11, %ymm10, %ymm10 +; AVX1-NEXT: vbroadcastss 96(%rbp), %ymm11 +; AVX1-NEXT: vmulps %ymm4, %ymm11, %ymm11 +; AVX1-NEXT: vaddps %ymm11, %ymm10, %ymm10 +; AVX1-NEXT: vbroadcastss 100(%rbp), %ymm11 +; AVX1-NEXT: vmulps %ymm5, %ymm11, %ymm11 +; AVX1-NEXT: vaddps %ymm11, %ymm10, %ymm10 +; AVX1-NEXT: vbroadcastss 104(%rbp), %ymm11 +; AVX1-NEXT: vmulps %ymm6, %ymm11, %ymm11 +; AVX1-NEXT: vaddps %ymm11, %ymm10, %ymm10 +; AVX1-NEXT: vbroadcastss 108(%rbp), %ymm11 +; AVX1-NEXT: vmulps %ymm7, %ymm11, %ymm11 +; AVX1-NEXT: vaddps %ymm11, %ymm10, %ymm10 +; AVX1-NEXT: vbroadcastss 112(%rbp), %ymm11 +; AVX1-NEXT: vmulps %ymm0, %ymm11, %ymm11 +; AVX1-NEXT: vbroadcastss 116(%rbp), %ymm12 +; AVX1-NEXT: vmulps %ymm1, %ymm12, %ymm12 +; AVX1-NEXT: vaddps %ymm12, %ymm11, %ymm11 +; AVX1-NEXT: vbroadcastss 120(%rbp), %ymm12 +; AVX1-NEXT: vmulps %ymm2, %ymm12, %ymm12 +; AVX1-NEXT: vaddps %ymm12, %ymm11, %ymm11 +; AVX1-NEXT: vbroadcastss 124(%rbp), %ymm12 +; AVX1-NEXT: vmulps %ymm3, %ymm12, %ymm12 +; AVX1-NEXT: vaddps %ymm12, %ymm11, %ymm11 +; AVX1-NEXT: vbroadcastss 128(%rbp), %ymm12 +; AVX1-NEXT: vmulps %ymm4, %ymm12, %ymm12 +; AVX1-NEXT: vaddps %ymm12, %ymm11, %ymm11 +; AVX1-NEXT: vbroadcastss 132(%rbp), %ymm12 +; AVX1-NEXT: vmulps %ymm5, %ymm12, %ymm12 +; AVX1-NEXT: vaddps %ymm12, %ymm11, %ymm11 +; AVX1-NEXT: vbroadcastss 136(%rbp), %ymm12 +; AVX1-NEXT: vmulps %ymm6, %ymm12, %ymm12 +; AVX1-NEXT: vaddps %ymm12, %ymm11, %ymm11 +; AVX1-NEXT: vbroadcastss 140(%rbp), %ymm12 +; AVX1-NEXT: vmulps %ymm7, %ymm12, %ymm12 +; AVX1-NEXT: vaddps %ymm12, %ymm11, %ymm11 +; AVX1-NEXT: vbroadcastss 144(%rbp), %ymm12 +; AVX1-NEXT: vmulps %ymm0, %ymm12, %ymm12 +; AVX1-NEXT: vbroadcastss 148(%rbp), %ymm13 +; AVX1-NEXT: vmulps %ymm1, %ymm13, %ymm13 +; AVX1-NEXT: vaddps %ymm13, %ymm12, %ymm12 +; AVX1-NEXT: vbroadcastss 152(%rbp), %ymm13 +; AVX1-NEXT: vmulps %ymm2, %ymm13, %ymm13 +; AVX1-NEXT: vaddps %ymm13, %ymm12, %ymm12 +; AVX1-NEXT: vbroadcastss 156(%rbp), %ymm13 +; AVX1-NEXT: vmulps %ymm3, %ymm13, %ymm13 +; AVX1-NEXT: vaddps %ymm13, %ymm12, %ymm12 +; AVX1-NEXT: vbroadcastss 160(%rbp), %ymm13 +; AVX1-NEXT: vmulps %ymm4, %ymm13, %ymm13 +; AVX1-NEXT: vaddps %ymm13, %ymm12, %ymm12 +; AVX1-NEXT: vbroadcastss 164(%rbp), %ymm13 +; AVX1-NEXT: vmulps %ymm5, %ymm13, %ymm13 +; AVX1-NEXT: vaddps %ymm13, %ymm12, %ymm12 +; AVX1-NEXT: vbroadcastss 168(%rbp), %ymm13 +; AVX1-NEXT: vmulps %ymm6, %ymm13, %ymm13 +; AVX1-NEXT: vaddps %ymm13, %ymm12, %ymm12 +; AVX1-NEXT: vbroadcastss 172(%rbp), %ymm13 +; AVX1-NEXT: vmulps %ymm7, %ymm13, %ymm13 +; AVX1-NEXT: vaddps %ymm13, %ymm12, %ymm12 +; AVX1-NEXT: vbroadcastss 176(%rbp), %ymm13 +; AVX1-NEXT: vmulps %ymm0, %ymm13, %ymm13 +; AVX1-NEXT: vbroadcastss 180(%rbp), %ymm14 +; AVX1-NEXT: vmulps %ymm1, %ymm14, %ymm14 +; AVX1-NEXT: vaddps %ymm14, %ymm13, %ymm13 +; AVX1-NEXT: vbroadcastss 184(%rbp), %ymm14 +; AVX1-NEXT: vmulps %ymm2, %ymm14, %ymm14 +; AVX1-NEXT: vaddps %ymm14, %ymm13, %ymm13 +; AVX1-NEXT: vbroadcastss 188(%rbp), %ymm14 +; AVX1-NEXT: vmulps %ymm3, %ymm14, %ymm14 +; AVX1-NEXT: vaddps %ymm14, %ymm13, %ymm13 +; AVX1-NEXT: vbroadcastss 192(%rbp), %ymm14 +; AVX1-NEXT: vmulps %ymm4, %ymm14, %ymm14 +; AVX1-NEXT: vaddps %ymm14, %ymm13, %ymm13 +; AVX1-NEXT: vbroadcastss 196(%rbp), %ymm14 +; AVX1-NEXT: vmulps %ymm5, %ymm14, %ymm14 +; AVX1-NEXT: vaddps %ymm14, %ymm13, %ymm13 +; AVX1-NEXT: vbroadcastss 200(%rbp), %ymm14 +; AVX1-NEXT: vmulps %ymm6, %ymm14, %ymm14 +; AVX1-NEXT: vaddps %ymm14, %ymm13, %ymm13 +; AVX1-NEXT: vbroadcastss 204(%rbp), %ymm14 +; AVX1-NEXT: vmulps %ymm7, %ymm14, %ymm14 +; AVX1-NEXT: vaddps %ymm14, %ymm13, %ymm13 +; AVX1-NEXT: vbroadcastss 208(%rbp), %ymm14 +; AVX1-NEXT: vmulps %ymm0, %ymm14, %ymm14 +; AVX1-NEXT: vbroadcastss 212(%rbp), %ymm15 +; AVX1-NEXT: vmulps %ymm1, %ymm15, %ymm15 +; AVX1-NEXT: vaddps %ymm15, %ymm14, %ymm14 +; AVX1-NEXT: vbroadcastss 216(%rbp), %ymm15 +; AVX1-NEXT: vmulps %ymm2, %ymm15, %ymm15 +; AVX1-NEXT: vaddps %ymm15, %ymm14, %ymm14 +; AVX1-NEXT: vbroadcastss 220(%rbp), %ymm15 +; AVX1-NEXT: vmulps %ymm3, %ymm15, %ymm15 +; AVX1-NEXT: vaddps %ymm15, %ymm14, %ymm14 +; AVX1-NEXT: vbroadcastss 224(%rbp), %ymm15 +; AVX1-NEXT: vmulps %ymm4, %ymm15, %ymm15 +; AVX1-NEXT: vaddps %ymm15, %ymm14, %ymm14 +; AVX1-NEXT: vbroadcastss 228(%rbp), %ymm15 +; AVX1-NEXT: vmulps %ymm5, %ymm15, %ymm15 +; AVX1-NEXT: vaddps %ymm15, %ymm14, %ymm14 +; AVX1-NEXT: vbroadcastss 232(%rbp), %ymm15 +; AVX1-NEXT: vmulps %ymm6, %ymm15, %ymm15 +; AVX1-NEXT: vaddps %ymm15, %ymm14, %ymm14 +; AVX1-NEXT: vbroadcastss 236(%rbp), %ymm15 +; AVX1-NEXT: vmulps %ymm7, %ymm15, %ymm15 +; AVX1-NEXT: vaddps %ymm15, %ymm14, %ymm14 +; AVX1-NEXT: vbroadcastss 240(%rbp), %ymm15 +; AVX1-NEXT: vmulps %ymm0, %ymm15, %ymm0 +; AVX1-NEXT: vbroadcastss 244(%rbp), %ymm15 +; AVX1-NEXT: vmulps %ymm1, %ymm15, %ymm1 +; AVX1-NEXT: vaddps %ymm1, %ymm0, %ymm0 +; AVX1-NEXT: vbroadcastss 248(%rbp), %ymm1 +; AVX1-NEXT: vmulps %ymm1, %ymm2, %ymm1 +; AVX1-NEXT: vaddps %ymm1, %ymm0, %ymm0 +; AVX1-NEXT: vbroadcastss 252(%rbp), %ymm1 +; AVX1-NEXT: vmulps %ymm1, %ymm3, %ymm1 +; AVX1-NEXT: vaddps %ymm1, %ymm0, %ymm0 +; AVX1-NEXT: vbroadcastss 256(%rbp), %ymm1 +; AVX1-NEXT: vmulps %ymm1, %ymm4, %ymm1 +; AVX1-NEXT: vaddps %ymm1, %ymm0, %ymm0 +; AVX1-NEXT: vbroadcastss 260(%rbp), %ymm1 +; AVX1-NEXT: vmulps %ymm1, %ymm5, %ymm1 +; AVX1-NEXT: vaddps %ymm1, %ymm0, %ymm0 +; AVX1-NEXT: vbroadcastss 264(%rbp), %ymm1 +; AVX1-NEXT: vmulps %ymm1, %ymm6, %ymm1 +; AVX1-NEXT: vaddps %ymm1, %ymm0, %ymm0 +; AVX1-NEXT: vbroadcastss 268(%rbp), %ymm1 +; AVX1-NEXT: vmulps %ymm1, %ymm7, %ymm1 +; AVX1-NEXT: vaddps %ymm1, %ymm0, %ymm0 +; AVX1-NEXT: vmovaps %ymm0, 224(%rdi) +; AVX1-NEXT: vmovaps %ymm14, 192(%rdi) +; AVX1-NEXT: vmovaps %ymm13, 160(%rdi) +; AVX1-NEXT: vmovaps %ymm12, 128(%rdi) +; AVX1-NEXT: vmovaps %ymm11, 96(%rdi) +; AVX1-NEXT: vmovaps %ymm10, 64(%rdi) +; AVX1-NEXT: vmovaps %ymm9, 32(%rdi) +; AVX1-NEXT: vmovaps %ymm8, (%rdi) +; AVX1-NEXT: movq %rbp, %rsp +; AVX1-NEXT: popq %rbp +; AVX1-NEXT: vzeroupper +; AVX1-NEXT: retq +; +; AVX2-LABEL: test_mul8x8_f32: +; AVX2: # %bb.0: # %entry +; AVX2-NEXT: pushq %rbp +; AVX2-NEXT: movq %rsp, %rbp +; AVX2-NEXT: andq $-32, %rsp +; AVX2-NEXT: subq $32, %rsp +; AVX2-NEXT: movq %rdi, %rax +; AVX2-NEXT: vbroadcastss 16(%rbp), %ymm8 +; AVX2-NEXT: vmulps %ymm0, %ymm8, %ymm8 +; AVX2-NEXT: vbroadcastss 20(%rbp), %ymm9 +; AVX2-NEXT: vmulps %ymm1, %ymm9, %ymm9 +; AVX2-NEXT: vaddps %ymm9, %ymm8, %ymm8 +; AVX2-NEXT: vbroadcastss 24(%rbp), %ymm9 +; AVX2-NEXT: vmulps %ymm2, %ymm9, %ymm9 +; AVX2-NEXT: vaddps %ymm9, %ymm8, %ymm8 +; AVX2-NEXT: vbroadcastss 28(%rbp), %ymm9 +; AVX2-NEXT: vmulps %ymm3, %ymm9, %ymm9 +; AVX2-NEXT: vaddps %ymm9, %ymm8, %ymm8 +; AVX2-NEXT: vbroadcastss 32(%rbp), %ymm9 +; AVX2-NEXT: vmulps %ymm4, %ymm9, %ymm9 +; AVX2-NEXT: vaddps %ymm9, %ymm8, %ymm8 +; AVX2-NEXT: vbroadcastss 36(%rbp), %ymm9 +; AVX2-NEXT: vmulps %ymm5, %ymm9, %ymm9 +; AVX2-NEXT: vaddps %ymm9, %ymm8, %ymm8 +; AVX2-NEXT: vbroadcastss 40(%rbp), %ymm9 +; AVX2-NEXT: vmulps %ymm6, %ymm9, %ymm9 +; AVX2-NEXT: vaddps %ymm9, %ymm8, %ymm8 +; AVX2-NEXT: vbroadcastss 44(%rbp), %ymm9 +; AVX2-NEXT: vmulps %ymm7, %ymm9, %ymm9 +; AVX2-NEXT: vaddps %ymm9, %ymm8, %ymm8 +; AVX2-NEXT: vbroadcastss 48(%rbp), %ymm9 +; AVX2-NEXT: vmulps %ymm0, %ymm9, %ymm9 +; AVX2-NEXT: vbroadcastss 52(%rbp), %ymm10 +; AVX2-NEXT: vmulps %ymm1, %ymm10, %ymm10 +; AVX2-NEXT: vaddps %ymm10, %ymm9, %ymm9 +; AVX2-NEXT: vbroadcastss 56(%rbp), %ymm10 +; AVX2-NEXT: vmulps %ymm2, %ymm10, %ymm10 +; AVX2-NEXT: vaddps %ymm10, %ymm9, %ymm9 +; AVX2-NEXT: vbroadcastss 60(%rbp), %ymm10 +; AVX2-NEXT: vmulps %ymm3, %ymm10, %ymm10 +; AVX2-NEXT: vaddps %ymm10, %ymm9, %ymm9 +; AVX2-NEXT: vbroadcastss 64(%rbp), %ymm10 +; AVX2-NEXT: vmulps %ymm4, %ymm10, %ymm10 +; AVX2-NEXT: vaddps %ymm10, %ymm9, %ymm9 +; AVX2-NEXT: vbroadcastss 68(%rbp), %ymm10 +; AVX2-NEXT: vmulps %ymm5, %ymm10, %ymm10 +; AVX2-NEXT: vaddps %ymm10, %ymm9, %ymm9 +; AVX2-NEXT: vbroadcastss 72(%rbp), %ymm10 +; AVX2-NEXT: vmulps %ymm6, %ymm10, %ymm10 +; AVX2-NEXT: vaddps %ymm10, %ymm9, %ymm9 +; AVX2-NEXT: vbroadcastss 76(%rbp), %ymm10 +; AVX2-NEXT: vmulps %ymm7, %ymm10, %ymm10 +; AVX2-NEXT: vaddps %ymm10, %ymm9, %ymm9 +; AVX2-NEXT: vbroadcastss 80(%rbp), %ymm10 +; AVX2-NEXT: vmulps %ymm0, %ymm10, %ymm10 +; AVX2-NEXT: vbroadcastss 84(%rbp), %ymm11 +; AVX2-NEXT: vmulps %ymm1, %ymm11, %ymm11 +; AVX2-NEXT: vaddps %ymm11, %ymm10, %ymm10 +; AVX2-NEXT: vbroadcastss 88(%rbp), %ymm11 +; AVX2-NEXT: vmulps %ymm2, %ymm11, %ymm11 +; AVX2-NEXT: vaddps %ymm11, %ymm10, %ymm10 +; AVX2-NEXT: vbroadcastss 92(%rbp), %ymm11 +; AVX2-NEXT: vmulps %ymm3, %ymm11, %ymm11 +; AVX2-NEXT: vaddps %ymm11, %ymm10, %ymm10 +; AVX2-NEXT: vbroadcastss 96(%rbp), %ymm11 +; AVX2-NEXT: vmulps %ymm4, %ymm11, %ymm11 +; AVX2-NEXT: vaddps %ymm11, %ymm10, %ymm10 +; AVX2-NEXT: vbroadcastss 100(%rbp), %ymm11 +; AVX2-NEXT: vmulps %ymm5, %ymm11, %ymm11 +; AVX2-NEXT: vaddps %ymm11, %ymm10, %ymm10 +; AVX2-NEXT: vbroadcastss 104(%rbp), %ymm11 +; AVX2-NEXT: vmulps %ymm6, %ymm11, %ymm11 +; AVX2-NEXT: vaddps %ymm11, %ymm10, %ymm10 +; AVX2-NEXT: vbroadcastss 108(%rbp), %ymm11 +; AVX2-NEXT: vmulps %ymm7, %ymm11, %ymm11 +; AVX2-NEXT: vaddps %ymm11, %ymm10, %ymm10 +; AVX2-NEXT: vbroadcastss 112(%rbp), %ymm11 +; AVX2-NEXT: vmulps %ymm0, %ymm11, %ymm11 +; AVX2-NEXT: vbroadcastss 116(%rbp), %ymm12 +; AVX2-NEXT: vmulps %ymm1, %ymm12, %ymm12 +; AVX2-NEXT: vaddps %ymm12, %ymm11, %ymm11 +; AVX2-NEXT: vbroadcastss 120(%rbp), %ymm12 +; AVX2-NEXT: vmulps %ymm2, %ymm12, %ymm12 +; AVX2-NEXT: vaddps %ymm12, %ymm11, %ymm11 +; AVX2-NEXT: vbroadcastss 124(%rbp), %ymm12 +; AVX2-NEXT: vmulps %ymm3, %ymm12, %ymm12 +; AVX2-NEXT: vaddps %ymm12, %ymm11, %ymm11 +; AVX2-NEXT: vbroadcastss 128(%rbp), %ymm12 +; AVX2-NEXT: vmulps %ymm4, %ymm12, %ymm12 +; AVX2-NEXT: vaddps %ymm12, %ymm11, %ymm11 +; AVX2-NEXT: vbroadcastss 132(%rbp), %ymm12 +; AVX2-NEXT: vmulps %ymm5, %ymm12, %ymm12 +; AVX2-NEXT: vaddps %ymm12, %ymm11, %ymm11 +; AVX2-NEXT: vbroadcastss 136(%rbp), %ymm12 +; AVX2-NEXT: vmulps %ymm6, %ymm12, %ymm12 +; AVX2-NEXT: vaddps %ymm12, %ymm11, %ymm11 +; AVX2-NEXT: vbroadcastss 140(%rbp), %ymm12 +; AVX2-NEXT: vmulps %ymm7, %ymm12, %ymm12 +; AVX2-NEXT: vaddps %ymm12, %ymm11, %ymm11 +; AVX2-NEXT: vbroadcastss 144(%rbp), %ymm12 +; AVX2-NEXT: vmulps %ymm0, %ymm12, %ymm12 +; AVX2-NEXT: vbroadcastss 148(%rbp), %ymm13 +; AVX2-NEXT: vmulps %ymm1, %ymm13, %ymm13 +; AVX2-NEXT: vaddps %ymm13, %ymm12, %ymm12 +; AVX2-NEXT: vbroadcastss 152(%rbp), %ymm13 +; AVX2-NEXT: vmulps %ymm2, %ymm13, %ymm13 +; AVX2-NEXT: vaddps %ymm13, %ymm12, %ymm12 +; AVX2-NEXT: vbroadcastss 156(%rbp), %ymm13 +; AVX2-NEXT: vmulps %ymm3, %ymm13, %ymm13 +; AVX2-NEXT: vaddps %ymm13, %ymm12, %ymm12 +; AVX2-NEXT: vbroadcastss 160(%rbp), %ymm13 +; AVX2-NEXT: vmulps %ymm4, %ymm13, %ymm13 +; AVX2-NEXT: vaddps %ymm13, %ymm12, %ymm12 +; AVX2-NEXT: vbroadcastss 164(%rbp), %ymm13 +; AVX2-NEXT: vmulps %ymm5, %ymm13, %ymm13 +; AVX2-NEXT: vaddps %ymm13, %ymm12, %ymm12 +; AVX2-NEXT: vbroadcastss 168(%rbp), %ymm13 +; AVX2-NEXT: vmulps %ymm6, %ymm13, %ymm13 +; AVX2-NEXT: vaddps %ymm13, %ymm12, %ymm12 +; AVX2-NEXT: vbroadcastss 172(%rbp), %ymm13 +; AVX2-NEXT: vmulps %ymm7, %ymm13, %ymm13 +; AVX2-NEXT: vaddps %ymm13, %ymm12, %ymm12 +; AVX2-NEXT: vbroadcastss 176(%rbp), %ymm13 +; AVX2-NEXT: vmulps %ymm0, %ymm13, %ymm13 +; AVX2-NEXT: vbroadcastss 180(%rbp), %ymm14 +; AVX2-NEXT: vmulps %ymm1, %ymm14, %ymm14 +; AVX2-NEXT: vaddps %ymm14, %ymm13, %ymm13 +; AVX2-NEXT: vbroadcastss 184(%rbp), %ymm14 +; AVX2-NEXT: vmulps %ymm2, %ymm14, %ymm14 +; AVX2-NEXT: vaddps %ymm14, %ymm13, %ymm13 +; AVX2-NEXT: vbroadcastss 188(%rbp), %ymm14 +; AVX2-NEXT: vmulps %ymm3, %ymm14, %ymm14 +; AVX2-NEXT: vaddps %ymm14, %ymm13, %ymm13 +; AVX2-NEXT: vbroadcastss 192(%rbp), %ymm14 +; AVX2-NEXT: vmulps %ymm4, %ymm14, %ymm14 +; AVX2-NEXT: vaddps %ymm14, %ymm13, %ymm13 +; AVX2-NEXT: vbroadcastss 196(%rbp), %ymm14 +; AVX2-NEXT: vmulps %ymm5, %ymm14, %ymm14 +; AVX2-NEXT: vaddps %ymm14, %ymm13, %ymm13 +; AVX2-NEXT: vbroadcastss 200(%rbp), %ymm14 +; AVX2-NEXT: vmulps %ymm6, %ymm14, %ymm14 +; AVX2-NEXT: vaddps %ymm14, %ymm13, %ymm13 +; AVX2-NEXT: vbroadcastss 204(%rbp), %ymm14 +; AVX2-NEXT: vmulps %ymm7, %ymm14, %ymm14 +; AVX2-NEXT: vaddps %ymm14, %ymm13, %ymm13 +; AVX2-NEXT: vbroadcastss 208(%rbp), %ymm14 +; AVX2-NEXT: vmulps %ymm0, %ymm14, %ymm14 +; AVX2-NEXT: vbroadcastss 212(%rbp), %ymm15 +; AVX2-NEXT: vmulps %ymm1, %ymm15, %ymm15 +; AVX2-NEXT: vaddps %ymm15, %ymm14, %ymm14 +; AVX2-NEXT: vbroadcastss 216(%rbp), %ymm15 +; AVX2-NEXT: vmulps %ymm2, %ymm15, %ymm15 +; AVX2-NEXT: vaddps %ymm15, %ymm14, %ymm14 +; AVX2-NEXT: vbroadcastss 220(%rbp), %ymm15 +; AVX2-NEXT: vmulps %ymm3, %ymm15, %ymm15 +; AVX2-NEXT: vaddps %ymm15, %ymm14, %ymm14 +; AVX2-NEXT: vbroadcastss 224(%rbp), %ymm15 +; AVX2-NEXT: vmulps %ymm4, %ymm15, %ymm15 +; AVX2-NEXT: vaddps %ymm15, %ymm14, %ymm14 +; AVX2-NEXT: vbroadcastss 228(%rbp), %ymm15 +; AVX2-NEXT: vmulps %ymm5, %ymm15, %ymm15 +; AVX2-NEXT: vaddps %ymm15, %ymm14, %ymm14 +; AVX2-NEXT: vbroadcastss 232(%rbp), %ymm15 +; AVX2-NEXT: vmulps %ymm6, %ymm15, %ymm15 +; AVX2-NEXT: vaddps %ymm15, %ymm14, %ymm14 +; AVX2-NEXT: vbroadcastss 236(%rbp), %ymm15 +; AVX2-NEXT: vmulps %ymm7, %ymm15, %ymm15 +; AVX2-NEXT: vaddps %ymm15, %ymm14, %ymm14 +; AVX2-NEXT: vbroadcastss 240(%rbp), %ymm15 +; AVX2-NEXT: vmulps %ymm0, %ymm15, %ymm0 +; AVX2-NEXT: vbroadcastss 244(%rbp), %ymm15 +; AVX2-NEXT: vmulps %ymm1, %ymm15, %ymm1 +; AVX2-NEXT: vaddps %ymm1, %ymm0, %ymm0 +; AVX2-NEXT: vbroadcastss 248(%rbp), %ymm1 +; AVX2-NEXT: vmulps %ymm1, %ymm2, %ymm1 +; AVX2-NEXT: vaddps %ymm1, %ymm0, %ymm0 +; AVX2-NEXT: vbroadcastss 252(%rbp), %ymm1 +; AVX2-NEXT: vmulps %ymm1, %ymm3, %ymm1 +; AVX2-NEXT: vaddps %ymm1, %ymm0, %ymm0 +; AVX2-NEXT: vbroadcastss 256(%rbp), %ymm1 +; AVX2-NEXT: vmulps %ymm1, %ymm4, %ymm1 +; AVX2-NEXT: vaddps %ymm1, %ymm0, %ymm0 +; AVX2-NEXT: vbroadcastss 260(%rbp), %ymm1 +; AVX2-NEXT: vmulps %ymm1, %ymm5, %ymm1 +; AVX2-NEXT: vaddps %ymm1, %ymm0, %ymm0 +; AVX2-NEXT: vbroadcastss 264(%rbp), %ymm1 +; AVX2-NEXT: vmulps %ymm1, %ymm6, %ymm1 +; AVX2-NEXT: vaddps %ymm1, %ymm0, %ymm0 +; AVX2-NEXT: vbroadcastss 268(%rbp), %ymm1 +; AVX2-NEXT: vmulps %ymm1, %ymm7, %ymm1 +; AVX2-NEXT: vaddps %ymm1, %ymm0, %ymm0 +; AVX2-NEXT: vmovaps %ymm0, 224(%rdi) +; AVX2-NEXT: vmovaps %ymm14, 192(%rdi) +; AVX2-NEXT: vmovaps %ymm13, 160(%rdi) +; AVX2-NEXT: vmovaps %ymm12, 128(%rdi) +; AVX2-NEXT: vmovaps %ymm11, 96(%rdi) +; AVX2-NEXT: vmovaps %ymm10, 64(%rdi) +; AVX2-NEXT: vmovaps %ymm9, 32(%rdi) +; AVX2-NEXT: vmovaps %ymm8, (%rdi) +; AVX2-NEXT: movq %rbp, %rsp +; AVX2-NEXT: popq %rbp +; AVX2-NEXT: vzeroupper +; AVX2-NEXT: retq +; +; AVX512F-LABEL: test_mul8x8_f32: +; AVX512F: # %bb.0: # %entry +; AVX512F-NEXT: subq $520, %rsp # imm = 0x208 +; AVX512F-NEXT: vmovups %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-NEXT: vmovups %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-NEXT: vmovups %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-NEXT: vmovups %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-NEXT: vbroadcastss %xmm4, %ymm10 +; AVX512F-NEXT: vmovshdup {{.*#+}} xmm8 = xmm4[1,1,3,3] +; AVX512F-NEXT: vbroadcastsd %xmm8, %ymm11 +; AVX512F-NEXT: vpermilps {{.*#+}} xmm8 = xmm4[2,2,2,2] +; AVX512F-NEXT: vbroadcastsd %xmm8, %ymm9 +; AVX512F-NEXT: vextractf64x4 $1, %zmm4, %ymm8 +; AVX512F-NEXT: vpermilps {{.*#+}} xmm12 = xmm8[2,2,2,2] +; AVX512F-NEXT: vbroadcastsd %xmm12, %ymm12 +; AVX512F-NEXT: vinsertf64x4 $1, %ymm12, %zmm9, %zmm23 +; AVX512F-NEXT: vpermilps {{.*#+}} xmm9 = xmm4[3,3,3,3] +; AVX512F-NEXT: vbroadcastsd %xmm9, %ymm9 +; AVX512F-NEXT: vextractf32x4 $2, %zmm4, %xmm12 +; AVX512F-NEXT: vbroadcastss %xmm12, %ymm12 +; AVX512F-NEXT: vinsertf64x4 $1, %ymm12, %zmm10, %zmm26 +; AVX512F-NEXT: vextractf128 $1, %ymm4, %xmm10 +; AVX512F-NEXT: vbroadcastss %xmm10, %ymm10 +; AVX512F-NEXT: vmovshdup {{.*#+}} xmm12 = xmm8[1,1,3,3] +; AVX512F-NEXT: vbroadcastsd %xmm12, %ymm12 +; AVX512F-NEXT: vinsertf64x4 $1, %ymm12, %zmm11, %zmm27 +; AVX512F-NEXT: vpermilps {{.*#+}} xmm11 = xmm8[3,3,3,3] +; AVX512F-NEXT: vbroadcastsd %xmm11, %ymm11 +; AVX512F-NEXT: vextractf32x4 $3, %zmm4, %xmm12 +; AVX512F-NEXT: vbroadcastss %xmm12, %ymm13 +; AVX512F-NEXT: vinsertf64x4 $1, %ymm11, %zmm9, %zmm0 +; AVX512F-NEXT: vmovups %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-NEXT: vbroadcastss %xmm5, %ymm11 +; AVX512F-NEXT: vpermilps {{.*#+}} xmm9 = xmm5[2,2,2,2] +; AVX512F-NEXT: vbroadcastsd %xmm9, %ymm14 +; AVX512F-NEXT: vinsertf64x4 $1, %ymm13, %zmm10, %zmm0 +; AVX512F-NEXT: vmovups %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-NEXT: vextractf64x4 $1, %zmm5, %ymm9 +; AVX512F-NEXT: vpermilps {{.*#+}} xmm10 = xmm9[2,2,2,2] +; AVX512F-NEXT: vbroadcastsd %xmm10, %ymm10 +; AVX512F-NEXT: vinsertf64x4 $1, %ymm10, %zmm14, %zmm0 +; AVX512F-NEXT: vmovups %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-NEXT: vmovshdup {{.*#+}} xmm10 = xmm5[1,1,3,3] +; AVX512F-NEXT: vbroadcastsd %xmm10, %ymm10 +; AVX512F-NEXT: vextractf32x4 $2, %zmm5, %xmm15 +; AVX512F-NEXT: vbroadcastss %xmm15, %ymm15 +; AVX512F-NEXT: vinsertf64x4 $1, %ymm15, %zmm11, %zmm0 +; AVX512F-NEXT: vmovups %zmm0, (%rsp) # 64-byte Spill +; AVX512F-NEXT: vpermilps {{.*#+}} xmm11 = xmm5[3,3,3,3] +; AVX512F-NEXT: vbroadcastsd %xmm11, %ymm11 +; AVX512F-NEXT: vmovshdup {{.*#+}} xmm12 = xmm9[1,1,3,3] +; AVX512F-NEXT: vbroadcastsd %xmm12, %ymm12 +; AVX512F-NEXT: vinsertf64x4 $1, %ymm12, %zmm10, %zmm0 +; AVX512F-NEXT: vmovups %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-NEXT: vextractf128 $1, %ymm5, %xmm10 +; AVX512F-NEXT: vbroadcastss %xmm10, %ymm10 +; AVX512F-NEXT: vpermilps {{.*#+}} xmm12 = xmm9[3,3,3,3] +; AVX512F-NEXT: vbroadcastsd %xmm12, %ymm12 +; AVX512F-NEXT: vinsertf64x4 $1, %ymm12, %zmm11, %zmm0 +; AVX512F-NEXT: vmovups %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-NEXT: vextractf32x4 $3, %zmm5, %xmm11 +; AVX512F-NEXT: vbroadcastss %xmm11, %ymm11 +; AVX512F-NEXT: vpermilps {{.*#+}} xmm12 = xmm6[2,2,2,2] +; AVX512F-NEXT: vbroadcastsd %xmm12, %ymm12 +; AVX512F-NEXT: vinsertf64x4 $1, %ymm11, %zmm10, %zmm18 +; AVX512F-NEXT: vextractf64x4 $1, %zmm6, %ymm10 +; AVX512F-NEXT: vpermilps {{.*#+}} xmm11 = xmm10[2,2,2,2] +; AVX512F-NEXT: vbroadcastsd %xmm11, %ymm11 +; AVX512F-NEXT: vinsertf64x4 $1, %ymm11, %zmm12, %zmm20 +; AVX512F-NEXT: vbroadcastss %xmm6, %ymm11 +; AVX512F-NEXT: vextractf32x4 $2, %zmm6, %xmm12 +; AVX512F-NEXT: vbroadcastss %xmm12, %ymm12 +; AVX512F-NEXT: vinsertf64x4 $1, %ymm12, %zmm11, %zmm21 +; AVX512F-NEXT: vmovshdup {{.*#+}} xmm11 = xmm6[1,1,3,3] +; AVX512F-NEXT: vbroadcastsd %xmm11, %ymm11 +; AVX512F-NEXT: vmovshdup {{.*#+}} xmm12 = xmm10[1,1,3,3] +; AVX512F-NEXT: vbroadcastsd %xmm12, %ymm12 +; AVX512F-NEXT: vinsertf64x4 $1, %ymm12, %zmm11, %zmm22 +; AVX512F-NEXT: vpermilps {{.*#+}} xmm11 = xmm6[3,3,3,3] +; AVX512F-NEXT: vbroadcastsd %xmm11, %ymm11 +; AVX512F-NEXT: vpermilps {{.*#+}} xmm12 = xmm10[3,3,3,3] +; AVX512F-NEXT: vbroadcastsd %xmm12, %ymm12 +; AVX512F-NEXT: vinsertf64x4 $1, %ymm12, %zmm11, %zmm24 +; AVX512F-NEXT: vextractf128 $1, %ymm6, %xmm11 +; AVX512F-NEXT: vbroadcastss %xmm11, %ymm11 +; AVX512F-NEXT: vmovshdup {{.*#+}} zmm17 = zmm4[1,1,3,3,5,5,7,7,9,9,11,11,13,13,15,15] +; AVX512F-NEXT: vmovaps {{.*#+}} zmm19 = [6,6,6,6,6,6,6,6,22,22,22,22,22,22,22,22] +; AVX512F-NEXT: vmovaps %zmm4, %zmm29 +; AVX512F-NEXT: vpermt2ps %zmm8, %zmm19, %zmm29 +; AVX512F-NEXT: vmovaps {{.*#+}} zmm30 = [7,7,7,7,7,7,7,7,23,23,23,23,23,23,23,23] +; AVX512F-NEXT: vpermt2ps %zmm8, %zmm30, %zmm4 +; AVX512F-NEXT: vextractf32x4 $3, %zmm6, %xmm8 +; AVX512F-NEXT: vbroadcastss %xmm8, %ymm8 +; AVX512F-NEXT: vmovshdup {{.*#+}} zmm31 = zmm5[1,1,3,3,5,5,7,7,9,9,11,11,13,13,15,15] +; AVX512F-NEXT: vmovaps %zmm5, %zmm2 +; AVX512F-NEXT: vpermt2ps %zmm9, %zmm19, %zmm2 +; AVX512F-NEXT: vpermt2ps %zmm9, %zmm30, %zmm5 +; AVX512F-NEXT: vinsertf64x4 $1, %ymm8, %zmm11, %zmm16 +; AVX512F-NEXT: vmovshdup {{.*#+}} zmm15 = zmm6[1,1,3,3,5,5,7,7,9,9,11,11,13,13,15,15] +; AVX512F-NEXT: vmovaps %zmm6, %zmm25 +; AVX512F-NEXT: vpermt2ps %zmm10, %zmm19, %zmm25 +; AVX512F-NEXT: vpermt2ps %zmm10, %zmm30, %zmm6 +; AVX512F-NEXT: vpermilps {{.*#+}} xmm10 = xmm7[2,2,2,2] +; AVX512F-NEXT: vbroadcastsd %xmm10, %ymm10 +; AVX512F-NEXT: vextractf64x4 $1, %zmm7, %ymm0 +; AVX512F-NEXT: vpermilps {{.*#+}} xmm11 = xmm0[2,2,2,2] +; AVX512F-NEXT: vbroadcastsd %xmm11, %ymm11 +; AVX512F-NEXT: vinsertf64x4 $1, %ymm11, %zmm10, %zmm13 +; AVX512F-NEXT: vextractf32x4 $2, %zmm7, %xmm10 +; AVX512F-NEXT: vbroadcastss %xmm10, %ymm10 +; AVX512F-NEXT: vbroadcastss %xmm7, %ymm11 +; AVX512F-NEXT: vinsertf64x4 $1, %ymm10, %zmm11, %zmm12 +; AVX512F-NEXT: vmovshdup {{.*#+}} xmm10 = xmm7[1,1,3,3] +; AVX512F-NEXT: vbroadcastsd %xmm10, %ymm10 +; AVX512F-NEXT: vmovshdup {{.*#+}} xmm11 = xmm0[1,1,3,3] +; AVX512F-NEXT: vbroadcastsd %xmm11, %ymm11 +; AVX512F-NEXT: vinsertf64x4 $1, %ymm11, %zmm10, %zmm9 +; AVX512F-NEXT: vpermi2ps %zmm0, %zmm7, %zmm19 +; AVX512F-NEXT: vpermilps {{.*#+}} xmm1 = xmm7[3,3,3,3] +; AVX512F-NEXT: vextractf128 $1, %ymm7, %xmm10 +; AVX512F-NEXT: vextractf32x4 $3, %zmm7, %xmm11 +; AVX512F-NEXT: vmovshdup {{.*#+}} zmm8 = zmm7[1,1,3,3,5,5,7,7,9,9,11,11,13,13,15,15] +; AVX512F-NEXT: vpermt2ps %zmm0, %zmm30, %zmm7 +; AVX512F-NEXT: vbroadcastsd %xmm1, %ymm1 +; AVX512F-NEXT: vpermilps {{.*#+}} xmm0 = xmm0[3,3,3,3] +; AVX512F-NEXT: vbroadcastsd %xmm0, %ymm0 +; AVX512F-NEXT: vinsertf64x4 $1, %ymm0, %zmm1, %zmm3 +; AVX512F-NEXT: vmovupd {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; AVX512F-NEXT: vinsertf64x4 $1, %ymm0, %zmm0, %zmm30 +; AVX512F-NEXT: vshuff64x2 {{.*#+}} zmm0 = zmm0[4,5,6,7,4,5,6,7] +; AVX512F-NEXT: vmulps %zmm26, %zmm30, %zmm1 +; AVX512F-NEXT: vmulps %zmm27, %zmm0, %zmm28 +; AVX512F-NEXT: vaddps %zmm28, %zmm1, %zmm1 +; AVX512F-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm14 # 64-byte Reload +; AVX512F-NEXT: vinsertf64x4 $1, %ymm14, %zmm14, %zmm28 +; AVX512F-NEXT: vmulps %zmm23, %zmm28, %zmm27 +; AVX512F-NEXT: vaddps %zmm27, %zmm1, %zmm1 +; AVX512F-NEXT: vshuff64x2 {{.*#+}} zmm27 = zmm14[4,5,6,7,4,5,6,7] +; AVX512F-NEXT: vmulps {{[-0-9]+}}(%r{{[sb]}}p), %zmm27, %zmm26 # 64-byte Folded Reload +; AVX512F-NEXT: vaddps %zmm26, %zmm1, %zmm1 +; AVX512F-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm14 # 64-byte Reload +; AVX512F-NEXT: vinsertf64x4 $1, %ymm14, %zmm14, %zmm26 +; AVX512F-NEXT: vmulps {{[-0-9]+}}(%r{{[sb]}}p), %zmm26, %zmm23 # 64-byte Folded Reload +; AVX512F-NEXT: vaddps %zmm23, %zmm1, %zmm1 +; AVX512F-NEXT: vshuff64x2 {{.*#+}} zmm23 = zmm14[4,5,6,7,4,5,6,7] +; AVX512F-NEXT: vpermpd {{.*#+}} zmm14 = zmm17[2,2,2,2,6,6,6,6] +; AVX512F-NEXT: vmulps %zmm14, %zmm23, %zmm14 +; AVX512F-NEXT: vaddps %zmm14, %zmm1, %zmm17 +; AVX512F-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload +; AVX512F-NEXT: vinsertf64x4 $1, %ymm1, %zmm1, %zmm14 +; AVX512F-NEXT: vmulps %zmm29, %zmm14, %zmm29 +; AVX512F-NEXT: vaddps %zmm29, %zmm17, %zmm17 +; AVX512F-NEXT: vshuff64x2 {{.*#+}} zmm29 = zmm1[4,5,6,7,4,5,6,7] +; AVX512F-NEXT: vmulps %zmm4, %zmm29, %zmm4 +; AVX512F-NEXT: vaddps %zmm4, %zmm17, %zmm4 +; AVX512F-NEXT: vmulps (%rsp), %zmm30, %zmm1 # 64-byte Folded Reload +; AVX512F-NEXT: vmulps {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm17 # 64-byte Folded Reload +; AVX512F-NEXT: vaddps %zmm17, %zmm1, %zmm1 +; AVX512F-NEXT: vmulps {{[-0-9]+}}(%r{{[sb]}}p), %zmm28, %zmm17 # 64-byte Folded Reload +; AVX512F-NEXT: vaddps %zmm17, %zmm1, %zmm1 +; AVX512F-NEXT: vmulps %zmm5, %zmm29, %zmm5 +; AVX512F-NEXT: vmulps {{[-0-9]+}}(%r{{[sb]}}p), %zmm27, %zmm17 # 64-byte Folded Reload +; AVX512F-NEXT: vaddps %zmm17, %zmm1, %zmm1 +; AVX512F-NEXT: vmulps %zmm18, %zmm26, %zmm17 +; AVX512F-NEXT: vaddps %zmm17, %zmm1, %zmm1 +; AVX512F-NEXT: vpermpd {{.*#+}} zmm17 = zmm31[2,2,2,2,6,6,6,6] +; AVX512F-NEXT: vmulps %zmm17, %zmm23, %zmm17 +; AVX512F-NEXT: vaddps %zmm17, %zmm1, %zmm1 +; AVX512F-NEXT: vmulps %zmm2, %zmm14, %zmm2 +; AVX512F-NEXT: vaddps %zmm2, %zmm1, %zmm1 +; AVX512F-NEXT: vaddps %zmm5, %zmm1, %zmm1 +; AVX512F-NEXT: vmulps %zmm21, %zmm30, %zmm2 +; AVX512F-NEXT: vmulps %zmm22, %zmm0, %zmm5 +; AVX512F-NEXT: vaddps %zmm5, %zmm2, %zmm2 +; AVX512F-NEXT: vmulps %zmm20, %zmm28, %zmm5 +; AVX512F-NEXT: vaddps %zmm5, %zmm2, %zmm2 +; AVX512F-NEXT: vmulps %zmm24, %zmm27, %zmm5 +; AVX512F-NEXT: vaddps %zmm5, %zmm2, %zmm2 +; AVX512F-NEXT: vmulps %zmm16, %zmm26, %zmm5 +; AVX512F-NEXT: vaddps %zmm5, %zmm2, %zmm2 +; AVX512F-NEXT: vpermpd {{.*#+}} zmm5 = zmm15[2,2,2,2,6,6,6,6] +; AVX512F-NEXT: vmulps %zmm5, %zmm23, %zmm5 +; AVX512F-NEXT: vaddps %zmm5, %zmm2, %zmm2 +; AVX512F-NEXT: vmulps %zmm25, %zmm14, %zmm5 +; AVX512F-NEXT: vaddps %zmm5, %zmm2, %zmm2 +; AVX512F-NEXT: vmulps %zmm6, %zmm29, %zmm5 +; AVX512F-NEXT: vaddps %zmm5, %zmm2, %zmm2 +; AVX512F-NEXT: vmulps %zmm13, %zmm28, %zmm5 +; AVX512F-NEXT: vmulps %zmm12, %zmm30, %zmm6 +; AVX512F-NEXT: vmulps %zmm9, %zmm0, %zmm0 +; AVX512F-NEXT: vaddps %zmm0, %zmm6, %zmm0 +; AVX512F-NEXT: vaddps %zmm5, %zmm0, %zmm0 +; AVX512F-NEXT: vmulps %zmm3, %zmm27, %zmm3 +; AVX512F-NEXT: vaddps %zmm3, %zmm0, %zmm0 +; AVX512F-NEXT: vbroadcastss %xmm10, %ymm3 +; AVX512F-NEXT: vbroadcastss %xmm11, %ymm5 +; AVX512F-NEXT: vinsertf64x4 $1, %ymm5, %zmm3, %zmm3 +; AVX512F-NEXT: vmulps %zmm3, %zmm26, %zmm3 +; AVX512F-NEXT: vaddps %zmm3, %zmm0, %zmm0 +; AVX512F-NEXT: vpermpd {{.*#+}} zmm3 = zmm8[2,2,2,2,6,6,6,6] +; AVX512F-NEXT: vmulps %zmm3, %zmm23, %zmm3 +; AVX512F-NEXT: vaddps %zmm3, %zmm0, %zmm0 +; AVX512F-NEXT: vmulps %zmm19, %zmm14, %zmm3 +; AVX512F-NEXT: vaddps %zmm3, %zmm0, %zmm0 +; AVX512F-NEXT: vmulps %zmm7, %zmm29, %zmm3 +; AVX512F-NEXT: vaddps %zmm3, %zmm0, %zmm3 +; AVX512F-NEXT: vmovaps %zmm4, %zmm0 +; AVX512F-NEXT: addq $520, %rsp # imm = 0x208 +; AVX512F-NEXT: retq +; +; AVX512VL-LABEL: test_mul8x8_f32: +; AVX512VL: # %bb.0: # %entry +; AVX512VL-NEXT: vbroadcastss %xmm4, %ymm12 +; AVX512VL-NEXT: vmovshdup {{.*#+}} xmm8 = xmm4[1,1,3,3] +; AVX512VL-NEXT: vbroadcastsd %xmm8, %ymm11 +; AVX512VL-NEXT: vpermilps {{.*#+}} xmm8 = xmm4[2,2,2,2] +; AVX512VL-NEXT: vbroadcastsd %xmm8, %ymm15 +; AVX512VL-NEXT: vpermilps {{.*#+}} xmm8 = xmm4[3,3,3,3] +; AVX512VL-NEXT: vbroadcastsd %xmm8, %ymm9 +; AVX512VL-NEXT: vextractf128 $1, %ymm4, %xmm8 +; AVX512VL-NEXT: vbroadcastss %xmm8, %ymm10 +; AVX512VL-NEXT: vextractf32x4 $2, %zmm4, %xmm8 +; AVX512VL-NEXT: vbroadcastss %xmm8, %ymm17 +; AVX512VL-NEXT: vextractf64x4 $1, %zmm4, %ymm8 +; AVX512VL-NEXT: vmovshdup {{.*#+}} xmm13 = xmm8[1,1,3,3] +; AVX512VL-NEXT: vbroadcastsd %xmm13, %ymm14 +; AVX512VL-NEXT: vpermilps {{.*#+}} xmm13 = xmm8[2,2,2,2] +; AVX512VL-NEXT: vbroadcastsd %xmm13, %ymm18 +; AVX512VL-NEXT: vpermilps {{.*#+}} xmm13 = xmm8[3,3,3,3] +; AVX512VL-NEXT: vbroadcastsd %xmm13, %ymm16 +; AVX512VL-NEXT: vextractf32x4 $3, %zmm4, %xmm13 +; AVX512VL-NEXT: vbroadcastss %xmm13, %ymm13 +; AVX512VL-NEXT: vinsertf64x4 $1, %ymm18, %zmm15, %zmm18 +; AVX512VL-NEXT: vbroadcastss %xmm5, %ymm19 +; AVX512VL-NEXT: vinsertf64x4 $1, %ymm17, %zmm12, %zmm12 +; AVX512VL-NEXT: vmovshdup {{.*#+}} xmm15 = xmm5[1,1,3,3] +; AVX512VL-NEXT: vbroadcastsd %xmm15, %ymm20 +; AVX512VL-NEXT: vinsertf64x4 $1, %ymm14, %zmm11, %zmm11 +; AVX512VL-NEXT: vinsertf64x4 $1, %ymm0, %zmm0, %zmm15 +; AVX512VL-NEXT: vmulps %zmm12, %zmm15, %zmm12 +; AVX512VL-NEXT: vshuff64x2 {{.*#+}} zmm14 = zmm0[4,5,6,7,4,5,6,7] +; AVX512VL-NEXT: vmulps %zmm11, %zmm14, %zmm0 +; AVX512VL-NEXT: vaddps %zmm0, %zmm12, %zmm0 +; AVX512VL-NEXT: vinsertf64x4 $1, %ymm1, %zmm1, %zmm17 +; AVX512VL-NEXT: vmulps %zmm18, %zmm17, %zmm11 +; AVX512VL-NEXT: vaddps %zmm11, %zmm0, %zmm0 +; AVX512VL-NEXT: vpermilps {{.*#+}} xmm11 = xmm5[2,2,2,2] +; AVX512VL-NEXT: vbroadcastsd %xmm11, %ymm18 +; AVX512VL-NEXT: vinsertf64x4 $1, %ymm16, %zmm9, %zmm11 +; AVX512VL-NEXT: vshuff64x2 {{.*#+}} zmm9 = zmm1[4,5,6,7,4,5,6,7] +; AVX512VL-NEXT: vmulps %zmm11, %zmm9, %zmm1 +; AVX512VL-NEXT: vaddps %zmm1, %zmm0, %zmm0 +; AVX512VL-NEXT: vinsertf64x4 $1, %ymm13, %zmm10, %zmm1 +; AVX512VL-NEXT: vinsertf64x4 $1, %ymm2, %zmm2, %zmm10 +; AVX512VL-NEXT: vmulps %zmm1, %zmm10, %zmm1 +; AVX512VL-NEXT: vaddps %zmm1, %zmm0, %zmm0 +; AVX512VL-NEXT: vmovshdup {{.*#+}} zmm1 = zmm4[1,1,3,3,5,5,7,7,9,9,11,11,13,13,15,15] +; AVX512VL-NEXT: vpermpd {{.*#+}} zmm1 = zmm1[2,2,2,2,6,6,6,6] +; AVX512VL-NEXT: vshuff64x2 {{.*#+}} zmm11 = zmm2[4,5,6,7,4,5,6,7] +; AVX512VL-NEXT: vmulps %zmm1, %zmm11, %zmm1 +; AVX512VL-NEXT: vaddps %zmm1, %zmm0, %zmm0 +; AVX512VL-NEXT: vmovaps {{.*#+}} zmm12 = [6,6,6,6,6,6,6,6,22,22,22,22,22,22,22,22] +; AVX512VL-NEXT: vmovaps %zmm4, %zmm1 +; AVX512VL-NEXT: vpermt2ps %zmm8, %zmm12, %zmm1 +; AVX512VL-NEXT: vinsertf64x4 $1, %ymm3, %zmm3, %zmm13 +; AVX512VL-NEXT: vmulps %zmm1, %zmm13, %zmm1 +; AVX512VL-NEXT: vaddps %zmm1, %zmm0, %zmm0 +; AVX512VL-NEXT: vmovaps {{.*#+}} zmm16 = [7,7,7,7,7,7,7,7,23,23,23,23,23,23,23,23] +; AVX512VL-NEXT: vpermt2ps %zmm8, %zmm16, %zmm4 +; AVX512VL-NEXT: vextractf64x4 $1, %zmm5, %ymm1 +; AVX512VL-NEXT: vshuff64x2 {{.*#+}} zmm3 = zmm3[4,5,6,7,4,5,6,7] +; AVX512VL-NEXT: vmulps %zmm4, %zmm3, %zmm2 +; AVX512VL-NEXT: vaddps %zmm2, %zmm0, %zmm0 +; AVX512VL-NEXT: vpermilps {{.*#+}} xmm2 = xmm1[2,2,2,2] +; AVX512VL-NEXT: vbroadcastsd %xmm2, %ymm2 +; AVX512VL-NEXT: vinsertf64x4 $1, %ymm2, %zmm18, %zmm2 +; AVX512VL-NEXT: vextractf32x4 $2, %zmm5, %xmm4 +; AVX512VL-NEXT: vbroadcastss %xmm4, %ymm4 +; AVX512VL-NEXT: vinsertf64x4 $1, %ymm4, %zmm19, %zmm4 +; AVX512VL-NEXT: vmovshdup {{.*#+}} xmm8 = xmm1[1,1,3,3] +; AVX512VL-NEXT: vbroadcastsd %xmm8, %ymm8 +; AVX512VL-NEXT: vinsertf64x4 $1, %ymm8, %zmm20, %zmm8 +; AVX512VL-NEXT: vpermilps {{.*#+}} xmm18 = xmm5[3,3,3,3] +; AVX512VL-NEXT: vbroadcastsd %xmm18, %ymm18 +; AVX512VL-NEXT: vmulps %zmm4, %zmm15, %zmm4 +; AVX512VL-NEXT: vmulps %zmm8, %zmm14, %zmm8 +; AVX512VL-NEXT: vaddps %zmm8, %zmm4, %zmm4 +; AVX512VL-NEXT: vextractf128 $1, %ymm5, %xmm8 +; AVX512VL-NEXT: vbroadcastss %xmm8, %ymm8 +; AVX512VL-NEXT: vmulps %zmm2, %zmm17, %zmm2 +; AVX512VL-NEXT: vaddps %zmm2, %zmm4, %zmm2 +; AVX512VL-NEXT: vpermilps {{.*#+}} xmm4 = xmm1[3,3,3,3] +; AVX512VL-NEXT: vbroadcastsd %xmm4, %ymm4 +; AVX512VL-NEXT: vinsertf64x4 $1, %ymm4, %zmm18, %zmm4 +; AVX512VL-NEXT: vextractf32x4 $3, %zmm5, %xmm18 +; AVX512VL-NEXT: vbroadcastss %xmm18, %ymm18 +; AVX512VL-NEXT: vmulps %zmm4, %zmm9, %zmm4 +; AVX512VL-NEXT: vaddps %zmm4, %zmm2, %zmm2 +; AVX512VL-NEXT: vbroadcastss %xmm6, %ymm4 +; AVX512VL-NEXT: vinsertf64x4 $1, %ymm18, %zmm8, %zmm8 +; AVX512VL-NEXT: vmovshdup {{.*#+}} xmm18 = xmm6[1,1,3,3] +; AVX512VL-NEXT: vbroadcastsd %xmm18, %ymm18 +; AVX512VL-NEXT: vmulps %zmm8, %zmm10, %zmm8 +; AVX512VL-NEXT: vaddps %zmm8, %zmm2, %zmm2 +; AVX512VL-NEXT: vmovshdup {{.*#+}} zmm8 = zmm5[1,1,3,3,5,5,7,7,9,9,11,11,13,13,15,15] +; AVX512VL-NEXT: vpermpd {{.*#+}} zmm8 = zmm8[2,2,2,2,6,6,6,6] +; AVX512VL-NEXT: vmulps %zmm8, %zmm11, %zmm8 +; AVX512VL-NEXT: vaddps %zmm8, %zmm2, %zmm2 +; AVX512VL-NEXT: vmovaps %zmm5, %zmm8 +; AVX512VL-NEXT: vpermt2ps %zmm1, %zmm12, %zmm8 +; AVX512VL-NEXT: vmulps %zmm8, %zmm13, %zmm8 +; AVX512VL-NEXT: vaddps %zmm8, %zmm2, %zmm2 +; AVX512VL-NEXT: vpermilps {{.*#+}} xmm8 = xmm6[2,2,2,2] +; AVX512VL-NEXT: vbroadcastsd %xmm8, %ymm8 +; AVX512VL-NEXT: vpermt2ps %zmm1, %zmm16, %zmm5 +; AVX512VL-NEXT: vextractf64x4 $1, %zmm6, %ymm19 +; AVX512VL-NEXT: vmulps %zmm5, %zmm3, %zmm1 +; AVX512VL-NEXT: vaddps %zmm1, %zmm2, %zmm1 +; AVX512VL-NEXT: vpermilps {{.*#+}} xmm2 = xmm19[2,2,2,2] +; AVX512VL-NEXT: vbroadcastsd %xmm2, %ymm2 +; AVX512VL-NEXT: vinsertf64x4 $1, %ymm2, %zmm8, %zmm2 +; AVX512VL-NEXT: vextractf32x4 $2, %zmm6, %xmm5 +; AVX512VL-NEXT: vbroadcastss %xmm5, %ymm5 +; AVX512VL-NEXT: vinsertf64x4 $1, %ymm5, %zmm4, %zmm4 +; AVX512VL-NEXT: vmovshdup {{.*#+}} xmm5 = xmm19[1,1,3,3] +; AVX512VL-NEXT: vbroadcastsd %xmm5, %ymm5 +; AVX512VL-NEXT: vinsertf64x4 $1, %ymm5, %zmm18, %zmm5 +; AVX512VL-NEXT: vmulps %zmm4, %zmm15, %zmm4 +; AVX512VL-NEXT: vmulps %zmm5, %zmm14, %zmm5 +; AVX512VL-NEXT: vaddps %zmm5, %zmm4, %zmm4 +; AVX512VL-NEXT: vpermilps {{.*#+}} xmm5 = xmm6[3,3,3,3] +; AVX512VL-NEXT: vbroadcastsd %xmm5, %ymm5 +; AVX512VL-NEXT: vmulps %zmm2, %zmm17, %zmm2 +; AVX512VL-NEXT: vaddps %zmm2, %zmm4, %zmm2 +; AVX512VL-NEXT: vpermilps {{.*#+}} xmm4 = xmm19[3,3,3,3] +; AVX512VL-NEXT: vbroadcastsd %xmm4, %ymm4 +; AVX512VL-NEXT: vinsertf64x4 $1, %ymm4, %zmm5, %zmm4 +; AVX512VL-NEXT: vextractf128 $1, %ymm6, %xmm5 +; AVX512VL-NEXT: vbroadcastss %xmm5, %ymm5 +; AVX512VL-NEXT: vmulps %zmm4, %zmm9, %zmm4 +; AVX512VL-NEXT: vaddps %zmm4, %zmm2, %zmm2 +; AVX512VL-NEXT: vextractf32x4 $3, %zmm6, %xmm4 +; AVX512VL-NEXT: vbroadcastss %xmm4, %ymm4 +; AVX512VL-NEXT: vinsertf64x4 $1, %ymm4, %zmm5, %zmm4 +; AVX512VL-NEXT: vmulps %zmm4, %zmm10, %zmm4 +; AVX512VL-NEXT: vaddps %zmm4, %zmm2, %zmm2 +; AVX512VL-NEXT: vmovshdup {{.*#+}} zmm4 = zmm6[1,1,3,3,5,5,7,7,9,9,11,11,13,13,15,15] +; AVX512VL-NEXT: vpermpd {{.*#+}} zmm4 = zmm4[2,2,2,2,6,6,6,6] +; AVX512VL-NEXT: vmulps %zmm4, %zmm11, %zmm4 +; AVX512VL-NEXT: vaddps %zmm4, %zmm2, %zmm2 +; AVX512VL-NEXT: vmovaps %zmm6, %zmm4 +; AVX512VL-NEXT: vpermt2ps %zmm19, %zmm12, %zmm4 +; AVX512VL-NEXT: vmulps %zmm4, %zmm13, %zmm4 +; AVX512VL-NEXT: vaddps %zmm4, %zmm2, %zmm2 +; AVX512VL-NEXT: vpermilps {{.*#+}} xmm4 = xmm7[2,2,2,2] +; AVX512VL-NEXT: vbroadcastsd %xmm4, %ymm4 +; AVX512VL-NEXT: vpermt2ps %zmm19, %zmm16, %zmm6 +; AVX512VL-NEXT: vextractf64x4 $1, %zmm7, %ymm5 +; AVX512VL-NEXT: vmulps %zmm6, %zmm3, %zmm6 +; AVX512VL-NEXT: vaddps %zmm6, %zmm2, %zmm2 +; AVX512VL-NEXT: vpermilps {{.*#+}} xmm6 = xmm5[2,2,2,2] +; AVX512VL-NEXT: vbroadcastsd %xmm6, %ymm6 +; AVX512VL-NEXT: vinsertf64x4 $1, %ymm6, %zmm4, %zmm4 +; AVX512VL-NEXT: vbroadcastss %xmm7, %ymm6 +; AVX512VL-NEXT: vmulps %zmm4, %zmm17, %zmm4 +; AVX512VL-NEXT: vextractf32x4 $2, %zmm7, %xmm8 +; AVX512VL-NEXT: vbroadcastss %xmm8, %ymm8 +; AVX512VL-NEXT: vinsertf64x4 $1, %ymm8, %zmm6, %zmm6 +; AVX512VL-NEXT: vmovshdup {{.*#+}} xmm8 = xmm7[1,1,3,3] +; AVX512VL-NEXT: vbroadcastsd %xmm8, %ymm8 +; AVX512VL-NEXT: vmulps %zmm6, %zmm15, %zmm6 +; AVX512VL-NEXT: vmovshdup {{.*#+}} xmm15 = xmm5[1,1,3,3] +; AVX512VL-NEXT: vbroadcastsd %xmm15, %ymm15 +; AVX512VL-NEXT: vinsertf64x4 $1, %ymm15, %zmm8, %zmm8 +; AVX512VL-NEXT: vpermilps {{.*#+}} xmm15 = xmm7[3,3,3,3] +; AVX512VL-NEXT: vmulps %zmm8, %zmm14, %zmm8 +; AVX512VL-NEXT: vextractf128 $1, %ymm7, %xmm14 +; AVX512VL-NEXT: vaddps %zmm8, %zmm6, %zmm6 +; AVX512VL-NEXT: vextractf32x4 $3, %zmm7, %xmm8 +; AVX512VL-NEXT: vaddps %zmm4, %zmm6, %zmm4 +; AVX512VL-NEXT: vmovshdup {{.*#+}} zmm6 = zmm7[1,1,3,3,5,5,7,7,9,9,11,11,13,13,15,15] +; AVX512VL-NEXT: vpermi2ps %zmm5, %zmm7, %zmm12 +; AVX512VL-NEXT: vpermt2ps %zmm5, %zmm16, %zmm7 +; AVX512VL-NEXT: vbroadcastsd %xmm15, %ymm15 +; AVX512VL-NEXT: vpermilps {{.*#+}} xmm5 = xmm5[3,3,3,3] +; AVX512VL-NEXT: vbroadcastsd %xmm5, %ymm5 +; AVX512VL-NEXT: vinsertf64x4 $1, %ymm5, %zmm15, %zmm5 +; AVX512VL-NEXT: vmulps %zmm5, %zmm9, %zmm5 +; AVX512VL-NEXT: vaddps %zmm5, %zmm4, %zmm4 +; AVX512VL-NEXT: vbroadcastss %xmm14, %ymm5 +; AVX512VL-NEXT: vbroadcastss %xmm8, %ymm8 +; AVX512VL-NEXT: vinsertf64x4 $1, %ymm8, %zmm5, %zmm5 +; AVX512VL-NEXT: vmulps %zmm5, %zmm10, %zmm5 +; AVX512VL-NEXT: vaddps %zmm5, %zmm4, %zmm4 +; AVX512VL-NEXT: vpermpd {{.*#+}} zmm5 = zmm6[2,2,2,2,6,6,6,6] +; AVX512VL-NEXT: vmulps %zmm5, %zmm11, %zmm5 +; AVX512VL-NEXT: vaddps %zmm5, %zmm4, %zmm4 +; AVX512VL-NEXT: vmulps %zmm12, %zmm13, %zmm5 +; AVX512VL-NEXT: vaddps %zmm5, %zmm4, %zmm4 +; AVX512VL-NEXT: vmulps %zmm7, %zmm3, %zmm3 +; AVX512VL-NEXT: vaddps %zmm3, %zmm4, %zmm3 +; AVX512VL-NEXT: retq +entry: + %split = shufflevector <64 x float> %a0, <64 x float> poison, <8 x i32> + %split1 = shufflevector <64 x float> %a0, <64 x float> poison, <8 x i32> + %split2 = shufflevector <64 x float> %a0, <64 x float> poison, <8 x i32> + %split3 = shufflevector <64 x float> %a0, <64 x float> poison, <8 x i32> + %split4 = shufflevector <64 x float> %a0, <64 x float> poison, <8 x i32> + %split5 = shufflevector <64 x float> %a0, <64 x float> poison, <8 x i32> + %split6 = shufflevector <64 x float> %a0, <64 x float> poison, <8 x i32> + %split7 = shufflevector <64 x float> %a0, <64 x float> poison, <8 x i32> + %splat.splat = shufflevector <64 x float> %a1, <64 x float> undef, <8 x i32> zeroinitializer + %0 = fmul <8 x float> %split, %splat.splat + %splat.splat18 = shufflevector <64 x float> %a1, <64 x float> undef, <8 x i32> + %1 = fmul <8 x float> %split1, %splat.splat18 + %2 = fadd <8 x float> %0, %1 + %splat.splat21 = shufflevector <64 x float> %a1, <64 x float> undef, <8 x i32> + %3 = fmul <8 x float> %split2, %splat.splat21 + %4 = fadd <8 x float> %2, %3 + %splat.splat24 = shufflevector <64 x float> %a1, <64 x float> undef, <8 x i32> + %5 = fmul <8 x float> %split3, %splat.splat24 + %6 = fadd <8 x float> %4, %5 + %splat.splat27 = shufflevector <64 x float> %a1, <64 x float> undef, <8 x i32> + %7 = fmul <8 x float> %split4, %splat.splat27 + %8 = fadd <8 x float> %6, %7 + %splat.splat30 = shufflevector <64 x float> %a1, <64 x float> undef, <8 x i32> + %9 = fmul <8 x float> %split5, %splat.splat30 + %10 = fadd <8 x float> %8, %9 + %splat.splat33 = shufflevector <64 x float> %a1, <64 x float> undef, <8 x i32> + %11 = fmul <8 x float> %split6, %splat.splat33 + %12 = fadd <8 x float> %10, %11 + %splat.splat36 = shufflevector <64 x float> %a1, <64 x float> undef, <8 x i32> + %13 = fmul <8 x float> %split7, %splat.splat36 + %14 = fadd <8 x float> %12, %13 + %splat.splat39 = shufflevector <64 x float> %a1, <64 x float> undef, <8 x i32> + %15 = fmul <8 x float> %split, %splat.splat39 + %splat.splat42 = shufflevector <64 x float> %a1, <64 x float> undef, <8 x i32> + %16 = fmul <8 x float> %split1, %splat.splat42 + %17 = fadd <8 x float> %15, %16 + %splat.splat45 = shufflevector <64 x float> %a1, <64 x float> undef, <8 x i32> + %18 = fmul <8 x float> %split2, %splat.splat45 + %19 = fadd <8 x float> %17, %18 + %splat.splat48 = shufflevector <64 x float> %a1, <64 x float> undef, <8 x i32> + %20 = fmul <8 x float> %split3, %splat.splat48 + %21 = fadd <8 x float> %19, %20 + %splat.splat51 = shufflevector <64 x float> %a1, <64 x float> undef, <8 x i32> + %22 = fmul <8 x float> %split4, %splat.splat51 + %23 = fadd <8 x float> %21, %22 + %splat.splat54 = shufflevector <64 x float> %a1, <64 x float> undef, <8 x i32> + %24 = fmul <8 x float> %split5, %splat.splat54 + %25 = fadd <8 x float> %23, %24 + %splat.splat57 = shufflevector <64 x float> %a1, <64 x float> undef, <8 x i32> + %26 = fmul <8 x float> %split6, %splat.splat57 + %27 = fadd <8 x float> %25, %26 + %splat.splat60 = shufflevector <64 x float> %a1, <64 x float> undef, <8 x i32> + %28 = fmul <8 x float> %split7, %splat.splat60 + %29 = fadd <8 x float> %27, %28 + %splat.splat63 = shufflevector <64 x float> %a1, <64 x float> undef, <8 x i32> + %30 = fmul <8 x float> %split, %splat.splat63 + %splat.splat66 = shufflevector <64 x float> %a1, <64 x float> undef, <8 x i32> + %31 = fmul <8 x float> %split1, %splat.splat66 + %32 = fadd <8 x float> %30, %31 + %splat.splat69 = shufflevector <64 x float> %a1, <64 x float> undef, <8 x i32> + %33 = fmul <8 x float> %split2, %splat.splat69 + %34 = fadd <8 x float> %32, %33 + %splat.splat72 = shufflevector <64 x float> %a1, <64 x float> undef, <8 x i32> + %35 = fmul <8 x float> %split3, %splat.splat72 + %36 = fadd <8 x float> %34, %35 + %splat.splat75 = shufflevector <64 x float> %a1, <64 x float> undef, <8 x i32> + %37 = fmul <8 x float> %split4, %splat.splat75 + %38 = fadd <8 x float> %36, %37 + %splat.splat78 = shufflevector <64 x float> %a1, <64 x float> undef, <8 x i32> + %39 = fmul <8 x float> %split5, %splat.splat78 + %40 = fadd <8 x float> %38, %39 + %splat.splat81 = shufflevector <64 x float> %a1, <64 x float> undef, <8 x i32> + %41 = fmul <8 x float> %split6, %splat.splat81 + %42 = fadd <8 x float> %40, %41 + %splat.splat84 = shufflevector <64 x float> %a1, <64 x float> undef, <8 x i32> + %43 = fmul <8 x float> %split7, %splat.splat84 + %44 = fadd <8 x float> %42, %43 + %splat.splat87 = shufflevector <64 x float> %a1, <64 x float> undef, <8 x i32> + %45 = fmul <8 x float> %split, %splat.splat87 + %splat.splat90 = shufflevector <64 x float> %a1, <64 x float> undef, <8 x i32> + %46 = fmul <8 x float> %split1, %splat.splat90 + %47 = fadd <8 x float> %45, %46 + %splat.splat93 = shufflevector <64 x float> %a1, <64 x float> undef, <8 x i32> + %48 = fmul <8 x float> %split2, %splat.splat93 + %49 = fadd <8 x float> %47, %48 + %splat.splat96 = shufflevector <64 x float> %a1, <64 x float> undef, <8 x i32> + %50 = fmul <8 x float> %split3, %splat.splat96 + %51 = fadd <8 x float> %49, %50 + %splat.splat99 = shufflevector <64 x float> %a1, <64 x float> undef, <8 x i32> + %52 = fmul <8 x float> %split4, %splat.splat99 + %53 = fadd <8 x float> %51, %52 + %splat.splat102 = shufflevector <64 x float> %a1, <64 x float> undef, <8 x i32> + %54 = fmul <8 x float> %split5, %splat.splat102 + %55 = fadd <8 x float> %53, %54 + %splat.splat105 = shufflevector <64 x float> %a1, <64 x float> undef, <8 x i32> + %56 = fmul <8 x float> %split6, %splat.splat105 + %57 = fadd <8 x float> %55, %56 + %splat.splat108 = shufflevector <64 x float> %a1, <64 x float> undef, <8 x i32> + %58 = fmul <8 x float> %split7, %splat.splat108 + %59 = fadd <8 x float> %57, %58 + %splat.splat111 = shufflevector <64 x float> %a1, <64 x float> undef, <8 x i32> + %60 = fmul <8 x float> %split, %splat.splat111 + %splat.splat114 = shufflevector <64 x float> %a1, <64 x float> undef, <8 x i32> + %61 = fmul <8 x float> %split1, %splat.splat114 + %62 = fadd <8 x float> %60, %61 + %splat.splat117 = shufflevector <64 x float> %a1, <64 x float> undef, <8 x i32> + %63 = fmul <8 x float> %split2, %splat.splat117 + %64 = fadd <8 x float> %62, %63 + %splat.splat120 = shufflevector <64 x float> %a1, <64 x float> undef, <8 x i32> + %65 = fmul <8 x float> %split3, %splat.splat120 + %66 = fadd <8 x float> %64, %65 + %splat.splat123 = shufflevector <64 x float> %a1, <64 x float> undef, <8 x i32> + %67 = fmul <8 x float> %split4, %splat.splat123 + %68 = fadd <8 x float> %66, %67 + %splat.splat126 = shufflevector <64 x float> %a1, <64 x float> undef, <8 x i32> + %69 = fmul <8 x float> %split5, %splat.splat126 + %70 = fadd <8 x float> %68, %69 + %splat.splat129 = shufflevector <64 x float> %a1, <64 x float> undef, <8 x i32> + %71 = fmul <8 x float> %split6, %splat.splat129 + %72 = fadd <8 x float> %70, %71 + %splat.splat132 = shufflevector <64 x float> %a1, <64 x float> undef, <8 x i32> + %73 = fmul <8 x float> %split7, %splat.splat132 + %74 = fadd <8 x float> %72, %73 + %splat.splat135 = shufflevector <64 x float> %a1, <64 x float> undef, <8 x i32> + %75 = fmul <8 x float> %split, %splat.splat135 + %splat.splat138 = shufflevector <64 x float> %a1, <64 x float> undef, <8 x i32> + %76 = fmul <8 x float> %split1, %splat.splat138 + %77 = fadd <8 x float> %75, %76 + %splat.splat141 = shufflevector <64 x float> %a1, <64 x float> undef, <8 x i32> + %78 = fmul <8 x float> %split2, %splat.splat141 + %79 = fadd <8 x float> %77, %78 + %splat.splat144 = shufflevector <64 x float> %a1, <64 x float> undef, <8 x i32> + %80 = fmul <8 x float> %split3, %splat.splat144 + %81 = fadd <8 x float> %79, %80 + %splat.splat147 = shufflevector <64 x float> %a1, <64 x float> undef, <8 x i32> + %82 = fmul <8 x float> %split4, %splat.splat147 + %83 = fadd <8 x float> %81, %82 + %splat.splat150 = shufflevector <64 x float> %a1, <64 x float> undef, <8 x i32> + %84 = fmul <8 x float> %split5, %splat.splat150 + %85 = fadd <8 x float> %83, %84 + %splat.splat153 = shufflevector <64 x float> %a1, <64 x float> undef, <8 x i32> + %86 = fmul <8 x float> %split6, %splat.splat153 + %87 = fadd <8 x float> %85, %86 + %splat.splat156 = shufflevector <64 x float> %a1, <64 x float> undef, <8 x i32> + %88 = fmul <8 x float> %split7, %splat.splat156 + %89 = fadd <8 x float> %87, %88 + %splat.splat159 = shufflevector <64 x float> %a1, <64 x float> undef, <8 x i32> + %90 = fmul <8 x float> %split, %splat.splat159 + %splat.splat162 = shufflevector <64 x float> %a1, <64 x float> undef, <8 x i32> + %91 = fmul <8 x float> %split1, %splat.splat162 + %92 = fadd <8 x float> %90, %91 + %splat.splat165 = shufflevector <64 x float> %a1, <64 x float> undef, <8 x i32> + %93 = fmul <8 x float> %split2, %splat.splat165 + %94 = fadd <8 x float> %92, %93 + %splat.splat168 = shufflevector <64 x float> %a1, <64 x float> undef, <8 x i32> + %95 = fmul <8 x float> %split3, %splat.splat168 + %96 = fadd <8 x float> %94, %95 + %splat.splat171 = shufflevector <64 x float> %a1, <64 x float> undef, <8 x i32> + %97 = fmul <8 x float> %split4, %splat.splat171 + %98 = fadd <8 x float> %96, %97 + %splat.splat174 = shufflevector <64 x float> %a1, <64 x float> undef, <8 x i32> + %99 = fmul <8 x float> %split5, %splat.splat174 + %100 = fadd <8 x float> %98, %99 + %splat.splat177 = shufflevector <64 x float> %a1, <64 x float> undef, <8 x i32> + %101 = fmul <8 x float> %split6, %splat.splat177 + %102 = fadd <8 x float> %100, %101 + %splat.splat180 = shufflevector <64 x float> %a1, <64 x float> undef, <8 x i32> + %103 = fmul <8 x float> %split7, %splat.splat180 + %104 = fadd <8 x float> %102, %103 + %splat.splat183 = shufflevector <64 x float> %a1, <64 x float> undef, <8 x i32> + %105 = fmul <8 x float> %split, %splat.splat183 + %splat.splat186 = shufflevector <64 x float> %a1, <64 x float> undef, <8 x i32> + %106 = fmul <8 x float> %split1, %splat.splat186 + %107 = fadd <8 x float> %105, %106 + %splat.splat189 = shufflevector <64 x float> %a1, <64 x float> undef, <8 x i32> + %108 = fmul <8 x float> %split2, %splat.splat189 + %109 = fadd <8 x float> %107, %108 + %splat.splat192 = shufflevector <64 x float> %a1, <64 x float> undef, <8 x i32> + %110 = fmul <8 x float> %split3, %splat.splat192 + %111 = fadd <8 x float> %109, %110 + %splat.splat195 = shufflevector <64 x float> %a1, <64 x float> undef, <8 x i32> + %112 = fmul <8 x float> %split4, %splat.splat195 + %113 = fadd <8 x float> %111, %112 + %splat.splat198 = shufflevector <64 x float> %a1, <64 x float> undef, <8 x i32> + %114 = fmul <8 x float> %split5, %splat.splat198 + %115 = fadd <8 x float> %113, %114 + %splat.splat201 = shufflevector <64 x float> %a1, <64 x float> undef, <8 x i32> + %116 = fmul <8 x float> %split6, %splat.splat201 + %117 = fadd <8 x float> %115, %116 + %splat.splat204 = shufflevector <64 x float> %a1, <64 x float> undef, <8 x i32> + %118 = fmul <8 x float> %split7, %splat.splat204 + %119 = fadd <8 x float> %117, %118 + %120 = shufflevector <8 x float> %14, <8 x float> %29, <16 x i32> + %121 = shufflevector <8 x float> %44, <8 x float> %59, <16 x i32> + %122 = shufflevector <8 x float> %74, <8 x float> %89, <16 x i32> + %123 = shufflevector <8 x float> %104, <8 x float> %119, <16 x i32> + %124 = shufflevector <16 x float> %120, <16 x float> %121, <32 x i32> + %125 = shufflevector <16 x float> %122, <16 x float> %123, <32 x i32> + %126 = shufflevector <32 x float> %124, <32 x float> %125, <64 x i32> + ret <64 x float> %126 +} + +define <64 x double> @test_mul8x8_f64(<64 x double> %a0, <64 x double> %a1) nounwind { +; SSE-LABEL: test_mul8x8_f64: +; SSE: # %bb.0: # %entry +; SSE-NEXT: subq $344, %rsp # imm = 0x158 +; SSE-NEXT: movapd %xmm7, %xmm14 +; SSE-NEXT: movapd %xmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movapd %xmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movapd %xmm4, %xmm15 +; SSE-NEXT: movapd %xmm3, %xmm10 +; SSE-NEXT: movapd %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movapd %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movapd %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movapd %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movq %rdi, %rax +; SSE-NEXT: movapd {{[0-9]+}}(%rsp), %xmm13 +; SSE-NEXT: movapd {{[0-9]+}}(%rsp), %xmm11 +; SSE-NEXT: movapd {{[0-9]+}}(%rsp), %xmm12 +; SSE-NEXT: movapd %xmm12, %xmm3 +; SSE-NEXT: unpcklpd {{.*#+}} xmm3 = xmm3[0],xmm12[0] +; SSE-NEXT: mulpd %xmm3, %xmm10 +; SSE-NEXT: movapd %xmm2, %xmm8 +; SSE-NEXT: mulpd %xmm3, %xmm8 +; SSE-NEXT: movapd %xmm1, %xmm9 +; SSE-NEXT: mulpd %xmm3, %xmm9 +; SSE-NEXT: mulpd %xmm0, %xmm3 +; SSE-NEXT: unpckhpd {{.*#+}} xmm12 = xmm12[1,1] +; SSE-NEXT: movapd %xmm7, %xmm2 +; SSE-NEXT: mulpd %xmm12, %xmm2 +; SSE-NEXT: addpd %xmm10, %xmm2 +; SSE-NEXT: movapd %xmm6, %xmm7 +; SSE-NEXT: mulpd %xmm12, %xmm7 +; SSE-NEXT: addpd %xmm8, %xmm7 +; SSE-NEXT: movapd %xmm5, %xmm8 +; SSE-NEXT: mulpd %xmm12, %xmm8 +; SSE-NEXT: addpd %xmm9, %xmm8 +; SSE-NEXT: mulpd %xmm4, %xmm12 +; SSE-NEXT: addpd %xmm3, %xmm12 +; SSE-NEXT: movapd %xmm11, %xmm6 +; SSE-NEXT: unpcklpd {{.*#+}} xmm6 = xmm6[0],xmm11[0] +; SSE-NEXT: movapd %xmm13, %xmm3 +; SSE-NEXT: mulpd %xmm6, %xmm3 +; SSE-NEXT: addpd %xmm12, %xmm3 +; SSE-NEXT: movapd {{[0-9]+}}(%rsp), %xmm1 +; SSE-NEXT: mulpd %xmm6, %xmm1 +; SSE-NEXT: addpd %xmm8, %xmm1 +; SSE-NEXT: movapd {{[0-9]+}}(%rsp), %xmm5 +; SSE-NEXT: mulpd %xmm6, %xmm5 +; SSE-NEXT: addpd %xmm7, %xmm5 +; SSE-NEXT: mulpd {{[0-9]+}}(%rsp), %xmm6 +; SSE-NEXT: addpd %xmm2, %xmm6 +; SSE-NEXT: unpckhpd {{.*#+}} xmm11 = xmm11[1,1] +; SSE-NEXT: movapd {{[0-9]+}}(%rsp), %xmm2 +; SSE-NEXT: mulpd %xmm11, %xmm2 +; SSE-NEXT: addpd %xmm6, %xmm2 +; SSE-NEXT: movapd {{[0-9]+}}(%rsp), %xmm4 +; SSE-NEXT: mulpd %xmm11, %xmm4 +; SSE-NEXT: addpd %xmm5, %xmm4 +; SSE-NEXT: movapd {{[0-9]+}}(%rsp), %xmm5 +; SSE-NEXT: mulpd %xmm11, %xmm5 +; SSE-NEXT: addpd %xmm1, %xmm5 +; SSE-NEXT: mulpd {{[0-9]+}}(%rsp), %xmm11 +; SSE-NEXT: addpd %xmm3, %xmm11 +; SSE-NEXT: movapd {{[0-9]+}}(%rsp), %xmm1 +; SSE-NEXT: movapd %xmm1, %xmm6 +; SSE-NEXT: unpcklpd {{.*#+}} xmm6 = xmm6[0],xmm1[0] +; SSE-NEXT: movapd {{[0-9]+}}(%rsp), %xmm3 +; SSE-NEXT: mulpd %xmm6, %xmm3 +; SSE-NEXT: addpd %xmm11, %xmm3 +; SSE-NEXT: movapd {{[0-9]+}}(%rsp), %xmm7 +; SSE-NEXT: mulpd %xmm6, %xmm7 +; SSE-NEXT: addpd %xmm5, %xmm7 +; SSE-NEXT: movapd {{[0-9]+}}(%rsp), %xmm5 +; SSE-NEXT: mulpd %xmm6, %xmm5 +; SSE-NEXT: addpd %xmm4, %xmm5 +; SSE-NEXT: mulpd {{[0-9]+}}(%rsp), %xmm6 +; SSE-NEXT: addpd %xmm2, %xmm6 +; SSE-NEXT: unpckhpd {{.*#+}} xmm1 = xmm1[1,1] +; SSE-NEXT: movapd {{[0-9]+}}(%rsp), %xmm0 +; SSE-NEXT: mulpd %xmm1, %xmm0 +; SSE-NEXT: addpd %xmm6, %xmm0 +; SSE-NEXT: movapd {{[0-9]+}}(%rsp), %xmm4 +; SSE-NEXT: mulpd %xmm1, %xmm4 +; SSE-NEXT: addpd %xmm5, %xmm4 +; SSE-NEXT: movapd {{[0-9]+}}(%rsp), %xmm5 +; SSE-NEXT: mulpd %xmm1, %xmm5 +; SSE-NEXT: addpd %xmm7, %xmm5 +; SSE-NEXT: mulpd {{[0-9]+}}(%rsp), %xmm1 +; SSE-NEXT: addpd %xmm3, %xmm1 +; SSE-NEXT: movapd {{[0-9]+}}(%rsp), %xmm6 +; SSE-NEXT: movapd %xmm6, %xmm3 +; SSE-NEXT: unpcklpd {{.*#+}} xmm3 = xmm3[0],xmm6[0] +; SSE-NEXT: movapd {{[0-9]+}}(%rsp), %xmm2 +; SSE-NEXT: mulpd %xmm3, %xmm2 +; SSE-NEXT: addpd %xmm1, %xmm2 +; SSE-NEXT: movapd {{[0-9]+}}(%rsp), %xmm1 +; SSE-NEXT: mulpd %xmm3, %xmm1 +; SSE-NEXT: addpd %xmm5, %xmm1 +; SSE-NEXT: movapd {{[0-9]+}}(%rsp), %xmm5 +; SSE-NEXT: mulpd %xmm3, %xmm5 +; SSE-NEXT: addpd %xmm4, %xmm5 +; SSE-NEXT: mulpd {{[0-9]+}}(%rsp), %xmm3 +; SSE-NEXT: addpd %xmm0, %xmm3 +; SSE-NEXT: unpckhpd {{.*#+}} xmm6 = xmm6[1,1] +; SSE-NEXT: movapd {{[0-9]+}}(%rsp), %xmm0 +; SSE-NEXT: mulpd %xmm6, %xmm0 +; SSE-NEXT: addpd %xmm3, %xmm0 +; SSE-NEXT: movapd %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movapd {{[0-9]+}}(%rsp), %xmm0 +; SSE-NEXT: mulpd %xmm6, %xmm0 +; SSE-NEXT: addpd %xmm5, %xmm0 +; SSE-NEXT: movapd %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movapd {{[0-9]+}}(%rsp), %xmm0 +; SSE-NEXT: mulpd %xmm6, %xmm0 +; SSE-NEXT: addpd %xmm1, %xmm0 +; SSE-NEXT: movapd %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: mulpd {{[0-9]+}}(%rsp), %xmm6 +; SSE-NEXT: addpd %xmm2, %xmm6 +; SSE-NEXT: movapd %xmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movapd {{[0-9]+}}(%rsp), %xmm1 +; SSE-NEXT: movapd %xmm1, %xmm0 +; SSE-NEXT: unpcklpd {{.*#+}} xmm0 = xmm0[0],xmm1[0] +; SSE-NEXT: movapd {{[-0-9]+}}(%r{{[sb]}}p), %xmm10 # 16-byte Reload +; SSE-NEXT: movapd %xmm10, %xmm3 +; SSE-NEXT: mulpd %xmm0, %xmm3 +; SSE-NEXT: unpckhpd {{.*#+}} xmm1 = xmm1[1,1] +; SSE-NEXT: movapd %xmm14, %xmm13 +; SSE-NEXT: movapd %xmm14, %xmm2 +; SSE-NEXT: mulpd %xmm1, %xmm2 +; SSE-NEXT: addpd %xmm3, %xmm2 +; SSE-NEXT: movapd {{[-0-9]+}}(%r{{[sb]}}p), %xmm12 # 16-byte Reload +; SSE-NEXT: movapd %xmm12, %xmm3 +; SSE-NEXT: mulpd %xmm0, %xmm3 +; SSE-NEXT: movapd {{[-0-9]+}}(%r{{[sb]}}p), %xmm9 # 16-byte Reload +; SSE-NEXT: movapd %xmm9, %xmm4 +; SSE-NEXT: mulpd %xmm1, %xmm4 +; SSE-NEXT: addpd %xmm3, %xmm4 +; SSE-NEXT: movapd {{[-0-9]+}}(%r{{[sb]}}p), %xmm14 # 16-byte Reload +; SSE-NEXT: movapd %xmm14, %xmm3 +; SSE-NEXT: mulpd %xmm0, %xmm3 +; SSE-NEXT: movapd {{[-0-9]+}}(%r{{[sb]}}p), %xmm11 # 16-byte Reload +; SSE-NEXT: movapd %xmm11, %xmm5 +; SSE-NEXT: mulpd %xmm1, %xmm5 +; SSE-NEXT: addpd %xmm3, %xmm5 +; SSE-NEXT: movapd {{[-0-9]+}}(%r{{[sb]}}p), %xmm8 # 16-byte Reload +; SSE-NEXT: mulpd %xmm8, %xmm0 +; SSE-NEXT: mulpd %xmm15, %xmm1 +; SSE-NEXT: addpd %xmm0, %xmm1 +; SSE-NEXT: movapd {{[0-9]+}}(%rsp), %xmm0 +; SSE-NEXT: movapd %xmm0, %xmm6 +; SSE-NEXT: unpcklpd {{.*#+}} xmm6 = xmm6[0],xmm0[0] +; SSE-NEXT: movapd {{[0-9]+}}(%rsp), %xmm3 +; SSE-NEXT: mulpd %xmm6, %xmm3 +; SSE-NEXT: addpd %xmm1, %xmm3 +; SSE-NEXT: movapd {{[0-9]+}}(%rsp), %xmm1 +; SSE-NEXT: mulpd %xmm6, %xmm1 +; SSE-NEXT: addpd %xmm5, %xmm1 +; SSE-NEXT: movapd {{[0-9]+}}(%rsp), %xmm5 +; SSE-NEXT: mulpd %xmm6, %xmm5 +; SSE-NEXT: addpd %xmm4, %xmm5 +; SSE-NEXT: mulpd {{[0-9]+}}(%rsp), %xmm6 +; SSE-NEXT: addpd %xmm2, %xmm6 +; SSE-NEXT: unpckhpd {{.*#+}} xmm0 = xmm0[1,1] +; SSE-NEXT: movapd {{[0-9]+}}(%rsp), %xmm2 +; SSE-NEXT: mulpd %xmm0, %xmm2 +; SSE-NEXT: addpd %xmm6, %xmm2 +; SSE-NEXT: movapd {{[0-9]+}}(%rsp), %xmm4 +; SSE-NEXT: mulpd %xmm0, %xmm4 +; SSE-NEXT: addpd %xmm5, %xmm4 +; SSE-NEXT: movapd {{[0-9]+}}(%rsp), %xmm5 +; SSE-NEXT: mulpd %xmm0, %xmm5 +; SSE-NEXT: addpd %xmm1, %xmm5 +; SSE-NEXT: mulpd {{[0-9]+}}(%rsp), %xmm0 +; SSE-NEXT: addpd %xmm3, %xmm0 +; SSE-NEXT: movapd {{[0-9]+}}(%rsp), %xmm1 +; SSE-NEXT: movapd %xmm1, %xmm6 +; SSE-NEXT: unpcklpd {{.*#+}} xmm6 = xmm6[0],xmm1[0] +; SSE-NEXT: movapd {{[0-9]+}}(%rsp), %xmm3 +; SSE-NEXT: mulpd %xmm6, %xmm3 +; SSE-NEXT: addpd %xmm0, %xmm3 +; SSE-NEXT: movapd {{[0-9]+}}(%rsp), %xmm7 +; SSE-NEXT: mulpd %xmm6, %xmm7 +; SSE-NEXT: addpd %xmm5, %xmm7 +; SSE-NEXT: movapd {{[0-9]+}}(%rsp), %xmm5 +; SSE-NEXT: mulpd %xmm6, %xmm5 +; SSE-NEXT: addpd %xmm4, %xmm5 +; SSE-NEXT: mulpd {{[0-9]+}}(%rsp), %xmm6 +; SSE-NEXT: addpd %xmm2, %xmm6 +; SSE-NEXT: unpckhpd {{.*#+}} xmm1 = xmm1[1,1] +; SSE-NEXT: movapd {{[0-9]+}}(%rsp), %xmm0 +; SSE-NEXT: mulpd %xmm1, %xmm0 +; SSE-NEXT: addpd %xmm6, %xmm0 +; SSE-NEXT: movapd {{[0-9]+}}(%rsp), %xmm4 +; SSE-NEXT: mulpd %xmm1, %xmm4 +; SSE-NEXT: addpd %xmm5, %xmm4 +; SSE-NEXT: movapd {{[0-9]+}}(%rsp), %xmm5 +; SSE-NEXT: mulpd %xmm1, %xmm5 +; SSE-NEXT: addpd %xmm7, %xmm5 +; SSE-NEXT: mulpd {{[0-9]+}}(%rsp), %xmm1 +; SSE-NEXT: addpd %xmm3, %xmm1 +; SSE-NEXT: movapd {{[0-9]+}}(%rsp), %xmm6 +; SSE-NEXT: movapd %xmm6, %xmm3 +; SSE-NEXT: unpcklpd {{.*#+}} xmm3 = xmm3[0],xmm6[0] +; SSE-NEXT: movapd {{[0-9]+}}(%rsp), %xmm2 +; SSE-NEXT: mulpd %xmm3, %xmm2 +; SSE-NEXT: addpd %xmm1, %xmm2 +; SSE-NEXT: movapd {{[0-9]+}}(%rsp), %xmm1 +; SSE-NEXT: mulpd %xmm3, %xmm1 +; SSE-NEXT: addpd %xmm5, %xmm1 +; SSE-NEXT: movapd {{[0-9]+}}(%rsp), %xmm5 +; SSE-NEXT: mulpd %xmm3, %xmm5 +; SSE-NEXT: addpd %xmm4, %xmm5 +; SSE-NEXT: mulpd {{[0-9]+}}(%rsp), %xmm3 +; SSE-NEXT: addpd %xmm0, %xmm3 +; SSE-NEXT: unpckhpd {{.*#+}} xmm6 = xmm6[1,1] +; SSE-NEXT: movapd {{[0-9]+}}(%rsp), %xmm0 +; SSE-NEXT: mulpd %xmm6, %xmm0 +; SSE-NEXT: addpd %xmm3, %xmm0 +; SSE-NEXT: movapd %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movapd {{[0-9]+}}(%rsp), %xmm0 +; SSE-NEXT: mulpd %xmm6, %xmm0 +; SSE-NEXT: addpd %xmm5, %xmm0 +; SSE-NEXT: movapd %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movapd {{[0-9]+}}(%rsp), %xmm0 +; SSE-NEXT: mulpd %xmm6, %xmm0 +; SSE-NEXT: addpd %xmm1, %xmm0 +; SSE-NEXT: movapd %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: mulpd {{[0-9]+}}(%rsp), %xmm6 +; SSE-NEXT: addpd %xmm2, %xmm6 +; SSE-NEXT: movapd %xmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movapd {{[0-9]+}}(%rsp), %xmm1 +; SSE-NEXT: movapd %xmm1, %xmm0 +; SSE-NEXT: unpcklpd {{.*#+}} xmm0 = xmm0[0],xmm1[0] +; SSE-NEXT: movapd %xmm10, %xmm3 +; SSE-NEXT: mulpd %xmm0, %xmm3 +; SSE-NEXT: unpckhpd {{.*#+}} xmm1 = xmm1[1,1] +; SSE-NEXT: movapd %xmm13, %xmm2 +; SSE-NEXT: mulpd %xmm1, %xmm2 +; SSE-NEXT: addpd %xmm3, %xmm2 +; SSE-NEXT: movapd %xmm12, %xmm3 +; SSE-NEXT: mulpd %xmm0, %xmm3 +; SSE-NEXT: movapd %xmm9, %xmm4 +; SSE-NEXT: mulpd %xmm1, %xmm4 +; SSE-NEXT: addpd %xmm3, %xmm4 +; SSE-NEXT: movapd %xmm14, %xmm3 +; SSE-NEXT: mulpd %xmm0, %xmm3 +; SSE-NEXT: movapd %xmm11, %xmm5 +; SSE-NEXT: movapd %xmm11, %xmm12 +; SSE-NEXT: mulpd %xmm1, %xmm5 +; SSE-NEXT: addpd %xmm3, %xmm5 +; SSE-NEXT: mulpd %xmm8, %xmm0 +; SSE-NEXT: movapd %xmm8, %xmm11 +; SSE-NEXT: mulpd %xmm15, %xmm1 +; SSE-NEXT: addpd %xmm0, %xmm1 +; SSE-NEXT: movapd {{[0-9]+}}(%rsp), %xmm0 +; SSE-NEXT: movapd %xmm0, %xmm6 +; SSE-NEXT: unpcklpd {{.*#+}} xmm6 = xmm6[0],xmm0[0] +; SSE-NEXT: movapd {{[0-9]+}}(%rsp), %xmm3 +; SSE-NEXT: mulpd %xmm6, %xmm3 +; SSE-NEXT: addpd %xmm1, %xmm3 +; SSE-NEXT: movapd {{[0-9]+}}(%rsp), %xmm1 +; SSE-NEXT: mulpd %xmm6, %xmm1 +; SSE-NEXT: addpd %xmm5, %xmm1 +; SSE-NEXT: movapd {{[0-9]+}}(%rsp), %xmm5 +; SSE-NEXT: mulpd %xmm6, %xmm5 +; SSE-NEXT: addpd %xmm4, %xmm5 +; SSE-NEXT: movapd {{[0-9]+}}(%rsp), %xmm4 +; SSE-NEXT: mulpd %xmm4, %xmm6 +; SSE-NEXT: addpd %xmm2, %xmm6 +; SSE-NEXT: unpckhpd {{.*#+}} xmm0 = xmm0[1,1] +; SSE-NEXT: movapd {{[0-9]+}}(%rsp), %xmm2 +; SSE-NEXT: mulpd %xmm0, %xmm2 +; SSE-NEXT: addpd %xmm6, %xmm2 +; SSE-NEXT: movapd {{[0-9]+}}(%rsp), %xmm4 +; SSE-NEXT: mulpd %xmm0, %xmm4 +; SSE-NEXT: addpd %xmm5, %xmm4 +; SSE-NEXT: movapd {{[0-9]+}}(%rsp), %xmm5 +; SSE-NEXT: mulpd %xmm0, %xmm5 +; SSE-NEXT: addpd %xmm1, %xmm5 +; SSE-NEXT: movapd {{[0-9]+}}(%rsp), %xmm1 +; SSE-NEXT: mulpd %xmm1, %xmm0 +; SSE-NEXT: addpd %xmm3, %xmm0 +; SSE-NEXT: movapd {{[0-9]+}}(%rsp), %xmm1 +; SSE-NEXT: movapd %xmm1, %xmm6 +; SSE-NEXT: unpcklpd {{.*#+}} xmm6 = xmm6[0],xmm1[0] +; SSE-NEXT: movapd {{[0-9]+}}(%rsp), %xmm3 +; SSE-NEXT: mulpd %xmm6, %xmm3 +; SSE-NEXT: addpd %xmm0, %xmm3 +; SSE-NEXT: movapd {{[0-9]+}}(%rsp), %xmm7 +; SSE-NEXT: mulpd %xmm6, %xmm7 +; SSE-NEXT: addpd %xmm5, %xmm7 +; SSE-NEXT: movapd {{[0-9]+}}(%rsp), %xmm5 +; SSE-NEXT: mulpd %xmm6, %xmm5 +; SSE-NEXT: addpd %xmm4, %xmm5 +; SSE-NEXT: movapd {{[0-9]+}}(%rsp), %xmm0 +; SSE-NEXT: mulpd %xmm0, %xmm6 +; SSE-NEXT: addpd %xmm2, %xmm6 +; SSE-NEXT: unpckhpd {{.*#+}} xmm1 = xmm1[1,1] +; SSE-NEXT: movapd {{[0-9]+}}(%rsp), %xmm0 +; SSE-NEXT: mulpd %xmm1, %xmm0 +; SSE-NEXT: addpd %xmm6, %xmm0 +; SSE-NEXT: movapd {{[0-9]+}}(%rsp), %xmm4 +; SSE-NEXT: mulpd %xmm1, %xmm4 +; SSE-NEXT: addpd %xmm5, %xmm4 +; SSE-NEXT: movapd {{[0-9]+}}(%rsp), %xmm5 +; SSE-NEXT: mulpd %xmm1, %xmm5 +; SSE-NEXT: addpd %xmm7, %xmm5 +; SSE-NEXT: movapd {{[0-9]+}}(%rsp), %xmm2 +; SSE-NEXT: mulpd %xmm2, %xmm1 +; SSE-NEXT: addpd %xmm3, %xmm1 +; SSE-NEXT: movapd {{[0-9]+}}(%rsp), %xmm6 +; SSE-NEXT: movapd %xmm6, %xmm3 +; SSE-NEXT: unpcklpd {{.*#+}} xmm3 = xmm3[0],xmm6[0] +; SSE-NEXT: movapd {{[0-9]+}}(%rsp), %xmm2 +; SSE-NEXT: mulpd %xmm3, %xmm2 +; SSE-NEXT: addpd %xmm1, %xmm2 +; SSE-NEXT: movapd {{[0-9]+}}(%rsp), %xmm1 +; SSE-NEXT: mulpd %xmm3, %xmm1 +; SSE-NEXT: addpd %xmm5, %xmm1 +; SSE-NEXT: movapd {{[0-9]+}}(%rsp), %xmm5 +; SSE-NEXT: mulpd %xmm3, %xmm5 +; SSE-NEXT: addpd %xmm4, %xmm5 +; SSE-NEXT: movapd {{[0-9]+}}(%rsp), %xmm4 +; SSE-NEXT: mulpd %xmm4, %xmm3 +; SSE-NEXT: addpd %xmm0, %xmm3 +; SSE-NEXT: unpckhpd {{.*#+}} xmm6 = xmm6[1,1] +; SSE-NEXT: movapd {{[0-9]+}}(%rsp), %xmm0 +; SSE-NEXT: mulpd %xmm6, %xmm0 +; SSE-NEXT: addpd %xmm3, %xmm0 +; SSE-NEXT: movapd %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movapd {{[0-9]+}}(%rsp), %xmm0 +; SSE-NEXT: mulpd %xmm6, %xmm0 +; SSE-NEXT: addpd %xmm5, %xmm0 +; SSE-NEXT: movapd %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movapd {{[0-9]+}}(%rsp), %xmm0 +; SSE-NEXT: mulpd %xmm6, %xmm0 +; SSE-NEXT: addpd %xmm1, %xmm0 +; SSE-NEXT: movapd %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movapd {{[0-9]+}}(%rsp), %xmm0 +; SSE-NEXT: mulpd %xmm0, %xmm6 +; SSE-NEXT: addpd %xmm2, %xmm6 +; SSE-NEXT: movapd %xmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movapd {{[0-9]+}}(%rsp), %xmm1 +; SSE-NEXT: movapd %xmm1, %xmm0 +; SSE-NEXT: unpcklpd {{.*#+}} xmm0 = xmm0[0],xmm1[0] +; SSE-NEXT: movapd %xmm10, %xmm8 +; SSE-NEXT: movapd %xmm10, %xmm3 +; SSE-NEXT: mulpd %xmm0, %xmm3 +; SSE-NEXT: unpckhpd {{.*#+}} xmm1 = xmm1[1,1] +; SSE-NEXT: movapd %xmm13, %xmm2 +; SSE-NEXT: mulpd %xmm1, %xmm2 +; SSE-NEXT: addpd %xmm3, %xmm2 +; SSE-NEXT: movapd {{[-0-9]+}}(%r{{[sb]}}p), %xmm9 # 16-byte Reload +; SSE-NEXT: movapd %xmm9, %xmm3 +; SSE-NEXT: mulpd %xmm0, %xmm3 +; SSE-NEXT: movapd {{[-0-9]+}}(%r{{[sb]}}p), %xmm10 # 16-byte Reload +; SSE-NEXT: movapd %xmm10, %xmm4 +; SSE-NEXT: mulpd %xmm1, %xmm4 +; SSE-NEXT: addpd %xmm3, %xmm4 +; SSE-NEXT: movapd %xmm14, %xmm3 +; SSE-NEXT: mulpd %xmm0, %xmm3 +; SSE-NEXT: movapd %xmm12, %xmm14 +; SSE-NEXT: movapd %xmm12, %xmm5 +; SSE-NEXT: mulpd %xmm1, %xmm5 +; SSE-NEXT: addpd %xmm3, %xmm5 +; SSE-NEXT: mulpd %xmm11, %xmm0 +; SSE-NEXT: mulpd %xmm15, %xmm1 +; SSE-NEXT: addpd %xmm0, %xmm1 +; SSE-NEXT: movapd {{[0-9]+}}(%rsp), %xmm0 +; SSE-NEXT: movapd %xmm0, %xmm6 +; SSE-NEXT: unpcklpd {{.*#+}} xmm6 = xmm6[0],xmm0[0] +; SSE-NEXT: movapd {{[0-9]+}}(%rsp), %xmm3 +; SSE-NEXT: mulpd %xmm6, %xmm3 +; SSE-NEXT: addpd %xmm1, %xmm3 +; SSE-NEXT: movapd {{[0-9]+}}(%rsp), %xmm1 +; SSE-NEXT: mulpd %xmm6, %xmm1 +; SSE-NEXT: addpd %xmm5, %xmm1 +; SSE-NEXT: movapd {{[0-9]+}}(%rsp), %xmm5 +; SSE-NEXT: mulpd %xmm6, %xmm5 +; SSE-NEXT: addpd %xmm4, %xmm5 +; SSE-NEXT: mulpd {{[0-9]+}}(%rsp), %xmm6 +; SSE-NEXT: addpd %xmm2, %xmm6 +; SSE-NEXT: unpckhpd {{.*#+}} xmm0 = xmm0[1,1] +; SSE-NEXT: movapd {{[0-9]+}}(%rsp), %xmm2 +; SSE-NEXT: mulpd %xmm0, %xmm2 +; SSE-NEXT: addpd %xmm6, %xmm2 +; SSE-NEXT: movapd {{[0-9]+}}(%rsp), %xmm4 +; SSE-NEXT: mulpd %xmm0, %xmm4 +; SSE-NEXT: addpd %xmm5, %xmm4 +; SSE-NEXT: movapd {{[0-9]+}}(%rsp), %xmm5 +; SSE-NEXT: mulpd %xmm0, %xmm5 +; SSE-NEXT: addpd %xmm1, %xmm5 +; SSE-NEXT: mulpd {{[0-9]+}}(%rsp), %xmm0 +; SSE-NEXT: addpd %xmm3, %xmm0 +; SSE-NEXT: movapd {{[0-9]+}}(%rsp), %xmm1 +; SSE-NEXT: movapd %xmm1, %xmm6 +; SSE-NEXT: unpcklpd {{.*#+}} xmm6 = xmm6[0],xmm1[0] +; SSE-NEXT: movapd {{[0-9]+}}(%rsp), %xmm3 +; SSE-NEXT: mulpd %xmm6, %xmm3 +; SSE-NEXT: addpd %xmm0, %xmm3 +; SSE-NEXT: movapd {{[0-9]+}}(%rsp), %xmm7 +; SSE-NEXT: mulpd %xmm6, %xmm7 +; SSE-NEXT: addpd %xmm5, %xmm7 +; SSE-NEXT: movapd {{[0-9]+}}(%rsp), %xmm5 +; SSE-NEXT: mulpd %xmm6, %xmm5 +; SSE-NEXT: addpd %xmm4, %xmm5 +; SSE-NEXT: mulpd {{[0-9]+}}(%rsp), %xmm6 +; SSE-NEXT: addpd %xmm2, %xmm6 +; SSE-NEXT: unpckhpd {{.*#+}} xmm1 = xmm1[1,1] +; SSE-NEXT: movapd {{[0-9]+}}(%rsp), %xmm0 +; SSE-NEXT: mulpd %xmm1, %xmm0 +; SSE-NEXT: addpd %xmm6, %xmm0 +; SSE-NEXT: movapd {{[0-9]+}}(%rsp), %xmm4 +; SSE-NEXT: mulpd %xmm1, %xmm4 +; SSE-NEXT: addpd %xmm5, %xmm4 +; SSE-NEXT: movapd {{[0-9]+}}(%rsp), %xmm6 +; SSE-NEXT: mulpd %xmm1, %xmm6 +; SSE-NEXT: addpd %xmm7, %xmm6 +; SSE-NEXT: mulpd {{[0-9]+}}(%rsp), %xmm1 +; SSE-NEXT: addpd %xmm3, %xmm1 +; SSE-NEXT: movapd {{[0-9]+}}(%rsp), %xmm5 +; SSE-NEXT: movapd %xmm5, %xmm3 +; SSE-NEXT: unpcklpd {{.*#+}} xmm3 = xmm3[0],xmm5[0] +; SSE-NEXT: movapd {{[0-9]+}}(%rsp), %xmm2 +; SSE-NEXT: mulpd %xmm3, %xmm2 +; SSE-NEXT: addpd %xmm1, %xmm2 +; SSE-NEXT: movapd {{[0-9]+}}(%rsp), %xmm1 +; SSE-NEXT: mulpd %xmm3, %xmm1 +; SSE-NEXT: addpd %xmm6, %xmm1 +; SSE-NEXT: movapd {{[0-9]+}}(%rsp), %xmm6 +; SSE-NEXT: mulpd %xmm3, %xmm6 +; SSE-NEXT: addpd %xmm4, %xmm6 +; SSE-NEXT: mulpd {{[0-9]+}}(%rsp), %xmm3 +; SSE-NEXT: addpd %xmm0, %xmm3 +; SSE-NEXT: unpckhpd {{.*#+}} xmm5 = xmm5[1,1] +; SSE-NEXT: movapd {{[0-9]+}}(%rsp), %xmm0 +; SSE-NEXT: mulpd %xmm5, %xmm0 +; SSE-NEXT: addpd %xmm3, %xmm0 +; SSE-NEXT: movapd %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movapd {{[0-9]+}}(%rsp), %xmm0 +; SSE-NEXT: mulpd %xmm5, %xmm0 +; SSE-NEXT: addpd %xmm6, %xmm0 +; SSE-NEXT: movapd %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movapd {{[0-9]+}}(%rsp), %xmm0 +; SSE-NEXT: mulpd %xmm5, %xmm0 +; SSE-NEXT: addpd %xmm1, %xmm0 +; SSE-NEXT: movapd %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: mulpd {{[0-9]+}}(%rsp), %xmm5 +; SSE-NEXT: addpd %xmm2, %xmm5 +; SSE-NEXT: movapd %xmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movapd {{[0-9]+}}(%rsp), %xmm1 +; SSE-NEXT: movapd %xmm1, %xmm0 +; SSE-NEXT: unpcklpd {{.*#+}} xmm0 = xmm0[0],xmm1[0] +; SSE-NEXT: mulpd %xmm0, %xmm8 +; SSE-NEXT: unpckhpd {{.*#+}} xmm1 = xmm1[1,1] +; SSE-NEXT: movapd %xmm13, %xmm2 +; SSE-NEXT: movapd %xmm13, %xmm12 +; SSE-NEXT: mulpd %xmm1, %xmm2 +; SSE-NEXT: addpd %xmm8, %xmm2 +; SSE-NEXT: mulpd %xmm0, %xmm9 +; SSE-NEXT: movapd %xmm10, %xmm4 +; SSE-NEXT: mulpd %xmm1, %xmm4 +; SSE-NEXT: addpd %xmm9, %xmm4 +; SSE-NEXT: movapd {{[-0-9]+}}(%r{{[sb]}}p), %xmm5 # 16-byte Reload +; SSE-NEXT: movapd %xmm5, %xmm3 +; SSE-NEXT: mulpd %xmm0, %xmm3 +; SSE-NEXT: movapd %xmm14, %xmm6 +; SSE-NEXT: mulpd %xmm1, %xmm6 +; SSE-NEXT: addpd %xmm3, %xmm6 +; SSE-NEXT: movapd %xmm11, %xmm8 +; SSE-NEXT: mulpd %xmm11, %xmm0 +; SSE-NEXT: movapd %xmm15, %xmm10 +; SSE-NEXT: mulpd %xmm15, %xmm1 +; SSE-NEXT: addpd %xmm0, %xmm1 +; SSE-NEXT: movapd {{[0-9]+}}(%rsp), %xmm0 +; SSE-NEXT: movapd %xmm0, %xmm7 +; SSE-NEXT: unpcklpd {{.*#+}} xmm7 = xmm7[0],xmm0[0] +; SSE-NEXT: movapd {{[0-9]+}}(%rsp), %xmm3 +; SSE-NEXT: mulpd %xmm7, %xmm3 +; SSE-NEXT: addpd %xmm1, %xmm3 +; SSE-NEXT: movapd {{[0-9]+}}(%rsp), %xmm1 +; SSE-NEXT: mulpd %xmm7, %xmm1 +; SSE-NEXT: addpd %xmm6, %xmm1 +; SSE-NEXT: movapd {{[0-9]+}}(%rsp), %xmm6 +; SSE-NEXT: mulpd %xmm7, %xmm6 +; SSE-NEXT: addpd %xmm4, %xmm6 +; SSE-NEXT: mulpd {{[0-9]+}}(%rsp), %xmm7 +; SSE-NEXT: addpd %xmm2, %xmm7 +; SSE-NEXT: unpckhpd {{.*#+}} xmm0 = xmm0[1,1] +; SSE-NEXT: movapd {{[0-9]+}}(%rsp), %xmm2 +; SSE-NEXT: mulpd %xmm0, %xmm2 +; SSE-NEXT: addpd %xmm7, %xmm2 +; SSE-NEXT: movapd {{[0-9]+}}(%rsp), %xmm4 +; SSE-NEXT: mulpd %xmm0, %xmm4 +; SSE-NEXT: addpd %xmm6, %xmm4 +; SSE-NEXT: movapd {{[0-9]+}}(%rsp), %xmm6 +; SSE-NEXT: mulpd %xmm0, %xmm6 +; SSE-NEXT: addpd %xmm1, %xmm6 +; SSE-NEXT: mulpd {{[0-9]+}}(%rsp), %xmm0 +; SSE-NEXT: addpd %xmm3, %xmm0 +; SSE-NEXT: movapd {{[0-9]+}}(%rsp), %xmm1 +; SSE-NEXT: movapd %xmm1, %xmm7 +; SSE-NEXT: unpcklpd {{.*#+}} xmm7 = xmm7[0],xmm1[0] +; SSE-NEXT: movapd {{[0-9]+}}(%rsp), %xmm3 +; SSE-NEXT: mulpd %xmm7, %xmm3 +; SSE-NEXT: addpd %xmm0, %xmm3 +; SSE-NEXT: movapd {{[0-9]+}}(%rsp), %xmm9 +; SSE-NEXT: mulpd %xmm7, %xmm9 +; SSE-NEXT: addpd %xmm6, %xmm9 +; SSE-NEXT: movapd {{[0-9]+}}(%rsp), %xmm6 +; SSE-NEXT: mulpd %xmm7, %xmm6 +; SSE-NEXT: addpd %xmm4, %xmm6 +; SSE-NEXT: mulpd {{[0-9]+}}(%rsp), %xmm7 +; SSE-NEXT: addpd %xmm2, %xmm7 +; SSE-NEXT: unpckhpd {{.*#+}} xmm1 = xmm1[1,1] +; SSE-NEXT: movapd {{[0-9]+}}(%rsp), %xmm0 +; SSE-NEXT: mulpd %xmm1, %xmm0 +; SSE-NEXT: addpd %xmm7, %xmm0 +; SSE-NEXT: movapd {{[0-9]+}}(%rsp), %xmm7 +; SSE-NEXT: mulpd %xmm1, %xmm7 +; SSE-NEXT: addpd %xmm6, %xmm7 +; SSE-NEXT: movapd {{[0-9]+}}(%rsp), %xmm6 +; SSE-NEXT: mulpd %xmm1, %xmm6 +; SSE-NEXT: addpd %xmm9, %xmm6 +; SSE-NEXT: mulpd {{[0-9]+}}(%rsp), %xmm1 +; SSE-NEXT: addpd %xmm3, %xmm1 +; SSE-NEXT: movapd {{[0-9]+}}(%rsp), %xmm11 +; SSE-NEXT: movapd %xmm11, %xmm3 +; SSE-NEXT: unpcklpd {{.*#+}} xmm3 = xmm3[0],xmm11[0] +; SSE-NEXT: movapd {{[0-9]+}}(%rsp), %xmm2 +; SSE-NEXT: mulpd %xmm3, %xmm2 +; SSE-NEXT: addpd %xmm1, %xmm2 +; SSE-NEXT: movapd {{[0-9]+}}(%rsp), %xmm1 +; SSE-NEXT: mulpd %xmm3, %xmm1 +; SSE-NEXT: addpd %xmm6, %xmm1 +; SSE-NEXT: movapd {{[0-9]+}}(%rsp), %xmm6 +; SSE-NEXT: mulpd %xmm3, %xmm6 +; SSE-NEXT: addpd %xmm7, %xmm6 +; SSE-NEXT: movapd {{[0-9]+}}(%rsp), %xmm4 +; SSE-NEXT: mulpd %xmm4, %xmm3 +; SSE-NEXT: addpd %xmm0, %xmm3 +; SSE-NEXT: unpckhpd {{.*#+}} xmm11 = xmm11[1,1] +; SSE-NEXT: movapd {{[0-9]+}}(%rsp), %xmm0 +; SSE-NEXT: mulpd %xmm11, %xmm0 +; SSE-NEXT: addpd %xmm3, %xmm0 +; SSE-NEXT: movapd %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movapd {{[0-9]+}}(%rsp), %xmm0 +; SSE-NEXT: mulpd %xmm11, %xmm0 +; SSE-NEXT: addpd %xmm6, %xmm0 +; SSE-NEXT: movapd %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movapd {{[0-9]+}}(%rsp), %xmm0 +; SSE-NEXT: mulpd %xmm11, %xmm0 +; SSE-NEXT: addpd %xmm1, %xmm0 +; SSE-NEXT: movapd %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: mulpd {{[0-9]+}}(%rsp), %xmm11 +; SSE-NEXT: addpd %xmm2, %xmm11 +; SSE-NEXT: movapd {{[0-9]+}}(%rsp), %xmm1 +; SSE-NEXT: movapd %xmm1, %xmm0 +; SSE-NEXT: unpcklpd {{.*#+}} xmm0 = xmm0[0],xmm1[0] +; SSE-NEXT: movapd {{[-0-9]+}}(%r{{[sb]}}p), %xmm13 # 16-byte Reload +; SSE-NEXT: movapd %xmm13, %xmm3 +; SSE-NEXT: mulpd %xmm0, %xmm3 +; SSE-NEXT: unpckhpd {{.*#+}} xmm1 = xmm1[1,1] +; SSE-NEXT: movapd %xmm12, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movapd %xmm12, %xmm2 +; SSE-NEXT: mulpd %xmm1, %xmm2 +; SSE-NEXT: addpd %xmm3, %xmm2 +; SSE-NEXT: movapd {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Reload +; SSE-NEXT: mulpd %xmm0, %xmm3 +; SSE-NEXT: movapd {{[-0-9]+}}(%r{{[sb]}}p), %xmm15 # 16-byte Reload +; SSE-NEXT: movapd %xmm15, %xmm6 +; SSE-NEXT: mulpd %xmm1, %xmm6 +; SSE-NEXT: addpd %xmm3, %xmm6 +; SSE-NEXT: mulpd %xmm0, %xmm5 +; SSE-NEXT: movapd %xmm14, %xmm4 +; SSE-NEXT: movapd %xmm14, %xmm7 +; SSE-NEXT: mulpd %xmm1, %xmm7 +; SSE-NEXT: addpd %xmm5, %xmm7 +; SSE-NEXT: mulpd %xmm8, %xmm0 +; SSE-NEXT: mulpd %xmm10, %xmm1 +; SSE-NEXT: movapd %xmm10, %xmm14 +; SSE-NEXT: addpd %xmm0, %xmm1 +; SSE-NEXT: movapd {{[0-9]+}}(%rsp), %xmm0 +; SSE-NEXT: movapd %xmm0, %xmm9 +; SSE-NEXT: unpcklpd {{.*#+}} xmm9 = xmm9[0],xmm0[0] +; SSE-NEXT: movapd {{[0-9]+}}(%rsp), %xmm3 +; SSE-NEXT: mulpd %xmm9, %xmm3 +; SSE-NEXT: addpd %xmm1, %xmm3 +; SSE-NEXT: movapd {{[0-9]+}}(%rsp), %xmm10 +; SSE-NEXT: mulpd %xmm9, %xmm10 +; SSE-NEXT: addpd %xmm7, %xmm10 +; SSE-NEXT: movapd {{[0-9]+}}(%rsp), %xmm7 +; SSE-NEXT: mulpd %xmm9, %xmm7 +; SSE-NEXT: addpd %xmm6, %xmm7 +; SSE-NEXT: mulpd {{[0-9]+}}(%rsp), %xmm9 +; SSE-NEXT: addpd %xmm2, %xmm9 +; SSE-NEXT: unpckhpd {{.*#+}} xmm0 = xmm0[1,1] +; SSE-NEXT: movapd {{[0-9]+}}(%rsp), %xmm1 +; SSE-NEXT: mulpd %xmm0, %xmm1 +; SSE-NEXT: addpd %xmm9, %xmm1 +; SSE-NEXT: movapd {{[0-9]+}}(%rsp), %xmm9 +; SSE-NEXT: mulpd %xmm0, %xmm9 +; SSE-NEXT: addpd %xmm7, %xmm9 +; SSE-NEXT: movapd {{[0-9]+}}(%rsp), %xmm7 +; SSE-NEXT: mulpd %xmm0, %xmm7 +; SSE-NEXT: addpd %xmm10, %xmm7 +; SSE-NEXT: mulpd {{[0-9]+}}(%rsp), %xmm0 +; SSE-NEXT: addpd %xmm3, %xmm0 +; SSE-NEXT: movapd {{[0-9]+}}(%rsp), %xmm6 +; SSE-NEXT: movapd %xmm6, %xmm3 +; SSE-NEXT: unpcklpd {{.*#+}} xmm3 = xmm3[0],xmm6[0] +; SSE-NEXT: movapd {{[0-9]+}}(%rsp), %xmm2 +; SSE-NEXT: mulpd %xmm3, %xmm2 +; SSE-NEXT: addpd %xmm0, %xmm2 +; SSE-NEXT: movapd {{[0-9]+}}(%rsp), %xmm10 +; SSE-NEXT: mulpd %xmm3, %xmm10 +; SSE-NEXT: addpd %xmm7, %xmm10 +; SSE-NEXT: movapd {{[0-9]+}}(%rsp), %xmm7 +; SSE-NEXT: mulpd %xmm3, %xmm7 +; SSE-NEXT: addpd %xmm9, %xmm7 +; SSE-NEXT: mulpd {{[0-9]+}}(%rsp), %xmm3 +; SSE-NEXT: addpd %xmm1, %xmm3 +; SSE-NEXT: unpckhpd {{.*#+}} xmm6 = xmm6[1,1] +; SSE-NEXT: movapd {{[0-9]+}}(%rsp), %xmm0 +; SSE-NEXT: mulpd %xmm6, %xmm0 +; SSE-NEXT: addpd %xmm3, %xmm0 +; SSE-NEXT: movapd {{[0-9]+}}(%rsp), %xmm9 +; SSE-NEXT: mulpd %xmm6, %xmm9 +; SSE-NEXT: addpd %xmm7, %xmm9 +; SSE-NEXT: movapd {{[0-9]+}}(%rsp), %xmm7 +; SSE-NEXT: mulpd %xmm6, %xmm7 +; SSE-NEXT: addpd %xmm10, %xmm7 +; SSE-NEXT: mulpd {{[0-9]+}}(%rsp), %xmm6 +; SSE-NEXT: addpd %xmm2, %xmm6 +; SSE-NEXT: movapd {{[0-9]+}}(%rsp), %xmm8 +; SSE-NEXT: movapd %xmm8, %xmm2 +; SSE-NEXT: unpcklpd {{.*#+}} xmm2 = xmm2[0],xmm8[0] +; SSE-NEXT: movapd {{[0-9]+}}(%rsp), %xmm1 +; SSE-NEXT: mulpd %xmm2, %xmm1 +; SSE-NEXT: addpd %xmm6, %xmm1 +; SSE-NEXT: movapd {{[0-9]+}}(%rsp), %xmm10 +; SSE-NEXT: mulpd %xmm2, %xmm10 +; SSE-NEXT: addpd %xmm7, %xmm10 +; SSE-NEXT: movapd {{[0-9]+}}(%rsp), %xmm5 +; SSE-NEXT: movapd %xmm5, %xmm6 +; SSE-NEXT: mulpd %xmm2, %xmm6 +; SSE-NEXT: addpd %xmm9, %xmm6 +; SSE-NEXT: mulpd {{[0-9]+}}(%rsp), %xmm2 +; SSE-NEXT: addpd %xmm0, %xmm2 +; SSE-NEXT: unpckhpd {{.*#+}} xmm8 = xmm8[1,1] +; SSE-NEXT: movapd {{[0-9]+}}(%rsp), %xmm7 +; SSE-NEXT: movapd %xmm7, %xmm0 +; SSE-NEXT: mulpd %xmm8, %xmm0 +; SSE-NEXT: addpd %xmm2, %xmm0 +; SSE-NEXT: movapd %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movapd {{[0-9]+}}(%rsp), %xmm0 +; SSE-NEXT: mulpd %xmm8, %xmm0 +; SSE-NEXT: addpd %xmm6, %xmm0 +; SSE-NEXT: movapd %xmm0, (%rsp) # 16-byte Spill +; SSE-NEXT: movapd {{[0-9]+}}(%rsp), %xmm0 +; SSE-NEXT: mulpd %xmm8, %xmm0 +; SSE-NEXT: addpd %xmm10, %xmm0 +; SSE-NEXT: movapd %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movapd {{[0-9]+}}(%rsp), %xmm0 +; SSE-NEXT: mulpd %xmm0, %xmm8 +; SSE-NEXT: addpd %xmm1, %xmm8 +; SSE-NEXT: movapd {{[0-9]+}}(%rsp), %xmm1 +; SSE-NEXT: movapd %xmm1, %xmm0 +; SSE-NEXT: unpcklpd {{.*#+}} xmm0 = xmm0[0],xmm1[0] +; SSE-NEXT: mulpd %xmm0, %xmm13 +; SSE-NEXT: unpckhpd {{.*#+}} xmm1 = xmm1[1,1] +; SSE-NEXT: movapd %xmm12, %xmm3 +; SSE-NEXT: mulpd %xmm1, %xmm3 +; SSE-NEXT: addpd %xmm13, %xmm3 +; SSE-NEXT: movapd {{[-0-9]+}}(%r{{[sb]}}p), %xmm13 # 16-byte Reload +; SSE-NEXT: movapd %xmm13, %xmm9 +; SSE-NEXT: mulpd %xmm0, %xmm9 +; SSE-NEXT: movapd %xmm15, %xmm10 +; SSE-NEXT: mulpd %xmm1, %xmm10 +; SSE-NEXT: addpd %xmm9, %xmm10 +; SSE-NEXT: movapd {{[-0-9]+}}(%r{{[sb]}}p), %xmm12 # 16-byte Reload +; SSE-NEXT: movapd %xmm12, %xmm9 +; SSE-NEXT: mulpd %xmm0, %xmm9 +; SSE-NEXT: mulpd %xmm1, %xmm4 +; SSE-NEXT: addpd %xmm9, %xmm4 +; SSE-NEXT: movapd {{[-0-9]+}}(%r{{[sb]}}p), %xmm6 # 16-byte Reload +; SSE-NEXT: mulpd %xmm6, %xmm0 +; SSE-NEXT: mulpd %xmm14, %xmm1 +; SSE-NEXT: addpd %xmm0, %xmm1 +; SSE-NEXT: movapd {{[0-9]+}}(%rsp), %xmm2 +; SSE-NEXT: movapd %xmm2, %xmm0 +; SSE-NEXT: unpcklpd {{.*#+}} xmm0 = xmm0[0],xmm2[0] +; SSE-NEXT: movapd {{[0-9]+}}(%rsp), %xmm9 +; SSE-NEXT: mulpd %xmm0, %xmm9 +; SSE-NEXT: addpd %xmm1, %xmm9 +; SSE-NEXT: movapd {{[0-9]+}}(%rsp), %xmm1 +; SSE-NEXT: mulpd %xmm0, %xmm1 +; SSE-NEXT: addpd %xmm4, %xmm1 +; SSE-NEXT: movapd {{[0-9]+}}(%rsp), %xmm15 +; SSE-NEXT: mulpd %xmm0, %xmm15 +; SSE-NEXT: addpd %xmm10, %xmm15 +; SSE-NEXT: mulpd {{[0-9]+}}(%rsp), %xmm0 +; SSE-NEXT: addpd %xmm3, %xmm0 +; SSE-NEXT: unpckhpd {{.*#+}} xmm2 = xmm2[1,1] +; SSE-NEXT: movapd {{[0-9]+}}(%rsp), %xmm10 +; SSE-NEXT: mulpd %xmm2, %xmm10 +; SSE-NEXT: addpd %xmm0, %xmm10 +; SSE-NEXT: movapd {{[0-9]+}}(%rsp), %xmm0 +; SSE-NEXT: mulpd %xmm2, %xmm0 +; SSE-NEXT: addpd %xmm15, %xmm0 +; SSE-NEXT: movapd {{[0-9]+}}(%rsp), %xmm15 +; SSE-NEXT: mulpd %xmm2, %xmm15 +; SSE-NEXT: addpd %xmm1, %xmm15 +; SSE-NEXT: mulpd {{[0-9]+}}(%rsp), %xmm2 +; SSE-NEXT: addpd %xmm9, %xmm2 +; SSE-NEXT: movapd {{[0-9]+}}(%rsp), %xmm9 +; SSE-NEXT: movapd %xmm9, %xmm1 +; SSE-NEXT: unpcklpd {{.*#+}} xmm1 = xmm1[0],xmm9[0] +; SSE-NEXT: movapd {{[0-9]+}}(%rsp), %xmm3 +; SSE-NEXT: mulpd %xmm1, %xmm3 +; SSE-NEXT: addpd %xmm2, %xmm3 +; SSE-NEXT: movapd {{[0-9]+}}(%rsp), %xmm2 +; SSE-NEXT: mulpd %xmm1, %xmm2 +; SSE-NEXT: addpd %xmm15, %xmm2 +; SSE-NEXT: movapd {{[0-9]+}}(%rsp), %xmm15 +; SSE-NEXT: mulpd %xmm1, %xmm15 +; SSE-NEXT: addpd %xmm0, %xmm15 +; SSE-NEXT: mulpd {{[0-9]+}}(%rsp), %xmm1 +; SSE-NEXT: addpd %xmm10, %xmm1 +; SSE-NEXT: unpckhpd {{.*#+}} xmm9 = xmm9[1,1] +; SSE-NEXT: movapd {{[0-9]+}}(%rsp), %xmm4 +; SSE-NEXT: mulpd %xmm9, %xmm4 +; SSE-NEXT: addpd %xmm1, %xmm4 +; SSE-NEXT: movapd {{[0-9]+}}(%rsp), %xmm10 +; SSE-NEXT: mulpd %xmm9, %xmm10 +; SSE-NEXT: addpd %xmm15, %xmm10 +; SSE-NEXT: movapd {{[0-9]+}}(%rsp), %xmm15 +; SSE-NEXT: mulpd %xmm9, %xmm15 +; SSE-NEXT: addpd %xmm2, %xmm15 +; SSE-NEXT: mulpd {{[0-9]+}}(%rsp), %xmm9 +; SSE-NEXT: addpd %xmm3, %xmm9 +; SSE-NEXT: movapd {{[0-9]+}}(%rsp), %xmm2 +; SSE-NEXT: movapd %xmm2, %xmm3 +; SSE-NEXT: unpcklpd {{.*#+}} xmm3 = xmm3[0],xmm2[0] +; SSE-NEXT: movapd {{[0-9]+}}(%rsp), %xmm1 +; SSE-NEXT: mulpd %xmm3, %xmm1 +; SSE-NEXT: addpd %xmm9, %xmm1 +; SSE-NEXT: movapd {{[0-9]+}}(%rsp), %xmm9 +; SSE-NEXT: mulpd %xmm3, %xmm9 +; SSE-NEXT: addpd %xmm15, %xmm9 +; SSE-NEXT: movapd %xmm5, %xmm0 +; SSE-NEXT: mulpd %xmm3, %xmm0 +; SSE-NEXT: addpd %xmm10, %xmm0 +; SSE-NEXT: mulpd {{[0-9]+}}(%rsp), %xmm3 +; SSE-NEXT: addpd %xmm4, %xmm3 +; SSE-NEXT: unpckhpd {{.*#+}} xmm2 = xmm2[1,1] +; SSE-NEXT: movapd %xmm7, %xmm15 +; SSE-NEXT: mulpd %xmm2, %xmm15 +; SSE-NEXT: addpd %xmm3, %xmm15 +; SSE-NEXT: movapd {{[0-9]+}}(%rsp), %xmm10 +; SSE-NEXT: mulpd %xmm2, %xmm10 +; SSE-NEXT: addpd %xmm0, %xmm10 +; SSE-NEXT: movapd {{[0-9]+}}(%rsp), %xmm7 +; SSE-NEXT: mulpd %xmm2, %xmm7 +; SSE-NEXT: addpd %xmm9, %xmm7 +; SSE-NEXT: mulpd {{[0-9]+}}(%rsp), %xmm2 +; SSE-NEXT: addpd %xmm1, %xmm2 +; SSE-NEXT: movapd {{[0-9]+}}(%rsp), %xmm1 +; SSE-NEXT: movapd %xmm1, %xmm0 +; SSE-NEXT: unpcklpd {{.*#+}} xmm0 = xmm0[0],xmm1[0] +; SSE-NEXT: movapd {{[-0-9]+}}(%r{{[sb]}}p), %xmm9 # 16-byte Reload +; SSE-NEXT: mulpd %xmm0, %xmm9 +; SSE-NEXT: mulpd %xmm0, %xmm13 +; SSE-NEXT: mulpd %xmm0, %xmm12 +; SSE-NEXT: mulpd %xmm6, %xmm0 +; SSE-NEXT: unpckhpd {{.*#+}} xmm1 = xmm1[1,1] +; SSE-NEXT: movapd {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Reload +; SSE-NEXT: mulpd %xmm1, %xmm4 +; SSE-NEXT: addpd %xmm9, %xmm4 +; SSE-NEXT: movapd %xmm4, %xmm9 +; SSE-NEXT: movapd {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Reload +; SSE-NEXT: mulpd %xmm1, %xmm4 +; SSE-NEXT: addpd %xmm13, %xmm4 +; SSE-NEXT: movapd %xmm4, %xmm5 +; SSE-NEXT: movapd {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Reload +; SSE-NEXT: mulpd %xmm1, %xmm4 +; SSE-NEXT: addpd %xmm12, %xmm4 +; SSE-NEXT: movapd %xmm4, %xmm3 +; SSE-NEXT: mulpd %xmm14, %xmm1 +; SSE-NEXT: addpd %xmm0, %xmm1 +; SSE-NEXT: movapd {{[0-9]+}}(%rsp), %xmm0 +; SSE-NEXT: movapd %xmm0, %xmm4 +; SSE-NEXT: unpcklpd {{.*#+}} xmm4 = xmm4[0],xmm0[0] +; SSE-NEXT: movapd {{[0-9]+}}(%rsp), %xmm14 +; SSE-NEXT: mulpd %xmm4, %xmm14 +; SSE-NEXT: addpd %xmm1, %xmm14 +; SSE-NEXT: movapd {{[0-9]+}}(%rsp), %xmm1 +; SSE-NEXT: mulpd %xmm4, %xmm1 +; SSE-NEXT: addpd %xmm3, %xmm1 +; SSE-NEXT: movapd %xmm1, %xmm3 +; SSE-NEXT: movapd {{[0-9]+}}(%rsp), %xmm1 +; SSE-NEXT: mulpd %xmm4, %xmm1 +; SSE-NEXT: addpd %xmm5, %xmm1 +; SSE-NEXT: movapd %xmm1, %xmm5 +; SSE-NEXT: mulpd {{[0-9]+}}(%rsp), %xmm4 +; SSE-NEXT: addpd %xmm9, %xmm4 +; SSE-NEXT: unpckhpd {{.*#+}} xmm0 = xmm0[1,1] +; SSE-NEXT: movapd {{[0-9]+}}(%rsp), %xmm1 +; SSE-NEXT: mulpd %xmm0, %xmm1 +; SSE-NEXT: addpd %xmm4, %xmm1 +; SSE-NEXT: movapd %xmm1, %xmm9 +; SSE-NEXT: movapd {{[0-9]+}}(%rsp), %xmm6 +; SSE-NEXT: mulpd %xmm0, %xmm6 +; SSE-NEXT: addpd %xmm5, %xmm6 +; SSE-NEXT: movapd {{[0-9]+}}(%rsp), %xmm1 +; SSE-NEXT: mulpd %xmm0, %xmm1 +; SSE-NEXT: addpd %xmm3, %xmm1 +; SSE-NEXT: movapd %xmm1, %xmm3 +; SSE-NEXT: mulpd {{[0-9]+}}(%rsp), %xmm0 +; SSE-NEXT: addpd %xmm14, %xmm0 +; SSE-NEXT: movapd {{[0-9]+}}(%rsp), %xmm1 +; SSE-NEXT: movapd %xmm1, %xmm4 +; SSE-NEXT: unpcklpd {{.*#+}} xmm4 = xmm4[0],xmm1[0] +; SSE-NEXT: movapd {{[0-9]+}}(%rsp), %xmm5 +; SSE-NEXT: mulpd %xmm4, %xmm5 +; SSE-NEXT: addpd %xmm0, %xmm5 +; SSE-NEXT: movapd {{[0-9]+}}(%rsp), %xmm0 +; SSE-NEXT: mulpd %xmm4, %xmm0 +; SSE-NEXT: addpd %xmm3, %xmm0 +; SSE-NEXT: movapd %xmm0, %xmm14 +; SSE-NEXT: movapd {{[0-9]+}}(%rsp), %xmm0 +; SSE-NEXT: mulpd %xmm4, %xmm0 +; SSE-NEXT: addpd %xmm6, %xmm0 +; SSE-NEXT: movapd %xmm0, %xmm6 +; SSE-NEXT: mulpd {{[0-9]+}}(%rsp), %xmm4 +; SSE-NEXT: addpd %xmm9, %xmm4 +; SSE-NEXT: unpckhpd {{.*#+}} xmm1 = xmm1[1,1] +; SSE-NEXT: movapd {{[0-9]+}}(%rsp), %xmm0 +; SSE-NEXT: mulpd %xmm1, %xmm0 +; SSE-NEXT: addpd %xmm4, %xmm0 +; SSE-NEXT: movapd %xmm0, %xmm3 +; SSE-NEXT: movapd {{[0-9]+}}(%rsp), %xmm0 +; SSE-NEXT: mulpd %xmm1, %xmm0 +; SSE-NEXT: addpd %xmm6, %xmm0 +; SSE-NEXT: movapd %xmm0, %xmm6 +; SSE-NEXT: movapd {{[0-9]+}}(%rsp), %xmm0 +; SSE-NEXT: mulpd %xmm1, %xmm0 +; SSE-NEXT: addpd %xmm14, %xmm0 +; SSE-NEXT: movapd %xmm0, %xmm9 +; SSE-NEXT: mulpd {{[0-9]+}}(%rsp), %xmm1 +; SSE-NEXT: addpd %xmm5, %xmm1 +; SSE-NEXT: movapd {{[0-9]+}}(%rsp), %xmm0 +; SSE-NEXT: movapd %xmm0, %xmm4 +; SSE-NEXT: unpcklpd {{.*#+}} xmm4 = xmm4[0],xmm0[0] +; SSE-NEXT: movapd {{[0-9]+}}(%rsp), %xmm5 +; SSE-NEXT: mulpd %xmm4, %xmm5 +; SSE-NEXT: addpd %xmm1, %xmm5 +; SSE-NEXT: movapd {{[0-9]+}}(%rsp), %xmm1 +; SSE-NEXT: mulpd %xmm4, %xmm1 +; SSE-NEXT: addpd %xmm9, %xmm1 +; SSE-NEXT: movapd {{[0-9]+}}(%rsp), %xmm9 +; SSE-NEXT: mulpd %xmm4, %xmm9 +; SSE-NEXT: addpd %xmm6, %xmm9 +; SSE-NEXT: mulpd {{[0-9]+}}(%rsp), %xmm4 +; SSE-NEXT: addpd %xmm3, %xmm4 +; SSE-NEXT: unpckhpd {{.*#+}} xmm0 = xmm0[1,1] +; SSE-NEXT: movapd {{[0-9]+}}(%rsp), %xmm3 +; SSE-NEXT: mulpd %xmm0, %xmm3 +; SSE-NEXT: addpd %xmm4, %xmm3 +; SSE-NEXT: movapd {{[0-9]+}}(%rsp), %xmm4 +; SSE-NEXT: mulpd %xmm0, %xmm4 +; SSE-NEXT: addpd %xmm9, %xmm4 +; SSE-NEXT: movapd {{[0-9]+}}(%rsp), %xmm6 +; SSE-NEXT: mulpd %xmm0, %xmm6 +; SSE-NEXT: addpd %xmm1, %xmm6 +; SSE-NEXT: mulpd {{[0-9]+}}(%rsp), %xmm0 +; SSE-NEXT: addpd %xmm5, %xmm0 +; SSE-NEXT: movapd %xmm3, 496(%rdi) +; SSE-NEXT: movapd %xmm4, 480(%rdi) +; SSE-NEXT: movapd %xmm6, 464(%rdi) +; SSE-NEXT: movapd %xmm0, 448(%rdi) +; SSE-NEXT: movapd %xmm15, 432(%rdi) +; SSE-NEXT: movapd %xmm10, 416(%rdi) +; SSE-NEXT: movapd %xmm7, 400(%rdi) +; SSE-NEXT: movapd %xmm2, 384(%rdi) +; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload +; SSE-NEXT: movaps %xmm0, 368(%rdi) +; SSE-NEXT: movaps (%rsp), %xmm0 # 16-byte Reload +; SSE-NEXT: movaps %xmm0, 352(%rdi) +; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload +; SSE-NEXT: movaps %xmm0, 336(%rdi) +; SSE-NEXT: movapd %xmm8, 320(%rdi) +; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload +; SSE-NEXT: movaps %xmm0, 304(%rdi) +; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload +; SSE-NEXT: movaps %xmm0, 288(%rdi) +; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload +; SSE-NEXT: movaps %xmm0, 272(%rdi) +; SSE-NEXT: movapd %xmm11, 256(%rdi) +; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload +; SSE-NEXT: movaps %xmm0, 240(%rdi) +; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload +; SSE-NEXT: movaps %xmm0, 224(%rdi) +; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload +; SSE-NEXT: movaps %xmm0, 208(%rdi) +; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload +; SSE-NEXT: movaps %xmm0, 192(%rdi) +; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload +; SSE-NEXT: movaps %xmm0, 176(%rdi) +; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload +; SSE-NEXT: movaps %xmm0, 160(%rdi) +; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload +; SSE-NEXT: movaps %xmm0, 144(%rdi) +; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload +; SSE-NEXT: movaps %xmm0, 128(%rdi) +; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload +; SSE-NEXT: movaps %xmm0, 112(%rdi) +; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload +; SSE-NEXT: movaps %xmm0, 96(%rdi) +; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload +; SSE-NEXT: movaps %xmm0, 80(%rdi) +; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload +; SSE-NEXT: movaps %xmm0, 64(%rdi) +; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload +; SSE-NEXT: movaps %xmm0, 48(%rdi) +; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload +; SSE-NEXT: movaps %xmm0, 32(%rdi) +; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload +; SSE-NEXT: movaps %xmm0, 16(%rdi) +; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload +; SSE-NEXT: movaps %xmm0, (%rdi) +; SSE-NEXT: addq $344, %rsp # imm = 0x158 +; SSE-NEXT: retq +; +; AVX1-LABEL: test_mul8x8_f64: +; AVX1: # %bb.0: # %entry +; AVX1-NEXT: pushq %rbp +; AVX1-NEXT: movq %rsp, %rbp +; AVX1-NEXT: andq $-32, %rsp +; AVX1-NEXT: subq $448, %rsp # imm = 0x1C0 +; AVX1-NEXT: vmovapd %ymm4, %ymm13 +; AVX1-NEXT: vmovapd %ymm3, %ymm9 +; AVX1-NEXT: vmovapd %ymm1, %ymm4 +; AVX1-NEXT: vmovapd %ymm0, (%rsp) # 32-byte Spill +; AVX1-NEXT: movq %rdi, %rax +; AVX1-NEXT: vmovapd 112(%rbp), %ymm12 +; AVX1-NEXT: vmovapd 48(%rbp), %ymm15 +; AVX1-NEXT: vbroadcastsd 272(%rbp), %ymm10 +; AVX1-NEXT: vmulpd %ymm1, %ymm10, %ymm8 +; AVX1-NEXT: vmulpd %ymm0, %ymm10, %ymm1 +; AVX1-NEXT: vbroadcastsd 280(%rbp), %ymm10 +; AVX1-NEXT: vmulpd %ymm3, %ymm10, %ymm11 +; AVX1-NEXT: vaddpd %ymm11, %ymm8, %ymm3 +; AVX1-NEXT: vmulpd %ymm2, %ymm10, %ymm10 +; AVX1-NEXT: vmovapd %ymm2, %ymm8 +; AVX1-NEXT: vaddpd %ymm1, %ymm10, %ymm0 +; AVX1-NEXT: vbroadcastsd 288(%rbp), %ymm10 +; AVX1-NEXT: vmulpd %ymm10, %ymm13, %ymm11 +; AVX1-NEXT: vaddpd %ymm0, %ymm11, %ymm0 +; AVX1-NEXT: vmulpd %ymm5, %ymm10, %ymm10 +; AVX1-NEXT: vaddpd %ymm3, %ymm10, %ymm3 +; AVX1-NEXT: vbroadcastsd 296(%rbp), %ymm10 +; AVX1-NEXT: vmulpd %ymm7, %ymm10, %ymm11 +; AVX1-NEXT: vaddpd %ymm3, %ymm11, %ymm3 +; AVX1-NEXT: vmulpd %ymm6, %ymm10, %ymm10 +; AVX1-NEXT: vaddpd %ymm0, %ymm10, %ymm0 +; AVX1-NEXT: vbroadcastsd 304(%rbp), %ymm10 +; AVX1-NEXT: vmulpd 16(%rbp), %ymm10, %ymm11 +; AVX1-NEXT: vaddpd %ymm0, %ymm11, %ymm0 +; AVX1-NEXT: vmulpd %ymm10, %ymm15, %ymm10 +; AVX1-NEXT: vaddpd %ymm3, %ymm10, %ymm3 +; AVX1-NEXT: vbroadcastsd 312(%rbp), %ymm10 +; AVX1-NEXT: vmulpd %ymm10, %ymm12, %ymm11 +; AVX1-NEXT: vmovapd %ymm12, %ymm14 +; AVX1-NEXT: vaddpd %ymm3, %ymm11, %ymm3 +; AVX1-NEXT: vmulpd 80(%rbp), %ymm10, %ymm10 +; AVX1-NEXT: vaddpd %ymm0, %ymm10, %ymm0 +; AVX1-NEXT: vbroadcastsd 320(%rbp), %ymm10 +; AVX1-NEXT: vmulpd 144(%rbp), %ymm10, %ymm11 +; AVX1-NEXT: vaddpd %ymm0, %ymm11, %ymm0 +; AVX1-NEXT: vmulpd 176(%rbp), %ymm10, %ymm10 +; AVX1-NEXT: vaddpd %ymm3, %ymm10, %ymm3 +; AVX1-NEXT: vbroadcastsd 328(%rbp), %ymm10 +; AVX1-NEXT: vmulpd 240(%rbp), %ymm10, %ymm11 +; AVX1-NEXT: vaddpd %ymm3, %ymm11, %ymm1 +; AVX1-NEXT: vmovapd %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX1-NEXT: vmulpd 208(%rbp), %ymm10, %ymm3 +; AVX1-NEXT: vaddpd %ymm3, %ymm0, %ymm0 +; AVX1-NEXT: vmovapd %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX1-NEXT: vbroadcastsd 336(%rbp), %ymm0 +; AVX1-NEXT: vmulpd %ymm0, %ymm4, %ymm3 +; AVX1-NEXT: vbroadcastsd 344(%rbp), %ymm10 +; AVX1-NEXT: vmulpd %ymm10, %ymm9, %ymm11 +; AVX1-NEXT: vmovapd %ymm9, %ymm12 +; AVX1-NEXT: vaddpd %ymm3, %ymm11, %ymm3 +; AVX1-NEXT: vmovapd (%rsp), %ymm2 # 32-byte Reload +; AVX1-NEXT: vmulpd %ymm0, %ymm2, %ymm0 +; AVX1-NEXT: vmulpd %ymm10, %ymm8, %ymm10 +; AVX1-NEXT: vaddpd %ymm0, %ymm10, %ymm0 +; AVX1-NEXT: vbroadcastsd 352(%rbp), %ymm10 +; AVX1-NEXT: vmulpd %ymm10, %ymm13, %ymm11 +; AVX1-NEXT: vaddpd %ymm0, %ymm11, %ymm0 +; AVX1-NEXT: vmulpd %ymm5, %ymm10, %ymm10 +; AVX1-NEXT: vaddpd %ymm3, %ymm10, %ymm3 +; AVX1-NEXT: vbroadcastsd 360(%rbp), %ymm10 +; AVX1-NEXT: vmulpd %ymm7, %ymm10, %ymm11 +; AVX1-NEXT: vaddpd %ymm3, %ymm11, %ymm3 +; AVX1-NEXT: vmulpd %ymm6, %ymm10, %ymm10 +; AVX1-NEXT: vaddpd %ymm0, %ymm10, %ymm0 +; AVX1-NEXT: vbroadcastsd 368(%rbp), %ymm10 +; AVX1-NEXT: vmovapd 16(%rbp), %ymm1 +; AVX1-NEXT: vmulpd %ymm1, %ymm10, %ymm11 +; AVX1-NEXT: vaddpd %ymm0, %ymm11, %ymm0 +; AVX1-NEXT: vmulpd %ymm10, %ymm15, %ymm10 +; AVX1-NEXT: vaddpd %ymm3, %ymm10, %ymm3 +; AVX1-NEXT: vbroadcastsd 376(%rbp), %ymm10 +; AVX1-NEXT: vmulpd %ymm10, %ymm14, %ymm11 +; AVX1-NEXT: vaddpd %ymm3, %ymm11, %ymm3 +; AVX1-NEXT: vmovapd 80(%rbp), %ymm9 +; AVX1-NEXT: vmulpd %ymm10, %ymm9, %ymm10 +; AVX1-NEXT: vaddpd %ymm0, %ymm10, %ymm0 +; AVX1-NEXT: vbroadcastsd 384(%rbp), %ymm10 +; AVX1-NEXT: vmovapd 144(%rbp), %ymm9 +; AVX1-NEXT: vmulpd %ymm10, %ymm9, %ymm11 +; AVX1-NEXT: vaddpd %ymm0, %ymm11, %ymm0 +; AVX1-NEXT: vmovapd 176(%rbp), %ymm9 +; AVX1-NEXT: vmulpd %ymm10, %ymm9, %ymm10 +; AVX1-NEXT: vaddpd %ymm3, %ymm10, %ymm3 +; AVX1-NEXT: vbroadcastsd 392(%rbp), %ymm10 +; AVX1-NEXT: vmovapd 240(%rbp), %ymm9 +; AVX1-NEXT: vmulpd %ymm10, %ymm9, %ymm11 +; AVX1-NEXT: vaddpd %ymm3, %ymm11, %ymm3 +; AVX1-NEXT: vmovapd %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX1-NEXT: vmulpd 208(%rbp), %ymm10, %ymm3 +; AVX1-NEXT: vaddpd %ymm3, %ymm0, %ymm0 +; AVX1-NEXT: vmovapd %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX1-NEXT: vbroadcastsd 400(%rbp), %ymm0 +; AVX1-NEXT: vmulpd %ymm0, %ymm4, %ymm3 +; AVX1-NEXT: vmovapd %ymm4, %ymm9 +; AVX1-NEXT: vbroadcastsd 408(%rbp), %ymm10 +; AVX1-NEXT: vmulpd %ymm10, %ymm12, %ymm11 +; AVX1-NEXT: vaddpd %ymm3, %ymm11, %ymm3 +; AVX1-NEXT: vmulpd %ymm0, %ymm2, %ymm0 +; AVX1-NEXT: vmulpd %ymm10, %ymm8, %ymm10 +; AVX1-NEXT: vaddpd %ymm0, %ymm10, %ymm0 +; AVX1-NEXT: vbroadcastsd 416(%rbp), %ymm10 +; AVX1-NEXT: vmovapd %ymm13, %ymm4 +; AVX1-NEXT: vmulpd %ymm10, %ymm13, %ymm11 +; AVX1-NEXT: vaddpd %ymm0, %ymm11, %ymm0 +; AVX1-NEXT: vmulpd %ymm5, %ymm10, %ymm10 +; AVX1-NEXT: vmovapd %ymm5, %ymm13 +; AVX1-NEXT: vaddpd %ymm3, %ymm10, %ymm3 +; AVX1-NEXT: vbroadcastsd 424(%rbp), %ymm10 +; AVX1-NEXT: vmulpd %ymm7, %ymm10, %ymm11 +; AVX1-NEXT: vmovapd %ymm7, %ymm5 +; AVX1-NEXT: vaddpd %ymm3, %ymm11, %ymm3 +; AVX1-NEXT: vmulpd %ymm6, %ymm10, %ymm10 +; AVX1-NEXT: vmovapd %ymm6, %ymm7 +; AVX1-NEXT: vaddpd %ymm0, %ymm10, %ymm0 +; AVX1-NEXT: vbroadcastsd 432(%rbp), %ymm10 +; AVX1-NEXT: vmulpd %ymm1, %ymm10, %ymm11 +; AVX1-NEXT: vaddpd %ymm0, %ymm11, %ymm0 +; AVX1-NEXT: vmulpd %ymm10, %ymm15, %ymm10 +; AVX1-NEXT: vaddpd %ymm3, %ymm10, %ymm3 +; AVX1-NEXT: vbroadcastsd 440(%rbp), %ymm10 +; AVX1-NEXT: vmulpd %ymm10, %ymm14, %ymm11 +; AVX1-NEXT: vaddpd %ymm3, %ymm11, %ymm3 +; AVX1-NEXT: vmulpd 80(%rbp), %ymm10, %ymm10 +; AVX1-NEXT: vaddpd %ymm0, %ymm10, %ymm0 +; AVX1-NEXT: vbroadcastsd 448(%rbp), %ymm10 +; AVX1-NEXT: vmulpd 144(%rbp), %ymm10, %ymm11 +; AVX1-NEXT: vaddpd %ymm0, %ymm11, %ymm0 +; AVX1-NEXT: vmulpd 176(%rbp), %ymm10, %ymm10 +; AVX1-NEXT: vaddpd %ymm3, %ymm10, %ymm3 +; AVX1-NEXT: vbroadcastsd 456(%rbp), %ymm10 +; AVX1-NEXT: vmulpd 240(%rbp), %ymm10, %ymm11 +; AVX1-NEXT: vaddpd %ymm3, %ymm11, %ymm1 +; AVX1-NEXT: vmovapd %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX1-NEXT: vmulpd 208(%rbp), %ymm10, %ymm3 +; AVX1-NEXT: vaddpd %ymm3, %ymm0, %ymm0 +; AVX1-NEXT: vmovapd %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX1-NEXT: vbroadcastsd 464(%rbp), %ymm0 +; AVX1-NEXT: vmulpd %ymm0, %ymm9, %ymm3 +; AVX1-NEXT: vmovapd %ymm9, %ymm6 +; AVX1-NEXT: vbroadcastsd 472(%rbp), %ymm10 +; AVX1-NEXT: vmulpd %ymm10, %ymm12, %ymm11 +; AVX1-NEXT: vaddpd %ymm3, %ymm11, %ymm3 +; AVX1-NEXT: vmulpd %ymm0, %ymm2, %ymm0 +; AVX1-NEXT: vmovapd %ymm2, %ymm1 +; AVX1-NEXT: vmulpd %ymm10, %ymm8, %ymm10 +; AVX1-NEXT: vmovapd %ymm8, %ymm14 +; AVX1-NEXT: vaddpd %ymm0, %ymm10, %ymm0 +; AVX1-NEXT: vbroadcastsd 480(%rbp), %ymm10 +; AVX1-NEXT: vmulpd %ymm4, %ymm10, %ymm11 +; AVX1-NEXT: vmovapd %ymm4, %ymm15 +; AVX1-NEXT: vaddpd %ymm0, %ymm11, %ymm0 +; AVX1-NEXT: vmovapd %ymm13, %ymm9 +; AVX1-NEXT: vmulpd %ymm10, %ymm13, %ymm10 +; AVX1-NEXT: vaddpd %ymm3, %ymm10, %ymm3 +; AVX1-NEXT: vbroadcastsd 488(%rbp), %ymm10 +; AVX1-NEXT: vmovapd %ymm5, %ymm8 +; AVX1-NEXT: vmulpd %ymm5, %ymm10, %ymm11 +; AVX1-NEXT: vaddpd %ymm3, %ymm11, %ymm3 +; AVX1-NEXT: vmulpd %ymm7, %ymm10, %ymm10 +; AVX1-NEXT: vaddpd %ymm0, %ymm10, %ymm0 +; AVX1-NEXT: vbroadcastsd 496(%rbp), %ymm10 +; AVX1-NEXT: vmulpd 16(%rbp), %ymm10, %ymm11 +; AVX1-NEXT: vaddpd %ymm0, %ymm11, %ymm0 +; AVX1-NEXT: vmovapd 48(%rbp), %ymm4 +; AVX1-NEXT: vmulpd %ymm4, %ymm10, %ymm10 +; AVX1-NEXT: vaddpd %ymm3, %ymm10, %ymm3 +; AVX1-NEXT: vbroadcastsd 504(%rbp), %ymm10 +; AVX1-NEXT: vmovapd 112(%rbp), %ymm2 +; AVX1-NEXT: vmulpd %ymm2, %ymm10, %ymm11 +; AVX1-NEXT: vaddpd %ymm3, %ymm11, %ymm3 +; AVX1-NEXT: vmulpd 80(%rbp), %ymm10, %ymm10 +; AVX1-NEXT: vaddpd %ymm0, %ymm10, %ymm0 +; AVX1-NEXT: vbroadcastsd 512(%rbp), %ymm10 +; AVX1-NEXT: vmovapd 144(%rbp), %ymm13 +; AVX1-NEXT: vmulpd %ymm10, %ymm13, %ymm11 +; AVX1-NEXT: vaddpd %ymm0, %ymm11, %ymm0 +; AVX1-NEXT: vmulpd 176(%rbp), %ymm10, %ymm10 +; AVX1-NEXT: vaddpd %ymm3, %ymm10, %ymm3 +; AVX1-NEXT: vbroadcastsd 520(%rbp), %ymm10 +; AVX1-NEXT: vmulpd 240(%rbp), %ymm10, %ymm11 +; AVX1-NEXT: vaddpd %ymm3, %ymm11, %ymm2 +; AVX1-NEXT: vmovapd %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX1-NEXT: vmovapd 208(%rbp), %ymm2 +; AVX1-NEXT: vmulpd %ymm2, %ymm10, %ymm3 +; AVX1-NEXT: vaddpd %ymm3, %ymm0, %ymm0 +; AVX1-NEXT: vmovapd %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX1-NEXT: vbroadcastsd 528(%rbp), %ymm0 +; AVX1-NEXT: vmovapd %ymm6, %ymm2 +; AVX1-NEXT: vmulpd %ymm0, %ymm6, %ymm3 +; AVX1-NEXT: vbroadcastsd 536(%rbp), %ymm10 +; AVX1-NEXT: vmovapd %ymm12, %ymm5 +; AVX1-NEXT: vmulpd %ymm10, %ymm12, %ymm11 +; AVX1-NEXT: vaddpd %ymm3, %ymm11, %ymm3 +; AVX1-NEXT: vmulpd %ymm0, %ymm1, %ymm0 +; AVX1-NEXT: vmovapd %ymm1, %ymm12 +; AVX1-NEXT: vmovapd %ymm14, %ymm6 +; AVX1-NEXT: vmulpd %ymm10, %ymm14, %ymm10 +; AVX1-NEXT: vaddpd %ymm0, %ymm10, %ymm0 +; AVX1-NEXT: vbroadcastsd 544(%rbp), %ymm10 +; AVX1-NEXT: vmulpd %ymm10, %ymm15, %ymm11 +; AVX1-NEXT: vaddpd %ymm0, %ymm11, %ymm0 +; AVX1-NEXT: vmulpd %ymm10, %ymm9, %ymm10 +; AVX1-NEXT: vaddpd %ymm3, %ymm10, %ymm3 +; AVX1-NEXT: vbroadcastsd 552(%rbp), %ymm10 +; AVX1-NEXT: vmulpd %ymm10, %ymm8, %ymm11 +; AVX1-NEXT: vaddpd %ymm3, %ymm11, %ymm3 +; AVX1-NEXT: vmulpd %ymm7, %ymm10, %ymm10 +; AVX1-NEXT: vaddpd %ymm0, %ymm10, %ymm0 +; AVX1-NEXT: vbroadcastsd 560(%rbp), %ymm10 +; AVX1-NEXT: vmovapd 16(%rbp), %ymm1 +; AVX1-NEXT: vmulpd %ymm1, %ymm10, %ymm11 +; AVX1-NEXT: vaddpd %ymm0, %ymm11, %ymm0 +; AVX1-NEXT: vmulpd %ymm4, %ymm10, %ymm10 +; AVX1-NEXT: vaddpd %ymm3, %ymm10, %ymm3 +; AVX1-NEXT: vbroadcastsd 568(%rbp), %ymm10 +; AVX1-NEXT: vmulpd 112(%rbp), %ymm10, %ymm11 +; AVX1-NEXT: vaddpd %ymm3, %ymm11, %ymm3 +; AVX1-NEXT: vmovapd 80(%rbp), %ymm4 +; AVX1-NEXT: vmulpd %ymm4, %ymm10, %ymm10 +; AVX1-NEXT: vaddpd %ymm0, %ymm10, %ymm0 +; AVX1-NEXT: vbroadcastsd 576(%rbp), %ymm10 +; AVX1-NEXT: vmulpd %ymm10, %ymm13, %ymm11 +; AVX1-NEXT: vaddpd %ymm0, %ymm11, %ymm0 +; AVX1-NEXT: vmovapd 176(%rbp), %ymm13 +; AVX1-NEXT: vmulpd %ymm10, %ymm13, %ymm10 +; AVX1-NEXT: vaddpd %ymm3, %ymm10, %ymm3 +; AVX1-NEXT: vbroadcastsd 584(%rbp), %ymm10 +; AVX1-NEXT: vmovapd 240(%rbp), %ymm1 +; AVX1-NEXT: vmulpd %ymm1, %ymm10, %ymm11 +; AVX1-NEXT: vaddpd %ymm3, %ymm11, %ymm3 +; AVX1-NEXT: vmovapd %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX1-NEXT: vmovapd 208(%rbp), %ymm14 +; AVX1-NEXT: vmulpd %ymm10, %ymm14, %ymm3 +; AVX1-NEXT: vaddpd %ymm3, %ymm0, %ymm0 +; AVX1-NEXT: vmovapd %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX1-NEXT: vbroadcastsd 592(%rbp), %ymm0 +; AVX1-NEXT: vmulpd %ymm0, %ymm2, %ymm3 +; AVX1-NEXT: vbroadcastsd 600(%rbp), %ymm10 +; AVX1-NEXT: vmulpd %ymm5, %ymm10, %ymm11 +; AVX1-NEXT: vaddpd %ymm3, %ymm11, %ymm3 +; AVX1-NEXT: vmulpd %ymm0, %ymm12, %ymm0 +; AVX1-NEXT: vmulpd %ymm6, %ymm10, %ymm10 +; AVX1-NEXT: vaddpd %ymm0, %ymm10, %ymm0 +; AVX1-NEXT: vbroadcastsd 608(%rbp), %ymm10 +; AVX1-NEXT: vmulpd %ymm10, %ymm15, %ymm11 +; AVX1-NEXT: vaddpd %ymm0, %ymm11, %ymm0 +; AVX1-NEXT: vmulpd %ymm10, %ymm9, %ymm10 +; AVX1-NEXT: vaddpd %ymm3, %ymm10, %ymm3 +; AVX1-NEXT: vbroadcastsd 616(%rbp), %ymm10 +; AVX1-NEXT: vmulpd %ymm10, %ymm8, %ymm11 +; AVX1-NEXT: vaddpd %ymm3, %ymm11, %ymm3 +; AVX1-NEXT: vmulpd %ymm7, %ymm10, %ymm10 +; AVX1-NEXT: vaddpd %ymm0, %ymm10, %ymm0 +; AVX1-NEXT: vbroadcastsd 624(%rbp), %ymm10 +; AVX1-NEXT: vmulpd 16(%rbp), %ymm10, %ymm11 +; AVX1-NEXT: vaddpd %ymm0, %ymm11, %ymm0 +; AVX1-NEXT: vmovapd 48(%rbp), %ymm1 +; AVX1-NEXT: vmulpd %ymm1, %ymm10, %ymm10 +; AVX1-NEXT: vaddpd %ymm3, %ymm10, %ymm3 +; AVX1-NEXT: vbroadcastsd 632(%rbp), %ymm10 +; AVX1-NEXT: vmovapd 112(%rbp), %ymm1 +; AVX1-NEXT: vmulpd %ymm1, %ymm10, %ymm11 +; AVX1-NEXT: vaddpd %ymm3, %ymm11, %ymm3 +; AVX1-NEXT: vmulpd %ymm4, %ymm10, %ymm10 +; AVX1-NEXT: vaddpd %ymm0, %ymm10, %ymm0 +; AVX1-NEXT: vbroadcastsd 640(%rbp), %ymm10 +; AVX1-NEXT: vmovapd 144(%rbp), %ymm1 +; AVX1-NEXT: vmulpd %ymm1, %ymm10, %ymm11 +; AVX1-NEXT: vaddpd %ymm0, %ymm11, %ymm0 +; AVX1-NEXT: vmulpd %ymm10, %ymm13, %ymm10 +; AVX1-NEXT: vaddpd %ymm3, %ymm10, %ymm3 +; AVX1-NEXT: vbroadcastsd 648(%rbp), %ymm10 +; AVX1-NEXT: vmulpd 240(%rbp), %ymm10, %ymm11 +; AVX1-NEXT: vaddpd %ymm3, %ymm11, %ymm1 +; AVX1-NEXT: vmovapd %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX1-NEXT: vmulpd %ymm10, %ymm14, %ymm3 +; AVX1-NEXT: vaddpd %ymm3, %ymm0, %ymm0 +; AVX1-NEXT: vmovapd %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX1-NEXT: vbroadcastsd 656(%rbp), %ymm1 +; AVX1-NEXT: vmulpd %ymm1, %ymm2, %ymm3 +; AVX1-NEXT: vbroadcastsd 664(%rbp), %ymm0 +; AVX1-NEXT: vmulpd %ymm0, %ymm5, %ymm13 +; AVX1-NEXT: vaddpd %ymm3, %ymm13, %ymm3 +; AVX1-NEXT: vmulpd %ymm1, %ymm12, %ymm1 +; AVX1-NEXT: vmulpd %ymm0, %ymm6, %ymm0 +; AVX1-NEXT: vaddpd %ymm0, %ymm1, %ymm0 +; AVX1-NEXT: vbroadcastsd 672(%rbp), %ymm1 +; AVX1-NEXT: vmulpd %ymm1, %ymm15, %ymm13 +; AVX1-NEXT: vaddpd %ymm0, %ymm13, %ymm0 +; AVX1-NEXT: vmulpd %ymm1, %ymm9, %ymm1 +; AVX1-NEXT: vaddpd %ymm1, %ymm3, %ymm1 +; AVX1-NEXT: vbroadcastsd 680(%rbp), %ymm3 +; AVX1-NEXT: vmulpd %ymm3, %ymm8, %ymm13 +; AVX1-NEXT: vaddpd %ymm1, %ymm13, %ymm1 +; AVX1-NEXT: vmulpd %ymm3, %ymm7, %ymm3 +; AVX1-NEXT: vaddpd %ymm3, %ymm0, %ymm0 +; AVX1-NEXT: vbroadcastsd 688(%rbp), %ymm3 +; AVX1-NEXT: vmovapd 16(%rbp), %ymm10 +; AVX1-NEXT: vmulpd %ymm3, %ymm10, %ymm13 +; AVX1-NEXT: vaddpd %ymm0, %ymm13, %ymm0 +; AVX1-NEXT: vmulpd 48(%rbp), %ymm3, %ymm3 +; AVX1-NEXT: vaddpd %ymm3, %ymm1, %ymm1 +; AVX1-NEXT: vbroadcastsd 696(%rbp), %ymm3 +; AVX1-NEXT: vmulpd 112(%rbp), %ymm3, %ymm13 +; AVX1-NEXT: vaddpd %ymm1, %ymm13, %ymm1 +; AVX1-NEXT: vmulpd 80(%rbp), %ymm3, %ymm3 +; AVX1-NEXT: vaddpd %ymm3, %ymm0, %ymm0 +; AVX1-NEXT: vbroadcastsd 704(%rbp), %ymm3 +; AVX1-NEXT: vmulpd 144(%rbp), %ymm3, %ymm13 +; AVX1-NEXT: vaddpd %ymm0, %ymm13, %ymm0 +; AVX1-NEXT: vmulpd 176(%rbp), %ymm3, %ymm3 +; AVX1-NEXT: vaddpd %ymm3, %ymm1, %ymm1 +; AVX1-NEXT: vbroadcastsd 712(%rbp), %ymm13 +; AVX1-NEXT: vmovapd 240(%rbp), %ymm11 +; AVX1-NEXT: vmulpd %ymm13, %ymm11, %ymm3 +; AVX1-NEXT: vaddpd %ymm3, %ymm1, %ymm3 +; AVX1-NEXT: vmulpd %ymm13, %ymm14, %ymm1 +; AVX1-NEXT: vaddpd %ymm1, %ymm0, %ymm0 +; AVX1-NEXT: vbroadcastsd 720(%rbp), %ymm1 +; AVX1-NEXT: vmulpd %ymm1, %ymm2, %ymm4 +; AVX1-NEXT: vmulpd %ymm1, %ymm12, %ymm1 +; AVX1-NEXT: vbroadcastsd 728(%rbp), %ymm2 +; AVX1-NEXT: vmulpd %ymm2, %ymm5, %ymm5 +; AVX1-NEXT: vaddpd %ymm5, %ymm4, %ymm4 +; AVX1-NEXT: vmulpd %ymm2, %ymm6, %ymm2 +; AVX1-NEXT: vaddpd %ymm2, %ymm1, %ymm1 +; AVX1-NEXT: vbroadcastsd 736(%rbp), %ymm2 +; AVX1-NEXT: vmulpd %ymm2, %ymm15, %ymm5 +; AVX1-NEXT: vaddpd %ymm5, %ymm1, %ymm1 +; AVX1-NEXT: vmulpd %ymm2, %ymm9, %ymm2 +; AVX1-NEXT: vaddpd %ymm2, %ymm4, %ymm2 +; AVX1-NEXT: vbroadcastsd 744(%rbp), %ymm4 +; AVX1-NEXT: vmulpd %ymm4, %ymm8, %ymm5 +; AVX1-NEXT: vaddpd %ymm5, %ymm2, %ymm2 +; AVX1-NEXT: vmulpd %ymm4, %ymm7, %ymm4 +; AVX1-NEXT: vaddpd %ymm4, %ymm1, %ymm1 +; AVX1-NEXT: vbroadcastsd 752(%rbp), %ymm4 +; AVX1-NEXT: vmulpd %ymm4, %ymm10, %ymm5 +; AVX1-NEXT: vaddpd %ymm5, %ymm1, %ymm1 +; AVX1-NEXT: vmulpd 48(%rbp), %ymm4, %ymm4 +; AVX1-NEXT: vaddpd %ymm4, %ymm2, %ymm2 +; AVX1-NEXT: vbroadcastsd 760(%rbp), %ymm4 +; AVX1-NEXT: vmulpd 112(%rbp), %ymm4, %ymm5 +; AVX1-NEXT: vaddpd %ymm5, %ymm2, %ymm2 +; AVX1-NEXT: vmulpd 80(%rbp), %ymm4, %ymm4 +; AVX1-NEXT: vaddpd %ymm4, %ymm1, %ymm1 +; AVX1-NEXT: vbroadcastsd 768(%rbp), %ymm4 +; AVX1-NEXT: vmulpd 144(%rbp), %ymm4, %ymm5 +; AVX1-NEXT: vaddpd %ymm5, %ymm1, %ymm1 +; AVX1-NEXT: vmulpd 176(%rbp), %ymm4, %ymm4 +; AVX1-NEXT: vaddpd %ymm4, %ymm2, %ymm2 +; AVX1-NEXT: vbroadcastsd 776(%rbp), %ymm4 +; AVX1-NEXT: vmulpd %ymm4, %ymm11, %ymm5 +; AVX1-NEXT: vaddpd %ymm5, %ymm2, %ymm2 +; AVX1-NEXT: vmulpd %ymm4, %ymm14, %ymm4 +; AVX1-NEXT: vaddpd %ymm4, %ymm1, %ymm1 +; AVX1-NEXT: vmovapd %ymm2, 480(%rdi) +; AVX1-NEXT: vmovapd %ymm1, 448(%rdi) +; AVX1-NEXT: vmovapd %ymm3, 416(%rdi) +; AVX1-NEXT: vmovapd %ymm0, 384(%rdi) +; AVX1-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload +; AVX1-NEXT: vmovaps %ymm0, 352(%rdi) +; AVX1-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload +; AVX1-NEXT: vmovaps %ymm0, 320(%rdi) +; AVX1-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload +; AVX1-NEXT: vmovaps %ymm0, 288(%rdi) +; AVX1-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload +; AVX1-NEXT: vmovaps %ymm0, 256(%rdi) +; AVX1-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload +; AVX1-NEXT: vmovaps %ymm0, 224(%rdi) +; AVX1-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload +; AVX1-NEXT: vmovaps %ymm0, 192(%rdi) +; AVX1-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload +; AVX1-NEXT: vmovaps %ymm0, 160(%rdi) +; AVX1-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload +; AVX1-NEXT: vmovaps %ymm0, 128(%rdi) +; AVX1-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload +; AVX1-NEXT: vmovaps %ymm0, 96(%rdi) +; AVX1-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload +; AVX1-NEXT: vmovaps %ymm0, 64(%rdi) +; AVX1-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload +; AVX1-NEXT: vmovaps %ymm0, 32(%rdi) +; AVX1-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload +; AVX1-NEXT: vmovaps %ymm0, (%rdi) +; AVX1-NEXT: movq %rbp, %rsp +; AVX1-NEXT: popq %rbp +; AVX1-NEXT: vzeroupper +; AVX1-NEXT: retq +; +; AVX2-LABEL: test_mul8x8_f64: +; AVX2: # %bb.0: # %entry +; AVX2-NEXT: pushq %rbp +; AVX2-NEXT: movq %rsp, %rbp +; AVX2-NEXT: andq $-32, %rsp +; AVX2-NEXT: subq $448, %rsp # imm = 0x1C0 +; AVX2-NEXT: vmovapd %ymm4, %ymm13 +; AVX2-NEXT: vmovapd %ymm3, %ymm9 +; AVX2-NEXT: vmovapd %ymm1, %ymm4 +; AVX2-NEXT: vmovapd %ymm0, (%rsp) # 32-byte Spill +; AVX2-NEXT: movq %rdi, %rax +; AVX2-NEXT: vmovapd 112(%rbp), %ymm12 +; AVX2-NEXT: vmovapd 48(%rbp), %ymm15 +; AVX2-NEXT: vbroadcastsd 272(%rbp), %ymm10 +; AVX2-NEXT: vmulpd %ymm1, %ymm10, %ymm8 +; AVX2-NEXT: vmulpd %ymm0, %ymm10, %ymm1 +; AVX2-NEXT: vbroadcastsd 280(%rbp), %ymm10 +; AVX2-NEXT: vmulpd %ymm3, %ymm10, %ymm11 +; AVX2-NEXT: vaddpd %ymm11, %ymm8, %ymm3 +; AVX2-NEXT: vmulpd %ymm2, %ymm10, %ymm10 +; AVX2-NEXT: vmovapd %ymm2, %ymm8 +; AVX2-NEXT: vaddpd %ymm1, %ymm10, %ymm0 +; AVX2-NEXT: vbroadcastsd 288(%rbp), %ymm10 +; AVX2-NEXT: vmulpd %ymm10, %ymm13, %ymm11 +; AVX2-NEXT: vaddpd %ymm0, %ymm11, %ymm0 +; AVX2-NEXT: vmulpd %ymm5, %ymm10, %ymm10 +; AVX2-NEXT: vaddpd %ymm3, %ymm10, %ymm3 +; AVX2-NEXT: vbroadcastsd 296(%rbp), %ymm10 +; AVX2-NEXT: vmulpd %ymm7, %ymm10, %ymm11 +; AVX2-NEXT: vaddpd %ymm3, %ymm11, %ymm3 +; AVX2-NEXT: vmulpd %ymm6, %ymm10, %ymm10 +; AVX2-NEXT: vaddpd %ymm0, %ymm10, %ymm0 +; AVX2-NEXT: vbroadcastsd 304(%rbp), %ymm10 +; AVX2-NEXT: vmulpd 16(%rbp), %ymm10, %ymm11 +; AVX2-NEXT: vaddpd %ymm0, %ymm11, %ymm0 +; AVX2-NEXT: vmulpd %ymm10, %ymm15, %ymm10 +; AVX2-NEXT: vaddpd %ymm3, %ymm10, %ymm3 +; AVX2-NEXT: vbroadcastsd 312(%rbp), %ymm10 +; AVX2-NEXT: vmulpd %ymm10, %ymm12, %ymm11 +; AVX2-NEXT: vmovapd %ymm12, %ymm14 +; AVX2-NEXT: vaddpd %ymm3, %ymm11, %ymm3 +; AVX2-NEXT: vmulpd 80(%rbp), %ymm10, %ymm10 +; AVX2-NEXT: vaddpd %ymm0, %ymm10, %ymm0 +; AVX2-NEXT: vbroadcastsd 320(%rbp), %ymm10 +; AVX2-NEXT: vmulpd 144(%rbp), %ymm10, %ymm11 +; AVX2-NEXT: vaddpd %ymm0, %ymm11, %ymm0 +; AVX2-NEXT: vmulpd 176(%rbp), %ymm10, %ymm10 +; AVX2-NEXT: vaddpd %ymm3, %ymm10, %ymm3 +; AVX2-NEXT: vbroadcastsd 328(%rbp), %ymm10 +; AVX2-NEXT: vmulpd 240(%rbp), %ymm10, %ymm11 +; AVX2-NEXT: vaddpd %ymm3, %ymm11, %ymm1 +; AVX2-NEXT: vmovapd %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-NEXT: vmulpd 208(%rbp), %ymm10, %ymm3 +; AVX2-NEXT: vaddpd %ymm3, %ymm0, %ymm0 +; AVX2-NEXT: vmovapd %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-NEXT: vbroadcastsd 336(%rbp), %ymm0 +; AVX2-NEXT: vmulpd %ymm0, %ymm4, %ymm3 +; AVX2-NEXT: vbroadcastsd 344(%rbp), %ymm10 +; AVX2-NEXT: vmulpd %ymm10, %ymm9, %ymm11 +; AVX2-NEXT: vmovapd %ymm9, %ymm12 +; AVX2-NEXT: vaddpd %ymm3, %ymm11, %ymm3 +; AVX2-NEXT: vmovapd (%rsp), %ymm2 # 32-byte Reload +; AVX2-NEXT: vmulpd %ymm0, %ymm2, %ymm0 +; AVX2-NEXT: vmulpd %ymm10, %ymm8, %ymm10 +; AVX2-NEXT: vaddpd %ymm0, %ymm10, %ymm0 +; AVX2-NEXT: vbroadcastsd 352(%rbp), %ymm10 +; AVX2-NEXT: vmulpd %ymm10, %ymm13, %ymm11 +; AVX2-NEXT: vaddpd %ymm0, %ymm11, %ymm0 +; AVX2-NEXT: vmulpd %ymm5, %ymm10, %ymm10 +; AVX2-NEXT: vaddpd %ymm3, %ymm10, %ymm3 +; AVX2-NEXT: vbroadcastsd 360(%rbp), %ymm10 +; AVX2-NEXT: vmulpd %ymm7, %ymm10, %ymm11 +; AVX2-NEXT: vaddpd %ymm3, %ymm11, %ymm3 +; AVX2-NEXT: vmulpd %ymm6, %ymm10, %ymm10 +; AVX2-NEXT: vaddpd %ymm0, %ymm10, %ymm0 +; AVX2-NEXT: vbroadcastsd 368(%rbp), %ymm10 +; AVX2-NEXT: vmovapd 16(%rbp), %ymm1 +; AVX2-NEXT: vmulpd %ymm1, %ymm10, %ymm11 +; AVX2-NEXT: vaddpd %ymm0, %ymm11, %ymm0 +; AVX2-NEXT: vmulpd %ymm10, %ymm15, %ymm10 +; AVX2-NEXT: vaddpd %ymm3, %ymm10, %ymm3 +; AVX2-NEXT: vbroadcastsd 376(%rbp), %ymm10 +; AVX2-NEXT: vmulpd %ymm10, %ymm14, %ymm11 +; AVX2-NEXT: vaddpd %ymm3, %ymm11, %ymm3 +; AVX2-NEXT: vmovapd 80(%rbp), %ymm9 +; AVX2-NEXT: vmulpd %ymm10, %ymm9, %ymm10 +; AVX2-NEXT: vaddpd %ymm0, %ymm10, %ymm0 +; AVX2-NEXT: vbroadcastsd 384(%rbp), %ymm10 +; AVX2-NEXT: vmovapd 144(%rbp), %ymm9 +; AVX2-NEXT: vmulpd %ymm10, %ymm9, %ymm11 +; AVX2-NEXT: vaddpd %ymm0, %ymm11, %ymm0 +; AVX2-NEXT: vmovapd 176(%rbp), %ymm9 +; AVX2-NEXT: vmulpd %ymm10, %ymm9, %ymm10 +; AVX2-NEXT: vaddpd %ymm3, %ymm10, %ymm3 +; AVX2-NEXT: vbroadcastsd 392(%rbp), %ymm10 +; AVX2-NEXT: vmovapd 240(%rbp), %ymm9 +; AVX2-NEXT: vmulpd %ymm10, %ymm9, %ymm11 +; AVX2-NEXT: vaddpd %ymm3, %ymm11, %ymm3 +; AVX2-NEXT: vmovapd %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-NEXT: vmulpd 208(%rbp), %ymm10, %ymm3 +; AVX2-NEXT: vaddpd %ymm3, %ymm0, %ymm0 +; AVX2-NEXT: vmovapd %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-NEXT: vbroadcastsd 400(%rbp), %ymm0 +; AVX2-NEXT: vmulpd %ymm0, %ymm4, %ymm3 +; AVX2-NEXT: vmovapd %ymm4, %ymm9 +; AVX2-NEXT: vbroadcastsd 408(%rbp), %ymm10 +; AVX2-NEXT: vmulpd %ymm10, %ymm12, %ymm11 +; AVX2-NEXT: vaddpd %ymm3, %ymm11, %ymm3 +; AVX2-NEXT: vmulpd %ymm0, %ymm2, %ymm0 +; AVX2-NEXT: vmulpd %ymm10, %ymm8, %ymm10 +; AVX2-NEXT: vaddpd %ymm0, %ymm10, %ymm0 +; AVX2-NEXT: vbroadcastsd 416(%rbp), %ymm10 +; AVX2-NEXT: vmovapd %ymm13, %ymm4 +; AVX2-NEXT: vmulpd %ymm10, %ymm13, %ymm11 +; AVX2-NEXT: vaddpd %ymm0, %ymm11, %ymm0 +; AVX2-NEXT: vmulpd %ymm5, %ymm10, %ymm10 +; AVX2-NEXT: vmovapd %ymm5, %ymm13 +; AVX2-NEXT: vaddpd %ymm3, %ymm10, %ymm3 +; AVX2-NEXT: vbroadcastsd 424(%rbp), %ymm10 +; AVX2-NEXT: vmulpd %ymm7, %ymm10, %ymm11 +; AVX2-NEXT: vmovapd %ymm7, %ymm5 +; AVX2-NEXT: vaddpd %ymm3, %ymm11, %ymm3 +; AVX2-NEXT: vmulpd %ymm6, %ymm10, %ymm10 +; AVX2-NEXT: vmovapd %ymm6, %ymm7 +; AVX2-NEXT: vaddpd %ymm0, %ymm10, %ymm0 +; AVX2-NEXT: vbroadcastsd 432(%rbp), %ymm10 +; AVX2-NEXT: vmulpd %ymm1, %ymm10, %ymm11 +; AVX2-NEXT: vaddpd %ymm0, %ymm11, %ymm0 +; AVX2-NEXT: vmulpd %ymm10, %ymm15, %ymm10 +; AVX2-NEXT: vaddpd %ymm3, %ymm10, %ymm3 +; AVX2-NEXT: vbroadcastsd 440(%rbp), %ymm10 +; AVX2-NEXT: vmulpd %ymm10, %ymm14, %ymm11 +; AVX2-NEXT: vaddpd %ymm3, %ymm11, %ymm3 +; AVX2-NEXT: vmulpd 80(%rbp), %ymm10, %ymm10 +; AVX2-NEXT: vaddpd %ymm0, %ymm10, %ymm0 +; AVX2-NEXT: vbroadcastsd 448(%rbp), %ymm10 +; AVX2-NEXT: vmulpd 144(%rbp), %ymm10, %ymm11 +; AVX2-NEXT: vaddpd %ymm0, %ymm11, %ymm0 +; AVX2-NEXT: vmulpd 176(%rbp), %ymm10, %ymm10 +; AVX2-NEXT: vaddpd %ymm3, %ymm10, %ymm3 +; AVX2-NEXT: vbroadcastsd 456(%rbp), %ymm10 +; AVX2-NEXT: vmulpd 240(%rbp), %ymm10, %ymm11 +; AVX2-NEXT: vaddpd %ymm3, %ymm11, %ymm1 +; AVX2-NEXT: vmovapd %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-NEXT: vmulpd 208(%rbp), %ymm10, %ymm3 +; AVX2-NEXT: vaddpd %ymm3, %ymm0, %ymm0 +; AVX2-NEXT: vmovapd %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-NEXT: vbroadcastsd 464(%rbp), %ymm0 +; AVX2-NEXT: vmulpd %ymm0, %ymm9, %ymm3 +; AVX2-NEXT: vmovapd %ymm9, %ymm6 +; AVX2-NEXT: vbroadcastsd 472(%rbp), %ymm10 +; AVX2-NEXT: vmulpd %ymm10, %ymm12, %ymm11 +; AVX2-NEXT: vaddpd %ymm3, %ymm11, %ymm3 +; AVX2-NEXT: vmulpd %ymm0, %ymm2, %ymm0 +; AVX2-NEXT: vmovapd %ymm2, %ymm1 +; AVX2-NEXT: vmulpd %ymm10, %ymm8, %ymm10 +; AVX2-NEXT: vmovapd %ymm8, %ymm14 +; AVX2-NEXT: vaddpd %ymm0, %ymm10, %ymm0 +; AVX2-NEXT: vbroadcastsd 480(%rbp), %ymm10 +; AVX2-NEXT: vmulpd %ymm4, %ymm10, %ymm11 +; AVX2-NEXT: vmovapd %ymm4, %ymm15 +; AVX2-NEXT: vaddpd %ymm0, %ymm11, %ymm0 +; AVX2-NEXT: vmovapd %ymm13, %ymm9 +; AVX2-NEXT: vmulpd %ymm10, %ymm13, %ymm10 +; AVX2-NEXT: vaddpd %ymm3, %ymm10, %ymm3 +; AVX2-NEXT: vbroadcastsd 488(%rbp), %ymm10 +; AVX2-NEXT: vmovapd %ymm5, %ymm8 +; AVX2-NEXT: vmulpd %ymm5, %ymm10, %ymm11 +; AVX2-NEXT: vaddpd %ymm3, %ymm11, %ymm3 +; AVX2-NEXT: vmulpd %ymm7, %ymm10, %ymm10 +; AVX2-NEXT: vaddpd %ymm0, %ymm10, %ymm0 +; AVX2-NEXT: vbroadcastsd 496(%rbp), %ymm10 +; AVX2-NEXT: vmulpd 16(%rbp), %ymm10, %ymm11 +; AVX2-NEXT: vaddpd %ymm0, %ymm11, %ymm0 +; AVX2-NEXT: vmovapd 48(%rbp), %ymm4 +; AVX2-NEXT: vmulpd %ymm4, %ymm10, %ymm10 +; AVX2-NEXT: vaddpd %ymm3, %ymm10, %ymm3 +; AVX2-NEXT: vbroadcastsd 504(%rbp), %ymm10 +; AVX2-NEXT: vmovapd 112(%rbp), %ymm2 +; AVX2-NEXT: vmulpd %ymm2, %ymm10, %ymm11 +; AVX2-NEXT: vaddpd %ymm3, %ymm11, %ymm3 +; AVX2-NEXT: vmulpd 80(%rbp), %ymm10, %ymm10 +; AVX2-NEXT: vaddpd %ymm0, %ymm10, %ymm0 +; AVX2-NEXT: vbroadcastsd 512(%rbp), %ymm10 +; AVX2-NEXT: vmovapd 144(%rbp), %ymm13 +; AVX2-NEXT: vmulpd %ymm10, %ymm13, %ymm11 +; AVX2-NEXT: vaddpd %ymm0, %ymm11, %ymm0 +; AVX2-NEXT: vmulpd 176(%rbp), %ymm10, %ymm10 +; AVX2-NEXT: vaddpd %ymm3, %ymm10, %ymm3 +; AVX2-NEXT: vbroadcastsd 520(%rbp), %ymm10 +; AVX2-NEXT: vmulpd 240(%rbp), %ymm10, %ymm11 +; AVX2-NEXT: vaddpd %ymm3, %ymm11, %ymm2 +; AVX2-NEXT: vmovapd %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-NEXT: vmovapd 208(%rbp), %ymm2 +; AVX2-NEXT: vmulpd %ymm2, %ymm10, %ymm3 +; AVX2-NEXT: vaddpd %ymm3, %ymm0, %ymm0 +; AVX2-NEXT: vmovapd %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-NEXT: vbroadcastsd 528(%rbp), %ymm0 +; AVX2-NEXT: vmovapd %ymm6, %ymm2 +; AVX2-NEXT: vmulpd %ymm0, %ymm6, %ymm3 +; AVX2-NEXT: vbroadcastsd 536(%rbp), %ymm10 +; AVX2-NEXT: vmovapd %ymm12, %ymm5 +; AVX2-NEXT: vmulpd %ymm10, %ymm12, %ymm11 +; AVX2-NEXT: vaddpd %ymm3, %ymm11, %ymm3 +; AVX2-NEXT: vmulpd %ymm0, %ymm1, %ymm0 +; AVX2-NEXT: vmovapd %ymm1, %ymm12 +; AVX2-NEXT: vmovapd %ymm14, %ymm6 +; AVX2-NEXT: vmulpd %ymm10, %ymm14, %ymm10 +; AVX2-NEXT: vaddpd %ymm0, %ymm10, %ymm0 +; AVX2-NEXT: vbroadcastsd 544(%rbp), %ymm10 +; AVX2-NEXT: vmulpd %ymm10, %ymm15, %ymm11 +; AVX2-NEXT: vaddpd %ymm0, %ymm11, %ymm0 +; AVX2-NEXT: vmulpd %ymm10, %ymm9, %ymm10 +; AVX2-NEXT: vaddpd %ymm3, %ymm10, %ymm3 +; AVX2-NEXT: vbroadcastsd 552(%rbp), %ymm10 +; AVX2-NEXT: vmulpd %ymm10, %ymm8, %ymm11 +; AVX2-NEXT: vaddpd %ymm3, %ymm11, %ymm3 +; AVX2-NEXT: vmulpd %ymm7, %ymm10, %ymm10 +; AVX2-NEXT: vaddpd %ymm0, %ymm10, %ymm0 +; AVX2-NEXT: vbroadcastsd 560(%rbp), %ymm10 +; AVX2-NEXT: vmovapd 16(%rbp), %ymm1 +; AVX2-NEXT: vmulpd %ymm1, %ymm10, %ymm11 +; AVX2-NEXT: vaddpd %ymm0, %ymm11, %ymm0 +; AVX2-NEXT: vmulpd %ymm4, %ymm10, %ymm10 +; AVX2-NEXT: vaddpd %ymm3, %ymm10, %ymm3 +; AVX2-NEXT: vbroadcastsd 568(%rbp), %ymm10 +; AVX2-NEXT: vmulpd 112(%rbp), %ymm10, %ymm11 +; AVX2-NEXT: vaddpd %ymm3, %ymm11, %ymm3 +; AVX2-NEXT: vmovapd 80(%rbp), %ymm4 +; AVX2-NEXT: vmulpd %ymm4, %ymm10, %ymm10 +; AVX2-NEXT: vaddpd %ymm0, %ymm10, %ymm0 +; AVX2-NEXT: vbroadcastsd 576(%rbp), %ymm10 +; AVX2-NEXT: vmulpd %ymm10, %ymm13, %ymm11 +; AVX2-NEXT: vaddpd %ymm0, %ymm11, %ymm0 +; AVX2-NEXT: vmovapd 176(%rbp), %ymm13 +; AVX2-NEXT: vmulpd %ymm10, %ymm13, %ymm10 +; AVX2-NEXT: vaddpd %ymm3, %ymm10, %ymm3 +; AVX2-NEXT: vbroadcastsd 584(%rbp), %ymm10 +; AVX2-NEXT: vmovapd 240(%rbp), %ymm1 +; AVX2-NEXT: vmulpd %ymm1, %ymm10, %ymm11 +; AVX2-NEXT: vaddpd %ymm3, %ymm11, %ymm3 +; AVX2-NEXT: vmovapd %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-NEXT: vmovapd 208(%rbp), %ymm14 +; AVX2-NEXT: vmulpd %ymm10, %ymm14, %ymm3 +; AVX2-NEXT: vaddpd %ymm3, %ymm0, %ymm0 +; AVX2-NEXT: vmovapd %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-NEXT: vbroadcastsd 592(%rbp), %ymm0 +; AVX2-NEXT: vmulpd %ymm0, %ymm2, %ymm3 +; AVX2-NEXT: vbroadcastsd 600(%rbp), %ymm10 +; AVX2-NEXT: vmulpd %ymm5, %ymm10, %ymm11 +; AVX2-NEXT: vaddpd %ymm3, %ymm11, %ymm3 +; AVX2-NEXT: vmulpd %ymm0, %ymm12, %ymm0 +; AVX2-NEXT: vmulpd %ymm6, %ymm10, %ymm10 +; AVX2-NEXT: vaddpd %ymm0, %ymm10, %ymm0 +; AVX2-NEXT: vbroadcastsd 608(%rbp), %ymm10 +; AVX2-NEXT: vmulpd %ymm10, %ymm15, %ymm11 +; AVX2-NEXT: vaddpd %ymm0, %ymm11, %ymm0 +; AVX2-NEXT: vmulpd %ymm10, %ymm9, %ymm10 +; AVX2-NEXT: vaddpd %ymm3, %ymm10, %ymm3 +; AVX2-NEXT: vbroadcastsd 616(%rbp), %ymm10 +; AVX2-NEXT: vmulpd %ymm10, %ymm8, %ymm11 +; AVX2-NEXT: vaddpd %ymm3, %ymm11, %ymm3 +; AVX2-NEXT: vmulpd %ymm7, %ymm10, %ymm10 +; AVX2-NEXT: vaddpd %ymm0, %ymm10, %ymm0 +; AVX2-NEXT: vbroadcastsd 624(%rbp), %ymm10 +; AVX2-NEXT: vmulpd 16(%rbp), %ymm10, %ymm11 +; AVX2-NEXT: vaddpd %ymm0, %ymm11, %ymm0 +; AVX2-NEXT: vmovapd 48(%rbp), %ymm1 +; AVX2-NEXT: vmulpd %ymm1, %ymm10, %ymm10 +; AVX2-NEXT: vaddpd %ymm3, %ymm10, %ymm3 +; AVX2-NEXT: vbroadcastsd 632(%rbp), %ymm10 +; AVX2-NEXT: vmovapd 112(%rbp), %ymm1 +; AVX2-NEXT: vmulpd %ymm1, %ymm10, %ymm11 +; AVX2-NEXT: vaddpd %ymm3, %ymm11, %ymm3 +; AVX2-NEXT: vmulpd %ymm4, %ymm10, %ymm10 +; AVX2-NEXT: vaddpd %ymm0, %ymm10, %ymm0 +; AVX2-NEXT: vbroadcastsd 640(%rbp), %ymm10 +; AVX2-NEXT: vmovapd 144(%rbp), %ymm1 +; AVX2-NEXT: vmulpd %ymm1, %ymm10, %ymm11 +; AVX2-NEXT: vaddpd %ymm0, %ymm11, %ymm0 +; AVX2-NEXT: vmulpd %ymm10, %ymm13, %ymm10 +; AVX2-NEXT: vaddpd %ymm3, %ymm10, %ymm3 +; AVX2-NEXT: vbroadcastsd 648(%rbp), %ymm10 +; AVX2-NEXT: vmulpd 240(%rbp), %ymm10, %ymm11 +; AVX2-NEXT: vaddpd %ymm3, %ymm11, %ymm1 +; AVX2-NEXT: vmovapd %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-NEXT: vmulpd %ymm10, %ymm14, %ymm3 +; AVX2-NEXT: vaddpd %ymm3, %ymm0, %ymm0 +; AVX2-NEXT: vmovapd %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-NEXT: vbroadcastsd 656(%rbp), %ymm1 +; AVX2-NEXT: vmulpd %ymm1, %ymm2, %ymm3 +; AVX2-NEXT: vbroadcastsd 664(%rbp), %ymm0 +; AVX2-NEXT: vmulpd %ymm0, %ymm5, %ymm13 +; AVX2-NEXT: vaddpd %ymm3, %ymm13, %ymm3 +; AVX2-NEXT: vmulpd %ymm1, %ymm12, %ymm1 +; AVX2-NEXT: vmulpd %ymm0, %ymm6, %ymm0 +; AVX2-NEXT: vaddpd %ymm0, %ymm1, %ymm0 +; AVX2-NEXT: vbroadcastsd 672(%rbp), %ymm1 +; AVX2-NEXT: vmulpd %ymm1, %ymm15, %ymm13 +; AVX2-NEXT: vaddpd %ymm0, %ymm13, %ymm0 +; AVX2-NEXT: vmulpd %ymm1, %ymm9, %ymm1 +; AVX2-NEXT: vaddpd %ymm1, %ymm3, %ymm1 +; AVX2-NEXT: vbroadcastsd 680(%rbp), %ymm3 +; AVX2-NEXT: vmulpd %ymm3, %ymm8, %ymm13 +; AVX2-NEXT: vaddpd %ymm1, %ymm13, %ymm1 +; AVX2-NEXT: vmulpd %ymm3, %ymm7, %ymm3 +; AVX2-NEXT: vaddpd %ymm3, %ymm0, %ymm0 +; AVX2-NEXT: vbroadcastsd 688(%rbp), %ymm3 +; AVX2-NEXT: vmovapd 16(%rbp), %ymm10 +; AVX2-NEXT: vmulpd %ymm3, %ymm10, %ymm13 +; AVX2-NEXT: vaddpd %ymm0, %ymm13, %ymm0 +; AVX2-NEXT: vmulpd 48(%rbp), %ymm3, %ymm3 +; AVX2-NEXT: vaddpd %ymm3, %ymm1, %ymm1 +; AVX2-NEXT: vbroadcastsd 696(%rbp), %ymm3 +; AVX2-NEXT: vmulpd 112(%rbp), %ymm3, %ymm13 +; AVX2-NEXT: vaddpd %ymm1, %ymm13, %ymm1 +; AVX2-NEXT: vmulpd 80(%rbp), %ymm3, %ymm3 +; AVX2-NEXT: vaddpd %ymm3, %ymm0, %ymm0 +; AVX2-NEXT: vbroadcastsd 704(%rbp), %ymm3 +; AVX2-NEXT: vmulpd 144(%rbp), %ymm3, %ymm13 +; AVX2-NEXT: vaddpd %ymm0, %ymm13, %ymm0 +; AVX2-NEXT: vmulpd 176(%rbp), %ymm3, %ymm3 +; AVX2-NEXT: vaddpd %ymm3, %ymm1, %ymm1 +; AVX2-NEXT: vbroadcastsd 712(%rbp), %ymm13 +; AVX2-NEXT: vmovapd 240(%rbp), %ymm11 +; AVX2-NEXT: vmulpd %ymm13, %ymm11, %ymm3 +; AVX2-NEXT: vaddpd %ymm3, %ymm1, %ymm3 +; AVX2-NEXT: vmulpd %ymm13, %ymm14, %ymm1 +; AVX2-NEXT: vaddpd %ymm1, %ymm0, %ymm0 +; AVX2-NEXT: vbroadcastsd 720(%rbp), %ymm1 +; AVX2-NEXT: vmulpd %ymm1, %ymm2, %ymm4 +; AVX2-NEXT: vmulpd %ymm1, %ymm12, %ymm1 +; AVX2-NEXT: vbroadcastsd 728(%rbp), %ymm2 +; AVX2-NEXT: vmulpd %ymm2, %ymm5, %ymm5 +; AVX2-NEXT: vaddpd %ymm5, %ymm4, %ymm4 +; AVX2-NEXT: vmulpd %ymm2, %ymm6, %ymm2 +; AVX2-NEXT: vaddpd %ymm2, %ymm1, %ymm1 +; AVX2-NEXT: vbroadcastsd 736(%rbp), %ymm2 +; AVX2-NEXT: vmulpd %ymm2, %ymm15, %ymm5 +; AVX2-NEXT: vaddpd %ymm5, %ymm1, %ymm1 +; AVX2-NEXT: vmulpd %ymm2, %ymm9, %ymm2 +; AVX2-NEXT: vaddpd %ymm2, %ymm4, %ymm2 +; AVX2-NEXT: vbroadcastsd 744(%rbp), %ymm4 +; AVX2-NEXT: vmulpd %ymm4, %ymm8, %ymm5 +; AVX2-NEXT: vaddpd %ymm5, %ymm2, %ymm2 +; AVX2-NEXT: vmulpd %ymm4, %ymm7, %ymm4 +; AVX2-NEXT: vaddpd %ymm4, %ymm1, %ymm1 +; AVX2-NEXT: vbroadcastsd 752(%rbp), %ymm4 +; AVX2-NEXT: vmulpd %ymm4, %ymm10, %ymm5 +; AVX2-NEXT: vaddpd %ymm5, %ymm1, %ymm1 +; AVX2-NEXT: vmulpd 48(%rbp), %ymm4, %ymm4 +; AVX2-NEXT: vaddpd %ymm4, %ymm2, %ymm2 +; AVX2-NEXT: vbroadcastsd 760(%rbp), %ymm4 +; AVX2-NEXT: vmulpd 112(%rbp), %ymm4, %ymm5 +; AVX2-NEXT: vaddpd %ymm5, %ymm2, %ymm2 +; AVX2-NEXT: vmulpd 80(%rbp), %ymm4, %ymm4 +; AVX2-NEXT: vaddpd %ymm4, %ymm1, %ymm1 +; AVX2-NEXT: vbroadcastsd 768(%rbp), %ymm4 +; AVX2-NEXT: vmulpd 144(%rbp), %ymm4, %ymm5 +; AVX2-NEXT: vaddpd %ymm5, %ymm1, %ymm1 +; AVX2-NEXT: vmulpd 176(%rbp), %ymm4, %ymm4 +; AVX2-NEXT: vaddpd %ymm4, %ymm2, %ymm2 +; AVX2-NEXT: vbroadcastsd 776(%rbp), %ymm4 +; AVX2-NEXT: vmulpd %ymm4, %ymm11, %ymm5 +; AVX2-NEXT: vaddpd %ymm5, %ymm2, %ymm2 +; AVX2-NEXT: vmulpd %ymm4, %ymm14, %ymm4 +; AVX2-NEXT: vaddpd %ymm4, %ymm1, %ymm1 +; AVX2-NEXT: vmovapd %ymm2, 480(%rdi) +; AVX2-NEXT: vmovapd %ymm1, 448(%rdi) +; AVX2-NEXT: vmovapd %ymm3, 416(%rdi) +; AVX2-NEXT: vmovapd %ymm0, 384(%rdi) +; AVX2-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload +; AVX2-NEXT: vmovaps %ymm0, 352(%rdi) +; AVX2-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload +; AVX2-NEXT: vmovaps %ymm0, 320(%rdi) +; AVX2-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload +; AVX2-NEXT: vmovaps %ymm0, 288(%rdi) +; AVX2-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload +; AVX2-NEXT: vmovaps %ymm0, 256(%rdi) +; AVX2-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload +; AVX2-NEXT: vmovaps %ymm0, 224(%rdi) +; AVX2-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload +; AVX2-NEXT: vmovaps %ymm0, 192(%rdi) +; AVX2-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload +; AVX2-NEXT: vmovaps %ymm0, 160(%rdi) +; AVX2-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload +; AVX2-NEXT: vmovaps %ymm0, 128(%rdi) +; AVX2-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload +; AVX2-NEXT: vmovaps %ymm0, 96(%rdi) +; AVX2-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload +; AVX2-NEXT: vmovaps %ymm0, 64(%rdi) +; AVX2-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload +; AVX2-NEXT: vmovaps %ymm0, 32(%rdi) +; AVX2-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload +; AVX2-NEXT: vmovaps %ymm0, (%rdi) +; AVX2-NEXT: movq %rbp, %rsp +; AVX2-NEXT: popq %rbp +; AVX2-NEXT: vzeroupper +; AVX2-NEXT: retq +; +; AVX512-LABEL: test_mul8x8_f64: +; AVX512: # %bb.0: # %entry +; AVX512-NEXT: pushq %rbp +; AVX512-NEXT: movq %rsp, %rbp +; AVX512-NEXT: andq $-64, %rsp +; AVX512-NEXT: subq $64, %rsp +; AVX512-NEXT: movq %rdi, %rax +; AVX512-NEXT: vmulpd 16(%rbp){1to8}, %zmm0, %zmm8 +; AVX512-NEXT: vmulpd 24(%rbp){1to8}, %zmm1, %zmm9 +; AVX512-NEXT: vaddpd %zmm9, %zmm8, %zmm8 +; AVX512-NEXT: vmulpd 32(%rbp){1to8}, %zmm2, %zmm9 +; AVX512-NEXT: vaddpd %zmm9, %zmm8, %zmm8 +; AVX512-NEXT: vmulpd 40(%rbp){1to8}, %zmm3, %zmm9 +; AVX512-NEXT: vaddpd %zmm9, %zmm8, %zmm8 +; AVX512-NEXT: vmulpd 48(%rbp){1to8}, %zmm4, %zmm9 +; AVX512-NEXT: vaddpd %zmm9, %zmm8, %zmm8 +; AVX512-NEXT: vmulpd 56(%rbp){1to8}, %zmm5, %zmm9 +; AVX512-NEXT: vaddpd %zmm9, %zmm8, %zmm8 +; AVX512-NEXT: vmulpd 64(%rbp){1to8}, %zmm6, %zmm9 +; AVX512-NEXT: vaddpd %zmm9, %zmm8, %zmm8 +; AVX512-NEXT: vmulpd 72(%rbp){1to8}, %zmm7, %zmm9 +; AVX512-NEXT: vaddpd %zmm9, %zmm8, %zmm8 +; AVX512-NEXT: vmulpd 80(%rbp){1to8}, %zmm0, %zmm9 +; AVX512-NEXT: vmulpd 88(%rbp){1to8}, %zmm1, %zmm10 +; AVX512-NEXT: vaddpd %zmm10, %zmm9, %zmm9 +; AVX512-NEXT: vmulpd 96(%rbp){1to8}, %zmm2, %zmm10 +; AVX512-NEXT: vaddpd %zmm10, %zmm9, %zmm9 +; AVX512-NEXT: vmulpd 104(%rbp){1to8}, %zmm3, %zmm10 +; AVX512-NEXT: vaddpd %zmm10, %zmm9, %zmm9 +; AVX512-NEXT: vmulpd 112(%rbp){1to8}, %zmm4, %zmm10 +; AVX512-NEXT: vaddpd %zmm10, %zmm9, %zmm9 +; AVX512-NEXT: vmulpd 120(%rbp){1to8}, %zmm5, %zmm10 +; AVX512-NEXT: vaddpd %zmm10, %zmm9, %zmm9 +; AVX512-NEXT: vmulpd 128(%rbp){1to8}, %zmm6, %zmm10 +; AVX512-NEXT: vaddpd %zmm10, %zmm9, %zmm9 +; AVX512-NEXT: vmulpd 136(%rbp){1to8}, %zmm7, %zmm10 +; AVX512-NEXT: vaddpd %zmm10, %zmm9, %zmm9 +; AVX512-NEXT: vmulpd 144(%rbp){1to8}, %zmm0, %zmm10 +; AVX512-NEXT: vmulpd 152(%rbp){1to8}, %zmm1, %zmm11 +; AVX512-NEXT: vaddpd %zmm11, %zmm10, %zmm10 +; AVX512-NEXT: vmulpd 160(%rbp){1to8}, %zmm2, %zmm11 +; AVX512-NEXT: vaddpd %zmm11, %zmm10, %zmm10 +; AVX512-NEXT: vmulpd 168(%rbp){1to8}, %zmm3, %zmm11 +; AVX512-NEXT: vaddpd %zmm11, %zmm10, %zmm10 +; AVX512-NEXT: vmulpd 176(%rbp){1to8}, %zmm4, %zmm11 +; AVX512-NEXT: vaddpd %zmm11, %zmm10, %zmm10 +; AVX512-NEXT: vmulpd 184(%rbp){1to8}, %zmm5, %zmm11 +; AVX512-NEXT: vaddpd %zmm11, %zmm10, %zmm10 +; AVX512-NEXT: vmulpd 192(%rbp){1to8}, %zmm6, %zmm11 +; AVX512-NEXT: vaddpd %zmm11, %zmm10, %zmm10 +; AVX512-NEXT: vmulpd 200(%rbp){1to8}, %zmm7, %zmm11 +; AVX512-NEXT: vaddpd %zmm11, %zmm10, %zmm10 +; AVX512-NEXT: vmulpd 208(%rbp){1to8}, %zmm0, %zmm11 +; AVX512-NEXT: vmulpd 216(%rbp){1to8}, %zmm1, %zmm12 +; AVX512-NEXT: vaddpd %zmm12, %zmm11, %zmm11 +; AVX512-NEXT: vmulpd 224(%rbp){1to8}, %zmm2, %zmm12 +; AVX512-NEXT: vaddpd %zmm12, %zmm11, %zmm11 +; AVX512-NEXT: vmulpd 232(%rbp){1to8}, %zmm3, %zmm12 +; AVX512-NEXT: vaddpd %zmm12, %zmm11, %zmm11 +; AVX512-NEXT: vmulpd 240(%rbp){1to8}, %zmm4, %zmm12 +; AVX512-NEXT: vaddpd %zmm12, %zmm11, %zmm11 +; AVX512-NEXT: vmulpd 248(%rbp){1to8}, %zmm5, %zmm12 +; AVX512-NEXT: vaddpd %zmm12, %zmm11, %zmm11 +; AVX512-NEXT: vmulpd 256(%rbp){1to8}, %zmm6, %zmm12 +; AVX512-NEXT: vaddpd %zmm12, %zmm11, %zmm11 +; AVX512-NEXT: vmulpd 264(%rbp){1to8}, %zmm7, %zmm12 +; AVX512-NEXT: vaddpd %zmm12, %zmm11, %zmm11 +; AVX512-NEXT: vmulpd 272(%rbp){1to8}, %zmm0, %zmm12 +; AVX512-NEXT: vmulpd 280(%rbp){1to8}, %zmm1, %zmm13 +; AVX512-NEXT: vaddpd %zmm13, %zmm12, %zmm12 +; AVX512-NEXT: vmulpd 288(%rbp){1to8}, %zmm2, %zmm13 +; AVX512-NEXT: vaddpd %zmm13, %zmm12, %zmm12 +; AVX512-NEXT: vmulpd 296(%rbp){1to8}, %zmm3, %zmm13 +; AVX512-NEXT: vaddpd %zmm13, %zmm12, %zmm12 +; AVX512-NEXT: vmulpd 304(%rbp){1to8}, %zmm4, %zmm13 +; AVX512-NEXT: vaddpd %zmm13, %zmm12, %zmm12 +; AVX512-NEXT: vmulpd 312(%rbp){1to8}, %zmm5, %zmm13 +; AVX512-NEXT: vaddpd %zmm13, %zmm12, %zmm12 +; AVX512-NEXT: vmulpd 320(%rbp){1to8}, %zmm6, %zmm13 +; AVX512-NEXT: vaddpd %zmm13, %zmm12, %zmm12 +; AVX512-NEXT: vmulpd 328(%rbp){1to8}, %zmm7, %zmm13 +; AVX512-NEXT: vaddpd %zmm13, %zmm12, %zmm12 +; AVX512-NEXT: vmulpd 336(%rbp){1to8}, %zmm0, %zmm13 +; AVX512-NEXT: vmulpd 344(%rbp){1to8}, %zmm1, %zmm14 +; AVX512-NEXT: vaddpd %zmm14, %zmm13, %zmm13 +; AVX512-NEXT: vmulpd 352(%rbp){1to8}, %zmm2, %zmm14 +; AVX512-NEXT: vaddpd %zmm14, %zmm13, %zmm13 +; AVX512-NEXT: vmulpd 360(%rbp){1to8}, %zmm3, %zmm14 +; AVX512-NEXT: vaddpd %zmm14, %zmm13, %zmm13 +; AVX512-NEXT: vmulpd 368(%rbp){1to8}, %zmm4, %zmm14 +; AVX512-NEXT: vaddpd %zmm14, %zmm13, %zmm13 +; AVX512-NEXT: vmulpd 376(%rbp){1to8}, %zmm5, %zmm14 +; AVX512-NEXT: vaddpd %zmm14, %zmm13, %zmm13 +; AVX512-NEXT: vmulpd 384(%rbp){1to8}, %zmm6, %zmm14 +; AVX512-NEXT: vaddpd %zmm14, %zmm13, %zmm13 +; AVX512-NEXT: vmulpd 392(%rbp){1to8}, %zmm7, %zmm14 +; AVX512-NEXT: vaddpd %zmm14, %zmm13, %zmm13 +; AVX512-NEXT: vmulpd 400(%rbp){1to8}, %zmm0, %zmm14 +; AVX512-NEXT: vmulpd 408(%rbp){1to8}, %zmm1, %zmm15 +; AVX512-NEXT: vaddpd %zmm15, %zmm14, %zmm14 +; AVX512-NEXT: vmulpd 416(%rbp){1to8}, %zmm2, %zmm15 +; AVX512-NEXT: vaddpd %zmm15, %zmm14, %zmm14 +; AVX512-NEXT: vmulpd 424(%rbp){1to8}, %zmm3, %zmm15 +; AVX512-NEXT: vaddpd %zmm15, %zmm14, %zmm14 +; AVX512-NEXT: vmulpd 432(%rbp){1to8}, %zmm4, %zmm15 +; AVX512-NEXT: vaddpd %zmm15, %zmm14, %zmm14 +; AVX512-NEXT: vmulpd 440(%rbp){1to8}, %zmm5, %zmm15 +; AVX512-NEXT: vaddpd %zmm15, %zmm14, %zmm14 +; AVX512-NEXT: vmulpd 448(%rbp){1to8}, %zmm6, %zmm15 +; AVX512-NEXT: vaddpd %zmm15, %zmm14, %zmm14 +; AVX512-NEXT: vmulpd 456(%rbp){1to8}, %zmm7, %zmm15 +; AVX512-NEXT: vaddpd %zmm15, %zmm14, %zmm14 +; AVX512-NEXT: vmulpd 464(%rbp){1to8}, %zmm0, %zmm0 +; AVX512-NEXT: vmulpd 472(%rbp){1to8}, %zmm1, %zmm1 +; AVX512-NEXT: vaddpd %zmm1, %zmm0, %zmm0 +; AVX512-NEXT: vmulpd 480(%rbp){1to8}, %zmm2, %zmm1 +; AVX512-NEXT: vaddpd %zmm1, %zmm0, %zmm0 +; AVX512-NEXT: vmulpd 488(%rbp){1to8}, %zmm3, %zmm1 +; AVX512-NEXT: vaddpd %zmm1, %zmm0, %zmm0 +; AVX512-NEXT: vmulpd 496(%rbp){1to8}, %zmm4, %zmm1 +; AVX512-NEXT: vaddpd %zmm1, %zmm0, %zmm0 +; AVX512-NEXT: vmulpd 504(%rbp){1to8}, %zmm5, %zmm1 +; AVX512-NEXT: vaddpd %zmm1, %zmm0, %zmm0 +; AVX512-NEXT: vmulpd 512(%rbp){1to8}, %zmm6, %zmm1 +; AVX512-NEXT: vaddpd %zmm1, %zmm0, %zmm0 +; AVX512-NEXT: vmulpd 520(%rbp){1to8}, %zmm7, %zmm1 +; AVX512-NEXT: vaddpd %zmm1, %zmm0, %zmm0 +; AVX512-NEXT: vmovapd %zmm0, 448(%rdi) +; AVX512-NEXT: vmovapd %zmm14, 384(%rdi) +; AVX512-NEXT: vmovapd %zmm13, 320(%rdi) +; AVX512-NEXT: vmovapd %zmm12, 256(%rdi) +; AVX512-NEXT: vmovapd %zmm11, 192(%rdi) +; AVX512-NEXT: vmovapd %zmm10, 128(%rdi) +; AVX512-NEXT: vmovapd %zmm9, 64(%rdi) +; AVX512-NEXT: vmovapd %zmm8, (%rdi) +; AVX512-NEXT: movq %rbp, %rsp +; AVX512-NEXT: popq %rbp +; AVX512-NEXT: vzeroupper +; AVX512-NEXT: retq +entry: + %split = shufflevector <64 x double> %a0, <64 x double> poison, <8 x i32> + %split1 = shufflevector <64 x double> %a0, <64 x double> poison, <8 x i32> + %split2 = shufflevector <64 x double> %a0, <64 x double> poison, <8 x i32> + %split3 = shufflevector <64 x double> %a0, <64 x double> poison, <8 x i32> + %split4 = shufflevector <64 x double> %a0, <64 x double> poison, <8 x i32> + %split5 = shufflevector <64 x double> %a0, <64 x double> poison, <8 x i32> + %split6 = shufflevector <64 x double> %a0, <64 x double> poison, <8 x i32> + %split7 = shufflevector <64 x double> %a0, <64 x double> poison, <8 x i32> + %splat.splat = shufflevector <64 x double> %a1, <64 x double> undef, <8 x i32> zeroinitializer + %0 = fmul <8 x double> %split, %splat.splat + %splat.splat18 = shufflevector <64 x double> %a1, <64 x double> undef, <8 x i32> + %1 = fmul <8 x double> %split1, %splat.splat18 + %2 = fadd <8 x double> %0, %1 + %splat.splat21 = shufflevector <64 x double> %a1, <64 x double> undef, <8 x i32> + %3 = fmul <8 x double> %split2, %splat.splat21 + %4 = fadd <8 x double> %2, %3 + %splat.splat24 = shufflevector <64 x double> %a1, <64 x double> undef, <8 x i32> + %5 = fmul <8 x double> %split3, %splat.splat24 + %6 = fadd <8 x double> %4, %5 + %splat.splat27 = shufflevector <64 x double> %a1, <64 x double> undef, <8 x i32> + %7 = fmul <8 x double> %split4, %splat.splat27 + %8 = fadd <8 x double> %6, %7 + %splat.splat30 = shufflevector <64 x double> %a1, <64 x double> undef, <8 x i32> + %9 = fmul <8 x double> %split5, %splat.splat30 + %10 = fadd <8 x double> %8, %9 + %splat.splat33 = shufflevector <64 x double> %a1, <64 x double> undef, <8 x i32> + %11 = fmul <8 x double> %split6, %splat.splat33 + %12 = fadd <8 x double> %10, %11 + %splat.splat36 = shufflevector <64 x double> %a1, <64 x double> undef, <8 x i32> + %13 = fmul <8 x double> %split7, %splat.splat36 + %14 = fadd <8 x double> %12, %13 + %splat.splat39 = shufflevector <64 x double> %a1, <64 x double> undef, <8 x i32> + %15 = fmul <8 x double> %split, %splat.splat39 + %splat.splat42 = shufflevector <64 x double> %a1, <64 x double> undef, <8 x i32> + %16 = fmul <8 x double> %split1, %splat.splat42 + %17 = fadd <8 x double> %15, %16 + %splat.splat45 = shufflevector <64 x double> %a1, <64 x double> undef, <8 x i32> + %18 = fmul <8 x double> %split2, %splat.splat45 + %19 = fadd <8 x double> %17, %18 + %splat.splat48 = shufflevector <64 x double> %a1, <64 x double> undef, <8 x i32> + %20 = fmul <8 x double> %split3, %splat.splat48 + %21 = fadd <8 x double> %19, %20 + %splat.splat51 = shufflevector <64 x double> %a1, <64 x double> undef, <8 x i32> + %22 = fmul <8 x double> %split4, %splat.splat51 + %23 = fadd <8 x double> %21, %22 + %splat.splat54 = shufflevector <64 x double> %a1, <64 x double> undef, <8 x i32> + %24 = fmul <8 x double> %split5, %splat.splat54 + %25 = fadd <8 x double> %23, %24 + %splat.splat57 = shufflevector <64 x double> %a1, <64 x double> undef, <8 x i32> + %26 = fmul <8 x double> %split6, %splat.splat57 + %27 = fadd <8 x double> %25, %26 + %splat.splat60 = shufflevector <64 x double> %a1, <64 x double> undef, <8 x i32> + %28 = fmul <8 x double> %split7, %splat.splat60 + %29 = fadd <8 x double> %27, %28 + %splat.splat63 = shufflevector <64 x double> %a1, <64 x double> undef, <8 x i32> + %30 = fmul <8 x double> %split, %splat.splat63 + %splat.splat66 = shufflevector <64 x double> %a1, <64 x double> undef, <8 x i32> + %31 = fmul <8 x double> %split1, %splat.splat66 + %32 = fadd <8 x double> %30, %31 + %splat.splat69 = shufflevector <64 x double> %a1, <64 x double> undef, <8 x i32> + %33 = fmul <8 x double> %split2, %splat.splat69 + %34 = fadd <8 x double> %32, %33 + %splat.splat72 = shufflevector <64 x double> %a1, <64 x double> undef, <8 x i32> + %35 = fmul <8 x double> %split3, %splat.splat72 + %36 = fadd <8 x double> %34, %35 + %splat.splat75 = shufflevector <64 x double> %a1, <64 x double> undef, <8 x i32> + %37 = fmul <8 x double> %split4, %splat.splat75 + %38 = fadd <8 x double> %36, %37 + %splat.splat78 = shufflevector <64 x double> %a1, <64 x double> undef, <8 x i32> + %39 = fmul <8 x double> %split5, %splat.splat78 + %40 = fadd <8 x double> %38, %39 + %splat.splat81 = shufflevector <64 x double> %a1, <64 x double> undef, <8 x i32> + %41 = fmul <8 x double> %split6, %splat.splat81 + %42 = fadd <8 x double> %40, %41 + %splat.splat84 = shufflevector <64 x double> %a1, <64 x double> undef, <8 x i32> + %43 = fmul <8 x double> %split7, %splat.splat84 + %44 = fadd <8 x double> %42, %43 + %splat.splat87 = shufflevector <64 x double> %a1, <64 x double> undef, <8 x i32> + %45 = fmul <8 x double> %split, %splat.splat87 + %splat.splat90 = shufflevector <64 x double> %a1, <64 x double> undef, <8 x i32> + %46 = fmul <8 x double> %split1, %splat.splat90 + %47 = fadd <8 x double> %45, %46 + %splat.splat93 = shufflevector <64 x double> %a1, <64 x double> undef, <8 x i32> + %48 = fmul <8 x double> %split2, %splat.splat93 + %49 = fadd <8 x double> %47, %48 + %splat.splat96 = shufflevector <64 x double> %a1, <64 x double> undef, <8 x i32> + %50 = fmul <8 x double> %split3, %splat.splat96 + %51 = fadd <8 x double> %49, %50 + %splat.splat99 = shufflevector <64 x double> %a1, <64 x double> undef, <8 x i32> + %52 = fmul <8 x double> %split4, %splat.splat99 + %53 = fadd <8 x double> %51, %52 + %splat.splat102 = shufflevector <64 x double> %a1, <64 x double> undef, <8 x i32> + %54 = fmul <8 x double> %split5, %splat.splat102 + %55 = fadd <8 x double> %53, %54 + %splat.splat105 = shufflevector <64 x double> %a1, <64 x double> undef, <8 x i32> + %56 = fmul <8 x double> %split6, %splat.splat105 + %57 = fadd <8 x double> %55, %56 + %splat.splat108 = shufflevector <64 x double> %a1, <64 x double> undef, <8 x i32> + %58 = fmul <8 x double> %split7, %splat.splat108 + %59 = fadd <8 x double> %57, %58 + %splat.splat111 = shufflevector <64 x double> %a1, <64 x double> undef, <8 x i32> + %60 = fmul <8 x double> %split, %splat.splat111 + %splat.splat114 = shufflevector <64 x double> %a1, <64 x double> undef, <8 x i32> + %61 = fmul <8 x double> %split1, %splat.splat114 + %62 = fadd <8 x double> %60, %61 + %splat.splat117 = shufflevector <64 x double> %a1, <64 x double> undef, <8 x i32> + %63 = fmul <8 x double> %split2, %splat.splat117 + %64 = fadd <8 x double> %62, %63 + %splat.splat120 = shufflevector <64 x double> %a1, <64 x double> undef, <8 x i32> + %65 = fmul <8 x double> %split3, %splat.splat120 + %66 = fadd <8 x double> %64, %65 + %splat.splat123 = shufflevector <64 x double> %a1, <64 x double> undef, <8 x i32> + %67 = fmul <8 x double> %split4, %splat.splat123 + %68 = fadd <8 x double> %66, %67 + %splat.splat126 = shufflevector <64 x double> %a1, <64 x double> undef, <8 x i32> + %69 = fmul <8 x double> %split5, %splat.splat126 + %70 = fadd <8 x double> %68, %69 + %splat.splat129 = shufflevector <64 x double> %a1, <64 x double> undef, <8 x i32> + %71 = fmul <8 x double> %split6, %splat.splat129 + %72 = fadd <8 x double> %70, %71 + %splat.splat132 = shufflevector <64 x double> %a1, <64 x double> undef, <8 x i32> + %73 = fmul <8 x double> %split7, %splat.splat132 + %74 = fadd <8 x double> %72, %73 + %splat.splat135 = shufflevector <64 x double> %a1, <64 x double> undef, <8 x i32> + %75 = fmul <8 x double> %split, %splat.splat135 + %splat.splat138 = shufflevector <64 x double> %a1, <64 x double> undef, <8 x i32> + %76 = fmul <8 x double> %split1, %splat.splat138 + %77 = fadd <8 x double> %75, %76 + %splat.splat141 = shufflevector <64 x double> %a1, <64 x double> undef, <8 x i32> + %78 = fmul <8 x double> %split2, %splat.splat141 + %79 = fadd <8 x double> %77, %78 + %splat.splat144 = shufflevector <64 x double> %a1, <64 x double> undef, <8 x i32> + %80 = fmul <8 x double> %split3, %splat.splat144 + %81 = fadd <8 x double> %79, %80 + %splat.splat147 = shufflevector <64 x double> %a1, <64 x double> undef, <8 x i32> + %82 = fmul <8 x double> %split4, %splat.splat147 + %83 = fadd <8 x double> %81, %82 + %splat.splat150 = shufflevector <64 x double> %a1, <64 x double> undef, <8 x i32> + %84 = fmul <8 x double> %split5, %splat.splat150 + %85 = fadd <8 x double> %83, %84 + %splat.splat153 = shufflevector <64 x double> %a1, <64 x double> undef, <8 x i32> + %86 = fmul <8 x double> %split6, %splat.splat153 + %87 = fadd <8 x double> %85, %86 + %splat.splat156 = shufflevector <64 x double> %a1, <64 x double> undef, <8 x i32> + %88 = fmul <8 x double> %split7, %splat.splat156 + %89 = fadd <8 x double> %87, %88 + %splat.splat159 = shufflevector <64 x double> %a1, <64 x double> undef, <8 x i32> + %90 = fmul <8 x double> %split, %splat.splat159 + %splat.splat162 = shufflevector <64 x double> %a1, <64 x double> undef, <8 x i32> + %91 = fmul <8 x double> %split1, %splat.splat162 + %92 = fadd <8 x double> %90, %91 + %splat.splat165 = shufflevector <64 x double> %a1, <64 x double> undef, <8 x i32> + %93 = fmul <8 x double> %split2, %splat.splat165 + %94 = fadd <8 x double> %92, %93 + %splat.splat168 = shufflevector <64 x double> %a1, <64 x double> undef, <8 x i32> + %95 = fmul <8 x double> %split3, %splat.splat168 + %96 = fadd <8 x double> %94, %95 + %splat.splat171 = shufflevector <64 x double> %a1, <64 x double> undef, <8 x i32> + %97 = fmul <8 x double> %split4, %splat.splat171 + %98 = fadd <8 x double> %96, %97 + %splat.splat174 = shufflevector <64 x double> %a1, <64 x double> undef, <8 x i32> + %99 = fmul <8 x double> %split5, %splat.splat174 + %100 = fadd <8 x double> %98, %99 + %splat.splat177 = shufflevector <64 x double> %a1, <64 x double> undef, <8 x i32> + %101 = fmul <8 x double> %split6, %splat.splat177 + %102 = fadd <8 x double> %100, %101 + %splat.splat180 = shufflevector <64 x double> %a1, <64 x double> undef, <8 x i32> + %103 = fmul <8 x double> %split7, %splat.splat180 + %104 = fadd <8 x double> %102, %103 + %splat.splat183 = shufflevector <64 x double> %a1, <64 x double> undef, <8 x i32> + %105 = fmul <8 x double> %split, %splat.splat183 + %splat.splat186 = shufflevector <64 x double> %a1, <64 x double> undef, <8 x i32> + %106 = fmul <8 x double> %split1, %splat.splat186 + %107 = fadd <8 x double> %105, %106 + %splat.splat189 = shufflevector <64 x double> %a1, <64 x double> undef, <8 x i32> + %108 = fmul <8 x double> %split2, %splat.splat189 + %109 = fadd <8 x double> %107, %108 + %splat.splat192 = shufflevector <64 x double> %a1, <64 x double> undef, <8 x i32> + %110 = fmul <8 x double> %split3, %splat.splat192 + %111 = fadd <8 x double> %109, %110 + %splat.splat195 = shufflevector <64 x double> %a1, <64 x double> undef, <8 x i32> + %112 = fmul <8 x double> %split4, %splat.splat195 + %113 = fadd <8 x double> %111, %112 + %splat.splat198 = shufflevector <64 x double> %a1, <64 x double> undef, <8 x i32> + %114 = fmul <8 x double> %split5, %splat.splat198 + %115 = fadd <8 x double> %113, %114 + %splat.splat201 = shufflevector <64 x double> %a1, <64 x double> undef, <8 x i32> + %116 = fmul <8 x double> %split6, %splat.splat201 + %117 = fadd <8 x double> %115, %116 + %splat.splat204 = shufflevector <64 x double> %a1, <64 x double> undef, <8 x i32> + %118 = fmul <8 x double> %split7, %splat.splat204 + %119 = fadd <8 x double> %117, %118 + %120 = shufflevector <8 x double> %14, <8 x double> %29, <16 x i32> + %121 = shufflevector <8 x double> %44, <8 x double> %59, <16 x i32> + %122 = shufflevector <8 x double> %74, <8 x double> %89, <16 x i32> + %123 = shufflevector <8 x double> %104, <8 x double> %119, <16 x i32> + %124 = shufflevector <16 x double> %120, <16 x double> %121, <32 x i32> + %125 = shufflevector <16 x double> %122, <16 x double> %123, <32 x i32> + %126 = shufflevector <32 x double> %124, <32 x double> %125, <64 x i32> + ret <64 x double> %126 +} +;; NOTE: These prefixes are unused and the list is autogenerated. Do not add tests below this line: +; AVX: {{.*}} -- 2.7.4