--- /dev/null
+/*********************************************************************/
+/* Copyright 2009, 2010 The University of Texas at Austin. */
+/* All rights reserved. */
+/* */
+/* Redistribution and use in source and binary forms, with or */
+/* without modification, are permitted provided that the following */
+/* conditions are met: */
+/* */
+/* 1. Redistributions of source code must retain the above */
+/* copyright notice, this list of conditions and the following */
+/* disclaimer. */
+/* */
+/* 2. Redistributions in binary form must reproduce the above */
+/* copyright notice, this list of conditions and the following */
+/* disclaimer in the documentation and/or other materials */
+/* provided with the distribution. */
+/* */
+/* THIS SOFTWARE IS PROVIDED BY THE UNIVERSITY OF TEXAS AT */
+/* AUSTIN ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, */
+/* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF */
+/* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE */
+/* DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY OF TEXAS AT */
+/* AUSTIN OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, */
+/* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES */
+/* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE */
+/* GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR */
+/* BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF */
+/* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT */
+/* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT */
+/* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE */
+/* POSSIBILITY OF SUCH DAMAGE. */
+/* */
+/* The views and conclusions contained in the software and */
+/* documentation are those of the authors and should not be */
+/* interpreted as representing official policies, either expressed */
+/* or implied, of The University of Texas at Austin. */
+/*********************************************************************/
+
+#define ASSEMBLER
+#include "common.h"
+#include "l2param.h"
+
+#define A_PRE 256
+
+#define VMOVUPS_A1(OFF, ADDR, REGS) vmovups OFF(ADDR), REGS
+#define VMOVUPS_A2(OFF, ADDR, BASE, SCALE, REGS) vmovups OFF(ADDR, BASE, SCALE), REGS
+#define VMOVUPS_YL1(OFF, ADDR, REGS) vmovups OFF(ADDR), REGS
+#define VMOVUPS_YS1(OFF, ADDR, REGS) vmovups REGS, OFF(ADDR)
+
+#if GEMV_UNROLL < 2
+#undef GEMV_UNROLL
+#define GEMV_UNROLL 2
+#endif
+
+#ifndef WINDOWS_ABI
+
+#define STACKSIZE 64
+
+#define OLD_M %rdi
+#define OLD_N %rsi
+#define OLD_A %rcx
+#define OLD_LDA %r8
+#define STACK_INCX 8 + STACKSIZE(%rsp)
+#define STACK_Y 16 + STACKSIZE(%rsp)
+#define STACK_INCY 24 + STACKSIZE(%rsp)
+#define STACK_BUFFER 32 + STACKSIZE(%rsp)
+#define ALPHA 48 (%rsp)
+
+#else
+
+#define STACKSIZE 256
+
+#define OLD_M %rcx
+#define OLD_N %rdx
+#define OLD_A 40 + STACKSIZE(%rsp)
+#define OLD_LDA 48 + STACKSIZE(%rsp)
+#define OLD_X 56 + STACKSIZE(%rsp)
+#define STACK_INCX 64 + STACKSIZE(%rsp)
+#define STACK_Y 72 + STACKSIZE(%rsp)
+#define STACK_INCY 80 + STACKSIZE(%rsp)
+#define STACK_BUFFER 88 + STACKSIZE(%rsp)
+#define ALPHA 224 (%rsp)
+
+#endif
+
+#define LDA %r8
+#define X %r9
+
+#define INCX %rsi
+#define INCY %rdi
+
+#define M %r10
+#define N %r11
+#define A %r12
+#define Y %r14
+#define BUFFER %r13
+
+#define I %rax
+#define A1 %rbx
+#define A2 %rcx
+#define LDA3 %rdx
+#define Y1 %rbp
+
+#ifdef ALIGNED_ACCESS
+#define MM %r15
+#else
+#define MM M
+#endif
+
+ PROLOGUE
+ PROFCODE
+
+ subq $STACKSIZE, %rsp
+ movq %rbx, 0(%rsp)
+ movq %rbp, 8(%rsp)
+ movq %r12, 16(%rsp)
+ movq %r13, 24(%rsp)
+ movq %r14, 32(%rsp)
+ movq %r15, 40(%rsp)
+
+#ifdef WINDOWS_ABI
+ movq %rdi, 48(%rsp)
+ movq %rsi, 56(%rsp)
+ vmovups %xmm6, 64(%rsp)
+ vmovups %xmm7, 80(%rsp)
+ vmovups %xmm8, 96(%rsp)
+ vmovups %xmm9, 112(%rsp)
+ vmovups %xmm10, 128(%rsp)
+ vmovups %xmm11, 144(%rsp)
+ vmovups %xmm12, 160(%rsp)
+ vmovups %xmm13, 176(%rsp)
+ vmovups %xmm14, 192(%rsp)
+ vmovups %xmm15, 208(%rsp)
+
+ movq OLD_M, M
+ movq OLD_N, N
+ movq OLD_A, A
+ movq OLD_LDA, LDA
+ movq OLD_X, X
+#else
+ movq OLD_M, M
+ movq OLD_N, N
+ movq OLD_A, A
+ movq OLD_LDA, LDA
+#endif
+
+ movq STACK_INCX, INCX
+ movq STACK_Y, Y
+ movq STACK_INCY, INCY
+ movq STACK_BUFFER, BUFFER
+
+#ifndef WINDOWS_ABI
+ vmovsd %xmm0, ALPHA
+#else
+ vmovsd %xmm3, ALPHA
+#endif
+
+ leaq -1(INCY), %rax
+
+ leaq (,INCX, SIZE), INCX
+ leaq (,INCY, SIZE), INCY
+ leaq (,LDA, SIZE), LDA
+
+ leaq (LDA, LDA, 2), LDA3
+
+ subq $-16 * SIZE, A
+
+#ifdef ALIGNED_ACCESS
+ leaq -1 (M), MM
+ testq $SIZE, A
+ cmoveq M, MM
+#endif
+
+ testq N, N # if n <= 0 goto END
+ jle .L999
+ testq M, M # if n <= 0 goto END
+ jle .L999
+
+#if !defined(COPY_FORCE) && !defined(ALIGNED_ACCESS)
+#ifndef NOCOPY_UNALIGNED
+ movq Y, Y1
+ andq $0xf, Y1
+ orq Y1, %rax
+#endif
+ testq %rax, %rax
+ cmoveq Y, BUFFER
+ je .L10
+#endif
+
+ movq BUFFER, Y1
+
+ vxorpd %xmm4, %xmm4, %xmm4
+
+ movq M, %rax
+ addq $16, %rax
+ sarq $4, %rax
+ ALIGN_3
+
+.L01:
+ vmovups %xmm4, 0 * SIZE(Y1)
+ vmovups %xmm4, 2 * SIZE(Y1)
+ vmovups %xmm4, 4 * SIZE(Y1)
+ vmovups %xmm4, 6 * SIZE(Y1)
+ vmovups %xmm4, 8 * SIZE(Y1)
+ vmovups %xmm4, 10 * SIZE(Y1)
+ vmovups %xmm4, 12 * SIZE(Y1)
+ vmovups %xmm4, 14 * SIZE(Y1)
+ subq $-16 * SIZE, Y1
+ decq %rax
+ jg .L01
+ ALIGN_3
+
+.L10:
+
+#ifdef ALIGNED_ACCESS
+ leaq SIZE(BUFFER), %rax
+ testq $SIZE, A
+ cmovne %rax, BUFFER
+
+ testq $SIZE, LDA
+ jne .L50
+#endif
+
+#if GEMV_UNROLL >= 8
+
+ cmpq $8, N
+ jl .L20
+ ALIGN_3
+
+.L11:
+ subq $8, N
+
+ leaq 16 * SIZE(BUFFER), Y1
+ movq A, A1
+ leaq (A, LDA, 4), A2
+ leaq (A, LDA, 8), A
+
+ vmovddup (X), %xmm8
+ addq INCX, X
+ vmovddup (X), %xmm9
+ addq INCX, X
+ vmovddup (X), %xmm10
+ addq INCX, X
+ vmovddup (X), %xmm11
+ addq INCX, X
+ vmovddup (X), %xmm12
+ addq INCX, X
+ vmovddup (X), %xmm13
+ addq INCX, X
+ vmovddup (X), %xmm14
+ addq INCX, X
+ vmovddup (X), %xmm15
+ addq INCX, X
+
+ vmovddup ALPHA, %xmm0
+
+ vmulpd %xmm0, %xmm8 , %xmm8
+ vmulpd %xmm0, %xmm9 , %xmm9
+ vmulpd %xmm0, %xmm10 , %xmm10
+ vmulpd %xmm0, %xmm11 , %xmm11
+ vmulpd %xmm0, %xmm12 , %xmm12
+ vmulpd %xmm0, %xmm13 , %xmm13
+ vmulpd %xmm0, %xmm14 , %xmm14
+ vmulpd %xmm0, %xmm15 , %xmm15
+
+#ifdef ALIGNED_ACCESS
+ testq $SIZE, A
+ je .L1X
+
+ vmovsd -16 * SIZE(Y1), %xmm0
+
+ vmovsd -16 * SIZE(A1), %xmm4
+ vmovsd -16 * SIZE(A1, LDA), %xmm5
+ vmovsd -16 * SIZE(A1, LDA, 2), %xmm6
+ vmovsd -16 * SIZE(A1, LDA3), %xmm7
+
+
+ vfmaddsd %xmm0 , %xmm8 , %xmm4 , %xmm0
+ vfmaddsd %xmm0 , %xmm9 , %xmm5 , %xmm0
+ vfmaddsd %xmm0 , %xmm10, %xmm6 , %xmm0
+ vfmaddsd %xmm0 , %xmm11, %xmm7 , %xmm0
+
+ vmovsd -16 * SIZE(A2), %xmm4
+ vmovsd -16 * SIZE(A2, LDA), %xmm5
+ vmovsd -16 * SIZE(A2, LDA, 2), %xmm6
+ vmovsd -16 * SIZE(A2, LDA3), %xmm7
+
+ vfmaddsd %xmm0 , %xmm12, %xmm4 , %xmm0
+ vfmaddsd %xmm0 , %xmm13, %xmm5 , %xmm0
+ vfmaddsd %xmm0 , %xmm14, %xmm6 , %xmm0
+ vfmaddsd %xmm0 , %xmm15, %xmm7 , %xmm0
+
+ vmovsd %xmm0, -16 * SIZE(Y1)
+
+ addq $SIZE, A1
+ addq $SIZE, A2
+ addq $SIZE, Y1
+ ALIGN_3
+
+.L1X:
+#endif
+
+ movq MM, I
+ sarq $3, I
+ jle .L15
+
+ VMOVUPS_A1(-16 * SIZE, A1, %xmm4)
+ VMOVUPS_A1(-14 * SIZE, A1, %xmm5)
+ VMOVUPS_A1(-12 * SIZE, A1, %xmm6)
+ VMOVUPS_A1(-10 * SIZE, A1, %xmm7)
+
+ VMOVUPS_YL1(-16 * SIZE, Y1, %xmm0)
+ VMOVUPS_YL1(-14 * SIZE, Y1, %xmm1)
+ VMOVUPS_YL1(-12 * SIZE, Y1, %xmm2)
+ VMOVUPS_YL1(-10 * SIZE, Y1, %xmm3)
+
+ decq I
+ jle .L14
+ ALIGN_3
+
+.L13:
+
+
+ vfmaddpd %xmm0 , %xmm8 , %xmm4 , %xmm0
+ vfmaddpd %xmm1 , %xmm8 , %xmm5 , %xmm1
+ vfmaddpd %xmm2 , %xmm8 , %xmm6 , %xmm2
+ vfmaddpd %xmm3 , %xmm8 , %xmm7 , %xmm3
+
+ VMOVUPS_A2(-16 * SIZE, A1, LDA, 1, %xmm4)
+ VMOVUPS_A2(-14 * SIZE, A1, LDA, 1, %xmm5)
+ prefetchnta A_PRE(A1,LDA,1)
+ VMOVUPS_A2(-12 * SIZE, A1, LDA, 1, %xmm6)
+ VMOVUPS_A2(-10 * SIZE, A1, LDA, 1, %xmm7)
+
+
+ vfmaddpd %xmm0 , %xmm9 , %xmm4 , %xmm0
+ vfmaddpd %xmm1 , %xmm9 , %xmm5 , %xmm1
+ vfmaddpd %xmm2 , %xmm9 , %xmm6 , %xmm2
+ vfmaddpd %xmm3 , %xmm9 , %xmm7 , %xmm3
+
+ VMOVUPS_A2(-16 * SIZE, A1, LDA, 2, %xmm4)
+ VMOVUPS_A2(-14 * SIZE, A1, LDA, 2, %xmm5)
+ prefetchnta A_PRE(A1,LDA,2)
+ VMOVUPS_A2(-12 * SIZE, A1, LDA, 2, %xmm6)
+ VMOVUPS_A2(-10 * SIZE, A1, LDA, 2, %xmm7)
+
+
+ vfmaddpd %xmm0 , %xmm10, %xmm4 , %xmm0
+ vfmaddpd %xmm1 , %xmm10, %xmm5 , %xmm1
+ vfmaddpd %xmm2 , %xmm10, %xmm6 , %xmm2
+ vfmaddpd %xmm3 , %xmm10, %xmm7 , %xmm3
+
+ VMOVUPS_A2(-16 * SIZE, A1, LDA3, 1, %xmm4)
+ VMOVUPS_A2(-14 * SIZE, A1, LDA3, 1, %xmm5)
+ prefetchnta A_PRE(A1,LDA3,1)
+ VMOVUPS_A2(-12 * SIZE, A1, LDA3, 1, %xmm6)
+ VMOVUPS_A2(-10 * SIZE, A1, LDA3, 1, %xmm7)
+
+
+ vfmaddpd %xmm0 , %xmm11, %xmm4 , %xmm0
+ vfmaddpd %xmm1 , %xmm11, %xmm5 , %xmm1
+ vfmaddpd %xmm2 , %xmm11, %xmm6 , %xmm2
+ vfmaddpd %xmm3 , %xmm11, %xmm7 , %xmm3
+
+ VMOVUPS_A1(-16 * SIZE, A2, %xmm4)
+ VMOVUPS_A1(-14 * SIZE, A2, %xmm5)
+ prefetchnta A_PRE(A2)
+ VMOVUPS_A1(-12 * SIZE, A2, %xmm6)
+ VMOVUPS_A1(-10 * SIZE, A2, %xmm7)
+
+
+ vfmaddpd %xmm0 , %xmm12, %xmm4 , %xmm0
+ vfmaddpd %xmm1 , %xmm12, %xmm5 , %xmm1
+ vfmaddpd %xmm2 , %xmm12, %xmm6 , %xmm2
+ vfmaddpd %xmm3 , %xmm12, %xmm7 , %xmm3
+
+ VMOVUPS_A2(-16 * SIZE, A2, LDA, 1, %xmm4)
+ VMOVUPS_A2(-14 * SIZE, A2, LDA, 1, %xmm5)
+ prefetchnta A_PRE(A2,LDA,1)
+ VMOVUPS_A2(-12 * SIZE, A2, LDA, 1, %xmm6)
+ VMOVUPS_A2(-10 * SIZE, A2, LDA, 1, %xmm7)
+
+
+ vfmaddpd %xmm0 , %xmm13, %xmm4 , %xmm0
+ vfmaddpd %xmm1 , %xmm13, %xmm5 , %xmm1
+ vfmaddpd %xmm2 , %xmm13, %xmm6 , %xmm2
+ vfmaddpd %xmm3 , %xmm13, %xmm7 , %xmm3
+
+ VMOVUPS_A2(-16 * SIZE, A2, LDA, 2, %xmm4)
+ VMOVUPS_A2(-14 * SIZE, A2, LDA, 2, %xmm5)
+ prefetchnta A_PRE(A2,LDA,2)
+ VMOVUPS_A2(-12 * SIZE, A2, LDA, 2, %xmm6)
+ VMOVUPS_A2(-10 * SIZE, A2, LDA, 2, %xmm7)
+
+
+ vfmaddpd %xmm0 , %xmm14, %xmm4 , %xmm0
+ vfmaddpd %xmm1 , %xmm14, %xmm5 , %xmm1
+ vfmaddpd %xmm2 , %xmm14, %xmm6 , %xmm2
+ vfmaddpd %xmm3 , %xmm14, %xmm7 , %xmm3
+
+ VMOVUPS_A2(-16 * SIZE, A2, LDA3, 1, %xmm4)
+ VMOVUPS_A2(-14 * SIZE, A2, LDA3, 1, %xmm5)
+ prefetchnta A_PRE(A2,LDA3,1)
+ VMOVUPS_A2(-12 * SIZE, A2, LDA3, 1, %xmm6)
+ VMOVUPS_A2(-10 * SIZE, A2, LDA3, 1, %xmm7)
+
+
+ vfmaddpd %xmm0 , %xmm15, %xmm4 , %xmm0
+ vfmaddpd %xmm1 , %xmm15, %xmm5 , %xmm1
+ vfmaddpd %xmm2 , %xmm15, %xmm6 , %xmm2
+ vfmaddpd %xmm3 , %xmm15, %xmm7 , %xmm3
+
+ VMOVUPS_A1( -8 * SIZE, A1, %xmm4)
+ VMOVUPS_A1( -6 * SIZE, A1, %xmm5)
+ prefetchnta A_PRE(A1)
+ VMOVUPS_A1( -4 * SIZE, A1, %xmm6)
+ VMOVUPS_A1( -2 * SIZE, A1, %xmm7)
+
+
+ VMOVUPS_YS1(-16 * SIZE, Y1, %xmm0)
+ VMOVUPS_YS1(-14 * SIZE, Y1, %xmm1)
+ VMOVUPS_YS1(-12 * SIZE, Y1, %xmm2)
+ VMOVUPS_YS1(-10 * SIZE, Y1, %xmm3)
+
+
+ VMOVUPS_YL1( -8 * SIZE, Y1, %xmm0)
+ VMOVUPS_YL1( -6 * SIZE, Y1, %xmm1)
+ prefetchnta A_PRE(Y1)
+ VMOVUPS_YL1( -4 * SIZE, Y1, %xmm2)
+ VMOVUPS_YL1( -2 * SIZE, Y1, %xmm3)
+
+ subq $-8 * SIZE, A1
+ subq $-8 * SIZE, A2
+ subq $-8 * SIZE, Y1
+
+ subq $1, I
+ BRANCH
+ jg .L13
+ ALIGN_3
+
+.L14:
+ vfmaddpd %xmm0 , %xmm8 , %xmm4 , %xmm0
+ vfmaddpd %xmm1 , %xmm8 , %xmm5 , %xmm1
+ vfmaddpd %xmm2 , %xmm8 , %xmm6 , %xmm2
+ vfmaddpd %xmm3 , %xmm8 , %xmm7 , %xmm3
+
+ VMOVUPS_A2(-16 * SIZE, A1, LDA, 1, %xmm4)
+ VMOVUPS_A2(-14 * SIZE, A1, LDA, 1, %xmm5)
+ VMOVUPS_A2(-12 * SIZE, A1, LDA, 1, %xmm6)
+ VMOVUPS_A2(-10 * SIZE, A1, LDA, 1, %xmm7)
+
+ vfmaddpd %xmm0 , %xmm9 , %xmm4 , %xmm0
+ vfmaddpd %xmm1 , %xmm9 , %xmm5 , %xmm1
+ vfmaddpd %xmm2 , %xmm9 , %xmm6 , %xmm2
+ vfmaddpd %xmm3 , %xmm9 , %xmm7 , %xmm3
+
+ VMOVUPS_A2(-16 * SIZE, A1, LDA, 2, %xmm4)
+ VMOVUPS_A2(-14 * SIZE, A1, LDA, 2, %xmm5)
+ VMOVUPS_A2(-12 * SIZE, A1, LDA, 2, %xmm6)
+ VMOVUPS_A2(-10 * SIZE, A1, LDA, 2, %xmm7)
+
+ vfmaddpd %xmm0 , %xmm10, %xmm4 , %xmm0
+ vfmaddpd %xmm1 , %xmm10, %xmm5 , %xmm1
+ vfmaddpd %xmm2 , %xmm10, %xmm6 , %xmm2
+ vfmaddpd %xmm3 , %xmm10, %xmm7 , %xmm3
+
+ VMOVUPS_A2(-16 * SIZE, A1, LDA3, 1, %xmm4)
+ VMOVUPS_A2(-14 * SIZE, A1, LDA3, 1, %xmm5)
+ VMOVUPS_A2(-12 * SIZE, A1, LDA3, 1, %xmm6)
+ VMOVUPS_A2(-10 * SIZE, A1, LDA3, 1, %xmm7)
+
+ vfmaddpd %xmm0 , %xmm11, %xmm4 , %xmm0
+ vfmaddpd %xmm1 , %xmm11, %xmm5 , %xmm1
+ vfmaddpd %xmm2 , %xmm11, %xmm6 , %xmm2
+ vfmaddpd %xmm3 , %xmm11, %xmm7 , %xmm3
+
+ VMOVUPS_A1(-16 * SIZE, A2, %xmm4)
+ VMOVUPS_A1(-14 * SIZE, A2, %xmm5)
+ VMOVUPS_A1(-12 * SIZE, A2, %xmm6)
+ VMOVUPS_A1(-10 * SIZE, A2, %xmm7)
+
+ vfmaddpd %xmm0 , %xmm12, %xmm4 , %xmm0
+ vfmaddpd %xmm1 , %xmm12, %xmm5 , %xmm1
+ vfmaddpd %xmm2 , %xmm12, %xmm6 , %xmm2
+ vfmaddpd %xmm3 , %xmm12, %xmm7 , %xmm3
+
+ VMOVUPS_A2(-16 * SIZE, A2, LDA, 1, %xmm4)
+ VMOVUPS_A2(-14 * SIZE, A2, LDA, 1, %xmm5)
+ VMOVUPS_A2(-12 * SIZE, A2, LDA, 1, %xmm6)
+ VMOVUPS_A2(-10 * SIZE, A2, LDA, 1, %xmm7)
+
+ vfmaddpd %xmm0 , %xmm13, %xmm4 , %xmm0
+ vfmaddpd %xmm1 , %xmm13, %xmm5 , %xmm1
+ vfmaddpd %xmm2 , %xmm13, %xmm6 , %xmm2
+ vfmaddpd %xmm3 , %xmm13, %xmm7 , %xmm3
+
+ VMOVUPS_A2(-16 * SIZE, A2, LDA, 2, %xmm4)
+ VMOVUPS_A2(-14 * SIZE, A2, LDA, 2, %xmm5)
+ VMOVUPS_A2(-12 * SIZE, A2, LDA, 2, %xmm6)
+ VMOVUPS_A2(-10 * SIZE, A2, LDA, 2, %xmm7)
+
+ vfmaddpd %xmm0 , %xmm14, %xmm4 , %xmm0
+ vfmaddpd %xmm1 , %xmm14, %xmm5 , %xmm1
+ vfmaddpd %xmm2 , %xmm14, %xmm6 , %xmm2
+ vfmaddpd %xmm3 , %xmm14, %xmm7 , %xmm3
+
+ VMOVUPS_A2(-16 * SIZE, A2, LDA3, 1, %xmm4)
+ VMOVUPS_A2(-14 * SIZE, A2, LDA3, 1, %xmm5)
+ VMOVUPS_A2(-12 * SIZE, A2, LDA3, 1, %xmm6)
+ VMOVUPS_A2(-10 * SIZE, A2, LDA3, 1, %xmm7)
+
+ vfmaddpd %xmm0 , %xmm15, %xmm4 , %xmm0
+ vfmaddpd %xmm1 , %xmm15, %xmm5 , %xmm1
+ vfmaddpd %xmm2 , %xmm15, %xmm6 , %xmm2
+ vfmaddpd %xmm3 , %xmm15, %xmm7 , %xmm3
+
+ VMOVUPS_YS1(-16 * SIZE, Y1, %xmm0)
+ VMOVUPS_YS1(-14 * SIZE, Y1, %xmm1)
+
+ VMOVUPS_YS1(-12 * SIZE, Y1, %xmm2)
+ VMOVUPS_YS1(-10 * SIZE, Y1, %xmm3)
+
+ subq $-8 * SIZE, A1
+ subq $-8 * SIZE, A2
+ subq $-8 * SIZE, Y1
+ ALIGN_3
+
+.L15:
+ testq $4, MM
+ je .L16
+
+ VMOVUPS_A1(-16 * SIZE, A1, %xmm4)
+ VMOVUPS_A1(-14 * SIZE, A1, %xmm5)
+ VMOVUPS_A2(-16 * SIZE, A1, LDA, 1, %xmm6)
+ VMOVUPS_A2(-14 * SIZE, A1, LDA, 1, %xmm7)
+
+ VMOVUPS_YL1(-16 * SIZE, Y1, %xmm0)
+ VMOVUPS_YL1(-14 * SIZE, Y1, %xmm1)
+
+ vfmaddpd %xmm0 , %xmm8 , %xmm4 , %xmm0
+ vfmaddpd %xmm1 , %xmm8 , %xmm5 , %xmm1
+ vfmaddpd %xmm0 , %xmm9 , %xmm6 , %xmm0
+ vfmaddpd %xmm1 , %xmm9 , %xmm7 , %xmm1
+
+ VMOVUPS_A2(-16 * SIZE, A1, LDA, 2, %xmm4)
+ VMOVUPS_A2(-14 * SIZE, A1, LDA, 2, %xmm5)
+ VMOVUPS_A2(-16 * SIZE, A1, LDA3, 1, %xmm6)
+ VMOVUPS_A2(-14 * SIZE, A1, LDA3, 1, %xmm7)
+
+ vfmaddpd %xmm0 , %xmm10, %xmm4 , %xmm0
+ vfmaddpd %xmm1 , %xmm10, %xmm5 , %xmm1
+ vfmaddpd %xmm0 , %xmm11, %xmm6 , %xmm0
+ vfmaddpd %xmm1 , %xmm11, %xmm7 , %xmm1
+
+ VMOVUPS_A1(-16 * SIZE, A2, %xmm4)
+ VMOVUPS_A1(-14 * SIZE, A2, %xmm5)
+ VMOVUPS_A2(-16 * SIZE, A2, LDA, 1, %xmm6)
+ VMOVUPS_A2(-14 * SIZE, A2, LDA, 1, %xmm7)
+
+ vfmaddpd %xmm0 , %xmm12, %xmm4 , %xmm0
+ vfmaddpd %xmm1 , %xmm12, %xmm5 , %xmm1
+ vfmaddpd %xmm0 , %xmm13, %xmm6 , %xmm0
+ vfmaddpd %xmm1 , %xmm13, %xmm7 , %xmm1
+
+ VMOVUPS_A2(-16 * SIZE, A2, LDA, 2, %xmm4)
+ VMOVUPS_A2(-14 * SIZE, A2, LDA, 2, %xmm5)
+ VMOVUPS_A2(-16 * SIZE, A2, LDA3, 1, %xmm6)
+ VMOVUPS_A2(-14 * SIZE, A2, LDA3, 1, %xmm7)
+
+ vfmaddpd %xmm0 , %xmm14, %xmm4 , %xmm0
+ vfmaddpd %xmm1 , %xmm14, %xmm5 , %xmm1
+ vfmaddpd %xmm0 , %xmm15, %xmm6 , %xmm0
+ vfmaddpd %xmm1 , %xmm15, %xmm7 , %xmm1
+
+
+ VMOVUPS_YS1(-16 * SIZE, Y1, %xmm0)
+ VMOVUPS_YS1(-14 * SIZE, Y1, %xmm1)
+
+ addq $4 * SIZE, A1
+ addq $4 * SIZE, A2
+ addq $4 * SIZE, Y1
+ ALIGN_3
+
+.L16:
+ testq $2, MM
+ je .L17
+
+ VMOVUPS_A1(-16 * SIZE, A1, %xmm4)
+ VMOVUPS_A2(-16 * SIZE, A1, LDA, 1, %xmm5)
+ VMOVUPS_A2(-16 * SIZE, A1, LDA, 2, %xmm6)
+ VMOVUPS_A2(-16 * SIZE, A1, LDA3, 1, %xmm7)
+
+ VMOVUPS_YL1(-16 * SIZE, Y1, %xmm0)
+
+ vfmaddpd %xmm0 , %xmm8 , %xmm4 , %xmm0
+ vfmaddpd %xmm0 , %xmm9 , %xmm5 , %xmm0
+ vfmaddpd %xmm0 , %xmm10, %xmm6 , %xmm0
+ vfmaddpd %xmm0 , %xmm11, %xmm7 , %xmm0
+
+ VMOVUPS_A1(-16 * SIZE, A2, %xmm4)
+ VMOVUPS_A2(-16 * SIZE, A2, LDA, 1, %xmm5)
+ VMOVUPS_A2(-16 * SIZE, A2, LDA, 2, %xmm6)
+ VMOVUPS_A2(-16 * SIZE, A2, LDA3, 1, %xmm7)
+
+ vfmaddpd %xmm0 , %xmm12, %xmm4 , %xmm0
+ vfmaddpd %xmm0 , %xmm13, %xmm5 , %xmm0
+ vfmaddpd %xmm0 , %xmm14, %xmm6 , %xmm0
+ vfmaddpd %xmm0 , %xmm15, %xmm7 , %xmm0
+
+ VMOVUPS_YS1(-16 * SIZE, Y1, %xmm0)
+
+ addq $2 * SIZE, A1
+ addq $2 * SIZE, A2
+ addq $2 * SIZE, Y1
+ ALIGN_3
+
+.L17:
+ testq $1, MM
+ je .L18
+
+ vmovsd -16 * SIZE(A1), %xmm4
+ vmovsd -16 * SIZE(A1, LDA), %xmm5
+ vmovsd -16 * SIZE(A1, LDA, 2), %xmm6
+ vmovsd -16 * SIZE(A1, LDA3), %xmm7
+
+ vmovsd -16 * SIZE(Y1), %xmm0
+
+
+ vfmaddsd %xmm0 , %xmm8 , %xmm4 , %xmm0
+ vfmaddsd %xmm0 , %xmm9 , %xmm5 , %xmm0
+ vfmaddsd %xmm0 , %xmm10, %xmm6 , %xmm0
+ vfmaddsd %xmm0 , %xmm11, %xmm7 , %xmm0
+
+ vmovsd -16 * SIZE(A2), %xmm4
+ vmovsd -16 * SIZE(A2, LDA), %xmm5
+ vmovsd -16 * SIZE(A2, LDA, 2), %xmm6
+ vmovsd -16 * SIZE(A2, LDA3), %xmm7
+
+ vfmaddsd %xmm0 , %xmm12, %xmm4 , %xmm0
+ vfmaddsd %xmm0 , %xmm13, %xmm5 , %xmm0
+ vfmaddsd %xmm0 , %xmm14, %xmm6 , %xmm0
+ vfmaddsd %xmm0 , %xmm15, %xmm7 , %xmm0
+
+
+ vmovsd %xmm0, -16 * SIZE(Y1)
+ ALIGN_3
+
+.L18:
+ cmpq $8, N
+ jge .L11
+ ALIGN_3
+
+.L20:
+#endif
+
+#if GEMV_UNROLL >= 4
+
+ cmpq $4, N
+ jl .L30
+
+#if GEMV_UNROLL == 4
+ ALIGN_3
+
+.L21:
+#endif
+
+ subq $4, N
+
+ leaq 16 * SIZE(BUFFER), Y1
+ movq A, A1
+ leaq (A, LDA, 2), A2
+ leaq (A, LDA, 4), A
+
+ vmovddup (X), %xmm12
+ addq INCX, X
+ vmovddup (X), %xmm13
+ addq INCX, X
+ vmovddup (X), %xmm14
+ addq INCX, X
+ vmovddup (X), %xmm15
+ addq INCX, X
+
+ vmovddup ALPHA, %xmm0
+
+ vmulpd %xmm0, %xmm12 , %xmm12
+ vmulpd %xmm0, %xmm13 , %xmm13
+ vmulpd %xmm0, %xmm14 , %xmm14
+ vmulpd %xmm0, %xmm15 , %xmm15
+
+#ifdef ALIGNED_ACCESS
+ testq $SIZE, A
+ je .L2X
+
+ vmovsd -16 * SIZE(A1), %xmm4
+ vmovsd -16 * SIZE(A1, LDA), %xmm5
+ vmovsd -16 * SIZE(A2), %xmm6
+ vmovsd -16 * SIZE(A2, LDA), %xmm7
+
+ vmovsd -16 * SIZE(Y1), %xmm0
+
+ vfmaddsd %xmm0 , %xmm12, %xmm4 , %xmm0
+ vfmaddsd %xmm0 , %xmm13, %xmm5 , %xmm0
+ vfmaddsd %xmm0 , %xmm14, %xmm6 , %xmm0
+ vfmaddsd %xmm0 , %xmm15, %xmm7 , %xmm0
+
+ vmovsd %xmm0, -16 * SIZE(Y1)
+
+ addq $SIZE, A1
+ addq $SIZE, A2
+ addq $SIZE, Y1
+ ALIGN_3
+
+.L2X:
+#endif
+
+ movq MM, I
+ sarq $3, I
+ jle .L25
+
+ VMOVUPS_A1(-16 * SIZE, A1, %xmm0)
+ VMOVUPS_A1(-14 * SIZE, A1, %xmm1)
+ VMOVUPS_A1(-12 * SIZE, A1, %xmm2)
+ VMOVUPS_A1(-10 * SIZE, A1, %xmm3)
+
+ VMOVUPS_YL1(-16 * SIZE, Y1, %xmm8)
+ VMOVUPS_YL1(-14 * SIZE, Y1, %xmm9)
+ VMOVUPS_YL1(-12 * SIZE, Y1, %xmm10)
+ VMOVUPS_YL1(-10 * SIZE, Y1, %xmm11)
+
+ VMOVUPS_A2(-16 * SIZE, A1, LDA, 1, %xmm4)
+ VMOVUPS_A2(-14 * SIZE, A1, LDA, 1, %xmm5)
+ VMOVUPS_A2(-12 * SIZE, A1, LDA, 1, %xmm6)
+ VMOVUPS_A2(-10 * SIZE, A1, LDA, 1, %xmm7)
+
+ decq I
+ jle .L24
+ ALIGN_3
+
+.L23:
+
+
+
+ vfmaddpd %xmm8 , %xmm12, %xmm0 , %xmm8
+ vfmaddpd %xmm9 , %xmm12, %xmm1 , %xmm9
+ vfmaddpd %xmm10, %xmm12, %xmm2 , %xmm10
+ vfmaddpd %xmm11, %xmm12, %xmm3 , %xmm11
+
+ VMOVUPS_A1(-16 * SIZE, A2, %xmm0)
+ VMOVUPS_A1(-14 * SIZE, A2, %xmm1)
+ prefetchnta A_PRE(A2)
+ VMOVUPS_A1(-12 * SIZE, A2, %xmm2)
+ VMOVUPS_A1(-10 * SIZE, A2, %xmm3)
+
+ vfmaddpd %xmm8 , %xmm13, %xmm4 , %xmm8
+ vfmaddpd %xmm9 , %xmm13, %xmm5 , %xmm9
+ vfmaddpd %xmm10, %xmm13, %xmm6 , %xmm10
+ vfmaddpd %xmm11, %xmm13, %xmm7 , %xmm11
+
+ VMOVUPS_A2(-16 * SIZE, A2, LDA, 1, %xmm4)
+ VMOVUPS_A2(-14 * SIZE, A2, LDA, 1, %xmm5)
+ prefetchnta A_PRE(A2, LDA, 1)
+ VMOVUPS_A2(-12 * SIZE, A2, LDA, 1, %xmm6)
+ VMOVUPS_A2(-10 * SIZE, A2, LDA, 1, %xmm7)
+
+ vfmaddpd %xmm8 , %xmm14, %xmm0 , %xmm8
+ vfmaddpd %xmm9 , %xmm14, %xmm1 , %xmm9
+ vfmaddpd %xmm10, %xmm14, %xmm2 , %xmm10
+ vfmaddpd %xmm11, %xmm14, %xmm3 , %xmm11
+
+ VMOVUPS_A1( -8 * SIZE, A1, %xmm0)
+ VMOVUPS_A1( -6 * SIZE, A1, %xmm1)
+ prefetchnta A_PRE(A1)
+ VMOVUPS_A1( -4 * SIZE, A1, %xmm2)
+ VMOVUPS_A1( -2 * SIZE, A1, %xmm3)
+
+ vfmaddpd %xmm8 , %xmm15, %xmm4 , %xmm8
+ vfmaddpd %xmm9 , %xmm15, %xmm5 , %xmm9
+ vfmaddpd %xmm10, %xmm15, %xmm6 , %xmm10
+ vfmaddpd %xmm11, %xmm15, %xmm7 , %xmm11
+
+ VMOVUPS_A2( -8 * SIZE, A1, LDA, 1, %xmm4)
+ VMOVUPS_A2( -6 * SIZE, A1, LDA, 1, %xmm5)
+ prefetchnta A_PRE(A1, LDA, 1)
+ VMOVUPS_A2( -4 * SIZE, A1, LDA, 1, %xmm6)
+ VMOVUPS_A2( -2 * SIZE, A1, LDA, 1, %xmm7)
+
+
+ VMOVUPS_YS1(-16 * SIZE, Y1, %xmm8)
+ VMOVUPS_YS1(-14 * SIZE, Y1, %xmm9)
+ VMOVUPS_YS1(-12 * SIZE, Y1, %xmm10)
+ VMOVUPS_YS1(-10 * SIZE, Y1, %xmm11)
+
+ VMOVUPS_YL1( -8 * SIZE, Y1, %xmm8)
+ VMOVUPS_YL1( -6 * SIZE, Y1, %xmm9)
+ prefetchnta A_PRE(Y1)
+ VMOVUPS_YL1( -4 * SIZE, Y1, %xmm10)
+ VMOVUPS_YL1( -2 * SIZE, Y1, %xmm11)
+
+ subq $-8 * SIZE, A1
+ subq $-8 * SIZE, A2
+ subq $-8 * SIZE, Y1
+
+ subq $1, I
+ BRANCH
+ jg .L23
+ ALIGN_3
+
+.L24:
+
+ vfmaddpd %xmm8 , %xmm12, %xmm0 , %xmm8
+ vfmaddpd %xmm9 , %xmm12, %xmm1 , %xmm9
+ vfmaddpd %xmm10, %xmm12, %xmm2 , %xmm10
+ vfmaddpd %xmm11, %xmm12, %xmm3 , %xmm11
+
+ VMOVUPS_A1(-16 * SIZE, A2, %xmm0)
+ VMOVUPS_A1(-14 * SIZE, A2, %xmm1)
+ VMOVUPS_A1(-12 * SIZE, A2, %xmm2)
+ VMOVUPS_A1(-10 * SIZE, A2, %xmm3)
+
+ vfmaddpd %xmm8 , %xmm13, %xmm4 , %xmm8
+ vfmaddpd %xmm9 , %xmm13, %xmm5 , %xmm9
+ vfmaddpd %xmm10, %xmm13, %xmm6 , %xmm10
+ vfmaddpd %xmm11, %xmm13, %xmm7 , %xmm11
+
+ VMOVUPS_A2(-16 * SIZE, A2, LDA, 1, %xmm4)
+ VMOVUPS_A2(-14 * SIZE, A2, LDA, 1, %xmm5)
+ VMOVUPS_A2(-12 * SIZE, A2, LDA, 1, %xmm6)
+ VMOVUPS_A2(-10 * SIZE, A2, LDA, 1, %xmm7)
+
+ vfmaddpd %xmm8 , %xmm14, %xmm0 , %xmm8
+ vfmaddpd %xmm9 , %xmm14, %xmm1 , %xmm9
+ vfmaddpd %xmm10, %xmm14, %xmm2 , %xmm10
+ vfmaddpd %xmm11, %xmm14, %xmm3 , %xmm11
+
+ vfmaddpd %xmm8 , %xmm15, %xmm4 , %xmm8
+ vfmaddpd %xmm9 , %xmm15, %xmm5 , %xmm9
+ vfmaddpd %xmm10, %xmm15, %xmm6 , %xmm10
+ vfmaddpd %xmm11, %xmm15, %xmm7 , %xmm11
+
+ VMOVUPS_YS1(-16 * SIZE, Y1, %xmm8)
+ VMOVUPS_YS1(-14 * SIZE, Y1, %xmm9)
+ VMOVUPS_YS1(-12 * SIZE, Y1, %xmm10)
+ VMOVUPS_YS1(-10 * SIZE, Y1, %xmm11)
+
+ subq $-8 * SIZE, A1
+ subq $-8 * SIZE, A2
+ subq $-8 * SIZE, Y1
+ ALIGN_3
+
+.L25:
+ testq $4, MM
+ je .L26
+
+ VMOVUPS_A1(-16 * SIZE, A1, %xmm0)
+ VMOVUPS_A1(-14 * SIZE, A1, %xmm1)
+
+ VMOVUPS_YL1(-16 * SIZE, Y1, %xmm8)
+ VMOVUPS_YL1(-14 * SIZE, Y1, %xmm9)
+
+ vfmaddpd %xmm8 , %xmm12, %xmm0 , %xmm8
+ vfmaddpd %xmm9 , %xmm12, %xmm1 , %xmm9
+
+ VMOVUPS_A2(-16 * SIZE, A1, LDA, 1, %xmm4)
+ VMOVUPS_A2(-14 * SIZE, A1, LDA, 1, %xmm5)
+
+ vfmaddpd %xmm8 , %xmm13, %xmm4 , %xmm8
+ vfmaddpd %xmm9 , %xmm13, %xmm5 , %xmm9
+
+ VMOVUPS_A1(-16 * SIZE, A2, %xmm0)
+ VMOVUPS_A1(-14 * SIZE, A2, %xmm1)
+
+ vfmaddpd %xmm8 , %xmm14, %xmm0 , %xmm8
+ vfmaddpd %xmm9 , %xmm14, %xmm1 , %xmm9
+
+ VMOVUPS_A2(-16 * SIZE, A2, LDA, 1, %xmm4)
+ VMOVUPS_A2(-14 * SIZE, A2, LDA, 1, %xmm5)
+
+ vfmaddpd %xmm8 , %xmm15, %xmm4 , %xmm8
+ vfmaddpd %xmm9 , %xmm15, %xmm5 , %xmm9
+
+ VMOVUPS_YS1(-16 * SIZE, Y1, %xmm8)
+ VMOVUPS_YS1(-14 * SIZE, Y1, %xmm9)
+
+ addq $4 * SIZE, A1
+ addq $4 * SIZE, A2
+ addq $4 * SIZE, Y1
+ ALIGN_3
+
+.L26:
+ testq $2, MM
+ je .L27
+
+ VMOVUPS_A1(-16 * SIZE, A1, %xmm8)
+ VMOVUPS_A2(-16 * SIZE, A1, LDA, 1, %xmm9)
+ VMOVUPS_A1(-16 * SIZE, A2, %xmm10)
+ VMOVUPS_A2(-16 * SIZE, A2, LDA, 1, %xmm11)
+
+ VMOVUPS_YL1(-16 * SIZE, Y1, %xmm0)
+
+ vfmaddpd %xmm0 , %xmm12, %xmm8 , %xmm0
+ vfmaddpd %xmm0 , %xmm13, %xmm9 , %xmm0
+ vfmaddpd %xmm0 , %xmm14, %xmm10, %xmm0
+ vfmaddpd %xmm0 , %xmm15, %xmm11, %xmm0
+
+ VMOVUPS_YS1(-16 * SIZE, Y1, %xmm0)
+
+ addq $2 * SIZE, A1
+ addq $2 * SIZE, A2
+ addq $2 * SIZE, Y1
+ ALIGN_3
+
+.L27:
+ testq $1, MM
+#if GEMV_UNROLL == 4
+ je .L28
+#else
+ je .L30
+#endif
+
+ vmovsd -16 * SIZE(Y1), %xmm0
+
+ vmovsd -16 * SIZE(A1), %xmm8
+ vmovsd -16 * SIZE(A1, LDA), %xmm9
+ vmovsd -16 * SIZE(A2), %xmm10
+ vmovsd -16 * SIZE(A2, LDA), %xmm11
+
+ vfmaddsd %xmm0 , %xmm12, %xmm8 , %xmm0
+ vfmaddsd %xmm0 , %xmm13, %xmm9 , %xmm0
+ vfmaddsd %xmm0 , %xmm14, %xmm10, %xmm0
+ vfmaddsd %xmm0 , %xmm15, %xmm11, %xmm0
+
+ vmovsd %xmm0, -16 * SIZE(Y1)
+ ALIGN_3
+
+#if GEMV_UNROLL == 4
+.L28:
+ cmpq $4, N
+ jge .L21
+ ALIGN_3
+
+#endif
+
+.L30:
+#endif
+
+#if GEMV_UNROLL >= 2
+
+ cmpq $2, N
+ jl .L40
+
+#if GEMV_UNROLL == 2
+ ALIGN_3
+
+.L31:
+#endif
+
+ subq $2, N
+
+ leaq 16 * SIZE(BUFFER), Y1
+ movq A, A1
+ leaq (A, LDA), A2
+ leaq (A, LDA, 2), A
+
+ vmovddup (X), %xmm12
+ addq INCX, X
+ vmovddup (X), %xmm13
+ addq INCX, X
+
+ vmovddup ALPHA, %xmm0
+
+ vmulpd %xmm0, %xmm12 , %xmm12
+ vmulpd %xmm0, %xmm13 , %xmm13
+
+#ifdef ALIGNED_ACCESS
+ testq $SIZE, A
+ je .L3X
+
+ vmovsd -16 * SIZE(A1), %xmm4
+ vmovsd -16 * SIZE(A2), %xmm5
+
+ vmovsd -16 * SIZE(Y1), %xmm0
+
+ vfmaddsd %xmm0 , %xmm12, %xmm4 , %xmm0
+ vfmaddsd %xmm0 , %xmm13, %xmm5 , %xmm0
+
+ vmovsd %xmm0, -16 * SIZE(Y1)
+
+ addq $SIZE, A1
+ addq $SIZE, A2
+ addq $SIZE, Y1
+ ALIGN_3
+
+.L3X:
+#endif
+
+ movq MM, I
+ sarq $3, I
+ jle .L35
+
+ VMOVUPS_A1(-16 * SIZE, A1, %xmm0)
+ VMOVUPS_A1(-14 * SIZE, A1, %xmm1)
+ VMOVUPS_A1(-12 * SIZE, A1, %xmm2)
+ VMOVUPS_A1(-10 * SIZE, A1, %xmm3)
+
+ VMOVUPS_YL1(-16 * SIZE, Y1, %xmm8)
+ VMOVUPS_YL1(-14 * SIZE, Y1, %xmm9)
+ VMOVUPS_YL1(-12 * SIZE, Y1, %xmm10)
+ VMOVUPS_YL1(-10 * SIZE, Y1, %xmm11)
+
+ VMOVUPS_A1(-16 * SIZE, A2, %xmm4)
+ VMOVUPS_A1(-14 * SIZE, A2, %xmm5)
+ VMOVUPS_A1(-12 * SIZE, A2, %xmm6)
+ VMOVUPS_A1(-10 * SIZE, A2, %xmm7)
+
+ decq I
+ jle .L34
+ ALIGN_3
+
+.L33:
+
+
+ vfmaddpd %xmm8 , %xmm12 , %xmm0 , %xmm8
+ vfmaddpd %xmm9 , %xmm12 , %xmm1 , %xmm9
+ vfmaddpd %xmm10, %xmm12 , %xmm2 , %xmm10
+ vfmaddpd %xmm11, %xmm12 , %xmm3 , %xmm11
+
+ vmovups -8 * SIZE(A1), %xmm0
+ vmovups -6 * SIZE(A1), %xmm1
+ prefetchnta A_PRE(A1)
+ vmovups -4 * SIZE(A1), %xmm2
+ vmovups -2 * SIZE(A1), %xmm3
+
+
+ vfmaddpd %xmm8 , %xmm13 , %xmm4 , %xmm8
+ vfmaddpd %xmm9 , %xmm13 , %xmm5 , %xmm9
+ prefetchnta A_PRE(A2)
+ vfmaddpd %xmm10, %xmm13 , %xmm6 , %xmm10
+ vfmaddpd %xmm11, %xmm13 , %xmm7 , %xmm11
+
+ vmovups -8 * SIZE(A2), %xmm4
+ vmovups -6 * SIZE(A2), %xmm5
+ vmovups -4 * SIZE(A2), %xmm6
+ vmovups -2 * SIZE(A2) , %xmm7
+
+
+ VMOVUPS_YS1(-16 * SIZE, Y1, %xmm8)
+ VMOVUPS_YS1(-14 * SIZE, Y1, %xmm9)
+ VMOVUPS_YS1(-12 * SIZE, Y1, %xmm10)
+ VMOVUPS_YS1(-10 * SIZE, Y1, %xmm11)
+
+ VMOVUPS_YL1( -8 * SIZE, Y1, %xmm8)
+ VMOVUPS_YL1( -6 * SIZE, Y1, %xmm9)
+ prefetchnta A_PRE(Y1)
+ VMOVUPS_YL1( -4 * SIZE, Y1, %xmm10)
+ VMOVUPS_YL1( -2 * SIZE, Y1, %xmm11)
+
+ subq $-8 * SIZE, A1
+ subq $-8 * SIZE, A2
+ subq $-8 * SIZE, Y1
+
+ subq $1, I
+ BRANCH
+ jg .L33
+ ALIGN_3
+
+.L34:
+ vfmaddpd %xmm8 , %xmm12 , %xmm0 , %xmm8
+ vfmaddpd %xmm9 , %xmm12 , %xmm1 , %xmm9
+ vfmaddpd %xmm10, %xmm12 , %xmm2 , %xmm10
+ vfmaddpd %xmm11, %xmm12 , %xmm3 , %xmm11
+
+ vfmaddpd %xmm8 , %xmm13 , %xmm4 , %xmm8
+ vfmaddpd %xmm9 , %xmm13 , %xmm5 , %xmm9
+ vfmaddpd %xmm10, %xmm13 , %xmm6 , %xmm10
+ vfmaddpd %xmm11, %xmm13 , %xmm7 , %xmm11
+
+ VMOVUPS_YS1(-16 * SIZE, Y1, %xmm8)
+ VMOVUPS_YS1(-14 * SIZE, Y1, %xmm9)
+ VMOVUPS_YS1(-12 * SIZE, Y1, %xmm10)
+ VMOVUPS_YS1(-10 * SIZE, Y1, %xmm11)
+
+ subq $-8 * SIZE, A1
+ subq $-8 * SIZE, A2
+ subq $-8 * SIZE, Y1
+ ALIGN_3
+
+.L35:
+ testq $4, MM
+ je .L36
+
+
+ VMOVUPS_A1(-16 * SIZE, A1, %xmm0)
+ VMOVUPS_A1(-14 * SIZE, A1, %xmm1)
+
+ VMOVUPS_YL1(-16 * SIZE, Y1, %xmm8)
+ VMOVUPS_YL1(-14 * SIZE, Y1, %xmm9)
+
+ vfmaddpd %xmm8 , %xmm12 , %xmm0 , %xmm8
+ vfmaddpd %xmm9 , %xmm12 , %xmm1 , %xmm9
+
+ VMOVUPS_A1(-16 * SIZE, A2, %xmm4)
+ VMOVUPS_A1(-14 * SIZE, A2, %xmm5)
+
+ vfmaddpd %xmm8 , %xmm13 , %xmm4 , %xmm8
+ vfmaddpd %xmm9 , %xmm13 , %xmm5 , %xmm9
+
+ VMOVUPS_YS1(-16 * SIZE, Y1, %xmm8)
+ VMOVUPS_YS1(-14 * SIZE, Y1, %xmm9)
+
+ addq $4 * SIZE, A1
+ addq $4 * SIZE, A2
+ addq $4 * SIZE, Y1
+ ALIGN_3
+
+.L36:
+ testq $2, MM
+ je .L37
+
+ VMOVUPS_A1(-16 * SIZE, A1, %xmm8)
+ VMOVUPS_A1(-16 * SIZE, A2, %xmm9)
+
+ VMOVUPS_YL1(-16 * SIZE, Y1, %xmm0)
+
+ vfmaddpd %xmm0 , %xmm12 , %xmm8 , %xmm0
+ vfmaddpd %xmm0 , %xmm13 , %xmm9 , %xmm0
+
+ VMOVUPS_YS1(-16 * SIZE, Y1, %xmm0)
+
+ addq $2 * SIZE, A1
+ addq $2 * SIZE, A2
+ addq $2 * SIZE, Y1
+ ALIGN_3
+
+.L37:
+ testq $1, MM
+#if GEMV_UNROLL == 2
+ je .L38
+#else
+ je .L40
+#endif
+
+ vmovsd -16 * SIZE(Y1), %xmm0
+
+ vmovsd -16 * SIZE(A1), %xmm8
+ vmovsd -16 * SIZE(A2), %xmm9
+
+ vfmaddsd %xmm0 , %xmm12 , %xmm8 , %xmm0
+ vfmaddsd %xmm0 , %xmm13 , %xmm9 , %xmm0
+
+ vmovsd %xmm0, -16 * SIZE(Y1)
+ ALIGN_3
+
+#if GEMV_UNROLL == 2
+.L38:
+ cmpq $2, N
+ jge .L31
+ ALIGN_3
+
+#endif
+
+.L40:
+ cmpq $1, N
+ jl .L900
+#endif
+
+ leaq 16 * SIZE(BUFFER), Y1
+ movq A, A1
+
+ vmovddup (X), %xmm12
+ addq INCX, X
+
+ vmovddup ALPHA, %xmm0
+
+ vmulpd %xmm0, %xmm12 , %xmm12
+
+#ifdef ALIGNED_ACCESS
+ testq $SIZE, A
+ je .L4X
+
+ vmovsd -16 * SIZE(A1), %xmm4
+ vmovsd -16 * SIZE(Y1), %xmm0
+
+ vfmaddsd %xmm0 , %xmm12 , %xmm4 , %xmm0
+
+ vmovsd %xmm0, -16 * SIZE(Y1)
+
+ addq $SIZE, A1
+ addq $SIZE, Y1
+ ALIGN_3
+
+.L4X:
+#endif
+
+ movq MM, I
+ sarq $3, I
+ jle .L45
+
+ VMOVUPS_A1(-16 * SIZE, A1, %xmm0)
+ VMOVUPS_A1(-14 * SIZE, A1, %xmm1)
+ VMOVUPS_A1(-12 * SIZE, A1, %xmm2)
+ VMOVUPS_A1(-10 * SIZE, A1, %xmm3)
+
+ VMOVUPS_YL1(-16 * SIZE, Y1, %xmm8)
+ VMOVUPS_YL1(-14 * SIZE, Y1, %xmm9)
+ VMOVUPS_YL1(-12 * SIZE, Y1, %xmm10)
+ VMOVUPS_YL1(-10 * SIZE, Y1, %xmm11)
+
+ decq I
+ jle .L44
+ ALIGN_3
+
+.L43:
+
+
+ vfmaddpd %xmm8 , %xmm12 , %xmm0 , %xmm8
+ vfmaddpd %xmm9 , %xmm12 , %xmm1 , %xmm9
+ vfmaddpd %xmm10, %xmm12 , %xmm2 , %xmm10
+ vfmaddpd %xmm11, %xmm12 , %xmm3 , %xmm11
+
+ VMOVUPS_A1( -8 * SIZE, A1, %xmm0)
+ VMOVUPS_A1( -6 * SIZE, A1, %xmm1)
+ VMOVUPS_A1( -4 * SIZE, A1, %xmm2)
+ VMOVUPS_A1( -2 * SIZE, A1, %xmm3)
+
+
+ VMOVUPS_YS1(-16 * SIZE, Y1, %xmm8)
+ VMOVUPS_YS1(-14 * SIZE, Y1, %xmm9)
+ VMOVUPS_YS1(-12 * SIZE, Y1, %xmm10)
+ VMOVUPS_YS1(-10 * SIZE, Y1, %xmm11)
+
+ VMOVUPS_YL1( -8 * SIZE, Y1, %xmm8)
+ VMOVUPS_YL1( -6 * SIZE, Y1, %xmm9)
+ VMOVUPS_YL1( -4 * SIZE, Y1, %xmm10)
+ VMOVUPS_YL1( -2 * SIZE, Y1, %xmm11)
+
+ subq $-8 * SIZE, A1
+ subq $-8 * SIZE, Y1
+
+ subq $1, I
+ BRANCH
+ jg .L43
+ ALIGN_3
+
+.L44:
+
+ vfmaddpd %xmm8 , %xmm12 , %xmm0 , %xmm8
+ vfmaddpd %xmm9 , %xmm12 , %xmm1 , %xmm9
+ vfmaddpd %xmm10, %xmm12 , %xmm2 , %xmm10
+ vfmaddpd %xmm11, %xmm12 , %xmm3 , %xmm11
+
+ VMOVUPS_YS1(-16 * SIZE, Y1, %xmm8)
+ VMOVUPS_YS1(-14 * SIZE, Y1, %xmm9)
+ VMOVUPS_YS1(-12 * SIZE, Y1, %xmm10)
+ VMOVUPS_YS1(-10 * SIZE, Y1, %xmm11)
+
+ subq $-8 * SIZE, A1
+ subq $-8 * SIZE, Y1
+ ALIGN_3
+
+.L45:
+ testq $4, MM
+ je .L46
+
+ VMOVUPS_A1(-16 * SIZE, A1, %xmm0)
+ VMOVUPS_A1(-14 * SIZE, A1, %xmm1)
+
+ VMOVUPS_YL1(-16 * SIZE, Y1, %xmm8)
+ VMOVUPS_YL1(-14 * SIZE, Y1, %xmm9)
+
+ vfmaddpd %xmm8 , %xmm12 , %xmm0 , %xmm8
+ vfmaddpd %xmm9 , %xmm12 , %xmm1 , %xmm9
+
+ VMOVUPS_YS1(-16 * SIZE, Y1, %xmm8)
+ VMOVUPS_YS1(-14 * SIZE, Y1, %xmm9)
+
+ addq $4 * SIZE, A1
+ addq $4 * SIZE, Y1
+ ALIGN_3
+
+.L46:
+ testq $2, MM
+ je .L47
+
+ VMOVUPS_A1(-16 * SIZE, A1, %xmm8)
+
+ VMOVUPS_YL1(-16 * SIZE, Y1, %xmm0)
+
+ vfmaddpd %xmm0 , %xmm12 , %xmm8 , %xmm0
+
+ VMOVUPS_YS1(-16 * SIZE, Y1, %xmm0)
+
+ addq $2 * SIZE, A1
+ addq $2 * SIZE, Y1
+ ALIGN_3
+
+.L47:
+ testq $1, MM
+ je .L900
+
+ vmovsd -16 * SIZE(Y1), %xmm0
+ vmovsd -16 * SIZE(A1), %xmm8
+
+ vfmaddsd %xmm0 , %xmm12 , %xmm8 , %xmm0
+
+ vmovsd %xmm0, -16 * SIZE(Y1)
+ ALIGN_3
+
+#ifdef ALIGNED_ACCESS
+ jmp .L900
+ ALIGN_3
+
+.L50:
+#if GEMV_UNROLL >= 4
+
+ cmpq $4, N
+ jl .L60
+ ALIGN_3
+
+.L51:
+
+ subq $4, N
+
+ leaq 16 * SIZE(BUFFER), Y1
+ movq A, A1
+ leaq (A, LDA, 2), A2
+ leaq (A, LDA, 4), A
+
+ vmovddup (X), %xmm12
+ addq INCX, X
+ vmovddup (X), %xmm13
+ addq INCX, X
+ vmovddup (X), %xmm14
+ addq INCX, X
+ vmovddup (X), %xmm15
+ addq INCX, X
+
+ vmovddup ALPHA, %xmm0
+
+ vmulpd %xmm0, %xmm12 , %xmm12
+ vmulpd %xmm0, %xmm13 , %xmm13
+ vmulpd %xmm0, %xmm14 , %xmm14
+ vmulpd %xmm0, %xmm15 , %xmm15
+
+ testq $SIZE, A
+ je .L5X
+
+ vmovsd -16 * SIZE(A1), %xmm4
+ vmovsd -16 * SIZE(A1, LDA), %xmm5
+ vmovsd -16 * SIZE(A2), %xmm6
+ vmovsd -16 * SIZE(A2, LDA), %xmm7
+
+ vmovsd -16 * SIZE(Y1), %xmm0
+
+ vfmaddsd %xmm0 , %xmm12 , %xmm4 , %xmm0
+ vfmaddsd %xmm0 , %xmm13 , %xmm5 , %xmm0
+ vfmaddsd %xmm0 , %xmm14 , %xmm6 , %xmm0
+ vfmaddsd %xmm0 , %xmm15 , %xmm7 , %xmm0
+
+ vmovsd %xmm0, -16 * SIZE(Y1)
+
+ addq $SIZE, A1
+ addq $SIZE, A2
+ addq $SIZE, Y1
+ ALIGN_3
+
+.L5X:
+ movhpd -16 * SIZE(A1, LDA), %xmm8
+ movhpd -16 * SIZE(A2, LDA), %xmm9
+
+ movq MM, I
+ sarq $3, I
+ jle .L55
+
+ VMOVUPS_A1(-16 * SIZE, A1, %xmm4)
+ VMOVUPS_A1(-14 * SIZE, A1, %xmm5)
+ VMOVUPS_A1(-12 * SIZE, A1, %xmm6)
+
+ VMOVUPS_YL1(-16 * SIZE, Y1, %xmm0)
+ VMOVUPS_YL1(-14 * SIZE, Y1, %xmm1)
+ VMOVUPS_YL1(-12 * SIZE, Y1, %xmm2)
+ VMOVUPS_YL1(-10 * SIZE, Y1, %xmm3)
+
+ decq I
+ jle .L54
+ ALIGN_3
+
+.L53:
+
+
+
+ vfmaddpd %xmm0 , %xmm12 , %xmm4 , %xmm0
+ VMOVUPS_A1(-10 * SIZE, A1, %xmm7)
+ vfmaddpd %xmm1 , %xmm12 , %xmm5 , %xmm1
+ VMOVUPS_A2(-15 * SIZE, A1, LDA, 1, %xmm4)
+
+ prefetchnta A_PRE(A1, LDA, 1)
+ vfmaddpd %xmm2 , %xmm12 , %xmm6 , %xmm2
+ VMOVUPS_A2(-13 * SIZE, A1, LDA, 1, %xmm5)
+ vfmaddpd %xmm3 , %xmm12 , %xmm7 , %xmm3
+ VMOVUPS_A2(-11 * SIZE, A1, LDA, 1, %xmm6)
+
+
+ shufpd $1, %xmm4, %xmm8
+ vfmaddpd %xmm0 , %xmm13 , %xmm8 , %xmm0
+ VMOVUPS_A2( -9 * SIZE, A1, LDA, 1, %xmm8)
+ shufpd $1, %xmm5, %xmm4
+ vfmaddpd %xmm1 , %xmm13 , %xmm4 , %xmm1
+ VMOVUPS_A1(-16 * SIZE, A2, %xmm4)
+
+ prefetchnta A_PRE(A2)
+ shufpd $1, %xmm6, %xmm5
+ vfmaddpd %xmm2 , %xmm13 , %xmm5 , %xmm2
+ VMOVUPS_A1(-14 * SIZE, A2, %xmm5)
+ shufpd $1, %xmm8, %xmm6
+ vfmaddpd %xmm3 , %xmm13 , %xmm6 , %xmm3
+ VMOVUPS_A1(-12 * SIZE, A2, %xmm6)
+
+
+ vfmaddpd %xmm0 , %xmm14 , %xmm4 , %xmm0
+ VMOVUPS_A1(-10 * SIZE, A2, %xmm7)
+ vfmaddpd %xmm1 , %xmm14 , %xmm5 , %xmm1
+ VMOVUPS_A2(-15 * SIZE, A2, LDA, 1, %xmm4)
+
+ prefetchnta A_PRE(A2, LDA, 1)
+ vfmaddpd %xmm2 , %xmm14 , %xmm6 , %xmm2
+ VMOVUPS_A2(-13 * SIZE, A2, LDA, 1, %xmm5)
+ vfmaddpd %xmm3 , %xmm14 , %xmm7 , %xmm3
+ VMOVUPS_A2(-11 * SIZE, A2, LDA, 1, %xmm6)
+
+
+ shufpd $1, %xmm4, %xmm9
+ vfmaddpd %xmm0 , %xmm15 , %xmm9 , %xmm0
+ VMOVUPS_A2( -9 * SIZE, A2, LDA, 1, %xmm9)
+ shufpd $1, %xmm5, %xmm4
+ vfmaddpd %xmm1 , %xmm15 , %xmm4 , %xmm1
+ VMOVUPS_A1( -8 * SIZE, A1, %xmm4)
+
+ prefetchnta A_PRE(A1)
+ shufpd $1, %xmm6, %xmm5
+ vfmaddpd %xmm2 , %xmm15 , %xmm5 , %xmm2
+ VMOVUPS_A1( -6 * SIZE, A1, %xmm5)
+ shufpd $1, %xmm9, %xmm6
+ vfmaddpd %xmm3 , %xmm15 , %xmm6 , %xmm3
+ VMOVUPS_A1( -4 * SIZE, A1, %xmm6)
+
+
+ VMOVUPS_YS1(-16 * SIZE, Y1, %xmm0)
+ VMOVUPS_YS1(-14 * SIZE, Y1, %xmm1)
+ VMOVUPS_YS1(-12 * SIZE, Y1, %xmm2)
+ VMOVUPS_YS1(-10 * SIZE, Y1, %xmm3)
+
+ VMOVUPS_YL1( -8 * SIZE, Y1, %xmm0)
+ VMOVUPS_YL1( -6 * SIZE, Y1, %xmm1)
+ prefetchnta A_PRE(Y1)
+ VMOVUPS_YL1( -4 * SIZE, Y1, %xmm2)
+ VMOVUPS_YL1( -2 * SIZE, Y1, %xmm3)
+
+ subq $-8 * SIZE, A1
+ subq $-8 * SIZE, A2
+ subq $-8 * SIZE, Y1
+
+ subq $1, I
+ BRANCH
+ jg .L53
+ ALIGN_3
+
+
+.L54:
+
+ vfmaddpd %xmm0 , %xmm12 , %xmm4 , %xmm0
+ VMOVUPS_A1(-10 * SIZE, A1, %xmm7)
+ vfmaddpd %xmm1 , %xmm12 , %xmm5 , %xmm1
+ VMOVUPS_A2(-15 * SIZE, A1, LDA, 1, %xmm4)
+
+ vfmaddpd %xmm2 , %xmm12 , %xmm6 , %xmm2
+ VMOVUPS_A2(-13 * SIZE, A1, LDA, 1, %xmm5)
+ vfmaddpd %xmm3 , %xmm12 , %xmm7 , %xmm3
+ VMOVUPS_A2(-11 * SIZE, A1, LDA, 1, %xmm6)
+
+ shufpd $1, %xmm4, %xmm8
+ vfmaddpd %xmm0 , %xmm13 , %xmm8 , %xmm0
+ VMOVUPS_A2( -9 * SIZE, A1, LDA, 1, %xmm8)
+ shufpd $1, %xmm5, %xmm4
+ vfmaddpd %xmm1 , %xmm13 , %xmm4 , %xmm1
+ VMOVUPS_A1(-16 * SIZE, A2, %xmm4)
+
+ shufpd $1, %xmm6, %xmm5
+ vfmaddpd %xmm2 , %xmm13 , %xmm5 , %xmm2
+ VMOVUPS_A1(-14 * SIZE, A2, %xmm5)
+ shufpd $1, %xmm8, %xmm6
+ vfmaddpd %xmm3 , %xmm13 , %xmm6 , %xmm3
+ VMOVUPS_A1(-12 * SIZE, A2, %xmm6)
+
+ vfmaddpd %xmm0 , %xmm14 , %xmm4 , %xmm0
+ VMOVUPS_A1(-10 * SIZE, A2, %xmm7)
+ vfmaddpd %xmm1 , %xmm14 , %xmm5 , %xmm1
+ VMOVUPS_A2(-15 * SIZE, A2, LDA, 1, %xmm4)
+
+ vfmaddpd %xmm2 , %xmm14 , %xmm6 , %xmm2
+ VMOVUPS_A2(-13 * SIZE, A2, LDA, 1, %xmm5)
+ vfmaddpd %xmm3 , %xmm14 , %xmm7 , %xmm3
+ VMOVUPS_A2(-11 * SIZE, A2, LDA, 1, %xmm6)
+
+ shufpd $1, %xmm4, %xmm9
+ vfmaddpd %xmm0 , %xmm15 , %xmm9 , %xmm0
+ VMOVUPS_A2( -9 * SIZE, A2, LDA, 1, %xmm9)
+
+ shufpd $1, %xmm5, %xmm4
+ vfmaddpd %xmm1 , %xmm15 , %xmm4 , %xmm1
+ shufpd $1, %xmm6, %xmm5
+ vfmaddpd %xmm2 , %xmm15 , %xmm5 , %xmm2
+ shufpd $1, %xmm9, %xmm6
+ vfmaddpd %xmm3 , %xmm15 , %xmm6 , %xmm3
+
+ VMOVUPS_YS1(-16 * SIZE, Y1, %xmm0)
+ VMOVUPS_YS1(-14 * SIZE, Y1, %xmm1)
+ VMOVUPS_YS1(-12 * SIZE, Y1, %xmm2)
+ VMOVUPS_YS1(-10 * SIZE, Y1, %xmm3)
+
+ subq $-8 * SIZE, A1
+ subq $-8 * SIZE, A2
+ subq $-8 * SIZE, Y1
+ ALIGN_3
+
+.L55:
+ testq $4, MM
+ je .L56
+
+ VMOVUPS_A1(-16 * SIZE, A1, %xmm4)
+ VMOVUPS_A1(-14 * SIZE, A1, %xmm5)
+
+ VMOVUPS_YL1(-16 * SIZE, Y1, %xmm0)
+ VMOVUPS_YL1(-14 * SIZE, Y1, %xmm1)
+
+ vfmaddpd %xmm0 , %xmm12 , %xmm4 , %xmm0
+ vfmaddpd %xmm1 , %xmm12 , %xmm5 , %xmm1
+
+ VMOVUPS_A2(-15 * SIZE, A1, LDA, 1, %xmm6)
+ VMOVUPS_A2(-13 * SIZE, A1, LDA, 1, %xmm7)
+
+ shufpd $1, %xmm6, %xmm8
+ vfmaddpd %xmm0 , %xmm13 , %xmm8 , %xmm0
+ movaps %xmm7, %xmm8
+ shufpd $1, %xmm7, %xmm6
+ vfmaddpd %xmm1 , %xmm13 , %xmm6 , %xmm1
+
+ VMOVUPS_A1(-16 * SIZE, A2, %xmm4)
+ VMOVUPS_A1(-14 * SIZE, A2, %xmm5)
+
+ vfmaddpd %xmm0 , %xmm14 , %xmm4 , %xmm0
+ vfmaddpd %xmm1 , %xmm14 , %xmm5 , %xmm1
+
+ VMOVUPS_A2(-15 * SIZE, A2, LDA, 1, %xmm6)
+ VMOVUPS_A2(-13 * SIZE, A2, LDA, 1, %xmm7)
+
+ shufpd $1, %xmm6, %xmm9
+ vfmaddpd %xmm0 , %xmm15 , %xmm9 , %xmm0
+ movaps %xmm7, %xmm9
+ shufpd $1, %xmm7, %xmm6
+ vfmaddpd %xmm1 , %xmm15 , %xmm6 , %xmm1
+
+ VMOVUPS_YS1(-16 * SIZE, Y1, %xmm0)
+ VMOVUPS_YS1(-14 * SIZE, Y1, %xmm1)
+
+ addq $4 * SIZE, A1
+ addq $4 * SIZE, A2
+ addq $4 * SIZE, Y1
+ ALIGN_3
+
+.L56:
+ testq $2, MM
+ je .L57
+
+ VMOVUPS_A1(-16 * SIZE, A1, %xmm4)
+ VMOVUPS_A2(-15 * SIZE, A1, LDA, 1, %xmm5)
+ VMOVUPS_A1(-16 * SIZE, A2, %xmm6)
+ VMOVUPS_A2(-15 * SIZE, A2, LDA, 1, %xmm7)
+
+ VMOVUPS_YL1(-16 * SIZE, Y1, %xmm0)
+
+ vfmaddpd %xmm0 , %xmm12 , %xmm4 , %xmm0
+ shufpd $1, %xmm5, %xmm8
+ vfmaddpd %xmm0 , %xmm13 , %xmm8 , %xmm0
+ movaps %xmm5, %xmm8
+ vfmaddpd %xmm0 , %xmm14 , %xmm6 , %xmm0
+ shufpd $1, %xmm7, %xmm9
+ vfmaddpd %xmm0 , %xmm15 , %xmm9 , %xmm0
+ movaps %xmm7, %xmm9
+
+ VMOVUPS_YS1(-16 * SIZE, Y1, %xmm0)
+
+ addq $2 * SIZE, A1
+ addq $2 * SIZE, A2
+ addq $2 * SIZE, Y1
+ ALIGN_3
+
+.L57:
+ testq $1, MM
+ je .L58
+
+ vmovsd -16 * SIZE(Y1), %xmm0
+
+ vmovsd -16 * SIZE(A1), %xmm4
+ shufpd $1, %xmm8, %xmm8
+ vmovsd -16 * SIZE(A2), %xmm6
+ shufpd $1, %xmm9, %xmm9
+
+ vfmaddsd %xmm0 , %xmm12 , %xmm4 , %xmm0
+ vfmaddsd %xmm0 , %xmm13 , %xmm8 , %xmm0
+ vfmaddsd %xmm0 , %xmm14 , %xmm6 , %xmm0
+ vfmaddsd %xmm0 , %xmm15 , %xmm9 , %xmm0
+
+ vmovsd %xmm0, -16 * SIZE(Y1)
+ ALIGN_3
+
+.L58:
+ cmpq $4, N
+ jge .L51
+ ALIGN_3
+
+.L60:
+#endif
+
+#if GEMV_UNROLL >= 2
+
+ cmpq $2, N
+ jl .L70
+
+#if GEMV_UNROLL == 2
+ ALIGN_3
+
+.L61:
+#endif
+
+ subq $2, N
+
+ leaq 16 * SIZE(BUFFER), Y1
+ movq A, A1
+ leaq (A, LDA), A2
+ leaq (A, LDA, 2), A
+
+ vmovddup (X), %xmm12
+ addq INCX, X
+ vmovddup (X), %xmm13
+ addq INCX, X
+
+ vmovddup ALPHA, %xmm0
+
+ vmulpd %xmm0, %xmm12 , %xmm12
+ vmulpd %xmm0, %xmm13 , %xmm13
+
+ testq $SIZE, A
+ je .L6X
+
+ vmovsd -16 * SIZE(A1), %xmm4
+ vmovsd -16 * SIZE(A2), %xmm5
+
+ vmovsd -16 * SIZE(Y1), %xmm0
+
+ vfmaddsd %xmm0 , %xmm12 , %xmm4 , %xmm0
+ vfmaddsd %xmm0 , %xmm13 , %xmm5 , %xmm0
+
+ vmovsd %xmm0, -16 * SIZE(Y1)
+
+ addq $SIZE, A1
+ addq $SIZE, A2
+ addq $SIZE, Y1
+ ALIGN_3
+
+.L6X:
+ movhpd -16 * SIZE(A2), %xmm8
+
+ movq MM, I
+ sarq $3, I
+ jle .L65
+
+ VMOVUPS_A1(-16 * SIZE, A1, %xmm4)
+ VMOVUPS_A1(-14 * SIZE, A1, %xmm5)
+ VMOVUPS_A1(-12 * SIZE, A1, %xmm6)
+
+ VMOVUPS_YL1(-16 * SIZE, Y1, %xmm0)
+ VMOVUPS_YL1(-14 * SIZE, Y1, %xmm1)
+ VMOVUPS_YL1(-12 * SIZE, Y1, %xmm2)
+ VMOVUPS_YL1(-10 * SIZE, Y1, %xmm3)
+
+ decq I
+ jle .L64
+ ALIGN_3
+
+.L63:
+
+ vfmaddpd %xmm0 , %xmm12 , %xmm4 , %xmm0
+ VMOVUPS_A1(-10 * SIZE, A1, %xmm7)
+ vfmaddpd %xmm1 , %xmm12 , %xmm5 , %xmm1
+ VMOVUPS_A1(-15 * SIZE, A2, %xmm4)
+
+ prefetchnta A_PRE(A2)
+ vfmaddpd %xmm2 , %xmm12 , %xmm6 , %xmm2
+ VMOVUPS_A1(-13 * SIZE, A2, %xmm5)
+ vfmaddpd %xmm3 , %xmm12 , %xmm7 , %xmm3
+ VMOVUPS_A1(-11 * SIZE, A2, %xmm6)
+
+
+ shufpd $1, %xmm4, %xmm8
+ vfmaddpd %xmm0 , %xmm13 , %xmm8 , %xmm0
+ VMOVUPS_A1( -9 * SIZE, A2, %xmm8)
+ shufpd $1, %xmm5, %xmm4
+ vfmaddpd %xmm1 , %xmm13 , %xmm4 , %xmm1
+ VMOVUPS_A1( -8 * SIZE, A1, %xmm4)
+
+ prefetchnta A_PRE(A1)
+ shufpd $1, %xmm6, %xmm5
+ vfmaddpd %xmm2 , %xmm13 , %xmm5 , %xmm2
+ VMOVUPS_A1( -6 * SIZE, A1, %xmm5)
+ shufpd $1, %xmm8, %xmm6
+ vfmaddpd %xmm3 , %xmm13 , %xmm6 , %xmm3
+ VMOVUPS_A1( -4 * SIZE, A1, %xmm6)
+
+
+ VMOVUPS_YS1(-16 * SIZE, Y1, %xmm0)
+ VMOVUPS_YS1(-14 * SIZE, Y1, %xmm1)
+ VMOVUPS_YS1(-12 * SIZE, Y1, %xmm2)
+ VMOVUPS_YS1(-10 * SIZE, Y1, %xmm3)
+
+ VMOVUPS_YL1( -8 * SIZE, Y1, %xmm0)
+ VMOVUPS_YL1( -6 * SIZE, Y1, %xmm1)
+ prefetchnta A_PRE(Y1)
+ VMOVUPS_YL1( -4 * SIZE, Y1, %xmm2)
+ VMOVUPS_YL1( -2 * SIZE, Y1, %xmm3)
+
+ subq $-8 * SIZE, A1
+ subq $-8 * SIZE, A2
+ subq $-8 * SIZE, Y1
+
+ subq $1, I
+ BRANCH
+ jg .L63
+ ALIGN_3
+
+.L64:
+
+ vfmaddpd %xmm0 , %xmm12 , %xmm4 , %xmm0
+ VMOVUPS_A1(-10 * SIZE, A1, %xmm7)
+ vfmaddpd %xmm1 , %xmm12 , %xmm5 , %xmm1
+ VMOVUPS_A1(-15 * SIZE, A2, %xmm4)
+
+ vfmaddpd %xmm2 , %xmm12 , %xmm6 , %xmm2
+ VMOVUPS_A1(-13 * SIZE, A2, %xmm5)
+ vfmaddpd %xmm3 , %xmm12 , %xmm7 , %xmm3
+ VMOVUPS_A1(-11 * SIZE, A2, %xmm6)
+
+ shufpd $0x01, %xmm4, %xmm8
+ vfmaddpd %xmm0 , %xmm13 , %xmm8 , %xmm0
+ VMOVUPS_A1( -9 * SIZE, A2, %xmm8)
+ shufpd $0x01, %xmm5, %xmm4
+ vfmaddpd %xmm1 , %xmm13 , %xmm4 , %xmm1
+
+ shufpd $0x01, %xmm6, %xmm5
+ vfmaddpd %xmm2 , %xmm13 , %xmm5 , %xmm2
+ shufpd $0x01, %xmm8, %xmm6
+ vfmaddpd %xmm3 , %xmm13 , %xmm6 , %xmm3
+
+ VMOVUPS_YS1(-16 * SIZE, Y1, %xmm0)
+ VMOVUPS_YS1(-14 * SIZE, Y1, %xmm1)
+ VMOVUPS_YS1(-12 * SIZE, Y1, %xmm2)
+ VMOVUPS_YS1(-10 * SIZE, Y1, %xmm3)
+
+ subq $-8 * SIZE, A1
+ subq $-8 * SIZE, A2
+ subq $-8 * SIZE, Y1
+ ALIGN_3
+
+.L65:
+ testq $4, MM
+ je .L66
+
+
+ VMOVUPS_A1(-16 * SIZE, A1, %xmm4)
+ VMOVUPS_A1(-14 * SIZE, A1, %xmm5)
+
+ VMOVUPS_YL1(-16 * SIZE, Y1, %xmm0)
+ VMOVUPS_YL1(-14 * SIZE, Y1, %xmm1)
+
+ vfmaddpd %xmm0 , %xmm12 , %xmm4 , %xmm0
+ vfmaddpd %xmm1 , %xmm12 , %xmm5 , %xmm1
+
+ VMOVUPS_A1(-15 * SIZE, A2, %xmm6)
+ VMOVUPS_A1(-13 * SIZE, A2, %xmm7)
+
+ shufpd $0x01, %xmm6, %xmm8
+ vfmaddpd %xmm0 , %xmm13 , %xmm8 , %xmm0
+ movaps %xmm7, %xmm8
+ shufpd $0x01, %xmm7, %xmm6
+ vfmaddpd %xmm1 , %xmm13 , %xmm6 , %xmm1
+
+ VMOVUPS_YS1(-16 * SIZE, Y1, %xmm0)
+ VMOVUPS_YS1(-14 * SIZE, Y1, %xmm1)
+
+ addq $4 * SIZE, A1
+ addq $4 * SIZE, A2
+ addq $4 * SIZE, Y1
+ ALIGN_3
+
+.L66:
+ testq $2, MM
+ je .L67
+
+ VMOVUPS_A1(-16 * SIZE, A1, %xmm4)
+ VMOVUPS_A1(-15 * SIZE, A2, %xmm5)
+
+ VMOVUPS_YL1(-16 * SIZE, Y1, %xmm0)
+
+ vfmaddpd %xmm0 , %xmm12 , %xmm4 , %xmm0
+ shufpd $0x01, %xmm5, %xmm8
+ vfmaddpd %xmm0 , %xmm13 , %xmm8 , %xmm0
+ movaps %xmm5, %xmm8
+
+ VMOVUPS_YS1(-16 * SIZE, Y1, %xmm0)
+
+ addq $2 * SIZE, A1
+ addq $2 * SIZE, A2
+ addq $2 * SIZE, Y1
+ ALIGN_3
+
+.L67:
+ testq $1, MM
+#if GEMV_UNROLL == 2
+ je .L68
+#else
+ je .L70
+#endif
+
+ vmovsd -16 * SIZE(Y1), %xmm0
+
+ vmovsd -16 * SIZE(A1), %xmm4
+ vshufpd $0x01, %xmm8, %xmm8 , %xmm8
+
+ vfmaddsd %xmm0 , %xmm12 , %xmm4 , %xmm0
+ vfmaddsd %xmm0 , %xmm13 , %xmm8 , %xmm0
+
+ vmovsd %xmm0, -16 * SIZE(Y1)
+ ALIGN_3
+
+#if GEMV_UNROLL == 2
+.L68:
+ cmpq $2, N
+ jge .L61
+ ALIGN_3
+
+#endif
+
+.L70:
+ cmpq $1, N
+ jl .L900
+
+#endif
+
+ leaq 16 * SIZE(BUFFER), Y1
+ movq A, A1
+
+ vmovddup (X), %xmm12
+ addq INCX, X
+
+ vmovddup ALPHA, %xmm0
+
+ vmulpd %xmm0, %xmm12 , %xmm12
+
+ testq $SIZE, A
+ je .L7X
+
+ vmovsd -16 * SIZE(A1), %xmm4
+ vmovsd -16 * SIZE(Y1), %xmm0
+
+ vfmaddsd %xmm0 , %xmm12 , %xmm4 , %xmm0
+
+ vmovsd %xmm0, -16 * SIZE(Y1)
+
+ addq $SIZE, A1
+ addq $SIZE, Y1
+ ALIGN_3
+
+.L7X:
+
+ movq MM, I
+ sarq $3, I
+ jle .L75
+
+ VMOVUPS_A1(-16 * SIZE, A1, %xmm0)
+ VMOVUPS_A1(-14 * SIZE, A1, %xmm1)
+ VMOVUPS_A1(-12 * SIZE, A1, %xmm2)
+ VMOVUPS_A1(-10 * SIZE, A1, %xmm3)
+
+ VMOVUPS_YL1(-16 * SIZE, Y1, %xmm8)
+ VMOVUPS_YL1(-14 * SIZE, Y1, %xmm9)
+ VMOVUPS_YL1(-12 * SIZE, Y1, %xmm10)
+ VMOVUPS_YL1(-10 * SIZE, Y1, %xmm11)
+
+ decq I
+ jle .L74
+ ALIGN_3
+
+.L73:
+
+ vfmaddpd %xmm8 , %xmm12 , %xmm0 , %xmm8
+ VMOVUPS_A1( -8 * SIZE, A1, %xmm0)
+ vfmaddpd %xmm9 , %xmm12 , %xmm1 , %xmm9
+ VMOVUPS_A1( -6 * SIZE, A1, %xmm1)
+
+ prefetchnta A_PRE(A1)
+ vfmaddpd %xmm10, %xmm12 , %xmm2 , %xmm10
+ VMOVUPS_A1( -4 * SIZE, A1, %xmm2)
+ vfmaddpd %xmm11, %xmm12 , %xmm3 , %xmm11
+ VMOVUPS_A1( -2 * SIZE, A1, %xmm3)
+
+
+ VMOVUPS_YS1(-16 * SIZE, Y1, %xmm8)
+ VMOVUPS_YS1(-14 * SIZE, Y1, %xmm9)
+ VMOVUPS_YS1(-12 * SIZE, Y1, %xmm10)
+ VMOVUPS_YS1(-10 * SIZE, Y1, %xmm11)
+
+ VMOVUPS_YL1( -8 * SIZE, Y1, %xmm8)
+ VMOVUPS_YL1( -6 * SIZE, Y1, %xmm9)
+ prefetchnta A_PRE(Y1)
+ VMOVUPS_YL1( -4 * SIZE, Y1, %xmm10)
+ VMOVUPS_YL1( -2 * SIZE, Y1, %xmm11)
+
+ subq $-8 * SIZE, A1
+ subq $-8 * SIZE, Y1
+
+ subq $1, I
+ BRANCH
+ jg .L73
+ ALIGN_3
+
+.L74:
+
+ vfmaddpd %xmm8 , %xmm12 , %xmm0 , %xmm8
+ VMOVUPS_YS1(-16 * SIZE, Y1, %xmm8)
+ vfmaddpd %xmm9 , %xmm12 , %xmm1 , %xmm9
+ VMOVUPS_YS1(-14 * SIZE, Y1, %xmm9)
+ vfmaddpd %xmm10, %xmm12 , %xmm2 , %xmm10
+ VMOVUPS_YS1(-12 * SIZE, Y1, %xmm10)
+ vfmaddpd %xmm11, %xmm12 , %xmm3 , %xmm11
+ VMOVUPS_YS1(-10 * SIZE, Y1, %xmm11)
+
+ subq $-8 * SIZE, A1
+ subq $-8 * SIZE, Y1
+ ALIGN_3
+
+.L75:
+ testq $4, MM
+ je .L76
+
+ VMOVUPS_A1(-16 * SIZE, A1, %xmm0)
+ VMOVUPS_A1(-14 * SIZE, A1, %xmm1)
+
+ VMOVUPS_YL1(-16 * SIZE, Y1, %xmm8)
+ VMOVUPS_YL1(-14 * SIZE, Y1, %xmm9)
+
+ vfmaddpd %xmm8 , %xmm12 , %xmm0 , %xmm8
+ VMOVUPS_YS1(-16 * SIZE, Y1, %xmm8)
+ vfmaddpd %xmm9 , %xmm12 , %xmm1 , %xmm9
+ VMOVUPS_YS1(-14 * SIZE, Y1, %xmm9)
+
+ addq $4 * SIZE, A1
+ addq $4 * SIZE, Y1
+ ALIGN_3
+
+.L76:
+ testq $2, MM
+ je .L77
+
+ VMOVUPS_A1(-16 * SIZE, A1, %xmm8)
+
+ VMOVUPS_YL1(-16 * SIZE, Y1, %xmm0)
+
+ vfmaddpd %xmm0 , %xmm12 , %xmm8 , %xmm0
+
+ VMOVUPS_YS1(-16 * SIZE, Y1, %xmm0)
+
+ addq $2 * SIZE, A1
+ addq $2 * SIZE, Y1
+ ALIGN_3
+
+.L77:
+ testq $1, MM
+ je .L900
+
+ vmovsd -16 * SIZE(Y1), %xmm0
+ vmovsd -16 * SIZE(A1), %xmm8
+
+ vfmaddsd %xmm0 , %xmm12 , %xmm8 , %xmm0
+
+ vmovsd %xmm0, -16 * SIZE(Y1)
+#endif
+ ALIGN_3
+
+
+.L900:
+#ifndef COPY_FORCE
+ cmpq Y, BUFFER
+ je .L999
+#endif
+
+ cmpq $SIZE, INCY
+ jne .L950
+
+ testq $SIZE, Y
+ je .L910
+
+ vmovsd (Y), %xmm0
+ addsd (BUFFER), %xmm0
+ vmovsd %xmm0, (Y)
+
+ addq $SIZE, Y
+ addq $SIZE, BUFFER
+
+ decq M
+ jle .L999
+ ALIGN_4
+
+.L910:
+ testq $SIZE, BUFFER
+ jne .L920
+
+ movq M, %rax
+ sarq $3, %rax
+ jle .L914
+ ALIGN_3
+
+.L912:
+
+ vmovups 0 * SIZE(Y), %xmm0
+ vmovups 2 * SIZE(Y), %xmm1
+ vmovups 4 * SIZE(Y), %xmm2
+ vmovups 6 * SIZE(Y), %xmm3
+
+ vmovups 0 * SIZE(BUFFER), %xmm4
+ vmovups 2 * SIZE(BUFFER), %xmm5
+ vmovups 4 * SIZE(BUFFER), %xmm6
+ vmovups 6 * SIZE(BUFFER), %xmm7
+
+
+ addpd %xmm4, %xmm0
+ addpd %xmm5, %xmm1
+ addpd %xmm6, %xmm2
+ addpd %xmm7, %xmm3
+
+ vmovups %xmm0, 0 * SIZE(Y)
+ vmovups %xmm1, 2 * SIZE(Y)
+ vmovups %xmm2, 4 * SIZE(Y)
+ vmovups %xmm3, 6 * SIZE(Y)
+
+ addq $8 * SIZE, Y
+ addq $8 * SIZE, BUFFER
+
+ decq %rax
+ jg .L912
+ ALIGN_3
+
+.L914:
+ testq $7, M
+ jle .L999
+
+ testq $4, M
+ jle .L915
+
+ vmovups 0 * SIZE(Y), %xmm0
+ vmovups 2 * SIZE(Y), %xmm1
+
+ vmovups 0 * SIZE(BUFFER), %xmm4
+ vmovups 2 * SIZE(BUFFER), %xmm5
+
+ addpd %xmm4, %xmm0
+ addpd %xmm5, %xmm1
+
+ vmovups %xmm0, 0 * SIZE(Y)
+ vmovups %xmm1, 2 * SIZE(Y)
+
+ addq $4 * SIZE, Y
+ addq $4 * SIZE, BUFFER
+ ALIGN_3
+
+.L915:
+ testq $2, M
+ jle .L916
+
+ vmovups (Y), %xmm0
+
+ vmovups (BUFFER), %xmm4
+
+ addpd %xmm4, %xmm0
+
+ vmovups %xmm0, (Y)
+
+ addq $2 * SIZE, Y
+ addq $2 * SIZE, BUFFER
+ ALIGN_3
+
+.L916:
+ testq $1, M
+ jle .L999
+
+ vmovsd (Y), %xmm0
+
+ vmovsd 0 * SIZE(BUFFER), %xmm4
+
+ addsd %xmm4, %xmm0
+
+ vmovsd %xmm0, (Y)
+ ALIGN_3
+
+ jmp .L999
+ ALIGN_4
+
+.L920:
+ vmovups -1 * SIZE(BUFFER), %xmm4
+
+ movq M, %rax
+ sarq $3, %rax
+ jle .L924
+ ALIGN_3
+
+.L922:
+
+ vmovups 0 * SIZE(Y), %xmm0
+ vmovups 2 * SIZE(Y), %xmm1
+ vmovups 4 * SIZE(Y), %xmm2
+ vmovups 6 * SIZE(Y), %xmm3
+
+ vmovups 1 * SIZE(BUFFER), %xmm5
+ vmovups 3 * SIZE(BUFFER), %xmm6
+ vmovups 5 * SIZE(BUFFER), %xmm7
+ vmovups 7 * SIZE(BUFFER), %xmm8
+
+ shufpd $0x01, %xmm5, %xmm4
+ shufpd $0x01, %xmm6, %xmm5
+ shufpd $0x01, %xmm7, %xmm6
+ shufpd $0x01, %xmm8, %xmm7
+
+
+ addpd %xmm4, %xmm0
+ addpd %xmm5, %xmm1
+ addpd %xmm6, %xmm2
+ addpd %xmm7, %xmm3
+
+ vmovups %xmm0, 0 * SIZE(Y)
+ vmovups %xmm1, 2 * SIZE(Y)
+ vmovups %xmm2, 4 * SIZE(Y)
+ vmovups %xmm3, 6 * SIZE(Y)
+
+ vmovups %xmm8, %xmm4
+
+ addq $8 * SIZE, Y
+ addq $8 * SIZE, BUFFER
+
+ decq %rax
+ jg .L922
+ ALIGN_3
+
+.L924:
+ testq $7, M
+ jle .L999
+
+ testq $4, M
+ jle .L925
+
+ vmovups 0 * SIZE(Y), %xmm0
+ vmovups 2 * SIZE(Y), %xmm1
+
+ vmovups 1 * SIZE(BUFFER), %xmm5
+ vmovups 3 * SIZE(BUFFER), %xmm6
+
+ shufpd $0x01, %xmm5, %xmm4
+ shufpd $0x01, %xmm6, %xmm5
+
+ addpd %xmm4, %xmm0
+ addpd %xmm5, %xmm1
+
+ vmovups %xmm0, 0 * SIZE(Y)
+ vmovups %xmm1, 2 * SIZE(Y)
+
+ vmovups %xmm6, %xmm4
+
+ addq $4 * SIZE, Y
+ addq $4 * SIZE, BUFFER
+ ALIGN_3
+
+.L925:
+ testq $2, M
+ jle .L926
+
+ vmovups (Y), %xmm0
+
+ vmovups 1 * SIZE(BUFFER), %xmm5
+
+ shufpd $0x01, %xmm5, %xmm4
+
+ addpd %xmm4, %xmm0
+
+ vmovups %xmm0, (Y)
+
+ movaps %xmm5, %xmm4
+
+ addq $2 * SIZE, Y
+ addq $2 * SIZE, BUFFER
+ ALIGN_3
+
+.L926:
+ testq $1, M
+ jle .L999
+
+ vmovsd (Y), %xmm0
+
+ vshufpd $0x01, %xmm4 ,%xmm4, %xmm4
+
+ addsd %xmm4, %xmm0
+
+ vmovsd %xmm0, (Y)
+ ALIGN_3
+
+ jmp .L999
+ ALIGN_4
+
+.L950:
+ testq $SIZE, BUFFER
+ je .L960
+
+ vmovsd (Y), %xmm0
+ addsd (BUFFER), %xmm0
+ vmovsd %xmm0, (Y)
+
+ addq INCY, Y
+ addq $SIZE, BUFFER
+
+ decq M
+ jle .L999
+ ALIGN_4
+
+.L960:
+ movq Y, Y1
+
+ movq M, %rax
+ sarq $3, %rax
+ jle .L964
+ ALIGN_3
+
+.L962:
+ vmovsd (Y), %xmm0
+ addq INCY, Y
+ movhpd (Y), %xmm0
+ addq INCY, Y
+
+ vmovups 0 * SIZE(BUFFER), %xmm4
+
+ vmovsd (Y), %xmm1
+ addq INCY, Y
+ movhpd (Y), %xmm1
+ addq INCY, Y
+
+ vmovups 2 * SIZE(BUFFER), %xmm5
+
+ vmovsd (Y), %xmm2
+ addq INCY, Y
+ movhpd (Y), %xmm2
+ addq INCY, Y
+
+ vmovups 4 * SIZE(BUFFER), %xmm6
+
+ addpd %xmm4, %xmm0
+
+ vmovsd (Y), %xmm3
+ addq INCY, Y
+ movhpd (Y), %xmm3
+ addq INCY, Y
+
+ vmovups 6 * SIZE(BUFFER), %xmm7
+
+ addpd %xmm5, %xmm1
+
+ vmovsd %xmm0, (Y1)
+ addq INCY, Y1
+ movhpd %xmm0, (Y1)
+ addq INCY, Y1
+
+ addpd %xmm6, %xmm2
+
+ vmovsd %xmm1, (Y1)
+ addq INCY, Y1
+ movhpd %xmm1, (Y1)
+ addq INCY, Y1
+
+ addpd %xmm7, %xmm3
+
+ vmovsd %xmm2, (Y1)
+ addq INCY, Y1
+ movhpd %xmm2, (Y1)
+ addq INCY, Y1
+ vmovsd %xmm3, (Y1)
+ addq INCY, Y1
+ movhpd %xmm3, (Y1)
+ addq INCY, Y1
+
+ addq $8 * SIZE, BUFFER
+ decq %rax
+ jg .L962
+ ALIGN_3
+
+.L964:
+ testq $7, M
+ jle .L999
+
+ testq $4, M
+ jle .L965
+
+ vmovsd (Y), %xmm0
+ addq INCY, Y
+ movhpd (Y), %xmm0
+ addq INCY, Y
+
+ vmovups 0 * SIZE(BUFFER), %xmm4
+
+ vmovsd (Y), %xmm1
+ addq INCY, Y
+ movhpd (Y), %xmm1
+ addq INCY, Y
+
+ vmovups 2 * SIZE(BUFFER), %xmm5
+
+ addpd %xmm4, %xmm0
+ addpd %xmm5, %xmm1
+
+ vmovsd %xmm0, (Y1)
+ addq INCY, Y1
+ movhpd %xmm0, (Y1)
+ addq INCY, Y1
+ vmovsd %xmm1, (Y1)
+ addq INCY, Y1
+ movhpd %xmm1, (Y1)
+ addq INCY, Y1
+
+ addq $4 * SIZE, BUFFER
+ ALIGN_3
+
+.L965:
+ testq $2, M
+ jle .L966
+
+ vmovsd (Y), %xmm0
+ addq INCY, Y
+ movhpd (Y), %xmm0
+ addq INCY, Y
+
+ vmovups 0 * SIZE(BUFFER), %xmm4
+
+ addpd %xmm4, %xmm0
+
+ vmovsd %xmm0, (Y1)
+ addq INCY, Y1
+ movhpd %xmm0, (Y1)
+ addq INCY, Y1
+
+ addq $2 * SIZE, BUFFER
+ ALIGN_3
+
+.L966:
+ testq $1, M
+ jle .L999
+
+ vmovsd (Y), %xmm0
+
+ vmovsd 0 * SIZE(BUFFER), %xmm4
+
+ addsd %xmm4, %xmm0
+
+ vmovsd %xmm0, (Y1)
+ ALIGN_3
+
+.L999:
+ movq 0(%rsp), %rbx
+ movq 8(%rsp), %rbp
+ movq 16(%rsp), %r12
+ movq 24(%rsp), %r13
+ movq 32(%rsp), %r14
+ movq 40(%rsp), %r15
+
+#ifdef WINDOWS_ABI
+ movq 48(%rsp), %rdi
+ movq 56(%rsp), %rsi
+ vmovups 64(%rsp), %xmm6
+ vmovups 80(%rsp), %xmm7
+ vmovups 96(%rsp), %xmm8
+ vmovups 112(%rsp), %xmm9
+ vmovups 128(%rsp), %xmm10
+ vmovups 144(%rsp), %xmm11
+ vmovups 160(%rsp), %xmm12
+ vmovups 176(%rsp), %xmm13
+ vmovups 192(%rsp), %xmm14
+ vmovups 208(%rsp), %xmm15
+#endif
+
+ addq $STACKSIZE, %rsp
+
+ ret
+ EPILOGUE