#ifdef SMP
nthreads = num_cpu_avail(1);
+ //disable multi-thread when incx==0 or incy==0
+ //In that case, the threads would be dependent.
+ if (incx == 0 || incy == 0)
+ nthreads = 1;
+
if (nthreads == 1) {
#endif
#ifdef SMP
nthreads = num_cpu_avail(1);
+ //disable multi-thread when incx==0 or incy==0
+ //In that case, the threads would be dependent.
+ if (incx == 0 || incy == 0)
+ nthreads = 1;
+
if (nthreads == 1) {
#endif
.L50:
movq M, %rax
movq Y, YY
+//If incx==0 || incy==0, avoid unloop.
+ cmpq $0, INCX
+ je .L56
+ cmpq $0, INCY
+ je .L56
+
sarq $3, %rax
jle .L55
ALIGN_3
.L40:
movq Y, YY
movq M, %rax
+//If incx==0 || incy==0, avoid unloop.
+ cmpq $0, INCX
+ je .L46
+ cmpq $0, INCY
+ je .L46
+
sarq $3, %rax
jle .L45
ALIGN_3
unpcklps %xmm13, %xmm15
#endif
+//If incx==0 || incy==0, avoid unloop and jump to end.
+ cmpq $0, INCX
+ je .L200
+ cmpq $0, INCY
+ je .L200
+
movq Y, YY
movq M, %rax
addps %xmm1, %xmm8
movsd %xmm8, (Y)
+ jmp .L999
ALIGN_3
+
+.L200:
+ movq M, %rax
+ cmpq $0, %rax
+ jle .L999
+ ALIGN_3
+
+.L201:
+ movsd (X), %xmm0
+ addq INCX, X
+
+#ifdef HAVE_SSE3
+ movshdup %xmm0, %xmm1
+ movsldup %xmm0, %xmm0
+#else
+ pshufd $0xf5, %xmm0, %xmm1
+ shufps $0xa0, %xmm0, %xmm0
+#endif
+
+ mulps %xmm14, %xmm0
+ mulps %xmm15, %xmm1
+
+ movsd (Y), %xmm8
+
+ addps %xmm0, %xmm8
+ addps %xmm1, %xmm8
+ movsd %xmm8, (Y)
+ addq INCY, Y
+
+ decq %rax
+ jg .L201
+ ALIGN_3
+
.L999:
xorq %rax, %rax
movq Y, YY
movq M, %rax
+//If incx==0 || incy==0, avoid unloop and jump to end.
+ cmpq $0, INCX
+ je .L58
+ cmpq $0, INCY
+ je .L58
+
sarq $3, %rax
jle .L55
andq $1, %rax
jle .L999
+.L58:
MOVDDUP( 0 * SIZE, X, %xmm0)
MOVDDUP( 1 * SIZE, X, %xmm1)
movlpd %xmm8, 0 * SIZE(YY)
movhpd %xmm8, 1 * SIZE(YY)
+
+ decq %rax
+ jg .L58
ALIGN_3
.L999: