;-----------------------------------------------------------------------------
%macro VECTOR_FMUL 0
cglobal vector_fmul, 4,4,2, dst, src0, src1, len
- lea lenq, [lend*4 - 2*mmsize]
+ lea lenq, [lend*4 - 64]
ALIGN 16
.loop:
- mova m0, [src0q + lenq]
- mova m1, [src0q + lenq + mmsize]
- mulps m0, m0, [src1q + lenq]
- mulps m1, m1, [src1q + lenq + mmsize]
- mova [dstq + lenq], m0
- mova [dstq + lenq + mmsize], m1
+%assign a 0
+%rep 32/mmsize
+ mova m0, [src0q + lenq + (a+0)*mmsize]
+ mova m1, [src0q + lenq + (a+1)*mmsize]
+ mulps m0, m0, [src1q + lenq + (a+0)*mmsize]
+ mulps m1, m1, [src1q + lenq + (a+1)*mmsize]
+ mova [dstq + lenq + (a+0)*mmsize], m0
+ mova [dstq + lenq + (a+1)*mmsize], m1
+%assign a a+2
+%endrep
- sub lenq, 2*mmsize
+ sub lenq, 64
jge .loop
REP_RET
%endmacro
%macro VECTOR_FMAC_SCALAR 0
%if UNIX64
-cglobal vector_fmac_scalar, 3,3,3, dst, src, len
+cglobal vector_fmac_scalar, 3,3,5, dst, src, len
%else
-cglobal vector_fmac_scalar, 4,4,3, dst, src, mul, len
+cglobal vector_fmac_scalar, 4,4,5, dst, src, mul, len
%endif
%if ARCH_X86_32
VBROADCASTSS m0, mulm
%else
%if WIN64
- mova xmm0, xmm2
+ SWAP 0, 2
%endif
- shufps xmm0, xmm0, 0
+ shufps xm0, xm0, 0
%if cpuflag(avx)
- vinsertf128 m0, m0, xmm0, 1
+ vinsertf128 m0, m0, xm0, 1
%endif
%endif
- lea lenq, [lend*4-2*mmsize]
+ lea lenq, [lend*4-64]
.loop:
- mulps m1, m0, [srcq+lenq ]
- mulps m2, m0, [srcq+lenq+mmsize]
- addps m1, m1, [dstq+lenq ]
- addps m2, m2, [dstq+lenq+mmsize]
- mova [dstq+lenq ], m1
- mova [dstq+lenq+mmsize], m2
- sub lenq, 2*mmsize
+%if cpuflag(fma3)
+ mova m1, [dstq+lenq]
+ mova m2, [dstq+lenq+1*mmsize]
+ fmaddps m1, m0, [srcq+lenq], m1
+ fmaddps m2, m0, [srcq+lenq+1*mmsize], m2
+%else ; cpuflag
+ mulps m1, m0, [srcq+lenq]
+ mulps m2, m0, [srcq+lenq+1*mmsize]
+%if mmsize < 32
+ mulps m3, m0, [srcq+lenq+2*mmsize]
+ mulps m4, m0, [srcq+lenq+3*mmsize]
+%endif ; mmsize
+ addps m1, m1, [dstq+lenq]
+ addps m2, m2, [dstq+lenq+1*mmsize]
+%if mmsize < 32
+ addps m3, m3, [dstq+lenq+2*mmsize]
+ addps m4, m4, [dstq+lenq+3*mmsize]
+%endif ; mmsize
+%endif ; cpuflag
+ mova [dstq+lenq], m1
+ mova [dstq+lenq+1*mmsize], m2
+%if mmsize < 32
+ mova [dstq+lenq+2*mmsize], m3
+ mova [dstq+lenq+3*mmsize], m4
+%endif ; mmsize
+ sub lenq, 64
jge .loop
REP_RET
%endmacro
INIT_YMM avx
VECTOR_FMAC_SCALAR
%endif
+%if HAVE_FMA3_EXTERNAL
+INIT_YMM fma3
+VECTOR_FMAC_SCALAR
+%endif
;------------------------------------------------------------------------------
; void ff_vector_fmul_scalar(float *dst, const float *src, float mul, int len)
VBROADCASTSD m0, mulm
%else
%if WIN64
- movlhps xmm2, xmm2
-%if cpuflag(avx)
- vinsertf128 ymm2, ymm2, xmm2, 1
-%endif
SWAP 0, 2
-%else
- movlhps xmm0, xmm0
-%if cpuflag(avx)
- vinsertf128 ymm0, ymm0, xmm0, 1
%endif
+ movlhps xm0, xm0
+%if cpuflag(avx)
+ vinsertf128 ym0, ym0, xm0, 1
%endif
%endif
lea lenq, [lend*8-2*mmsize]
%endif
;-----------------------------------------------------------------------------
+; vector_fmul_window(float *dst, const float *src0,
+; const float *src1, const float *win, int len);
+;-----------------------------------------------------------------------------
+%macro VECTOR_FMUL_WINDOW 0
+cglobal vector_fmul_window, 5, 6, 6, dst, src0, src1, win, len, len1
+ shl lend, 2
+ lea len1q, [lenq - mmsize]
+ add src0q, lenq
+ add dstq, lenq
+ add winq, lenq
+ neg lenq
+.loop
+ mova m0, [winq + lenq]
+ mova m4, [src0q + lenq]
+%if cpuflag(sse)
+ mova m1, [winq + len1q]
+ mova m5, [src1q + len1q]
+ shufps m1, m1, 0x1b
+ shufps m5, m5, 0x1b
+ mova m2, m0
+ mova m3, m1
+ mulps m2, m4
+ mulps m3, m5
+ mulps m1, m4
+ mulps m0, m5
+ addps m2, m3
+ subps m1, m0
+ shufps m2, m2, 0x1b
+%else
+ pswapd m1, [winq + len1q]
+ pswapd m5, [src1q + len1q]
+ mova m2, m0
+ mova m3, m1
+ pfmul m2, m4
+ pfmul m3, m5
+ pfmul m1, m4
+ pfmul m0, m5
+ pfadd m2, m3
+ pfsub m1, m0
+ pswapd m2, m2
+%endif
+ mova [dstq + lenq], m1
+ mova [dstq + len1q], m2
+ sub len1q, mmsize
+ add lenq, mmsize
+ jl .loop
+%if mmsize == 8
+ femms
+%endif
+ REP_RET
+%endmacro
+
+INIT_MMX 3dnowext
+VECTOR_FMUL_WINDOW
+INIT_XMM sse
+VECTOR_FMUL_WINDOW
+
+;-----------------------------------------------------------------------------
; vector_fmul_add(float *dst, const float *src0, const float *src1,
; const float *src2, int len)
;-----------------------------------------------------------------------------
%macro VECTOR_FMUL_ADD 0
-cglobal vector_fmul_add, 5,5,2, dst, src0, src1, src2, len
+cglobal vector_fmul_add, 5,5,4, dst, src0, src1, src2, len
lea lenq, [lend*4 - 2*mmsize]
ALIGN 16
.loop:
mova m0, [src0q + lenq]
mova m1, [src0q + lenq + mmsize]
+%if cpuflag(fma3)
+ mova m2, [src2q + lenq]
+ mova m3, [src2q + lenq + mmsize]
+ fmaddps m0, m0, [src1q + lenq], m2
+ fmaddps m1, m1, [src1q + lenq + mmsize], m3
+%else
mulps m0, m0, [src1q + lenq]
mulps m1, m1, [src1q + lenq + mmsize]
addps m0, m0, [src2q + lenq]
addps m1, m1, [src2q + lenq + mmsize]
+%endif
mova [dstq + lenq], m0
mova [dstq + lenq + mmsize], m1
INIT_YMM avx
VECTOR_FMUL_ADD
%endif
+%if HAVE_FMA3_EXTERNAL
+INIT_YMM fma3
+VECTOR_FMUL_ADD
+%endif
;-----------------------------------------------------------------------------
; void vector_fmul_reverse(float *dst, const float *src0, const float *src1,