From 952e87219815b0d8a698e0c098e4fb7984f8b19d Mon Sep 17 00:00:00 2001
From: =?utf8?q?M=C3=A5ns=20Rullg=C3=A5rd?= <mans@mansr.com>
Date: Sun, 27 Sep 2009 16:51:54 +0000
Subject: [PATCH] Drop unused args from vector_fmul_add_add, simpify code, and
 rename

The src3 and step arguments to vector_fmul_add_add() are always zero
and one, respectively.  This removes these arguments from the function,
simplifies the code accordingly, and renames the function to better
match the new operation.

Originally committed as revision 20061 to svn://svn.ffmpeg.org/ffmpeg/trunk
---
 libavcodec/dsputil.c           |  6 ++--
 libavcodec/dsputil.h           |  4 +--
 libavcodec/ppc/float_altivec.c | 66 +++---------------------------------
 libavcodec/wmadec.c            |  8 ++---
 libavcodec/x86/dsputil_mmx.c   | 76 ++++--------------------------------------
 5 files changed, 18 insertions(+), 142 deletions(-)

diff --git a/libavcodec/dsputil.c b/libavcodec/dsputil.c
index 270c583..894e592 100644
--- a/libavcodec/dsputil.c
+++ b/libavcodec/dsputil.c
@@ -4068,10 +4068,10 @@ static void vector_fmul_reverse_c(float *dst, const float *src0, const float *sr
         dst[i] = src0[i] * src1[-i];
 }
 
-void ff_vector_fmul_add_add_c(float *dst, const float *src0, const float *src1, const float *src2, int src3, int len, int step){
+static void vector_fmul_add_c(float *dst, const float *src0, const float *src1, const float *src2, int len){
     int i;
     for(i=0; i<len; i++)
-        dst[i*step] = src0[i] * src1[i] + src2[i] + src3;
+        dst[i] = src0[i] * src1[i] + src2[i];
 }
 
 void ff_vector_fmul_window_c(float *dst, const float *src0, const float *src1, const float *win, float add_bias, int len){
@@ -4787,7 +4787,7 @@ void dsputil_init(DSPContext* c, AVCodecContext *avctx)
 #endif
     c->vector_fmul = vector_fmul_c;
     c->vector_fmul_reverse = vector_fmul_reverse_c;
-    c->vector_fmul_add_add = ff_vector_fmul_add_add_c;
+    c->vector_fmul_add = vector_fmul_add_c;
     c->vector_fmul_window = ff_vector_fmul_window_c;
     c->int32_to_float_fmul_scalar = int32_to_float_fmul_scalar_c;
     c->vector_clipf = vector_clipf_c;
diff --git a/libavcodec/dsputil.h b/libavcodec/dsputil.h
index 58d5b49..dd7b22d 100644
--- a/libavcodec/dsputil.h
+++ b/libavcodec/dsputil.h
@@ -66,8 +66,6 @@ void ff_h264_idct_add16intra_c(uint8_t *dst, const int *blockoffset, DCTELEM *bl
 void ff_h264_idct8_add4_c(uint8_t *dst, const int *blockoffset, DCTELEM *block, int stride, const uint8_t nnzc[6*8]);
 void ff_h264_idct_add8_c(uint8_t **dest, const int *blockoffset, DCTELEM *block, int stride, const uint8_t nnzc[6*8]);
 
-void ff_vector_fmul_add_add_c(float *dst, const float *src0, const float *src1,
-                              const float *src2, int src3, int blocksize, int step);
 void ff_vector_fmul_window_c(float *dst, const float *src0, const float *src1,
                              const float *win, float add_bias, int len);
 void ff_float_to_int16_c(int16_t *dst, const float *src, long len);
@@ -391,7 +389,7 @@ typedef struct DSPContext {
     void (*vector_fmul)(float *dst, const float *src, int len);
     void (*vector_fmul_reverse)(float *dst, const float *src0, const float *src1, int len);
     /* assume len is a multiple of 8, and src arrays are 16-byte aligned */
-    void (*vector_fmul_add_add)(float *dst, const float *src0, const float *src1, const float *src2, int src3, int len, int step);
+    void (*vector_fmul_add)(float *dst, const float *src0, const float *src1, const float *src2, int len);
     /* assume len is a multiple of 4, and arrays are 16-byte aligned */
     void (*vector_fmul_window)(float *dst, const float *src0, const float *src1, const float *win, float add_bias, int len);
     /* assume len is a multiple of 8, and arrays are 16-byte aligned */
diff --git a/libavcodec/ppc/float_altivec.c b/libavcodec/ppc/float_altivec.c
index 096f75f..48d093c 100644
--- a/libavcodec/ppc/float_altivec.c
+++ b/libavcodec/ppc/float_altivec.c
@@ -66,71 +66,15 @@ static void vector_fmul_reverse_altivec(float *dst, const float *src0,
     }
 }
 
-static void vector_fmul_add_add_altivec(float *dst, const float *src0,
-                                        const float *src1, const float *src2,
-                                        int src3, int len, int step)
+static void vector_fmul_add_altivec(float *dst, const float *src0,
+                                    const float *src1, const float *src2,
+                                    int len)
 {
     int i;
     vector float d, s0, s1, s2, t0, t1, edges;
     vector unsigned char align = vec_lvsr(0,dst),
                          mask = vec_lvsl(0, dst);
 
-#if 0 //FIXME: there is still something wrong
-    if (step == 2) {
-        int y;
-        vector float d0, d1, s3, t2;
-        vector unsigned int sel =
-                vec_mergeh(vec_splat_u32(-1), vec_splat_u32(0));
-        t1 = vec_ld(16, dst);
-        for (i=0,y=0; i<len-3; i+=4,y+=8) {
-
-            s0 = vec_ld(0,src0+i);
-            s1 = vec_ld(0,src1+i);
-            s2 = vec_ld(0,src2+i);
-
-//          t0 = vec_ld(0, dst+y);  //[x x x|a]
-//          t1 = vec_ld(16, dst+y); //[b c d|e]
-            t2 = vec_ld(31, dst+y); //[f g h|x]
-
-            d = vec_madd(s0,s1,s2); // [A B C D]
-
-                                                 // [A A B B]
-
-                                                 // [C C D D]
-
-            d0 = vec_perm(t0, t1, mask); // [a b c d]
-
-            d0 = vec_sel(vec_mergeh(d, d), d0, sel);   // [A b B d]
-
-            edges = vec_perm(t1, t0, mask);
-
-            t0 = vec_perm(edges, d0, align); // [x x x|A]
-
-            t1 = vec_perm(d0, edges, align); // [b B d|e]
-
-            vec_stl(t0, 0, dst+y);
-
-            d1 = vec_perm(t1, t2, mask); // [e f g h]
-
-            d1 = vec_sel(vec_mergel(d, d), d1, sel); // [C f D h]
-
-            edges = vec_perm(t2, t1, mask);
-
-            t1 = vec_perm(edges, d1, align); // [b B d|C]
-
-            t2 = vec_perm(d1, edges, align); // [f D h|x]
-
-            vec_stl(t1, 16, dst+y);
-
-            t0 = t1;
-
-            vec_stl(t2, 31, dst+y);
-
-            t1 = t2;
-        }
-    } else
-    #endif
-    if (step == 1 && src3 == 0)
         for (i=0; i<len-3; i+=4) {
             t0 = vec_ld(0, dst+i);
             t1 = vec_ld(15, dst+i);
@@ -144,8 +88,6 @@ static void vector_fmul_add_add_altivec(float *dst, const float *src0,
             vec_st(t1, 15, dst+i);
             vec_st(t0, 0, dst+i);
         }
-    else
-        ff_vector_fmul_add_add_c(dst, src0, src1, src2, src3, len, step);
 }
 
 static void vector_fmul_window_altivec(float *dst, const float *src0, const float *src1, const float *win, float add_bias, int len)
@@ -299,7 +241,7 @@ void float_init_altivec(DSPContext* c, AVCodecContext *avctx)
 {
     c->vector_fmul = vector_fmul_altivec;
     c->vector_fmul_reverse = vector_fmul_reverse_altivec;
-    c->vector_fmul_add_add = vector_fmul_add_add_altivec;
+    c->vector_fmul_add = vector_fmul_add_altivec;
     c->int32_to_float_fmul_scalar = int32_to_float_fmul_scalar_altivec;
     if(!(avctx->flags & CODEC_FLAG_BITEXACT)) {
         c->vector_fmul_window = vector_fmul_window_altivec;
diff --git a/libavcodec/wmadec.c b/libavcodec/wmadec.c
index ca3bf0a..a4d5ad6 100644
--- a/libavcodec/wmadec.c
+++ b/libavcodec/wmadec.c
@@ -301,16 +301,16 @@ static void wma_window(WMACodecContext *s, float *out)
         block_len = s->block_len;
         bsize = s->frame_len_bits - s->block_len_bits;
 
-        s->dsp.vector_fmul_add_add(out, in, s->windows[bsize],
-                                   out, 0, block_len, 1);
+        s->dsp.vector_fmul_add(out, in, s->windows[bsize],
+                               out, block_len);
 
     } else {
         block_len = 1 << s->prev_block_len_bits;
         n = (s->block_len - block_len) / 2;
         bsize = s->frame_len_bits - s->prev_block_len_bits;
 
-        s->dsp.vector_fmul_add_add(out+n, in+n, s->windows[bsize],
-                                   out+n, 0, block_len, 1);
+        s->dsp.vector_fmul_add(out+n, in+n, s->windows[bsize],
+                               out+n, block_len);
 
         memcpy(out+n+block_len, in+n+block_len, n*sizeof(float));
     }
diff --git a/libavcodec/x86/dsputil_mmx.c b/libavcodec/x86/dsputil_mmx.c
index f430abc..79ceb15 100644
--- a/libavcodec/x86/dsputil_mmx.c
+++ b/libavcodec/x86/dsputil_mmx.c
@@ -2125,34 +2125,9 @@ static void vector_fmul_reverse_sse(float *dst, const float *src0, const float *
     );
 }
 
-static void vector_fmul_add_add_3dnow(float *dst, const float *src0, const float *src1,
-                                      const float *src2, int src3, int len, int step){
+static void vector_fmul_add_3dnow(float *dst, const float *src0, const float *src1,
+                                  const float *src2, int len){
     x86_reg i = (len-4)*4;
-    if(step == 2 && src3 == 0){
-        dst += (len-4)*2;
-        __asm__ volatile(
-            "1: \n\t"
-            "movq   (%2,%0),  %%mm0 \n\t"
-            "movq  8(%2,%0),  %%mm1 \n\t"
-            "pfmul  (%3,%0),  %%mm0 \n\t"
-            "pfmul 8(%3,%0),  %%mm1 \n\t"
-            "pfadd  (%4,%0),  %%mm0 \n\t"
-            "pfadd 8(%4,%0),  %%mm1 \n\t"
-            "movd     %%mm0,   (%1) \n\t"
-            "movd     %%mm1, 16(%1) \n\t"
-            "psrlq      $32,  %%mm0 \n\t"
-            "psrlq      $32,  %%mm1 \n\t"
-            "movd     %%mm0,  8(%1) \n\t"
-            "movd     %%mm1, 24(%1) \n\t"
-            "sub  $32, %1 \n\t"
-            "sub  $16, %0 \n\t"
-            "jge  1b \n\t"
-            :"+r"(i), "+r"(dst)
-            :"r"(src0), "r"(src1), "r"(src2)
-            :"memory"
-        );
-    }
-    else if(step == 1 && src3 == 0){
         __asm__ volatile(
             "1: \n\t"
             "movq    (%2,%0), %%mm0 \n\t"
@@ -2169,47 +2144,11 @@ static void vector_fmul_add_add_3dnow(float *dst, const float *src0, const float
             :"r"(dst), "r"(src0), "r"(src1), "r"(src2)
             :"memory"
         );
-    }
-    else
-        ff_vector_fmul_add_add_c(dst, src0, src1, src2, src3, len, step);
     __asm__ volatile("femms");
 }
-static void vector_fmul_add_add_sse(float *dst, const float *src0, const float *src1,
-                                    const float *src2, int src3, int len, int step){
+static void vector_fmul_add_sse(float *dst, const float *src0, const float *src1,
+                                const float *src2, int len){
     x86_reg i = (len-8)*4;
-    if(step == 2 && src3 == 0){
-        dst += (len-8)*2;
-        __asm__ volatile(
-            "1: \n\t"
-            "movaps   (%2,%0), %%xmm0 \n\t"
-            "movaps 16(%2,%0), %%xmm1 \n\t"
-            "mulps    (%3,%0), %%xmm0 \n\t"
-            "mulps  16(%3,%0), %%xmm1 \n\t"
-            "addps    (%4,%0), %%xmm0 \n\t"
-            "addps  16(%4,%0), %%xmm1 \n\t"
-            "movss     %%xmm0,   (%1) \n\t"
-            "movss     %%xmm1, 32(%1) \n\t"
-            "movhlps   %%xmm0, %%xmm2 \n\t"
-            "movhlps   %%xmm1, %%xmm3 \n\t"
-            "movss     %%xmm2, 16(%1) \n\t"
-            "movss     %%xmm3, 48(%1) \n\t"
-            "shufps $0xb1, %%xmm0, %%xmm0 \n\t"
-            "shufps $0xb1, %%xmm1, %%xmm1 \n\t"
-            "movss     %%xmm0,  8(%1) \n\t"
-            "movss     %%xmm1, 40(%1) \n\t"
-            "movhlps   %%xmm0, %%xmm2 \n\t"
-            "movhlps   %%xmm1, %%xmm3 \n\t"
-            "movss     %%xmm2, 24(%1) \n\t"
-            "movss     %%xmm3, 56(%1) \n\t"
-            "sub  $64, %1 \n\t"
-            "sub  $32, %0 \n\t"
-            "jge  1b \n\t"
-            :"+r"(i), "+r"(dst)
-            :"r"(src0), "r"(src1), "r"(src2)
-            :"memory"
-        );
-    }
-    else if(step == 1 && src3 == 0){
         __asm__ volatile(
             "1: \n\t"
             "movaps   (%2,%0), %%xmm0 \n\t"
@@ -2226,9 +2165,6 @@ static void vector_fmul_add_add_sse(float *dst, const float *src0, const float *
             :"r"(dst), "r"(src0), "r"(src1), "r"(src2)
             :"memory"
         );
-    }
-    else
-        ff_vector_fmul_add_add_c(dst, src0, src1, src2, src3, len, step);
 }
 
 static void vector_fmul_window_3dnow2(float *dst, const float *src0, const float *src1,
@@ -3077,7 +3013,7 @@ void dsputil_init_mmx(DSPContext* c, AVCodecContext *avctx)
             c->ac3_downmix = ac3_downmix_sse;
             c->vector_fmul = vector_fmul_sse;
             c->vector_fmul_reverse = vector_fmul_reverse_sse;
-            c->vector_fmul_add_add = vector_fmul_add_add_sse;
+            c->vector_fmul_add = vector_fmul_add_sse;
             c->vector_fmul_window = vector_fmul_window_sse;
             c->int32_to_float_fmul_scalar = int32_to_float_fmul_scalar_sse;
             c->vector_clipf = vector_clipf_sse;
@@ -3085,7 +3021,7 @@ void dsputil_init_mmx(DSPContext* c, AVCodecContext *avctx)
             c->float_to_int16_interleave = float_to_int16_interleave_sse;
         }
         if(mm_flags & FF_MM_3DNOW)
-            c->vector_fmul_add_add = vector_fmul_add_add_3dnow; // faster than sse
+            c->vector_fmul_add = vector_fmul_add_3dnow; // faster than sse
         if(mm_flags & FF_MM_SSE2){
             c->int32_to_float_fmul_scalar = int32_to_float_fmul_scalar_sse2;
             c->float_to_int16 = float_to_int16_sse2;
-- 
2.7.4