dnn: prefer to use v_fma() instead of v_c += v_a * v_b

author Alexander Alekhin <alexander.a.alekhin@gmail.com>

Sat, 5 Dec 2020 11:51:03 +0000 (11:51 +0000)

committer Alexander Alekhin <alexander.a.alekhin@gmail.com>

Sat, 5 Dec 2020 11:51:03 +0000 (11:51 +0000)
author Alexander Alekhin <alexander.a.alekhin@gmail.com>
Sat, 5 Dec 2020 11:51:03 +0000 (11:51 +0000)
committer Alexander Alekhin <alexander.a.alekhin@gmail.com>
Sat, 5 Dec 2020 11:51:03 +0000 (11:51 +0000)
diff --git a/modules/dnn/src/layers/convolution_layer.cpp b/modules/dnn/src/layers/convolution_layer.cpp

index c8245c4..63bd386 100644 (file)
--- a/modules/dnn/src/layers/convolution_layer.cpp
+++ b/modules/dnn/src/layers/convolution_layer.cpp
@@ -1370,15 +1370,15 @@ public:
                                      v_float32x4 r2 = v_load_aligned(rptr + vsz_a*2);
                                      v_float32x4 r3 = v_load_aligned(rptr + vsz_a*3);
  
-                                    vs00 += w0*r0;
-                                    vs01 += w0*r1;
-                                    vs02 += w0*r2;
-                                    vs03 += w0*r3;
-
-                                    vs10 += w1*r0;
-                                    vs11 += w1*r1;
-                                    vs12 += w1*r2;
-                                    vs13 += w1*r3;
+                                    vs00 = v_fma(w0, r0, vs00);
+                                    vs01 = v_fma(w0, r1, vs01);
+                                    vs02 = v_fma(w0, r2, vs02);
+                                    vs03 = v_fma(w0, r3, vs03);
+
+                                    vs10 = v_fma(w1, r0, vs10);
+                                    vs11 = v_fma(w1, r1, vs11);
+                                    vs12 = v_fma(w1, r2, vs12);
+                                    vs13 = v_fma(w1, r3, vs13);
                                  }
                                  s0 += v_reduce_sum4(vs00, vs01, vs02, vs03);
                                  s1 += v_reduce_sum4(vs10, vs11, vs12, vs13);
@@ -2035,20 +2035,21 @@ public:
  
                      for( ; n <= nmax - 4; n += 4 )
                      {
+                        v_float32x4 d0 = v_load(dst0 + n);
+                        v_float32x4 d1 = v_load(dst1 + n);
                          v_float32x4 b0 = v_load(bptr0 + n);
                          v_float32x4 b1 = v_load(bptr1 + n);
                          v_float32x4 b2 = v_load(bptr2 + n);
                          v_float32x4 b3 = v_load(bptr3 + n);
-                        v_float32x4 d0 = v_load(dst0 + n);
-                        v_float32x4 d1 = v_load(dst1 + n);
-                        d0 += b0*a00;
-                        d1 += b0*a01;
-                        d0 += b1*a10;
-                        d1 += b1*a11;
-                        d0 += b2*a20;
-                        d1 += b2*a21;
-                        d0 += b3*a30;
-                        d1 += b3*a31;
+                        // TODO try to improve pipeline width
+                        d0 = v_fma(b0, a00, d0);
+                        d1 = v_fma(b0, a01, d1);
+                        d0 = v_fma(b1, a10, d0);
+                        d1 = v_fma(b1, a11, d1);
+                        d0 = v_fma(b2, a20, d0);
+                        d1 = v_fma(b2, a21, d1);
+                        d0 = v_fma(b3, a30, d0);
+                        d1 = v_fma(b3, a31, d1);
                          v_store(dst0 + n, d0);
                          v_store(dst1 + n, d1);
                      }
@@ -2056,8 +2057,10 @@ public:
  
                      for( ; n < nmax; n++ )
                      {
-                        float b0 = bptr0[n], b1 = bptr1[n];
-                        float b2 = bptr2[n], b3 = bptr3[n];
+                        float b0 = bptr0[n];
+                        float b1 = bptr1[n];
+                        float b2 = bptr2[n];
+                        float b3 = bptr3[n];
                          float d0 = dst0[n] + alpha00*b0 + alpha10*b1 + alpha20*b2 + alpha30*b3;
                          float d1 = dst1[n] + alpha01*b0 + alpha11*b1 + alpha21*b2 + alpha31*b3;
                          dst0[n] = d0;
diff --git a/modules/dnn/src/layers/fully_connected_layer.cpp b/modules/dnn/src/layers/fully_connected_layer.cpp

index 4746403..d9c6f6e 100644 (file)
--- a/modules/dnn/src/layers/fully_connected_layer.cpp
+++ b/modules/dnn/src/layers/fully_connected_layer.cpp
@@ -241,16 +241,18 @@ public:
              #if CV_SIMD128
                      for( ; i <= nw - 4; i += 4, wptr += 4*wstep )
                      {
-                        v_float32x4 vs0 = v_setall_f32(0.f), vs1 = v_setall_f32(0.f);
-                        v_float32x4 vs2 = v_setall_f32(0.f), vs3 = v_setall_f32(0.f);
+                        v_float32x4 vs0 = v_setall_f32(0.f);
+                        v_float32x4 vs1 = v_setall_f32(0.f);
+                        v_float32x4 vs2 = v_setall_f32(0.f);
+                        v_float32x4 vs3 = v_setall_f32(0.f);
  
                          for( k = 0; k < vecsize; k += 4 )
                          {
                              v_float32x4 v = v_load_aligned(sptr + k);
-                            vs0 += v*v_load_aligned(wptr + k);
-                            vs1 += v*v_load_aligned(wptr + wstep + k);
-                            vs2 += v*v_load_aligned(wptr + wstep*2 + k);
-                            vs3 += v*v_load_aligned(wptr + wstep*3 + k);
+                            vs0 = v_fma(v, v_load_aligned(wptr + k), vs0);
+                            vs1 = v_fma(v, v_load_aligned(wptr + wstep + k), vs1);
+                            vs2 = v_fma(v, v_load_aligned(wptr + wstep*2 + k), vs2);
+                            vs3 = v_fma(v, v_load_aligned(wptr + wstep*3 + k), vs3);
                          }
  
                          v_float32x4 s = v_reduce_sum4(vs0, vs1, vs2, vs3);
author	Alexander Alekhin <alexander.a.alekhin@gmail.com>
	Sat, 5 Dec 2020 11:51:03 +0000 (11:51 +0000)
committer	Alexander Alekhin <alexander.a.alekhin@gmail.com>
	Sat, 5 Dec 2020 11:51:03 +0000 (11:51 +0000)
modules/dnn/src/layers/convolution_layer.cpp		patch \| blob \| history
modules/dnn/src/layers/fully_connected_layer.cpp		patch \| blob \| history