ocl_calcOpticalFlowPyrLK optimizations
authorkrodyush <konstantin.rodyushkin@intel.com>
Wed, 19 Mar 2014 15:31:14 +0000 (19:31 +0400)
committerkrodyush <konstantin.rodyushkin@intel.com>
Wed, 19 Mar 2014 15:31:14 +0000 (19:31 +0400)
1. decrease branch number in CL code by replacing them into weights
2. decrease local mem pressure in reduce operation by using private variables
3. decrease image sampler pressure by caching data into local memory
4. remove unnecessary sync point on the HOST side.

modules/video/src/lkpyramid.cpp
modules/video/src/opencl/pyrlk.cl

index cd57585..c95835d 100644 (file)
@@ -989,7 +989,7 @@ namespace cv
             idxArg = kernel.set(idxArg, (int)winSize.height); // int c_winSize_y
             idxArg = kernel.set(idxArg, (int)iters); // int c_iters
             idxArg = kernel.set(idxArg, (char)calcErr); //char calcErr
-            return kernel.run(2, globalThreads, localThreads, true);
+            return kernel.run(2, globalThreads, localThreads, false);
         }
     private:
         inline static bool isDeviceCPU()
index c018554..45571c7 100644 (file)
 //
 //M*/
 
-#define        BUFFER  64
-#define        BUFFER2 BUFFER>>1
+#define GRIDSIZE    3
+#define LSx 8
+#define LSy 8
+#define BUFFER  (LSx*LSy)
+#define BUFFER2 BUFFER>>1
 #ifndef WAVE_SIZE
 #define WAVE_SIZE 1
 #endif
+
 #ifdef CPU
 
 inline void reduce3(float val1, float val2, float val3,  __local float* smem1,  __local float* smem2,  __local float* smem3, int tid)
@@ -128,24 +132,21 @@ inline void reduce3(float val1, float val2, float val3,
 #if WAVE_SIZE <16
     }
     barrier(CLK_LOCAL_MEM_FENCE);
-    if (tid < 8)
+    if (tid<1)
     {
 #endif
-        smem1[tid] += smem1[tid + 8];
-        smem2[tid] += smem2[tid + 8];
-        smem3[tid] += smem3[tid + 8];
-
-        smem1[tid] += smem1[tid + 4];
-        smem2[tid] += smem2[tid + 4];
-        smem3[tid] += smem3[tid + 4];
-
-        smem1[tid] += smem1[tid + 2];
-        smem2[tid] += smem2[tid + 2];
-        smem3[tid] += smem3[tid + 2];
-
-        smem1[tid] += smem1[tid + 1];
-        smem2[tid] += smem2[tid + 1];
-        smem3[tid] += smem3[tid + 1];
+        local float8* m1 = (local float8*)smem1;
+        local float8* m2 = (local float8*)smem2;
+        local float8* m3 = (local float8*)smem3;
+        float8 t1 = m1[0]+m1[1];
+        float8 t2 = m2[0]+m2[1];
+        float8 t3 = m3[0]+m3[1];
+        float4 t14 = t1.lo + t1.hi;
+        float4 t24 = t2.lo + t2.hi;
+        float4 t34 = t3.lo + t3.hi;
+        smem1[0] = t14.x+t14.y+t14.z+t14.w;
+        smem2[0] = t24.x+t24.y+t24.z+t24.w;
+        smem3[0] = t34.x+t34.y+t34.z+t34.w;
     }
     barrier(CLK_LOCAL_MEM_FENCE);
 }
@@ -171,20 +172,17 @@ inline void reduce2(float val1, float val2, __local volatile float* smem1, __loc
 #if WAVE_SIZE <16
     }
     barrier(CLK_LOCAL_MEM_FENCE);
-    if (tid < 8)
+    if (tid<1)
     {
 #endif
-        smem1[tid] += smem1[tid + 8];
-        smem2[tid] += smem2[tid + 8];
-
-        smem1[tid] += smem1[tid + 4];
-        smem2[tid] += smem2[tid + 4];
-
-        smem1[tid] += smem1[tid + 2];
-        smem2[tid] += smem2[tid + 2];
-
-        smem1[tid] += smem1[tid + 1];
-        smem2[tid] += smem2[tid + 1];
+        local float8* m1 = (local float8*)smem1;
+        local float8* m2 = (local float8*)smem2;
+        float8 t1 = m1[0]+m1[1];
+        float8 t2 = m2[0]+m2[1];
+        float4 t14 = t1.lo + t1.hi;
+        float4 t24 = t2.lo + t2.hi;
+        smem1[0] = t14.x+t14.y+t14.z+t14.w;
+        smem2[0] = t24.x+t24.y+t24.z+t24.w;
     }
     barrier(CLK_LOCAL_MEM_FENCE);
 }
@@ -207,13 +205,13 @@ inline void reduce1(float val1, __local volatile float* smem1, int tid)
 #if WAVE_SIZE <16
     }
     barrier(CLK_LOCAL_MEM_FENCE);
-    if (tid < 8)
+    if (tid<1)
     {
 #endif
-        smem1[tid] += smem1[tid + 8];
-        smem1[tid] += smem1[tid + 4];
-        smem1[tid] += smem1[tid + 2];
-        smem1[tid] += smem1[tid + 1];
+        local float8* m1 = (local float8*)smem1;
+        float8 t1 = m1[0]+m1[1];
+        float4 t14 = t1.lo + t1.hi;
+        smem1[0] = t14.x+t14.y+t14.z+t14.w;
     }
     barrier(CLK_LOCAL_MEM_FENCE);
 }
@@ -225,18 +223,21 @@ inline void reduce1(float val1, __local volatile float* smem1, int tid)
 // Image read mode
 __constant sampler_t sampler    = CLK_NORMALIZED_COORDS_FALSE | CLK_ADDRESS_CLAMP_TO_EDGE | CLK_FILTER_LINEAR;
 
-inline void SetPatch(image2d_t I, float x, float y,
+// macro to get pixel value from local memory
+#define VAL(_y,_x,_yy,_xx)    (IPatchLocal[yid+((_y)*LSy)+1+(_yy)][xid+((_x)*LSx)+1+(_xx)])
+inline void SetPatch(local float IPatchLocal[LSy*GRIDSIZE+2][LSx*GRIDSIZE+2], int TileY, int TileX,
               float* Pch, float* Dx, float* Dy,
-              float* A11, float* A12, float* A22)
+              float* A11, float* A12, float* A22, float w)
 {
-    *Pch = read_imagef(I, sampler, (float2)(x, y)).x;
-
-    float dIdx = 3.0f * read_imagef(I, sampler, (float2)(x + 1, y - 1)).x + 10.0f * read_imagef(I, sampler, (float2)(x + 1, y)).x + 3.0f * read_imagef(I, sampler, (float2)(x + 1, y + 1)).x -
-                 (3.0f * read_imagef(I, sampler, (float2)(x - 1, y - 1)).x + 10.0f * read_imagef(I, sampler, (float2)(x - 1, y)).x + 3.0f * read_imagef(I, sampler, (float2)(x - 1, y + 1)).x);
+    unsigned int xid=get_local_id(0);
+    unsigned int yid=get_local_id(1);
+    *Pch = VAL(TileY,TileX,0,0);
 
-    float dIdy = 3.0f * read_imagef(I, sampler, (float2)(x - 1, y + 1)).x + 10.0f * read_imagef(I, sampler, (float2)(x, y + 1)).x + 3.0f * read_imagef(I, sampler, (float2)(x + 1, y + 1)).x -
-                 (3.0f * read_imagef(I, sampler, (float2)(x - 1, y - 1)).x + 10.0f * read_imagef(I, sampler, (float2)(x, y - 1)).x + 3.0f * read_imagef(I, sampler, (float2)(x + 1, y - 1)).x);
+    float dIdx = (3.0f*VAL(TileY,TileX,-1,1)+10.0f*VAL(TileY,TileX,0,1)+3.0f*VAL(TileY,TileX,+1,1))-(3.0f*VAL(TileY,TileX,-1,-1)+10.0f*VAL(TileY,TileX,0,-1)+3.0f*VAL(TileY,TileX,+1,-1));
+    float dIdy = (3.0f*VAL(TileY,TileX,1,-1)+10.0f*VAL(TileY,TileX,1,0)+3.0f*VAL(TileY,TileX,1,+1))-(3.0f*VAL(TileY,TileX,-1,-1)+10.0f*VAL(TileY,TileX,-1,0)+3.0f*VAL(TileY,TileX,-1,+1));
 
+    dIdx *= w;
+    dIdy *= w;
 
     *Dx = dIdx;
     *Dy = dIdy;
@@ -245,6 +246,7 @@ inline void SetPatch(image2d_t I, float x, float y,
     *A12 += dIdx * dIdy;
     *A22 += dIdy * dIdy;
 }
+#undef VAL
 
 inline void GetPatch(image2d_t J, float x, float y,
               float* Pch, float* Dx, float* Dy,
@@ -303,7 +305,38 @@ inline void GetError4(image2d_t J, const float x, const float y, const float4* P
     *errval += fabs(diff.x) + fabs(diff.y) + fabs(diff.z);
 }
 
-#define        GRIDSIZE        3
+
+//macro to read pixel value into local memory.
+#define READI(_y,_x)    IPatchLocal[yid+((_y)*LSy)][xid+((_x)*LSx)] = read_imagef(I, sampler, (float2)(Point.x + xid+(_x)*LSx + 0.5f-1, Point.y + yid+(_y)*LSy+ 0.5f-1)).x;
+void ReadPatchIToLocalMem(image2d_t I, float2 Point, local float IPatchLocal[LSy*GRIDSIZE+2][LSx*GRIDSIZE+2])
+{
+    unsigned int xid=get_local_id(0);
+    unsigned int yid=get_local_id(1);
+    //read (3*LSx)*(3*LSy) window. each macro call read LSx*LSy pixels block
+    READI(0,0);READI(0,1);READI(0,2);
+    READI(1,0);READI(1,1);READI(1,2);
+    READI(2,0);READI(2,1);READI(2,2);
+    if(xid<2)
+    {// read last 2 columns border. each macro call reads 2*LSy pixels block
+        READI(0,3);
+        READI(1,3);
+        READI(2,3);
+    }
+
+    if(yid<2)
+    {// read last 2 row. each macro call reads LSx*2 pixels block
+        READI(3,0);READI(3,1);READI(3,2);
+    }
+
+    if(yid<2 && xid<2)
+    {// read right bottom 2x2 corner. one macro call reads 2*2 pixels block
+        READI(3,3);
+    }
+    barrier(CLK_LOCAL_MEM_FENCE);
+}
+#undef READI
+
+__attribute__((reqd_work_group_size(LSx, LSy, 1)))
 __kernel void lkSparse(image2d_t I, image2d_t J,
                        __global const float2* prevPts, int prevPtsStep, __global float2* nextPts, int nextPtsStep, __global uchar* status, __global float* err,
                        const int level, const int rows, const int cols, int PATCH_X, int PATCH_Y, int c_winSize_x, int c_winSize_y, int c_iters, char calcErr)
@@ -318,6 +351,8 @@ __kernel void lkSparse(image2d_t I, image2d_t J,
     unsigned int xsize=get_local_size(0);
     unsigned int ysize=get_local_size(1);
     int xBase, yBase, k;
+    float wx = ((xid+2*xsize)<c_winSize_x)?1:0;
+    float wy = ((yid+2*ysize)<c_winSize_y)?1:0;
 
     float2 c_halfWin = (float2)((c_winSize_x - 1)>>1, (c_winSize_y - 1)>>1);
 
@@ -346,65 +381,54 @@ __kernel void lkSparse(image2d_t I, image2d_t J,
     float dIdx_patch[GRIDSIZE][GRIDSIZE];
     float dIdy_patch[GRIDSIZE][GRIDSIZE];
 
-    yBase=yid;
+    // local memory to read image with border to calc sobels
+    local float IPatchLocal[LSy*GRIDSIZE+2][LSx*GRIDSIZE+2];
+    ReadPatchIToLocalMem(I,prevPt,IPatchLocal);
+
     {
-        xBase=xid;
-        SetPatch(I, prevPt.x + xBase + 0.5f, prevPt.y + yBase + 0.5f,
+        SetPatch(IPatchLocal, 0, 0,
                  &I_patch[0][0], &dIdx_patch[0][0], &dIdy_patch[0][0],
-                 &A11, &A12, &A22);
+                 &A11, &A12, &A22,1);
 
 
-        xBase+=xsize;
-        SetPatch(I, prevPt.x + xBase + 0.5f, prevPt.y + yBase + 0.5f,
+        SetPatch(IPatchLocal, 0, 1,
                  &I_patch[0][1], &dIdx_patch[0][1], &dIdy_patch[0][1],
-                 &A11, &A12, &A22);
+                 &A11, &A12, &A22,1);
 
-        xBase+=xsize;
-        if(xBase<c_winSize_x)
-            SetPatch(I, prevPt.x + xBase + 0.5f, prevPt.y + yBase + 0.5f,
-                     &I_patch[0][2], &dIdx_patch[0][2], &dIdy_patch[0][2],
-                     &A11, &A12, &A22);
+        SetPatch(IPatchLocal, 0, 2,
+                    &I_patch[0][2], &dIdx_patch[0][2], &dIdy_patch[0][2],
+                    &A11, &A12, &A22,wx);
     }
-    yBase+=ysize;
     {
-        xBase=xid;
-        SetPatch(I, prevPt.x + xBase + 0.5f, prevPt.y + yBase + 0.5f,
+        SetPatch(IPatchLocal, 1, 0,
                  &I_patch[1][0], &dIdx_patch[1][0], &dIdy_patch[1][0],
-                 &A11, &A12, &A22);
+                 &A11, &A12, &A22,1);
 
 
-        xBase+=xsize;
-        SetPatch(I, prevPt.x + xBase + 0.5f, prevPt.y + yBase + 0.5f,
+        SetPatch(IPatchLocal, 1,1,
                  &I_patch[1][1], &dIdx_patch[1][1], &dIdy_patch[1][1],
-                 &A11, &A12, &A22);
+                 &A11, &A12, &A22,1);
 
-        xBase+=xsize;
-        if(xBase<c_winSize_x)
-            SetPatch(I, prevPt.x + xBase + 0.5f, prevPt.y + yBase + 0.5f,
-                     &I_patch[1][2], &dIdx_patch[1][2], &dIdy_patch[1][2],
-                     &A11, &A12, &A22);
+        SetPatch(IPatchLocal, 1,2,
+                    &I_patch[1][2], &dIdx_patch[1][2], &dIdy_patch[1][2],
+                    &A11, &A12, &A22,wx);
     }
-    yBase+=ysize;
-    if(yBase<c_winSize_y)
     {
-        xBase=xid;
-        SetPatch(I, prevPt.x + xBase + 0.5f, prevPt.y + yBase + 0.5f,
+        SetPatch(IPatchLocal, 2,0,
                  &I_patch[2][0], &dIdx_patch[2][0], &dIdy_patch[2][0],
-                 &A11, &A12, &A22);
+                 &A11, &A12, &A22,wy);
 
 
-        xBase+=xsize;
-        SetPatch(I, prevPt.x + xBase + 0.5f, prevPt.y + yBase + 0.5f,
+        SetPatch(IPatchLocal, 2,1,
                  &I_patch[2][1], &dIdx_patch[2][1], &dIdy_patch[2][1],
-                 &A11, &A12, &A22);
+                 &A11, &A12, &A22,wy);
 
-        xBase+=xsize;
-        if(xBase<c_winSize_x)
-            SetPatch(I, prevPt.x + xBase + 0.5f, prevPt.y + yBase + 0.5f,
-                     &I_patch[2][2], &dIdx_patch[2][2], &dIdy_patch[2][2],
-                     &A11, &A12, &A22);
+        SetPatch(IPatchLocal, 2,2,
+                    &I_patch[2][2], &dIdx_patch[2][2], &dIdy_patch[2][2],
+                    &A11, &A12, &A22,wx*wy);
     }
 
+
     reduce3(A11, A12, A22, smem1, smem2, smem3, tid);
 
     A11 = smem1[0];
@@ -434,7 +458,7 @@ __kernel void lkSparse(image2d_t I, image2d_t J,
         {
             if (tid == 0 && level == 0)
                 status[gid] = 0;
-            return;
+            break;
         }
 
         float b1 = 0;
@@ -454,10 +478,9 @@ __kernel void lkSparse(image2d_t I, image2d_t J,
                      &b1, &b2);
 
             xBase+=xsize;
-            if(xBase<c_winSize_x)
-                GetPatch(J, prevPt.x + xBase + 0.5f, prevPt.y + yBase + 0.5f,
-                         &I_patch[0][2], &dIdx_patch[0][2], &dIdy_patch[0][2],
-                         &b1, &b2);
+            GetPatch(J, prevPt.x + xBase + 0.5f, prevPt.y + yBase + 0.5f,
+                        &I_patch[0][2], &dIdx_patch[0][2], &dIdy_patch[0][2],
+                        &b1, &b2);
         }
         yBase+=ysize;
         {
@@ -473,13 +496,11 @@ __kernel void lkSparse(image2d_t I, image2d_t J,
                      &b1, &b2);
 
             xBase+=xsize;
-            if(xBase<c_winSize_x)
-                GetPatch(J, prevPt.x + xBase + 0.5f, prevPt.y + yBase + 0.5f,
-                         &I_patch[1][2], &dIdx_patch[1][2], &dIdy_patch[1][2],
-                         &b1, &b2);
+            GetPatch(J, prevPt.x + xBase + 0.5f, prevPt.y + yBase + 0.5f,
+                        &I_patch[1][2], &dIdx_patch[1][2], &dIdy_patch[1][2],
+                        &b1, &b2);
         }
         yBase+=ysize;
-        if(yBase<c_winSize_y)
         {
             xBase=xid;
             GetPatch(J, prevPt.x + xBase + 0.5f, prevPt.y + yBase + 0.5f,
@@ -493,10 +514,9 @@ __kernel void lkSparse(image2d_t I, image2d_t J,
                      &b1, &b2);
 
             xBase+=xsize;
-            if(xBase<c_winSize_x)
-                GetPatch(J, prevPt.x + xBase + 0.5f, prevPt.y + yBase + 0.5f,
-                         &I_patch[2][2], &dIdx_patch[2][2], &dIdy_patch[2][2],
-                         &b1, &b2);
+            GetPatch(J, prevPt.x + xBase + 0.5f, prevPt.y + yBase + 0.5f,
+                        &I_patch[2][2], &dIdx_patch[2][2], &dIdy_patch[2][2],
+                        &b1, &b2);
         }
 
         reduce2(b1, b2, smem1, smem2, tid);