From da96d8f8219abf6b5b2d59905c1cee1b0a0fbc81 Mon Sep 17 00:00:00 2001
From: =?utf8?q?=D0=90=D0=BD=D0=B4=D1=80=D0=B5=D0=B9=20=D0=A8=D0=B5=D0=B4?=
 =?utf8?q?=D1=8C=D0=BA=D0=BE/AI=20Tools=20Lab=20/SRR/Engineer/=EC=82=BC?=
 =?utf8?q?=EC=84=B1=EC=A0=84=EC=9E=90?= <a.shedko@samsung.com>
Date: Wed, 10 Apr 2019 06:54:57 +0300
Subject: [PATCH] [NNFW] optimized Conv on CPU (#4964)

Moved condition and offset calculation out of the innermost loop in cpu
convolution kernel. Achieved 2+x speed up on mobilenet on neurun.

Signed-off-by: Andrei Shedko <a.shedko@samsung.com>
---
 libs/cker/include/cker/operation/Conv.h | 40 ++++++++++++++++-----------------
 1 file changed, 20 insertions(+), 20 deletions(-)

diff --git a/libs/cker/include/cker/operation/Conv.h b/libs/cker/include/cker/operation/Conv.h
index e494f0e..d232dbc 100644
--- a/libs/cker/include/cker/operation/Conv.h
+++ b/libs/cker/include/cker/operation/Conv.h
@@ -95,18 +95,18 @@ inline void Conv(const ConvParams &params, const Shape &input_shape, const float
           {
             for (int filter_x = 0; filter_x < filter_width; ++filter_x)
             {
-              for (int in_channel = 0; in_channel < input_depth; ++in_channel)
+              const int in_x = in_x_origin + dilation_width_factor * filter_x;
+              const int in_y = in_y_origin + dilation_height_factor * filter_y;
+              // If the location is outside the bounds of the input image,
+              // use zero as a default value.
+              if ((in_x >= 0) && (in_x < input_width) && (in_y >= 0) && (in_y < input_height))
               {
-                const int in_x = in_x_origin + dilation_width_factor * filter_x;
-                const int in_y = in_y_origin + dilation_height_factor * filter_y;
-                // If the location is outside the bounds of the input image,
-                // use zero as a default value.
-                if ((in_x >= 0) && (in_x < input_width) && (in_y >= 0) && (in_y < input_height))
+                const int in_offset = Offset(input_shape, batch, in_y, in_x, 0);
+                const int filter_offset = Offset(filter_shape, out_channel, filter_y, filter_x, 0);
+                for (int in_channel = 0; in_channel < input_depth; ++in_channel)
                 {
-                  float input_value =
-                      input_data[Offset(input_shape, batch, in_y, in_x, in_channel)];
-                  float filter_value = filter_data[Offset(filter_shape, out_channel, filter_y,
-                                                          filter_x, in_channel)];
+                  float input_value = input_data[in_offset + in_channel];
+                  float filter_value = filter_data[filter_offset + in_channel];
                   total += (input_value * filter_value);
                 }
               }
@@ -176,18 +176,18 @@ inline void Conv(const ConvParams &params, const Shape &input_shape, const uint8
           {
             for (int filter_x = 0; filter_x < filter_width; ++filter_x)
             {
-              for (int in_channel = 0; in_channel < input_depth; ++in_channel)
+              const int in_x = in_x_origin + dilation_width_factor * filter_x;
+              const int in_y = in_y_origin + dilation_height_factor * filter_y;
+              // If the location is outside the bounds of the input image,
+              // use zero as a default value.
+              const int in_base = Offset(input_shape, batch, in_y, in_x, 0);
+              const int filter_base = Offset(filter_shape, out_channel, filter_y, filter_x, 0);
+              if ((in_x >= 0) && (in_x < input_width) && (in_y >= 0) && (in_y < input_height))
               {
-                const int in_x = in_x_origin + dilation_width_factor * filter_x;
-                const int in_y = in_y_origin + dilation_height_factor * filter_y;
-                // If the location is outside the bounds of the input image,
-                // use zero as a default value.
-                if ((in_x >= 0) && (in_x < input_width) && (in_y >= 0) && (in_y < input_height))
+                for (int in_channel = 0; in_channel < input_depth; in_channel++)
                 {
-                  int32_t input_val =
-                      input_data[Offset(input_shape, batch, in_y, in_x, in_channel)];
-                  int32_t filter_val = filter_data[Offset(filter_shape, out_channel, filter_y,
-                                                          filter_x, in_channel)];
+                  int32_t input_val = input_data[in_channel + in_base];
+                  int32_t filter_val = filter_data[in_channel + filter_base];
                   acc += (filter_val + filter_offset) * (input_val + input_offset);
                 }
               }
-- 
2.7.4