src/core/NEON/kernels/winograd/transforms/output_4x4_3x3_fp32.cpp

   1 /*
   2  * Copyright (c) 2017 ARM Limited.
   3  *
   4  * SPDX-License-Identifier: MIT
   5  *
   6  * Permission is hereby granted, free of charge, to any person obtaining a copy
   7  * of this software and associated documentation files (the "Software"), to
   8  * deal in the Software without restriction, including without limitation the
   9  * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
  10  * sell copies of the Software, and to permit persons to whom the Software is
  11  * furnished to do so, subject to the following conditions:
  12  *
  13  * The above copyright notice and this permission notice shall be included in all
  14  * copies or substantial portions of the Software.
  15  *
  16  * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
  17  * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
  18  * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
  19  * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
  20  * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
  21  * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
  22  * SOFTWARE.
  23  */
  24
  25 #include "transforms/output.hpp"
  26 #include "winograd_gemm.hpp"
  27 #include "arm.hpp"
  28
  29 namespace winograd
  30 {
  31
  32 using Transform = WinogradGEMM<4, 4, 3, 3>::OutputTransform<float>;
  33
  34 template <>
  35 template <>
  36 int Transform::ops_performed(const Tensor4DShape &shape)
  37 {
  38   // NOTE: Cost in FLOPs rather than instructions or uops.
  39   const int tile_M = iceildiv(shape.n_rows, 4);
  40   const int tile_N = iceildiv(shape.n_cols, 4);
  41   return 170 * tile_M * tile_N * shape.n_channels;
  42 }
  43
  44 /* F(4x4, 3x3) constructs 4x4 output tiles from a 3x3 convolution. Since we use
  45  * enough tiles to cover the output space each output tile may contain up to 3
  46  * padded values to the right and bottom columns or rows of the tile, e.g.:
  47 *
  48 *      ________    ________   ________   ________
  49 *     |       |   |      X|  |    X X|  |  X X X|
  50 *     |       |   |      X|  |    X X|  |  X X X|
  51 *     |       |   |      X|  |    X X|  |  X X X|
  52 *     |_______|   |______X|  |____X_X|  |__X_X_X|
  53 *
  54 *      ________    ________   ________   ________
  55 *     |       |   |      X|  |    X X|  |  X X X|
  56 *     |       |   |      X|  |    X X|  |  X X X|
  57 *     |       |   |      X|  |    X X|  |  X X X|
  58 *     |X_X_X_X|   |X_X_X_X|  |X_X_X_X|  |X_X_X_X|
  59 *
  60 *      ________    ________   ________   ________
  61 *     |       |   |      X|  |    X X|  |  X X X|
  62 *     |       |   |      X|  |    X X|  |  X X X|
  63 *     |X X X X|   |X X X X|  |X X X X|  |X X X X|
  64 *     |X_X_X_X|   |X_X_X_X|  |X_X_X_X|  |X_X_X_X|
  65 *
  66 *      ________    ________   ________   ________
  67 *     |       |   |      X|  |    X X|  |  X X X|
  68 *     |X X X X|   |X X X X|  |X X X X|  |X X X X|
  69 *     |X X X X|   |X X X X|  |X X X X|  |X X X X|
  70 *     |X_X_X_X|   |X_X_X_X|  |X_X_X_X|  |X_X_X_X|
  71 *
  72 *
  73 * We provide a specialised output transform for each of these instances.
  74 */
  75 template <>
  76 template <>
  77 template <int pad_bottom, int pad_right>
  78 void Transform::process_tile(
  79   const int n_channels,
  80   const float* const matrix_base,
  81   const int matrix_stride,
  82   float* const output,
  83   const int output_row_stride,
  84   const int output_col_stride
  85 )
  86 {
  87   constexpr int cells_i = 4 - pad_bottom;
  88   constexpr int cells_j = 4 - pad_right;
  89
  90   // Construct a map to the output cells
  91   float *outptrs[cells_i][cells_j];
  92   for (int i = 0; i < cells_i; i++)
  93   {
  94     for (int j = 0; j < cells_j; j++)
  95     {
  96       outptrs[i][j] = output + i*output_row_stride + j*output_col_stride;
  97     }
  98   }
  99   const float *inptr = matrix_base;
 100
 101   // For each channel of the output
 102   int channels_remaining = n_channels;
 103 #ifdef __aarch64__
 104   for (; channels_remaining >= 4; channels_remaining -= 4)
 105   {
 106     // Matrices used and computed during this transform
 107     float32x4_t F[6][6], FZ[6][4], f[4][4];
 108
 109     // Read a 6x6 tile in the Winograd domain
 110     for (int i = 0, m = 0; i < 6; i++)
 111     {
 112       for (int j = 0; j < 6; j++, m++)
 113       {
 114         F[i][j] = vld1q_f32(inptr + m*matrix_stride);
 115       }
 116     }
 117     inptr += 4;
 118
 119     // Compute the matrix F Z
 120     for (int i = 0; i < 6; i++)
 121     {
 122       // FZ[i][0] =  1*F[i][0] +  1*F[i][1] +  1*F[i][2] +  1*F[i][3] +  1*F[i][4];
 123       FZ[i][0] = vaddq_f32(vaddq_f32(vaddq_f32(F[i][0], F[i][1]), vaddq_f32(F[i][2], F[i][3])), F[i][4]);
 124
 125       // FZ[i][1] =  1*F[i][1] + -1*F[i][2] +  2*F[i][3] + -2*F[i][4];
 126       FZ[i][1] = vmlaq_n_f32(vsubq_f32(F[i][1], F[i][2]), vsubq_f32(F[i][3], F[i][4]), 2.0f);
 127
 128       // FZ[i][2] =  1*F[i][1] +  1*F[i][2] +  4*F[i][3] +  4*F[i][4];
 129       FZ[i][2] = vmlaq_n_f32(vaddq_f32(F[i][1], F[i][2]), vaddq_f32(F[i][3], F[i][4]), 4.0f);
 130
 131       // FZ[i][3] =  1*F[i][1] + -1*F[i][2] +  8*F[i][3] + -8*F[i][4] +  1*F[i][5];
 132       FZ[i][3] = vaddq_f32(vmlaq_n_f32(vsubq_f32(F[i][1], F[i][2]), vsubq_f32(F[i][3], F[i][4]), 8.0f), F[i][5]);
 133     }
 134
 135     // Compute the output tile f = ZT F Z
 136     for (int j = 0; j < 4; j++)
 137     {
 138       // f[0][j] =  1*FZ[0][j] +  1*FZ[1][j] +  1*FZ[2][j] +  1*FZ[3][j] +  1*FZ[4][j];
 139       f[0][j] = vaddq_f32(vaddq_f32(vaddq_f32(FZ[0][j], FZ[1][j]), vaddq_f32(FZ[2][j], FZ[3][j])), FZ[4][j]);
 140
 141       // f[1][j] =  1*FZ[1][j] + -1*FZ[2][j] +  2*FZ[3][j] + -2*FZ[4][j];
 142       f[1][j] = vmlaq_n_f32(vsubq_f32(FZ[1][j], FZ[2][j]), vsubq_f32(FZ[3][j], FZ[4][j]), 2.0f);
 143
 144       // f[2][j] =  1*FZ[1][j] +  1*FZ[2][j] +  4*FZ[3][j] +  4*FZ[4][j];
 145       f[2][j] = vmlaq_n_f32(vaddq_f32(FZ[1][j], FZ[2][j]), vaddq_f32(FZ[3][j], FZ[4][j]), 4.0f);
 146
 147       // f[3][j] =  1*FZ[1][j] + -1*FZ[2][j] +  8*FZ[3][j] + -8*FZ[4][j] +  1*FZ[5][j];
 148       f[3][j] = vaddq_f32(vmlaq_n_f32(vsubq_f32(FZ[1][j], FZ[2][j]), vsubq_f32(FZ[3][j], FZ[4][j]), 8.0f), FZ[5][j]);
 149     }
 150
 151     // Write out the output tile
 152     for (int i = 0; i < cells_i; i++)
 153     {
 154       for (int j = 0; j < cells_j; j++)
 155       {
 156         vst1q_f32(outptrs[i][j], f[i][j]);
 157         outptrs[i][j] += 4;
 158       }
 159     }
 160   }
 161 #endif  // __aarch64__
 162 #ifdef __arm_any__
 163   for (; channels_remaining >= 2; channels_remaining -= 2)
 164   {
 165     // Matrices used and computed during this transform
 166     float32x2_t F[6][6], FZ[6][4], f[4][4];
 167
 168     // Read a 6x6 tile in the Winograd domain
 169     for (int i = 0, m = 0; i < 6; i++)
 170     {
 171       for (int j = 0; j < 6; j++, m++)
 172       {
 173         F[i][j] = vld1_f32(inptr + m*matrix_stride);
 174       }
 175     }
 176     inptr += 2;
 177
 178     // Compute the matrix F Z
 179     for (int i = 0; i < 6; i++)
 180     {
 181       // FZ[i][0] =  1*F[i][0] +  1*F[i][1] +  1*F[i][2] +  1*F[i][3] +  1*F[i][4];
 182       FZ[i][0] = vadd_f32(vadd_f32(vadd_f32(F[i][0], F[i][1]), vadd_f32(F[i][2], F[i][3])), F[i][4]);
 183
 184       // FZ[i][1] =  1*F[i][1] + -1*F[i][2] +  2*F[i][3] + -2*F[i][4];
 185       FZ[i][1] = vmla_n_f32(vsub_f32(F[i][1], F[i][2]), vsub_f32(F[i][3], F[i][4]), 2.0f);
 186
 187       // FZ[i][2] =  1*F[i][1] +  1*F[i][2] +  4*F[i][3] +  4*F[i][4];
 188       FZ[i][2] = vmla_n_f32(vadd_f32(F[i][1], F[i][2]), vadd_f32(F[i][3], F[i][4]), 4.0f);
 189
 190       // FZ[i][3] =  1*F[i][1] + -1*F[i][2] +  8*F[i][3] + -8*F[i][4] +  1*F[i][5];
 191       FZ[i][3] = vadd_f32(vmla_n_f32(vsub_f32(F[i][1], F[i][2]), vsub_f32(F[i][3], F[i][4]), 8.0f), F[i][5]);
 192     }
 193
 194     // Compute the output tile f = ZT F Z
 195     for (int j = 0; j < 4; j++)
 196     {
 197       // f[0][j] =  1*FZ[0][j] +  1*FZ[1][j] +  1*FZ[2][j] +  1*FZ[3][j] +  1*FZ[4][j];
 198       f[0][j] = vadd_f32(vadd_f32(vadd_f32(FZ[0][j], FZ[1][j]), vadd_f32(FZ[2][j], FZ[3][j])), FZ[4][j]);
 199
 200       // f[1][j] =  1*FZ[1][j] + -1*FZ[2][j] +  2*FZ[3][j] + -2*FZ[4][j];
 201       f[1][j] = vmla_n_f32(vsub_f32(FZ[1][j], FZ[2][j]), vsub_f32(FZ[3][j], FZ[4][j]), 2.0f);
 202
 203       // f[2][j] =  1*FZ[1][j] +  1*FZ[2][j] +  4*FZ[3][j] +  4*FZ[4][j];
 204       f[2][j] = vmla_n_f32(vadd_f32(FZ[1][j], FZ[2][j]), vadd_f32(FZ[3][j], FZ[4][j]), 4.0f);
 205
 206       // f[3][j] =  1*FZ[1][j] + -1*FZ[2][j] +  8*FZ[3][j] + -8*FZ[4][j] +  1*FZ[5][j];
 207       f[3][j] = vadd_f32(vmla_n_f32(vsub_f32(FZ[1][j], FZ[2][j]), vsub_f32(FZ[3][j], FZ[4][j]), 8.0f), FZ[5][j]);
 208     }
 209
 210     // Write out the output tile
 211     for (int i = 0; i < cells_i; i++)
 212     {
 213       for (int j = 0; j < cells_j; j++)
 214       {
 215         vst1_f32(outptrs[i][j], f[i][j]);
 216         outptrs[i][j] += 2;
 217       }
 218     }
 219   }
 220 #endif
 221   for (; channels_remaining; channels_remaining--)
 222   {
 223     // Matrices used and computed during this transform
 224     float F[6][6], FZ[6][4], f[4][4];
 225
 226     // Read a 6x6 tile in the Winograd domain
 227     for (int i = 0, m = 0; i < 6; i++)
 228     {
 229       for (int j = 0; j < 6; j++, m++)
 230       {
 231         F[i][j] = *(inptr + m*matrix_stride);
 232       }
 233     }
 234     inptr++;
 235
 236     // Compute the matrix F Z
 237     for (int i = 0; i < 6; i++)
 238     {
 239       FZ[i][0] =  1*F[i][0] +  1*F[i][1] +  1*F[i][2] +  1*F[i][3] +  1*F[i][4];
 240       FZ[i][1] =  1*F[i][1] + -1*F[i][2] +  2*F[i][3] + -2*F[i][4];
 241       FZ[i][2] =  1*F[i][1] +  1*F[i][2] +  4*F[i][3] +  4*F[i][4];
 242       FZ[i][3] =  1*F[i][1] + -1*F[i][2] +  8*F[i][3] + -8*F[i][4] +  1*F[i][5];
 243     }
 244
 245     // Compute the output tile f = ZT F Z
 246     for (int j = 0; j < 4; j++)
 247     {
 248       f[0][j] =  1*FZ[0][j] +  1*FZ[1][j] +  1*FZ[2][j] +  1*FZ[3][j] +  1*FZ[4][j];
 249       f[1][j] =  1*FZ[1][j] + -1*FZ[2][j] +  2*FZ[3][j] + -2*FZ[4][j];
 250       f[2][j] =  1*FZ[1][j] +  1*FZ[2][j] +  4*FZ[3][j] +  4*FZ[4][j];
 251       f[3][j] =  1*FZ[1][j] + -1*FZ[2][j] +  8*FZ[3][j] + -8*FZ[4][j] +  1*FZ[5][j];
 252     }
 253
 254     // Write out the output tile
 255     for (int i = 0; i < cells_i; i++)
 256     {
 257       for (int j = 0; j < cells_j; j++)
 258       {
 259         *(outptrs[i][j]++) = f[i][j];
 260       }
 261     }
 262   }
 263 }
 264
 265 template <>
 266 template <>
 267 const Transform::TileFn Transform::tile_fns[max_pad_bottom][max_pad_right] =
 268 {
 269   {
 270     Transform::template process_tile<0, 0>,
 271     Transform::template process_tile<0, 1>,
 272     Transform::template process_tile<0, 2>,
 273     Transform::template process_tile<0, 3>,
 274   },
 275   {
 276     Transform::template process_tile<1, 0>,
 277     Transform::template process_tile<1, 1>,
 278     Transform::template process_tile<1, 2>,
 279     Transform::template process_tile<1, 3>,
 280   },
 281   {
 282     Transform::template process_tile<2, 0>,
 283     Transform::template process_tile<2, 1>,
 284     Transform::template process_tile<2, 2>,
 285     Transform::template process_tile<2, 3>,
 286   },
 287   {
 288     Transform::template process_tile<3, 0>,
 289     Transform::template process_tile<3, 1>,
 290     Transform::template process_tile<3, 2>,
 291     Transform::template process_tile<3, 3>,
 292   }
 293 };
 294
 295 template struct WinogradGEMM<4, 4, 3, 3>::OutputTransform<float>;
 296 }  // namespace winograd