arm_compute/core/NEON/kernels/assembly/kernels/a32_sgemm_8x6/a55r1.hpp

   1 /*
   2  * Copyright (c) 2018 ARM Limited.
   3  *
   4  * SPDX-License-Identifier: MIT
   5  *
   6  * Permission is hereby granted, free of charge, to any person obtaining a copy
   7  * of this software and associated documentation files (the "Software"), to
   8  * deal in the Software without restriction, including without limitation the
   9  * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
  10  * sell copies of the Software, and to permit persons to whom the Software is
  11  * furnished to do so, subject to the following conditions:
  12  *
  13  * The above copyright notice and this permission notice shall be included in all
  14  * copies or substantial portions of the Software.
  15  *
  16  * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
  17  * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
  18  * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
  19  * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
  20  * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
  21  * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
  22  * SOFTWARE.
  23  */
  24 #pragma once
  25
  26 #ifdef __arm__
  27
  28 #include <arm_neon.h>
  29
  30 #include "../../asmlib.hpp"
  31
  32 // Kernel implementation.
  33 //
  34 // Assume that "Apanel" points to a chunk of A blocks (each size 6xK) in read-order.
  35 // Assume that "Bpanel" points to a chunk of B blocks (each size 8xK) in read-order.
  36 // Assume that "Cpanel" points to a chunk of C output blocks (each size
  37 // 8x6), the chunks being arranged in a row major fashion.
  38 //
  39 // Note that the intent of this is that either ablocks or bblocks will be 1
  40 // - this construction allows the output loop to proceed in either order.
  41
  42 inline void a32_sgemm_8x6_a55r1(const float *Apanel, const float *Bpanel, float *Cpanel, int ablocks, int bblocks, int K) {
  43     const float *a_ptr = Apanel;
  44     float *c_ptr = Cpanel;
  45
  46     /* Work out starting values for "k" and "tails" in the inner loop. */
  47     int tails_initial = (K & 3);
  48     if (tails_initial == 0) {
  49         tails_initial = 4;
  50     }
  51
  52     int k_initial = ((K+3)/4) - 1;
  53
  54     for (int yb=0; yb<ablocks; yb++) {
  55         const float *a_ptr0 = a_ptr;
  56         const float *b_ptr = Bpanel;
  57
  58         for (int xb=0; xb<bblocks; xb++) {
  59             int tails = tails_initial;
  60             int k = k_initial;
  61
  62             a_ptr = a_ptr0;
  63
  64             __asm __volatile (
  65                 "vldr           d0, [%[a_ptr]]\n"
  66                 "vmov.i32       q4, #0\n"
  67                 "vldr           d1, [%[a_ptr], #0x08]\n"
  68                 "vmov.i32       q5, #0\n"
  69                 "vldr           d4, [%[b_ptr]]\n"
  70                 "vmov.i32       q6, #0\n"
  71                 "vldr           d5, [%[b_ptr], #0x08]\n"
  72                 "vmov.i32       q7, #0\n"
  73                 "vldr           d2, [%[a_ptr], #0x10]\n"
  74                 "vmov.i32       q8, #0\n"
  75                 ASM_PREFETCH("[%[b_ptr], #0x40]")
  76                 "vmov.i32       q9, #0\n"
  77                 ASM_PREFETCH("[%[a_ptr], #0x40]")
  78                 "vmov.i32       q10, #0\n"
  79                 ASM_PREFETCH("[%[b_ptr], #0x80]")
  80                 "vmov.i32       q11, #0\n"
  81                 ASM_PREFETCH("[%[a_ptr], #0x80]")
  82                 "vmov.i32       q12, #0\n"
  83                 ASM_PREFETCH("[%[b_ptr], #0XC0]")
  84                 "vmov.i32       q13, #0\n"
  85                 ASM_PREFETCH("[%[a_ptr], #0xC0]")
  86                 "vmov.i32       q14, #0\n"
  87                 ASM_PREFETCH("[%[b_ptr], #0x100]")
  88                 "vmov.i32       q15, #0\n"
  89                 ASM_PREFETCH("[%[a_ptr], #0x100]")
  90                 "cmp            %[k], #0\n"
  91                 ASM_PREFETCH("[%[b_ptr], #0x140]")
  92                 "beq            6f\n"
  93                 ASM_PREFETCH("[%[b_ptr], #0x180]")
  94
  95                 "1:\n"
  96                 // Unroll 0
  97                 "vmla.f32       q4, q2, d0[0]\n"
  98                 "vldr           d6, [%[b_ptr], #0x10]\n"
  99                 "vmla.f32       q5, q2, d0[1]\n"
 100                 "vldr           d7, [%[b_ptr], #0x18]\n"
 101                 "vmla.f32       q6, q2, d1[0]\n"
 102                 "vldr           d3, [%[a_ptr], #0x18]\n"
 103                 "vmla.f32       q7, q2, d1[1]\n"
 104                 ASM_PREFETCH("[%[a_ptr], #0x140]")
 105                 "vmla.f32       q8, q2, d2[0]\n"
 106                 "subs           %[k], %[k], #1\n"
 107                 "vmla.f32       q9, q2, d2[1]\n"
 108                 "vldr           d4, [%[b_ptr], #0x20]\n"
 109                 "vmla.f32       q10, q3, d0[0]\n"
 110                 "vldr           d5, [%[b_ptr], #0x28]\n"
 111                 "vmla.f32       q11, q3, d0[1]\n"
 112                 "vldr           d0, [%[a_ptr], #0x20]\n"
 113                 "vmla.f32       q12, q3, d1[0]\n"
 114
 115                 "vmla.f32       q13, q3, d1[1]\n"
 116                 "vldr           d1, [%[a_ptr], #0x28]\n"
 117                 "vmla.f32       q14, q3, d2[0]\n"
 118
 119                 "vmla.f32       q15, q3, d2[1]\n"
 120                 "vldr           d6, [%[b_ptr], #0x30]\n"
 121
 122                 // Unroll 1
 123                 "vmla.f32       q4, q2, d3[0]\n"
 124                 "vldr           d7, [%[b_ptr], #0x38]\n"
 125                 "vmla.f32       q5, q2, d3[1]\n"
 126                 "vldr           d2, [%[a_ptr], #0x30]\n"
 127                 "vmla.f32       q6, q2, d0[0]\n"
 128
 129                 "vmla.f32       q7, q2, d0[1]\n"
 130                 ASM_PREFETCH("[%[b_ptr], #0x1C0]")
 131                 "vmla.f32       q8, q2, d1[0]\n"
 132
 133                 "vmla.f32       q9, q2, d1[1]\n"
 134                 "vldr           d4, [%[b_ptr], #0x40]\n"
 135                 "vmla.f32       q10, q3, d3[0]\n"
 136                 "vldr           d5, [%[b_ptr], #0x48]\n"
 137                 "vmla.f32       q11, q3, d3[1]\n"
 138                 "vldr           d3, [%[a_ptr], #0x38]\n"
 139                 "vmla.f32       q12, q3, d0[0]\n"
 140
 141                 "vmla.f32       q13, q3, d0[1]\n"
 142                 "vldr           d0, [%[a_ptr], #0x40]\n"
 143                 "vmla.f32       q14, q3, d1[0]\n"
 144
 145                 "vmla.f32       q15, q3, d1[1]\n"
 146                 "vldr           d6, [%[b_ptr], #0x50]\n"
 147
 148                 // Unroll 2
 149                 "vmla.f32       q4, q2, d2[0]\n"
 150                 "vldr           d7, [%[b_ptr], #0x58]\n"
 151                 "vmla.f32       q5, q2, d2[1]\n"
 152                 "vldr           d1, [%[a_ptr], #0x48]\n"
 153                 "vmla.f32       q6, q2, d3[0]\n"
 154
 155                 "vmla.f32       q7, q2, d3[1]\n"
 156                 ASM_PREFETCH("[%[a_ptr], #0x180]")
 157                 "vmla.f32       q8, q2, d0[0]\n"
 158
 159                 "vmla.f32       q9, q2, d0[1]\n"
 160                 "vldr           d4, [%[b_ptr], #0x60]\n"
 161                 "vmla.f32       q10, q3, d2[0]\n"
 162                 "vldr           d5, [%[b_ptr], #0x68]\n"
 163                 "vmla.f32       q11, q3, d2[1]\n"
 164                 "vldr           d2, [%[a_ptr], #0x50]\n"
 165                 "vmla.f32       q12, q3, d3[0]\n"
 166
 167                 "vmla.f32       q13, q3, d3[1]\n"
 168                 "vldr           d3, [%[a_ptr], #0x58]\n"
 169                 "vmla.f32       q14, q3, d0[0]\n"
 170                 "add            %[a_ptr], %[a_ptr], #0x60\n"
 171                 "vmla.f32       q15, q3, d0[1]\n"
 172                 "vldr           d6, [%[b_ptr], #0x70]\n"
 173
 174                 // Unroll 3
 175                 "vmla.f32       q4, q2, d1[0]\n"
 176                 "vldr           d7, [%[b_ptr], #0x78]\n"
 177                 "vmla.f32       q5, q2, d1[1]\n"
 178                 "add            %[b_ptr], %[b_ptr], #0x80\n"
 179                 "vmla.f32       q6, q2, d2[0]\n"
 180                 "vldr           d0, [%[a_ptr], #0x00]\n"
 181                 "vmla.f32       q7, q2, d2[1]\n"
 182                 ASM_PREFETCH("[%[b_ptr], #0x180]")
 183                 "vmla.f32       q8, q2, d3[0]\n"
 184
 185                 "vmla.f32       q9, q2, d3[1]\n"
 186                 "vldr           d4, [%[b_ptr], #0x00]\n"
 187                 "vmla.f32       q10, q3, d1[0]\n"
 188                 "vldr           d5, [%[b_ptr], #0x08]\n"
 189                 "vmla.f32       q11, q3, d1[1]\n"
 190                 "vldr           d1, [%[a_ptr], #0x08]\n"
 191                 "vmla.f32       q12, q3, d2[0]\n"
 192
 193                 "vmla.f32       q13, q3, d2[1]\n"
 194                 "vldr           d2, [%[a_ptr], #0x10]\n"
 195                 "vmla.f32       q14, q3, d3[0]\n"
 196
 197                 "vmla.f32       q15, q3, d3[1]\n"
 198                 "bne            1b\n"
 199
 200                 // "Tails" shows how many multiply blocks are needed at the
 201                 // end, must be 1-4 inclusive.  Bail out to alternative tail
 202                 // immediately if it's 1.
 203                 "6:\n"
 204                 "subs           %[tails], %[tails], #1\n"
 205                 "beq            3f\n"
 206
 207                 // Detached final iteration
 208
 209                 // Unroll 0
 210                 "vmla.f32       q4, q2, d0[0]\n"
 211                 "vldr           d6, [%[b_ptr], #0x10]\n"
 212                 "vmla.f32       q5, q2, d0[1]\n"
 213                 "vldr           d7, [%[b_ptr], #0x18]\n"
 214                 "vmla.f32       q6, q2, d1[0]\n"
 215                 "vldr           d3, [%[a_ptr], #0x18]\n"
 216                 "vmla.f32       q7, q2, d1[1]\n"
 217                 "subs           %[tails], %[tails], #1\n"
 218                 "vmla.f32       q8, q2, d2[0]\n"
 219                 "vmla.f32       q9, q2, d2[1]\n"
 220                 "vldr           d4, [%[b_ptr], #0x20]\n"
 221
 222                 "vmla.f32       q10, q3, d0[0]\n"
 223                 "vldr           d5, [%[b_ptr], #0x28]\n"
 224                 "vmla.f32       q11, q3, d0[1]\n"
 225                 "vldr           d0, [%[a_ptr], #0x20]\n"
 226                 "vmla.f32       q12, q3, d1[0]\n"
 227                 "add            %[b_ptr], %[b_ptr], #0x30\n"
 228                 "vmla.f32       q13, q3, d1[1]\n"
 229                 "vldr           d1, [%[a_ptr], #0x28]\n"
 230                 "vmla.f32       q14, q3, d2[0]\n"
 231                 "vmla.f32       q15, q3, d2[1]\n"
 232                 "beq            4f\n"
 233
 234                 // Unroll 1
 235                 "vmla.f32       q4, q2, d3[0]\n"
 236                 "vldr           d6, [%[b_ptr], #0x30]\n"
 237                 "vmla.f32       q5, q2, d3[1]\n"
 238                 "vldr           d7, [%[b_ptr], #0x38]\n"
 239                 "vmla.f32       q6, q2, d0[0]\n"
 240                 "vldr           d2, [%[a_ptr], #0x30]\n"
 241                 "vmla.f32       q7, q2, d0[1]\n"
 242                 "subs           %[tails], %[tails], #1\n"
 243                 "vmla.f32       q8, q2, d1[0]\n"
 244
 245                 "vmla.f32       q9, q2, d1[1]\n"
 246
 247                 "vmla.f32       q10, q3, d3[0]\n"
 248                 "vldr           d4, [%[b_ptr], #0x40]\n"
 249                 "vmla.f32       q11, q3, d3[1]\n"
 250                 "vldr           d5, [%[b_ptr], #0x48]\n"
 251                 "vmla.f32       q12, q3, d0[0]\n"
 252                 "vldr           d3, [%[a_ptr], #0x38]\n"
 253                 "vmla.f32       q13, q3, d0[1]\n"
 254                 "vldr           d0, [%[a_ptr], #0x40]\n"
 255                 "vmla.f32       q14, q3, d1[0]\n"
 256                 "vmla.f32       q15, q3, d1[1]\n"
 257                 "beq            5f\n"
 258
 259                 // Unroll 2
 260                 "vmla.f32       q4, q2, d2[0]\n"
 261                 "vldr           d6, [%[b_ptr], #0x50]\n"
 262                 "vmla.f32       q5, q2, d2[1]\n"
 263                 "vldr           d7, [%[b_ptr], #0x58]\n"
 264                 "vmla.f32       q6, q2, d3[0]\n"
 265                 "vldr           d1, [%[a_ptr], #0x48]\n"
 266                 "vmla.f32       q7, q2, d3[1]\n"
 267                 "vmla.f32       q8, q2, d0[0]\n"
 268                 "vmla.f32       q9, q2, d0[1]\n"
 269
 270                 "vmla.f32       q10, q3, d2[0]\n"
 271                 "vldr           d4, [%[b_ptr], #0x60]\n"
 272                 "vmla.f32       q11, q3, d2[1]\n"
 273                 "vldr           d5, [%[b_ptr], #0x68]\n"
 274                 "vmla.f32       q12, q3, d3[0]\n"
 275                 "vldr           d2, [%[a_ptr], #0x50]\n"
 276                 "vmla.f32       q13, q3, d3[1]\n"
 277                 "vldr           d3, [%[a_ptr], #0x58]\n"
 278                 "vmla.f32       q14, q3, d0[0]\n"
 279                 "vmla.f32       q15, q3, d0[1]\n"
 280
 281                 // Unroll 3
 282                 "vmla.f32       q4, q2, d1[0]\n"
 283                 "vldr           d6, [%[b_ptr], #0x70]\n"
 284                 "vmla.f32       q5, q2, d1[1]\n"
 285                 "vldr           d7, [%[b_ptr], #0x78]\n"
 286                 "vmla.f32       q10, q3, d1[0]\n"
 287                 "vst1.32        {d8-d9}, [%[c_ptr] :128]!\n"
 288                 "vmla.f32       q11, q3, d1[1]\n"
 289                 "vst1.32        {d20-d21}, [%[c_ptr] :128]!\n"
 290                 "vmla.f32       q6, q2, d2[0]\n"
 291                 "vst1.32        {d10-d11}, [%[c_ptr] :128]!\n"
 292                 "vmla.f32       q12, q3, d2[0]\n"
 293                 "vst1.32        {d22-d23}, [%[c_ptr] :128]!\n"
 294                 "vmla.f32       q7, q2, d2[1]\n"
 295                 "vst1.32        {d12-d13}, [%[c_ptr] :128]!\n"
 296                 "vmla.f32       q13, q3, d2[1]\n"
 297                 "vst1.32        {d24-d25}, [%[c_ptr] :128]!\n"
 298                 "vmla.f32       q8, q2, d3[0]\n"
 299                 "vst1.32        {d14-d15}, [%[c_ptr] :128]!\n"
 300                 "vmla.f32       q14, q3, d3[0]\n"
 301                 "vst1.32        {d26-d27}, [%[c_ptr] :128]!\n"
 302                 "vmla.f32       q9, q2, d3[1]\n"
 303                 "vst1.32        {d16-d17}, [%[c_ptr] :128]!\n"
 304                 "vmla.f32       q15, q3, d3[1]\n"
 305                 "vst1.32        {d28-d29}, [%[c_ptr] :128]!\n"
 306                 "add            %[a_ptr], %[a_ptr], #0x60\n"
 307                 "vst1.32        {d18-d19}, [%[c_ptr] :128]!\n"
 308                 "add            %[b_ptr], %[b_ptr], #0x80\n"
 309                 "b              2f\n"
 310
 311                 // tails==1 final tail
 312                 "3:\n"
 313                 "vmla.f32       q4, q2, d0[0]\n"
 314                 "vldr           d6, [%[b_ptr], #0x10]\n"
 315                 "vmla.f32       q5, q2, d0[1]\n"
 316                 "vldr           d7, [%[b_ptr], #0x18]\n"
 317                 "vmla.f32       q6, q2, d1[0]\n"
 318                 "vst1.32        {d8-d9}, [%[c_ptr] :128]!\n"
 319                 "vmla.f32       q10, q3, d0[0]\n"
 320                 "vst1.32        {d20-d21}, [%[c_ptr] :128]!\n"
 321                 "vmla.f32       q11, q3, d0[1]\n"
 322                 "vst1.32        {d10-d11}, [%[c_ptr] :128]!\n"
 323                 "vmla.f32       q12, q3, d1[0]\n"
 324                 "vst1.32        {d22-d23}, [%[c_ptr] :128]!\n"
 325                 "vmla.f32       q7, q2, d1[1]\n"
 326                 "vst1.32        {d12-d13}, [%[c_ptr] :128]!\n"
 327                 "vmla.f32       q13, q3, d1[1]\n"
 328                 "vst1.32        {d24-d25}, [%[c_ptr] :128]!\n"
 329                 "vmla.f32       q8, q2, d2[0]\n"
 330                 "vst1.32        {d14-d15}, [%[c_ptr] :128]!\n"
 331                 "vmla.f32       q14, q3, d2[0]\n"
 332                 "vst1.32        {d26-d27}, [%[c_ptr] :128]!\n"
 333                 "vmla.f32       q9, q2, d2[1]\n"
 334                 "vst1.32        {d16-d17}, [%[c_ptr] :128]!\n"
 335                 "vmla.f32       q15, q3, d2[1]\n"
 336                 "vst1.32        {d28-d29}, [%[c_ptr] :128]!\n"
 337                 "add            %[a_ptr], %[a_ptr], #0x18\n"
 338                 "vst1.32        {d18-d19}, [%[c_ptr] :128]!\n"
 339                 "add            %[b_ptr], %[b_ptr], #0x20\n"
 340                 "b              2f\n"
 341
 342                 // tails==2 final tail
 343                 "4:\n"
 344                 "vmla.f32       q4, q2, d3[0]\n"
 345                 "vldr           d6, [%[b_ptr], #0x30]\n"
 346                 "vmla.f32       q5, q2, d3[1]\n"
 347                 "vldr           d7, [%[b_ptr], #0x38]\n"
 348                 "vmla.f32       q10, q3, d3[0]\n"
 349                 "vst1.32        {d8-d9}, [%[c_ptr] :128]!\n"
 350                 "vmla.f32       q11, q3, d3[1]\n"
 351                 "vst1.32        {d20-d21}, [%[c_ptr] :128]!\n"
 352                 "vmla.f32       q6, q2, d0[0]\n"
 353                 "vst1.32        {d10-d11}, [%[c_ptr] :128]!\n"
 354                 "vmla.f32       q12, q3, d0[0]\n"
 355                 "vst1.32        {d22-d23}, [%[c_ptr] :128]!\n"
 356                 "vmla.f32       q7, q2, d0[1]\n"
 357                 "vst1.32        {d12-d13}, [%[c_ptr] :128]!\n"
 358                 "vmla.f32       q13, q3, d0[1]\n"
 359                 "vst1.32        {d24-d25}, [%[c_ptr] :128]!\n"
 360                 "vmla.f32       q8, q2, d1[0]\n"
 361                 "vst1.32        {d14-d15}, [%[c_ptr] :128]!\n"
 362                 "vmla.f32       q14, q3, d1[0]\n"
 363                 "vst1.32        {d26-d27}, [%[c_ptr] :128]!\n"
 364                 "vmla.f32       q9, q2, d1[1]\n"
 365                 "vst1.32        {d16-d17}, [%[c_ptr] :128]!\n"
 366                 "vmla.f32       q15, q3, d1[1]\n"
 367                 "vst1.32        {d28-d29}, [%[c_ptr] :128]!\n"
 368                 "add            %[b_ptr], %[b_ptr], #0x40\n"
 369                 "vst1.32        {d18-d19}, [%[c_ptr] :128]!\n"
 370                 "add            %[a_ptr], %[a_ptr], #0x30\n"
 371                 "b              2f\n"
 372
 373                 // tails==3 final tail
 374                 "5:\n"
 375                 "vmla.f32       q4, q2, d2[0]\n"
 376                 "vldr           d6, [%[b_ptr], #0x50]\n"
 377                 "vmla.f32       q5, q2, d2[1]\n"
 378                 "vldr           d7, [%[b_ptr], #0x58]\n"
 379                 "vmla.f32       q6, q2, d3[0]\n"
 380                 "vst1.32        {d8-d9}, [%[c_ptr] :128]!\n"
 381                 "vmla.f32       q10, q3, d2[0]\n"
 382                 "vst1.32        {d20-d21}, [%[c_ptr] :128]!\n"
 383                 "vmla.f32       q11, q3, d2[1]\n"
 384                 "vst1.32        {d10-d11}, [%[c_ptr] :128]!\n"
 385                 "vmla.f32       q12, q3, d3[0]\n"
 386                 "vst1.32        {d22-d23}, [%[c_ptr] :128]!\n"
 387                 "vmla.f32       q7, q2, d3[1]\n"
 388                 "vst1.32        {d12-d13}, [%[c_ptr] :128]!\n"
 389                 "vmla.f32       q13, q3, d3[1]\n"
 390                 "vst1.32        {d24-d25}, [%[c_ptr] :128]!\n"
 391                 "vmla.f32       q8, q2, d0[0]\n"
 392                 "vst1.32        {d14-d15}, [%[c_ptr] :128]!\n"
 393                 "vmla.f32       q14, q3, d0[0]\n"
 394                 "vst1.32        {d26-d27}, [%[c_ptr] :128]!\n"
 395                 "vmla.f32       q9, q2, d0[1]\n"
 396                 "vst1.32        {d16-d17}, [%[c_ptr] :128]!\n"
 397                 "vmla.f32       q15, q3, d0[1]\n"
 398                 "vst1.32        {d28-d29}, [%[c_ptr] :128]!\n"
 399                 "add            %[a_ptr], %[a_ptr], #0x48\n"
 400                 "vst1.32        {d18-d19}, [%[c_ptr] :128]!\n"
 401                 "add            %[b_ptr], %[b_ptr], #0x60\n"
 402
 403                 "2:\n"
 404                 "vst1.32        {d30-d31}, [%[c_ptr] :128]!\n"
 405             : [a_ptr] "+r" (a_ptr), [b_ptr] "+r" (b_ptr), [c_ptr] "+r" (c_ptr), [k] "+r" (k), [tails] "+r" (tails)
 406             :
 407             : "q0", "q1", "q2", "q3", "q4", "q5", "q6", "q7", "q8", "q9", "q10", "q11", "q12", "q13", "q14", "q15", "r0", "r1"
 408             );
 409         }
 410     }
 411 }
 412
 413 #endif /* __arm__ */