kernel/arm/cdot_vfp.S

   1 /***************************************************************************
   2 Copyright (c) 2013, The OpenBLAS Project
   3 All rights reserved.
   4 Redistribution and use in source and binary forms, with or without
   5 modification, are permitted provided that the following conditions are
   6 met:
   7 1. Redistributions of source code must retain the above copyright
   8 notice, this list of conditions and the following disclaimer.
   9 2. Redistributions in binary form must reproduce the above copyright
  10 notice, this list of conditions and the following disclaimer in
  11 the documentation and/or other materials provided with the
  12 distribution.
  13 3. Neither the name of the OpenBLAS project nor the names of
  14 its contributors may be used to endorse or promote products
  15 derived from this software without specific prior written permission.
  16 THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
  17 AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
  18 IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
  19 ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE
  20 LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
  21 DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
  22 SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
  23 CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
  24 OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
  25 USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
  26 *****************************************************************************/
  27
  28 /**************************************************************************************
  29 * 2013/11/11 Saar
  30 *        BLASTEST               : OK
  31 *        CTEST                  : OK
  32 *        TEST                   : OK
  33 *
  34 **************************************************************************************/
  35
  36 #define ASSEMBLER
  37 #include "common.h"
  38
  39 #define STACKSIZE 256
  40
  41 #define N       r0
  42 #define X       r1
  43 #define INC_X   r2
  44 #define OLD_Y   r3
  45
  46
  47 /******************************************************
  48 * [fp, #-128] - [fp, #-64] is reserved
  49 * for store and restore of floating point
  50 * registers
  51 *******************************************************/
  52
  53 #define OLD_INC_Y       [fp, #4 ]
  54
  55 #define I       r5
  56 #define Y       r6
  57 #define INC_Y   r7
  58
  59 #define X_PRE   512
  60
  61 /**************************************************************************************
  62 * Macro definitions
  63 **************************************************************************************/
  64
  65 .macro KERNEL_F4
  66
  67         pld     [ X, #X_PRE  ]
  68         pld     [ Y, #X_PRE  ]
  69
  70         fldmias X!, { s4 - s5 }
  71         fldmias Y!, { s8 - s9 }
  72         fmacs   s0  , s4,  s8
  73         fmacs   s1  , s4,  s9
  74         fldmias X!, { s6 - s7 }
  75         fmacs   s2  , s5,  s9
  76         fmacs   s3  , s5,  s8
  77
  78         fldmias Y!, { s10 - s11 }
  79         fmacs   s0  , s6,  s10
  80         fmacs   s1  , s6,  s11
  81         fmacs   s2  , s7,  s11
  82         fmacs   s3  , s7,  s10
  83
  84
  85         fldmias X!, { s4 - s5 }
  86         fldmias Y!, { s8 - s9 }
  87         fmacs   s0  , s4,  s8
  88         fmacs   s1  , s4,  s9
  89         fldmias X!, { s6 - s7 }
  90         fmacs   s2  , s5,  s9
  91         fmacs   s3  , s5,  s8
  92
  93         fldmias Y!, { s10 - s11 }
  94         fmacs   s0  , s6,  s10
  95         fmacs   s1  , s6,  s11
  96         fmacs   s2  , s7,  s11
  97         fmacs   s3  , s7,  s10
  98
  99 .endm
 100
 101 .macro KERNEL_F1
 102
 103         fldmias X!, { s4 - s5 }
 104         fldmias Y!, { s8 - s9 }
 105         fmacs   s0  , s4,  s8
 106         fmacs   s1  , s4,  s9
 107         fmacs   s2  , s5,  s9
 108         fmacs   s3  , s5,  s8
 109
 110 .endm
 111
 112
 113 /*************************************************************************************************************************/
 114
 115 .macro KERNEL_S4
 116
 117         nop
 118
 119         fldmias X, { s4 - s5 }
 120         fldmias Y, { s8 - s9 }
 121         fmacs   s0  , s4,  s8
 122         fmacs   s1  , s4,  s9
 123         fmacs   s2  , s5,  s9
 124         fmacs   s3  , s5,  s8
 125         add     X, X, INC_X
 126         add     Y, Y, INC_Y
 127
 128         fldmias X, { s4 - s5 }
 129         fldmias Y, { s8 - s9 }
 130         fmacs   s0  , s4,  s8
 131         fmacs   s1  , s4,  s9
 132         fmacs   s2  , s5,  s9
 133         fmacs   s3  , s5,  s8
 134         add     X, X, INC_X
 135         add     Y, Y, INC_Y
 136
 137         fldmias X, { s4 - s5 }
 138         fldmias Y, { s8 - s9 }
 139         fmacs   s0  , s4,  s8
 140         fmacs   s1  , s4,  s9
 141         fmacs   s2  , s5,  s9
 142         fmacs   s3  , s5,  s8
 143         add     X, X, INC_X
 144         add     Y, Y, INC_Y
 145
 146         fldmias X, { s4 - s5 }
 147         fldmias Y, { s8 - s9 }
 148         fmacs   s0  , s4,  s8
 149         fmacs   s1  , s4,  s9
 150         fmacs   s2  , s5,  s9
 151         fmacs   s3  , s5,  s8
 152         add     X, X, INC_X
 153         add     Y, Y, INC_Y
 154
 155 .endm
 156
 157
 158 .macro KERNEL_S1
 159
 160         fldmias X, { s4 - s5 }
 161         fldmias Y, { s8 - s9 }
 162         fmacs   s0  , s4,  s8
 163         fmacs   s1  , s4,  s9
 164         fmacs   s2  , s5,  s9
 165         fmacs   s3  , s5,  s8
 166         add     X, X, INC_X
 167         add     Y, Y, INC_Y
 168
 169 .endm
 170
 171
 172
 173 /**************************************************************************************
 174 * End of macro definitions
 175 **************************************************************************************/
 176
 177         PROLOGUE
 178
 179         .align 5
 180
 181         push    {r4 - r9, fp}
 182         add     fp, sp, #24
 183         sub     sp, sp, #STACKSIZE                              // reserve stack
 184
 185         sub     r4, fp, #128
 186         vstm    r4, { s8 - s15}                                 // store floating point registers
 187
 188         movs    r4, #0                                          // clear floating point register
 189         vmov    s0, r4
 190         vmov    s1, s0
 191         vmov    s2, s0
 192         vmov    s3, s0
 193
 194         mov     Y, OLD_Y
 195         ldr     INC_Y, OLD_INC_Y
 196
 197         cmp     N, #0
 198         ble     cdot_kernel_L999
 199
 200         cmp     INC_X, #0
 201         beq     cdot_kernel_L999
 202
 203         cmp     INC_Y, #0
 204         beq     cdot_kernel_L999
 205
 206         cmp     INC_X, #1
 207         bne     cdot_kernel_S_BEGIN
 208
 209         cmp     INC_Y, #1
 210         bne     cdot_kernel_S_BEGIN
 211
 212 cdot_kernel_F_BEGIN:
 213
 214         asrs    I, N, #2                                        // I = N / 4
 215         ble     cdot_kernel_F1
 216
 217 cdot_kernel_F4:
 218
 219         KERNEL_F4
 220
 221         subs    I, I, #1
 222         bne     cdot_kernel_F4
 223
 224 cdot_kernel_F1:
 225
 226         ands    I, N, #3
 227         ble     cdot_kernel_L999
 228
 229 cdot_kernel_F10:
 230
 231         KERNEL_F1
 232
 233         subs    I, I, #1
 234         bne     cdot_kernel_F10
 235
 236         b       cdot_kernel_L999
 237
 238 cdot_kernel_S_BEGIN:
 239
 240         lsl     INC_X, INC_X, #3                                // INC_X * SIZE * 2
 241         lsl     INC_Y, INC_Y, #3                                // INC_Y * SIZE * 2
 242
 243         asrs    I, N, #2                                        // I = N / 4
 244         ble     cdot_kernel_S1
 245
 246 cdot_kernel_S4:
 247
 248         KERNEL_S4
 249
 250         subs    I, I, #1
 251         bne     cdot_kernel_S4
 252
 253 cdot_kernel_S1:
 254
 255         ands    I, N, #3
 256         ble     cdot_kernel_L999
 257
 258 cdot_kernel_S10:
 259
 260         KERNEL_S1
 261
 262         subs    I, I, #1
 263         bne     cdot_kernel_S10
 264
 265
 266
 267 cdot_kernel_L999:
 268
 269         sub     r3, fp, #128
 270         vldm    r3, { s8 - s15}                                 // restore floating point registers
 271
 272 #if !defined(CONJ)
 273         vsub.f32        s0 , s0, s2
 274         vadd.f32        s1 , s1, s3
 275 #else
 276         vadd.f32        s0 , s0, s2
 277         vsub.f32        s1 , s1, s3
 278 #endif
 279
 280         sub     sp, fp, #24
 281         pop     {r4 - r9, fp}
 282         bx      lr
 283
 284         EPILOGUE
 285