ARM64: Convert all labels to local labels
authorAshwin Sekhar T K <ashwin.sekhar@cavium.com>
Tue, 24 Oct 2017 10:47:11 +0000 (10:47 +0000)
committerAshwin Sekhar T K <ashwin.sekhar@cavium.com>
Tue, 24 Oct 2017 11:40:05 +0000 (11:40 +0000)
While debugging/profiling applications using perf or other tools, the
kernels appear scattered in the profile reports. This is because the labels
within the kernels are not local and each label is shown as a separate
function.

To avoid this, all the labels within the kernels are changed to local
labels.

50 files changed:
kernel/arm64/amax.S
kernel/arm64/asum.S
kernel/arm64/axpy.S
kernel/arm64/casum.S
kernel/arm64/cgemm_kernel_4x4.S
kernel/arm64/cgemm_kernel_8x4.S
kernel/arm64/cgemm_kernel_8x4_thunderx2t99.S
kernel/arm64/copy.S
kernel/arm64/ctrmm_kernel_4x4.S
kernel/arm64/ctrmm_kernel_8x4.S
kernel/arm64/daxpy_thunderx2t99.S
kernel/arm64/dgemm_kernel_4x4.S
kernel/arm64/dgemm_kernel_4x8.S
kernel/arm64/dgemm_kernel_8x4.S
kernel/arm64/dgemm_kernel_8x4_thunderx2t99.S
kernel/arm64/dgemm_ncopy_4.S
kernel/arm64/dgemm_ncopy_8.S
kernel/arm64/dgemm_tcopy_4.S
kernel/arm64/dgemm_tcopy_8.S
kernel/arm64/dot.S
kernel/arm64/dtrmm_kernel_4x4.S
kernel/arm64/dtrmm_kernel_4x8.S
kernel/arm64/dtrmm_kernel_8x4.S
kernel/arm64/gemv_n.S
kernel/arm64/gemv_t.S
kernel/arm64/iamax.S
kernel/arm64/izamax.S
kernel/arm64/nrm2.S
kernel/arm64/rot.S
kernel/arm64/scal.S
kernel/arm64/sgemm_kernel_16x4.S
kernel/arm64/sgemm_kernel_16x4_thunderx2t99.S
kernel/arm64/sgemm_kernel_4x4.S
kernel/arm64/sgemm_kernel_8x8.S
kernel/arm64/strmm_kernel_16x4.S
kernel/arm64/strmm_kernel_4x4.S
kernel/arm64/strmm_kernel_8x8.S
kernel/arm64/swap.S
kernel/arm64/zamax.S
kernel/arm64/zasum.S
kernel/arm64/zaxpy.S
kernel/arm64/zdot.S
kernel/arm64/zgemm_kernel_4x4.S
kernel/arm64/zgemm_kernel_4x4_thunderx2t99.S
kernel/arm64/zgemv_n.S
kernel/arm64/zgemv_t.S
kernel/arm64/znrm2.S
kernel/arm64/zrot.S
kernel/arm64/zscal.S
kernel/arm64/ztrmm_kernel_4x4.S

index c02321a..f535ddf 100644 (file)
@@ -160,62 +160,62 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
        PROLOGUE
 
        cmp     N, xzr
-       ble     amax_kernel_zero
+       ble     .Lamax_kernel_zero
        cmp     INC_X, xzr
-       ble     amax_kernel_zero
+       ble     .Lamax_kernel_zero
 
        cmp     INC_X, #1
-       bne     amax_kernel_S_BEGIN
+       bne     .Lamax_kernel_S_BEGIN
 
-amax_kernel_F_BEGIN:
+.Lamax_kernel_F_BEGIN:
 
        asr     I, N, #2
        cmp     I, xzr
-       beq     amax_kernel_F1_INIT
+       beq     .Lamax_kernel_F1_INIT
 
        INIT_F4
        subs    I, I, #1
-       beq     amax_kernel_F1
+       beq     .Lamax_kernel_F1
 
-amax_kernel_F4:
+.Lamax_kernel_F4:
 
        KERNEL_F4
 
        subs    I, I, #1
-       bne     amax_kernel_F4
+       bne     .Lamax_kernel_F4
 
-amax_kernel_F1:
+.Lamax_kernel_F1:
 
        ands    I, N, #3
-       ble     amax_kernel_L999
+       ble     .Lamax_kernel_L999
 
-amax_kernel_F10:
+.Lamax_kernel_F10:
 
        KERNEL_F1
 
        subs    I, I, #1
-        bne     amax_kernel_F10
+        bne     .Lamax_kernel_F10
 
        ret
 
-amax_kernel_F1_INIT:
+.Lamax_kernel_F1_INIT:
 
        INIT_F1
        subs    N, N, #1
-       b       amax_kernel_F1
+       b       .Lamax_kernel_F1
 
-amax_kernel_S_BEGIN:
+.Lamax_kernel_S_BEGIN:
 
        INIT_S
 
        subs    N, N, #1
-       ble     amax_kernel_L999
+       ble     .Lamax_kernel_L999
 
        asr     I, N, #2
        cmp     I, xzr
-       ble     amax_kernel_S1
+       ble     .Lamax_kernel_S1
 
-amax_kernel_S4:
+.Lamax_kernel_S4:
 
        KERNEL_S1
        KERNEL_S1
@@ -223,25 +223,25 @@ amax_kernel_S4:
        KERNEL_S1
 
        subs    I, I, #1
-       bne     amax_kernel_S4
+       bne     .Lamax_kernel_S4
 
-amax_kernel_S1:
+.Lamax_kernel_S1:
 
        ands    I, N, #3
-       ble     amax_kernel_L999
+       ble     .Lamax_kernel_L999
 
-amax_kernel_S10:
+.Lamax_kernel_S10:
 
        KERNEL_S1
 
        subs    I, I, #1
-        bne     amax_kernel_S10
+        bne     .Lamax_kernel_S10
 
-amax_kernel_L999:
+.Lamax_kernel_L999:
 
        ret
 
-amax_kernel_zero:
+.Lamax_kernel_zero:
 
        fmov    MAXF, REG0
        ret
index bee8927..e88eb07 100644 (file)
@@ -122,52 +122,52 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 #endif
 
        cmp     N, xzr
-       ble     asum_kernel_L999
+       ble     .Lasum_kernel_L999
        cmp     INC_X, xzr
-       ble     asum_kernel_L999
+       ble     .Lasum_kernel_L999
 
        cmp     INC_X, #1
-       bne     asum_kernel_S_BEGIN
+       bne     .Lasum_kernel_S_BEGIN
 
-asum_kernel_F_BEGIN:
+.Lasum_kernel_F_BEGIN:
 
        asr     I, N, #3
        cmp     I, xzr
-       beq     asum_kernel_F1
+       beq     .Lasum_kernel_F1
 
-asum_kernel_F8:
+.Lasum_kernel_F8:
 
        KERNEL_F8
 
        subs    I, I, #1
-       bne     asum_kernel_F8
+       bne     .Lasum_kernel_F8
 
        KERNEL_F8_FINALIZE
 
-asum_kernel_F1:
+.Lasum_kernel_F1:
 
        ands    I, N, #7
-       ble     asum_kernel_L999
+       ble     .Lasum_kernel_L999
 
-asum_kernel_F10:
+.Lasum_kernel_F10:
 
        KERNEL_F1
 
        subs    I, I, #1
-        bne     asum_kernel_F10
+        bne     .Lasum_kernel_F10
 
-asum_kernel_L999:
+.Lasum_kernel_L999:
        ret
 
-asum_kernel_S_BEGIN:
+.Lasum_kernel_S_BEGIN:
 
        INIT_S
 
        asr     I, N, #2
        cmp     I, xzr
-       ble     asum_kernel_S1
+       ble     .Lasum_kernel_S1
 
-asum_kernel_S4:
+.Lasum_kernel_S4:
 
        KERNEL_S1
        KERNEL_S1
@@ -175,19 +175,19 @@ asum_kernel_S4:
        KERNEL_S1
 
        subs    I, I, #1
-       bne     asum_kernel_S4
+       bne     .Lasum_kernel_S4
 
-asum_kernel_S1:
+.Lasum_kernel_S1:
 
        ands    I, N, #3
-       ble     asum_kernel_L999
+       ble     .Lasum_kernel_L999
 
-asum_kernel_S10:
+.Lasum_kernel_S10:
 
        KERNEL_S1
 
        subs    I, I, #1
-        bne     asum_kernel_S10
+        bne     .Lasum_kernel_S10
 
        ret
 
index 554902c..8094351 100644 (file)
@@ -135,53 +135,53 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
        PROLOGUE
 
        cmp     N, xzr
-       ble     axpy_kernel_L999
+       ble     .Laxpy_kernel_L999
 
        fcmp    DA, #0.0
-       beq     axpy_kernel_L999
+       beq     .Laxpy_kernel_L999
 
        cmp     INC_X, #1
-       bne     axpy_kernel_S_BEGIN
+       bne     .Laxpy_kernel_S_BEGIN
        cmp     INC_Y, #1
-       bne     axpy_kernel_S_BEGIN
+       bne     .Laxpy_kernel_S_BEGIN
 
-axpy_kernel_F_BEGIN:
+.Laxpy_kernel_F_BEGIN:
 
        asr     I, N, #3
        cmp     I, xzr
-       beq     axpy_kernel_F1
+       beq     .Laxpy_kernel_F1
 
-axpy_kernel_F8:
+.Laxpy_kernel_F8:
 
        KERNEL_F8
 
        subs    I, I, #1
-       bne     axpy_kernel_F8
+       bne     .Laxpy_kernel_F8
 
-axpy_kernel_F1:
+.Laxpy_kernel_F1:
 
        ands    I, N, #7
-       ble     axpy_kernel_L999
+       ble     .Laxpy_kernel_L999
 
-axpy_kernel_F10:
+.Laxpy_kernel_F10:
 
        KERNEL_F1
 
        subs    I, I, #1
-        bne     axpy_kernel_F10
+        bne     .Laxpy_kernel_F10
 
        mov     w0, wzr
        ret
 
-axpy_kernel_S_BEGIN:
+.Laxpy_kernel_S_BEGIN:
 
        INIT_S
 
        asr     I, N, #2
        cmp     I, xzr
-       ble     axpy_kernel_S1
+       ble     .Laxpy_kernel_S1
 
-axpy_kernel_S4:
+.Laxpy_kernel_S4:
 
        KERNEL_S1
        KERNEL_S1
@@ -189,21 +189,21 @@ axpy_kernel_S4:
        KERNEL_S1
 
        subs    I, I, #1
-       bne     axpy_kernel_S4
+       bne     .Laxpy_kernel_S4
 
-axpy_kernel_S1:
+.Laxpy_kernel_S1:
 
        ands    I, N, #3
-       ble     axpy_kernel_L999
+       ble     .Laxpy_kernel_L999
 
-axpy_kernel_S10:
+.Laxpy_kernel_S10:
 
        KERNEL_S1
 
        subs    I, I, #1
-        bne     axpy_kernel_S10
+        bne     .Laxpy_kernel_S10
 
-axpy_kernel_L999:
+.Laxpy_kernel_L999:
 
        mov     w0, wzr
        ret
index 8f09eec..7c82827 100644 (file)
@@ -98,52 +98,52 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
        fmov    s1, SUMF
 
        cmp     N, xzr
-       ble     asum_kernel_L999
+       ble     .Lcasum_kernel_L999
        cmp     INC_X, xzr
-       ble     asum_kernel_L999
+       ble     .Lcasum_kernel_L999
 
        cmp     INC_X, #1
-       bne     asum_kernel_S_BEGIN
+       bne     .Lcasum_kernel_S_BEGIN
 
-asum_kernel_F_BEGIN:
+.Lcasum_kernel_F_BEGIN:
 
        asr     I, N, #3
        cmp     I, xzr
-       beq     asum_kernel_F1
+       beq     .Lcasum_kernel_F1
 
-asum_kernel_F8:
+.Lcasum_kernel_F8:
 
        KERNEL_F8
 
        subs    I, I, #1
-       bne     asum_kernel_F8
+       bne     .Lcasum_kernel_F8
 
        KERNEL_F8_FINALIZE
 
-asum_kernel_F1:
+.Lcasum_kernel_F1:
 
        ands    I, N, #7
-       ble     asum_kernel_L999
+       ble     .Lcasum_kernel_L999
 
-asum_kernel_F10:
+.Lcasum_kernel_F10:
 
        KERNEL_F1
 
        subs    I, I, #1
-        bne     asum_kernel_F10
+        bne     .Lcasum_kernel_F10
 
-asum_kernel_L999:
+.Lcasum_kernel_L999:
        ret
 
-asum_kernel_S_BEGIN:
+.Lcasum_kernel_S_BEGIN:
 
        INIT_S
 
        asr     I, N, #2
        cmp     I, xzr
-       ble     asum_kernel_S1
+       ble     .Lcasum_kernel_S1
 
-asum_kernel_S4:
+.Lcasum_kernel_S4:
 
        KERNEL_S1
        KERNEL_S1
@@ -151,19 +151,19 @@ asum_kernel_S4:
        KERNEL_S1
 
        subs    I, I, #1
-       bne     asum_kernel_S4
+       bne     .Lcasum_kernel_S4
 
-asum_kernel_S1:
+.Lcasum_kernel_S1:
 
        ands    I, N, #3
-       ble     asum_kernel_L999
+       ble     .Lcasum_kernel_L999
 
-asum_kernel_S10:
+.Lcasum_kernel_S10:
 
        KERNEL_S1
 
        subs    I, I, #1
-        bne     asum_kernel_S10
+        bne     .Lcasum_kernel_S10
 
        ret
 
index 7f2ddea..bbf0c75 100644 (file)
@@ -1072,11 +1072,11 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
        mov     counterJ, origN
        asr     counterJ, counterJ, #2          // J = J / 4
        cmp     counterJ, #0
-       ble     cgemm_kernel_L2_BEGIN
+       ble     .Lcgemm_kernel_L2_BEGIN
 
 /******************************************************************************/
 
-cgemm_kernel_L4_BEGIN:
+.Lcgemm_kernel_L4_BEGIN:
        mov     pCRow0, pC                      // pCRow0 = C
        add     pC, pC, LDC, lsl #2
 
@@ -1084,96 +1084,96 @@ cgemm_kernel_L4_BEGIN:
        mov     pA, origPA                      // pA = start of A array
        add     ppA, temp, pA
 
-cgemm_kernel_L4_M8_BEGIN:
+.Lcgemm_kernel_L4_M8_BEGIN:
 
        mov     counterI, origM
        asr     counterI, counterI, #3          // counterI = counterI / 8
        cmp     counterI, #0
-       ble     cgemm_kernel_L4_M4_BEGIN
+       ble     .Lcgemm_kernel_L4_M4_BEGIN
 
-cgemm_kernel_L4_M8_20:
+.Lcgemm_kernel_L4_M8_20:
 
        mov     pB, origPB
        asr     counterL , origK, #1            // L = K / 2
        cmp     counterL , #2                   // is there at least 4 to do?
-       blt     cgemm_kernel_L4_M8_32
+       blt     .Lcgemm_kernel_L4_M8_32
 
        KERNEL8x4_I                             // do one in the K
        KERNEL8x4_M2                            // do another in the K
 
        subs    counterL, counterL, #2          // subtract 2
-       ble     cgemm_kernel_L4_M8_22a
+       ble     .Lcgemm_kernel_L4_M8_22a
        .align 5
 
-cgemm_kernel_L4_M8_22:
+.Lcgemm_kernel_L4_M8_22:
 
        KERNEL8x4_M1
        KERNEL8x4_M2
 
        subs    counterL, counterL, #1
-       bgt     cgemm_kernel_L4_M8_22
+       bgt     .Lcgemm_kernel_L4_M8_22
 
 
-cgemm_kernel_L4_M8_22a:
+.Lcgemm_kernel_L4_M8_22a:
 
        KERNEL8x4_M1
        KERNEL8x4_E
 
-       b        cgemm_kernel_L4_M8_44
+       b        .Lcgemm_kernel_L4_M8_44
 
-cgemm_kernel_L4_M8_32:
+.Lcgemm_kernel_L4_M8_32:
 
        tst     counterL, #1
-       ble     cgemm_kernel_L4_M8_40
+       ble     .Lcgemm_kernel_L4_M8_40
 
        KERNEL8x4_I
        KERNEL8x4_E
 
-       b       cgemm_kernel_L4_M8_44
+       b       .Lcgemm_kernel_L4_M8_44
 
 
-cgemm_kernel_L4_M8_40:
+.Lcgemm_kernel_L4_M8_40:
 
        INIT8x4
 
-cgemm_kernel_L4_M8_44:
+.Lcgemm_kernel_L4_M8_44:
 
        ands    counterL , origK, #1
-       ble     cgemm_kernel_L4_M8_100
+       ble     .Lcgemm_kernel_L4_M8_100
 
-cgemm_kernel_L4_M8_46:
+.Lcgemm_kernel_L4_M8_46:
        KERNEL8x4_SUB
 
-cgemm_kernel_L4_M8_100:
+.Lcgemm_kernel_L4_M8_100:
 
        SAVE8x4
 
-cgemm_kernel_L4_M8_END:
+.Lcgemm_kernel_L4_M8_END:
        lsl     temp, origK, #5                 // k * 4 * 8
        add     pA, pA, temp
        add     ppA, ppA, temp
        subs    counterI, counterI, #1
-       bne     cgemm_kernel_L4_M8_20
+       bne     .Lcgemm_kernel_L4_M8_20
 
 
-cgemm_kernel_L4_M4_BEGIN:
+.Lcgemm_kernel_L4_M4_BEGIN:
        mov     counterI, origM
        tst     counterI , #7
-       ble     cgemm_kernel_L4_END
+       ble     .Lcgemm_kernel_L4_END
 
        tst     counterI, #4
-       ble     cgemm_kernel_L4_M2_BEGIN
+       ble     .Lcgemm_kernel_L4_M2_BEGIN
 
-cgemm_kernel_L4_M4_20:
+.Lcgemm_kernel_L4_M4_20:
 
        INIT4x4
 
        mov     pB, origPB
        asr     counterL, origK, #3             // counterL = counterL / 8
        cmp     counterL, #0
-       ble     cgemm_kernel_L4_M4_40
+       ble     .Lcgemm_kernel_L4_M4_40
 
-cgemm_kernel_L4_M4_22:
+.Lcgemm_kernel_L4_M4_22:
 
        KERNEL4x4_SUB
        KERNEL4x4_SUB
@@ -1186,47 +1186,47 @@ cgemm_kernel_L4_M4_22:
        KERNEL4x4_SUB
 
        subs    counterL, counterL, #1
-       bgt     cgemm_kernel_L4_M4_22
+       bgt     .Lcgemm_kernel_L4_M4_22
 
 
-cgemm_kernel_L4_M4_40:
+.Lcgemm_kernel_L4_M4_40:
 
        ands    counterL , origK, #7            // counterL = counterL % 8
-       ble     cgemm_kernel_L4_M4_100
+       ble     .Lcgemm_kernel_L4_M4_100
 
-cgemm_kernel_L4_M4_42:
+.Lcgemm_kernel_L4_M4_42:
 
        KERNEL4x4_SUB
 
        subs    counterL, counterL, #1
-       bgt     cgemm_kernel_L4_M4_42
+       bgt     .Lcgemm_kernel_L4_M4_42
 
-cgemm_kernel_L4_M4_100:
+.Lcgemm_kernel_L4_M4_100:
 
        SAVE4x4
 
-cgemm_kernel_L4_M4_END:
+.Lcgemm_kernel_L4_M4_END:
 
 
-cgemm_kernel_L4_M2_BEGIN:
+.Lcgemm_kernel_L4_M2_BEGIN:
 
        mov     counterI, origM
        tst     counterI , #3
-       ble     cgemm_kernel_L4_END
+       ble     .Lcgemm_kernel_L4_END
 
        tst     counterI, #2                    // counterI = counterI / 2
-       ble     cgemm_kernel_L4_M1_BEGIN
+       ble     .Lcgemm_kernel_L4_M1_BEGIN
 
-cgemm_kernel_L4_M2_20:
+.Lcgemm_kernel_L4_M2_20:
 
        INIT2x4
 
        mov     pB, origPB
        asr     counterL , origK, #3            // counterL = counterL / 8
        cmp     counterL , #0
-       ble     cgemm_kernel_L4_M2_40
+       ble     .Lcgemm_kernel_L4_M2_40
 
-cgemm_kernel_L4_M2_22:
+.Lcgemm_kernel_L4_M2_22:
 
        KERNEL2x4_SUB
        KERNEL2x4_SUB
@@ -1239,43 +1239,43 @@ cgemm_kernel_L4_M2_22:
        KERNEL2x4_SUB
 
        subs    counterL, counterL, #1
-       bgt     cgemm_kernel_L4_M2_22
+       bgt     .Lcgemm_kernel_L4_M2_22
 
 
-cgemm_kernel_L4_M2_40:
+.Lcgemm_kernel_L4_M2_40:
 
        ands    counterL , origK, #7            // counterL = counterL % 8
-       ble     cgemm_kernel_L4_M2_100
+       ble     .Lcgemm_kernel_L4_M2_100
 
-cgemm_kernel_L4_M2_42:
+.Lcgemm_kernel_L4_M2_42:
 
        KERNEL2x4_SUB
 
        subs    counterL, counterL, #1
-       bgt     cgemm_kernel_L4_M2_42
+       bgt     .Lcgemm_kernel_L4_M2_42
 
-cgemm_kernel_L4_M2_100:
+.Lcgemm_kernel_L4_M2_100:
 
        SAVE2x4
 
-cgemm_kernel_L4_M2_END:
+.Lcgemm_kernel_L4_M2_END:
 
 
-cgemm_kernel_L4_M1_BEGIN:
+.Lcgemm_kernel_L4_M1_BEGIN:
 
        tst     counterI, #1                    // counterI = counterI % 2
-       ble     cgemm_kernel_L4_END
+       ble     .Lcgemm_kernel_L4_END
 
-cgemm_kernel_L4_M1_20:
+.Lcgemm_kernel_L4_M1_20:
 
        INIT1x4
 
        mov     pB, origPB
        asr     counterL , origK, #3            // counterL = counterL / 8
        cmp     counterL , #0
-       ble     cgemm_kernel_L4_M1_40
+       ble     .Lcgemm_kernel_L4_M1_40
 
-cgemm_kernel_L4_M1_22:
+.Lcgemm_kernel_L4_M1_22:
        KERNEL1x4_SUB
        KERNEL1x4_SUB
        KERNEL1x4_SUB
@@ -1287,45 +1287,45 @@ cgemm_kernel_L4_M1_22:
        KERNEL1x4_SUB
 
        subs    counterL, counterL, #1
-       bgt     cgemm_kernel_L4_M1_22
+       bgt     .Lcgemm_kernel_L4_M1_22
 
 
-cgemm_kernel_L4_M1_40:
+.Lcgemm_kernel_L4_M1_40:
 
        ands    counterL , origK, #7            // counterL = counterL % 8
-       ble     cgemm_kernel_L4_M1_100
+       ble     .Lcgemm_kernel_L4_M1_100
 
-cgemm_kernel_L4_M1_42:
+.Lcgemm_kernel_L4_M1_42:
 
        KERNEL1x4_SUB
 
        subs    counterL, counterL, #1
-       bgt     cgemm_kernel_L4_M1_42
+       bgt     .Lcgemm_kernel_L4_M1_42
 
-cgemm_kernel_L4_M1_100:
+.Lcgemm_kernel_L4_M1_100:
 
        SAVE1x4
 
 
-cgemm_kernel_L4_END:
+.Lcgemm_kernel_L4_END:
 
        lsl     temp, origK, #5 
        add     origPB, origPB, temp            // B = B + K * 4 * 8
 
        subs    counterJ, counterJ , #1         // j--
-       bgt     cgemm_kernel_L4_BEGIN
+       bgt     .Lcgemm_kernel_L4_BEGIN
 
 
 /******************************************************************************/
 
-cgemm_kernel_L2_BEGIN:   // less than 2 left in N direction
+.Lcgemm_kernel_L2_BEGIN:   // less than 2 left in N direction
 
        mov     counterJ , origN
        tst     counterJ , #3
-       ble     cgemm_kernel_L999   // error, N was less than 4?
+       ble     .Lcgemm_kernel_L999   // error, N was less than 4?
 
        tst     counterJ , #2
-       ble     cgemm_kernel_L1_BEGIN
+       ble     .Lcgemm_kernel_L1_BEGIN
 
        mov     pCRow0, pC                      // pCRow0 = pC
 
@@ -1335,24 +1335,24 @@ cgemm_kernel_L2_BEGIN:   // less than 2 left in N direction
 
 
 
-cgemm_kernel_L2_M4_BEGIN:
+.Lcgemm_kernel_L2_M4_BEGIN:
 
        mov     counterI, origM
        asr     counterI, counterI, #2          // counterI = counterI / 4
        cmp     counterI,#0
-       ble     cgemm_kernel_L2_M2_BEGIN
+       ble     .Lcgemm_kernel_L2_M2_BEGIN
 
-cgemm_kernel_L2_M4_20:
+.Lcgemm_kernel_L2_M4_20:
 
        INIT4x2
 
        mov     pB, origPB
        asr     counterL , origK, #3            // counterL = counterL / 8
        cmp     counterL,#0
-       ble     cgemm_kernel_L2_M4_40
+       ble     .Lcgemm_kernel_L2_M4_40
        .align 5
 
-cgemm_kernel_L2_M4_22:
+.Lcgemm_kernel_L2_M4_22:
        KERNEL4x2_SUB
        KERNEL4x2_SUB
        KERNEL4x2_SUB
@@ -1364,50 +1364,50 @@ cgemm_kernel_L2_M4_22:
        KERNEL4x2_SUB
 
        subs    counterL, counterL, #1
-       bgt     cgemm_kernel_L2_M4_22
+       bgt     .Lcgemm_kernel_L2_M4_22
 
 
-cgemm_kernel_L2_M4_40:
+.Lcgemm_kernel_L2_M4_40:
 
        ands    counterL , origK, #7            // counterL = counterL % 8
-       ble     cgemm_kernel_L2_M4_100
+       ble     .Lcgemm_kernel_L2_M4_100
 
-cgemm_kernel_L2_M4_42:
+.Lcgemm_kernel_L2_M4_42:
 
        KERNEL4x2_SUB
 
        subs    counterL, counterL, #1
-       bgt     cgemm_kernel_L2_M4_42
+       bgt     .Lcgemm_kernel_L2_M4_42
 
-cgemm_kernel_L2_M4_100:
+.Lcgemm_kernel_L2_M4_100:
 
        SAVE4x2
 
-cgemm_kernel_L2_M4_END:
+.Lcgemm_kernel_L2_M4_END:
 
        subs    counterI, counterI, #1
-       bgt     cgemm_kernel_L2_M4_20
+       bgt     .Lcgemm_kernel_L2_M4_20
 
 
-cgemm_kernel_L2_M2_BEGIN:
+.Lcgemm_kernel_L2_M2_BEGIN:
 
        mov     counterI, origM
        tst     counterI , #3
-       ble     cgemm_kernel_L2_END
+       ble     .Lcgemm_kernel_L2_END
 
        tst     counterI, #2                    // counterI = counterI / 2
-       ble     cgemm_kernel_L2_M1_BEGIN
+       ble     .Lcgemm_kernel_L2_M1_BEGIN
 
-cgemm_kernel_L2_M2_20:
+.Lcgemm_kernel_L2_M2_20:
 
        INIT2x2
 
        mov     pB, origPB
        asr     counterL , origK, #3            // counterL = counterL / 8
         cmp    counterL,#0
-       ble     cgemm_kernel_L2_M2_40
+       ble     .Lcgemm_kernel_L2_M2_40
 
-cgemm_kernel_L2_M2_22:
+.Lcgemm_kernel_L2_M2_22:
 
        KERNEL2x2_SUB
        KERNEL2x2_SUB
@@ -1420,43 +1420,43 @@ cgemm_kernel_L2_M2_22:
        KERNEL2x2_SUB
 
        subs    counterL, counterL, #1
-       bgt     cgemm_kernel_L2_M2_22
+       bgt     .Lcgemm_kernel_L2_M2_22
 
 
-cgemm_kernel_L2_M2_40:
+.Lcgemm_kernel_L2_M2_40:
 
        ands    counterL , origK, #7            // counterL = counterL % 8
-       ble     cgemm_kernel_L2_M2_100
+       ble     .Lcgemm_kernel_L2_M2_100
 
-cgemm_kernel_L2_M2_42:
+.Lcgemm_kernel_L2_M2_42:
 
        KERNEL2x2_SUB
 
        subs    counterL, counterL, #1
-       bgt     cgemm_kernel_L2_M2_42
+       bgt     .Lcgemm_kernel_L2_M2_42
 
-cgemm_kernel_L2_M2_100:
+.Lcgemm_kernel_L2_M2_100:
 
        SAVE2x2
 
-cgemm_kernel_L2_M2_END:
+.Lcgemm_kernel_L2_M2_END:
 
 
-cgemm_kernel_L2_M1_BEGIN:
+.Lcgemm_kernel_L2_M1_BEGIN:
 
        tst     counterI, #1                    // counterI = counterI % 2
-       ble     cgemm_kernel_L2_END
+       ble     .Lcgemm_kernel_L2_END
 
-cgemm_kernel_L2_M1_20:
+.Lcgemm_kernel_L2_M1_20:
 
        INIT1x2
 
        mov     pB, origPB
        asr     counterL , origK, #3            // counterL = counterL / 8
         cmp     counterL, #0
-       ble     cgemm_kernel_L2_M1_40
+       ble     .Lcgemm_kernel_L2_M1_40
 
-cgemm_kernel_L2_M1_22:
+.Lcgemm_kernel_L2_M1_22:
        KERNEL1x2_SUB
        KERNEL1x2_SUB
        KERNEL1x2_SUB
@@ -1468,36 +1468,36 @@ cgemm_kernel_L2_M1_22:
        KERNEL1x2_SUB
 
        subs    counterL, counterL, #1
-       bgt     cgemm_kernel_L2_M1_22
+       bgt     .Lcgemm_kernel_L2_M1_22
 
 
-cgemm_kernel_L2_M1_40:
+.Lcgemm_kernel_L2_M1_40:
 
        ands    counterL , origK, #7            // counterL = counterL % 8
-       ble     cgemm_kernel_L2_M1_100
+       ble     .Lcgemm_kernel_L2_M1_100
 
-cgemm_kernel_L2_M1_42:
+.Lcgemm_kernel_L2_M1_42:
 
        KERNEL1x2_SUB
 
        subs    counterL, counterL, #1
-       bgt     cgemm_kernel_L2_M1_42
+       bgt     .Lcgemm_kernel_L2_M1_42
 
-cgemm_kernel_L2_M1_100:
+.Lcgemm_kernel_L2_M1_100:
 
        SAVE1x2
 
 
-cgemm_kernel_L2_END:
+.Lcgemm_kernel_L2_END:
        add     origPB, origPB, origK, lsl #4   // B = B + K * 2 * 8
 
 /******************************************************************************/
 
-cgemm_kernel_L1_BEGIN:
+.Lcgemm_kernel_L1_BEGIN:
 
        mov     counterJ , origN
        tst     counterJ , #1
-       ble     cgemm_kernel_L999 // done
+       ble     .Lcgemm_kernel_L999 // done
 
 
        mov     pCRow0, pC                      // pCRow0 = C
@@ -1507,24 +1507,24 @@ cgemm_kernel_L1_BEGIN:
 
 
 
-cgemm_kernel_L1_M4_BEGIN:
+.Lcgemm_kernel_L1_M4_BEGIN:
 
        mov     counterI, origM
        asr     counterI, counterI, #2          // counterI = counterI / 4
        cmp     counterI, #0
-       ble     cgemm_kernel_L1_M2_BEGIN
+       ble     .Lcgemm_kernel_L1_M2_BEGIN
 
-cgemm_kernel_L1_M4_20:
+.Lcgemm_kernel_L1_M4_20:
 
        INIT4x1
 
        mov     pB, origPB
        asr     counterL , origK, #3            // counterL = counterL / 8
        cmp     counterL , #0
-       ble     cgemm_kernel_L1_M4_40
+       ble     .Lcgemm_kernel_L1_M4_40
        .align 5
 
-cgemm_kernel_L1_M4_22:
+.Lcgemm_kernel_L1_M4_22:
        KERNEL4x1_SUB
        KERNEL4x1_SUB
        KERNEL4x1_SUB
@@ -1536,50 +1536,50 @@ cgemm_kernel_L1_M4_22:
        KERNEL4x1_SUB
 
        subs    counterL, counterL, #1
-       bgt     cgemm_kernel_L1_M4_22
+       bgt     .Lcgemm_kernel_L1_M4_22
 
 
-cgemm_kernel_L1_M4_40:
+.Lcgemm_kernel_L1_M4_40:
 
        ands    counterL , origK, #7            // counterL = counterL % 8
-       ble     cgemm_kernel_L1_M4_100
+       ble     .Lcgemm_kernel_L1_M4_100
 
-cgemm_kernel_L1_M4_42:
+.Lcgemm_kernel_L1_M4_42:
 
        KERNEL4x1_SUB
 
        subs    counterL, counterL, #1
-       bgt     cgemm_kernel_L1_M4_42
+       bgt     .Lcgemm_kernel_L1_M4_42
 
-cgemm_kernel_L1_M4_100:
+.Lcgemm_kernel_L1_M4_100:
 
        SAVE4x1
 
-cgemm_kernel_L1_M4_END:
+.Lcgemm_kernel_L1_M4_END:
 
        subs    counterI, counterI, #1
-       bgt     cgemm_kernel_L1_M4_20
+       bgt     .Lcgemm_kernel_L1_M4_20
 
 
-cgemm_kernel_L1_M2_BEGIN:
+.Lcgemm_kernel_L1_M2_BEGIN:
 
        mov     counterI, origM
        tst     counterI , #3
-       ble     cgemm_kernel_L1_END
+       ble     .Lcgemm_kernel_L1_END
 
        tst     counterI, #2                    // counterI = counterI / 2
-       ble     cgemm_kernel_L1_M1_BEGIN
+       ble     .Lcgemm_kernel_L1_M1_BEGIN
 
-cgemm_kernel_L1_M2_20:
+.Lcgemm_kernel_L1_M2_20:
 
        INIT2x1
 
        mov     pB, origPB
        asr     counterL , origK, #3            // counterL = counterL / 8
        cmp     counterL , #0
-       ble     cgemm_kernel_L1_M2_40
+       ble     .Lcgemm_kernel_L1_M2_40
 
-cgemm_kernel_L1_M2_22:
+.Lcgemm_kernel_L1_M2_22:
 
        KERNEL2x1_SUB
        KERNEL2x1_SUB
@@ -1592,43 +1592,43 @@ cgemm_kernel_L1_M2_22:
        KERNEL2x1_SUB
 
        subs    counterL, counterL, #1
-       bgt     cgemm_kernel_L1_M2_22
+       bgt     .Lcgemm_kernel_L1_M2_22
 
 
-cgemm_kernel_L1_M2_40:
+.Lcgemm_kernel_L1_M2_40:
 
        ands    counterL , origK, #7            // counterL = counterL % 8
-       ble     cgemm_kernel_L1_M2_100
+       ble     .Lcgemm_kernel_L1_M2_100
 
-cgemm_kernel_L1_M2_42:
+.Lcgemm_kernel_L1_M2_42:
 
        KERNEL2x1_SUB
 
        subs    counterL, counterL, #1
-       bgt     cgemm_kernel_L1_M2_42
+       bgt     .Lcgemm_kernel_L1_M2_42
 
-cgemm_kernel_L1_M2_100:
+.Lcgemm_kernel_L1_M2_100:
 
        SAVE2x1
 
-cgemm_kernel_L1_M2_END:
+.Lcgemm_kernel_L1_M2_END:
 
 
-cgemm_kernel_L1_M1_BEGIN:
+.Lcgemm_kernel_L1_M1_BEGIN:
 
        tst     counterI, #1                    // counterI = counterI % 2
-       ble     cgemm_kernel_L1_END
+       ble     .Lcgemm_kernel_L1_END
 
-cgemm_kernel_L1_M1_20:
+.Lcgemm_kernel_L1_M1_20:
 
        INIT1x1
 
        mov     pB, origPB
        asr     counterL , origK, #3            // counterL = counterL / 8
        cmp     counterL , #0
-       ble     cgemm_kernel_L1_M1_40
+       ble     .Lcgemm_kernel_L1_M1_40
 
-cgemm_kernel_L1_M1_22:
+.Lcgemm_kernel_L1_M1_22:
        KERNEL1x1_SUB
        KERNEL1x1_SUB
        KERNEL1x1_SUB
@@ -1640,30 +1640,30 @@ cgemm_kernel_L1_M1_22:
        KERNEL1x1_SUB
 
        subs    counterL, counterL, #1
-       bgt     cgemm_kernel_L1_M1_22
+       bgt     .Lcgemm_kernel_L1_M1_22
 
 
-cgemm_kernel_L1_M1_40:
+.Lcgemm_kernel_L1_M1_40:
 
        ands    counterL , origK, #7            // counterL = counterL % 8
-       ble     cgemm_kernel_L1_M1_100
+       ble     .Lcgemm_kernel_L1_M1_100
 
-cgemm_kernel_L1_M1_42:
+.Lcgemm_kernel_L1_M1_42:
 
        KERNEL1x1_SUB
 
        subs    counterL, counterL, #1
-       bgt     cgemm_kernel_L1_M1_42
+       bgt     .Lcgemm_kernel_L1_M1_42
 
-cgemm_kernel_L1_M1_100:
+.Lcgemm_kernel_L1_M1_100:
 
        SAVE1x1
 
 
-cgemm_kernel_L1_END:
+.Lcgemm_kernel_L1_END:
 
 
-cgemm_kernel_L999:
+.Lcgemm_kernel_L999:
        mov     x0, #0                          // set return value
        ldp     d8, d9, [sp, #(0 * 16)]
        ldp     d10, d11, [sp, #(1 * 16)]
index 5d14628..24e08a6 100644 (file)
@@ -1407,11 +1407,11 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
        mov     counterJ, origN
        asr     counterJ, counterJ, #2          // J = J / 4
        cmp     counterJ, #0
-       ble     cgemm_kernel_L2_BEGIN
+       ble     .Lcgemm_kernel_L2_BEGIN
 
 /******************************************************************************/
 
-cgemm_kernel_L4_BEGIN:
+.Lcgemm_kernel_L4_BEGIN:
        mov     pCRow0, pC
        add     pCRow1, pCRow0, LDC
        add     pCRow2, pCRow1, LDC
@@ -1421,21 +1421,21 @@ cgemm_kernel_L4_BEGIN:
 
        mov     pA, origPA                      // pA = start of A array
 
-cgemm_kernel_L4_M8_BEGIN:
+.Lcgemm_kernel_L4_M8_BEGIN:
 
        mov     counterI, origM
        asr     counterI, counterI, #3          // counterI = counterI / 8
        cmp     counterI, #0
-       ble     cgemm_kernel_L4_M4_BEGIN
+       ble     .Lcgemm_kernel_L4_M4_BEGIN
 
        .align 5
-cgemm_kernel_L4_M8_20:
+.Lcgemm_kernel_L4_M8_20:
 
        mov     pB, origPB
 
        asr     counterL , origK, #3
        cmp     counterL , #2
-       blt     cgemm_kernel_L4_M8_32
+       blt     .Lcgemm_kernel_L4_M8_32
 
        KERNEL8x4_I
        KERNEL8x4_M2
@@ -1447,10 +1447,10 @@ cgemm_kernel_L4_M8_20:
        KERNEL8x4_M2
 
        subs    counterL, counterL, #2          // subtract 2
-       ble     cgemm_kernel_L4_M8_22a
+       ble     .Lcgemm_kernel_L4_M8_22a
 
        .align 5
-cgemm_kernel_L4_M8_22:
+.Lcgemm_kernel_L4_M8_22:
 
        KERNEL8x4_M1
        KERNEL8x4_M2
@@ -1462,10 +1462,10 @@ cgemm_kernel_L4_M8_22:
        KERNEL8x4_M2
 
        subs    counterL, counterL, #1
-       bgt     cgemm_kernel_L4_M8_22
+       bgt     .Lcgemm_kernel_L4_M8_22
 
        .align 5
-cgemm_kernel_L4_M8_22a:
+.Lcgemm_kernel_L4_M8_22a:
 
        KERNEL8x4_M1
        KERNEL8x4_M2
@@ -1476,13 +1476,13 @@ cgemm_kernel_L4_M8_22a:
        KERNEL8x4_M1
        KERNEL8x4_E
 
-       b        cgemm_kernel_L4_M8_44
+       b        .Lcgemm_kernel_L4_M8_44
 
        .align 5
-cgemm_kernel_L4_M8_32:
+.Lcgemm_kernel_L4_M8_32:
 
        tst     counterL, #1
-       ble     cgemm_kernel_L4_M8_40
+       ble     .Lcgemm_kernel_L4_M8_40
 
        KERNEL8x4_I
        KERNEL8x4_M2
@@ -1493,116 +1493,116 @@ cgemm_kernel_L4_M8_32:
        KERNEL8x4_M1
        KERNEL8x4_E
 
-       b       cgemm_kernel_L4_M8_44
+       b       .Lcgemm_kernel_L4_M8_44
 
-cgemm_kernel_L4_M8_40:
+.Lcgemm_kernel_L4_M8_40:
 
        INIT8x4
 
-cgemm_kernel_L4_M8_44:
+.Lcgemm_kernel_L4_M8_44:
 
        ands    counterL , origK, #7
-       ble     cgemm_kernel_L4_M8_100
+       ble     .Lcgemm_kernel_L4_M8_100
 
        .align 5
-cgemm_kernel_L4_M8_46:
+.Lcgemm_kernel_L4_M8_46:
 
        KERNEL8x4_SUB
 
        subs    counterL, counterL, #1
-       bne     cgemm_kernel_L4_M8_46
+       bne     .Lcgemm_kernel_L4_M8_46
 
-cgemm_kernel_L4_M8_100:
+.Lcgemm_kernel_L4_M8_100:
        prfm    PLDL1KEEP, [pA]
        prfm    PLDL1KEEP, [pA, #64]
        prfm    PLDL1KEEP, [origPB]
 
        SAVE8x4
 
-cgemm_kernel_L4_M8_END:
+.Lcgemm_kernel_L4_M8_END:
        subs    counterI, counterI, #1
-       bne     cgemm_kernel_L4_M8_20
+       bne     .Lcgemm_kernel_L4_M8_20
 
-cgemm_kernel_L4_M4_BEGIN:
+.Lcgemm_kernel_L4_M4_BEGIN:
 
        mov     counterI, origM
        tst     counterI , #7
-       ble     cgemm_kernel_L4_END
+       ble     .Lcgemm_kernel_L4_END
 
        tst     counterI, #4
-       ble     cgemm_kernel_L4_M2_BEGIN
+       ble     .Lcgemm_kernel_L4_M2_BEGIN
 
 
-cgemm_kernel_L4_M4_20:
+.Lcgemm_kernel_L4_M4_20:
 
        mov     pB, origPB
        
        asr     counterL , origK, #1            // L = K / 2
        cmp     counterL , #2                   // is there at least 4 to do?
-       blt     cgemm_kernel_L4_M4_32
+       blt     .Lcgemm_kernel_L4_M4_32
 
        KERNEL4x4_I                             // do one in the K
        KERNEL4x4_M2                            // do another in the K
 
        subs    counterL, counterL, #2
-       ble     cgemm_kernel_L4_M4_22a
+       ble     .Lcgemm_kernel_L4_M4_22a
        .align 5
 
 
-cgemm_kernel_L4_M4_22:
+.Lcgemm_kernel_L4_M4_22:
 
        KERNEL4x4_M1
        KERNEL4x4_M2
 
        subs    counterL, counterL, #1
-       bgt     cgemm_kernel_L4_M4_22
+       bgt     .Lcgemm_kernel_L4_M4_22
 
-cgemm_kernel_L4_M4_22a:
+.Lcgemm_kernel_L4_M4_22a:
        KERNEL4x4_M1
        KERNEL4x4_E
-       b        cgemm_kernel_L4_M4_44
-cgemm_kernel_L4_M4_32:
+       b        .Lcgemm_kernel_L4_M4_44
+.Lcgemm_kernel_L4_M4_32:
        tst     counterL, #1
-       ble     cgemm_kernel_L4_M4_40
+       ble     .Lcgemm_kernel_L4_M4_40
        KERNEL4x4_I
        KERNEL4x4_E
-       b       cgemm_kernel_L4_M4_44
-cgemm_kernel_L4_M4_40:
+       b       .Lcgemm_kernel_L4_M4_44
+.Lcgemm_kernel_L4_M4_40:
 
        INIT4x4
 
-cgemm_kernel_L4_M4_44:
+.Lcgemm_kernel_L4_M4_44:
        ands    counterL , origK, #1
-       ble     cgemm_kernel_L4_M4_100
+       ble     .Lcgemm_kernel_L4_M4_100
 
-cgemm_kernel_L4_M4_46:
+.Lcgemm_kernel_L4_M4_46:
        KERNEL4x4_SUB
 
-cgemm_kernel_L4_M4_100:
+.Lcgemm_kernel_L4_M4_100:
 
        SAVE4x4
 
-cgemm_kernel_L4_M4_END:
+.Lcgemm_kernel_L4_M4_END:
 
-cgemm_kernel_L4_M2_BEGIN:
+.Lcgemm_kernel_L4_M2_BEGIN:
 
        mov     counterI, origM
        tst     counterI , #3
-       ble     cgemm_kernel_L4_END
+       ble     .Lcgemm_kernel_L4_END
 
        tst     counterI, #2                    // counterI = counterI / 2
-       ble     cgemm_kernel_L4_M1_BEGIN
+       ble     .Lcgemm_kernel_L4_M1_BEGIN
 
-cgemm_kernel_L4_M2_20:
+.Lcgemm_kernel_L4_M2_20:
 
        INIT2x4
 
        mov     pB, origPB
        asr     counterL , origK, #3            // counterL = counterL / 8
        cmp     counterL , #0
-       ble     cgemm_kernel_L4_M2_40
+       ble     .Lcgemm_kernel_L4_M2_40
 
-cgemm_kernel_L4_M2_22:
+.Lcgemm_kernel_L4_M2_22:
 
        KERNEL2x4_SUB
        KERNEL2x4_SUB
@@ -1615,43 +1615,43 @@ cgemm_kernel_L4_M2_22:
        KERNEL2x4_SUB
 
        subs    counterL, counterL, #1
-       bgt     cgemm_kernel_L4_M2_22
+       bgt     .Lcgemm_kernel_L4_M2_22
 
 
-cgemm_kernel_L4_M2_40:
+.Lcgemm_kernel_L4_M2_40:
 
        ands    counterL , origK, #7            // counterL = counterL % 8
-       ble     cgemm_kernel_L4_M2_100
+       ble     .Lcgemm_kernel_L4_M2_100
 
-cgemm_kernel_L4_M2_42:
+.Lcgemm_kernel_L4_M2_42:
 
        KERNEL2x4_SUB
 
        subs    counterL, counterL, #1
-       bgt     cgemm_kernel_L4_M2_42
+       bgt     .Lcgemm_kernel_L4_M2_42
 
-cgemm_kernel_L4_M2_100:
+.Lcgemm_kernel_L4_M2_100:
 
        SAVE2x4
 
-cgemm_kernel_L4_M2_END:
+.Lcgemm_kernel_L4_M2_END:
 
 
-cgemm_kernel_L4_M1_BEGIN:
+.Lcgemm_kernel_L4_M1_BEGIN:
 
        tst     counterI, #1                    // counterI = counterI % 2
-       ble     cgemm_kernel_L4_END
+       ble     .Lcgemm_kernel_L4_END
 
-cgemm_kernel_L4_M1_20:
+.Lcgemm_kernel_L4_M1_20:
 
        INIT1x4
 
        mov     pB, origPB
        asr     counterL , origK, #3            // counterL = counterL / 8
        cmp     counterL , #0
-       ble     cgemm_kernel_L4_M1_40
+       ble     .Lcgemm_kernel_L4_M1_40
 
-cgemm_kernel_L4_M1_22:
+.Lcgemm_kernel_L4_M1_22:
        KERNEL1x4_SUB
        KERNEL1x4_SUB
        KERNEL1x4_SUB
@@ -1663,45 +1663,45 @@ cgemm_kernel_L4_M1_22:
        KERNEL1x4_SUB
 
        subs    counterL, counterL, #1
-       bgt     cgemm_kernel_L4_M1_22
+       bgt     .Lcgemm_kernel_L4_M1_22
 
 
-cgemm_kernel_L4_M1_40:
+.Lcgemm_kernel_L4_M1_40:
 
        ands    counterL , origK, #7            // counterL = counterL % 8
-       ble     cgemm_kernel_L4_M1_100
+       ble     .Lcgemm_kernel_L4_M1_100
 
-cgemm_kernel_L4_M1_42:
+.Lcgemm_kernel_L4_M1_42:
 
        KERNEL1x4_SUB
 
        subs    counterL, counterL, #1
-       bgt     cgemm_kernel_L4_M1_42
+       bgt     .Lcgemm_kernel_L4_M1_42
 
-cgemm_kernel_L4_M1_100:
+.Lcgemm_kernel_L4_M1_100:
 
        SAVE1x4
 
 
-cgemm_kernel_L4_END:
+.Lcgemm_kernel_L4_END:
 
        lsl     temp, origK, #5 
        add     origPB, origPB, temp            // B = B + K * 4 * 8
 
        subs    counterJ, counterJ , #1         // j--
-       bgt     cgemm_kernel_L4_BEGIN
+       bgt     .Lcgemm_kernel_L4_BEGIN
 
 
 /******************************************************************************/
 
-cgemm_kernel_L2_BEGIN:   // less than 2 left in N direction
+.Lcgemm_kernel_L2_BEGIN:   // less than 2 left in N direction
 
        mov     counterJ , origN
        tst     counterJ , #3
-       ble     cgemm_kernel_L999   // error, N was less than 4?
+       ble     .Lcgemm_kernel_L999   // error, N was less than 4?
 
        tst     counterJ , #2
-       ble     cgemm_kernel_L1_BEGIN
+       ble     .Lcgemm_kernel_L1_BEGIN
 
        mov     pCRow0, pC                      // pCRow0 = pC
 
@@ -1710,14 +1710,14 @@ cgemm_kernel_L2_BEGIN:   // less than 2 left in N direction
        mov     pA, origPA                      // pA = A
 
 
-cgemm_kernel_L2_M8_BEGIN:
+.Lcgemm_kernel_L2_M8_BEGIN:
 
        mov     counterI, origM
        asr     counterI, counterI, #3          // counterI = counterI / 8
        cmp     counterI, #0
-       ble     cgemm_kernel_L2_M4_BEGIN
+       ble     .Lcgemm_kernel_L2_M4_BEGIN
 
-cgemm_kernel_L2_M8_20:
+.Lcgemm_kernel_L2_M8_20:
 
        INIT8x2
 
@@ -1725,10 +1725,10 @@ cgemm_kernel_L2_M8_20:
 
        asr     counterL , origK, #3            // counterL = counterL / 8
        cmp     counterL,#0
-       ble     cgemm_kernel_L2_M8_40
+       ble     .Lcgemm_kernel_L2_M8_40
        .align 5
 
-cgemm_kernel_L2_M8_22:
+.Lcgemm_kernel_L2_M8_22:
        KERNEL8x2_SUB
        KERNEL8x2_SUB
        KERNEL8x2_SUB
@@ -1740,50 +1740,50 @@ cgemm_kernel_L2_M8_22:
        KERNEL8x2_SUB
 
        subs    counterL, counterL, #1
-       bgt     cgemm_kernel_L2_M8_22
+       bgt     .Lcgemm_kernel_L2_M8_22
 
 
-cgemm_kernel_L2_M8_40:
+.Lcgemm_kernel_L2_M8_40:
 
        ands    counterL , origK, #7            // counterL = counterL % 8
-       ble     cgemm_kernel_L2_M8_100
+       ble     .Lcgemm_kernel_L2_M8_100
 
-cgemm_kernel_L2_M8_42:
+.Lcgemm_kernel_L2_M8_42:
 
        KERNEL8x2_SUB
 
        subs    counterL, counterL, #1
-       bgt     cgemm_kernel_L2_M8_42
+       bgt     .Lcgemm_kernel_L2_M8_42
 
-cgemm_kernel_L2_M8_100:
+.Lcgemm_kernel_L2_M8_100:
 
        SAVE8x2
 
-cgemm_kernel_L2_M8_END:
+.Lcgemm_kernel_L2_M8_END:
 
        subs    counterI, counterI, #1
-       bgt     cgemm_kernel_L2_M8_20
+       bgt     .Lcgemm_kernel_L2_M8_20
 
-cgemm_kernel_L2_M4_BEGIN:
+.Lcgemm_kernel_L2_M4_BEGIN:
 
        mov     counterI, origM
        tst     counterI , #7
-       ble     cgemm_kernel_L2_END
+       ble     .Lcgemm_kernel_L2_END
 
        tst     counterI, #4                    // counterI = counterI / 2
-       ble     cgemm_kernel_L2_M2_BEGIN
+       ble     .Lcgemm_kernel_L2_M2_BEGIN
 
-cgemm_kernel_L2_M4_20:
+.Lcgemm_kernel_L2_M4_20:
 
        INIT4x2
 
        mov     pB, origPB
        asr     counterL , origK, #3            // counterL = counterL / 8
        cmp     counterL,#0
-       ble     cgemm_kernel_L2_M4_40
+       ble     .Lcgemm_kernel_L2_M4_40
        .align 5
 
-cgemm_kernel_L2_M4_22:
+.Lcgemm_kernel_L2_M4_22:
        KERNEL4x2_SUB
        KERNEL4x2_SUB
        KERNEL4x2_SUB
@@ -1795,46 +1795,46 @@ cgemm_kernel_L2_M4_22:
        KERNEL4x2_SUB
 
        subs    counterL, counterL, #1
-       bgt     cgemm_kernel_L2_M4_22
+       bgt     .Lcgemm_kernel_L2_M4_22
 
 
-cgemm_kernel_L2_M4_40:
+.Lcgemm_kernel_L2_M4_40:
 
        ands    counterL , origK, #7            // counterL = counterL % 8
-       ble     cgemm_kernel_L2_M4_100
+       ble     .Lcgemm_kernel_L2_M4_100
 
-cgemm_kernel_L2_M4_42:
+.Lcgemm_kernel_L2_M4_42:
 
        KERNEL4x2_SUB
 
        subs    counterL, counterL, #1
-       bgt     cgemm_kernel_L2_M4_42
+       bgt     .Lcgemm_kernel_L2_M4_42
 
-cgemm_kernel_L2_M4_100:
+.Lcgemm_kernel_L2_M4_100:
 
        SAVE4x2
 
-cgemm_kernel_L2_M4_END:
+.Lcgemm_kernel_L2_M4_END:
 
-cgemm_kernel_L2_M2_BEGIN:
+.Lcgemm_kernel_L2_M2_BEGIN:
 
        mov     counterI, origM
        tst     counterI , #3
-       ble     cgemm_kernel_L2_END
+       ble     .Lcgemm_kernel_L2_END
 
        tst     counterI, #2                    // counterI = counterI / 2
-       ble     cgemm_kernel_L2_M1_BEGIN
+       ble     .Lcgemm_kernel_L2_M1_BEGIN
 
-cgemm_kernel_L2_M2_20:
+.Lcgemm_kernel_L2_M2_20:
 
        INIT2x2
 
        mov     pB, origPB
        asr     counterL , origK, #3            // counterL = counterL / 8
         cmp    counterL,#0
-       ble     cgemm_kernel_L2_M2_40
+       ble     .Lcgemm_kernel_L2_M2_40
 
-cgemm_kernel_L2_M2_22:
+.Lcgemm_kernel_L2_M2_22:
 
        KERNEL2x2_SUB
        KERNEL2x2_SUB
@@ -1847,43 +1847,43 @@ cgemm_kernel_L2_M2_22:
        KERNEL2x2_SUB
 
        subs    counterL, counterL, #1
-       bgt     cgemm_kernel_L2_M2_22
+       bgt     .Lcgemm_kernel_L2_M2_22
 
 
-cgemm_kernel_L2_M2_40:
+.Lcgemm_kernel_L2_M2_40:
 
        ands    counterL , origK, #7            // counterL = counterL % 8
-       ble     cgemm_kernel_L2_M2_100
+       ble     .Lcgemm_kernel_L2_M2_100
 
-cgemm_kernel_L2_M2_42:
+.Lcgemm_kernel_L2_M2_42:
 
        KERNEL2x2_SUB
 
        subs    counterL, counterL, #1
-       bgt     cgemm_kernel_L2_M2_42
+       bgt     .Lcgemm_kernel_L2_M2_42
 
-cgemm_kernel_L2_M2_100:
+.Lcgemm_kernel_L2_M2_100:
 
        SAVE2x2
 
-cgemm_kernel_L2_M2_END:
+.Lcgemm_kernel_L2_M2_END:
 
 
-cgemm_kernel_L2_M1_BEGIN:
+.Lcgemm_kernel_L2_M1_BEGIN:
 
        tst     counterI, #1                    // counterI = counterI % 2
-       ble     cgemm_kernel_L2_END
+       ble     .Lcgemm_kernel_L2_END
 
-cgemm_kernel_L2_M1_20:
+.Lcgemm_kernel_L2_M1_20:
 
        INIT1x2
 
        mov     pB, origPB
        asr     counterL , origK, #3            // counterL = counterL / 8
         cmp     counterL, #0
-       ble     cgemm_kernel_L2_M1_40
+       ble     .Lcgemm_kernel_L2_M1_40
 
-cgemm_kernel_L2_M1_22:
+.Lcgemm_kernel_L2_M1_22:
        KERNEL1x2_SUB
        KERNEL1x2_SUB
        KERNEL1x2_SUB
@@ -1895,36 +1895,36 @@ cgemm_kernel_L2_M1_22:
        KERNEL1x2_SUB
 
        subs    counterL, counterL, #1
-       bgt     cgemm_kernel_L2_M1_22
+       bgt     .Lcgemm_kernel_L2_M1_22
 
 
-cgemm_kernel_L2_M1_40:
+.Lcgemm_kernel_L2_M1_40:
 
        ands    counterL , origK, #7            // counterL = counterL % 8
-       ble     cgemm_kernel_L2_M1_100
+       ble     .Lcgemm_kernel_L2_M1_100
 
-cgemm_kernel_L2_M1_42:
+.Lcgemm_kernel_L2_M1_42:
 
        KERNEL1x2_SUB
 
        subs    counterL, counterL, #1
-       bgt     cgemm_kernel_L2_M1_42
+       bgt     .Lcgemm_kernel_L2_M1_42
 
-cgemm_kernel_L2_M1_100:
+.Lcgemm_kernel_L2_M1_100:
 
        SAVE1x2
 
 
-cgemm_kernel_L2_END:
+.Lcgemm_kernel_L2_END:
        add     origPB, origPB, origK, lsl #4   // B = B + K * 2 * 8
 
 /******************************************************************************/
 
-cgemm_kernel_L1_BEGIN:
+.Lcgemm_kernel_L1_BEGIN:
 
        mov     counterJ , origN
        tst     counterJ , #1
-       ble     cgemm_kernel_L999 // done
+       ble     .Lcgemm_kernel_L999 // done
 
 
        mov     pCRow0, pC                      // pCRow0 = C
@@ -1933,24 +1933,24 @@ cgemm_kernel_L1_BEGIN:
        mov     pA, origPA                      // pA = A
 
 
-cgemm_kernel_L1_M8_BEGIN:
+.Lcgemm_kernel_L1_M8_BEGIN:
 
        mov     counterI, origM
        asr     counterI, counterI, #3          // counterI = counterI / 8
        cmp     counterI, #0
-       ble     cgemm_kernel_L1_M4_BEGIN
+       ble     .Lcgemm_kernel_L1_M4_BEGIN
 
-cgemm_kernel_L1_M8_20:
+.Lcgemm_kernel_L1_M8_20:
 
        INIT8x1
 
        mov     pB, origPB
        asr     counterL , origK, #3            // counterL = counterL / 8
        cmp     counterL , #0
-       ble     cgemm_kernel_L1_M8_40
+       ble     .Lcgemm_kernel_L1_M8_40
        .align 5
 
-cgemm_kernel_L1_M8_22:
+.Lcgemm_kernel_L1_M8_22:
        KERNEL8x1_SUB
        KERNEL8x1_SUB
        KERNEL8x1_SUB
@@ -1962,51 +1962,51 @@ cgemm_kernel_L1_M8_22:
        KERNEL8x1_SUB
 
        subs    counterL, counterL, #1
-       bgt     cgemm_kernel_L1_M8_22
+       bgt     .Lcgemm_kernel_L1_M8_22
 
 
-cgemm_kernel_L1_M8_40:
+.Lcgemm_kernel_L1_M8_40:
 
        ands    counterL , origK, #7            // counterL = counterL % 8
-       ble     cgemm_kernel_L1_M8_100
+       ble     .Lcgemm_kernel_L1_M8_100
 
-cgemm_kernel_L1_M8_42:
+.Lcgemm_kernel_L1_M8_42:
 
        KERNEL8x1_SUB
 
        subs    counterL, counterL, #1
-       bgt     cgemm_kernel_L1_M8_42
+       bgt     .Lcgemm_kernel_L1_M8_42
 
-cgemm_kernel_L1_M8_100:
+.Lcgemm_kernel_L1_M8_100:
 
        SAVE8x1
 
-cgemm_kernel_L1_M8_END:
+.Lcgemm_kernel_L1_M8_END:
 
        subs    counterI, counterI, #1
-       bgt     cgemm_kernel_L1_M8_20
+       bgt     .Lcgemm_kernel_L1_M8_20
 
-cgemm_kernel_L1_M4_BEGIN:
+.Lcgemm_kernel_L1_M4_BEGIN:
 
        mov     counterI, origM
        tst     counterI , #7
-       ble     cgemm_kernel_L1_END
+       ble     .Lcgemm_kernel_L1_END
 
        tst     counterI, #4                    // counterI = counterI / 2
-       ble     cgemm_kernel_L1_M2_BEGIN
+       ble     .Lcgemm_kernel_L1_M2_BEGIN
 
 
-cgemm_kernel_L1_M4_20:
+.Lcgemm_kernel_L1_M4_20:
 
        INIT4x1
 
        mov     pB, origPB
        asr     counterL , origK, #3            // counterL = counterL / 8
        cmp     counterL , #0
-       ble     cgemm_kernel_L1_M4_40
+       ble     .Lcgemm_kernel_L1_M4_40
        .align 5
 
-cgemm_kernel_L1_M4_22:
+.Lcgemm_kernel_L1_M4_22:
        KERNEL4x1_SUB
        KERNEL4x1_SUB
        KERNEL4x1_SUB
@@ -2018,47 +2018,47 @@ cgemm_kernel_L1_M4_22:
        KERNEL4x1_SUB
 
        subs    counterL, counterL, #1
-       bgt     cgemm_kernel_L1_M4_22
+       bgt     .Lcgemm_kernel_L1_M4_22
 
 
-cgemm_kernel_L1_M4_40:
+.Lcgemm_kernel_L1_M4_40:
 
        ands    counterL , origK, #7            // counterL = counterL % 8
-       ble     cgemm_kernel_L1_M4_100
+       ble     .Lcgemm_kernel_L1_M4_100
 
-cgemm_kernel_L1_M4_42:
+.Lcgemm_kernel_L1_M4_42:
 
        KERNEL4x1_SUB
 
        subs    counterL, counterL, #1
-       bgt     cgemm_kernel_L1_M4_42
+       bgt     .Lcgemm_kernel_L1_M4_42
 
-cgemm_kernel_L1_M4_100:
+.Lcgemm_kernel_L1_M4_100:
 
        SAVE4x1
 
-cgemm_kernel_L1_M4_END:
+.Lcgemm_kernel_L1_M4_END:
 
 
-cgemm_kernel_L1_M2_BEGIN:
+.Lcgemm_kernel_L1_M2_BEGIN:
 
        mov     counterI, origM
        tst     counterI , #3
-       ble     cgemm_kernel_L1_END
+       ble     .Lcgemm_kernel_L1_END
 
        tst     counterI, #2                    // counterI = counterI / 2
-       ble     cgemm_kernel_L1_M1_BEGIN
+       ble     .Lcgemm_kernel_L1_M1_BEGIN
 
-cgemm_kernel_L1_M2_20:
+.Lcgemm_kernel_L1_M2_20:
 
        INIT2x1
 
        mov     pB, origPB
        asr     counterL , origK, #3            // counterL = counterL / 8
        cmp     counterL , #0
-       ble     cgemm_kernel_L1_M2_40
+       ble     .Lcgemm_kernel_L1_M2_40
 
-cgemm_kernel_L1_M2_22:
+.Lcgemm_kernel_L1_M2_22:
 
        KERNEL2x1_SUB
        KERNEL2x1_SUB
@@ -2071,43 +2071,43 @@ cgemm_kernel_L1_M2_22:
        KERNEL2x1_SUB
 
        subs    counterL, counterL, #1
-       bgt     cgemm_kernel_L1_M2_22
+       bgt     .Lcgemm_kernel_L1_M2_22
 
 
-cgemm_kernel_L1_M2_40:
+.Lcgemm_kernel_L1_M2_40:
 
        ands    counterL , origK, #7            // counterL = counterL % 8
-       ble     cgemm_kernel_L1_M2_100
+       ble     .Lcgemm_kernel_L1_M2_100
 
-cgemm_kernel_L1_M2_42:
+.Lcgemm_kernel_L1_M2_42:
 
        KERNEL2x1_SUB
 
        subs    counterL, counterL, #1
-       bgt     cgemm_kernel_L1_M2_42
+       bgt     .Lcgemm_kernel_L1_M2_42
 
-cgemm_kernel_L1_M2_100:
+.Lcgemm_kernel_L1_M2_100:
 
        SAVE2x1
 
-cgemm_kernel_L1_M2_END:
+.Lcgemm_kernel_L1_M2_END:
 
 
-cgemm_kernel_L1_M1_BEGIN:
+.Lcgemm_kernel_L1_M1_BEGIN:
 
        tst     counterI, #1                    // counterI = counterI % 2
-       ble     cgemm_kernel_L1_END
+       ble     .Lcgemm_kernel_L1_END
 
-cgemm_kernel_L1_M1_20:
+.Lcgemm_kernel_L1_M1_20:
 
        INIT1x1
 
        mov     pB, origPB
        asr     counterL , origK, #3            // counterL = counterL / 8
        cmp     counterL , #0
-       ble     cgemm_kernel_L1_M1_40
+       ble     .Lcgemm_kernel_L1_M1_40
 
-cgemm_kernel_L1_M1_22:
+.Lcgemm_kernel_L1_M1_22:
        KERNEL1x1_SUB
        KERNEL1x1_SUB
        KERNEL1x1_SUB
@@ -2119,30 +2119,30 @@ cgemm_kernel_L1_M1_22:
        KERNEL1x1_SUB
 
        subs    counterL, counterL, #1
-       bgt     cgemm_kernel_L1_M1_22
+       bgt     .Lcgemm_kernel_L1_M1_22
 
 
-cgemm_kernel_L1_M1_40:
+.Lcgemm_kernel_L1_M1_40:
 
        ands    counterL , origK, #7            // counterL = counterL % 8
-       ble     cgemm_kernel_L1_M1_100
+       ble     .Lcgemm_kernel_L1_M1_100
 
-cgemm_kernel_L1_M1_42:
+.Lcgemm_kernel_L1_M1_42:
 
        KERNEL1x1_SUB
 
        subs    counterL, counterL, #1
-       bgt     cgemm_kernel_L1_M1_42
+       bgt     .Lcgemm_kernel_L1_M1_42
 
-cgemm_kernel_L1_M1_100:
+.Lcgemm_kernel_L1_M1_100:
 
        SAVE1x1
 
 
-cgemm_kernel_L1_END:
+.Lcgemm_kernel_L1_END:
 
 
-cgemm_kernel_L999:
+.Lcgemm_kernel_L999:
        mov     x0, #0                          // set return value
        ldp     d8, d9, [sp, #(0 * 16)]
        ldp     d10, d11, [sp, #(1 * 16)]
index 367cd02..29a68ff 100644 (file)
@@ -1432,11 +1432,11 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
        mov     counterJ, origN
        asr     counterJ, counterJ, #2          // J = J / 4
        cmp     counterJ, #0
-       ble     cgemm_kernel_L2_BEGIN
+       ble     .Lcgemm_kernel_L2_BEGIN
 
 /******************************************************************************/
 
-cgemm_kernel_L4_BEGIN:
+.Lcgemm_kernel_L4_BEGIN:
        mov     pCRow0, pC
        add     pCRow1, pCRow0, LDC
        add     pCRow2, pCRow1, LDC
@@ -1446,21 +1446,21 @@ cgemm_kernel_L4_BEGIN:
 
        mov     pA, origPA                      // pA = start of A array
 
-cgemm_kernel_L4_M8_BEGIN:
+.Lcgemm_kernel_L4_M8_BEGIN:
 
        mov     counterI, origM
        asr     counterI, counterI, #3          // counterI = counterI / 8
        cmp     counterI, #0
-       ble     cgemm_kernel_L4_M4_BEGIN
+       ble     .Lcgemm_kernel_L4_M4_BEGIN
 
        .align 5
-cgemm_kernel_L4_M8_20:
+.Lcgemm_kernel_L4_M8_20:
 
        mov     pB, origPB
 
        asr     counterL , origK, #5            // origK / 32
        cmp     counterL , #2
-       blt     cgemm_kernel_L4_M8_32
+       blt     .Lcgemm_kernel_L4_M8_32
 
        KERNEL8x4_I
        KERNEL8x4_M2
@@ -1470,18 +1470,18 @@ cgemm_kernel_L4_M8_20:
        KERNEL8x4_M1_M2_x8
 
        subs    counterL, counterL, #2          // subtract 2
-       ble     cgemm_kernel_L4_M8_22a
+       ble     .Lcgemm_kernel_L4_M8_22a
 
        .align 5
-cgemm_kernel_L4_M8_22:
+.Lcgemm_kernel_L4_M8_22:
 
        KERNEL8x4_M1_M2_x16
 
        subs    counterL, counterL, #1
-       bgt     cgemm_kernel_L4_M8_22
+       bgt     .Lcgemm_kernel_L4_M8_22
 
        .align 5
-cgemm_kernel_L4_M8_22a:
+.Lcgemm_kernel_L4_M8_22a:
 
        KERNEL8x4_M1_M2_x8
        KERNEL8x4_M1_M2_x4
@@ -1490,13 +1490,13 @@ cgemm_kernel_L4_M8_22a:
        KERNEL8x4_M1
        KERNEL8x4_E
 
-       b        cgemm_kernel_L4_M8_44
+       b        .Lcgemm_kernel_L4_M8_44
 
        .align 5
-cgemm_kernel_L4_M8_32:
+.Lcgemm_kernel_L4_M8_32:
 
        tst     counterL, #1
-       ble     cgemm_kernel_L4_M8_40
+       ble     .Lcgemm_kernel_L4_M8_40
 
        KERNEL8x4_I
        KERNEL8x4_M2
@@ -1506,116 +1506,116 @@ cgemm_kernel_L4_M8_32:
        KERNEL8x4_M1
        KERNEL8x4_E
 
-       b       cgemm_kernel_L4_M8_44
+       b       .Lcgemm_kernel_L4_M8_44
 
-cgemm_kernel_L4_M8_40:
+.Lcgemm_kernel_L4_M8_40:
 
        INIT8x4
 
-cgemm_kernel_L4_M8_44:
+.Lcgemm_kernel_L4_M8_44:
 
        ands    counterL , origK, #31
-       ble     cgemm_kernel_L4_M8_100
+       ble     .Lcgemm_kernel_L4_M8_100
 
        .align 5
-cgemm_kernel_L4_M8_46:
+.Lcgemm_kernel_L4_M8_46:
 
        KERNEL8x4_SUB
 
        subs    counterL, counterL, #1
-       bne     cgemm_kernel_L4_M8_46
+       bne     .Lcgemm_kernel_L4_M8_46
 
-cgemm_kernel_L4_M8_100:
+.Lcgemm_kernel_L4_M8_100:
        prfm    PLDL1KEEP, [pA]
        prfm    PLDL1KEEP, [pA, #64]
        prfm    PLDL1KEEP, [origPB]
 
        SAVE8x4
 
-cgemm_kernel_L4_M8_END:
+.Lcgemm_kernel_L4_M8_END:
        subs    counterI, counterI, #1
-       bne     cgemm_kernel_L4_M8_20
+       bne     .Lcgemm_kernel_L4_M8_20
 
-cgemm_kernel_L4_M4_BEGIN:
+.Lcgemm_kernel_L4_M4_BEGIN:
 
        mov     counterI, origM
        tst     counterI , #7
-       ble     cgemm_kernel_L4_END
+       ble     .Lcgemm_kernel_L4_END
 
        tst     counterI, #4
-       ble     cgemm_kernel_L4_M2_BEGIN
+       ble     .Lcgemm_kernel_L4_M2_BEGIN
 
 
-cgemm_kernel_L4_M4_20:
+.Lcgemm_kernel_L4_M4_20:
 
        mov     pB, origPB
        
        asr     counterL , origK, #1            // L = K / 2
        cmp     counterL , #2                   // is there at least 4 to do?
-       blt     cgemm_kernel_L4_M4_32
+       blt     .Lcgemm_kernel_L4_M4_32
 
        KERNEL4x4_I                             // do one in the K
        KERNEL4x4_M2                            // do another in the K
 
        subs    counterL, counterL, #2
-       ble     cgemm_kernel_L4_M4_22a
+       ble     .Lcgemm_kernel_L4_M4_22a
        .align 5
 
 
-cgemm_kernel_L4_M4_22:
+.Lcgemm_kernel_L4_M4_22:
 
        KERNEL4x4_M1
        KERNEL4x4_M2
 
        subs    counterL, counterL, #1
-       bgt     cgemm_kernel_L4_M4_22
+       bgt     .Lcgemm_kernel_L4_M4_22
 
-cgemm_kernel_L4_M4_22a:
+.Lcgemm_kernel_L4_M4_22a:
        KERNEL4x4_M1
        KERNEL4x4_E
-       b        cgemm_kernel_L4_M4_44
-cgemm_kernel_L4_M4_32:
+       b        .Lcgemm_kernel_L4_M4_44
+.Lcgemm_kernel_L4_M4_32:
        tst     counterL, #1
-       ble     cgemm_kernel_L4_M4_40
+       ble     .Lcgemm_kernel_L4_M4_40
        KERNEL4x4_I
        KERNEL4x4_E
-       b       cgemm_kernel_L4_M4_44
-cgemm_kernel_L4_M4_40:
+       b       .Lcgemm_kernel_L4_M4_44
+.Lcgemm_kernel_L4_M4_40:
 
        INIT4x4
 
-cgemm_kernel_L4_M4_44:
+.Lcgemm_kernel_L4_M4_44:
        ands    counterL , origK, #1
-       ble     cgemm_kernel_L4_M4_100
+       ble     .Lcgemm_kernel_L4_M4_100
 
-cgemm_kernel_L4_M4_46:
+.Lcgemm_kernel_L4_M4_46:
        KERNEL4x4_SUB
 
-cgemm_kernel_L4_M4_100:
+.Lcgemm_kernel_L4_M4_100:
 
        SAVE4x4
 
-cgemm_kernel_L4_M4_END:
+.Lcgemm_kernel_L4_M4_END:
 
-cgemm_kernel_L4_M2_BEGIN:
+.Lcgemm_kernel_L4_M2_BEGIN:
 
        mov     counterI, origM
        tst     counterI , #3
-       ble     cgemm_kernel_L4_END
+       ble     .Lcgemm_kernel_L4_END
 
        tst     counterI, #2                    // counterI = counterI / 2
-       ble     cgemm_kernel_L4_M1_BEGIN
+       ble     .Lcgemm_kernel_L4_M1_BEGIN
 
-cgemm_kernel_L4_M2_20:
+.Lcgemm_kernel_L4_M2_20:
 
        INIT2x4
 
        mov     pB, origPB
        asr     counterL , origK, #3            // counterL = counterL / 8
        cmp     counterL , #0
-       ble     cgemm_kernel_L4_M2_40
+       ble     .Lcgemm_kernel_L4_M2_40
 
-cgemm_kernel_L4_M2_22:
+.Lcgemm_kernel_L4_M2_22:
 
        KERNEL2x4_SUB
        KERNEL2x4_SUB
@@ -1628,43 +1628,43 @@ cgemm_kernel_L4_M2_22:
        KERNEL2x4_SUB
 
        subs    counterL, counterL, #1
-       bgt     cgemm_kernel_L4_M2_22
+       bgt     .Lcgemm_kernel_L4_M2_22
 
 
-cgemm_kernel_L4_M2_40:
+.Lcgemm_kernel_L4_M2_40:
 
        ands    counterL , origK, #7            // counterL = counterL % 8
-       ble     cgemm_kernel_L4_M2_100
+       ble     .Lcgemm_kernel_L4_M2_100
 
-cgemm_kernel_L4_M2_42:
+.Lcgemm_kernel_L4_M2_42:
 
        KERNEL2x4_SUB
 
        subs    counterL, counterL, #1
-       bgt     cgemm_kernel_L4_M2_42
+       bgt     .Lcgemm_kernel_L4_M2_42
 
-cgemm_kernel_L4_M2_100:
+.Lcgemm_kernel_L4_M2_100:
 
        SAVE2x4
 
-cgemm_kernel_L4_M2_END:
+.Lcgemm_kernel_L4_M2_END:
 
 
-cgemm_kernel_L4_M1_BEGIN:
+.Lcgemm_kernel_L4_M1_BEGIN:
 
        tst     counterI, #1                    // counterI = counterI % 2
-       ble     cgemm_kernel_L4_END
+       ble     .Lcgemm_kernel_L4_END
 
-cgemm_kernel_L4_M1_20:
+.Lcgemm_kernel_L4_M1_20:
 
        INIT1x4
 
        mov     pB, origPB
        asr     counterL , origK, #3            // counterL = counterL / 8
        cmp     counterL , #0
-       ble     cgemm_kernel_L4_M1_40
+       ble     .Lcgemm_kernel_L4_M1_40
 
-cgemm_kernel_L4_M1_22:
+.Lcgemm_kernel_L4_M1_22:
        KERNEL1x4_SUB
        KERNEL1x4_SUB
        KERNEL1x4_SUB
@@ -1676,45 +1676,45 @@ cgemm_kernel_L4_M1_22:
        KERNEL1x4_SUB
 
        subs    counterL, counterL, #1
-       bgt     cgemm_kernel_L4_M1_22
+       bgt     .Lcgemm_kernel_L4_M1_22
 
 
-cgemm_kernel_L4_M1_40:
+.Lcgemm_kernel_L4_M1_40:
 
        ands    counterL , origK, #7            // counterL = counterL % 8
-       ble     cgemm_kernel_L4_M1_100
+       ble     .Lcgemm_kernel_L4_M1_100
 
-cgemm_kernel_L4_M1_42:
+.Lcgemm_kernel_L4_M1_42:
 
        KERNEL1x4_SUB
 
        subs    counterL, counterL, #1
-       bgt     cgemm_kernel_L4_M1_42
+       bgt     .Lcgemm_kernel_L4_M1_42
 
-cgemm_kernel_L4_M1_100:
+.Lcgemm_kernel_L4_M1_100:
 
        SAVE1x4
 
 
-cgemm_kernel_L4_END:
+.Lcgemm_kernel_L4_END:
 
        lsl     temp, origK, #5 
        add     origPB, origPB, temp            // B = B + K * 4 * 8
 
        subs    counterJ, counterJ , #1         // j--
-       bgt     cgemm_kernel_L4_BEGIN
+       bgt     .Lcgemm_kernel_L4_BEGIN
 
 
 /******************************************************************************/
 
-cgemm_kernel_L2_BEGIN:   // less than 2 left in N direction
+.Lcgemm_kernel_L2_BEGIN:   // less than 2 left in N direction
 
        mov     counterJ , origN
        tst     counterJ , #3
-       ble     cgemm_kernel_L999   // error, N was less than 4?
+       ble     .Lcgemm_kernel_L999   // error, N was less than 4?
 
        tst     counterJ , #2
-       ble     cgemm_kernel_L1_BEGIN
+       ble     .Lcgemm_kernel_L1_BEGIN
 
        mov     pCRow0, pC                      // pCRow0 = pC
 
@@ -1723,14 +1723,14 @@ cgemm_kernel_L2_BEGIN:   // less than 2 left in N direction
        mov     pA, origPA                      // pA = A
 
 
-cgemm_kernel_L2_M8_BEGIN:
+.Lcgemm_kernel_L2_M8_BEGIN:
 
        mov     counterI, origM
        asr     counterI, counterI, #3          // counterI = counterI / 8
        cmp     counterI, #0
-       ble     cgemm_kernel_L2_M4_BEGIN
+       ble     .Lcgemm_kernel_L2_M4_BEGIN
 
-cgemm_kernel_L2_M8_20:
+.Lcgemm_kernel_L2_M8_20:
 
        INIT8x2
 
@@ -1738,10 +1738,10 @@ cgemm_kernel_L2_M8_20:
 
        asr     counterL , origK, #3            // counterL = counterL / 8
        cmp     counterL,#0
-       ble     cgemm_kernel_L2_M8_40
+       ble     .Lcgemm_kernel_L2_M8_40
        .align 5
 
-cgemm_kernel_L2_M8_22:
+.Lcgemm_kernel_L2_M8_22:
        KERNEL8x2_SUB
        KERNEL8x2_SUB
        KERNEL8x2_SUB
@@ -1753,50 +1753,50 @@ cgemm_kernel_L2_M8_22:
        KERNEL8x2_SUB
 
        subs    counterL, counterL, #1
-       bgt     cgemm_kernel_L2_M8_22
+       bgt     .Lcgemm_kernel_L2_M8_22
 
 
-cgemm_kernel_L2_M8_40:
+.Lcgemm_kernel_L2_M8_40:
 
        ands    counterL , origK, #7            // counterL = counterL % 8
-       ble     cgemm_kernel_L2_M8_100
+       ble     .Lcgemm_kernel_L2_M8_100
 
-cgemm_kernel_L2_M8_42:
+.Lcgemm_kernel_L2_M8_42:
 
        KERNEL8x2_SUB
 
        subs    counterL, counterL, #1
-       bgt     cgemm_kernel_L2_M8_42
+       bgt     .Lcgemm_kernel_L2_M8_42
 
-cgemm_kernel_L2_M8_100:
+.Lcgemm_kernel_L2_M8_100:
 
        SAVE8x2
 
-cgemm_kernel_L2_M8_END:
+.Lcgemm_kernel_L2_M8_END:
 
        subs    counterI, counterI, #1
-       bgt     cgemm_kernel_L2_M8_20
+       bgt     .Lcgemm_kernel_L2_M8_20
 
-cgemm_kernel_L2_M4_BEGIN:
+.Lcgemm_kernel_L2_M4_BEGIN:
 
        mov     counterI, origM
        tst     counterI , #7
-       ble     cgemm_kernel_L2_END
+       ble     .Lcgemm_kernel_L2_END
 
        tst     counterI, #4                    // counterI = counterI / 2
-       ble     cgemm_kernel_L2_M2_BEGIN
+       ble     .Lcgemm_kernel_L2_M2_BEGIN
 
-cgemm_kernel_L2_M4_20:
+.Lcgemm_kernel_L2_M4_20:
 
        INIT4x2
 
        mov     pB, origPB
        asr     counterL , origK, #3            // counterL = counterL / 8
        cmp     counterL,#0
-       ble     cgemm_kernel_L2_M4_40
+       ble     .Lcgemm_kernel_L2_M4_40
        .align 5
 
-cgemm_kernel_L2_M4_22:
+.Lcgemm_kernel_L2_M4_22:
        KERNEL4x2_SUB
        KERNEL4x2_SUB
        KERNEL4x2_SUB
@@ -1808,46 +1808,46 @@ cgemm_kernel_L2_M4_22:
        KERNEL4x2_SUB
 
        subs    counterL, counterL, #1
-       bgt     cgemm_kernel_L2_M4_22
+       bgt     .Lcgemm_kernel_L2_M4_22
 
 
-cgemm_kernel_L2_M4_40:
+.Lcgemm_kernel_L2_M4_40:
 
        ands    counterL , origK, #7            // counterL = counterL % 8
-       ble     cgemm_kernel_L2_M4_100
+       ble     .Lcgemm_kernel_L2_M4_100
 
-cgemm_kernel_L2_M4_42:
+.Lcgemm_kernel_L2_M4_42:
 
        KERNEL4x2_SUB
 
        subs    counterL, counterL, #1
-       bgt     cgemm_kernel_L2_M4_42
+       bgt     .Lcgemm_kernel_L2_M4_42
 
-cgemm_kernel_L2_M4_100:
+.Lcgemm_kernel_L2_M4_100:
 
        SAVE4x2
 
-cgemm_kernel_L2_M4_END:
+.Lcgemm_kernel_L2_M4_END:
 
-cgemm_kernel_L2_M2_BEGIN:
+.Lcgemm_kernel_L2_M2_BEGIN:
 
        mov     counterI, origM
        tst     counterI , #3
-       ble     cgemm_kernel_L2_END
+       ble     .Lcgemm_kernel_L2_END
 
        tst     counterI, #2                    // counterI = counterI / 2
-       ble     cgemm_kernel_L2_M1_BEGIN
+       ble     .Lcgemm_kernel_L2_M1_BEGIN
 
-cgemm_kernel_L2_M2_20:
+.Lcgemm_kernel_L2_M2_20:
 
        INIT2x2
 
        mov     pB, origPB
        asr     counterL , origK, #3            // counterL = counterL / 8
         cmp    counterL,#0
-       ble     cgemm_kernel_L2_M2_40
+       ble     .Lcgemm_kernel_L2_M2_40
 
-cgemm_kernel_L2_M2_22:
+.Lcgemm_kernel_L2_M2_22:
 
        KERNEL2x2_SUB
        KERNEL2x2_SUB
@@ -1860,43 +1860,43 @@ cgemm_kernel_L2_M2_22:
        KERNEL2x2_SUB
 
        subs    counterL, counterL, #1
-       bgt     cgemm_kernel_L2_M2_22
+       bgt     .Lcgemm_kernel_L2_M2_22
 
 
-cgemm_kernel_L2_M2_40:
+.Lcgemm_kernel_L2_M2_40:
 
        ands    counterL , origK, #7            // counterL = counterL % 8
-       ble     cgemm_kernel_L2_M2_100
+       ble     .Lcgemm_kernel_L2_M2_100
 
-cgemm_kernel_L2_M2_42:
+.Lcgemm_kernel_L2_M2_42:
 
        KERNEL2x2_SUB
 
        subs    counterL, counterL, #1
-       bgt     cgemm_kernel_L2_M2_42
+       bgt     .Lcgemm_kernel_L2_M2_42
 
-cgemm_kernel_L2_M2_100:
+.Lcgemm_kernel_L2_M2_100:
 
        SAVE2x2
 
-cgemm_kernel_L2_M2_END:
+.Lcgemm_kernel_L2_M2_END:
 
 
-cgemm_kernel_L2_M1_BEGIN:
+.Lcgemm_kernel_L2_M1_BEGIN:
 
        tst     counterI, #1                    // counterI = counterI % 2
-       ble     cgemm_kernel_L2_END
+       ble     .Lcgemm_kernel_L2_END
 
-cgemm_kernel_L2_M1_20:
+.Lcgemm_kernel_L2_M1_20:
 
        INIT1x2
 
        mov     pB, origPB
        asr     counterL , origK, #3            // counterL = counterL / 8
         cmp     counterL, #0
-       ble     cgemm_kernel_L2_M1_40
+       ble     .Lcgemm_kernel_L2_M1_40
 
-cgemm_kernel_L2_M1_22:
+.Lcgemm_kernel_L2_M1_22:
        KERNEL1x2_SUB
        KERNEL1x2_SUB
        KERNEL1x2_SUB
@@ -1908,36 +1908,36 @@ cgemm_kernel_L2_M1_22:
        KERNEL1x2_SUB
 
        subs    counterL, counterL, #1
-       bgt     cgemm_kernel_L2_M1_22
+       bgt     .Lcgemm_kernel_L2_M1_22
 
 
-cgemm_kernel_L2_M1_40:
+.Lcgemm_kernel_L2_M1_40:
 
        ands    counterL , origK, #7            // counterL = counterL % 8
-       ble     cgemm_kernel_L2_M1_100
+       ble     .Lcgemm_kernel_L2_M1_100
 
-cgemm_kernel_L2_M1_42:
+.Lcgemm_kernel_L2_M1_42:
 
        KERNEL1x2_SUB
 
        subs    counterL, counterL, #1
-       bgt     cgemm_kernel_L2_M1_42
+       bgt     .Lcgemm_kernel_L2_M1_42
 
-cgemm_kernel_L2_M1_100:
+.Lcgemm_kernel_L2_M1_100:
 
        SAVE1x2
 
 
-cgemm_kernel_L2_END:
+.Lcgemm_kernel_L2_END:
        add     origPB, origPB, origK, lsl #4   // B = B + K * 2 * 8
 
 /******************************************************************************/
 
-cgemm_kernel_L1_BEGIN:
+.Lcgemm_kernel_L1_BEGIN:
 
        mov     counterJ , origN
        tst     counterJ , #1
-       ble     cgemm_kernel_L999 // done
+       ble     .Lcgemm_kernel_L999 // done
 
 
        mov     pCRow0, pC                      // pCRow0 = C
@@ -1946,24 +1946,24 @@ cgemm_kernel_L1_BEGIN:
        mov     pA, origPA                      // pA = A
 
 
-cgemm_kernel_L1_M8_BEGIN:
+.Lcgemm_kernel_L1_M8_BEGIN:
 
        mov     counterI, origM
        asr     counterI, counterI, #3          // counterI = counterI / 8
        cmp     counterI, #0
-       ble     cgemm_kernel_L1_M4_BEGIN
+       ble     .Lcgemm_kernel_L1_M4_BEGIN
 
-cgemm_kernel_L1_M8_20:
+.Lcgemm_kernel_L1_M8_20:
 
        INIT8x1
 
        mov     pB, origPB
        asr     counterL , origK, #3            // counterL = counterL / 8
        cmp     counterL , #0
-       ble     cgemm_kernel_L1_M8_40
+       ble     .Lcgemm_kernel_L1_M8_40
        .align 5
 
-cgemm_kernel_L1_M8_22:
+.Lcgemm_kernel_L1_M8_22:
        KERNEL8x1_SUB
        KERNEL8x1_SUB
        KERNEL8x1_SUB
@@ -1975,51 +1975,51 @@ cgemm_kernel_L1_M8_22:
        KERNEL8x1_SUB
 
        subs    counterL, counterL, #1
-       bgt     cgemm_kernel_L1_M8_22
+       bgt     .Lcgemm_kernel_L1_M8_22
 
 
-cgemm_kernel_L1_M8_40:
+.Lcgemm_kernel_L1_M8_40:
 
        ands    counterL , origK, #7            // counterL = counterL % 8
-       ble     cgemm_kernel_L1_M8_100
+       ble     .Lcgemm_kernel_L1_M8_100
 
-cgemm_kernel_L1_M8_42:
+.Lcgemm_kernel_L1_M8_42:
 
        KERNEL8x1_SUB
 
        subs    counterL, counterL, #1
-       bgt     cgemm_kernel_L1_M8_42
+       bgt     .Lcgemm_kernel_L1_M8_42
 
-cgemm_kernel_L1_M8_100:
+.Lcgemm_kernel_L1_M8_100:
 
        SAVE8x1
 
-cgemm_kernel_L1_M8_END:
+.Lcgemm_kernel_L1_M8_END:
 
        subs    counterI, counterI, #1
-       bgt     cgemm_kernel_L1_M8_20
+       bgt     .Lcgemm_kernel_L1_M8_20
 
-cgemm_kernel_L1_M4_BEGIN:
+.Lcgemm_kernel_L1_M4_BEGIN:
 
        mov     counterI, origM
        tst     counterI , #7
-       ble     cgemm_kernel_L1_END
+       ble     .Lcgemm_kernel_L1_END
 
        tst     counterI, #4                    // counterI = counterI / 2
-       ble     cgemm_kernel_L1_M2_BEGIN
+       ble     .Lcgemm_kernel_L1_M2_BEGIN
 
 
-cgemm_kernel_L1_M4_20:
+.Lcgemm_kernel_L1_M4_20:
 
        INIT4x1
 
        mov     pB, origPB
        asr     counterL , origK, #3            // counterL = counterL / 8
        cmp     counterL , #0
-       ble     cgemm_kernel_L1_M4_40
+       ble     .Lcgemm_kernel_L1_M4_40
        .align 5
 
-cgemm_kernel_L1_M4_22:
+.Lcgemm_kernel_L1_M4_22:
        KERNEL4x1_SUB
        KERNEL4x1_SUB
        KERNEL4x1_SUB
@@ -2031,47 +2031,47 @@ cgemm_kernel_L1_M4_22:
        KERNEL4x1_SUB
 
        subs    counterL, counterL, #1
-       bgt     cgemm_kernel_L1_M4_22
+       bgt     .Lcgemm_kernel_L1_M4_22
 
 
-cgemm_kernel_L1_M4_40:
+.Lcgemm_kernel_L1_M4_40:
 
        ands    counterL , origK, #7            // counterL = counterL % 8
-       ble     cgemm_kernel_L1_M4_100
+       ble     .Lcgemm_kernel_L1_M4_100
 
-cgemm_kernel_L1_M4_42:
+.Lcgemm_kernel_L1_M4_42:
 
        KERNEL4x1_SUB
 
        subs    counterL, counterL, #1
-       bgt     cgemm_kernel_L1_M4_42
+       bgt     .Lcgemm_kernel_L1_M4_42
 
-cgemm_kernel_L1_M4_100:
+.Lcgemm_kernel_L1_M4_100:
 
        SAVE4x1
 
-cgemm_kernel_L1_M4_END:
+.Lcgemm_kernel_L1_M4_END:
 
 
-cgemm_kernel_L1_M2_BEGIN:
+.Lcgemm_kernel_L1_M2_BEGIN:
 
        mov     counterI, origM
        tst     counterI , #3
-       ble     cgemm_kernel_L1_END
+       ble     .Lcgemm_kernel_L1_END
 
        tst     counterI, #2                    // counterI = counterI / 2
-       ble     cgemm_kernel_L1_M1_BEGIN
+       ble     .Lcgemm_kernel_L1_M1_BEGIN
 
-cgemm_kernel_L1_M2_20:
+.Lcgemm_kernel_L1_M2_20:
 
        INIT2x1
 
        mov     pB, origPB
        asr     counterL , origK, #3            // counterL = counterL / 8
        cmp     counterL , #0
-       ble     cgemm_kernel_L1_M2_40
+       ble     .Lcgemm_kernel_L1_M2_40
 
-cgemm_kernel_L1_M2_22:
+.Lcgemm_kernel_L1_M2_22:
 
        KERNEL2x1_SUB
        KERNEL2x1_SUB
@@ -2084,43 +2084,43 @@ cgemm_kernel_L1_M2_22:
        KERNEL2x1_SUB
 
        subs    counterL, counterL, #1
-       bgt     cgemm_kernel_L1_M2_22
+       bgt     .Lcgemm_kernel_L1_M2_22
 
 
-cgemm_kernel_L1_M2_40:
+.Lcgemm_kernel_L1_M2_40:
 
        ands    counterL , origK, #7            // counterL = counterL % 8
-       ble     cgemm_kernel_L1_M2_100
+       ble     .Lcgemm_kernel_L1_M2_100
 
-cgemm_kernel_L1_M2_42:
+.Lcgemm_kernel_L1_M2_42:
 
        KERNEL2x1_SUB
 
        subs    counterL, counterL, #1
-       bgt     cgemm_kernel_L1_M2_42
+       bgt     .Lcgemm_kernel_L1_M2_42
 
-cgemm_kernel_L1_M2_100:
+.Lcgemm_kernel_L1_M2_100:
 
        SAVE2x1
 
-cgemm_kernel_L1_M2_END:
+.Lcgemm_kernel_L1_M2_END:
 
 
-cgemm_kernel_L1_M1_BEGIN:
+.Lcgemm_kernel_L1_M1_BEGIN:
 
        tst     counterI, #1                    // counterI = counterI % 2
-       ble     cgemm_kernel_L1_END
+       ble     .Lcgemm_kernel_L1_END
 
-cgemm_kernel_L1_M1_20:
+.Lcgemm_kernel_L1_M1_20:
 
        INIT1x1
 
        mov     pB, origPB
        asr     counterL , origK, #3            // counterL = counterL / 8
        cmp     counterL , #0
-       ble     cgemm_kernel_L1_M1_40
+       ble     .Lcgemm_kernel_L1_M1_40
 
-cgemm_kernel_L1_M1_22:
+.Lcgemm_kernel_L1_M1_22:
        KERNEL1x1_SUB
        KERNEL1x1_SUB
        KERNEL1x1_SUB
@@ -2132,30 +2132,30 @@ cgemm_kernel_L1_M1_22:
        KERNEL1x1_SUB
 
        subs    counterL, counterL, #1
-       bgt     cgemm_kernel_L1_M1_22
+       bgt     .Lcgemm_kernel_L1_M1_22
 
 
-cgemm_kernel_L1_M1_40:
+.Lcgemm_kernel_L1_M1_40:
 
        ands    counterL , origK, #7            // counterL = counterL % 8
-       ble     cgemm_kernel_L1_M1_100
+       ble     .Lcgemm_kernel_L1_M1_100
 
-cgemm_kernel_L1_M1_42:
+.Lcgemm_kernel_L1_M1_42:
 
        KERNEL1x1_SUB
 
        subs    counterL, counterL, #1
-       bgt     cgemm_kernel_L1_M1_42
+       bgt     .Lcgemm_kernel_L1_M1_42
 
-cgemm_kernel_L1_M1_100:
+.Lcgemm_kernel_L1_M1_100:
 
        SAVE1x1
 
 
-cgemm_kernel_L1_END:
+.Lcgemm_kernel_L1_END:
 
 
-cgemm_kernel_L999:
+.Lcgemm_kernel_L999:
        mov     x0, #0                          // set return value
        ldp     d8, d9, [sp, #(0 * 16)]
        ldp     d10, d11, [sp, #(1 * 16)]
index 70eab96..b8c6bfc 100644 (file)
@@ -159,50 +159,50 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
        PROLOGUE
 
        cmp     N, xzr
-       ble     copy_kernel_L999
+       ble     .Lcopy_kernel_L999
 
        cmp     INC_X, #1
-       bne     copy_kernel_S_BEGIN
+       bne     .Lcopy_kernel_S_BEGIN
        cmp     INC_Y, #1
-       bne     copy_kernel_S_BEGIN
+       bne     .Lcopy_kernel_S_BEGIN
 
-copy_kernel_F_BEGIN:
+.Lcopy_kernel_F_BEGIN:
 
        asr     I, N, #2
        cmp     I, xzr
-       beq     copy_kernel_F1
+       beq     .Lcopy_kernel_F1
 
-copy_kernel_F4:
+.Lcopy_kernel_F4:
 
        KERNEL_F4
 
        subs    I, I, #1
-       bne     copy_kernel_F4
+       bne     .Lcopy_kernel_F4
 
-copy_kernel_F1:
+.Lcopy_kernel_F1:
 
        ands    I, N, #3
-       ble     copy_kernel_L999
+       ble     .Lcopy_kernel_L999
 
-copy_kernel_F10:
+.Lcopy_kernel_F10:
 
        KERNEL_F1
 
        subs    I, I, #1
-        bne     copy_kernel_F10
+        bne     .Lcopy_kernel_F10
 
        mov     w0, wzr
        ret
 
-copy_kernel_S_BEGIN:
+.Lcopy_kernel_S_BEGIN:
 
        INIT_S
 
        asr     I, N, #2
        cmp     I, xzr
-       ble     copy_kernel_S1
+       ble     .Lcopy_kernel_S1
 
-copy_kernel_S4:
+.Lcopy_kernel_S4:
 
        KERNEL_S1
        KERNEL_S1
@@ -210,21 +210,21 @@ copy_kernel_S4:
        KERNEL_S1
 
        subs    I, I, #1
-       bne     copy_kernel_S4
+       bne     .Lcopy_kernel_S4
 
-copy_kernel_S1:
+.Lcopy_kernel_S1:
 
        ands    I, N, #3
-       ble     copy_kernel_L999
+       ble     .Lcopy_kernel_L999
 
-copy_kernel_S10:
+.Lcopy_kernel_S10:
 
        KERNEL_S1
 
        subs    I, I, #1
-        bne     copy_kernel_S10
+        bne     .Lcopy_kernel_S10
 
-copy_kernel_L999:
+.Lcopy_kernel_L999:
 
        mov     w0, wzr
        ret
index 3de2725..79d33e9 100644 (file)
@@ -785,11 +785,11 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
        mov     counterJ, origN
        asr     counterJ, counterJ, #2          // J = J / 4
        cmp     counterJ, #0
-       ble     ctrmm_kernel_L2_BEGIN
+       ble     .Lctrmm_kernel_L2_BEGIN
 
 /******************************************************************************/
 
-ctrmm_kernel_L4_BEGIN:
+.Lctrmm_kernel_L4_BEGIN:
        mov     pCRow0, pC                      // pCRow0 = C
        add     pC, pC, LDC, lsl #2
 
@@ -798,14 +798,14 @@ ctrmm_kernel_L4_BEGIN:
 #endif
        mov     pA, origPA                      // pA = start of A array
 
-ctrmm_kernel_L4_M4_BEGIN:
+.Lctrmm_kernel_L4_M4_BEGIN:
 
        mov     counterI, origM
        asr     counterI, counterI, #2          // counterI = counterI / 4
        cmp     counterI, #0
-       ble     ctrmm_kernel_L4_M2_BEGIN
+       ble     .Lctrmm_kernel_L4_M2_BEGIN
 
-ctrmm_kernel_L4_M4_20:
+.Lctrmm_kernel_L4_M4_20:
 
 #if (defined(LEFT) &&  defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA))
        mov     pB, origPB
@@ -826,55 +826,55 @@ ctrmm_kernel_L4_M4_20:
 
        asr     counterL , tempK, #1            // L = K / 2
        cmp     counterL , #2                   // is there at least 4 to do?
-       blt     ctrmm_kernel_L4_M4_32
+       blt     .Lctrmm_kernel_L4_M4_32
 
        KERNEL4x4_I                             // do one in the K
        KERNEL4x4_M2                            // do another in the K
 
        subs    counterL, counterL, #2
-       ble     ctrmm_kernel_L4_M4_22a
+       ble     .Lctrmm_kernel_L4_M4_22a
        .align 5
 
-ctrmm_kernel_L4_M4_22:
+.Lctrmm_kernel_L4_M4_22:
 
        KERNEL4x4_M1
        KERNEL4x4_M2
 
        subs    counterL, counterL, #1
-       bgt     ctrmm_kernel_L4_M4_22
+       bgt     .Lctrmm_kernel_L4_M4_22
 
 
-ctrmm_kernel_L4_M4_22a:
+.Lctrmm_kernel_L4_M4_22a:
 
        KERNEL4x4_M1
        KERNEL4x4_E
 
-       b        ctrmm_kernel_L4_M4_44
+       b        .Lctrmm_kernel_L4_M4_44
 
-ctrmm_kernel_L4_M4_32:
+.Lctrmm_kernel_L4_M4_32:
 
        tst     counterL, #1
-       ble     ctrmm_kernel_L4_M4_40
+       ble     .Lctrmm_kernel_L4_M4_40
 
        KERNEL4x4_I
        KERNEL4x4_E
 
-       b       ctrmm_kernel_L4_M4_44
+       b       .Lctrmm_kernel_L4_M4_44
 
 
-ctrmm_kernel_L4_M4_40:
+.Lctrmm_kernel_L4_M4_40:
 
        INIT4x4
 
-ctrmm_kernel_L4_M4_44:
+.Lctrmm_kernel_L4_M4_44:
 
        ands    counterL , tempK, #1
-       ble     ctrmm_kernel_L4_M4_100
+       ble     .Lctrmm_kernel_L4_M4_100
 
-ctrmm_kernel_L4_M4_46:
+.Lctrmm_kernel_L4_M4_46:
        KERNEL4x4_SUB
 
-ctrmm_kernel_L4_M4_100:
+.Lctrmm_kernel_L4_M4_100:
 
        SAVE4x4
 
@@ -893,20 +893,20 @@ ctrmm_kernel_L4_M4_100:
        add     tempOffset, tempOffset, #4
 #endif
 
-ctrmm_kernel_L4_M4_END:
+.Lctrmm_kernel_L4_M4_END:
        subs    counterI, counterI, #1
-       bne     ctrmm_kernel_L4_M4_20
+       bne     .Lctrmm_kernel_L4_M4_20
 
-ctrmm_kernel_L4_M2_BEGIN:
+.Lctrmm_kernel_L4_M2_BEGIN:
 
        mov     counterI, origM
        tst     counterI , #3
-       ble     ctrmm_kernel_L4_END
+       ble     .Lctrmm_kernel_L4_END
 
        tst     counterI, #2                    // counterI = counterI / 2
-       ble     ctrmm_kernel_L4_M1_BEGIN
+       ble     .Lctrmm_kernel_L4_M1_BEGIN
 
-ctrmm_kernel_L4_M2_20:
+.Lctrmm_kernel_L4_M2_20:
 
        INIT2x4
 
@@ -930,9 +930,9 @@ ctrmm_kernel_L4_M2_20:
 
        asr     counterL , tempK, #3            // counterL = counterL / 8
        cmp     counterL , #0
-       ble     ctrmm_kernel_L4_M2_40
+       ble     .Lctrmm_kernel_L4_M2_40
 
-ctrmm_kernel_L4_M2_22:
+.Lctrmm_kernel_L4_M2_22:
 
        KERNEL2x4_SUB
        KERNEL2x4_SUB
@@ -945,22 +945,22 @@ ctrmm_kernel_L4_M2_22:
        KERNEL2x4_SUB
 
        subs    counterL, counterL, #1
-       bgt     ctrmm_kernel_L4_M2_22
+       bgt     .Lctrmm_kernel_L4_M2_22
 
 
-ctrmm_kernel_L4_M2_40:
+.Lctrmm_kernel_L4_M2_40:
 
        ands    counterL , tempK, #7            // counterL = counterL % 8
-       ble     ctrmm_kernel_L4_M2_100
+       ble     .Lctrmm_kernel_L4_M2_100
 
-ctrmm_kernel_L4_M2_42:
+.Lctrmm_kernel_L4_M2_42:
 
        KERNEL2x4_SUB
 
        subs    counterL, counterL, #1
-       bgt     ctrmm_kernel_L4_M2_42
+       bgt     .Lctrmm_kernel_L4_M2_42
 
-ctrmm_kernel_L4_M2_100:
+.Lctrmm_kernel_L4_M2_100:
 
        SAVE2x4
 
@@ -980,15 +980,15 @@ ctrmm_kernel_L4_M2_100:
        add     tempOffset, tempOffset, #2
 #endif
 
-ctrmm_kernel_L4_M2_END:
+.Lctrmm_kernel_L4_M2_END:
 
 
-ctrmm_kernel_L4_M1_BEGIN:
+.Lctrmm_kernel_L4_M1_BEGIN:
 
        tst     counterI, #1                    // counterI = counterI % 2
-       ble     ctrmm_kernel_L4_END
+       ble     .Lctrmm_kernel_L4_END
 
-ctrmm_kernel_L4_M1_20:
+.Lctrmm_kernel_L4_M1_20:
 
        INIT1x4
 
@@ -1012,9 +1012,9 @@ ctrmm_kernel_L4_M1_20:
 
        asr     counterL , tempK, #3            // counterL = counterL / 8
        cmp     counterL , #0
-       ble     ctrmm_kernel_L4_M1_40
+       ble     .Lctrmm_kernel_L4_M1_40
 
-ctrmm_kernel_L4_M1_22:
+.Lctrmm_kernel_L4_M1_22:
        KERNEL1x4_SUB
        KERNEL1x4_SUB
        KERNEL1x4_SUB
@@ -1026,22 +1026,22 @@ ctrmm_kernel_L4_M1_22:
        KERNEL1x4_SUB
 
        subs    counterL, counterL, #1
-       bgt     ctrmm_kernel_L4_M1_22
+       bgt     .Lctrmm_kernel_L4_M1_22
 
 
-ctrmm_kernel_L4_M1_40:
+.Lctrmm_kernel_L4_M1_40:
 
        ands    counterL , tempK, #7            // counterL = counterL % 8
-       ble     ctrmm_kernel_L4_M1_100
+       ble     .Lctrmm_kernel_L4_M1_100
 
-ctrmm_kernel_L4_M1_42:
+.Lctrmm_kernel_L4_M1_42:
 
        KERNEL1x4_SUB
 
        subs    counterL, counterL, #1
-       bgt     ctrmm_kernel_L4_M1_42
+       bgt     .Lctrmm_kernel_L4_M1_42
 
-ctrmm_kernel_L4_M1_100:
+.Lctrmm_kernel_L4_M1_100:
 
        SAVE1x4
 
@@ -1061,7 +1061,7 @@ ctrmm_kernel_L4_M1_100:
        add     tempOffset, tempOffset, #1
 #endif
 
-ctrmm_kernel_L4_END:
+.Lctrmm_kernel_L4_END:
 
        lsl     temp, origK, #5 
        add     origPB, origPB, temp            // B = B + K * 4 * 8
@@ -1071,19 +1071,19 @@ ctrmm_kernel_L4_END:
 #endif
 
        subs    counterJ, counterJ , #1         // j--
-       bgt     ctrmm_kernel_L4_BEGIN
+       bgt     .Lctrmm_kernel_L4_BEGIN
 
 
 /******************************************************************************/
 
-ctrmm_kernel_L2_BEGIN:   // less than 2 left in N direction
+.Lctrmm_kernel_L2_BEGIN:   // less than 2 left in N direction
 
        mov     counterJ , origN
        tst     counterJ , #3
-       ble     ctrmm_kernel_L999   // error, N was less than 4?
+       ble     .Lctrmm_kernel_L999   // error, N was less than 4?
 
        tst     counterJ , #2
-       ble     ctrmm_kernel_L1_BEGIN
+       ble     .Lctrmm_kernel_L1_BEGIN
 
        mov     pCRow0, pC                      // pCRow0 = pC
 
@@ -1095,14 +1095,14 @@ ctrmm_kernel_L2_BEGIN:   // less than 2 left in N direction
 
        mov     pA, origPA                      // pA = A
 
-ctrmm_kernel_L2_M4_BEGIN:
+.Lctrmm_kernel_L2_M4_BEGIN:
 
        mov     counterI, origM
        asr     counterI, counterI, #2          // counterI = counterI / 4
        cmp     counterI,#0
-       ble     ctrmm_kernel_L2_M2_BEGIN
+       ble     .Lctrmm_kernel_L2_M2_BEGIN
 
-ctrmm_kernel_L2_M4_20:
+.Lctrmm_kernel_L2_M4_20:
 
        INIT4x2
 
@@ -1126,10 +1126,10 @@ ctrmm_kernel_L2_M4_20:
 
        asr     counterL , tempK, #3            // counterL = counterL / 8
        cmp     counterL,#0
-       ble     ctrmm_kernel_L2_M4_40
+       ble     .Lctrmm_kernel_L2_M4_40
        .align 5
 
-ctrmm_kernel_L2_M4_22:
+.Lctrmm_kernel_L2_M4_22:
        KERNEL4x2_SUB
        KERNEL4x2_SUB
        KERNEL4x2_SUB
@@ -1141,22 +1141,22 @@ ctrmm_kernel_L2_M4_22:
        KERNEL4x2_SUB
 
        subs    counterL, counterL, #1
-       bgt     ctrmm_kernel_L2_M4_22
+       bgt     .Lctrmm_kernel_L2_M4_22
 
 
-ctrmm_kernel_L2_M4_40:
+.Lctrmm_kernel_L2_M4_40:
 
        ands    counterL , tempK, #7            // counterL = counterL % 8
-       ble     ctrmm_kernel_L2_M4_100
+       ble     .Lctrmm_kernel_L2_M4_100
 
-ctrmm_kernel_L2_M4_42:
+.Lctrmm_kernel_L2_M4_42:
 
        KERNEL4x2_SUB
 
        subs    counterL, counterL, #1
-       bgt     ctrmm_kernel_L2_M4_42
+       bgt     .Lctrmm_kernel_L2_M4_42
 
-ctrmm_kernel_L2_M4_100:
+.Lctrmm_kernel_L2_M4_100:
 
        SAVE4x2
 
@@ -1176,22 +1176,22 @@ ctrmm_kernel_L2_M4_100:
        add     tempOffset, tempOffset, #4
 #endif
 
-ctrmm_kernel_L2_M4_END:
+.Lctrmm_kernel_L2_M4_END:
 
        subs    counterI, counterI, #1
-       bgt     ctrmm_kernel_L2_M4_20
+       bgt     .Lctrmm_kernel_L2_M4_20
 
 
-ctrmm_kernel_L2_M2_BEGIN:
+.Lctrmm_kernel_L2_M2_BEGIN:
 
        mov     counterI, origM
        tst     counterI , #3
-       ble     ctrmm_kernel_L2_END
+       ble     .Lctrmm_kernel_L2_END
 
        tst     counterI, #2                    // counterI = counterI / 2
-       ble     ctrmm_kernel_L2_M1_BEGIN
+       ble     .Lctrmm_kernel_L2_M1_BEGIN
 
-ctrmm_kernel_L2_M2_20:
+.Lctrmm_kernel_L2_M2_20:
 
        INIT2x2
 
@@ -1215,9 +1215,9 @@ ctrmm_kernel_L2_M2_20:
 
        asr     counterL , tempK, #3            // counterL = counterL / 8
         cmp    counterL,#0
-       ble     ctrmm_kernel_L2_M2_40
+       ble     .Lctrmm_kernel_L2_M2_40
 
-ctrmm_kernel_L2_M2_22:
+.Lctrmm_kernel_L2_M2_22:
 
        KERNEL2x2_SUB
        KERNEL2x2_SUB
@@ -1230,22 +1230,22 @@ ctrmm_kernel_L2_M2_22:
        KERNEL2x2_SUB
 
        subs    counterL, counterL, #1
-       bgt     ctrmm_kernel_L2_M2_22
+       bgt     .Lctrmm_kernel_L2_M2_22
 
 
-ctrmm_kernel_L2_M2_40:
+.Lctrmm_kernel_L2_M2_40:
 
        ands    counterL , tempK, #7            // counterL = counterL % 8
-       ble     ctrmm_kernel_L2_M2_100
+       ble     .Lctrmm_kernel_L2_M2_100
 
-ctrmm_kernel_L2_M2_42:
+.Lctrmm_kernel_L2_M2_42:
 
        KERNEL2x2_SUB
 
        subs    counterL, counterL, #1
-       bgt     ctrmm_kernel_L2_M2_42
+       bgt     .Lctrmm_kernel_L2_M2_42
 
-ctrmm_kernel_L2_M2_100:
+.Lctrmm_kernel_L2_M2_100:
 
        SAVE2x2
 
@@ -1265,15 +1265,15 @@ ctrmm_kernel_L2_M2_100:
        add     tempOffset, tempOffset, #2
 #endif
 
-ctrmm_kernel_L2_M2_END:
+.Lctrmm_kernel_L2_M2_END:
 
 
-ctrmm_kernel_L2_M1_BEGIN:
+.Lctrmm_kernel_L2_M1_BEGIN:
 
        tst     counterI, #1                    // counterI = counterI % 2
-       ble     ctrmm_kernel_L2_END
+       ble     .Lctrmm_kernel_L2_END
 
-ctrmm_kernel_L2_M1_20:
+.Lctrmm_kernel_L2_M1_20:
 
        INIT1x2
 
@@ -1297,9 +1297,9 @@ ctrmm_kernel_L2_M1_20:
 
        asr     counterL , tempK, #3            // counterL = counterL / 8
         cmp     counterL, #0
-       ble     ctrmm_kernel_L2_M1_40
+       ble     .Lctrmm_kernel_L2_M1_40
 
-ctrmm_kernel_L2_M1_22:
+.Lctrmm_kernel_L2_M1_22:
        KERNEL1x2_SUB
        KERNEL1x2_SUB
        KERNEL1x2_SUB
@@ -1311,22 +1311,22 @@ ctrmm_kernel_L2_M1_22:
        KERNEL1x2_SUB
 
        subs    counterL, counterL, #1
-       bgt     ctrmm_kernel_L2_M1_22
+       bgt     .Lctrmm_kernel_L2_M1_22
 
 
-ctrmm_kernel_L2_M1_40:
+.Lctrmm_kernel_L2_M1_40:
 
        ands    counterL , tempK, #7            // counterL = counterL % 8
-       ble     ctrmm_kernel_L2_M1_100
+       ble     .Lctrmm_kernel_L2_M1_100
 
-ctrmm_kernel_L2_M1_42:
+.Lctrmm_kernel_L2_M1_42:
 
        KERNEL1x2_SUB
 
        subs    counterL, counterL, #1
-       bgt     ctrmm_kernel_L2_M1_42
+       bgt     .Lctrmm_kernel_L2_M1_42
 
-ctrmm_kernel_L2_M1_100:
+.Lctrmm_kernel_L2_M1_100:
 
        SAVE1x2
 
@@ -1346,7 +1346,7 @@ ctrmm_kernel_L2_M1_100:
        add     tempOffset, tempOffset, #1
 #endif
 
-ctrmm_kernel_L2_END:
+.Lctrmm_kernel_L2_END:
 #if !defined(LEFT)
        add     tempOffset, tempOffset, #2
 #endif
@@ -1354,11 +1354,11 @@ ctrmm_kernel_L2_END:
 
 /******************************************************************************/
 
-ctrmm_kernel_L1_BEGIN:
+.Lctrmm_kernel_L1_BEGIN:
 
        mov     counterJ , origN
        tst     counterJ , #1
-       ble     ctrmm_kernel_L999 // done
+       ble     .Lctrmm_kernel_L999 // done
 
 
        mov     pCRow0, pC                      // pCRow0 = C
@@ -1370,14 +1370,14 @@ ctrmm_kernel_L1_BEGIN:
 
        mov     pA, origPA                      // pA = A
 
-ctrmm_kernel_L1_M4_BEGIN:
+.Lctrmm_kernel_L1_M4_BEGIN:
 
        mov     counterI, origM
        asr     counterI, counterI, #2          // counterI = counterI / 4
        cmp     counterI, #0
-       ble     ctrmm_kernel_L1_M2_BEGIN
+       ble     .Lctrmm_kernel_L1_M2_BEGIN
 
-ctrmm_kernel_L1_M4_20:
+.Lctrmm_kernel_L1_M4_20:
 
        INIT4x1
 
@@ -1401,10 +1401,10 @@ ctrmm_kernel_L1_M4_20:
 
        asr     counterL , tempK, #3            // counterL = counterL / 8
        cmp     counterL , #0
-       ble     ctrmm_kernel_L1_M4_40
+       ble     .Lctrmm_kernel_L1_M4_40
        .align 5
 
-ctrmm_kernel_L1_M4_22:
+.Lctrmm_kernel_L1_M4_22:
        KERNEL4x1_SUB
        KERNEL4x1_SUB
        KERNEL4x1_SUB
@@ -1416,22 +1416,22 @@ ctrmm_kernel_L1_M4_22:
        KERNEL4x1_SUB
 
        subs    counterL, counterL, #1
-       bgt     ctrmm_kernel_L1_M4_22
+       bgt     .Lctrmm_kernel_L1_M4_22
 
 
-ctrmm_kernel_L1_M4_40:
+.Lctrmm_kernel_L1_M4_40:
 
        ands    counterL , tempK, #7            // counterL = counterL % 8
-       ble     ctrmm_kernel_L1_M4_100
+       ble     .Lctrmm_kernel_L1_M4_100
 
-ctrmm_kernel_L1_M4_42:
+.Lctrmm_kernel_L1_M4_42:
 
        KERNEL4x1_SUB
 
        subs    counterL, counterL, #1
-       bgt     ctrmm_kernel_L1_M4_42
+       bgt     .Lctrmm_kernel_L1_M4_42
 
-ctrmm_kernel_L1_M4_100:
+.Lctrmm_kernel_L1_M4_100:
 
        SAVE4x1
 
@@ -1451,22 +1451,22 @@ ctrmm_kernel_L1_M4_100:
        add     tempOffset, tempOffset, #4
 #endif
 
-ctrmm_kernel_L1_M4_END:
+.Lctrmm_kernel_L1_M4_END:
 
        subs    counterI, counterI, #1
-       bgt     ctrmm_kernel_L1_M4_20
+       bgt     .Lctrmm_kernel_L1_M4_20
 
 
-ctrmm_kernel_L1_M2_BEGIN:
+.Lctrmm_kernel_L1_M2_BEGIN:
 
        mov     counterI, origM
        tst     counterI , #3
-       ble     ctrmm_kernel_L1_END
+       ble     .Lctrmm_kernel_L1_END
 
        tst     counterI, #2                    // counterI = counterI / 2
-       ble     ctrmm_kernel_L1_M1_BEGIN
+       ble     .Lctrmm_kernel_L1_M1_BEGIN
 
-ctrmm_kernel_L1_M2_20:
+.Lctrmm_kernel_L1_M2_20:
 
        INIT2x1
 
@@ -1490,9 +1490,9 @@ ctrmm_kernel_L1_M2_20:
 
        asr     counterL , tempK, #3            // counterL = counterL / 8
        cmp     counterL , #0
-       ble     ctrmm_kernel_L1_M2_40
+       ble     .Lctrmm_kernel_L1_M2_40
 
-ctrmm_kernel_L1_M2_22:
+.Lctrmm_kernel_L1_M2_22:
 
        KERNEL2x1_SUB
        KERNEL2x1_SUB
@@ -1505,22 +1505,22 @@ ctrmm_kernel_L1_M2_22:
        KERNEL2x1_SUB
 
        subs    counterL, counterL, #1
-       bgt     ctrmm_kernel_L1_M2_22
+       bgt     .Lctrmm_kernel_L1_M2_22
 
 
-ctrmm_kernel_L1_M2_40:
+.Lctrmm_kernel_L1_M2_40:
 
        ands    counterL , tempK, #7            // counterL = counterL % 8
-       ble     ctrmm_kernel_L1_M2_100
+       ble     .Lctrmm_kernel_L1_M2_100
 
-ctrmm_kernel_L1_M2_42:
+.Lctrmm_kernel_L1_M2_42:
 
        KERNEL2x1_SUB
 
        subs    counterL, counterL, #1
-       bgt     ctrmm_kernel_L1_M2_42
+       bgt     .Lctrmm_kernel_L1_M2_42
 
-ctrmm_kernel_L1_M2_100:
+.Lctrmm_kernel_L1_M2_100:
 
        SAVE2x1
 
@@ -1540,15 +1540,15 @@ ctrmm_kernel_L1_M2_100:
        add     tempOffset, tempOffset, #2
 #endif
 
-ctrmm_kernel_L1_M2_END:
+.Lctrmm_kernel_L1_M2_END:
 
 
-ctrmm_kernel_L1_M1_BEGIN:
+.Lctrmm_kernel_L1_M1_BEGIN:
 
        tst     counterI, #1                    // counterI = counterI % 2
-       ble     ctrmm_kernel_L1_END
+       ble     .Lctrmm_kernel_L1_END
 
-ctrmm_kernel_L1_M1_20:
+.Lctrmm_kernel_L1_M1_20:
 
        INIT1x1
 
@@ -1572,9 +1572,9 @@ ctrmm_kernel_L1_M1_20:
 
        asr     counterL , tempK, #3            // counterL = counterL / 8
        cmp     counterL , #0
-       ble     ctrmm_kernel_L1_M1_40
+       ble     .Lctrmm_kernel_L1_M1_40
 
-ctrmm_kernel_L1_M1_22:
+.Lctrmm_kernel_L1_M1_22:
        KERNEL1x1_SUB
        KERNEL1x1_SUB
        KERNEL1x1_SUB
@@ -1586,30 +1586,30 @@ ctrmm_kernel_L1_M1_22:
        KERNEL1x1_SUB
 
        subs    counterL, counterL, #1
-       bgt     ctrmm_kernel_L1_M1_22
+       bgt     .Lctrmm_kernel_L1_M1_22
 
 
-ctrmm_kernel_L1_M1_40:
+.Lctrmm_kernel_L1_M1_40:
 
        ands    counterL , tempK, #7            // counterL = counterL % 8
-       ble     ctrmm_kernel_L1_M1_100
+       ble     .Lctrmm_kernel_L1_M1_100
 
-ctrmm_kernel_L1_M1_42:
+.Lctrmm_kernel_L1_M1_42:
 
        KERNEL1x1_SUB
 
        subs    counterL, counterL, #1
-       bgt     ctrmm_kernel_L1_M1_42
+       bgt     .Lctrmm_kernel_L1_M1_42
 
-ctrmm_kernel_L1_M1_100:
+.Lctrmm_kernel_L1_M1_100:
 
        SAVE1x1
 
 
-ctrmm_kernel_L1_END:
+.Lctrmm_kernel_L1_END:
 
 
-ctrmm_kernel_L999:
+.Lctrmm_kernel_L999:
        mov     x0, #0                          // set return value
        ldp     d8, d9, [sp, #(0 * 16)]
        ldp     d10, d11, [sp, #(1 * 16)]
index 680fb56..5c08273 100644 (file)
@@ -1405,11 +1405,11 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
        mov     counterJ, origN
        asr     counterJ, counterJ, #2          // J = J / 4
        cmp     counterJ, #0
-       ble     ctrmm_kernel_L2_BEGIN
+       ble     .Lctrmm_kernel_L2_BEGIN
 
 /******************************************************************************/
 
-ctrmm_kernel_L4_BEGIN:
+.Lctrmm_kernel_L4_BEGIN:
        mov     pCRow0, pC
        add     pCRow1, pCRow0, LDC
        add     pCRow2, pCRow1, LDC
@@ -1423,14 +1423,14 @@ ctrmm_kernel_L4_BEGIN:
 #endif
        mov     pA, origPA                      // pA = start of A array
 
-ctrmm_kernel_L4_M8_BEGIN:
+.Lctrmm_kernel_L4_M8_BEGIN:
 
        mov     counterI, origM
        asr     counterI, counterI, #3          // counterI = counterI / 8
        cmp     counterI, #0
-       ble     ctrmm_kernel_L4_M4_BEGIN
+       ble     .Lctrmm_kernel_L4_M4_BEGIN
 
-ctrmm_kernel_L4_M8_20:
+.Lctrmm_kernel_L4_M8_20:
 
 #if (defined(LEFT) &&  defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA))
        mov     pB, origPB
@@ -1452,7 +1452,7 @@ ctrmm_kernel_L4_M8_20:
 
        asr     counterL , tempK, #3
        cmp     counterL , #2
-       blt     ctrmm_kernel_L4_M8_32
+       blt     .Lctrmm_kernel_L4_M8_32
 
        KERNEL8x4_I
        KERNEL8x4_M2
@@ -1464,10 +1464,10 @@ ctrmm_kernel_L4_M8_20:
        KERNEL8x4_M2
 
        subs    counterL, counterL, #2          // subtract 2
-       ble     ctrmm_kernel_L4_M8_22a
+       ble     .Lctrmm_kernel_L4_M8_22a
 
        .align 5
-ctrmm_kernel_L4_M8_22:
+.Lctrmm_kernel_L4_M8_22:
 
        KERNEL8x4_M1
        KERNEL8x4_M2
@@ -1479,10 +1479,10 @@ ctrmm_kernel_L4_M8_22:
        KERNEL8x4_M2
 
        subs    counterL, counterL, #1
-       bgt     ctrmm_kernel_L4_M8_22
+       bgt     .Lctrmm_kernel_L4_M8_22
 
        .align 5
-ctrmm_kernel_L4_M8_22a:
+.Lctrmm_kernel_L4_M8_22a:
 
        KERNEL8x4_M1
        KERNEL8x4_M2
@@ -1493,13 +1493,13 @@ ctrmm_kernel_L4_M8_22a:
        KERNEL8x4_M1
        KERNEL8x4_E
 
-       b        ctrmm_kernel_L4_M8_44
+       b        .Lctrmm_kernel_L4_M8_44
 
        .align 5
-ctrmm_kernel_L4_M8_32:
+.Lctrmm_kernel_L4_M8_32:
 
        tst     counterL, #1
-       ble     ctrmm_kernel_L4_M8_40
+       ble     .Lctrmm_kernel_L4_M8_40
 
        KERNEL8x4_I
        KERNEL8x4_M2
@@ -1510,26 +1510,26 @@ ctrmm_kernel_L4_M8_32:
        KERNEL8x4_M1
        KERNEL8x4_E
 
-       b       ctrmm_kernel_L4_M8_44
+       b       .Lctrmm_kernel_L4_M8_44
 
-ctrmm_kernel_L4_M8_40:
+.Lctrmm_kernel_L4_M8_40:
 
        INIT8x4
 
-ctrmm_kernel_L4_M8_44:
+.Lctrmm_kernel_L4_M8_44:
 
        ands    counterL , tempK, #7
-       ble     ctrmm_kernel_L4_M8_100
+       ble     .Lctrmm_kernel_L4_M8_100
 
        .align 5
-ctrmm_kernel_L4_M8_46:
+.Lctrmm_kernel_L4_M8_46:
 
        KERNEL8x4_SUB
 
        subs    counterL, counterL, #1
-       bne     ctrmm_kernel_L4_M8_46
+       bne     .Lctrmm_kernel_L4_M8_46
 
-ctrmm_kernel_L4_M8_100:
+.Lctrmm_kernel_L4_M8_100:
 
        SAVE8x4
 
@@ -1552,21 +1552,21 @@ ctrmm_kernel_L4_M8_100:
        prfm    PLDL1KEEP, [pA, #64]
        prfm    PLDL1KEEP, [origPB]
 
-ctrmm_kernel_L4_M8_END:
+.Lctrmm_kernel_L4_M8_END:
        subs    counterI, counterI, #1
-       bne     ctrmm_kernel_L4_M8_20
+       bne     .Lctrmm_kernel_L4_M8_20
 
-ctrmm_kernel_L4_M4_BEGIN:
+.Lctrmm_kernel_L4_M4_BEGIN:
 
        mov     counterI, origM
        tst     counterI , #7
-       ble     ctrmm_kernel_L4_END
+       ble     .Lctrmm_kernel_L4_END
 
        tst     counterI, #4
-       ble     ctrmm_kernel_L4_M2_BEGIN
+       ble     .Lctrmm_kernel_L4_M2_BEGIN
 
 
-ctrmm_kernel_L4_M4_20:
+.Lctrmm_kernel_L4_M4_20:
 
 #if (defined(LEFT) &&  defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA))
        mov     pB, origPB
@@ -1587,46 +1587,46 @@ ctrmm_kernel_L4_M4_20:
 
        asr     counterL , tempK, #1            // L = K / 2
        cmp     counterL , #2                   // is there at least 4 to do?
-       blt     ctrmm_kernel_L4_M4_32
+       blt     .Lctrmm_kernel_L4_M4_32
 
        KERNEL4x4_I                             // do one in the K
        KERNEL4x4_M2                            // do another in the K
 
        subs    counterL, counterL, #2
-       ble     ctrmm_kernel_L4_M4_22a
+       ble     .Lctrmm_kernel_L4_M4_22a
        .align 5
 
 
-ctrmm_kernel_L4_M4_22:
+.Lctrmm_kernel_L4_M4_22:
 
        KERNEL4x4_M1
        KERNEL4x4_M2
 
        subs    counterL, counterL, #1
-       bgt     ctrmm_kernel_L4_M4_22
+       bgt     .Lctrmm_kernel_L4_M4_22
 
-ctrmm_kernel_L4_M4_22a:
+.Lctrmm_kernel_L4_M4_22a:
        KERNEL4x4_M1
        KERNEL4x4_E
-       b        ctrmm_kernel_L4_M4_44
-ctrmm_kernel_L4_M4_32:
+       b        .Lctrmm_kernel_L4_M4_44
+.Lctrmm_kernel_L4_M4_32:
        tst     counterL, #1
-       ble     ctrmm_kernel_L4_M4_40
+       ble     .Lctrmm_kernel_L4_M4_40
        KERNEL4x4_I
        KERNEL4x4_E
-       b       ctrmm_kernel_L4_M4_44
-ctrmm_kernel_L4_M4_40:
+       b       .Lctrmm_kernel_L4_M4_44
+.Lctrmm_kernel_L4_M4_40:
 
        INIT4x4
 
-ctrmm_kernel_L4_M4_44:
+.Lctrmm_kernel_L4_M4_44:
        ands    counterL , tempK, #1
-       ble     ctrmm_kernel_L4_M4_100
+       ble     .Lctrmm_kernel_L4_M4_100
 
-ctrmm_kernel_L4_M4_46:
+.Lctrmm_kernel_L4_M4_46:
        KERNEL4x4_SUB
 
-ctrmm_kernel_L4_M4_100:
+.Lctrmm_kernel_L4_M4_100:
 
        SAVE4x4
 
@@ -1645,18 +1645,18 @@ ctrmm_kernel_L4_M4_100:
        add     tempOffset, tempOffset, #4
 #endif
 
-ctrmm_kernel_L4_M4_END:
+.Lctrmm_kernel_L4_M4_END:
 
-ctrmm_kernel_L4_M2_BEGIN:
+.Lctrmm_kernel_L4_M2_BEGIN:
 
        mov     counterI, origM
        tst     counterI , #3
-       ble     ctrmm_kernel_L4_END
+       ble     .Lctrmm_kernel_L4_END
 
        tst     counterI, #2                    // counterI = counterI / 2
-       ble     ctrmm_kernel_L4_M1_BEGIN
+       ble     .Lctrmm_kernel_L4_M1_BEGIN
 
-ctrmm_kernel_L4_M2_20:
+.Lctrmm_kernel_L4_M2_20:
 
        INIT2x4
 
@@ -1679,9 +1679,9 @@ ctrmm_kernel_L4_M2_20:
 #endif
        asr     counterL , tempK, #3            // counterL = counterL / 8
        cmp     counterL , #0
-       ble     ctrmm_kernel_L4_M2_40
+       ble     .Lctrmm_kernel_L4_M2_40
 
-ctrmm_kernel_L4_M2_22:
+.Lctrmm_kernel_L4_M2_22:
 
        KERNEL2x4_SUB
        KERNEL2x4_SUB
@@ -1694,22 +1694,22 @@ ctrmm_kernel_L4_M2_22:
        KERNEL2x4_SUB
 
        subs    counterL, counterL, #1
-       bgt     ctrmm_kernel_L4_M2_22
+       bgt     .Lctrmm_kernel_L4_M2_22
 
 
-ctrmm_kernel_L4_M2_40:
+.Lctrmm_kernel_L4_M2_40:
 
        ands    counterL , tempK, #7            // counterL = counterL % 8
-       ble     ctrmm_kernel_L4_M2_100
+       ble     .Lctrmm_kernel_L4_M2_100
 
-ctrmm_kernel_L4_M2_42:
+.Lctrmm_kernel_L4_M2_42:
 
        KERNEL2x4_SUB
 
        subs    counterL, counterL, #1
-       bgt     ctrmm_kernel_L4_M2_42
+       bgt     .Lctrmm_kernel_L4_M2_42
 
-ctrmm_kernel_L4_M2_100:
+.Lctrmm_kernel_L4_M2_100:
 
        SAVE2x4
 
@@ -1729,15 +1729,15 @@ ctrmm_kernel_L4_M2_100:
        add     tempOffset, tempOffset, #2
 #endif
 
-ctrmm_kernel_L4_M2_END:
+.Lctrmm_kernel_L4_M2_END:
 
 
-ctrmm_kernel_L4_M1_BEGIN:
+.Lctrmm_kernel_L4_M1_BEGIN:
 
        tst     counterI, #1                    // counterI = counterI % 2
-       ble     ctrmm_kernel_L4_END
+       ble     .Lctrmm_kernel_L4_END
 
-ctrmm_kernel_L4_M1_20:
+.Lctrmm_kernel_L4_M1_20:
 
        INIT1x4
 
@@ -1761,9 +1761,9 @@ ctrmm_kernel_L4_M1_20:
 
        asr     counterL , tempK, #3            // counterL = counterL / 8
        cmp     counterL , #0
-       ble     ctrmm_kernel_L4_M1_40
+       ble     .Lctrmm_kernel_L4_M1_40
 
-ctrmm_kernel_L4_M1_22:
+.Lctrmm_kernel_L4_M1_22:
        KERNEL1x4_SUB
        KERNEL1x4_SUB
        KERNEL1x4_SUB
@@ -1775,22 +1775,22 @@ ctrmm_kernel_L4_M1_22:
        KERNEL1x4_SUB
 
        subs    counterL, counterL, #1
-       bgt     ctrmm_kernel_L4_M1_22
+       bgt     .Lctrmm_kernel_L4_M1_22
 
 
-ctrmm_kernel_L4_M1_40:
+.Lctrmm_kernel_L4_M1_40:
 
        ands    counterL , tempK, #7            // counterL = counterL % 8
-       ble     ctrmm_kernel_L4_M1_100
+       ble     .Lctrmm_kernel_L4_M1_100
 
-ctrmm_kernel_L4_M1_42:
+.Lctrmm_kernel_L4_M1_42:
 
        KERNEL1x4_SUB
 
        subs    counterL, counterL, #1
-       bgt     ctrmm_kernel_L4_M1_42
+       bgt     .Lctrmm_kernel_L4_M1_42
 
-ctrmm_kernel_L4_M1_100:
+.Lctrmm_kernel_L4_M1_100:
 
        SAVE1x4
 
@@ -1810,7 +1810,7 @@ ctrmm_kernel_L4_M1_100:
        add     tempOffset, tempOffset, #1
 #endif
 
-ctrmm_kernel_L4_END:
+.Lctrmm_kernel_L4_END:
 
        lsl     temp, origK, #5 
        add     origPB, origPB, temp            // B = B + K * 4 * 8
@@ -1820,19 +1820,19 @@ ctrmm_kernel_L4_END:
 #endif
 
        subs    counterJ, counterJ , #1         // j--
-       bgt     ctrmm_kernel_L4_BEGIN
+       bgt     .Lctrmm_kernel_L4_BEGIN
 
 
 /******************************************************************************/
 
-ctrmm_kernel_L2_BEGIN:   // less than 2 left in N direction
+.Lctrmm_kernel_L2_BEGIN:   // less than 2 left in N direction
 
        mov     counterJ , origN
        tst     counterJ , #3
-       ble     ctrmm_kernel_L999   // error, N was less than 4?
+       ble     .Lctrmm_kernel_L999   // error, N was less than 4?
 
        tst     counterJ , #2
-       ble     ctrmm_kernel_L1_BEGIN
+       ble     .Lctrmm_kernel_L1_BEGIN
 
        mov     pCRow0, pC                      // pCRow0 = pC
 
@@ -1843,14 +1843,14 @@ ctrmm_kernel_L2_BEGIN:   // less than 2 left in N direction
 #endif
        mov     pA, origPA                      // pA = A
 
-ctrmm_kernel_L2_M8_BEGIN:
+.Lctrmm_kernel_L2_M8_BEGIN:
 
        mov     counterI, origM
        asr     counterI, counterI, #3          // counterI = counterI / 8
        cmp     counterI, #0
-       ble     ctrmm_kernel_L2_M4_BEGIN
+       ble     .Lctrmm_kernel_L2_M4_BEGIN
 
-ctrmm_kernel_L2_M8_20:
+.Lctrmm_kernel_L2_M8_20:
 
        INIT8x2
 
@@ -1874,10 +1874,10 @@ ctrmm_kernel_L2_M8_20:
 
        asr     counterL , tempK, #3            // counterL = counterL / 8
        cmp     counterL,#0
-       ble     ctrmm_kernel_L2_M8_40
+       ble     .Lctrmm_kernel_L2_M8_40
        .align 5
 
-ctrmm_kernel_L2_M8_22:
+.Lctrmm_kernel_L2_M8_22:
        KERNEL8x2_SUB
        KERNEL8x2_SUB
        KERNEL8x2_SUB
@@ -1889,22 +1889,22 @@ ctrmm_kernel_L2_M8_22:
        KERNEL8x2_SUB
 
        subs    counterL, counterL, #1
-       bgt     ctrmm_kernel_L2_M8_22
+       bgt     .Lctrmm_kernel_L2_M8_22
 
 
-ctrmm_kernel_L2_M8_40:
+.Lctrmm_kernel_L2_M8_40:
 
        ands    counterL , tempK, #7            // counterL = counterL % 8
-       ble     ctrmm_kernel_L2_M8_100
+       ble     .Lctrmm_kernel_L2_M8_100
 
-ctrmm_kernel_L2_M8_42:
+.Lctrmm_kernel_L2_M8_42:
 
        KERNEL8x2_SUB
 
        subs    counterL, counterL, #1
-       bgt     ctrmm_kernel_L2_M8_42
+       bgt     .Lctrmm_kernel_L2_M8_42
 
-ctrmm_kernel_L2_M8_100:
+.Lctrmm_kernel_L2_M8_100:
 
        SAVE8x2
 
@@ -1924,21 +1924,21 @@ ctrmm_kernel_L2_M8_100:
        add     tempOffset, tempOffset, #8
 #endif
 
-ctrmm_kernel_L2_M8_END:
+.Lctrmm_kernel_L2_M8_END:
 
        subs    counterI, counterI, #1
-       bgt     ctrmm_kernel_L2_M8_20
+       bgt     .Lctrmm_kernel_L2_M8_20
 
-ctrmm_kernel_L2_M4_BEGIN:
+.Lctrmm_kernel_L2_M4_BEGIN:
 
        mov     counterI, origM
        tst     counterI , #7
-       ble     ctrmm_kernel_L2_END
+       ble     .Lctrmm_kernel_L2_END
 
        tst     counterI, #4                    // counterI = counterI / 2
-       ble     ctrmm_kernel_L2_M2_BEGIN
+       ble     .Lctrmm_kernel_L2_M2_BEGIN
 
-ctrmm_kernel_L2_M4_20:
+.Lctrmm_kernel_L2_M4_20:
 
        INIT4x2
 
@@ -1962,10 +1962,10 @@ ctrmm_kernel_L2_M4_20:
 
        asr     counterL , tempK, #3            // counterL = counterL / 8
        cmp     counterL,#0
-       ble     ctrmm_kernel_L2_M4_40
+       ble     .Lctrmm_kernel_L2_M4_40
        .align 5
 
-ctrmm_kernel_L2_M4_22:
+.Lctrmm_kernel_L2_M4_22:
        KERNEL4x2_SUB
        KERNEL4x2_SUB
        KERNEL4x2_SUB
@@ -1977,22 +1977,22 @@ ctrmm_kernel_L2_M4_22:
        KERNEL4x2_SUB
 
        subs    counterL, counterL, #1
-       bgt     ctrmm_kernel_L2_M4_22
+       bgt     .Lctrmm_kernel_L2_M4_22
 
 
-ctrmm_kernel_L2_M4_40:
+.Lctrmm_kernel_L2_M4_40:
 
        ands    counterL , tempK, #7            // counterL = counterL % 8
-       ble     ctrmm_kernel_L2_M4_100
+       ble     .Lctrmm_kernel_L2_M4_100
 
-ctrmm_kernel_L2_M4_42:
+.Lctrmm_kernel_L2_M4_42:
 
        KERNEL4x2_SUB
 
        subs    counterL, counterL, #1
-       bgt     ctrmm_kernel_L2_M4_42
+       bgt     .Lctrmm_kernel_L2_M4_42
 
-ctrmm_kernel_L2_M4_100:
+.Lctrmm_kernel_L2_M4_100:
 
        SAVE4x2
 
@@ -2012,19 +2012,19 @@ ctrmm_kernel_L2_M4_100:
        add     tempOffset, tempOffset, #4
 #endif
 
-ctrmm_kernel_L2_M4_END:
+.Lctrmm_kernel_L2_M4_END:
 
 
-ctrmm_kernel_L2_M2_BEGIN:
+.Lctrmm_kernel_L2_M2_BEGIN:
 
        mov     counterI, origM
        tst     counterI , #3
-       ble     ctrmm_kernel_L2_END
+       ble     .Lctrmm_kernel_L2_END
 
        tst     counterI, #2                    // counterI = counterI / 2
-       ble     ctrmm_kernel_L2_M1_BEGIN
+       ble     .Lctrmm_kernel_L2_M1_BEGIN
 
-ctrmm_kernel_L2_M2_20:
+.Lctrmm_kernel_L2_M2_20:
 
        INIT2x2
 
@@ -2048,9 +2048,9 @@ ctrmm_kernel_L2_M2_20:
 
        asr     counterL , tempK, #3            // counterL = counterL / 8
         cmp    counterL,#0
-       ble     ctrmm_kernel_L2_M2_40
+       ble     .Lctrmm_kernel_L2_M2_40
 
-ctrmm_kernel_L2_M2_22:
+.Lctrmm_kernel_L2_M2_22:
 
        KERNEL2x2_SUB
        KERNEL2x2_SUB
@@ -2063,22 +2063,22 @@ ctrmm_kernel_L2_M2_22:
        KERNEL2x2_SUB
 
        subs    counterL, counterL, #1
-       bgt     ctrmm_kernel_L2_M2_22
+       bgt     .Lctrmm_kernel_L2_M2_22
 
 
-ctrmm_kernel_L2_M2_40:
+.Lctrmm_kernel_L2_M2_40:
 
        ands    counterL , tempK, #7            // counterL = counterL % 8
-       ble     ctrmm_kernel_L2_M2_100
+       ble     .Lctrmm_kernel_L2_M2_100
 
-ctrmm_kernel_L2_M2_42:
+.Lctrmm_kernel_L2_M2_42:
 
        KERNEL2x2_SUB
 
        subs    counterL, counterL, #1
-       bgt     ctrmm_kernel_L2_M2_42
+       bgt     .Lctrmm_kernel_L2_M2_42
 
-ctrmm_kernel_L2_M2_100:
+.Lctrmm_kernel_L2_M2_100:
 
        SAVE2x2
 
@@ -2098,15 +2098,15 @@ ctrmm_kernel_L2_M2_100:
        add     tempOffset, tempOffset, #2
 #endif
 
-ctrmm_kernel_L2_M2_END:
+.Lctrmm_kernel_L2_M2_END:
 
 
-ctrmm_kernel_L2_M1_BEGIN:
+.Lctrmm_kernel_L2_M1_BEGIN:
 
        tst     counterI, #1                    // counterI = counterI % 2
-       ble     ctrmm_kernel_L2_END
+       ble     .Lctrmm_kernel_L2_END
 
-ctrmm_kernel_L2_M1_20:
+.Lctrmm_kernel_L2_M1_20:
 
        INIT1x2
 
@@ -2130,9 +2130,9 @@ ctrmm_kernel_L2_M1_20:
 
        asr     counterL , tempK, #3            // counterL = counterL / 8
         cmp     counterL, #0
-       ble     ctrmm_kernel_L2_M1_40
+       ble     .Lctrmm_kernel_L2_M1_40
 
-ctrmm_kernel_L2_M1_22:
+.Lctrmm_kernel_L2_M1_22:
        KERNEL1x2_SUB
        KERNEL1x2_SUB
        KERNEL1x2_SUB
@@ -2144,22 +2144,22 @@ ctrmm_kernel_L2_M1_22:
        KERNEL1x2_SUB
 
        subs    counterL, counterL, #1
-       bgt     ctrmm_kernel_L2_M1_22
+       bgt     .Lctrmm_kernel_L2_M1_22
 
 
-ctrmm_kernel_L2_M1_40:
+.Lctrmm_kernel_L2_M1_40:
 
        ands    counterL , tempK, #7            // counterL = counterL % 8
-       ble     ctrmm_kernel_L2_M1_100
+       ble     .Lctrmm_kernel_L2_M1_100
 
-ctrmm_kernel_L2_M1_42:
+.Lctrmm_kernel_L2_M1_42:
 
        KERNEL1x2_SUB
 
        subs    counterL, counterL, #1
-       bgt     ctrmm_kernel_L2_M1_42
+       bgt     .Lctrmm_kernel_L2_M1_42
 
-ctrmm_kernel_L2_M1_100:
+.Lctrmm_kernel_L2_M1_100:
 
        SAVE1x2
 
@@ -2179,7 +2179,7 @@ ctrmm_kernel_L2_M1_100:
        add     tempOffset, tempOffset, #1
 #endif
 
-ctrmm_kernel_L2_END:
+.Lctrmm_kernel_L2_END:
 #if !defined(LEFT)
        add     tempOffset, tempOffset, #2
 #endif
@@ -2187,11 +2187,11 @@ ctrmm_kernel_L2_END:
 
 /******************************************************************************/
 
-ctrmm_kernel_L1_BEGIN:
+.Lctrmm_kernel_L1_BEGIN:
 
        mov     counterJ , origN
        tst     counterJ , #1
-       ble     ctrmm_kernel_L999 // done
+       ble     .Lctrmm_kernel_L999 // done
 
        mov     pCRow0, pC                      // pCRow0 = C
        add     pC , pC , LDC                   // Update pC to point to next
@@ -2201,14 +2201,14 @@ ctrmm_kernel_L1_BEGIN:
 #endif
        mov     pA, origPA                      // pA = A
 
-ctrmm_kernel_L1_M8_BEGIN:
+.Lctrmm_kernel_L1_M8_BEGIN:
 
        mov     counterI, origM
        asr     counterI, counterI, #3          // counterI = counterI / 8
        cmp     counterI, #0
-       ble     ctrmm_kernel_L1_M4_BEGIN
+       ble     .Lctrmm_kernel_L1_M4_BEGIN
 
-ctrmm_kernel_L1_M8_20:
+.Lctrmm_kernel_L1_M8_20:
 
        INIT8x1
 
@@ -2232,10 +2232,10 @@ ctrmm_kernel_L1_M8_20:
 
        asr     counterL , tempK, #3            // counterL = counterL / 8
        cmp     counterL , #0
-       ble     ctrmm_kernel_L1_M8_40
+       ble     .Lctrmm_kernel_L1_M8_40
        .align 5
 
-ctrmm_kernel_L1_M8_22:
+.Lctrmm_kernel_L1_M8_22:
        KERNEL8x1_SUB
        KERNEL8x1_SUB
        KERNEL8x1_SUB
@@ -2247,22 +2247,22 @@ ctrmm_kernel_L1_M8_22:
        KERNEL8x1_SUB
 
        subs    counterL, counterL, #1
-       bgt     ctrmm_kernel_L1_M8_22
+       bgt     .Lctrmm_kernel_L1_M8_22
 
 
-ctrmm_kernel_L1_M8_40:
+.Lctrmm_kernel_L1_M8_40:
 
        ands    counterL , tempK, #7            // counterL = counterL % 8
-       ble     ctrmm_kernel_L1_M8_100
+       ble     .Lctrmm_kernel_L1_M8_100
 
-ctrmm_kernel_L1_M8_42:
+.Lctrmm_kernel_L1_M8_42:
 
        KERNEL8x1_SUB
 
        subs    counterL, counterL, #1
-       bgt     ctrmm_kernel_L1_M8_42
+       bgt     .Lctrmm_kernel_L1_M8_42
 
-ctrmm_kernel_L1_M8_100:
+.Lctrmm_kernel_L1_M8_100:
 
        SAVE8x1
 
@@ -2282,21 +2282,21 @@ ctrmm_kernel_L1_M8_100:
        add     tempOffset, tempOffset, #8
 #endif
 
-ctrmm_kernel_L1_M8_END:
+.Lctrmm_kernel_L1_M8_END:
 
        subs    counterI, counterI, #1
-       bgt     ctrmm_kernel_L1_M8_20
+       bgt     .Lctrmm_kernel_L1_M8_20
 
-ctrmm_kernel_L1_M4_BEGIN:
+.Lctrmm_kernel_L1_M4_BEGIN:
 
        mov     counterI, origM
        tst     counterI , #7
-       ble     ctrmm_kernel_L1_END
+       ble     .Lctrmm_kernel_L1_END
 
        tst     counterI, #4                    // counterI = counterI / 2
-       ble     ctrmm_kernel_L1_M2_BEGIN
+       ble     .Lctrmm_kernel_L1_M2_BEGIN
 
-ctrmm_kernel_L1_M4_20:
+.Lctrmm_kernel_L1_M4_20:
 
        INIT4x1
 
@@ -2319,10 +2319,10 @@ ctrmm_kernel_L1_M4_20:
 
        asr     counterL , tempK, #3            // counterL = counterL / 8
        cmp     counterL , #0
-       ble     ctrmm_kernel_L1_M4_40
+       ble     .Lctrmm_kernel_L1_M4_40
        .align 5
 
-ctrmm_kernel_L1_M4_22:
+.Lctrmm_kernel_L1_M4_22:
        KERNEL4x1_SUB
        KERNEL4x1_SUB
        KERNEL4x1_SUB
@@ -2334,22 +2334,22 @@ ctrmm_kernel_L1_M4_22:
        KERNEL4x1_SUB
 
        subs    counterL, counterL, #1
-       bgt     ctrmm_kernel_L1_M4_22
+       bgt     .Lctrmm_kernel_L1_M4_22
 
 
-ctrmm_kernel_L1_M4_40:
+.Lctrmm_kernel_L1_M4_40:
 
        ands    counterL , tempK, #7            // counterL = counterL % 8
-       ble     ctrmm_kernel_L1_M4_100
+       ble     .Lctrmm_kernel_L1_M4_100
 
-ctrmm_kernel_L1_M4_42:
+.Lctrmm_kernel_L1_M4_42:
 
        KERNEL4x1_SUB
 
        subs    counterL, counterL, #1
-       bgt     ctrmm_kernel_L1_M4_42
+       bgt     .Lctrmm_kernel_L1_M4_42
 
-ctrmm_kernel_L1_M4_100:
+.Lctrmm_kernel_L1_M4_100:
 
        SAVE4x1
 
@@ -2369,18 +2369,18 @@ ctrmm_kernel_L1_M4_100:
        add     tempOffset, tempOffset, #4
 #endif
 
-ctrmm_kernel_L1_M4_END:
+.Lctrmm_kernel_L1_M4_END:
 
-ctrmm_kernel_L1_M2_BEGIN:
+.Lctrmm_kernel_L1_M2_BEGIN:
 
        mov     counterI, origM
        tst     counterI , #3
-       ble     ctrmm_kernel_L1_END
+       ble     .Lctrmm_kernel_L1_END
 
        tst     counterI, #2                    // counterI = counterI / 2
-       ble     ctrmm_kernel_L1_M1_BEGIN
+       ble     .Lctrmm_kernel_L1_M1_BEGIN
 
-ctrmm_kernel_L1_M2_20:
+.Lctrmm_kernel_L1_M2_20:
 
        INIT2x1
 
@@ -2404,9 +2404,9 @@ ctrmm_kernel_L1_M2_20:
 
        asr     counterL , tempK, #3            // counterL = counterL / 8
        cmp     counterL , #0
-       ble     ctrmm_kernel_L1_M2_40
+       ble     .Lctrmm_kernel_L1_M2_40
 
-ctrmm_kernel_L1_M2_22:
+.Lctrmm_kernel_L1_M2_22:
 
        KERNEL2x1_SUB
        KERNEL2x1_SUB
@@ -2419,22 +2419,22 @@ ctrmm_kernel_L1_M2_22:
        KERNEL2x1_SUB
 
        subs    counterL, counterL, #1
-       bgt     ctrmm_kernel_L1_M2_22
+       bgt     .Lctrmm_kernel_L1_M2_22
 
 
-ctrmm_kernel_L1_M2_40:
+.Lctrmm_kernel_L1_M2_40:
 
        ands    counterL , tempK, #7            // counterL = counterL % 8
-       ble     ctrmm_kernel_L1_M2_100
+       ble     .Lctrmm_kernel_L1_M2_100
 
-ctrmm_kernel_L1_M2_42:
+.Lctrmm_kernel_L1_M2_42:
 
        KERNEL2x1_SUB
 
        subs    counterL, counterL, #1
-       bgt     ctrmm_kernel_L1_M2_42
+       bgt     .Lctrmm_kernel_L1_M2_42
 
-ctrmm_kernel_L1_M2_100:
+.Lctrmm_kernel_L1_M2_100:
 
        SAVE2x1
 
@@ -2454,15 +2454,15 @@ ctrmm_kernel_L1_M2_100:
        add     tempOffset, tempOffset, #2
 #endif
 
-ctrmm_kernel_L1_M2_END:
+.Lctrmm_kernel_L1_M2_END:
 
 
-ctrmm_kernel_L1_M1_BEGIN:
+.Lctrmm_kernel_L1_M1_BEGIN:
 
        tst     counterI, #1                    // counterI = counterI % 2
-       ble     ctrmm_kernel_L1_END
+       ble     .Lctrmm_kernel_L1_END
 
-ctrmm_kernel_L1_M1_20:
+.Lctrmm_kernel_L1_M1_20:
 
        INIT1x1
 
@@ -2486,9 +2486,9 @@ ctrmm_kernel_L1_M1_20:
 
        asr     counterL , tempK, #3            // counterL = counterL / 8
        cmp     counterL , #0
-       ble     ctrmm_kernel_L1_M1_40
+       ble     .Lctrmm_kernel_L1_M1_40
 
-ctrmm_kernel_L1_M1_22:
+.Lctrmm_kernel_L1_M1_22:
        KERNEL1x1_SUB
        KERNEL1x1_SUB
        KERNEL1x1_SUB
@@ -2500,30 +2500,30 @@ ctrmm_kernel_L1_M1_22:
        KERNEL1x1_SUB
 
        subs    counterL, counterL, #1
-       bgt     ctrmm_kernel_L1_M1_22
+       bgt     .Lctrmm_kernel_L1_M1_22
 
 
-ctrmm_kernel_L1_M1_40:
+.Lctrmm_kernel_L1_M1_40:
 
        ands    counterL , tempK, #7            // counterL = counterL % 8
-       ble     ctrmm_kernel_L1_M1_100
+       ble     .Lctrmm_kernel_L1_M1_100
 
-ctrmm_kernel_L1_M1_42:
+.Lctrmm_kernel_L1_M1_42:
 
        KERNEL1x1_SUB
 
        subs    counterL, counterL, #1
-       bgt     ctrmm_kernel_L1_M1_42
+       bgt     .Lctrmm_kernel_L1_M1_42
 
-ctrmm_kernel_L1_M1_100:
+.Lctrmm_kernel_L1_M1_100:
 
        SAVE1x1
 
 
-ctrmm_kernel_L1_END:
+.Lctrmm_kernel_L1_END:
 
 
-ctrmm_kernel_L999:
+.Lctrmm_kernel_L999:
        mov     x0, #0                          // set return value
        ldp     d8, d9, [sp, #(0 * 16)]
        ldp     d10, d11, [sp, #(1 * 16)]
index 5eb2ec0..b8d0af5 100644 (file)
@@ -122,53 +122,53 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
        PROLOGUE
 
        cmp     N, xzr
-       ble     axpy_kernel_L999
+       ble     .Ldaxpy_kernel_L999
 
        fcmp    DA, #0.0
-       beq     axpy_kernel_L999
+       beq     .Ldaxpy_kernel_L999
 
        cmp     INC_X, #1
-       bne     axpy_kernel_S_BEGIN
+       bne     .Ldaxpy_kernel_S_BEGIN
        cmp     INC_Y, #1
-       bne     axpy_kernel_S_BEGIN
+       bne     .Ldaxpy_kernel_S_BEGIN
 
-axpy_kernel_F_BEGIN:
+.Ldaxpy_kernel_F_BEGIN:
 
        asr     I, N, #5
        cmp     I, xzr
-       beq     axpy_kernel_F1
+       beq     .Ldaxpy_kernel_F1
 
        .align 5
-axpy_kernel_F32:
+.Ldaxpy_kernel_F32:
 
        KERNEL_F32
 
        subs    I, I, #1
-       bne     axpy_kernel_F32
+       bne     .Ldaxpy_kernel_F32
 
-axpy_kernel_F1:
+.Ldaxpy_kernel_F1:
 
        ands    I, N, #31
-       ble     axpy_kernel_L999
+       ble     .Ldaxpy_kernel_L999
 
-axpy_kernel_F10:
+.Ldaxpy_kernel_F10:
 
        KERNEL_F1
 
        subs    I, I, #1
-        bne     axpy_kernel_F10
+        bne     .Ldaxpy_kernel_F10
 
-       b       axpy_kernel_L999
+       b       .Ldaxpy_kernel_L999
 
-axpy_kernel_S_BEGIN:
+.Ldaxpy_kernel_S_BEGIN:
 
        INIT_S
 
        asr     I, N, #2
        cmp     I, xzr
-       ble     axpy_kernel_S1
+       ble     .Ldaxpy_kernel_S1
 
-axpy_kernel_S4:
+.Ldaxpy_kernel_S4:
 
        KERNEL_S1
        KERNEL_S1
@@ -176,21 +176,21 @@ axpy_kernel_S4:
        KERNEL_S1
 
        subs    I, I, #1
-       bne     axpy_kernel_S4
+       bne     .Ldaxpy_kernel_S4
 
-axpy_kernel_S1:
+.Ldaxpy_kernel_S1:
 
        ands    I, N, #3
-       ble     axpy_kernel_L999
+       ble     .Ldaxpy_kernel_L999
 
-axpy_kernel_S10:
+.Ldaxpy_kernel_S10:
 
        KERNEL_S1
 
        subs    I, I, #1
-        bne     axpy_kernel_S10
+        bne     .Ldaxpy_kernel_S10
 
-axpy_kernel_L999:
+.Ldaxpy_kernel_L999:
 
        mov     w0, wzr
        ret
index 44b0f7f..3491670 100644 (file)
@@ -775,9 +775,9 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
        mov     counterJ, origN
        asr     counterJ, counterJ, #2          // J = J / 4
        cmp     counterJ, #0
-       ble     dgemm_kernel_L2_BEGIN
+       ble     .Ldgemm_kernel_L2_BEGIN
 
-dgemm_kernel_L4_BEGIN:
+.Ldgemm_kernel_L4_BEGIN:
        mov     pCRow0, pC
        add     pCRow1, pCRow0, LDC
        add     pCRow2, pCRow1, LDC
@@ -791,20 +791,20 @@ dgemm_kernel_L4_BEGIN:
 
 //------------------------------------------------------------------------------
 
-dgemm_kernel_L4_M8_BEGIN:
+.Ldgemm_kernel_L4_M8_BEGIN:
 
        mov     counterI, origM
        asr     counterI, counterI, #3          // counterI = counterI / 8
        cmp     counterI, #0
-       ble     dgemm_kernel_L4_M4_BEGIN
+       ble     .Ldgemm_kernel_L4_M4_BEGIN
 
        .align 5
-dgemm_kernel_L4_M8_20:
+.Ldgemm_kernel_L4_M8_20:
 
        mov     pB, origPB
        asr     counterL , origK, #2            // L = K / 4
        cmp     counterL , #2
-       blt     dgemm_kernel_L4_M8_32
+       blt     .Ldgemm_kernel_L4_M8_32
 
        KERNEL8x4_I
        KERNEL8x4_M2
@@ -812,60 +812,60 @@ dgemm_kernel_L4_M8_20:
        KERNEL8x4_M2
 
        subs    counterL, counterL, #2          // subtract 2
-       ble     dgemm_kernel_L4_M8_22a
+       ble     .Ldgemm_kernel_L4_M8_22a
 
        .align 5
-dgemm_kernel_L4_M8_22:
+.Ldgemm_kernel_L4_M8_22:
        KERNEL8x4_M1
        KERNEL8x4_M2
        KERNEL8x4_M1
        KERNEL8x4_M2
 
        subs    counterL, counterL, #1
-       bgt     dgemm_kernel_L4_M8_22
+       bgt     .Ldgemm_kernel_L4_M8_22
 
        .align 5
-dgemm_kernel_L4_M8_22a:
+.Ldgemm_kernel_L4_M8_22a:
 
        KERNEL8x4_M1
        KERNEL8x4_M2
        KERNEL8x4_M1
        KERNEL8x4_E
 
-       b        dgemm_kernel_L4_M8_44
+       b        .Ldgemm_kernel_L4_M8_44
 
        .align 5
-dgemm_kernel_L4_M8_32:
+.Ldgemm_kernel_L4_M8_32:
 
        tst     counterL, #1
-       ble     dgemm_kernel_L4_M8_40
+       ble     .Ldgemm_kernel_L4_M8_40
 
        KERNEL8x4_I
        KERNEL8x4_M2
        KERNEL8x4_M1
        KERNEL8x4_E
 
-       b       dgemm_kernel_L4_M8_44
+       b       .Ldgemm_kernel_L4_M8_44
 
 
-dgemm_kernel_L4_M8_40:
+.Ldgemm_kernel_L4_M8_40:
 
        INIT8x4
 
-dgemm_kernel_L4_M8_44:
+.Ldgemm_kernel_L4_M8_44:
 
        ands    counterL , origK, #3
-       ble     dgemm_kernel_L4_M8_100
+       ble     .Ldgemm_kernel_L4_M8_100
 
        .align 5
-dgemm_kernel_L4_M8_46:
+.Ldgemm_kernel_L4_M8_46:
 
        KERNEL8x4_SUB
 
        subs    counterL, counterL, #1
-       bne     dgemm_kernel_L4_M8_46
+       bne     .Ldgemm_kernel_L4_M8_46
 
-dgemm_kernel_L4_M8_100:
+.Ldgemm_kernel_L4_M8_100:
        lsl     temp, origK, #5
        prfm    PLDL1KEEP, [pA, temp]
        prfm    PLDL1KEEP, [ppA, temp]
@@ -873,31 +873,31 @@ dgemm_kernel_L4_M8_100:
 
        SAVE8x4
 
-dgemm_kernel_L4_M8_END:
+.Ldgemm_kernel_L4_M8_END:
        lsl     temp, origK, #5                 // k * 4 * 8
        add     pA, pA, temp
        add     ppA, ppA, temp
        subs    counterI, counterI, #1
-       bne     dgemm_kernel_L4_M8_20
+       bne     .Ldgemm_kernel_L4_M8_20
 
-dgemm_kernel_L4_M4_BEGIN:
+.Ldgemm_kernel_L4_M4_BEGIN:
        mov     counterI, origM
        tst     counterI , #7
-       ble     dgemm_kernel_L4_END
+       ble     .Ldgemm_kernel_L4_END
 
        tst     counterI, #4
-       ble     dgemm_kernel_L4_M2_BEGIN
+       ble     .Ldgemm_kernel_L4_M2_BEGIN
 
-dgemm_kernel_L4_M4_20:
+.Ldgemm_kernel_L4_M4_20:
 
        INIT4x4
 
        mov     pB, origPB
        asr     counterL, origK, #3             // counterL = counterL / 8
        cmp     counterL, #0
-       ble     dgemm_kernel_L4_M4_40
+       ble     .Ldgemm_kernel_L4_M4_40
 
-dgemm_kernel_L4_M4_22:
+.Ldgemm_kernel_L4_M4_22:
 
        KERNEL4x4_SUB
        KERNEL4x4_SUB
@@ -910,47 +910,47 @@ dgemm_kernel_L4_M4_22:
        KERNEL4x4_SUB
 
        subs    counterL, counterL, #1
-       bgt     dgemm_kernel_L4_M4_22
+       bgt     .Ldgemm_kernel_L4_M4_22
 
 
-dgemm_kernel_L4_M4_40:
+.Ldgemm_kernel_L4_M4_40:
 
        ands    counterL , origK, #7            // counterL = counterL % 8
-       ble     dgemm_kernel_L4_M4_100
+       ble     .Ldgemm_kernel_L4_M4_100
 
-dgemm_kernel_L4_M4_42:
+.Ldgemm_kernel_L4_M4_42:
 
        KERNEL4x4_SUB
 
        subs    counterL, counterL, #1
-       bgt     dgemm_kernel_L4_M4_42
+       bgt     .Ldgemm_kernel_L4_M4_42
 
-dgemm_kernel_L4_M4_100:
+.Ldgemm_kernel_L4_M4_100:
 
        SAVE4x4
 
-dgemm_kernel_L4_M4_END:
+.Ldgemm_kernel_L4_M4_END:
 
 
-dgemm_kernel_L4_M2_BEGIN:
+.Ldgemm_kernel_L4_M2_BEGIN:
 
        mov     counterI, origM
        tst     counterI , #3
-       ble     dgemm_kernel_L4_END
+       ble     .Ldgemm_kernel_L4_END
 
        tst     counterI, #2                    // counterI = counterI / 2
-       ble     dgemm_kernel_L4_M1_BEGIN
+       ble     .Ldgemm_kernel_L4_M1_BEGIN
 
-dgemm_kernel_L4_M2_20:
+.Ldgemm_kernel_L4_M2_20:
 
        INIT2x4
 
        mov     pB, origPB
        asr     counterL , origK, #3            // counterL = counterL / 8
        cmp     counterL , #0
-       ble     dgemm_kernel_L4_M2_40
+       ble     .Ldgemm_kernel_L4_M2_40
 
-dgemm_kernel_L4_M2_22:
+.Ldgemm_kernel_L4_M2_22:
 
        KERNEL2x4_SUB
        KERNEL2x4_SUB
@@ -963,43 +963,43 @@ dgemm_kernel_L4_M2_22:
        KERNEL2x4_SUB
 
        subs    counterL, counterL, #1
-       bgt     dgemm_kernel_L4_M2_22
+       bgt     .Ldgemm_kernel_L4_M2_22
 
 
-dgemm_kernel_L4_M2_40:
+.Ldgemm_kernel_L4_M2_40:
 
        ands    counterL , origK, #7            // counterL = counterL % 8
-       ble     dgemm_kernel_L4_M2_100
+       ble     .Ldgemm_kernel_L4_M2_100
 
-dgemm_kernel_L4_M2_42:
+.Ldgemm_kernel_L4_M2_42:
 
        KERNEL2x4_SUB
 
        subs    counterL, counterL, #1
-       bgt     dgemm_kernel_L4_M2_42
+       bgt     .Ldgemm_kernel_L4_M2_42
 
-dgemm_kernel_L4_M2_100:
+.Ldgemm_kernel_L4_M2_100:
 
        SAVE2x4
 
-dgemm_kernel_L4_M2_END:
+.Ldgemm_kernel_L4_M2_END:
 
 
-dgemm_kernel_L4_M1_BEGIN:
+.Ldgemm_kernel_L4_M1_BEGIN:
 
        tst     counterI, #1                    // counterI = counterI % 2
-       ble     dgemm_kernel_L4_END
+       ble     .Ldgemm_kernel_L4_END
 
-dgemm_kernel_L4_M1_20:
+.Ldgemm_kernel_L4_M1_20:
 
        INIT1x4
 
        mov     pB, origPB
        asr     counterL , origK, #3            // counterL = counterL / 8
        cmp     counterL , #0
-       ble     dgemm_kernel_L4_M1_40
+       ble     .Ldgemm_kernel_L4_M1_40
 
-dgemm_kernel_L4_M1_22:
+.Ldgemm_kernel_L4_M1_22:
        KERNEL1x4_SUB
        KERNEL1x4_SUB
        KERNEL1x4_SUB
@@ -1011,45 +1011,45 @@ dgemm_kernel_L4_M1_22:
        KERNEL1x4_SUB
 
        subs    counterL, counterL, #1
-       bgt     dgemm_kernel_L4_M1_22
+       bgt     .Ldgemm_kernel_L4_M1_22
 
 
-dgemm_kernel_L4_M1_40:
+.Ldgemm_kernel_L4_M1_40:
 
        ands    counterL , origK, #7            // counterL = counterL % 8
-       ble     dgemm_kernel_L4_M1_100
+       ble     .Ldgemm_kernel_L4_M1_100
 
-dgemm_kernel_L4_M1_42:
+.Ldgemm_kernel_L4_M1_42:
 
        KERNEL1x4_SUB
 
        subs    counterL, counterL, #1
-       bgt     dgemm_kernel_L4_M1_42
+       bgt     .Ldgemm_kernel_L4_M1_42
 
-dgemm_kernel_L4_M1_100:
+.Ldgemm_kernel_L4_M1_100:
 
        SAVE1x4
 
 
-dgemm_kernel_L4_END:
+.Ldgemm_kernel_L4_END:
 
        lsl     temp, origK, #5 
        add     origPB, origPB, temp            // B = B + K * 4 * 8
 
        subs    counterJ, counterJ , #1         // j--
-       bgt     dgemm_kernel_L4_BEGIN
+       bgt     .Ldgemm_kernel_L4_BEGIN
 
 
 /******************************************************************************/
 
-dgemm_kernel_L2_BEGIN:   // less than 2 left in N direction
+.Ldgemm_kernel_L2_BEGIN:   // less than 2 left in N direction
 
        mov     counterJ , origN
        tst     counterJ , #3
-       ble     dgemm_kernel_L999   // error, N was less than 4?
+       ble     .Ldgemm_kernel_L999   // error, N was less than 4?
 
        tst     counterJ , #2
-       ble     dgemm_kernel_L1_BEGIN
+       ble     .Ldgemm_kernel_L1_BEGIN
 
        mov     pCRow0, pC                      // pCRow0 = pC
 
@@ -1059,24 +1059,24 @@ dgemm_kernel_L2_BEGIN:   // less than 2 left in N direction
 
 
 
-dgemm_kernel_L2_M4_BEGIN:
+.Ldgemm_kernel_L2_M4_BEGIN:
 
        mov     counterI, origM
        asr     counterI, counterI, #2          // counterI = counterI / 4
        cmp     counterI,#0
-       ble     dgemm_kernel_L2_M2_BEGIN
+       ble     .Ldgemm_kernel_L2_M2_BEGIN
 
-dgemm_kernel_L2_M4_20:
+.Ldgemm_kernel_L2_M4_20:
 
        INIT4x2
 
        mov     pB, origPB
        asr     counterL , origK, #3            // counterL = counterL / 8
        cmp     counterL,#0
-       ble     dgemm_kernel_L2_M4_40
+       ble     .Ldgemm_kernel_L2_M4_40
        .align 5
 
-dgemm_kernel_L2_M4_22:
+.Ldgemm_kernel_L2_M4_22:
        KERNEL4x2_SUB
        KERNEL4x2_SUB
        KERNEL4x2_SUB
@@ -1088,50 +1088,50 @@ dgemm_kernel_L2_M4_22:
        KERNEL4x2_SUB
 
        subs    counterL, counterL, #1
-       bgt     dgemm_kernel_L2_M4_22
+       bgt     .Ldgemm_kernel_L2_M4_22
 
 
-dgemm_kernel_L2_M4_40:
+.Ldgemm_kernel_L2_M4_40:
 
        ands    counterL , origK, #7            // counterL = counterL % 8
-       ble     dgemm_kernel_L2_M4_100
+       ble     .Ldgemm_kernel_L2_M4_100
 
-dgemm_kernel_L2_M4_42:
+.Ldgemm_kernel_L2_M4_42:
 
        KERNEL4x2_SUB
 
        subs    counterL, counterL, #1
-       bgt     dgemm_kernel_L2_M4_42
+       bgt     .Ldgemm_kernel_L2_M4_42
 
-dgemm_kernel_L2_M4_100:
+.Ldgemm_kernel_L2_M4_100:
 
        SAVE4x2
 
-dgemm_kernel_L2_M4_END:
+.Ldgemm_kernel_L2_M4_END:
 
        subs    counterI, counterI, #1
-       bgt     dgemm_kernel_L2_M4_20
+       bgt     .Ldgemm_kernel_L2_M4_20
 
 
-dgemm_kernel_L2_M2_BEGIN:
+.Ldgemm_kernel_L2_M2_BEGIN:
 
        mov     counterI, origM
        tst     counterI , #3
-       ble     dgemm_kernel_L2_END
+       ble     .Ldgemm_kernel_L2_END
 
        tst     counterI, #2                    // counterI = counterI / 2
-       ble     dgemm_kernel_L2_M1_BEGIN
+       ble     .Ldgemm_kernel_L2_M1_BEGIN
 
-dgemm_kernel_L2_M2_20:
+.Ldgemm_kernel_L2_M2_20:
 
        INIT2x2
 
        mov     pB, origPB
        asr     counterL , origK, #3            // counterL = counterL / 8
         cmp    counterL,#0
-       ble     dgemm_kernel_L2_M2_40
+       ble     .Ldgemm_kernel_L2_M2_40
 
-dgemm_kernel_L2_M2_22:
+.Ldgemm_kernel_L2_M2_22:
 
        KERNEL2x2_SUB
        KERNEL2x2_SUB
@@ -1144,43 +1144,43 @@ dgemm_kernel_L2_M2_22:
        KERNEL2x2_SUB
 
        subs    counterL, counterL, #1
-       bgt     dgemm_kernel_L2_M2_22
+       bgt     .Ldgemm_kernel_L2_M2_22
 
 
-dgemm_kernel_L2_M2_40:
+.Ldgemm_kernel_L2_M2_40:
 
        ands    counterL , origK, #7            // counterL = counterL % 8
-       ble     dgemm_kernel_L2_M2_100
+       ble     .Ldgemm_kernel_L2_M2_100
 
-dgemm_kernel_L2_M2_42:
+.Ldgemm_kernel_L2_M2_42:
 
        KERNEL2x2_SUB
 
        subs    counterL, counterL, #1
-       bgt     dgemm_kernel_L2_M2_42
+       bgt     .Ldgemm_kernel_L2_M2_42
 
-dgemm_kernel_L2_M2_100:
+.Ldgemm_kernel_L2_M2_100:
 
        SAVE2x2
 
-dgemm_kernel_L2_M2_END:
+.Ldgemm_kernel_L2_M2_END:
 
 
-dgemm_kernel_L2_M1_BEGIN:
+.Ldgemm_kernel_L2_M1_BEGIN:
 
        tst     counterI, #1                    // counterI = counterI % 2
-       ble     dgemm_kernel_L2_END
+       ble     .Ldgemm_kernel_L2_END
 
-dgemm_kernel_L2_M1_20:
+.Ldgemm_kernel_L2_M1_20:
 
        INIT1x2
 
        mov     pB, origPB
        asr     counterL , origK, #3            // counterL = counterL / 8
         cmp     counterL, #0
-       ble     dgemm_kernel_L2_M1_40
+       ble     .Ldgemm_kernel_L2_M1_40
 
-dgemm_kernel_L2_M1_22:
+.Ldgemm_kernel_L2_M1_22:
        KERNEL1x2_SUB
        KERNEL1x2_SUB
        KERNEL1x2_SUB
@@ -1192,36 +1192,36 @@ dgemm_kernel_L2_M1_22:
        KERNEL1x2_SUB
 
        subs    counterL, counterL, #1
-       bgt     dgemm_kernel_L2_M1_22
+       bgt     .Ldgemm_kernel_L2_M1_22
 
 
-dgemm_kernel_L2_M1_40:
+.Ldgemm_kernel_L2_M1_40:
 
        ands    counterL , origK, #7            // counterL = counterL % 8
-       ble     dgemm_kernel_L2_M1_100
+       ble     .Ldgemm_kernel_L2_M1_100
 
-dgemm_kernel_L2_M1_42:
+.Ldgemm_kernel_L2_M1_42:
 
        KERNEL1x2_SUB
 
        subs    counterL, counterL, #1
-       bgt     dgemm_kernel_L2_M1_42
+       bgt     .Ldgemm_kernel_L2_M1_42
 
-dgemm_kernel_L2_M1_100:
+.Ldgemm_kernel_L2_M1_100:
 
        SAVE1x2
 
 
-dgemm_kernel_L2_END:
+.Ldgemm_kernel_L2_END:
        add     origPB, origPB, origK, lsl #4   // B = B + K * 2 * 8
 
 /******************************************************************************/
 
-dgemm_kernel_L1_BEGIN:
+.Ldgemm_kernel_L1_BEGIN:
 
        mov     counterJ , origN
        tst     counterJ , #1
-       ble     dgemm_kernel_L999 // done
+       ble     .Ldgemm_kernel_L999 // done
 
 
        mov     pCRow0, pC                      // pCRow0 = C
@@ -1231,24 +1231,24 @@ dgemm_kernel_L1_BEGIN:
 
 
 
-dgemm_kernel_L1_M4_BEGIN:
+.Ldgemm_kernel_L1_M4_BEGIN:
 
        mov     counterI, origM
        asr     counterI, counterI, #2          // counterI = counterI / 4
        cmp     counterI, #0
-       ble     dgemm_kernel_L1_M2_BEGIN
+       ble     .Ldgemm_kernel_L1_M2_BEGIN
 
-dgemm_kernel_L1_M4_20:
+.Ldgemm_kernel_L1_M4_20:
 
        INIT4x1
 
        mov     pB, origPB
        asr     counterL , origK, #3            // counterL = counterL / 8
        cmp     counterL , #0
-       ble     dgemm_kernel_L1_M4_40
+       ble     .Ldgemm_kernel_L1_M4_40
        .align 5
 
-dgemm_kernel_L1_M4_22:
+.Ldgemm_kernel_L1_M4_22:
        KERNEL4x1_SUB
        KERNEL4x1_SUB
        KERNEL4x1_SUB
@@ -1260,50 +1260,50 @@ dgemm_kernel_L1_M4_22:
        KERNEL4x1_SUB
 
        subs    counterL, counterL, #1
-       bgt     dgemm_kernel_L1_M4_22
+       bgt     .Ldgemm_kernel_L1_M4_22
 
 
-dgemm_kernel_L1_M4_40:
+.Ldgemm_kernel_L1_M4_40:
 
        ands    counterL , origK, #7            // counterL = counterL % 8
-       ble     dgemm_kernel_L1_M4_100
+       ble     .Ldgemm_kernel_L1_M4_100
 
-dgemm_kernel_L1_M4_42:
+.Ldgemm_kernel_L1_M4_42:
 
        KERNEL4x1_SUB
 
        subs    counterL, counterL, #1
-       bgt     dgemm_kernel_L1_M4_42
+       bgt     .Ldgemm_kernel_L1_M4_42
 
-dgemm_kernel_L1_M4_100:
+.Ldgemm_kernel_L1_M4_100:
 
        SAVE4x1
 
-dgemm_kernel_L1_M4_END:
+.Ldgemm_kernel_L1_M4_END:
 
        subs    counterI, counterI, #1
-       bgt     dgemm_kernel_L1_M4_20
+       bgt     .Ldgemm_kernel_L1_M4_20
 
 
-dgemm_kernel_L1_M2_BEGIN:
+.Ldgemm_kernel_L1_M2_BEGIN:
 
        mov     counterI, origM
        tst     counterI , #3
-       ble     dgemm_kernel_L1_END
+       ble     .Ldgemm_kernel_L1_END
 
        tst     counterI, #2                    // counterI = counterI / 2
-       ble     dgemm_kernel_L1_M1_BEGIN
+       ble     .Ldgemm_kernel_L1_M1_BEGIN
 
-dgemm_kernel_L1_M2_20:
+.Ldgemm_kernel_L1_M2_20:
 
        INIT2x1
 
        mov     pB, origPB
        asr     counterL , origK, #3            // counterL = counterL / 8
        cmp     counterL , #0
-       ble     dgemm_kernel_L1_M2_40
+       ble     .Ldgemm_kernel_L1_M2_40
 
-dgemm_kernel_L1_M2_22:
+.Ldgemm_kernel_L1_M2_22:
 
        KERNEL2x1_SUB
        KERNEL2x1_SUB
@@ -1316,43 +1316,43 @@ dgemm_kernel_L1_M2_22:
        KERNEL2x1_SUB
 
        subs    counterL, counterL, #1
-       bgt     dgemm_kernel_L1_M2_22
+       bgt     .Ldgemm_kernel_L1_M2_22
 
 
-dgemm_kernel_L1_M2_40:
+.Ldgemm_kernel_L1_M2_40:
 
        ands    counterL , origK, #7            // counterL = counterL % 8
-       ble     dgemm_kernel_L1_M2_100
+       ble     .Ldgemm_kernel_L1_M2_100
 
-dgemm_kernel_L1_M2_42:
+.Ldgemm_kernel_L1_M2_42:
 
        KERNEL2x1_SUB
 
        subs    counterL, counterL, #1
-       bgt     dgemm_kernel_L1_M2_42
+       bgt     .Ldgemm_kernel_L1_M2_42
 
-dgemm_kernel_L1_M2_100:
+.Ldgemm_kernel_L1_M2_100:
 
        SAVE2x1
 
-dgemm_kernel_L1_M2_END:
+.Ldgemm_kernel_L1_M2_END:
 
 
-dgemm_kernel_L1_M1_BEGIN:
+.Ldgemm_kernel_L1_M1_BEGIN:
 
        tst     counterI, #1                    // counterI = counterI % 2
-       ble     dgemm_kernel_L1_END
+       ble     .Ldgemm_kernel_L1_END
 
-dgemm_kernel_L1_M1_20:
+.Ldgemm_kernel_L1_M1_20:
 
        INIT1x1
 
        mov     pB, origPB
        asr     counterL , origK, #3            // counterL = counterL / 8
        cmp     counterL , #0
-       ble     dgemm_kernel_L1_M1_40
+       ble     .Ldgemm_kernel_L1_M1_40
 
-dgemm_kernel_L1_M1_22:
+.Ldgemm_kernel_L1_M1_22:
        KERNEL1x1_SUB
        KERNEL1x1_SUB
        KERNEL1x1_SUB
@@ -1364,30 +1364,30 @@ dgemm_kernel_L1_M1_22:
        KERNEL1x1_SUB
 
        subs    counterL, counterL, #1
-       bgt     dgemm_kernel_L1_M1_22
+       bgt     .Ldgemm_kernel_L1_M1_22
 
 
-dgemm_kernel_L1_M1_40:
+.Ldgemm_kernel_L1_M1_40:
 
        ands    counterL , origK, #7            // counterL = counterL % 8
-       ble     dgemm_kernel_L1_M1_100
+       ble     .Ldgemm_kernel_L1_M1_100
 
-dgemm_kernel_L1_M1_42:
+.Ldgemm_kernel_L1_M1_42:
 
        KERNEL1x1_SUB
 
        subs    counterL, counterL, #1
-       bgt     dgemm_kernel_L1_M1_42
+       bgt     .Ldgemm_kernel_L1_M1_42
 
-dgemm_kernel_L1_M1_100:
+.Ldgemm_kernel_L1_M1_100:
 
        SAVE1x1
 
 
-dgemm_kernel_L1_END:
+.Ldgemm_kernel_L1_END:
 
 
-dgemm_kernel_L999:
+.Ldgemm_kernel_L999:
        mov     x0, #0                          // set return value
        ldp     d8, d9, [sp, #(0 * 16)]
        ldp     d10, d11, [sp, #(1 * 16)]
index b04dbb5..ced26b4 100644 (file)
@@ -938,98 +938,98 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
        mov     counterJ, origN
        asr     counterJ, counterJ, #3          // J = J / 8
        cmp     counterJ, #0
-       ble     dgemm_kernel_L4_BEGIN
+       ble     .Ldgemm_kernel_L4_BEGIN
 
 /******************************************************************************/
 
-dgemm_kernel_L8_BEGIN:
+.Ldgemm_kernel_L8_BEGIN:
 
        mov     pCRow0, pC                      // pCRow0 = C
        add     pC, pC, LDC, lsl #3
 
        mov     pA, origPA                      // pA = start of A array
 
-dgemm_kernel_L8_M4_BEGIN:
+.Ldgemm_kernel_L8_M4_BEGIN:
 
        mov     counterI, origM
        asr     counterI, counterI, #2          // counterI = counterI / 4
        cmp     counterI, #0
-       ble     dgemm_kernel_L8_M2_BEGIN
+       ble     .Ldgemm_kernel_L8_M2_BEGIN
 
-dgemm_kernel_L8_M4_20:
+.Ldgemm_kernel_L8_M4_20:
 
        mov     pB, origPB
 
        asr     counterL , origK, #1            // L = K / 2
        cmp     counterL , #2                   // is there at least 4 to do?
-       blt     dgemm_kernel_L8_M4_32
+       blt     .Ldgemm_kernel_L8_M4_32
 
        KERNEL4x8_I                             // do one in the K
        KERNEL4x8_M2                            // do another in the K
 
        subs    counterL, counterL, #2
-       ble     dgemm_kernel_L8_M4_22a
+       ble     .Ldgemm_kernel_L8_M4_22a
        .align 5
 
-dgemm_kernel_L8_M4_22:
+.Ldgemm_kernel_L8_M4_22:
 
        KERNEL4x8_M1
        KERNEL4x8_M2
 
        subs    counterL, counterL, #1
-       bgt     dgemm_kernel_L8_M4_22
+       bgt     .Ldgemm_kernel_L8_M4_22
 
 
-dgemm_kernel_L8_M4_22a:
+.Ldgemm_kernel_L8_M4_22a:
 
        KERNEL4x8_M1
        KERNEL4x8_E
 
-       b        dgemm_kernel_L8_M4_44
+       b        .Ldgemm_kernel_L8_M4_44
 
-dgemm_kernel_L8_M4_32:
+.Ldgemm_kernel_L8_M4_32:
 
        tst     counterL, #1
-       ble     dgemm_kernel_L8_M4_40
+       ble     .Ldgemm_kernel_L8_M4_40
 
        KERNEL4x8_I
 
        KERNEL4x8_E
 
-       b       dgemm_kernel_L8_M4_44
+       b       .Ldgemm_kernel_L8_M4_44
 
 
-dgemm_kernel_L8_M4_40:
+.Ldgemm_kernel_L8_M4_40:
 
        INIT4x8
 
-dgemm_kernel_L8_M4_44:
+.Ldgemm_kernel_L8_M4_44:
 
        ands    counterL , origK, #1
-       ble     dgemm_kernel_L8_M4_100
+       ble     .Ldgemm_kernel_L8_M4_100
 
-dgemm_kernel_L8_M4_46:
+.Ldgemm_kernel_L8_M4_46:
 
        KERNEL4x8_SUB
 
-dgemm_kernel_L8_M4_100:
+.Ldgemm_kernel_L8_M4_100:
 
        SAVE4x8
 
-dgemm_kernel_L8_M4_END:
+.Ldgemm_kernel_L8_M4_END:
        subs    counterI, counterI, #1
-       bne     dgemm_kernel_L8_M4_20
+       bne     .Ldgemm_kernel_L8_M4_20
 
-dgemm_kernel_L8_M2_BEGIN:
+.Ldgemm_kernel_L8_M2_BEGIN:
 
        mov     counterI, origM
        tst     counterI , #3
-       ble     dgemm_kernel_L8_END
+       ble     .Ldgemm_kernel_L8_END
 
        tst     counterI, #2                    // counterI = counterI / 2
-       ble     dgemm_kernel_L8_M1_BEGIN
+       ble     .Ldgemm_kernel_L8_M1_BEGIN
 
-dgemm_kernel_L8_M2_20:
+.Ldgemm_kernel_L8_M2_20:
 
        INIT2x8
 
@@ -1037,9 +1037,9 @@ dgemm_kernel_L8_M2_20:
 
        asr     counterL , origK, #3            // counterL = counterL / 8
        cmp     counterL , #0
-       ble     dgemm_kernel_L8_M2_40
+       ble     .Ldgemm_kernel_L8_M2_40
 
-dgemm_kernel_L8_M2_22:
+.Ldgemm_kernel_L8_M2_22:
 
        KERNEL2x8_SUB
        KERNEL2x8_SUB
@@ -1052,34 +1052,34 @@ dgemm_kernel_L8_M2_22:
        KERNEL2x8_SUB
 
        subs    counterL, counterL, #1
-       bgt     dgemm_kernel_L8_M2_22
+       bgt     .Ldgemm_kernel_L8_M2_22
 
 
-dgemm_kernel_L8_M2_40:
+.Ldgemm_kernel_L8_M2_40:
 
        ands    counterL , origK, #7            // counterL = counterL % 8
-       ble     dgemm_kernel_L8_M2_100
+       ble     .Ldgemm_kernel_L8_M2_100
 
-dgemm_kernel_L8_M2_42:
+.Ldgemm_kernel_L8_M2_42:
 
        KERNEL2x8_SUB
 
        subs    counterL, counterL, #1
-       bgt     dgemm_kernel_L8_M2_42
+       bgt     .Ldgemm_kernel_L8_M2_42
 
-dgemm_kernel_L8_M2_100:
+.Ldgemm_kernel_L8_M2_100:
 
        SAVE2x8
 
-dgemm_kernel_L8_M2_END:
+.Ldgemm_kernel_L8_M2_END:
 
 
-dgemm_kernel_L8_M1_BEGIN:
+.Ldgemm_kernel_L8_M1_BEGIN:
 
        tst     counterI, #1                    // counterI = counterI % 2
-       ble     dgemm_kernel_L8_END
+       ble     .Ldgemm_kernel_L8_END
 
-dgemm_kernel_L8_M1_20:
+.Ldgemm_kernel_L8_M1_20:
 
        INIT1x8
 
@@ -1087,9 +1087,9 @@ dgemm_kernel_L8_M1_20:
 
        asr     counterL , origK, #3            // counterL = counterL / 8
        cmp     counterL , #0
-       ble     dgemm_kernel_L8_M1_40
+       ble     .Ldgemm_kernel_L8_M1_40
 
-dgemm_kernel_L8_M1_22:
+.Ldgemm_kernel_L8_M1_22:
        KERNEL1x8_SUB
        KERNEL1x8_SUB
        KERNEL1x8_SUB
@@ -1101,131 +1101,131 @@ dgemm_kernel_L8_M1_22:
        KERNEL1x8_SUB
 
        subs    counterL, counterL, #1
-       bgt     dgemm_kernel_L8_M1_22
+       bgt     .Ldgemm_kernel_L8_M1_22
 
 
-dgemm_kernel_L8_M1_40:
+.Ldgemm_kernel_L8_M1_40:
 
        ands    counterL , origK, #7            // counterL = counterL % 8
-       ble     dgemm_kernel_L8_M1_100
+       ble     .Ldgemm_kernel_L8_M1_100
 
-dgemm_kernel_L8_M1_42:
+.Ldgemm_kernel_L8_M1_42:
 
        KERNEL1x8_SUB
 
        subs    counterL, counterL, #1
-       bgt     dgemm_kernel_L8_M1_42
+       bgt     .Ldgemm_kernel_L8_M1_42
 
-dgemm_kernel_L8_M1_100:
+.Ldgemm_kernel_L8_M1_100:
 
        SAVE1x8
 
-dgemm_kernel_L8_END:
+.Ldgemm_kernel_L8_END:
 
        lsl     temp, origK, #6
        add     origPB, origPB, temp            // B = B + K * 8 * 8
 
        subs    counterJ, counterJ , #1         // j--
-       bgt     dgemm_kernel_L8_BEGIN
+       bgt     .Ldgemm_kernel_L8_BEGIN
 
 
 /******************************************************************************/
 
-dgemm_kernel_L4_BEGIN:
+.Ldgemm_kernel_L4_BEGIN:
 
        mov     counterJ , origN
        tst     counterJ , #7
-       ble     dgemm_kernel_L999
+       ble     .Ldgemm_kernel_L999
 
        tst     counterJ , #4
-       ble     dgemm_kernel_L2_BEGIN
+       ble     .Ldgemm_kernel_L2_BEGIN
 
        mov     pCRow0, pC                      // pCRow0 = C
        add     pC, pC, LDC, lsl #2
 
        mov     pA, origPA                      // pA = start of A array
 
-dgemm_kernel_L4_M4_BEGIN:
+.Ldgemm_kernel_L4_M4_BEGIN:
 
        mov     counterI, origM
        asr     counterI, counterI, #2          // counterI = counterI / 4
        cmp     counterI, #0
-       ble     dgemm_kernel_L4_M2_BEGIN
+       ble     .Ldgemm_kernel_L4_M2_BEGIN
 
-dgemm_kernel_L4_M4_20:
+.Ldgemm_kernel_L4_M4_20:
 
        mov     pB, origPB
 
        asr     counterL , origK, #1            // L = K / 2
        cmp     counterL , #2                   // is there at least 4 to do?
-       blt     dgemm_kernel_L4_M4_32
+       blt     .Ldgemm_kernel_L4_M4_32
 
        KERNEL4x4_I                             // do one in the K
        KERNEL4x4_M2                            // do another in the K
 
        subs    counterL, counterL, #2
-       ble     dgemm_kernel_L4_M4_22a
+       ble     .Ldgemm_kernel_L4_M4_22a
        .align 5
 
-dgemm_kernel_L4_M4_22:
+.Ldgemm_kernel_L4_M4_22:
 
        KERNEL4x4_M1
        KERNEL4x4_M2
 
        subs    counterL, counterL, #1
-       bgt     dgemm_kernel_L4_M4_22
+       bgt     .Ldgemm_kernel_L4_M4_22
 
 
-dgemm_kernel_L4_M4_22a:
+.Ldgemm_kernel_L4_M4_22a:
 
        KERNEL4x4_M1
        KERNEL4x4_E
 
-       b        dgemm_kernel_L4_M4_44
+       b        .Ldgemm_kernel_L4_M4_44
 
-dgemm_kernel_L4_M4_32:
+.Ldgemm_kernel_L4_M4_32:
 
        tst     counterL, #1
-       ble     dgemm_kernel_L4_M4_40
+       ble     .Ldgemm_kernel_L4_M4_40
 
        KERNEL4x4_I
 
        KERNEL4x4_E
 
-       b       dgemm_kernel_L4_M4_44
+       b       .Ldgemm_kernel_L4_M4_44
 
 
-dgemm_kernel_L4_M4_40:
+.Ldgemm_kernel_L4_M4_40:
 
        INIT4x4
 
-dgemm_kernel_L4_M4_44:
+.Ldgemm_kernel_L4_M4_44:
 
        ands    counterL , origK, #1
-       ble     dgemm_kernel_L4_M4_100
+       ble     .Ldgemm_kernel_L4_M4_100
 
-dgemm_kernel_L4_M4_46:
+.Ldgemm_kernel_L4_M4_46:
 
        KERNEL4x4_SUB
 
-dgemm_kernel_L4_M4_100:
+.Ldgemm_kernel_L4_M4_100:
 
        SAVE4x4
 
-dgemm_kernel_L4_M4_END:
+.Ldgemm_kernel_L4_M4_END:
        subs    counterI, counterI, #1
-       bne     dgemm_kernel_L4_M4_20
+       bne     .Ldgemm_kernel_L4_M4_20
 
-dgemm_kernel_L4_M2_BEGIN:
+.Ldgemm_kernel_L4_M2_BEGIN:
 
        mov     counterI, origM
        tst     counterI , #3
-       ble     dgemm_kernel_L4_END
+       ble     .Ldgemm_kernel_L4_END
 
        tst     counterI, #2                    // counterI = counterI / 2
-       ble     dgemm_kernel_L4_M1_BEGIN
+       ble     .Ldgemm_kernel_L4_M1_BEGIN
 
-dgemm_kernel_L4_M2_20:
+.Ldgemm_kernel_L4_M2_20:
 
        INIT2x4
 
@@ -1233,9 +1233,9 @@ dgemm_kernel_L4_M2_20:
 
        asr     counterL , origK, #3            // counterL = counterL / 8
        cmp     counterL , #0
-       ble     dgemm_kernel_L4_M2_40
+       ble     .Ldgemm_kernel_L4_M2_40
 
-dgemm_kernel_L4_M2_22:
+.Ldgemm_kernel_L4_M2_22:
 
        KERNEL2x4_SUB
        KERNEL2x4_SUB
@@ -1248,34 +1248,34 @@ dgemm_kernel_L4_M2_22:
        KERNEL2x4_SUB
 
        subs    counterL, counterL, #1
-       bgt     dgemm_kernel_L4_M2_22
+       bgt     .Ldgemm_kernel_L4_M2_22
 
 
-dgemm_kernel_L4_M2_40:
+.Ldgemm_kernel_L4_M2_40:
 
        ands    counterL , origK, #7            // counterL = counterL % 8
-       ble     dgemm_kernel_L4_M2_100
+       ble     .Ldgemm_kernel_L4_M2_100
 
-dgemm_kernel_L4_M2_42:
+.Ldgemm_kernel_L4_M2_42:
 
        KERNEL2x4_SUB
 
        subs    counterL, counterL, #1
-       bgt     dgemm_kernel_L4_M2_42
+       bgt     .Ldgemm_kernel_L4_M2_42
 
-dgemm_kernel_L4_M2_100:
+.Ldgemm_kernel_L4_M2_100:
 
        SAVE2x4
 
-dgemm_kernel_L4_M2_END:
+.Ldgemm_kernel_L4_M2_END:
 
 
-dgemm_kernel_L4_M1_BEGIN:
+.Ldgemm_kernel_L4_M1_BEGIN:
 
        tst     counterI, #1                    // counterI = counterI % 2
-       ble     dgemm_kernel_L4_END
+       ble     .Ldgemm_kernel_L4_END
 
-dgemm_kernel_L4_M1_20:
+.Ldgemm_kernel_L4_M1_20:
 
        INIT1x4
 
@@ -1283,9 +1283,9 @@ dgemm_kernel_L4_M1_20:
 
        asr     counterL , origK, #3            // counterL = counterL / 8
        cmp     counterL , #0
-       ble     dgemm_kernel_L4_M1_40
+       ble     .Ldgemm_kernel_L4_M1_40
 
-dgemm_kernel_L4_M1_22:
+.Ldgemm_kernel_L4_M1_22:
        KERNEL1x4_SUB
        KERNEL1x4_SUB
        KERNEL1x4_SUB
@@ -1297,40 +1297,40 @@ dgemm_kernel_L4_M1_22:
        KERNEL1x4_SUB
 
        subs    counterL, counterL, #1
-       bgt     dgemm_kernel_L4_M1_22
+       bgt     .Ldgemm_kernel_L4_M1_22
 
 
-dgemm_kernel_L4_M1_40:
+.Ldgemm_kernel_L4_M1_40:
 
        ands    counterL , origK, #7            // counterL = counterL % 8
-       ble     dgemm_kernel_L4_M1_100
+       ble     .Ldgemm_kernel_L4_M1_100
 
-dgemm_kernel_L4_M1_42:
+.Ldgemm_kernel_L4_M1_42:
 
        KERNEL1x4_SUB
 
        subs    counterL, counterL, #1
-       bgt     dgemm_kernel_L4_M1_42
+       bgt     .Ldgemm_kernel_L4_M1_42
 
-dgemm_kernel_L4_M1_100:
+.Ldgemm_kernel_L4_M1_100:
 
        SAVE1x4
 
-dgemm_kernel_L4_END:
+.Ldgemm_kernel_L4_END:
 
        lsl     temp, origK, #5 
        add     origPB, origPB, temp            // B = B + K * 4 * 8
 
 /******************************************************************************/
 
-dgemm_kernel_L2_BEGIN:   // less than 2 left in N direction
+.Ldgemm_kernel_L2_BEGIN:   // less than 2 left in N direction
 
        mov     counterJ , origN
        tst     counterJ , #3
-       ble     dgemm_kernel_L999   // error, N was less than 4?
+       ble     .Ldgemm_kernel_L999   // error, N was less than 4?
 
        tst     counterJ , #2
-       ble     dgemm_kernel_L1_BEGIN
+       ble     .Ldgemm_kernel_L1_BEGIN
 
        mov     pCRow0, pC                      // pCRow0 = pC
 
@@ -1339,14 +1339,14 @@ dgemm_kernel_L2_BEGIN:   // less than 2 left in N direction
        mov     pA, origPA                      // pA = A
 
 
-dgemm_kernel_L2_M4_BEGIN:
+.Ldgemm_kernel_L2_M4_BEGIN:
 
        mov     counterI, origM
        asr     counterI, counterI, #2          // counterI = counterI / 4
        cmp     counterI,#0
-       ble     dgemm_kernel_L2_M2_BEGIN
+       ble     .Ldgemm_kernel_L2_M2_BEGIN
 
-dgemm_kernel_L2_M4_20:
+.Ldgemm_kernel_L2_M4_20:
 
        INIT4x2
 
@@ -1354,10 +1354,10 @@ dgemm_kernel_L2_M4_20:
 
        asr     counterL , origK, #3            // counterL = counterL / 8
        cmp     counterL,#0
-       ble     dgemm_kernel_L2_M4_40
+       ble     .Ldgemm_kernel_L2_M4_40
        .align 5
 
-dgemm_kernel_L2_M4_22:
+.Ldgemm_kernel_L2_M4_22:
        KERNEL4x2_SUB
        KERNEL4x2_SUB
        KERNEL4x2_SUB
@@ -1369,41 +1369,41 @@ dgemm_kernel_L2_M4_22:
        KERNEL4x2_SUB
 
        subs    counterL, counterL, #1
-       bgt     dgemm_kernel_L2_M4_22
+       bgt     .Ldgemm_kernel_L2_M4_22
 
 
-dgemm_kernel_L2_M4_40:
+.Ldgemm_kernel_L2_M4_40:
 
        ands    counterL , origK, #7            // counterL = counterL % 8
-       ble     dgemm_kernel_L2_M4_100
+       ble     .Ldgemm_kernel_L2_M4_100
 
-dgemm_kernel_L2_M4_42:
+.Ldgemm_kernel_L2_M4_42:
 
        KERNEL4x2_SUB
 
        subs    counterL, counterL, #1
-       bgt     dgemm_kernel_L2_M4_42
+       bgt     .Ldgemm_kernel_L2_M4_42
 
-dgemm_kernel_L2_M4_100:
+.Ldgemm_kernel_L2_M4_100:
 
        SAVE4x2
 
-dgemm_kernel_L2_M4_END:
+.Ldgemm_kernel_L2_M4_END:
 
        subs    counterI, counterI, #1
-       bgt     dgemm_kernel_L2_M4_20
+       bgt     .Ldgemm_kernel_L2_M4_20
 
 
-dgemm_kernel_L2_M2_BEGIN:
+.Ldgemm_kernel_L2_M2_BEGIN:
 
        mov     counterI, origM
        tst     counterI , #3
-       ble     dgemm_kernel_L2_END
+       ble     .Ldgemm_kernel_L2_END
 
        tst     counterI, #2                    // counterI = counterI / 2
-       ble     dgemm_kernel_L2_M1_BEGIN
+       ble     .Ldgemm_kernel_L2_M1_BEGIN
 
-dgemm_kernel_L2_M2_20:
+.Ldgemm_kernel_L2_M2_20:
 
        INIT2x2
 
@@ -1411,9 +1411,9 @@ dgemm_kernel_L2_M2_20:
 
        asr     counterL , origK, #3            // counterL = counterL / 8
         cmp    counterL,#0
-       ble     dgemm_kernel_L2_M2_40
+       ble     .Ldgemm_kernel_L2_M2_40
 
-dgemm_kernel_L2_M2_22:
+.Ldgemm_kernel_L2_M2_22:
 
        KERNEL2x2_SUB
        KERNEL2x2_SUB
@@ -1426,34 +1426,34 @@ dgemm_kernel_L2_M2_22:
        KERNEL2x2_SUB
 
        subs    counterL, counterL, #1
-       bgt     dgemm_kernel_L2_M2_22
+       bgt     .Ldgemm_kernel_L2_M2_22
 
 
-dgemm_kernel_L2_M2_40:
+.Ldgemm_kernel_L2_M2_40:
 
        ands    counterL , origK, #7            // counterL = counterL % 8
-       ble     dgemm_kernel_L2_M2_100
+       ble     .Ldgemm_kernel_L2_M2_100
 
-dgemm_kernel_L2_M2_42:
+.Ldgemm_kernel_L2_M2_42:
 
        KERNEL2x2_SUB
 
        subs    counterL, counterL, #1
-       bgt     dgemm_kernel_L2_M2_42
+       bgt     .Ldgemm_kernel_L2_M2_42
 
-dgemm_kernel_L2_M2_100:
+.Ldgemm_kernel_L2_M2_100:
 
        SAVE2x2
 
-dgemm_kernel_L2_M2_END:
+.Ldgemm_kernel_L2_M2_END:
 
 
-dgemm_kernel_L2_M1_BEGIN:
+.Ldgemm_kernel_L2_M1_BEGIN:
 
        tst     counterI, #1                    // counterI = counterI % 2
-       ble     dgemm_kernel_L2_END
+       ble     .Ldgemm_kernel_L2_END
 
-dgemm_kernel_L2_M1_20:
+.Ldgemm_kernel_L2_M1_20:
 
        INIT1x2
 
@@ -1461,9 +1461,9 @@ dgemm_kernel_L2_M1_20:
 
        asr     counterL , origK, #3            // counterL = counterL / 8
         cmp     counterL, #0
-       ble     dgemm_kernel_L2_M1_40
+       ble     .Ldgemm_kernel_L2_M1_40
 
-dgemm_kernel_L2_M1_22:
+.Ldgemm_kernel_L2_M1_22:
        KERNEL1x2_SUB
        KERNEL1x2_SUB
        KERNEL1x2_SUB
@@ -1475,35 +1475,35 @@ dgemm_kernel_L2_M1_22:
        KERNEL1x2_SUB
 
        subs    counterL, counterL, #1
-       bgt     dgemm_kernel_L2_M1_22
+       bgt     .Ldgemm_kernel_L2_M1_22
 
 
-dgemm_kernel_L2_M1_40:
+.Ldgemm_kernel_L2_M1_40:
 
        ands    counterL , origK, #7            // counterL = counterL % 8
-       ble     dgemm_kernel_L2_M1_100
+       ble     .Ldgemm_kernel_L2_M1_100
 
-dgemm_kernel_L2_M1_42:
+.Ldgemm_kernel_L2_M1_42:
 
        KERNEL1x2_SUB
 
        subs    counterL, counterL, #1
-       bgt     dgemm_kernel_L2_M1_42
+       bgt     .Ldgemm_kernel_L2_M1_42
 
-dgemm_kernel_L2_M1_100:
+.Ldgemm_kernel_L2_M1_100:
 
        SAVE1x2
 
-dgemm_kernel_L2_END:
+.Ldgemm_kernel_L2_END:
        add     origPB, origPB, origK, lsl #4   // B = B + K * 2 * 8
 
 /******************************************************************************/
 
-dgemm_kernel_L1_BEGIN:
+.Ldgemm_kernel_L1_BEGIN:
 
        mov     counterJ , origN
        tst     counterJ , #1
-       ble     dgemm_kernel_L999 // done
+       ble     .Ldgemm_kernel_L999 // done
 
 
        mov     pCRow0, pC                      // pCRow0 = C
@@ -1511,24 +1511,24 @@ dgemm_kernel_L1_BEGIN:
 
        mov     pA, origPA                      // pA = A
 
-dgemm_kernel_L1_M4_BEGIN:
+.Ldgemm_kernel_L1_M4_BEGIN:
 
        mov     counterI, origM
        asr     counterI, counterI, #2          // counterI = counterI / 4
        cmp     counterI, #0
-       ble     dgemm_kernel_L1_M2_BEGIN
+       ble     .Ldgemm_kernel_L1_M2_BEGIN
 
-dgemm_kernel_L1_M4_20:
+.Ldgemm_kernel_L1_M4_20:
 
        INIT4x1
 
        mov     pB, origPB
        asr     counterL , origK, #3            // counterL = counterL / 8
        cmp     counterL , #0
-       ble     dgemm_kernel_L1_M4_40
+       ble     .Ldgemm_kernel_L1_M4_40
        .align 5
 
-dgemm_kernel_L1_M4_22:
+.Ldgemm_kernel_L1_M4_22:
        KERNEL4x1_SUB
        KERNEL4x1_SUB
        KERNEL4x1_SUB
@@ -1540,41 +1540,41 @@ dgemm_kernel_L1_M4_22:
        KERNEL4x1_SUB
 
        subs    counterL, counterL, #1
-       bgt     dgemm_kernel_L1_M4_22
+       bgt     .Ldgemm_kernel_L1_M4_22
 
 
-dgemm_kernel_L1_M4_40:
+.Ldgemm_kernel_L1_M4_40:
 
        ands    counterL , origK, #7            // counterL = counterL % 8
-       ble     dgemm_kernel_L1_M4_100
+       ble     .Ldgemm_kernel_L1_M4_100
 
-dgemm_kernel_L1_M4_42:
+.Ldgemm_kernel_L1_M4_42:
 
        KERNEL4x1_SUB
 
        subs    counterL, counterL, #1
-       bgt     dgemm_kernel_L1_M4_42
+       bgt     .Ldgemm_kernel_L1_M4_42
 
-dgemm_kernel_L1_M4_100:
+.Ldgemm_kernel_L1_M4_100:
 
        SAVE4x1
 
-dgemm_kernel_L1_M4_END:
+.Ldgemm_kernel_L1_M4_END:
 
        subs    counterI, counterI, #1
-       bgt     dgemm_kernel_L1_M4_20
+       bgt     .Ldgemm_kernel_L1_M4_20
 
 
-dgemm_kernel_L1_M2_BEGIN:
+.Ldgemm_kernel_L1_M2_BEGIN:
 
        mov     counterI, origM
        tst     counterI , #3
-       ble     dgemm_kernel_L1_END
+       ble     .Ldgemm_kernel_L1_END
 
        tst     counterI, #2                    // counterI = counterI / 2
-       ble     dgemm_kernel_L1_M1_BEGIN
+       ble     .Ldgemm_kernel_L1_M1_BEGIN
 
-dgemm_kernel_L1_M2_20:
+.Ldgemm_kernel_L1_M2_20:
 
        INIT2x1
 
@@ -1582,9 +1582,9 @@ dgemm_kernel_L1_M2_20:
 
        asr     counterL , origK, #3            // counterL = counterL / 8
        cmp     counterL , #0
-       ble     dgemm_kernel_L1_M2_40
+       ble     .Ldgemm_kernel_L1_M2_40
 
-dgemm_kernel_L1_M2_22:
+.Ldgemm_kernel_L1_M2_22:
 
        KERNEL2x1_SUB
        KERNEL2x1_SUB
@@ -1597,34 +1597,34 @@ dgemm_kernel_L1_M2_22:
        KERNEL2x1_SUB
 
        subs    counterL, counterL, #1
-       bgt     dgemm_kernel_L1_M2_22
+       bgt     .Ldgemm_kernel_L1_M2_22
 
 
-dgemm_kernel_L1_M2_40:
+.Ldgemm_kernel_L1_M2_40:
 
        ands    counterL , origK, #7            // counterL = counterL % 8
-       ble     dgemm_kernel_L1_M2_100
+       ble     .Ldgemm_kernel_L1_M2_100
 
-dgemm_kernel_L1_M2_42:
+.Ldgemm_kernel_L1_M2_42:
 
        KERNEL2x1_SUB
 
        subs    counterL, counterL, #1
-       bgt     dgemm_kernel_L1_M2_42
+       bgt     .Ldgemm_kernel_L1_M2_42
 
-dgemm_kernel_L1_M2_100:
+.Ldgemm_kernel_L1_M2_100:
 
        SAVE2x1
 
-dgemm_kernel_L1_M2_END:
+.Ldgemm_kernel_L1_M2_END:
 
 
-dgemm_kernel_L1_M1_BEGIN:
+.Ldgemm_kernel_L1_M1_BEGIN:
 
        tst     counterI, #1                    // counterI = counterI % 2
-       ble     dgemm_kernel_L1_END
+       ble     .Ldgemm_kernel_L1_END
 
-dgemm_kernel_L1_M1_20:
+.Ldgemm_kernel_L1_M1_20:
 
        INIT1x1
 
@@ -1632,9 +1632,9 @@ dgemm_kernel_L1_M1_20:
 
        asr     counterL , origK, #3            // counterL = counterL / 8
        cmp     counterL , #0
-       ble     dgemm_kernel_L1_M1_40
+       ble     .Ldgemm_kernel_L1_M1_40
 
-dgemm_kernel_L1_M1_22:
+.Ldgemm_kernel_L1_M1_22:
        KERNEL1x1_SUB
        KERNEL1x1_SUB
        KERNEL1x1_SUB
@@ -1646,30 +1646,30 @@ dgemm_kernel_L1_M1_22:
        KERNEL1x1_SUB
 
        subs    counterL, counterL, #1
-       bgt     dgemm_kernel_L1_M1_22
+       bgt     .Ldgemm_kernel_L1_M1_22
 
 
-dgemm_kernel_L1_M1_40:
+.Ldgemm_kernel_L1_M1_40:
 
        ands    counterL , origK, #7            // counterL = counterL % 8
-       ble     dgemm_kernel_L1_M1_100
+       ble     .Ldgemm_kernel_L1_M1_100
 
-dgemm_kernel_L1_M1_42:
+.Ldgemm_kernel_L1_M1_42:
 
        KERNEL1x1_SUB
 
        subs    counterL, counterL, #1
-       bgt     dgemm_kernel_L1_M1_42
+       bgt     .Ldgemm_kernel_L1_M1_42
 
-dgemm_kernel_L1_M1_100:
+.Ldgemm_kernel_L1_M1_100:
 
        SAVE1x1
 
 
-dgemm_kernel_L1_END:
+.Ldgemm_kernel_L1_END:
 
 
-dgemm_kernel_L999:
+.Ldgemm_kernel_L999:
        mov     x0, #0                          // set return value
        ldp     d8, d9, [sp, #(0 * 16)]
        ldp     d10, d11, [sp, #(1 * 16)]
index 3fd74fc..af3aa02 100644 (file)
@@ -885,12 +885,12 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
        mov     counterJ, origN
        asr     counterJ, counterJ, #2          // J = J / 4
        cmp     counterJ, #0
-       ble     dgemm_kernel_L2_BEGIN
+       ble     .Ldgemm_kernel_L2_BEGIN
 
 /******************************************************************************/
 
        .align 5
-dgemm_kernel_L4_BEGIN:
+.Ldgemm_kernel_L4_BEGIN:
        mov     pCRow0, pC
        add     pCRow1, pCRow0, LDC
        add     pCRow2, pCRow1, LDC
@@ -900,21 +900,21 @@ dgemm_kernel_L4_BEGIN:
 
        mov     pA, origPA                      // pA = start of A array
 
-dgemm_kernel_L4_M8_BEGIN:
+.Ldgemm_kernel_L4_M8_BEGIN:
 
        mov     counterI, origM
        asr     counterI, counterI, #3          // counterI = counterI / 8
        cmp     counterI, #0
-       ble     dgemm_kernel_L4_M4_BEGIN
+       ble     .Ldgemm_kernel_L4_M4_BEGIN
 
        .align 5
-dgemm_kernel_L4_M8_20:
+.Ldgemm_kernel_L4_M8_20:
 
        mov     pB, origPB
 
        asr     counterL , origK, #3            // L = K / 8
        cmp     counterL , #2                   // is there at least 4 to do?
-       blt     dgemm_kernel_L4_M8_32
+       blt     .Ldgemm_kernel_L4_M8_32
 
        KERNEL8x4_I
        KERNEL8x4_M2
@@ -926,10 +926,10 @@ dgemm_kernel_L4_M8_20:
        KERNEL8x4_M2
 
        subs    counterL, counterL, #2          // subtract 2
-       ble     dgemm_kernel_L4_M8_22a
+       ble     .Ldgemm_kernel_L4_M8_22a
 
        .align 5
-dgemm_kernel_L4_M8_22:
+.Ldgemm_kernel_L4_M8_22:
 
        KERNEL8x4_M1
        KERNEL8x4_M2
@@ -941,10 +941,10 @@ dgemm_kernel_L4_M8_22:
        KERNEL8x4_M2
 
        subs    counterL, counterL, #1
-       bgt     dgemm_kernel_L4_M8_22
+       bgt     .Ldgemm_kernel_L4_M8_22
 
        .align 5
-dgemm_kernel_L4_M8_22a:
+.Ldgemm_kernel_L4_M8_22a:
 
        KERNEL8x4_M1
        KERNEL8x4_M2
@@ -955,13 +955,13 @@ dgemm_kernel_L4_M8_22a:
        KERNEL8x4_M1
        KERNEL8x4_E
 
-       b        dgemm_kernel_L4_M8_44
+       b        .Ldgemm_kernel_L4_M8_44
 
        .align 5
-dgemm_kernel_L4_M8_32:
+.Ldgemm_kernel_L4_M8_32:
 
        tst     counterL, #1
-       ble     dgemm_kernel_L4_M8_40
+       ble     .Ldgemm_kernel_L4_M8_40
 
        KERNEL8x4_I
        KERNEL8x4_M2
@@ -972,46 +972,46 @@ dgemm_kernel_L4_M8_32:
        KERNEL8x4_M1
        KERNEL8x4_E
 
-       b       dgemm_kernel_L4_M8_44
+       b       .Ldgemm_kernel_L4_M8_44
 
-dgemm_kernel_L4_M8_40:
+.Ldgemm_kernel_L4_M8_40:
 
        INIT8x4
 
-dgemm_kernel_L4_M8_44:
+.Ldgemm_kernel_L4_M8_44:
 
        ands    counterL , origK, #7
-       ble     dgemm_kernel_L4_M8_100
+       ble     .Ldgemm_kernel_L4_M8_100
 
        .align 5
-dgemm_kernel_L4_M8_46:
+.Ldgemm_kernel_L4_M8_46:
 
        KERNEL8x4_SUB
 
        subs    counterL, counterL, #1
-       bne     dgemm_kernel_L4_M8_46
+       bne     .Ldgemm_kernel_L4_M8_46
 
-dgemm_kernel_L4_M8_100:
+.Ldgemm_kernel_L4_M8_100:
        prfm    PLDL1KEEP, [pA]
        prfm    PLDL1KEEP, [pA, #64]
        prfm    PLDL1KEEP, [origPB]
 
        SAVE8x4
 
-dgemm_kernel_L4_M8_END:
+.Ldgemm_kernel_L4_M8_END:
        subs    counterI, counterI, #1
-       bne     dgemm_kernel_L4_M8_20
+       bne     .Ldgemm_kernel_L4_M8_20
 
-dgemm_kernel_L4_M4_BEGIN:
+.Ldgemm_kernel_L4_M4_BEGIN:
 
        mov     counterI, origM
        tst     counterI , #7
-       ble     dgemm_kernel_L4_END
+       ble     .Ldgemm_kernel_L4_END
 
        tst     counterI, #4
-       ble     dgemm_kernel_L4_M2_BEGIN
+       ble     .Ldgemm_kernel_L4_M2_BEGIN
 
-dgemm_kernel_L4_M4_20:
+.Ldgemm_kernel_L4_M4_20:
 
        INIT4x4
 
@@ -1019,10 +1019,10 @@ dgemm_kernel_L4_M4_20:
 
        asr     counterL , origK, #3            // counterL = counterL / 8
        cmp     counterL , #0
-       ble     dgemm_kernel_L4_M4_40
+       ble     .Ldgemm_kernel_L4_M4_40
 
        .align 5
-dgemm_kernel_L4_M4_22:
+.Ldgemm_kernel_L4_M4_22:
 
        KERNEL4x4_SUB
        prfm    PLDL1KEEP, [pB, #B_PRE_SIZE]
@@ -1043,38 +1043,38 @@ dgemm_kernel_L4_M4_22:
        prfm    PLDL1KEEP, [pA, #A_PRE_SIZE]
 
        subs    counterL, counterL, #1
-       bgt     dgemm_kernel_L4_M4_22
+       bgt     .Ldgemm_kernel_L4_M4_22
 
-dgemm_kernel_L4_M4_40:
+.Ldgemm_kernel_L4_M4_40:
 
        ands    counterL , origK, #7            // counterL = counterL % 8
-       ble     dgemm_kernel_L4_M4_100
+       ble     .Ldgemm_kernel_L4_M4_100
 
-dgemm_kernel_L4_M4_42:
+.Ldgemm_kernel_L4_M4_42:
 
        KERNEL4x4_SUB
        prfm    PLDL1KEEP, [pB, #B_PRE_SIZE]
        prfm    PLDL1KEEP, [pA, #A_PRE_SIZE]
 
        subs    counterL, counterL, #1
-       bgt     dgemm_kernel_L4_M4_42
+       bgt     .Ldgemm_kernel_L4_M4_42
 
-dgemm_kernel_L4_M4_100:
+.Ldgemm_kernel_L4_M4_100:
 
        SAVE4x4
 
-dgemm_kernel_L4_M4_END:
+.Ldgemm_kernel_L4_M4_END:
 
-dgemm_kernel_L4_M2_BEGIN:
+.Ldgemm_kernel_L4_M2_BEGIN:
 
        mov     counterI, origM
        tst     counterI , #3
-       ble     dgemm_kernel_L4_END
+       ble     .Ldgemm_kernel_L4_END
 
        tst     counterI, #2                    // counterI = counterI / 2
-       ble     dgemm_kernel_L4_M1_BEGIN
+       ble     .Ldgemm_kernel_L4_M1_BEGIN
 
-dgemm_kernel_L4_M2_20:
+.Ldgemm_kernel_L4_M2_20:
 
        INIT2x4
 
@@ -1082,10 +1082,10 @@ dgemm_kernel_L4_M2_20:
 
        asr     counterL , origK, #3            // counterL = counterL / 8
        cmp     counterL , #0
-       ble     dgemm_kernel_L4_M2_40
+       ble     .Ldgemm_kernel_L4_M2_40
 
        .align 5
-dgemm_kernel_L4_M2_22:
+.Ldgemm_kernel_L4_M2_22:
 
        KERNEL2x4_SUB
        prfm    PLDL1KEEP, [pB, #B_PRE_SIZE]
@@ -1104,37 +1104,37 @@ dgemm_kernel_L4_M2_22:
        KERNEL2x4_SUB
 
        subs    counterL, counterL, #1
-       bgt     dgemm_kernel_L4_M2_22
+       bgt     .Ldgemm_kernel_L4_M2_22
 
 
-dgemm_kernel_L4_M2_40:
+.Ldgemm_kernel_L4_M2_40:
 
        ands    counterL , origK, #7            // counterL = counterL % 8
-       ble     dgemm_kernel_L4_M2_100
+       ble     .Ldgemm_kernel_L4_M2_100
 
        prfm    PLDL1KEEP, [pA, #A_PRE_SIZE]
        prfm    PLDL1KEEP, [pA, #A_PRE_SIZE+64]
-dgemm_kernel_L4_M2_42:
+.Ldgemm_kernel_L4_M2_42:
 
        KERNEL2x4_SUB
        prfm    PLDL1KEEP, [pB, #B_PRE_SIZE]
 
        subs    counterL, counterL, #1
-       bgt     dgemm_kernel_L4_M2_42
+       bgt     .Ldgemm_kernel_L4_M2_42
 
-dgemm_kernel_L4_M2_100:
+.Ldgemm_kernel_L4_M2_100:
 
        SAVE2x4
 
-dgemm_kernel_L4_M2_END:
+.Ldgemm_kernel_L4_M2_END:
 
 
-dgemm_kernel_L4_M1_BEGIN:
+.Ldgemm_kernel_L4_M1_BEGIN:
 
        tst     counterI, #1                    // counterI = counterI % 2
-       ble     dgemm_kernel_L4_END
+       ble     .Ldgemm_kernel_L4_END
 
-dgemm_kernel_L4_M1_20:
+.Ldgemm_kernel_L4_M1_20:
 
        INIT1x4
 
@@ -1142,10 +1142,10 @@ dgemm_kernel_L4_M1_20:
 
        asr     counterL , origK, #3            // counterL = counterL / 8
        cmp     counterL , #0
-       ble     dgemm_kernel_L4_M1_40
+       ble     .Ldgemm_kernel_L4_M1_40
 
        .align 5
-dgemm_kernel_L4_M1_22:
+.Ldgemm_kernel_L4_M1_22:
        KERNEL1x4_SUB
        prfm    PLDL1KEEP, [pB, #B_PRE_SIZE]
        KERNEL1x4_SUB
@@ -1163,46 +1163,46 @@ dgemm_kernel_L4_M1_22:
        KERNEL1x4_SUB
 
        subs    counterL, counterL, #1
-       bgt     dgemm_kernel_L4_M1_22
+       bgt     .Ldgemm_kernel_L4_M1_22
 
 
-dgemm_kernel_L4_M1_40:
+.Ldgemm_kernel_L4_M1_40:
 
        ands    counterL , origK, #7            // counterL = counterL % 8
-       ble     dgemm_kernel_L4_M1_100
+       ble     .Ldgemm_kernel_L4_M1_100
 
        prfm    PLDL1KEEP, [pA, #A_PRE_SIZE]
-dgemm_kernel_L4_M1_42:
+.Ldgemm_kernel_L4_M1_42:
 
        KERNEL1x4_SUB
        prfm    PLDL1KEEP, [pB, #B_PRE_SIZE]
 
        subs    counterL, counterL, #1
-       bgt     dgemm_kernel_L4_M1_42
+       bgt     .Ldgemm_kernel_L4_M1_42
 
-dgemm_kernel_L4_M1_100:
+.Ldgemm_kernel_L4_M1_100:
 
        SAVE1x4
 
-dgemm_kernel_L4_END:
+.Ldgemm_kernel_L4_END:
 
        lsl     temp, origK, #5 
        add     origPB, origPB, temp            // B = B + K * 4 * 8
 
        subs    counterJ, counterJ , #1         // j--
-       bgt     dgemm_kernel_L4_BEGIN
+       bgt     .Ldgemm_kernel_L4_BEGIN
 
 
 /******************************************************************************/
 
-dgemm_kernel_L2_BEGIN:   // less than 2 left in N direction
+.Ldgemm_kernel_L2_BEGIN:   // less than 2 left in N direction
 
        mov     counterJ , origN
        tst     counterJ , #3
-       ble     dgemm_kernel_L999   // error, N was less than 4?
+       ble     .Ldgemm_kernel_L999   // error, N was less than 4?
 
        tst     counterJ , #2
-       ble     dgemm_kernel_L1_BEGIN
+       ble     .Ldgemm_kernel_L1_BEGIN
 
        mov     pCRow0, pC
        add     pCRow1, pCRow0, LDC
@@ -1211,15 +1211,15 @@ dgemm_kernel_L2_BEGIN:   // less than 2 left in N direction
 
        mov     pA, origPA                      // pA = A
 
-dgemm_kernel_L2_M8_BEGIN:
+.Ldgemm_kernel_L2_M8_BEGIN:
 
        mov     counterI, origM
        asr     counterI, counterI, #3          // counterI = counterI / 8
        cmp     counterI, #0
-       ble     dgemm_kernel_L2_M4_BEGIN
+       ble     .Ldgemm_kernel_L2_M4_BEGIN
 
        .align 5
-dgemm_kernel_L2_M8_20:
+.Ldgemm_kernel_L2_M8_20:
 
        INIT8x2
 
@@ -1227,10 +1227,10 @@ dgemm_kernel_L2_M8_20:
 
        asr     counterL , origK, #3            // counterL = counterL / 8
        cmp     counterL,#0
-       ble     dgemm_kernel_L2_M8_40
+       ble     .Ldgemm_kernel_L2_M8_40
 
        .align 5
-dgemm_kernel_L2_M8_22:
+.Ldgemm_kernel_L2_M8_22:
        KERNEL8x2_SUB
        KERNEL8x2_SUB
        prfm    PLDL1KEEP, [pB, #B_PRE_SIZE]
@@ -1244,41 +1244,41 @@ dgemm_kernel_L2_M8_22:
        KERNEL8x2_SUB
 
        subs    counterL, counterL, #1
-       bgt     dgemm_kernel_L2_M8_22
+       bgt     .Ldgemm_kernel_L2_M8_22
 
-dgemm_kernel_L2_M8_40:
+.Ldgemm_kernel_L2_M8_40:
 
        ands    counterL , origK, #7            // counterL = counterL % 8
-       ble     dgemm_kernel_L2_M8_100
+       ble     .Ldgemm_kernel_L2_M8_100
 
        prfm    PLDL1KEEP, [pB, #B_PRE_SIZE]
        prfm    PLDL1KEEP, [pB, #B_PRE_SIZE+64]
-dgemm_kernel_L2_M8_42:
+.Ldgemm_kernel_L2_M8_42:
 
        KERNEL8x2_SUB
 
        subs    counterL, counterL, #1
-       bgt     dgemm_kernel_L2_M8_42
+       bgt     .Ldgemm_kernel_L2_M8_42
 
-dgemm_kernel_L2_M8_100:
+.Ldgemm_kernel_L2_M8_100:
 
        SAVE8x2
 
-dgemm_kernel_L2_M8_END:
+.Ldgemm_kernel_L2_M8_END:
 
        subs    counterI, counterI, #1
-       bgt     dgemm_kernel_L2_M8_20
+       bgt     .Ldgemm_kernel_L2_M8_20
 
-dgemm_kernel_L2_M4_BEGIN:
+.Ldgemm_kernel_L2_M4_BEGIN:
 
        mov     counterI, origM
        tst     counterI , #7
-       ble     dgemm_kernel_L2_END
+       ble     .Ldgemm_kernel_L2_END
 
        tst     counterI, #4                    // counterI = counterI / 2
-       ble     dgemm_kernel_L2_M2_BEGIN
+       ble     .Ldgemm_kernel_L2_M2_BEGIN
 
-dgemm_kernel_L2_M4_20:
+.Ldgemm_kernel_L2_M4_20:
 
        INIT4x2
 
@@ -1286,10 +1286,10 @@ dgemm_kernel_L2_M4_20:
 
        asr     counterL , origK, #3            // counterL = counterL / 8
        cmp     counterL,#0
-       ble     dgemm_kernel_L2_M4_40
+       ble     .Ldgemm_kernel_L2_M4_40
 
        .align 5
-dgemm_kernel_L2_M4_22:
+.Ldgemm_kernel_L2_M4_22:
        KERNEL4x2_SUB
        prfm    PLDL1KEEP, [pA, #A_PRE_SIZE]
        KERNEL4x2_SUB
@@ -1307,41 +1307,41 @@ dgemm_kernel_L2_M4_22:
        KERNEL4x2_SUB
 
        subs    counterL, counterL, #1
-       bgt     dgemm_kernel_L2_M4_22
+       bgt     .Ldgemm_kernel_L2_M4_22
 
 
-dgemm_kernel_L2_M4_40:
+.Ldgemm_kernel_L2_M4_40:
 
        ands    counterL , origK, #7            // counterL = counterL % 8
-       ble     dgemm_kernel_L2_M4_100
+       ble     .Ldgemm_kernel_L2_M4_100
 
        prfm    PLDL1KEEP, [pB, #B_PRE_SIZE]
        prfm    PLDL1KEEP, [pB, #B_PRE_SIZE+64]
-dgemm_kernel_L2_M4_42:
+.Ldgemm_kernel_L2_M4_42:
 
        KERNEL4x2_SUB
        prfm    PLDL1KEEP, [pA, #A_PRE_SIZE]
 
        subs    counterL, counterL, #1
-       bgt     dgemm_kernel_L2_M4_42
+       bgt     .Ldgemm_kernel_L2_M4_42
 
-dgemm_kernel_L2_M4_100:
+.Ldgemm_kernel_L2_M4_100:
 
        SAVE4x2
 
-dgemm_kernel_L2_M4_END:
+.Ldgemm_kernel_L2_M4_END:
 
 
-dgemm_kernel_L2_M2_BEGIN:
+.Ldgemm_kernel_L2_M2_BEGIN:
 
        mov     counterI, origM
        tst     counterI , #3
-       ble     dgemm_kernel_L2_END
+       ble     .Ldgemm_kernel_L2_END
 
        tst     counterI, #2                    // counterI = counterI / 2
-       ble     dgemm_kernel_L2_M1_BEGIN
+       ble     .Ldgemm_kernel_L2_M1_BEGIN
 
-dgemm_kernel_L2_M2_20:
+.Ldgemm_kernel_L2_M2_20:
 
        INIT2x2
 
@@ -1349,9 +1349,9 @@ dgemm_kernel_L2_M2_20:
 
        asr     counterL , origK, #3            // counterL = counterL / 8
         cmp    counterL,#0
-       ble     dgemm_kernel_L2_M2_40
+       ble     .Ldgemm_kernel_L2_M2_40
 
-dgemm_kernel_L2_M2_22:
+.Ldgemm_kernel_L2_M2_22:
 
        KERNEL2x2_SUB
        prfm    PLDL1KEEP, [pB, #B_PRE_SIZE]
@@ -1368,37 +1368,37 @@ dgemm_kernel_L2_M2_22:
        KERNEL2x2_SUB
 
        subs    counterL, counterL, #1
-       bgt     dgemm_kernel_L2_M2_22
+       bgt     .Ldgemm_kernel_L2_M2_22
 
        prfm    PLDL1KEEP, [pA, #A_PRE_SIZE]
        prfm    PLDL1KEEP, [pA, #A_PRE_SIZE+64]
        prfm    PLDL1KEEP, [pB, #B_PRE_SIZE]
        prfm    PLDL1KEEP, [pB, #B_PRE_SIZE+64]
-dgemm_kernel_L2_M2_40:
+.Ldgemm_kernel_L2_M2_40:
 
        ands    counterL , origK, #7            // counterL = counterL % 8
-       ble     dgemm_kernel_L2_M2_100
+       ble     .Ldgemm_kernel_L2_M2_100
 
-dgemm_kernel_L2_M2_42:
+.Ldgemm_kernel_L2_M2_42:
 
        KERNEL2x2_SUB
 
        subs    counterL, counterL, #1
-       bgt     dgemm_kernel_L2_M2_42
+       bgt     .Ldgemm_kernel_L2_M2_42
 
-dgemm_kernel_L2_M2_100:
+.Ldgemm_kernel_L2_M2_100:
 
        SAVE2x2
 
-dgemm_kernel_L2_M2_END:
+.Ldgemm_kernel_L2_M2_END:
 
 
-dgemm_kernel_L2_M1_BEGIN:
+.Ldgemm_kernel_L2_M1_BEGIN:
 
        tst     counterI, #1                    // counterI = counterI % 2
-       ble     dgemm_kernel_L2_END
+       ble     .Ldgemm_kernel_L2_END
 
-dgemm_kernel_L2_M1_20:
+.Ldgemm_kernel_L2_M1_20:
 
        INIT1x2
 
@@ -1406,9 +1406,9 @@ dgemm_kernel_L2_M1_20:
 
        asr     counterL , origK, #3            // counterL = counterL / 8
         cmp     counterL, #0
-       ble     dgemm_kernel_L2_M1_40
+       ble     .Ldgemm_kernel_L2_M1_40
 
-dgemm_kernel_L2_M1_22:
+.Ldgemm_kernel_L2_M1_22:
        KERNEL1x2_SUB
        KERNEL1x2_SUB
        prfm    PLDL1KEEP, [pB, #B_PRE_SIZE]
@@ -1424,62 +1424,62 @@ dgemm_kernel_L2_M1_22:
        KERNEL1x2_SUB
 
        subs    counterL, counterL, #1
-       bgt     dgemm_kernel_L2_M1_22
+       bgt     .Ldgemm_kernel_L2_M1_22
 
        prfm    PLDL1KEEP, [pA, #A_PRE_SIZE]
        prfm    PLDL1KEEP, [pB, #B_PRE_SIZE]
        prfm    PLDL1KEEP, [pB, #B_PRE_SIZE+64]
-dgemm_kernel_L2_M1_40:
+.Ldgemm_kernel_L2_M1_40:
 
        ands    counterL , origK, #7            // counterL = counterL % 8
-       ble     dgemm_kernel_L2_M1_100
+       ble     .Ldgemm_kernel_L2_M1_100
 
-dgemm_kernel_L2_M1_42:
+.Ldgemm_kernel_L2_M1_42:
 
        KERNEL1x2_SUB
 
        subs    counterL, counterL, #1
-       bgt     dgemm_kernel_L2_M1_42
+       bgt     .Ldgemm_kernel_L2_M1_42
 
-dgemm_kernel_L2_M1_100:
+.Ldgemm_kernel_L2_M1_100:
 
        SAVE1x2
 
-dgemm_kernel_L2_END:
+.Ldgemm_kernel_L2_END:
        add     origPB, origPB, origK, lsl #4   // B = B + K * 2 * 8
 
 /******************************************************************************/
 
-dgemm_kernel_L1_BEGIN:
+.Ldgemm_kernel_L1_BEGIN:
 
        mov     counterJ , origN
        tst     counterJ , #1
-       ble     dgemm_kernel_L999 // done
+       ble     .Ldgemm_kernel_L999 // done
 
        mov     pCRow0, pC                      // pCRow0 = C
        add     pC , pC , LDC                   // Update pC to point to next
 
        mov     pA, origPA                      // pA = A
 
-dgemm_kernel_L1_M8_BEGIN:
+.Ldgemm_kernel_L1_M8_BEGIN:
 
        mov     counterI, origM
        asr     counterI, counterI, #3          // counterI = counterI / 8
        cmp     counterI, #0
-       ble     dgemm_kernel_L1_M4_BEGIN
+       ble     .Ldgemm_kernel_L1_M4_BEGIN
 
        .align 5
-dgemm_kernel_L1_M8_20:
+.Ldgemm_kernel_L1_M8_20:
 
        INIT8x1
 
        mov     pB, origPB
        asr     counterL , origK, #3            // counterL = counterL / 8
        cmp     counterL , #0
-       ble     dgemm_kernel_L1_M8_40
+       ble     .Ldgemm_kernel_L1_M8_40
 
        .align 5
-dgemm_kernel_L1_M8_22:
+.Ldgemm_kernel_L1_M8_22:
        KERNEL8x1_SUB
        KERNEL8x1_SUB
        KERNEL8x1_SUB
@@ -1493,51 +1493,51 @@ dgemm_kernel_L1_M8_22:
        KERNEL8x1_SUB
 
        subs    counterL, counterL, #1
-       bgt     dgemm_kernel_L1_M8_22
+       bgt     .Ldgemm_kernel_L1_M8_22
 
 
-dgemm_kernel_L1_M8_40:
+.Ldgemm_kernel_L1_M8_40:
 
        ands    counterL , origK, #7            // counterL = counterL % 8
-       ble     dgemm_kernel_L1_M8_100
+       ble     .Ldgemm_kernel_L1_M8_100
 
        prfm    PLDL1KEEP, [pB, #B_PRE_SIZE]
-dgemm_kernel_L1_M8_42:
+.Ldgemm_kernel_L1_M8_42:
 
        KERNEL8x1_SUB
 
        subs    counterL, counterL, #1
-       bgt     dgemm_kernel_L1_M8_42
+       bgt     .Ldgemm_kernel_L1_M8_42
 
-dgemm_kernel_L1_M8_100:
+.Ldgemm_kernel_L1_M8_100:
 
        SAVE8x1
 
-dgemm_kernel_L1_M8_END:
+.Ldgemm_kernel_L1_M8_END:
 
        subs    counterI, counterI, #1
-       bgt     dgemm_kernel_L1_M8_20
+       bgt     .Ldgemm_kernel_L1_M8_20
 
-dgemm_kernel_L1_M4_BEGIN:
+.Ldgemm_kernel_L1_M4_BEGIN:
 
        mov     counterI, origM
        tst     counterI , #7
-       ble     dgemm_kernel_L1_END
+       ble     .Ldgemm_kernel_L1_END
 
        tst     counterI, #4                    // counterI = counterI / 2
-       ble     dgemm_kernel_L1_M2_BEGIN
+       ble     .Ldgemm_kernel_L1_M2_BEGIN
 
-dgemm_kernel_L1_M4_20:
+.Ldgemm_kernel_L1_M4_20:
 
        INIT4x1
 
        mov     pB, origPB
        asr     counterL , origK, #3            // counterL = counterL / 8
        cmp     counterL , #0
-       ble     dgemm_kernel_L1_M4_40
+       ble     .Ldgemm_kernel_L1_M4_40
 
        .align 5
-dgemm_kernel_L1_M4_22:
+.Ldgemm_kernel_L1_M4_22:
        KERNEL4x1_SUB
        prfm    PLDL1KEEP, [pA, #A_PRE_SIZE]
        KERNEL4x1_SUB
@@ -1555,39 +1555,39 @@ dgemm_kernel_L1_M4_22:
        KERNEL4x1_SUB
 
        subs    counterL, counterL, #1
-       bgt     dgemm_kernel_L1_M4_22
+       bgt     .Ldgemm_kernel_L1_M4_22
 
 
-dgemm_kernel_L1_M4_40:
+.Ldgemm_kernel_L1_M4_40:
 
        ands    counterL , origK, #7            // counterL = counterL % 8
-       ble     dgemm_kernel_L1_M4_100
+       ble     .Ldgemm_kernel_L1_M4_100
 
        prfm    PLDL1KEEP, [pB, #B_PRE_SIZE]
-dgemm_kernel_L1_M4_42:
+.Ldgemm_kernel_L1_M4_42:
 
        KERNEL4x1_SUB
        prfm    PLDL1KEEP, [pA, #A_PRE_SIZE]
 
        subs    counterL, counterL, #1
-       bgt     dgemm_kernel_L1_M4_42
+       bgt     .Ldgemm_kernel_L1_M4_42
 
-dgemm_kernel_L1_M4_100:
+.Ldgemm_kernel_L1_M4_100:
 
        SAVE4x1
 
-dgemm_kernel_L1_M4_END:
+.Ldgemm_kernel_L1_M4_END:
 
-dgemm_kernel_L1_M2_BEGIN:
+.Ldgemm_kernel_L1_M2_BEGIN:
 
        mov     counterI, origM
        tst     counterI , #3
-       ble     dgemm_kernel_L1_END
+       ble     .Ldgemm_kernel_L1_END
 
        tst     counterI, #2                    // counterI = counterI / 2
-       ble     dgemm_kernel_L1_M1_BEGIN
+       ble     .Ldgemm_kernel_L1_M1_BEGIN
 
-dgemm_kernel_L1_M2_20:
+.Ldgemm_kernel_L1_M2_20:
 
        INIT2x1
 
@@ -1595,9 +1595,9 @@ dgemm_kernel_L1_M2_20:
 
        asr     counterL , origK, #3            // counterL = counterL / 8
        cmp     counterL , #0
-       ble     dgemm_kernel_L1_M2_40
+       ble     .Ldgemm_kernel_L1_M2_40
 
-dgemm_kernel_L1_M2_22:
+.Ldgemm_kernel_L1_M2_22:
 
        KERNEL2x1_SUB
        KERNEL2x1_SUB
@@ -1614,36 +1614,36 @@ dgemm_kernel_L1_M2_22:
        KERNEL2x1_SUB
 
        subs    counterL, counterL, #1
-       bgt     dgemm_kernel_L1_M2_22
+       bgt     .Ldgemm_kernel_L1_M2_22
 
        prfm    PLDL1KEEP, [pA, #A_PRE_SIZE]
        prfm    PLDL1KEEP, [pA, #A_PRE_SIZE+64]
        prfm    PLDL1KEEP, [pB, #B_PRE_SIZE]
-dgemm_kernel_L1_M2_40:
+.Ldgemm_kernel_L1_M2_40:
 
        ands    counterL , origK, #7            // counterL = counterL % 8
-       ble     dgemm_kernel_L1_M2_100
+       ble     .Ldgemm_kernel_L1_M2_100
 
-dgemm_kernel_L1_M2_42:
+.Ldgemm_kernel_L1_M2_42:
 
        KERNEL2x1_SUB
 
        subs    counterL, counterL, #1
-       bgt     dgemm_kernel_L1_M2_42
+       bgt     .Ldgemm_kernel_L1_M2_42
 
-dgemm_kernel_L1_M2_100:
+.Ldgemm_kernel_L1_M2_100:
 
        SAVE2x1
 
-dgemm_kernel_L1_M2_END:
+.Ldgemm_kernel_L1_M2_END:
 
 
-dgemm_kernel_L1_M1_BEGIN:
+.Ldgemm_kernel_L1_M1_BEGIN:
 
        tst     counterI, #1                    // counterI = counterI % 2
-       ble     dgemm_kernel_L1_END
+       ble     .Ldgemm_kernel_L1_END
 
-dgemm_kernel_L1_M1_20:
+.Ldgemm_kernel_L1_M1_20:
 
        INIT1x1
 
@@ -1651,10 +1651,10 @@ dgemm_kernel_L1_M1_20:
 
        asr     counterL , origK, #3            // counterL = counterL / 8
        cmp     counterL , #0
-       ble     dgemm_kernel_L1_M1_40
+       ble     .Ldgemm_kernel_L1_M1_40
 
 
-dgemm_kernel_L1_M1_22:
+.Ldgemm_kernel_L1_M1_22:
        KERNEL1x1_SUB
        KERNEL1x1_SUB
        prfm    PLDL1KEEP, [pA, #A_PRE_SIZE]
@@ -1668,32 +1668,32 @@ dgemm_kernel_L1_M1_22:
        KERNEL1x1_SUB
 
        subs    counterL, counterL, #1
-       bgt     dgemm_kernel_L1_M1_22
+       bgt     .Ldgemm_kernel_L1_M1_22
 
 
-dgemm_kernel_L1_M1_40:
+.Ldgemm_kernel_L1_M1_40:
 
        ands    counterL , origK, #7            // counterL = counterL % 8
-       ble     dgemm_kernel_L1_M1_100
+       ble     .Ldgemm_kernel_L1_M1_100
 
        prfm    PLDL1KEEP, [pA, #A_PRE_SIZE]
        prfm    PLDL1KEEP, [pB, #B_PRE_SIZE]
-dgemm_kernel_L1_M1_42:
+.Ldgemm_kernel_L1_M1_42:
 
        KERNEL1x1_SUB
 
        subs    counterL, counterL, #1
-       bgt     dgemm_kernel_L1_M1_42
+       bgt     .Ldgemm_kernel_L1_M1_42
 
-dgemm_kernel_L1_M1_100:
+.Ldgemm_kernel_L1_M1_100:
 
        SAVE1x1
 
 
-dgemm_kernel_L1_END:
+.Ldgemm_kernel_L1_END:
 
 
-dgemm_kernel_L999:
+.Ldgemm_kernel_L999:
        mov     x0, #0                          // set return value
        ldp     d8, d9, [sp, #(0 * 16)]
        ldp     d10, d11, [sp, #(1 * 16)]
index 86865d8..598db6e 100644 (file)
@@ -962,12 +962,12 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
        mov     counterJ, origN
        asr     counterJ, counterJ, #2          // J = J / 4
        cmp     counterJ, #0
-       ble     dgemm_kernel_L2_BEGIN
+       ble     .Ldgemm_kernel_L2_BEGIN
 
 /******************************************************************************/
 
        .align 5
-dgemm_kernel_L4_BEGIN:
+.Ldgemm_kernel_L4_BEGIN:
        mov     pCRow0, pC
        add     pCRow1, pCRow0, LDC
        add     pCRow2, pCRow1, LDC
@@ -977,21 +977,21 @@ dgemm_kernel_L4_BEGIN:
 
        mov     pA, origPA                      // pA = start of A array
 
-dgemm_kernel_L4_M8_BEGIN:
+.Ldgemm_kernel_L4_M8_BEGIN:
 
        mov     counterI, origM
        asr     counterI, counterI, #3          // counterI = counterI / 8
        cmp     counterI, #0
-       ble     dgemm_kernel_L4_M4_BEGIN
+       ble     .Ldgemm_kernel_L4_M4_BEGIN
 
        .align 5
-dgemm_kernel_L4_M8_20:
+.Ldgemm_kernel_L4_M8_20:
 
        mov     pB, origPB
 
        asr     counterL , origK, #7            // L = K / 128
        cmp     counterL , #2                   // is there at least 4 to do?
-       blt     dgemm_kernel_L4_M8_32
+       blt     .Ldgemm_kernel_L4_M8_32
 
        KERNEL8x4_I
        KERNEL8x4_M2
@@ -1003,18 +1003,18 @@ dgemm_kernel_L4_M8_20:
        KERNEL8x4_M1_M2_x1
 
        subs    counterL, counterL, #2          // subtract 2
-       ble     dgemm_kernel_L4_M8_22a
+       ble     .Ldgemm_kernel_L4_M8_22a
 
        .align 5
-dgemm_kernel_L4_M8_22:
+.Ldgemm_kernel_L4_M8_22:
 
        KERNEL8x4_M1_M2_x64
 
        subs    counterL, counterL, #1
-       bgt     dgemm_kernel_L4_M8_22
+       bgt     .Ldgemm_kernel_L4_M8_22
 
        .align 5
-dgemm_kernel_L4_M8_22a:
+.Ldgemm_kernel_L4_M8_22a:
 
        KERNEL8x4_M1_M2_x32
        KERNEL8x4_M1_M2_x16
@@ -1025,13 +1025,13 @@ dgemm_kernel_L4_M8_22a:
        KERNEL8x4_M1
        KERNEL8x4_E
 
-       b        dgemm_kernel_L4_M8_44
+       b        .Ldgemm_kernel_L4_M8_44
 
        .align 5
-dgemm_kernel_L4_M8_32:
+.Ldgemm_kernel_L4_M8_32:
 
        tst     counterL, #1
-       ble     dgemm_kernel_L4_M8_40
+       ble     .Ldgemm_kernel_L4_M8_40
 
        KERNEL8x4_I
        KERNEL8x4_M2
@@ -1043,26 +1043,26 @@ dgemm_kernel_L4_M8_32:
        KERNEL8x4_M1
        KERNEL8x4_E
 
-       b       dgemm_kernel_L4_M8_44
+       b       .Ldgemm_kernel_L4_M8_44
 
-dgemm_kernel_L4_M8_40:
+.Ldgemm_kernel_L4_M8_40:
 
        INIT8x4
 
-dgemm_kernel_L4_M8_44:
+.Ldgemm_kernel_L4_M8_44:
 
        ands    counterL , origK, #127
-       ble     dgemm_kernel_L4_M8_100
+       ble     .Ldgemm_kernel_L4_M8_100
 
        .align 5
-dgemm_kernel_L4_M8_46:
+.Ldgemm_kernel_L4_M8_46:
 
        KERNEL8x4_SUB
 
        subs    counterL, counterL, #1
-       bne     dgemm_kernel_L4_M8_46
+       bne     .Ldgemm_kernel_L4_M8_46
 
-dgemm_kernel_L4_M8_100:
+.Ldgemm_kernel_L4_M8_100:
        prfm    PLDL2KEEP, [pCRow0, C_PRE_SIZE]
        prfm    PLDL2KEEP, [pCRow1, C_PRE_SIZE]
        prfm    PLDL2KEEP, [pCRow2, C_PRE_SIZE]
@@ -1073,20 +1073,20 @@ dgemm_kernel_L4_M8_100:
 
        SAVE8x4
 
-dgemm_kernel_L4_M8_END:
+.Ldgemm_kernel_L4_M8_END:
        subs    counterI, counterI, #1
-       bne     dgemm_kernel_L4_M8_20
+       bne     .Ldgemm_kernel_L4_M8_20
 
-dgemm_kernel_L4_M4_BEGIN:
+.Ldgemm_kernel_L4_M4_BEGIN:
 
        mov     counterI, origM
        tst     counterI , #7
-       ble     dgemm_kernel_L4_END
+       ble     .Ldgemm_kernel_L4_END
 
        tst     counterI, #4
-       ble     dgemm_kernel_L4_M2_BEGIN
+       ble     .Ldgemm_kernel_L4_M2_BEGIN
 
-dgemm_kernel_L4_M4_20:
+.Ldgemm_kernel_L4_M4_20:
 
        INIT4x4
 
@@ -1094,10 +1094,10 @@ dgemm_kernel_L4_M4_20:
 
        asr     counterL , origK, #3            // counterL = counterL / 8
        cmp     counterL , #0
-       ble     dgemm_kernel_L4_M4_40
+       ble     .Ldgemm_kernel_L4_M4_40
 
        .align 5
-dgemm_kernel_L4_M4_22:
+.Ldgemm_kernel_L4_M4_22:
 
        KERNEL4x4_SUB
        prfm    PLDL1KEEP, [pB, B_PRE_SIZE]
@@ -1118,38 +1118,38 @@ dgemm_kernel_L4_M4_22:
        prfm    PLDL1KEEP, [pA, A_PRE_SIZE]
 
        subs    counterL, counterL, #1
-       bgt     dgemm_kernel_L4_M4_22
+       bgt     .Ldgemm_kernel_L4_M4_22
 
-dgemm_kernel_L4_M4_40:
+.Ldgemm_kernel_L4_M4_40:
 
        ands    counterL , origK, #7            // counterL = counterL % 8
-       ble     dgemm_kernel_L4_M4_100
+       ble     .Ldgemm_kernel_L4_M4_100
 
-dgemm_kernel_L4_M4_42:
+.Ldgemm_kernel_L4_M4_42:
 
        KERNEL4x4_SUB
        prfm    PLDL1KEEP, [pB, B_PRE_SIZE]
        prfm    PLDL1KEEP, [pA, A_PRE_SIZE]
 
        subs    counterL, counterL, #1
-       bgt     dgemm_kernel_L4_M4_42
+       bgt     .Ldgemm_kernel_L4_M4_42
 
-dgemm_kernel_L4_M4_100:
+.Ldgemm_kernel_L4_M4_100:
 
        SAVE4x4
 
-dgemm_kernel_L4_M4_END:
+.Ldgemm_kernel_L4_M4_END:
 
-dgemm_kernel_L4_M2_BEGIN:
+.Ldgemm_kernel_L4_M2_BEGIN:
 
        mov     counterI, origM
        tst     counterI , #3
-       ble     dgemm_kernel_L4_END
+       ble     .Ldgemm_kernel_L4_END
 
        tst     counterI, #2                    // counterI = counterI / 2
-       ble     dgemm_kernel_L4_M1_BEGIN
+       ble     .Ldgemm_kernel_L4_M1_BEGIN
 
-dgemm_kernel_L4_M2_20:
+.Ldgemm_kernel_L4_M2_20:
 
        INIT2x4
 
@@ -1157,10 +1157,10 @@ dgemm_kernel_L4_M2_20:
 
        asr     counterL , origK, #3            // counterL = counterL / 8
        cmp     counterL , #0
-       ble     dgemm_kernel_L4_M2_40
+       ble     .Ldgemm_kernel_L4_M2_40
 
        .align 5
-dgemm_kernel_L4_M2_22:
+.Ldgemm_kernel_L4_M2_22:
 
        KERNEL2x4_SUB
        prfm    PLDL1KEEP, [pB, B_PRE_SIZE]
@@ -1179,37 +1179,37 @@ dgemm_kernel_L4_M2_22:
        KERNEL2x4_SUB
 
        subs    counterL, counterL, #1
-       bgt     dgemm_kernel_L4_M2_22
+       bgt     .Ldgemm_kernel_L4_M2_22
 
 
-dgemm_kernel_L4_M2_40:
+.Ldgemm_kernel_L4_M2_40:
 
        ands    counterL , origK, #7            // counterL = counterL % 8
-       ble     dgemm_kernel_L4_M2_100
+       ble     .Ldgemm_kernel_L4_M2_100
 
        prfm    PLDL1KEEP, [pA, A_PRE_SIZE]
        prfm    PLDL1KEEP, [pA, A_PRE_SIZE_64]
-dgemm_kernel_L4_M2_42:
+.Ldgemm_kernel_L4_M2_42:
 
        KERNEL2x4_SUB
        prfm    PLDL1KEEP, [pB, B_PRE_SIZE]
 
        subs    counterL, counterL, #1
-       bgt     dgemm_kernel_L4_M2_42
+       bgt     .Ldgemm_kernel_L4_M2_42
 
-dgemm_kernel_L4_M2_100:
+.Ldgemm_kernel_L4_M2_100:
 
        SAVE2x4
 
-dgemm_kernel_L4_M2_END:
+.Ldgemm_kernel_L4_M2_END:
 
 
-dgemm_kernel_L4_M1_BEGIN:
+.Ldgemm_kernel_L4_M1_BEGIN:
 
        tst     counterI, #1                    // counterI = counterI % 2
-       ble     dgemm_kernel_L4_END
+       ble     .Ldgemm_kernel_L4_END
 
-dgemm_kernel_L4_M1_20:
+.Ldgemm_kernel_L4_M1_20:
 
        INIT1x4
 
@@ -1217,10 +1217,10 @@ dgemm_kernel_L4_M1_20:
 
        asr     counterL , origK, #3            // counterL = counterL / 8
        cmp     counterL , #0
-       ble     dgemm_kernel_L4_M1_40
+       ble     .Ldgemm_kernel_L4_M1_40
 
        .align 5
-dgemm_kernel_L4_M1_22:
+.Ldgemm_kernel_L4_M1_22:
        KERNEL1x4_SUB
        prfm    PLDL1KEEP, [pB, B_PRE_SIZE]
        KERNEL1x4_SUB
@@ -1238,46 +1238,46 @@ dgemm_kernel_L4_M1_22:
        KERNEL1x4_SUB
 
        subs    counterL, counterL, #1
-       bgt     dgemm_kernel_L4_M1_22
+       bgt     .Ldgemm_kernel_L4_M1_22
 
 
-dgemm_kernel_L4_M1_40:
+.Ldgemm_kernel_L4_M1_40:
 
        ands    counterL , origK, #7            // counterL = counterL % 8
-       ble     dgemm_kernel_L4_M1_100
+       ble     .Ldgemm_kernel_L4_M1_100
 
        prfm    PLDL1KEEP, [pA, A_PRE_SIZE]
-dgemm_kernel_L4_M1_42:
+.Ldgemm_kernel_L4_M1_42:
 
        KERNEL1x4_SUB
        prfm    PLDL1KEEP, [pB, B_PRE_SIZE]
 
        subs    counterL, counterL, #1
-       bgt     dgemm_kernel_L4_M1_42
+       bgt     .Ldgemm_kernel_L4_M1_42
 
-dgemm_kernel_L4_M1_100:
+.Ldgemm_kernel_L4_M1_100:
 
        SAVE1x4
 
-dgemm_kernel_L4_END:
+.Ldgemm_kernel_L4_END:
 
        lsl     temp, origK, #5
        add     origPB, origPB, temp            // B = B + K * 4 * 8
 
        subs    counterJ, counterJ , #1         // j--
-       bgt     dgemm_kernel_L4_BEGIN
+       bgt     .Ldgemm_kernel_L4_BEGIN
 
 
 /******************************************************************************/
 
-dgemm_kernel_L2_BEGIN:   // less than 2 left in N direction
+.Ldgemm_kernel_L2_BEGIN:   // less than 2 left in N direction
 
        mov     counterJ , origN
        tst     counterJ , #3
-       ble     dgemm_kernel_L999   // error, N was less than 4?
+       ble     .Ldgemm_kernel_L999   // error, N was less than 4?
 
        tst     counterJ , #2
-       ble     dgemm_kernel_L1_BEGIN
+       ble     .Ldgemm_kernel_L1_BEGIN
 
        mov     pCRow0, pC
        add     pCRow1, pCRow0, LDC
@@ -1286,15 +1286,15 @@ dgemm_kernel_L2_BEGIN:   // less than 2 left in N direction
 
        mov     pA, origPA                      // pA = A
 
-dgemm_kernel_L2_M8_BEGIN:
+.Ldgemm_kernel_L2_M8_BEGIN:
 
        mov     counterI, origM
        asr     counterI, counterI, #3          // counterI = counterI / 8
        cmp     counterI, #0
-       ble     dgemm_kernel_L2_M4_BEGIN
+       ble     .Ldgemm_kernel_L2_M4_BEGIN
 
        .align 5
-dgemm_kernel_L2_M8_20:
+.Ldgemm_kernel_L2_M8_20:
 
        INIT8x2
 
@@ -1302,10 +1302,10 @@ dgemm_kernel_L2_M8_20:
 
        asr     counterL , origK, #3            // counterL = counterL / 8
        cmp     counterL,#0
-       ble     dgemm_kernel_L2_M8_40
+       ble     .Ldgemm_kernel_L2_M8_40
 
        .align 5
-dgemm_kernel_L2_M8_22:
+.Ldgemm_kernel_L2_M8_22:
        KERNEL8x2_SUB
        KERNEL8x2_SUB
        prfm    PLDL1KEEP, [pB, B_PRE_SIZE]
@@ -1319,41 +1319,41 @@ dgemm_kernel_L2_M8_22:
        KERNEL8x2_SUB
 
        subs    counterL, counterL, #1
-       bgt     dgemm_kernel_L2_M8_22
+       bgt     .Ldgemm_kernel_L2_M8_22
 
-dgemm_kernel_L2_M8_40:
+.Ldgemm_kernel_L2_M8_40:
 
        ands    counterL , origK, #7            // counterL = counterL % 8
-       ble     dgemm_kernel_L2_M8_100
+       ble     .Ldgemm_kernel_L2_M8_100
 
        prfm    PLDL1KEEP, [pB, B_PRE_SIZE]
        prfm    PLDL1KEEP, [pB, B_PRE_SIZE_64]
-dgemm_kernel_L2_M8_42:
+.Ldgemm_kernel_L2_M8_42:
 
        KERNEL8x2_SUB
 
        subs    counterL, counterL, #1
-       bgt     dgemm_kernel_L2_M8_42
+       bgt     .Ldgemm_kernel_L2_M8_42
 
-dgemm_kernel_L2_M8_100:
+.Ldgemm_kernel_L2_M8_100:
 
        SAVE8x2
 
-dgemm_kernel_L2_M8_END:
+.Ldgemm_kernel_L2_M8_END:
 
        subs    counterI, counterI, #1
-       bgt     dgemm_kernel_L2_M8_20
+       bgt     .Ldgemm_kernel_L2_M8_20
 
-dgemm_kernel_L2_M4_BEGIN:
+.Ldgemm_kernel_L2_M4_BEGIN:
 
        mov     counterI, origM
        tst     counterI , #7
-       ble     dgemm_kernel_L2_END
+       ble     .Ldgemm_kernel_L2_END
 
        tst     counterI, #4                    // counterI = counterI / 2
-       ble     dgemm_kernel_L2_M2_BEGIN
+       ble     .Ldgemm_kernel_L2_M2_BEGIN
 
-dgemm_kernel_L2_M4_20:
+.Ldgemm_kernel_L2_M4_20:
 
        INIT4x2
 
@@ -1361,10 +1361,10 @@ dgemm_kernel_L2_M4_20:
 
        asr     counterL , origK, #3            // counterL = counterL / 8
        cmp     counterL,#0
-       ble     dgemm_kernel_L2_M4_40
+       ble     .Ldgemm_kernel_L2_M4_40
 
        .align 5
-dgemm_kernel_L2_M4_22:
+.Ldgemm_kernel_L2_M4_22:
        KERNEL4x2_SUB
        prfm    PLDL1KEEP, [pA, A_PRE_SIZE]
        KERNEL4x2_SUB
@@ -1382,41 +1382,41 @@ dgemm_kernel_L2_M4_22:
        KERNEL4x2_SUB
 
        subs    counterL, counterL, #1
-       bgt     dgemm_kernel_L2_M4_22
+       bgt     .Ldgemm_kernel_L2_M4_22
 
 
-dgemm_kernel_L2_M4_40:
+.Ldgemm_kernel_L2_M4_40:
 
        ands    counterL , origK, #7            // counterL = counterL % 8
-       ble     dgemm_kernel_L2_M4_100
+       ble     .Ldgemm_kernel_L2_M4_100
 
        prfm    PLDL1KEEP, [pB, B_PRE_SIZE]
        prfm    PLDL1KEEP, [pB, B_PRE_SIZE_64]
-dgemm_kernel_L2_M4_42:
+.Ldgemm_kernel_L2_M4_42:
 
        KERNEL4x2_SUB
        prfm    PLDL1KEEP, [pA, A_PRE_SIZE]
 
        subs    counterL, counterL, #1
-       bgt     dgemm_kernel_L2_M4_42
+       bgt     .Ldgemm_kernel_L2_M4_42
 
-dgemm_kernel_L2_M4_100:
+.Ldgemm_kernel_L2_M4_100:
 
        SAVE4x2
 
-dgemm_kernel_L2_M4_END:
+.Ldgemm_kernel_L2_M4_END:
 
 
-dgemm_kernel_L2_M2_BEGIN:
+.Ldgemm_kernel_L2_M2_BEGIN:
 
        mov     counterI, origM
        tst     counterI , #3
-       ble     dgemm_kernel_L2_END
+       ble     .Ldgemm_kernel_L2_END
 
        tst     counterI, #2                    // counterI = counterI / 2
-       ble     dgemm_kernel_L2_M1_BEGIN
+       ble     .Ldgemm_kernel_L2_M1_BEGIN
 
-dgemm_kernel_L2_M2_20:
+.Ldgemm_kernel_L2_M2_20:
 
        INIT2x2
 
@@ -1424,9 +1424,9 @@ dgemm_kernel_L2_M2_20:
 
        asr     counterL , origK, #3            // counterL = counterL / 8
         cmp    counterL,#0
-       ble     dgemm_kernel_L2_M2_40
+       ble     .Ldgemm_kernel_L2_M2_40
 
-dgemm_kernel_L2_M2_22:
+.Ldgemm_kernel_L2_M2_22:
 
        KERNEL2x2_SUB
        prfm    PLDL1KEEP, [pB, B_PRE_SIZE]
@@ -1443,37 +1443,37 @@ dgemm_kernel_L2_M2_22:
        KERNEL2x2_SUB
 
        subs    counterL, counterL, #1
-       bgt     dgemm_kernel_L2_M2_22
+       bgt     .Ldgemm_kernel_L2_M2_22
 
        prfm    PLDL1KEEP, [pA, A_PRE_SIZE]
        prfm    PLDL1KEEP, [pA, A_PRE_SIZE_64]
        prfm    PLDL1KEEP, [pB, B_PRE_SIZE]
        prfm    PLDL1KEEP, [pB, B_PRE_SIZE_64]
-dgemm_kernel_L2_M2_40:
+.Ldgemm_kernel_L2_M2_40:
 
        ands    counterL , origK, #7            // counterL = counterL % 8
-       ble     dgemm_kernel_L2_M2_100
+       ble     .Ldgemm_kernel_L2_M2_100
 
-dgemm_kernel_L2_M2_42:
+.Ldgemm_kernel_L2_M2_42:
 
        KERNEL2x2_SUB
 
        subs    counterL, counterL, #1
-       bgt     dgemm_kernel_L2_M2_42
+       bgt     .Ldgemm_kernel_L2_M2_42
 
-dgemm_kernel_L2_M2_100:
+.Ldgemm_kernel_L2_M2_100:
 
        SAVE2x2
 
-dgemm_kernel_L2_M2_END:
+.Ldgemm_kernel_L2_M2_END:
 
 
-dgemm_kernel_L2_M1_BEGIN:
+.Ldgemm_kernel_L2_M1_BEGIN:
 
        tst     counterI, #1                    // counterI = counterI % 2
-       ble     dgemm_kernel_L2_END
+       ble     .Ldgemm_kernel_L2_END
 
-dgemm_kernel_L2_M1_20:
+.Ldgemm_kernel_L2_M1_20:
 
        INIT1x2
 
@@ -1481,9 +1481,9 @@ dgemm_kernel_L2_M1_20:
 
        asr     counterL , origK, #3            // counterL = counterL / 8
         cmp     counterL, #0
-       ble     dgemm_kernel_L2_M1_40
+       ble     .Ldgemm_kernel_L2_M1_40
 
-dgemm_kernel_L2_M1_22:
+.Ldgemm_kernel_L2_M1_22:
        KERNEL1x2_SUB
        KERNEL1x2_SUB
        prfm    PLDL1KEEP, [pB, B_PRE_SIZE]
@@ -1499,62 +1499,62 @@ dgemm_kernel_L2_M1_22:
        KERNEL1x2_SUB
 
        subs    counterL, counterL, #1
-       bgt     dgemm_kernel_L2_M1_22
+       bgt     .Ldgemm_kernel_L2_M1_22
 
        prfm    PLDL1KEEP, [pA, A_PRE_SIZE]
        prfm    PLDL1KEEP, [pB, B_PRE_SIZE]
        prfm    PLDL1KEEP, [pB, B_PRE_SIZE_64]
-dgemm_kernel_L2_M1_40:
+.Ldgemm_kernel_L2_M1_40:
 
        ands    counterL , origK, #7            // counterL = counterL % 8
-       ble     dgemm_kernel_L2_M1_100
+       ble     .Ldgemm_kernel_L2_M1_100
 
-dgemm_kernel_L2_M1_42:
+.Ldgemm_kernel_L2_M1_42:
 
        KERNEL1x2_SUB
 
        subs    counterL, counterL, #1
-       bgt     dgemm_kernel_L2_M1_42
+       bgt     .Ldgemm_kernel_L2_M1_42
 
-dgemm_kernel_L2_M1_100:
+.Ldgemm_kernel_L2_M1_100:
 
        SAVE1x2
 
-dgemm_kernel_L2_END:
+.Ldgemm_kernel_L2_END:
        add     origPB, origPB, origK, lsl #4   // B = B + K * 2 * 8
 
 /******************************************************************************/
 
-dgemm_kernel_L1_BEGIN:
+.Ldgemm_kernel_L1_BEGIN:
 
        mov     counterJ , origN
        tst     counterJ , #1
-       ble     dgemm_kernel_L999 // done
+       ble     .Ldgemm_kernel_L999 // done
 
        mov     pCRow0, pC                      // pCRow0 = C
        add     pC , pC , LDC                   // Update pC to point to next
 
        mov     pA, origPA                      // pA = A
 
-dgemm_kernel_L1_M8_BEGIN:
+.Ldgemm_kernel_L1_M8_BEGIN:
 
        mov     counterI, origM
        asr     counterI, counterI, #3          // counterI = counterI / 8
        cmp     counterI, #0
-       ble     dgemm_kernel_L1_M4_BEGIN
+       ble     .Ldgemm_kernel_L1_M4_BEGIN
 
        .align 5
-dgemm_kernel_L1_M8_20:
+.Ldgemm_kernel_L1_M8_20:
 
        INIT8x1
 
        mov     pB, origPB
        asr     counterL , origK, #3            // counterL = counterL / 8
        cmp     counterL , #0
-       ble     dgemm_kernel_L1_M8_40
+       ble     .Ldgemm_kernel_L1_M8_40
 
        .align 5
-dgemm_kernel_L1_M8_22:
+.Ldgemm_kernel_L1_M8_22:
        KERNEL8x1_SUB
        KERNEL8x1_SUB
        KERNEL8x1_SUB
@@ -1568,51 +1568,51 @@ dgemm_kernel_L1_M8_22:
        KERNEL8x1_SUB
 
        subs    counterL, counterL, #1
-       bgt     dgemm_kernel_L1_M8_22
+       bgt     .Ldgemm_kernel_L1_M8_22
 
 
-dgemm_kernel_L1_M8_40:
+.Ldgemm_kernel_L1_M8_40:
 
        ands    counterL , origK, #7            // counterL = counterL % 8
-       ble     dgemm_kernel_L1_M8_100
+       ble     .Ldgemm_kernel_L1_M8_100
 
        prfm    PLDL1KEEP, [pB, B_PRE_SIZE]
-dgemm_kernel_L1_M8_42:
+.Ldgemm_kernel_L1_M8_42:
 
        KERNEL8x1_SUB
 
        subs    counterL, counterL, #1
-       bgt     dgemm_kernel_L1_M8_42
+       bgt     .Ldgemm_kernel_L1_M8_42
 
-dgemm_kernel_L1_M8_100:
+.Ldgemm_kernel_L1_M8_100:
 
        SAVE8x1
 
-dgemm_kernel_L1_M8_END:
+.Ldgemm_kernel_L1_M8_END:
 
        subs    counterI, counterI, #1
-       bgt     dgemm_kernel_L1_M8_20
+       bgt     .Ldgemm_kernel_L1_M8_20
 
-dgemm_kernel_L1_M4_BEGIN:
+.Ldgemm_kernel_L1_M4_BEGIN:
 
        mov     counterI, origM
        tst     counterI , #7
-       ble     dgemm_kernel_L1_END
+       ble     .Ldgemm_kernel_L1_END
 
        tst     counterI, #4                    // counterI = counterI / 2
-       ble     dgemm_kernel_L1_M2_BEGIN
+       ble     .Ldgemm_kernel_L1_M2_BEGIN
 
-dgemm_kernel_L1_M4_20:
+.Ldgemm_kernel_L1_M4_20:
 
        INIT4x1
 
        mov     pB, origPB
        asr     counterL , origK, #3            // counterL = counterL / 8
        cmp     counterL , #0
-       ble     dgemm_kernel_L1_M4_40
+       ble     .Ldgemm_kernel_L1_M4_40
 
        .align 5
-dgemm_kernel_L1_M4_22:
+.Ldgemm_kernel_L1_M4_22:
        KERNEL4x1_SUB
        prfm    PLDL1KEEP, [pA, A_PRE_SIZE]
        KERNEL4x1_SUB
@@ -1630,39 +1630,39 @@ dgemm_kernel_L1_M4_22:
        KERNEL4x1_SUB
 
        subs    counterL, counterL, #1
-       bgt     dgemm_kernel_L1_M4_22
+       bgt     .Ldgemm_kernel_L1_M4_22
 
 
-dgemm_kernel_L1_M4_40:
+.Ldgemm_kernel_L1_M4_40:
 
        ands    counterL , origK, #7            // counterL = counterL % 8
-       ble     dgemm_kernel_L1_M4_100
+       ble     .Ldgemm_kernel_L1_M4_100
 
        prfm    PLDL1KEEP, [pB, B_PRE_SIZE]
-dgemm_kernel_L1_M4_42:
+.Ldgemm_kernel_L1_M4_42:
 
        KERNEL4x1_SUB
        prfm    PLDL1KEEP, [pA, A_PRE_SIZE]
 
        subs    counterL, counterL, #1
-       bgt     dgemm_kernel_L1_M4_42
+       bgt     .Ldgemm_kernel_L1_M4_42
 
-dgemm_kernel_L1_M4_100:
+.Ldgemm_kernel_L1_M4_100:
 
        SAVE4x1
 
-dgemm_kernel_L1_M4_END:
+.Ldgemm_kernel_L1_M4_END:
 
-dgemm_kernel_L1_M2_BEGIN:
+.Ldgemm_kernel_L1_M2_BEGIN:
 
        mov     counterI, origM
        tst     counterI , #3
-       ble     dgemm_kernel_L1_END
+       ble     .Ldgemm_kernel_L1_END
 
        tst     counterI, #2                    // counterI = counterI / 2
-       ble     dgemm_kernel_L1_M1_BEGIN
+       ble     .Ldgemm_kernel_L1_M1_BEGIN
 
-dgemm_kernel_L1_M2_20:
+.Ldgemm_kernel_L1_M2_20:
 
        INIT2x1
 
@@ -1670,9 +1670,9 @@ dgemm_kernel_L1_M2_20:
 
        asr     counterL , origK, #3            // counterL = counterL / 8
        cmp     counterL , #0
-       ble     dgemm_kernel_L1_M2_40
+       ble     .Ldgemm_kernel_L1_M2_40
 
-dgemm_kernel_L1_M2_22:
+.Ldgemm_kernel_L1_M2_22:
 
        KERNEL2x1_SUB
        KERNEL2x1_SUB
@@ -1689,36 +1689,36 @@ dgemm_kernel_L1_M2_22:
        KERNEL2x1_SUB
 
        subs    counterL, counterL, #1
-       bgt     dgemm_kernel_L1_M2_22
+       bgt     .Ldgemm_kernel_L1_M2_22
 
        prfm    PLDL1KEEP, [pA, A_PRE_SIZE]
        prfm    PLDL1KEEP, [pA, A_PRE_SIZE_64]
        prfm    PLDL1KEEP, [pB, B_PRE_SIZE]
-dgemm_kernel_L1_M2_40:
+.Ldgemm_kernel_L1_M2_40:
 
        ands    counterL , origK, #7            // counterL = counterL % 8
-       ble     dgemm_kernel_L1_M2_100
+       ble     .Ldgemm_kernel_L1_M2_100
 
-dgemm_kernel_L1_M2_42:
+.Ldgemm_kernel_L1_M2_42:
 
        KERNEL2x1_SUB
 
        subs    counterL, counterL, #1
-       bgt     dgemm_kernel_L1_M2_42
+       bgt     .Ldgemm_kernel_L1_M2_42
 
-dgemm_kernel_L1_M2_100:
+.Ldgemm_kernel_L1_M2_100:
 
        SAVE2x1
 
-dgemm_kernel_L1_M2_END:
+.Ldgemm_kernel_L1_M2_END:
 
 
-dgemm_kernel_L1_M1_BEGIN:
+.Ldgemm_kernel_L1_M1_BEGIN:
 
        tst     counterI, #1                    // counterI = counterI % 2
-       ble     dgemm_kernel_L1_END
+       ble     .Ldgemm_kernel_L1_END
 
-dgemm_kernel_L1_M1_20:
+.Ldgemm_kernel_L1_M1_20:
 
        INIT1x1
 
@@ -1726,10 +1726,10 @@ dgemm_kernel_L1_M1_20:
 
        asr     counterL , origK, #3            // counterL = counterL / 8
        cmp     counterL , #0
-       ble     dgemm_kernel_L1_M1_40
+       ble     .Ldgemm_kernel_L1_M1_40
 
 
-dgemm_kernel_L1_M1_22:
+.Ldgemm_kernel_L1_M1_22:
        KERNEL1x1_SUB
        KERNEL1x1_SUB
        prfm    PLDL1KEEP, [pA, A_PRE_SIZE]
@@ -1743,32 +1743,32 @@ dgemm_kernel_L1_M1_22:
        KERNEL1x1_SUB
 
        subs    counterL, counterL, #1
-       bgt     dgemm_kernel_L1_M1_22
+       bgt     .Ldgemm_kernel_L1_M1_22
 
 
-dgemm_kernel_L1_M1_40:
+.Ldgemm_kernel_L1_M1_40:
 
        ands    counterL , origK, #7            // counterL = counterL % 8
-       ble     dgemm_kernel_L1_M1_100
+       ble     .Ldgemm_kernel_L1_M1_100
 
        prfm    PLDL1KEEP, [pA, A_PRE_SIZE]
        prfm    PLDL1KEEP, [pB, B_PRE_SIZE]
-dgemm_kernel_L1_M1_42:
+.Ldgemm_kernel_L1_M1_42:
 
        KERNEL1x1_SUB
 
        subs    counterL, counterL, #1
-       bgt     dgemm_kernel_L1_M1_42
+       bgt     .Ldgemm_kernel_L1_M1_42
 
-dgemm_kernel_L1_M1_100:
+.Ldgemm_kernel_L1_M1_100:
 
        SAVE1x1
 
 
-dgemm_kernel_L1_END:
+.Ldgemm_kernel_L1_END:
 
 
-dgemm_kernel_L999:
+.Ldgemm_kernel_L999:
        mov     x0, #0                          // set return value
        ldp     d8, d9, [sp, #(0 * 16)]
        ldp     d10, d11, [sp, #(1 * 16)]
index c98a732..29d274d 100644 (file)
@@ -192,14 +192,14 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 
        lsl     LDA, LDA, #3                                    // LDA = LDA * SIZE
 
-dgemm_ncopy_L4_BEGIN:
+.Ldgemm_ncopy_L4_BEGIN:
 
        asr     J, N, #2                                        // J = N / 4
        cmp     J, #0
-       ble     dgemm_ncopy_L2_BEGIN
+       ble     .Ldgemm_ncopy_L2_BEGIN
 
        .align  5
-dgemm_ncopy_L4_M4_BEGIN:
+.Ldgemm_ncopy_L4_M4_BEGIN:
 
        mov     A01, A00
        add     A02, A01, LDA
@@ -209,128 +209,128 @@ dgemm_ncopy_L4_M4_BEGIN:
 
        asr     I, M, #2                                        // I = M / 4
        cmp     I, #0
-       ble     dgemm_ncopy_L4_M4_40
+       ble     .Ldgemm_ncopy_L4_M4_40
 
        .align  5
-dgemm_ncopy_L4_M4_20:
+.Ldgemm_ncopy_L4_M4_20:
 
        COPY4x4
 
        subs    I , I , #1
-       bne     dgemm_ncopy_L4_M4_20
+       bne     .Ldgemm_ncopy_L4_M4_20
 
 
-dgemm_ncopy_L4_M4_40:
+.Ldgemm_ncopy_L4_M4_40:
 
        and     I, M , #3
        cmp     I, #0
-       ble     dgemm_ncopy_L4_M4_END
+       ble     .Ldgemm_ncopy_L4_M4_END
 
        .align  5
-dgemm_ncopy_L4_M4_60:
+.Ldgemm_ncopy_L4_M4_60:
 
        COPY1x4
 
        subs    I , I , #1
-       bne     dgemm_ncopy_L4_M4_60
+       bne     .Ldgemm_ncopy_L4_M4_60
 
 
-dgemm_ncopy_L4_M4_END:
+.Ldgemm_ncopy_L4_M4_END:
 
        subs    J , J, #1                                               // j--
-       bne     dgemm_ncopy_L4_M4_BEGIN
+       bne     .Ldgemm_ncopy_L4_M4_BEGIN
 
 
 
 /*********************************************************************************************/
 
-dgemm_ncopy_L2_BEGIN:
+.Ldgemm_ncopy_L2_BEGIN:
 
        tst     N, #3
-       ble     dgemm_ncopy_L999
+       ble     .Ldgemm_ncopy_L999
 
        tst     N, #2
-       ble     dgemm_ncopy_L1_BEGIN
+       ble     .Ldgemm_ncopy_L1_BEGIN
 
-dgemm_ncopy_L2_M4_BEGIN:
+.Ldgemm_ncopy_L2_M4_BEGIN:
        mov     A01, A00
        add     A02, A01, LDA
        add     A00, A02, LDA
 
        asr     I, M, #2                                        // I = M / 4
        cmp     I, #0
-       ble     dgemm_ncopy_L2_M4_40
+       ble     .Ldgemm_ncopy_L2_M4_40
 
        .align  5
-dgemm_ncopy_L2_M4_20:
+.Ldgemm_ncopy_L2_M4_20:
 
        COPY4x2
 
        subs    I , I , #1
-       bne     dgemm_ncopy_L2_M4_20
+       bne     .Ldgemm_ncopy_L2_M4_20
 
 
-dgemm_ncopy_L2_M4_40:
+.Ldgemm_ncopy_L2_M4_40:
 
        and     I, M , #3
        cmp     I, #0
-       ble     dgemm_ncopy_L2_M4_END
+       ble     .Ldgemm_ncopy_L2_M4_END
 
        .align  5
-dgemm_ncopy_L2_M4_60:
+.Ldgemm_ncopy_L2_M4_60:
 
        COPY1x2
 
        subs    I , I , #1
-       bne     dgemm_ncopy_L2_M4_60
+       bne     .Ldgemm_ncopy_L2_M4_60
 
 
-dgemm_ncopy_L2_M4_END:
+.Ldgemm_ncopy_L2_M4_END:
 
 
 /*********************************************************************************************/
 
-dgemm_ncopy_L1_BEGIN:
+.Ldgemm_ncopy_L1_BEGIN:
 
        tst     N, #1
-       ble     dgemm_ncopy_L999
+       ble     .Ldgemm_ncopy_L999
 
 
-dgemm_ncopy_L1_M4_BEGIN:
+.Ldgemm_ncopy_L1_M4_BEGIN:
 
        mov     A01, A00
 
        asr     I, M, #2                                        // I = M / 4
        cmp     I, #0
-       ble     dgemm_ncopy_L1_M4_40
+       ble     .Ldgemm_ncopy_L1_M4_40
 
        .align  5
-dgemm_ncopy_L1_M4_20:
+.Ldgemm_ncopy_L1_M4_20:
 
        COPY4x1
 
        subs    I , I , #1
-       bne     dgemm_ncopy_L1_M4_20
+       bne     .Ldgemm_ncopy_L1_M4_20
 
 
-dgemm_ncopy_L1_M4_40:
+.Ldgemm_ncopy_L1_M4_40:
 
        and     I, M , #3
        cmp     I, #0
-       ble     dgemm_ncopy_L1_M4_END
+       ble     .Ldgemm_ncopy_L1_M4_END
 
        .align  5
-dgemm_ncopy_L1_M4_60:
+.Ldgemm_ncopy_L1_M4_60:
 
        COPY1x1
 
        subs    I , I , #1
-       bne     dgemm_ncopy_L1_M4_60
+       bne     .Ldgemm_ncopy_L1_M4_60
 
 
-dgemm_ncopy_L1_M4_END:
+.Ldgemm_ncopy_L1_M4_END:
 
-dgemm_ncopy_L999:
+.Ldgemm_ncopy_L999:
 
        mov     x0, #0
        RESTORE_REGS
index 1f237b4..3664248 100644 (file)
@@ -353,13 +353,13 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 
        lsl     LDA, LDA, #3                                    // LDA = LDA * SIZE
 
-dgemm_ncopy_L8_BEGIN:
+.Ldgemm_ncopy_L8_BEGIN:
 
        asr     J, N, #3                                        // J = N / 8
        cmp     J, #0
-       ble     dgemm_ncopy_L4_BEGIN
+       ble     .Ldgemm_ncopy_L4_BEGIN
 
-dgemm_ncopy_L8_M8_BEGIN:
+.Ldgemm_ncopy_L8_M8_BEGIN:
 
        mov     A01, A00
        add     A02, A01, LDA
@@ -374,46 +374,46 @@ dgemm_ncopy_L8_M8_BEGIN:
 
        asr     I, M, #3                                        // I = M / 8
        cmp     I, #0
-       ble     dgemm_ncopy_L8_M8_40
+       ble     .Ldgemm_ncopy_L8_M8_40
 
-dgemm_ncopy_L8_M8_20:
+.Ldgemm_ncopy_L8_M8_20:
 
        COPY8x8
 
        subs    I , I , #1
-       bne     dgemm_ncopy_L8_M8_20
+       bne     .Ldgemm_ncopy_L8_M8_20
 
 
-dgemm_ncopy_L8_M8_40:
+.Ldgemm_ncopy_L8_M8_40:
 
        and     I, M , #7
        cmp     I, #0
-       ble     dgemm_ncopy_L8_M8_END
+       ble     .Ldgemm_ncopy_L8_M8_END
 
-dgemm_ncopy_L8_M8_60:
+.Ldgemm_ncopy_L8_M8_60:
 
        COPY1x8
 
        subs    I , I , #1
-       bne     dgemm_ncopy_L8_M8_60
+       bne     .Ldgemm_ncopy_L8_M8_60
 
 
-dgemm_ncopy_L8_M8_END:
+.Ldgemm_ncopy_L8_M8_END:
 
        subs    J , J, #1                                               // j--
-       bne     dgemm_ncopy_L8_M8_BEGIN
+       bne     .Ldgemm_ncopy_L8_M8_BEGIN
 
 /*********************************************************************************************/
 
-dgemm_ncopy_L4_BEGIN:
+.Ldgemm_ncopy_L4_BEGIN:
 
        tst     N, #7
-       ble     dgemm_ncopy_L999
+       ble     .Ldgemm_ncopy_L999
 
        tst     N, #4
-       ble     dgemm_ncopy_L2_BEGIN
+       ble     .Ldgemm_ncopy_L2_BEGIN
 
-dgemm_ncopy_L4_M8_BEGIN:
+.Ldgemm_ncopy_L4_M8_BEGIN:
 
        mov     A01, A00
        add     A02, A01, LDA
@@ -423,118 +423,118 @@ dgemm_ncopy_L4_M8_BEGIN:
 
        asr     I, M, #3                                        // I = M / 8
        cmp     I, #0
-       ble     dgemm_ncopy_L4_M8_40
+       ble     .Ldgemm_ncopy_L4_M8_40
 
-dgemm_ncopy_L4_M8_20:
+.Ldgemm_ncopy_L4_M8_20:
 
        COPY8x4
 
        subs    I , I , #1
-       bne     dgemm_ncopy_L4_M8_20
+       bne     .Ldgemm_ncopy_L4_M8_20
 
 
-dgemm_ncopy_L4_M8_40:
+.Ldgemm_ncopy_L4_M8_40:
 
        and     I, M , #7
        cmp     I, #0
-       ble     dgemm_ncopy_L4_M8_END
+       ble     .Ldgemm_ncopy_L4_M8_END
 
-dgemm_ncopy_L4_M8_60:
+.Ldgemm_ncopy_L4_M8_60:
 
        COPY1x4
 
        subs    I , I , #1
-       bne     dgemm_ncopy_L4_M8_60
+       bne     .Ldgemm_ncopy_L4_M8_60
 
 
-dgemm_ncopy_L4_M8_END:
+.Ldgemm_ncopy_L4_M8_END:
 
 
 /*********************************************************************************************/
 
-dgemm_ncopy_L2_BEGIN:
+.Ldgemm_ncopy_L2_BEGIN:
 
        tst     N, #3
-       ble     dgemm_ncopy_L999
+       ble     .Ldgemm_ncopy_L999
 
        tst     N, #2
-       ble     dgemm_ncopy_L1_BEGIN
+       ble     .Ldgemm_ncopy_L1_BEGIN
 
-dgemm_ncopy_L2_M8_BEGIN:
+.Ldgemm_ncopy_L2_M8_BEGIN:
        mov     A01, A00
        add     A02, A01, LDA
        add     A00, A02, LDA
 
        asr     I, M, #3                                        // I = M / 8
        cmp     I, #0
-       ble     dgemm_ncopy_L2_M8_40
+       ble     .Ldgemm_ncopy_L2_M8_40
 
-dgemm_ncopy_L2_M8_20:
+.Ldgemm_ncopy_L2_M8_20:
 
        COPY8x2
 
        subs    I , I , #1
-       bne     dgemm_ncopy_L2_M8_20
+       bne     .Ldgemm_ncopy_L2_M8_20
 
 
-dgemm_ncopy_L2_M8_40:
+.Ldgemm_ncopy_L2_M8_40:
 
        and     I, M , #7
        cmp     I, #0
-       ble     dgemm_ncopy_L2_M8_END
+       ble     .Ldgemm_ncopy_L2_M8_END
 
-dgemm_ncopy_L2_M8_60:
+.Ldgemm_ncopy_L2_M8_60:
 
        COPY1x2
 
        subs    I , I , #1
-       bne     dgemm_ncopy_L2_M8_60
+       bne     .Ldgemm_ncopy_L2_M8_60
 
 
-dgemm_ncopy_L2_M8_END:
+.Ldgemm_ncopy_L2_M8_END:
 
 
 /*********************************************************************************************/
 
-dgemm_ncopy_L1_BEGIN:
+.Ldgemm_ncopy_L1_BEGIN:
 
        tst     N, #1
-       ble     dgemm_ncopy_L999
+       ble     .Ldgemm_ncopy_L999
 
 
-dgemm_ncopy_L1_M8_BEGIN:
+.Ldgemm_ncopy_L1_M8_BEGIN:
 
        mov     A01, A00
 
        asr     I, M, #3                                        // I = M / 8
        cmp     I, #0
-       ble     dgemm_ncopy_L1_M8_40
+       ble     .Ldgemm_ncopy_L1_M8_40
 
-dgemm_ncopy_L1_M8_20:
+.Ldgemm_ncopy_L1_M8_20:
 
        COPY8x1
 
        subs    I , I , #1
-       bne     dgemm_ncopy_L1_M8_20
+       bne     .Ldgemm_ncopy_L1_M8_20
 
 
-dgemm_ncopy_L1_M8_40:
+.Ldgemm_ncopy_L1_M8_40:
 
        and     I, M , #7
        cmp     I, #0
-       ble     dgemm_ncopy_L1_M8_END
+       ble     .Ldgemm_ncopy_L1_M8_END
 
-dgemm_ncopy_L1_M8_60:
+.Ldgemm_ncopy_L1_M8_60:
 
        COPY1x1
 
        subs    I , I , #1
-       bne     dgemm_ncopy_L1_M8_60
+       bne     .Ldgemm_ncopy_L1_M8_60
 
 
-dgemm_ncopy_L1_M8_END:
+.Ldgemm_ncopy_L1_M8_END:
 
-dgemm_ncopy_L999:
+.Ldgemm_ncopy_L999:
 
        mov     x0, #0
        RESTORE_REGS
index 5b2ed43..7c91352 100644 (file)
@@ -247,13 +247,13 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 
        lsl     M4, M, #5                                       // M4 = M * 4 * SIZE
 
-dgemm_tcopy_L4_BEGIN:
+.Ldgemm_tcopy_L4_BEGIN:
        asr     J, M, #2                                        // J = M / 4
        cmp     J, #0
-       ble     dgemm_tcopy_L2_BEGIN
+       ble     .Ldgemm_tcopy_L2_BEGIN
 
        .align  5
-dgemm_tcopy_L4_M4_BEGIN:
+.Ldgemm_tcopy_L4_M4_BEGIN:
 
        mov     A01, A
        add     A02, A01, LDA
@@ -266,51 +266,51 @@ dgemm_tcopy_L4_M4_BEGIN:
 
        asr     I, N, #2                                        // I = N / 4
        cmp     I, #0
-       ble     dgemm_tcopy_L4_M4_40
+       ble     .Ldgemm_tcopy_L4_M4_40
 
        .align  5
-dgemm_tcopy_L4_M4_20:
+.Ldgemm_tcopy_L4_M4_20:
 
        COPY4x4
 
        subs    I , I , #1
-       bne     dgemm_tcopy_L4_M4_20
+       bne     .Ldgemm_tcopy_L4_M4_20
 
 
-dgemm_tcopy_L4_M4_40:
+.Ldgemm_tcopy_L4_M4_40:
 
        tst     N , #2
-       ble     dgemm_tcopy_L4_M4_60
+       ble     .Ldgemm_tcopy_L4_M4_60
 
        COPY2x4
 
 
-dgemm_tcopy_L4_M4_60:
+.Ldgemm_tcopy_L4_M4_60:
 
        tst     N, #1
-       ble     dgemm_tcopy_L4_M4_END
+       ble     .Ldgemm_tcopy_L4_M4_END
 
        COPY1x4
 
 
-dgemm_tcopy_L4_M4_END:
+.Ldgemm_tcopy_L4_M4_END:
 
        subs    J , J, #1                                               // j--
-       bne     dgemm_tcopy_L4_M4_BEGIN
+       bne     .Ldgemm_tcopy_L4_M4_BEGIN
 
 
 
 /*********************************************************************************************/
 
-dgemm_tcopy_L2_BEGIN:
+.Ldgemm_tcopy_L2_BEGIN:
 
        tst     M, #3
-       ble     dgemm_tcopy_L999
+       ble     .Ldgemm_tcopy_L999
 
        tst     M, #2
-       ble     dgemm_tcopy_L1_BEGIN
+       ble     .Ldgemm_tcopy_L1_BEGIN
 
-dgemm_tcopy_L2_M4_BEGIN:
+.Ldgemm_tcopy_L2_M4_BEGIN:
        mov     A01, A
        add     A02, A01, LDA
        add     A, A02, LDA
@@ -320,80 +320,80 @@ dgemm_tcopy_L2_M4_BEGIN:
 
        asr     I, N, #2                                        // I = N / 4
        cmp     I, #0
-       ble     dgemm_tcopy_L2_M4_40
+       ble     .Ldgemm_tcopy_L2_M4_40
 
        .align  5
-dgemm_tcopy_L2_M4_20:
+.Ldgemm_tcopy_L2_M4_20:
 
        COPY4x2
 
        subs    I , I , #1
-       bne     dgemm_tcopy_L2_M4_20
+       bne     .Ldgemm_tcopy_L2_M4_20
 
 
-dgemm_tcopy_L2_M4_40:
+.Ldgemm_tcopy_L2_M4_40:
 
        tst     N , #2
-       ble     dgemm_tcopy_L2_M4_60
+       ble     .Ldgemm_tcopy_L2_M4_60
 
        COPY2x2
 
-dgemm_tcopy_L2_M4_60:
+.Ldgemm_tcopy_L2_M4_60:
 
        tst     N , #1
-       ble     dgemm_tcopy_L2_M4_END
+       ble     .Ldgemm_tcopy_L2_M4_END
 
        COPY1x2
 
 
-dgemm_tcopy_L2_M4_END:
+.Ldgemm_tcopy_L2_M4_END:
 
 
 /*********************************************************************************************/
 
-dgemm_tcopy_L1_BEGIN:
+.Ldgemm_tcopy_L1_BEGIN:
 
        tst     M, #1
-       ble     dgemm_tcopy_L999
+       ble     .Ldgemm_tcopy_L999
 
 
-dgemm_tcopy_L1_M4_BEGIN:
+.Ldgemm_tcopy_L1_M4_BEGIN:
 
        mov     A01, A                                          // A01 = A
        mov     B01, B
 
        asr     I, N, #2                                        // I = M / 4
        cmp     I, #0
-       ble     dgemm_tcopy_L1_M4_40
+       ble     .Ldgemm_tcopy_L1_M4_40
 
        .align  5
-dgemm_tcopy_L1_M4_20:
+.Ldgemm_tcopy_L1_M4_20:
 
        COPY4x1
 
        subs    I , I , #1
-       bne     dgemm_tcopy_L1_M4_20
+       bne     .Ldgemm_tcopy_L1_M4_20
 
 
-dgemm_tcopy_L1_M4_40:
+.Ldgemm_tcopy_L1_M4_40:
 
        tst     N , #2
-       ble     dgemm_tcopy_L1_M4_60
+       ble     .Ldgemm_tcopy_L1_M4_60
 
        COPY2x1
 
-dgemm_tcopy_L1_M4_60:
+.Ldgemm_tcopy_L1_M4_60:
 
        tst     N , #1
-       ble     dgemm_tcopy_L1_M4_END
+       ble     .Ldgemm_tcopy_L1_M4_END
 
        COPY1x1
 
 
-dgemm_tcopy_L1_M4_END:
+.Ldgemm_tcopy_L1_M4_END:
 
 
-dgemm_tcopy_L999:
+.Ldgemm_tcopy_L999:
        mov     x0, #0                                          // set return value
        RESTORE_REGS
        ret
index 1c57e30..9ab51ff 100644 (file)
@@ -454,13 +454,13 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 
        lsl     M8, M, #6                                       // M8 = M * 8 * SIZE
 
-dgemm_tcopy_L8_BEGIN:
+.Ldgemm_tcopy_L8_BEGIN:
        asr     J, M, #3                                        // J = M / 4
        cmp     J, #0
-       ble     dgemm_tcopy_L4_BEGIN
+       ble     .Ldgemm_tcopy_L4_BEGIN
 
        .align  5
-dgemm_tcopy_L8_M8_BEGIN:
+.Ldgemm_tcopy_L8_M8_BEGIN:
 
        mov     A01, A
        add     A02, A01, LDA
@@ -477,53 +477,53 @@ dgemm_tcopy_L8_M8_BEGIN:
 
        asr     I, N, #3                                        // I = N / 8
        cmp     I, #0
-       ble     dgemm_tcopy_L8_M8_40
+       ble     .Ldgemm_tcopy_L8_M8_40
 
        .align  5
-dgemm_tcopy_L8_M8_20:
+.Ldgemm_tcopy_L8_M8_20:
 
        COPY8x8
 
        subs    I , I , #1
-       bne     dgemm_tcopy_L8_M8_20
+       bne     .Ldgemm_tcopy_L8_M8_20
 
-dgemm_tcopy_L8_M8_40:
+.Ldgemm_tcopy_L8_M8_40:
        tst     N , #4
-       ble     dgemm_tcopy_L8_M8_60
+       ble     .Ldgemm_tcopy_L8_M8_60
 
        COPY4x8
 
-dgemm_tcopy_L8_M8_60:
+.Ldgemm_tcopy_L8_M8_60:
 
        tst     N , #2
-       ble     dgemm_tcopy_L8_M8_80
+       ble     .Ldgemm_tcopy_L8_M8_80
 
        COPY2x8
 
 
-dgemm_tcopy_L8_M8_80:
+.Ldgemm_tcopy_L8_M8_80:
 
        tst     N, #1
-       ble     dgemm_tcopy_L8_M8_END
+       ble     .Ldgemm_tcopy_L8_M8_END
 
        COPY1x8
 
 
-dgemm_tcopy_L8_M8_END:
+.Ldgemm_tcopy_L8_M8_END:
 
        subs    J , J, #1                                               // j--
-       bne     dgemm_tcopy_L8_M8_BEGIN
+       bne     .Ldgemm_tcopy_L8_M8_BEGIN
 
 /*********************************************************************************************/
 
-dgemm_tcopy_L4_BEGIN:
+.Ldgemm_tcopy_L4_BEGIN:
        tst     M, #7
-       ble     dgemm_tcopy_L999
+       ble     .Ldgemm_tcopy_L999
 
        tst     M, #4
-       ble     dgemm_tcopy_L2_BEGIN
+       ble     .Ldgemm_tcopy_L2_BEGIN
 
-dgemm_tcopy_L4_M8_BEGIN:
+.Ldgemm_tcopy_L4_M8_BEGIN:
 
        mov     A01, A
        add     A02, A01, LDA
@@ -536,51 +536,51 @@ dgemm_tcopy_L4_M8_BEGIN:
 
        asr     I, N, #3                                        // I = N / 8
        cmp     I, #0
-       ble     dgemm_tcopy_L4_M8_40
+       ble     .Ldgemm_tcopy_L4_M8_40
 
        .align  5
-dgemm_tcopy_L4_M8_20:
+.Ldgemm_tcopy_L4_M8_20:
 
        COPY8x4
 
        subs    I , I , #1
-       bne     dgemm_tcopy_L4_M8_20
+       bne     .Ldgemm_tcopy_L4_M8_20
 
-dgemm_tcopy_L4_M8_40:
+.Ldgemm_tcopy_L4_M8_40:
        tst     N , #4
-       ble     dgemm_tcopy_L4_M8_60
+       ble     .Ldgemm_tcopy_L4_M8_60
 
        COPY4x4
 
-dgemm_tcopy_L4_M8_60:
+.Ldgemm_tcopy_L4_M8_60:
 
        tst     N , #2
-       ble     dgemm_tcopy_L4_M8_80
+       ble     .Ldgemm_tcopy_L4_M8_80
 
        COPY2x4
 
 
-dgemm_tcopy_L4_M8_80:
+.Ldgemm_tcopy_L4_M8_80:
 
        tst     N, #1
-       ble     dgemm_tcopy_L4_M8_END
+       ble     .Ldgemm_tcopy_L4_M8_END
 
        COPY1x4
 
 
-dgemm_tcopy_L4_M8_END:
+.Ldgemm_tcopy_L4_M8_END:
 
 /*********************************************************************************************/
 
-dgemm_tcopy_L2_BEGIN:
+.Ldgemm_tcopy_L2_BEGIN:
 
        tst     M, #3
-       ble     dgemm_tcopy_L999
+       ble     .Ldgemm_tcopy_L999
 
        tst     M, #2
-       ble     dgemm_tcopy_L1_BEGIN
+       ble     .Ldgemm_tcopy_L1_BEGIN
 
-dgemm_tcopy_L2_M8_BEGIN:
+.Ldgemm_tcopy_L2_M8_BEGIN:
        mov     A01, A
        add     A02, A01, LDA
        add     A, A02, LDA
@@ -590,90 +590,90 @@ dgemm_tcopy_L2_M8_BEGIN:
 
        asr     I, N, #3                                        // I = N / 8
        cmp     I, #0
-       ble     dgemm_tcopy_L2_M8_40
+       ble     .Ldgemm_tcopy_L2_M8_40
 
        .align  5
-dgemm_tcopy_L2_M8_20:
+.Ldgemm_tcopy_L2_M8_20:
 
        COPY8x2
 
        subs    I , I , #1
-       bne     dgemm_tcopy_L2_M8_20
+       bne     .Ldgemm_tcopy_L2_M8_20
 
-dgemm_tcopy_L2_M8_40:
+.Ldgemm_tcopy_L2_M8_40:
        tst     N , #4
-       ble     dgemm_tcopy_L2_M8_60
+       ble     .Ldgemm_tcopy_L2_M8_60
 
        COPY4x2
 
-dgemm_tcopy_L2_M8_60:
+.Ldgemm_tcopy_L2_M8_60:
 
        tst     N , #2
-       ble     dgemm_tcopy_L2_M8_80
+       ble     .Ldgemm_tcopy_L2_M8_80
 
        COPY2x2
 
-dgemm_tcopy_L2_M8_80:
+.Ldgemm_tcopy_L2_M8_80:
 
        tst     N , #1
-       ble     dgemm_tcopy_L2_M8_END
+       ble     .Ldgemm_tcopy_L2_M8_END
 
        COPY1x2
 
 
-dgemm_tcopy_L2_M8_END:
+.Ldgemm_tcopy_L2_M8_END:
 
 
 /*********************************************************************************************/
 
-dgemm_tcopy_L1_BEGIN:
+.Ldgemm_tcopy_L1_BEGIN:
 
        tst     M, #1
-       ble     dgemm_tcopy_L999
+       ble     .Ldgemm_tcopy_L999
 
 
-dgemm_tcopy_L1_M8_BEGIN:
+.Ldgemm_tcopy_L1_M8_BEGIN:
 
        mov     A01, A                                          // A01 = A
        mov     B01, B
 
        asr     I, N, #3                                        // I = M / 8
        cmp     I, #0
-       ble     dgemm_tcopy_L1_M8_40
+       ble     .Ldgemm_tcopy_L1_M8_40
 
        .align  5
-dgemm_tcopy_L1_M8_20:
+.Ldgemm_tcopy_L1_M8_20:
 
        COPY8x1
 
        subs    I , I , #1
-       bne     dgemm_tcopy_L1_M8_20
+       bne     .Ldgemm_tcopy_L1_M8_20
 
-dgemm_tcopy_L1_M8_40:
+.Ldgemm_tcopy_L1_M8_40:
        tst     N , #4
-       ble     dgemm_tcopy_L1_M8_60
+       ble     .Ldgemm_tcopy_L1_M8_60
 
        COPY4x1
 
-dgemm_tcopy_L1_M8_60:
+.Ldgemm_tcopy_L1_M8_60:
 
        tst     N , #2
-       ble     dgemm_tcopy_L1_M8_80
+       ble     .Ldgemm_tcopy_L1_M8_80
 
        COPY2x1
 
-dgemm_tcopy_L1_M8_80:
+.Ldgemm_tcopy_L1_M8_80:
 
        tst     N , #1
-       ble     dgemm_tcopy_L1_M8_END
+       ble     .Ldgemm_tcopy_L1_M8_END
 
        COPY1x1
 
 
-dgemm_tcopy_L1_M8_END:
+.Ldgemm_tcopy_L1_M8_END:
 
 
-dgemm_tcopy_L999:
+.Ldgemm_tcopy_L999:
        mov     x0, #0                                          // set return value
        RESTORE_REGS
        ret
index 35d4779..a1a5bf2 100644 (file)
@@ -154,51 +154,51 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 #endif
 
        cmp     N, xzr
-       ble     dot_kernel_L999
+       ble     .Ldot_kernel_L999
 
        cmp     INC_X, #1
-       bne     dot_kernel_S_BEGIN
+       bne     .Ldot_kernel_S_BEGIN
        cmp     INC_Y, #1
-       bne     dot_kernel_S_BEGIN
+       bne     .Ldot_kernel_S_BEGIN
 
-dot_kernel_F_BEGIN:
+.Ldot_kernel_F_BEGIN:
 
        asr     I, N, #2
        cmp     I, xzr
-       beq     dot_kernel_F1
+       beq     .Ldot_kernel_F1
 
-dot_kernel_F4:
+.Ldot_kernel_F4:
 
        KERNEL_F4
 
        subs    I, I, #1
-       bne     dot_kernel_F4
+       bne     .Ldot_kernel_F4
 
        KERNEL_F4_FINALIZE
 
-dot_kernel_F1:
+.Ldot_kernel_F1:
 
        ands    I, N, #3
-       ble     dot_kernel_L999
+       ble     .Ldot_kernel_L999
 
-dot_kernel_F10:
+.Ldot_kernel_F10:
 
        KERNEL_F1
 
        subs    I, I, #1
-        bne     dot_kernel_F10
+        bne     .Ldot_kernel_F10
 
        ret
 
-dot_kernel_S_BEGIN:
+.Ldot_kernel_S_BEGIN:
 
        INIT_S
 
        asr     I, N, #2
        cmp     I, xzr
-       ble     dot_kernel_S1
+       ble     .Ldot_kernel_S1
 
-dot_kernel_S4:
+.Ldot_kernel_S4:
 
        KERNEL_S1
        KERNEL_S1
@@ -206,21 +206,21 @@ dot_kernel_S4:
        KERNEL_S1
 
        subs    I, I, #1
-       bne     dot_kernel_S4
+       bne     .Ldot_kernel_S4
 
-dot_kernel_S1:
+.Ldot_kernel_S1:
 
        ands    I, N, #3
-       ble     dot_kernel_L999
+       ble     .Ldot_kernel_L999
 
-dot_kernel_S10:
+.Ldot_kernel_S10:
 
        KERNEL_S1
 
        subs    I, I, #1
-        bne     dot_kernel_S10
+        bne     .Ldot_kernel_S10
 
-dot_kernel_L999:
+.Ldot_kernel_L999:
 
        ret
 
index 34fb8c2..b528aeb 100644 (file)
@@ -549,11 +549,11 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
        mov     counterJ, origN
        asr     counterJ, counterJ, #2          // J = J / 4
        cmp     counterJ, #0
-       ble     dtrmm_kernel_L2_BEGIN
+       ble     .Ldtrmm_kernel_L2_BEGIN
 
 /******************************************************************************/
 
-dtrmm_kernel_L4_BEGIN:
+.Ldtrmm_kernel_L4_BEGIN:
        mov     pCRow0, pC                      // pCRow0 = C
        add     pC, pC, LDC, lsl #2
 
@@ -563,14 +563,14 @@ dtrmm_kernel_L4_BEGIN:
 
        mov     pA, origPA                      // pA = start of A array
 
-dtrmm_kernel_L4_M4_BEGIN:
+.Ldtrmm_kernel_L4_M4_BEGIN:
 
        mov     counterI, origM
        asr     counterI, counterI, #2          // counterI = counterI / 4
        cmp     counterI, #0
-       ble     dtrmm_kernel_L4_M2_BEGIN
+       ble     .Ldtrmm_kernel_L4_M2_BEGIN
 
-dtrmm_kernel_L4_M4_20:
+.Ldtrmm_kernel_L4_M4_20:
 
 #if (defined(LEFT) &&  defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA))
        mov     pB, origPB
@@ -591,57 +591,57 @@ dtrmm_kernel_L4_M4_20:
 
        asr     counterL , tempK, #1            // L = K / 2
        cmp     counterL , #2                   // is there at least 4 to do?
-       blt     dtrmm_kernel_L4_M4_32
+       blt     .Ldtrmm_kernel_L4_M4_32
 
        KERNEL4x4_I                             // do one in the K
        KERNEL4x4_M2                            // do another in the K
 
        subs    counterL, counterL, #2
-       ble     dtrmm_kernel_L4_M4_22a
+       ble     .Ldtrmm_kernel_L4_M4_22a
        .align 5
 
-dtrmm_kernel_L4_M4_22:
+.Ldtrmm_kernel_L4_M4_22:
 
        KERNEL4x4_M1
        KERNEL4x4_M2
 
        subs    counterL, counterL, #1
-       bgt     dtrmm_kernel_L4_M4_22
+       bgt     .Ldtrmm_kernel_L4_M4_22
 
 
-dtrmm_kernel_L4_M4_22a:
+.Ldtrmm_kernel_L4_M4_22a:
 
        KERNEL4x4_M1
        KERNEL4x4_E
 
-       b        dtrmm_kernel_L4_M4_44
+       b        .Ldtrmm_kernel_L4_M4_44
 
-dtrmm_kernel_L4_M4_32:
+.Ldtrmm_kernel_L4_M4_32:
 
        tst     counterL, #1
-       ble     dtrmm_kernel_L4_M4_40
+       ble     .Ldtrmm_kernel_L4_M4_40
 
        KERNEL4x4_I
 
        KERNEL4x4_E
 
-       b       dtrmm_kernel_L4_M4_44
+       b       .Ldtrmm_kernel_L4_M4_44
 
 
-dtrmm_kernel_L4_M4_40:
+.Ldtrmm_kernel_L4_M4_40:
 
        INIT4x4
 
-dtrmm_kernel_L4_M4_44:
+.Ldtrmm_kernel_L4_M4_44:
 
        ands    counterL , tempK, #1
-       ble     dtrmm_kernel_L4_M4_100
+       ble     .Ldtrmm_kernel_L4_M4_100
 
-dtrmm_kernel_L4_M4_46:
+.Ldtrmm_kernel_L4_M4_46:
 
        KERNEL4x4_SUB
 
-dtrmm_kernel_L4_M4_100:
+.Ldtrmm_kernel_L4_M4_100:
 
        SAVE4x4
 
@@ -660,20 +660,20 @@ dtrmm_kernel_L4_M4_100:
        add     tempOffset, tempOffset, #4
 #endif
 
-dtrmm_kernel_L4_M4_END:
+.Ldtrmm_kernel_L4_M4_END:
        subs    counterI, counterI, #1
-       bne     dtrmm_kernel_L4_M4_20
+       bne     .Ldtrmm_kernel_L4_M4_20
 
-dtrmm_kernel_L4_M2_BEGIN:
+.Ldtrmm_kernel_L4_M2_BEGIN:
 
        mov     counterI, origM
        tst     counterI , #3
-       ble     dtrmm_kernel_L4_END
+       ble     .Ldtrmm_kernel_L4_END
 
        tst     counterI, #2                    // counterI = counterI / 2
-       ble     dtrmm_kernel_L4_M1_BEGIN
+       ble     .Ldtrmm_kernel_L4_M1_BEGIN
 
-dtrmm_kernel_L4_M2_20:
+.Ldtrmm_kernel_L4_M2_20:
 
        INIT2x4
 
@@ -697,9 +697,9 @@ dtrmm_kernel_L4_M2_20:
 
        asr     counterL , tempK, #3            // counterL = counterL / 8
        cmp     counterL , #0
-       ble     dtrmm_kernel_L4_M2_40
+       ble     .Ldtrmm_kernel_L4_M2_40
 
-dtrmm_kernel_L4_M2_22:
+.Ldtrmm_kernel_L4_M2_22:
 
        KERNEL2x4_SUB
        KERNEL2x4_SUB
@@ -712,22 +712,22 @@ dtrmm_kernel_L4_M2_22:
        KERNEL2x4_SUB
 
        subs    counterL, counterL, #1
-       bgt     dtrmm_kernel_L4_M2_22
+       bgt     .Ldtrmm_kernel_L4_M2_22
 
 
-dtrmm_kernel_L4_M2_40:
+.Ldtrmm_kernel_L4_M2_40:
 
        ands    counterL , tempK, #7            // counterL = counterL % 8
-       ble     dtrmm_kernel_L4_M2_100
+       ble     .Ldtrmm_kernel_L4_M2_100
 
-dtrmm_kernel_L4_M2_42:
+.Ldtrmm_kernel_L4_M2_42:
 
        KERNEL2x4_SUB
 
        subs    counterL, counterL, #1
-       bgt     dtrmm_kernel_L4_M2_42
+       bgt     .Ldtrmm_kernel_L4_M2_42
 
-dtrmm_kernel_L4_M2_100:
+.Ldtrmm_kernel_L4_M2_100:
 
        SAVE2x4
 
@@ -747,15 +747,15 @@ dtrmm_kernel_L4_M2_100:
        add     tempOffset, tempOffset, #2
 #endif
 
-dtrmm_kernel_L4_M2_END:
+.Ldtrmm_kernel_L4_M2_END:
 
 
-dtrmm_kernel_L4_M1_BEGIN:
+.Ldtrmm_kernel_L4_M1_BEGIN:
 
        tst     counterI, #1                    // counterI = counterI % 2
-       ble     dtrmm_kernel_L4_END
+       ble     .Ldtrmm_kernel_L4_END
 
-dtrmm_kernel_L4_M1_20:
+.Ldtrmm_kernel_L4_M1_20:
 
        INIT1x4
 
@@ -779,9 +779,9 @@ dtrmm_kernel_L4_M1_20:
 
        asr     counterL , tempK, #3            // counterL = counterL / 8
        cmp     counterL , #0
-       ble     dtrmm_kernel_L4_M1_40
+       ble     .Ldtrmm_kernel_L4_M1_40
 
-dtrmm_kernel_L4_M1_22:
+.Ldtrmm_kernel_L4_M1_22:
        KERNEL1x4_SUB
        KERNEL1x4_SUB
        KERNEL1x4_SUB
@@ -793,22 +793,22 @@ dtrmm_kernel_L4_M1_22:
        KERNEL1x4_SUB
 
        subs    counterL, counterL, #1
-       bgt     dtrmm_kernel_L4_M1_22
+       bgt     .Ldtrmm_kernel_L4_M1_22
 
 
-dtrmm_kernel_L4_M1_40:
+.Ldtrmm_kernel_L4_M1_40:
 
        ands    counterL , tempK, #7            // counterL = counterL % 8
-       ble     dtrmm_kernel_L4_M1_100
+       ble     .Ldtrmm_kernel_L4_M1_100
 
-dtrmm_kernel_L4_M1_42:
+.Ldtrmm_kernel_L4_M1_42:
 
        KERNEL1x4_SUB
 
        subs    counterL, counterL, #1
-       bgt     dtrmm_kernel_L4_M1_42
+       bgt     .Ldtrmm_kernel_L4_M1_42
 
-dtrmm_kernel_L4_M1_100:
+.Ldtrmm_kernel_L4_M1_100:
 
        SAVE1x4
 
@@ -828,7 +828,7 @@ dtrmm_kernel_L4_M1_100:
        add     tempOffset, tempOffset, #1
 #endif
 
-dtrmm_kernel_L4_END:
+.Ldtrmm_kernel_L4_END:
 
        lsl     temp, origK, #5 
        add     origPB, origPB, temp            // B = B + K * 4 * 8
@@ -838,19 +838,19 @@ dtrmm_kernel_L4_END:
 #endif
 
        subs    counterJ, counterJ , #1         // j--
-       bgt     dtrmm_kernel_L4_BEGIN
+       bgt     .Ldtrmm_kernel_L4_BEGIN
 
 
 /******************************************************************************/
 
-dtrmm_kernel_L2_BEGIN:   // less than 2 left in N direction
+.Ldtrmm_kernel_L2_BEGIN:   // less than 2 left in N direction
 
        mov     counterJ , origN
        tst     counterJ , #3
-       ble     dtrmm_kernel_L999   // error, N was less than 4?
+       ble     .Ldtrmm_kernel_L999   // error, N was less than 4?
 
        tst     counterJ , #2
-       ble     dtrmm_kernel_L1_BEGIN
+       ble     .Ldtrmm_kernel_L1_BEGIN
 
        mov     pCRow0, pC                      // pCRow0 = pC
 
@@ -863,14 +863,14 @@ dtrmm_kernel_L2_BEGIN:   // less than 2 left in N direction
        mov     pA, origPA                      // pA = A
 
 
-dtrmm_kernel_L2_M4_BEGIN:
+.Ldtrmm_kernel_L2_M4_BEGIN:
 
        mov     counterI, origM
        asr     counterI, counterI, #2          // counterI = counterI / 4
        cmp     counterI,#0
-       ble     dtrmm_kernel_L2_M2_BEGIN
+       ble     .Ldtrmm_kernel_L2_M2_BEGIN
 
-dtrmm_kernel_L2_M4_20:
+.Ldtrmm_kernel_L2_M4_20:
 
        INIT4x2
 
@@ -894,10 +894,10 @@ dtrmm_kernel_L2_M4_20:
 
        asr     counterL , tempK, #3            // counterL = counterL / 8
        cmp     counterL,#0
-       ble     dtrmm_kernel_L2_M4_40
+       ble     .Ldtrmm_kernel_L2_M4_40
        .align 5
 
-dtrmm_kernel_L2_M4_22:
+.Ldtrmm_kernel_L2_M4_22:
        KERNEL4x2_SUB
        KERNEL4x2_SUB
        KERNEL4x2_SUB
@@ -909,22 +909,22 @@ dtrmm_kernel_L2_M4_22:
        KERNEL4x2_SUB
 
        subs    counterL, counterL, #1
-       bgt     dtrmm_kernel_L2_M4_22
+       bgt     .Ldtrmm_kernel_L2_M4_22
 
 
-dtrmm_kernel_L2_M4_40:
+.Ldtrmm_kernel_L2_M4_40:
 
        ands    counterL , tempK, #7            // counterL = counterL % 8
-       ble     dtrmm_kernel_L2_M4_100
+       ble     .Ldtrmm_kernel_L2_M4_100
 
-dtrmm_kernel_L2_M4_42:
+.Ldtrmm_kernel_L2_M4_42:
 
        KERNEL4x2_SUB
 
        subs    counterL, counterL, #1
-       bgt     dtrmm_kernel_L2_M4_42
+       bgt     .Ldtrmm_kernel_L2_M4_42
 
-dtrmm_kernel_L2_M4_100:
+.Ldtrmm_kernel_L2_M4_100:
 
        SAVE4x2
 
@@ -944,22 +944,22 @@ dtrmm_kernel_L2_M4_100:
        add     tempOffset, tempOffset, #4
 #endif
 
-dtrmm_kernel_L2_M4_END:
+.Ldtrmm_kernel_L2_M4_END:
 
        subs    counterI, counterI, #1
-       bgt     dtrmm_kernel_L2_M4_20
+       bgt     .Ldtrmm_kernel_L2_M4_20
 
 
-dtrmm_kernel_L2_M2_BEGIN:
+.Ldtrmm_kernel_L2_M2_BEGIN:
 
        mov     counterI, origM
        tst     counterI , #3
-       ble     dtrmm_kernel_L2_END
+       ble     .Ldtrmm_kernel_L2_END
 
        tst     counterI, #2                    // counterI = counterI / 2
-       ble     dtrmm_kernel_L2_M1_BEGIN
+       ble     .Ldtrmm_kernel_L2_M1_BEGIN
 
-dtrmm_kernel_L2_M2_20:
+.Ldtrmm_kernel_L2_M2_20:
 
        INIT2x2
 
@@ -983,9 +983,9 @@ dtrmm_kernel_L2_M2_20:
 
        asr     counterL , tempK, #3            // counterL = counterL / 8
         cmp    counterL,#0
-       ble     dtrmm_kernel_L2_M2_40
+       ble     .Ldtrmm_kernel_L2_M2_40
 
-dtrmm_kernel_L2_M2_22:
+.Ldtrmm_kernel_L2_M2_22:
 
        KERNEL2x2_SUB
        KERNEL2x2_SUB
@@ -998,22 +998,22 @@ dtrmm_kernel_L2_M2_22:
        KERNEL2x2_SUB
 
        subs    counterL, counterL, #1
-       bgt     dtrmm_kernel_L2_M2_22
+       bgt     .Ldtrmm_kernel_L2_M2_22
 
 
-dtrmm_kernel_L2_M2_40:
+.Ldtrmm_kernel_L2_M2_40:
 
        ands    counterL , tempK, #7            // counterL = counterL % 8
-       ble     dtrmm_kernel_L2_M2_100
+       ble     .Ldtrmm_kernel_L2_M2_100
 
-dtrmm_kernel_L2_M2_42:
+.Ldtrmm_kernel_L2_M2_42:
 
        KERNEL2x2_SUB
 
        subs    counterL, counterL, #1
-       bgt     dtrmm_kernel_L2_M2_42
+       bgt     .Ldtrmm_kernel_L2_M2_42
 
-dtrmm_kernel_L2_M2_100:
+.Ldtrmm_kernel_L2_M2_100:
 
        SAVE2x2
 
@@ -1033,15 +1033,15 @@ dtrmm_kernel_L2_M2_100:
        add     tempOffset, tempOffset, #2
 #endif
 
-dtrmm_kernel_L2_M2_END:
+.Ldtrmm_kernel_L2_M2_END:
 
 
-dtrmm_kernel_L2_M1_BEGIN:
+.Ldtrmm_kernel_L2_M1_BEGIN:
 
        tst     counterI, #1                    // counterI = counterI % 2
-       ble     dtrmm_kernel_L2_END
+       ble     .Ldtrmm_kernel_L2_END
 
-dtrmm_kernel_L2_M1_20:
+.Ldtrmm_kernel_L2_M1_20:
 
        INIT1x2
 
@@ -1065,9 +1065,9 @@ dtrmm_kernel_L2_M1_20:
 
        asr     counterL , tempK, #3            // counterL = counterL / 8
         cmp     counterL, #0
-       ble     dtrmm_kernel_L2_M1_40
+       ble     .Ldtrmm_kernel_L2_M1_40
 
-dtrmm_kernel_L2_M1_22:
+.Ldtrmm_kernel_L2_M1_22:
        KERNEL1x2_SUB
        KERNEL1x2_SUB
        KERNEL1x2_SUB
@@ -1079,22 +1079,22 @@ dtrmm_kernel_L2_M1_22:
        KERNEL1x2_SUB
 
        subs    counterL, counterL, #1
-       bgt     dtrmm_kernel_L2_M1_22
+       bgt     .Ldtrmm_kernel_L2_M1_22
 
 
-dtrmm_kernel_L2_M1_40:
+.Ldtrmm_kernel_L2_M1_40:
 
        ands    counterL , tempK, #7            // counterL = counterL % 8
-       ble     dtrmm_kernel_L2_M1_100
+       ble     .Ldtrmm_kernel_L2_M1_100
 
-dtrmm_kernel_L2_M1_42:
+.Ldtrmm_kernel_L2_M1_42:
 
        KERNEL1x2_SUB
 
        subs    counterL, counterL, #1
-       bgt     dtrmm_kernel_L2_M1_42
+       bgt     .Ldtrmm_kernel_L2_M1_42
 
-dtrmm_kernel_L2_M1_100:
+.Ldtrmm_kernel_L2_M1_100:
 
        SAVE1x2
 
@@ -1114,7 +1114,7 @@ dtrmm_kernel_L2_M1_100:
        add     tempOffset, tempOffset, #1
 #endif
 
-dtrmm_kernel_L2_END:
+.Ldtrmm_kernel_L2_END:
 #if !defined(LEFT)
        add     tempOffset, tempOffset, #2
 #endif
@@ -1122,11 +1122,11 @@ dtrmm_kernel_L2_END:
 
 /******************************************************************************/
 
-dtrmm_kernel_L1_BEGIN:
+.Ldtrmm_kernel_L1_BEGIN:
 
        mov     counterJ , origN
        tst     counterJ , #1
-       ble     dtrmm_kernel_L999 // done
+       ble     .Ldtrmm_kernel_L999 // done
 
 
        mov     pCRow0, pC                      // pCRow0 = C
@@ -1138,14 +1138,14 @@ dtrmm_kernel_L1_BEGIN:
 
        mov     pA, origPA                      // pA = A
 
-dtrmm_kernel_L1_M4_BEGIN:
+.Ldtrmm_kernel_L1_M4_BEGIN:
 
        mov     counterI, origM
        asr     counterI, counterI, #2          // counterI = counterI / 4
        cmp     counterI, #0
-       ble     dtrmm_kernel_L1_M2_BEGIN
+       ble     .Ldtrmm_kernel_L1_M2_BEGIN
 
-dtrmm_kernel_L1_M4_20:
+.Ldtrmm_kernel_L1_M4_20:
 
        INIT4x1
 
@@ -1169,10 +1169,10 @@ dtrmm_kernel_L1_M4_20:
 
        asr     counterL , tempK, #3            // counterL = counterL / 8
        cmp     counterL , #0
-       ble     dtrmm_kernel_L1_M4_40
+       ble     .Ldtrmm_kernel_L1_M4_40
        .align 5
 
-dtrmm_kernel_L1_M4_22:
+.Ldtrmm_kernel_L1_M4_22:
        KERNEL4x1_SUB
        KERNEL4x1_SUB
        KERNEL4x1_SUB
@@ -1184,22 +1184,22 @@ dtrmm_kernel_L1_M4_22:
        KERNEL4x1_SUB
 
        subs    counterL, counterL, #1
-       bgt     dtrmm_kernel_L1_M4_22
+       bgt     .Ldtrmm_kernel_L1_M4_22
 
 
-dtrmm_kernel_L1_M4_40:
+.Ldtrmm_kernel_L1_M4_40:
 
        ands    counterL , tempK, #7            // counterL = counterL % 8
-       ble     dtrmm_kernel_L1_M4_100
+       ble     .Ldtrmm_kernel_L1_M4_100
 
-dtrmm_kernel_L1_M4_42:
+.Ldtrmm_kernel_L1_M4_42:
 
        KERNEL4x1_SUB
 
        subs    counterL, counterL, #1
-       bgt     dtrmm_kernel_L1_M4_42
+       bgt     .Ldtrmm_kernel_L1_M4_42
 
-dtrmm_kernel_L1_M4_100:
+.Ldtrmm_kernel_L1_M4_100:
 
        SAVE4x1
 
@@ -1220,22 +1220,22 @@ dtrmm_kernel_L1_M4_100:
        add     tempOffset, tempOffset, #4
 #endif
 
-dtrmm_kernel_L1_M4_END:
+.Ldtrmm_kernel_L1_M4_END:
 
        subs    counterI, counterI, #1
-       bgt     dtrmm_kernel_L1_M4_20
+       bgt     .Ldtrmm_kernel_L1_M4_20
 
 
-dtrmm_kernel_L1_M2_BEGIN:
+.Ldtrmm_kernel_L1_M2_BEGIN:
 
        mov     counterI, origM
        tst     counterI , #3
-       ble     dtrmm_kernel_L1_END
+       ble     .Ldtrmm_kernel_L1_END
 
        tst     counterI, #2                    // counterI = counterI / 2
-       ble     dtrmm_kernel_L1_M1_BEGIN
+       ble     .Ldtrmm_kernel_L1_M1_BEGIN
 
-dtrmm_kernel_L1_M2_20:
+.Ldtrmm_kernel_L1_M2_20:
 
        INIT2x1
 
@@ -1259,9 +1259,9 @@ dtrmm_kernel_L1_M2_20:
 
        asr     counterL , tempK, #3            // counterL = counterL / 8
        cmp     counterL , #0
-       ble     dtrmm_kernel_L1_M2_40
+       ble     .Ldtrmm_kernel_L1_M2_40
 
-dtrmm_kernel_L1_M2_22:
+.Ldtrmm_kernel_L1_M2_22:
 
        KERNEL2x1_SUB
        KERNEL2x1_SUB
@@ -1274,22 +1274,22 @@ dtrmm_kernel_L1_M2_22:
        KERNEL2x1_SUB
 
        subs    counterL, counterL, #1
-       bgt     dtrmm_kernel_L1_M2_22
+       bgt     .Ldtrmm_kernel_L1_M2_22
 
 
-dtrmm_kernel_L1_M2_40:
+.Ldtrmm_kernel_L1_M2_40:
 
        ands    counterL , tempK, #7            // counterL = counterL % 8
-       ble     dtrmm_kernel_L1_M2_100
+       ble     .Ldtrmm_kernel_L1_M2_100
 
-dtrmm_kernel_L1_M2_42:
+.Ldtrmm_kernel_L1_M2_42:
 
        KERNEL2x1_SUB
 
        subs    counterL, counterL, #1
-       bgt     dtrmm_kernel_L1_M2_42
+       bgt     .Ldtrmm_kernel_L1_M2_42
 
-dtrmm_kernel_L1_M2_100:
+.Ldtrmm_kernel_L1_M2_100:
 
        SAVE2x1
 
@@ -1309,15 +1309,15 @@ dtrmm_kernel_L1_M2_100:
        add     tempOffset, tempOffset, #2
 #endif
 
-dtrmm_kernel_L1_M2_END:
+.Ldtrmm_kernel_L1_M2_END:
 
 
-dtrmm_kernel_L1_M1_BEGIN:
+.Ldtrmm_kernel_L1_M1_BEGIN:
 
        tst     counterI, #1                    // counterI = counterI % 2
-       ble     dtrmm_kernel_L1_END
+       ble     .Ldtrmm_kernel_L1_END
 
-dtrmm_kernel_L1_M1_20:
+.Ldtrmm_kernel_L1_M1_20:
 
        INIT1x1
 
@@ -1341,9 +1341,9 @@ dtrmm_kernel_L1_M1_20:
 
        asr     counterL , tempK, #3            // counterL = counterL / 8
        cmp     counterL , #0
-       ble     dtrmm_kernel_L1_M1_40
+       ble     .Ldtrmm_kernel_L1_M1_40
 
-dtrmm_kernel_L1_M1_22:
+.Ldtrmm_kernel_L1_M1_22:
        KERNEL1x1_SUB
        KERNEL1x1_SUB
        KERNEL1x1_SUB
@@ -1355,30 +1355,30 @@ dtrmm_kernel_L1_M1_22:
        KERNEL1x1_SUB
 
        subs    counterL, counterL, #1
-       bgt     dtrmm_kernel_L1_M1_22
+       bgt     .Ldtrmm_kernel_L1_M1_22
 
 
-dtrmm_kernel_L1_M1_40:
+.Ldtrmm_kernel_L1_M1_40:
 
        ands    counterL , tempK, #7            // counterL = counterL % 8
-       ble     dtrmm_kernel_L1_M1_100
+       ble     .Ldtrmm_kernel_L1_M1_100
 
-dtrmm_kernel_L1_M1_42:
+.Ldtrmm_kernel_L1_M1_42:
 
        KERNEL1x1_SUB
 
        subs    counterL, counterL, #1
-       bgt     dtrmm_kernel_L1_M1_42
+       bgt     .Ldtrmm_kernel_L1_M1_42
 
-dtrmm_kernel_L1_M1_100:
+.Ldtrmm_kernel_L1_M1_100:
 
        SAVE1x1
 
 
-dtrmm_kernel_L1_END:
+.Ldtrmm_kernel_L1_END:
 
 
-dtrmm_kernel_L999:
+.Ldtrmm_kernel_L999:
        mov     x0, #0                          // set return value
        ldp     d8, d9, [sp, #(0 * 16)]
        ldp     d10, d11, [sp, #(1 * 16)]
index 4aecf28..47956de 100644 (file)
@@ -900,11 +900,11 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
        mov     counterJ, origN
        asr     counterJ, counterJ, #3          // J = J / 8
        cmp     counterJ, #0
-       ble     dtrmm_kernel_L4_BEGIN
+       ble     .Ldtrmm_kernel_L4_BEGIN
 
 /******************************************************************************/
 
-dtrmm_kernel_L8_BEGIN:
+.Ldtrmm_kernel_L8_BEGIN:
 
        mov     pCRow0, pC                      // pCRow0 = C
        add     pC, pC, LDC, lsl #3
@@ -915,14 +915,14 @@ dtrmm_kernel_L8_BEGIN:
 
        mov     pA, origPA                      // pA = start of A array
 
-dtrmm_kernel_L8_M4_BEGIN:
+.Ldtrmm_kernel_L8_M4_BEGIN:
 
        mov     counterI, origM
        asr     counterI, counterI, #2          // counterI = counterI / 4
        cmp     counterI, #0
-       ble     dtrmm_kernel_L8_M2_BEGIN
+       ble     .Ldtrmm_kernel_L8_M2_BEGIN
 
-dtrmm_kernel_L8_M4_20:
+.Ldtrmm_kernel_L8_M4_20:
 
 #if (defined(LEFT) &&  defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA))
        mov     pB, origPB
@@ -944,57 +944,57 @@ dtrmm_kernel_L8_M4_20:
 
        asr     counterL, tempK, #1             // L = K / 2
        cmp     counterL , #2                   // is there at least 4 to do?
-       blt     dtrmm_kernel_L8_M4_32
+       blt     .Ldtrmm_kernel_L8_M4_32
 
        KERNEL4x8_I                             // do one in the K
        KERNEL4x8_M2                            // do another in the K
 
        subs    counterL, counterL, #2
-       ble     dtrmm_kernel_L8_M4_22a
+       ble     .Ldtrmm_kernel_L8_M4_22a
        .align 5
 
-dtrmm_kernel_L8_M4_22:
+.Ldtrmm_kernel_L8_M4_22:
 
        KERNEL4x8_M1
        KERNEL4x8_M2
 
        subs    counterL, counterL, #1
-       bgt     dtrmm_kernel_L8_M4_22
+       bgt     .Ldtrmm_kernel_L8_M4_22
 
 
-dtrmm_kernel_L8_M4_22a:
+.Ldtrmm_kernel_L8_M4_22a:
 
        KERNEL4x8_M1
        KERNEL4x8_E
 
-       b        dtrmm_kernel_L8_M4_44
+       b        .Ldtrmm_kernel_L8_M4_44
 
-dtrmm_kernel_L8_M4_32:
+.Ldtrmm_kernel_L8_M4_32:
 
        tst     counterL, #1
-       ble     dtrmm_kernel_L8_M4_40
+       ble     .Ldtrmm_kernel_L8_M4_40
 
        KERNEL4x8_I
 
        KERNEL4x8_E
 
-       b       dtrmm_kernel_L8_M4_44
+       b       .Ldtrmm_kernel_L8_M4_44
 
 
-dtrmm_kernel_L8_M4_40:
+.Ldtrmm_kernel_L8_M4_40:
 
        INIT4x8
 
-dtrmm_kernel_L8_M4_44:
+.Ldtrmm_kernel_L8_M4_44:
 
        ands    counterL, tempK, #1
-       ble     dtrmm_kernel_L8_M4_100
+       ble     .Ldtrmm_kernel_L8_M4_100
 
-dtrmm_kernel_L8_M4_46:
+.Ldtrmm_kernel_L8_M4_46:
 
        KERNEL4x8_SUB
 
-dtrmm_kernel_L8_M4_100:
+.Ldtrmm_kernel_L8_M4_100:
 
        SAVE4x8
 
@@ -1014,20 +1014,20 @@ dtrmm_kernel_L8_M4_100:
        add     tempOffset, tempOffset, #4
 #endif
 
-dtrmm_kernel_L8_M4_END:
+.Ldtrmm_kernel_L8_M4_END:
        subs    counterI, counterI, #1
-       bne     dtrmm_kernel_L8_M4_20
+       bne     .Ldtrmm_kernel_L8_M4_20
 
-dtrmm_kernel_L8_M2_BEGIN:
+.Ldtrmm_kernel_L8_M2_BEGIN:
 
        mov     counterI, origM
        tst     counterI , #3
-       ble     dtrmm_kernel_L8_END
+       ble     .Ldtrmm_kernel_L8_END
 
        tst     counterI, #2                    // counterI = counterI / 2
-       ble     dtrmm_kernel_L8_M1_BEGIN
+       ble     .Ldtrmm_kernel_L8_M1_BEGIN
 
-dtrmm_kernel_L8_M2_20:
+.Ldtrmm_kernel_L8_M2_20:
 
        INIT2x8
 
@@ -1051,9 +1051,9 @@ dtrmm_kernel_L8_M2_20:
 
        asr     counterL, tempK, #3             // counterL = counterL / 8
        cmp     counterL , #0
-       ble     dtrmm_kernel_L8_M2_40
+       ble     .Ldtrmm_kernel_L8_M2_40
 
-dtrmm_kernel_L8_M2_22:
+.Ldtrmm_kernel_L8_M2_22:
 
        KERNEL2x8_SUB
        KERNEL2x8_SUB
@@ -1066,22 +1066,22 @@ dtrmm_kernel_L8_M2_22:
        KERNEL2x8_SUB
 
        subs    counterL, counterL, #1
-       bgt     dtrmm_kernel_L8_M2_22
+       bgt     .Ldtrmm_kernel_L8_M2_22
 
 
-dtrmm_kernel_L8_M2_40:
+.Ldtrmm_kernel_L8_M2_40:
 
        ands    counterL, tempK, #7             // counterL = counterL % 8
-       ble     dtrmm_kernel_L8_M2_100
+       ble     .Ldtrmm_kernel_L8_M2_100
 
-dtrmm_kernel_L8_M2_42:
+.Ldtrmm_kernel_L8_M2_42:
 
        KERNEL2x8_SUB
 
        subs    counterL, counterL, #1
-       bgt     dtrmm_kernel_L8_M2_42
+       bgt     .Ldtrmm_kernel_L8_M2_42
 
-dtrmm_kernel_L8_M2_100:
+.Ldtrmm_kernel_L8_M2_100:
 
        SAVE2x8
 
@@ -1102,15 +1102,15 @@ dtrmm_kernel_L8_M2_100:
        add     tempOffset, tempOffset, #2
 #endif
 
-dtrmm_kernel_L8_M2_END:
+.Ldtrmm_kernel_L8_M2_END:
 
 
-dtrmm_kernel_L8_M1_BEGIN:
+.Ldtrmm_kernel_L8_M1_BEGIN:
 
        tst     counterI, #1                    // counterI = counterI % 2
-       ble     dtrmm_kernel_L8_END
+       ble     .Ldtrmm_kernel_L8_END
 
-dtrmm_kernel_L8_M1_20:
+.Ldtrmm_kernel_L8_M1_20:
 
        INIT1x8
 
@@ -1134,9 +1134,9 @@ dtrmm_kernel_L8_M1_20:
 
        asr     counterL, tempK, #3             // counterL = counterL / 8
        cmp     counterL , #0
-       ble     dtrmm_kernel_L8_M1_40
+       ble     .Ldtrmm_kernel_L8_M1_40
 
-dtrmm_kernel_L8_M1_22:
+.Ldtrmm_kernel_L8_M1_22:
        KERNEL1x8_SUB
        KERNEL1x8_SUB
        KERNEL1x8_SUB
@@ -1148,22 +1148,22 @@ dtrmm_kernel_L8_M1_22:
        KERNEL1x8_SUB
 
        subs    counterL, counterL, #1
-       bgt     dtrmm_kernel_L8_M1_22
+       bgt     .Ldtrmm_kernel_L8_M1_22
 
 
-dtrmm_kernel_L8_M1_40:
+.Ldtrmm_kernel_L8_M1_40:
 
        ands    counterL, tempK, #7             // counterL = counterL % 8
-       ble     dtrmm_kernel_L8_M1_100
+       ble     .Ldtrmm_kernel_L8_M1_100
 
-dtrmm_kernel_L8_M1_42:
+.Ldtrmm_kernel_L8_M1_42:
 
        KERNEL1x8_SUB
 
        subs    counterL, counterL, #1
-       bgt     dtrmm_kernel_L8_M1_42
+       bgt     .Ldtrmm_kernel_L8_M1_42
 
-dtrmm_kernel_L8_M1_100:
+.Ldtrmm_kernel_L8_M1_100:
 
        SAVE1x8
 
@@ -1183,7 +1183,7 @@ dtrmm_kernel_L8_M1_100:
        add     tempOffset, tempOffset, #1
 #endif
 
-dtrmm_kernel_L8_END:
+.Ldtrmm_kernel_L8_END:
 
        lsl     temp, origK, #6
        add     origPB, origPB, temp            // B = B + K * 8 * 8
@@ -1193,19 +1193,19 @@ dtrmm_kernel_L8_END:
 #endif
 
        subs    counterJ, counterJ , #1         // j--
-       bgt     dtrmm_kernel_L8_BEGIN
+       bgt     .Ldtrmm_kernel_L8_BEGIN
 
 
 /******************************************************************************/
 
-dtrmm_kernel_L4_BEGIN:
+.Ldtrmm_kernel_L4_BEGIN:
 
        mov     counterJ , origN
        tst     counterJ , #7
-       ble     dtrmm_kernel_L999
+       ble     .Ldtrmm_kernel_L999
 
        tst     counterJ , #4
-       ble     dtrmm_kernel_L2_BEGIN
+       ble     .Ldtrmm_kernel_L2_BEGIN
 
        mov     pCRow0, pC                      // pCRow0 = C
        add     pC, pC, LDC, lsl #2
@@ -1216,14 +1216,14 @@ dtrmm_kernel_L4_BEGIN:
 
        mov     pA, origPA                      // pA = start of A array
 
-dtrmm_kernel_L4_M4_BEGIN:
+.Ldtrmm_kernel_L4_M4_BEGIN:
 
        mov     counterI, origM
        asr     counterI, counterI, #2          // counterI = counterI / 4
        cmp     counterI, #0
-       ble     dtrmm_kernel_L4_M2_BEGIN
+       ble     .Ldtrmm_kernel_L4_M2_BEGIN
 
-dtrmm_kernel_L4_M4_20:
+.Ldtrmm_kernel_L4_M4_20:
 
 #if (defined(LEFT) &&  defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA))
        mov     pB, origPB
@@ -1244,57 +1244,57 @@ dtrmm_kernel_L4_M4_20:
 
        asr     counterL, tempK, #1             // L = K / 2
        cmp     counterL , #2                   // is there at least 4 to do?
-       blt     dtrmm_kernel_L4_M4_32
+       blt     .Ldtrmm_kernel_L4_M4_32
 
        KERNEL4x4_I                             // do one in the K
        KERNEL4x4_M2                            // do another in the K
 
        subs    counterL, counterL, #2
-       ble     dtrmm_kernel_L4_M4_22a
+       ble     .Ldtrmm_kernel_L4_M4_22a
        .align 5
 
-dtrmm_kernel_L4_M4_22:
+.Ldtrmm_kernel_L4_M4_22:
 
        KERNEL4x4_M1
        KERNEL4x4_M2
 
        subs    counterL, counterL, #1
-       bgt     dtrmm_kernel_L4_M4_22
+       bgt     .Ldtrmm_kernel_L4_M4_22
 
 
-dtrmm_kernel_L4_M4_22a:
+.Ldtrmm_kernel_L4_M4_22a:
 
        KERNEL4x4_M1
        KERNEL4x4_E
 
-       b        dtrmm_kernel_L4_M4_44
+       b        .Ldtrmm_kernel_L4_M4_44
 
-dtrmm_kernel_L4_M4_32:
+.Ldtrmm_kernel_L4_M4_32:
 
        tst     counterL, #1
-       ble     dtrmm_kernel_L4_M4_40
+       ble     .Ldtrmm_kernel_L4_M4_40
 
        KERNEL4x4_I
 
        KERNEL4x4_E
 
-       b       dtrmm_kernel_L4_M4_44
+       b       .Ldtrmm_kernel_L4_M4_44
 
 
-dtrmm_kernel_L4_M4_40:
+.Ldtrmm_kernel_L4_M4_40:
 
        INIT4x4
 
-dtrmm_kernel_L4_M4_44:
+.Ldtrmm_kernel_L4_M4_44:
 
        ands    counterL , tempK, #1
-       ble     dtrmm_kernel_L4_M4_100
+       ble     .Ldtrmm_kernel_L4_M4_100
 
-dtrmm_kernel_L4_M4_46:
+.Ldtrmm_kernel_L4_M4_46:
 
        KERNEL4x4_SUB
 
-dtrmm_kernel_L4_M4_100:
+.Ldtrmm_kernel_L4_M4_100:
 
        SAVE4x4
 #if (defined(LEFT) &&  defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA))
@@ -1312,20 +1312,20 @@ dtrmm_kernel_L4_M4_100:
        add     tempOffset, tempOffset, #4
 #endif
 
-dtrmm_kernel_L4_M4_END:
+.Ldtrmm_kernel_L4_M4_END:
        subs    counterI, counterI, #1
-       bne     dtrmm_kernel_L4_M4_20
+       bne     .Ldtrmm_kernel_L4_M4_20
 
-dtrmm_kernel_L4_M2_BEGIN:
+.Ldtrmm_kernel_L4_M2_BEGIN:
 
        mov     counterI, origM
        tst     counterI , #3
-       ble     dtrmm_kernel_L4_END
+       ble     .Ldtrmm_kernel_L4_END
 
        tst     counterI, #2                    // counterI = counterI / 2
-       ble     dtrmm_kernel_L4_M1_BEGIN
+       ble     .Ldtrmm_kernel_L4_M1_BEGIN
 
-dtrmm_kernel_L4_M2_20:
+.Ldtrmm_kernel_L4_M2_20:
 
        INIT2x4
 
@@ -1348,9 +1348,9 @@ dtrmm_kernel_L4_M2_20:
 #endif
        asr     counterL , tempK, #3            // counterL = counterL / 8
        cmp     counterL , #0
-       ble     dtrmm_kernel_L4_M2_40
+       ble     .Ldtrmm_kernel_L4_M2_40
 
-dtrmm_kernel_L4_M2_22:
+.Ldtrmm_kernel_L4_M2_22:
 
        KERNEL2x4_SUB
        KERNEL2x4_SUB
@@ -1363,22 +1363,22 @@ dtrmm_kernel_L4_M2_22:
        KERNEL2x4_SUB
 
        subs    counterL, counterL, #1
-       bgt     dtrmm_kernel_L4_M2_22
+       bgt     .Ldtrmm_kernel_L4_M2_22
 
 
-dtrmm_kernel_L4_M2_40:
+.Ldtrmm_kernel_L4_M2_40:
 
        ands    counterL , tempK, #7            // counterL = counterL % 8
-       ble     dtrmm_kernel_L4_M2_100
+       ble     .Ldtrmm_kernel_L4_M2_100
 
-dtrmm_kernel_L4_M2_42:
+.Ldtrmm_kernel_L4_M2_42:
 
        KERNEL2x4_SUB
 
        subs    counterL, counterL, #1
-       bgt     dtrmm_kernel_L4_M2_42
+       bgt     .Ldtrmm_kernel_L4_M2_42
 
-dtrmm_kernel_L4_M2_100:
+.Ldtrmm_kernel_L4_M2_100:
 
        SAVE2x4
 
@@ -1397,15 +1397,15 @@ dtrmm_kernel_L4_M2_100:
 #if defined(LEFT)
        add     tempOffset, tempOffset, #2
 #endif
-dtrmm_kernel_L4_M2_END:
+.Ldtrmm_kernel_L4_M2_END:
 
 
-dtrmm_kernel_L4_M1_BEGIN:
+.Ldtrmm_kernel_L4_M1_BEGIN:
 
        tst     counterI, #1                    // counterI = counterI % 2
-       ble     dtrmm_kernel_L4_END
+       ble     .Ldtrmm_kernel_L4_END
 
-dtrmm_kernel_L4_M1_20:
+.Ldtrmm_kernel_L4_M1_20:
 
        INIT1x4
 
@@ -1428,9 +1428,9 @@ dtrmm_kernel_L4_M1_20:
 #endif
        asr     counterL , tempK, #3            // counterL = counterL / 8
        cmp     counterL , #0
-       ble     dtrmm_kernel_L4_M1_40
+       ble     .Ldtrmm_kernel_L4_M1_40
 
-dtrmm_kernel_L4_M1_22:
+.Ldtrmm_kernel_L4_M1_22:
        KERNEL1x4_SUB
        KERNEL1x4_SUB
        KERNEL1x4_SUB
@@ -1442,22 +1442,22 @@ dtrmm_kernel_L4_M1_22:
        KERNEL1x4_SUB
 
        subs    counterL, counterL, #1
-       bgt     dtrmm_kernel_L4_M1_22
+       bgt     .Ldtrmm_kernel_L4_M1_22
 
 
-dtrmm_kernel_L4_M1_40:
+.Ldtrmm_kernel_L4_M1_40:
 
        ands    counterL , tempK, #7            // counterL = counterL % 8
-       ble     dtrmm_kernel_L4_M1_100
+       ble     .Ldtrmm_kernel_L4_M1_100
 
-dtrmm_kernel_L4_M1_42:
+.Ldtrmm_kernel_L4_M1_42:
 
        KERNEL1x4_SUB
 
        subs    counterL, counterL, #1
-       bgt     dtrmm_kernel_L4_M1_42
+       bgt     .Ldtrmm_kernel_L4_M1_42
 
-dtrmm_kernel_L4_M1_100:
+.Ldtrmm_kernel_L4_M1_100:
 
        SAVE1x4
 
@@ -1476,7 +1476,7 @@ dtrmm_kernel_L4_M1_100:
 #if defined(LEFT)
        add     tempOffset, tempOffset, #1
 #endif
-dtrmm_kernel_L4_END:
+.Ldtrmm_kernel_L4_END:
 
        lsl     temp, origK, #5 
        add     origPB, origPB, temp            // B = B + K * 4 * 8
@@ -1486,14 +1486,14 @@ dtrmm_kernel_L4_END:
 
 /******************************************************************************/
 
-dtrmm_kernel_L2_BEGIN:   // less than 2 left in N direction
+.Ldtrmm_kernel_L2_BEGIN:   // less than 2 left in N direction
 
        mov     counterJ , origN
        tst     counterJ , #3
-       ble     dtrmm_kernel_L999   // error, N was less than 4?
+       ble     .Ldtrmm_kernel_L999   // error, N was less than 4?
 
        tst     counterJ , #2
-       ble     dtrmm_kernel_L1_BEGIN
+       ble     .Ldtrmm_kernel_L1_BEGIN
 
        mov     pCRow0, pC                      // pCRow0 = pC
 
@@ -1505,14 +1505,14 @@ dtrmm_kernel_L2_BEGIN:   // less than 2 left in N direction
        mov     pA, origPA                      // pA = A
 
 
-dtrmm_kernel_L2_M4_BEGIN:
+.Ldtrmm_kernel_L2_M4_BEGIN:
 
        mov     counterI, origM
        asr     counterI, counterI, #2          // counterI = counterI / 4
        cmp     counterI,#0
-       ble     dtrmm_kernel_L2_M2_BEGIN
+       ble     .Ldtrmm_kernel_L2_M2_BEGIN
 
-dtrmm_kernel_L2_M4_20:
+.Ldtrmm_kernel_L2_M4_20:
 
        INIT4x2
 
@@ -1535,10 +1535,10 @@ dtrmm_kernel_L2_M4_20:
 #endif
        asr     counterL , tempK, #3            // counterL = counterL / 8
        cmp     counterL,#0
-       ble     dtrmm_kernel_L2_M4_40
+       ble     .Ldtrmm_kernel_L2_M4_40
        .align 5
 
-dtrmm_kernel_L2_M4_22:
+.Ldtrmm_kernel_L2_M4_22:
        KERNEL4x2_SUB
        KERNEL4x2_SUB
        KERNEL4x2_SUB
@@ -1550,22 +1550,22 @@ dtrmm_kernel_L2_M4_22:
        KERNEL4x2_SUB
 
        subs    counterL, counterL, #1
-       bgt     dtrmm_kernel_L2_M4_22
+       bgt     .Ldtrmm_kernel_L2_M4_22
 
 
-dtrmm_kernel_L2_M4_40:
+.Ldtrmm_kernel_L2_M4_40:
 
        ands    counterL , tempK, #7            // counterL = counterL % 8
-       ble     dtrmm_kernel_L2_M4_100
+       ble     .Ldtrmm_kernel_L2_M4_100
 
-dtrmm_kernel_L2_M4_42:
+.Ldtrmm_kernel_L2_M4_42:
 
        KERNEL4x2_SUB
 
        subs    counterL, counterL, #1
-       bgt     dtrmm_kernel_L2_M4_42
+       bgt     .Ldtrmm_kernel_L2_M4_42
 
-dtrmm_kernel_L2_M4_100:
+.Ldtrmm_kernel_L2_M4_100:
 
        SAVE4x2
 #if (defined(LEFT) &&  defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA))
@@ -1584,22 +1584,22 @@ dtrmm_kernel_L2_M4_100:
        add     tempOffset, tempOffset, #4
 #endif
 
-dtrmm_kernel_L2_M4_END:
+.Ldtrmm_kernel_L2_M4_END:
 
        subs    counterI, counterI, #1
-       bgt     dtrmm_kernel_L2_M4_20
+       bgt     .Ldtrmm_kernel_L2_M4_20
 
 
-dtrmm_kernel_L2_M2_BEGIN:
+.Ldtrmm_kernel_L2_M2_BEGIN:
 
        mov     counterI, origM
        tst     counterI , #3
-       ble     dtrmm_kernel_L2_END
+       ble     .Ldtrmm_kernel_L2_END
 
        tst     counterI, #2                    // counterI = counterI / 2
-       ble     dtrmm_kernel_L2_M1_BEGIN
+       ble     .Ldtrmm_kernel_L2_M1_BEGIN
 
-dtrmm_kernel_L2_M2_20:
+.Ldtrmm_kernel_L2_M2_20:
 
        INIT2x2
 
@@ -1622,9 +1622,9 @@ dtrmm_kernel_L2_M2_20:
 #endif
        asr     counterL , tempK, #3            // counterL = counterL / 8
         cmp    counterL,#0
-       ble     dtrmm_kernel_L2_M2_40
+       ble     .Ldtrmm_kernel_L2_M2_40
 
-dtrmm_kernel_L2_M2_22:
+.Ldtrmm_kernel_L2_M2_22:
 
        KERNEL2x2_SUB
        KERNEL2x2_SUB
@@ -1637,22 +1637,22 @@ dtrmm_kernel_L2_M2_22:
        KERNEL2x2_SUB
 
        subs    counterL, counterL, #1
-       bgt     dtrmm_kernel_L2_M2_22
+       bgt     .Ldtrmm_kernel_L2_M2_22
 
 
-dtrmm_kernel_L2_M2_40:
+.Ldtrmm_kernel_L2_M2_40:
 
        ands    counterL , tempK, #7            // counterL = counterL % 8
-       ble     dtrmm_kernel_L2_M2_100
+       ble     .Ldtrmm_kernel_L2_M2_100
 
-dtrmm_kernel_L2_M2_42:
+.Ldtrmm_kernel_L2_M2_42:
 
        KERNEL2x2_SUB
 
        subs    counterL, counterL, #1
-       bgt     dtrmm_kernel_L2_M2_42
+       bgt     .Ldtrmm_kernel_L2_M2_42
 
-dtrmm_kernel_L2_M2_100:
+.Ldtrmm_kernel_L2_M2_100:
 
        SAVE2x2
 
@@ -1671,15 +1671,15 @@ dtrmm_kernel_L2_M2_100:
 #if defined(LEFT)
        add     tempOffset, tempOffset, #2
 #endif
-dtrmm_kernel_L2_M2_END:
+.Ldtrmm_kernel_L2_M2_END:
 
 
-dtrmm_kernel_L2_M1_BEGIN:
+.Ldtrmm_kernel_L2_M1_BEGIN:
 
        tst     counterI, #1                    // counterI = counterI % 2
-       ble     dtrmm_kernel_L2_END
+       ble     .Ldtrmm_kernel_L2_END
 
-dtrmm_kernel_L2_M1_20:
+.Ldtrmm_kernel_L2_M1_20:
 
        INIT1x2
 
@@ -1702,9 +1702,9 @@ dtrmm_kernel_L2_M1_20:
 #endif
        asr     counterL , tempK, #3            // counterL = counterL / 8
         cmp     counterL, #0
-       ble     dtrmm_kernel_L2_M1_40
+       ble     .Ldtrmm_kernel_L2_M1_40
 
-dtrmm_kernel_L2_M1_22:
+.Ldtrmm_kernel_L2_M1_22:
        KERNEL1x2_SUB
        KERNEL1x2_SUB
        KERNEL1x2_SUB
@@ -1716,22 +1716,22 @@ dtrmm_kernel_L2_M1_22:
        KERNEL1x2_SUB
 
        subs    counterL, counterL, #1
-       bgt     dtrmm_kernel_L2_M1_22
+       bgt     .Ldtrmm_kernel_L2_M1_22
 
 
-dtrmm_kernel_L2_M1_40:
+.Ldtrmm_kernel_L2_M1_40:
 
        ands    counterL , tempK, #7            // counterL = counterL % 8
-       ble     dtrmm_kernel_L2_M1_100
+       ble     .Ldtrmm_kernel_L2_M1_100
 
-dtrmm_kernel_L2_M1_42:
+.Ldtrmm_kernel_L2_M1_42:
 
        KERNEL1x2_SUB
 
        subs    counterL, counterL, #1
-       bgt     dtrmm_kernel_L2_M1_42
+       bgt     .Ldtrmm_kernel_L2_M1_42
 
-dtrmm_kernel_L2_M1_100:
+.Ldtrmm_kernel_L2_M1_100:
 
        SAVE1x2
 
@@ -1750,7 +1750,7 @@ dtrmm_kernel_L2_M1_100:
 #if defined(LEFT)
        add     tempOffset, tempOffset, #1
 #endif
-dtrmm_kernel_L2_END:
+.Ldtrmm_kernel_L2_END:
 #if !defined(LEFT)
        add     tempOffset, tempOffset, #2
 #endif
@@ -1758,11 +1758,11 @@ dtrmm_kernel_L2_END:
 
 /******************************************************************************/
 
-dtrmm_kernel_L1_BEGIN:
+.Ldtrmm_kernel_L1_BEGIN:
 
        mov     counterJ , origN
        tst     counterJ , #1
-       ble     dtrmm_kernel_L999 // done
+       ble     .Ldtrmm_kernel_L999 // done
 
 
        mov     pCRow0, pC                      // pCRow0 = C
@@ -1773,14 +1773,14 @@ dtrmm_kernel_L1_BEGIN:
 #endif
        mov     pA, origPA                      // pA = A
 
-dtrmm_kernel_L1_M4_BEGIN:
+.Ldtrmm_kernel_L1_M4_BEGIN:
 
        mov     counterI, origM
        asr     counterI, counterI, #2          // counterI = counterI / 4
        cmp     counterI, #0
-       ble     dtrmm_kernel_L1_M2_BEGIN
+       ble     .Ldtrmm_kernel_L1_M2_BEGIN
 
-dtrmm_kernel_L1_M4_20:
+.Ldtrmm_kernel_L1_M4_20:
 
        INIT4x1
 
@@ -1802,10 +1802,10 @@ dtrmm_kernel_L1_M4_20:
 #endif
        asr     counterL , tempK, #3            // counterL = counterL / 8
        cmp     counterL , #0
-       ble     dtrmm_kernel_L1_M4_40
+       ble     .Ldtrmm_kernel_L1_M4_40
        .align 5
 
-dtrmm_kernel_L1_M4_22:
+.Ldtrmm_kernel_L1_M4_22:
        KERNEL4x1_SUB
        KERNEL4x1_SUB
        KERNEL4x1_SUB
@@ -1817,22 +1817,22 @@ dtrmm_kernel_L1_M4_22:
        KERNEL4x1_SUB
 
        subs    counterL, counterL, #1
-       bgt     dtrmm_kernel_L1_M4_22
+       bgt     .Ldtrmm_kernel_L1_M4_22
 
 
-dtrmm_kernel_L1_M4_40:
+.Ldtrmm_kernel_L1_M4_40:
 
        ands    counterL , tempK, #7            // counterL = counterL % 8
-       ble     dtrmm_kernel_L1_M4_100
+       ble     .Ldtrmm_kernel_L1_M4_100
 
-dtrmm_kernel_L1_M4_42:
+.Ldtrmm_kernel_L1_M4_42:
 
        KERNEL4x1_SUB
 
        subs    counterL, counterL, #1
-       bgt     dtrmm_kernel_L1_M4_42
+       bgt     .Ldtrmm_kernel_L1_M4_42
 
-dtrmm_kernel_L1_M4_100:
+.Ldtrmm_kernel_L1_M4_100:
 
        SAVE4x1
 #if (defined(LEFT) &&  defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA))
@@ -1851,22 +1851,22 @@ dtrmm_kernel_L1_M4_100:
        add     tempOffset, tempOffset, #4
 #endif
 
-dtrmm_kernel_L1_M4_END:
+.Ldtrmm_kernel_L1_M4_END:
 
        subs    counterI, counterI, #1
-       bgt     dtrmm_kernel_L1_M4_20
+       bgt     .Ldtrmm_kernel_L1_M4_20
 
 
-dtrmm_kernel_L1_M2_BEGIN:
+.Ldtrmm_kernel_L1_M2_BEGIN:
 
        mov     counterI, origM
        tst     counterI , #3
-       ble     dtrmm_kernel_L1_END
+       ble     .Ldtrmm_kernel_L1_END
 
        tst     counterI, #2                    // counterI = counterI / 2
-       ble     dtrmm_kernel_L1_M1_BEGIN
+       ble     .Ldtrmm_kernel_L1_M1_BEGIN
 
-dtrmm_kernel_L1_M2_20:
+.Ldtrmm_kernel_L1_M2_20:
 
        INIT2x1
 
@@ -1889,9 +1889,9 @@ dtrmm_kernel_L1_M2_20:
 #endif
        asr     counterL , tempK, #3            // counterL = counterL / 8
        cmp     counterL , #0
-       ble     dtrmm_kernel_L1_M2_40
+       ble     .Ldtrmm_kernel_L1_M2_40
 
-dtrmm_kernel_L1_M2_22:
+.Ldtrmm_kernel_L1_M2_22:
 
        KERNEL2x1_SUB
        KERNEL2x1_SUB
@@ -1904,22 +1904,22 @@ dtrmm_kernel_L1_M2_22:
        KERNEL2x1_SUB
 
        subs    counterL, counterL, #1
-       bgt     dtrmm_kernel_L1_M2_22
+       bgt     .Ldtrmm_kernel_L1_M2_22
 
 
-dtrmm_kernel_L1_M2_40:
+.Ldtrmm_kernel_L1_M2_40:
 
        ands    counterL , tempK, #7            // counterL = counterL % 8
-       ble     dtrmm_kernel_L1_M2_100
+       ble     .Ldtrmm_kernel_L1_M2_100
 
-dtrmm_kernel_L1_M2_42:
+.Ldtrmm_kernel_L1_M2_42:
 
        KERNEL2x1_SUB
 
        subs    counterL, counterL, #1
-       bgt     dtrmm_kernel_L1_M2_42
+       bgt     .Ldtrmm_kernel_L1_M2_42
 
-dtrmm_kernel_L1_M2_100:
+.Ldtrmm_kernel_L1_M2_100:
 
        SAVE2x1
 
@@ -1938,15 +1938,15 @@ dtrmm_kernel_L1_M2_100:
 #if defined(LEFT)
        add     tempOffset, tempOffset, #2
 #endif
-dtrmm_kernel_L1_M2_END:
+.Ldtrmm_kernel_L1_M2_END:
 
 
-dtrmm_kernel_L1_M1_BEGIN:
+.Ldtrmm_kernel_L1_M1_BEGIN:
 
        tst     counterI, #1                    // counterI = counterI % 2
-       ble     dtrmm_kernel_L1_END
+       ble     .Ldtrmm_kernel_L1_END
 
-dtrmm_kernel_L1_M1_20:
+.Ldtrmm_kernel_L1_M1_20:
 
        INIT1x1
 
@@ -1969,9 +1969,9 @@ dtrmm_kernel_L1_M1_20:
 #endif
        asr     counterL , tempK, #3            // counterL = counterL / 8
        cmp     counterL , #0
-       ble     dtrmm_kernel_L1_M1_40
+       ble     .Ldtrmm_kernel_L1_M1_40
 
-dtrmm_kernel_L1_M1_22:
+.Ldtrmm_kernel_L1_M1_22:
        KERNEL1x1_SUB
        KERNEL1x1_SUB
        KERNEL1x1_SUB
@@ -1983,30 +1983,30 @@ dtrmm_kernel_L1_M1_22:
        KERNEL1x1_SUB
 
        subs    counterL, counterL, #1
-       bgt     dtrmm_kernel_L1_M1_22
+       bgt     .Ldtrmm_kernel_L1_M1_22
 
 
-dtrmm_kernel_L1_M1_40:
+.Ldtrmm_kernel_L1_M1_40:
 
        ands    counterL , tempK, #7            // counterL = counterL % 8
-       ble     dtrmm_kernel_L1_M1_100
+       ble     .Ldtrmm_kernel_L1_M1_100
 
-dtrmm_kernel_L1_M1_42:
+.Ldtrmm_kernel_L1_M1_42:
 
        KERNEL1x1_SUB
 
        subs    counterL, counterL, #1
-       bgt     dtrmm_kernel_L1_M1_42
+       bgt     .Ldtrmm_kernel_L1_M1_42
 
-dtrmm_kernel_L1_M1_100:
+.Ldtrmm_kernel_L1_M1_100:
 
        SAVE1x1
 
 
-dtrmm_kernel_L1_END:
+.Ldtrmm_kernel_L1_END:
 
 
-dtrmm_kernel_L999:
+.Ldtrmm_kernel_L999:
        mov     x0, #0                          // set return value
        ldp     d8, d9, [sp, #(0 * 16)]
        ldp     d10, d11, [sp, #(1 * 16)]
index 2b81737..0ac5a5f 100644 (file)
@@ -829,11 +829,11 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
        mov     counterJ, origN
        asr     counterJ, counterJ, #2          // J = J / 4
        cmp     counterJ, #0
-       ble     dtrmm_kernel_L2_BEGIN
+       ble     .Ldtrmm_kernel_L2_BEGIN
 
 /******************************************************************************/
 
-dtrmm_kernel_L4_BEGIN:
+.Ldtrmm_kernel_L4_BEGIN:
        mov     pCRow0, pC
        add     pCRow1, pCRow0, LDC
        add     pCRow2, pCRow1, LDC
@@ -847,15 +847,15 @@ dtrmm_kernel_L4_BEGIN:
 #endif
        mov     pA, origPA                      // pA = start of A array
 
-dtrmm_kernel_L4_M8_BEGIN:
+.Ldtrmm_kernel_L4_M8_BEGIN:
 
        mov     counterI, origM
        asr     counterI, counterI, #3          // counterI = counterI / 8
        cmp     counterI, #0
-       ble     dtrmm_kernel_L4_M4_BEGIN
+       ble     .Ldtrmm_kernel_L4_M4_BEGIN
 
        .align 5
-dtrmm_kernel_L4_M8_20:
+.Ldtrmm_kernel_L4_M8_20:
 
 #if (defined(LEFT) &&  defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA))
        mov     pB, origPB
@@ -877,7 +877,7 @@ dtrmm_kernel_L4_M8_20:
 
        asr     counterL , tempK, #3            // L = K / 8
        cmp     counterL , #2                   // is there at least 4 to do?
-       blt     dtrmm_kernel_L4_M8_32
+       blt     .Ldtrmm_kernel_L4_M8_32
 
        KERNEL8x4_I                             // do one in the K
        KERNEL8x4_M2                            // do another in the K
@@ -889,10 +889,10 @@ dtrmm_kernel_L4_M8_20:
        KERNEL8x4_M2
 
        subs    counterL, counterL, #2          // subtract 2
-       ble     dtrmm_kernel_L4_M8_22a
+       ble     .Ldtrmm_kernel_L4_M8_22a
 
        .align 5
-dtrmm_kernel_L4_M8_22:
+.Ldtrmm_kernel_L4_M8_22:
 
        KERNEL8x4_M1
        KERNEL8x4_M2
@@ -904,10 +904,10 @@ dtrmm_kernel_L4_M8_22:
        KERNEL8x4_M2
 
        subs    counterL, counterL, #1
-       bgt     dtrmm_kernel_L4_M8_22
+       bgt     .Ldtrmm_kernel_L4_M8_22
 
        .align 5
-dtrmm_kernel_L4_M8_22a:
+.Ldtrmm_kernel_L4_M8_22a:
 
        KERNEL8x4_M1
        KERNEL8x4_M2
@@ -918,13 +918,13 @@ dtrmm_kernel_L4_M8_22a:
        KERNEL8x4_M1
        KERNEL8x4_E
 
-       b        dtrmm_kernel_L4_M8_44
+       b        .Ldtrmm_kernel_L4_M8_44
 
        .align 5
-dtrmm_kernel_L4_M8_32:
+.Ldtrmm_kernel_L4_M8_32:
 
        tst     counterL, #1
-       ble     dtrmm_kernel_L4_M8_40
+       ble     .Ldtrmm_kernel_L4_M8_40
 
        KERNEL8x4_I
        KERNEL8x4_M2
@@ -935,26 +935,26 @@ dtrmm_kernel_L4_M8_32:
        KERNEL8x4_M1
        KERNEL8x4_E
 
-       b       dtrmm_kernel_L4_M8_44
+       b       .Ldtrmm_kernel_L4_M8_44
 
-dtrmm_kernel_L4_M8_40:
+.Ldtrmm_kernel_L4_M8_40:
 
        INIT8x4
 
-dtrmm_kernel_L4_M8_44:
+.Ldtrmm_kernel_L4_M8_44:
 
        ands    counterL , tempK, #7
-       ble     dtrmm_kernel_L4_M8_100
+       ble     .Ldtrmm_kernel_L4_M8_100
 
        .align 5
-dtrmm_kernel_L4_M8_46:
+.Ldtrmm_kernel_L4_M8_46:
 
        KERNEL8x4_SUB
 
        subs    counterL, counterL, #1
-       bne     dtrmm_kernel_L4_M8_46
+       bne     .Ldtrmm_kernel_L4_M8_46
 
-dtrmm_kernel_L4_M8_100:
+.Ldtrmm_kernel_L4_M8_100:
 
        SAVE8x4
 
@@ -977,20 +977,20 @@ dtrmm_kernel_L4_M8_100:
        prfm    PLDL1KEEP, [pA, #64]
        prfm    PLDL1KEEP, [origPB]
 
-dtrmm_kernel_L4_M8_END:
+.Ldtrmm_kernel_L4_M8_END:
        subs    counterI, counterI, #1
-       bne     dtrmm_kernel_L4_M8_20
+       bne     .Ldtrmm_kernel_L4_M8_20
 
-dtrmm_kernel_L4_M4_BEGIN:
+.Ldtrmm_kernel_L4_M4_BEGIN:
 
        mov     counterI, origM
        tst     counterI , #7
-       ble     dtrmm_kernel_L4_END
+       ble     .Ldtrmm_kernel_L4_END
 
        tst     counterI, #4
-       ble     dtrmm_kernel_L4_M2_BEGIN
+       ble     .Ldtrmm_kernel_L4_M2_BEGIN
 
-dtrmm_kernel_L4_M4_20:
+.Ldtrmm_kernel_L4_M4_20:
 
        INIT4x4
 
@@ -1013,9 +1013,9 @@ dtrmm_kernel_L4_M4_20:
 
        asr     counterL , tempK, #3            // counterL = counterL / 8
        cmp     counterL , #0
-       ble     dtrmm_kernel_L4_M4_40
+       ble     .Ldtrmm_kernel_L4_M4_40
 
-dtrmm_kernel_L4_M4_22:
+.Ldtrmm_kernel_L4_M4_22:
 
        KERNEL4x4_SUB
        KERNEL4x4_SUB
@@ -1028,22 +1028,22 @@ dtrmm_kernel_L4_M4_22:
        KERNEL4x4_SUB
 
        subs    counterL, counterL, #1
-       bgt     dtrmm_kernel_L4_M4_22
+       bgt     .Ldtrmm_kernel_L4_M4_22
 
 
-dtrmm_kernel_L4_M4_40:
+.Ldtrmm_kernel_L4_M4_40:
 
        ands    counterL , tempK, #7            // counterL = counterL % 8
-       ble     dtrmm_kernel_L4_M4_100
+       ble     .Ldtrmm_kernel_L4_M4_100
 
-dtrmm_kernel_L4_M4_42:
+.Ldtrmm_kernel_L4_M4_42:
 
        KERNEL4x4_SUB
 
        subs    counterL, counterL, #1
-       bgt     dtrmm_kernel_L4_M4_42
+       bgt     .Ldtrmm_kernel_L4_M4_42
 
-dtrmm_kernel_L4_M4_100:
+.Ldtrmm_kernel_L4_M4_100:
 
        SAVE4x4
 
@@ -1062,19 +1062,19 @@ dtrmm_kernel_L4_M4_100:
        add     tempOffset, tempOffset, #4
 #endif
 
-dtrmm_kernel_L4_M4_END:
+.Ldtrmm_kernel_L4_M4_END:
 
 
-dtrmm_kernel_L4_M2_BEGIN:
+.Ldtrmm_kernel_L4_M2_BEGIN:
 
        mov     counterI, origM
        tst     counterI , #3
-       ble     dtrmm_kernel_L4_END
+       ble     .Ldtrmm_kernel_L4_END
 
        tst     counterI, #2                    // counterI = counterI / 2
-       ble     dtrmm_kernel_L4_M1_BEGIN
+       ble     .Ldtrmm_kernel_L4_M1_BEGIN
 
-dtrmm_kernel_L4_M2_20:
+.Ldtrmm_kernel_L4_M2_20:
 
        INIT2x4
 
@@ -1097,9 +1097,9 @@ dtrmm_kernel_L4_M2_20:
 #endif
        asr     counterL , tempK, #3            // counterL = counterL / 8
        cmp     counterL , #0
-       ble     dtrmm_kernel_L4_M2_40
+       ble     .Ldtrmm_kernel_L4_M2_40
 
-dtrmm_kernel_L4_M2_22:
+.Ldtrmm_kernel_L4_M2_22:
 
        KERNEL2x4_SUB
        KERNEL2x4_SUB
@@ -1112,22 +1112,22 @@ dtrmm_kernel_L4_M2_22:
        KERNEL2x4_SUB
 
        subs    counterL, counterL, #1
-       bgt     dtrmm_kernel_L4_M2_22
+       bgt     .Ldtrmm_kernel_L4_M2_22
 
 
-dtrmm_kernel_L4_M2_40:
+.Ldtrmm_kernel_L4_M2_40:
 
        ands    counterL , tempK, #7            // counterL = counterL % 8
-       ble     dtrmm_kernel_L4_M2_100
+       ble     .Ldtrmm_kernel_L4_M2_100
 
-dtrmm_kernel_L4_M2_42:
+.Ldtrmm_kernel_L4_M2_42:
 
        KERNEL2x4_SUB
 
        subs    counterL, counterL, #1
-       bgt     dtrmm_kernel_L4_M2_42
+       bgt     .Ldtrmm_kernel_L4_M2_42
 
-dtrmm_kernel_L4_M2_100:
+.Ldtrmm_kernel_L4_M2_100:
 
        SAVE2x4
 
@@ -1147,15 +1147,15 @@ dtrmm_kernel_L4_M2_100:
        add     tempOffset, tempOffset, #2
 #endif
 
-dtrmm_kernel_L4_M2_END:
+.Ldtrmm_kernel_L4_M2_END:
 
 
-dtrmm_kernel_L4_M1_BEGIN:
+.Ldtrmm_kernel_L4_M1_BEGIN:
 
        tst     counterI, #1                    // counterI = counterI % 2
-       ble     dtrmm_kernel_L4_END
+       ble     .Ldtrmm_kernel_L4_END
 
-dtrmm_kernel_L4_M1_20:
+.Ldtrmm_kernel_L4_M1_20:
 
        INIT1x4
 
@@ -1179,9 +1179,9 @@ dtrmm_kernel_L4_M1_20:
 
        asr     counterL , tempK, #3            // counterL = counterL / 8
        cmp     counterL , #0
-       ble     dtrmm_kernel_L4_M1_40
+       ble     .Ldtrmm_kernel_L4_M1_40
 
-dtrmm_kernel_L4_M1_22:
+.Ldtrmm_kernel_L4_M1_22:
        KERNEL1x4_SUB
        KERNEL1x4_SUB
        KERNEL1x4_SUB
@@ -1193,22 +1193,22 @@ dtrmm_kernel_L4_M1_22:
        KERNEL1x4_SUB
 
        subs    counterL, counterL, #1
-       bgt     dtrmm_kernel_L4_M1_22
+       bgt     .Ldtrmm_kernel_L4_M1_22
 
 
-dtrmm_kernel_L4_M1_40:
+.Ldtrmm_kernel_L4_M1_40:
 
        ands    counterL , tempK, #7            // counterL = counterL % 8
-       ble     dtrmm_kernel_L4_M1_100
+       ble     .Ldtrmm_kernel_L4_M1_100
 
-dtrmm_kernel_L4_M1_42:
+.Ldtrmm_kernel_L4_M1_42:
 
        KERNEL1x4_SUB
 
        subs    counterL, counterL, #1
-       bgt     dtrmm_kernel_L4_M1_42
+       bgt     .Ldtrmm_kernel_L4_M1_42
 
-dtrmm_kernel_L4_M1_100:
+.Ldtrmm_kernel_L4_M1_100:
 
        SAVE1x4
 
@@ -1228,7 +1228,7 @@ dtrmm_kernel_L4_M1_100:
        add     tempOffset, tempOffset, #1
 #endif
 
-dtrmm_kernel_L4_END:
+.Ldtrmm_kernel_L4_END:
 
        lsl     temp, origK, #5 
        add     origPB, origPB, temp            // B = B + K * 4 * 8
@@ -1238,19 +1238,19 @@ dtrmm_kernel_L4_END:
 #endif
 
        subs    counterJ, counterJ , #1         // j--
-       bgt     dtrmm_kernel_L4_BEGIN
+       bgt     .Ldtrmm_kernel_L4_BEGIN
 
 
 /******************************************************************************/
 
-dtrmm_kernel_L2_BEGIN:   // less than 2 left in N direction
+.Ldtrmm_kernel_L2_BEGIN:   // less than 2 left in N direction
 
        mov     counterJ , origN
        tst     counterJ , #3
-       ble     dtrmm_kernel_L999   // error, N was less than 4?
+       ble     .Ldtrmm_kernel_L999   // error, N was less than 4?
 
        tst     counterJ , #2
-       ble     dtrmm_kernel_L1_BEGIN
+       ble     .Ldtrmm_kernel_L1_BEGIN
 
        mov     pCRow0, pC                      // pCRow0 = pC
 
@@ -1261,14 +1261,14 @@ dtrmm_kernel_L2_BEGIN:   // less than 2 left in N direction
 #endif
        mov     pA, origPA                      // pA = A
 
-dtrmm_kernel_L2_M8_BEGIN:
+.Ldtrmm_kernel_L2_M8_BEGIN:
 
        mov     counterI, origM
        asr     counterI, counterI, #3          // counterI = counterI / 8
        cmp     counterI, #0
-       ble     dtrmm_kernel_L2_M4_BEGIN
+       ble     .Ldtrmm_kernel_L2_M4_BEGIN
 
-dtrmm_kernel_L2_M8_20:
+.Ldtrmm_kernel_L2_M8_20:
 
        INIT8x2
 
@@ -1292,10 +1292,10 @@ dtrmm_kernel_L2_M8_20:
 
        asr     counterL , tempK, #3            // counterL = counterL / 8
        cmp     counterL,#0
-       ble     dtrmm_kernel_L2_M8_40
+       ble     .Ldtrmm_kernel_L2_M8_40
        .align 5
 
-dtrmm_kernel_L2_M8_22:
+.Ldtrmm_kernel_L2_M8_22:
        KERNEL8x2_SUB
        KERNEL8x2_SUB
        KERNEL8x2_SUB
@@ -1307,22 +1307,22 @@ dtrmm_kernel_L2_M8_22:
        KERNEL8x2_SUB
 
        subs    counterL, counterL, #1
-       bgt     dtrmm_kernel_L2_M8_22
+       bgt     .Ldtrmm_kernel_L2_M8_22
 
 
-dtrmm_kernel_L2_M8_40:
+.Ldtrmm_kernel_L2_M8_40:
 
        ands    counterL , tempK, #7            // counterL = counterL % 8
-       ble     dtrmm_kernel_L2_M8_100
+       ble     .Ldtrmm_kernel_L2_M8_100
 
-dtrmm_kernel_L2_M8_42:
+.Ldtrmm_kernel_L2_M8_42:
 
        KERNEL8x2_SUB
 
        subs    counterL, counterL, #1
-       bgt     dtrmm_kernel_L2_M8_42
+       bgt     .Ldtrmm_kernel_L2_M8_42
 
-dtrmm_kernel_L2_M8_100:
+.Ldtrmm_kernel_L2_M8_100:
 
        SAVE8x2
 
@@ -1342,21 +1342,21 @@ dtrmm_kernel_L2_M8_100:
        add     tempOffset, tempOffset, #8
 #endif
 
-dtrmm_kernel_L2_M8_END:
+.Ldtrmm_kernel_L2_M8_END:
 
        subs    counterI, counterI, #1
-       bgt     dtrmm_kernel_L2_M8_20
+       bgt     .Ldtrmm_kernel_L2_M8_20
 
-dtrmm_kernel_L2_M4_BEGIN:
+.Ldtrmm_kernel_L2_M4_BEGIN:
 
        mov     counterI, origM
        tst     counterI , #7
-       ble     dtrmm_kernel_L2_END
+       ble     .Ldtrmm_kernel_L2_END
 
        tst     counterI, #4                    // counterI = counterI / 2
-       ble     dtrmm_kernel_L2_M2_BEGIN
+       ble     .Ldtrmm_kernel_L2_M2_BEGIN
 
-dtrmm_kernel_L2_M4_20:
+.Ldtrmm_kernel_L2_M4_20:
 
        INIT4x2
 
@@ -1380,10 +1380,10 @@ dtrmm_kernel_L2_M4_20:
 
        asr     counterL , tempK, #3            // counterL = counterL / 8
        cmp     counterL,#0
-       ble     dtrmm_kernel_L2_M4_40
+       ble     .Ldtrmm_kernel_L2_M4_40
        .align 5
 
-dtrmm_kernel_L2_M4_22:
+.Ldtrmm_kernel_L2_M4_22:
        KERNEL4x2_SUB
        KERNEL4x2_SUB
        KERNEL4x2_SUB
@@ -1395,22 +1395,22 @@ dtrmm_kernel_L2_M4_22:
        KERNEL4x2_SUB
 
        subs    counterL, counterL, #1
-       bgt     dtrmm_kernel_L2_M4_22
+       bgt     .Ldtrmm_kernel_L2_M4_22
 
 
-dtrmm_kernel_L2_M4_40:
+.Ldtrmm_kernel_L2_M4_40:
 
        ands    counterL , tempK, #7            // counterL = counterL % 8
-       ble     dtrmm_kernel_L2_M4_100
+       ble     .Ldtrmm_kernel_L2_M4_100
 
-dtrmm_kernel_L2_M4_42:
+.Ldtrmm_kernel_L2_M4_42:
 
        KERNEL4x2_SUB
 
        subs    counterL, counterL, #1
-       bgt     dtrmm_kernel_L2_M4_42
+       bgt     .Ldtrmm_kernel_L2_M4_42
 
-dtrmm_kernel_L2_M4_100:
+.Ldtrmm_kernel_L2_M4_100:
 
        SAVE4x2
 
@@ -1430,19 +1430,19 @@ dtrmm_kernel_L2_M4_100:
        add     tempOffset, tempOffset, #4
 #endif
 
-dtrmm_kernel_L2_M4_END:
+.Ldtrmm_kernel_L2_M4_END:
 
 
-dtrmm_kernel_L2_M2_BEGIN:
+.Ldtrmm_kernel_L2_M2_BEGIN:
 
        mov     counterI, origM
        tst     counterI , #3
-       ble     dtrmm_kernel_L2_END
+       ble     .Ldtrmm_kernel_L2_END
 
        tst     counterI, #2                    // counterI = counterI / 2
-       ble     dtrmm_kernel_L2_M1_BEGIN
+       ble     .Ldtrmm_kernel_L2_M1_BEGIN
 
-dtrmm_kernel_L2_M2_20:
+.Ldtrmm_kernel_L2_M2_20:
 
        INIT2x2
 
@@ -1466,9 +1466,9 @@ dtrmm_kernel_L2_M2_20:
 
        asr     counterL , tempK, #3            // counterL = counterL / 8
         cmp    counterL,#0
-       ble     dtrmm_kernel_L2_M2_40
+       ble     .Ldtrmm_kernel_L2_M2_40
 
-dtrmm_kernel_L2_M2_22:
+.Ldtrmm_kernel_L2_M2_22:
 
        KERNEL2x2_SUB
        KERNEL2x2_SUB
@@ -1481,22 +1481,22 @@ dtrmm_kernel_L2_M2_22:
        KERNEL2x2_SUB
 
        subs    counterL, counterL, #1
-       bgt     dtrmm_kernel_L2_M2_22
+       bgt     .Ldtrmm_kernel_L2_M2_22
 
 
-dtrmm_kernel_L2_M2_40:
+.Ldtrmm_kernel_L2_M2_40:
 
        ands    counterL , tempK, #7            // counterL = counterL % 8
-       ble     dtrmm_kernel_L2_M2_100
+       ble     .Ldtrmm_kernel_L2_M2_100
 
-dtrmm_kernel_L2_M2_42:
+.Ldtrmm_kernel_L2_M2_42:
 
        KERNEL2x2_SUB
 
        subs    counterL, counterL, #1
-       bgt     dtrmm_kernel_L2_M2_42
+       bgt     .Ldtrmm_kernel_L2_M2_42
 
-dtrmm_kernel_L2_M2_100:
+.Ldtrmm_kernel_L2_M2_100:
 
        SAVE2x2
 
@@ -1516,15 +1516,15 @@ dtrmm_kernel_L2_M2_100:
        add     tempOffset, tempOffset, #2
 #endif
 
-dtrmm_kernel_L2_M2_END:
+.Ldtrmm_kernel_L2_M2_END:
 
 
-dtrmm_kernel_L2_M1_BEGIN:
+.Ldtrmm_kernel_L2_M1_BEGIN:
 
        tst     counterI, #1                    // counterI = counterI % 2
-       ble     dtrmm_kernel_L2_END
+       ble     .Ldtrmm_kernel_L2_END
 
-dtrmm_kernel_L2_M1_20:
+.Ldtrmm_kernel_L2_M1_20:
 
        INIT1x2
 
@@ -1548,9 +1548,9 @@ dtrmm_kernel_L2_M1_20:
 
        asr     counterL , tempK, #3            // counterL = counterL / 8
         cmp     counterL, #0
-       ble     dtrmm_kernel_L2_M1_40
+       ble     .Ldtrmm_kernel_L2_M1_40
 
-dtrmm_kernel_L2_M1_22:
+.Ldtrmm_kernel_L2_M1_22:
        KERNEL1x2_SUB
        KERNEL1x2_SUB
        KERNEL1x2_SUB
@@ -1562,22 +1562,22 @@ dtrmm_kernel_L2_M1_22:
        KERNEL1x2_SUB
 
        subs    counterL, counterL, #1
-       bgt     dtrmm_kernel_L2_M1_22
+       bgt     .Ldtrmm_kernel_L2_M1_22
 
 
-dtrmm_kernel_L2_M1_40:
+.Ldtrmm_kernel_L2_M1_40:
 
        ands    counterL , tempK, #7            // counterL = counterL % 8
-       ble     dtrmm_kernel_L2_M1_100
+       ble     .Ldtrmm_kernel_L2_M1_100
 
-dtrmm_kernel_L2_M1_42:
+.Ldtrmm_kernel_L2_M1_42:
 
        KERNEL1x2_SUB
 
        subs    counterL, counterL, #1
-       bgt     dtrmm_kernel_L2_M1_42
+       bgt     .Ldtrmm_kernel_L2_M1_42
 
-dtrmm_kernel_L2_M1_100:
+.Ldtrmm_kernel_L2_M1_100:
 
        SAVE1x2
 
@@ -1597,7 +1597,7 @@ dtrmm_kernel_L2_M1_100:
        add     tempOffset, tempOffset, #1
 #endif
 
-dtrmm_kernel_L2_END:
+.Ldtrmm_kernel_L2_END:
 #if !defined(LEFT)
        add     tempOffset, tempOffset, #2
 #endif
@@ -1605,11 +1605,11 @@ dtrmm_kernel_L2_END:
 
 /******************************************************************************/
 
-dtrmm_kernel_L1_BEGIN:
+.Ldtrmm_kernel_L1_BEGIN:
 
        mov     counterJ , origN
        tst     counterJ , #1
-       ble     dtrmm_kernel_L999 // done
+       ble     .Ldtrmm_kernel_L999 // done
 
        mov     pCRow0, pC                      // pCRow0 = C
        add     pC , pC , LDC                   // Update pC to point to next
@@ -1619,14 +1619,14 @@ dtrmm_kernel_L1_BEGIN:
 #endif
        mov     pA, origPA                      // pA = A
 
-dtrmm_kernel_L1_M8_BEGIN:
+.Ldtrmm_kernel_L1_M8_BEGIN:
 
        mov     counterI, origM
        asr     counterI, counterI, #3          // counterI = counterI / 8
        cmp     counterI, #0
-       ble     dtrmm_kernel_L1_M4_BEGIN
+       ble     .Ldtrmm_kernel_L1_M4_BEGIN
 
-dtrmm_kernel_L1_M8_20:
+.Ldtrmm_kernel_L1_M8_20:
 
        INIT8x1
 
@@ -1650,10 +1650,10 @@ dtrmm_kernel_L1_M8_20:
 
        asr     counterL , tempK, #3            // counterL = counterL / 8
        cmp     counterL , #0
-       ble     dtrmm_kernel_L1_M8_40
+       ble     .Ldtrmm_kernel_L1_M8_40
        .align 5
 
-dtrmm_kernel_L1_M8_22:
+.Ldtrmm_kernel_L1_M8_22:
        KERNEL8x1_SUB
        KERNEL8x1_SUB
        KERNEL8x1_SUB
@@ -1665,22 +1665,22 @@ dtrmm_kernel_L1_M8_22:
        KERNEL8x1_SUB
 
        subs    counterL, counterL, #1
-       bgt     dtrmm_kernel_L1_M8_22
+       bgt     .Ldtrmm_kernel_L1_M8_22
 
 
-dtrmm_kernel_L1_M8_40:
+.Ldtrmm_kernel_L1_M8_40:
 
        ands    counterL , tempK, #7            // counterL = counterL % 8
-       ble     dtrmm_kernel_L1_M8_100
+       ble     .Ldtrmm_kernel_L1_M8_100
 
-dtrmm_kernel_L1_M8_42:
+.Ldtrmm_kernel_L1_M8_42:
 
        KERNEL8x1_SUB
 
        subs    counterL, counterL, #1
-       bgt     dtrmm_kernel_L1_M8_42
+       bgt     .Ldtrmm_kernel_L1_M8_42
 
-dtrmm_kernel_L1_M8_100:
+.Ldtrmm_kernel_L1_M8_100:
 
        SAVE8x1
 
@@ -1700,21 +1700,21 @@ dtrmm_kernel_L1_M8_100:
        add     tempOffset, tempOffset, #8
 #endif
 
-dtrmm_kernel_L1_M8_END:
+.Ldtrmm_kernel_L1_M8_END:
 
        subs    counterI, counterI, #1
-       bgt     dtrmm_kernel_L1_M8_20
+       bgt     .Ldtrmm_kernel_L1_M8_20
 
-dtrmm_kernel_L1_M4_BEGIN:
+.Ldtrmm_kernel_L1_M4_BEGIN:
 
        mov     counterI, origM
        tst     counterI , #7
-       ble     dtrmm_kernel_L1_END
+       ble     .Ldtrmm_kernel_L1_END
 
        tst     counterI, #4                    // counterI = counterI / 2
-       ble     dtrmm_kernel_L1_M2_BEGIN
+       ble     .Ldtrmm_kernel_L1_M2_BEGIN
 
-dtrmm_kernel_L1_M4_20:
+.Ldtrmm_kernel_L1_M4_20:
 
        INIT4x1
 
@@ -1737,10 +1737,10 @@ dtrmm_kernel_L1_M4_20:
 
        asr     counterL , tempK, #3            // counterL = counterL / 8
        cmp     counterL , #0
-       ble     dtrmm_kernel_L1_M4_40
+       ble     .Ldtrmm_kernel_L1_M4_40
        .align 5
 
-dtrmm_kernel_L1_M4_22:
+.Ldtrmm_kernel_L1_M4_22:
        KERNEL4x1_SUB
        KERNEL4x1_SUB
        KERNEL4x1_SUB
@@ -1752,22 +1752,22 @@ dtrmm_kernel_L1_M4_22:
        KERNEL4x1_SUB
 
        subs    counterL, counterL, #1
-       bgt     dtrmm_kernel_L1_M4_22
+       bgt     .Ldtrmm_kernel_L1_M4_22
 
 
-dtrmm_kernel_L1_M4_40:
+.Ldtrmm_kernel_L1_M4_40:
 
        ands    counterL , tempK, #7            // counterL = counterL % 8
-       ble     dtrmm_kernel_L1_M4_100
+       ble     .Ldtrmm_kernel_L1_M4_100
 
-dtrmm_kernel_L1_M4_42:
+.Ldtrmm_kernel_L1_M4_42:
 
        KERNEL4x1_SUB
 
        subs    counterL, counterL, #1
-       bgt     dtrmm_kernel_L1_M4_42
+       bgt     .Ldtrmm_kernel_L1_M4_42
 
-dtrmm_kernel_L1_M4_100:
+.Ldtrmm_kernel_L1_M4_100:
 
        SAVE4x1
 
@@ -1787,18 +1787,18 @@ dtrmm_kernel_L1_M4_100:
        add     tempOffset, tempOffset, #4
 #endif
 
-dtrmm_kernel_L1_M4_END:
+.Ldtrmm_kernel_L1_M4_END:
 
-dtrmm_kernel_L1_M2_BEGIN:
+.Ldtrmm_kernel_L1_M2_BEGIN:
 
        mov     counterI, origM
        tst     counterI , #3
-       ble     dtrmm_kernel_L1_END
+       ble     .Ldtrmm_kernel_L1_END
 
        tst     counterI, #2                    // counterI = counterI / 2
-       ble     dtrmm_kernel_L1_M1_BEGIN
+       ble     .Ldtrmm_kernel_L1_M1_BEGIN
 
-dtrmm_kernel_L1_M2_20:
+.Ldtrmm_kernel_L1_M2_20:
 
        INIT2x1
 
@@ -1822,9 +1822,9 @@ dtrmm_kernel_L1_M2_20:
 
        asr     counterL , tempK, #3            // counterL = counterL / 8
        cmp     counterL , #0
-       ble     dtrmm_kernel_L1_M2_40
+       ble     .Ldtrmm_kernel_L1_M2_40
 
-dtrmm_kernel_L1_M2_22:
+.Ldtrmm_kernel_L1_M2_22:
 
        KERNEL2x1_SUB
        KERNEL2x1_SUB
@@ -1837,22 +1837,22 @@ dtrmm_kernel_L1_M2_22:
        KERNEL2x1_SUB
 
        subs    counterL, counterL, #1
-       bgt     dtrmm_kernel_L1_M2_22
+       bgt     .Ldtrmm_kernel_L1_M2_22
 
 
-dtrmm_kernel_L1_M2_40:
+.Ldtrmm_kernel_L1_M2_40:
 
        ands    counterL , tempK, #7            // counterL = counterL % 8
-       ble     dtrmm_kernel_L1_M2_100
+       ble     .Ldtrmm_kernel_L1_M2_100
 
-dtrmm_kernel_L1_M2_42:
+.Ldtrmm_kernel_L1_M2_42:
 
        KERNEL2x1_SUB
 
        subs    counterL, counterL, #1
-       bgt     dtrmm_kernel_L1_M2_42
+       bgt     .Ldtrmm_kernel_L1_M2_42
 
-dtrmm_kernel_L1_M2_100:
+.Ldtrmm_kernel_L1_M2_100:
 
        SAVE2x1
 
@@ -1872,15 +1872,15 @@ dtrmm_kernel_L1_M2_100:
        add     tempOffset, tempOffset, #2
 #endif
 
-dtrmm_kernel_L1_M2_END:
+.Ldtrmm_kernel_L1_M2_END:
 
 
-dtrmm_kernel_L1_M1_BEGIN:
+.Ldtrmm_kernel_L1_M1_BEGIN:
 
        tst     counterI, #1                    // counterI = counterI % 2
-       ble     dtrmm_kernel_L1_END
+       ble     .Ldtrmm_kernel_L1_END
 
-dtrmm_kernel_L1_M1_20:
+.Ldtrmm_kernel_L1_M1_20:
 
        INIT1x1
 
@@ -1904,9 +1904,9 @@ dtrmm_kernel_L1_M1_20:
 
        asr     counterL , tempK, #3            // counterL = counterL / 8
        cmp     counterL , #0
-       ble     dtrmm_kernel_L1_M1_40
+       ble     .Ldtrmm_kernel_L1_M1_40
 
-dtrmm_kernel_L1_M1_22:
+.Ldtrmm_kernel_L1_M1_22:
        KERNEL1x1_SUB
        KERNEL1x1_SUB
        KERNEL1x1_SUB
@@ -1918,30 +1918,30 @@ dtrmm_kernel_L1_M1_22:
        KERNEL1x1_SUB
 
        subs    counterL, counterL, #1
-       bgt     dtrmm_kernel_L1_M1_22
+       bgt     .Ldtrmm_kernel_L1_M1_22
 
 
-dtrmm_kernel_L1_M1_40:
+.Ldtrmm_kernel_L1_M1_40:
 
        ands    counterL , tempK, #7            // counterL = counterL % 8
-       ble     dtrmm_kernel_L1_M1_100
+       ble     .Ldtrmm_kernel_L1_M1_100
 
-dtrmm_kernel_L1_M1_42:
+.Ldtrmm_kernel_L1_M1_42:
 
        KERNEL1x1_SUB
 
        subs    counterL, counterL, #1
-       bgt     dtrmm_kernel_L1_M1_42
+       bgt     .Ldtrmm_kernel_L1_M1_42
 
-dtrmm_kernel_L1_M1_100:
+.Ldtrmm_kernel_L1_M1_100:
 
        SAVE1x1
 
 
-dtrmm_kernel_L1_END:
+.Ldtrmm_kernel_L1_END:
 
 
-dtrmm_kernel_L999:
+.Ldtrmm_kernel_L999:
        mov     x0, #0                          // set return value
        ldp     d8, d9, [sp, #(0 * 16)]
        ldp     d10, d11, [sp, #(1 * 16)]
index 162f721..658551f 100644 (file)
@@ -203,18 +203,18 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
        SAVE_REGS
 
        cmp     N, xzr
-       ble     gemv_n_kernel_L999
+       ble     .Lgemv_n_kernel_L999
        cmp     M, xzr
-       ble     gemv_n_kernel_L999
+       ble     .Lgemv_n_kernel_L999
 
        lsl     LDA, LDA, #SHZ
        lsl     INC_X, INC_X, #SHZ
        mov     J, N
 
        cmp     INC_Y, #1
-       bne     gemv_n_kernel_S_BEGIN
+       bne     .Lgemv_n_kernel_S_BEGIN
 
-gemv_n_kernel_F_LOOP:
+.Lgemv_n_kernel_F_LOOP:
 
        ld1     TEMPV, [X], INC_X
        fmul    TEMP, ALPHA, TEMP
@@ -229,57 +229,57 @@ gemv_n_kernel_F_LOOP:
        mov     Y_IPTR, Y
        mov     Y_OPTR, Y
 
-gemv_n_kernel_F32:
+.Lgemv_n_kernel_F32:
 
        asr     I, M, #5
        cmp     I, xzr
-       beq     gemv_n_kernel_F4
+       beq     .Lgemv_n_kernel_F4
 
-gemv_n_kernel_F320:
+.Lgemv_n_kernel_F320:
 
        KERNEL_F16
        KERNEL_F16
 
        subs    I, I, #1
-       bne     gemv_n_kernel_F320
+       bne     .Lgemv_n_kernel_F320
 
-gemv_n_kernel_F4:
+.Lgemv_n_kernel_F4:
        ands    I, M, #31
        asr     I, I, #2
        cmp     I, xzr
-       beq     gemv_n_kernel_F1
+       beq     .Lgemv_n_kernel_F1
 
-gemv_n_kernel_F40:
+.Lgemv_n_kernel_F40:
 
        KERNEL_F4
 
        subs    I, I, #1
-       bne     gemv_n_kernel_F40
+       bne     .Lgemv_n_kernel_F40
 
-gemv_n_kernel_F1:
+.Lgemv_n_kernel_F1:
        ands    I, M, #3
-       ble     gemv_n_kernel_F_END
+       ble     .Lgemv_n_kernel_F_END
 
-gemv_n_kernel_F10:
+.Lgemv_n_kernel_F10:
 
        KERNEL_F1
 
        subs    I, I, #1
-       bne     gemv_n_kernel_F10
+       bne     .Lgemv_n_kernel_F10
 
-gemv_n_kernel_F_END:
+.Lgemv_n_kernel_F_END:
 
        add     A, A, LDA
        subs    J, J, #1
-       bne     gemv_n_kernel_F_LOOP
+       bne     .Lgemv_n_kernel_F_LOOP
 
-       b       gemv_n_kernel_L999
+       b       .Lgemv_n_kernel_L999
 
-gemv_n_kernel_S_BEGIN:
+.Lgemv_n_kernel_S_BEGIN:
 
        INIT_S
 
-gemv_n_kernel_S_LOOP:
+.Lgemv_n_kernel_S_LOOP:
 
        ld1     TEMPV, [X], INC_X
        fmul    TEMP, ALPHA, TEMP
@@ -288,9 +288,9 @@ gemv_n_kernel_S_LOOP:
 
        asr     I, M, #2
        cmp     I, xzr
-       ble     gemv_n_kernel_S1
+       ble     .Lgemv_n_kernel_S1
 
-gemv_n_kernel_S4:
+.Lgemv_n_kernel_S4:
 
        KERNEL_S1
        KERNEL_S1
@@ -298,27 +298,27 @@ gemv_n_kernel_S4:
        KERNEL_S1
 
        subs    I, I, #1
-       bne     gemv_n_kernel_S4
+       bne     .Lgemv_n_kernel_S4
 
-gemv_n_kernel_S1:
+.Lgemv_n_kernel_S1:
 
        ands    I, M, #3
-       ble     gemv_n_kernel_S_END
+       ble     .Lgemv_n_kernel_S_END
 
-gemv_n_kernel_S10:
+.Lgemv_n_kernel_S10:
 
        KERNEL_S1
 
        subs    I, I, #1
-       bne     gemv_n_kernel_S10
+       bne     .Lgemv_n_kernel_S10
 
-gemv_n_kernel_S_END:
+.Lgemv_n_kernel_S_END:
 
        add     A, A, LDA
        subs    J, J, #1
-       bne     gemv_n_kernel_S_LOOP
+       bne     .Lgemv_n_kernel_S_LOOP
 
-gemv_n_kernel_L999:
+.Lgemv_n_kernel_L999:
 
        mov     w0, wzr
 
index 28325f7..b04367a 100644 (file)
@@ -233,18 +233,18 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
        SAVE_REGS
 
        cmp     N, xzr
-       ble     gemv_t_kernel_L999
+       ble     .Lgemv_t_kernel_L999
        cmp     M, xzr
-       ble     gemv_t_kernel_L999
+       ble     .Lgemv_t_kernel_L999
 
        lsl     LDA, LDA, #SHZ
        lsl     INC_Y, INC_Y, #SHZ
        mov     J, N
 
        cmp     INC_X, #1
-       bne     gemv_t_kernel_S_BEGIN
+       bne     .Lgemv_t_kernel_S_BEGIN
 
-gemv_t_kernel_F_LOOP:
+.Lgemv_t_kernel_F_LOOP:
 
        fmov    TEMP, REG0
        fmov    TEMP1, REG0
@@ -254,64 +254,64 @@ gemv_t_kernel_F_LOOP:
        mov     A_PTR, A
        mov     X_PTR, X
 
-gemv_t_kernel_F32:
+.Lgemv_t_kernel_F32:
 
        asr     I, M, #5
        cmp     I, xzr
-       beq     gemv_t_kernel_F4
+       beq     .Lgemv_t_kernel_F4
 
-gemv_t_kernel_F320:
+.Lgemv_t_kernel_F320:
 
        KERNEL_F32
 
        subs    I, I, #1
-       bne     gemv_t_kernel_F320
+       bne     .Lgemv_t_kernel_F320
 
        KERNEL_F32_FINALIZE
 
-gemv_t_kernel_F4:
+.Lgemv_t_kernel_F4:
        ands    I, M, #31
        asr     I, I, #2
        cmp     I, xzr
-       beq     gemv_t_kernel_F1
+       beq     .Lgemv_t_kernel_F1
 
-gemv_t_kernel_F40:
+.Lgemv_t_kernel_F40:
 
        KERNEL_F4
 
        subs    I, I, #1
-       bne     gemv_t_kernel_F40
+       bne     .Lgemv_t_kernel_F40
 
-gemv_t_kernel_F1:
+.Lgemv_t_kernel_F1:
 
        KERNEL_F4_FINALIZE
 
        ands    I, M, #3
-       ble     gemv_t_kernel_F_END
+       ble     .Lgemv_t_kernel_F_END
 
-gemv_t_kernel_F10:
+.Lgemv_t_kernel_F10:
 
        KERNEL_F1
 
        subs    I, I, #1
-       bne     gemv_t_kernel_F10
+       bne     .Lgemv_t_kernel_F10
 
-gemv_t_kernel_F_END:
+.Lgemv_t_kernel_F_END:
 
        ld1     TMPV1, [Y]
        add     A, A, LDA
        subs    J, J, #1
        fmadd   TMP1, ALPHA, TEMP, TMP1
        st1     TMPV1, [Y], INC_Y
-       bne     gemv_t_kernel_F_LOOP
+       bne     .Lgemv_t_kernel_F_LOOP
 
-       b       gemv_t_kernel_L999
+       b       .Lgemv_t_kernel_L999
 
-gemv_t_kernel_S_BEGIN:
+.Lgemv_t_kernel_S_BEGIN:
 
        INIT_S
 
-gemv_t_kernel_S_LOOP:
+.Lgemv_t_kernel_S_LOOP:
 
        fmov    TEMP, REG0
        mov     A_PTR, A
@@ -319,9 +319,9 @@ gemv_t_kernel_S_LOOP:
 
        asr     I, M, #2
        cmp     I, xzr
-       ble     gemv_t_kernel_S1
+       ble     .Lgemv_t_kernel_S1
 
-gemv_t_kernel_S4:
+.Lgemv_t_kernel_S4:
 
        KERNEL_S1
        KERNEL_S1
@@ -329,30 +329,30 @@ gemv_t_kernel_S4:
        KERNEL_S1
 
        subs    I, I, #1
-       bne     gemv_t_kernel_S4
+       bne     .Lgemv_t_kernel_S4
 
-gemv_t_kernel_S1:
+.Lgemv_t_kernel_S1:
 
        ands    I, M, #3
-       ble     gemv_t_kernel_S_END
+       ble     .Lgemv_t_kernel_S_END
 
-gemv_t_kernel_S10:
+.Lgemv_t_kernel_S10:
 
        KERNEL_S1
 
        subs    I, I, #1
-       bne     gemv_t_kernel_S10
+       bne     .Lgemv_t_kernel_S10
 
-gemv_t_kernel_S_END:
+.Lgemv_t_kernel_S_END:
 
        ld1     TMPV1, [Y]
        add     A, A, LDA
        subs    J, J, #1
        fmadd   TMP1, ALPHA, TEMP, TMP1
        st1     TMPV1, [Y], INC_Y
-        bne     gemv_t_kernel_S_LOOP
+        bne     .Lgemv_t_kernel_S_LOOP
 
-gemv_t_kernel_L999:
+.Lgemv_t_kernel_L999:
 
        RESTORE_REGS
 
index 6c0d84f..31d0cd6 100644 (file)
@@ -230,62 +230,62 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
        PROLOGUE
 
        cmp     N, xzr
-       ble     iamax_kernel_zero
+       ble     .Liamax_kernel_zero
        cmp     INC_X, xzr
-       ble     iamax_kernel_zero
+       ble     .Liamax_kernel_zero
 
        cmp     INC_X, #1
-       bne     iamax_kernel_S_BEGIN
+       bne     .Liamax_kernel_S_BEGIN
        mov     x7, X
 
-iamax_kernel_F_BEGIN:
+.Liamax_kernel_F_BEGIN:
 
        INIT_S
 
        subs    N, N, #1
-       ble     iamax_kernel_L999
+       ble     .Liamax_kernel_L999
 
        asr     I, N, #3
        cmp     I, xzr
-       beq     iamax_kernel_F1
+       beq     .Liamax_kernel_F1
 
        add     Z, Z, #1
-iamax_kernel_F8:
+.Liamax_kernel_F8:
 
        KERNEL_F8
 
        subs    I, I, #1
-       bne     iamax_kernel_F8
+       bne     .Liamax_kernel_F8
 
        KERNEL_F8_FINALIZE
 
        sub     Z, Z, #1
-iamax_kernel_F1:
+.Liamax_kernel_F1:
 
        ands    I, N, #7
-       ble     iamax_kernel_L999
+       ble     .Liamax_kernel_L999
 
-iamax_kernel_F10:
+.Liamax_kernel_F10:
 
        KERNEL_S1
 
        subs    I, I, #1
-        bne     iamax_kernel_F10
+        bne     .Liamax_kernel_F10
 
-       b       iamax_kernel_L999
+       b       .Liamax_kernel_L999
 
-iamax_kernel_S_BEGIN:
+.Liamax_kernel_S_BEGIN:
 
        INIT_S
 
        subs    N, N, #1
-       ble     iamax_kernel_L999
+       ble     .Liamax_kernel_L999
 
        asr     I, N, #2
        cmp     I, xzr
-       ble     iamax_kernel_S1
+       ble     .Liamax_kernel_S1
 
-iamax_kernel_S4:
+.Liamax_kernel_S4:
 
        KERNEL_S1
        KERNEL_S1
@@ -293,25 +293,25 @@ iamax_kernel_S4:
        KERNEL_S1
 
        subs    I, I, #1
-       bne     iamax_kernel_S4
+       bne     .Liamax_kernel_S4
 
-iamax_kernel_S1:
+.Liamax_kernel_S1:
 
        ands    I, N, #3
-       ble     iamax_kernel_L999
+       ble     .Liamax_kernel_L999
 
-iamax_kernel_S10:
+.Liamax_kernel_S10:
 
        KERNEL_S1
        subs    I, I, #1
-       bne     iamax_kernel_S10
+       bne     .Liamax_kernel_S10
 
-iamax_kernel_L999:
+.Liamax_kernel_L999:
 
        mov     x0, INDEX
        ret
 
-iamax_kernel_zero:
+.Liamax_kernel_zero:
 
        mov     x0, xzr
        ret
index 9b252ec..42fa4e7 100644 (file)
@@ -276,64 +276,64 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
        PROLOGUE
 
        cmp     N, xzr
-       ble     iamax_kernel_zero
+       ble     .Lizamax_kernel_zero
        cmp     INC_X, xzr
-       ble     iamax_kernel_zero
+       ble     .Lizamax_kernel_zero
 
        cmp     INC_X, #1
-       bne     iamax_kernel_S_BEGIN
+       bne     .Lizamax_kernel_S_BEGIN
        mov     x7, X
 
 
-iamax_kernel_F_BEGIN:
+.Lizamax_kernel_F_BEGIN:
 
        INIT_S
 
        subs    N, N, #1
-       ble     iamax_kernel_L999
+       ble     .Lizamax_kernel_L999
 
        asr     I, N, #3
        cmp     I, xzr
-       ble     iamax_kernel_F1
+       ble     .Lizamax_kernel_F1
 
        add     Z, Z, #1
 
-iamax_kernel_F8:
+.Lizamax_kernel_F8:
 
        KERNEL_F8
 
        subs    I, I, #1
-       bne     iamax_kernel_F8
+       bne     .Lizamax_kernel_F8
 
        KERNEL_F8_FINALIZE
 
        sub     Z, Z, #1
-iamax_kernel_F1:
+.Lizamax_kernel_F1:
 
        ands    I, N, #7
-       ble     iamax_kernel_L999
+       ble     .Lizamax_kernel_L999
 
-iamax_kernel_F10:
+.Lizamax_kernel_F10:
 
        KERNEL_S1
 
        subs    I, I, #1
-        bne     iamax_kernel_F10
+        bne     .Lizamax_kernel_F10
 
-       b       iamax_kernel_L999
+       b       .Lizamax_kernel_L999
 
-iamax_kernel_S_BEGIN:
+.Lizamax_kernel_S_BEGIN:
 
        INIT_S
 
        subs    N, N, #1
-       ble     iamax_kernel_L999
+       ble     .Lizamax_kernel_L999
 
        asr     I, N, #2
        cmp     I, xzr
-       ble     iamax_kernel_S1
+       ble     .Lizamax_kernel_S1
 
-iamax_kernel_S4:
+.Lizamax_kernel_S4:
 
        KERNEL_S1
        KERNEL_S1
@@ -341,26 +341,26 @@ iamax_kernel_S4:
        KERNEL_S1
 
        subs    I, I, #1
-       bne     iamax_kernel_S4
+       bne     .Lizamax_kernel_S4
 
-iamax_kernel_S1:
+.Lizamax_kernel_S1:
 
        ands    I, N, #3
-       ble     iamax_kernel_L999
+       ble     .Lizamax_kernel_L999
 
-iamax_kernel_S10:
+.Lizamax_kernel_S10:
 
        KERNEL_S1
 
        subs    I, I, #1
-        bne     iamax_kernel_S10
+        bne     .Lizamax_kernel_S10
 
-iamax_kernel_L999:
+.Lizamax_kernel_L999:
 
        mov     x0, INDEX
        ret
 
-iamax_kernel_zero:
+.Lizamax_kernel_zero:
 
        mov     x0, xzr
        ret
index 5d06c13..e2cbd4d 100644 (file)
@@ -162,44 +162,44 @@ KERNEL_S1_NEXT:
        INIT
 
        cmp     N, #0
-       ble     nrm2_kernel_L999
+       ble     .Lnrm2_kernel_L999
 
        cmp     INC_X, #0
-       beq     nrm2_kernel_L999
+       beq     .Lnrm2_kernel_L999
 
 
        cmp     INC_X, #1
-       bne     nrm2_kernel_S_BEGIN
+       bne     .Lnrm2_kernel_S_BEGIN
 
-nrm2_kernel_F_BEGIN:
+.Lnrm2_kernel_F_BEGIN:
 
        asr     I, N, #3                                // I = N / 8
        cmp     I, xzr
-       ble     nrm2_kernel_F1
+       ble     .Lnrm2_kernel_F1
 
-nrm2_kernel_F8:
+.Lnrm2_kernel_F8:
 
        KERNEL_F8
 
        subs    I, I, #1
-       bne     nrm2_kernel_F8
+       bne     .Lnrm2_kernel_F8
 
-nrm2_kernel_F1:
+.Lnrm2_kernel_F1:
 
        ands    I, N, #7
-       ble     nrm2_kernel_L999
+       ble     .Lnrm2_kernel_L999
 
 
-nrm2_kernel_F10:
+.Lnrm2_kernel_F10:
 
        KERNEL_F1
 
        subs    I, I, #1
-       bne     nrm2_kernel_F10
+       bne     .Lnrm2_kernel_F10
 
-       b       nrm2_kernel_L999
+       b       .Lnrm2_kernel_L999
 
-nrm2_kernel_S_BEGIN:
+.Lnrm2_kernel_S_BEGIN:
 
        INIT_S
 
@@ -207,15 +207,15 @@ nrm2_kernel_S_BEGIN:
 
        .align 5
 
-nrm2_kernel_S10:
+.Lnrm2_kernel_S10:
 
        KERNEL_S1
 
        subs    I, I, #1
-       bne     nrm2_kernel_S10
+       bne     .Lnrm2_kernel_S10
 
 
-nrm2_kernel_L999:
+.Lnrm2_kernel_L999:
        fsqrt   SSQ, SSQ
        fmul    SSQ, SCALE, SSQ
 
index 5721252..00c3085 100644 (file)
@@ -165,48 +165,48 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
        PROLOGUE
 
        cmp     N, xzr
-       ble     rot_kernel_L999
+       ble     .Lrot_kernel_L999
 
        INIT
 
        cmp     INC_X, #1
-       bne     rot_kernel_S_BEGIN
+       bne     .Lrot_kernel_S_BEGIN
        cmp     INC_Y, #1
-       bne     rot_kernel_S_BEGIN
+       bne     .Lrot_kernel_S_BEGIN
 
-rot_kernel_F_BEGIN:
+.Lrot_kernel_F_BEGIN:
 
        asr     I, N, #2
        cmp     I, xzr
-       beq     rot_kernel_F1
+       beq     .Lrot_kernel_F1
 
        KERNEL_INIT_F4
 
-rot_kernel_F4:
+.Lrot_kernel_F4:
 
        KERNEL_F4
 
        subs    I, I, #1
-       bne     rot_kernel_F4
+       bne     .Lrot_kernel_F4
 
-rot_kernel_F1:
+.Lrot_kernel_F1:
 
        ands    I, N, #3
-       ble     rot_kernel_L999
+       ble     .Lrot_kernel_L999
 
        INIT_F1
 
-rot_kernel_F10:
+.Lrot_kernel_F10:
 
        KERNEL_F1
 
        subs    I, I, #1
-        bne     rot_kernel_F10
+        bne     .Lrot_kernel_F10
 
        mov     w0, wzr
        ret
 
-rot_kernel_S_BEGIN:
+.Lrot_kernel_S_BEGIN:
 
        INIT_S
        INIT_F1
@@ -214,9 +214,9 @@ rot_kernel_S_BEGIN:
 
        asr     I, N, #2
        cmp     I, xzr
-       ble     rot_kernel_S1
+       ble     .Lrot_kernel_S1
 
-rot_kernel_S4:
+.Lrot_kernel_S4:
 
        KERNEL_S1
        KERNEL_S1
@@ -224,22 +224,22 @@ rot_kernel_S4:
        KERNEL_S1
 
        subs    I, I, #1
-       bne     rot_kernel_S4
+       bne     .Lrot_kernel_S4
 
-rot_kernel_S1:
+.Lrot_kernel_S1:
 
        ands    I, N, #3
-       ble     rot_kernel_L999
+       ble     .Lrot_kernel_L999
 
 
-rot_kernel_S10:
+.Lrot_kernel_S10:
 
        KERNEL_S1
 
        subs    I, I, #1
-        bne     rot_kernel_S10
+        bne     .Lrot_kernel_S10
 
-rot_kernel_L999:
+.Lrot_kernel_L999:
 
        mov     w0, wzr
        ret
index 91d469d..09c41cd 100644 (file)
@@ -166,86 +166,86 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
        PROLOGUE
 
        cmp     N, xzr
-       ble     scal_kernel_L999
+       ble     .Lscal_kernel_L999
 
        fcmp    DA, #0.0
-       beq     scal_kernel_zero
+       beq     .Lscal_kernel_zero
 
        cmp     INC_X, #1
-       bne     scal_kernel_S_BEGIN
+       bne     .Lscal_kernel_S_BEGIN
 
-scal_kernel_F_BEGIN:
+.Lscal_kernel_F_BEGIN:
 
        asr     I, N, #3
        cmp     I, xzr
-       beq     scal_kernel_F1
+       beq     .Lscal_kernel_F1
 
        KERNEL_INIT_F8
 
-scal_kernel_F8:
+.Lscal_kernel_F8:
 
        KERNEL_F8
 
        subs    I, I, #1
-       bne     scal_kernel_F8
+       bne     .Lscal_kernel_F8
 
-scal_kernel_F1:
+.Lscal_kernel_F1:
 
        ands    I, N, #7
-       ble     scal_kernel_L999
+       ble     .Lscal_kernel_L999
 
-scal_kernel_F10:
+.Lscal_kernel_F10:
 
        KERNEL_F1
 
        subs    I, I, #1
-        bne     scal_kernel_F10
+        bne     .Lscal_kernel_F10
 
        mov     w0, wzr
        ret
 
-scal_kernel_S_BEGIN:
+.Lscal_kernel_S_BEGIN:
 
        INIT_S
        mov     X_COPY, X
 
        asr     I, N, #2
        cmp     I, xzr
-       ble     scal_kernel_S1
+       ble     .Lscal_kernel_S1
 
-scal_kernel_S4:
+.Lscal_kernel_S4:
 
        KERNEL_S4
 
        subs    I, I, #1
-       bne     scal_kernel_S4
+       bne     .Lscal_kernel_S4
 
-scal_kernel_S1:
+.Lscal_kernel_S1:
 
        ands    I, N, #3
-       ble     scal_kernel_L999
+       ble     .Lscal_kernel_L999
 
-scal_kernel_S10:
+.Lscal_kernel_S10:
 
        KERNEL_S1
 
        subs    I, I, #1
-        bne     scal_kernel_S10
+        bne     .Lscal_kernel_S10
 
-scal_kernel_L999:
+.Lscal_kernel_L999:
 
        mov     w0, wzr
        ret
 
-scal_kernel_zero:
+.Lscal_kernel_zero:
 
        INIT_S
 
-scal_kernel_Z1:
+.Lscal_kernel_Z1:
 
        st1     DAV, [X], INC_X
        subs    N, N, #1
-        bne     scal_kernel_Z1
+        bne     .Lscal_kernel_Z1
 
        mov     w0, wzr
        ret
index 6e3645b..99099ea 100644 (file)
@@ -1070,7 +1070,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 
        PROLOGUE
 
-sgemm_kernel_begin:
+.Lsgemm_kernel_begin:
 
        .align 5
        add     sp, sp, #-(11 * 16)
@@ -1098,11 +1098,11 @@ sgemm_kernel_begin:
        mov     counterJ, origN
        asr     counterJ, counterJ, #2          // J = J / 4
        cmp     counterJ, #0
-       ble     sgemm_kernel_L2_BEGIN
+       ble     .Lsgemm_kernel_L2_BEGIN
 
 /******************************************************************************/
 
-sgemm_kernel_L4_BEGIN:
+.Lsgemm_kernel_L4_BEGIN:
        mov     pCRow0, pC
        add     pCRow1, pCRow0, LDC
        add     pCRow2, pCRow1, LDC
@@ -1112,21 +1112,21 @@ sgemm_kernel_L4_BEGIN:
 
        mov     pA, origPA                      // pA = start of A array
 
-sgemm_kernel_L4_M16_BEGIN:
+.Lsgemm_kernel_L4_M16_BEGIN:
 
        mov     counterI, origM
        asr     counterI, counterI, #4          // counterI = counterI / 16
        cmp     counterI, #0
-       ble     sgemm_kernel_L4_M8_BEGIN
+       ble     .Lsgemm_kernel_L4_M8_BEGIN
 
        .align 5
-sgemm_kernel_L4_M16_20:
+.Lsgemm_kernel_L4_M16_20:
 
        mov     pB, origPB
 
        asr     counterL , origK, #3
        cmp     counterL , #2
-       blt     sgemm_kernel_L4_M16_32
+       blt     .Lsgemm_kernel_L4_M16_32
 
        KERNEL16x4_I
        KERNEL16x4_M2
@@ -1138,10 +1138,10 @@ sgemm_kernel_L4_M16_20:
        KERNEL16x4_M2
 
        subs    counterL, counterL, #2
-       ble     sgemm_kernel_L4_M16_22a
+       ble     .Lsgemm_kernel_L4_M16_22a
 
        .align 5
-sgemm_kernel_L4_M16_22:
+.Lsgemm_kernel_L4_M16_22:
 
        KERNEL16x4_M1
        KERNEL16x4_M2
@@ -1153,10 +1153,10 @@ sgemm_kernel_L4_M16_22:
        KERNEL16x4_M2
 
        subs    counterL, counterL, #1
-       bgt     sgemm_kernel_L4_M16_22
+       bgt     .Lsgemm_kernel_L4_M16_22
 
        .align 5
-sgemm_kernel_L4_M16_22a:
+.Lsgemm_kernel_L4_M16_22a:
 
        KERNEL16x4_M1
        KERNEL16x4_M2
@@ -1167,13 +1167,13 @@ sgemm_kernel_L4_M16_22a:
        KERNEL16x4_M1
        KERNEL16x4_E
 
-       b        sgemm_kernel_L4_M16_44
+       b        .Lsgemm_kernel_L4_M16_44
 
        .align 5
-sgemm_kernel_L4_M16_32:
+.Lsgemm_kernel_L4_M16_32:
 
        tst     counterL, #1
-       ble     sgemm_kernel_L4_M16_40
+       ble     .Lsgemm_kernel_L4_M16_40
 
        KERNEL16x4_I
        KERNEL16x4_M2
@@ -1184,187 +1184,187 @@ sgemm_kernel_L4_M16_32:
        KERNEL16x4_M1
        KERNEL16x4_E
 
-       b       sgemm_kernel_L4_M16_44
+       b       .Lsgemm_kernel_L4_M16_44
 
-sgemm_kernel_L4_M16_40:
+.Lsgemm_kernel_L4_M16_40:
 
        INIT16x4
 
-sgemm_kernel_L4_M16_44:
+.Lsgemm_kernel_L4_M16_44:
 
        ands    counterL , origK, #7
-       ble     sgemm_kernel_L4_M16_100
+       ble     .Lsgemm_kernel_L4_M16_100
 
        .align 5
-sgemm_kernel_L4_M16_46:
+.Lsgemm_kernel_L4_M16_46:
 
        KERNEL16x4_SUB
        subs    counterL, counterL, #1
-       bne     sgemm_kernel_L4_M16_46
+       bne     .Lsgemm_kernel_L4_M16_46
 
-sgemm_kernel_L4_M16_100:
+.Lsgemm_kernel_L4_M16_100:
        prfm    PLDL1KEEP, [pA]
        prfm    PLDL1KEEP, [pA, #64]
        prfm    PLDL1KEEP, [origPB]
 
        SAVE16x4
 
-sgemm_kernel_L4_M16_END:
+.Lsgemm_kernel_L4_M16_END:
        subs    counterI, counterI, #1
-       bne     sgemm_kernel_L4_M16_20
+       bne     .Lsgemm_kernel_L4_M16_20
 
 //------------------------------------------------------------------------------
 
-sgemm_kernel_L4_M8_BEGIN:
+.Lsgemm_kernel_L4_M8_BEGIN:
 
        mov     counterI, origM
        tst     counterI , #15
-       ble     sgemm_kernel_L4_END
+       ble     .Lsgemm_kernel_L4_END
 
        tst     counterI, #8
-       ble     sgemm_kernel_L4_M4_BEGIN
+       ble     .Lsgemm_kernel_L4_M4_BEGIN
 
-sgemm_kernel_L4_M8_20:
+.Lsgemm_kernel_L4_M8_20:
 
        mov     pB, origPB
 
        asr     counterL , origK, #1            // L = K / 2
        cmp     counterL , #2                   // is there at least 4 to do?
-       blt     sgemm_kernel_L4_M8_32
+       blt     .Lsgemm_kernel_L4_M8_32
 
        KERNEL8x4_I                             // do one in the K
        KERNEL8x4_M2                            // do another in the K
 
        subs    counterL, counterL, #2
-       ble     sgemm_kernel_L4_M8_22a
+       ble     .Lsgemm_kernel_L4_M8_22a
        .align 5
 
-sgemm_kernel_L4_M8_22:
+.Lsgemm_kernel_L4_M8_22:
 
        KERNEL8x4_M1
        KERNEL8x4_M2
 
        subs    counterL, counterL, #1
-       bgt     sgemm_kernel_L4_M8_22
+       bgt     .Lsgemm_kernel_L4_M8_22
 
-sgemm_kernel_L4_M8_22a:
+.Lsgemm_kernel_L4_M8_22a:
 
        KERNEL8x4_M1
        KERNEL8x4_E
 
-       b        sgemm_kernel_L4_M8_44
+       b        .Lsgemm_kernel_L4_M8_44
 
-sgemm_kernel_L4_M8_32:
+.Lsgemm_kernel_L4_M8_32:
 
        tst     counterL, #1
-       ble     sgemm_kernel_L4_M8_40
+       ble     .Lsgemm_kernel_L4_M8_40
 
        KERNEL8x4_I
        KERNEL8x4_E
 
-       b       sgemm_kernel_L4_M8_44
+       b       .Lsgemm_kernel_L4_M8_44
 
-sgemm_kernel_L4_M8_40:
+.Lsgemm_kernel_L4_M8_40:
 
        INIT8x4
 
-sgemm_kernel_L4_M8_44:
+.Lsgemm_kernel_L4_M8_44:
 
        ands    counterL , origK, #1
-       ble     sgemm_kernel_L4_M8_100
+       ble     .Lsgemm_kernel_L4_M8_100
 
-sgemm_kernel_L4_M8_46:
+.Lsgemm_kernel_L4_M8_46:
 
        KERNEL8x4_SUB
 
-sgemm_kernel_L4_M8_100:
+.Lsgemm_kernel_L4_M8_100:
 
        SAVE8x4
 
-sgemm_kernel_L4_M8_END:
+.Lsgemm_kernel_L4_M8_END:
 
 //------------------------------------------------------------------------------
 
-sgemm_kernel_L4_M4_BEGIN:
+.Lsgemm_kernel_L4_M4_BEGIN:
 
        mov     counterI, origM
        tst     counterI , #7
-       ble     sgemm_kernel_L4_END
+       ble     .Lsgemm_kernel_L4_END
 
        tst     counterI, #4
-       ble     sgemm_kernel_L4_M2_BEGIN
+       ble     .Lsgemm_kernel_L4_M2_BEGIN
 
-sgemm_kernel_L4_M4_20:
+.Lsgemm_kernel_L4_M4_20:
 
        mov     pB, origPB
 
        asr     counterL , origK, #1            // L = K / 2
        cmp     counterL , #2                   // is there at least 4 to do?
-       blt     sgemm_kernel_L4_M4_32
+       blt     .Lsgemm_kernel_L4_M4_32
 
        KERNEL4x4_I                             // do one in the K
        KERNEL4x4_M2                            // do another in the K
 
        subs    counterL, counterL, #2
-       ble     sgemm_kernel_L4_M4_22a
+       ble     .Lsgemm_kernel_L4_M4_22a
        .align 5
 
-sgemm_kernel_L4_M4_22:
+.Lsgemm_kernel_L4_M4_22:
 
        KERNEL4x4_M1
        KERNEL4x4_M2
 
        subs    counterL, counterL, #1
-       bgt     sgemm_kernel_L4_M4_22
+       bgt     .Lsgemm_kernel_L4_M4_22
 
-sgemm_kernel_L4_M4_22a:
+.Lsgemm_kernel_L4_M4_22a:
 
        KERNEL4x4_M1
        KERNEL4x4_E
 
-       b        sgemm_kernel_L4_M4_44
+       b        .Lsgemm_kernel_L4_M4_44
 
-sgemm_kernel_L4_M4_32:
+.Lsgemm_kernel_L4_M4_32:
 
        tst     counterL, #1
-       ble     sgemm_kernel_L4_M4_40
+       ble     .Lsgemm_kernel_L4_M4_40
 
        KERNEL4x4_I
        KERNEL4x4_E
 
-       b       sgemm_kernel_L4_M4_44
+       b       .Lsgemm_kernel_L4_M4_44
 
-sgemm_kernel_L4_M4_40:
+.Lsgemm_kernel_L4_M4_40:
 
        INIT4x4
 
-sgemm_kernel_L4_M4_44:
+.Lsgemm_kernel_L4_M4_44:
 
        ands    counterL , origK, #1
-       ble     sgemm_kernel_L4_M4_100
+       ble     .Lsgemm_kernel_L4_M4_100
 
-sgemm_kernel_L4_M4_46:
+.Lsgemm_kernel_L4_M4_46:
 
        KERNEL4x4_SUB
 
-sgemm_kernel_L4_M4_100:
+.Lsgemm_kernel_L4_M4_100:
 
        SAVE4x4
 
-sgemm_kernel_L4_M4_END:
+.Lsgemm_kernel_L4_M4_END:
 
 //------------------------------------------------------------------------------
 
-sgemm_kernel_L4_M2_BEGIN:
+.Lsgemm_kernel_L4_M2_BEGIN:
 
        mov     counterI, origM
        tst     counterI , #3
-       ble     sgemm_kernel_L4_END
+       ble     .Lsgemm_kernel_L4_END
 
        tst     counterI, #2                    // counterI = counterI / 2
-       ble     sgemm_kernel_L4_M1_BEGIN
+       ble     .Lsgemm_kernel_L4_M1_BEGIN
 
-sgemm_kernel_L4_M2_20:
+.Lsgemm_kernel_L4_M2_20:
 
        INIT2x4
 
@@ -1372,9 +1372,9 @@ sgemm_kernel_L4_M2_20:
 
        asr     counterL , origK, #3            // counterL = counterL / 8
        cmp     counterL , #0
-       ble     sgemm_kernel_L4_M2_40
+       ble     .Lsgemm_kernel_L4_M2_40
 
-sgemm_kernel_L4_M2_22:
+.Lsgemm_kernel_L4_M2_22:
 
        KERNEL2x4_SUB
        KERNEL2x4_SUB
@@ -1387,34 +1387,34 @@ sgemm_kernel_L4_M2_22:
        KERNEL2x4_SUB
 
        subs    counterL, counterL, #1
-       bgt     sgemm_kernel_L4_M2_22
+       bgt     .Lsgemm_kernel_L4_M2_22
 
 
-sgemm_kernel_L4_M2_40:
+.Lsgemm_kernel_L4_M2_40:
 
        ands    counterL , origK, #7            // counterL = counterL % 8
-       ble     sgemm_kernel_L4_M2_100
+       ble     .Lsgemm_kernel_L4_M2_100
 
-sgemm_kernel_L4_M2_42:
+.Lsgemm_kernel_L4_M2_42:
 
        KERNEL2x4_SUB
 
        subs    counterL, counterL, #1
-       bgt     sgemm_kernel_L4_M2_42
+       bgt     .Lsgemm_kernel_L4_M2_42
 
-sgemm_kernel_L4_M2_100:
+.Lsgemm_kernel_L4_M2_100:
 
        SAVE2x4
 
-sgemm_kernel_L4_M2_END:
+.Lsgemm_kernel_L4_M2_END:
 
 
-sgemm_kernel_L4_M1_BEGIN:
+.Lsgemm_kernel_L4_M1_BEGIN:
 
        tst     counterI, #1                    // counterI = counterI % 2
-       ble     sgemm_kernel_L4_END
+       ble     .Lsgemm_kernel_L4_END
 
-sgemm_kernel_L4_M1_20:
+.Lsgemm_kernel_L4_M1_20:
 
        INIT1x4
 
@@ -1422,9 +1422,9 @@ sgemm_kernel_L4_M1_20:
 
        asr     counterL , origK, #3            // counterL = counterL / 8
        cmp     counterL , #0
-       ble     sgemm_kernel_L4_M1_40
+       ble     .Lsgemm_kernel_L4_M1_40
 
-sgemm_kernel_L4_M1_22:
+.Lsgemm_kernel_L4_M1_22:
        KERNEL1x4_SUB
        KERNEL1x4_SUB
        KERNEL1x4_SUB
@@ -1436,42 +1436,42 @@ sgemm_kernel_L4_M1_22:
        KERNEL1x4_SUB
 
        subs    counterL, counterL, #1
-       bgt     sgemm_kernel_L4_M1_22
+       bgt     .Lsgemm_kernel_L4_M1_22
 
 
-sgemm_kernel_L4_M1_40:
+.Lsgemm_kernel_L4_M1_40:
 
        ands    counterL , origK, #7            // counterL = counterL % 8
-       ble     sgemm_kernel_L4_M1_100
+       ble     .Lsgemm_kernel_L4_M1_100
 
-sgemm_kernel_L4_M1_42:
+.Lsgemm_kernel_L4_M1_42:
 
        KERNEL1x4_SUB
 
        subs    counterL, counterL, #1
-       bgt     sgemm_kernel_L4_M1_42
+       bgt     .Lsgemm_kernel_L4_M1_42
 
-sgemm_kernel_L4_M1_100:
+.Lsgemm_kernel_L4_M1_100:
 
        SAVE1x4
 
-sgemm_kernel_L4_END:
+.Lsgemm_kernel_L4_END:
        add     origPB, origPB, origK, lsl #4   // B = B + K * 4 * 4
 
        subs    counterJ, counterJ , #1         // j--
-       bgt     sgemm_kernel_L4_BEGIN
+       bgt     .Lsgemm_kernel_L4_BEGIN
 
 
 /******************************************************************************/
 
-sgemm_kernel_L2_BEGIN:   // less than 2 left in N direction
+.Lsgemm_kernel_L2_BEGIN:   // less than 2 left in N direction
 
        mov     counterJ , origN
        tst     counterJ , #3
-       ble     sgemm_kernel_L999
+       ble     .Lsgemm_kernel_L999
 
        tst     counterJ , #2
-       ble     sgemm_kernel_L1_BEGIN
+       ble     .Lsgemm_kernel_L1_BEGIN
 
        mov     pCRow0, pC                      // pCRow0 = pC
 
@@ -1479,14 +1479,14 @@ sgemm_kernel_L2_BEGIN:   // less than 2 left in N direction
 
        mov     pA, origPA                      // pA = A
 
-sgemm_kernel_L2_M16_BEGIN:
+.Lsgemm_kernel_L2_M16_BEGIN:
 
        mov     counterI, origM
        asr     counterI, counterI, #4          // counterI = counterI / 16
        cmp     counterI,#0
-       ble     sgemm_kernel_L2_M8_BEGIN
+       ble     .Lsgemm_kernel_L2_M8_BEGIN
 
-sgemm_kernel_L2_M16_20:
+.Lsgemm_kernel_L2_M16_20:
 
        INIT16x2
 
@@ -1494,10 +1494,10 @@ sgemm_kernel_L2_M16_20:
 
        asr     counterL , origK, #3            // counterL = counterL / 8
        cmp     counterL,#0
-       ble     sgemm_kernel_L2_M16_40
+       ble     .Lsgemm_kernel_L2_M16_40
        .align 5
 
-sgemm_kernel_L2_M16_22:
+.Lsgemm_kernel_L2_M16_22:
        KERNEL16x2_SUB
        KERNEL16x2_SUB
        KERNEL16x2_SUB
@@ -1509,41 +1509,41 @@ sgemm_kernel_L2_M16_22:
        KERNEL16x2_SUB
 
        subs    counterL, counterL, #1
-       bgt     sgemm_kernel_L2_M16_22
+       bgt     .Lsgemm_kernel_L2_M16_22
 
 
-sgemm_kernel_L2_M16_40:
+.Lsgemm_kernel_L2_M16_40:
 
        ands    counterL , origK, #7            // counterL = counterL % 8
-       ble     sgemm_kernel_L2_M16_100
+       ble     .Lsgemm_kernel_L2_M16_100
 
-sgemm_kernel_L2_M16_42:
+.Lsgemm_kernel_L2_M16_42:
 
        KERNEL16x2_SUB
 
        subs    counterL, counterL, #1
-       bgt     sgemm_kernel_L2_M16_42
+       bgt     .Lsgemm_kernel_L2_M16_42
 
-sgemm_kernel_L2_M16_100:
+.Lsgemm_kernel_L2_M16_100:
 
        SAVE16x2
 
-sgemm_kernel_L2_M16_END:
+.Lsgemm_kernel_L2_M16_END:
 
        subs    counterI, counterI, #1
-       bgt     sgemm_kernel_L2_M16_20
+       bgt     .Lsgemm_kernel_L2_M16_20
 
 //------------------------------------------------------------------------------
 
-sgemm_kernel_L2_M8_BEGIN:
+.Lsgemm_kernel_L2_M8_BEGIN:
        mov     counterI, origM
        tst     counterI , #15
-       ble     sgemm_kernel_L2_END
+       ble     .Lsgemm_kernel_L2_END
 
        tst     counterI, #8
-       ble     sgemm_kernel_L2_M4_BEGIN
+       ble     .Lsgemm_kernel_L2_M4_BEGIN
 
-sgemm_kernel_L2_M8_20:
+.Lsgemm_kernel_L2_M8_20:
 
        INIT8x2
 
@@ -1551,10 +1551,10 @@ sgemm_kernel_L2_M8_20:
 
        asr     counterL , origK, #3            // counterL = counterL / 8
        cmp     counterL,#0
-       ble     sgemm_kernel_L2_M8_40
+       ble     .Lsgemm_kernel_L2_M8_40
        .align 5
 
-sgemm_kernel_L2_M8_22:
+.Lsgemm_kernel_L2_M8_22:
        KERNEL8x2_SUB
        KERNEL8x2_SUB
        KERNEL8x2_SUB
@@ -1566,38 +1566,38 @@ sgemm_kernel_L2_M8_22:
        KERNEL8x2_SUB
 
        subs    counterL, counterL, #1
-       bgt     sgemm_kernel_L2_M8_22
+       bgt     .Lsgemm_kernel_L2_M8_22
 
 
-sgemm_kernel_L2_M8_40:
+.Lsgemm_kernel_L2_M8_40:
 
        ands    counterL , origK, #7            // counterL = counterL % 8
-       ble     sgemm_kernel_L2_M8_100
+       ble     .Lsgemm_kernel_L2_M8_100
 
-sgemm_kernel_L2_M8_42:
+.Lsgemm_kernel_L2_M8_42:
 
        KERNEL8x2_SUB
 
        subs    counterL, counterL, #1
-       bgt     sgemm_kernel_L2_M8_42
+       bgt     .Lsgemm_kernel_L2_M8_42
 
-sgemm_kernel_L2_M8_100:
+.Lsgemm_kernel_L2_M8_100:
 
        SAVE8x2
 
-sgemm_kernel_L2_M8_END:
+.Lsgemm_kernel_L2_M8_END:
 
 //------------------------------------------------------------------------------
 
-sgemm_kernel_L2_M4_BEGIN:
+.Lsgemm_kernel_L2_M4_BEGIN:
        mov     counterI, origM
        tst     counterI , #7
-       ble     sgemm_kernel_L2_END
+       ble     .Lsgemm_kernel_L2_END
 
        tst     counterI, #4
-       ble     sgemm_kernel_L2_M2_BEGIN
+       ble     .Lsgemm_kernel_L2_M2_BEGIN
 
-sgemm_kernel_L2_M4_20:
+.Lsgemm_kernel_L2_M4_20:
 
        INIT4x2
 
@@ -1605,10 +1605,10 @@ sgemm_kernel_L2_M4_20:
 
        asr     counterL , origK, #3            // counterL = counterL / 8
        cmp     counterL,#0
-       ble     sgemm_kernel_L2_M4_40
+       ble     .Lsgemm_kernel_L2_M4_40
        .align 5
 
-sgemm_kernel_L2_M4_22:
+.Lsgemm_kernel_L2_M4_22:
        KERNEL4x2_SUB
        KERNEL4x2_SUB
        KERNEL4x2_SUB
@@ -1620,40 +1620,40 @@ sgemm_kernel_L2_M4_22:
        KERNEL4x2_SUB
 
        subs    counterL, counterL, #1
-       bgt     sgemm_kernel_L2_M4_22
+       bgt     .Lsgemm_kernel_L2_M4_22
 
 
-sgemm_kernel_L2_M4_40:
+.Lsgemm_kernel_L2_M4_40:
 
        ands    counterL , origK, #7            // counterL = counterL % 8
-       ble     sgemm_kernel_L2_M4_100
+       ble     .Lsgemm_kernel_L2_M4_100
 
-sgemm_kernel_L2_M4_42:
+.Lsgemm_kernel_L2_M4_42:
 
        KERNEL4x2_SUB
 
        subs    counterL, counterL, #1
-       bgt     sgemm_kernel_L2_M4_42
+       bgt     .Lsgemm_kernel_L2_M4_42
 
-sgemm_kernel_L2_M4_100:
+.Lsgemm_kernel_L2_M4_100:
 
        SAVE4x2
 
-sgemm_kernel_L2_M4_END:
+.Lsgemm_kernel_L2_M4_END:
 
 //------------------------------------------------------------------------------
 
 
-sgemm_kernel_L2_M2_BEGIN:
+.Lsgemm_kernel_L2_M2_BEGIN:
 
        mov     counterI, origM
        tst     counterI , #3
-       ble     sgemm_kernel_L2_END
+       ble     .Lsgemm_kernel_L2_END
 
        tst     counterI, #2                    // counterI = counterI / 2
-       ble     sgemm_kernel_L2_M1_BEGIN
+       ble     .Lsgemm_kernel_L2_M1_BEGIN
 
-sgemm_kernel_L2_M2_20:
+.Lsgemm_kernel_L2_M2_20:
 
        INIT2x2
 
@@ -1661,9 +1661,9 @@ sgemm_kernel_L2_M2_20:
 
        asr     counterL , origK, #3            // counterL = counterL / 8
         cmp    counterL,#0
-       ble     sgemm_kernel_L2_M2_40
+       ble     .Lsgemm_kernel_L2_M2_40
 
-sgemm_kernel_L2_M2_22:
+.Lsgemm_kernel_L2_M2_22:
 
        KERNEL2x2_SUB
        KERNEL2x2_SUB
@@ -1676,34 +1676,34 @@ sgemm_kernel_L2_M2_22:
        KERNEL2x2_SUB
 
        subs    counterL, counterL, #1
-       bgt     sgemm_kernel_L2_M2_22
+       bgt     .Lsgemm_kernel_L2_M2_22
 
 
-sgemm_kernel_L2_M2_40:
+.Lsgemm_kernel_L2_M2_40:
 
        ands    counterL , origK, #7            // counterL = counterL % 8
-       ble     sgemm_kernel_L2_M2_100
+       ble     .Lsgemm_kernel_L2_M2_100
 
-sgemm_kernel_L2_M2_42:
+.Lsgemm_kernel_L2_M2_42:
 
        KERNEL2x2_SUB
 
        subs    counterL, counterL, #1
-       bgt     sgemm_kernel_L2_M2_42
+       bgt     .Lsgemm_kernel_L2_M2_42
 
-sgemm_kernel_L2_M2_100:
+.Lsgemm_kernel_L2_M2_100:
 
        SAVE2x2
 
-sgemm_kernel_L2_M2_END:
+.Lsgemm_kernel_L2_M2_END:
 
 
-sgemm_kernel_L2_M1_BEGIN:
+.Lsgemm_kernel_L2_M1_BEGIN:
 
        tst     counterI, #1                    // counterI = counterI % 2
-       ble     sgemm_kernel_L2_END
+       ble     .Lsgemm_kernel_L2_END
 
-sgemm_kernel_L2_M1_20:
+.Lsgemm_kernel_L2_M1_20:
 
        INIT1x2
 
@@ -1711,9 +1711,9 @@ sgemm_kernel_L2_M1_20:
 
        asr     counterL , origK, #3            // counterL = counterL / 8
         cmp     counterL, #0
-       ble     sgemm_kernel_L2_M1_40
+       ble     .Lsgemm_kernel_L2_M1_40
 
-sgemm_kernel_L2_M1_22:
+.Lsgemm_kernel_L2_M1_22:
        KERNEL1x2_SUB
        KERNEL1x2_SUB
        KERNEL1x2_SUB
@@ -1725,36 +1725,36 @@ sgemm_kernel_L2_M1_22:
        KERNEL1x2_SUB
 
        subs    counterL, counterL, #1
-       bgt     sgemm_kernel_L2_M1_22
+       bgt     .Lsgemm_kernel_L2_M1_22
 
 
-sgemm_kernel_L2_M1_40:
+.Lsgemm_kernel_L2_M1_40:
 
        ands    counterL , origK, #7            // counterL = counterL % 8
-       ble     sgemm_kernel_L2_M1_100
+       ble     .Lsgemm_kernel_L2_M1_100
 
-sgemm_kernel_L2_M1_42:
+.Lsgemm_kernel_L2_M1_42:
 
        KERNEL1x2_SUB
 
        subs    counterL, counterL, #1
-       bgt     sgemm_kernel_L2_M1_42
+       bgt     .Lsgemm_kernel_L2_M1_42
 
-sgemm_kernel_L2_M1_100:
+.Lsgemm_kernel_L2_M1_100:
 
        SAVE1x2
 
-sgemm_kernel_L2_END:
+.Lsgemm_kernel_L2_END:
 
        add     origPB, origPB, origK, lsl #3   // B = B + K * 2 * 4
 
 /******************************************************************************/
 
-sgemm_kernel_L1_BEGIN:
+.Lsgemm_kernel_L1_BEGIN:
 
        mov     counterJ , origN
        tst     counterJ , #1
-       ble     sgemm_kernel_L999 // done
+       ble     .Lsgemm_kernel_L999 // done
 
 
        mov     pCRow0, pC                      // pCRow0 = C
@@ -1762,14 +1762,14 @@ sgemm_kernel_L1_BEGIN:
 
        mov     pA, origPA                      // pA = A
 
-sgemm_kernel_L1_M16_BEGIN:
+.Lsgemm_kernel_L1_M16_BEGIN:
 
        mov     counterI, origM
        asr     counterI, counterI, #4          // counterI = counterI / 16
        cmp     counterI, #0
-       ble     sgemm_kernel_L1_M8_BEGIN
+       ble     .Lsgemm_kernel_L1_M8_BEGIN
 
-sgemm_kernel_L1_M16_20:
+.Lsgemm_kernel_L1_M16_20:
 
        INIT16x1
 
@@ -1777,10 +1777,10 @@ sgemm_kernel_L1_M16_20:
 
        asr     counterL , origK, #3            // counterL = counterL / 8
        cmp     counterL , #0
-       ble     sgemm_kernel_L1_M16_40
+       ble     .Lsgemm_kernel_L1_M16_40
        .align 5
 
-sgemm_kernel_L1_M16_22:
+.Lsgemm_kernel_L1_M16_22:
        KERNEL16x1_SUB
        KERNEL16x1_SUB
        KERNEL16x1_SUB
@@ -1792,42 +1792,42 @@ sgemm_kernel_L1_M16_22:
        KERNEL16x1_SUB
 
        subs    counterL, counterL, #1
-       bgt     sgemm_kernel_L1_M16_22
+       bgt     .Lsgemm_kernel_L1_M16_22
 
 
-sgemm_kernel_L1_M16_40:
+.Lsgemm_kernel_L1_M16_40:
 
        ands    counterL , origK, #7            // counterL = counterL % 8
-       ble     sgemm_kernel_L1_M16_100
+       ble     .Lsgemm_kernel_L1_M16_100
 
-sgemm_kernel_L1_M16_42:
+.Lsgemm_kernel_L1_M16_42:
 
        KERNEL16x1_SUB
 
        subs    counterL, counterL, #1
-       bgt     sgemm_kernel_L1_M16_42
+       bgt     .Lsgemm_kernel_L1_M16_42
 
-sgemm_kernel_L1_M16_100:
+.Lsgemm_kernel_L1_M16_100:
 
        SAVE16x1
 
-sgemm_kernel_L1_M16_END:
+.Lsgemm_kernel_L1_M16_END:
 
        subs    counterI, counterI, #1
-       bgt     sgemm_kernel_L1_M16_20
+       bgt     .Lsgemm_kernel_L1_M16_20
 
 //------------------------------------------------------------------------------
 
-sgemm_kernel_L1_M8_BEGIN:
+.Lsgemm_kernel_L1_M8_BEGIN:
 
        mov     counterI, origM
        tst     counterI , #15
-       ble     sgemm_kernel_L1_END
+       ble     .Lsgemm_kernel_L1_END
 
        tst     counterI, #8
-       ble     sgemm_kernel_L1_M4_BEGIN
+       ble     .Lsgemm_kernel_L1_M4_BEGIN
 
-sgemm_kernel_L1_M8_20:
+.Lsgemm_kernel_L1_M8_20:
 
        INIT8x1
 
@@ -1835,10 +1835,10 @@ sgemm_kernel_L1_M8_20:
 
        asr     counterL , origK, #3            // counterL = counterL / 8
        cmp     counterL , #0
-       ble     sgemm_kernel_L1_M8_40
+       ble     .Lsgemm_kernel_L1_M8_40
        .align 5
 
-sgemm_kernel_L1_M8_22:
+.Lsgemm_kernel_L1_M8_22:
        KERNEL8x1_SUB
        KERNEL8x1_SUB
        KERNEL8x1_SUB
@@ -1850,38 +1850,38 @@ sgemm_kernel_L1_M8_22:
        KERNEL8x1_SUB
 
        subs    counterL, counterL, #1
-       bgt     sgemm_kernel_L1_M8_22
+       bgt     .Lsgemm_kernel_L1_M8_22
 
 
-sgemm_kernel_L1_M8_40:
+.Lsgemm_kernel_L1_M8_40:
 
        ands    counterL , origK, #7            // counterL = counterL % 8
-       ble     sgemm_kernel_L1_M8_100
+       ble     .Lsgemm_kernel_L1_M8_100
 
-sgemm_kernel_L1_M8_42:
+.Lsgemm_kernel_L1_M8_42:
 
        KERNEL8x1_SUB
 
        subs    counterL, counterL, #1
-       bgt     sgemm_kernel_L1_M8_42
+       bgt     .Lsgemm_kernel_L1_M8_42
 
-sgemm_kernel_L1_M8_100:
+.Lsgemm_kernel_L1_M8_100:
 
        SAVE8x1
 
-sgemm_kernel_L1_M8_END:
+.Lsgemm_kernel_L1_M8_END:
 
 //------------------------------------------------------------------------------
 
-sgemm_kernel_L1_M4_BEGIN:
+.Lsgemm_kernel_L1_M4_BEGIN:
        mov     counterI, origM
        tst     counterI , #7
-       ble     sgemm_kernel_L1_END
+       ble     .Lsgemm_kernel_L1_END
 
        tst     counterI, #4
-       ble     sgemm_kernel_L1_M2_BEGIN
+       ble     .Lsgemm_kernel_L1_M2_BEGIN
 
-sgemm_kernel_L1_M4_20:
+.Lsgemm_kernel_L1_M4_20:
 
        INIT4x1
 
@@ -1889,10 +1889,10 @@ sgemm_kernel_L1_M4_20:
 
        asr     counterL , origK, #3            // counterL = counterL / 8
        cmp     counterL , #0
-       ble     sgemm_kernel_L1_M4_40
+       ble     .Lsgemm_kernel_L1_M4_40
        .align 5
 
-sgemm_kernel_L1_M4_22:
+.Lsgemm_kernel_L1_M4_22:
        KERNEL4x1_SUB
        KERNEL4x1_SUB
        KERNEL4x1_SUB
@@ -1904,39 +1904,39 @@ sgemm_kernel_L1_M4_22:
        KERNEL4x1_SUB
 
        subs    counterL, counterL, #1
-       bgt     sgemm_kernel_L1_M4_22
+       bgt     .Lsgemm_kernel_L1_M4_22
 
 
-sgemm_kernel_L1_M4_40:
+.Lsgemm_kernel_L1_M4_40:
 
        ands    counterL , origK, #7            // counterL = counterL % 8
-       ble     sgemm_kernel_L1_M4_100
+       ble     .Lsgemm_kernel_L1_M4_100
 
-sgemm_kernel_L1_M4_42:
+.Lsgemm_kernel_L1_M4_42:
 
        KERNEL4x1_SUB
 
        subs    counterL, counterL, #1
-       bgt     sgemm_kernel_L1_M4_42
+       bgt     .Lsgemm_kernel_L1_M4_42
 
-sgemm_kernel_L1_M4_100:
+.Lsgemm_kernel_L1_M4_100:
 
        SAVE4x1
 
-sgemm_kernel_L1_M4_END:
+.Lsgemm_kernel_L1_M4_END:
 
 //------------------------------------------------------------------------------
 
-sgemm_kernel_L1_M2_BEGIN:
+.Lsgemm_kernel_L1_M2_BEGIN:
 
        mov     counterI, origM
        tst     counterI , #3
-       ble     sgemm_kernel_L1_END
+       ble     .Lsgemm_kernel_L1_END
 
        tst     counterI, #2                    // counterI = counterI / 2
-       ble     sgemm_kernel_L1_M1_BEGIN
+       ble     .Lsgemm_kernel_L1_M1_BEGIN
 
-sgemm_kernel_L1_M2_20:
+.Lsgemm_kernel_L1_M2_20:
 
        INIT2x1
 
@@ -1944,9 +1944,9 @@ sgemm_kernel_L1_M2_20:
 
        asr     counterL , origK, #3            // counterL = counterL / 8
        cmp     counterL , #0
-       ble     sgemm_kernel_L1_M2_40
+       ble     .Lsgemm_kernel_L1_M2_40
 
-sgemm_kernel_L1_M2_22:
+.Lsgemm_kernel_L1_M2_22:
 
        KERNEL2x1_SUB
        KERNEL2x1_SUB
@@ -1959,34 +1959,34 @@ sgemm_kernel_L1_M2_22:
        KERNEL2x1_SUB
 
        subs    counterL, counterL, #1
-       bgt     sgemm_kernel_L1_M2_22
+       bgt     .Lsgemm_kernel_L1_M2_22
 
 
-sgemm_kernel_L1_M2_40:
+.Lsgemm_kernel_L1_M2_40:
 
        ands    counterL , origK, #7            // counterL = counterL % 8
-       ble     sgemm_kernel_L1_M2_100
+       ble     .Lsgemm_kernel_L1_M2_100
 
-sgemm_kernel_L1_M2_42:
+.Lsgemm_kernel_L1_M2_42:
 
        KERNEL2x1_SUB
 
        subs    counterL, counterL, #1
-       bgt     sgemm_kernel_L1_M2_42
+       bgt     .Lsgemm_kernel_L1_M2_42
 
-sgemm_kernel_L1_M2_100:
+.Lsgemm_kernel_L1_M2_100:
 
        SAVE2x1
 
-sgemm_kernel_L1_M2_END:
+.Lsgemm_kernel_L1_M2_END:
 
 
-sgemm_kernel_L1_M1_BEGIN:
+.Lsgemm_kernel_L1_M1_BEGIN:
 
        tst     counterI, #1                    // counterI = counterI % 2
-       ble     sgemm_kernel_L1_END
+       ble     .Lsgemm_kernel_L1_END
 
-sgemm_kernel_L1_M1_20:
+.Lsgemm_kernel_L1_M1_20:
 
        INIT1x1
 
@@ -1994,9 +1994,9 @@ sgemm_kernel_L1_M1_20:
 
        asr     counterL , origK, #3            // counterL = counterL / 8
        cmp     counterL , #0
-       ble     sgemm_kernel_L1_M1_40
+       ble     .Lsgemm_kernel_L1_M1_40
 
-sgemm_kernel_L1_M1_22:
+.Lsgemm_kernel_L1_M1_22:
        KERNEL1x1_SUB
        KERNEL1x1_SUB
        KERNEL1x1_SUB
@@ -2008,28 +2008,28 @@ sgemm_kernel_L1_M1_22:
        KERNEL1x1_SUB
 
        subs    counterL, counterL, #1
-       bgt     sgemm_kernel_L1_M1_22
+       bgt     .Lsgemm_kernel_L1_M1_22
 
 
-sgemm_kernel_L1_M1_40:
+.Lsgemm_kernel_L1_M1_40:
 
        ands    counterL , origK, #7            // counterL = counterL % 8
-       ble     sgemm_kernel_L1_M1_100
+       ble     .Lsgemm_kernel_L1_M1_100
 
-sgemm_kernel_L1_M1_42:
+.Lsgemm_kernel_L1_M1_42:
 
        KERNEL1x1_SUB
 
        subs    counterL, counterL, #1
-       bgt     sgemm_kernel_L1_M1_42
+       bgt     .Lsgemm_kernel_L1_M1_42
 
-sgemm_kernel_L1_M1_100:
+.Lsgemm_kernel_L1_M1_100:
 
        SAVE1x1
 
-sgemm_kernel_L1_END:
+.Lsgemm_kernel_L1_END:
 
-sgemm_kernel_L999:
+.Lsgemm_kernel_L999:
        mov     x0, #0                          // set return value
        ldp     d8, d9, [sp, #(0 * 16)]
        ldp     d10, d11, [sp, #(1 * 16)]
index 0ee10e1..144d4bc 100644 (file)
@@ -1117,7 +1117,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 
        PROLOGUE
 
-sgemm_kernel_begin:
+.Lsgemm_kernel_begin:
 
        .align 5
        add     sp, sp, #-(11 * 16)
@@ -1145,11 +1145,11 @@ sgemm_kernel_begin:
        mov     counterJ, origN
        asr     counterJ, counterJ, #2          // J = J / 4
        cmp     counterJ, #0
-       ble     sgemm_kernel_L2_BEGIN
+       ble     .Lsgemm_kernel_L2_BEGIN
 
 /******************************************************************************/
 
-sgemm_kernel_L4_BEGIN:
+.Lsgemm_kernel_L4_BEGIN:
        mov     pCRow0, pC
        add     pCRow1, pCRow0, LDC
        add     pCRow2, pCRow1, LDC
@@ -1159,21 +1159,21 @@ sgemm_kernel_L4_BEGIN:
 
        mov     pA, origPA                      // pA = start of A array
 
-sgemm_kernel_L4_M16_BEGIN:
+.Lsgemm_kernel_L4_M16_BEGIN:
 
        mov     counterI, origM
        asr     counterI, counterI, #4          // counterI = counterI / 16
        cmp     counterI, #0
-       ble     sgemm_kernel_L4_M8_BEGIN
+       ble     .Lsgemm_kernel_L4_M8_BEGIN
 
        .align 5
-sgemm_kernel_L4_M16_20:
+.Lsgemm_kernel_L4_M16_20:
 
        mov     pB, origPB
 
        asr     counterL , origK, #4        // L = K / 16
        cmp     counterL , #2
-       blt     sgemm_kernel_L4_M16_32
+       blt     .Lsgemm_kernel_L4_M16_32
 
        KERNEL16x4_I
        KERNEL16x4_M2
@@ -1182,18 +1182,18 @@ sgemm_kernel_L4_M16_20:
        KERNEL16x4_M1_M2_x1
 
        subs    counterL, counterL, #2
-       ble     sgemm_kernel_L4_M16_22a
+       ble     .Lsgemm_kernel_L4_M16_22a
 
        .align 5
-sgemm_kernel_L4_M16_22:
+.Lsgemm_kernel_L4_M16_22:
 
        KERNEL16x4_M1_M2_x8
 
        subs    counterL, counterL, #1
-       bgt     sgemm_kernel_L4_M16_22
+       bgt     .Lsgemm_kernel_L4_M16_22
 
        .align 5
-sgemm_kernel_L4_M16_22a:
+.Lsgemm_kernel_L4_M16_22a:
 
        KERNEL16x4_M1_M2_x4
        KERNEL16x4_M1_M2_x2
@@ -1201,13 +1201,13 @@ sgemm_kernel_L4_M16_22a:
        KERNEL16x4_M1
        KERNEL16x4_E
 
-       b        sgemm_kernel_L4_M16_44
+       b        .Lsgemm_kernel_L4_M16_44
 
        .align 5
-sgemm_kernel_L4_M16_32:
+.Lsgemm_kernel_L4_M16_32:
 
        tst     counterL, #1
-       ble     sgemm_kernel_L4_M16_40
+       ble     .Lsgemm_kernel_L4_M16_40
 
        KERNEL16x4_I
        KERNEL16x4_M2
@@ -1216,187 +1216,187 @@ sgemm_kernel_L4_M16_32:
        KERNEL16x4_M1
        KERNEL16x4_E
 
-       b       sgemm_kernel_L4_M16_44
+       b       .Lsgemm_kernel_L4_M16_44
 
-sgemm_kernel_L4_M16_40:
+.Lsgemm_kernel_L4_M16_40:
 
        INIT16x4
 
-sgemm_kernel_L4_M16_44:
+.Lsgemm_kernel_L4_M16_44:
 
        ands    counterL , origK, #15
-       ble     sgemm_kernel_L4_M16_100
+       ble     .Lsgemm_kernel_L4_M16_100
 
        .align 5
-sgemm_kernel_L4_M16_46:
+.Lsgemm_kernel_L4_M16_46:
 
        KERNEL16x4_SUB
        subs    counterL, counterL, #1
-       bne     sgemm_kernel_L4_M16_46
+       bne     .Lsgemm_kernel_L4_M16_46
 
-sgemm_kernel_L4_M16_100:
+.Lsgemm_kernel_L4_M16_100:
        prfm    PLDL1KEEP, [pA]
        prfm    PLDL1KEEP, [pA, #64]
        prfm    PLDL1KEEP, [origPB]
 
        SAVE16x4
 
-sgemm_kernel_L4_M16_END:
+.Lsgemm_kernel_L4_M16_END:
        subs    counterI, counterI, #1
-       bne     sgemm_kernel_L4_M16_20
+       bne     .Lsgemm_kernel_L4_M16_20
 
 //------------------------------------------------------------------------------
 
-sgemm_kernel_L4_M8_BEGIN:
+.Lsgemm_kernel_L4_M8_BEGIN:
 
        mov     counterI, origM
        tst     counterI , #15
-       ble     sgemm_kernel_L4_END
+       ble     .Lsgemm_kernel_L4_END
 
        tst     counterI, #8
-       ble     sgemm_kernel_L4_M4_BEGIN
+       ble     .Lsgemm_kernel_L4_M4_BEGIN
 
-sgemm_kernel_L4_M8_20:
+.Lsgemm_kernel_L4_M8_20:
 
        mov     pB, origPB
 
        asr     counterL , origK, #1            // L = K / 2
        cmp     counterL , #2                   // is there at least 4 to do?
-       blt     sgemm_kernel_L4_M8_32
+       blt     .Lsgemm_kernel_L4_M8_32
 
        KERNEL8x4_I                             // do one in the K
        KERNEL8x4_M2                            // do another in the K
 
        subs    counterL, counterL, #2
-       ble     sgemm_kernel_L4_M8_22a
+       ble     .Lsgemm_kernel_L4_M8_22a
        .align 5
 
-sgemm_kernel_L4_M8_22:
+.Lsgemm_kernel_L4_M8_22:
 
        KERNEL8x4_M1
        KERNEL8x4_M2
 
        subs    counterL, counterL, #1
-       bgt     sgemm_kernel_L4_M8_22
+       bgt     .Lsgemm_kernel_L4_M8_22
 
-sgemm_kernel_L4_M8_22a:
+.Lsgemm_kernel_L4_M8_22a:
 
        KERNEL8x4_M1
        KERNEL8x4_E
 
-       b        sgemm_kernel_L4_M8_44
+       b        .Lsgemm_kernel_L4_M8_44
 
-sgemm_kernel_L4_M8_32:
+.Lsgemm_kernel_L4_M8_32:
 
        tst     counterL, #1
-       ble     sgemm_kernel_L4_M8_40
+       ble     .Lsgemm_kernel_L4_M8_40
 
        KERNEL8x4_I
        KERNEL8x4_E
 
-       b       sgemm_kernel_L4_M8_44
+       b       .Lsgemm_kernel_L4_M8_44
 
-sgemm_kernel_L4_M8_40:
+.Lsgemm_kernel_L4_M8_40:
 
        INIT8x4
 
-sgemm_kernel_L4_M8_44:
+.Lsgemm_kernel_L4_M8_44:
 
        ands    counterL , origK, #1
-       ble     sgemm_kernel_L4_M8_100
+       ble     .Lsgemm_kernel_L4_M8_100
 
-sgemm_kernel_L4_M8_46:
+.Lsgemm_kernel_L4_M8_46:
 
        KERNEL8x4_SUB
 
-sgemm_kernel_L4_M8_100:
+.Lsgemm_kernel_L4_M8_100:
 
        SAVE8x4
 
-sgemm_kernel_L4_M8_END:
+.Lsgemm_kernel_L4_M8_END:
 
 //------------------------------------------------------------------------------
 
-sgemm_kernel_L4_M4_BEGIN:
+.Lsgemm_kernel_L4_M4_BEGIN:
 
        mov     counterI, origM
        tst     counterI , #7
-       ble     sgemm_kernel_L4_END
+       ble     .Lsgemm_kernel_L4_END
 
        tst     counterI, #4
-       ble     sgemm_kernel_L4_M2_BEGIN
+       ble     .Lsgemm_kernel_L4_M2_BEGIN
 
-sgemm_kernel_L4_M4_20:
+.Lsgemm_kernel_L4_M4_20:
 
        mov     pB, origPB
 
        asr     counterL , origK, #1            // L = K / 2
        cmp     counterL , #2                   // is there at least 4 to do?
-       blt     sgemm_kernel_L4_M4_32
+       blt     .Lsgemm_kernel_L4_M4_32
 
        KERNEL4x4_I                             // do one in the K
        KERNEL4x4_M2                            // do another in the K
 
        subs    counterL, counterL, #2
-       ble     sgemm_kernel_L4_M4_22a
+       ble     .Lsgemm_kernel_L4_M4_22a
        .align 5
 
-sgemm_kernel_L4_M4_22:
+.Lsgemm_kernel_L4_M4_22:
 
        KERNEL4x4_M1
        KERNEL4x4_M2
 
        subs    counterL, counterL, #1
-       bgt     sgemm_kernel_L4_M4_22
+       bgt     .Lsgemm_kernel_L4_M4_22
 
-sgemm_kernel_L4_M4_22a:
+.Lsgemm_kernel_L4_M4_22a:
 
        KERNEL4x4_M1
        KERNEL4x4_E
 
-       b        sgemm_kernel_L4_M4_44
+       b        .Lsgemm_kernel_L4_M4_44
 
-sgemm_kernel_L4_M4_32:
+.Lsgemm_kernel_L4_M4_32:
 
        tst     counterL, #1
-       ble     sgemm_kernel_L4_M4_40
+       ble     .Lsgemm_kernel_L4_M4_40
 
        KERNEL4x4_I
        KERNEL4x4_E
 
-       b       sgemm_kernel_L4_M4_44
+       b       .Lsgemm_kernel_L4_M4_44
 
-sgemm_kernel_L4_M4_40:
+.Lsgemm_kernel_L4_M4_40:
 
        INIT4x4
 
-sgemm_kernel_L4_M4_44:
+.Lsgemm_kernel_L4_M4_44:
 
        ands    counterL , origK, #1
-       ble     sgemm_kernel_L4_M4_100
+       ble     .Lsgemm_kernel_L4_M4_100
 
-sgemm_kernel_L4_M4_46:
+.Lsgemm_kernel_L4_M4_46:
 
        KERNEL4x4_SUB
 
-sgemm_kernel_L4_M4_100:
+.Lsgemm_kernel_L4_M4_100:
 
        SAVE4x4
 
-sgemm_kernel_L4_M4_END:
+.Lsgemm_kernel_L4_M4_END:
 
 //------------------------------------------------------------------------------
 
-sgemm_kernel_L4_M2_BEGIN:
+.Lsgemm_kernel_L4_M2_BEGIN:
 
        mov     counterI, origM
        tst     counterI , #3
-       ble     sgemm_kernel_L4_END
+       ble     .Lsgemm_kernel_L4_END
 
        tst     counterI, #2                    // counterI = counterI / 2
-       ble     sgemm_kernel_L4_M1_BEGIN
+       ble     .Lsgemm_kernel_L4_M1_BEGIN
 
-sgemm_kernel_L4_M2_20:
+.Lsgemm_kernel_L4_M2_20:
 
        INIT2x4
 
@@ -1404,9 +1404,9 @@ sgemm_kernel_L4_M2_20:
 
        asr     counterL , origK, #3            // counterL = counterL / 8
        cmp     counterL , #0
-       ble     sgemm_kernel_L4_M2_40
+       ble     .Lsgemm_kernel_L4_M2_40
 
-sgemm_kernel_L4_M2_22:
+.Lsgemm_kernel_L4_M2_22:
 
        KERNEL2x4_SUB
        KERNEL2x4_SUB
@@ -1419,34 +1419,34 @@ sgemm_kernel_L4_M2_22:
        KERNEL2x4_SUB
 
        subs    counterL, counterL, #1
-       bgt     sgemm_kernel_L4_M2_22
+       bgt     .Lsgemm_kernel_L4_M2_22
 
 
-sgemm_kernel_L4_M2_40:
+.Lsgemm_kernel_L4_M2_40:
 
        ands    counterL , origK, #7            // counterL = counterL % 8
-       ble     sgemm_kernel_L4_M2_100
+       ble     .Lsgemm_kernel_L4_M2_100
 
-sgemm_kernel_L4_M2_42:
+.Lsgemm_kernel_L4_M2_42:
 
        KERNEL2x4_SUB
 
        subs    counterL, counterL, #1
-       bgt     sgemm_kernel_L4_M2_42
+       bgt     .Lsgemm_kernel_L4_M2_42
 
-sgemm_kernel_L4_M2_100:
+.Lsgemm_kernel_L4_M2_100:
 
        SAVE2x4
 
-sgemm_kernel_L4_M2_END:
+.Lsgemm_kernel_L4_M2_END:
 
 
-sgemm_kernel_L4_M1_BEGIN:
+.Lsgemm_kernel_L4_M1_BEGIN:
 
        tst     counterI, #1                    // counterI = counterI % 2
-       ble     sgemm_kernel_L4_END
+       ble     .Lsgemm_kernel_L4_END
 
-sgemm_kernel_L4_M1_20:
+.Lsgemm_kernel_L4_M1_20:
 
        INIT1x4
 
@@ -1454,9 +1454,9 @@ sgemm_kernel_L4_M1_20:
 
        asr     counterL , origK, #3            // counterL = counterL / 8
        cmp     counterL , #0
-       ble     sgemm_kernel_L4_M1_40
+       ble     .Lsgemm_kernel_L4_M1_40
 
-sgemm_kernel_L4_M1_22:
+.Lsgemm_kernel_L4_M1_22:
        KERNEL1x4_SUB
        KERNEL1x4_SUB
        KERNEL1x4_SUB
@@ -1468,42 +1468,42 @@ sgemm_kernel_L4_M1_22:
        KERNEL1x4_SUB
 
        subs    counterL, counterL, #1
-       bgt     sgemm_kernel_L4_M1_22
+       bgt     .Lsgemm_kernel_L4_M1_22
 
 
-sgemm_kernel_L4_M1_40:
+.Lsgemm_kernel_L4_M1_40:
 
        ands    counterL , origK, #7            // counterL = counterL % 8
-       ble     sgemm_kernel_L4_M1_100
+       ble     .Lsgemm_kernel_L4_M1_100
 
-sgemm_kernel_L4_M1_42:
+.Lsgemm_kernel_L4_M1_42:
 
        KERNEL1x4_SUB
 
        subs    counterL, counterL, #1
-       bgt     sgemm_kernel_L4_M1_42
+       bgt     .Lsgemm_kernel_L4_M1_42
 
-sgemm_kernel_L4_M1_100:
+.Lsgemm_kernel_L4_M1_100:
 
        SAVE1x4
 
-sgemm_kernel_L4_END:
+.Lsgemm_kernel_L4_END:
        add     origPB, origPB, origK, lsl #4   // B = B + K * 4 * 4
 
        subs    counterJ, counterJ , #1         // j--
-       bgt     sgemm_kernel_L4_BEGIN
+       bgt     .Lsgemm_kernel_L4_BEGIN
 
 
 /******************************************************************************/
 
-sgemm_kernel_L2_BEGIN:   // less than 2 left in N direction
+.Lsgemm_kernel_L2_BEGIN:   // less than 2 left in N direction
 
        mov     counterJ , origN
        tst     counterJ , #3
-       ble     sgemm_kernel_L999
+       ble     .Lsgemm_kernel_L999
 
        tst     counterJ , #2
-       ble     sgemm_kernel_L1_BEGIN
+       ble     .Lsgemm_kernel_L1_BEGIN
 
        mov     pCRow0, pC                      // pCRow0 = pC
 
@@ -1511,14 +1511,14 @@ sgemm_kernel_L2_BEGIN:   // less than 2 left in N direction
 
        mov     pA, origPA                      // pA = A
 
-sgemm_kernel_L2_M16_BEGIN:
+.Lsgemm_kernel_L2_M16_BEGIN:
 
        mov     counterI, origM
        asr     counterI, counterI, #4          // counterI = counterI / 16
        cmp     counterI,#0
-       ble     sgemm_kernel_L2_M8_BEGIN
+       ble     .Lsgemm_kernel_L2_M8_BEGIN
 
-sgemm_kernel_L2_M16_20:
+.Lsgemm_kernel_L2_M16_20:
 
        INIT16x2
 
@@ -1526,10 +1526,10 @@ sgemm_kernel_L2_M16_20:
 
        asr     counterL , origK, #3            // counterL = counterL / 8
        cmp     counterL,#0
-       ble     sgemm_kernel_L2_M16_40
+       ble     .Lsgemm_kernel_L2_M16_40
        .align 5
 
-sgemm_kernel_L2_M16_22:
+.Lsgemm_kernel_L2_M16_22:
        KERNEL16x2_SUB
        KERNEL16x2_SUB
        KERNEL16x2_SUB
@@ -1541,41 +1541,41 @@ sgemm_kernel_L2_M16_22:
        KERNEL16x2_SUB
 
        subs    counterL, counterL, #1
-       bgt     sgemm_kernel_L2_M16_22
+       bgt     .Lsgemm_kernel_L2_M16_22
 
 
-sgemm_kernel_L2_M16_40:
+.Lsgemm_kernel_L2_M16_40:
 
        ands    counterL , origK, #7            // counterL = counterL % 8
-       ble     sgemm_kernel_L2_M16_100
+       ble     .Lsgemm_kernel_L2_M16_100
 
-sgemm_kernel_L2_M16_42:
+.Lsgemm_kernel_L2_M16_42:
 
        KERNEL16x2_SUB
 
        subs    counterL, counterL, #1
-       bgt     sgemm_kernel_L2_M16_42
+       bgt     .Lsgemm_kernel_L2_M16_42
 
-sgemm_kernel_L2_M16_100:
+.Lsgemm_kernel_L2_M16_100:
 
        SAVE16x2
 
-sgemm_kernel_L2_M16_END:
+.Lsgemm_kernel_L2_M16_END:
 
        subs    counterI, counterI, #1
-       bgt     sgemm_kernel_L2_M16_20
+       bgt     .Lsgemm_kernel_L2_M16_20
 
 //------------------------------------------------------------------------------
 
-sgemm_kernel_L2_M8_BEGIN:
+.Lsgemm_kernel_L2_M8_BEGIN:
        mov     counterI, origM
        tst     counterI , #15
-       ble     sgemm_kernel_L2_END
+       ble     .Lsgemm_kernel_L2_END
 
        tst     counterI, #8
-       ble     sgemm_kernel_L2_M4_BEGIN
+       ble     .Lsgemm_kernel_L2_M4_BEGIN
 
-sgemm_kernel_L2_M8_20:
+.Lsgemm_kernel_L2_M8_20:
 
        INIT8x2
 
@@ -1583,10 +1583,10 @@ sgemm_kernel_L2_M8_20:
 
        asr     counterL , origK, #3            // counterL = counterL / 8
        cmp     counterL,#0
-       ble     sgemm_kernel_L2_M8_40
+       ble     .Lsgemm_kernel_L2_M8_40
        .align 5
 
-sgemm_kernel_L2_M8_22:
+.Lsgemm_kernel_L2_M8_22:
        KERNEL8x2_SUB
        KERNEL8x2_SUB
        KERNEL8x2_SUB
@@ -1598,38 +1598,38 @@ sgemm_kernel_L2_M8_22:
        KERNEL8x2_SUB
 
        subs    counterL, counterL, #1
-       bgt     sgemm_kernel_L2_M8_22
+       bgt     .Lsgemm_kernel_L2_M8_22
 
 
-sgemm_kernel_L2_M8_40:
+.Lsgemm_kernel_L2_M8_40:
 
        ands    counterL , origK, #7            // counterL = counterL % 8
-       ble     sgemm_kernel_L2_M8_100
+       ble     .Lsgemm_kernel_L2_M8_100
 
-sgemm_kernel_L2_M8_42:
+.Lsgemm_kernel_L2_M8_42:
 
        KERNEL8x2_SUB
 
        subs    counterL, counterL, #1
-       bgt     sgemm_kernel_L2_M8_42
+       bgt     .Lsgemm_kernel_L2_M8_42
 
-sgemm_kernel_L2_M8_100:
+.Lsgemm_kernel_L2_M8_100:
 
        SAVE8x2
 
-sgemm_kernel_L2_M8_END:
+.Lsgemm_kernel_L2_M8_END:
 
 //------------------------------------------------------------------------------
 
-sgemm_kernel_L2_M4_BEGIN:
+.Lsgemm_kernel_L2_M4_BEGIN:
        mov     counterI, origM
        tst     counterI , #7
-       ble     sgemm_kernel_L2_END
+       ble     .Lsgemm_kernel_L2_END
 
        tst     counterI, #4
-       ble     sgemm_kernel_L2_M2_BEGIN
+       ble     .Lsgemm_kernel_L2_M2_BEGIN
 
-sgemm_kernel_L2_M4_20:
+.Lsgemm_kernel_L2_M4_20:
 
        INIT4x2
 
@@ -1637,10 +1637,10 @@ sgemm_kernel_L2_M4_20:
 
        asr     counterL , origK, #3            // counterL = counterL / 8
        cmp     counterL,#0
-       ble     sgemm_kernel_L2_M4_40
+       ble     .Lsgemm_kernel_L2_M4_40
        .align 5
 
-sgemm_kernel_L2_M4_22:
+.Lsgemm_kernel_L2_M4_22:
        KERNEL4x2_SUB
        KERNEL4x2_SUB
        KERNEL4x2_SUB
@@ -1652,40 +1652,40 @@ sgemm_kernel_L2_M4_22:
        KERNEL4x2_SUB
 
        subs    counterL, counterL, #1
-       bgt     sgemm_kernel_L2_M4_22
+       bgt     .Lsgemm_kernel_L2_M4_22
 
 
-sgemm_kernel_L2_M4_40:
+.Lsgemm_kernel_L2_M4_40:
 
        ands    counterL , origK, #7            // counterL = counterL % 8
-       ble     sgemm_kernel_L2_M4_100
+       ble     .Lsgemm_kernel_L2_M4_100
 
-sgemm_kernel_L2_M4_42:
+.Lsgemm_kernel_L2_M4_42:
 
        KERNEL4x2_SUB
 
        subs    counterL, counterL, #1
-       bgt     sgemm_kernel_L2_M4_42
+       bgt     .Lsgemm_kernel_L2_M4_42
 
-sgemm_kernel_L2_M4_100:
+.Lsgemm_kernel_L2_M4_100:
 
        SAVE4x2
 
-sgemm_kernel_L2_M4_END:
+.Lsgemm_kernel_L2_M4_END:
 
 //------------------------------------------------------------------------------
 
 
-sgemm_kernel_L2_M2_BEGIN:
+.Lsgemm_kernel_L2_M2_BEGIN:
 
        mov     counterI, origM
        tst     counterI , #3
-       ble     sgemm_kernel_L2_END
+       ble     .Lsgemm_kernel_L2_END
 
        tst     counterI, #2                    // counterI = counterI / 2
-       ble     sgemm_kernel_L2_M1_BEGIN
+       ble     .Lsgemm_kernel_L2_M1_BEGIN
 
-sgemm_kernel_L2_M2_20:
+.Lsgemm_kernel_L2_M2_20:
 
        INIT2x2
 
@@ -1693,9 +1693,9 @@ sgemm_kernel_L2_M2_20:
 
        asr     counterL , origK, #3            // counterL = counterL / 8
         cmp    counterL,#0
-       ble     sgemm_kernel_L2_M2_40
+       ble     .Lsgemm_kernel_L2_M2_40
 
-sgemm_kernel_L2_M2_22:
+.Lsgemm_kernel_L2_M2_22:
 
        KERNEL2x2_SUB
        KERNEL2x2_SUB
@@ -1708,34 +1708,34 @@ sgemm_kernel_L2_M2_22:
        KERNEL2x2_SUB
 
        subs    counterL, counterL, #1
-       bgt     sgemm_kernel_L2_M2_22
+       bgt     .Lsgemm_kernel_L2_M2_22
 
 
-sgemm_kernel_L2_M2_40:
+.Lsgemm_kernel_L2_M2_40:
 
        ands    counterL , origK, #7            // counterL = counterL % 8
-       ble     sgemm_kernel_L2_M2_100
+       ble     .Lsgemm_kernel_L2_M2_100
 
-sgemm_kernel_L2_M2_42:
+.Lsgemm_kernel_L2_M2_42:
 
        KERNEL2x2_SUB
 
        subs    counterL, counterL, #1
-       bgt     sgemm_kernel_L2_M2_42
+       bgt     .Lsgemm_kernel_L2_M2_42
 
-sgemm_kernel_L2_M2_100:
+.Lsgemm_kernel_L2_M2_100:
 
        SAVE2x2
 
-sgemm_kernel_L2_M2_END:
+.Lsgemm_kernel_L2_M2_END:
 
 
-sgemm_kernel_L2_M1_BEGIN:
+.Lsgemm_kernel_L2_M1_BEGIN:
 
        tst     counterI, #1                    // counterI = counterI % 2
-       ble     sgemm_kernel_L2_END
+       ble     .Lsgemm_kernel_L2_END
 
-sgemm_kernel_L2_M1_20:
+.Lsgemm_kernel_L2_M1_20:
 
        INIT1x2
 
@@ -1743,9 +1743,9 @@ sgemm_kernel_L2_M1_20:
 
        asr     counterL , origK, #3            // counterL = counterL / 8
         cmp     counterL, #0
-       ble     sgemm_kernel_L2_M1_40
+       ble     .Lsgemm_kernel_L2_M1_40
 
-sgemm_kernel_L2_M1_22:
+.Lsgemm_kernel_L2_M1_22:
        KERNEL1x2_SUB
        KERNEL1x2_SUB
        KERNEL1x2_SUB
@@ -1757,36 +1757,36 @@ sgemm_kernel_L2_M1_22:
        KERNEL1x2_SUB
 
        subs    counterL, counterL, #1
-       bgt     sgemm_kernel_L2_M1_22
+       bgt     .Lsgemm_kernel_L2_M1_22
 
 
-sgemm_kernel_L2_M1_40:
+.Lsgemm_kernel_L2_M1_40:
 
        ands    counterL , origK, #7            // counterL = counterL % 8
-       ble     sgemm_kernel_L2_M1_100
+       ble     .Lsgemm_kernel_L2_M1_100
 
-sgemm_kernel_L2_M1_42:
+.Lsgemm_kernel_L2_M1_42:
 
        KERNEL1x2_SUB
 
        subs    counterL, counterL, #1
-       bgt     sgemm_kernel_L2_M1_42
+       bgt     .Lsgemm_kernel_L2_M1_42
 
-sgemm_kernel_L2_M1_100:
+.Lsgemm_kernel_L2_M1_100:
 
        SAVE1x2
 
-sgemm_kernel_L2_END:
+.Lsgemm_kernel_L2_END:
 
        add     origPB, origPB, origK, lsl #3   // B = B + K * 2 * 4
 
 /******************************************************************************/
 
-sgemm_kernel_L1_BEGIN:
+.Lsgemm_kernel_L1_BEGIN:
 
        mov     counterJ , origN
        tst     counterJ , #1
-       ble     sgemm_kernel_L999 // done
+       ble     .Lsgemm_kernel_L999 // done
 
 
        mov     pCRow0, pC                      // pCRow0 = C
@@ -1794,14 +1794,14 @@ sgemm_kernel_L1_BEGIN:
 
        mov     pA, origPA                      // pA = A
 
-sgemm_kernel_L1_M16_BEGIN:
+.Lsgemm_kernel_L1_M16_BEGIN:
 
        mov     counterI, origM
        asr     counterI, counterI, #4          // counterI = counterI / 16
        cmp     counterI, #0
-       ble     sgemm_kernel_L1_M8_BEGIN
+       ble     .Lsgemm_kernel_L1_M8_BEGIN
 
-sgemm_kernel_L1_M16_20:
+.Lsgemm_kernel_L1_M16_20:
 
        INIT16x1
 
@@ -1809,10 +1809,10 @@ sgemm_kernel_L1_M16_20:
 
        asr     counterL , origK, #3            // counterL = counterL / 8
        cmp     counterL , #0
-       ble     sgemm_kernel_L1_M16_40
+       ble     .Lsgemm_kernel_L1_M16_40
        .align 5
 
-sgemm_kernel_L1_M16_22:
+.Lsgemm_kernel_L1_M16_22:
        KERNEL16x1_SUB
        KERNEL16x1_SUB
        KERNEL16x1_SUB
@@ -1824,42 +1824,42 @@ sgemm_kernel_L1_M16_22:
        KERNEL16x1_SUB
 
        subs    counterL, counterL, #1
-       bgt     sgemm_kernel_L1_M16_22
+       bgt     .Lsgemm_kernel_L1_M16_22
 
 
-sgemm_kernel_L1_M16_40:
+.Lsgemm_kernel_L1_M16_40:
 
        ands    counterL , origK, #7            // counterL = counterL % 8
-       ble     sgemm_kernel_L1_M16_100
+       ble     .Lsgemm_kernel_L1_M16_100
 
-sgemm_kernel_L1_M16_42:
+.Lsgemm_kernel_L1_M16_42:
 
        KERNEL16x1_SUB
 
        subs    counterL, counterL, #1
-       bgt     sgemm_kernel_L1_M16_42
+       bgt     .Lsgemm_kernel_L1_M16_42
 
-sgemm_kernel_L1_M16_100:
+.Lsgemm_kernel_L1_M16_100:
 
        SAVE16x1
 
-sgemm_kernel_L1_M16_END:
+.Lsgemm_kernel_L1_M16_END:
 
        subs    counterI, counterI, #1
-       bgt     sgemm_kernel_L1_M16_20
+       bgt     .Lsgemm_kernel_L1_M16_20
 
 //------------------------------------------------------------------------------
 
-sgemm_kernel_L1_M8_BEGIN:
+.Lsgemm_kernel_L1_M8_BEGIN:
 
        mov     counterI, origM
        tst     counterI , #15
-       ble     sgemm_kernel_L1_END
+       ble     .Lsgemm_kernel_L1_END
 
        tst     counterI, #8
-       ble     sgemm_kernel_L1_M4_BEGIN
+       ble     .Lsgemm_kernel_L1_M4_BEGIN
 
-sgemm_kernel_L1_M8_20:
+.Lsgemm_kernel_L1_M8_20:
 
        INIT8x1
 
@@ -1867,10 +1867,10 @@ sgemm_kernel_L1_M8_20:
 
        asr     counterL , origK, #3            // counterL = counterL / 8
        cmp     counterL , #0
-       ble     sgemm_kernel_L1_M8_40
+       ble     .Lsgemm_kernel_L1_M8_40
        .align 5
 
-sgemm_kernel_L1_M8_22:
+.Lsgemm_kernel_L1_M8_22:
        KERNEL8x1_SUB
        KERNEL8x1_SUB
        KERNEL8x1_SUB
@@ -1882,38 +1882,38 @@ sgemm_kernel_L1_M8_22:
        KERNEL8x1_SUB
 
        subs    counterL, counterL, #1
-       bgt     sgemm_kernel_L1_M8_22
+       bgt     .Lsgemm_kernel_L1_M8_22
 
 
-sgemm_kernel_L1_M8_40:
+.Lsgemm_kernel_L1_M8_40:
 
        ands    counterL , origK, #7            // counterL = counterL % 8
-       ble     sgemm_kernel_L1_M8_100
+       ble     .Lsgemm_kernel_L1_M8_100
 
-sgemm_kernel_L1_M8_42:
+.Lsgemm_kernel_L1_M8_42:
 
        KERNEL8x1_SUB
 
        subs    counterL, counterL, #1
-       bgt     sgemm_kernel_L1_M8_42
+       bgt     .Lsgemm_kernel_L1_M8_42
 
-sgemm_kernel_L1_M8_100:
+.Lsgemm_kernel_L1_M8_100:
 
        SAVE8x1
 
-sgemm_kernel_L1_M8_END:
+.Lsgemm_kernel_L1_M8_END:
 
 //------------------------------------------------------------------------------
 
-sgemm_kernel_L1_M4_BEGIN:
+.Lsgemm_kernel_L1_M4_BEGIN:
        mov     counterI, origM
        tst     counterI , #7
-       ble     sgemm_kernel_L1_END
+       ble     .Lsgemm_kernel_L1_END
 
        tst     counterI, #4
-       ble     sgemm_kernel_L1_M2_BEGIN
+       ble     .Lsgemm_kernel_L1_M2_BEGIN
 
-sgemm_kernel_L1_M4_20:
+.Lsgemm_kernel_L1_M4_20:
 
        INIT4x1
 
@@ -1921,10 +1921,10 @@ sgemm_kernel_L1_M4_20:
 
        asr     counterL , origK, #3            // counterL = counterL / 8
        cmp     counterL , #0
-       ble     sgemm_kernel_L1_M4_40
+       ble     .Lsgemm_kernel_L1_M4_40
        .align 5
 
-sgemm_kernel_L1_M4_22:
+.Lsgemm_kernel_L1_M4_22:
        KERNEL4x1_SUB
        KERNEL4x1_SUB
        KERNEL4x1_SUB
@@ -1936,39 +1936,39 @@ sgemm_kernel_L1_M4_22:
        KERNEL4x1_SUB
 
        subs    counterL, counterL, #1
-       bgt     sgemm_kernel_L1_M4_22
+       bgt     .Lsgemm_kernel_L1_M4_22
 
 
-sgemm_kernel_L1_M4_40:
+.Lsgemm_kernel_L1_M4_40:
 
        ands    counterL , origK, #7            // counterL = counterL % 8
-       ble     sgemm_kernel_L1_M4_100
+       ble     .Lsgemm_kernel_L1_M4_100
 
-sgemm_kernel_L1_M4_42:
+.Lsgemm_kernel_L1_M4_42:
 
        KERNEL4x1_SUB
 
        subs    counterL, counterL, #1
-       bgt     sgemm_kernel_L1_M4_42
+       bgt     .Lsgemm_kernel_L1_M4_42
 
-sgemm_kernel_L1_M4_100:
+.Lsgemm_kernel_L1_M4_100:
 
        SAVE4x1
 
-sgemm_kernel_L1_M4_END:
+.Lsgemm_kernel_L1_M4_END:
 
 //------------------------------------------------------------------------------
 
-sgemm_kernel_L1_M2_BEGIN:
+.Lsgemm_kernel_L1_M2_BEGIN:
 
        mov     counterI, origM
        tst     counterI , #3
-       ble     sgemm_kernel_L1_END
+       ble     .Lsgemm_kernel_L1_END
 
        tst     counterI, #2                    // counterI = counterI / 2
-       ble     sgemm_kernel_L1_M1_BEGIN
+       ble     .Lsgemm_kernel_L1_M1_BEGIN
 
-sgemm_kernel_L1_M2_20:
+.Lsgemm_kernel_L1_M2_20:
 
        INIT2x1
 
@@ -1976,9 +1976,9 @@ sgemm_kernel_L1_M2_20:
 
        asr     counterL , origK, #3            // counterL = counterL / 8
        cmp     counterL , #0
-       ble     sgemm_kernel_L1_M2_40
+       ble     .Lsgemm_kernel_L1_M2_40
 
-sgemm_kernel_L1_M2_22:
+.Lsgemm_kernel_L1_M2_22:
 
        KERNEL2x1_SUB
        KERNEL2x1_SUB
@@ -1991,34 +1991,34 @@ sgemm_kernel_L1_M2_22:
        KERNEL2x1_SUB
 
        subs    counterL, counterL, #1
-       bgt     sgemm_kernel_L1_M2_22
+       bgt     .Lsgemm_kernel_L1_M2_22
 
 
-sgemm_kernel_L1_M2_40:
+.Lsgemm_kernel_L1_M2_40:
 
        ands    counterL , origK, #7            // counterL = counterL % 8
-       ble     sgemm_kernel_L1_M2_100
+       ble     .Lsgemm_kernel_L1_M2_100
 
-sgemm_kernel_L1_M2_42:
+.Lsgemm_kernel_L1_M2_42:
 
        KERNEL2x1_SUB
 
        subs    counterL, counterL, #1
-       bgt     sgemm_kernel_L1_M2_42
+       bgt     .Lsgemm_kernel_L1_M2_42
 
-sgemm_kernel_L1_M2_100:
+.Lsgemm_kernel_L1_M2_100:
 
        SAVE2x1
 
-sgemm_kernel_L1_M2_END:
+.Lsgemm_kernel_L1_M2_END:
 
 
-sgemm_kernel_L1_M1_BEGIN:
+.Lsgemm_kernel_L1_M1_BEGIN:
 
        tst     counterI, #1                    // counterI = counterI % 2
-       ble     sgemm_kernel_L1_END
+       ble     .Lsgemm_kernel_L1_END
 
-sgemm_kernel_L1_M1_20:
+.Lsgemm_kernel_L1_M1_20:
 
        INIT1x1
 
@@ -2026,9 +2026,9 @@ sgemm_kernel_L1_M1_20:
 
        asr     counterL , origK, #3            // counterL = counterL / 8
        cmp     counterL , #0
-       ble     sgemm_kernel_L1_M1_40
+       ble     .Lsgemm_kernel_L1_M1_40
 
-sgemm_kernel_L1_M1_22:
+.Lsgemm_kernel_L1_M1_22:
        KERNEL1x1_SUB
        KERNEL1x1_SUB
        KERNEL1x1_SUB
@@ -2040,28 +2040,28 @@ sgemm_kernel_L1_M1_22:
        KERNEL1x1_SUB
 
        subs    counterL, counterL, #1
-       bgt     sgemm_kernel_L1_M1_22
+       bgt     .Lsgemm_kernel_L1_M1_22
 
 
-sgemm_kernel_L1_M1_40:
+.Lsgemm_kernel_L1_M1_40:
 
        ands    counterL , origK, #7            // counterL = counterL % 8
-       ble     sgemm_kernel_L1_M1_100
+       ble     .Lsgemm_kernel_L1_M1_100
 
-sgemm_kernel_L1_M1_42:
+.Lsgemm_kernel_L1_M1_42:
 
        KERNEL1x1_SUB
 
        subs    counterL, counterL, #1
-       bgt     sgemm_kernel_L1_M1_42
+       bgt     .Lsgemm_kernel_L1_M1_42
 
-sgemm_kernel_L1_M1_100:
+.Lsgemm_kernel_L1_M1_100:
 
        SAVE1x1
 
-sgemm_kernel_L1_END:
+.Lsgemm_kernel_L1_END:
 
-sgemm_kernel_L999:
+.Lsgemm_kernel_L999:
        mov     x0, #0                          // set return value
        ldp     d8, d9, [sp, #(0 * 16)]
        ldp     d10, d11, [sp, #(1 * 16)]
index a5cf7ba..76c11f1 100644 (file)
@@ -892,11 +892,11 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
        mov     counterJ, origN
        asr     counterJ, counterJ, #2          // J = J / 4
        cmp     counterJ, #0
-       ble     sgemm_kernel_L2_BEGIN
+       ble     .Lsgemm_kernel_L2_BEGIN
 
 /******************************************************************************/
 
-sgemm_kernel_L4_BEGIN:
+.Lsgemm_kernel_L4_BEGIN:
        mov     pCRow0, pC                      // pCRow0 = C
        add     pC, pC, LDC, lsl #2
 
@@ -906,73 +906,73 @@ sgemm_kernel_L4_BEGIN:
        add     pA_2, temp, pA_1
        add     pA_3, temp, pA_2
 
-sgemm_kernel_L4_M16_BEGIN:
+.Lsgemm_kernel_L4_M16_BEGIN:
 
        mov     counterI, origM
        asr     counterI, counterI, #4          // counterI = counterI / 16
        cmp     counterI, #0
-       ble     sgemm_kernel_L4_M8_BEGIN
+       ble     .Lsgemm_kernel_L4_M8_BEGIN
 
-sgemm_kernel_L4_M16_20:
+.Lsgemm_kernel_L4_M16_20:
 
        mov     pB, origPB
        asr     counterL , origK, #1            // L = K / 2
        cmp     counterL , #2                   // is there at least 4 to do?
-       blt     sgemm_kernel_L4_M16_32
+       blt     .Lsgemm_kernel_L4_M16_32
 
        KERNEL16x4_I                            // do one in the K
        KERNEL16x4_M2                           // do another in the K
 
        subs    counterL, counterL, #2
-       ble     sgemm_kernel_L4_M16_22a
+       ble     .Lsgemm_kernel_L4_M16_22a
        .align 5
 
-sgemm_kernel_L4_M16_22:
+.Lsgemm_kernel_L4_M16_22:
 
        KERNEL16x4_M1
        KERNEL16x4_M2
 
        subs    counterL, counterL, #1
-       bgt     sgemm_kernel_L4_M16_22
+       bgt     .Lsgemm_kernel_L4_M16_22
 
 
-sgemm_kernel_L4_M16_22a:
+.Lsgemm_kernel_L4_M16_22a:
 
        KERNEL16x4_M1
        KERNEL16x4_E
 
-       b        sgemm_kernel_L4_M16_44
+       b        .Lsgemm_kernel_L4_M16_44
 
-sgemm_kernel_L4_M16_32:
+.Lsgemm_kernel_L4_M16_32:
 
        tst     counterL, #1
-       ble     sgemm_kernel_L4_M16_40
+       ble     .Lsgemm_kernel_L4_M16_40
 
        KERNEL16x4_I
 
        KERNEL16x4_E
 
-       b       sgemm_kernel_L4_M16_44
+       b       .Lsgemm_kernel_L4_M16_44
 
 
-sgemm_kernel_L4_M16_40:
+.Lsgemm_kernel_L4_M16_40:
 
        INIT16x4
 
-sgemm_kernel_L4_M16_44:
+.Lsgemm_kernel_L4_M16_44:
 
        ands    counterL , origK, #1
-       ble     sgemm_kernel_L4_M16_100
+       ble     .Lsgemm_kernel_L4_M16_100
 
-sgemm_kernel_L4_M16_46:
+.Lsgemm_kernel_L4_M16_46:
 
        KERNEL16x4_SUB
 
-sgemm_kernel_L4_M16_100:
+.Lsgemm_kernel_L4_M16_100:
 
        SAVE16x4
 
-sgemm_kernel_L4_M16_END:
+.Lsgemm_kernel_L4_M16_END:
        lsl     temp, origK, #4                 // k * 4 * 4 = Four rows of A
        add     pA_0, pA_0, temp
        add     pA_0, pA_0, temp
@@ -981,26 +981,26 @@ sgemm_kernel_L4_M16_END:
        add     pA_2, pA_1, temp
        add     pA_3, pA_2, temp
        subs    counterI, counterI, #1
-       bne     sgemm_kernel_L4_M16_20
+       bne     .Lsgemm_kernel_L4_M16_20
 
-sgemm_kernel_L4_M8_BEGIN:
+.Lsgemm_kernel_L4_M8_BEGIN:
        mov     counterI, origM
        tst     counterI , #15
-       ble     sgemm_kernel_L4_END
+       ble     .Lsgemm_kernel_L4_END
 
        tst     counterI, #8
-       ble     sgemm_kernel_L4_M4_BEGIN
+       ble     .Lsgemm_kernel_L4_M4_BEGIN
 
-sgemm_kernel_L4_M8_20:
+.Lsgemm_kernel_L4_M8_20:
 
        INIT8x4
 
        mov     pB, origPB
        asr     counterL, origK, #3             // counterL = counterL / 8
        cmp     counterL, #0
-       ble     sgemm_kernel_L4_M8_40
+       ble     .Lsgemm_kernel_L4_M8_40
 
-sgemm_kernel_L4_M8_22:
+.Lsgemm_kernel_L4_M8_22:
 
        KERNEL8x4_SUB
        KERNEL8x4_SUB
@@ -1013,47 +1013,47 @@ sgemm_kernel_L4_M8_22:
        KERNEL8x4_SUB
 
        subs    counterL, counterL, #1
-       bgt     sgemm_kernel_L4_M8_22
+       bgt     .Lsgemm_kernel_L4_M8_22
 
 
-sgemm_kernel_L4_M8_40:
+.Lsgemm_kernel_L4_M8_40:
 
        ands    counterL , origK, #7            // counterL = counterL % 8
-       ble     sgemm_kernel_L4_M8_100
+       ble     .Lsgemm_kernel_L4_M8_100
 
-sgemm_kernel_L4_M8_42:
+.Lsgemm_kernel_L4_M8_42:
 
        KERNEL8x4_SUB
 
        subs    counterL, counterL, #1
-       bgt     sgemm_kernel_L4_M8_42
+       bgt     .Lsgemm_kernel_L4_M8_42
 
-sgemm_kernel_L4_M8_100:
+.Lsgemm_kernel_L4_M8_100:
 
        SAVE8x4
 
-sgemm_kernel_L4_M8_END:
+.Lsgemm_kernel_L4_M8_END:
        lsl     temp, origK, #4                 // k * 4 * 4
        add     pA_0, pA_0, temp
 
-sgemm_kernel_L4_M4_BEGIN:
+.Lsgemm_kernel_L4_M4_BEGIN:
        mov     counterI, origM
        tst     counterI , #7
-       ble     sgemm_kernel_L4_END
+       ble     .Lsgemm_kernel_L4_END
 
        tst     counterI, #4
-       ble     sgemm_kernel_L4_M2_BEGIN
+       ble     .Lsgemm_kernel_L4_M2_BEGIN
 
-sgemm_kernel_L4_M4_20:
+.Lsgemm_kernel_L4_M4_20:
 
        INIT4x4
 
        mov     pB, origPB
        asr     counterL, origK, #3             // counterL = counterL / 8
        cmp     counterL, #0
-       ble     sgemm_kernel_L4_M4_40
+       ble     .Lsgemm_kernel_L4_M4_40
 
-sgemm_kernel_L4_M4_22:
+.Lsgemm_kernel_L4_M4_22:
 
        KERNEL4x4_SUB
        KERNEL4x4_SUB
@@ -1066,47 +1066,47 @@ sgemm_kernel_L4_M4_22:
        KERNEL4x4_SUB
 
        subs    counterL, counterL, #1
-       bgt     sgemm_kernel_L4_M4_22
+       bgt     .Lsgemm_kernel_L4_M4_22
 
 
-sgemm_kernel_L4_M4_40:
+.Lsgemm_kernel_L4_M4_40:
 
        ands    counterL , origK, #7            // counterL = counterL % 8
-       ble     sgemm_kernel_L4_M4_100
+       ble     .Lsgemm_kernel_L4_M4_100
 
-sgemm_kernel_L4_M4_42:
+.Lsgemm_kernel_L4_M4_42:
 
        KERNEL4x4_SUB
 
        subs    counterL, counterL, #1
-       bgt     sgemm_kernel_L4_M4_42
+       bgt     .Lsgemm_kernel_L4_M4_42
 
-sgemm_kernel_L4_M4_100:
+.Lsgemm_kernel_L4_M4_100:
 
        SAVE4x4
 
-sgemm_kernel_L4_M4_END:
+.Lsgemm_kernel_L4_M4_END:
 
 
-sgemm_kernel_L4_M2_BEGIN:
+.Lsgemm_kernel_L4_M2_BEGIN:
 
        mov     counterI, origM
        tst     counterI , #3
-       ble     sgemm_kernel_L4_END
+       ble     .Lsgemm_kernel_L4_END
 
        tst     counterI, #2                    // counterI = counterI / 2
-       ble     sgemm_kernel_L4_M1_BEGIN
+       ble     .Lsgemm_kernel_L4_M1_BEGIN
 
-sgemm_kernel_L4_M2_20:
+.Lsgemm_kernel_L4_M2_20:
 
        INIT2x4
 
        mov     pB, origPB
        asr     counterL , origK, #3            // counterL = counterL / 8
        cmp     counterL , #0
-       ble     sgemm_kernel_L4_M2_40
+       ble     .Lsgemm_kernel_L4_M2_40
 
-sgemm_kernel_L4_M2_22:
+.Lsgemm_kernel_L4_M2_22:
 
        KERNEL2x4_SUB
        KERNEL2x4_SUB
@@ -1119,43 +1119,43 @@ sgemm_kernel_L4_M2_22:
        KERNEL2x4_SUB
 
        subs    counterL, counterL, #1
-       bgt     sgemm_kernel_L4_M2_22
+       bgt     .Lsgemm_kernel_L4_M2_22
 
 
-sgemm_kernel_L4_M2_40:
+.Lsgemm_kernel_L4_M2_40:
 
        ands    counterL , origK, #7            // counterL = counterL % 8
-       ble     sgemm_kernel_L4_M2_100
+       ble     .Lsgemm_kernel_L4_M2_100
 
-sgemm_kernel_L4_M2_42:
+.Lsgemm_kernel_L4_M2_42:
 
        KERNEL2x4_SUB
 
        subs    counterL, counterL, #1
-       bgt     sgemm_kernel_L4_M2_42
+       bgt     .Lsgemm_kernel_L4_M2_42
 
-sgemm_kernel_L4_M2_100:
+.Lsgemm_kernel_L4_M2_100:
 
        SAVE2x4
 
-sgemm_kernel_L4_M2_END:
+.Lsgemm_kernel_L4_M2_END:
 
 
-sgemm_kernel_L4_M1_BEGIN:
+.Lsgemm_kernel_L4_M1_BEGIN:
 
        tst     counterI, #1                    // counterI = counterI % 2
-       ble     sgemm_kernel_L4_END
+       ble     .Lsgemm_kernel_L4_END
 
-sgemm_kernel_L4_M1_20:
+.Lsgemm_kernel_L4_M1_20:
 
        INIT1x4
 
        mov     pB, origPB
        asr     counterL , origK, #3            // counterL = counterL / 8
        cmp     counterL , #0
-       ble     sgemm_kernel_L4_M1_40
+       ble     .Lsgemm_kernel_L4_M1_40
 
-sgemm_kernel_L4_M1_22:
+.Lsgemm_kernel_L4_M1_22:
        KERNEL1x4_SUB
        KERNEL1x4_SUB
        KERNEL1x4_SUB
@@ -1167,45 +1167,45 @@ sgemm_kernel_L4_M1_22:
        KERNEL1x4_SUB
 
        subs    counterL, counterL, #1
-       bgt     sgemm_kernel_L4_M1_22
+       bgt     .Lsgemm_kernel_L4_M1_22
 
 
-sgemm_kernel_L4_M1_40:
+.Lsgemm_kernel_L4_M1_40:
 
        ands    counterL , origK, #7            // counterL = counterL % 8
-       ble     sgemm_kernel_L4_M1_100
+       ble     .Lsgemm_kernel_L4_M1_100
 
-sgemm_kernel_L4_M1_42:
+.Lsgemm_kernel_L4_M1_42:
 
        KERNEL1x4_SUB
 
        subs    counterL, counterL, #1
-       bgt     sgemm_kernel_L4_M1_42
+       bgt     .Lsgemm_kernel_L4_M1_42
 
-sgemm_kernel_L4_M1_100:
+.Lsgemm_kernel_L4_M1_100:
 
        SAVE1x4
 
 
-sgemm_kernel_L4_END:
+.Lsgemm_kernel_L4_END:
 
        lsl     temp, origK, #4 
        add     origPB, origPB, temp            // B = B + K * 4 * 4
 
        subs    counterJ, counterJ , #1         // j--
-       bgt     sgemm_kernel_L4_BEGIN
+       bgt     .Lsgemm_kernel_L4_BEGIN
 
 
 /******************************************************************************/
 
-sgemm_kernel_L2_BEGIN:   // less than 2 left in N direction
+.Lsgemm_kernel_L2_BEGIN:   // less than 2 left in N direction
 
        mov     counterJ , origN
        tst     counterJ , #3
-       ble     sgemm_kernel_L999
+       ble     .Lsgemm_kernel_L999
 
        tst     counterJ , #2
-       ble     sgemm_kernel_L1_BEGIN
+       ble     .Lsgemm_kernel_L1_BEGIN
 
        mov     pCRow0, pC                      // pCRow0 = pC
 
@@ -1215,24 +1215,24 @@ sgemm_kernel_L2_BEGIN:   // less than 2 left in N direction
 
 
 
-sgemm_kernel_L2_M4_BEGIN:
+.Lsgemm_kernel_L2_M4_BEGIN:
 
        mov     counterI, origM
        asr     counterI, counterI, #2          // counterI = counterI / 4
        cmp     counterI,#0
-       ble     sgemm_kernel_L2_M2_BEGIN
+       ble     .Lsgemm_kernel_L2_M2_BEGIN
 
-sgemm_kernel_L2_M4_20:
+.Lsgemm_kernel_L2_M4_20:
 
        INIT4x2
 
        mov     pB, origPB
        asr     counterL , origK, #3            // counterL = counterL / 8
        cmp     counterL,#0
-       ble     sgemm_kernel_L2_M4_40
+       ble     .Lsgemm_kernel_L2_M4_40
        .align 5
 
-sgemm_kernel_L2_M4_22:
+.Lsgemm_kernel_L2_M4_22:
        KERNEL4x2_SUB
        KERNEL4x2_SUB
        KERNEL4x2_SUB
@@ -1244,50 +1244,50 @@ sgemm_kernel_L2_M4_22:
        KERNEL4x2_SUB
 
        subs    counterL, counterL, #1
-       bgt     sgemm_kernel_L2_M4_22
+       bgt     .Lsgemm_kernel_L2_M4_22
 
 
-sgemm_kernel_L2_M4_40:
+.Lsgemm_kernel_L2_M4_40:
 
        ands    counterL , origK, #7            // counterL = counterL % 8
-       ble     sgemm_kernel_L2_M4_100
+       ble     .Lsgemm_kernel_L2_M4_100
 
-sgemm_kernel_L2_M4_42:
+.Lsgemm_kernel_L2_M4_42:
 
        KERNEL4x2_SUB
 
        subs    counterL, counterL, #1
-       bgt     sgemm_kernel_L2_M4_42
+       bgt     .Lsgemm_kernel_L2_M4_42
 
-sgemm_kernel_L2_M4_100:
+.Lsgemm_kernel_L2_M4_100:
 
        SAVE4x2
 
-sgemm_kernel_L2_M4_END:
+.Lsgemm_kernel_L2_M4_END:
 
        subs    counterI, counterI, #1
-       bgt     sgemm_kernel_L2_M4_20
+       bgt     .Lsgemm_kernel_L2_M4_20
 
 
-sgemm_kernel_L2_M2_BEGIN:
+.Lsgemm_kernel_L2_M2_BEGIN:
 
        mov     counterI, origM
        tst     counterI , #3
-       ble     sgemm_kernel_L2_END
+       ble     .Lsgemm_kernel_L2_END
 
        tst     counterI, #2                    // counterI = counterI / 2
-       ble     sgemm_kernel_L2_M1_BEGIN
+       ble     .Lsgemm_kernel_L2_M1_BEGIN
 
-sgemm_kernel_L2_M2_20:
+.Lsgemm_kernel_L2_M2_20:
 
        INIT2x2
 
        mov     pB, origPB
        asr     counterL , origK, #3            // counterL = counterL / 8
         cmp    counterL,#0
-       ble     sgemm_kernel_L2_M2_40
+       ble     .Lsgemm_kernel_L2_M2_40
 
-sgemm_kernel_L2_M2_22:
+.Lsgemm_kernel_L2_M2_22:
 
        KERNEL2x2_SUB
        KERNEL2x2_SUB
@@ -1300,43 +1300,43 @@ sgemm_kernel_L2_M2_22:
        KERNEL2x2_SUB
 
        subs    counterL, counterL, #1
-       bgt     sgemm_kernel_L2_M2_22
+       bgt     .Lsgemm_kernel_L2_M2_22
 
 
-sgemm_kernel_L2_M2_40:
+.Lsgemm_kernel_L2_M2_40:
 
        ands    counterL , origK, #7            // counterL = counterL % 8
-       ble     sgemm_kernel_L2_M2_100
+       ble     .Lsgemm_kernel_L2_M2_100
 
-sgemm_kernel_L2_M2_42:
+.Lsgemm_kernel_L2_M2_42:
 
        KERNEL2x2_SUB
 
        subs    counterL, counterL, #1
-       bgt     sgemm_kernel_L2_M2_42
+       bgt     .Lsgemm_kernel_L2_M2_42
 
-sgemm_kernel_L2_M2_100:
+.Lsgemm_kernel_L2_M2_100:
 
        SAVE2x2
 
-sgemm_kernel_L2_M2_END:
+.Lsgemm_kernel_L2_M2_END:
 
 
-sgemm_kernel_L2_M1_BEGIN:
+.Lsgemm_kernel_L2_M1_BEGIN:
 
        tst     counterI, #1                    // counterI = counterI % 2
-       ble     sgemm_kernel_L2_END
+       ble     .Lsgemm_kernel_L2_END
 
-sgemm_kernel_L2_M1_20:
+.Lsgemm_kernel_L2_M1_20:
 
        INIT1x2
 
        mov     pB, origPB
        asr     counterL , origK, #3            // counterL = counterL / 8
         cmp     counterL, #0
-       ble     sgemm_kernel_L2_M1_40
+       ble     .Lsgemm_kernel_L2_M1_40
 
-sgemm_kernel_L2_M1_22:
+.Lsgemm_kernel_L2_M1_22:
        KERNEL1x2_SUB
        KERNEL1x2_SUB
        KERNEL1x2_SUB
@@ -1348,36 +1348,36 @@ sgemm_kernel_L2_M1_22:
        KERNEL1x2_SUB
 
        subs    counterL, counterL, #1
-       bgt     sgemm_kernel_L2_M1_22
+       bgt     .Lsgemm_kernel_L2_M1_22
 
 
-sgemm_kernel_L2_M1_40:
+.Lsgemm_kernel_L2_M1_40:
 
        ands    counterL , origK, #7            // counterL = counterL % 8
-       ble     sgemm_kernel_L2_M1_100
+       ble     .Lsgemm_kernel_L2_M1_100
 
-sgemm_kernel_L2_M1_42:
+.Lsgemm_kernel_L2_M1_42:
 
        KERNEL1x2_SUB
 
        subs    counterL, counterL, #1
-       bgt     sgemm_kernel_L2_M1_42
+       bgt     .Lsgemm_kernel_L2_M1_42
 
-sgemm_kernel_L2_M1_100:
+.Lsgemm_kernel_L2_M1_100:
 
        SAVE1x2
 
 
-sgemm_kernel_L2_END:
+.Lsgemm_kernel_L2_END:
        add     origPB, origPB, origK, lsl #3   // B = B + K * 2 * 4
 
 /******************************************************************************/
 
-sgemm_kernel_L1_BEGIN:
+.Lsgemm_kernel_L1_BEGIN:
 
        mov     counterJ , origN
        tst     counterJ , #1
-       ble     sgemm_kernel_L999 // done
+       ble     .Lsgemm_kernel_L999 // done
 
 
        mov     pCRow0, pC                      // pCRow0 = C
@@ -1387,24 +1387,24 @@ sgemm_kernel_L1_BEGIN:
 
 
 
-sgemm_kernel_L1_M4_BEGIN:
+.Lsgemm_kernel_L1_M4_BEGIN:
 
        mov     counterI, origM
        asr     counterI, counterI, #2          // counterI = counterI / 4
        cmp     counterI, #0
-       ble     sgemm_kernel_L1_M2_BEGIN
+       ble     .Lsgemm_kernel_L1_M2_BEGIN
 
-sgemm_kernel_L1_M4_20:
+.Lsgemm_kernel_L1_M4_20:
 
        INIT4x1
 
        mov     pB, origPB
        asr     counterL , origK, #3            // counterL = counterL / 8
        cmp     counterL , #0
-       ble     sgemm_kernel_L1_M4_40
+       ble     .Lsgemm_kernel_L1_M4_40
        .align 5
 
-sgemm_kernel_L1_M4_22:
+.Lsgemm_kernel_L1_M4_22:
        KERNEL4x1_SUB
        KERNEL4x1_SUB
        KERNEL4x1_SUB
@@ -1416,50 +1416,50 @@ sgemm_kernel_L1_M4_22:
        KERNEL4x1_SUB
 
        subs    counterL, counterL, #1
-       bgt     sgemm_kernel_L1_M4_22
+       bgt     .Lsgemm_kernel_L1_M4_22
 
 
-sgemm_kernel_L1_M4_40:
+.Lsgemm_kernel_L1_M4_40:
 
        ands    counterL , origK, #7            // counterL = counterL % 8
-       ble     sgemm_kernel_L1_M4_100
+       ble     .Lsgemm_kernel_L1_M4_100
 
-sgemm_kernel_L1_M4_42:
+.Lsgemm_kernel_L1_M4_42:
 
        KERNEL4x1_SUB
 
        subs    counterL, counterL, #1
-       bgt     sgemm_kernel_L1_M4_42
+       bgt     .Lsgemm_kernel_L1_M4_42
 
-sgemm_kernel_L1_M4_100:
+.Lsgemm_kernel_L1_M4_100:
 
        SAVE4x1
 
-sgemm_kernel_L1_M4_END:
+.Lsgemm_kernel_L1_M4_END:
 
        subs    counterI, counterI, #1
-       bgt     sgemm_kernel_L1_M4_20
+       bgt     .Lsgemm_kernel_L1_M4_20
 
 
-sgemm_kernel_L1_M2_BEGIN:
+.Lsgemm_kernel_L1_M2_BEGIN:
 
        mov     counterI, origM
        tst     counterI , #3
-       ble     sgemm_kernel_L1_END
+       ble     .Lsgemm_kernel_L1_END
 
        tst     counterI, #2                    // counterI = counterI / 2
-       ble     sgemm_kernel_L1_M1_BEGIN
+       ble     .Lsgemm_kernel_L1_M1_BEGIN
 
-sgemm_kernel_L1_M2_20:
+.Lsgemm_kernel_L1_M2_20:
 
        INIT2x1
 
        mov     pB, origPB
        asr     counterL , origK, #3            // counterL = counterL / 8
        cmp     counterL , #0
-       ble     sgemm_kernel_L1_M2_40
+       ble     .Lsgemm_kernel_L1_M2_40
 
-sgemm_kernel_L1_M2_22:
+.Lsgemm_kernel_L1_M2_22:
 
        KERNEL2x1_SUB
        KERNEL2x1_SUB
@@ -1472,43 +1472,43 @@ sgemm_kernel_L1_M2_22:
        KERNEL2x1_SUB
 
        subs    counterL, counterL, #1
-       bgt     sgemm_kernel_L1_M2_22
+       bgt     .Lsgemm_kernel_L1_M2_22
 
 
-sgemm_kernel_L1_M2_40:
+.Lsgemm_kernel_L1_M2_40:
 
        ands    counterL , origK, #7            // counterL = counterL % 8
-       ble     sgemm_kernel_L1_M2_100
+       ble     .Lsgemm_kernel_L1_M2_100
 
-sgemm_kernel_L1_M2_42:
+.Lsgemm_kernel_L1_M2_42:
 
        KERNEL2x1_SUB
 
        subs    counterL, counterL, #1
-       bgt     sgemm_kernel_L1_M2_42
+       bgt     .Lsgemm_kernel_L1_M2_42
 
-sgemm_kernel_L1_M2_100:
+.Lsgemm_kernel_L1_M2_100:
 
        SAVE2x1
 
-sgemm_kernel_L1_M2_END:
+.Lsgemm_kernel_L1_M2_END:
 
 
-sgemm_kernel_L1_M1_BEGIN:
+.Lsgemm_kernel_L1_M1_BEGIN:
 
        tst     counterI, #1                    // counterI = counterI % 2
-       ble     sgemm_kernel_L1_END
+       ble     .Lsgemm_kernel_L1_END
 
-sgemm_kernel_L1_M1_20:
+.Lsgemm_kernel_L1_M1_20:
 
        INIT1x1
 
        mov     pB, origPB
        asr     counterL , origK, #3            // counterL = counterL / 8
        cmp     counterL , #0
-       ble     sgemm_kernel_L1_M1_40
+       ble     .Lsgemm_kernel_L1_M1_40
 
-sgemm_kernel_L1_M1_22:
+.Lsgemm_kernel_L1_M1_22:
        KERNEL1x1_SUB
        KERNEL1x1_SUB
        KERNEL1x1_SUB
@@ -1520,30 +1520,30 @@ sgemm_kernel_L1_M1_22:
        KERNEL1x1_SUB
 
        subs    counterL, counterL, #1
-       bgt     sgemm_kernel_L1_M1_22
+       bgt     .Lsgemm_kernel_L1_M1_22
 
 
-sgemm_kernel_L1_M1_40:
+.Lsgemm_kernel_L1_M1_40:
 
        ands    counterL , origK, #7            // counterL = counterL % 8
-       ble     sgemm_kernel_L1_M1_100
+       ble     .Lsgemm_kernel_L1_M1_100
 
-sgemm_kernel_L1_M1_42:
+.Lsgemm_kernel_L1_M1_42:
 
        KERNEL1x1_SUB
 
        subs    counterL, counterL, #1
-       bgt     sgemm_kernel_L1_M1_42
+       bgt     .Lsgemm_kernel_L1_M1_42
 
-sgemm_kernel_L1_M1_100:
+.Lsgemm_kernel_L1_M1_100:
 
        SAVE1x1
 
 
-sgemm_kernel_L1_END:
+.Lsgemm_kernel_L1_END:
 
 
-sgemm_kernel_L999:
+.Lsgemm_kernel_L999:
        mov     x0, #0                          // set return value
        ldp     d8, d9, [sp, #(0 * 16)]
        ldp     d10, d11, [sp, #(1 * 16)]
index bd47bed..6ba64dd 100644 (file)
@@ -1263,7 +1263,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 
        PROLOGUE
 
-sgemm_kernel_begin:
+.Lsgemm_kernel_begin:
 
        .align 5
        add     sp, sp, #-(11 * 16)
@@ -1291,12 +1291,12 @@ sgemm_kernel_begin:
        mov     counterJ, origN
        asr     counterJ, counterJ, #3          // J = J / 8
        cmp     counterJ, #0
-       ble     sgemm_kernel_L4_BEGIN
+       ble     .Lsgemm_kernel_L4_BEGIN
 
 /******************************************************************************/
 /******************************************************************************/
 
-sgemm_kernel_L8_BEGIN:
+.Lsgemm_kernel_L8_BEGIN:
        mov     pCRow0, pC                      // pCRow0 = C
        add     pC, pC, LDC, lsl #3
 
@@ -1304,156 +1304,156 @@ sgemm_kernel_L8_BEGIN:
 
 /******************************************************************************/
 
-sgemm_kernel_L8_M8_BEGIN:
+.Lsgemm_kernel_L8_M8_BEGIN:
 
        mov     counterI, origM
        asr     counterI, counterI, #3          // counterI = counterI / 8
        cmp     counterI, #0
-       ble     sgemm_kernel_L8_M4_BEGIN
+       ble     .Lsgemm_kernel_L8_M4_BEGIN
 
-sgemm_kernel_L8_M8_20:
+.Lsgemm_kernel_L8_M8_20:
 
        mov     pB, origPB
 
        asr     counterL , origK, #1            // L = K / 2
        cmp     counterL , #2                   // is there at least 4 to do?
-       blt     sgemm_kernel_L8_M8_32
+       blt     .Lsgemm_kernel_L8_M8_32
 
        KERNEL8x8_I                             // do one in the K
        KERNEL8x8_M2                            // do another in the K
 
        subs    counterL, counterL, #2
-       ble     sgemm_kernel_L8_M8_22a
+       ble     .Lsgemm_kernel_L8_M8_22a
        .align 5
 
-sgemm_kernel_L8_M8_22:
+.Lsgemm_kernel_L8_M8_22:
 
        KERNEL8x8_M1
        KERNEL8x8_M2
 
        subs    counterL, counterL, #1
-       bgt     sgemm_kernel_L8_M8_22
+       bgt     .Lsgemm_kernel_L8_M8_22
 
-sgemm_kernel_L8_M8_22a:
+.Lsgemm_kernel_L8_M8_22a:
 
        KERNEL8x8_M1
        KERNEL8x8_E
 
-       b        sgemm_kernel_L8_M8_44
+       b        .Lsgemm_kernel_L8_M8_44
 
-sgemm_kernel_L8_M8_32:
+.Lsgemm_kernel_L8_M8_32:
 
        tst     counterL, #1
-       ble     sgemm_kernel_L8_M8_40
+       ble     .Lsgemm_kernel_L8_M8_40
 
        KERNEL8x8_I
        KERNEL8x8_E
 
-       b       sgemm_kernel_L8_M8_44
+       b       .Lsgemm_kernel_L8_M8_44
 
-sgemm_kernel_L8_M8_40:
+.Lsgemm_kernel_L8_M8_40:
 
        INIT8x8
 
-sgemm_kernel_L8_M8_44:
+.Lsgemm_kernel_L8_M8_44:
 
        ands    counterL , origK, #1
-       ble     sgemm_kernel_L8_M8_100
+       ble     .Lsgemm_kernel_L8_M8_100
 
-sgemm_kernel_L8_M8_46:
+.Lsgemm_kernel_L8_M8_46:
 
        KERNEL8x8_SUB
 
-sgemm_kernel_L8_M8_100:
+.Lsgemm_kernel_L8_M8_100:
 
        SAVE8x8
 
-sgemm_kernel_L8_M8_END:
+.Lsgemm_kernel_L8_M8_END:
        subs    counterI, counterI, #1
-       bne     sgemm_kernel_L8_M8_20
+       bne     .Lsgemm_kernel_L8_M8_20
 
 /******************************************************************************/
 
-sgemm_kernel_L8_M4_BEGIN:
+.Lsgemm_kernel_L8_M4_BEGIN:
 
        mov     counterI, origM
        tst     counterI , #7
-       ble     sgemm_kernel_L8_END
+       ble     .Lsgemm_kernel_L8_END
 
        tst     counterI, #4
-       ble     sgemm_kernel_L8_M2_BEGIN
+       ble     .Lsgemm_kernel_L8_M2_BEGIN
 
-sgemm_kernel_L8_M4_20:
+.Lsgemm_kernel_L8_M4_20:
 
        mov     pB, origPB
 
        asr     counterL , origK, #1            // L = K / 2
        cmp     counterL , #2                   // is there at least 4 to do?
-       blt     sgemm_kernel_L8_M4_32
+       blt     .Lsgemm_kernel_L8_M4_32
 
        KERNEL4x8_I                             // do one in the K
        KERNEL4x8_M2                            // do another in the K
 
        subs    counterL, counterL, #2
-       ble     sgemm_kernel_L8_M4_22a
+       ble     .Lsgemm_kernel_L8_M4_22a
        .align 5
 
-sgemm_kernel_L8_M4_22:
+.Lsgemm_kernel_L8_M4_22:
 
        KERNEL4x8_M1
        KERNEL4x8_M2
 
        subs    counterL, counterL, #1
-       bgt     sgemm_kernel_L8_M4_22
+       bgt     .Lsgemm_kernel_L8_M4_22
 
-sgemm_kernel_L8_M4_22a:
+.Lsgemm_kernel_L8_M4_22a:
 
        KERNEL4x8_M1
        KERNEL4x8_E
 
-       b        sgemm_kernel_L8_M4_44
+       b        .Lsgemm_kernel_L8_M4_44
 
-sgemm_kernel_L8_M4_32:
+.Lsgemm_kernel_L8_M4_32:
 
        tst     counterL, #1
-       ble     sgemm_kernel_L8_M4_40
+       ble     .Lsgemm_kernel_L8_M4_40
 
        KERNEL4x8_I
        KERNEL4x8_E
 
-       b       sgemm_kernel_L8_M4_44
+       b       .Lsgemm_kernel_L8_M4_44
 
-sgemm_kernel_L8_M4_40:
+.Lsgemm_kernel_L8_M4_40:
 
        INIT4x8
 
-sgemm_kernel_L8_M4_44:
+.Lsgemm_kernel_L8_M4_44:
 
        ands    counterL , origK, #1
-       ble     sgemm_kernel_L8_M4_100
+       ble     .Lsgemm_kernel_L8_M4_100
 
-sgemm_kernel_L8_M4_46:
+.Lsgemm_kernel_L8_M4_46:
 
        KERNEL4x8_SUB
 
-sgemm_kernel_L8_M4_100:
+.Lsgemm_kernel_L8_M4_100:
 
        SAVE4x8
 
-sgemm_kernel_L8_M4_END:
+.Lsgemm_kernel_L8_M4_END:
 
 /******************************************************************************/
 
-sgemm_kernel_L8_M2_BEGIN:
+.Lsgemm_kernel_L8_M2_BEGIN:
 
        mov     counterI, origM
        tst     counterI , #3
-       ble     sgemm_kernel_L8_END
+       ble     .Lsgemm_kernel_L8_END
 
        tst     counterI, #2                    // counterI = counterI / 2
-       ble     sgemm_kernel_L8_M1_BEGIN
+       ble     .Lsgemm_kernel_L8_M1_BEGIN
 
-sgemm_kernel_L8_M2_20:
+.Lsgemm_kernel_L8_M2_20:
 
        INIT2x8
 
@@ -1461,9 +1461,9 @@ sgemm_kernel_L8_M2_20:
 
        asr     counterL , origK, #3            // counterL = counterL / 8
        cmp     counterL , #0
-       ble     sgemm_kernel_L8_M2_40
+       ble     .Lsgemm_kernel_L8_M2_40
 
-sgemm_kernel_L8_M2_22:
+.Lsgemm_kernel_L8_M2_22:
 
        KERNEL2x8_SUB
        KERNEL2x8_SUB
@@ -1476,35 +1476,35 @@ sgemm_kernel_L8_M2_22:
        KERNEL2x8_SUB
 
        subs    counterL, counterL, #1
-       bgt     sgemm_kernel_L8_M2_22
+       bgt     .Lsgemm_kernel_L8_M2_22
 
 
-sgemm_kernel_L8_M2_40:
+.Lsgemm_kernel_L8_M2_40:
 
        ands    counterL , origK, #7            // counterL = counterL % 8
-       ble     sgemm_kernel_L8_M2_100
+       ble     .Lsgemm_kernel_L8_M2_100
 
-sgemm_kernel_L8_M2_42:
+.Lsgemm_kernel_L8_M2_42:
 
        KERNEL2x8_SUB
 
        subs    counterL, counterL, #1
-       bgt     sgemm_kernel_L8_M2_42
+       bgt     .Lsgemm_kernel_L8_M2_42
 
-sgemm_kernel_L8_M2_100:
+.Lsgemm_kernel_L8_M2_100:
 
        SAVE2x8
 
-sgemm_kernel_L8_M2_END:
+.Lsgemm_kernel_L8_M2_END:
 
 /******************************************************************************/
 
-sgemm_kernel_L8_M1_BEGIN:
+.Lsgemm_kernel_L8_M1_BEGIN:
 
        tst     counterI, #1                    // counterI = counterI % 2
-       ble     sgemm_kernel_L8_END
+       ble     .Lsgemm_kernel_L8_END
 
-sgemm_kernel_L8_M1_20:
+.Lsgemm_kernel_L8_M1_20:
 
        INIT1x8
 
@@ -1512,9 +1512,9 @@ sgemm_kernel_L8_M1_20:
 
        asr     counterL , origK, #3            // counterL = counterL / 8
        cmp     counterL , #0
-       ble     sgemm_kernel_L8_M1_40
+       ble     .Lsgemm_kernel_L8_M1_40
 
-sgemm_kernel_L8_M1_22:
+.Lsgemm_kernel_L8_M1_22:
        KERNEL1x8_SUB
        KERNEL1x8_SUB
        KERNEL1x8_SUB
@@ -1526,43 +1526,43 @@ sgemm_kernel_L8_M1_22:
        KERNEL1x8_SUB
 
        subs    counterL, counterL, #1
-       bgt     sgemm_kernel_L8_M1_22
+       bgt     .Lsgemm_kernel_L8_M1_22
 
 
-sgemm_kernel_L8_M1_40:
+.Lsgemm_kernel_L8_M1_40:
 
        ands    counterL , origK, #7            // counterL = counterL % 8
-       ble     sgemm_kernel_L8_M1_100
+       ble     .Lsgemm_kernel_L8_M1_100
 
-sgemm_kernel_L8_M1_42:
+.Lsgemm_kernel_L8_M1_42:
 
        KERNEL1x8_SUB
 
        subs    counterL, counterL, #1
-       bgt     sgemm_kernel_L8_M1_42
+       bgt     .Lsgemm_kernel_L8_M1_42
 
-sgemm_kernel_L8_M1_100:
+.Lsgemm_kernel_L8_M1_100:
 
        SAVE1x8
 
-sgemm_kernel_L8_END:
+.Lsgemm_kernel_L8_END:
        lsl     temp, origK, #5                 // B = B + K * 4 * 8
        add     origPB, origPB, temp
 
        subs    counterJ, counterJ , #1         // j--
-       bgt     sgemm_kernel_L8_BEGIN
+       bgt     .Lsgemm_kernel_L8_BEGIN
 
 /******************************************************************************/
 /******************************************************************************/
 
-sgemm_kernel_L4_BEGIN:
+.Lsgemm_kernel_L4_BEGIN:
 
        mov     counterJ , origN
        tst     counterJ , #7
-       ble     sgemm_kernel_L999
+       ble     .Lsgemm_kernel_L999
 
        tst     counterJ , #4
-       ble     sgemm_kernel_L2_BEGIN
+       ble     .Lsgemm_kernel_L2_BEGIN
 
        mov     pCRow0, pC                      // pCRow0 = pC
 
@@ -1572,156 +1572,156 @@ sgemm_kernel_L4_BEGIN:
 
 /******************************************************************************/
 
-sgemm_kernel_L4_M8_BEGIN:
+.Lsgemm_kernel_L4_M8_BEGIN:
 
        mov     counterI, origM
        asr     counterI, counterI, #3          // counterI = counterI / 8
        cmp     counterI, #0
-       ble     sgemm_kernel_L4_M4_BEGIN
+       ble     .Lsgemm_kernel_L4_M4_BEGIN
 
-sgemm_kernel_L4_M8_20:
+.Lsgemm_kernel_L4_M8_20:
 
        mov     pB, origPB
 
        asr     counterL , origK, #1            // L = K / 2
        cmp     counterL , #2                   // is there at least 4 to do?
-       blt     sgemm_kernel_L4_M8_32
+       blt     .Lsgemm_kernel_L4_M8_32
 
        KERNEL8x4_I                             // do one in the K
        KERNEL8x4_M2                            // do another in the K
 
        subs    counterL, counterL, #2
-       ble     sgemm_kernel_L4_M8_22a
+       ble     .Lsgemm_kernel_L4_M8_22a
        .align 5
 
-sgemm_kernel_L4_M8_22:
+.Lsgemm_kernel_L4_M8_22:
 
        KERNEL8x4_M1
        KERNEL8x4_M2
 
        subs    counterL, counterL, #1
-       bgt     sgemm_kernel_L4_M8_22
+       bgt     .Lsgemm_kernel_L4_M8_22
 
-sgemm_kernel_L4_M8_22a:
+.Lsgemm_kernel_L4_M8_22a:
 
        KERNEL8x4_M1
        KERNEL8x4_E
 
-       b        sgemm_kernel_L4_M8_44
+       b        .Lsgemm_kernel_L4_M8_44
 
-sgemm_kernel_L4_M8_32:
+.Lsgemm_kernel_L4_M8_32:
 
        tst     counterL, #1
-       ble     sgemm_kernel_L4_M8_40
+       ble     .Lsgemm_kernel_L4_M8_40
 
        KERNEL8x4_I
        KERNEL8x4_E
 
-       b       sgemm_kernel_L4_M8_44
+       b       .Lsgemm_kernel_L4_M8_44
 
-sgemm_kernel_L4_M8_40:
+.Lsgemm_kernel_L4_M8_40:
 
        INIT8x4
 
-sgemm_kernel_L4_M8_44:
+.Lsgemm_kernel_L4_M8_44:
 
        ands    counterL , origK, #1
-       ble     sgemm_kernel_L4_M8_100
+       ble     .Lsgemm_kernel_L4_M8_100
 
-sgemm_kernel_L4_M8_46:
+.Lsgemm_kernel_L4_M8_46:
 
        KERNEL8x4_SUB
 
-sgemm_kernel_L4_M8_100:
+.Lsgemm_kernel_L4_M8_100:
 
        SAVE8x4
 
-sgemm_kernel_L4_M8_END:
+.Lsgemm_kernel_L4_M8_END:
        subs    counterI, counterI, #1
-       bne     sgemm_kernel_L4_M8_20
+       bne     .Lsgemm_kernel_L4_M8_20
 
 /******************************************************************************/
 
-sgemm_kernel_L4_M4_BEGIN:
+.Lsgemm_kernel_L4_M4_BEGIN:
 
        mov     counterI, origM
        tst     counterI , #7
-       ble     sgemm_kernel_L4_END
+       ble     .Lsgemm_kernel_L4_END
 
        tst     counterI, #4
-       ble     sgemm_kernel_L4_M2_BEGIN
+       ble     .Lsgemm_kernel_L4_M2_BEGIN
 
-sgemm_kernel_L4_M4_20:
+.Lsgemm_kernel_L4_M4_20:
 
        mov     pB, origPB
 
        asr     counterL , origK, #1            // L = K / 2
        cmp     counterL , #2                   // is there at least 4 to do?
-       blt     sgemm_kernel_L4_M4_32
+       blt     .Lsgemm_kernel_L4_M4_32
 
        KERNEL4x4_I                             // do one in the K
        KERNEL4x4_M2                            // do another in the K
 
        subs    counterL, counterL, #2
-       ble     sgemm_kernel_L4_M4_22a
+       ble     .Lsgemm_kernel_L4_M4_22a
        .align 5
 
-sgemm_kernel_L4_M4_22:
+.Lsgemm_kernel_L4_M4_22:
 
        KERNEL4x4_M1
        KERNEL4x4_M2
 
        subs    counterL, counterL, #1
-       bgt     sgemm_kernel_L4_M4_22
+       bgt     .Lsgemm_kernel_L4_M4_22
 
-sgemm_kernel_L4_M4_22a:
+.Lsgemm_kernel_L4_M4_22a:
 
        KERNEL4x4_M1
        KERNEL4x4_E
 
-       b        sgemm_kernel_L4_M4_44
+       b        .Lsgemm_kernel_L4_M4_44
 
-sgemm_kernel_L4_M4_32:
+.Lsgemm_kernel_L4_M4_32:
 
        tst     counterL, #1
-       ble     sgemm_kernel_L4_M4_40
+       ble     .Lsgemm_kernel_L4_M4_40
 
        KERNEL4x4_I
        KERNEL4x4_E
 
-       b       sgemm_kernel_L4_M4_44
+       b       .Lsgemm_kernel_L4_M4_44
 
-sgemm_kernel_L4_M4_40:
+.Lsgemm_kernel_L4_M4_40:
 
        INIT4x4
 
-sgemm_kernel_L4_M4_44:
+.Lsgemm_kernel_L4_M4_44:
 
        ands    counterL , origK, #1
-       ble     sgemm_kernel_L4_M4_100
+       ble     .Lsgemm_kernel_L4_M4_100
 
-sgemm_kernel_L4_M4_46:
+.Lsgemm_kernel_L4_M4_46:
 
        KERNEL4x4_SUB
 
-sgemm_kernel_L4_M4_100:
+.Lsgemm_kernel_L4_M4_100:
 
        SAVE4x4
 
-sgemm_kernel_L4_M4_END:
+.Lsgemm_kernel_L4_M4_END:
 
 /******************************************************************************/
 
-sgemm_kernel_L4_M2_BEGIN:
+.Lsgemm_kernel_L4_M2_BEGIN:
 
        mov     counterI, origM
        tst     counterI , #3
-       ble     sgemm_kernel_L4_END
+       ble     .Lsgemm_kernel_L4_END
 
        tst     counterI, #2                    // counterI = counterI / 2
-       ble     sgemm_kernel_L4_M1_BEGIN
+       ble     .Lsgemm_kernel_L4_M1_BEGIN
 
-sgemm_kernel_L4_M2_20:
+.Lsgemm_kernel_L4_M2_20:
 
        INIT2x4
 
@@ -1729,9 +1729,9 @@ sgemm_kernel_L4_M2_20:
 
        asr     counterL , origK, #3            // counterL = counterL / 8
        cmp     counterL , #0
-       ble     sgemm_kernel_L4_M2_40
+       ble     .Lsgemm_kernel_L4_M2_40
 
-sgemm_kernel_L4_M2_22:
+.Lsgemm_kernel_L4_M2_22:
 
        KERNEL2x4_SUB
        KERNEL2x4_SUB
@@ -1744,35 +1744,35 @@ sgemm_kernel_L4_M2_22:
        KERNEL2x4_SUB
 
        subs    counterL, counterL, #1
-       bgt     sgemm_kernel_L4_M2_22
+       bgt     .Lsgemm_kernel_L4_M2_22
 
 
-sgemm_kernel_L4_M2_40:
+.Lsgemm_kernel_L4_M2_40:
 
        ands    counterL , origK, #7            // counterL = counterL % 8
-       ble     sgemm_kernel_L4_M2_100
+       ble     .Lsgemm_kernel_L4_M2_100
 
-sgemm_kernel_L4_M2_42:
+.Lsgemm_kernel_L4_M2_42:
 
        KERNEL2x4_SUB
 
        subs    counterL, counterL, #1
-       bgt     sgemm_kernel_L4_M2_42
+       bgt     .Lsgemm_kernel_L4_M2_42
 
-sgemm_kernel_L4_M2_100:
+.Lsgemm_kernel_L4_M2_100:
 
        SAVE2x4
 
-sgemm_kernel_L4_M2_END:
+.Lsgemm_kernel_L4_M2_END:
 
 /******************************************************************************/
 
-sgemm_kernel_L4_M1_BEGIN:
+.Lsgemm_kernel_L4_M1_BEGIN:
 
        tst     counterI, #1                    // counterI = counterI % 2
-       ble     sgemm_kernel_L4_END
+       ble     .Lsgemm_kernel_L4_END
 
-sgemm_kernel_L4_M1_20:
+.Lsgemm_kernel_L4_M1_20:
 
        INIT1x4
 
@@ -1780,9 +1780,9 @@ sgemm_kernel_L4_M1_20:
 
        asr     counterL , origK, #3            // counterL = counterL / 8
        cmp     counterL , #0
-       ble     sgemm_kernel_L4_M1_40
+       ble     .Lsgemm_kernel_L4_M1_40
 
-sgemm_kernel_L4_M1_22:
+.Lsgemm_kernel_L4_M1_22:
        KERNEL1x4_SUB
        KERNEL1x4_SUB
        KERNEL1x4_SUB
@@ -1794,39 +1794,39 @@ sgemm_kernel_L4_M1_22:
        KERNEL1x4_SUB
 
        subs    counterL, counterL, #1
-       bgt     sgemm_kernel_L4_M1_22
+       bgt     .Lsgemm_kernel_L4_M1_22
 
 
-sgemm_kernel_L4_M1_40:
+.Lsgemm_kernel_L4_M1_40:
 
        ands    counterL , origK, #7            // counterL = counterL % 8
-       ble     sgemm_kernel_L4_M1_100
+       ble     .Lsgemm_kernel_L4_M1_100
 
-sgemm_kernel_L4_M1_42:
+.Lsgemm_kernel_L4_M1_42:
 
        KERNEL1x4_SUB
 
        subs    counterL, counterL, #1
-       bgt     sgemm_kernel_L4_M1_42
+       bgt     .Lsgemm_kernel_L4_M1_42
 
-sgemm_kernel_L4_M1_100:
+.Lsgemm_kernel_L4_M1_100:
 
        SAVE1x4
 
-sgemm_kernel_L4_END:
+.Lsgemm_kernel_L4_END:
        add     origPB, origPB, origK, lsl #4   // B = B + K * 4 * 4
 
 /******************************************************************************/
 /******************************************************************************/
 
-sgemm_kernel_L2_BEGIN:   // less than 2 left in N direction
+.Lsgemm_kernel_L2_BEGIN:   // less than 2 left in N direction
 
        mov     counterJ , origN
        tst     counterJ , #3
-       ble     sgemm_kernel_L999
+       ble     .Lsgemm_kernel_L999
 
        tst     counterJ , #2
-       ble     sgemm_kernel_L1_BEGIN
+       ble     .Lsgemm_kernel_L1_BEGIN
 
        mov     pCRow0, pC                      // pCRow0 = pC
 
@@ -1836,14 +1836,14 @@ sgemm_kernel_L2_BEGIN:   // less than 2 left in N direction
 
 /******************************************************************************/
 
-sgemm_kernel_L2_M8_BEGIN:
+.Lsgemm_kernel_L2_M8_BEGIN:
 
        mov     counterI, origM
        asr     counterI, counterI, #3          // counterI = counterI / 8
        cmp     counterI,#0
-       ble     sgemm_kernel_L2_M4_BEGIN
+       ble     .Lsgemm_kernel_L2_M4_BEGIN
 
-sgemm_kernel_L2_M8_20:
+.Lsgemm_kernel_L2_M8_20:
 
        INIT8x2
 
@@ -1851,10 +1851,10 @@ sgemm_kernel_L2_M8_20:
 
        asr     counterL , origK, #3            // counterL = counterL / 8
        cmp     counterL,#0
-       ble     sgemm_kernel_L2_M8_40
+       ble     .Lsgemm_kernel_L2_M8_40
        .align 5
 
-sgemm_kernel_L2_M8_22:
+.Lsgemm_kernel_L2_M8_22:
        KERNEL8x2_SUB
        KERNEL8x2_SUB
        KERNEL8x2_SUB
@@ -1866,42 +1866,42 @@ sgemm_kernel_L2_M8_22:
        KERNEL8x2_SUB
 
        subs    counterL, counterL, #1
-       bgt     sgemm_kernel_L2_M8_22
+       bgt     .Lsgemm_kernel_L2_M8_22
 
 
-sgemm_kernel_L2_M8_40:
+.Lsgemm_kernel_L2_M8_40:
 
        ands    counterL , origK, #7            // counterL = counterL % 8
-       ble     sgemm_kernel_L2_M8_100
+       ble     .Lsgemm_kernel_L2_M8_100
 
-sgemm_kernel_L2_M8_42:
+.Lsgemm_kernel_L2_M8_42:
 
        KERNEL8x2_SUB
 
        subs    counterL, counterL, #1
-       bgt     sgemm_kernel_L2_M8_42
+       bgt     .Lsgemm_kernel_L2_M8_42
 
-sgemm_kernel_L2_M8_100:
+.Lsgemm_kernel_L2_M8_100:
 
        SAVE8x2
 
-sgemm_kernel_L2_M8_END:
+.Lsgemm_kernel_L2_M8_END:
 
        subs    counterI, counterI, #1
-       bgt     sgemm_kernel_L2_M8_20
+       bgt     .Lsgemm_kernel_L2_M8_20
 
 /******************************************************************************/
 
-sgemm_kernel_L2_M4_BEGIN:
+.Lsgemm_kernel_L2_M4_BEGIN:
 
        mov     counterI, origM
        tst     counterI , #7
-       ble     sgemm_kernel_L2_END
+       ble     .Lsgemm_kernel_L2_END
 
        tst     counterI, #4
-       ble     sgemm_kernel_L2_M2_BEGIN
+       ble     .Lsgemm_kernel_L2_M2_BEGIN
 
-sgemm_kernel_L2_M4_20:
+.Lsgemm_kernel_L2_M4_20:
 
        INIT4x2
 
@@ -1909,10 +1909,10 @@ sgemm_kernel_L2_M4_20:
 
        asr     counterL , origK, #3            // counterL = counterL / 8
        cmp     counterL,#0
-       ble     sgemm_kernel_L2_M4_40
+       ble     .Lsgemm_kernel_L2_M4_40
        .align 5
 
-sgemm_kernel_L2_M4_22:
+.Lsgemm_kernel_L2_M4_22:
        KERNEL4x2_SUB
        KERNEL4x2_SUB
        KERNEL4x2_SUB
@@ -1924,39 +1924,39 @@ sgemm_kernel_L2_M4_22:
        KERNEL4x2_SUB
 
        subs    counterL, counterL, #1
-       bgt     sgemm_kernel_L2_M4_22
+       bgt     .Lsgemm_kernel_L2_M4_22
 
 
-sgemm_kernel_L2_M4_40:
+.Lsgemm_kernel_L2_M4_40:
 
        ands    counterL , origK, #7            // counterL = counterL % 8
-       ble     sgemm_kernel_L2_M4_100
+       ble     .Lsgemm_kernel_L2_M4_100
 
-sgemm_kernel_L2_M4_42:
+.Lsgemm_kernel_L2_M4_42:
 
        KERNEL4x2_SUB
 
        subs    counterL, counterL, #1
-       bgt     sgemm_kernel_L2_M4_42
+       bgt     .Lsgemm_kernel_L2_M4_42
 
-sgemm_kernel_L2_M4_100:
+.Lsgemm_kernel_L2_M4_100:
 
        SAVE4x2
 
-sgemm_kernel_L2_M4_END:
+.Lsgemm_kernel_L2_M4_END:
 
 /******************************************************************************/
 
-sgemm_kernel_L2_M2_BEGIN:
+.Lsgemm_kernel_L2_M2_BEGIN:
 
        mov     counterI, origM
        tst     counterI , #3
-       ble     sgemm_kernel_L2_END
+       ble     .Lsgemm_kernel_L2_END
 
        tst     counterI, #2                    // counterI = counterI / 2
-       ble     sgemm_kernel_L2_M1_BEGIN
+       ble     .Lsgemm_kernel_L2_M1_BEGIN
 
-sgemm_kernel_L2_M2_20:
+.Lsgemm_kernel_L2_M2_20:
 
        INIT2x2
 
@@ -1964,9 +1964,9 @@ sgemm_kernel_L2_M2_20:
 
        asr     counterL , origK, #3            // counterL = counterL / 8
         cmp    counterL,#0
-       ble     sgemm_kernel_L2_M2_40
+       ble     .Lsgemm_kernel_L2_M2_40
 
-sgemm_kernel_L2_M2_22:
+.Lsgemm_kernel_L2_M2_22:
 
        KERNEL2x2_SUB
        KERNEL2x2_SUB
@@ -1979,35 +1979,35 @@ sgemm_kernel_L2_M2_22:
        KERNEL2x2_SUB
 
        subs    counterL, counterL, #1
-       bgt     sgemm_kernel_L2_M2_22
+       bgt     .Lsgemm_kernel_L2_M2_22
 
 
-sgemm_kernel_L2_M2_40:
+.Lsgemm_kernel_L2_M2_40:
 
        ands    counterL , origK, #7            // counterL = counterL % 8
-       ble     sgemm_kernel_L2_M2_100
+       ble     .Lsgemm_kernel_L2_M2_100
 
-sgemm_kernel_L2_M2_42:
+.Lsgemm_kernel_L2_M2_42:
 
        KERNEL2x2_SUB
 
        subs    counterL, counterL, #1
-       bgt     sgemm_kernel_L2_M2_42
+       bgt     .Lsgemm_kernel_L2_M2_42
 
-sgemm_kernel_L2_M2_100:
+.Lsgemm_kernel_L2_M2_100:
 
        SAVE2x2
 
-sgemm_kernel_L2_M2_END:
+.Lsgemm_kernel_L2_M2_END:
 
 /******************************************************************************/
 
-sgemm_kernel_L2_M1_BEGIN:
+.Lsgemm_kernel_L2_M1_BEGIN:
 
        tst     counterI, #1                    // counterI = counterI % 2
-       ble     sgemm_kernel_L2_END
+       ble     .Lsgemm_kernel_L2_END
 
-sgemm_kernel_L2_M1_20:
+.Lsgemm_kernel_L2_M1_20:
 
        INIT1x2
 
@@ -2015,9 +2015,9 @@ sgemm_kernel_L2_M1_20:
 
        asr     counterL , origK, #3            // counterL = counterL / 8
         cmp     counterL, #0
-       ble     sgemm_kernel_L2_M1_40
+       ble     .Lsgemm_kernel_L2_M1_40
 
-sgemm_kernel_L2_M1_22:
+.Lsgemm_kernel_L2_M1_22:
        KERNEL1x2_SUB
        KERNEL1x2_SUB
        KERNEL1x2_SUB
@@ -2029,37 +2029,37 @@ sgemm_kernel_L2_M1_22:
        KERNEL1x2_SUB
 
        subs    counterL, counterL, #1
-       bgt     sgemm_kernel_L2_M1_22
+       bgt     .Lsgemm_kernel_L2_M1_22
 
 
-sgemm_kernel_L2_M1_40:
+.Lsgemm_kernel_L2_M1_40:
 
        ands    counterL , origK, #7            // counterL = counterL % 8
-       ble     sgemm_kernel_L2_M1_100
+       ble     .Lsgemm_kernel_L2_M1_100
 
-sgemm_kernel_L2_M1_42:
+.Lsgemm_kernel_L2_M1_42:
 
        KERNEL1x2_SUB
 
        subs    counterL, counterL, #1
-       bgt     sgemm_kernel_L2_M1_42
+       bgt     .Lsgemm_kernel_L2_M1_42
 
-sgemm_kernel_L2_M1_100:
+.Lsgemm_kernel_L2_M1_100:
 
        SAVE1x2
 
-sgemm_kernel_L2_END:
+.Lsgemm_kernel_L2_END:
 
        add     origPB, origPB, origK, lsl #3   // B = B + K * 2 * 4
 
 /******************************************************************************/
 /******************************************************************************/
 
-sgemm_kernel_L1_BEGIN:
+.Lsgemm_kernel_L1_BEGIN:
 
        mov     counterJ , origN
        tst     counterJ , #1
-       ble     sgemm_kernel_L999 // done
+       ble     .Lsgemm_kernel_L999 // done
 
 
        mov     pCRow0, pC                      // pCRow0 = C
@@ -2069,14 +2069,14 @@ sgemm_kernel_L1_BEGIN:
 
 /******************************************************************************/
 
-sgemm_kernel_L1_M8_BEGIN:
+.Lsgemm_kernel_L1_M8_BEGIN:
 
        mov     counterI, origM
        asr     counterI, counterI, #3
        cmp     counterI, #0
-       ble     sgemm_kernel_L1_M4_BEGIN
+       ble     .Lsgemm_kernel_L1_M4_BEGIN
 
-sgemm_kernel_L1_M8_20:
+.Lsgemm_kernel_L1_M8_20:
 
        INIT8x1
 
@@ -2084,10 +2084,10 @@ sgemm_kernel_L1_M8_20:
 
        asr     counterL , origK, #3            // counterL = counterL / 8
        cmp     counterL , #0
-       ble     sgemm_kernel_L1_M8_40
+       ble     .Lsgemm_kernel_L1_M8_40
        .align 5
 
-sgemm_kernel_L1_M8_22:
+.Lsgemm_kernel_L1_M8_22:
        KERNEL8x1_SUB
        KERNEL8x1_SUB
        KERNEL8x1_SUB
@@ -2099,42 +2099,42 @@ sgemm_kernel_L1_M8_22:
        KERNEL8x1_SUB
 
        subs    counterL, counterL, #1
-       bgt     sgemm_kernel_L1_M8_22
+       bgt     .Lsgemm_kernel_L1_M8_22
 
 
-sgemm_kernel_L1_M8_40:
+.Lsgemm_kernel_L1_M8_40:
 
        ands    counterL , origK, #7            // counterL = counterL % 8
-       ble     sgemm_kernel_L1_M8_100
+       ble     .Lsgemm_kernel_L1_M8_100
 
-sgemm_kernel_L1_M8_42:
+.Lsgemm_kernel_L1_M8_42:
 
        KERNEL8x1_SUB
 
        subs    counterL, counterL, #1
-       bgt     sgemm_kernel_L1_M8_42
+       bgt     .Lsgemm_kernel_L1_M8_42
 
-sgemm_kernel_L1_M8_100:
+.Lsgemm_kernel_L1_M8_100:
 
        SAVE8x1
 
-sgemm_kernel_L1_M8_END:
+.Lsgemm_kernel_L1_M8_END:
 
        subs    counterI, counterI, #1
-       bgt     sgemm_kernel_L1_M8_20
+       bgt     .Lsgemm_kernel_L1_M8_20
 
 /******************************************************************************/
 
-sgemm_kernel_L1_M4_BEGIN:
+.Lsgemm_kernel_L1_M4_BEGIN:
 
        mov     counterI, origM
        tst     counterI , #7
-       ble     sgemm_kernel_L1_END
+       ble     .Lsgemm_kernel_L1_END
 
        tst     counterI, #4
-       ble     sgemm_kernel_L1_M2_BEGIN
+       ble     .Lsgemm_kernel_L1_M2_BEGIN
 
-sgemm_kernel_L1_M4_20:
+.Lsgemm_kernel_L1_M4_20:
 
        INIT4x1
 
@@ -2142,10 +2142,10 @@ sgemm_kernel_L1_M4_20:
 
        asr     counterL , origK, #3            // counterL = counterL / 8
        cmp     counterL , #0
-       ble     sgemm_kernel_L1_M4_40
+       ble     .Lsgemm_kernel_L1_M4_40
        .align 5
 
-sgemm_kernel_L1_M4_22:
+.Lsgemm_kernel_L1_M4_22:
        KERNEL4x1_SUB
        KERNEL4x1_SUB
        KERNEL4x1_SUB
@@ -2157,39 +2157,39 @@ sgemm_kernel_L1_M4_22:
        KERNEL4x1_SUB
 
        subs    counterL, counterL, #1
-       bgt     sgemm_kernel_L1_M4_22
+       bgt     .Lsgemm_kernel_L1_M4_22
 
 
-sgemm_kernel_L1_M4_40:
+.Lsgemm_kernel_L1_M4_40:
 
        ands    counterL , origK, #7            // counterL = counterL % 8
-       ble     sgemm_kernel_L1_M4_100
+       ble     .Lsgemm_kernel_L1_M4_100
 
-sgemm_kernel_L1_M4_42:
+.Lsgemm_kernel_L1_M4_42:
 
        KERNEL4x1_SUB
 
        subs    counterL, counterL, #1
-       bgt     sgemm_kernel_L1_M4_42
+       bgt     .Lsgemm_kernel_L1_M4_42
 
-sgemm_kernel_L1_M4_100:
+.Lsgemm_kernel_L1_M4_100:
 
        SAVE4x1
 
-sgemm_kernel_L1_M4_END:
+.Lsgemm_kernel_L1_M4_END:
 
 /******************************************************************************/
 
-sgemm_kernel_L1_M2_BEGIN:
+.Lsgemm_kernel_L1_M2_BEGIN:
 
        mov     counterI, origM
        tst     counterI , #3
-       ble     sgemm_kernel_L1_END
+       ble     .Lsgemm_kernel_L1_END
 
        tst     counterI, #2                    // counterI = counterI / 2
-       ble     sgemm_kernel_L1_M1_BEGIN
+       ble     .Lsgemm_kernel_L1_M1_BEGIN
 
-sgemm_kernel_L1_M2_20:
+.Lsgemm_kernel_L1_M2_20:
 
        INIT2x1
 
@@ -2197,9 +2197,9 @@ sgemm_kernel_L1_M2_20:
 
        asr     counterL , origK, #3            // counterL = counterL / 8
        cmp     counterL , #0
-       ble     sgemm_kernel_L1_M2_40
+       ble     .Lsgemm_kernel_L1_M2_40
 
-sgemm_kernel_L1_M2_22:
+.Lsgemm_kernel_L1_M2_22:
 
        KERNEL2x1_SUB
        KERNEL2x1_SUB
@@ -2212,35 +2212,35 @@ sgemm_kernel_L1_M2_22:
        KERNEL2x1_SUB
 
        subs    counterL, counterL, #1
-       bgt     sgemm_kernel_L1_M2_22
+       bgt     .Lsgemm_kernel_L1_M2_22
 
 
-sgemm_kernel_L1_M2_40:
+.Lsgemm_kernel_L1_M2_40:
 
        ands    counterL , origK, #7            // counterL = counterL % 8
-       ble     sgemm_kernel_L1_M2_100
+       ble     .Lsgemm_kernel_L1_M2_100
 
-sgemm_kernel_L1_M2_42:
+.Lsgemm_kernel_L1_M2_42:
 
        KERNEL2x1_SUB
 
        subs    counterL, counterL, #1
-       bgt     sgemm_kernel_L1_M2_42
+       bgt     .Lsgemm_kernel_L1_M2_42
 
-sgemm_kernel_L1_M2_100:
+.Lsgemm_kernel_L1_M2_100:
 
        SAVE2x1
 
-sgemm_kernel_L1_M2_END:
+.Lsgemm_kernel_L1_M2_END:
 
 /******************************************************************************/
 
-sgemm_kernel_L1_M1_BEGIN:
+.Lsgemm_kernel_L1_M1_BEGIN:
 
        tst     counterI, #1                    // counterI = counterI % 2
-       ble     sgemm_kernel_L1_END
+       ble     .Lsgemm_kernel_L1_END
 
-sgemm_kernel_L1_M1_20:
+.Lsgemm_kernel_L1_M1_20:
 
        INIT1x1
 
@@ -2248,9 +2248,9 @@ sgemm_kernel_L1_M1_20:
 
        asr     counterL , origK, #3            // counterL = counterL / 8
        cmp     counterL , #0
-       ble     sgemm_kernel_L1_M1_40
+       ble     .Lsgemm_kernel_L1_M1_40
 
-sgemm_kernel_L1_M1_22:
+.Lsgemm_kernel_L1_M1_22:
        KERNEL1x1_SUB
        KERNEL1x1_SUB
        KERNEL1x1_SUB
@@ -2262,30 +2262,30 @@ sgemm_kernel_L1_M1_22:
        KERNEL1x1_SUB
 
        subs    counterL, counterL, #1
-       bgt     sgemm_kernel_L1_M1_22
+       bgt     .Lsgemm_kernel_L1_M1_22
 
 
-sgemm_kernel_L1_M1_40:
+.Lsgemm_kernel_L1_M1_40:
 
        ands    counterL , origK, #7            // counterL = counterL % 8
-       ble     sgemm_kernel_L1_M1_100
+       ble     .Lsgemm_kernel_L1_M1_100
 
-sgemm_kernel_L1_M1_42:
+.Lsgemm_kernel_L1_M1_42:
 
        KERNEL1x1_SUB
 
        subs    counterL, counterL, #1
-       bgt     sgemm_kernel_L1_M1_42
+       bgt     .Lsgemm_kernel_L1_M1_42
 
-sgemm_kernel_L1_M1_100:
+.Lsgemm_kernel_L1_M1_100:
 
        SAVE1x1
 
-sgemm_kernel_L1_END:
+.Lsgemm_kernel_L1_END:
 
 /******************************************************************************/
 
-sgemm_kernel_L999:
+.Lsgemm_kernel_L999:
        mov     x0, #0                          // set return value
        ldp     d8, d9, [sp, #(0 * 16)]
        ldp     d10, d11, [sp, #(1 * 16)]
index 77e0510..985a0a9 100644 (file)
@@ -1035,7 +1035,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 
        PROLOGUE
 
-strmm_kernel_begin:
+.Lstrmm_kernel_begin:
 
        .align 5
        add     sp, sp, #-(11 * 16)
@@ -1066,11 +1066,11 @@ strmm_kernel_begin:
        mov     counterJ, origN
        asr     counterJ, counterJ, #2          // J = J / 4
        cmp     counterJ, #0
-       ble     strmm_kernel_L2_BEGIN
+       ble     .Lstrmm_kernel_L2_BEGIN
 
 /******************************************************************************/
 
-strmm_kernel_L4_BEGIN:
+.Lstrmm_kernel_L4_BEGIN:
        mov     pCRow0, pC
        add     pCRow1, pCRow0, LDC
        add     pCRow2, pCRow1, LDC
@@ -1084,15 +1084,15 @@ strmm_kernel_L4_BEGIN:
 #endif
        mov     pA, origPA                      // pA = start of A array
 
-strmm_kernel_L4_M16_BEGIN:
+.Lstrmm_kernel_L4_M16_BEGIN:
 
        mov     counterI, origM
        asr     counterI, counterI, #4          // counterI = counterI / 16
        cmp     counterI, #0
-       ble     strmm_kernel_L4_M8_BEGIN
+       ble     .Lstrmm_kernel_L4_M8_BEGIN
 
        .align 5
-strmm_kernel_L4_M16_20:
+.Lstrmm_kernel_L4_M16_20:
 
 #if (defined(LEFT) &&  defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA))
        mov     pB, origPB
@@ -1114,7 +1114,7 @@ strmm_kernel_L4_M16_20:
 
        asr     counterL , tempK, #3
        cmp     counterL , #2
-       blt     strmm_kernel_L4_M16_32
+       blt     .Lstrmm_kernel_L4_M16_32
 
        KERNEL16x4_I
        KERNEL16x4_M2
@@ -1126,10 +1126,10 @@ strmm_kernel_L4_M16_20:
        KERNEL16x4_M2
 
        subs    counterL, counterL, #2
-       ble     strmm_kernel_L4_M16_22a
+       ble     .Lstrmm_kernel_L4_M16_22a
 
        .align 5
-strmm_kernel_L4_M16_22:
+.Lstrmm_kernel_L4_M16_22:
 
        KERNEL16x4_M1
        KERNEL16x4_M2
@@ -1141,10 +1141,10 @@ strmm_kernel_L4_M16_22:
        KERNEL16x4_M2
 
        subs    counterL, counterL, #1
-       bgt     strmm_kernel_L4_M16_22
+       bgt     .Lstrmm_kernel_L4_M16_22
 
        .align 5
-strmm_kernel_L4_M16_22a:
+.Lstrmm_kernel_L4_M16_22a:
 
        KERNEL16x4_M1
        KERNEL16x4_M2
@@ -1155,13 +1155,13 @@ strmm_kernel_L4_M16_22a:
        KERNEL16x4_M1
        KERNEL16x4_E
 
-       b        strmm_kernel_L4_M16_44
+       b        .Lstrmm_kernel_L4_M16_44
 
        .align 5
-strmm_kernel_L4_M16_32:
+.Lstrmm_kernel_L4_M16_32:
 
        tst     counterL, #1
-       ble     strmm_kernel_L4_M16_40
+       ble     .Lstrmm_kernel_L4_M16_40
 
        KERNEL16x4_I
        KERNEL16x4_M2
@@ -1172,25 +1172,25 @@ strmm_kernel_L4_M16_32:
        KERNEL16x4_M1
        KERNEL16x4_E
 
-       b       strmm_kernel_L4_M16_44
+       b       .Lstrmm_kernel_L4_M16_44
 
-strmm_kernel_L4_M16_40:
+.Lstrmm_kernel_L4_M16_40:
 
        INIT16x4
 
-strmm_kernel_L4_M16_44:
+.Lstrmm_kernel_L4_M16_44:
 
        ands    counterL , tempK, #7
-       ble     strmm_kernel_L4_M16_100
+       ble     .Lstrmm_kernel_L4_M16_100
 
        .align 5
-strmm_kernel_L4_M16_46:
+.Lstrmm_kernel_L4_M16_46:
 
        KERNEL16x4_SUB
        subs    counterL, counterL, #1
-       bne     strmm_kernel_L4_M16_46
+       bne     .Lstrmm_kernel_L4_M16_46
 
-strmm_kernel_L4_M16_100:
+.Lstrmm_kernel_L4_M16_100:
 
        SAVE16x4
 
@@ -1213,22 +1213,22 @@ strmm_kernel_L4_M16_100:
        prfm    PLDL1KEEP, [pA, #64]
        prfm    PLDL1KEEP, [origPB]
 
-strmm_kernel_L4_M16_END:
+.Lstrmm_kernel_L4_M16_END:
        subs    counterI, counterI, #1
-       bne     strmm_kernel_L4_M16_20
+       bne     .Lstrmm_kernel_L4_M16_20
 
 //------------------------------------------------------------------------------
 
-strmm_kernel_L4_M8_BEGIN:
+.Lstrmm_kernel_L4_M8_BEGIN:
 
        mov     counterI, origM
        tst     counterI , #15
-       ble     strmm_kernel_L4_END
+       ble     .Lstrmm_kernel_L4_END
 
        tst     counterI, #8
-       ble     strmm_kernel_L4_M4_BEGIN
+       ble     .Lstrmm_kernel_L4_M4_BEGIN
 
-strmm_kernel_L4_M8_20:
+.Lstrmm_kernel_L4_M8_20:
 
 #if (defined(LEFT) &&  defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA))
        mov     pB, origPB
@@ -1250,54 +1250,54 @@ strmm_kernel_L4_M8_20:
 
        asr     counterL , tempK, #1            // L = K / 2
        cmp     counterL , #2                   // is there at least 4 to do?
-       blt     strmm_kernel_L4_M8_32
+       blt     .Lstrmm_kernel_L4_M8_32
 
        KERNEL8x4_I                             // do one in the K
        KERNEL8x4_M2                            // do another in the K
 
        subs    counterL, counterL, #2
-       ble     strmm_kernel_L4_M8_22a
+       ble     .Lstrmm_kernel_L4_M8_22a
        .align 5
 
-strmm_kernel_L4_M8_22:
+.Lstrmm_kernel_L4_M8_22:
 
        KERNEL8x4_M1
        KERNEL8x4_M2
 
        subs    counterL, counterL, #1
-       bgt     strmm_kernel_L4_M8_22
+       bgt     .Lstrmm_kernel_L4_M8_22
 
-strmm_kernel_L4_M8_22a:
+.Lstrmm_kernel_L4_M8_22a:
 
        KERNEL8x4_M1
        KERNEL8x4_E
 
-       b        strmm_kernel_L4_M8_44
+       b        .Lstrmm_kernel_L4_M8_44
 
-strmm_kernel_L4_M8_32:
+.Lstrmm_kernel_L4_M8_32:
 
        tst     counterL, #1
-       ble     strmm_kernel_L4_M8_40
+       ble     .Lstrmm_kernel_L4_M8_40
 
        KERNEL8x4_I
        KERNEL8x4_E
 
-       b       strmm_kernel_L4_M8_44
+       b       .Lstrmm_kernel_L4_M8_44
 
-strmm_kernel_L4_M8_40:
+.Lstrmm_kernel_L4_M8_40:
 
        INIT8x4
 
-strmm_kernel_L4_M8_44:
+.Lstrmm_kernel_L4_M8_44:
 
        ands    counterL , tempK, #1
-       ble     strmm_kernel_L4_M8_100
+       ble     .Lstrmm_kernel_L4_M8_100
 
-strmm_kernel_L4_M8_46:
+.Lstrmm_kernel_L4_M8_46:
 
        KERNEL8x4_SUB
 
-strmm_kernel_L4_M8_100:
+.Lstrmm_kernel_L4_M8_100:
 
        SAVE8x4
 
@@ -1317,20 +1317,20 @@ strmm_kernel_L4_M8_100:
        add     tempOffset, tempOffset, #8
 #endif
 
-strmm_kernel_L4_M8_END:
+.Lstrmm_kernel_L4_M8_END:
 
 //------------------------------------------------------------------------------
 
-strmm_kernel_L4_M4_BEGIN:
+.Lstrmm_kernel_L4_M4_BEGIN:
 
        mov     counterI, origM
        tst     counterI , #7
-       ble     strmm_kernel_L4_END
+       ble     .Lstrmm_kernel_L4_END
 
        tst     counterI, #4
-       ble     strmm_kernel_L4_M2_BEGIN
+       ble     .Lstrmm_kernel_L4_M2_BEGIN
 
-strmm_kernel_L4_M4_20:
+.Lstrmm_kernel_L4_M4_20:
 
 #if (defined(LEFT) &&  defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA))
        mov     pB, origPB
@@ -1350,54 +1350,54 @@ strmm_kernel_L4_M4_20:
 #endif
        asr     counterL , tempK, #1            // L = K / 2
        cmp     counterL , #2                   // is there at least 4 to do?
-       blt     strmm_kernel_L4_M4_32
+       blt     .Lstrmm_kernel_L4_M4_32
 
        KERNEL4x4_I                             // do one in the K
        KERNEL4x4_M2                            // do another in the K
 
        subs    counterL, counterL, #2
-       ble     strmm_kernel_L4_M4_22a
+       ble     .Lstrmm_kernel_L4_M4_22a
        .align 5
 
-strmm_kernel_L4_M4_22:
+.Lstrmm_kernel_L4_M4_22:
 
        KERNEL4x4_M1
        KERNEL4x4_M2
 
        subs    counterL, counterL, #1
-       bgt     strmm_kernel_L4_M4_22
+       bgt     .Lstrmm_kernel_L4_M4_22
 
-strmm_kernel_L4_M4_22a:
+.Lstrmm_kernel_L4_M4_22a:
 
        KERNEL4x4_M1
        KERNEL4x4_E
 
-       b        strmm_kernel_L4_M4_44
+       b        .Lstrmm_kernel_L4_M4_44
 
-strmm_kernel_L4_M4_32:
+.Lstrmm_kernel_L4_M4_32:
 
        tst     counterL, #1
-       ble     strmm_kernel_L4_M4_40
+       ble     .Lstrmm_kernel_L4_M4_40
 
        KERNEL4x4_I
        KERNEL4x4_E
 
-       b       strmm_kernel_L4_M4_44
+       b       .Lstrmm_kernel_L4_M4_44
 
-strmm_kernel_L4_M4_40:
+.Lstrmm_kernel_L4_M4_40:
 
        INIT4x4
 
-strmm_kernel_L4_M4_44:
+.Lstrmm_kernel_L4_M4_44:
 
        ands    counterL , tempK, #1
-       ble     strmm_kernel_L4_M4_100
+       ble     .Lstrmm_kernel_L4_M4_100
 
-strmm_kernel_L4_M4_46:
+.Lstrmm_kernel_L4_M4_46:
 
        KERNEL4x4_SUB
 
-strmm_kernel_L4_M4_100:
+.Lstrmm_kernel_L4_M4_100:
 
        SAVE4x4
 
@@ -1415,20 +1415,20 @@ strmm_kernel_L4_M4_100:
 #if defined(LEFT)
        add     tempOffset, tempOffset, #4
 #endif
-strmm_kernel_L4_M4_END:
+.Lstrmm_kernel_L4_M4_END:
 
 //------------------------------------------------------------------------------
 
-strmm_kernel_L4_M2_BEGIN:
+.Lstrmm_kernel_L4_M2_BEGIN:
 
        mov     counterI, origM
        tst     counterI , #3
-       ble     strmm_kernel_L4_END
+       ble     .Lstrmm_kernel_L4_END
 
        tst     counterI, #2                    // counterI = counterI / 2
-       ble     strmm_kernel_L4_M1_BEGIN
+       ble     .Lstrmm_kernel_L4_M1_BEGIN
 
-strmm_kernel_L4_M2_20:
+.Lstrmm_kernel_L4_M2_20:
 
        INIT2x4
 
@@ -1451,9 +1451,9 @@ strmm_kernel_L4_M2_20:
 #endif
        asr     counterL , tempK, #3            // counterL = counterL / 8
        cmp     counterL , #0
-       ble     strmm_kernel_L4_M2_40
+       ble     .Lstrmm_kernel_L4_M2_40
 
-strmm_kernel_L4_M2_22:
+.Lstrmm_kernel_L4_M2_22:
 
        KERNEL2x4_SUB
        KERNEL2x4_SUB
@@ -1466,22 +1466,22 @@ strmm_kernel_L4_M2_22:
        KERNEL2x4_SUB
 
        subs    counterL, counterL, #1
-       bgt     strmm_kernel_L4_M2_22
+       bgt     .Lstrmm_kernel_L4_M2_22
 
 
-strmm_kernel_L4_M2_40:
+.Lstrmm_kernel_L4_M2_40:
 
        ands    counterL , tempK, #7            // counterL = counterL % 8
-       ble     strmm_kernel_L4_M2_100
+       ble     .Lstrmm_kernel_L4_M2_100
 
-strmm_kernel_L4_M2_42:
+.Lstrmm_kernel_L4_M2_42:
 
        KERNEL2x4_SUB
 
        subs    counterL, counterL, #1
-       bgt     strmm_kernel_L4_M2_42
+       bgt     .Lstrmm_kernel_L4_M2_42
 
-strmm_kernel_L4_M2_100:
+.Lstrmm_kernel_L4_M2_100:
 
        SAVE2x4
 
@@ -1500,15 +1500,15 @@ strmm_kernel_L4_M2_100:
 #if defined(LEFT)
        add     tempOffset, tempOffset, #2
 #endif
-strmm_kernel_L4_M2_END:
+.Lstrmm_kernel_L4_M2_END:
 
 
-strmm_kernel_L4_M1_BEGIN:
+.Lstrmm_kernel_L4_M1_BEGIN:
 
        tst     counterI, #1                    // counterI = counterI % 2
-       ble     strmm_kernel_L4_END
+       ble     .Lstrmm_kernel_L4_END
 
-strmm_kernel_L4_M1_20:
+.Lstrmm_kernel_L4_M1_20:
 
        INIT1x4
 
@@ -1531,9 +1531,9 @@ strmm_kernel_L4_M1_20:
 #endif
        asr     counterL , tempK, #3            // counterL = counterL / 8
        cmp     counterL , #0
-       ble     strmm_kernel_L4_M1_40
+       ble     .Lstrmm_kernel_L4_M1_40
 
-strmm_kernel_L4_M1_22:
+.Lstrmm_kernel_L4_M1_22:
        KERNEL1x4_SUB
        KERNEL1x4_SUB
        KERNEL1x4_SUB
@@ -1545,22 +1545,22 @@ strmm_kernel_L4_M1_22:
        KERNEL1x4_SUB
 
        subs    counterL, counterL, #1
-       bgt     strmm_kernel_L4_M1_22
+       bgt     .Lstrmm_kernel_L4_M1_22
 
 
-strmm_kernel_L4_M1_40:
+.Lstrmm_kernel_L4_M1_40:
 
        ands    counterL , tempK, #7            // counterL = counterL % 8
-       ble     strmm_kernel_L4_M1_100
+       ble     .Lstrmm_kernel_L4_M1_100
 
-strmm_kernel_L4_M1_42:
+.Lstrmm_kernel_L4_M1_42:
 
        KERNEL1x4_SUB
 
        subs    counterL, counterL, #1
-       bgt     strmm_kernel_L4_M1_42
+       bgt     .Lstrmm_kernel_L4_M1_42
 
-strmm_kernel_L4_M1_100:
+.Lstrmm_kernel_L4_M1_100:
 
        SAVE1x4
 
@@ -1579,26 +1579,26 @@ strmm_kernel_L4_M1_100:
 #if defined(LEFT)
        add     tempOffset, tempOffset, #1
 #endif
-strmm_kernel_L4_END:
+.Lstrmm_kernel_L4_END:
        add     origPB, origPB, origK, lsl #4   // B = B + K * 4 * 4
 #if !defined(LEFT)
        add     tempOffset, tempOffset, #4
 #endif
 
        subs    counterJ, counterJ , #1         // j--
-       bgt     strmm_kernel_L4_BEGIN
+       bgt     .Lstrmm_kernel_L4_BEGIN
 
 
 /******************************************************************************/
 
-strmm_kernel_L2_BEGIN:   // less than 2 left in N direction
+.Lstrmm_kernel_L2_BEGIN:   // less than 2 left in N direction
 
        mov     counterJ , origN
        tst     counterJ , #3
-       ble     strmm_kernel_L999
+       ble     .Lstrmm_kernel_L999
 
        tst     counterJ , #2
-       ble     strmm_kernel_L1_BEGIN
+       ble     .Lstrmm_kernel_L1_BEGIN
 
        mov     pCRow0, pC                      // pCRow0 = pC
 
@@ -1609,14 +1609,14 @@ strmm_kernel_L2_BEGIN:   // less than 2 left in N direction
 #endif
        mov     pA, origPA                      // pA = A
 
-strmm_kernel_L2_M16_BEGIN:
+.Lstrmm_kernel_L2_M16_BEGIN:
 
        mov     counterI, origM
        asr     counterI, counterI, #4          // counterI = counterI / 16
        cmp     counterI,#0
-       ble     strmm_kernel_L2_M8_BEGIN
+       ble     .Lstrmm_kernel_L2_M8_BEGIN
 
-strmm_kernel_L2_M16_20:
+.Lstrmm_kernel_L2_M16_20:
 
        INIT16x2
 
@@ -1640,10 +1640,10 @@ strmm_kernel_L2_M16_20:
 
        asr     counterL , tempK, #3            // counterL = counterL / 8
        cmp     counterL,#0
-       ble     strmm_kernel_L2_M16_40
+       ble     .Lstrmm_kernel_L2_M16_40
        .align 5
 
-strmm_kernel_L2_M16_22:
+.Lstrmm_kernel_L2_M16_22:
        KERNEL16x2_SUB
        KERNEL16x2_SUB
        KERNEL16x2_SUB
@@ -1655,22 +1655,22 @@ strmm_kernel_L2_M16_22:
        KERNEL16x2_SUB
 
        subs    counterL, counterL, #1
-       bgt     strmm_kernel_L2_M16_22
+       bgt     .Lstrmm_kernel_L2_M16_22
 
 
-strmm_kernel_L2_M16_40:
+.Lstrmm_kernel_L2_M16_40:
 
        ands    counterL , tempK, #7            // counterL = counterL % 8
-       ble     strmm_kernel_L2_M16_100
+       ble     .Lstrmm_kernel_L2_M16_100
 
-strmm_kernel_L2_M16_42:
+.Lstrmm_kernel_L2_M16_42:
 
        KERNEL16x2_SUB
 
        subs    counterL, counterL, #1
-       bgt     strmm_kernel_L2_M16_42
+       bgt     .Lstrmm_kernel_L2_M16_42
 
-strmm_kernel_L2_M16_100:
+.Lstrmm_kernel_L2_M16_100:
 
        SAVE16x2
 
@@ -1690,22 +1690,22 @@ strmm_kernel_L2_M16_100:
        add     tempOffset, tempOffset, #16
 #endif
 
-strmm_kernel_L2_M16_END:
+.Lstrmm_kernel_L2_M16_END:
 
        subs    counterI, counterI, #1
-       bgt     strmm_kernel_L2_M16_20
+       bgt     .Lstrmm_kernel_L2_M16_20
 
 //------------------------------------------------------------------------------
 
-strmm_kernel_L2_M8_BEGIN:
+.Lstrmm_kernel_L2_M8_BEGIN:
        mov     counterI, origM
        tst     counterI , #15
-       ble     strmm_kernel_L2_END
+       ble     .Lstrmm_kernel_L2_END
 
        tst     counterI, #8
-       ble     strmm_kernel_L2_M4_BEGIN
+       ble     .Lstrmm_kernel_L2_M4_BEGIN
 
-strmm_kernel_L2_M8_20:
+.Lstrmm_kernel_L2_M8_20:
 
        INIT8x2
 
@@ -1729,10 +1729,10 @@ strmm_kernel_L2_M8_20:
 
        asr     counterL , tempK, #3            // counterL = counterL / 8
        cmp     counterL,#0
-       ble     strmm_kernel_L2_M8_40
+       ble     .Lstrmm_kernel_L2_M8_40
        .align 5
 
-strmm_kernel_L2_M8_22:
+.Lstrmm_kernel_L2_M8_22:
        KERNEL8x2_SUB
        KERNEL8x2_SUB
        KERNEL8x2_SUB
@@ -1744,22 +1744,22 @@ strmm_kernel_L2_M8_22:
        KERNEL8x2_SUB
 
        subs    counterL, counterL, #1
-       bgt     strmm_kernel_L2_M8_22
+       bgt     .Lstrmm_kernel_L2_M8_22
 
 
-strmm_kernel_L2_M8_40:
+.Lstrmm_kernel_L2_M8_40:
 
        ands    counterL , tempK, #7            // counterL = counterL % 8
-       ble     strmm_kernel_L2_M8_100
+       ble     .Lstrmm_kernel_L2_M8_100
 
-strmm_kernel_L2_M8_42:
+.Lstrmm_kernel_L2_M8_42:
 
        KERNEL8x2_SUB
 
        subs    counterL, counterL, #1
-       bgt     strmm_kernel_L2_M8_42
+       bgt     .Lstrmm_kernel_L2_M8_42
 
-strmm_kernel_L2_M8_100:
+.Lstrmm_kernel_L2_M8_100:
 
        SAVE8x2
 
@@ -1779,19 +1779,19 @@ strmm_kernel_L2_M8_100:
        add     tempOffset, tempOffset, #8
 #endif
 
-strmm_kernel_L2_M8_END:
+.Lstrmm_kernel_L2_M8_END:
 
 //------------------------------------------------------------------------------
 
-strmm_kernel_L2_M4_BEGIN:
+.Lstrmm_kernel_L2_M4_BEGIN:
        mov     counterI, origM
        tst     counterI , #7
-       ble     strmm_kernel_L2_END
+       ble     .Lstrmm_kernel_L2_END
 
        tst     counterI, #4
-       ble     strmm_kernel_L2_M2_BEGIN
+       ble     .Lstrmm_kernel_L2_M2_BEGIN
 
-strmm_kernel_L2_M4_20:
+.Lstrmm_kernel_L2_M4_20:
 
        INIT4x2
 
@@ -1814,10 +1814,10 @@ strmm_kernel_L2_M4_20:
 #endif
        asr     counterL , tempK, #3            // counterL = counterL / 8
        cmp     counterL,#0
-       ble     strmm_kernel_L2_M4_40
+       ble     .Lstrmm_kernel_L2_M4_40
        .align 5
 
-strmm_kernel_L2_M4_22:
+.Lstrmm_kernel_L2_M4_22:
        KERNEL4x2_SUB
        KERNEL4x2_SUB
        KERNEL4x2_SUB
@@ -1829,22 +1829,22 @@ strmm_kernel_L2_M4_22:
        KERNEL4x2_SUB
 
        subs    counterL, counterL, #1
-       bgt     strmm_kernel_L2_M4_22
+       bgt     .Lstrmm_kernel_L2_M4_22
 
 
-strmm_kernel_L2_M4_40:
+.Lstrmm_kernel_L2_M4_40:
 
        ands    counterL , tempK, #7            // counterL = counterL % 8
-       ble     strmm_kernel_L2_M4_100
+       ble     .Lstrmm_kernel_L2_M4_100
 
-strmm_kernel_L2_M4_42:
+.Lstrmm_kernel_L2_M4_42:
 
        KERNEL4x2_SUB
 
        subs    counterL, counterL, #1
-       bgt     strmm_kernel_L2_M4_42
+       bgt     .Lstrmm_kernel_L2_M4_42
 
-strmm_kernel_L2_M4_100:
+.Lstrmm_kernel_L2_M4_100:
 
        SAVE4x2
 
@@ -1863,21 +1863,21 @@ strmm_kernel_L2_M4_100:
 #if defined(LEFT)
        add     tempOffset, tempOffset, #4
 #endif
-strmm_kernel_L2_M4_END:
+.Lstrmm_kernel_L2_M4_END:
 
 //------------------------------------------------------------------------------
 
 
-strmm_kernel_L2_M2_BEGIN:
+.Lstrmm_kernel_L2_M2_BEGIN:
 
        mov     counterI, origM
        tst     counterI , #3
-       ble     strmm_kernel_L2_END
+       ble     .Lstrmm_kernel_L2_END
 
        tst     counterI, #2                    // counterI = counterI / 2
-       ble     strmm_kernel_L2_M1_BEGIN
+       ble     .Lstrmm_kernel_L2_M1_BEGIN
 
-strmm_kernel_L2_M2_20:
+.Lstrmm_kernel_L2_M2_20:
 
        INIT2x2
 
@@ -1900,9 +1900,9 @@ strmm_kernel_L2_M2_20:
 #endif
        asr     counterL , tempK, #3            // counterL = counterL / 8
         cmp    counterL,#0
-       ble     strmm_kernel_L2_M2_40
+       ble     .Lstrmm_kernel_L2_M2_40
 
-strmm_kernel_L2_M2_22:
+.Lstrmm_kernel_L2_M2_22:
 
        KERNEL2x2_SUB
        KERNEL2x2_SUB
@@ -1915,22 +1915,22 @@ strmm_kernel_L2_M2_22:
        KERNEL2x2_SUB
 
        subs    counterL, counterL, #1
-       bgt     strmm_kernel_L2_M2_22
+       bgt     .Lstrmm_kernel_L2_M2_22
 
 
-strmm_kernel_L2_M2_40:
+.Lstrmm_kernel_L2_M2_40:
 
        ands    counterL , tempK, #7            // counterL = counterL % 8
-       ble     strmm_kernel_L2_M2_100
+       ble     .Lstrmm_kernel_L2_M2_100
 
-strmm_kernel_L2_M2_42:
+.Lstrmm_kernel_L2_M2_42:
 
        KERNEL2x2_SUB
 
        subs    counterL, counterL, #1
-       bgt     strmm_kernel_L2_M2_42
+       bgt     .Lstrmm_kernel_L2_M2_42
 
-strmm_kernel_L2_M2_100:
+.Lstrmm_kernel_L2_M2_100:
 
        SAVE2x2
 #if (defined(LEFT) &&  defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA))
@@ -1949,15 +1949,15 @@ strmm_kernel_L2_M2_100:
        add     tempOffset, tempOffset, #2
 #endif
 
-strmm_kernel_L2_M2_END:
+.Lstrmm_kernel_L2_M2_END:
 
 
-strmm_kernel_L2_M1_BEGIN:
+.Lstrmm_kernel_L2_M1_BEGIN:
 
        tst     counterI, #1                    // counterI = counterI % 2
-       ble     strmm_kernel_L2_END
+       ble     .Lstrmm_kernel_L2_END
 
-strmm_kernel_L2_M1_20:
+.Lstrmm_kernel_L2_M1_20:
 
        INIT1x2
 
@@ -1980,9 +1980,9 @@ strmm_kernel_L2_M1_20:
 #endif
        asr     counterL , tempK, #3            // counterL = counterL / 8
         cmp     counterL, #0
-       ble     strmm_kernel_L2_M1_40
+       ble     .Lstrmm_kernel_L2_M1_40
 
-strmm_kernel_L2_M1_22:
+.Lstrmm_kernel_L2_M1_22:
        KERNEL1x2_SUB
        KERNEL1x2_SUB
        KERNEL1x2_SUB
@@ -1994,22 +1994,22 @@ strmm_kernel_L2_M1_22:
        KERNEL1x2_SUB
 
        subs    counterL, counterL, #1
-       bgt     strmm_kernel_L2_M1_22
+       bgt     .Lstrmm_kernel_L2_M1_22
 
 
-strmm_kernel_L2_M1_40:
+.Lstrmm_kernel_L2_M1_40:
 
        ands    counterL , tempK, #7            // counterL = counterL % 8
-       ble     strmm_kernel_L2_M1_100
+       ble     .Lstrmm_kernel_L2_M1_100
 
-strmm_kernel_L2_M1_42:
+.Lstrmm_kernel_L2_M1_42:
 
        KERNEL1x2_SUB
 
        subs    counterL, counterL, #1
-       bgt     strmm_kernel_L2_M1_42
+       bgt     .Lstrmm_kernel_L2_M1_42
 
-strmm_kernel_L2_M1_100:
+.Lstrmm_kernel_L2_M1_100:
 
        SAVE1x2
 
@@ -2028,7 +2028,7 @@ strmm_kernel_L2_M1_100:
 #if defined(LEFT)
        add     tempOffset, tempOffset, #1
 #endif
-strmm_kernel_L2_END:
+.Lstrmm_kernel_L2_END:
 #if !defined(LEFT)
        add     tempOffset, tempOffset, #2
 #endif
@@ -2036,11 +2036,11 @@ strmm_kernel_L2_END:
 
 /******************************************************************************/
 
-strmm_kernel_L1_BEGIN:
+.Lstrmm_kernel_L1_BEGIN:
 
        mov     counterJ , origN
        tst     counterJ , #1
-       ble     strmm_kernel_L999 // done
+       ble     .Lstrmm_kernel_L999 // done
 
 
        mov     pCRow0, pC                      // pCRow0 = C
@@ -2051,14 +2051,14 @@ strmm_kernel_L1_BEGIN:
 #endif
        mov     pA, origPA                      // pA = A
 
-strmm_kernel_L1_M16_BEGIN:
+.Lstrmm_kernel_L1_M16_BEGIN:
 
        mov     counterI, origM
        asr     counterI, counterI, #4          // counterI = counterI / 16
        cmp     counterI, #0
-       ble     strmm_kernel_L1_M8_BEGIN
+       ble     .Lstrmm_kernel_L1_M8_BEGIN
 
-strmm_kernel_L1_M16_20:
+.Lstrmm_kernel_L1_M16_20:
 
        INIT16x1
 
@@ -2082,10 +2082,10 @@ strmm_kernel_L1_M16_20:
 
        asr     counterL , tempK, #3            // counterL = counterL / 8
        cmp     counterL , #0
-       ble     strmm_kernel_L1_M16_40
+       ble     .Lstrmm_kernel_L1_M16_40
        .align 5
 
-strmm_kernel_L1_M16_22:
+.Lstrmm_kernel_L1_M16_22:
        KERNEL16x1_SUB
        KERNEL16x1_SUB
        KERNEL16x1_SUB
@@ -2097,22 +2097,22 @@ strmm_kernel_L1_M16_22:
        KERNEL16x1_SUB
 
        subs    counterL, counterL, #1
-       bgt     strmm_kernel_L1_M16_22
+       bgt     .Lstrmm_kernel_L1_M16_22
 
 
-strmm_kernel_L1_M16_40:
+.Lstrmm_kernel_L1_M16_40:
 
        ands    counterL , tempK, #7            // counterL = counterL % 8
-       ble     strmm_kernel_L1_M16_100
+       ble     .Lstrmm_kernel_L1_M16_100
 
-strmm_kernel_L1_M16_42:
+.Lstrmm_kernel_L1_M16_42:
 
        KERNEL16x1_SUB
 
        subs    counterL, counterL, #1
-       bgt     strmm_kernel_L1_M16_42
+       bgt     .Lstrmm_kernel_L1_M16_42
 
-strmm_kernel_L1_M16_100:
+.Lstrmm_kernel_L1_M16_100:
 
        SAVE16x1
 
@@ -2132,23 +2132,23 @@ strmm_kernel_L1_M16_100:
        add     tempOffset, tempOffset, #16
 #endif
 
-strmm_kernel_L1_M16_END:
+.Lstrmm_kernel_L1_M16_END:
 
        subs    counterI, counterI, #1
-       bgt     strmm_kernel_L1_M16_20
+       bgt     .Lstrmm_kernel_L1_M16_20
 
 //------------------------------------------------------------------------------
 
-strmm_kernel_L1_M8_BEGIN:
+.Lstrmm_kernel_L1_M8_BEGIN:
 
        mov     counterI, origM
        tst     counterI , #15
-       ble     strmm_kernel_L1_END
+       ble     .Lstrmm_kernel_L1_END
 
        tst     counterI, #8
-       ble     strmm_kernel_L1_M4_BEGIN
+       ble     .Lstrmm_kernel_L1_M4_BEGIN
 
-strmm_kernel_L1_M8_20:
+.Lstrmm_kernel_L1_M8_20:
 
        INIT8x1
 
@@ -2172,10 +2172,10 @@ strmm_kernel_L1_M8_20:
 
        asr     counterL , tempK, #3            // counterL = counterL / 8
        cmp     counterL , #0
-       ble     strmm_kernel_L1_M8_40
+       ble     .Lstrmm_kernel_L1_M8_40
        .align 5
 
-strmm_kernel_L1_M8_22:
+.Lstrmm_kernel_L1_M8_22:
        KERNEL8x1_SUB
        KERNEL8x1_SUB
        KERNEL8x1_SUB
@@ -2187,22 +2187,22 @@ strmm_kernel_L1_M8_22:
        KERNEL8x1_SUB
 
        subs    counterL, counterL, #1
-       bgt     strmm_kernel_L1_M8_22
+       bgt     .Lstrmm_kernel_L1_M8_22
 
 
-strmm_kernel_L1_M8_40:
+.Lstrmm_kernel_L1_M8_40:
 
        ands    counterL , tempK, #7            // counterL = counterL % 8
-       ble     strmm_kernel_L1_M8_100
+       ble     .Lstrmm_kernel_L1_M8_100
 
-strmm_kernel_L1_M8_42:
+.Lstrmm_kernel_L1_M8_42:
 
        KERNEL8x1_SUB
 
        subs    counterL, counterL, #1
-       bgt     strmm_kernel_L1_M8_42
+       bgt     .Lstrmm_kernel_L1_M8_42
 
-strmm_kernel_L1_M8_100:
+.Lstrmm_kernel_L1_M8_100:
 
        SAVE8x1
 
@@ -2222,19 +2222,19 @@ strmm_kernel_L1_M8_100:
        add     tempOffset, tempOffset, #8
 #endif
 
-strmm_kernel_L1_M8_END:
+.Lstrmm_kernel_L1_M8_END:
 
 //------------------------------------------------------------------------------
 
-strmm_kernel_L1_M4_BEGIN:
+.Lstrmm_kernel_L1_M4_BEGIN:
        mov     counterI, origM
        tst     counterI , #7
-       ble     strmm_kernel_L1_END
+       ble     .Lstrmm_kernel_L1_END
 
        tst     counterI, #4
-       ble     strmm_kernel_L1_M2_BEGIN
+       ble     .Lstrmm_kernel_L1_M2_BEGIN
 
-strmm_kernel_L1_M4_20:
+.Lstrmm_kernel_L1_M4_20:
 
        INIT4x1
 
@@ -2257,10 +2257,10 @@ strmm_kernel_L1_M4_20:
 #endif
        asr     counterL , tempK, #3            // counterL = counterL / 8
        cmp     counterL , #0
-       ble     strmm_kernel_L1_M4_40
+       ble     .Lstrmm_kernel_L1_M4_40
        .align 5
 
-strmm_kernel_L1_M4_22:
+.Lstrmm_kernel_L1_M4_22:
        KERNEL4x1_SUB
        KERNEL4x1_SUB
        KERNEL4x1_SUB
@@ -2272,22 +2272,22 @@ strmm_kernel_L1_M4_22:
        KERNEL4x1_SUB
 
        subs    counterL, counterL, #1
-       bgt     strmm_kernel_L1_M4_22
+       bgt     .Lstrmm_kernel_L1_M4_22
 
 
-strmm_kernel_L1_M4_40:
+.Lstrmm_kernel_L1_M4_40:
 
        ands    counterL , tempK, #7            // counterL = counterL % 8
-       ble     strmm_kernel_L1_M4_100
+       ble     .Lstrmm_kernel_L1_M4_100
 
-strmm_kernel_L1_M4_42:
+.Lstrmm_kernel_L1_M4_42:
 
        KERNEL4x1_SUB
 
        subs    counterL, counterL, #1
-       bgt     strmm_kernel_L1_M4_42
+       bgt     .Lstrmm_kernel_L1_M4_42
 
-strmm_kernel_L1_M4_100:
+.Lstrmm_kernel_L1_M4_100:
 
        SAVE4x1
 
@@ -2306,20 +2306,20 @@ strmm_kernel_L1_M4_100:
 #if defined(LEFT)
        add     tempOffset, tempOffset, #4
 #endif
-strmm_kernel_L1_M4_END:
+.Lstrmm_kernel_L1_M4_END:
 
 //------------------------------------------------------------------------------
 
-strmm_kernel_L1_M2_BEGIN:
+.Lstrmm_kernel_L1_M2_BEGIN:
 
        mov     counterI, origM
        tst     counterI , #3
-       ble     strmm_kernel_L1_END
+       ble     .Lstrmm_kernel_L1_END
 
        tst     counterI, #2                    // counterI = counterI / 2
-       ble     strmm_kernel_L1_M1_BEGIN
+       ble     .Lstrmm_kernel_L1_M1_BEGIN
 
-strmm_kernel_L1_M2_20:
+.Lstrmm_kernel_L1_M2_20:
 
        INIT2x1
 
@@ -2342,9 +2342,9 @@ strmm_kernel_L1_M2_20:
 #endif
        asr     counterL , tempK, #3            // counterL = counterL / 8
        cmp     counterL , #0
-       ble     strmm_kernel_L1_M2_40
+       ble     .Lstrmm_kernel_L1_M2_40
 
-strmm_kernel_L1_M2_22:
+.Lstrmm_kernel_L1_M2_22:
 
        KERNEL2x1_SUB
        KERNEL2x1_SUB
@@ -2357,22 +2357,22 @@ strmm_kernel_L1_M2_22:
        KERNEL2x1_SUB
 
        subs    counterL, counterL, #1
-       bgt     strmm_kernel_L1_M2_22
+       bgt     .Lstrmm_kernel_L1_M2_22
 
 
-strmm_kernel_L1_M2_40:
+.Lstrmm_kernel_L1_M2_40:
 
        ands    counterL , tempK, #7            // counterL = counterL % 8
-       ble     strmm_kernel_L1_M2_100
+       ble     .Lstrmm_kernel_L1_M2_100
 
-strmm_kernel_L1_M2_42:
+.Lstrmm_kernel_L1_M2_42:
 
        KERNEL2x1_SUB
 
        subs    counterL, counterL, #1
-       bgt     strmm_kernel_L1_M2_42
+       bgt     .Lstrmm_kernel_L1_M2_42
 
-strmm_kernel_L1_M2_100:
+.Lstrmm_kernel_L1_M2_100:
 
        SAVE2x1
 
@@ -2391,15 +2391,15 @@ strmm_kernel_L1_M2_100:
 #if defined(LEFT)
        add     tempOffset, tempOffset, #2
 #endif
-strmm_kernel_L1_M2_END:
+.Lstrmm_kernel_L1_M2_END:
 
 
-strmm_kernel_L1_M1_BEGIN:
+.Lstrmm_kernel_L1_M1_BEGIN:
 
        tst     counterI, #1                    // counterI = counterI % 2
-       ble     strmm_kernel_L1_END
+       ble     .Lstrmm_kernel_L1_END
 
-strmm_kernel_L1_M1_20:
+.Lstrmm_kernel_L1_M1_20:
 
        INIT1x1
 
@@ -2422,9 +2422,9 @@ strmm_kernel_L1_M1_20:
 #endif
        asr     counterL , tempK, #3            // counterL = counterL / 8
        cmp     counterL , #0
-       ble     strmm_kernel_L1_M1_40
+       ble     .Lstrmm_kernel_L1_M1_40
 
-strmm_kernel_L1_M1_22:
+.Lstrmm_kernel_L1_M1_22:
        KERNEL1x1_SUB
        KERNEL1x1_SUB
        KERNEL1x1_SUB
@@ -2436,28 +2436,28 @@ strmm_kernel_L1_M1_22:
        KERNEL1x1_SUB
 
        subs    counterL, counterL, #1
-       bgt     strmm_kernel_L1_M1_22
+       bgt     .Lstrmm_kernel_L1_M1_22
 
 
-strmm_kernel_L1_M1_40:
+.Lstrmm_kernel_L1_M1_40:
 
        ands    counterL , tempK, #7            // counterL = counterL % 8
-       ble     strmm_kernel_L1_M1_100
+       ble     .Lstrmm_kernel_L1_M1_100
 
-strmm_kernel_L1_M1_42:
+.Lstrmm_kernel_L1_M1_42:
 
        KERNEL1x1_SUB
 
        subs    counterL, counterL, #1
-       bgt     strmm_kernel_L1_M1_42
+       bgt     .Lstrmm_kernel_L1_M1_42
 
-strmm_kernel_L1_M1_100:
+.Lstrmm_kernel_L1_M1_100:
 
        SAVE1x1
 
-strmm_kernel_L1_END:
+.Lstrmm_kernel_L1_END:
 
-strmm_kernel_L999:
+.Lstrmm_kernel_L999:
        mov     x0, #0                          // set return value
        ldp     d8, d9, [sp, #(0 * 16)]
        ldp     d10, d11, [sp, #(1 * 16)]
index eeb3e6e..5f7818c 100644 (file)
@@ -507,7 +507,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 
        PROLOGUE
 
-strmm_kernel_begin:
+.Lstrmm_kernel_begin:
 
        .align 5
        add     sp, sp, #-(11 * 16)
@@ -539,11 +539,11 @@ strmm_kernel_begin:
        mov     counterJ, origN
        asr     counterJ, counterJ, #2          // J = J / 4
        cmp     counterJ, #0
-       ble     strmm_kernel_L2_BEGIN
+       ble     .Lstrmm_kernel_L2_BEGIN
 
 /******************************************************************************/
 
-strmm_kernel_L4_BEGIN:
+.Lstrmm_kernel_L4_BEGIN:
        mov     pCRow0, pC                      // pCRow0 = C
        add     pC, pC, LDC, lsl #2
 
@@ -553,14 +553,14 @@ strmm_kernel_L4_BEGIN:
 
        mov     pA, origPA                      // pA = start of A array
 
-strmm_kernel_L4_M4_BEGIN:
+.Lstrmm_kernel_L4_M4_BEGIN:
 
        mov     counterI, origM
        asr     counterI, counterI, #2          // counterI = counterI / 4
        cmp     counterI, #0
-       ble     strmm_kernel_L4_M2_BEGIN
+       ble     .Lstrmm_kernel_L4_M2_BEGIN
 
-strmm_kernel_L4_M4_20:
+.Lstrmm_kernel_L4_M4_20:
 
 #if (defined(LEFT) &&  defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA))
        mov     pB, origPB
@@ -581,54 +581,54 @@ strmm_kernel_L4_M4_20:
 
        asr     counterL , tempK, #1            // L = K / 2
        cmp     counterL , #2                   // is there at least 4 to do?
-       blt     strmm_kernel_L4_M4_32
+       blt     .Lstrmm_kernel_L4_M4_32
 
        KERNEL4x4_I                             // do one in the K
        KERNEL4x4_M2                            // do another in the K
 
        subs    counterL, counterL, #2
-       ble     strmm_kernel_L4_M4_22a
+       ble     .Lstrmm_kernel_L4_M4_22a
        .align 5
 
-strmm_kernel_L4_M4_22:
+.Lstrmm_kernel_L4_M4_22:
 
        KERNEL4x4_M1
        KERNEL4x4_M2
 
        subs    counterL, counterL, #1
-       bgt     strmm_kernel_L4_M4_22
+       bgt     .Lstrmm_kernel_L4_M4_22
 
-strmm_kernel_L4_M4_22a:
+.Lstrmm_kernel_L4_M4_22a:
 
        KERNEL4x4_M1
        KERNEL4x4_E
 
-       b        strmm_kernel_L4_M4_44
+       b        .Lstrmm_kernel_L4_M4_44
 
-strmm_kernel_L4_M4_32:
+.Lstrmm_kernel_L4_M4_32:
 
        tst     counterL, #1
-       ble     strmm_kernel_L4_M4_40
+       ble     .Lstrmm_kernel_L4_M4_40
 
        KERNEL4x4_I
        KERNEL4x4_E
 
-       b       strmm_kernel_L4_M4_44
+       b       .Lstrmm_kernel_L4_M4_44
 
-strmm_kernel_L4_M4_40:
+.Lstrmm_kernel_L4_M4_40:
 
        INIT4x4
 
-strmm_kernel_L4_M4_44:
+.Lstrmm_kernel_L4_M4_44:
 
        ands    counterL , tempK, #1
-       ble     strmm_kernel_L4_M4_100
+       ble     .Lstrmm_kernel_L4_M4_100
 
-strmm_kernel_L4_M4_46:
+.Lstrmm_kernel_L4_M4_46:
 
        KERNEL4x4_SUB
 
-strmm_kernel_L4_M4_100:
+.Lstrmm_kernel_L4_M4_100:
 
        SAVE4x4
 
@@ -647,20 +647,20 @@ strmm_kernel_L4_M4_100:
        add     tempOffset, tempOffset, #4
 #endif
 
-strmm_kernel_L4_M4_END:
+.Lstrmm_kernel_L4_M4_END:
        subs    counterI, counterI, #1
-       bne     strmm_kernel_L4_M4_20
+       bne     .Lstrmm_kernel_L4_M4_20
 
-strmm_kernel_L4_M2_BEGIN:
+.Lstrmm_kernel_L4_M2_BEGIN:
 
        mov     counterI, origM
        tst     counterI , #3
-       ble     strmm_kernel_L4_END
+       ble     .Lstrmm_kernel_L4_END
 
        tst     counterI, #2                    // counterI = counterI / 2
-       ble     strmm_kernel_L4_M1_BEGIN
+       ble     .Lstrmm_kernel_L4_M1_BEGIN
 
-strmm_kernel_L4_M2_20:
+.Lstrmm_kernel_L4_M2_20:
 
        INIT2x4
 
@@ -684,9 +684,9 @@ strmm_kernel_L4_M2_20:
 
        asr     counterL , tempK, #3            // counterL = counterL / 8
        cmp     counterL , #0
-       ble     strmm_kernel_L4_M2_40
+       ble     .Lstrmm_kernel_L4_M2_40
 
-strmm_kernel_L4_M2_22:
+.Lstrmm_kernel_L4_M2_22:
 
        KERNEL2x4_SUB
        KERNEL2x4_SUB
@@ -699,22 +699,22 @@ strmm_kernel_L4_M2_22:
        KERNEL2x4_SUB
 
        subs    counterL, counterL, #1
-       bgt     strmm_kernel_L4_M2_22
+       bgt     .Lstrmm_kernel_L4_M2_22
 
 
-strmm_kernel_L4_M2_40:
+.Lstrmm_kernel_L4_M2_40:
 
        ands    counterL , tempK, #7            // counterL = counterL % 8
-       ble     strmm_kernel_L4_M2_100
+       ble     .Lstrmm_kernel_L4_M2_100
 
-strmm_kernel_L4_M2_42:
+.Lstrmm_kernel_L4_M2_42:
 
        KERNEL2x4_SUB
 
        subs    counterL, counterL, #1
-       bgt     strmm_kernel_L4_M2_42
+       bgt     .Lstrmm_kernel_L4_M2_42
 
-strmm_kernel_L4_M2_100:
+.Lstrmm_kernel_L4_M2_100:
 
        SAVE2x4
 
@@ -735,15 +735,15 @@ strmm_kernel_L4_M2_100:
 #endif
 
 
-strmm_kernel_L4_M2_END:
+.Lstrmm_kernel_L4_M2_END:
 
 
-strmm_kernel_L4_M1_BEGIN:
+.Lstrmm_kernel_L4_M1_BEGIN:
 
        tst     counterI, #1                    // counterI = counterI % 2
-       ble     strmm_kernel_L4_END
+       ble     .Lstrmm_kernel_L4_END
 
-strmm_kernel_L4_M1_20:
+.Lstrmm_kernel_L4_M1_20:
 
        INIT1x4
 
@@ -767,9 +767,9 @@ strmm_kernel_L4_M1_20:
 
        asr     counterL , tempK, #3            // counterL = counterL / 8
        cmp     counterL , #0
-       ble     strmm_kernel_L4_M1_40
+       ble     .Lstrmm_kernel_L4_M1_40
 
-strmm_kernel_L4_M1_22:
+.Lstrmm_kernel_L4_M1_22:
        KERNEL1x4_SUB
        KERNEL1x4_SUB
        KERNEL1x4_SUB
@@ -781,22 +781,22 @@ strmm_kernel_L4_M1_22:
        KERNEL1x4_SUB
 
        subs    counterL, counterL, #1
-       bgt     strmm_kernel_L4_M1_22
+       bgt     .Lstrmm_kernel_L4_M1_22
 
 
-strmm_kernel_L4_M1_40:
+.Lstrmm_kernel_L4_M1_40:
 
        ands    counterL , tempK, #7            // counterL = counterL % 8
-       ble     strmm_kernel_L4_M1_100
+       ble     .Lstrmm_kernel_L4_M1_100
 
-strmm_kernel_L4_M1_42:
+.Lstrmm_kernel_L4_M1_42:
 
        KERNEL1x4_SUB
 
        subs    counterL, counterL, #1
-       bgt     strmm_kernel_L4_M1_42
+       bgt     .Lstrmm_kernel_L4_M1_42
 
-strmm_kernel_L4_M1_100:
+.Lstrmm_kernel_L4_M1_100:
 
        SAVE1x4
 
@@ -817,7 +817,7 @@ strmm_kernel_L4_M1_100:
 #endif
 
 
-strmm_kernel_L4_END:
+.Lstrmm_kernel_L4_END:
        add     origPB, origPB, origK, lsl #4   // B = B + K * 4 * 4
 
 #if !defined(LEFT)
@@ -825,19 +825,19 @@ strmm_kernel_L4_END:
 #endif
 
        subs    counterJ, counterJ , #1         // j--
-       bgt     strmm_kernel_L4_BEGIN
+       bgt     .Lstrmm_kernel_L4_BEGIN
 
 
 /******************************************************************************/
 
-strmm_kernel_L2_BEGIN:   // less than 2 left in N direction
+.Lstrmm_kernel_L2_BEGIN:   // less than 2 left in N direction
 
        mov     counterJ , origN
        tst     counterJ , #3
-       ble     strmm_kernel_L999
+       ble     .Lstrmm_kernel_L999
 
        tst     counterJ , #2
-       ble     strmm_kernel_L1_BEGIN
+       ble     .Lstrmm_kernel_L1_BEGIN
 
        mov     pCRow0, pC                      // pCRow0 = pC
 
@@ -849,14 +849,14 @@ strmm_kernel_L2_BEGIN:   // less than 2 left in N direction
 
        mov     pA, origPA                      // pA = A
 
-strmm_kernel_L2_M4_BEGIN:
+.Lstrmm_kernel_L2_M4_BEGIN:
 
        mov     counterI, origM
        asr     counterI, counterI, #2          // counterI = counterI / 4
        cmp     counterI,#0
-       ble     strmm_kernel_L2_M2_BEGIN
+       ble     .Lstrmm_kernel_L2_M2_BEGIN
 
-strmm_kernel_L2_M4_20:
+.Lstrmm_kernel_L2_M4_20:
 
        INIT4x2
 
@@ -880,10 +880,10 @@ strmm_kernel_L2_M4_20:
 
        asr     counterL , tempK, #3            // counterL = counterL / 8
        cmp     counterL,#0
-       ble     strmm_kernel_L2_M4_40
+       ble     .Lstrmm_kernel_L2_M4_40
        .align 5
 
-strmm_kernel_L2_M4_22:
+.Lstrmm_kernel_L2_M4_22:
        KERNEL4x2_SUB
        KERNEL4x2_SUB
        KERNEL4x2_SUB
@@ -895,22 +895,22 @@ strmm_kernel_L2_M4_22:
        KERNEL4x2_SUB
 
        subs    counterL, counterL, #1
-       bgt     strmm_kernel_L2_M4_22
+       bgt     .Lstrmm_kernel_L2_M4_22
 
 
-strmm_kernel_L2_M4_40:
+.Lstrmm_kernel_L2_M4_40:
 
        ands    counterL , tempK, #7            // counterL = counterL % 8
-       ble     strmm_kernel_L2_M4_100
+       ble     .Lstrmm_kernel_L2_M4_100
 
-strmm_kernel_L2_M4_42:
+.Lstrmm_kernel_L2_M4_42:
 
        KERNEL4x2_SUB
 
        subs    counterL, counterL, #1
-       bgt     strmm_kernel_L2_M4_42
+       bgt     .Lstrmm_kernel_L2_M4_42
 
-strmm_kernel_L2_M4_100:
+.Lstrmm_kernel_L2_M4_100:
 
        SAVE4x2
 
@@ -930,22 +930,22 @@ strmm_kernel_L2_M4_100:
        add     tempOffset, tempOffset, #4
 #endif
 
-strmm_kernel_L2_M4_END:
+.Lstrmm_kernel_L2_M4_END:
 
        subs    counterI, counterI, #1
-       bgt     strmm_kernel_L2_M4_20
+       bgt     .Lstrmm_kernel_L2_M4_20
 
 
-strmm_kernel_L2_M2_BEGIN:
+.Lstrmm_kernel_L2_M2_BEGIN:
 
        mov     counterI, origM
        tst     counterI , #3
-       ble     strmm_kernel_L2_END
+       ble     .Lstrmm_kernel_L2_END
 
        tst     counterI, #2                    // counterI = counterI / 2
-       ble     strmm_kernel_L2_M1_BEGIN
+       ble     .Lstrmm_kernel_L2_M1_BEGIN
 
-strmm_kernel_L2_M2_20:
+.Lstrmm_kernel_L2_M2_20:
 
        INIT2x2
 
@@ -969,9 +969,9 @@ strmm_kernel_L2_M2_20:
 
        asr     counterL , tempK, #3            // counterL = counterL / 8
         cmp    counterL,#0
-       ble     strmm_kernel_L2_M2_40
+       ble     .Lstrmm_kernel_L2_M2_40
 
-strmm_kernel_L2_M2_22:
+.Lstrmm_kernel_L2_M2_22:
 
        KERNEL2x2_SUB
        KERNEL2x2_SUB
@@ -984,22 +984,22 @@ strmm_kernel_L2_M2_22:
        KERNEL2x2_SUB
 
        subs    counterL, counterL, #1
-       bgt     strmm_kernel_L2_M2_22
+       bgt     .Lstrmm_kernel_L2_M2_22
 
 
-strmm_kernel_L2_M2_40:
+.Lstrmm_kernel_L2_M2_40:
 
        ands    counterL , tempK, #7            // counterL = counterL % 8
-       ble     strmm_kernel_L2_M2_100
+       ble     .Lstrmm_kernel_L2_M2_100
 
-strmm_kernel_L2_M2_42:
+.Lstrmm_kernel_L2_M2_42:
 
        KERNEL2x2_SUB
 
        subs    counterL, counterL, #1
-       bgt     strmm_kernel_L2_M2_42
+       bgt     .Lstrmm_kernel_L2_M2_42
 
-strmm_kernel_L2_M2_100:
+.Lstrmm_kernel_L2_M2_100:
 
        SAVE2x2
 #if (defined(LEFT) &&  defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA))
@@ -1018,15 +1018,15 @@ strmm_kernel_L2_M2_100:
        add     tempOffset, tempOffset, #2
 #endif
 
-strmm_kernel_L2_M2_END:
+.Lstrmm_kernel_L2_M2_END:
 
 
-strmm_kernel_L2_M1_BEGIN:
+.Lstrmm_kernel_L2_M1_BEGIN:
 
        tst     counterI, #1                    // counterI = counterI % 2
-       ble     strmm_kernel_L2_END
+       ble     .Lstrmm_kernel_L2_END
 
-strmm_kernel_L2_M1_20:
+.Lstrmm_kernel_L2_M1_20:
 
        INIT1x2
 
@@ -1050,9 +1050,9 @@ strmm_kernel_L2_M1_20:
 
        asr     counterL , tempK, #3            // counterL = counterL / 8
         cmp     counterL, #0
-       ble     strmm_kernel_L2_M1_40
+       ble     .Lstrmm_kernel_L2_M1_40
 
-strmm_kernel_L2_M1_22:
+.Lstrmm_kernel_L2_M1_22:
        KERNEL1x2_SUB
        KERNEL1x2_SUB
        KERNEL1x2_SUB
@@ -1064,22 +1064,22 @@ strmm_kernel_L2_M1_22:
        KERNEL1x2_SUB
 
        subs    counterL, counterL, #1
-       bgt     strmm_kernel_L2_M1_22
+       bgt     .Lstrmm_kernel_L2_M1_22
 
 
-strmm_kernel_L2_M1_40:
+.Lstrmm_kernel_L2_M1_40:
 
        ands    counterL , tempK, #7            // counterL = counterL % 8
-       ble     strmm_kernel_L2_M1_100
+       ble     .Lstrmm_kernel_L2_M1_100
 
-strmm_kernel_L2_M1_42:
+.Lstrmm_kernel_L2_M1_42:
 
        KERNEL1x2_SUB
 
        subs    counterL, counterL, #1
-       bgt     strmm_kernel_L2_M1_42
+       bgt     .Lstrmm_kernel_L2_M1_42
 
-strmm_kernel_L2_M1_100:
+.Lstrmm_kernel_L2_M1_100:
 
        SAVE1x2
 
@@ -1099,7 +1099,7 @@ strmm_kernel_L2_M1_100:
        add     tempOffset, tempOffset, #1
 #endif
 
-strmm_kernel_L2_END:
+.Lstrmm_kernel_L2_END:
 #if !defined(LEFT)
        add     tempOffset, tempOffset, #2
 #endif
@@ -1107,11 +1107,11 @@ strmm_kernel_L2_END:
 
 /******************************************************************************/
 
-strmm_kernel_L1_BEGIN:
+.Lstrmm_kernel_L1_BEGIN:
 
        mov     counterJ , origN
        tst     counterJ , #1
-       ble     strmm_kernel_L999 // done
+       ble     .Lstrmm_kernel_L999 // done
 
 
        mov     pCRow0, pC                      // pCRow0 = C
@@ -1123,14 +1123,14 @@ strmm_kernel_L1_BEGIN:
 
        mov     pA, origPA                      // pA = A
 
-strmm_kernel_L1_M4_BEGIN:
+.Lstrmm_kernel_L1_M4_BEGIN:
 
        mov     counterI, origM
        asr     counterI, counterI, #2          // counterI = counterI / 4
        cmp     counterI, #0
-       ble     strmm_kernel_L1_M2_BEGIN
+       ble     .Lstrmm_kernel_L1_M2_BEGIN
 
-strmm_kernel_L1_M4_20:
+.Lstrmm_kernel_L1_M4_20:
 
        INIT4x1
 
@@ -1154,10 +1154,10 @@ strmm_kernel_L1_M4_20:
 
        asr     counterL , tempK, #3            // counterL = counterL / 8
        cmp     counterL , #0
-       ble     strmm_kernel_L1_M4_40
+       ble     .Lstrmm_kernel_L1_M4_40
        .align 5
 
-strmm_kernel_L1_M4_22:
+.Lstrmm_kernel_L1_M4_22:
        KERNEL4x1_SUB
        KERNEL4x1_SUB
        KERNEL4x1_SUB
@@ -1169,22 +1169,22 @@ strmm_kernel_L1_M4_22:
        KERNEL4x1_SUB
 
        subs    counterL, counterL, #1
-       bgt     strmm_kernel_L1_M4_22
+       bgt     .Lstrmm_kernel_L1_M4_22
 
 
-strmm_kernel_L1_M4_40:
+.Lstrmm_kernel_L1_M4_40:
 
        ands    counterL , tempK, #7            // counterL = counterL % 8
-       ble     strmm_kernel_L1_M4_100
+       ble     .Lstrmm_kernel_L1_M4_100
 
-strmm_kernel_L1_M4_42:
+.Lstrmm_kernel_L1_M4_42:
 
        KERNEL4x1_SUB
 
        subs    counterL, counterL, #1
-       bgt     strmm_kernel_L1_M4_42
+       bgt     .Lstrmm_kernel_L1_M4_42
 
-strmm_kernel_L1_M4_100:
+.Lstrmm_kernel_L1_M4_100:
 
        SAVE4x1
 
@@ -1204,22 +1204,22 @@ strmm_kernel_L1_M4_100:
        add     tempOffset, tempOffset, #4
 #endif
 
-strmm_kernel_L1_M4_END:
+.Lstrmm_kernel_L1_M4_END:
 
        subs    counterI, counterI, #1
-       bgt     strmm_kernel_L1_M4_20
+       bgt     .Lstrmm_kernel_L1_M4_20
 
 
-strmm_kernel_L1_M2_BEGIN:
+.Lstrmm_kernel_L1_M2_BEGIN:
 
        mov     counterI, origM
        tst     counterI , #3
-       ble     strmm_kernel_L1_END
+       ble     .Lstrmm_kernel_L1_END
 
        tst     counterI, #2                    // counterI = counterI / 2
-       ble     strmm_kernel_L1_M1_BEGIN
+       ble     .Lstrmm_kernel_L1_M1_BEGIN
 
-strmm_kernel_L1_M2_20:
+.Lstrmm_kernel_L1_M2_20:
 
        INIT2x1
 
@@ -1243,9 +1243,9 @@ strmm_kernel_L1_M2_20:
 
        asr     counterL , tempK, #3            // counterL = counterL / 8
        cmp     counterL , #0
-       ble     strmm_kernel_L1_M2_40
+       ble     .Lstrmm_kernel_L1_M2_40
 
-strmm_kernel_L1_M2_22:
+.Lstrmm_kernel_L1_M2_22:
 
        KERNEL2x1_SUB
        KERNEL2x1_SUB
@@ -1258,22 +1258,22 @@ strmm_kernel_L1_M2_22:
        KERNEL2x1_SUB
 
        subs    counterL, counterL, #1
-       bgt     strmm_kernel_L1_M2_22
+       bgt     .Lstrmm_kernel_L1_M2_22
 
 
-strmm_kernel_L1_M2_40:
+.Lstrmm_kernel_L1_M2_40:
 
        ands    counterL , tempK, #7            // counterL = counterL % 8
-       ble     strmm_kernel_L1_M2_100
+       ble     .Lstrmm_kernel_L1_M2_100
 
-strmm_kernel_L1_M2_42:
+.Lstrmm_kernel_L1_M2_42:
 
        KERNEL2x1_SUB
 
        subs    counterL, counterL, #1
-       bgt     strmm_kernel_L1_M2_42
+       bgt     .Lstrmm_kernel_L1_M2_42
 
-strmm_kernel_L1_M2_100:
+.Lstrmm_kernel_L1_M2_100:
 
        SAVE2x1
 
@@ -1294,15 +1294,15 @@ strmm_kernel_L1_M2_100:
 #endif
 
 
-strmm_kernel_L1_M2_END:
+.Lstrmm_kernel_L1_M2_END:
 
 
-strmm_kernel_L1_M1_BEGIN:
+.Lstrmm_kernel_L1_M1_BEGIN:
 
        tst     counterI, #1                    // counterI = counterI % 2
-       ble     strmm_kernel_L1_END
+       ble     .Lstrmm_kernel_L1_END
 
-strmm_kernel_L1_M1_20:
+.Lstrmm_kernel_L1_M1_20:
 
        INIT1x1
 
@@ -1326,9 +1326,9 @@ strmm_kernel_L1_M1_20:
 
        asr     counterL , tempK, #3            // counterL = counterL / 8
        cmp     counterL , #0
-       ble     strmm_kernel_L1_M1_40
+       ble     .Lstrmm_kernel_L1_M1_40
 
-strmm_kernel_L1_M1_22:
+.Lstrmm_kernel_L1_M1_22:
        KERNEL1x1_SUB
        KERNEL1x1_SUB
        KERNEL1x1_SUB
@@ -1340,22 +1340,22 @@ strmm_kernel_L1_M1_22:
        KERNEL1x1_SUB
 
        subs    counterL, counterL, #1
-       bgt     strmm_kernel_L1_M1_22
+       bgt     .Lstrmm_kernel_L1_M1_22
 
 
-strmm_kernel_L1_M1_40:
+.Lstrmm_kernel_L1_M1_40:
 
        ands    counterL , tempK, #7            // counterL = counterL % 8
-       ble     strmm_kernel_L1_M1_100
+       ble     .Lstrmm_kernel_L1_M1_100
 
-strmm_kernel_L1_M1_42:
+.Lstrmm_kernel_L1_M1_42:
 
        KERNEL1x1_SUB
 
        subs    counterL, counterL, #1
-       bgt     strmm_kernel_L1_M1_42
+       bgt     .Lstrmm_kernel_L1_M1_42
 
-strmm_kernel_L1_M1_100:
+.Lstrmm_kernel_L1_M1_100:
 
        SAVE1x1
 
@@ -1377,7 +1377,7 @@ strmm_kernel_L1_M1_100:
 #endif
 #endif
 
-strmm_kernel_L1_END:
+.Lstrmm_kernel_L1_END:
 
 #if 0
 #if !defined(LEFT)
@@ -1385,7 +1385,7 @@ strmm_kernel_L1_END:
 #endif
 #endif
 
-strmm_kernel_L999:
+.Lstrmm_kernel_L999:
        mov     x0, #0                          // set return value
        ldp     d8, d9, [sp, #(0 * 16)]
        ldp     d10, d11, [sp, #(1 * 16)]
index 843f0c8..cd18e68 100644 (file)
@@ -1257,7 +1257,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 
        PROLOGUE
 
-strmm_kernel_begin:
+.Lstrmm_kernel_begin:
 
        .align 5
        add     sp, sp, #-(11 * 16)
@@ -1288,12 +1288,12 @@ strmm_kernel_begin:
        mov     counterJ, origN
        asr     counterJ, counterJ, #3          // J = J / 8
        cmp     counterJ, #0
-       ble     strmm_kernel_L4_BEGIN
+       ble     .Lstrmm_kernel_L4_BEGIN
 
 /******************************************************************************/
 /******************************************************************************/
 
-strmm_kernel_L8_BEGIN:
+.Lstrmm_kernel_L8_BEGIN:
        mov     pCRow0, pC                      // pCRow0 = C
        add     pC, pC, LDC, lsl #3
 
@@ -1305,14 +1305,14 @@ strmm_kernel_L8_BEGIN:
 
 /******************************************************************************/
 
-strmm_kernel_L8_M8_BEGIN:
+.Lstrmm_kernel_L8_M8_BEGIN:
 
        mov     counterI, origM
        asr     counterI, counterI, #3          // counterI = counterI / 8
        cmp     counterI, #0
-       ble     strmm_kernel_L8_M4_BEGIN
+       ble     .Lstrmm_kernel_L8_M4_BEGIN
 
-strmm_kernel_L8_M8_20:
+.Lstrmm_kernel_L8_M8_20:
 
 #if (defined(LEFT) &&  defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA))
        mov     pB, origPB
@@ -1333,54 +1333,54 @@ strmm_kernel_L8_M8_20:
 
        asr     counterL , tempK, #1            // L = K / 2
        cmp     counterL , #2                   // is there at least 4 to do?
-       blt     strmm_kernel_L8_M8_32
+       blt     .Lstrmm_kernel_L8_M8_32
 
        KERNEL8x8_I                             // do one in the K
        KERNEL8x8_M2                            // do another in the K
 
        subs    counterL, counterL, #2
-       ble     strmm_kernel_L8_M8_22a
+       ble     .Lstrmm_kernel_L8_M8_22a
        .align 5
 
-strmm_kernel_L8_M8_22:
+.Lstrmm_kernel_L8_M8_22:
 
        KERNEL8x8_M1
        KERNEL8x8_M2
 
        subs    counterL, counterL, #1
-       bgt     strmm_kernel_L8_M8_22
+       bgt     .Lstrmm_kernel_L8_M8_22
 
-strmm_kernel_L8_M8_22a:
+.Lstrmm_kernel_L8_M8_22a:
 
        KERNEL8x8_M1
        KERNEL8x8_E
 
-       b        strmm_kernel_L8_M8_44
+       b        .Lstrmm_kernel_L8_M8_44
 
-strmm_kernel_L8_M8_32:
+.Lstrmm_kernel_L8_M8_32:
 
        tst     counterL, #1
-       ble     strmm_kernel_L8_M8_40
+       ble     .Lstrmm_kernel_L8_M8_40
 
        KERNEL8x8_I
        KERNEL8x8_E
 
-       b       strmm_kernel_L8_M8_44
+       b       .Lstrmm_kernel_L8_M8_44
 
-strmm_kernel_L8_M8_40:
+.Lstrmm_kernel_L8_M8_40:
 
        INIT8x8
 
-strmm_kernel_L8_M8_44:
+.Lstrmm_kernel_L8_M8_44:
 
        ands    counterL , tempK, #1
-       ble     strmm_kernel_L8_M8_100
+       ble     .Lstrmm_kernel_L8_M8_100
 
-strmm_kernel_L8_M8_46:
+.Lstrmm_kernel_L8_M8_46:
 
        KERNEL8x8_SUB
 
-strmm_kernel_L8_M8_100:
+.Lstrmm_kernel_L8_M8_100:
 
        SAVE8x8
 
@@ -1399,22 +1399,22 @@ strmm_kernel_L8_M8_100:
        add     tempOffset, tempOffset, #8
 #endif
 
-strmm_kernel_L8_M8_END:
+.Lstrmm_kernel_L8_M8_END:
        subs    counterI, counterI, #1
-       bne     strmm_kernel_L8_M8_20
+       bne     .Lstrmm_kernel_L8_M8_20
 
 /******************************************************************************/
 
-strmm_kernel_L8_M4_BEGIN:
+.Lstrmm_kernel_L8_M4_BEGIN:
 
        mov     counterI, origM
        tst     counterI , #7
-       ble     strmm_kernel_L8_END
+       ble     .Lstrmm_kernel_L8_END
 
        tst     counterI, #4
-       ble     strmm_kernel_L8_M2_BEGIN
+       ble     .Lstrmm_kernel_L8_M2_BEGIN
 
-strmm_kernel_L8_M4_20:
+.Lstrmm_kernel_L8_M4_20:
 
 #if (defined(LEFT) &&  defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA))
        mov     pB, origPB
@@ -1436,54 +1436,54 @@ strmm_kernel_L8_M4_20:
 
        asr     counterL , tempK, #1            // L = K / 2
        cmp     counterL , #2                   // is there at least 4 to do?
-       blt     strmm_kernel_L8_M4_32
+       blt     .Lstrmm_kernel_L8_M4_32
 
        KERNEL4x8_I                             // do one in the K
        KERNEL4x8_M2                            // do another in the K
 
        subs    counterL, counterL, #2
-       ble     strmm_kernel_L8_M4_22a
+       ble     .Lstrmm_kernel_L8_M4_22a
        .align 5
 
-strmm_kernel_L8_M4_22:
+.Lstrmm_kernel_L8_M4_22:
 
        KERNEL4x8_M1
        KERNEL4x8_M2
 
        subs    counterL, counterL, #1
-       bgt     strmm_kernel_L8_M4_22
+       bgt     .Lstrmm_kernel_L8_M4_22
 
-strmm_kernel_L8_M4_22a:
+.Lstrmm_kernel_L8_M4_22a:
 
        KERNEL4x8_M1
        KERNEL4x8_E
 
-       b        strmm_kernel_L8_M4_44
+       b        .Lstrmm_kernel_L8_M4_44
 
-strmm_kernel_L8_M4_32:
+.Lstrmm_kernel_L8_M4_32:
 
        tst     counterL, #1
-       ble     strmm_kernel_L8_M4_40
+       ble     .Lstrmm_kernel_L8_M4_40
 
        KERNEL4x8_I
        KERNEL4x8_E
 
-       b       strmm_kernel_L8_M4_44
+       b       .Lstrmm_kernel_L8_M4_44
 
-strmm_kernel_L8_M4_40:
+.Lstrmm_kernel_L8_M4_40:
 
        INIT4x8
 
-strmm_kernel_L8_M4_44:
+.Lstrmm_kernel_L8_M4_44:
 
        ands    counterL , tempK, #1
-       ble     strmm_kernel_L8_M4_100
+       ble     .Lstrmm_kernel_L8_M4_100
 
-strmm_kernel_L8_M4_46:
+.Lstrmm_kernel_L8_M4_46:
 
        KERNEL4x8_SUB
 
-strmm_kernel_L8_M4_100:
+.Lstrmm_kernel_L8_M4_100:
 
        SAVE4x8
 
@@ -1503,20 +1503,20 @@ strmm_kernel_L8_M4_100:
        add     tempOffset, tempOffset, #4
 #endif
 
-strmm_kernel_L8_M4_END:
+.Lstrmm_kernel_L8_M4_END:
 
 /******************************************************************************/
 
-strmm_kernel_L8_M2_BEGIN:
+.Lstrmm_kernel_L8_M2_BEGIN:
 
        mov     counterI, origM
        tst     counterI , #3
-       ble     strmm_kernel_L8_END
+       ble     .Lstrmm_kernel_L8_END
 
        tst     counterI, #2                    // counterI = counterI / 2
-       ble     strmm_kernel_L8_M1_BEGIN
+       ble     .Lstrmm_kernel_L8_M1_BEGIN
 
-strmm_kernel_L8_M2_20:
+.Lstrmm_kernel_L8_M2_20:
 
        INIT2x8
 
@@ -1540,9 +1540,9 @@ strmm_kernel_L8_M2_20:
 
        asr     counterL , tempK, #3            // counterL = counterL / 8
        cmp     counterL , #0
-       ble     strmm_kernel_L8_M2_40
+       ble     .Lstrmm_kernel_L8_M2_40
 
-strmm_kernel_L8_M2_22:
+.Lstrmm_kernel_L8_M2_22:
 
        KERNEL2x8_SUB
        KERNEL2x8_SUB
@@ -1555,22 +1555,22 @@ strmm_kernel_L8_M2_22:
        KERNEL2x8_SUB
 
        subs    counterL, counterL, #1
-       bgt     strmm_kernel_L8_M2_22
+       bgt     .Lstrmm_kernel_L8_M2_22
 
 
-strmm_kernel_L8_M2_40:
+.Lstrmm_kernel_L8_M2_40:
 
        ands    counterL , tempK, #7            // counterL = counterL % 8
-       ble     strmm_kernel_L8_M2_100
+       ble     .Lstrmm_kernel_L8_M2_100
 
-strmm_kernel_L8_M2_42:
+.Lstrmm_kernel_L8_M2_42:
 
        KERNEL2x8_SUB
 
        subs    counterL, counterL, #1
-       bgt     strmm_kernel_L8_M2_42
+       bgt     .Lstrmm_kernel_L8_M2_42
 
-strmm_kernel_L8_M2_100:
+.Lstrmm_kernel_L8_M2_100:
 
        SAVE2x8
 
@@ -1590,16 +1590,16 @@ strmm_kernel_L8_M2_100:
        add     tempOffset, tempOffset, #2
 #endif
 
-strmm_kernel_L8_M2_END:
+.Lstrmm_kernel_L8_M2_END:
 
 /******************************************************************************/
 
-strmm_kernel_L8_M1_BEGIN:
+.Lstrmm_kernel_L8_M1_BEGIN:
 
        tst     counterI, #1                    // counterI = counterI % 2
-       ble     strmm_kernel_L8_END
+       ble     .Lstrmm_kernel_L8_END
 
-strmm_kernel_L8_M1_20:
+.Lstrmm_kernel_L8_M1_20:
 
        INIT1x8
 
@@ -1623,9 +1623,9 @@ strmm_kernel_L8_M1_20:
 
        asr     counterL , tempK, #3            // counterL = counterL / 8
        cmp     counterL , #0
-       ble     strmm_kernel_L8_M1_40
+       ble     .Lstrmm_kernel_L8_M1_40
 
-strmm_kernel_L8_M1_22:
+.Lstrmm_kernel_L8_M1_22:
        KERNEL1x8_SUB
        KERNEL1x8_SUB
        KERNEL1x8_SUB
@@ -1637,22 +1637,22 @@ strmm_kernel_L8_M1_22:
        KERNEL1x8_SUB
 
        subs    counterL, counterL, #1
-       bgt     strmm_kernel_L8_M1_22
+       bgt     .Lstrmm_kernel_L8_M1_22
 
 
-strmm_kernel_L8_M1_40:
+.Lstrmm_kernel_L8_M1_40:
 
        ands    counterL , tempK, #7            // counterL = counterL % 8
-       ble     strmm_kernel_L8_M1_100
+       ble     .Lstrmm_kernel_L8_M1_100
 
-strmm_kernel_L8_M1_42:
+.Lstrmm_kernel_L8_M1_42:
 
        KERNEL1x8_SUB
 
        subs    counterL, counterL, #1
-       bgt     strmm_kernel_L8_M1_42
+       bgt     .Lstrmm_kernel_L8_M1_42
 
-strmm_kernel_L8_M1_100:
+.Lstrmm_kernel_L8_M1_100:
 
        SAVE1x8
 
@@ -1672,7 +1672,7 @@ strmm_kernel_L8_M1_100:
        add     tempOffset, tempOffset, #1
 #endif
 
-strmm_kernel_L8_END:
+.Lstrmm_kernel_L8_END:
        lsl     temp, origK, #5                 // B = B + K * 4 * 8
        add     origPB, origPB, temp
 
@@ -1681,19 +1681,19 @@ strmm_kernel_L8_END:
 #endif
 
        subs    counterJ, counterJ , #1         // j--
-       bgt     strmm_kernel_L8_BEGIN
+       bgt     .Lstrmm_kernel_L8_BEGIN
 
 /******************************************************************************/
 /******************************************************************************/
 
-strmm_kernel_L4_BEGIN:
+.Lstrmm_kernel_L4_BEGIN:
 
        mov     counterJ , origN
        tst     counterJ , #7
-       ble     strmm_kernel_L999
+       ble     .Lstrmm_kernel_L999
 
        tst     counterJ , #4
-       ble     strmm_kernel_L2_BEGIN
+       ble     .Lstrmm_kernel_L2_BEGIN
 
        mov     pCRow0, pC                      // pCRow0 = pC
 
@@ -1707,14 +1707,14 @@ strmm_kernel_L4_BEGIN:
 
 /******************************************************************************/
 
-strmm_kernel_L4_M8_BEGIN:
+.Lstrmm_kernel_L4_M8_BEGIN:
 
        mov     counterI, origM
        asr     counterI, counterI, #3          // counterI = counterI / 8
        cmp     counterI, #0
-       ble     strmm_kernel_L4_M4_BEGIN
+       ble     .Lstrmm_kernel_L4_M4_BEGIN
 
-strmm_kernel_L4_M8_20:
+.Lstrmm_kernel_L4_M8_20:
 
 #if (defined(LEFT) &&  defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA))
        mov     pB, origPB
@@ -1736,54 +1736,54 @@ strmm_kernel_L4_M8_20:
 
        asr     counterL , tempK, #1            // L = K / 2
        cmp     counterL , #2                   // is there at least 4 to do?
-       blt     strmm_kernel_L4_M8_32
+       blt     .Lstrmm_kernel_L4_M8_32
 
        KERNEL8x4_I                             // do one in the K
        KERNEL8x4_M2                            // do another in the K
 
        subs    counterL, counterL, #2
-       ble     strmm_kernel_L4_M8_22a
+       ble     .Lstrmm_kernel_L4_M8_22a
        .align 5
 
-strmm_kernel_L4_M8_22:
+.Lstrmm_kernel_L4_M8_22:
 
        KERNEL8x4_M1
        KERNEL8x4_M2
 
        subs    counterL, counterL, #1
-       bgt     strmm_kernel_L4_M8_22
+       bgt     .Lstrmm_kernel_L4_M8_22
 
-strmm_kernel_L4_M8_22a:
+.Lstrmm_kernel_L4_M8_22a:
 
        KERNEL8x4_M1
        KERNEL8x4_E
 
-       b        strmm_kernel_L4_M8_44
+       b        .Lstrmm_kernel_L4_M8_44
 
-strmm_kernel_L4_M8_32:
+.Lstrmm_kernel_L4_M8_32:
 
        tst     counterL, #1
-       ble     strmm_kernel_L4_M8_40
+       ble     .Lstrmm_kernel_L4_M8_40
 
        KERNEL8x4_I
        KERNEL8x4_E
 
-       b       strmm_kernel_L4_M8_44
+       b       .Lstrmm_kernel_L4_M8_44
 
-strmm_kernel_L4_M8_40:
+.Lstrmm_kernel_L4_M8_40:
 
        INIT8x4
 
-strmm_kernel_L4_M8_44:
+.Lstrmm_kernel_L4_M8_44:
 
        ands    counterL , tempK, #1
-       ble     strmm_kernel_L4_M8_100
+       ble     .Lstrmm_kernel_L4_M8_100
 
-strmm_kernel_L4_M8_46:
+.Lstrmm_kernel_L4_M8_46:
 
        KERNEL8x4_SUB
 
-strmm_kernel_L4_M8_100:
+.Lstrmm_kernel_L4_M8_100:
 
        SAVE8x4
 
@@ -1802,22 +1802,22 @@ strmm_kernel_L4_M8_100:
 #if defined(LEFT)
        add     tempOffset, tempOffset, #8
 #endif
-strmm_kernel_L4_M8_END:
+.Lstrmm_kernel_L4_M8_END:
        subs    counterI, counterI, #1
-       bne     strmm_kernel_L4_M8_20
+       bne     .Lstrmm_kernel_L4_M8_20
 
 /******************************************************************************/
 
-strmm_kernel_L4_M4_BEGIN:
+.Lstrmm_kernel_L4_M4_BEGIN:
 
        mov     counterI, origM
        tst     counterI , #7
-       ble     strmm_kernel_L4_END
+       ble     .Lstrmm_kernel_L4_END
 
        tst     counterI, #4
-       ble     strmm_kernel_L4_M2_BEGIN
+       ble     .Lstrmm_kernel_L4_M2_BEGIN
 
-strmm_kernel_L4_M4_20:
+.Lstrmm_kernel_L4_M4_20:
 
 #if (defined(LEFT) &&  defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA))
        mov     pB, origPB
@@ -1837,54 +1837,54 @@ strmm_kernel_L4_M4_20:
 #endif
        asr     counterL , tempK, #1            // L = K / 2
        cmp     counterL , #2                   // is there at least 4 to do?
-       blt     strmm_kernel_L4_M4_32
+       blt     .Lstrmm_kernel_L4_M4_32
 
        KERNEL4x4_I                             // do one in the K
        KERNEL4x4_M2                            // do another in the K
 
        subs    counterL, counterL, #2
-       ble     strmm_kernel_L4_M4_22a
+       ble     .Lstrmm_kernel_L4_M4_22a
        .align 5
 
-strmm_kernel_L4_M4_22:
+.Lstrmm_kernel_L4_M4_22:
 
        KERNEL4x4_M1
        KERNEL4x4_M2
 
        subs    counterL, counterL, #1
-       bgt     strmm_kernel_L4_M4_22
+       bgt     .Lstrmm_kernel_L4_M4_22
 
-strmm_kernel_L4_M4_22a:
+.Lstrmm_kernel_L4_M4_22a:
 
        KERNEL4x4_M1
        KERNEL4x4_E
 
-       b        strmm_kernel_L4_M4_44
+       b        .Lstrmm_kernel_L4_M4_44
 
-strmm_kernel_L4_M4_32:
+.Lstrmm_kernel_L4_M4_32:
 
        tst     counterL, #1
-       ble     strmm_kernel_L4_M4_40
+       ble     .Lstrmm_kernel_L4_M4_40
 
        KERNEL4x4_I
        KERNEL4x4_E
 
-       b       strmm_kernel_L4_M4_44
+       b       .Lstrmm_kernel_L4_M4_44
 
-strmm_kernel_L4_M4_40:
+.Lstrmm_kernel_L4_M4_40:
 
        INIT4x4
 
-strmm_kernel_L4_M4_44:
+.Lstrmm_kernel_L4_M4_44:
 
        ands    counterL , tempK, #1
-       ble     strmm_kernel_L4_M4_100
+       ble     .Lstrmm_kernel_L4_M4_100
 
-strmm_kernel_L4_M4_46:
+.Lstrmm_kernel_L4_M4_46:
 
        KERNEL4x4_SUB
 
-strmm_kernel_L4_M4_100:
+.Lstrmm_kernel_L4_M4_100:
 
        SAVE4x4
 
@@ -1902,20 +1902,20 @@ strmm_kernel_L4_M4_100:
 #if defined(LEFT)
        add     tempOffset, tempOffset, #4
 #endif
-strmm_kernel_L4_M4_END:
+.Lstrmm_kernel_L4_M4_END:
 
 /******************************************************************************/
 
-strmm_kernel_L4_M2_BEGIN:
+.Lstrmm_kernel_L4_M2_BEGIN:
 
        mov     counterI, origM
        tst     counterI , #3
-       ble     strmm_kernel_L4_END
+       ble     .Lstrmm_kernel_L4_END
 
        tst     counterI, #2                    // counterI = counterI / 2
-       ble     strmm_kernel_L4_M1_BEGIN
+       ble     .Lstrmm_kernel_L4_M1_BEGIN
 
-strmm_kernel_L4_M2_20:
+.Lstrmm_kernel_L4_M2_20:
 
        INIT2x4
 
@@ -1938,9 +1938,9 @@ strmm_kernel_L4_M2_20:
 #endif
        asr     counterL , tempK, #3            // counterL = counterL / 8
        cmp     counterL , #0
-       ble     strmm_kernel_L4_M2_40
+       ble     .Lstrmm_kernel_L4_M2_40
 
-strmm_kernel_L4_M2_22:
+.Lstrmm_kernel_L4_M2_22:
 
        KERNEL2x4_SUB
        KERNEL2x4_SUB
@@ -1953,22 +1953,22 @@ strmm_kernel_L4_M2_22:
        KERNEL2x4_SUB
 
        subs    counterL, counterL, #1
-       bgt     strmm_kernel_L4_M2_22
+       bgt     .Lstrmm_kernel_L4_M2_22
 
 
-strmm_kernel_L4_M2_40:
+.Lstrmm_kernel_L4_M2_40:
 
        ands    counterL , tempK, #7            // counterL = counterL % 8
-       ble     strmm_kernel_L4_M2_100
+       ble     .Lstrmm_kernel_L4_M2_100
 
-strmm_kernel_L4_M2_42:
+.Lstrmm_kernel_L4_M2_42:
 
        KERNEL2x4_SUB
 
        subs    counterL, counterL, #1
-       bgt     strmm_kernel_L4_M2_42
+       bgt     .Lstrmm_kernel_L4_M2_42
 
-strmm_kernel_L4_M2_100:
+.Lstrmm_kernel_L4_M2_100:
 
        SAVE2x4
 
@@ -1987,16 +1987,16 @@ strmm_kernel_L4_M2_100:
 #if defined(LEFT)
        add     tempOffset, tempOffset, #2
 #endif
-strmm_kernel_L4_M2_END:
+.Lstrmm_kernel_L4_M2_END:
 
 /******************************************************************************/
 
-strmm_kernel_L4_M1_BEGIN:
+.Lstrmm_kernel_L4_M1_BEGIN:
 
        tst     counterI, #1                    // counterI = counterI % 2
-       ble     strmm_kernel_L4_END
+       ble     .Lstrmm_kernel_L4_END
 
-strmm_kernel_L4_M1_20:
+.Lstrmm_kernel_L4_M1_20:
 
        INIT1x4
 
@@ -2019,9 +2019,9 @@ strmm_kernel_L4_M1_20:
 #endif
        asr     counterL , tempK, #3            // counterL = counterL / 8
        cmp     counterL , #0
-       ble     strmm_kernel_L4_M1_40
+       ble     .Lstrmm_kernel_L4_M1_40
 
-strmm_kernel_L4_M1_22:
+.Lstrmm_kernel_L4_M1_22:
        KERNEL1x4_SUB
        KERNEL1x4_SUB
        KERNEL1x4_SUB
@@ -2033,22 +2033,22 @@ strmm_kernel_L4_M1_22:
        KERNEL1x4_SUB
 
        subs    counterL, counterL, #1
-       bgt     strmm_kernel_L4_M1_22
+       bgt     .Lstrmm_kernel_L4_M1_22
 
 
-strmm_kernel_L4_M1_40:
+.Lstrmm_kernel_L4_M1_40:
 
        ands    counterL , tempK, #7            // counterL = counterL % 8
-       ble     strmm_kernel_L4_M1_100
+       ble     .Lstrmm_kernel_L4_M1_100
 
-strmm_kernel_L4_M1_42:
+.Lstrmm_kernel_L4_M1_42:
 
        KERNEL1x4_SUB
 
        subs    counterL, counterL, #1
-       bgt     strmm_kernel_L4_M1_42
+       bgt     .Lstrmm_kernel_L4_M1_42
 
-strmm_kernel_L4_M1_100:
+.Lstrmm_kernel_L4_M1_100:
 
        SAVE1x4
 
@@ -2067,7 +2067,7 @@ strmm_kernel_L4_M1_100:
 #if defined(LEFT)
        add     tempOffset, tempOffset, #1
 #endif
-strmm_kernel_L4_END:
+.Lstrmm_kernel_L4_END:
        add     origPB, origPB, origK, lsl #4   // B = B + K * 4 * 4
 #if !defined(LEFT)
        add     tempOffset, tempOffset, #4
@@ -2076,14 +2076,14 @@ strmm_kernel_L4_END:
 /******************************************************************************/
 /******************************************************************************/
 
-strmm_kernel_L2_BEGIN:   // less than 2 left in N direction
+.Lstrmm_kernel_L2_BEGIN:   // less than 2 left in N direction
 
        mov     counterJ , origN
        tst     counterJ , #3
-       ble     strmm_kernel_L999
+       ble     .Lstrmm_kernel_L999
 
        tst     counterJ , #2
-       ble     strmm_kernel_L1_BEGIN
+       ble     .Lstrmm_kernel_L1_BEGIN
 
        mov     pCRow0, pC                      // pCRow0 = pC
 
@@ -2096,14 +2096,14 @@ strmm_kernel_L2_BEGIN:   // less than 2 left in N direction
 
 /******************************************************************************/
 
-strmm_kernel_L2_M8_BEGIN:
+.Lstrmm_kernel_L2_M8_BEGIN:
 
        mov     counterI, origM
        asr     counterI, counterI, #3          // counterI = counterI / 8
        cmp     counterI,#0
-       ble     strmm_kernel_L2_M4_BEGIN
+       ble     .Lstrmm_kernel_L2_M4_BEGIN
 
-strmm_kernel_L2_M8_20:
+.Lstrmm_kernel_L2_M8_20:
 
        INIT8x2
 
@@ -2126,10 +2126,10 @@ strmm_kernel_L2_M8_20:
 #endif
        asr     counterL , tempK, #3            // counterL = counterL / 8
        cmp     counterL,#0
-       ble     strmm_kernel_L2_M8_40
+       ble     .Lstrmm_kernel_L2_M8_40
        .align 5
 
-strmm_kernel_L2_M8_22:
+.Lstrmm_kernel_L2_M8_22:
        KERNEL8x2_SUB
        KERNEL8x2_SUB
        KERNEL8x2_SUB
@@ -2141,22 +2141,22 @@ strmm_kernel_L2_M8_22:
        KERNEL8x2_SUB
 
        subs    counterL, counterL, #1
-       bgt     strmm_kernel_L2_M8_22
+       bgt     .Lstrmm_kernel_L2_M8_22
 
 
-strmm_kernel_L2_M8_40:
+.Lstrmm_kernel_L2_M8_40:
 
        ands    counterL , tempK, #7            // counterL = counterL % 8
-       ble     strmm_kernel_L2_M8_100
+       ble     .Lstrmm_kernel_L2_M8_100
 
-strmm_kernel_L2_M8_42:
+.Lstrmm_kernel_L2_M8_42:
 
        KERNEL8x2_SUB
 
        subs    counterL, counterL, #1
-       bgt     strmm_kernel_L2_M8_42
+       bgt     .Lstrmm_kernel_L2_M8_42
 
-strmm_kernel_L2_M8_100:
+.Lstrmm_kernel_L2_M8_100:
 
        SAVE8x2
 
@@ -2175,23 +2175,23 @@ strmm_kernel_L2_M8_100:
 #if defined(LEFT)
        add     tempOffset, tempOffset, #8
 #endif
-strmm_kernel_L2_M8_END:
+.Lstrmm_kernel_L2_M8_END:
 
        subs    counterI, counterI, #1
-       bgt     strmm_kernel_L2_M8_20
+       bgt     .Lstrmm_kernel_L2_M8_20
 
 /******************************************************************************/
 
-strmm_kernel_L2_M4_BEGIN:
+.Lstrmm_kernel_L2_M4_BEGIN:
 
        mov     counterI, origM
        tst     counterI , #7
-       ble     strmm_kernel_L2_END
+       ble     .Lstrmm_kernel_L2_END
 
        tst     counterI, #4
-       ble     strmm_kernel_L2_M2_BEGIN
+       ble     .Lstrmm_kernel_L2_M2_BEGIN
 
-strmm_kernel_L2_M4_20:
+.Lstrmm_kernel_L2_M4_20:
 
        INIT4x2
 
@@ -2214,10 +2214,10 @@ strmm_kernel_L2_M4_20:
 #endif
        asr     counterL , tempK, #3            // counterL = counterL / 8
        cmp     counterL,#0
-       ble     strmm_kernel_L2_M4_40
+       ble     .Lstrmm_kernel_L2_M4_40
        .align 5
 
-strmm_kernel_L2_M4_22:
+.Lstrmm_kernel_L2_M4_22:
        KERNEL4x2_SUB
        KERNEL4x2_SUB
        KERNEL4x2_SUB
@@ -2229,22 +2229,22 @@ strmm_kernel_L2_M4_22:
        KERNEL4x2_SUB
 
        subs    counterL, counterL, #1
-       bgt     strmm_kernel_L2_M4_22
+       bgt     .Lstrmm_kernel_L2_M4_22
 
 
-strmm_kernel_L2_M4_40:
+.Lstrmm_kernel_L2_M4_40:
 
        ands    counterL , tempK, #7            // counterL = counterL % 8
-       ble     strmm_kernel_L2_M4_100
+       ble     .Lstrmm_kernel_L2_M4_100
 
-strmm_kernel_L2_M4_42:
+.Lstrmm_kernel_L2_M4_42:
 
        KERNEL4x2_SUB
 
        subs    counterL, counterL, #1
-       bgt     strmm_kernel_L2_M4_42
+       bgt     .Lstrmm_kernel_L2_M4_42
 
-strmm_kernel_L2_M4_100:
+.Lstrmm_kernel_L2_M4_100:
 
        SAVE4x2
 
@@ -2263,20 +2263,20 @@ strmm_kernel_L2_M4_100:
 #if defined(LEFT)
        add     tempOffset, tempOffset, #4
 #endif
-strmm_kernel_L2_M4_END:
+.Lstrmm_kernel_L2_M4_END:
 
 /******************************************************************************/
 
-strmm_kernel_L2_M2_BEGIN:
+.Lstrmm_kernel_L2_M2_BEGIN:
 
        mov     counterI, origM
        tst     counterI , #3
-       ble     strmm_kernel_L2_END
+       ble     .Lstrmm_kernel_L2_END
 
        tst     counterI, #2                    // counterI = counterI / 2
-       ble     strmm_kernel_L2_M1_BEGIN
+       ble     .Lstrmm_kernel_L2_M1_BEGIN
 
-strmm_kernel_L2_M2_20:
+.Lstrmm_kernel_L2_M2_20:
 
        INIT2x2
 
@@ -2299,9 +2299,9 @@ strmm_kernel_L2_M2_20:
 #endif
        asr     counterL , tempK, #3            // counterL = counterL / 8
         cmp    counterL,#0
-       ble     strmm_kernel_L2_M2_40
+       ble     .Lstrmm_kernel_L2_M2_40
 
-strmm_kernel_L2_M2_22:
+.Lstrmm_kernel_L2_M2_22:
 
        KERNEL2x2_SUB
        KERNEL2x2_SUB
@@ -2314,22 +2314,22 @@ strmm_kernel_L2_M2_22:
        KERNEL2x2_SUB
 
        subs    counterL, counterL, #1
-       bgt     strmm_kernel_L2_M2_22
+       bgt     .Lstrmm_kernel_L2_M2_22
 
 
-strmm_kernel_L2_M2_40:
+.Lstrmm_kernel_L2_M2_40:
 
        ands    counterL , tempK, #7            // counterL = counterL % 8
-       ble     strmm_kernel_L2_M2_100
+       ble     .Lstrmm_kernel_L2_M2_100
 
-strmm_kernel_L2_M2_42:
+.Lstrmm_kernel_L2_M2_42:
 
        KERNEL2x2_SUB
 
        subs    counterL, counterL, #1
-       bgt     strmm_kernel_L2_M2_42
+       bgt     .Lstrmm_kernel_L2_M2_42
 
-strmm_kernel_L2_M2_100:
+.Lstrmm_kernel_L2_M2_100:
 
        SAVE2x2
 #if (defined(LEFT) &&  defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA))
@@ -2348,16 +2348,16 @@ strmm_kernel_L2_M2_100:
        add     tempOffset, tempOffset, #2
 #endif
 
-strmm_kernel_L2_M2_END:
+.Lstrmm_kernel_L2_M2_END:
 
 /******************************************************************************/
 
-strmm_kernel_L2_M1_BEGIN:
+.Lstrmm_kernel_L2_M1_BEGIN:
 
        tst     counterI, #1                    // counterI = counterI % 2
-       ble     strmm_kernel_L2_END
+       ble     .Lstrmm_kernel_L2_END
 
-strmm_kernel_L2_M1_20:
+.Lstrmm_kernel_L2_M1_20:
 
        INIT1x2
 
@@ -2380,9 +2380,9 @@ strmm_kernel_L2_M1_20:
 #endif
        asr     counterL , tempK, #3            // counterL = counterL / 8
         cmp     counterL, #0
-       ble     strmm_kernel_L2_M1_40
+       ble     .Lstrmm_kernel_L2_M1_40
 
-strmm_kernel_L2_M1_22:
+.Lstrmm_kernel_L2_M1_22:
        KERNEL1x2_SUB
        KERNEL1x2_SUB
        KERNEL1x2_SUB
@@ -2394,22 +2394,22 @@ strmm_kernel_L2_M1_22:
        KERNEL1x2_SUB
 
        subs    counterL, counterL, #1
-       bgt     strmm_kernel_L2_M1_22
+       bgt     .Lstrmm_kernel_L2_M1_22
 
 
-strmm_kernel_L2_M1_40:
+.Lstrmm_kernel_L2_M1_40:
 
        ands    counterL , tempK, #7            // counterL = counterL % 8
-       ble     strmm_kernel_L2_M1_100
+       ble     .Lstrmm_kernel_L2_M1_100
 
-strmm_kernel_L2_M1_42:
+.Lstrmm_kernel_L2_M1_42:
 
        KERNEL1x2_SUB
 
        subs    counterL, counterL, #1
-       bgt     strmm_kernel_L2_M1_42
+       bgt     .Lstrmm_kernel_L2_M1_42
 
-strmm_kernel_L2_M1_100:
+.Lstrmm_kernel_L2_M1_100:
 
        SAVE1x2
 
@@ -2428,7 +2428,7 @@ strmm_kernel_L2_M1_100:
 #if defined(LEFT)
        add     tempOffset, tempOffset, #1
 #endif
-strmm_kernel_L2_END:
+.Lstrmm_kernel_L2_END:
 #if !defined(LEFT)
        add     tempOffset, tempOffset, #2
 #endif
@@ -2437,11 +2437,11 @@ strmm_kernel_L2_END:
 /******************************************************************************/
 /******************************************************************************/
 
-strmm_kernel_L1_BEGIN:
+.Lstrmm_kernel_L1_BEGIN:
 
        mov     counterJ , origN
        tst     counterJ , #1
-       ble     strmm_kernel_L999 // done
+       ble     .Lstrmm_kernel_L999 // done
 
 
        mov     pCRow0, pC                      // pCRow0 = C
@@ -2454,14 +2454,14 @@ strmm_kernel_L1_BEGIN:
 
 /******************************************************************************/
 
-strmm_kernel_L1_M8_BEGIN:
+.Lstrmm_kernel_L1_M8_BEGIN:
 
        mov     counterI, origM
        asr     counterI, counterI, #3
        cmp     counterI, #0
-       ble     strmm_kernel_L1_M4_BEGIN
+       ble     .Lstrmm_kernel_L1_M4_BEGIN
 
-strmm_kernel_L1_M8_20:
+.Lstrmm_kernel_L1_M8_20:
 
        INIT8x1
 
@@ -2484,10 +2484,10 @@ strmm_kernel_L1_M8_20:
 #endif
        asr     counterL , tempK, #3            // counterL = counterL / 8
        cmp     counterL , #0
-       ble     strmm_kernel_L1_M8_40
+       ble     .Lstrmm_kernel_L1_M8_40
        .align 5
 
-strmm_kernel_L1_M8_22:
+.Lstrmm_kernel_L1_M8_22:
        KERNEL8x1_SUB
        KERNEL8x1_SUB
        KERNEL8x1_SUB
@@ -2499,22 +2499,22 @@ strmm_kernel_L1_M8_22:
        KERNEL8x1_SUB
 
        subs    counterL, counterL, #1
-       bgt     strmm_kernel_L1_M8_22
+       bgt     .Lstrmm_kernel_L1_M8_22
 
 
-strmm_kernel_L1_M8_40:
+.Lstrmm_kernel_L1_M8_40:
 
        ands    counterL , tempK, #7            // counterL = counterL % 8
-       ble     strmm_kernel_L1_M8_100
+       ble     .Lstrmm_kernel_L1_M8_100
 
-strmm_kernel_L1_M8_42:
+.Lstrmm_kernel_L1_M8_42:
 
        KERNEL8x1_SUB
 
        subs    counterL, counterL, #1
-       bgt     strmm_kernel_L1_M8_42
+       bgt     .Lstrmm_kernel_L1_M8_42
 
-strmm_kernel_L1_M8_100:
+.Lstrmm_kernel_L1_M8_100:
 
        SAVE8x1
 
@@ -2533,23 +2533,23 @@ strmm_kernel_L1_M8_100:
 #if defined(LEFT)
        add     tempOffset, tempOffset, #8
 #endif
-strmm_kernel_L1_M8_END:
+.Lstrmm_kernel_L1_M8_END:
 
        subs    counterI, counterI, #1
-       bgt     strmm_kernel_L1_M8_20
+       bgt     .Lstrmm_kernel_L1_M8_20
 
 /******************************************************************************/
 
-strmm_kernel_L1_M4_BEGIN:
+.Lstrmm_kernel_L1_M4_BEGIN:
 
        mov     counterI, origM
        tst     counterI , #7
-       ble     strmm_kernel_L1_END
+       ble     .Lstrmm_kernel_L1_END
 
        tst     counterI, #4
-       ble     strmm_kernel_L1_M2_BEGIN
+       ble     .Lstrmm_kernel_L1_M2_BEGIN
 
-strmm_kernel_L1_M4_20:
+.Lstrmm_kernel_L1_M4_20:
 
        INIT4x1
 
@@ -2572,10 +2572,10 @@ strmm_kernel_L1_M4_20:
 #endif
        asr     counterL , tempK, #3            // counterL = counterL / 8
        cmp     counterL , #0
-       ble     strmm_kernel_L1_M4_40
+       ble     .Lstrmm_kernel_L1_M4_40
        .align 5
 
-strmm_kernel_L1_M4_22:
+.Lstrmm_kernel_L1_M4_22:
        KERNEL4x1_SUB
        KERNEL4x1_SUB
        KERNEL4x1_SUB
@@ -2587,22 +2587,22 @@ strmm_kernel_L1_M4_22:
        KERNEL4x1_SUB
 
        subs    counterL, counterL, #1
-       bgt     strmm_kernel_L1_M4_22
+       bgt     .Lstrmm_kernel_L1_M4_22
 
 
-strmm_kernel_L1_M4_40:
+.Lstrmm_kernel_L1_M4_40:
 
        ands    counterL , tempK, #7            // counterL = counterL % 8
-       ble     strmm_kernel_L1_M4_100
+       ble     .Lstrmm_kernel_L1_M4_100
 
-strmm_kernel_L1_M4_42:
+.Lstrmm_kernel_L1_M4_42:
 
        KERNEL4x1_SUB
 
        subs    counterL, counterL, #1
-       bgt     strmm_kernel_L1_M4_42
+       bgt     .Lstrmm_kernel_L1_M4_42
 
-strmm_kernel_L1_M4_100:
+.Lstrmm_kernel_L1_M4_100:
 
        SAVE4x1
 
@@ -2621,20 +2621,20 @@ strmm_kernel_L1_M4_100:
 #if defined(LEFT)
        add     tempOffset, tempOffset, #4
 #endif
-strmm_kernel_L1_M4_END:
+.Lstrmm_kernel_L1_M4_END:
 
 /******************************************************************************/
 
-strmm_kernel_L1_M2_BEGIN:
+.Lstrmm_kernel_L1_M2_BEGIN:
 
        mov     counterI, origM
        tst     counterI , #3
-       ble     strmm_kernel_L1_END
+       ble     .Lstrmm_kernel_L1_END
 
        tst     counterI, #2                    // counterI = counterI / 2
-       ble     strmm_kernel_L1_M1_BEGIN
+       ble     .Lstrmm_kernel_L1_M1_BEGIN
 
-strmm_kernel_L1_M2_20:
+.Lstrmm_kernel_L1_M2_20:
 
        INIT2x1
 
@@ -2657,9 +2657,9 @@ strmm_kernel_L1_M2_20:
 #endif
        asr     counterL , tempK, #3            // counterL = counterL / 8
        cmp     counterL , #0
-       ble     strmm_kernel_L1_M2_40
+       ble     .Lstrmm_kernel_L1_M2_40
 
-strmm_kernel_L1_M2_22:
+.Lstrmm_kernel_L1_M2_22:
 
        KERNEL2x1_SUB
        KERNEL2x1_SUB
@@ -2672,22 +2672,22 @@ strmm_kernel_L1_M2_22:
        KERNEL2x1_SUB
 
        subs    counterL, counterL, #1
-       bgt     strmm_kernel_L1_M2_22
+       bgt     .Lstrmm_kernel_L1_M2_22
 
 
-strmm_kernel_L1_M2_40:
+.Lstrmm_kernel_L1_M2_40:
 
        ands    counterL , tempK, #7            // counterL = counterL % 8
-       ble     strmm_kernel_L1_M2_100
+       ble     .Lstrmm_kernel_L1_M2_100
 
-strmm_kernel_L1_M2_42:
+.Lstrmm_kernel_L1_M2_42:
 
        KERNEL2x1_SUB
 
        subs    counterL, counterL, #1
-       bgt     strmm_kernel_L1_M2_42
+       bgt     .Lstrmm_kernel_L1_M2_42
 
-strmm_kernel_L1_M2_100:
+.Lstrmm_kernel_L1_M2_100:
 
        SAVE2x1
 
@@ -2706,16 +2706,16 @@ strmm_kernel_L1_M2_100:
 #if defined(LEFT)
        add     tempOffset, tempOffset, #2
 #endif
-strmm_kernel_L1_M2_END:
+.Lstrmm_kernel_L1_M2_END:
 
 /******************************************************************************/
 
-strmm_kernel_L1_M1_BEGIN:
+.Lstrmm_kernel_L1_M1_BEGIN:
 
        tst     counterI, #1                    // counterI = counterI % 2
-       ble     strmm_kernel_L1_END
+       ble     .Lstrmm_kernel_L1_END
 
-strmm_kernel_L1_M1_20:
+.Lstrmm_kernel_L1_M1_20:
 
        INIT1x1
 
@@ -2738,9 +2738,9 @@ strmm_kernel_L1_M1_20:
 #endif
        asr     counterL , tempK, #3            // counterL = counterL / 8
        cmp     counterL , #0
-       ble     strmm_kernel_L1_M1_40
+       ble     .Lstrmm_kernel_L1_M1_40
 
-strmm_kernel_L1_M1_22:
+.Lstrmm_kernel_L1_M1_22:
        KERNEL1x1_SUB
        KERNEL1x1_SUB
        KERNEL1x1_SUB
@@ -2752,30 +2752,30 @@ strmm_kernel_L1_M1_22:
        KERNEL1x1_SUB
 
        subs    counterL, counterL, #1
-       bgt     strmm_kernel_L1_M1_22
+       bgt     .Lstrmm_kernel_L1_M1_22
 
 
-strmm_kernel_L1_M1_40:
+.Lstrmm_kernel_L1_M1_40:
 
        ands    counterL , tempK, #7            // counterL = counterL % 8
-       ble     strmm_kernel_L1_M1_100
+       ble     .Lstrmm_kernel_L1_M1_100
 
-strmm_kernel_L1_M1_42:
+.Lstrmm_kernel_L1_M1_42:
 
        KERNEL1x1_SUB
 
        subs    counterL, counterL, #1
-       bgt     strmm_kernel_L1_M1_42
+       bgt     .Lstrmm_kernel_L1_M1_42
 
-strmm_kernel_L1_M1_100:
+.Lstrmm_kernel_L1_M1_100:
 
        SAVE1x1
 
-strmm_kernel_L1_END:
+.Lstrmm_kernel_L1_END:
 
 /******************************************************************************/
 
-strmm_kernel_L999:
+.Lstrmm_kernel_L999:
        mov     x0, #0                          // set return value
        ldp     d8, d9, [sp, #(0 * 16)]
        ldp     d10, d11, [sp, #(1 * 16)]
index 37ed83f..184e02e 100644 (file)
@@ -193,50 +193,50 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
        PROLOGUE
 
        cmp     N, xzr
-       ble     swap_kernel_L999
+       ble     .Lswap_kernel_L999
 
        cmp     INC_X, #1
-       bne     swap_kernel_S_BEGIN
+       bne     .Lswap_kernel_S_BEGIN
        cmp     INC_Y, #1
-       bne     swap_kernel_S_BEGIN
+       bne     .Lswap_kernel_S_BEGIN
 
-swap_kernel_F_BEGIN:
+.Lswap_kernel_F_BEGIN:
 
        asr     I, N, #3
        cmp     I, xzr
-       beq     swap_kernel_F1
+       beq     .Lswap_kernel_F1
 
-swap_kernel_F8:
+.Lswap_kernel_F8:
 
        KERNEL_F8
 
        subs    I, I, #1
-       bne     swap_kernel_F8
+       bne     .Lswap_kernel_F8
 
-swap_kernel_F1:
+.Lswap_kernel_F1:
 
        ands    I, N, #7
-       ble     swap_kernel_L999
+       ble     .Lswap_kernel_L999
 
-swap_kernel_F10:
+.Lswap_kernel_F10:
 
        KERNEL_F1
 
        subs    I, I, #1
-       bne     swap_kernel_F10
+       bne     .Lswap_kernel_F10
 
-       b       swap_kernel_L999
+       b       .Lswap_kernel_L999
 
 
-swap_kernel_S_BEGIN:
+.Lswap_kernel_S_BEGIN:
 
        INIT_S
 
        asr     I, N, #2
        cmp     I, xzr
-       ble     swap_kernel_S1
+       ble     .Lswap_kernel_S1
 
-swap_kernel_S4:
+.Lswap_kernel_S4:
 
        KERNEL_S1
        KERNEL_S1
@@ -244,21 +244,21 @@ swap_kernel_S4:
        KERNEL_S1
 
        subs    I, I, #1
-       bne     swap_kernel_S4
+       bne     .Lswap_kernel_S4
 
-swap_kernel_S1:
+.Lswap_kernel_S1:
 
        ands    I, N, #3
-       ble     swap_kernel_L999
+       ble     .Lswap_kernel_L999
 
-swap_kernel_S10:
+.Lswap_kernel_S10:
 
        KERNEL_S1
 
        subs    I, I, #1
-        bne     swap_kernel_S10
+        bne     .Lswap_kernel_S10
 
-swap_kernel_L999:
+.Lswap_kernel_L999:
 
        mov     w0, wzr
        ret
index 7db339f..c2c0a53 100644 (file)
@@ -184,62 +184,62 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
        PROLOGUE
 
        cmp     N, xzr
-       ble     amax_kernel_zero
+       ble     .Lzamax_kernel_zero
        cmp     INC_X, xzr
-       ble     amax_kernel_zero
+       ble     .Lzamax_kernel_zero
 
        cmp     INC_X, #1
-       bne     amax_kernel_S_BEGIN
+       bne     .Lzamax_kernel_S_BEGIN
 
-amax_kernel_F_BEGIN:
+.Lzamax_kernel_F_BEGIN:
 
        asr     I, N, #2
        cmp     I, xzr
-       beq     amax_kernel_F1_INIT
+       beq     .Lzamax_kernel_F1_INIT
 
        INIT_F4
        subs    I, I, #1
-       beq     amax_kernel_F1
+       beq     .Lzamax_kernel_F1
 
-amax_kernel_F4:
+.Lzamax_kernel_F4:
 
        KERNEL_F4
 
        subs    I, I, #1
-       bne     amax_kernel_F4
+       bne     .Lzamax_kernel_F4
 
-amax_kernel_F1:
+.Lzamax_kernel_F1:
 
        ands    I, N, #3
-       ble     amax_kernel_L999
+       ble     .Lzamax_kernel_L999
 
-amax_kernel_F10:
+.Lzamax_kernel_F10:
 
        KERNEL_F1
 
        subs    I, I, #1
-        bne     amax_kernel_F10
+        bne     .Lzamax_kernel_F10
 
        ret
 
-amax_kernel_F1_INIT:
+.Lzamax_kernel_F1_INIT:
 
        INIT_F1
        subs    N, N, #1
-       b       amax_kernel_F1
+       b       .Lzamax_kernel_F1
 
-amax_kernel_S_BEGIN:
+.Lzamax_kernel_S_BEGIN:
 
        INIT_S
 
        subs    N, N, #1
-       ble     amax_kernel_L999
+       ble     .Lzamax_kernel_L999
 
        asr     I, N, #2
        cmp     I, xzr
-       ble     amax_kernel_S1
+       ble     .Lzamax_kernel_S1
 
-amax_kernel_S4:
+.Lzamax_kernel_S4:
 
        KERNEL_S1
        KERNEL_S1
@@ -247,25 +247,25 @@ amax_kernel_S4:
        KERNEL_S1
 
        subs    I, I, #1
-       bne     amax_kernel_S4
+       bne     .Lzamax_kernel_S4
 
-amax_kernel_S1:
+.Lzamax_kernel_S1:
 
        ands    I, N, #3
-       ble     amax_kernel_L999
+       ble     .Lzamax_kernel_L999
 
-amax_kernel_S10:
+.Lzamax_kernel_S10:
 
        KERNEL_S1
 
        subs    I, I, #1
-        bne     amax_kernel_S10
+        bne     .Lzamax_kernel_S10
 
-amax_kernel_L999:
+.Lzamax_kernel_L999:
 
        ret
 
-amax_kernel_zero:
+.Lzamax_kernel_zero:
 
        fmov    MAXF, REG0
        ret
index bf586d3..0d5ec95 100644 (file)
@@ -92,52 +92,52 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
        fmov    SUMF, REG0
 
        cmp     N, xzr
-       ble     asum_kernel_L999
+       ble     .Lzasum_kernel_L999
        cmp     INC_X, xzr
-       ble     asum_kernel_L999
+       ble     .Lzasum_kernel_L999
 
        cmp     INC_X, #1
-       bne     asum_kernel_S_BEGIN
+       bne     .Lzasum_kernel_S_BEGIN
 
-asum_kernel_F_BEGIN:
+.Lzasum_kernel_F_BEGIN:
 
        asr     I, N, #2
        cmp     I, xzr
-       beq     asum_kernel_F1
+       beq     .Lzasum_kernel_F1
 
-asum_kernel_F4:
+.Lzasum_kernel_F4:
 
        KERNEL_F4
 
        subs    I, I, #1
-       bne     asum_kernel_F4
+       bne     .Lzasum_kernel_F4
 
        KERNEL_F4_FINALIZE
 
-asum_kernel_F1:
+.Lzasum_kernel_F1:
 
        ands    I, N, #3
-       ble     asum_kernel_L999
+       ble     .Lzasum_kernel_L999
 
-asum_kernel_F10:
+.Lzasum_kernel_F10:
 
        KERNEL_F1
 
        subs    I, I, #1
-        bne     asum_kernel_F10
+        bne     .Lzasum_kernel_F10
 
-asum_kernel_L999:
+.Lzasum_kernel_L999:
        ret
 
-asum_kernel_S_BEGIN:
+.Lzasum_kernel_S_BEGIN:
 
        INIT_S
 
        asr     I, N, #2
        cmp     I, xzr
-       ble     asum_kernel_S1
+       ble     .Lzasum_kernel_S1
 
-asum_kernel_S4:
+.Lzasum_kernel_S4:
 
        KERNEL_S1
        KERNEL_S1
@@ -145,19 +145,19 @@ asum_kernel_S4:
        KERNEL_S1
 
        subs    I, I, #1
-       bne     asum_kernel_S4
+       bne     .Lzasum_kernel_S4
 
-asum_kernel_S1:
+.Lzasum_kernel_S1:
 
        ands    I, N, #3
-       ble     asum_kernel_L999
+       ble     .Lzasum_kernel_L999
 
-asum_kernel_S10:
+.Lzasum_kernel_S10:
 
        KERNEL_S1
 
        subs    I, I, #1
-        bne     asum_kernel_S10
+        bne     .Lzasum_kernel_S10
 
        ret
 
index 70c2499..46d7b04 100644 (file)
@@ -241,62 +241,62 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
        PROLOGUE
 
        cmp     N, xzr
-       ble     zaxpy_kernel_L999
+       ble     .Lzaxpy_kernel_L999
 
        mov     Y_COPY, Y
 
        fcmp    DA_R, #0.0
        bne     .L1
        fcmp    DA_I, #0.0
-       beq     zaxpy_kernel_L999
+       beq     .Lzaxpy_kernel_L999
 
 .L1:
        INIT
 
        cmp     INC_X, #1
-       bne     zaxpy_kernel_S_BEGIN
+       bne     .Lzaxpy_kernel_S_BEGIN
        cmp     INC_Y, #1
-       bne     zaxpy_kernel_S_BEGIN
+       bne     .Lzaxpy_kernel_S_BEGIN
 
-zaxpy_kernel_F_BEGIN:
+.Lzaxpy_kernel_F_BEGIN:
 
        asr     I, N, #2
        cmp     I, xzr
-       beq     zaxpy_kernel_F1
+       beq     .Lzaxpy_kernel_F1
 
        KERNEL_INIT_F4
 
-zaxpy_kernel_F4:
+.Lzaxpy_kernel_F4:
 
        KERNEL_F4
 
        subs    I, I, #1
-       bne     zaxpy_kernel_F4
+       bne     .Lzaxpy_kernel_F4
 
-zaxpy_kernel_F1:
+.Lzaxpy_kernel_F1:
 
        ands    I, N, #3
-       ble     zaxpy_kernel_L999
+       ble     .Lzaxpy_kernel_L999
 
-zaxpy_kernel_F10:
+.Lzaxpy_kernel_F10:
 
        KERNEL_F1
 
        subs    I, I, #1
-        bne     zaxpy_kernel_F10
+        bne     .Lzaxpy_kernel_F10
 
        mov     w0, wzr
        ret
 
-zaxpy_kernel_S_BEGIN:
+.Lzaxpy_kernel_S_BEGIN:
 
        INIT_S
 
        asr     I, N, #2
        cmp     I, xzr
-       ble     zaxpy_kernel_S1
+       ble     .Lzaxpy_kernel_S1
 
-zaxpy_kernel_S4:
+.Lzaxpy_kernel_S4:
 
        KERNEL_S1
        KERNEL_S1
@@ -304,21 +304,21 @@ zaxpy_kernel_S4:
        KERNEL_S1
 
        subs    I, I, #1
-       bne     zaxpy_kernel_S4
+       bne     .Lzaxpy_kernel_S4
 
-zaxpy_kernel_S1:
+.Lzaxpy_kernel_S1:
 
        ands    I, N, #3
-       ble     zaxpy_kernel_L999
+       ble     .Lzaxpy_kernel_L999
 
-zaxpy_kernel_S10:
+.Lzaxpy_kernel_S10:
 
        KERNEL_S1
 
        subs    I, I, #1
-        bne     zaxpy_kernel_S10
+        bne     .Lzaxpy_kernel_S10
 
-zaxpy_kernel_L999:
+.Lzaxpy_kernel_L999:
 
        mov     w0, wzr
        ret
index 3e8e3d7..044ace3 100644 (file)
@@ -229,51 +229,51 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 #endif
 
        cmp     N, xzr
-       ble     dot_kernel_L999
+       ble     .Lzdot_kernel_L999
 
        cmp     INC_X, #1
-       bne     dot_kernel_S_BEGIN
+       bne     .Lzdot_kernel_S_BEGIN
        cmp     INC_Y, #1
-       bne     dot_kernel_S_BEGIN
+       bne     .Lzdot_kernel_S_BEGIN
 
-dot_kernel_F_BEGIN:
+.Lzdot_kernel_F_BEGIN:
 
        asr     I, N, #2
        cmp     I, xzr
-       beq     dot_kernel_F1
+       beq     .Lzdot_kernel_F1
 
-dot_kernel_F4:
+.Lzdot_kernel_F4:
 
        KERNEL_F4
 
        subs    I, I, #1
-       bne     dot_kernel_F4
+       bne     .Lzdot_kernel_F4
 
        KERNEL_F4_FINALIZE
 
-dot_kernel_F1:
+.Lzdot_kernel_F1:
 
        ands    I, N, #3
-       ble     dot_kernel_L999
+       ble     .Lzdot_kernel_L999
 
-dot_kernel_F10:
+.Lzdot_kernel_F10:
 
        KERNEL_F1
 
        subs    I, I, #1
-        bne     dot_kernel_F10
+        bne     .Lzdot_kernel_F10
 
        ret
 
-dot_kernel_S_BEGIN:
+.Lzdot_kernel_S_BEGIN:
 
        INIT_S
 
        asr     I, N, #2
        cmp     I, xzr
-       ble     dot_kernel_S1
+       ble     .Lzdot_kernel_S1
 
-dot_kernel_S4:
+.Lzdot_kernel_S4:
 
        KERNEL_S1
        KERNEL_S1
@@ -281,21 +281,21 @@ dot_kernel_S4:
        KERNEL_S1
 
        subs    I, I, #1
-       bne     dot_kernel_S4
+       bne     .Lzdot_kernel_S4
 
-dot_kernel_S1:
+.Lzdot_kernel_S1:
 
        ands    I, N, #3
-       ble     dot_kernel_L999
+       ble     .Lzdot_kernel_L999
 
-dot_kernel_S10:
+.Lzdot_kernel_S10:
 
        KERNEL_S1
 
        subs    I, I, #1
-        bne     dot_kernel_S10
+        bne     .Lzdot_kernel_S10
 
-dot_kernel_L999:
+.Lzdot_kernel_L999:
 
        ret
 
index 08a1531..f8e877f 100644 (file)
@@ -1099,9 +1099,9 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
        mov     counterJ, origN
        asr     counterJ, counterJ, #2          // J = J / 4
        cmp     counterJ, #0
-       ble     zgemm_kernel_L2_BEGIN
+       ble     .Lzgemm_kernel_L2_BEGIN
 
-zgemm_kernel_L4_BEGIN:
+.Lzgemm_kernel_L4_BEGIN:
        mov     pCRow0, pC
        add     pCRow1, pCRow0, LDC
        add     pCRow2, pCRow1, LDC
@@ -1111,20 +1111,20 @@ zgemm_kernel_L4_BEGIN:
 
        mov     pA, origPA                      // pA = start of A array
 
-zgemm_kernel_L4_M4_BEGIN:
+.Lzgemm_kernel_L4_M4_BEGIN:
 
        mov     counterI, origM
        asr     counterI, counterI, #2          // counterI = counterI / 4
        cmp     counterI, #0
-       ble     zgemm_kernel_L4_M2_BEGIN
+       ble     .Lzgemm_kernel_L4_M2_BEGIN
 
        .align 5
-zgemm_kernel_L4_M4_20:
+.Lzgemm_kernel_L4_M4_20:
 
        mov     pB, origPB
        asr     counterL , origK, #3
        cmp     counterL , #2
-       blt     zgemm_kernel_L4_M4_32
+       blt     .Lzgemm_kernel_L4_M4_32
 
        KERNEL4x4_I
        KERNEL4x4_M2
@@ -1136,10 +1136,10 @@ zgemm_kernel_L4_M4_20:
        KERNEL4x4_M2
 
        subs    counterL, counterL, #2          // subtract 2
-       ble     zgemm_kernel_L4_M4_22a
+       ble     .Lzgemm_kernel_L4_M4_22a
 
        .align 5
-zgemm_kernel_L4_M4_22:
+.Lzgemm_kernel_L4_M4_22:
 
        KERNEL4x4_M1
        KERNEL4x4_M2
@@ -1151,10 +1151,10 @@ zgemm_kernel_L4_M4_22:
        KERNEL4x4_M2
 
        subs    counterL, counterL, #1
-       bgt     zgemm_kernel_L4_M4_22
+       bgt     .Lzgemm_kernel_L4_M4_22
 
        .align 5
-zgemm_kernel_L4_M4_22a:
+.Lzgemm_kernel_L4_M4_22a:
 
        KERNEL4x4_M1
        KERNEL4x4_M2
@@ -1165,13 +1165,13 @@ zgemm_kernel_L4_M4_22a:
        KERNEL4x4_M1
        KERNEL4x4_E
 
-       b        zgemm_kernel_L4_M4_44
+       b        .Lzgemm_kernel_L4_M4_44
 
        .align 5
-zgemm_kernel_L4_M4_32:
+.Lzgemm_kernel_L4_M4_32:
 
        tst     counterL, #1
-       ble     zgemm_kernel_L4_M4_40
+       ble     .Lzgemm_kernel_L4_M4_40
 
        KERNEL4x4_I
        KERNEL4x4_M2
@@ -1182,55 +1182,55 @@ zgemm_kernel_L4_M4_32:
        KERNEL4x4_M1
        KERNEL4x4_E
 
-       b       zgemm_kernel_L4_M4_44
+       b       .Lzgemm_kernel_L4_M4_44
 
 
-zgemm_kernel_L4_M4_40:
+.Lzgemm_kernel_L4_M4_40:
 
        INIT4x4
 
-zgemm_kernel_L4_M4_44:
+.Lzgemm_kernel_L4_M4_44:
 
        ands    counterL , origK, #7
-       ble     zgemm_kernel_L4_M4_100
+       ble     .Lzgemm_kernel_L4_M4_100
 
        .align 5
-zgemm_kernel_L4_M4_46:
+.Lzgemm_kernel_L4_M4_46:
        KERNEL4x4_SUB
 
        subs    counterL, counterL, #1
-       bne     zgemm_kernel_L4_M4_46
+       bne     .Lzgemm_kernel_L4_M4_46
 
-zgemm_kernel_L4_M4_100:
+.Lzgemm_kernel_L4_M4_100:
        prfm    PLDL1KEEP, [pA]
        prfm    PLDL1KEEP, [pA, #64]
        prfm    PLDL1KEEP, [origPB]
 
        SAVE4x4
 
-zgemm_kernel_L4_M4_END:
+.Lzgemm_kernel_L4_M4_END:
        subs    counterI, counterI, #1
-       bne     zgemm_kernel_L4_M4_20
+       bne     .Lzgemm_kernel_L4_M4_20
 
-zgemm_kernel_L4_M2_BEGIN:
+.Lzgemm_kernel_L4_M2_BEGIN:
 
        mov     counterI, origM
        tst     counterI , #3
-       ble     zgemm_kernel_L4_END
+       ble     .Lzgemm_kernel_L4_END
 
        tst     counterI, #2                    // counterI = counterI / 2
-       ble     zgemm_kernel_L4_M1_BEGIN
+       ble     .Lzgemm_kernel_L4_M1_BEGIN
 
-zgemm_kernel_L4_M2_20:
+.Lzgemm_kernel_L4_M2_20:
 
        INIT2x4
 
        mov     pB, origPB
        asr     counterL , origK, #3            // counterL = counterL / 8
        cmp     counterL , #0
-       ble     zgemm_kernel_L4_M2_40
+       ble     .Lzgemm_kernel_L4_M2_40
 
-zgemm_kernel_L4_M2_22:
+.Lzgemm_kernel_L4_M2_22:
 
        KERNEL2x4_SUB
        KERNEL2x4_SUB
@@ -1243,43 +1243,43 @@ zgemm_kernel_L4_M2_22:
        KERNEL2x4_SUB
 
        subs    counterL, counterL, #1
-       bgt     zgemm_kernel_L4_M2_22
+       bgt     .Lzgemm_kernel_L4_M2_22
 
 
-zgemm_kernel_L4_M2_40:
+.Lzgemm_kernel_L4_M2_40:
 
        ands    counterL , origK, #7            // counterL = counterL % 8
-       ble     zgemm_kernel_L4_M2_100
+       ble     .Lzgemm_kernel_L4_M2_100
 
-zgemm_kernel_L4_M2_42:
+.Lzgemm_kernel_L4_M2_42:
 
        KERNEL2x4_SUB
 
        subs    counterL, counterL, #1
-       bgt     zgemm_kernel_L4_M2_42
+       bgt     .Lzgemm_kernel_L4_M2_42
 
-zgemm_kernel_L4_M2_100:
+.Lzgemm_kernel_L4_M2_100:
 
        SAVE2x4
 
-zgemm_kernel_L4_M2_END:
+.Lzgemm_kernel_L4_M2_END:
 
 
-zgemm_kernel_L4_M1_BEGIN:
+.Lzgemm_kernel_L4_M1_BEGIN:
 
        tst     counterI, #1                    // counterI = counterI % 2
-       ble     zgemm_kernel_L4_END
+       ble     .Lzgemm_kernel_L4_END
 
-zgemm_kernel_L4_M1_20:
+.Lzgemm_kernel_L4_M1_20:
 
        INIT1x4
 
        mov     pB, origPB
        asr     counterL , origK, #3            // counterL = counterL / 8
        cmp     counterL , #0
-       ble     zgemm_kernel_L4_M1_40
+       ble     .Lzgemm_kernel_L4_M1_40
 
-zgemm_kernel_L4_M1_22:
+.Lzgemm_kernel_L4_M1_22:
        KERNEL1x4_SUB
        KERNEL1x4_SUB
        KERNEL1x4_SUB
@@ -1291,45 +1291,45 @@ zgemm_kernel_L4_M1_22:
        KERNEL1x4_SUB
 
        subs    counterL, counterL, #1
-       bgt     zgemm_kernel_L4_M1_22
+       bgt     .Lzgemm_kernel_L4_M1_22
 
 
-zgemm_kernel_L4_M1_40:
+.Lzgemm_kernel_L4_M1_40:
 
        ands    counterL , origK, #7            // counterL = counterL % 8
-       ble     zgemm_kernel_L4_M1_100
+       ble     .Lzgemm_kernel_L4_M1_100
 
-zgemm_kernel_L4_M1_42:
+.Lzgemm_kernel_L4_M1_42:
 
        KERNEL1x4_SUB
 
        subs    counterL, counterL, #1
-       bgt     zgemm_kernel_L4_M1_42
+       bgt     .Lzgemm_kernel_L4_M1_42
 
-zgemm_kernel_L4_M1_100:
+.Lzgemm_kernel_L4_M1_100:
 
        SAVE1x4
 
 
-zgemm_kernel_L4_END:
+.Lzgemm_kernel_L4_END:
 
        lsl     temp, origK, #6
        add     origPB, origPB, temp            // B = B + K * 4 * 8 * 2
 
        subs    counterJ, counterJ , #1         // j--
-       bgt     zgemm_kernel_L4_BEGIN
+       bgt     .Lzgemm_kernel_L4_BEGIN
 
 
 /******************************************************************************/
 
-zgemm_kernel_L2_BEGIN:   // less than 2 left in N direction
+.Lzgemm_kernel_L2_BEGIN:   // less than 2 left in N direction
 
        mov     counterJ , origN
        tst     counterJ , #3
-       ble     zgemm_kernel_L999
+       ble     .Lzgemm_kernel_L999
 
        tst     counterJ , #2
-       ble     zgemm_kernel_L1_BEGIN
+       ble     .Lzgemm_kernel_L1_BEGIN
 
        mov     pCRow0, pC                      // pCRow0 = pC
 
@@ -1339,24 +1339,24 @@ zgemm_kernel_L2_BEGIN:   // less than 2 left in N direction
 
 
 
-zgemm_kernel_L2_M4_BEGIN:
+.Lzgemm_kernel_L2_M4_BEGIN:
 
        mov     counterI, origM
        asr     counterI, counterI, #2          // counterI = counterI / 4
        cmp     counterI,#0
-       ble     zgemm_kernel_L2_M2_BEGIN
+       ble     .Lzgemm_kernel_L2_M2_BEGIN
 
-zgemm_kernel_L2_M4_20:
+.Lzgemm_kernel_L2_M4_20:
 
        INIT4x2
 
        mov     pB, origPB
        asr     counterL , origK, #3            // counterL = counterL / 8
        cmp     counterL,#0
-       ble     zgemm_kernel_L2_M4_40
+       ble     .Lzgemm_kernel_L2_M4_40
        .align 5
 
-zgemm_kernel_L2_M4_22:
+.Lzgemm_kernel_L2_M4_22:
        KERNEL4x2_SUB
        KERNEL4x2_SUB
        KERNEL4x2_SUB
@@ -1368,50 +1368,50 @@ zgemm_kernel_L2_M4_22:
        KERNEL4x2_SUB
 
        subs    counterL, counterL, #1
-       bgt     zgemm_kernel_L2_M4_22
+       bgt     .Lzgemm_kernel_L2_M4_22
 
 
-zgemm_kernel_L2_M4_40:
+.Lzgemm_kernel_L2_M4_40:
 
        ands    counterL , origK, #7            // counterL = counterL % 8
-       ble     zgemm_kernel_L2_M4_100
+       ble     .Lzgemm_kernel_L2_M4_100
 
-zgemm_kernel_L2_M4_42:
+.Lzgemm_kernel_L2_M4_42:
 
        KERNEL4x2_SUB
 
        subs    counterL, counterL, #1
-       bgt     zgemm_kernel_L2_M4_42
+       bgt     .Lzgemm_kernel_L2_M4_42
 
-zgemm_kernel_L2_M4_100:
+.Lzgemm_kernel_L2_M4_100:
 
        SAVE4x2
 
-zgemm_kernel_L2_M4_END:
+.Lzgemm_kernel_L2_M4_END:
 
        subs    counterI, counterI, #1
-       bgt     zgemm_kernel_L2_M4_20
+       bgt     .Lzgemm_kernel_L2_M4_20
 
 
-zgemm_kernel_L2_M2_BEGIN:
+.Lzgemm_kernel_L2_M2_BEGIN:
 
        mov     counterI, origM
        tst     counterI , #3
-       ble     zgemm_kernel_L2_END
+       ble     .Lzgemm_kernel_L2_END
 
        tst     counterI, #2                    // counterI = counterI / 2
-       ble     zgemm_kernel_L2_M1_BEGIN
+       ble     .Lzgemm_kernel_L2_M1_BEGIN
 
-zgemm_kernel_L2_M2_20:
+.Lzgemm_kernel_L2_M2_20:
 
        INIT2x2
 
        mov     pB, origPB
        asr     counterL , origK, #3            // counterL = counterL / 8
         cmp    counterL,#0
-       ble     zgemm_kernel_L2_M2_40
+       ble     .Lzgemm_kernel_L2_M2_40
 
-zgemm_kernel_L2_M2_22:
+.Lzgemm_kernel_L2_M2_22:
 
        KERNEL2x2_SUB
        KERNEL2x2_SUB
@@ -1424,43 +1424,43 @@ zgemm_kernel_L2_M2_22:
        KERNEL2x2_SUB
 
        subs    counterL, counterL, #1
-       bgt     zgemm_kernel_L2_M2_22
+       bgt     .Lzgemm_kernel_L2_M2_22
 
 
-zgemm_kernel_L2_M2_40:
+.Lzgemm_kernel_L2_M2_40:
 
        ands    counterL , origK, #7            // counterL = counterL % 8
-       ble     zgemm_kernel_L2_M2_100
+       ble     .Lzgemm_kernel_L2_M2_100
 
-zgemm_kernel_L2_M2_42:
+.Lzgemm_kernel_L2_M2_42:
 
        KERNEL2x2_SUB
 
        subs    counterL, counterL, #1
-       bgt     zgemm_kernel_L2_M2_42
+       bgt     .Lzgemm_kernel_L2_M2_42
 
-zgemm_kernel_L2_M2_100:
+.Lzgemm_kernel_L2_M2_100:
 
        SAVE2x2
 
-zgemm_kernel_L2_M2_END:
+.Lzgemm_kernel_L2_M2_END:
 
 
-zgemm_kernel_L2_M1_BEGIN:
+.Lzgemm_kernel_L2_M1_BEGIN:
 
        tst     counterI, #1                    // counterI = counterI % 2
-       ble     zgemm_kernel_L2_END
+       ble     .Lzgemm_kernel_L2_END
 
-zgemm_kernel_L2_M1_20:
+.Lzgemm_kernel_L2_M1_20:
 
        INIT1x2
 
        mov     pB, origPB
        asr     counterL , origK, #3            // counterL = counterL / 8
         cmp     counterL, #0
-       ble     zgemm_kernel_L2_M1_40
+       ble     .Lzgemm_kernel_L2_M1_40
 
-zgemm_kernel_L2_M1_22:
+.Lzgemm_kernel_L2_M1_22:
        KERNEL1x2_SUB
        KERNEL1x2_SUB
        KERNEL1x2_SUB
@@ -1472,37 +1472,37 @@ zgemm_kernel_L2_M1_22:
        KERNEL1x2_SUB
 
        subs    counterL, counterL, #1
-       bgt     zgemm_kernel_L2_M1_22
+       bgt     .Lzgemm_kernel_L2_M1_22
 
 
-zgemm_kernel_L2_M1_40:
+.Lzgemm_kernel_L2_M1_40:
 
        ands    counterL , origK, #7            // counterL = counterL % 8
-       ble     zgemm_kernel_L2_M1_100
+       ble     .Lzgemm_kernel_L2_M1_100
 
-zgemm_kernel_L2_M1_42:
+.Lzgemm_kernel_L2_M1_42:
 
        KERNEL1x2_SUB
 
        subs    counterL, counterL, #1
-       bgt     zgemm_kernel_L2_M1_42
+       bgt     .Lzgemm_kernel_L2_M1_42
 
-zgemm_kernel_L2_M1_100:
+.Lzgemm_kernel_L2_M1_100:
 
        SAVE1x2
 
 
-zgemm_kernel_L2_END:
+.Lzgemm_kernel_L2_END:
        lsl     temp, origK, #5
        add     origPB, origPB, temp // B = B + K * 2 * 8 * 2
 
 /******************************************************************************/
 
-zgemm_kernel_L1_BEGIN:
+.Lzgemm_kernel_L1_BEGIN:
 
        mov     counterJ , origN
        tst     counterJ , #1
-       ble     zgemm_kernel_L999 // done
+       ble     .Lzgemm_kernel_L999 // done
 
 
        mov     pCRow0, pC                      // pCRow0 = C
@@ -1512,24 +1512,24 @@ zgemm_kernel_L1_BEGIN:
 
 
 
-zgemm_kernel_L1_M4_BEGIN:
+.Lzgemm_kernel_L1_M4_BEGIN:
 
        mov     counterI, origM
        asr     counterI, counterI, #2          // counterI = counterI / 4
        cmp     counterI, #0
-       ble     zgemm_kernel_L1_M2_BEGIN
+       ble     .Lzgemm_kernel_L1_M2_BEGIN
 
-zgemm_kernel_L1_M4_20:
+.Lzgemm_kernel_L1_M4_20:
 
        INIT4x1
 
        mov     pB, origPB
        asr     counterL , origK, #3            // counterL = counterL / 8
        cmp     counterL , #0
-       ble     zgemm_kernel_L1_M4_40
+       ble     .Lzgemm_kernel_L1_M4_40
        .align 5
 
-zgemm_kernel_L1_M4_22:
+.Lzgemm_kernel_L1_M4_22:
        KERNEL4x1_SUB
        KERNEL4x1_SUB
        KERNEL4x1_SUB
@@ -1541,50 +1541,50 @@ zgemm_kernel_L1_M4_22:
        KERNEL4x1_SUB
 
        subs    counterL, counterL, #1
-       bgt     zgemm_kernel_L1_M4_22
+       bgt     .Lzgemm_kernel_L1_M4_22
 
 
-zgemm_kernel_L1_M4_40:
+.Lzgemm_kernel_L1_M4_40:
 
        ands    counterL , origK, #7            // counterL = counterL % 8
-       ble     zgemm_kernel_L1_M4_100
+       ble     .Lzgemm_kernel_L1_M4_100
 
-zgemm_kernel_L1_M4_42:
+.Lzgemm_kernel_L1_M4_42:
 
        KERNEL4x1_SUB
 
        subs    counterL, counterL, #1
-       bgt     zgemm_kernel_L1_M4_42
+       bgt     .Lzgemm_kernel_L1_M4_42
 
-zgemm_kernel_L1_M4_100:
+.Lzgemm_kernel_L1_M4_100:
 
        SAVE4x1
 
-zgemm_kernel_L1_M4_END:
+.Lzgemm_kernel_L1_M4_END:
 
        subs    counterI, counterI, #1
-       bgt     zgemm_kernel_L1_M4_20
+       bgt     .Lzgemm_kernel_L1_M4_20
 
 
-zgemm_kernel_L1_M2_BEGIN:
+.Lzgemm_kernel_L1_M2_BEGIN:
 
        mov     counterI, origM
        tst     counterI , #3
-       ble     zgemm_kernel_L1_END
+       ble     .Lzgemm_kernel_L1_END
 
        tst     counterI, #2                    // counterI = counterI / 2
-       ble     zgemm_kernel_L1_M1_BEGIN
+       ble     .Lzgemm_kernel_L1_M1_BEGIN
 
-zgemm_kernel_L1_M2_20:
+.Lzgemm_kernel_L1_M2_20:
 
        INIT2x1
 
        mov     pB, origPB
        asr     counterL , origK, #3            // counterL = counterL / 8
        cmp     counterL , #0
-       ble     zgemm_kernel_L1_M2_40
+       ble     .Lzgemm_kernel_L1_M2_40
 
-zgemm_kernel_L1_M2_22:
+.Lzgemm_kernel_L1_M2_22:
 
        KERNEL2x1_SUB
        KERNEL2x1_SUB
@@ -1597,43 +1597,43 @@ zgemm_kernel_L1_M2_22:
        KERNEL2x1_SUB
 
        subs    counterL, counterL, #1
-       bgt     zgemm_kernel_L1_M2_22
+       bgt     .Lzgemm_kernel_L1_M2_22
 
 
-zgemm_kernel_L1_M2_40:
+.Lzgemm_kernel_L1_M2_40:
 
        ands    counterL , origK, #7            // counterL = counterL % 8
-       ble     zgemm_kernel_L1_M2_100
+       ble     .Lzgemm_kernel_L1_M2_100
 
-zgemm_kernel_L1_M2_42:
+.Lzgemm_kernel_L1_M2_42:
 
        KERNEL2x1_SUB
 
        subs    counterL, counterL, #1
-       bgt     zgemm_kernel_L1_M2_42
+       bgt     .Lzgemm_kernel_L1_M2_42
 
-zgemm_kernel_L1_M2_100:
+.Lzgemm_kernel_L1_M2_100:
 
        SAVE2x1
 
-zgemm_kernel_L1_M2_END:
+.Lzgemm_kernel_L1_M2_END:
 
 
-zgemm_kernel_L1_M1_BEGIN:
+.Lzgemm_kernel_L1_M1_BEGIN:
 
        tst     counterI, #1                    // counterI = counterI % 2
-       ble     zgemm_kernel_L1_END
+       ble     .Lzgemm_kernel_L1_END
 
-zgemm_kernel_L1_M1_20:
+.Lzgemm_kernel_L1_M1_20:
 
        INIT1x1
 
        mov     pB, origPB
        asr     counterL , origK, #3            // counterL = counterL / 8
        cmp     counterL , #0
-       ble     zgemm_kernel_L1_M1_40
+       ble     .Lzgemm_kernel_L1_M1_40
 
-zgemm_kernel_L1_M1_22:
+.Lzgemm_kernel_L1_M1_22:
        KERNEL1x1_SUB
        KERNEL1x1_SUB
        KERNEL1x1_SUB
@@ -1645,30 +1645,30 @@ zgemm_kernel_L1_M1_22:
        KERNEL1x1_SUB
 
        subs    counterL, counterL, #1
-       bgt     zgemm_kernel_L1_M1_22
+       bgt     .Lzgemm_kernel_L1_M1_22
 
 
-zgemm_kernel_L1_M1_40:
+.Lzgemm_kernel_L1_M1_40:
 
        ands    counterL , origK, #7            // counterL = counterL % 8
-       ble     zgemm_kernel_L1_M1_100
+       ble     .Lzgemm_kernel_L1_M1_100
 
-zgemm_kernel_L1_M1_42:
+.Lzgemm_kernel_L1_M1_42:
 
        KERNEL1x1_SUB
 
        subs    counterL, counterL, #1
-       bgt     zgemm_kernel_L1_M1_42
+       bgt     .Lzgemm_kernel_L1_M1_42
 
-zgemm_kernel_L1_M1_100:
+.Lzgemm_kernel_L1_M1_100:
 
        SAVE1x1
 
 
-zgemm_kernel_L1_END:
+.Lzgemm_kernel_L1_END:
 
 
-zgemm_kernel_L999:
+.Lzgemm_kernel_L999:
        mov     x0, #0                          // set return value
        ldp     d8, d9, [sp, #(0 * 16)]
        ldp     d10, d11, [sp, #(1 * 16)]
index e5b4cba..8e6ff65 100644 (file)
@@ -1109,9 +1109,9 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
        mov     counterJ, origN
        asr     counterJ, counterJ, #2          // J = J / 4
        cmp     counterJ, #0
-       ble     zgemm_kernel_L2_BEGIN
+       ble     .Lzgemm_kernel_L2_BEGIN
 
-zgemm_kernel_L4_BEGIN:
+.Lzgemm_kernel_L4_BEGIN:
        mov     pCRow0, pC
        add     pCRow1, pCRow0, LDC
        add     pCRow2, pCRow1, LDC
@@ -1121,20 +1121,20 @@ zgemm_kernel_L4_BEGIN:
 
        mov     pA, origPA                      // pA = start of A array
 
-zgemm_kernel_L4_M4_BEGIN:
+.Lzgemm_kernel_L4_M4_BEGIN:
 
        mov     counterI, origM
        asr     counterI, counterI, #2          // counterI = counterI / 4
        cmp     counterI, #0
-       ble     zgemm_kernel_L4_M2_BEGIN
+       ble     .Lzgemm_kernel_L4_M2_BEGIN
 
        .align 5
-zgemm_kernel_L4_M4_20:
+.Lzgemm_kernel_L4_M4_20:
 
        mov     pB, origPB
        asr     counterL , origK, #3
        cmp     counterL , #2
-       blt     zgemm_kernel_L4_M4_32
+       blt     .Lzgemm_kernel_L4_M4_32
 
        KERNEL4x4_I
        KERNEL4x4_M2
@@ -1146,10 +1146,10 @@ zgemm_kernel_L4_M4_20:
        KERNEL4x4_M2
 
        subs    counterL, counterL, #2          // subtract 2
-       ble     zgemm_kernel_L4_M4_22a
+       ble     .Lzgemm_kernel_L4_M4_22a
 
        .align 5
-zgemm_kernel_L4_M4_22:
+.Lzgemm_kernel_L4_M4_22:
 
        KERNEL4x4_M1
        KERNEL4x4_M2
@@ -1161,10 +1161,10 @@ zgemm_kernel_L4_M4_22:
        KERNEL4x4_M2
 
        subs    counterL, counterL, #1
-       bgt     zgemm_kernel_L4_M4_22
+       bgt     .Lzgemm_kernel_L4_M4_22
 
        .align 5
-zgemm_kernel_L4_M4_22a:
+.Lzgemm_kernel_L4_M4_22a:
 
        KERNEL4x4_M1
        KERNEL4x4_M2
@@ -1175,13 +1175,13 @@ zgemm_kernel_L4_M4_22a:
        KERNEL4x4_M1
        KERNEL4x4_E
 
-       b        zgemm_kernel_L4_M4_44
+       b        .Lzgemm_kernel_L4_M4_44
 
        .align 5
-zgemm_kernel_L4_M4_32:
+.Lzgemm_kernel_L4_M4_32:
 
        tst     counterL, #1
-       ble     zgemm_kernel_L4_M4_40
+       ble     .Lzgemm_kernel_L4_M4_40
 
        KERNEL4x4_I
        KERNEL4x4_M2
@@ -1192,55 +1192,55 @@ zgemm_kernel_L4_M4_32:
        KERNEL4x4_M1
        KERNEL4x4_E
 
-       b       zgemm_kernel_L4_M4_44
+       b       .Lzgemm_kernel_L4_M4_44
 
 
-zgemm_kernel_L4_M4_40:
+.Lzgemm_kernel_L4_M4_40:
 
        INIT4x4
 
-zgemm_kernel_L4_M4_44:
+.Lzgemm_kernel_L4_M4_44:
 
        ands    counterL , origK, #7
-       ble     zgemm_kernel_L4_M4_100
+       ble     .Lzgemm_kernel_L4_M4_100
 
        .align 5
-zgemm_kernel_L4_M4_46:
+.Lzgemm_kernel_L4_M4_46:
        KERNEL4x4_SUB
 
        subs    counterL, counterL, #1
-       bne     zgemm_kernel_L4_M4_46
+       bne     .Lzgemm_kernel_L4_M4_46
 
-zgemm_kernel_L4_M4_100:
+.Lzgemm_kernel_L4_M4_100:
        prfm    PLDL1KEEP, [pA]
        prfm    PLDL1KEEP, [pA, #64]
        prfm    PLDL1KEEP, [origPB]
 
        SAVE4x4
 
-zgemm_kernel_L4_M4_END:
+.Lzgemm_kernel_L4_M4_END:
        subs    counterI, counterI, #1
-       bne     zgemm_kernel_L4_M4_20
+       bne     .Lzgemm_kernel_L4_M4_20
 
-zgemm_kernel_L4_M2_BEGIN:
+.Lzgemm_kernel_L4_M2_BEGIN:
 
        mov     counterI, origM
        tst     counterI , #3
-       ble     zgemm_kernel_L4_END
+       ble     .Lzgemm_kernel_L4_END
 
        tst     counterI, #2                    // counterI = counterI / 2
-       ble     zgemm_kernel_L4_M1_BEGIN
+       ble     .Lzgemm_kernel_L4_M1_BEGIN
 
-zgemm_kernel_L4_M2_20:
+.Lzgemm_kernel_L4_M2_20:
 
        INIT2x4
 
        mov     pB, origPB
        asr     counterL , origK, #3            // counterL = counterL / 8
        cmp     counterL , #0
-       ble     zgemm_kernel_L4_M2_40
+       ble     .Lzgemm_kernel_L4_M2_40
 
-zgemm_kernel_L4_M2_22:
+.Lzgemm_kernel_L4_M2_22:
 
        KERNEL2x4_SUB
        KERNEL2x4_SUB
@@ -1253,43 +1253,43 @@ zgemm_kernel_L4_M2_22:
        KERNEL2x4_SUB
 
        subs    counterL, counterL, #1
-       bgt     zgemm_kernel_L4_M2_22
+       bgt     .Lzgemm_kernel_L4_M2_22
 
 
-zgemm_kernel_L4_M2_40:
+.Lzgemm_kernel_L4_M2_40:
 
        ands    counterL , origK, #7            // counterL = counterL % 8
-       ble     zgemm_kernel_L4_M2_100
+       ble     .Lzgemm_kernel_L4_M2_100
 
-zgemm_kernel_L4_M2_42:
+.Lzgemm_kernel_L4_M2_42:
 
        KERNEL2x4_SUB
 
        subs    counterL, counterL, #1
-       bgt     zgemm_kernel_L4_M2_42
+       bgt     .Lzgemm_kernel_L4_M2_42
 
-zgemm_kernel_L4_M2_100:
+.Lzgemm_kernel_L4_M2_100:
 
        SAVE2x4
 
-zgemm_kernel_L4_M2_END:
+.Lzgemm_kernel_L4_M2_END:
 
 
-zgemm_kernel_L4_M1_BEGIN:
+.Lzgemm_kernel_L4_M1_BEGIN:
 
        tst     counterI, #1                    // counterI = counterI % 2
-       ble     zgemm_kernel_L4_END
+       ble     .Lzgemm_kernel_L4_END
 
-zgemm_kernel_L4_M1_20:
+.Lzgemm_kernel_L4_M1_20:
 
        INIT1x4
 
        mov     pB, origPB
        asr     counterL , origK, #3            // counterL = counterL / 8
        cmp     counterL , #0
-       ble     zgemm_kernel_L4_M1_40
+       ble     .Lzgemm_kernel_L4_M1_40
 
-zgemm_kernel_L4_M1_22:
+.Lzgemm_kernel_L4_M1_22:
        KERNEL1x4_SUB
        KERNEL1x4_SUB
        KERNEL1x4_SUB
@@ -1301,45 +1301,45 @@ zgemm_kernel_L4_M1_22:
        KERNEL1x4_SUB
 
        subs    counterL, counterL, #1
-       bgt     zgemm_kernel_L4_M1_22
+       bgt     .Lzgemm_kernel_L4_M1_22
 
 
-zgemm_kernel_L4_M1_40:
+.Lzgemm_kernel_L4_M1_40:
 
        ands    counterL , origK, #7            // counterL = counterL % 8
-       ble     zgemm_kernel_L4_M1_100
+       ble     .Lzgemm_kernel_L4_M1_100
 
-zgemm_kernel_L4_M1_42:
+.Lzgemm_kernel_L4_M1_42:
 
        KERNEL1x4_SUB
 
        subs    counterL, counterL, #1
-       bgt     zgemm_kernel_L4_M1_42
+       bgt     .Lzgemm_kernel_L4_M1_42
 
-zgemm_kernel_L4_M1_100:
+.Lzgemm_kernel_L4_M1_100:
 
        SAVE1x4
 
 
-zgemm_kernel_L4_END:
+.Lzgemm_kernel_L4_END:
 
        lsl     temp, origK, #6
        add     origPB, origPB, temp            // B = B + K * 4 * 8 * 2
 
        subs    counterJ, counterJ , #1         // j--
-       bgt     zgemm_kernel_L4_BEGIN
+       bgt     .Lzgemm_kernel_L4_BEGIN
 
 
 /******************************************************************************/
 
-zgemm_kernel_L2_BEGIN:   // less than 2 left in N direction
+.Lzgemm_kernel_L2_BEGIN:   // less than 2 left in N direction
 
        mov     counterJ , origN
        tst     counterJ , #3
-       ble     zgemm_kernel_L999
+       ble     .Lzgemm_kernel_L999
 
        tst     counterJ , #2
-       ble     zgemm_kernel_L1_BEGIN
+       ble     .Lzgemm_kernel_L1_BEGIN
 
        mov     pCRow0, pC                      // pCRow0 = pC
 
@@ -1349,24 +1349,24 @@ zgemm_kernel_L2_BEGIN:   // less than 2 left in N direction
 
 
 
-zgemm_kernel_L2_M4_BEGIN:
+.Lzgemm_kernel_L2_M4_BEGIN:
 
        mov     counterI, origM
        asr     counterI, counterI, #2          // counterI = counterI / 4
        cmp     counterI,#0
-       ble     zgemm_kernel_L2_M2_BEGIN
+       ble     .Lzgemm_kernel_L2_M2_BEGIN
 
-zgemm_kernel_L2_M4_20:
+.Lzgemm_kernel_L2_M4_20:
 
        INIT4x2
 
        mov     pB, origPB
        asr     counterL , origK, #3            // counterL = counterL / 8
        cmp     counterL,#0
-       ble     zgemm_kernel_L2_M4_40
+       ble     .Lzgemm_kernel_L2_M4_40
        .align 5
 
-zgemm_kernel_L2_M4_22:
+.Lzgemm_kernel_L2_M4_22:
        KERNEL4x2_SUB
        KERNEL4x2_SUB
        KERNEL4x2_SUB
@@ -1378,50 +1378,50 @@ zgemm_kernel_L2_M4_22:
        KERNEL4x2_SUB
 
        subs    counterL, counterL, #1
-       bgt     zgemm_kernel_L2_M4_22
+       bgt     .Lzgemm_kernel_L2_M4_22
 
 
-zgemm_kernel_L2_M4_40:
+.Lzgemm_kernel_L2_M4_40:
 
        ands    counterL , origK, #7            // counterL = counterL % 8
-       ble     zgemm_kernel_L2_M4_100
+       ble     .Lzgemm_kernel_L2_M4_100
 
-zgemm_kernel_L2_M4_42:
+.Lzgemm_kernel_L2_M4_42:
 
        KERNEL4x2_SUB
 
        subs    counterL, counterL, #1
-       bgt     zgemm_kernel_L2_M4_42
+       bgt     .Lzgemm_kernel_L2_M4_42
 
-zgemm_kernel_L2_M4_100:
+.Lzgemm_kernel_L2_M4_100:
 
        SAVE4x2
 
-zgemm_kernel_L2_M4_END:
+.Lzgemm_kernel_L2_M4_END:
 
        subs    counterI, counterI, #1
-       bgt     zgemm_kernel_L2_M4_20
+       bgt     .Lzgemm_kernel_L2_M4_20
 
 
-zgemm_kernel_L2_M2_BEGIN:
+.Lzgemm_kernel_L2_M2_BEGIN:
 
        mov     counterI, origM
        tst     counterI , #3
-       ble     zgemm_kernel_L2_END
+       ble     .Lzgemm_kernel_L2_END
 
        tst     counterI, #2                    // counterI = counterI / 2
-       ble     zgemm_kernel_L2_M1_BEGIN
+       ble     .Lzgemm_kernel_L2_M1_BEGIN
 
-zgemm_kernel_L2_M2_20:
+.Lzgemm_kernel_L2_M2_20:
 
        INIT2x2
 
        mov     pB, origPB
        asr     counterL , origK, #3            // counterL = counterL / 8
         cmp    counterL,#0
-       ble     zgemm_kernel_L2_M2_40
+       ble     .Lzgemm_kernel_L2_M2_40
 
-zgemm_kernel_L2_M2_22:
+.Lzgemm_kernel_L2_M2_22:
 
        KERNEL2x2_SUB
        KERNEL2x2_SUB
@@ -1434,43 +1434,43 @@ zgemm_kernel_L2_M2_22:
        KERNEL2x2_SUB
 
        subs    counterL, counterL, #1
-       bgt     zgemm_kernel_L2_M2_22
+       bgt     .Lzgemm_kernel_L2_M2_22
 
 
-zgemm_kernel_L2_M2_40:
+.Lzgemm_kernel_L2_M2_40:
 
        ands    counterL , origK, #7            // counterL = counterL % 8
-       ble     zgemm_kernel_L2_M2_100
+       ble     .Lzgemm_kernel_L2_M2_100
 
-zgemm_kernel_L2_M2_42:
+.Lzgemm_kernel_L2_M2_42:
 
        KERNEL2x2_SUB
 
        subs    counterL, counterL, #1
-       bgt     zgemm_kernel_L2_M2_42
+       bgt     .Lzgemm_kernel_L2_M2_42
 
-zgemm_kernel_L2_M2_100:
+.Lzgemm_kernel_L2_M2_100:
 
        SAVE2x2
 
-zgemm_kernel_L2_M2_END:
+.Lzgemm_kernel_L2_M2_END:
 
 
-zgemm_kernel_L2_M1_BEGIN:
+.Lzgemm_kernel_L2_M1_BEGIN:
 
        tst     counterI, #1                    // counterI = counterI % 2
-       ble     zgemm_kernel_L2_END
+       ble     .Lzgemm_kernel_L2_END
 
-zgemm_kernel_L2_M1_20:
+.Lzgemm_kernel_L2_M1_20:
 
        INIT1x2
 
        mov     pB, origPB
        asr     counterL , origK, #3            // counterL = counterL / 8
         cmp     counterL, #0
-       ble     zgemm_kernel_L2_M1_40
+       ble     .Lzgemm_kernel_L2_M1_40
 
-zgemm_kernel_L2_M1_22:
+.Lzgemm_kernel_L2_M1_22:
        KERNEL1x2_SUB
        KERNEL1x2_SUB
        KERNEL1x2_SUB
@@ -1482,37 +1482,37 @@ zgemm_kernel_L2_M1_22:
        KERNEL1x2_SUB
 
        subs    counterL, counterL, #1
-       bgt     zgemm_kernel_L2_M1_22
+       bgt     .Lzgemm_kernel_L2_M1_22
 
 
-zgemm_kernel_L2_M1_40:
+.Lzgemm_kernel_L2_M1_40:
 
        ands    counterL , origK, #7            // counterL = counterL % 8
-       ble     zgemm_kernel_L2_M1_100
+       ble     .Lzgemm_kernel_L2_M1_100
 
-zgemm_kernel_L2_M1_42:
+.Lzgemm_kernel_L2_M1_42:
 
        KERNEL1x2_SUB
 
        subs    counterL, counterL, #1
-       bgt     zgemm_kernel_L2_M1_42
+       bgt     .Lzgemm_kernel_L2_M1_42
 
-zgemm_kernel_L2_M1_100:
+.Lzgemm_kernel_L2_M1_100:
 
        SAVE1x2
 
 
-zgemm_kernel_L2_END:
+.Lzgemm_kernel_L2_END:
        lsl     temp, origK, #5
        add     origPB, origPB, temp // B = B + K * 2 * 8 * 2
 
 /******************************************************************************/
 
-zgemm_kernel_L1_BEGIN:
+.Lzgemm_kernel_L1_BEGIN:
 
        mov     counterJ , origN
        tst     counterJ , #1
-       ble     zgemm_kernel_L999 // done
+       ble     .Lzgemm_kernel_L999 // done
 
 
        mov     pCRow0, pC                      // pCRow0 = C
@@ -1522,24 +1522,24 @@ zgemm_kernel_L1_BEGIN:
 
 
 
-zgemm_kernel_L1_M4_BEGIN:
+.Lzgemm_kernel_L1_M4_BEGIN:
 
        mov     counterI, origM
        asr     counterI, counterI, #2          // counterI = counterI / 4
        cmp     counterI, #0
-       ble     zgemm_kernel_L1_M2_BEGIN
+       ble     .Lzgemm_kernel_L1_M2_BEGIN
 
-zgemm_kernel_L1_M4_20:
+.Lzgemm_kernel_L1_M4_20:
 
        INIT4x1
 
        mov     pB, origPB
        asr     counterL , origK, #3            // counterL = counterL / 8
        cmp     counterL , #0
-       ble     zgemm_kernel_L1_M4_40
+       ble     .Lzgemm_kernel_L1_M4_40
        .align 5
 
-zgemm_kernel_L1_M4_22:
+.Lzgemm_kernel_L1_M4_22:
        KERNEL4x1_SUB
        KERNEL4x1_SUB
        KERNEL4x1_SUB
@@ -1551,50 +1551,50 @@ zgemm_kernel_L1_M4_22:
        KERNEL4x1_SUB
 
        subs    counterL, counterL, #1
-       bgt     zgemm_kernel_L1_M4_22
+       bgt     .Lzgemm_kernel_L1_M4_22
 
 
-zgemm_kernel_L1_M4_40:
+.Lzgemm_kernel_L1_M4_40:
 
        ands    counterL , origK, #7            // counterL = counterL % 8
-       ble     zgemm_kernel_L1_M4_100
+       ble     .Lzgemm_kernel_L1_M4_100
 
-zgemm_kernel_L1_M4_42:
+.Lzgemm_kernel_L1_M4_42:
 
        KERNEL4x1_SUB
 
        subs    counterL, counterL, #1
-       bgt     zgemm_kernel_L1_M4_42
+       bgt     .Lzgemm_kernel_L1_M4_42
 
-zgemm_kernel_L1_M4_100:
+.Lzgemm_kernel_L1_M4_100:
 
        SAVE4x1
 
-zgemm_kernel_L1_M4_END:
+.Lzgemm_kernel_L1_M4_END:
 
        subs    counterI, counterI, #1
-       bgt     zgemm_kernel_L1_M4_20
+       bgt     .Lzgemm_kernel_L1_M4_20
 
 
-zgemm_kernel_L1_M2_BEGIN:
+.Lzgemm_kernel_L1_M2_BEGIN:
 
        mov     counterI, origM
        tst     counterI , #3
-       ble     zgemm_kernel_L1_END
+       ble     .Lzgemm_kernel_L1_END
 
        tst     counterI, #2                    // counterI = counterI / 2
-       ble     zgemm_kernel_L1_M1_BEGIN
+       ble     .Lzgemm_kernel_L1_M1_BEGIN
 
-zgemm_kernel_L1_M2_20:
+.Lzgemm_kernel_L1_M2_20:
 
        INIT2x1
 
        mov     pB, origPB
        asr     counterL , origK, #3            // counterL = counterL / 8
        cmp     counterL , #0
-       ble     zgemm_kernel_L1_M2_40
+       ble     .Lzgemm_kernel_L1_M2_40
 
-zgemm_kernel_L1_M2_22:
+.Lzgemm_kernel_L1_M2_22:
 
        KERNEL2x1_SUB
        KERNEL2x1_SUB
@@ -1607,43 +1607,43 @@ zgemm_kernel_L1_M2_22:
        KERNEL2x1_SUB
 
        subs    counterL, counterL, #1
-       bgt     zgemm_kernel_L1_M2_22
+       bgt     .Lzgemm_kernel_L1_M2_22
 
 
-zgemm_kernel_L1_M2_40:
+.Lzgemm_kernel_L1_M2_40:
 
        ands    counterL , origK, #7            // counterL = counterL % 8
-       ble     zgemm_kernel_L1_M2_100
+       ble     .Lzgemm_kernel_L1_M2_100
 
-zgemm_kernel_L1_M2_42:
+.Lzgemm_kernel_L1_M2_42:
 
        KERNEL2x1_SUB
 
        subs    counterL, counterL, #1
-       bgt     zgemm_kernel_L1_M2_42
+       bgt     .Lzgemm_kernel_L1_M2_42
 
-zgemm_kernel_L1_M2_100:
+.Lzgemm_kernel_L1_M2_100:
 
        SAVE2x1
 
-zgemm_kernel_L1_M2_END:
+.Lzgemm_kernel_L1_M2_END:
 
 
-zgemm_kernel_L1_M1_BEGIN:
+.Lzgemm_kernel_L1_M1_BEGIN:
 
        tst     counterI, #1                    // counterI = counterI % 2
-       ble     zgemm_kernel_L1_END
+       ble     .Lzgemm_kernel_L1_END
 
-zgemm_kernel_L1_M1_20:
+.Lzgemm_kernel_L1_M1_20:
 
        INIT1x1
 
        mov     pB, origPB
        asr     counterL , origK, #3            // counterL = counterL / 8
        cmp     counterL , #0
-       ble     zgemm_kernel_L1_M1_40
+       ble     .Lzgemm_kernel_L1_M1_40
 
-zgemm_kernel_L1_M1_22:
+.Lzgemm_kernel_L1_M1_22:
        KERNEL1x1_SUB
        KERNEL1x1_SUB
        KERNEL1x1_SUB
@@ -1655,30 +1655,30 @@ zgemm_kernel_L1_M1_22:
        KERNEL1x1_SUB
 
        subs    counterL, counterL, #1
-       bgt     zgemm_kernel_L1_M1_22
+       bgt     .Lzgemm_kernel_L1_M1_22
 
 
-zgemm_kernel_L1_M1_40:
+.Lzgemm_kernel_L1_M1_40:
 
        ands    counterL , origK, #7            // counterL = counterL % 8
-       ble     zgemm_kernel_L1_M1_100
+       ble     .Lzgemm_kernel_L1_M1_100
 
-zgemm_kernel_L1_M1_42:
+.Lzgemm_kernel_L1_M1_42:
 
        KERNEL1x1_SUB
 
        subs    counterL, counterL, #1
-       bgt     zgemm_kernel_L1_M1_42
+       bgt     .Lzgemm_kernel_L1_M1_42
 
-zgemm_kernel_L1_M1_100:
+.Lzgemm_kernel_L1_M1_100:
 
        SAVE1x1
 
 
-zgemm_kernel_L1_END:
+.Lzgemm_kernel_L1_END:
 
 
-zgemm_kernel_L999:
+.Lzgemm_kernel_L999:
        mov     x0, #0                          // set return value
        ldp     d8, d9, [sp, #(0 * 16)]
        ldp     d10, d11, [sp, #(1 * 16)]
index a28d1b0..28afcad 100644 (file)
@@ -364,9 +364,9 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
        SAVE_REGS
 
        cmp     N, xzr
-       ble     zgemv_n_kernel_L999
+       ble     .Lzgemv_n_kernel_L999
        cmp     M, xzr
-       ble     zgemv_n_kernel_L999
+       ble     .Lzgemv_n_kernel_L999
 
        lsl     LDA, LDA, #SHZ
        lsl     INC_X, INC_X, #SHZ
@@ -375,9 +375,9 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
        INIT
 
        cmp     INC_Y, #1
-       bne     zgemv_n_kernel_S_BEGIN
+       bne     .Lzgemv_n_kernel_S_BEGIN
 
-zgemv_n_kernel_F_LOOP:
+.Lzgemv_n_kernel_F_LOOP:
        mov     A_PTR, A
        mov     Y_IPTR, Y
        mov     Y_OPTR, Y
@@ -387,40 +387,40 @@ zgemv_n_kernel_F_LOOP:
 
        asr     I, M, #2
        cmp     I, xzr
-       beq     zgemv_n_kernel_F1
+       beq     .Lzgemv_n_kernel_F1
 
-zgemv_n_kernel_F4:
+.Lzgemv_n_kernel_F4:
 
        KERNEL_F4
 
        subs    I, I, #1
-       bne     zgemv_n_kernel_F4
+       bne     .Lzgemv_n_kernel_F4
 
-zgemv_n_kernel_F1:
+.Lzgemv_n_kernel_F1:
 
        ands    I, M, #3
-       ble     zgemv_n_kernel_F_END
+       ble     .Lzgemv_n_kernel_F_END
 
-zgemv_n_kernel_F10:
+.Lzgemv_n_kernel_F10:
 
        KERNEL_F1
 
        subs    I, I, #1
-       bne     zgemv_n_kernel_F10
+       bne     .Lzgemv_n_kernel_F10
 
-zgemv_n_kernel_F_END:
+.Lzgemv_n_kernel_F_END:
 
        add     A, A, LDA
        subs    J, J, #1
-        bne     zgemv_n_kernel_F_LOOP
+        bne     .Lzgemv_n_kernel_F_LOOP
 
-       b       zgemv_n_kernel_L999
+       b       .Lzgemv_n_kernel_L999
 
-zgemv_n_kernel_S_BEGIN:
+.Lzgemv_n_kernel_S_BEGIN:
 
        INIT_S
 
-zgemv_n_kernel_S_LOOP:
+.Lzgemv_n_kernel_S_LOOP:
        mov     A_PTR, A
        mov     Y_IPTR, Y
        mov     Y_OPTR, Y
@@ -430,9 +430,9 @@ zgemv_n_kernel_S_LOOP:
 
        asr     I, M, #2
        cmp     I, xzr
-       ble     zgemv_n_kernel_S1
+       ble     .Lzgemv_n_kernel_S1
 
-zgemv_n_kernel_S4:
+.Lzgemv_n_kernel_S4:
 
        KERNEL_S1
        KERNEL_S1
@@ -440,27 +440,27 @@ zgemv_n_kernel_S4:
        KERNEL_S1
 
        subs    I, I, #1
-       bne     zgemv_n_kernel_S4
+       bne     .Lzgemv_n_kernel_S4
 
-zgemv_n_kernel_S1:
+.Lzgemv_n_kernel_S1:
 
        ands    I, M, #3
-       ble     zgemv_n_kernel_S_END
+       ble     .Lzgemv_n_kernel_S_END
 
-zgemv_n_kernel_S10:
+.Lzgemv_n_kernel_S10:
 
        KERNEL_S1
 
        subs    I, I, #1
-        bne     zgemv_n_kernel_S10
+        bne     .Lzgemv_n_kernel_S10
 
-zgemv_n_kernel_S_END:
+.Lzgemv_n_kernel_S_END:
 
        add     A, A, LDA
        subs    J, J, #1
-        bne     zgemv_n_kernel_S_LOOP
+        bne     .Lzgemv_n_kernel_S_LOOP
 
-zgemv_n_kernel_L999:
+.Lzgemv_n_kernel_L999:
        RESTORE_REGS
 
        mov     w0, wzr
index 79ce9bc..0151029 100644 (file)
@@ -292,9 +292,9 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
        SAVE_REGS
 
        cmp     N, xzr
-       ble     zgemv_t_kernel_L999
+       ble     .Lzgemv_t_kernel_L999
        cmp     M, xzr
-       ble     zgemv_t_kernel_L999
+       ble     .Lzgemv_t_kernel_L999
 
        lsl     LDA, LDA, #SHZ
        lsl     INC_Y, INC_Y, #SHZ
@@ -303,9 +303,9 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
        INIT
 
        cmp     INC_X, #1
-       bne     zgemv_t_kernel_S_BEGIN
+       bne     .Lzgemv_t_kernel_S_BEGIN
 
-zgemv_t_kernel_F_LOOP:
+.Lzgemv_t_kernel_F_LOOP:
 
        mov     A_PTR, A
        mov     X_PTR, X
@@ -314,30 +314,30 @@ zgemv_t_kernel_F_LOOP:
 
        asr     I, M, #2
        cmp     I, xzr
-       beq     zgemv_t_kernel_F1
+       beq     .Lzgemv_t_kernel_F1
 
-zgemv_t_kernel_F4:
+.Lzgemv_t_kernel_F4:
 
        KERNEL_F4
 
        subs    I, I, #1
-       bne     zgemv_t_kernel_F4
+       bne     .Lzgemv_t_kernel_F4
 
        KERNEL_F4_FINALIZE
 
-zgemv_t_kernel_F1:
+.Lzgemv_t_kernel_F1:
 
        ands    I, M, #3
-       ble     zgemv_t_kernel_F_END
+       ble     .Lzgemv_t_kernel_F_END
 
-zgemv_t_kernel_F10:
+.Lzgemv_t_kernel_F10:
 
        KERNEL_F1
 
        subs    I, I, #1
-        bne     zgemv_t_kernel_F10
+        bne     .Lzgemv_t_kernel_F10
 
-zgemv_t_kernel_F_END:
+.Lzgemv_t_kernel_F_END:
 
 #if !defined(DOUBLE)
        ld1     {v4.2s}, [Y]
@@ -355,15 +355,15 @@ zgemv_t_kernel_F_END:
 
        add     A, A, LDA
        subs    J, J, #1
-        bne     zgemv_t_kernel_F_LOOP
+        bne     .Lzgemv_t_kernel_F_LOOP
 
-       b       zgemv_t_kernel_L999
+       b       .Lzgemv_t_kernel_L999
 
-zgemv_t_kernel_S_BEGIN:
+.Lzgemv_t_kernel_S_BEGIN:
 
        INIT_S
 
-zgemv_t_kernel_S_LOOP:
+.Lzgemv_t_kernel_S_LOOP:
 
        mov     A_PTR, A
        mov     X_PTR, X
@@ -371,9 +371,9 @@ zgemv_t_kernel_S_LOOP:
 
        asr     I, M, #2
        cmp     I, xzr
-       ble     zgemv_t_kernel_S1
+       ble     .Lzgemv_t_kernel_S1
 
-zgemv_t_kernel_S4:
+.Lzgemv_t_kernel_S4:
 
        KERNEL_S1
        KERNEL_S1
@@ -381,21 +381,21 @@ zgemv_t_kernel_S4:
        KERNEL_S1
 
        subs    I, I, #1
-       bne     zgemv_t_kernel_S4
+       bne     .Lzgemv_t_kernel_S4
 
-zgemv_t_kernel_S1:
+.Lzgemv_t_kernel_S1:
 
        ands    I, M, #3
-       ble     zgemv_t_kernel_S_END
+       ble     .Lzgemv_t_kernel_S_END
 
-zgemv_t_kernel_S10:
+.Lzgemv_t_kernel_S10:
 
        KERNEL_S1
 
        subs    I, I, #1
-        bne     zgemv_t_kernel_S10
+        bne     .Lzgemv_t_kernel_S10
 
-zgemv_t_kernel_S_END:
+.Lzgemv_t_kernel_S_END:
 
 #if !defined(DOUBLE)
        ld1     {v4.2s}, [Y]
@@ -413,9 +413,9 @@ zgemv_t_kernel_S_END:
 
        add     A, A, LDA
        subs    J, J, #1
-        bne     zgemv_t_kernel_S_LOOP
+        bne     .Lzgemv_t_kernel_S_LOOP
 
-zgemv_t_kernel_L999:
+.Lzgemv_t_kernel_L999:
        RESTORE_REGS
        mov     w0, wzr
        ret
index 1360dc9..1c89685 100644 (file)
@@ -226,43 +226,43 @@ KERNEL_S1_END_\@:
        INIT
 
        cmp     N, #0
-       ble     nrm2_kernel_L999
+       ble     .Lznrm2_kernel_L999
 
        cmp     INC_X, #0
-       beq     nrm2_kernel_L999
+       beq     .Lznrm2_kernel_L999
 
        cmp     INC_X, #1
-       bne     nrm2_kernel_S_BEGIN
+       bne     .Lznrm2_kernel_S_BEGIN
 
-nrm2_kernel_F_BEGIN:
+.Lznrm2_kernel_F_BEGIN:
 
        asr     I, N, #3                                // I = N / 8
        cmp     I, xzr
-       ble     nrm2_kernel_F1
+       ble     .Lznrm2_kernel_F1
 
-nrm2_kernel_F8:
+.Lznrm2_kernel_F8:
 
        KERNEL_F8
 
        subs    I, I, #1
-        bne     nrm2_kernel_F8
+        bne     .Lznrm2_kernel_F8
 
-nrm2_kernel_F1:
+.Lznrm2_kernel_F1:
 
        ands    I, N, #7
-       ble     nrm2_kernel_L999
+       ble     .Lznrm2_kernel_L999
 
 
-nrm2_kernel_F10:
+.Lznrm2_kernel_F10:
 
        KERNEL_F1
 
        subs    I, I, #1
-       bne     nrm2_kernel_F10
+       bne     .Lznrm2_kernel_F10
 
-       b       nrm2_kernel_L999
+       b       .Lznrm2_kernel_L999
 
-nrm2_kernel_S_BEGIN:
+.Lznrm2_kernel_S_BEGIN:
 
        INIT_S
 
@@ -270,15 +270,15 @@ nrm2_kernel_S_BEGIN:
 
        .align 5
 
-nrm2_kernel_S10:
+.Lznrm2_kernel_S10:
 
        KERNEL_S1
 
        subs    I, I, #1
-       bne     nrm2_kernel_S10
+       bne     .Lznrm2_kernel_S10
 
 
-nrm2_kernel_L999:
+.Lznrm2_kernel_L999:
        fsqrt   SSQ, SSQ
        fmul    SSQ, SCALE, SSQ
 
index 90f138a..b5e510e 100644 (file)
@@ -181,54 +181,54 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
        PROLOGUE
 
        cmp     N, xzr
-       ble     rot_kernel_L999
+       ble     .Lzrot_kernel_L999
 
        INIT
 
        cmp     INC_X, #1
-       bne     rot_kernel_S_BEGIN
+       bne     .Lzrot_kernel_S_BEGIN
        cmp     INC_Y, #1
-       bne     rot_kernel_S_BEGIN
+       bne     .Lzrot_kernel_S_BEGIN
 
-rot_kernel_F_BEGIN:
+.Lzrot_kernel_F_BEGIN:
 
        asr     I, N, #2
        cmp     I, xzr
-       beq     rot_kernel_F1
+       beq     .Lzrot_kernel_F1
 
        KERNEL_INIT_F4
 
-rot_kernel_F4:
+.Lzrot_kernel_F4:
 
        KERNEL_F4
 
        subs    I, I, #1
-       bne     rot_kernel_F4
+       bne     .Lzrot_kernel_F4
 
-rot_kernel_F1:
+.Lzrot_kernel_F1:
 
        ands    I, N, #3
-       ble     rot_kernel_L999
+       ble     .Lzrot_kernel_L999
 
-rot_kernel_F10:
+.Lzrot_kernel_F10:
 
        KERNEL_F1
 
        subs    I, I, #1
-        bne     rot_kernel_F10
+        bne     .Lzrot_kernel_F10
 
        mov     w0, wzr
        ret
 
-rot_kernel_S_BEGIN:
+.Lzrot_kernel_S_BEGIN:
 
        INIT_S
 
        asr     I, N, #2
        cmp     I, xzr
-       ble     rot_kernel_S1
+       ble     .Lzrot_kernel_S1
 
-rot_kernel_S4:
+.Lzrot_kernel_S4:
 
        KERNEL_S1
        KERNEL_S1
@@ -236,21 +236,21 @@ rot_kernel_S4:
        KERNEL_S1
 
        subs    I, I, #1
-       bne     rot_kernel_S4
+       bne     .Lzrot_kernel_S4
 
-rot_kernel_S1:
+.Lzrot_kernel_S1:
 
        ands    I, N, #3
-       ble     rot_kernel_L999
+       ble     .Lzrot_kernel_L999
 
-rot_kernel_S10:
+.Lzrot_kernel_S10:
 
        KERNEL_S1
 
        subs    I, I, #1
-        bne     rot_kernel_S10
+        bne     .Lzrot_kernel_S10
 
-rot_kernel_L999:
+.Lzrot_kernel_L999:
 
        mov     w0, wzr
        ret
index daaa55e..9294559 100644 (file)
@@ -215,71 +215,71 @@ zscal_begin:
        mov     X_COPY, X
 
        cmp     N, xzr
-       ble     zscal_kernel_L999
+       ble     .Lzscal_kernel_L999
 
        fcmp    DA_R, #0.0
-       bne     zscal_kernel_R_non_zero
+       bne     .Lzscal_kernel_R_non_zero
 
        fcmp    DA_I, #0.0
-       beq     zscal_kernel_RI_zero
+       beq     .Lzscal_kernel_RI_zero
 
-       b       zscal_kernel_R_zero
+       b       .Lzscal_kernel_R_zero
 
-zscal_kernel_R_non_zero:
+.Lzscal_kernel_R_non_zero:
 
        fcmp    DA_I, #0.0
-       beq     zscal_kernel_I_zero
+       beq     .Lzscal_kernel_I_zero
 
 /*******************************************************************************
 * A_R != 0 && A_I != 0
 *******************************************************************************/
 
-zscal_kernel_RI_non_zero:
+.Lzscal_kernel_RI_non_zero:
 
        INIT
 
        cmp     INC_X, #1
-       bne     zscal_kernel_S_BEGIN
+       bne     .Lzscal_kernel_S_BEGIN
 
-zscal_kernel_F_BEGIN:
+.Lzscal_kernel_F_BEGIN:
 
        asr     I, N, #2
        cmp     I, xzr
-       beq     zscal_kernel_F1
+       beq     .Lzscal_kernel_F1
 
        KERNEL_INIT_F4
 
-zscal_kernel_F4:
+.Lzscal_kernel_F4:
 
        KERNEL_F4
 
        subs    I, I, #1
-       bne     zscal_kernel_F4
+       bne     .Lzscal_kernel_F4
 
-zscal_kernel_F1:
+.Lzscal_kernel_F1:
 
        ands    I, N, #3
-       ble     zscal_kernel_L999
+       ble     .Lzscal_kernel_L999
 
-zscal_kernel_F10:
+.Lzscal_kernel_F10:
 
        KERNEL_F1
 
        subs    I, I, #1
-        bne     zscal_kernel_F10
+        bne     .Lzscal_kernel_F10
 
        mov     w0, wzr
        ret
 
-zscal_kernel_S_BEGIN:
+.Lzscal_kernel_S_BEGIN:
 
        INIT_S
 
        asr     I, N, #2
        cmp     I, xzr
-       ble     zscal_kernel_S1
+       ble     .Lzscal_kernel_S1
 
-zscal_kernel_S4:
+.Lzscal_kernel_S4:
 
        KERNEL_S1
        KERNEL_S1
@@ -287,21 +287,21 @@ zscal_kernel_S4:
        KERNEL_S1
 
        subs    I, I, #1
-       bne     zscal_kernel_S4
+       bne     .Lzscal_kernel_S4
 
-zscal_kernel_S1:
+.Lzscal_kernel_S1:
 
        ands    I, N, #3
-       ble     zscal_kernel_L999
+       ble     .Lzscal_kernel_L999
 
-zscal_kernel_S10:
+.Lzscal_kernel_S10:
 
        KERNEL_S1
 
        subs    I, I, #1
-        bne     zscal_kernel_S10
+        bne     .Lzscal_kernel_S10
 
-zscal_kernel_L999:
+.Lzscal_kernel_L999:
 
        mov     w0, wzr
        ret
@@ -310,7 +310,7 @@ zscal_kernel_L999:
 * A_R == 0 && A_I != 0
 *******************************************************************************/
 
-zscal_kernel_R_zero:
+.Lzscal_kernel_R_zero:
        INIT_S
 
 #if !defined(DOUBLE)
@@ -323,7 +323,7 @@ zscal_kernel_R_zero:
        ins     v1.d[1], v2.d[0]                // v1 = -DA_I, DA_I
 #endif
 
-zscal_kernel_R_zero_1:
+.Lzscal_kernel_R_zero_1:
 #if !defined(DOUBLE)
        ld1     {v2.2s}, [X]                    // X1, X0
        fmul    v2.2s, v2.2s, v1.2s             // -DA_I*X1, DA_I*X0
@@ -337,7 +337,7 @@ zscal_kernel_R_zero_1:
 #endif
        add     X, X, INC_X
        subs    N, N, #1
-       bne     zscal_kernel_R_zero_1
+       bne     .Lzscal_kernel_R_zero_1
 
        mov     w0, wzr
        ret
@@ -346,7 +346,7 @@ zscal_kernel_R_zero_1:
 * A_R != 0 && A_I == 0
 *******************************************************************************/
 
-zscal_kernel_I_zero:
+.Lzscal_kernel_I_zero:
        INIT_S
 #if !defined(DOUBLE)
        ins     v0.s[1], v0.s[0]                // v0 = DA_R, DA_R
@@ -354,7 +354,7 @@ zscal_kernel_I_zero:
        ins     v0.d[1], v0.d[0]                // v0 = DA_R, DA_R 
 #endif
 
-zscal_kernel_I_zero_1:
+.Lzscal_kernel_I_zero_1:
 #if !defined(DOUBLE)
        ld1     {v2.2s}, [X]                    // X1, X0
        fmul    v2.2s, v2.2s, v0.2s             // DA_R*X1, DA_R*X0
@@ -366,7 +366,7 @@ zscal_kernel_I_zero_1:
 #endif
        add     X, X, INC_X
        subs    N, N, #1
-       bne     zscal_kernel_I_zero_1
+       bne     .Lzscal_kernel_I_zero_1
 
        mov     w0, wzr
        ret
@@ -375,16 +375,16 @@ zscal_kernel_I_zero_1:
 * A_R == 0 && A_I == 0
 *******************************************************************************/
 
-zscal_kernel_RI_zero:
+.Lzscal_kernel_RI_zero:
 
        INIT_S
 
-zscal_kernel_RI_zero_1:
+.Lzscal_kernel_RI_zero_1:
 
        stp     DA_R, DA_I, [X]
        add     X, X, INC_X
        subs    N, N, #1
-       bne     zscal_kernel_RI_zero_1
+       bne     .Lzscal_kernel_RI_zero_1
 
        mov     w0, wzr
        ret
index 77a7857..462acfe 100644 (file)
@@ -1078,9 +1078,9 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
        mov     counterJ, origN
        asr     counterJ, counterJ, #2          // J = J / 4
        cmp     counterJ, #0
-       ble     ztrmm_kernel_L2_BEGIN
+       ble     .Lztrmm_kernel_L2_BEGIN
 
-ztrmm_kernel_L4_BEGIN:
+.Lztrmm_kernel_L4_BEGIN:
        mov     pCRow0, pC
        add     pCRow1, pCRow0, LDC
        add     pCRow2, pCRow1, LDC
@@ -1094,15 +1094,15 @@ ztrmm_kernel_L4_BEGIN:
 #endif
        mov     pA, origPA                      // pA = start of A array
 
-ztrmm_kernel_L4_M4_BEGIN:
+.Lztrmm_kernel_L4_M4_BEGIN:
 
        mov     counterI, origM
        asr     counterI, counterI, #2          // counterI = counterI / 4
        cmp     counterI, #0
-       ble     ztrmm_kernel_L4_M2_BEGIN
+       ble     .Lztrmm_kernel_L4_M2_BEGIN
 
        .align 5
-ztrmm_kernel_L4_M4_20:
+.Lztrmm_kernel_L4_M4_20:
 
 #if (defined(LEFT) &&  defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA))
        mov     pB, origPB
@@ -1123,7 +1123,7 @@ ztrmm_kernel_L4_M4_20:
 
        asr     counterL , tempK, #3
        cmp     counterL , #2
-       blt     ztrmm_kernel_L4_M4_32
+       blt     .Lztrmm_kernel_L4_M4_32
 
        KERNEL4x4_I
        KERNEL4x4_M2
@@ -1135,10 +1135,10 @@ ztrmm_kernel_L4_M4_20:
        KERNEL4x4_M2
 
        subs    counterL, counterL, #2
-       ble     ztrmm_kernel_L4_M4_22a
+       ble     .Lztrmm_kernel_L4_M4_22a
 
        .align 5
-ztrmm_kernel_L4_M4_22:
+.Lztrmm_kernel_L4_M4_22:
 
        KERNEL4x4_M1
        KERNEL4x4_M2
@@ -1150,10 +1150,10 @@ ztrmm_kernel_L4_M4_22:
        KERNEL4x4_M2
 
        subs    counterL, counterL, #1
-       bgt     ztrmm_kernel_L4_M4_22
+       bgt     .Lztrmm_kernel_L4_M4_22
 
        .align 5
-ztrmm_kernel_L4_M4_22a:
+.Lztrmm_kernel_L4_M4_22a:
 
        KERNEL4x4_M1
        KERNEL4x4_M2
@@ -1164,13 +1164,13 @@ ztrmm_kernel_L4_M4_22a:
        KERNEL4x4_M1
        KERNEL4x4_E
 
-       b        ztrmm_kernel_L4_M4_44
+       b        .Lztrmm_kernel_L4_M4_44
 
        .align 5
-ztrmm_kernel_L4_M4_32:
+.Lztrmm_kernel_L4_M4_32:
 
        tst     counterL, #1
-       ble     ztrmm_kernel_L4_M4_40
+       ble     .Lztrmm_kernel_L4_M4_40
 
        KERNEL4x4_I
        KERNEL4x4_M2
@@ -1181,26 +1181,26 @@ ztrmm_kernel_L4_M4_32:
        KERNEL4x4_M1
        KERNEL4x4_E
 
-       b       ztrmm_kernel_L4_M4_44
+       b       .Lztrmm_kernel_L4_M4_44
 
 
-ztrmm_kernel_L4_M4_40:
+.Lztrmm_kernel_L4_M4_40:
 
        INIT4x4
 
-ztrmm_kernel_L4_M4_44:
+.Lztrmm_kernel_L4_M4_44:
 
        ands    counterL , tempK, #7
-       ble     ztrmm_kernel_L4_M4_100
+       ble     .Lztrmm_kernel_L4_M4_100
 
        .align 5
-ztrmm_kernel_L4_M4_46:
+.Lztrmm_kernel_L4_M4_46:
        KERNEL4x4_SUB
 
        subs    counterL, counterL, #1
-       bne     ztrmm_kernel_L4_M4_46
+       bne     .Lztrmm_kernel_L4_M4_46
 
-ztrmm_kernel_L4_M4_100:
+.Lztrmm_kernel_L4_M4_100:
 
        SAVE4x4
 
@@ -1223,20 +1223,20 @@ ztrmm_kernel_L4_M4_100:
        prfm    PLDL1KEEP, [pA, #64]
        prfm    PLDL1KEEP, [origPB]
 
-ztrmm_kernel_L4_M4_END:
+.Lztrmm_kernel_L4_M4_END:
        subs    counterI, counterI, #1
-       bne     ztrmm_kernel_L4_M4_20
+       bne     .Lztrmm_kernel_L4_M4_20
 
-ztrmm_kernel_L4_M2_BEGIN:
+.Lztrmm_kernel_L4_M2_BEGIN:
 
        mov     counterI, origM
        tst     counterI , #3
-       ble     ztrmm_kernel_L4_END
+       ble     .Lztrmm_kernel_L4_END
 
        tst     counterI, #2                    // counterI = counterI / 2
-       ble     ztrmm_kernel_L4_M1_BEGIN
+       ble     .Lztrmm_kernel_L4_M1_BEGIN
 
-ztrmm_kernel_L4_M2_20:
+.Lztrmm_kernel_L4_M2_20:
 
        INIT2x4
 
@@ -1260,9 +1260,9 @@ ztrmm_kernel_L4_M2_20:
 
        asr     counterL , tempK, #3            // counterL = counterL / 8
        cmp     counterL , #0
-       ble     ztrmm_kernel_L4_M2_40
+       ble     .Lztrmm_kernel_L4_M2_40
 
-ztrmm_kernel_L4_M2_22:
+.Lztrmm_kernel_L4_M2_22:
 
        KERNEL2x4_SUB
        KERNEL2x4_SUB
@@ -1275,22 +1275,22 @@ ztrmm_kernel_L4_M2_22:
        KERNEL2x4_SUB
 
        subs    counterL, counterL, #1
-       bgt     ztrmm_kernel_L4_M2_22
+       bgt     .Lztrmm_kernel_L4_M2_22
 
 
-ztrmm_kernel_L4_M2_40:
+.Lztrmm_kernel_L4_M2_40:
 
        ands    counterL , tempK, #7            // counterL = counterL % 8
-       ble     ztrmm_kernel_L4_M2_100
+       ble     .Lztrmm_kernel_L4_M2_100
 
-ztrmm_kernel_L4_M2_42:
+.Lztrmm_kernel_L4_M2_42:
 
        KERNEL2x4_SUB
 
        subs    counterL, counterL, #1
-       bgt     ztrmm_kernel_L4_M2_42
+       bgt     .Lztrmm_kernel_L4_M2_42
 
-ztrmm_kernel_L4_M2_100:
+.Lztrmm_kernel_L4_M2_100:
 
        SAVE2x4
 
@@ -1310,15 +1310,15 @@ ztrmm_kernel_L4_M2_100:
        add     tempOffset, tempOffset, #2
 #endif
 
-ztrmm_kernel_L4_M2_END:
+.Lztrmm_kernel_L4_M2_END:
 
 
-ztrmm_kernel_L4_M1_BEGIN:
+.Lztrmm_kernel_L4_M1_BEGIN:
 
        tst     counterI, #1                    // counterI = counterI % 2
-       ble     ztrmm_kernel_L4_END
+       ble     .Lztrmm_kernel_L4_END
 
-ztrmm_kernel_L4_M1_20:
+.Lztrmm_kernel_L4_M1_20:
 
        INIT1x4
 
@@ -1342,9 +1342,9 @@ ztrmm_kernel_L4_M1_20:
 
        asr     counterL , tempK, #3            // counterL = counterL / 8
        cmp     counterL , #0
-       ble     ztrmm_kernel_L4_M1_40
+       ble     .Lztrmm_kernel_L4_M1_40
 
-ztrmm_kernel_L4_M1_22:
+.Lztrmm_kernel_L4_M1_22:
        KERNEL1x4_SUB
        KERNEL1x4_SUB
        KERNEL1x4_SUB
@@ -1356,22 +1356,22 @@ ztrmm_kernel_L4_M1_22:
        KERNEL1x4_SUB
 
        subs    counterL, counterL, #1
-       bgt     ztrmm_kernel_L4_M1_22
+       bgt     .Lztrmm_kernel_L4_M1_22
 
 
-ztrmm_kernel_L4_M1_40:
+.Lztrmm_kernel_L4_M1_40:
 
        ands    counterL , tempK, #7            // counterL = counterL % 8
-       ble     ztrmm_kernel_L4_M1_100
+       ble     .Lztrmm_kernel_L4_M1_100
 
-ztrmm_kernel_L4_M1_42:
+.Lztrmm_kernel_L4_M1_42:
 
        KERNEL1x4_SUB
 
        subs    counterL, counterL, #1
-       bgt     ztrmm_kernel_L4_M1_42
+       bgt     .Lztrmm_kernel_L4_M1_42
 
-ztrmm_kernel_L4_M1_100:
+.Lztrmm_kernel_L4_M1_100:
 
        SAVE1x4
 
@@ -1392,7 +1392,7 @@ ztrmm_kernel_L4_M1_100:
 #endif
 
 
-ztrmm_kernel_L4_END:
+.Lztrmm_kernel_L4_END:
 
        lsl     temp, origK, #6
        add     origPB, origPB, temp            // B = B + K * 4 * 8 * 2
@@ -1402,19 +1402,19 @@ ztrmm_kernel_L4_END:
 #endif
 
        subs    counterJ, counterJ , #1         // j--
-       bgt     ztrmm_kernel_L4_BEGIN
+       bgt     .Lztrmm_kernel_L4_BEGIN
 
 
 /******************************************************************************/
 
-ztrmm_kernel_L2_BEGIN:   // less than 2 left in N direction
+.Lztrmm_kernel_L2_BEGIN:   // less than 2 left in N direction
 
        mov     counterJ , origN
        tst     counterJ , #3
-       ble     ztrmm_kernel_L999   // error, N was less than 4?
+       ble     .Lztrmm_kernel_L999   // error, N was less than 4?
 
        tst     counterJ , #2
-       ble     ztrmm_kernel_L1_BEGIN
+       ble     .Lztrmm_kernel_L1_BEGIN
 
        mov     pCRow0, pC                      // pCRow0 = pC
 
@@ -1426,14 +1426,14 @@ ztrmm_kernel_L2_BEGIN:   // less than 2 left in N direction
 
        mov     pA, origPA                      // pA = A
 
-ztrmm_kernel_L2_M4_BEGIN:
+.Lztrmm_kernel_L2_M4_BEGIN:
 
        mov     counterI, origM
        asr     counterI, counterI, #2          // counterI = counterI / 4
        cmp     counterI,#0
-       ble     ztrmm_kernel_L2_M2_BEGIN
+       ble     .Lztrmm_kernel_L2_M2_BEGIN
 
-ztrmm_kernel_L2_M4_20:
+.Lztrmm_kernel_L2_M4_20:
 
        INIT4x2
 
@@ -1457,10 +1457,10 @@ ztrmm_kernel_L2_M4_20:
 
        asr     counterL , tempK, #3            // counterL = counterL / 8
        cmp     counterL,#0
-       ble     ztrmm_kernel_L2_M4_40
+       ble     .Lztrmm_kernel_L2_M4_40
        .align 5
 
-ztrmm_kernel_L2_M4_22:
+.Lztrmm_kernel_L2_M4_22:
        KERNEL4x2_SUB
        KERNEL4x2_SUB
        KERNEL4x2_SUB
@@ -1472,22 +1472,22 @@ ztrmm_kernel_L2_M4_22:
        KERNEL4x2_SUB
 
        subs    counterL, counterL, #1
-       bgt     ztrmm_kernel_L2_M4_22
+       bgt     .Lztrmm_kernel_L2_M4_22
 
 
-ztrmm_kernel_L2_M4_40:
+.Lztrmm_kernel_L2_M4_40:
 
        ands    counterL , tempK, #7            // counterL = counterL % 8
-       ble     ztrmm_kernel_L2_M4_100
+       ble     .Lztrmm_kernel_L2_M4_100
 
-ztrmm_kernel_L2_M4_42:
+.Lztrmm_kernel_L2_M4_42:
 
        KERNEL4x2_SUB
 
        subs    counterL, counterL, #1
-       bgt     ztrmm_kernel_L2_M4_42
+       bgt     .Lztrmm_kernel_L2_M4_42
 
-ztrmm_kernel_L2_M4_100:
+.Lztrmm_kernel_L2_M4_100:
 
        SAVE4x2
 
@@ -1507,22 +1507,22 @@ ztrmm_kernel_L2_M4_100:
        add     tempOffset, tempOffset, #4
 #endif
 
-ztrmm_kernel_L2_M4_END:
+.Lztrmm_kernel_L2_M4_END:
 
        subs    counterI, counterI, #1
-       bgt     ztrmm_kernel_L2_M4_20
+       bgt     .Lztrmm_kernel_L2_M4_20
 
 
-ztrmm_kernel_L2_M2_BEGIN:
+.Lztrmm_kernel_L2_M2_BEGIN:
 
        mov     counterI, origM
        tst     counterI , #3
-       ble     ztrmm_kernel_L2_END
+       ble     .Lztrmm_kernel_L2_END
 
        tst     counterI, #2                    // counterI = counterI / 2
-       ble     ztrmm_kernel_L2_M1_BEGIN
+       ble     .Lztrmm_kernel_L2_M1_BEGIN
 
-ztrmm_kernel_L2_M2_20:
+.Lztrmm_kernel_L2_M2_20:
 
        INIT2x2
 
@@ -1546,9 +1546,9 @@ ztrmm_kernel_L2_M2_20:
 
        asr     counterL , tempK, #3            // counterL = counterL / 8
         cmp    counterL,#0
-       ble     ztrmm_kernel_L2_M2_40
+       ble     .Lztrmm_kernel_L2_M2_40
 
-ztrmm_kernel_L2_M2_22:
+.Lztrmm_kernel_L2_M2_22:
 
        KERNEL2x2_SUB
        KERNEL2x2_SUB
@@ -1561,22 +1561,22 @@ ztrmm_kernel_L2_M2_22:
        KERNEL2x2_SUB
 
        subs    counterL, counterL, #1
-       bgt     ztrmm_kernel_L2_M2_22
+       bgt     .Lztrmm_kernel_L2_M2_22
 
 
-ztrmm_kernel_L2_M2_40:
+.Lztrmm_kernel_L2_M2_40:
 
        ands    counterL , tempK, #7            // counterL = counterL % 8
-       ble     ztrmm_kernel_L2_M2_100
+       ble     .Lztrmm_kernel_L2_M2_100
 
-ztrmm_kernel_L2_M2_42:
+.Lztrmm_kernel_L2_M2_42:
 
        KERNEL2x2_SUB
 
        subs    counterL, counterL, #1
-       bgt     ztrmm_kernel_L2_M2_42
+       bgt     .Lztrmm_kernel_L2_M2_42
 
-ztrmm_kernel_L2_M2_100:
+.Lztrmm_kernel_L2_M2_100:
 
        SAVE2x2
 
@@ -1596,15 +1596,15 @@ ztrmm_kernel_L2_M2_100:
        add     tempOffset, tempOffset, #2
 #endif
 
-ztrmm_kernel_L2_M2_END:
+.Lztrmm_kernel_L2_M2_END:
 
 
-ztrmm_kernel_L2_M1_BEGIN:
+.Lztrmm_kernel_L2_M1_BEGIN:
 
        tst     counterI, #1                    // counterI = counterI % 2
-       ble     ztrmm_kernel_L2_END
+       ble     .Lztrmm_kernel_L2_END
 
-ztrmm_kernel_L2_M1_20:
+.Lztrmm_kernel_L2_M1_20:
 
        INIT1x2
 
@@ -1628,9 +1628,9 @@ ztrmm_kernel_L2_M1_20:
 
        asr     counterL , tempK, #3            // counterL = counterL / 8
         cmp     counterL, #0
-       ble     ztrmm_kernel_L2_M1_40
+       ble     .Lztrmm_kernel_L2_M1_40
 
-ztrmm_kernel_L2_M1_22:
+.Lztrmm_kernel_L2_M1_22:
        KERNEL1x2_SUB
        KERNEL1x2_SUB
        KERNEL1x2_SUB
@@ -1642,22 +1642,22 @@ ztrmm_kernel_L2_M1_22:
        KERNEL1x2_SUB
 
        subs    counterL, counterL, #1
-       bgt     ztrmm_kernel_L2_M1_22
+       bgt     .Lztrmm_kernel_L2_M1_22
 
 
-ztrmm_kernel_L2_M1_40:
+.Lztrmm_kernel_L2_M1_40:
 
        ands    counterL , tempK, #7            // counterL = counterL % 8
-       ble     ztrmm_kernel_L2_M1_100
+       ble     .Lztrmm_kernel_L2_M1_100
 
-ztrmm_kernel_L2_M1_42:
+.Lztrmm_kernel_L2_M1_42:
 
        KERNEL1x2_SUB
 
        subs    counterL, counterL, #1
-       bgt     ztrmm_kernel_L2_M1_42
+       bgt     .Lztrmm_kernel_L2_M1_42
 
-ztrmm_kernel_L2_M1_100:
+.Lztrmm_kernel_L2_M1_100:
 
        SAVE1x2
 
@@ -1678,7 +1678,7 @@ ztrmm_kernel_L2_M1_100:
 #endif
 
 
-ztrmm_kernel_L2_END:
+.Lztrmm_kernel_L2_END:
 #if !defined(LEFT)
        add     tempOffset, tempOffset, #2
 #endif
@@ -1688,11 +1688,11 @@ ztrmm_kernel_L2_END:
 
 /******************************************************************************/
 
-ztrmm_kernel_L1_BEGIN:
+.Lztrmm_kernel_L1_BEGIN:
 
        mov     counterJ , origN
        tst     counterJ , #1
-       ble     ztrmm_kernel_L999 // done
+       ble     .Lztrmm_kernel_L999 // done
 
 
        mov     pCRow0, pC                      // pCRow0 = C
@@ -1706,14 +1706,14 @@ ztrmm_kernel_L1_BEGIN:
 
 
 
-ztrmm_kernel_L1_M4_BEGIN:
+.Lztrmm_kernel_L1_M4_BEGIN:
 
        mov     counterI, origM
        asr     counterI, counterI, #2          // counterI = counterI / 4
        cmp     counterI, #0
-       ble     ztrmm_kernel_L1_M2_BEGIN
+       ble     .Lztrmm_kernel_L1_M2_BEGIN
 
-ztrmm_kernel_L1_M4_20:
+.Lztrmm_kernel_L1_M4_20:
 
        INIT4x1
 
@@ -1737,10 +1737,10 @@ ztrmm_kernel_L1_M4_20:
 
        asr     counterL , tempK, #3            // counterL = counterL / 8
        cmp     counterL , #0
-       ble     ztrmm_kernel_L1_M4_40
+       ble     .Lztrmm_kernel_L1_M4_40
        .align 5
 
-ztrmm_kernel_L1_M4_22:
+.Lztrmm_kernel_L1_M4_22:
        KERNEL4x1_SUB
        KERNEL4x1_SUB
        KERNEL4x1_SUB
@@ -1752,22 +1752,22 @@ ztrmm_kernel_L1_M4_22:
        KERNEL4x1_SUB
 
        subs    counterL, counterL, #1
-       bgt     ztrmm_kernel_L1_M4_22
+       bgt     .Lztrmm_kernel_L1_M4_22
 
 
-ztrmm_kernel_L1_M4_40:
+.Lztrmm_kernel_L1_M4_40:
 
        ands    counterL , tempK, #7            // counterL = counterL % 8
-       ble     ztrmm_kernel_L1_M4_100
+       ble     .Lztrmm_kernel_L1_M4_100
 
-ztrmm_kernel_L1_M4_42:
+.Lztrmm_kernel_L1_M4_42:
 
        KERNEL4x1_SUB
 
        subs    counterL, counterL, #1
-       bgt     ztrmm_kernel_L1_M4_42
+       bgt     .Lztrmm_kernel_L1_M4_42
 
-ztrmm_kernel_L1_M4_100:
+.Lztrmm_kernel_L1_M4_100:
 
        SAVE4x1
 
@@ -1787,22 +1787,22 @@ ztrmm_kernel_L1_M4_100:
        add     tempOffset, tempOffset, #4
 #endif
 
-ztrmm_kernel_L1_M4_END:
+.Lztrmm_kernel_L1_M4_END:
 
        subs    counterI, counterI, #1
-       bgt     ztrmm_kernel_L1_M4_20
+       bgt     .Lztrmm_kernel_L1_M4_20
 
 
-ztrmm_kernel_L1_M2_BEGIN:
+.Lztrmm_kernel_L1_M2_BEGIN:
 
        mov     counterI, origM
        tst     counterI , #3
-       ble     ztrmm_kernel_L1_END
+       ble     .Lztrmm_kernel_L1_END
 
        tst     counterI, #2                    // counterI = counterI / 2
-       ble     ztrmm_kernel_L1_M1_BEGIN
+       ble     .Lztrmm_kernel_L1_M1_BEGIN
 
-ztrmm_kernel_L1_M2_20:
+.Lztrmm_kernel_L1_M2_20:
 
        INIT2x1
 
@@ -1826,9 +1826,9 @@ ztrmm_kernel_L1_M2_20:
 
        asr     counterL , tempK, #3            // counterL = counterL / 8
        cmp     counterL , #0
-       ble     ztrmm_kernel_L1_M2_40
+       ble     .Lztrmm_kernel_L1_M2_40
 
-ztrmm_kernel_L1_M2_22:
+.Lztrmm_kernel_L1_M2_22:
 
        KERNEL2x1_SUB
        KERNEL2x1_SUB
@@ -1841,22 +1841,22 @@ ztrmm_kernel_L1_M2_22:
        KERNEL2x1_SUB
 
        subs    counterL, counterL, #1
-       bgt     ztrmm_kernel_L1_M2_22
+       bgt     .Lztrmm_kernel_L1_M2_22
 
 
-ztrmm_kernel_L1_M2_40:
+.Lztrmm_kernel_L1_M2_40:
 
        ands    counterL , tempK, #7            // counterL = counterL % 8
-       ble     ztrmm_kernel_L1_M2_100
+       ble     .Lztrmm_kernel_L1_M2_100
 
-ztrmm_kernel_L1_M2_42:
+.Lztrmm_kernel_L1_M2_42:
 
        KERNEL2x1_SUB
 
        subs    counterL, counterL, #1
-       bgt     ztrmm_kernel_L1_M2_42
+       bgt     .Lztrmm_kernel_L1_M2_42
 
-ztrmm_kernel_L1_M2_100:
+.Lztrmm_kernel_L1_M2_100:
 
        SAVE2x1
 
@@ -1876,15 +1876,15 @@ ztrmm_kernel_L1_M2_100:
        add     tempOffset, tempOffset, #2
 #endif
 
-ztrmm_kernel_L1_M2_END:
+.Lztrmm_kernel_L1_M2_END:
 
 
-ztrmm_kernel_L1_M1_BEGIN:
+.Lztrmm_kernel_L1_M1_BEGIN:
 
        tst     counterI, #1                    // counterI = counterI % 2
-       ble     ztrmm_kernel_L1_END
+       ble     .Lztrmm_kernel_L1_END
 
-ztrmm_kernel_L1_M1_20:
+.Lztrmm_kernel_L1_M1_20:
 
        INIT1x1
 
@@ -1908,9 +1908,9 @@ ztrmm_kernel_L1_M1_20:
 
        asr     counterL , tempK, #3            // counterL = counterL / 8
        cmp     counterL , #0
-       ble     ztrmm_kernel_L1_M1_40
+       ble     .Lztrmm_kernel_L1_M1_40
 
-ztrmm_kernel_L1_M1_22:
+.Lztrmm_kernel_L1_M1_22:
        KERNEL1x1_SUB
        KERNEL1x1_SUB
        KERNEL1x1_SUB
@@ -1922,30 +1922,30 @@ ztrmm_kernel_L1_M1_22:
        KERNEL1x1_SUB
 
        subs    counterL, counterL, #1
-       bgt     ztrmm_kernel_L1_M1_22
+       bgt     .Lztrmm_kernel_L1_M1_22
 
 
-ztrmm_kernel_L1_M1_40:
+.Lztrmm_kernel_L1_M1_40:
 
        ands    counterL , tempK, #7            // counterL = counterL % 8
-       ble     ztrmm_kernel_L1_M1_100
+       ble     .Lztrmm_kernel_L1_M1_100
 
-ztrmm_kernel_L1_M1_42:
+.Lztrmm_kernel_L1_M1_42:
 
        KERNEL1x1_SUB
 
        subs    counterL, counterL, #1
-       bgt     ztrmm_kernel_L1_M1_42
+       bgt     .Lztrmm_kernel_L1_M1_42
 
-ztrmm_kernel_L1_M1_100:
+.Lztrmm_kernel_L1_M1_100:
 
        SAVE1x1
 
 
-ztrmm_kernel_L1_END:
+.Lztrmm_kernel_L1_END:
 
 
-ztrmm_kernel_L999:
+.Lztrmm_kernel_L999:
        mov     x0, #0                          // set return value
        ldp     d8, d9, [sp, #(0 * 16)]
        ldp     d10, d11, [sp, #(1 * 16)]