added cgemm_tcopy_8_power8.S
authorWerner Saar <wernsaar@googlemail.com>
Sat, 23 Apr 2016 05:37:18 +0000 (07:37 +0200)
committerWerner Saar <wernsaar@googlemail.com>
Sat, 23 Apr 2016 05:37:18 +0000 (07:37 +0200)
kernel/power/KERNEL.POWER8
kernel/power/cgemm_tcopy_8_power8.S [new file with mode: 0644]
kernel/power/cgemm_tcopy_logic_8_power8.S [new file with mode: 0644]
kernel/power/cgemm_tcopy_macros_8_power8.S [new file with mode: 0644]

index c7df0e0..9406e77 100644 (file)
@@ -30,7 +30,7 @@ DGEMMOTCOPYOBJ = dgemm_otcopy.o
 
 CGEMMKERNEL    = cgemm_kernel_8x4_power8.S
 CGEMMINCOPY    = ../generic/zgemm_ncopy_8.c
-CGEMMITCOPY    = ../generic/zgemm_tcopy_8.c
+CGEMMITCOPY    = cgemm_tcopy_8_power8.S
 CGEMMONCOPY    = ../generic/zgemm_ncopy_4.c
 CGEMMOTCOPY    = ../generic/zgemm_tcopy_4.c
 CGEMMONCOPYOBJ =  cgemm_oncopy.o
diff --git a/kernel/power/cgemm_tcopy_8_power8.S b/kernel/power/cgemm_tcopy_8_power8.S
new file mode 100644 (file)
index 0000000..b1a7d2b
--- /dev/null
@@ -0,0 +1,206 @@
+/***************************************************************************
+Copyright (c) 2013-2016, The OpenBLAS Project
+All rights reserved.
+Redistribution and use in source and binary forms, with or without
+modification, are permitted provided that the following conditions are
+met:
+1. Redistributions of source code must retain the above copyright
+notice, this list of conditions and the following disclaimer.
+2. Redistributions in binary form must reproduce the above copyright
+notice, this list of conditions and the following disclaimer in
+the documentation and/or other materials provided with the
+distribution.
+3. Neither the name of the OpenBLAS project nor the names of
+its contributors may be used to endorse or promote products
+derived from this software without specific prior written permission.
+THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE
+LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
+USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+*****************************************************************************/
+
+/**************************************************************************************
+* 2016/04/23 Werner Saar (wernsaar@googlemail.com)
+*       BLASTEST               : OK
+*       CTEST                  : OK
+*       TEST                   : OK
+*       LAPACK-TEST            : OK
+**************************************************************************************/
+
+/*********************************************************************/
+/* Copyright 2009, 2010 The University of Texas at Austin.           */
+/* All rights reserved.                                              */
+/*                                                                   */
+/* Redistribution and use in source and binary forms, with or        */
+/* without modification, are permitted provided that the following   */
+/* conditions are met:                                               */
+/*                                                                   */
+/*   1. Redistributions of source code must retain the above         */
+/*      copyright notice, this list of conditions and the following  */
+/*      disclaimer.                                                  */
+/*                                                                   */
+/*   2. Redistributions in binary form must reproduce the above      */
+/*      copyright notice, this list of conditions and the following  */
+/*      disclaimer in the documentation and/or other materials       */
+/*      provided with the distribution.                              */
+/*                                                                   */
+/*    THIS  SOFTWARE IS PROVIDED  BY THE  UNIVERSITY OF  TEXAS AT    */
+/*    AUSTIN  ``AS IS''  AND ANY  EXPRESS OR  IMPLIED WARRANTIES,    */
+/*    INCLUDING, BUT  NOT LIMITED  TO, THE IMPLIED  WARRANTIES OF    */
+/*    MERCHANTABILITY  AND FITNESS FOR  A PARTICULAR  PURPOSE ARE    */
+/*    DISCLAIMED.  IN  NO EVENT SHALL THE UNIVERSITY  OF TEXAS AT    */
+/*    AUSTIN OR CONTRIBUTORS BE  LIABLE FOR ANY DIRECT, INDIRECT,    */
+/*    INCIDENTAL,  SPECIAL, EXEMPLARY,  OR  CONSEQUENTIAL DAMAGES    */
+/*    (INCLUDING, BUT  NOT LIMITED TO,  PROCUREMENT OF SUBSTITUTE    */
+/*    GOODS  OR  SERVICES; LOSS  OF  USE,  DATA,  OR PROFITS;  OR    */
+/*    BUSINESS INTERRUPTION) HOWEVER CAUSED  AND ON ANY THEORY OF    */
+/*    LIABILITY, WHETHER  IN CONTRACT, STRICT  LIABILITY, OR TORT    */
+/*    (INCLUDING NEGLIGENCE OR OTHERWISE)  ARISING IN ANY WAY OUT    */
+/*    OF  THE  USE OF  THIS  SOFTWARE,  EVEN  IF ADVISED  OF  THE    */
+/*    POSSIBILITY OF SUCH DAMAGE.                                    */
+/*                                                                   */
+/* The views and conclusions contained in the software and           */
+/* documentation are those of the authors and should not be          */
+/* interpreted as representing official policies, either expressed   */
+/* or implied, of The University of Texas at Austin.                 */
+/*********************************************************************/
+
+#define ASSEMBLER
+#include "common.h"
+#include "def_vsx.h"
+
+#define        M       r3
+#define        N       r4
+#define        A       r5
+#define        LDA     r6
+#define B      r7
+
+#define A0     r8
+#define A1     r9
+#define A2     r10
+#define A3     r11
+
+#define J      r12
+
+#define PREA   r14
+#define PREB   r15
+#define BO     r16
+#define B8     r17
+#define B4     r18
+#define B2     r19
+#define B1     r20
+#define o4     r21
+#define T2     r22
+#define I      r23
+#define o16    r24
+#define o32    r25
+#define o48    r26
+#define NOTUS2  r27
+#define M8     r30
+#define T1     r31
+
+#define o0     0
+
+#include "cgemm_tcopy_macros_8_power8.S"
+
+#define STACKSIZE 384
+
+
+       PROLOGUE
+       PROFCODE
+
+       addi    SP, SP, -STACKSIZE
+       li      r0, 0
+
+        std     r31,  144(SP)
+        std     r30,  152(SP)
+        std     r29,  160(SP)
+        std     r28,  168(SP)
+        std     r27,  176(SP)
+        std     r26,  184(SP)
+        std     r25,  192(SP)
+        std     r24,  200(SP)
+        std     r23,  208(SP)
+        std     r22,  216(SP)
+        std     r21,  224(SP)
+        std     r20,  232(SP)
+        std     r19,  240(SP)
+        std     r18,  248(SP)
+        std     r17,  256(SP)
+        std     r16,  264(SP)
+        std     r15,  272(SP)
+        std     r14,  280(SP)
+
+       cmpwi   cr0, M, 0
+       ble-    L999
+       cmpwi   cr0, N, 0
+       ble-    L999
+
+       slwi    LDA, LDA, ZBASE_SHIFT
+       slwi    M8, M, 3 + ZBASE_SHIFT
+
+       li      T2,     -8
+       li      PREA,   -4
+       li      PREB,   -2
+
+       and     B4, N, T2
+       and     B2, N, PREA
+       and     B1, N, PREB
+       
+       mullw   B4, B4, M
+       mullw   B2, B2, M
+       mullw   B1, B1, M
+
+       slwi    B4, B4, ZBASE_SHIFT
+       slwi    B2, B2, ZBASE_SHIFT
+       slwi    B1, B1, ZBASE_SHIFT
+
+       add     B4, B4, B
+       add     B2, B2, B
+       add     B1, B1, B
+
+       li      PREA,  384
+       addi    PREB,  M8,      128
+
+       li      o4,     4
+       li      o16,    16
+       li      o32,    32
+       li      o48,    48
+
+#include "cgemm_tcopy_logic_8_power8.S"
+
+L999:
+
+       li      r3, 0
+
+        ld      r31,  144(SP)
+        ld      r30,  152(SP)
+        ld      r29,  160(SP)
+        ld      r28,  168(SP)
+        ld      r27,  176(SP)
+        ld      r26,  184(SP)
+        ld      r25,  192(SP)
+        ld      r24,  200(SP)
+        ld      r23,  208(SP)
+        ld      r22,  216(SP)
+        ld      r21,  224(SP)
+        ld      r20,  232(SP)
+        ld      r19,  240(SP)
+        ld      r18,  248(SP)
+        ld      r17,  256(SP)
+        ld      r16,  264(SP)
+        ld      r15,  272(SP)
+        ld      r14,  280(SP)
+
+       addi    SP, SP, STACKSIZE
+
+       blr
+       EPILOGUE
+
+
diff --git a/kernel/power/cgemm_tcopy_logic_8_power8.S b/kernel/power/cgemm_tcopy_logic_8_power8.S
new file mode 100644 (file)
index 0000000..9418908
--- /dev/null
@@ -0,0 +1,247 @@
+/***************************************************************************
+Copyright (c) 2013-2016, The OpenBLAS Project
+All rights reserved.
+Redistribution and use in source and binary forms, with or without
+modification, are permitted provided that the following conditions are
+met:
+1. Redistributions of source code must retain the above copyright
+notice, this list of conditions and the following disclaimer.
+2. Redistributions in binary form must reproduce the above copyright
+notice, this list of conditions and the following disclaimer in
+the documentation and/or other materials provided with the
+distribution.
+3. Neither the name of the OpenBLAS project nor the names of
+its contributors may be used to endorse or promote products
+derived from this software without specific prior written permission.
+THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE
+LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
+USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+*****************************************************************************/
+
+/**************************************************************************************
+* 2016/04/23 Werner Saar (wernsaar@googlemail.com)
+*       BLASTEST               : OK
+*       CTEST                  : OK
+*       TEST                   : OK
+*       LAPACK-TEST            : OK
+**************************************************************************************/
+
+
+       srawi.          I,      M,      2
+       ble             CCOPYT_L2_BEGIN
+
+
+CCOPYT_L4_BEGIN:
+
+       mr              A0,     A
+       add             A1,     A0,     LDA
+       add             A2,     A1,     LDA
+       add             A3,     A2,     LDA
+       add             A,      A3,     LDA
+       mr              B8,     B
+       addi            B,      B,      64*SIZE
+
+       sradi.          J,      N,      3
+       ble             CCOPYT_L4x4_BEGIN
+
+       mr              BO,     B8
+
+CCOPYT_L4x8_LOOP:
+
+       dcbt            A0, PREA
+       dcbt            A1, PREA
+       dcbt            A2, PREA
+       dcbt            A3, PREA
+       dcbtst          BO, M8
+       dcbtst          BO, PREB
+       COPY_4x8
+
+       add             BO,     BO,     M8
+
+       addic.          J,      J,      -1
+       ble             CCOPYT_L4x4_BEGIN
+
+
+       COPY_4x8
+
+       add             BO,     BO,     M8
+
+       addic.          J,      J,      -1
+       bgt             CCOPYT_L4x8_LOOP
+
+CCOPYT_L4x4_BEGIN:
+
+       andi.           T1,     N,      4
+       ble             CCOPYT_L4x2_BEGIN
+
+       mr              BO,     B4
+
+       COPY_4x4
+
+
+       addi            B4,     B4,     32*SIZE
+
+CCOPYT_L4x2_BEGIN:
+
+       andi.           T1,     N,      2
+       ble             CCOPYT_L4x1_BEGIN
+
+       mr              BO,     B2
+
+       COPY_4x2
+
+
+       addi            B2,     B2,     16*SIZE
+
+CCOPYT_L4x1_BEGIN:
+
+       andi.           T1,     N,      1
+       ble             CCOPYT_L4_END
+
+       mr              BO,     B1
+
+       COPY_4x1
+
+
+       addi            B1,     B1,     8*SIZE
+
+CCOPYT_L4_END:
+
+       addic.          I,      I,      -1
+       bgt             CCOPYT_L4_BEGIN
+
+
+
+CCOPYT_L2_BEGIN:
+
+       andi.           T1,     M,      2
+       ble             CCOPYT_L1_BEGIN
+
+       mr              A0,     A
+       add             A1,     A0,     LDA
+       add             A,      A1,     LDA
+       mr              B8,     B
+       addi            B,      B,      32*SIZE
+
+       sradi.          J,      N,      3
+       ble             CCOPYT_L2x4_BEGIN
+
+       mr              BO,     B8
+
+CCOPYT_L2x8_LOOP:
+
+       COPY_2x8
+
+       add             BO,     BO,     M8
+
+       addic.          J,      J,      -1
+       bgt             CCOPYT_L2x8_LOOP
+
+CCOPYT_L2x4_BEGIN:
+
+       andi.           T1,     N,      4
+       ble             CCOPYT_L2x2_BEGIN
+
+       mr              BO,     B4
+
+       COPY_2x4
+
+
+       addi            B4,     B4,     16*SIZE
+
+CCOPYT_L2x2_BEGIN:
+
+       andi.           T1,     N,      2
+       ble             CCOPYT_L2x1_BEGIN
+
+       mr              BO,     B2
+
+       COPY_2x2
+
+
+       addi            B2,     B2,     8*SIZE
+
+CCOPYT_L2x1_BEGIN:
+
+       andi.           T1,     N,      1
+       ble             CCOPYT_L2_END
+
+       mr              BO,     B1
+
+       COPY_2x1
+
+
+       addi            B1,     B1,     4*SIZE
+
+CCOPYT_L2_END:
+
+
+CCOPYT_L1_BEGIN:
+
+       andi.           T1,     M,      1
+       ble             L999
+
+       mr              A0,     A
+       add             A,      A0,     LDA
+       mr              B8,     B
+       addi            B,      B,      16*SIZE
+
+       sradi.          J,      N,      3
+       ble             CCOPYT_L1x4_BEGIN
+
+       mr              BO,     B8
+
+CCOPYT_L1x8_LOOP:
+
+       COPY_1x8
+
+       add             BO,     BO,     M8
+
+       addic.          J,      J,      -1
+       bgt             CCOPYT_L1x8_LOOP
+
+CCOPYT_L1x4_BEGIN:
+
+       andi.           T1,     N,      4
+       ble             CCOPYT_L1x2_BEGIN
+
+       mr              BO,     B4
+
+       COPY_1x4
+
+
+       addi            B4,     B4,     8*SIZE
+
+CCOPYT_L1x2_BEGIN:
+
+       andi.           T1,     N,      2
+       ble             CCOPYT_L1x1_BEGIN
+
+       mr              BO,     B2
+
+       COPY_1x2
+
+
+       addi            B2,     B2,     4*SIZE
+
+CCOPYT_L1x1_BEGIN:
+
+       andi.           T1,     N,      1
+       ble             CCOPYT_L1_END
+
+       mr              BO,     B1
+
+       COPY_1x1
+
+
+       addi            B1,     B1,     2*SIZE
+
+CCOPYT_L1_END:
+
diff --git a/kernel/power/cgemm_tcopy_macros_8_power8.S b/kernel/power/cgemm_tcopy_macros_8_power8.S
new file mode 100644 (file)
index 0000000..03fda27
--- /dev/null
@@ -0,0 +1,385 @@
+/***************************************************************************
+Copyright (c) 2013-2016, The OpenBLAS Project
+All rights reserved.
+Redistribution and use in source and binary forms, with or without
+modification, are permitted provided that the following conditions are
+met:
+1. Redistributions of source code must retain the above copyright
+notice, this list of conditions and the following disclaimer.
+2. Redistributions in binary form must reproduce the above copyright
+notice, this list of conditions and the following disclaimer in
+the documentation and/or other materials provided with the
+distribution.
+3. Neither the name of the OpenBLAS project nor the names of
+its contributors may be used to endorse or promote products
+derived from this software without specific prior written permission.
+THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE
+LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
+USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+*****************************************************************************/
+
+/**************************************************************************************
+* 2016/04/23 Werner Saar (wernsaar@googlemail.com)
+*       BLASTEST               : OK
+*       CTEST                  : OK
+*       TEST                   : OK
+*       LAPACK-TEST            : OK
+**************************************************************************************/
+
+
+/**********************************************************************************************
+* Macros for N=4 and M=8
+**********************************************************************************************/
+
+.macro COPY_4x8
+
+       lxvw4x          vs32,   o0,     A0
+       lxvw4x          vs33,   o16,    A0
+       lxvw4x          vs34,   o32,    A0
+       lxvw4x          vs35,   o48,    A0
+
+       lxvw4x          vs36,   o0,     A1
+       lxvw4x          vs37,   o16,    A1
+       lxvw4x          vs38,   o32,    A1
+       lxvw4x          vs39,   o48,    A1
+
+       addi            A0,     A0,     64
+       addi            A1,     A1,     64
+
+       lxvw4x          vs40,   o0,     A2
+       lxvw4x          vs41,   o16,    A2
+       lxvw4x          vs42,   o32,    A2
+       lxvw4x          vs43,   o48,    A2
+
+       lxvw4x          vs44,   o0,     A3
+       lxvw4x          vs45,   o16,    A3
+       lxvw4x          vs46,   o32,    A3
+       lxvw4x          vs47,   o48,    A3
+
+       mr              T1,     BO
+       addi            A2,     A2,     64
+       addi            A3,     A3,     64
+
+       stxvw4x         vs32,   o0,     T1
+       stxvw4x         vs33,   o16,    T1
+       stxvw4x         vs34,   o32,    T1
+       stxvw4x         vs35,   o48,    T1
+
+       addi            T1,     T1,     64
+
+       stxvw4x         vs36,   o0,     T1
+       stxvw4x         vs37,   o16,    T1
+       stxvw4x         vs38,   o32,    T1
+       stxvw4x         vs39,   o48,    T1
+
+       addi            T1,     T1,     64
+
+       stxvw4x         vs40,   o0,     T1
+       stxvw4x         vs41,   o16,    T1
+       stxvw4x         vs42,   o32,    T1
+       stxvw4x         vs43,   o48,    T1
+
+       addi            T1,     T1,     64
+
+       stxvw4x         vs44,   o0,     T1
+       stxvw4x         vs45,   o16,    T1
+       stxvw4x         vs46,   o32,    T1
+       stxvw4x         vs47,   o48,    T1
+
+.endm
+
+/**********************************************************************************************
+* Macros for N=4 and M=4
+**********************************************************************************************/
+
+.macro COPY_4x4
+
+       lxvw4x          vs32,   o0,     A0
+       lxvw4x          vs33,   o16,    A0
+       addi            A0,     A0,     32
+
+       lxvw4x          vs34,   o0,     A1
+       lxvw4x          vs35,   o16,    A1
+       addi            A1,     A1,     32
+
+       lxvw4x          vs36,   o0,     A2
+       lxvw4x          vs37,   o16,    A2
+       addi            A2,     A2,     32
+
+       lxvw4x          vs38,   o0,     A3
+       lxvw4x          vs39,   o16,    A3
+       addi            A3,     A3,     32
+
+       mr              T1,     BO
+
+       stxvw4x         vs32,   o0,     T1
+       stxvw4x         vs33,   o16,    T1
+
+       stxvw4x         vs34,   o32,    T1
+       stxvw4x         vs35,   o48,    T1
+
+       addi            T1,     T1,     64
+
+       stxvw4x         vs36,   o0,     T1
+       stxvw4x         vs37,   o16,    T1
+
+       stxvw4x         vs38,   o32,    T1
+       stxvw4x         vs39,   o48,    T1
+
+.endm
+
+/**********************************************************************************************
+* Macros for N=4 and M=2
+**********************************************************************************************/
+
+.macro COPY_4x2
+
+       lxvw4x          vs32,   o0,     A0
+       addi            A0,     A0,     16
+
+       lxvw4x          vs33,   o0,     A1
+       addi            A1,     A1,     16
+
+       lxvw4x          vs34,   o0,     A2
+       addi            A2,     A2,     16
+
+       lxvw4x          vs35,   o0,     A3
+       addi            A3,     A3,     16
+
+       mr              T1,     BO
+
+       stxvw4x         vs32,   o0,     T1
+
+       stxvw4x         vs33,   o16,    T1
+
+       stxvw4x         vs34,   o32,    T1
+
+       stxvw4x         vs35,   o48,    T1
+
+.endm
+
+/**********************************************************************************************
+* Macros for N=4 and M=1
+**********************************************************************************************/
+
+.macro COPY_4x1
+
+       lxsspx          vs32,   o0,     A0
+       lxsspx          vs33,   o4,     A0
+       addi            A0,     A0,     8
+
+       lxsspx          vs34,   o0,     A1
+       lxsspx          vs35,   o4,     A1
+       addi            A1,     A1,     8
+
+       lxsspx          vs36,   o0,     A2
+       lxsspx          vs37,   o4,     A2
+       addi            A2,     A2,     8
+
+       lxsspx          vs38,   o0,     A3
+       lxsspx          vs39,   o4,     A3
+       addi            A3,     A3,     8
+
+       mr              T1,     BO
+
+       stxsspx         vs32,   o0,     T1
+       stxsspx         vs33,   o4,     T1
+
+       addi            T1,     T1,     8
+
+       stxsspx         vs34,   o0,     T1
+       stxsspx         vs35,   o4,     T1
+
+       addi            T1,     T1,     8
+
+       stxsspx         vs36,   o0,     T1
+       stxsspx         vs37,   o4,     T1
+
+       addi            T1,     T1,     8
+
+       stxsspx         vs38,   o0,     T1
+       stxsspx         vs39,   o4,     T1
+
+.endm
+
+/**********************************************************************************************
+* Macros for N=2 and M=8
+**********************************************************************************************/
+
+.macro COPY_2x8
+
+       lxvw4x          vs32,   o0,     A0
+       lxvw4x          vs33,   o16,    A0
+       lxvw4x          vs34,   o32,    A0
+       lxvw4x          vs35,   o48,    A0
+       addi            A0,     A0,     64
+
+       lxvw4x          vs36,   o0,     A1
+       lxvw4x          vs37,   o16,    A1
+       lxvw4x          vs38,   o32,    A1
+       lxvw4x          vs39,   o48,    A1
+       addi            A1,     A1,     64
+
+       mr              T1,     BO
+
+       stxvw4x         vs32,   o0,     T1
+       stxvw4x         vs33,   o16,    T1
+       stxvw4x         vs34,   o32,    T1
+       stxvw4x         vs35,   o48,    T1
+
+       addi            T1,     T1,     64
+
+       stxvw4x         vs36,   o0,     T1
+       stxvw4x         vs37,   o16,    T1
+       stxvw4x         vs38,   o32,    T1
+       stxvw4x         vs39,   o48,    T1
+
+.endm
+
+/**********************************************************************************************
+* Macros for N=2 and M=4
+**********************************************************************************************/
+
+.macro COPY_2x4
+
+       lxvw4x          vs32,   o0,     A0
+       lxvw4x          vs33,   o16,    A0
+       addi            A0,     A0,     32
+
+       lxvw4x          vs34,   o0,     A1
+       lxvw4x          vs35,   o16,    A1
+       addi            A1,     A1,     32
+
+       mr              T1,     BO
+
+       stxvw4x         vs32,   o0,     T1
+       stxvw4x         vs33,   o16,    T1
+
+       stxvw4x         vs34,   o32,    T1
+       stxvw4x         vs35,   o48,    T1
+
+.endm
+
+/**********************************************************************************************
+* Macros for N=2 and M=2
+**********************************************************************************************/
+
+.macro COPY_2x2
+
+       lxvw4x          vs32,   o0,     A0
+       addi            A0,     A0,     16
+
+       lxvw4x          vs33,   o0,     A1
+       addi            A1,     A1,     16
+
+       mr              T1,     BO
+
+       stxvw4x         vs32,   o0,     T1
+
+       stxvw4x         vs33,   o16,    T1
+
+.endm
+
+/**********************************************************************************************
+* Macros for N=2 and M=1
+**********************************************************************************************/
+
+.macro COPY_2x1
+
+       lxsspx          vs32,   o0,     A0
+       lxsspx          vs33,   o4,     A0
+       addi            A0,     A0,     8
+
+       lxsspx          vs34,   o0,     A1
+       lxsspx          vs35,   o4,     A1
+       addi            A1,     A1,     8
+
+       mr              T1,     BO
+
+       stxsspx         vs32,   o0,     T1
+       stxsspx         vs33,   o4,     T1
+
+       addi            T1,     T1,     8
+
+       stxsspx         vs34,   o0,     T1
+       stxsspx         vs35,   o4,     T1
+
+.endm
+
+/**********************************************************************************************
+* Macros for N=1 and M=8
+**********************************************************************************************/
+
+.macro COPY_1x8
+
+       lxvw4x          vs32,   o0,     A0
+       lxvw4x          vs33,   o16,    A0
+       lxvw4x          vs34,   o32,    A0
+       lxvw4x          vs35,   o48,    A0
+       addi            A0,     A0,     64
+
+       mr              T1,     BO
+
+       stxvw4x         vs32,   o0,     T1
+       stxvw4x         vs33,   o16,    T1
+       stxvw4x         vs34,   o32,    T1
+       stxvw4x         vs35,   o48,    T1
+
+.endm
+
+/**********************************************************************************************
+* Macros for N=1 and M=4
+**********************************************************************************************/
+
+.macro COPY_1x4
+
+       lxvw4x          vs32,   o0,     A0
+       lxvw4x          vs33,   o16,    A0
+       addi            A0,     A0,     32
+
+       mr              T1,     BO
+
+       stxvw4x         vs32,   o0,     T1
+       stxvw4x         vs33,   o16,    T1
+
+.endm
+
+/**********************************************************************************************
+* Macros for N=1 and M=2
+**********************************************************************************************/
+
+.macro COPY_1x2
+
+       lxvw4x          vs32,   o0,     A0
+       addi            A0,     A0,     16
+
+       mr              T1,     BO
+
+       stxvw4x         vs32,   o0,     T1
+
+.endm
+
+/**********************************************************************************************
+* Macros for N=1 and M=1
+**********************************************************************************************/
+
+.macro COPY_1x1
+
+       lxsspx          vs32,   o0,     A0
+       lxsspx          vs33,   o4,     A0
+       addi            A0,     A0,     8
+
+       mr              T1,     BO
+
+       stxsspx         vs32,   o0,     T1
+       stxsspx         vs33,   o4,     T1
+
+.endm
+