add remaining trmm copy rutines for SVE
authorBine Brank <binebrank@gmail.com>
Sun, 14 Nov 2021 15:00:10 +0000 (16:00 +0100)
committerBine Brank <binebrank@gmail.com>
Sun, 14 Nov 2021 15:00:10 +0000 (16:00 +0100)
kernel/arm64/trmm_lncopy_sve_v1.c [new file with mode: 0644]
kernel/arm64/trmm_ltcopy_sve_v1.c [new file with mode: 0644]
kernel/arm64/trmm_uncopy_sve_v1.c [new file with mode: 0644]
kernel/arm64/trmm_utcopy_sve_v1.c

diff --git a/kernel/arm64/trmm_lncopy_sve_v1.c b/kernel/arm64/trmm_lncopy_sve_v1.c
new file mode 100644 (file)
index 0000000..e454e28
--- /dev/null
@@ -0,0 +1,127 @@
+/*********************************************************************/
+/* Copyright 2009, 2010 The University of Texas at Austin.           */
+/* All rights reserved.                                              */
+/*                                                                   */
+/* Redistribution and use in source and binary forms, with or        */
+/* without modification, are permitted provided that the following   */
+/* conditions are met:                                               */
+/*                                                                   */
+/*   1. Redistributions of source code must retain the above         */
+/*      copyright notice, this list of conditions and the following  */
+/*      disclaimer.                                                  */
+/*                                                                   */
+/*   2. Redistributions in binary form must reproduce the above      */
+/*      copyright notice, this list of conditions and the following  */
+/*      disclaimer in the documentation and/or other materials       */
+/*      provided with the distribution.                              */
+/*                                                                   */
+/*    THIS  SOFTWARE IS PROVIDED  BY THE  UNIVERSITY OF  TEXAS AT    */
+/*    AUSTIN  ``AS IS''  AND ANY  EXPRESS OR  IMPLIED WARRANTIES,    */
+/*    INCLUDING, BUT  NOT LIMITED  TO, THE IMPLIED  WARRANTIES OF    */
+/*    MERCHANTABILITY  AND FITNESS FOR  A PARTICULAR  PURPOSE ARE    */
+/*    DISCLAIMED.  IN  NO EVENT SHALL THE UNIVERSITY  OF TEXAS AT    */
+/*    AUSTIN OR CONTRIBUTORS BE  LIABLE FOR ANY DIRECT, INDIRECT,    */
+/*    INCIDENTAL,  SPECIAL, EXEMPLARY,  OR  CONSEQUENTIAL DAMAGES    */
+/*    (INCLUDING, BUT  NOT LIMITED TO,  PROCUREMENT OF SUBSTITUTE    */
+/*    GOODS  OR  SERVICES; LOSS  OF  USE,  DATA,  OR PROFITS;  OR    */
+/*    BUSINESS INTERRUPTION) HOWEVER CAUSED  AND ON ANY THEORY OF    */
+/*    LIABILITY, WHETHER  IN CONTRACT, STRICT  LIABILITY, OR TORT    */
+/*    (INCLUDING NEGLIGENCE OR OTHERWISE)  ARISING IN ANY WAY OUT    */
+/*    OF  THE  USE OF  THIS  SOFTWARE,  EVEN  IF ADVISED  OF  THE    */
+/*    POSSIBILITY OF SUCH DAMAGE.                                    */
+/*                                                                   */
+/* The views and conclusions contained in the software and           */
+/* documentation are those of the authors and should not be          */
+/* interpreted as representing official policies, either expressed   */
+/* or implied, of The University of Texas at Austin.                 */
+/*********************************************************************/
+
+#include <stdio.h>
+#include "common.h"
+
+#ifdef __ARM_FEATURE_SVE
+#include <arm_sve.h>
+#endif
+
+int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, BLASLONG posX, BLASLONG posY, FLOAT *b){
+
+    BLASLONG i, js;
+    BLASLONG X;
+    //printf("Using trmm_ln.\n");
+
+    int sve_len = svcntd();
+    svint64_t index = svindex_s64(0LL, lda);
+
+    FLOAT *ao;
+    js = 0;
+    svbool_t pn = svwhilelt_b64(js, n);
+    int n_active = svcntp_b64(svptrue_b64(), pn);
+    do
+    {
+        X = posX;
+
+        if (posX <= posY) {
+            ao = a + posY + posX * lda;
+        } else {
+            ao = a + posX + posY * lda;
+        }
+
+        i = 0;
+        /* svbool_t pm = svwhilelt_b64(i, m); */
+        /* int m_active = svcntp_b64(svptrue_b64(), pm); */
+        do 
+        {
+            if (X > posY) { // optimize this: unroll over DGEMM_UNROLL_M: vl
+                svfloat64_t aj_vec = svld1_gather_index(pn, ao, index);
+                svst1(pn, b, aj_vec);
+                ao ++;
+                b += n_active;
+                X ++;
+                i ++;
+            } else 
+                if (X < posY) {
+                    ao += lda;
+                    b += n_active;
+                    X ++;
+                    i ++;
+                } else {
+#ifdef UNIT
+                    int temp = 0;
+                    for (int j = 0; j < n_active; j++) {
+                        for (int k = 0 ; k < j; k++) {
+                            b[temp++] = *(ao+k*lda+j);
+                        }
+                        b[temp++] = ONE;
+                        for (int k = j+1; k < n_active; k++) {
+                            b[temp++] = ZERO;
+                        }
+                    }
+#else 
+                    int temp = 0;
+                    for (int j = 0; j < n_active; j++) {
+                        for (int k = 0 ; k <= j; k++) {
+                            b[temp++] = *(ao+k*lda+j);
+                        }
+                        for (int k = j+1; k < n_active; k++) {
+                            b[temp++] = ZERO;
+                        }
+                    }
+#endif
+                    ao += n_active;
+                    b += n_active*n_active;
+                    X += n_active;
+                    i += n_active;
+                }
+        } while (i < m);
+
+        //printf("\n");
+
+
+        posY += n_active;
+        js += n_active;
+        pn = svwhilelt_b64(js, n);
+        n_active = svcntp_b64(svptrue_b64(), pn);
+    } while (svptest_any(svptrue_b64(), pn));
+
+    return 0;
+}
diff --git a/kernel/arm64/trmm_ltcopy_sve_v1.c b/kernel/arm64/trmm_ltcopy_sve_v1.c
new file mode 100644 (file)
index 0000000..86433f2
--- /dev/null
@@ -0,0 +1,128 @@
+/*********************************************************************/
+/* Copyright 2009, 2010 The University of Texas at Austin.           */
+/* All rights reserved.                                              */
+/*                                                                   */
+/* Redistribution and use in source and binary forms, with or        */
+/* without modification, are permitted provided that the following   */
+/* conditions are met:                                               */
+/*                                                                   */
+/*   1. Redistributions of source code must retain the above         */
+/*      copyright notice, this list of conditions and the following  */
+/*      disclaimer.                                                  */
+/*                                                                   */
+/*   2. Redistributions in binary form must reproduce the above      */
+/*      copyright notice, this list of conditions and the following  */
+/*      disclaimer in the documentation and/or other materials       */
+/*      provided with the distribution.                              */
+/*                                                                   */
+/*    THIS  SOFTWARE IS PROVIDED  BY THE  UNIVERSITY OF  TEXAS AT    */
+/*    AUSTIN  ``AS IS''  AND ANY  EXPRESS OR  IMPLIED WARRANTIES,    */
+/*    INCLUDING, BUT  NOT LIMITED  TO, THE IMPLIED  WARRANTIES OF    */
+/*    MERCHANTABILITY  AND FITNESS FOR  A PARTICULAR  PURPOSE ARE    */
+/*    DISCLAIMED.  IN  NO EVENT SHALL THE UNIVERSITY  OF TEXAS AT    */
+/*    AUSTIN OR CONTRIBUTORS BE  LIABLE FOR ANY DIRECT, INDIRECT,    */
+/*    INCIDENTAL,  SPECIAL, EXEMPLARY,  OR  CONSEQUENTIAL DAMAGES    */
+/*    (INCLUDING, BUT  NOT LIMITED TO,  PROCUREMENT OF SUBSTITUTE    */
+/*    GOODS  OR  SERVICES; LOSS  OF  USE,  DATA,  OR PROFITS;  OR    */
+/*    BUSINESS INTERRUPTION) HOWEVER CAUSED  AND ON ANY THEORY OF    */
+/*    LIABILITY, WHETHER  IN CONTRACT, STRICT  LIABILITY, OR TORT    */
+/*    (INCLUDING NEGLIGENCE OR OTHERWISE)  ARISING IN ANY WAY OUT    */
+/*    OF  THE  USE OF  THIS  SOFTWARE,  EVEN  IF ADVISED  OF  THE    */
+/*    POSSIBILITY OF SUCH DAMAGE.                                    */
+/*                                                                   */
+/* The views and conclusions contained in the software and           */
+/* documentation are those of the authors and should not be          */
+/* interpreted as representing official policies, either expressed   */
+/* or implied, of The University of Texas at Austin.                 */
+/*********************************************************************/
+
+#include <stdio.h>
+#include "common.h"
+
+#ifdef __ARM_FEATURE_SVE
+#include <arm_sve.h>
+#endif
+
+int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, BLASLONG posX, BLASLONG posY, FLOAT *b){
+
+    BLASLONG i, js;
+    BLASLONG X;
+
+    //printf("Using trmm_lt.\n");
+
+    int sve_len = svcntd();
+
+    FLOAT *ao;
+    js = 0;
+    svbool_t pn = svwhilelt_b64(js, n);
+    int n_active = svcntp_b64(svptrue_b64(), pn);
+    do
+    {
+        X = posX;
+
+        if (posX <= posY) {
+            ao = a + posY + posX * lda;
+        } else {
+            ao = a + posX + posY * lda;
+        }
+
+        i = 0;
+        /* svbool_t pm = svwhilelt_b64(i, m); */
+        /* int m_active = svcntp_b64(svptrue_b64(), pm); */
+        do 
+        {
+            if (X > posY) { // optimize this: unroll over DGEMM_UNROLL_M: vl
+                ao ++;
+                b += n_active;
+                X ++;
+                i ++;
+            } else 
+                if (X < posY) {
+                    svfloat64_t aj_vec = svld1(pn, ao);
+                    svst1(pn, b, aj_vec);
+                    ao += lda;
+                    b += n_active;
+                    X ++;
+                    i ++;
+                } else {
+#ifdef UNIT
+                    int temp = 0;
+                    for (int j = 0; j < n_active; j++) {
+                        for (int k = 0 ; k < j; k++) {
+                            b[temp++] = ZERO;
+                        }
+                        b[temp++] = ONE;
+                        for (int k = j+1; k < n_active; k++) {
+                            b[temp++] = *(ao+j*lda+k);
+                        }
+                    }
+#else 
+                    int temp = 0;
+                    for (int j = 0; j < n_active; j++) {
+                        for (int k = 0 ; k < j; k++) {
+                            b[temp++] = ZERO;
+                        }
+                        for (int k = j; k < n_active; k++) {
+                            b[temp++] = *(ao+j*lda+k);
+                        }
+                    }
+#endif
+                    ao += n_active * lda;
+                    b += n_active*n_active;
+                    X += n_active;
+                    i += n_active;
+                }
+        } while (i < m);
+
+        //printf("\n");
+
+
+        posY += n_active;
+        js += n_active;
+        pn = svwhilelt_b64(js, n);
+        n_active = svcntp_b64(svptrue_b64(), pn);
+    } while (svptest_any(svptrue_b64(), pn));
+
+
+    return 0;
+}
diff --git a/kernel/arm64/trmm_uncopy_sve_v1.c b/kernel/arm64/trmm_uncopy_sve_v1.c
new file mode 100644 (file)
index 0000000..21f392b
--- /dev/null
@@ -0,0 +1,130 @@
+/*********************************************************************/
+/* Copyright 2009, 2010 The University of Texas at Austin.           */
+/* All rights reserved.                                              */
+/*                                                                   */
+/* Redistribution and use in source and binary forms, with or        */
+/* without modification, are permitted provided that the following   */
+/* conditions are met:                                               */
+/*                                                                   */
+/*   1. Redistributions of source code must retain the above         */
+/*      copyright notice, this list of conditions and the following  */
+/*      disclaimer.                                                  */
+/*                                                                   */
+/*   2. Redistributions in binary form must reproduce the above      */
+/*      copyright notice, this list of conditions and the following  */
+/*      disclaimer in the documentation and/or other materials       */
+/*      provided with the distribution.                              */
+/*                                                                   */
+/*    THIS  SOFTWARE IS PROVIDED  BY THE  UNIVERSITY OF  TEXAS AT    */
+/*    AUSTIN  ``AS IS''  AND ANY  EXPRESS OR  IMPLIED WARRANTIES,    */
+/*    INCLUDING, BUT  NOT LIMITED  TO, THE IMPLIED  WARRANTIES OF    */
+/*    MERCHANTABILITY  AND FITNESS FOR  A PARTICULAR  PURPOSE ARE    */
+/*    DISCLAIMED.  IN  NO EVENT SHALL THE UNIVERSITY  OF TEXAS AT    */
+/*    AUSTIN OR CONTRIBUTORS BE  LIABLE FOR ANY DIRECT, INDIRECT,    */
+/*    INCIDENTAL,  SPECIAL, EXEMPLARY,  OR  CONSEQUENTIAL DAMAGES    */
+/*    (INCLUDING, BUT  NOT LIMITED TO,  PROCUREMENT OF SUBSTITUTE    */
+/*    GOODS  OR  SERVICES; LOSS  OF  USE,  DATA,  OR PROFITS;  OR    */
+/*    BUSINESS INTERRUPTION) HOWEVER CAUSED  AND ON ANY THEORY OF    */
+/*    LIABILITY, WHETHER  IN CONTRACT, STRICT  LIABILITY, OR TORT    */
+/*    (INCLUDING NEGLIGENCE OR OTHERWISE)  ARISING IN ANY WAY OUT    */
+/*    OF  THE  USE OF  THIS  SOFTWARE,  EVEN  IF ADVISED  OF  THE    */
+/*    POSSIBILITY OF SUCH DAMAGE.                                    */
+/*                                                                   */
+/* The views and conclusions contained in the software and           */
+/* documentation are those of the authors and should not be          */
+/* interpreted as representing official policies, either expressed   */
+/* or implied, of The University of Texas at Austin.                 */
+/*********************************************************************/
+
+#include <stdio.h>
+#include "common.h"
+
+#ifdef __ARM_FEATURE_SVE
+#include <arm_sve.h>
+#endif
+
+int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, BLASLONG posX, BLASLONG posY, FLOAT *b){
+
+    BLASLONG i, js;
+    BLASLONG X;
+    //printf("Using trmm_un.\n");
+    //printf("Using m %ld, n %ld.\n", m, n);
+    //printf("Using lda %ld.\n", lda);
+    //printf("Using posX %ld, posY %ld.\n", posX, posY);
+
+    int sve_len = svcntd();
+    svint64_t index = svindex_s64(0LL, lda);
+
+    FLOAT *ao;
+    js = 0;
+    svbool_t pn = svwhilelt_b64(js, n);
+    int n_active = svcntp_b64(svptrue_b64(), pn);
+    do
+    {
+        X = posX;
+
+        if (posX <= posY) {
+            ao = a + posX + posY * lda;
+        } else {
+            ao = a + posY + posX * lda;
+        }
+
+        i = 0;
+        /* svbool_t pm = svwhilelt_b64(i, m); */
+        /* int m_active = svcntp_b64(svptrue_b64(), pm); */
+        do 
+        {
+            if (X < posY) { // optimize this: unroll over DGEMM_UNROLL_M: vl
+                svfloat64_t aj_vec = svld1_gather_index(pn, ao, index);
+                svst1(pn, b, aj_vec);
+                ao ++;
+                b += n_active;
+                X ++;
+                i ++;
+            } else 
+                if (X > posY) {
+                    ao += lda;
+                    b += n_active;
+                    X ++;
+                    i ++;
+                } else {
+#ifdef UNIT
+                    int temp = 0;
+                    for (int j = 0; j < n_active; j++) {
+                        for (int k = 0 ; k < j; k++) {
+                            b[temp++] = ZERO;
+                        }
+                        b[temp++] = ONE;
+                        for (int k = j+1; k < n_active; k++) {
+                            b[temp++] = *(ao+k*lda+j);
+                        }
+                    }
+#else 
+                    int temp = 0;
+                    for (int j = 0; j < n_active; j++) {
+                        for (int k = 0 ; k < j; k++) {
+                            b[temp++] = ZERO;
+                        }
+                        for (int k = j; k < n_active; k++) {
+                            b[temp++] = *(ao+k*lda+j);
+                        }
+                    }
+#endif
+                    ao += n_active;
+                    b += n_active*n_active;
+                    X += n_active;
+                    i += n_active;
+                }
+        } while (i < m);
+
+        //printf("\n");
+
+
+        posY += n_active;
+        js += n_active;
+        pn = svwhilelt_b64(js, n);
+        n_active = svcntp_b64(svptrue_b64(), pn);
+    } while (svptest_any(svptrue_b64(), pn));
+
+    return 0;
+}
index e44e6737300589be39579a2d28875c58f15d8f45..38b88dc8c1cdbf28158cebf94c4b755d083b9849 100644 (file)
 #include <arm_sve.h>
 #endif
 
-#define MIN(a,b) (((a)<(b))?(a):(b))
-#define MAX(a,b) (((a)>(b))?(a):(b))
-
 int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, BLASLONG posX, BLASLONG posY, FLOAT *b){
 
-    BLASLONG i, js, j;
+    BLASLONG i, js;
     BLASLONG X;
+    //printf("Using trmm_ut.\n");
 
     int sve_len = svcntd();
 
@@ -62,9 +60,9 @@ int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, BLASLONG posX, BLASLON
         X = posX;
 
         if (posX <= posY) {
-            ao = a + posX + (posY + j) * lda;
+            ao = a + posX + posY * lda;
         } else {
-            ao = a + posY + (posX + j) * lda;
+            ao = a + posY + posX * lda;
         }
 
         i = 0;