ztrmm sve copy kernels
authorBine Brank <binebrank@gmail.com>
Thu, 30 Dec 2021 20:00:16 +0000 (21:00 +0100)
committerBine Brank <binebrank@gmail.com>
Thu, 30 Dec 2021 20:00:16 +0000 (21:00 +0100)
kernel/arm64/ztrmm_lncopy_sve_v1.c [new file with mode: 0644]
kernel/arm64/ztrmm_ltcopy_sve_v1.c [new file with mode: 0644]
kernel/arm64/ztrmm_uncopy_sve_v1.c [new file with mode: 0644]
kernel/arm64/ztrmm_utcopy_sve_v1.c [new file with mode: 0644]

diff --git a/kernel/arm64/ztrmm_lncopy_sve_v1.c b/kernel/arm64/ztrmm_lncopy_sve_v1.c
new file mode 100644 (file)
index 0000000..19c34ff
--- /dev/null
@@ -0,0 +1,145 @@
+/*********************************************************************/
+/* Copyright 2009, 2010 The University of Texas at Austin.           */
+/* All rights reserved.                                              */
+/*                                                                   */
+/* Redistribution and use in source and binary forms, with or        */
+/* without modification, are permitted provided that the following   */
+/* conditions are met:                                               */
+/*                                                                   */
+/*   1. Redistributions of source code must retain the above         */
+/*      copyright notice, this list of conditions and the following  */
+/*      disclaimer.                                                  */
+/*                                                                   */
+/*   2. Redistributions in binary form must reproduce the above      */
+/*      copyright notice, this list of conditions and the following  */
+/*      disclaimer in the documentation and/or other materials       */
+/*      provided with the distribution.                              */
+/*                                                                   */
+/*    THIS  SOFTWARE IS PROVIDED  BY THE  UNIVERSITY OF  TEXAS AT    */
+/*    AUSTIN  ``AS IS''  AND ANY  EXPRESS OR  IMPLIED WARRANTIES,    */
+/*    INCLUDING, BUT  NOT LIMITED  TO, THE IMPLIED  WARRANTIES OF    */
+/*    MERCHANTABILITY  AND FITNESS FOR  A PARTICULAR  PURPOSE ARE    */
+/*    DISCLAIMED.  IN  NO EVENT SHALL THE UNIVERSITY  OF TEXAS AT    */
+/*    AUSTIN OR CONTRIBUTORS BE  LIABLE FOR ANY DIRECT, INDIRECT,    */
+/*    INCIDENTAL,  SPECIAL, EXEMPLARY,  OR  CONSEQUENTIAL DAMAGES    */
+/*    (INCLUDING, BUT  NOT LIMITED TO,  PROCUREMENT OF SUBSTITUTE    */
+/*    GOODS  OR  SERVICES; LOSS  OF  USE,  DATA,  OR PROFITS;  OR    */
+/*    BUSINESS INTERRUPTION) HOWEVER CAUSED  AND ON ANY THEORY OF    */
+/*    LIABILITY, WHETHER  IN CONTRACT, STRICT  LIABILITY, OR TORT    */
+/*    (INCLUDING NEGLIGENCE OR OTHERWISE)  ARISING IN ANY WAY OUT    */
+/*    OF  THE  USE OF  THIS  SOFTWARE,  EVEN  IF ADVISED  OF  THE    */
+/*    POSSIBILITY OF SUCH DAMAGE.                                    */
+/*                                                                   */
+/* The views and conclusions contained in the software and           */
+/* documentation are those of the authors and should not be          */
+/* interpreted as representing official policies, either expressed   */
+/* or implied, of The University of Texas at Austin.                 */
+/*********************************************************************/
+
+#include <stdio.h>
+#include "common.h"
+
+#ifdef __ARM_FEATURE_SVE
+#include <arm_sve.h>
+#endif
+
+int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, BLASLONG posX, BLASLONG posY, FLOAT *b){
+
+    BLASLONG i, js;
+    BLASLONG X;
+
+    lda += lda;
+
+    js = 0;
+    FLOAT *ao;
+#ifdef DOUBLE
+    svint64_t index = svindex_s64(0LL, lda*2);
+    svbool_t pn = svwhilelt_b64(js, n);
+    int n_active = svcntp_b64(svptrue_b64(), pn);
+#else
+    svint32_t index = svindex_s32(0, lda*2);
+    svbool_t pn = svwhilelt_b32(js, n);
+    int n_active = svcntp_b32(svptrue_b32(), pn);
+#endif
+    do
+    {
+        X = posX;
+
+        if (posX <= posY) {
+            ao = a + posY * 2 + posX * lda;
+        } else {
+            ao = a + posX * 2 + posY * lda;
+        }
+
+        i = 0;
+        do 
+        {
+            if (X > posY) {
+#ifdef DOUBLE
+                svfloat64_t aj_vec_real = svld1_gather_index(pn, ao, index);
+                svfloat64_t aj_vec_imag = svld1_gather_index(pn, ao+1, index);
+#else
+                svfloat32_t aj_vec_real = svld1_gather_index(pn, ao, index);
+                svfloat32_t aj_vec_imag = svld1_gather_index(pn, ao+1, index);
+#endif
+                svst2(pn, b, svcreate2(aj_vec_real, aj_vec_imag));
+                ao += 2;
+                b += n_active * 2;
+                X ++;
+                i ++;
+            } else 
+                if (X < posY) {
+                    ao += lda * 2;
+                    b += n_active * 2;
+                    X ++;
+                    i ++;
+                } else {
+                    /* I did not find a way to unroll this while preserving vector-length-agnostic code. */
+#ifdef UNIT
+                    int temp = 0;
+                    for (int j = 0; j < n_active; j++) {
+                        for (int k = 0 ; k < j; k++) {
+                            b[temp++] = *(ao+k*lda+j);
+                            b[temp++] = *(ao+k*lda+j+1);
+                        }
+                        b[temp++] = ONE;
+                        b[temp++] = ZERO;
+                        for (int k = j+1; k < n_active; k++) {
+                            b[temp++] = ZERO;
+                            b[temp++] = ZERO;
+                        }
+                    }
+#else 
+                    int temp = 0;
+                    for (int j = 0; j < n_active; j++) {
+                        for (int k = 0 ; k <= j; k++) {
+                            b[temp++] = *(ao+k*lda+j);
+                            b[temp++] = *(ao+k*lda+j+1);
+                        }
+                        for (int k = j+1; k < n_active; k++) {
+                            b[temp++] = ZERO;
+                            b[temp++] = ZERO;
+                        }
+                    }
+#endif
+                    ao += n_active * 2;
+                    b += n_active*n_active * 2;
+                    X += n_active;
+                    i += n_active;
+                }
+        } while (i < m);
+
+        posY += n_active;
+        js += n_active;
+#ifdef DOUBLE
+        pn = svwhilelt_b64(js, n);
+        n_active = svcntp_b64(svptrue_b64(), pn);
+    } while (svptest_any(svptrue_b64(), pn));
+#else
+        pn = svwhilelt_b32(js, n);
+        n_active = svcntp_b32(svptrue_b32(), pn);
+    } while (svptest_any(svptrue_b32(), pn));
+#endif
+
+    return 0;
+}
diff --git a/kernel/arm64/ztrmm_ltcopy_sve_v1.c b/kernel/arm64/ztrmm_ltcopy_sve_v1.c
new file mode 100644 (file)
index 0000000..c272db6
--- /dev/null
@@ -0,0 +1,143 @@
+/*********************************************************************/
+/* Copyright 2009, 2010 The University of Texas at Austin.           */
+/* All rights reserved.                                              */
+/*                                                                   */
+/* Redistribution and use in source and binary forms, with or        */
+/* without modification, are permitted provided that the following   */
+/* conditions are met:                                               */
+/*                                                                   */
+/*   1. Redistributions of source code must retain the above         */
+/*      copyright notice, this list of conditions and the following  */
+/*      disclaimer.                                                  */
+/*                                                                   */
+/*   2. Redistributions in binary form must reproduce the above      */
+/*      copyright notice, this list of conditions and the following  */
+/*      disclaimer in the documentation and/or other materials       */
+/*      provided with the distribution.                              */
+/*                                                                   */
+/*    THIS  SOFTWARE IS PROVIDED  BY THE  UNIVERSITY OF  TEXAS AT    */
+/*    AUSTIN  ``AS IS''  AND ANY  EXPRESS OR  IMPLIED WARRANTIES,    */
+/*    INCLUDING, BUT  NOT LIMITED  TO, THE IMPLIED  WARRANTIES OF    */
+/*    MERCHANTABILITY  AND FITNESS FOR  A PARTICULAR  PURPOSE ARE    */
+/*    DISCLAIMED.  IN  NO EVENT SHALL THE UNIVERSITY  OF TEXAS AT    */
+/*    AUSTIN OR CONTRIBUTORS BE  LIABLE FOR ANY DIRECT, INDIRECT,    */
+/*    INCIDENTAL,  SPECIAL, EXEMPLARY,  OR  CONSEQUENTIAL DAMAGES    */
+/*    (INCLUDING, BUT  NOT LIMITED TO,  PROCUREMENT OF SUBSTITUTE    */
+/*    GOODS  OR  SERVICES; LOSS  OF  USE,  DATA,  OR PROFITS;  OR    */
+/*    BUSINESS INTERRUPTION) HOWEVER CAUSED  AND ON ANY THEORY OF    */
+/*    LIABILITY, WHETHER  IN CONTRACT, STRICT  LIABILITY, OR TORT    */
+/*    (INCLUDING NEGLIGENCE OR OTHERWISE)  ARISING IN ANY WAY OUT    */
+/*    OF  THE  USE OF  THIS  SOFTWARE,  EVEN  IF ADVISED  OF  THE    */
+/*    POSSIBILITY OF SUCH DAMAGE.                                    */
+/*                                                                   */
+/* The views and conclusions contained in the software and           */
+/* documentation are those of the authors and should not be          */
+/* interpreted as representing official policies, either expressed   */
+/* or implied, of The University of Texas at Austin.                 */
+/*********************************************************************/
+
+#include <stdio.h>
+#include "common.h"
+
+#ifdef __ARM_FEATURE_SVE
+#include <arm_sve.h>
+#endif
+
+int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, BLASLONG posX, BLASLONG posY, FLOAT *b){
+
+    BLASLONG i, js;
+    BLASLONG X;
+
+    lda += lda;
+
+    FLOAT *ao;
+    js = 0;
+#ifdef DOUBLE
+    svbool_t pn = svwhilelt_b64(js, n);
+    int n_active = svcntp_b64(svptrue_b64(), pn);
+#else
+    svbool_t pn = svwhilelt_b32(js, n);
+    int n_active = svcntp_b32(svptrue_b32(), pn);
+#endif
+    do
+    {
+        X = posX;
+
+        if (posX <= posY) {
+            ao = a + posY * 2 + posX * lda;
+        } else {
+            ao = a + posX * 2 + posY * lda;
+        }
+
+        i = 0;
+        do 
+        {
+            if (X > posY) {
+                ao += 2;
+                b += n_active * 2;
+                X ++;
+                i ++;
+            } else 
+                if (X < posY) {
+#ifdef DOUBLE
+                    svfloat64x2_t aj_vec = svld2(pn, ao);
+#else
+                    svfloat32x2_t aj_vec = svld2(pn, ao);
+#endif
+                    svst2(pn, b, aj_vec);
+                    ao += lda * 2;
+                    b += n_active * 2;
+                    X ++;
+                    i ++;
+                } else {
+                    /* I did not find a way to unroll this while preserving vector-length-agnostic code. */
+#ifdef UNIT
+                    int temp = 0;
+                    for (int j = 0; j < n_active; j++) {
+                        for (int k = 0 ; k < j; k++) {
+                            b[temp++] = ZERO;
+                            b[temp++] = ZERO;
+                        }
+                        b[temp++] = ONE;
+                        b[temp++] = ZERO;
+                        for (int k = j+1; k < n_active; k++) {
+                            b[temp++] = *(ao+j*lda+k);
+                            b[temp++] = *(ao+j*lda+k+1);
+                        }
+                    }
+#else 
+                    int temp = 0;
+                    for (int j = 0; j < n_active; j++) {
+                        for (int k = 0 ; k < j; k++) {
+                            b[temp++] = ZERO;
+                            b[temp++] = ZERO;
+                        }
+                        for (int k = j; k < n_active; k++) {
+                            b[temp++] = *(ao+j*lda+k);
+                            b[temp++] = *(ao+j*lda+k+1);
+                        }
+                    }
+#endif
+                    ao += n_active * lda * 2;
+                    b += n_active*n_active * 2;
+                    X += n_active;
+                    i += n_active;
+                }
+        } while (i < m);
+
+
+        posY += n_active;
+        js += n_active;
+#ifdef DOUBLE
+        pn = svwhilelt_b64(js, n);
+        n_active = svcntp_b64(svptrue_b64(), pn);
+    } while (svptest_any(svptrue_b64(), pn));
+#else
+        pn = svwhilelt_b32(js, n);
+        n_active = svcntp_b32(svptrue_b32(), pn);
+    } while (svptest_any(svptrue_b32(), pn));
+#endif
+
+
+    return 0;
+}
diff --git a/kernel/arm64/ztrmm_uncopy_sve_v1.c b/kernel/arm64/ztrmm_uncopy_sve_v1.c
new file mode 100644 (file)
index 0000000..aaa2170
--- /dev/null
@@ -0,0 +1,145 @@
+/*********************************************************************/
+/* Copyright 2009, 2010 The University of Texas at Austin.           */
+/* All rights reserved.                                              */
+/*                                                                   */
+/* Redistribution and use in source and binary forms, with or        */
+/* without modification, are permitted provided that the following   */
+/* conditions are met:                                               */
+/*                                                                   */
+/*   1. Redistributions of source code must retain the above         */
+/*      copyright notice, this list of conditions and the following  */
+/*      disclaimer.                                                  */
+/*                                                                   */
+/*   2. Redistributions in binary form must reproduce the above      */
+/*      copyright notice, this list of conditions and the following  */
+/*      disclaimer in the documentation and/or other materials       */
+/*      provided with the distribution.                              */
+/*                                                                   */
+/*    THIS  SOFTWARE IS PROVIDED  BY THE  UNIVERSITY OF  TEXAS AT    */
+/*    AUSTIN  ``AS IS''  AND ANY  EXPRESS OR  IMPLIED WARRANTIES,    */
+/*    INCLUDING, BUT  NOT LIMITED  TO, THE IMPLIED  WARRANTIES OF    */
+/*    MERCHANTABILITY  AND FITNESS FOR  A PARTICULAR  PURPOSE ARE    */
+/*    DISCLAIMED.  IN  NO EVENT SHALL THE UNIVERSITY  OF TEXAS AT    */
+/*    AUSTIN OR CONTRIBUTORS BE  LIABLE FOR ANY DIRECT, INDIRECT,    */
+/*    INCIDENTAL,  SPECIAL, EXEMPLARY,  OR  CONSEQUENTIAL DAMAGES    */
+/*    (INCLUDING, BUT  NOT LIMITED TO,  PROCUREMENT OF SUBSTITUTE    */
+/*    GOODS  OR  SERVICES; LOSS  OF  USE,  DATA,  OR PROFITS;  OR    */
+/*    BUSINESS INTERRUPTION) HOWEVER CAUSED  AND ON ANY THEORY OF    */
+/*    LIABILITY, WHETHER  IN CONTRACT, STRICT  LIABILITY, OR TORT    */
+/*    (INCLUDING NEGLIGENCE OR OTHERWISE)  ARISING IN ANY WAY OUT    */
+/*    OF  THE  USE OF  THIS  SOFTWARE,  EVEN  IF ADVISED  OF  THE    */
+/*    POSSIBILITY OF SUCH DAMAGE.                                    */
+/*                                                                   */
+/* The views and conclusions contained in the software and           */
+/* documentation are those of the authors and should not be          */
+/* interpreted as representing official policies, either expressed   */
+/* or implied, of The University of Texas at Austin.                 */
+/*********************************************************************/
+
+#include <stdio.h>
+#include "common.h"
+
+#ifdef __ARM_FEATURE_SVE
+#include <arm_sve.h>
+#endif
+
+int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, BLASLONG posX, BLASLONG posY, FLOAT *b){
+
+    BLASLONG i, js;
+    BLASLONG X;
+
+    lda += lda;
+
+    js = 0;
+    FLOAT *ao;
+#ifdef DOUBLE
+    svint64_t index = svindex_s64(0LL, lda * 2);
+    svbool_t pn = svwhilelt_b64(js, n);
+    int n_active = svcntp_b64(svptrue_b64(), pn);
+#else
+    svint32_t index = svindex_s32(0, lda * 2);
+    svbool_t pn = svwhilelt_b32(js, n);
+    int n_active = svcntp_b32(svptrue_b32(), pn);
+#endif
+    do
+    {
+        X = posX;
+
+        if (posX <= posY) {
+            ao = a + posX * 2 + posY * lda;
+        } else {
+            ao = a + posY * 2 + posX * lda;
+        }
+
+        i = 0;
+        do 
+        {
+            if (X < posY) {
+#ifdef DOUBLE
+                svfloat64_t aj_vec_real = svld1_gather_index(pn, ao, index);
+                svfloat64_t aj_vec_imag = svld1_gather_index(pn, ao+1, index);
+#else
+                svfloat32_t aj_vec_real = svld1_gather_index(pn, ao, index);
+                svfloat32_t aj_vec_imag = svld1_gather_index(pn, ao+1, index);
+#endif
+                svst2(pn, b, svcreate2(aj_vec_real, aj_vec_imag));
+                ao += 2;
+                b += n_active * 2;
+                X ++;
+                i ++;
+            } else 
+                if (X > posY) {
+                    ao += lda * 2;
+                    b += n_active * 2;
+                    X ++;
+                    i ++;
+                } else {
+                    /* I did not find a way to unroll this while preserving vector-length-agnostic code. */
+#ifdef UNIT
+                    int temp = 0;
+                    for (int j = 0; j < n_active; j++) {
+                        for (int k = 0 ; k < j; k++) {
+                            b[temp++] = ZERO;
+                            b[temp++] = ZERO;
+                        }
+                        b[temp++] = ONE;
+                        b[temp++] = ZERO;
+                        for (int k = j+1; k < n_active; k++) {
+                            b[temp++] = *(ao+k*lda+j);
+                            b[temp++] = *(ao+k*lda+j+1);
+                        }
+                    }
+#else 
+                    int temp = 0;
+                    for (int j = 0; j < n_active; j++) {
+                        for (int k = 0 ; k < j; k++) {
+                            b[temp++] = ZERO;
+                            b[temp++] = ZERO;
+                        }
+                        for (int k = j; k < n_active; k++) {
+                            b[temp++] = *(ao+k*lda+j);
+                            b[temp++] = *(ao+k*lda+j+1);
+                        }
+                    }
+#endif
+                    ao += n_active * 2;
+                    b += n_active*n_active * 2;
+                    X += n_active;
+                    i += n_active;
+                }
+        } while (i < m);
+
+        posY += n_active;
+        js += n_active;
+#ifdef DOUBLE
+        pn = svwhilelt_b64(js, n);
+        n_active = svcntp_b64(svptrue_b64(), pn);
+    } while (svptest_any(svptrue_b64(), pn));
+#else
+        pn = svwhilelt_b32(js, n);
+        n_active = svcntp_b32(svptrue_b32(), pn);
+    } while (svptest_any(svptrue_b32(), pn));
+#endif
+
+    return 0;
+}
diff --git a/kernel/arm64/ztrmm_utcopy_sve_v1.c b/kernel/arm64/ztrmm_utcopy_sve_v1.c
new file mode 100644 (file)
index 0000000..c3e1f1b
--- /dev/null
@@ -0,0 +1,141 @@
+/*********************************************************************/
+/* Copyright 2009, 2010 The University of Texas at Austin.           */
+/* All rights reserved.                                              */
+/*                                                                   */
+/* Redistribution and use in source and binary forms, with or        */
+/* without modification, are permitted provided that the following   */
+/* conditions are met:                                               */
+/*                                                                   */
+/*   1. Redistributions of source code must retain the above         */
+/*      copyright notice, this list of conditions and the following  */
+/*      disclaimer.                                                  */
+/*                                                                   */
+/*   2. Redistributions in binary form must reproduce the above      */
+/*      copyright notice, this list of conditions and the following  */
+/*      disclaimer in the documentation and/or other materials       */
+/*      provided with the distribution.                              */
+/*                                                                   */
+/*    THIS  SOFTWARE IS PROVIDED  BY THE  UNIVERSITY OF  TEXAS AT    */
+/*    AUSTIN  ``AS IS''  AND ANY  EXPRESS OR  IMPLIED WARRANTIES,    */
+/*    INCLUDING, BUT  NOT LIMITED  TO, THE IMPLIED  WARRANTIES OF    */
+/*    MERCHANTABILITY  AND FITNESS FOR  A PARTICULAR  PURPOSE ARE    */
+/*    DISCLAIMED.  IN  NO EVENT SHALL THE UNIVERSITY  OF TEXAS AT    */
+/*    AUSTIN OR CONTRIBUTORS BE  LIABLE FOR ANY DIRECT, INDIRECT,    */
+/*    INCIDENTAL,  SPECIAL, EXEMPLARY,  OR  CONSEQUENTIAL DAMAGES    */
+/*    (INCLUDING, BUT  NOT LIMITED TO,  PROCUREMENT OF SUBSTITUTE    */
+/*    GOODS  OR  SERVICES; LOSS  OF  USE,  DATA,  OR PROFITS;  OR    */
+/*    BUSINESS INTERRUPTION) HOWEVER CAUSED  AND ON ANY THEORY OF    */
+/*    LIABILITY, WHETHER  IN CONTRACT, STRICT  LIABILITY, OR TORT    */
+/*    (INCLUDING NEGLIGENCE OR OTHERWISE)  ARISING IN ANY WAY OUT    */
+/*    OF  THE  USE OF  THIS  SOFTWARE,  EVEN  IF ADVISED  OF  THE    */
+/*    POSSIBILITY OF SUCH DAMAGE.                                    */
+/*                                                                   */
+/* The views and conclusions contained in the software and           */
+/* documentation are those of the authors and should not be          */
+/* interpreted as representing official policies, either expressed   */
+/* or implied, of The University of Texas at Austin.                 */
+/*********************************************************************/
+
+#include <stdio.h>
+#include "common.h"
+
+#ifdef __ARM_FEATURE_SVE
+#include <arm_sve.h>
+#endif
+
+int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, BLASLONG posX, BLASLONG posY, FLOAT *b){
+
+    BLASLONG i, js;
+    BLASLONG X;
+
+    lda += lda;
+
+    FLOAT *ao;
+    js = 0;
+#ifdef DOUBLE
+    svbool_t pn = svwhilelt_b64(js, n);
+    int n_active = svcntp_b64(svptrue_b64(), pn);
+#else
+    svbool_t pn = svwhilelt_b32(js, n);
+    int n_active = svcntp_b32(svptrue_b32(), pn);
+#endif
+    do
+    {
+        X = posX;
+
+        if (posX <= posY) {
+            ao = a + posX * 2 + posY * lda;
+        } else {
+            ao = a + posY * 2 + posX * lda;
+        }
+
+        i = 0;
+        do 
+        {
+            if (X < posY) {
+                ao += 2;
+                b += n_active * 2;
+                X ++;
+                i ++;
+            } else 
+                if (X > posY) {
+#ifdef DOUBLE
+                    svfloat64x2_t aj_vec = svld2(pn, ao);
+#else
+                    svfloat32x2_t aj_vec = svld2(pn, ao);
+#endif
+                    svst2(pn, b, aj_vec);
+                    ao += lda * 2;
+                    b += n_active * 2;
+                    X ++;
+                    i ++;
+                } else { 
+                    /* I did not find a way to unroll this while preserving vector-length-agnostic code. */
+#ifdef UNIT
+                    int temp = 0;
+                    for (int j = 0; j < n_active; j++) {
+                        for (int k = 0 ; k < j; k++) {
+                            b[temp++] = *(ao+j*lda+k);
+                            b[temp++] = *(ao+j*lda+k+1);
+                        }
+                        b[temp++] = ONE;
+                        b[temp++] = ZERO;
+                        for (int k = j+1; k < n_active; k++) {
+                            b[temp++] = ZERO;
+                            b[temp++] = ZERO;
+                        }
+                    }
+#else 
+                    int temp = 0;
+                    for (int j = 0; j < n_active; j++) {
+                        for (int k = 0 ; k <= j; k++) {
+                            b[temp++] = *(ao+j*lda+k);
+                            b[temp++] = *(ao+j*lda+k+1);
+                        }
+                        for (int k = j+1; k < n_active; k++) {
+                            b[temp++] = ZERO;
+                            b[temp++] = ZERO;
+                        }
+                    }
+#endif
+                    ao += n_active * lda * 2;
+                    b += n_active*n_active * 2;
+                    X += n_active;
+                    i += n_active;
+                }
+        } while (i < m);
+
+        posY += n_active;
+        js += n_active;
+#ifdef DOUBLE
+        pn = svwhilelt_b64(js, n);
+        n_active = svcntp_b64(svptrue_b64(), pn);
+    } while (svptest_any(svptrue_b64(), pn));
+#else
+        pn = svwhilelt_b32(js, n);
+        n_active = svcntp_b32(svptrue_b32(), pn);
+    } while (svptest_any(svptrue_b32(), pn));
+#endif
+
+    return 0;
+}