#SMINKERNEL = ../arm/min.c
#DMINKERNEL = ../arm/min.c
#
-#ISAMAXKERNEL = ../arm/iamax.c
+ISAMAXKERNEL = isamax.c
IDAMAXKERNEL = idamax.c
-#ICAMAXKERNEL = ../arm/izamax.c
-IZAMAXKERNEL = izamax.c
+ICAMAXKERNEL = icamax.c
+IZAMAXKERNEL = izamax.c
#
-#ISAMINKERNEL = ../arm/iamin.c
-IDAMINKERNEL = idamin.c
-#ICAMINKERNEL = ../arm/izamin.c
+ISAMINKERNEL = isamin.c
+IDAMINKERNEL = idamin.c
+ICAMINKERNEL = icamin.c
IZAMINKERNEL = izamin.c
#
#ISMAXKERNEL = ../arm/imax.c
CASUMKERNEL = casum.c
ZASUMKERNEL = zasum.c
#
-#SAXPYKERNEL = ../arm/axpy.c
+SAXPYKERNEL = saxpy.c
DAXPYKERNEL = daxpy.c
-#CAXPYKERNEL = ../arm/zaxpy.c
+CAXPYKERNEL = caxpy.c
ZAXPYKERNEL = zaxpy.c
#
SCOPYKERNEL = scopy.c
SDOTKERNEL = sdot.c
DDOTKERNEL = ddot.c
DSDOTKERNEL = sdot.c
-#CDOTKERNEL = ../arm/zdot.c
+CDOTKERNEL = cdot.c
ZDOTKERNEL = zdot.c
#
SNRM2KERNEL = ../arm/nrm2.c
#
SROTKERNEL = srot.c
DROTKERNEL = drot.c
-#CROTKERNEL = ../arm/zrot.c
+CROTKERNEL = crot.c
ZROTKERNEL = zrot.c
#
SSCALKERNEL = sscal.c
--- /dev/null
+/*
+Copyright (c) 2013-2018, The OpenBLAS Project
+All rights reserved.
+Redistribution and use in source and binary forms, with or without
+modification, are permitted provided that the following conditions are
+met:
+1. Redistributions of source code must retain the above copyright
+notice, this list of conditions and the following disclaimer.
+2. Redistributions in binary form must reproduce the above copyright
+notice, this list of conditions and the following disclaimer in
+the documentation and/or other materials provided with the
+distribution.
+3. Neither the name of the OpenBLAS project nor the names of
+its contributors may be used to endorse or promote products
+derived from this software without specific prior written permission.
+THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE
+LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
+USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+*****************************************************************************/
+
+#include "common.h"
+
+
+#ifndef HAVE_ASM_KERNEL
+#include <altivec.h>
+static void caxpy_kernel_16(BLASLONG n, FLOAT *x, FLOAT *y, FLOAT alpha_r, FLOAT alpha_i)
+{
+
+#if ( !defined(CONJ) && !defined(XCONJ) ) || ( defined(CONJ) && defined(XCONJ) )
+
+ register __vector float valpha_r = {alpha_r, alpha_r,alpha_r, alpha_r};
+ register __vector float valpha_i = {-alpha_i, alpha_i,-alpha_i, alpha_i};
+
+#else
+ register __vector float valpha_r = {alpha_r, -alpha_r,alpha_r, -alpha_r};
+ register __vector float valpha_i = {alpha_i, alpha_i,alpha_i, alpha_i};
+#endif
+
+ __vector unsigned char swap_mask = { 4,5,6,7,0,1,2,3, 12,13,14,15, 8,9,10,11};
+ register __vector float *vy = (__vector float *) y;
+ register __vector float *vx = (__vector float *) x;
+ BLASLONG i=0;
+ for (; i < n/2; i += 8) {
+
+ register __vector float vy_0 = vy[i];
+ register __vector float vy_1 = vy[i + 1];
+ register __vector float vy_2 = vy[i + 2];
+ register __vector float vy_3 = vy[i + 3];
+ register __vector float vy_4 = vy[i + 4];
+ register __vector float vy_5 = vy[i + 5];
+ register __vector float vy_6 = vy[i + 6];
+ register __vector float vy_7 = vy[i + 7];
+ register __vector float vx_0 = vx[i];
+ register __vector float vx_1 = vx[i + 1];
+ register __vector float vx_2 = vx[i + 2];
+ register __vector float vx_3 = vx[i + 3];
+ register __vector float vx_4 = vx[i + 4];
+ register __vector float vx_5 = vx[i + 5];
+ register __vector float vx_6 = vx[i + 6];
+ register __vector float vx_7 = vx[i + 7];
+ vy_0 += vx_0*valpha_r;
+ vy_1 += vx_1*valpha_r;
+ vy_2 += vx_2*valpha_r;
+ vy_3 += vx_3*valpha_r;
+ vy_4 += vx_4*valpha_r;
+ vy_5 += vx_5*valpha_r;
+ vy_6 += vx_6*valpha_r;
+ vy_7 += vx_7*valpha_r;
+ vx_0 = vec_perm(vx_0, vx_0, swap_mask);
+ vx_1 = vec_perm(vx_1, vx_1, swap_mask);
+ vx_2 = vec_perm(vx_2, vx_2, swap_mask);
+ vx_3 = vec_perm(vx_3, vx_3, swap_mask);
+ vx_4 = vec_perm(vx_4, vx_4, swap_mask);
+ vx_5 = vec_perm(vx_5, vx_5, swap_mask);
+ vx_6 = vec_perm(vx_6, vx_6, swap_mask);
+ vx_7 = vec_perm(vx_7, vx_7, swap_mask);
+ vy_0 += vx_0*valpha_i;
+ vy_1 += vx_1*valpha_i;
+ vy_2 += vx_2*valpha_i;
+ vy_3 += vx_3*valpha_i;
+ vy_4 += vx_4*valpha_i;
+ vy_5 += vx_5*valpha_i;
+ vy_6 += vx_6*valpha_i;
+ vy_7 += vx_7*valpha_i;
+ vy[i] = vy_0;
+ vy[i + 1] = vy_1;
+ vy[i + 2] = vy_2;
+ vy[i + 3] = vy_3;
+ vy[i + 4] = vy_4;
+ vy[i + 5] = vy_5 ;
+ vy[i + 6] = vy_6 ;
+ vy[i + 7] = vy_7 ;
+
+ }
+}
+#endif
+int CNAME(BLASLONG n, BLASLONG dummy0, BLASLONG dummy1, FLOAT da_r, FLOAT da_i, FLOAT *x, BLASLONG inc_x, FLOAT *y, BLASLONG inc_y, FLOAT *dummy, BLASLONG dummy2) {
+ BLASLONG i = 0;
+ BLASLONG ix = 0, iy = 0;
+ if (n <= 0) return (0);
+ if ((inc_x == 1) && (inc_y == 1)) {
+ BLASLONG n1 = n & -16;
+ if (n1) {
+ caxpy_kernel_16(n1, x, y, da_r,da_i);
+ ix = 2 * n1;
+ }
+ i = n1;
+ while (i < n) {
+#if !defined(CONJ)
+ y[ix] += (da_r * x[ix] - da_i * x[ix + 1]);
+ y[ix + 1] += (da_r * x[ix + 1] + da_i * x[ix]);
+#else
+ y[ix] += (da_r * x[ix] + da_i * x[ix + 1]);
+ y[ix + 1] -= (da_r * x[ix + 1] - da_i * x[ix]);
+#endif
+ i++;
+ ix += 2;
+ }
+ return (0);
+
+ }
+ inc_x *= 2;
+ inc_y *= 2;
+ while (i < n) {
+#if !defined(CONJ)
+ y[iy] += (da_r * x[ix] - da_i * x[ix + 1]);
+ y[iy + 1] += (da_r * x[ix + 1] + da_i * x[ix]);
+#else
+ y[iy] += (da_r * x[ix] + da_i * x[ix + 1]);
+ y[iy + 1] -= (da_r * x[ix + 1] - da_i * x[ix]);
+#endif
+ ix += inc_x;
+ iy += inc_y;
+ i++;
+ }
+ return (0);
+}
+
--- /dev/null
+/*Copyright (c) 2013-201\n8, The OpenBLAS Project
+All rights reserved.
+Redistribution and use in source and binary forms, with or without
+modification, are permitted provided that the following conditions are
+met:
+1. Redistributions of source code must retain the above copyright
+notice, this list of conditions and the following disclaimer.
+2. Redistributions in binary form must reproduce the above copyright
+notice, this list of conditions and the following disclaimer in
+the documentation and/or other materials provided with the
+distribution.
+3. Neither the name of the OpenBLAS project nor the names of
+its contributors may be used to endorse or promote products
+derived from this software without specific prior written permission.
+THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE
+LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
+USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *****************************************************************************/
+
+#include "common.h"
+
+#ifndef HAVE_KERNEL_8
+#include <altivec.h>
+static void cdot_kernel_8(BLASLONG n, FLOAT *x, FLOAT *y, float *dot)
+{
+ __vector unsigned char swap_mask = { 4,5,6,7,0,1,2,3, 12,13,14,15, 8,9,10,11};
+ register __vector float *vy = (__vector float *) y;
+ register __vector float *vx = (__vector float *) x;
+ BLASLONG i = 0;
+ register __vector float vd_0 = { 0 };
+ register __vector float vd_1 = { 0 };
+ register __vector float vd_2 = { 0 };
+ register __vector float vd_3 = { 0 };
+ register __vector float vdd_0 = { 0 };
+ register __vector float vdd_1 = { 0 };
+ register __vector float vdd_2 = { 0 };
+ register __vector float vdd_3 = { 0 };
+ for (; i < n/2; i += 4) {
+
+ register __vector float vyy_0 ;
+ register __vector float vyy_1 ;
+ register __vector float vyy_2 ;
+ register __vector float vyy_3 ;
+
+ register __vector float vy_0 = vy[i];
+ register __vector float vy_1 = vy[i + 1];
+ register __vector float vy_2 = vy[i + 2];
+ register __vector float vy_3 = vy[i + 3];
+ register __vector float vx_0= vx[i];
+ register __vector float vx_1 = vx[i + 1];
+ register __vector float vx_2 = vx[i + 2];
+ register __vector float vx_3 = vx[i + 3];
+ vyy_0 = vec_perm(vy_0, vy_0, swap_mask);
+ vyy_1 = vec_perm(vy_1, vy_1, swap_mask);
+ vyy_2 = vec_perm(vy_2, vy_2, swap_mask);
+ vyy_3 = vec_perm(vy_3, vy_3, swap_mask);
+
+ vd_0 += vx_0 * vy_0;
+ vd_1 += vx_1 * vy_1;
+ vd_2 += vx_2 * vy_2;
+ vd_3 += vx_3 * vy_3;
+
+ vdd_0 += vx_0 * vyy_0;
+ vdd_1 += vx_1 * vyy_1;
+ vdd_2 += vx_2 * vyy_2;
+ vdd_3 += vx_3 * vyy_3;
+
+
+ }
+ //aggregate
+ vd_0 = vd_0 + vd_1 +vd_2 +vd_3;
+ vdd_0= vdd_0 + vdd_1 +vdd_2 +vdd_3;
+ //reverse and aggregate
+ vd_1=vec_xxpermdi(vd_0,vd_0,2) ;
+ vdd_1=vec_xxpermdi(vdd_0,vdd_0,2);
+ vd_2=vd_0+vd_1;
+ vdd_2=vdd_0+vdd_1;
+
+ dot[0]=vd_2[0];
+ dot[1]=vd_2[1];
+ dot[2]=vdd_2[0];
+ dot[3]=vdd_2[1];
+
+}
+#endif
+
+
+OPENBLAS_COMPLEX_FLOAT CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x, FLOAT *y, BLASLONG inc_y) {
+ BLASLONG i = 0;
+ BLASLONG ix=0, iy=0;
+ OPENBLAS_COMPLEX_FLOAT result;
+ FLOAT dot[4] __attribute__ ((aligned(16))) = {0.0, 0.0, 0.0, 0.0};
+
+ if (n <= 0) {
+ CREAL(result) = 0.0;
+ CIMAG(result) = 0.0;
+ return (result);
+
+ }
+
+ if ((inc_x == 1) && (inc_y == 1)) {
+
+ BLASLONG n1 = n & -8;
+ BLASLONG j=0;
+
+ if (n1){
+ cdot_kernel_8(n1, x, y, dot);
+ i = n1;
+ j = n1 <<1;
+ }
+
+
+ while (i < n) {
+
+ dot[0] += x[j] * y[j];
+ dot[1] += x[j + 1] * y[j + 1];
+ dot[2] += x[j] * y[j + 1];
+ dot[3] += x[j + 1] * y[j];
+
+ j += 2;
+ i++;
+
+ }
+
+
+ } else {
+ i = 0;
+ ix = 0;
+ iy = 0;
+ inc_x <<= 1;
+ inc_y <<= 1;
+ while (i < n) {
+
+ dot[0] += x[ix] * y[iy];
+ dot[1] += x[ix + 1] * y[iy + 1];
+ dot[2] += x[ix] * y[iy + 1];
+ dot[3] += x[ix + 1] * y[iy];
+
+ ix += inc_x;
+ iy += inc_y;
+ i++;
+
+ }
+ }
+
+#if !defined(CONJ)
+ CREAL(result) = dot[0] - dot[1];
+ CIMAG(result) = dot[2] + dot[3];
+#else
+ CREAL(result) = dot[0] + dot[1];
+ CIMAG(result) = dot[2] - dot[3];
+
+#endif
+
+ return (result);
+
+}
--- /dev/null
+/***************************************************************************\r
+Copyright (c) 2013-2018, The OpenBLAS Project\r
+All rights reserved.\r
+Redistribution and use in source and binary forms, with or without\r
+modification, are permitted provided that the following conditions are\r
+met:\r
+1. Redistributions of source code must retain the above copyright\r
+notice, this list of conditions and the following disclaimer.\r
+2. Redistributions in binary form must reproduce the above copyright\r
+notice, this list of conditions and the following disclaimer in\r
+the documentation and/or other materials provided with the\r
+distribution.\r
+3. Neither the name of the OpenBLAS project nor the names of\r
+its contributors may be used to endorse or promote products\r
+derived from this software without specific prior written permission.\r
+THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"\r
+AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE\r
+IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE\r
+ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE\r
+LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL\r
+DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR\r
+SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER\r
+CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,\r
+OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE\r
+USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.\r
+*****************************************************************************/\r
+ \r
+#include "common.h"\r
+ \r
+#if defined(POWER8)\r
+\r
+static void crot_kernel_8 (long n, float *x, float *y, float c, float s)\r
+{\r
+ __vector float t0;\r
+ __vector float t1;\r
+ __vector float t2;\r
+ __vector float t3;\r
+ __vector float t4;\r
+ __vector float t5;\r
+ __vector float t6;\r
+ __vector float t7;\r
+ __asm__\r
+ (\r
+ "xscvdpspn 36, %x[cos] \n\t" // load c to all words\r
+ "xxspltw 36, 36, 0 \n\t" \r
+ "xscvdpspn 37, %x[sin] \n\t" // load s to all words\r
+ "xxspltw 37, 37, 0 \n\t" \r
+ "lxvd2x 32, 0, %[x_ptr] \n\t" // load x\r
+ "lxvd2x 33, %[i16], %[x_ptr] \n\t" \r
+ "lxvd2x 34, %[i32], %[x_ptr] \n\t" \r
+ "lxvd2x 35, %[i48], %[x_ptr] \n\t" \r
+ "lxvd2x 48, 0, %[y_ptr] \n\t" // load y\r
+ "lxvd2x 49, %[i16], %[y_ptr] \n\t" \r
+ "lxvd2x 50, %[i32], %[y_ptr] \n\t" \r
+ "lxvd2x 51, %[i48], %[y_ptr] \n\t" \r
+ "addi %[x_ptr], %[x_ptr], 64 \n\t" \r
+ "addi %[y_ptr], %[y_ptr], 64 \n\t" \r
+ "addic. %[temp_n], %[temp_n], -16 \n\t" \r
+ "ble 2f \n\t" \r
+ ".p2align 5 \n\t" \r
+ "1: \n\t" \r
+ "xvmulsp 40, 32, 36 \n\t" // c * x\r
+ "xvmulsp 41, 33, 36 \n\t" \r
+ "xvmulsp 42, 34, 36 \n\t" \r
+ "xvmulsp 43, 35, 36 \n\t" \r
+ "xvmulsp %x[x0], 48, 36 \n\t" // c * y\r
+ "xvmulsp %x[x2], 49, 36 \n\t" \r
+ "xvmulsp %x[x1], 50, 36 \n\t" \r
+ "xvmulsp %x[x3], 51, 36 \n\t" \r
+ "xvmulsp 44, 32, 37 \n\t" // s * x\r
+ "xvmulsp 45, 33, 37 \n\t" \r
+ "lxvd2x 32, 0, %[x_ptr] \n\t" // load x\r
+ "lxvd2x 33, %[i16], %[x_ptr] \n\t" \r
+ "xvmulsp 46, 34, 37 \n\t" \r
+ "xvmulsp 47, 35, 37 \n\t" \r
+ "lxvd2x 34, %[i32], %[x_ptr] \n\t" \r
+ "lxvd2x 35, %[i48], %[x_ptr] \n\t" \r
+ "xvmulsp %x[x4], 48, 37 \n\t" // s * y\r
+ "xvmulsp %x[x5], 49, 37 \n\t" \r
+ "lxvd2x 48, 0, %[y_ptr] \n\t" // load y\r
+ "lxvd2x 49, %[i16], %[y_ptr] \n\t" \r
+ "xvmulsp %x[x6], 50, 37 \n\t" \r
+ "xvmulsp %x[x7], 51, 37 \n\t" \r
+ "lxvd2x 50, %[i32], %[y_ptr] \n\t" \r
+ "lxvd2x 51, %[i48], %[y_ptr] \n\t" \r
+ "xvaddsp 40, 40, %x[x4] \n\t" // c * x + s * y\r
+ "xvaddsp 41, 41, %x[x5] \n\t" // c * x + s * y\r
+ "addi %[x_ptr], %[x_ptr], -64 \n\t" \r
+ "addi %[y_ptr], %[y_ptr], -64 \n\t" \r
+ "xvaddsp 42, 42, %x[x6] \n\t" // c * x + s * y\r
+ "xvaddsp 43, 43, %x[x7] \n\t" // c * x + s * y\r
+ "xvsubsp %x[x0], %x[x0], 44 \n\t" // c * y - s * x\r
+ "xvsubsp %x[x2], %x[x2], 45 \n\t" // c * y - s * x\r
+ "xvsubsp %x[x1], %x[x1], 46 \n\t" // c * y - s * x\r
+ "xvsubsp %x[x3], %x[x3], 47 \n\t" // c * y - s * x\r
+ "stxvd2x 40, 0, %[x_ptr] \n\t" // store x\r
+ "stxvd2x 41, %[i16], %[x_ptr] \n\t" \r
+ "stxvd2x 42, %[i32], %[x_ptr] \n\t" \r
+ "stxvd2x 43, %[i48], %[x_ptr] \n\t" \r
+ "stxvd2x %x[x0], 0, %[y_ptr] \n\t" // store y\r
+ "stxvd2x %x[x2], %[i16], %[y_ptr] \n\t" \r
+ "stxvd2x %x[x1], %[i32], %[y_ptr] \n\t" \r
+ "stxvd2x %x[x3], %[i48], %[y_ptr] \n\t" \r
+ "addi %[x_ptr], %[x_ptr], 128 \n\t" \r
+ "addi %[y_ptr], %[y_ptr], 128 \n\t" \r
+ "addic. %[temp_n], %[temp_n], -16 \n\t" \r
+ "bgt 1b \n\t" \r
+ "2: \n\t" \r
+ "xvmulsp 40, 32, 36 \n\t" // c * x\r
+ "xvmulsp 41, 33, 36 \n\t" \r
+ "xvmulsp 42, 34, 36 \n\t" \r
+ "xvmulsp 43, 35, 36 \n\t" \r
+ "xvmulsp %x[x0], 48, 36 \n\t" // c * y\r
+ "xvmulsp %x[x2], 49, 36 \n\t" \r
+ "xvmulsp %x[x1], 50, 36 \n\t" \r
+ "xvmulsp %x[x3], 51, 36 \n\t" \r
+ "xvmulsp 44, 32, 37 \n\t" // s * x\r
+ "xvmulsp 45, 33, 37 \n\t" \r
+ "xvmulsp 46, 34, 37 \n\t" \r
+ "xvmulsp 47, 35, 37 \n\t" \r
+ "xvmulsp %x[x4], 48, 37 \n\t" // s * y\r
+ "xvmulsp %x[x5], 49, 37 \n\t" \r
+ "xvmulsp %x[x6], 50, 37 \n\t" \r
+ "xvmulsp %x[x7], 51, 37 \n\t" \r
+ "addi %[x_ptr], %[x_ptr], -64 \n\t" \r
+ "addi %[y_ptr], %[y_ptr], -64 \n\t" \r
+ "xvaddsp 40, 40, %x[x4] \n\t" // c * x + s * y\r
+ "xvaddsp 41, 41, %x[x5] \n\t" // c * x + s * y\r
+ "xvaddsp 42, 42, %x[x6] \n\t" // c * x + s * y\r
+ "xvaddsp 43, 43, %x[x7] \n\t" // c * x + s * y\r
+ "xvsubsp %x[x0], %x[x0], 44 \n\t" // c * y - s * x\r
+ "xvsubsp %x[x2], %x[x2], 45 \n\t" // c * y - s * x\r
+ "xvsubsp %x[x1], %x[x1], 46 \n\t" // c * y - s * x\r
+ "xvsubsp %x[x3], %x[x3], 47 \n\t" // c * y - s * x\r
+ "stxvd2x 40, 0, %[x_ptr] \n\t" // store x\r
+ "stxvd2x 41, %[i16], %[x_ptr] \n\t" \r
+ "stxvd2x 42, %[i32], %[x_ptr] \n\t" \r
+ "stxvd2x 43, %[i48], %[x_ptr] \n\t" \r
+ "stxvd2x %x[x0], 0, %[y_ptr] \n\t" // store y\r
+ "stxvd2x %x[x2], %[i16], %[y_ptr] \n\t" \r
+ "stxvd2x %x[x1], %[i32], %[y_ptr] \n\t" \r
+ "stxvd2x %x[x3], %[i48], %[y_ptr] "\r
+ :\r
+ [mem_x] "+m" (*(float (*)[2*n])x),\r
+ [mem_y] "+m" (*(float (*)[2*n])y),\r
+ [temp_n] "+r" (n),\r
+ [x_ptr] "+&b" (x),\r
+ [y_ptr] "+&b" (y),\r
+ [x0] "=wa" (t0),\r
+ [x1] "=wa" (t2),\r
+ [x2] "=wa" (t1),\r
+ [x3] "=wa" (t3),\r
+ [x4] "=wa" (t4),\r
+ [x5] "=wa" (t5),\r
+ [x6] "=wa" (t6),\r
+ [x7] "=wa" (t7) \r
+ : \r
+ [cos] "f" (c),\r
+ [sin] "f" (s),\r
+ [i16] "b" (16),\r
+ [i32] "b" (32),\r
+ [i48] "b" (48) \r
+ :\r
+ "cr0",\r
+ "vs32","vs33","vs34","vs35","vs36","vs37",\r
+ "vs40","vs41","vs42","vs43","vs44","vs45","vs46","vs47",\r
+ "vs48","vs49","vs50","vs51"\r
+ );\r
+}\r
+ \r
+#endif\r
+\r
+\r
+int CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x, FLOAT *y, BLASLONG inc_y, FLOAT c, FLOAT s)\r
+{\r
+ BLASLONG i=0;\r
+ BLASLONG ix=0,iy=0;\r
+ FLOAT *x1=x;\r
+ FLOAT *y1=y;\r
+ FLOAT temp;\r
+ if ( n <= 0 ) return(0);\r
+ if ( (inc_x == 1) && (inc_y == 1) )\r
+ {\r
+ BLASLONG n1 = n & -8;\r
+ if ( n1 > 0 )\r
+ {\r
+ crot_kernel_8(n1, x1, y1, c, s);\r
+ i=n1;\r
+ }\r
+ while(i < n)\r
+ {\r
+ temp = c*x[i] + s*y[i] ;\r
+ y[i] = c*y[i] - s*x[i] ;\r
+ x[i] = temp ;\r
+ i++ ;\r
+ }\r
+\r
+ }\r
+ else\r
+ {\r
+ while(i < n)\r
+ {\r
+ temp = c*x[ix] + s*y[iy] ;\r
+ y[iy] = c*y[iy] - s*x[ix] ;\r
+ x[ix] = temp ;\r
+ ix += inc_x ;\r
+ iy += inc_y ;\r
+ i++ ;\r
+ }\r
+ }\r
+ return(0);\r
+}\r
+\r
--- /dev/null
+/***************************************************************************\r
+Copyright (c) 2019, The OpenBLAS Project\r
+All rights reserved.\r
+Redistribution and use in source and binary forms, with or without\r
+modification, are permitted provided that the following conditions are\r
+met:\r
+1. Redistributions of source code must retain the above copyright\r
+notice, this list of conditions and the following disclaimer.\r
+2. Redistributions in binary form must reproduce the above copyright\r
+notice, this list of conditions and the following disclaimer in\r
+the documentation and/or other materials provided with the\r
+distribution.\r
+3. Neither the name of the OpenBLAS project nor the names of\r
+its contributors may be used to endorse or promote products\r
+derived from this software without specific prior written permission.\r
+THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"\r
+AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE\r
+IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE\r
+ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE\r
+LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL\r
+DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR\r
+SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER\r
+CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,\r
+OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE\r
+USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.\r
+*****************************************************************************/\r
+ \r
+\r
+#include "common.h"\r
+#include <math.h>\r
+#include <altivec.h>\r
+#if defined(DOUBLE)\r
+ #define ABS fabs\r
+#else\r
+ #define ABS fabsf\r
+#endif\r
+#define CABS1(x,i) ABS(x[i])+ABS(x[i+1])\r
+\r
+\r
+\r
+ \r
+/**\r
+ * Find maximum index \r
+ * Warning: requirements n>0 and n % 32 == 0\r
+ * @param n \r
+ * @param x pointer to the vector\r
+ * @param maxf (out) maximum absolute value .( only for output )\r
+ * @return index \r
+ */\r
+static BLASLONG ciamax_kernel_32(BLASLONG n, FLOAT *x, FLOAT *maxf) { \r
+\r
+ BLASLONG index;\r
+ BLASLONG i;\r
+ register __vector unsigned int static_index0 = {0,1,2,3};\r
+ register __vector unsigned int temp0 = {4,4,4, 4}; //temporary vector register\r
+ register __vector unsigned int temp1= temp0<<1; //{8,8,8,8}\r
+ register __vector unsigned int static_index1=static_index0 +temp0;//{4,5,6,7};\r
+ register __vector unsigned int static_index2=static_index0 +temp1;//{8,9,10,11};\r
+ register __vector unsigned int static_index3=static_index1 +temp1; //{12,13,14,15};\r
+ temp0=vec_xor(temp0,temp0);\r
+ temp1=temp1 <<1 ; //{16,16,16,16}\r
+ register __vector unsigned int temp_add=temp1 <<1; //{32,32,32,32}\r
+ register __vector unsigned int quadruple_indices=temp0;//{0,0,0,0}\r
+ register __vector float quadruple_values={0,0,0,0};\r
+\r
+ register __vector float * v_ptrx=(__vector float *)x;\r
+ register __vector unsigned char real_pack_mask = { 0,1,2,3,8,9,10,11,16,17,18,19, 24,25,26,27}; \r
+ register __vector unsigned char image_pack_mask= {4, 5, 6, 7, 12, 13, 14, 15, 20, 21, 22, 23, 28, 29, 30, 31}; \r
+ for(; i<n; i+=32){\r
+ //absolute temporary complex vectors\r
+ register __vector float v0=vec_abs(v_ptrx[0]);\r
+ register __vector float v1=vec_abs(v_ptrx[1]);\r
+ register __vector float v2=vec_abs(v_ptrx[2]);\r
+ register __vector float v3=vec_abs(v_ptrx[3]);\r
+ register __vector float v4=vec_abs(v_ptrx[4]);\r
+ register __vector float v5=vec_abs(v_ptrx[5]);\r
+ register __vector float v6=vec_abs(v_ptrx[6]); \r
+ register __vector float v7=vec_abs(v_ptrx[7]);\r
+\r
+ //pack complex real and imaginary parts together to sum real+image\r
+ register __vector float t1=vec_perm(v0,v1,real_pack_mask);\r
+ register __vector float ti=vec_perm(v0,v1,image_pack_mask); \r
+ v0=t1+ti; //sum quadruple real with quadruple image\r
+ register __vector float t2=vec_perm(v2,v3,real_pack_mask);\r
+ register __vector float ti2=vec_perm(v2,v3,image_pack_mask); \r
+ v1=t2+ti2;\r
+ t1=vec_perm(v4,v5,real_pack_mask);\r
+ ti=vec_perm(v4,v5,image_pack_mask); \r
+ v2=t1+ti; //sum\r
+ t2=vec_perm(v6,v7,real_pack_mask);\r
+ ti2=vec_perm(v6,v7,image_pack_mask); \r
+ v3=t2+ti2;\r
+ // now we have 16 summed elements . lets compare them\r
+ v_ptrx+=8;\r
+ register __vector bool int r1=vec_cmpgt(v1,v0);\r
+ register __vector bool int r2=vec_cmpgt(v3,v2);\r
+ register __vector unsigned int ind2= vec_sel(static_index0,static_index1,r1);\r
+ v0=vec_sel(v0,v1,r1); \r
+ register __vector unsigned int ind3= vec_sel(static_index2,static_index3,r2);\r
+ v1=vec_sel(v2,v3,r2);\r
+ //final cmp and select index and value for first 16 values\r
+ r1=vec_cmpgt(v1,v0);\r
+ register __vector unsigned int indf0 = vec_sel(ind2,ind3,r1);\r
+ register __vector float vf0= vec_sel(v0,v1,r1); \r
+\r
+ //absolute temporary complex vectors\r
+ v0=vec_abs(v_ptrx[0]);\r
+ v1=vec_abs(v_ptrx[1]);\r
+ v2=vec_abs(v_ptrx[2]);\r
+ v3=vec_abs(v_ptrx[3]);\r
+ v4=vec_abs(v_ptrx[4]);\r
+ v5=vec_abs(v_ptrx[5]);\r
+ v6=vec_abs(v_ptrx[6]); \r
+ v7=vec_abs(v_ptrx[7]);\r
+\r
+ //pack complex real and imaginary parts together to sum real+image\r
+ t1=vec_perm(v0,v1,real_pack_mask);\r
+ ti=vec_perm(v0,v1,image_pack_mask); \r
+ v0=t1+ti; //sum quadruple real with quadruple image\r
+ t2=vec_perm(v2,v3,real_pack_mask);\r
+ ti2=vec_perm(v2,v3,image_pack_mask); \r
+ v1=t2+ti2;\r
+ t1=vec_perm(v4,v5,real_pack_mask);\r
+ ti=vec_perm(v4,v5,image_pack_mask); \r
+ v2=t1+ti; //sum\r
+ t2=vec_perm(v6,v7,real_pack_mask);\r
+ ti2=vec_perm(v6,v7,image_pack_mask); \r
+ v3=t2+ti2;\r
+ // now we have 16 summed elements {from 16 to 31} . lets compare them\r
+ v_ptrx+=8;\r
+ r1=vec_cmpgt(v1,v0);\r
+ r2=vec_cmpgt(v3,v2);\r
+ ind2= vec_sel(static_index0,static_index1,r1);\r
+ v0=vec_sel(v0,v1,r1); \r
+ ind3= vec_sel(static_index2,static_index3,r2);\r
+ v1=vec_sel(v2,v3,r2);\r
+ //final cmp and select index and value for the second 16 values\r
+ r1=vec_cmpgt(v1,v0);\r
+ register __vector unsigned int indv0 = vec_sel(ind2,ind3,r1);\r
+ register __vector float vv0= vec_sel(v0,v1,r1); \r
+ indv0+=temp1; //make index from 16->31\r
+\r
+ //find final quadruple from 32 elements\r
+ r2=vec_cmpgt(vv0,vf0);\r
+ ind2 = vec_sel( indf0,indv0,r2);\r
+ vv0= vec_sel(vf0,vv0,r2); \r
+ //get asbolute index\r
+ ind2+=temp0;\r
+ //compare with old quadruple and update \r
+ r1=vec_cmpgt(vv0,quadruple_values);\r
+ quadruple_indices = vec_sel( quadruple_indices,ind2,r1);\r
+ quadruple_values= vec_sel(quadruple_values,vv0,r1); \r
+\r
+ temp0+=temp_add; \r
+ }\r
+\r
+ //now we have to chose from 4 values and 4 different indices\r
+ // we will compare pairwise if pairs are exactly the same we will choose minimum between index\r
+ // otherwise we will assign index of the maximum value\r
+ float a1,a2,a3,a4;\r
+ unsigned int i1,i2,i3,i4;\r
+ a1=vec_extract(quadruple_values,0);\r
+ a2=vec_extract(quadruple_values,1);\r
+ a3=vec_extract(quadruple_values,2);\r
+ a4=vec_extract(quadruple_values,3);\r
+ i1=vec_extract(quadruple_indices,0);\r
+ i2=vec_extract(quadruple_indices,1);\r
+ i3=vec_extract(quadruple_indices,2);\r
+ i4=vec_extract(quadruple_indices,3);\r
+ if(a1==a2){\r
+ index=i1>i2?i2:i1;\r
+ }else if(a2>a1){\r
+ index=i2;\r
+ a1=a2;\r
+ }else{\r
+ index= i1;\r
+ }\r
+\r
+ if(a4==a3){\r
+ i1=i3>i4?i4:i3;\r
+ }else if(a4>a3){\r
+ i1=i4;\r
+ a3=a4;\r
+ }else{\r
+ i1= i3;\r
+ }\r
+\r
+ if(a1==a3){\r
+ index=i1>index?index:i1;\r
+ *maxf=a1; \r
+ }else if(a3>a1){\r
+ index=i1;\r
+ *maxf=a3;\r
+ }else{ \r
+ *maxf=a1;\r
+ }\r
+ return index; \r
+\r
+}\r
+ \r
+ \r
+\r
+ \r
+ \r
+\r
+BLASLONG CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x)\r
+{\r
+ BLASLONG i = 0;\r
+ BLASLONG ix = 0;\r
+ FLOAT maxf = 0;\r
+ BLASLONG max = 0;\r
+ BLASLONG inc_x2;\r
+\r
+ if (n <= 0 || inc_x <= 0) return(max);\r
+ \r
+ if (inc_x == 1) {\r
+\r
+ BLASLONG n1 = n & -32;\r
+ if (n1 > 0) {\r
+\r
+ max = ciamax_kernel_32(n1, x, &maxf); \r
+ i = n1;\r
+ ix = n1 << 1;\r
+ }\r
+\r
+ while(i < n)\r
+ {\r
+ if( CABS1(x,ix) > maxf )\r
+ {\r
+ max = i;\r
+ maxf = CABS1(x,ix);\r
+ }\r
+ ix += 2;\r
+ i++;\r
+ }\r
+ return (max + 1);\r
+\r
+ } else {\r
+ \r
+ inc_x2 = 2 * inc_x;\r
+\r
+ maxf = CABS1(x,0);\r
+ ix += inc_x2;\r
+ i++;\r
+\r
+ while(i < n)\r
+ {\r
+ if( CABS1(x,ix) > maxf )\r
+ {\r
+ max = i;\r
+ maxf = CABS1(x,ix);\r
+ }\r
+ ix += inc_x2;\r
+ i++;\r
+ }\r
+ return (max + 1);\r
+ }\r
+ \r
+}\r
+\r
+\r
--- /dev/null
+/***************************************************************************\r
+Copyright (c) 2019, The OpenBLAS Project\r
+All rights reserved.\r
+Redistribution and use in source and binary forms, with or without\r
+modification, are permitted provided that the following conditions are\r
+met:\r
+1. Redistributions of source code must retain the above copyright\r
+notice, this list of conditions and the following disclaimer.\r
+2. Redistributions in binary form must reproduce the above copyright\r
+notice, this list of conditions and the following disclaimer in\r
+the documentation and/or other materials provided with the\r
+distribution.\r
+3. Neither the name of the OpenBLAS project nor the names of\r
+its contributors may be used to endorse or promote products\r
+derived from this software without specific prior written permission.\r
+THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"\r
+AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE\r
+IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE\r
+ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE\r
+LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL\r
+DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR\r
+SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER\r
+CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,\r
+OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE\r
+USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.\r
+*****************************************************************************/\r
+ \r
+\r
+#include "common.h"\r
+#include <math.h>\r
+#include <altivec.h>\r
+#if defined(DOUBLE)\r
+ #define ABS fabs\r
+#else\r
+ #define ABS fabsf\r
+#endif\r
+#define CABS1(x,i) ABS(x[i])+ABS(x[i+1])\r
+\r
+\r
+\r
+ \r
+/**\r
+ * Find minimum index \r
+ * Warning: requirements n>0 and n % 32 == 0\r
+ * @param n \r
+ * @param x pointer to the vector\r
+ * @param minf (out) minimum absolute value .( only for output )\r
+ * @return index \r
+ */\r
+static BLASLONG ciamin_kernel_32(BLASLONG n, FLOAT *x, FLOAT *minf) { \r
+\r
+ BLASLONG index;\r
+ BLASLONG i;\r
+ register __vector unsigned int static_index0 = {0,1,2,3};\r
+ register __vector unsigned int temp0 = {4,4,4, 4}; //temporary vector register\r
+ register __vector unsigned int temp1= temp0<<1; //{8,8,8,8}\r
+ register __vector unsigned int static_index1=static_index0 +temp0;//{4,5,6,7};\r
+ register __vector unsigned int static_index2=static_index0 +temp1;//{8,9,10,11};\r
+ register __vector unsigned int static_index3=static_index1 +temp1; //{12,13,14,15};\r
+ temp0=vec_xor(temp0,temp0);\r
+ temp1=temp1 <<1 ; //{16,16,16,16}\r
+ register __vector unsigned int temp_add=temp1 <<1; //{32,32,32,32}\r
+ register __vector unsigned int quadruple_indices=temp0;//{0,0,0,0}\r
+ float first_min=CABS1(x,0);\r
+ register __vector float quadruple_values={first_min,first_min,first_min,first_min};\r
+\r
+ register __vector float * v_ptrx=(__vector float *)x;\r
+ register __vector unsigned char real_pack_mask = { 0,1,2,3,8,9,10,11,16,17,18,19, 24,25,26,27}; \r
+ register __vector unsigned char image_pack_mask= {4, 5, 6, 7, 12, 13, 14, 15, 20, 21, 22, 23, 28, 29, 30, 31}; \r
+ for(; i<n; i+=32){\r
+ //absolute temporary complex vectors\r
+ register __vector float v0=vec_abs(v_ptrx[0]);\r
+ register __vector float v1=vec_abs(v_ptrx[1]);\r
+ register __vector float v2=vec_abs(v_ptrx[2]);\r
+ register __vector float v3=vec_abs(v_ptrx[3]);\r
+ register __vector float v4=vec_abs(v_ptrx[4]);\r
+ register __vector float v5=vec_abs(v_ptrx[5]);\r
+ register __vector float v6=vec_abs(v_ptrx[6]); \r
+ register __vector float v7=vec_abs(v_ptrx[7]);\r
+\r
+ //pack complex real and imaginary parts together to sum real+image\r
+ register __vector float t1=vec_perm(v0,v1,real_pack_mask);\r
+ register __vector float ti=vec_perm(v0,v1,image_pack_mask); \r
+ v0=t1+ti; //sum quadruple real with quadruple image\r
+ register __vector float t2=vec_perm(v2,v3,real_pack_mask);\r
+ register __vector float ti2=vec_perm(v2,v3,image_pack_mask); \r
+ v1=t2+ti2;\r
+ t1=vec_perm(v4,v5,real_pack_mask);\r
+ ti=vec_perm(v4,v5,image_pack_mask); \r
+ v2=t1+ti; //sum\r
+ t2=vec_perm(v6,v7,real_pack_mask);\r
+ ti2=vec_perm(v6,v7,image_pack_mask); \r
+ v3=t2+ti2;\r
+ // now we have 16 summed elements . lets compare them\r
+ v_ptrx+=8;\r
+ register __vector bool int r1=vec_cmpgt(v0,v1);\r
+ register __vector bool int r2=vec_cmpgt(v2,v3);\r
+ register __vector unsigned int ind2= vec_sel(static_index0,static_index1,r1);\r
+ v0=vec_sel(v0,v1,r1); \r
+ register __vector unsigned int ind3= vec_sel(static_index2,static_index3,r2);\r
+ v1=vec_sel(v2,v3,r2);\r
+ //final cmp and select index and value for first 16 values\r
+ r1=vec_cmpgt(v0,v1);\r
+ register __vector unsigned int indf0 = vec_sel(ind2,ind3,r1);\r
+ register __vector float vf0= vec_sel(v0,v1,r1); \r
+\r
+ //absolute temporary complex vectors\r
+ v0=vec_abs(v_ptrx[0]);\r
+ v1=vec_abs(v_ptrx[1]);\r
+ v2=vec_abs(v_ptrx[2]);\r
+ v3=vec_abs(v_ptrx[3]);\r
+ v4=vec_abs(v_ptrx[4]);\r
+ v5=vec_abs(v_ptrx[5]);\r
+ v6=vec_abs(v_ptrx[6]); \r
+ v7=vec_abs(v_ptrx[7]);\r
+\r
+ //pack complex real and imaginary parts together to sum real+image\r
+ t1=vec_perm(v0,v1,real_pack_mask);\r
+ ti=vec_perm(v0,v1,image_pack_mask); \r
+ v0=t1+ti; //sum quadruple real with quadruple image\r
+ t2=vec_perm(v2,v3,real_pack_mask);\r
+ ti2=vec_perm(v2,v3,image_pack_mask); \r
+ v1=t2+ti2;\r
+ t1=vec_perm(v4,v5,real_pack_mask);\r
+ ti=vec_perm(v4,v5,image_pack_mask); \r
+ v2=t1+ti; //sum\r
+ t2=vec_perm(v6,v7,real_pack_mask);\r
+ ti2=vec_perm(v6,v7,image_pack_mask); \r
+ v3=t2+ti2;\r
+ // now we have 16 summed elements {from 16 to 31} . lets compare them\r
+ v_ptrx+=8;\r
+ r1=vec_cmpgt(v0,v1);\r
+ r2=vec_cmpgt(v2,v3);\r
+ ind2= vec_sel(static_index0,static_index1,r1);\r
+ v0=vec_sel(v0,v1,r1); \r
+ ind3= vec_sel(static_index2,static_index3,r2);\r
+ v1=vec_sel(v2,v3,r2);\r
+ //final cmp and select index and value for the second 16 values\r
+ r1=vec_cmpgt(v0,v1);\r
+ register __vector unsigned int indv0 = vec_sel(ind2,ind3,r1);\r
+ register __vector float vv0= vec_sel(v0,v1,r1); \r
+ indv0+=temp1; //make index from 16->31\r
+\r
+ //find final quadruple from 32 elements\r
+ r2=vec_cmpgt(vf0,vv0);\r
+ ind2 = vec_sel( indf0,indv0,r2);\r
+ vv0= vec_sel(vf0,vv0,r2); \r
+ //get asbolute index\r
+ ind2+=temp0;\r
+ //compare with old quadruple and update \r
+ r1=vec_cmpgt(quadruple_values,vv0);\r
+ quadruple_indices = vec_sel( quadruple_indices,ind2,r1);\r
+ quadruple_values= vec_sel(quadruple_values,vv0,r1); \r
+\r
+ temp0+=temp_add; \r
+ }\r
+\r
+ //now we have to chose from 4 values and 4 different indices\r
+ // we will compare pairwise if pairs are exactly the same we will choose minimum between index\r
+ // otherwise we will assign index of the minimum value\r
+ float a1,a2,a3,a4;\r
+ unsigned int i1,i2,i3,i4;\r
+ a1=vec_extract(quadruple_values,0);\r
+ a2=vec_extract(quadruple_values,1);\r
+ a3=vec_extract(quadruple_values,2);\r
+ a4=vec_extract(quadruple_values,3);\r
+ i1=vec_extract(quadruple_indices,0);\r
+ i2=vec_extract(quadruple_indices,1);\r
+ i3=vec_extract(quadruple_indices,2);\r
+ i4=vec_extract(quadruple_indices,3);\r
+ if(a1==a2){\r
+ index=i1>i2?i2:i1;\r
+ }else if(a2<a1){\r
+ index=i2;\r
+ a1=a2;\r
+ }else{\r
+ index= i1;\r
+ }\r
+\r
+ if(a4==a3){\r
+ i1=i3>i4?i4:i3;\r
+ }else if(a4<a3){\r
+ i1=i4;\r
+ a3=a4;\r
+ }else{\r
+ i1= i3;\r
+ }\r
+\r
+ if(a1==a3){\r
+ index=i1>index?index:i1;\r
+ *minf=a1; \r
+ }else if(a3<a1){\r
+ index=i1;\r
+ *minf=a3;\r
+ }else{ \r
+ *minf=a1;\r
+ }\r
+ return index;\r
+\r
+}\r
+ \r
+ \r
+\r
+ \r
+\r
+ \r
+ \r
+\r
+BLASLONG CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x)\r
+{\r
+ BLASLONG i=0;\r
+ BLASLONG ix=0;\r
+ FLOAT minf;\r
+ BLASLONG min=0;\r
+ BLASLONG inc_x2;\r
+\r
+ if (n <= 0 || inc_x <= 0) return(min);\r
+ \r
+\r
+ if (inc_x == 1) {\r
+ minf = CABS1(x,0); //index will not be incremented\r
+ BLASLONG n1 = n & -32;\r
+ if (n1 > 0) {\r
+\r
+ min = ciamin_kernel_32(n1, x, &minf);\r
+ i = n1;\r
+ ix = n1 << 1;\r
+ }\r
+ \r
+\r
+ while(i < n)\r
+ {\r
+ if( CABS1(x,ix) < minf )\r
+ {\r
+ min = i;\r
+ minf = CABS1(x,ix);\r
+ }\r
+ ix += 2;\r
+ i++;\r
+ }\r
+ return (min + 1);\r
+\r
+ } else {\r
+ \r
+ inc_x2 = 2 * inc_x;\r
+\r
+ minf = CABS1(x,0);\r
+ ix += inc_x2;\r
+ i++;\r
+\r
+ while(i < n)\r
+ {\r
+ if( CABS1(x,ix) < minf )\r
+ {\r
+ min = i;\r
+ minf = CABS1(x,ix);\r
+ }\r
+ ix += inc_x2;\r
+ i++;\r
+ }\r
+ return (min + 1);\r
+ }\r
+ \r
+}\r
+\r
+\r
".p2align 5 \n\t"
"1: \n\t"
- "xvcmpgedp 2,44,45 \n\t "
- "xvcmpgedp 3,46,47 \n\t "
- "xvcmpgedp 4,48,49 \n\t "
- "xvcmpgedp 5,50,51 \n\t"
+ "xvcmpgtdp 2,44,45 \n\t "
+ "xvcmpgtdp 3,46,47 \n\t "
+ "xvcmpgtdp 4,48,49 \n\t "
+ "xvcmpgtdp 5,50,51 \n\t"
"xxsel 32,40,41,2 \n\t"
"xxsel 0,44,45,2 \n\t"
"xxsel 35,42,43,5 \n\t"
"xxsel 47,50,51,5 \n\t"
- "xvcmpgedp 2,0, 1 \n\t"
- "xvcmpgedp 3, 45,47 \n\t"
+ "xvcmpgtdp 2,0, 1 \n\t"
+ "xvcmpgtdp 3, 45,47 \n\t"
"addi %[ptr_tmp] ,%[ptr_tmp] , 128 \n\t"
"lxvd2x 47, %[i48],%[ptr_tmp] \n\t"
//choose smaller from first and second part
- "xvcmpgedp 4, 0,5 \n\t"
+ "xvcmpgtdp 4, 0,5 \n\t"
"xxsel 3, 0,5,4 \n\t"
"xxsel 33,32,34,4 \n\t"
"lxvd2x 51,%[i112],%[ptr_tmp] \n\t"
//compare with previous to get vec_min_index(v6 | vs38 ) and vec_min_value (vs39)
- "xvcmpgedp 2,39, 3 \n\t"
+ "xvcmpgtdp 2,39, 3 \n\t"
"xxsel 39,39,3,2 \n\t"
"xxsel 38,38,33,2 \n\t"
//<-----------jump here from first load
"2: \n\t"
- "xvcmpgedp 2,44,45 \n\t "
- "xvcmpgedp 3,46,47 \n\t "
- "xvcmpgedp 4,48,49 \n\t "
- "xvcmpgedp 5,50,51 \n\t"
+ "xvcmpgtdp 2,44,45 \n\t "
+ "xvcmpgtdp 3,46,47 \n\t "
+ "xvcmpgtdp 4,48,49 \n\t "
+ "xvcmpgtdp 5,50,51 \n\t"
"xxsel 32,40,41,2 \n\t"
"xxsel 0,44,45,2 \n\t"
"xxsel 35,42,43,5 \n\t"
"xxsel 47,50,51,5 \n\t"
- "xvcmpgedp 2,0, 1 \n\t"
- "xvcmpgedp 3, 45,47 \n\t"
+ "xvcmpgtdp 2,0, 1 \n\t"
+ "xvcmpgtdp 3, 45,47 \n\t"
"xxsel 32,32,33,2 \n\t"
"xxsel 0 ,0,1,2 \n\t"
"xxsel 34,34,35,3 \n\t"
"lxvd2x 47, %[i48],%[ptr_tmp] \n\t"
//choose smaller from first and second part
- "xvcmpgedp 4, 0,5 \n\t"
+ "xvcmpgtdp 4, 0,5 \n\t"
"xxsel 3, 0,5,4 \n\t"
"xxsel 33,32,34,4 \n\t"
//compare with previous to get vec_min_index(v6 | vs38 ) and vec_min_value (vs39)
- "xvcmpgedp 2,39, 3 \n\t"
+ "xvcmpgtdp 2,39, 3 \n\t"
"xxsel 39,39,3,2 \n\t"
"xxsel 38,38,33,2 \n\t"
//==============================================================================
- "xvcmpgedp 2,44,45 \n\t "
- "xvcmpgedp 3,46,47 \n\t "
- "xvcmpgedp 4,48,49 \n\t "
- "xvcmpgedp 5,50,51 \n\t"
+ "xvcmpgtdp 2,44,45 \n\t "
+ "xvcmpgtdp 3,46,47 \n\t "
+ "xvcmpgtdp 4,48,49 \n\t "
+ "xvcmpgtdp 5,50,51 \n\t"
"xxsel 32,40,41,2 \n\t"
"xxsel 0,44,45,2 \n\t"
"xxsel 35,42,43,5 \n\t"
"xxsel 47,50,51,5 \n\t"
- "xvcmpgedp 2,0, 1 \n\t"
- "xvcmpgedp 3, 45,47 \n\t"
+ "xvcmpgtdp 2,0, 1 \n\t"
+ "xvcmpgtdp 3, 45,47 \n\t"
"xxsel 32,32,33,2 \n\t"
// for {second 8 elements } we have to add 8 to each so that it became {from 8 to 16}
"vaddudm 2,2,4 \n\t" // vs34=vs34 + vs36{8,8}
//choose smaller from first and second part
- "xvcmpgedp 4, 0,5 \n\t"
+ "xvcmpgtdp 4, 0,5 \n\t"
"xxsel 3, 0,5,4 \n\t"
"xxsel 33,32,34,4 \n\t"
"vaddudm 1,1,5 \n\t" // get real index for first smaller
//compare with previous to get vec_min_index(v6 | vs38 ) and vec_min_value (vs39)
- "xvcmpgedp 2,39, 3 \n\t"
+ "xvcmpgtdp 2,39, 3 \n\t"
"xxsel 39,39,3,2 \n\t"
"xxsel 38,38,33,2 \n\t"
//cr6 0 bit set if all true, cr6=4*6+bit_ind=24,0011at CR(BI)==1, at=10 hint that it occurs rarely
//0b001110=14
"bc 14,24, 3f \n\t"
- "xvcmpgedp 4,39, 40 \n\t"
+ "xvcmpgtdp 4,39, 40 \n\t"
"xxsel 0,39,40,4 \n\t"
"xxsel 1,38,32,4 \n\t"
"stxsdx 0,0,%[ptr_minf] \n\t"
--- /dev/null
+/***************************************************************************\r
+Copyright (c) 2013-2019, The OpenBLAS Project\r
+All rights reserved.\r
+Redistribution and use in source and binary forms, with or without\r
+modification, are permitted provided that the following conditions are\r
+met:\r
+1. Redistributions of source code must retain the above copyright\r
+notice, this list of conditions and the following disclaimer.\r
+2. Redistributions in binary form must reproduce the above copyright\r
+notice, this list of conditions and the following disclaimer in\r
+the documentation and/or other materials provided with the\r
+distribution.\r
+3. Neither the name of the OpenBLAS project nor the names of\r
+its contributors may be used to endorse or promote products\r
+derived from this software without specific prior written permission.\r
+THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"\r
+AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE\r
+IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE\r
+ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE\r
+LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL\r
+DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR\r
+SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER\r
+CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,\r
+OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE\r
+USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.\r
+ *****************************************************************************/\r
+#include "common.h"\r
+#include <math.h>\r
+#include <altivec.h>\r
+\r
+\r
+#if defined(DOUBLE)\r
+ #define ABS fabs\r
+#else\r
+ #define ABS fabsf\r
+#endif\r
+\r
+/**\r
+ * Find maximum index \r
+ * Warning: requirements n>0 and n % 64 == 0\r
+ * @param n \r
+ * @param x pointer to the vector\r
+ * @param maxf (out) maximum absolute value .( only for output )\r
+ * @return index \r
+ */\r
+static BLASLONG siamax_kernel_64(BLASLONG n, FLOAT *x, FLOAT *maxf) {\r
+ BLASLONG index;\r
+ BLASLONG i=0;\r
+ register __vector unsigned int static_index0 = {0,1,2,3};\r
+ register __vector unsigned int temp0 = {4,4,4, 4}; //temporary vector register\r
+ register __vector unsigned int temp1= temp0<<1; //{8,8,8,8}\r
+ register __vector unsigned int static_index1=static_index0 +temp0;//{4,5,6,7};\r
+ register __vector unsigned int static_index2=static_index0 +temp1;//{8,9,10,11};\r
+ register __vector unsigned int static_index3=static_index1 +temp1; //{12,13,14,15};\r
+ temp0=vec_xor(temp0,temp0);\r
+ temp1=temp1 <<1 ; //{16,16,16,16}\r
+ register __vector unsigned int quadruple_indices=temp0;//{0,0,0,0}\r
+ register __vector float quadruple_values={0,0,0,0};\r
+ register __vector float * v_ptrx=(__vector float *)x;\r
+ for(; i<n; i+=64){\r
+ //absolute temporary vectors\r
+ register __vector float v0=vec_abs(v_ptrx[0]);\r
+ register __vector float v1=vec_abs(v_ptrx[1]);\r
+ register __vector float v2=vec_abs(v_ptrx[2]);\r
+ register __vector float v3=vec_abs(v_ptrx[3]);\r
+ register __vector float v4=vec_abs(v_ptrx[4]);\r
+ register __vector float v5=vec_abs(v_ptrx[5]);\r
+ register __vector float v6=vec_abs(v_ptrx[6]); \r
+ register __vector float v7=vec_abs(v_ptrx[7]);\r
+ //cmp quadruple pairs\r
+ register __vector bool int r1=vec_cmpgt(v1,v0);\r
+ register __vector bool int r2=vec_cmpgt(v3,v2);\r
+ register __vector bool int r3=vec_cmpgt(v5,v4);\r
+ register __vector bool int r4=vec_cmpgt(v7,v6);\r
+ \r
+ //select\r
+ register __vector unsigned int ind0_first= vec_sel(static_index0,static_index1,r1);\r
+ register __vector float vf0= vec_sel(v0,v1,r1);\r
+\r
+ register __vector unsigned int ind1= vec_sel(static_index2,static_index3,r2);\r
+ register __vector float vf1= vec_sel(v2,v3,r2);\r
+\r
+ register __vector unsigned int ind2= vec_sel(static_index0,static_index1,r3);\r
+ v0=vec_sel(v4,v5,r3);\r
+\r
+ register __vector unsigned int ind3= vec_sel(static_index2,static_index3,r4);\r
+ v1=vec_sel(v6,v7,r4);\r
+\r
+ // cmp selected\r
+ r1=vec_cmpgt(vf1,vf0);\r
+ r2=vec_cmpgt(v1,v0);\r
+\r
+ v_ptrx+=8;\r
+ //select from above \r
+ ind0_first= vec_sel(ind0_first,ind1,r1);\r
+ vf0= vec_sel(vf0,vf1,r1) ;\r
+\r
+ ind2= vec_sel(ind2,ind3,r2);\r
+ vf1= vec_sel(v0,v1,r2);\r
+\r
+ //second indices actually should be within [16,31] so ind2+16\r
+ ind2 +=temp1;\r
+ \r
+ //final cmp and select index and value for the first 32 values\r
+ r1=vec_cmpgt(vf1,vf0);\r
+ ind0_first = vec_sel(ind0_first,ind2,r1);\r
+ vf0= vec_sel(vf0,vf1,r1);\r
+ \r
+ ind0_first+=temp0; //get absolute index\r
+\r
+ temp0+=temp1;\r
+ temp0+=temp1; //temp0+32\r
+ //second part of 32\r
+ // absolute temporary vectors\r
+ v0=vec_abs(v_ptrx[0]);\r
+ v1=vec_abs(v_ptrx[1]);\r
+ v2=vec_abs(v_ptrx[2]);\r
+ v3=vec_abs(v_ptrx[3]);\r
+ v4=vec_abs(v_ptrx[4]);\r
+ v5=vec_abs(v_ptrx[5]);\r
+ v6=vec_abs(v_ptrx[6]); \r
+ v7=vec_abs(v_ptrx[7]);\r
+ //cmp quadruple pairs\r
+ r1=vec_cmpgt(v1,v0);\r
+ r2=vec_cmpgt(v3,v2);\r
+ r3=vec_cmpgt(v5,v4);\r
+ r4=vec_cmpgt(v7,v6);\r
+ //select\r
+ register __vector unsigned int ind0_second= vec_sel(static_index0,static_index1,r1);\r
+ register __vector float vv0= vec_sel(v0,v1,r1);\r
+\r
+ ind1= vec_sel(static_index2,static_index3,r2);\r
+ register __vector float vv1= vec_sel(v2,v3,r2);\r
+\r
+ ind2= vec_sel(static_index0,static_index1,r3);\r
+ v0=vec_sel(v4,v5,r3);\r
+\r
+ ind3= vec_sel(static_index2,static_index3,r4);\r
+ v1=vec_sel(v6,v7,r4);\r
+\r
+ // cmp selected\r
+ r1=vec_cmpgt(vv1,vv0);\r
+ r2=vec_cmpgt(v1,v0);\r
+\r
+ v_ptrx+=8;\r
+ //select from above \r
+ ind0_second= vec_sel(ind0_second,ind1,r1);\r
+ vv0= vec_sel(vv0,vv1,r1) ;\r
+\r
+ ind2= vec_sel(ind2,ind3,r2);\r
+ vv1= vec_sel(v0,v1,r2) ; \r
+\r
+ //second indices actually should be within [16,31] so ind2+16\r
+ ind2 +=temp1;\r
+ \r
+ //final cmp and select index and value for the second 32 values\r
+ r1=vec_cmpgt(vv1,vv0);\r
+ ind0_second = vec_sel(ind0_second,ind2,r1);\r
+ vv0= vec_sel(vv0,vv1,r1);\r
+\r
+ ind0_second+=temp0; //get absolute index\r
+ \r
+ //find final quadruple from 64 elements\r
+ r2=vec_cmpgt(vv0,vf0);\r
+ ind2 = vec_sel( ind0_first,ind0_second,r2);\r
+ vv0= vec_sel(vf0,vv0,r2); \r
+\r
+ //compare with old quadruple and update \r
+ r3=vec_cmpgt(vv0,quadruple_values);\r
+ quadruple_indices = vec_sel( quadruple_indices,ind2,r3);\r
+ quadruple_values= vec_sel(quadruple_values,vv0,r3); \r
+\r
+ temp0+=temp1;\r
+ temp0+=temp1; //temp0+32\r
+ \r
+ }\r
+\r
+ //now we have to chose from 4 values and 4 different indices\r
+ // we will compare pairwise if pairs are exactly the same we will choose minimum between index\r
+ // otherwise we will assign index of the maximum value\r
+ float a1,a2,a3,a4;\r
+ unsigned int i1,i2,i3,i4;\r
+ a1=vec_extract(quadruple_values,0);\r
+ a2=vec_extract(quadruple_values,1);\r
+ a3=vec_extract(quadruple_values,2);\r
+ a4=vec_extract(quadruple_values,3);\r
+ i1=vec_extract(quadruple_indices,0);\r
+ i2=vec_extract(quadruple_indices,1);\r
+ i3=vec_extract(quadruple_indices,2);\r
+ i4=vec_extract(quadruple_indices,3);\r
+ if(a1==a2){\r
+ index=i1>i2?i2:i1;\r
+ }else if(a2>a1){\r
+ index=i2;\r
+ a1=a2;\r
+ }else{\r
+ index= i1;\r
+ }\r
+\r
+ if(a4==a3){\r
+ i1=i3>i4?i4:i3;\r
+ }else if(a4>a3){\r
+ i1=i4;\r
+ a3=a4;\r
+ }else{\r
+ i1= i3;\r
+ }\r
+\r
+ if(a1==a3){\r
+ index=i1>index?index:i1;\r
+ *maxf=a1; \r
+ }else if(a3>a1){\r
+ index=i1;\r
+ *maxf=a3;\r
+ }else{ \r
+ *maxf=a1;\r
+ }\r
+ return index;\r
+\r
+}\r
+\r
+BLASLONG CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x) {\r
+ BLASLONG i = 0;\r
+ BLASLONG j = 0;\r
+ FLOAT maxf = 0.0;\r
+ BLASLONG max = 0;\r
+\r
+ if (n <= 0 || inc_x <= 0) return (max);\r
+\r
+ if (inc_x == 1) {\r
+\r
+ BLASLONG n1 = n & -64;\r
+ if (n1 > 0) {\r
+\r
+ max = siamax_kernel_64(n1, x, &maxf);\r
+\r
+ i = n1;\r
+ }\r
+\r
+ while (i < n) {\r
+ if (ABS(x[i]) > maxf) {\r
+ max = i;\r
+ maxf = ABS(x[i]);\r
+ }\r
+ i++;\r
+ }\r
+ return (max + 1);\r
+\r
+ } else {\r
+\r
+ BLASLONG n1 = n & -4;\r
+ while (j < n1) {\r
+\r
+ if (ABS(x[i]) > maxf) {\r
+ max = j;\r
+ maxf = ABS(x[i]);\r
+ }\r
+ if (ABS(x[i + inc_x]) > maxf) {\r
+ max = j + 1;\r
+ maxf = ABS(x[i + inc_x]);\r
+ }\r
+ if (ABS(x[i + 2 * inc_x]) > maxf) {\r
+ max = j + 2;\r
+ maxf = ABS(x[i + 2 * inc_x]);\r
+ }\r
+ if (ABS(x[i + 3 * inc_x]) > maxf) {\r
+ max = j + 3;\r
+ maxf = ABS(x[i + 3 * inc_x]);\r
+ }\r
+\r
+ i += inc_x * 4;\r
+\r
+ j += 4;\r
+\r
+ }\r
+\r
+\r
+ while (j < n) {\r
+ if (ABS(x[i]) > maxf) {\r
+ max = j;\r
+ maxf = ABS(x[i]);\r
+ }\r
+ i += inc_x;\r
+ j++;\r
+ }\r
+ return (max + 1);\r
+ }\r
+}\r
--- /dev/null
+/***************************************************************************\r
+Copyright (c) 2013-2019, The OpenBLAS Project\r
+All rights reserved.\r
+Redistribution and use in source and binary forms, with or without\r
+modification, are permitted provided that the following conditions are\r
+met:\r
+1. Redistributions of source code must retain the above copyright\r
+notice, this list of conditions and the following disclaimer.\r
+2. Redistributions in binary form must reproduce the above copyright\r
+notice, this list of conditions and the following disclaimer in\r
+the documentation and/or other materials provided with the\r
+distribution.\r
+3. Neither the name of the OpenBLAS project nor the names of\r
+its contributors may be used to endorse or promote products\r
+derived from this software without specific prior written permission.\r
+THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"\r
+AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE\r
+IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE\r
+ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE\r
+LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL\r
+DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR\r
+SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER\r
+CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,\r
+OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE\r
+USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.\r
+ *****************************************************************************/\r
+#include "common.h"\r
+#include <math.h>\r
+#include <altivec.h>\r
+#if defined(DOUBLE)\r
+ #define ABS fabs\r
+#else\r
+ #define ABS fabsf\r
+#endif\r
+/**\r
+ * Find minimum index \r
+ * Warning: requirements n>0 and n % 64 == 0\r
+ * @param n \r
+ * @param x pointer to the vector\r
+ * @param minf (out) minimum absolute value .( only for output )\r
+ * @return index \r
+ */\r
+static BLASLONG siamin_kernel_64(BLASLONG n, FLOAT *x, FLOAT *minf) {\r
+ BLASLONG index;\r
+ BLASLONG i=0;\r
+ register __vector unsigned int static_index0 = {0,1,2,3};\r
+ register __vector unsigned int temp0 = {4,4,4, 4}; //temporary vector register\r
+ register __vector unsigned int temp1= temp0<<1; //{8,8,8,8}\r
+ register __vector unsigned int static_index1=static_index0 +temp0;//{4,5,6,7};\r
+ register __vector unsigned int static_index2=static_index0 +temp1;//{8,9,10,11};\r
+ register __vector unsigned int static_index3=static_index1 +temp1; //{12,13,14,15};\r
+ temp0=vec_xor(temp0,temp0);\r
+ temp1=temp1 <<1 ; //{16,16,16,16}\r
+ register __vector unsigned int quadruple_indices=static_index0;//{0,1,2,3};\r
+ register __vector float * v_ptrx=(__vector float *)x;\r
+ register __vector float quadruple_values=vec_abs(v_ptrx[0]);\r
+ for(; i<n; i+=64){\r
+ //absolute temporary vectors\r
+ register __vector float v0=vec_abs(v_ptrx[0]);\r
+ register __vector float v1=vec_abs(v_ptrx[1]);\r
+ register __vector float v2=vec_abs(v_ptrx[2]);\r
+ register __vector float v3=vec_abs(v_ptrx[3]);\r
+ register __vector float v4=vec_abs(v_ptrx[4]);\r
+ register __vector float v5=vec_abs(v_ptrx[5]);\r
+ register __vector float v6=vec_abs(v_ptrx[6]); \r
+ register __vector float v7=vec_abs(v_ptrx[7]);\r
+ //cmp quadruple pairs\r
+ register __vector bool int r1=vec_cmpgt(v0,v1);\r
+ register __vector bool int r2=vec_cmpgt(v2,v3);\r
+ register __vector bool int r3=vec_cmpgt(v4,v5);\r
+ register __vector bool int r4=vec_cmpgt(v6,v7);\r
+ \r
+ //select\r
+ register __vector unsigned int ind0_first= vec_sel(static_index0,static_index1,r1);\r
+ register __vector float vf0= vec_sel(v0,v1,r1);\r
+\r
+ register __vector unsigned int ind1= vec_sel(static_index2,static_index3,r2);\r
+ register __vector float vf1= vec_sel(v2,v3,r2);\r
+\r
+ register __vector unsigned int ind2= vec_sel(static_index0,static_index1,r3);\r
+ v0=vec_sel(v4,v5,r3);\r
+\r
+ register __vector unsigned int ind3= vec_sel(static_index2,static_index3,r4);\r
+ v1=vec_sel(v6,v7,r4);\r
+\r
+ // cmp selected\r
+ r1=vec_cmpgt(vf0,vf1);\r
+ r2=vec_cmpgt(v0,v1);\r
+\r
+ v_ptrx+=8;\r
+ //select from above \r
+ ind0_first= vec_sel(ind0_first,ind1,r1);\r
+ vf0= vec_sel(vf0,vf1,r1) ;\r
+\r
+ ind2= vec_sel(ind2,ind3,r2);\r
+ vf1= vec_sel(v0,v1,r2);\r
+\r
+ //second indices actually should be within [16,31] so ind2+16\r
+ ind2 +=temp1;\r
+ \r
+ //final cmp and select index and value for the first 32 values\r
+ r1=vec_cmpgt(vf0,vf1);\r
+ ind0_first = vec_sel(ind0_first,ind2,r1);\r
+ vf0= vec_sel(vf0,vf1,r1);\r
+ \r
+ ind0_first+=temp0; //get absolute index\r
+ \r
+ temp0+=temp1;\r
+ temp0+=temp1; //temp0+32\r
+ //second part of 32\r
+ // absolute temporary vectors\r
+ v0=vec_abs(v_ptrx[0]);\r
+ v1=vec_abs(v_ptrx[1]);\r
+ v2=vec_abs(v_ptrx[2]);\r
+ v3=vec_abs(v_ptrx[3]);\r
+ v4=vec_abs(v_ptrx[4]);\r
+ v5=vec_abs(v_ptrx[5]);\r
+ v6=vec_abs(v_ptrx[6]); \r
+ v7=vec_abs(v_ptrx[7]);\r
+ //cmp quadruple pairs\r
+ r1=vec_cmpgt(v0,v1);\r
+ r2=vec_cmpgt(v2,v3);\r
+ r3=vec_cmpgt(v4,v5);\r
+ r4=vec_cmpgt(v6,v7);\r
+ //select\r
+ register __vector unsigned int ind0_second= vec_sel(static_index0,static_index1,r1);\r
+ register __vector float vv0= vec_sel(v0,v1,r1);\r
+\r
+ ind1= vec_sel(static_index2,static_index3,r2);\r
+ register __vector float vv1= vec_sel(v2,v3,r2);\r
+\r
+ ind2= vec_sel(static_index0,static_index1,r3);\r
+ v0=vec_sel(v4,v5,r3);\r
+\r
+ ind3= vec_sel(static_index2,static_index3,r4);\r
+ v1=vec_sel(v6,v7,r4);\r
+\r
+ // cmp selected\r
+ r1=vec_cmpgt(vv0,vv1);\r
+ r2=vec_cmpgt(v0,v1);\r
+\r
+ v_ptrx+=8;\r
+ //select from above \r
+ ind0_second= vec_sel(ind0_second,ind1,r1);\r
+ vv0= vec_sel(vv0,vv1,r1) ;\r
+\r
+ ind2= vec_sel(ind2,ind3,r2);\r
+ vv1= vec_sel(v0,v1,r2) ; \r
+\r
+ //second indices actually should be within [16,31] so ind2+16\r
+ ind2 +=temp1;\r
+ \r
+ //final cmp and select index and value for the second 32 values\r
+ r1=vec_cmpgt(vv0,vv1);\r
+ ind0_second = vec_sel(ind0_second,ind2,r1);\r
+ vv0= vec_sel(vv0,vv1,r1);\r
+\r
+ ind0_second+=temp0; //get absolute index\r
+ \r
+ //find final quadruple from 64 elements\r
+ r2=vec_cmpgt(vf0,vv0);\r
+ ind2 = vec_sel( ind0_first,ind0_second,r2);\r
+ vv0= vec_sel(vf0,vv0,r2); \r
+ \r
+ //compare with old quadruple and update \r
+ r3=vec_cmpgt( quadruple_values,vv0);\r
+ quadruple_indices = vec_sel( quadruple_indices,ind2,r3);\r
+ quadruple_values= vec_sel(quadruple_values,vv0,r3); \r
+ \r
+ temp0+=temp1;\r
+ temp0+=temp1; //temp0+32\r
+ \r
+ \r
+ }\r
+\r
+ //now we have to chose from 4 values and 4 different indices\r
+ // we will compare pairwise if pairs are exactly the same we will choose minimum between index\r
+ // otherwise we will assign index of the minimum value\r
+ float a1,a2,a3,a4;\r
+ unsigned int i1,i2,i3,i4;\r
+ a1=vec_extract(quadruple_values,0);\r
+ a2=vec_extract(quadruple_values,1);\r
+ a3=vec_extract(quadruple_values,2);\r
+ a4=vec_extract(quadruple_values,3);\r
+ i1=vec_extract(quadruple_indices,0);\r
+ i2=vec_extract(quadruple_indices,1);\r
+ i3=vec_extract(quadruple_indices,2);\r
+ i4=vec_extract(quadruple_indices,3);\r
+ if(a1==a2){\r
+ index=i1>i2?i2:i1;\r
+ }else if(a2<a1){\r
+ index=i2;\r
+ a1=a2;\r
+ }else{\r
+ index= i1;\r
+ }\r
+\r
+ if(a4==a3){\r
+ i1=i3>i4?i4:i3;\r
+ }else if(a4<a3){\r
+ i1=i4;\r
+ a3=a4;\r
+ }else{\r
+ i1= i3;\r
+ }\r
+\r
+ if(a1==a3){\r
+ index=i1>index?index:i1;\r
+ *minf=a1; \r
+ }else if(a3<a1){\r
+ index=i1;\r
+ *minf=a3;\r
+ }else{ \r
+ *minf=a1;\r
+ }\r
+ return index;\r
+\r
+}\r
+\r
+\r
+\r
+\r
+BLASLONG CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x) {\r
+ BLASLONG i = 0;\r
+ BLASLONG j = 0; \r
+ BLASLONG min = 0;\r
+ FLOAT minf = 0.0;\r
+ \r
+ if (n <= 0 || inc_x <= 0) return (min);\r
+ minf = ABS(x[0]); //index's not incremented\r
+ if (inc_x == 1) {\r
+\r
+ BLASLONG n1 = n & -64;\r
+ if (n1 > 0) {\r
+\r
+ min = siamin_kernel_64(n1, x, &minf);\r
+ i = n1;\r
+ }\r
+\r
+ while (i < n) {\r
+ if (ABS(x[i]) < minf) {\r
+ min = i;\r
+ minf = ABS(x[i]);\r
+ }\r
+ i++;\r
+ }\r
+ return (min + 1);\r
+\r
+ } else {\r
+\r
+ BLASLONG n1 = n & -4;\r
+ while (j < n1) {\r
+\r
+ if (ABS(x[i]) < minf) {\r
+ min = j;\r
+ minf = ABS(x[i]);\r
+ }\r
+ if (ABS(x[i + inc_x]) < minf) {\r
+ min = j + 1;\r
+ minf = ABS(x[i + inc_x]);\r
+ }\r
+ if (ABS(x[i + 2 * inc_x]) < minf) {\r
+ min = j + 2;\r
+ minf = ABS(x[i + 2 * inc_x]);\r
+ }\r
+ if (ABS(x[i + 3 * inc_x]) < minf) {\r
+ min = j + 3;\r
+ minf = ABS(x[i + 3 * inc_x]);\r
+ }\r
+\r
+ i += inc_x * 4;\r
+\r
+ j += 4;\r
+\r
+ }\r
+\r
+\r
+ while (j < n) {\r
+ if (ABS(x[i]) < minf) {\r
+ min = j;\r
+ minf = ABS(x[i]);\r
+ }\r
+ i += inc_x;\r
+ j++;\r
+ }\r
+ return (min + 1);\r
+ }\r
+}\r
- "xvcmpgedp 50,46,47 \n\t "
- "xvcmpgedp 51,48,49 \n\t "
+ "xvcmpgtdp 50,46,47 \n\t "
+ "xvcmpgtdp 51,48,49 \n\t "
"addi %[ptr_tmp] ,%[ptr_tmp] , 128 \n\t"
"lxvd2x 44, 0,%[ptr_tmp] \n\t"
"lxvd2x 45, %[i16],%[ptr_tmp] \n\t"
- "xvcmpgedp 2,0,1 \n\t "
+ "xvcmpgtdp 2,0,1 \n\t "
"lxvd2x 46, %[i32],%[ptr_tmp] \n\t"
"lxvd2x 47, %[i48],%[ptr_tmp] \n\t"
//cmp with previous
- "xvcmpgedp 4,39,3 \n\t "
+ "xvcmpgtdp 4,39,3 \n\t "
"vaddudm 5,5,4 \n\t"
"lxvd2x 48, %[i64],%[ptr_tmp] \n\t"
"xvadddp 48, 4,5 \n\t"
"xvadddp 49, 44,45 \n\t"
- "xvcmpgedp 50,46,47 \n\t "
- "xvcmpgedp 51,48,49 \n\t "
+ "xvcmpgtdp 50,46,47 \n\t "
+ "xvcmpgtdp 51,48,49 \n\t "
"addi %[ptr_tmp] ,%[ptr_tmp] , 128 \n\t"
"lxvd2x 44, 0,%[ptr_tmp] \n\t"
"lxvd2x 45, %[i16],%[ptr_tmp] \n\t"
- "xvcmpgedp 2,0,1 \n\t "
+ "xvcmpgtdp 2,0,1 \n\t "
"lxvd2x 46, %[i32],%[ptr_tmp] \n\t"
"lxvd2x 47, %[i48],%[ptr_tmp] \n\t"
//cmp with previous
- "xvcmpgedp 4,39,3 \n\t "
+ "xvcmpgtdp 4,39,3 \n\t "
"vaddudm 5,5,4 \n\t"
"lxvd2x 48, %[i64],%[ptr_tmp] \n\t"
- "xvcmpgedp 50,46,47 \n\t "
- "xvcmpgedp 51,48,49 \n\t "
+ "xvcmpgtdp 50,46,47 \n\t "
+ "xvcmpgtdp 51,48,49 \n\t "
"xxsel 32,40,41,50 \n\t"
"xxsel 0,46,47,50 \n\t"
"xxsel 33,42,43,51 \n\t"
"xxsel 1,48,49,51 \n\t"
- "xvcmpgedp 2,0,1 \n\t "
+ "xvcmpgtdp 2,0,1 \n\t "
"xxsel 32,32,33,2 \n\t"
"xxsel 3,0,1,2 \n\t"
"addi %[ptr_tmp] ,%[ptr_tmp] , 128 \n\t"
//cmp with previous
- "xvcmpgedp 4,39,3 \n\t "
+ "xvcmpgtdp 4,39,3 \n\t "
"vaddudm 5,5,4 \n\t"
"xxsel 38,38,32,4 \n\t"
"xxsel 39,39,3,4 \n\t"
//cr6 0 bit set if all true, cr6=4*6+bit_ind=24,0011at CR(BI)==1, at=10 hint that it occurs rarely
//0b001110=14
"bc 14,24, 3f \n\t"
- "xvcmpgedp 4,39, 40 \n\t"
+ "xvcmpgtdp 4,39, 40 \n\t"
"xxsel 0,39,40,4 \n\t"
"xxsel 1,38,32,4 \n\t"
"stxsdx 0,0,%[ptr_minf] \n\t"
--- /dev/null
+/***************************************************************************
+Copyright (c) 2013-2018, The OpenBLAS Project
+All rights reserved.
+Redistribution and use in source and binary forms, with or without
+modification, are permitted provided that the following conditions are
+met:
+1. Redistributions of source code must retain the above copyright
+notice, this list of conditions and the following disclaimer.
+2. Redistributions in binary form must reproduce the above copyright
+notice, this list of conditions and the following disclaimer in
+the documentation and/or other materials provided with the
+distribution.
+3. Neither the name of the OpenBLAS project nor the names of
+its contributors may be used to endorse or promote products
+derived from this software without specific prior written permission.
+THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE
+LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
+USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+*****************************************************************************/
+
+
+#include "common.h"
+
+
+
+
+#ifndef HAVE_KERNEL_8
+#include <altivec.h>
+
+static void saxpy_kernel_64(BLASLONG n, FLOAT *x, FLOAT *y, FLOAT alpha)
+{
+ BLASLONG i = 0;
+ __vector float v_a = {alpha,alpha,alpha,alpha};
+ __vector float * v_y=(__vector float *)y;
+ __vector float * v_x=(__vector float *)x;
+
+ for(; i<n/4; i+=16){
+
+ v_y[i] += v_a * v_x[i];
+ v_y[i+1] += v_a * v_x[i+1];
+ v_y[i+2] += v_a * v_x[i+2];
+ v_y[i+3] += v_a * v_x[i+3];
+ v_y[i+4] += v_a * v_x[i+4];
+ v_y[i+5] += v_a * v_x[i+5];
+ v_y[i+6] += v_a * v_x[i+6];
+ v_y[i+7] += v_a * v_x[i+7];
+ v_y[i+8] += v_a * v_x[i+8];
+ v_y[i+9] += v_a * v_x[i+9];
+ v_y[i+10] += v_a * v_x[i+10];
+ v_y[i+11] += v_a * v_x[i+11];
+ v_y[i+12] += v_a * v_x[i+12];
+ v_y[i+13] += v_a * v_x[i+13];
+ v_y[i+14] += v_a * v_x[i+14];
+ v_y[i+15] += v_a * v_x[i+15];
+ }
+}
+#endif
+
+int CNAME(BLASLONG n, BLASLONG dummy0, BLASLONG dummy1, FLOAT da, FLOAT *x, BLASLONG inc_x, FLOAT *y, BLASLONG inc_y, FLOAT *dummy, BLASLONG dummy2)
+{
+ BLASLONG i=0;
+ BLASLONG ix=0,iy=0;
+
+ if ( n <= 0 ) return(0);
+
+ if ( (inc_x == 1) && (inc_y == 1) )
+ {
+
+ BLASLONG n1 = n & -64;
+
+ if ( n1 )
+ saxpy_kernel_64(n1, x, y, da);
+
+ i = n1;
+ while(i < n)
+ {
+
+ y[i] += da * x[i] ;
+ i++ ;
+
+ }
+ return(0);
+
+
+ }
+
+ BLASLONG n1 = n & -4;
+
+ while(i < n1)
+ {
+
+ FLOAT m1 = da * x[ix] ;
+ FLOAT m2 = da * x[ix+inc_x] ;
+ FLOAT m3 = da * x[ix+2*inc_x] ;
+ FLOAT m4 = da * x[ix+3*inc_x] ;
+
+ y[iy] += m1 ;
+ y[iy+inc_y] += m2 ;
+ y[iy+2*inc_y] += m3 ;
+ y[iy+3*inc_y] += m4 ;
+
+ ix += inc_x*4 ;
+ iy += inc_y*4 ;
+ i+=4 ;
+
+ }
+
+ while(i < n)
+ {
+
+ y[iy] += da * x[ix] ;
+ ix += inc_x ;
+ iy += inc_y ;
+ i++ ;
+
+ }
+ return(0);
+
+}
+
+
+