USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
*****************************************************************************/
+/*
+ * Avoid contraction of floating point operations, specifically fused
+ * multiply-add, because they can cause unexpected results in complex
+ * multiplication.
+ */
+#if defined(__GNUC__) && !defined(__clang__)
+#pragma GCC optimize ("fp-contract=off")
+#endif
+
+#if defined(__clang__)
+#pragma clang fp contract(off)
+#endif
+
#include "common.h"
+#include "vector-common.h"
-static void cscal_kernel_16(BLASLONG n, FLOAT *alpha, FLOAT *x) {
- __asm__("vlrepf %%v0,0(%[alpha])\n\t"
- "vlef %%v1,4(%[alpha]),0\n\t"
- "vlef %%v1,4(%[alpha]),2\n\t"
- "vflcsb %%v1,%%v1\n\t"
- "vlef %%v1,4(%[alpha]),1\n\t"
- "vlef %%v1,4(%[alpha]),3\n\t"
- "srlg %[n],%[n],4\n\t"
- "xgr %%r1,%%r1\n\t"
- "0:\n\t"
- "pfd 2, 1024(%%r1,%[x])\n\t"
- "vl %%v16,0(%%r1,%[x])\n\t"
- "vl %%v17,16(%%r1,%[x])\n\t"
- "vl %%v18,32(%%r1,%[x])\n\t"
- "vl %%v19,48(%%r1,%[x])\n\t"
- "vl %%v20,64(%%r1,%[x])\n\t"
- "vl %%v21,80(%%r1,%[x])\n\t"
- "vl %%v22,96(%%r1,%[x])\n\t"
- "vl %%v23,112(%%r1,%[x])\n\t"
- "verllg %%v24,%%v16,32\n\t"
- "verllg %%v25,%%v17,32\n\t"
- "verllg %%v26,%%v18,32\n\t"
- "verllg %%v27,%%v19,32\n\t"
- "verllg %%v28,%%v20,32\n\t"
- "verllg %%v29,%%v21,32\n\t"
- "verllg %%v30,%%v22,32\n\t"
- "verllg %%v31,%%v23,32\n\t"
- "vfmsb %%v16,%%v16,%%v0\n\t"
- "vfmsb %%v17,%%v17,%%v0\n\t"
- "vfmsb %%v18,%%v18,%%v0\n\t"
- "vfmsb %%v19,%%v19,%%v0\n\t"
- "vfmsb %%v20,%%v20,%%v0\n\t"
- "vfmsb %%v21,%%v21,%%v0\n\t"
- "vfmsb %%v22,%%v22,%%v0\n\t"
- "vfmsb %%v23,%%v23,%%v0\n\t"
- "vfmasb %%v16,%%v24,%%v1,%%v16\n\t"
- "vfmasb %%v17,%%v25,%%v1,%%v17\n\t"
- "vfmasb %%v18,%%v26,%%v1,%%v18\n\t"
- "vfmasb %%v19,%%v27,%%v1,%%v19\n\t"
- "vfmasb %%v20,%%v28,%%v1,%%v20\n\t"
- "vfmasb %%v21,%%v29,%%v1,%%v21\n\t"
- "vfmasb %%v22,%%v30,%%v1,%%v22\n\t"
- "vfmasb %%v23,%%v31,%%v1,%%v23\n\t"
- "vst %%v16,0(%%r1,%[x])\n\t"
- "vst %%v17,16(%%r1,%[x])\n\t"
- "vst %%v18,32(%%r1,%[x])\n\t"
- "vst %%v19,48(%%r1,%[x])\n\t"
- "vst %%v20,64(%%r1,%[x])\n\t"
- "vst %%v21,80(%%r1,%[x])\n\t"
- "vst %%v22,96(%%r1,%[x])\n\t"
- "vst %%v23,112(%%r1,%[x])\n\t"
- "agfi %%r1,128\n\t"
- "brctg %[n],0b"
- : "+m"(*(FLOAT (*)[n * 2]) x),[n] "+&r"(n)
- : [x] "a"(x), "m"(*(const FLOAT (*)[2]) alpha),
- [alpha] "a"(alpha)
- : "cc", "r1", "v0", "v1", "v16", "v17", "v18", "v19", "v20", "v21",
- "v22", "v23", "v24", "v25", "v26", "v27", "v28", "v29", "v30",
- "v31");
+static void cscal_kernel_16(BLASLONG n, FLOAT da_r, FLOAT da_i, FLOAT *x) {
+ vector_float da_r_vec = vec_splats(da_r);
+ vector_float da_i_vec = { -da_i, da_i, -da_i, da_i };
+
+ vector_float *x_vec_ptr = (vector_float *)x;
+
+#pragma GCC unroll 16
+ for (size_t i = 0; i < n/2; i++) {
+ vector_float x_vec = vec_load_hinted(x + i * VLEN_FLOATS);
+ vector_float x_swapped = {x_vec[1], x_vec[0], x_vec[3], x_vec[2]};
+
+ x_vec_ptr[i] = x_vec * da_r_vec + x_swapped * da_i_vec;
+ }
}
static void cscal_kernel_16_zero_r(BLASLONG n, FLOAT *alpha, FLOAT *x) {
: "cc", "r1", "v0");
}
-static void cscal_kernel_inc_8(BLASLONG n, FLOAT *alpha, FLOAT *x,
+static void cscal_kernel_inc_8(BLASLONG n, FLOAT da_r, FLOAT da_i, FLOAT *x,
BLASLONG inc_x) {
BLASLONG i;
BLASLONG inc_x2 = 2 * inc_x;
BLASLONG inc_x3 = inc_x2 + inc_x;
FLOAT t0, t1, t2, t3;
- FLOAT da_r = alpha[0];
- FLOAT da_i = alpha[1];
for (i = 0; i < n; i += 4) {
t0 = da_r * x[0] - da_i * x[1];
BLASLONG n1 = n & -8;
if (n1 > 0) {
- alpha[0] = da_r;
- alpha[1] = da_i;
- cscal_kernel_inc_8(n1, alpha, x, inc_x);
+ cscal_kernel_inc_8(n1, da_r, da_i, x, inc_x);
j = n1;
i = n1 * inc_x;
}
else if (da_i == 0)
cscal_kernel_16_zero_i(n1, alpha, x);
else
- cscal_kernel_16(n1, alpha, x);
+ cscal_kernel_16(n1, da_r, da_i, x);
i = n1 << 1;
j = n1;
USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
*****************************************************************************/
+/*
+ * Avoid contraction of floating point operations, specifically fused
+ * multiply-add, because they can cause unexpected results in complex
+ * multiplication.
+ */
+#if defined(__GNUC__) && !defined(__clang__)
+#pragma GCC optimize ("fp-contract=off")
+#endif
+
+#if defined(__clang__)
+#pragma clang fp contract(off)
+#endif
+
#include "common.h"
+#include "vector-common.h"
-static void zscal_kernel_8(BLASLONG n, FLOAT *alpha, FLOAT *x) {
- __asm__("vlrepg %%v0,0(%[alpha])\n\t"
- "vleg %%v1,8(%[alpha]),0\n\t"
- "wflcdb %%v1,%%v1\n\t"
- "vleg %%v1,8(%[alpha]),1\n\t"
- "srlg %[n],%[n],3\n\t"
- "xgr %%r1,%%r1\n\t"
- "0:\n\t"
- "pfd 2, 1024(%%r1,%[x])\n\t"
- "vl %%v16,0(%%r1,%[x])\n\t"
- "vl %%v17,16(%%r1,%[x])\n\t"
- "vl %%v18,32(%%r1,%[x])\n\t"
- "vl %%v19,48(%%r1,%[x])\n\t"
- "vl %%v20,64(%%r1,%[x])\n\t"
- "vl %%v21,80(%%r1,%[x])\n\t"
- "vl %%v22,96(%%r1,%[x])\n\t"
- "vl %%v23,112(%%r1,%[x])\n\t"
- "vpdi %%v24,%%v16,%%v16,4\n\t"
- "vpdi %%v25,%%v17,%%v17,4\n\t"
- "vpdi %%v26,%%v18,%%v18,4\n\t"
- "vpdi %%v27,%%v19,%%v19,4\n\t"
- "vpdi %%v28,%%v20,%%v20,4\n\t"
- "vpdi %%v29,%%v21,%%v21,4\n\t"
- "vpdi %%v30,%%v22,%%v22,4\n\t"
- "vpdi %%v31,%%v23,%%v23,4\n\t"
- "vfmdb %%v16,%%v16,%%v0\n\t"
- "vfmdb %%v17,%%v17,%%v0\n\t"
- "vfmdb %%v18,%%v18,%%v0\n\t"
- "vfmdb %%v19,%%v19,%%v0\n\t"
- "vfmdb %%v20,%%v20,%%v0\n\t"
- "vfmdb %%v21,%%v21,%%v0\n\t"
- "vfmdb %%v22,%%v22,%%v0\n\t"
- "vfmdb %%v23,%%v23,%%v0\n\t"
- "vfmadb %%v16,%%v24,%%v1,%%v16\n\t"
- "vfmadb %%v17,%%v25,%%v1,%%v17\n\t"
- "vfmadb %%v18,%%v26,%%v1,%%v18\n\t"
- "vfmadb %%v19,%%v27,%%v1,%%v19\n\t"
- "vfmadb %%v20,%%v28,%%v1,%%v20\n\t"
- "vfmadb %%v21,%%v29,%%v1,%%v21\n\t"
- "vfmadb %%v22,%%v30,%%v1,%%v22\n\t"
- "vfmadb %%v23,%%v31,%%v1,%%v23\n\t"
- "vst %%v16,0(%%r1,%[x])\n\t"
- "vst %%v17,16(%%r1,%[x])\n\t"
- "vst %%v18,32(%%r1,%[x])\n\t"
- "vst %%v19,48(%%r1,%[x])\n\t"
- "vst %%v20,64(%%r1,%[x])\n\t"
- "vst %%v21,80(%%r1,%[x])\n\t"
- "vst %%v22,96(%%r1,%[x])\n\t"
- "vst %%v23,112(%%r1,%[x])\n\t"
- "agfi %%r1,128\n\t"
- "brctg %[n],0b"
- : "+m"(*(FLOAT (*)[n * 2]) x),[n] "+&r"(n)
- : [x] "a"(x), "m"(*(const FLOAT (*)[2]) alpha),
- [alpha] "a"(alpha)
- : "cc", "r1", "v0", "v1", "v16", "v17", "v18", "v19", "v20", "v21",
- "v22", "v23", "v24", "v25", "v26", "v27", "v28", "v29", "v30",
- "v31");
+static void zscal_kernel_8(BLASLONG n, FLOAT da_r, FLOAT da_i, FLOAT *x) {
+ vector_float da_r_vec = vec_splats(da_r);
+ vector_float da_i_vec = { -da_i, da_i };
+
+ vector_float * x_vec_ptr = (vector_float *)x;
+
+#pragma GCC unroll 16
+ for (size_t i = 0; i < n; i++) {
+ vector_float x_vec = vec_load_hinted(x + i * VLEN_FLOATS);
+ vector_float x_swapped = {x_vec[1], x_vec[0]};
+
+ x_vec_ptr[i] = x_vec * da_r_vec + x_swapped * da_i_vec;
+ }
}
static void zscal_kernel_8_zero_r(BLASLONG n, FLOAT *alpha, FLOAT *x) {
: "cc", "r1", "v0");
}
-static void zscal_kernel_inc_8(BLASLONG n, FLOAT *alpha, FLOAT *x,
+static void zscal_kernel_inc_8(BLASLONG n, FLOAT da_r, FLOAT da_i, FLOAT *x,
BLASLONG inc_x) {
BLASLONG i;
BLASLONG inc_x2 = 2 * inc_x;
BLASLONG inc_x3 = inc_x2 + inc_x;
FLOAT t0, t1, t2, t3;
- FLOAT da_r = alpha[0];
- FLOAT da_i = alpha[1];
for (i = 0; i < n; i += 4) {
t0 = da_r * x[0] - da_i * x[1];
BLASLONG n1 = n & -8;
if (n1 > 0) {
- alpha[0] = da_r;
- alpha[1] = da_i;
- zscal_kernel_inc_8(n1, alpha, x, inc_x);
+ zscal_kernel_inc_8(n1, da_r, da_i, x, inc_x);
j = n1;
i = n1 * inc_x;
}
else if (da_i == 0)
zscal_kernel_8_zero_i(n1, alpha, x);
else
- zscal_kernel_8(n1, alpha, x);
+ zscal_kernel_8(n1, da_r, da_i, x);
i = n1 << 1;
j = n1;