endif
ifeq ($(TARGET), C910V)
-TARGET_FLAGS = -march=rv64gcvxthead -mabi=lp64v
+TARGET_FLAGS = -march=rv64gcv0p7_zfh_xtheadc -mabi=lp64d
endif
all: getarch_2nd
ifeq ($(CORE), C910V)
-CCOMMON_OPT += -march=rv64gcvxthead -mabi=lp64v
-FCOMMON_OPT += -march=rv64gcvxthead -mabi=lp64v -static
+CCOMMON_OPT += -march=rv64imafdcv0p7_zfh_xtheadc -mabi=lp64d -mtune=c920
+FCOMMON_OPT += -march=rv64imafdcv0p7_zfh_xtheadc -mabi=lp64d -mtune=c920 -static
endif
#define SEEK_ADDRESS
#if defined(C910V)
-#include <riscv-vector.h>
+#include <riscv_vector.h>
#endif
#endif
"vor.vv v0, %1, %1\n\t"
"vsetvli x0, %3, e64,m8 \n\t"
"vfrsub.vf %0, %0, %2, v0.t \n\t"
- :"+v"(v0)
- :"v"(mask0), "f"(zero), "r"(gvl)
+ :"+vd"(v0)
+ :"vd"(mask0), "f"(zero), "r"(gvl)
:"v0");
#else
asm volatile(
"vor.vv v0, %1, %1\n\t"
"vsetvli x0, %3, e32,m8 \n\t"
"vfrsub.vf %0, %0, %2, v0.t \n\t"
- :"+v"(v0)
- :"v"(mask0), "f"(zero), "r"(gvl)
+ :"+vd"(v0)
+ :"vd"(mask0), "f"(zero), "r"(gvl)
:"v0");
#endif
"vor.vv v0, %1, %1\n\t"
"vsetvli x0, %3, e64,m8 \n\t"
"vfrsub.vf %0, %0, %2, v0.t \n\t"
- :"+v"(v1)
- :"v"(mask1), "f"(zero), "r"(gvl)
+ :"+vd"(v1)
+ :"vd"(mask1), "f"(zero), "r"(gvl)
:"v0");
#else
asm volatile(
"vor.vv v0, %1, %1\n\t"
"vsetvli x0, %3, e32,m8 \n\t"
"vfrsub.vf %0, %0, %2, v0.t \n\t"
- :"+v"(v1)
- :"v"(mask1), "f"(zero), "r"(gvl)
+ :"+vd"(v1)
+ :"vd"(mask1), "f"(zero), "r"(gvl)
:"v0");
#endif
j += gvl*2;
}
v_res = VFREDMAXVS_FLOAT(v_res, v_max, v_zero, gvl);
- maxf = v_res[0];
+ maxf = *((FLOAT*)&v_res);
+ //maxf = v_res[0];
}
for(;j<n;){
gvl = VSETVL(n-j);
"vor.vv v0, %1, %1\n\t"
"vsetvli x0, %3, e64,m8 \n\t"
"vfrsub.vf %0, %0, %2, v0.t \n\t"
- :"+v"(v0)
- :"v"(mask0), "f"(zero), "r"(gvl)
+ :"+vd"(v0)
+ :"vd"(mask0), "f"(zero), "r"(gvl)
:"v0");
#else
asm volatile(
"vor.vv v0, %1, %1\n\t"
"vsetvli x0, %3, e32,m8 \n\t"
"vfrsub.vf %0, %0, %2, v0.t \n\t"
- :"+v"(v0)
- :"v"(mask0), "f"(zero), "r"(gvl)
+ :"+vd"(v0)
+ :"vd"(mask0), "f"(zero), "r"(gvl)
:"v0");
#endif
v_res = VFREDMAXVS_FLOAT(v_res, v0, v_zero, gvl);
- if(v_res[0] > maxf)
- maxf = v_res[0];
+ if(*((FLOAT*)&v_res) > maxf)
+ maxf = *((FLOAT*)&v_res);
j += gvl;
}
}else{
"vor.vv v0, %1, %1\n\t"
"vsetvli x0, %3, e64,m8 \n\t"
"vfrsub.vf %0, %0, %2, v0.t \n\t"
- :"+v"(v0)
- :"v"(mask0), "f"(zero), "r"(gvl)
+ :"+vd"(v0)
+ :"vd"(mask0), "f"(zero), "r"(gvl)
:"v0");
#else
asm volatile(
"vor.vv v0, %1, %1\n\t"
"vsetvli x0, %3, e32,m8 \n\t"
"vfrsub.vf %0, %0, %2, v0.t \n\t"
- :"+v"(v0)
- :"v"(mask0), "f"(zero), "r"(gvl)
+ :"+vd"(v0)
+ :"vd"(mask0), "f"(zero), "r"(gvl)
:"v0");
#endif
"vor.vv v0, %1, %1\n\t"
"vsetvli x0, %3, e64,m8 \n\t"
"vfrsub.vf %0, %0, %2, v0.t \n\t"
- :"+v"(v1)
- :"v"(mask1), "f"(zero), "r"(gvl)
+ :"+vd"(v1)
+ :"vd"(mask1), "f"(zero), "r"(gvl)
:"v0");
#else
asm volatile(
"vor.vv v0, %1, %1\n\t"
"vsetvli x0, %3, e32,m8 \n\t"
"vfrsub.vf %0, %0, %2, v0.t \n\t"
- :"+v"(v1)
- :"v"(mask1), "f"(zero), "r"(gvl)
+ :"+vd"(v1)
+ :"vd"(mask1), "f"(zero), "r"(gvl)
:"v0");
#endif
ix += inc_xv*2;
}
v_res = VFREDMAXVS_FLOAT(v_res, v_max, v_zero, gvl);
- maxf = v_res[0];
+ maxf = *((FLOAT*)&v_res);
}
for(;j<n;){
gvl = VSETVL(n-j);
"vor.vv v0, %1, %1\n\t"
"vsetvli x0, %3, e64,m8 \n\t"
"vfrsub.vf %0, %0, %2, v0.t \n\t"
- :"+v"(v0)
- :"v"(mask0), "f"(zero), "r"(gvl)
+ :"+vd"(v0)
+ :"vd"(mask0), "f"(zero), "r"(gvl)
:"v0");
#else
asm volatile(
"vor.vv v0, %1, %1\n\t"
"vsetvli x0, %3, e32,m8 \n\t"
"vfrsub.vf %0, %0, %2, v0.t \n\t"
- :"+v"(v0)
- :"v"(mask0), "f"(zero), "r"(gvl)
+ :"+vd"(v0)
+ :"vd"(mask0), "f"(zero), "r"(gvl)
:"v0");
#endif
v_res = VFREDMAXVS_FLOAT(v_res, v0, v_zero, gvl);
- if(v_res[0] > maxf)
- maxf = v_res[0];
+ if(*((FLOAT*)&v_res) > maxf)
+ maxf = *((FLOAT*)&v_res);
j += gvl;
}
}
"vor.vv v0, %1, %1\n\t"
"vsetvli x0, %3, e64,m8 \n\t"
"vfrsub.vf %0, %0, %2, v0.t \n\t"
- :"+v"(v0)
- :"v"(mask0), "f"(zero), "r"(gvl)
+ :"+vd"(v0)
+ :"vd"(mask0), "f"(zero), "r"(gvl)
:"v0");
#else
asm volatile(
"vor.vv v0, %1, %1\n\t"
"vsetvli x0, %3, e32,m8 \n\t"
"vfrsub.vf %0, %0, %2, v0.t \n\t"
- :"+v"(v0)
- :"v"(mask0), "f"(zero), "r"(gvl)
+ :"+vd"(v0)
+ :"vd"(mask0), "f"(zero), "r"(gvl)
:"v0");
#endif
v_min = VFMINVV_FLOAT(v_min, v0, gvl);
"vor.vv v0, %1, %1\n\t"
"vsetvli x0, %3, e64,m8 \n\t"
"vfrsub.vf %0, %0, %2, v0.t \n\t"
- :"+v"(v1)
- :"v"(mask1), "f"(zero), "r"(gvl)
+ :"+vd"(v1)
+ :"vd"(mask1), "f"(zero), "r"(gvl)
:"v0");
#else
asm volatile(
"vor.vv v0, %1, %1\n\t"
"vsetvli x0, %3, e32,m8 \n\t"
"vfrsub.vf %0, %0, %2, v0.t \n\t"
- :"+v"(v1)
- :"v"(mask1), "f"(zero), "r"(gvl)
+ :"+vd"(v1)
+ :"vd"(mask1), "f"(zero), "r"(gvl)
:"v0");
#endif
j += gvl*2;
}
v_res = VFREDMINVS_FLOAT(v_res, v_min, v_max, gvl);
- minf = v_res[0];
+ minf = *((FLOAT*)&v_res);
}
for(;j<n;){
gvl = VSETVL(n-j);
"vor.vv v0, %1, %1\n\t"
"vsetvli x0, %3, e64,m8 \n\t"
"vfrsub.vf %0, %0, %2, v0.t \n\t"
- :"+v"(v0)
- :"v"(mask0), "f"(zero), "r"(gvl)
+ :"+vd"(v0)
+ :"vd"(mask0), "f"(zero), "r"(gvl)
:"v0");
#else
asm volatile(
"vor.vv v0, %1, %1\n\t"
"vsetvli x0, %3, e32,m8 \n\t"
"vfrsub.vf %0, %0, %2, v0.t \n\t"
- :"+v"(v0)
- :"v"(mask0), "f"(zero), "r"(gvl)
+ :"+vd"(v0)
+ :"vd"(mask0), "f"(zero), "r"(gvl)
:"v0");
#endif
v_res = VFREDMINVS_FLOAT(v_res, v0, v_max, gvl);
- if(v_res[0] < minf)
- minf = v_res[0];
+ if(*((FLOAT*)&v_res) < minf)
+ minf = *((FLOAT*)&v_res);
j += gvl;
}
}else{
"vor.vv v0, %1, %1\n\t"
"vsetvli x0, %3, e64,m8 \n\t"
"vfrsub.vf %0, %0, %2, v0.t \n\t"
- :"+v"(v0)
- :"v"(mask0), "f"(zero), "r"(gvl)
+ :"+vd"(v0)
+ :"vd"(mask0), "f"(zero), "r"(gvl)
:"v0");
#else
asm volatile(
"vor.vv v0, %1, %1\n\t"
"vsetvli x0, %3, e32,m8 \n\t"
"vfrsub.vf %0, %0, %2, v0.t \n\t"
- :"+v"(v0)
- :"v"(mask0), "f"(zero), "r"(gvl)
+ :"+vd"(v0)
+ :"vd"(mask0), "f"(zero), "r"(gvl)
:"v0");
#endif
v_min = VFMINVV_FLOAT(v_min, v0, gvl);
"vor.vv v0, %1, %1\n\t"
"vsetvli x0, %3, e64,m8 \n\t"
"vfrsub.vf %0, %0, %2, v0.t \n\t"
- :"+v"(v1)
- :"v"(mask1), "f"(zero), "r"(gvl)
+ :"+vd"(v1)
+ :"vd"(mask1), "f"(zero), "r"(gvl)
:"v0");
#else
asm volatile(
"vor.vv v0, %1, %1\n\t"
"vsetvli x0, %3, e32,m8 \n\t"
"vfrsub.vf %0, %0, %2, v0.t \n\t"
- :"+v"(v1)
- :"v"(mask1), "f"(zero), "r"(gvl)
+ :"+vd"(v1)
+ :"vd"(mask1), "f"(zero), "r"(gvl)
:"v0");
#endif
idx += inc_xv*2;
}
v_res = VFREDMINVS_FLOAT(v_res, v_min, v_max, gvl);
- minf = v_res[0];
+ minf = *((FLOAT*)&v_res);
}
for(;j<n;){
gvl = VSETVL(n-j);
"vor.vv v0, %1, %1\n\t"
"vsetvli x0, %3, e64,m8 \n\t"
"vfrsub.vf %0, %0, %2, v0.t \n\t"
- :"+v"(v0)
- :"v"(mask0), "f"(zero), "r"(gvl)
+ :"+vd"(v0)
+ :"vd"(mask0), "f"(zero), "r"(gvl)
:"v0");
#else
asm volatile(
"vor.vv v0, %1, %1\n\t"
"vsetvli x0, %3, e32,m8 \n\t"
"vfrsub.vf %0, %0, %2, v0.t \n\t"
- :"+v"(v0)
- :"v"(mask0), "f"(zero), "r"(gvl)
+ :"+vd"(v0)
+ :"vd"(mask0), "f"(zero), "r"(gvl)
:"v0");
#endif
v_res = VFREDMINVS_FLOAT(v_res, v0, v_max, gvl);
- if(v_res[0] < minf)
- minf = v_res[0];
+ if(*((FLOAT*)&v_res) < minf)
+ minf = *((FLOAT*)&v_res);
j += gvl;
}
}
#define FLOAT_V_T_M1 vfloat32m1_t
#define VLEV_FLOAT vle_v_f32m8
#define VLSEV_FLOAT vlse_v_f32m8
-#define VFREDSUMVS_FLOAT vfredsum_vs_f32m8_f32m1
+#define VFREDSUMVS_FLOAT vfredosum_vs_f32m8_f32m1
#define MASK_T vbool4_t
#define VMFLTVF_FLOAT vmflt_vf_f32m8_b4
#define VFMVVF_FLOAT vfmv_v_f_f32m8
#define FLOAT_V_T_M1 vfloat64m1_t
#define VLEV_FLOAT vle_v_f64m8
#define VLSEV_FLOAT vlse_v_f64m8
-#define VFREDSUMVS_FLOAT vfredsum_vs_f64m8_f64m1
+#define VFREDSUMVS_FLOAT vfredusum_vs_f64m8_f64m1
#define MASK_T vbool8_t
#define VMFLTVF_FLOAT vmflt_vf_f64m8_b8
#define VFMVVF_FLOAT vfmv_v_f_f64m8
j += gvl * 2;
}
v_res = VFREDSUMVS_FLOAT(v_res, v_sum, v_z0, gvl);
- asumf += v_res[0];
+ asumf += *((FLOAT*)&v_res);
}
for(;j<n;){
gvl = VSETVL(n-j);
mask0 = VMFLTVF_FLOAT(v0, 0, gvl);
v0 = VFRSUBVF_MASK_FLOAT(mask0, v0, v0, 0, gvl);
v_res = VFREDSUMVS_FLOAT(v_res, v0, v_z0, gvl);
- asumf += v_res[0];
+ asumf += *((FLOAT*)&v_res);
j += gvl;
}
}else{
inc_xv += inc_xv * 2;
}
v_res = VFREDSUMVS_FLOAT(v_res, v_sum, v_z0, gvl);
- asumf += v_res[0];
+ asumf += *((FLOAT*)&v_res);
}
for(;j<n;){
gvl = VSETVL(n-j);
mask0 = VMFLTVF_FLOAT(v0, 0, gvl);
v0 = VFRSUBVF_MASK_FLOAT(mask0, v0, v0, 0, gvl);
v_res = VFREDSUMVS_FLOAT(v_res, v0, v_z0, gvl);
- asumf += v_res[0];
+ asumf += *((FLOAT*)&v_res);
j += gvl;
}
}
#define VSEV_FLOAT vse_v_f32m8
#define VSSEV_FLOAT vsse_v_f32m8
#else
-#define VSETVL(n) vsetvl_e32m8(n)
+#define VSETVL(n) vsetvl_e64m8(n)
#define FLOAT_V_T vfloat64m8_t
#define VLEV_FLOAT vle_v_f64m8
#define VLSEV_FLOAT vlse_v_f64m8
#include "common.h"
-#include <riscv-vector.h>
+#include <riscv_vector.h>
#define KERNEL8x4_I \
"addi t1, %[PB], 1*8 \n\t"\
#define VSETVL_MAX vsetvlmax_e32m1()
#define FLOAT_V_T vfloat32m4_t
#define FLOAT_V_T_M1 vfloat32m1_t
+#define VFMVFS_FLOAT vfmv_f_s_f32m1_f32
#define VLEV_FLOAT vle_v_f32m4
#define VLSEV_FLOAT vlse_v_f32m4
-#define VFREDSUM_FLOAT vfredsum_vs_f32m4_f32m1
+#define VFREDSUM_FLOAT vfredosum_vs_f32m4_f32m1
#define VFMACCVV_FLOAT vfmacc_vv_f32m4
#define VFMVVF_FLOAT vfmv_v_f_f32m4
#define VFMVVF_FLOAT_M1 vfmv_v_f_f32m1
#define VSETVL_MAX vsetvlmax_e64m1()
#define FLOAT_V_T vfloat64m4_t
#define FLOAT_V_T_M1 vfloat64m1_t
+#define VFMVFS_FLOAT vfmv_f_s_f64m1_f64
#define VLEV_FLOAT vle_v_f64m4
#define VLSEV_FLOAT vlse_v_f64m4
-#define VFREDSUM_FLOAT vfredsum_vs_f64m4_f64m1
+#define VFREDSUM_FLOAT vfredusum_vs_f64m4_f64m1
#define VFMACCVV_FLOAT vfmacc_vv_f64m4
#define VFMVVF_FLOAT vfmv_v_f_f64m4
#define VFMVVF_FLOAT_M1 vfmv_v_f_f64m1
}
if(j > 0){
v_res = VFREDSUM_FLOAT(v_res, vr, v_z0, gvl);
- dot += v_res[0];
+ dot += (double)VFMVFS_FLOAT(v_res);
}
//tail
if(j < n){
//vr = VFDOTVV_FLOAT(vx, vy, gvl);
vr = VFMACCVV_FLOAT(vz, vx, vy, gvl);
v_res = VFREDSUM_FLOAT(v_res, vr, v_z0, gvl);
- dot += v_res[0];
+ dot += (double)VFMVFS_FLOAT(v_res);
}
}else if(inc_y == 1){
gvl = VSETVL(n);
vr = VFMVVF_FLOAT(0, gvl);
- unsigned int stride_x = inc_x * sizeof(FLOAT);
+ int stride_x = inc_x * sizeof(FLOAT);
for(i=0,j=0; i<n/gvl; i++){
vx = VLSEV_FLOAT(&x[j*inc_x], stride_x, gvl);
vy = VLEV_FLOAT(&y[j], gvl);
}
if(j > 0){
v_res = VFREDSUM_FLOAT(v_res, vr, v_z0, gvl);
- dot += v_res[0];
+ dot += (double)VFMVFS_FLOAT(v_res);
+
}
//tail
if(j < n){
//vr = VFDOTVV_FLOAT(vx, vy, gvl);
vr = VFMACCVV_FLOAT(vz, vx, vy, gvl);
v_res = VFREDSUM_FLOAT(v_res, vr, v_z0, gvl);
- dot += v_res[0];
+ dot += (double)VFMVFS_FLOAT(v_res);
+
}
}else if(inc_x == 1){
gvl = VSETVL(n);
vr = VFMVVF_FLOAT(0, gvl);
- unsigned int stride_y = inc_y * sizeof(FLOAT);
+ int stride_y = inc_y * sizeof(FLOAT);
for(i=0,j=0; i<n/gvl; i++){
vx = VLEV_FLOAT(&x[j], gvl);
vy = VLSEV_FLOAT(&y[j*inc_y], stride_y, gvl);
}
if(j > 0){
v_res = VFREDSUM_FLOAT(v_res, vr, v_z0, gvl);
- dot += v_res[0];
+ dot += (double)VFMVFS_FLOAT(v_res);
+
}
//tail
if(j < n){
//vr = VFDOTVV_FLOAT(vx, vy, gvl);
vr = VFMACCVV_FLOAT(vz, vx, vy, gvl);
v_res = VFREDSUM_FLOAT(v_res, vr, v_z0, gvl);
- dot += v_res[0];
+ dot += (double)VFMVFS_FLOAT(v_res);
+
}
}else{
gvl = VSETVL(n);
vr = VFMVVF_FLOAT(0, gvl);
- unsigned int stride_x = inc_x * sizeof(FLOAT);
- unsigned int stride_y = inc_y * sizeof(FLOAT);
+ int stride_x = inc_x * sizeof(FLOAT);
+ int stride_y = inc_y * sizeof(FLOAT);
for(i=0,j=0; i<n/gvl; i++){
vx = VLSEV_FLOAT(&x[j*inc_x], stride_x, gvl);
vy = VLSEV_FLOAT(&y[j*inc_y], stride_y, gvl);
}
if(j > 0){
v_res = VFREDSUM_FLOAT(v_res, vr, v_z0, gvl);
- dot += v_res[0];
+ dot += (double)VFMVFS_FLOAT(v_res);
+
}
//tail
if(j < n){
//vr = VFDOTVV_FLOAT(vx, vy, gvl);
vr = VFMACCVV_FLOAT(vz, vx, vy, gvl);
v_res = VFREDSUM_FLOAT(v_res, vr, v_z0, gvl);
- dot += v_res[0];
+ dot += (double)VFMVFS_FLOAT(v_res);
+
}
}
return(dot);
#define VSETVL_MAX vsetvlmax_e32m1()
#define FLOAT_V_T vfloat32m4_t
#define FLOAT_V_T_M1 vfloat32m1_t
+#define VFMVFS_FLOAT vfmv_f_s_f32m1_f32
#define VLEV_FLOAT vle_v_f32m4
#define VLSEV_FLOAT vlse_v_f32m4
-#define VFREDSUM_FLOAT vfredsum_vs_f32m4_f32m1
+#define VFREDSUM_FLOAT vfredosum_vs_f32m4_f32m1
#define VFMACCVV_FLOAT vfmacc_vv_f32m4
#define VFMVVF_FLOAT vfmv_v_f_f32m4
#define VFMVVF_FLOAT_M1 vfmv_v_f_f32m1
#define VSETVL_MAX vsetvlmax_e64m1()
#define FLOAT_V_T vfloat64m4_t
#define FLOAT_V_T_M1 vfloat64m1_t
+#define VFMVFS_FLOAT vfmv_f_s_f64m1_f64
#define VLEV_FLOAT vle_v_f64m4
#define VLSEV_FLOAT vlse_v_f64m4
-#define VFREDSUM_FLOAT vfredsum_vs_f64m4_f64m1
+#define VFREDSUM_FLOAT vfredusum_vs_f64m4_f64m1
#define VFMACCVV_FLOAT vfmacc_vv_f64m4
#define VFMVVF_FLOAT vfmv_v_f_f64m4
#define VFMVVF_FLOAT_M1 vfmv_v_f_f64m1
j += gvl;
}
v_res = VFREDSUM_FLOAT(v_res, vr, v_z0, gvl);
- temp = v_res[0];
+ temp = (FLOAT)VFMVFS_FLOAT(v_res);
if(j < m){
gvl = VSETVL(m-j);
va = VLEV_FLOAT(&a_ptr[j], gvl);
vr = VFMULVV_FLOAT(va, vx, gvl);
v_res = VFREDSUM_FLOAT(v_res, vr, v_z0, gvl);
- temp += v_res[0];
+ temp += (FLOAT)VFMVFS_FLOAT(v_res);
}
y[iy] += alpha * temp;
iy += inc_y;
}
}else{
BLASLONG stride_x = inc_x * sizeof(FLOAT);
- BLASLONG inc_xv = inc_x * gvl;
+
for(i = 0; i < n; i++){
gvl = VSETVL(m);
+ BLASLONG inc_xv = inc_x * gvl;
j = 0;
ix = 0;
vr = VFMVVF_FLOAT(0, gvl);
ix += inc_xv;
}
v_res = VFREDSUM_FLOAT(v_res, vr, v_z0, gvl);
- temp = v_res[0];
+ temp = (FLOAT)VFMVFS_FLOAT(v_res);
if(j < m){
gvl = VSETVL(m-j);
va = VLEV_FLOAT(&a_ptr[j], gvl);
vr = VFMULVV_FLOAT(va, vx, gvl);
v_res = VFREDSUM_FLOAT(v_res, vr, v_z0, gvl);
- temp += v_res[0];
+ temp += (FLOAT)VFMVFS_FLOAT(v_res);
}
y[iy] += alpha * temp;
iy += inc_y;
j += gvl;
}
v_res = VFREDMAXVS_FLOAT(v_res, v_max, v_z0, gvl);
- maxf = v_res[0];
+ maxf = *((FLOAT*)&v_res);
mask = VMFGEVF_FLOAT(v_max, maxf, gvl);
max_index = VMFIRSTM(mask,gvl);
- max_index = v_max_index[max_index];
+ max_index = *((unsigned int*)&v_max_index+max_index);
if(j < n){
gvl = VSETVL(n-j);
v_max = VFRSUBVF_MASK_FLOAT(mask, vx, vx, 0, gvl);
v_res = VFREDMAXVS_FLOAT(v_res, v_max, v_z0, gvl);
- FLOAT cur_maxf = v_res[0];
+ FLOAT cur_maxf = *((FLOAT*)&v_res);
if(cur_maxf > maxf){
//tail index
v_max_index = VIDV_UINT(gvl);
mask = VMFGEVF_FLOAT(v_max, cur_maxf, gvl);
max_index = VMFIRSTM(mask,gvl);
- max_index = v_max_index[max_index];
+ max_index = *((unsigned int*)&v_max_index+max_index);
}
}
}else{
idx += inc_v;
}
v_res = VFREDMAXVS_FLOAT(v_res, v_max, v_z0, gvl);
- maxf = v_res[0];
+ maxf = *((FLOAT*)&v_res);
mask = VMFGEVF_FLOAT(v_max, maxf, gvl);
max_index = VMFIRSTM(mask,gvl);
- max_index = v_max_index[max_index];
+ max_index = *((unsigned int*)&v_max_index+max_index);
if(j < n){
gvl = VSETVL(n-j);
v_max = VFRSUBVF_MASK_FLOAT(mask, vx, vx, 0, gvl);
v_res = VFREDMAXVS_FLOAT(v_res, v_max, v_z0, gvl);
- FLOAT cur_maxf = v_res[0];
+ FLOAT cur_maxf = *((FLOAT*)&v_res);
if(cur_maxf > maxf){
//tail index
v_max_index = VIDV_UINT(gvl);
mask = VMFGEVF_FLOAT(v_max, cur_maxf, gvl);
max_index = VMFIRSTM(mask,gvl);
- max_index = v_max_index[max_index];
+ max_index = *((unsigned int*)&v_max_index+max_index);
}
}
}
j += gvl;
}
v_res = VFREDMINVS_FLOAT(v_res, v_min, v_max, gvl);
- minf = v_res[0];
+ minf = *((FLOAT*)&v_res);
mask = VMFLEVF_FLOAT(v_min, minf, gvl);
min_index = VMFIRSTM(mask,gvl);
- min_index = v_min_index[min_index];
+ min_index = *((unsigned int*)&v_min_index+min_index);
if(j < n){
gvl = VSETVL(n-j);
v_min = VFRSUBVF_MASK_FLOAT(mask, vx, vx, 0, gvl);
v_res = VFREDMINVS_FLOAT(v_res, v_min, v_max, gvl);
- FLOAT cur_minf = v_res[0];
+ FLOAT cur_minf = *((FLOAT*)&v_res);
if(cur_minf < minf){
//tail index
v_min_index = VIDV_UINT(gvl);
mask = VMFLEVF_FLOAT(v_min, cur_minf, gvl);
min_index = VMFIRSTM(mask,gvl);
- min_index = v_min_index[min_index];
+ min_index = *((unsigned int*)&v_min_index+min_index);
}
}
}else{
idx += inc_v;
}
v_res = VFREDMINVS_FLOAT(v_res, v_min, v_max, gvl);
- minf = v_res[0];
+ minf = *((FLOAT*)&v_res);
mask = VMFLEVF_FLOAT(v_min, minf, gvl);
min_index = VMFIRSTM(mask,gvl);
- min_index = v_min_index[min_index];
+ min_index = *((unsigned int*)&v_min_index+min_index);
if(j < n){
gvl = VSETVL(n-j);
v_min = VFRSUBVF_MASK_FLOAT(mask, vx, vx, 0, gvl);
v_res = VFREDMINVS_FLOAT(v_res, v_min, v_max, gvl);
- FLOAT cur_minf = v_res[0];
+ FLOAT cur_minf = *((FLOAT*)&v_res);
if(cur_minf < minf){
//tail index
v_min_index = VIDV_UINT(gvl);
mask = VMFLEVF_FLOAT(v_min, cur_minf, gvl);
min_index = VMFIRSTM(mask,gvl);
- min_index = v_min_index[min_index];
+ min_index = *((unsigned int*)&v_min_index+min_index);
}
}
}
j += gvl;
}
v_res = VFREDMAXVS_FLOAT(v_res, v_max, v_min, gvl);
- maxf = v_res[0];
+ maxf = *((FLOAT*)&v_res);
mask = VMFGEVF_FLOAT(v_max, maxf, gvl);
max_index = VMFIRSTM(mask,gvl);
- max_index = v_max_index[max_index];
+ max_index = *((unsigned int*)&v_max_index+max_index);
if(j < n){
gvl = VSETVL(n-j);
v_max = VLEV_FLOAT(&x[j], gvl);
v_res = VFREDMAXVS_FLOAT(v_res, v_max, v_min, gvl);
- FLOAT cur_maxf = v_res[0];
+ FLOAT cur_maxf = *((FLOAT*)&v_res);
if(cur_maxf > maxf){
//tail index
v_max_index = VIDV_UINT(gvl);
mask = VMFGEVF_FLOAT(v_max, cur_maxf, gvl);
max_index = VMFIRSTM(mask,gvl);
- max_index = v_max_index[max_index];
+ max_index = *((unsigned int*)&v_max_index+max_index);
}
}
}else{
idx += inc_v;
}
v_res = VFREDMAXVS_FLOAT(v_res, v_max, v_min, gvl);
- maxf = v_res[0];
+ maxf = *((FLOAT*)&v_res);
mask = VMFGEVF_FLOAT(v_max, maxf, gvl);
max_index = VMFIRSTM(mask,gvl);
- max_index = v_max_index[max_index];
+ max_index = *((unsigned int*)&v_max_index+max_index);
if(j < n){
gvl = VSETVL(n-j);
v_max = VLSEV_FLOAT(&x[idx], stride_x, gvl);
v_res = VFREDMAXVS_FLOAT(v_res, v_max, v_min, gvl);
- FLOAT cur_maxf = v_res[0];
+ FLOAT cur_maxf = *((FLOAT*)&v_res);
if(cur_maxf > maxf){
//tail index
v_max_index = VIDV_UINT(gvl);
mask = VMFGEVF_FLOAT(v_max, cur_maxf, gvl);
max_index = VMFIRSTM(mask,gvl);
- max_index = v_max_index[max_index];
+ max_index = *((unsigned int*)&v_max_index+max_index);
}
}
}
j += gvl;
}
v_res = VFREDMINVS_FLOAT(v_res, v_min, v_max, gvl);
- minf = v_res[0];
+ minf = *((FLOAT*)&v_res);
mask = VMFLEVF_FLOAT(v_min, minf, gvl);
min_index = VMFIRSTM(mask,gvl);
- min_index = v_min_index[min_index];
+ min_index = *((unsigned int*)&v_min_index+min_index);
if(j < n){
gvl = VSETVL(n-j);
v_min = VLEV_FLOAT(&x[j], gvl);
v_res = VFREDMINVS_FLOAT(v_res, v_min, v_max, gvl);
- FLOAT cur_minf = v_res[0];
+ FLOAT cur_minf = *((FLOAT*)&v_res);
if(cur_minf < minf){
//tail index
v_min_index = VIDV_UINT(gvl);
v_min_index = VADDVX_UINT(v_min_index, j, gvl);
mask = VMFLEVF_FLOAT(v_min, cur_minf, gvl);
min_index = VMFIRSTM(mask,gvl);
- min_index = v_min_index[min_index];
+ min_index = *((unsigned int*)&v_min_index+min_index);
}
}
}else{
idx += inc_v;
}
v_res = VFREDMINVS_FLOAT(v_res, v_min, v_max, gvl);
- minf = v_res[0];
+ minf = *((FLOAT*)&v_res);
mask = VMFLEVF_FLOAT(v_min, minf, gvl);
min_index = VMFIRSTM(mask,gvl);
- min_index = v_min_index[min_index];
+ min_index = *((unsigned int*)&v_min_index+min_index);
if(j < n){
gvl = VSETVL(n-j);
v_min = VLSEV_FLOAT(&x[idx], stride_x, gvl);
v_res = VFREDMINVS_FLOAT(v_res, v_min, v_max, gvl);
- FLOAT cur_minf = v_res[0];
+ FLOAT cur_minf = *((FLOAT*)&v_res);
if(cur_minf < minf){
//tail index
v_min_index = VIDV_UINT(gvl);
v_min_index = VADDVX_UINT(v_min_index, j, gvl);
mask = VMFLEVF_FLOAT(v_min, cur_minf, gvl);
min_index = VMFIRSTM(mask,gvl);
- min_index = v_min_index[min_index];
+ min_index = *((unsigned int*)&v_min_index+min_index);
}
}
}
#define VSETVL_MAX vsetvlmax_e64m1()
#define FLOAT_V_T vfloat64m8_t
#define FLOAT_V_T_M1 vfloat64m1_t
+#define VFMVFS_FLOAT vfmv_f_s_f64m1_f64
#define VLSEV_FLOAT vlse_v_f64m8
#define VFREDMAXVS_FLOAT vfredmax_vs_f64m8_f64m1
#define MASK_T vbool8_t
#define VMFGEVF_FLOAT vmfge_vf_f64m8_b8
#define VMFIRSTM vmfirst_m_b8
#define UINT_V_T vuint64m8_t
+#define VSEVU_UINT vse64_v_u64m8
+#define UINT_T long unsigned int
#define VIDV_MASK_UINT vid_v_u64m8_m
#define VIDV_UINT vid_v_u64m8
#define VADDVX_MASK_UINT vadd_vx_u64m8_m
#define VSETVL_MAX vsetvlmax_e32m1()
#define FLOAT_V_T vfloat32m8_t
#define FLOAT_V_T_M1 vfloat32m1_t
+#define VFMVFS_FLOAT vfmv_f_s_f32m1_f32
#define VLSEV_FLOAT vlse_v_f32m8
#define VFREDMAXVS_FLOAT vfredmax_vs_f32m8_f32m1
#define MASK_T vbool4_t
#define VMFGEVF_FLOAT vmfge_vf_f32m8_b4
#define VMFIRSTM vmfirst_m_b4
#define UINT_V_T vuint32m8_t
+#define UINT_T unsigned int
+#define VSEVU_UINT vse32_v_u32m8
#define VIDV_MASK_UINT vid_v_u32m8_m
#define VIDV_UINT vid_v_u32m8
#define VADDVX_MASK_UINT vadd_vx_u32m8_m
v_z0 = VFMVVF_FLOAT_M1(0, gvl);
gvl = VSETVL(n);
+ UINT_T temp_uint[gvl];
v_max_index = VMVVX_UINT(0, gvl);
v_max = VFMVVF_FLOAT(-1, gvl);
BLASLONG stride_x = inc_x * 2 * sizeof(FLOAT);
}
vx0 = VFMVVF_FLOAT(0, gvl);
v_res = VFREDMAXVS_FLOAT(v_res, v_max, v_z0, gvl);
- maxf = v_res[0];
+ maxf = VFMVFS_FLOAT(v_res);
mask0 = VMFGEVF_FLOAT(v_max, maxf, gvl);
max_index = VMFIRSTM(mask0,gvl);
- max_index = v_max_index[max_index];
+ VSEVU_UINT(temp_uint,v_max_index,gvl);
+ max_index = temp_uint[max_index];
+
if(j < n){
gvl = VSETVL(n-j);
*/
v_max = VFADDVV_FLOAT(vx0, vx1, gvl);
v_res = VFREDMAXVS_FLOAT(v_res, v_max, v_z0, gvl);
- FLOAT cur_maxf = v_res[0];
+ FLOAT cur_maxf = VFMVFS_FLOAT(v_res);
if(cur_maxf > maxf){
//tail index
v_max_index = VIDV_UINT(gvl);
mask0 = VMFGEVF_FLOAT(v_max, cur_maxf, gvl);
max_index = VMFIRSTM(mask0,gvl);
- max_index = v_max_index[max_index];
+ VSEVU_UINT(temp_uint,v_max_index,gvl);
+ max_index = temp_uint[max_index];
+
}
}
return(max_index+1);
#define VSETVL_MAX vsetvlmax_e64m1()
#define FLOAT_V_T vfloat64m8_t
#define FLOAT_V_T_M1 vfloat64m1_t
+#define VFMVFS_FLOAT vfmv_f_s_f64m1_f64
#define VLSEV_FLOAT vlse_v_f64m8
#define VFREDMINVS_FLOAT vfredmin_vs_f64m8_f64m1
#define MASK_T vbool8_t
#define VMFLEVF_FLOAT vmfle_vf_f64m8_b8
#define VMFIRSTM vmfirst_m_b8
#define UINT_V_T vuint64m8_t
+#define VSEVU_UINT vse64_v_u64m8
+#define UINT_T long unsigned int
#define VIDV_MASK_UINT vid_v_u64m8_m
#define VIDV_UINT vid_v_u64m8
#define VADDVX_MASK_UINT vadd_vx_u64m8_m
#define VSETVL_MAX vsetvlmax_e32m1()
#define FLOAT_V_T vfloat32m8_t
#define FLOAT_V_T_M1 vfloat32m1_t
+#define VFMVFS_FLOAT vfmv_f_s_f32m1_f32
#define VLSEV_FLOAT vlse_v_f32m8
#define VFREDMINVS_FLOAT vfredmin_vs_f32m8_f32m1
#define MASK_T vbool4_t
#define VMFLEVF_FLOAT vmfle_vf_f32m8_b4
#define VMFIRSTM vmfirst_m_b4
#define UINT_V_T vuint32m8_t
+#define UINT_T unsigned int
+#define VSEVU_UINT vse32_v_u32m8
#define VIDV_MASK_UINT vid_v_u32m8_m
#define VIDV_UINT vid_v_u32m8
#define VADDVX_MASK_UINT vadd_vx_u32m8_m
v_max = VFMVVF_FLOAT_M1(FLT_MAX, gvl);
gvl = VSETVL(n);
+ UINT_T temp_uint[gvl];
v_min_index = VMVVX_UINT(0, gvl);
v_min = VFMVVF_FLOAT(FLT_MAX, gvl);
BLASLONG stride_x = inc_x * 2 * sizeof(FLOAT);
ix += inc_xv;
}
v_res = VFREDMINVS_FLOAT(v_res, v_min, v_max, gvl);
- minf = v_res[0];
+ minf = VFMVFS_FLOAT(v_res);
mask0 = VMFLEVF_FLOAT(v_min, minf, gvl);
min_index = VMFIRSTM(mask0,gvl);
- min_index = v_min_index[min_index];
+ VSEVU_UINT(temp_uint,v_min_index,gvl);
+ min_index = temp_uint[min_index];
if(j < n){
gvl = VSETVL(n-j);
*/
v_min = VFADDVV_FLOAT(vx0, vx1, gvl);
v_res = VFREDMINVS_FLOAT(v_res, v_min, v_max, gvl);
- FLOAT cur_minf = v_res[0];
+ FLOAT cur_minf = VFMVFS_FLOAT(v_res);
if(cur_minf < minf){
//tail index
v_min_index = VIDV_UINT(gvl);
mask0 = VMFLEVF_FLOAT(v_min, cur_minf, gvl);
min_index = VMFIRSTM(mask0,gvl);
- min_index = v_min_index[min_index];
+ VSEVU_UINT(temp_uint,v_min_index,gvl);
+ min_index = temp_uint[min_index];
+
}
}
return(min_index+1);
j += gvl * 2;
}
v_res = VFREDMAXVS_FLOAT(v_res, v_max, v_min, gvl);
- maxf = v_res[0];
+ maxf = *((FLOAT*)&v_res);
}
for(;j<n;){
gvl = VSETVL(n-j);
v0 = VLEV_FLOAT(&x[j], gvl);
v_res = VFREDMAXVS_FLOAT(v_res, v0, v_min, gvl);
- if(v_res[0] > maxf)
- maxf = v_res[0];
+ if(*((FLOAT*)&v_res) > maxf)
+ maxf = *((FLOAT*)&v_res);
j += gvl;
}
}else{
idx += inc_xv * 2;
}
v_res = VFREDMAXVS_FLOAT(v_res, v_max, v_min, gvl);
- maxf = v_res[0];
+ maxf = *((FLOAT*)&v_res);
}
for(;j<n;){
gvl = VSETVL(n-j);
v0 = VLSEV_FLOAT(&x[j*inc_x], stride_x, gvl);
v_res = VFREDMAXVS_FLOAT(v_res, v0, v_min, gvl);
- if(v_res[0] > maxf)
- maxf = v_res[0];
+ if(*((FLOAT*)&v_res) > maxf)
+ maxf = *((FLOAT*)&v_res);
j += gvl;
}
}
j += gvl * 2;
}
v_res = VFREDMINVS_FLOAT(v_res, v_min, v_max, gvl);
- minf = v_res[0];
+ minf = *((FLOAT*)&v_res);
}
for(;j<n;){
gvl = VSETVL(n-j);
v0 = VLEV_FLOAT(&x[j], gvl);
v_res = VFREDMINVS_FLOAT(v_res, v0, v_max, gvl);
- if(v_res[0] < minf)
- minf = v_res[0];
+ if(*((FLOAT*)&v_res) < minf)
+ minf = *((FLOAT*)&v_res);
j += gvl;
}
}else{
idx += inc_xv * 2;
}
v_res = VFREDMINVS_FLOAT(v_res, v_min, v_max, gvl);
- minf = v_res[0];
+ minf = *((FLOAT*)&v_res);
}
for(;j<n;){
gvl = VSETVL(n-j);
v0 = VLSEV_FLOAT(&x[j*inc_x], stride_x, gvl);
v_res = VFREDMINVS_FLOAT(v_res, v0, v_max, gvl);
- if(v_res[0] < minf)
- minf = v_res[0];
+ if(*((FLOAT*)&v_res) < minf)
+ minf = *((FLOAT*)&v_res);
j += gvl;
}
}
#define VSETVL(n) vsetvl_e32m4(n)
#define VSETVL_MAX vsetvlmax_e32m1()
#define FLOAT_V_T vfloat32m4_t
+#define VFMVFS_FLOATM4 vfmv_f_s_f32m4_f32
#define FLOAT_V_T_M1 vfloat32m1_t
+#define VFMVFS_FLOAT vfmv_f_s_f32m1_f32
#define VLEV_FLOAT vle_v_f32m4
#define VLSEV_FLOAT vlse_v_f32m4
-#define VFREDSUM_FLOAT vfredsum_vs_f32m4_f32m1
+#define VFREDSUM_FLOAT vfredusum_vs_f32m4_f32m1
#define VFMACCVV_FLOAT vfmacc_vv_f32m4
#define VFMVVF_FLOAT vfmv_v_f_f32m4
#define VFMVVF_FLOAT_M1 vfmv_v_f_f32m1
#define VSETVL(n) vsetvl_e64m4(n)
#define VSETVL_MAX vsetvlmax_e64m1()
#define FLOAT_V_T vfloat64m4_t
+#define VFMVFS_FLOATM4 vfmv_f_s_f64m4_f64
#define FLOAT_V_T_M1 vfloat64m1_t
+#define VFMVFS_FLOAT vfmv_f_s_f64m1_f64
#define VLEV_FLOAT vle_v_f64m4
#define VLSEV_FLOAT vlse_v_f64m4
-#define VFREDSUM_FLOAT vfredsum_vs_f64m4_f64m1
+#define VFREDSUM_FLOAT vfredusum_vs_f64m4_f64m1
#define VFMACCVV_FLOAT vfmacc_vv_f64m4
#define VFMVVF_FLOAT vfmv_v_f_f64m4
#define VFMVVF_FLOAT_M1 vfmv_v_f_f64m1
//ssq in vector vr: vr[0]
v_res = VFREDSUM_FLOAT(v_res, vr, v_z0, gvl);
//total ssq before current vector
- ssq += v_res[0];
+ ssq += VFMVFS_FLOAT(v_res);
//find max
v_res = VFREDMAXVS_FLOAT(v_res, v0, v_z0, gvl);
//update ssq before max_index
- ssq = ssq * (scale/v_res[0])*(scale/v_res[0]);
+ ssq = ssq * (scale/VFMVFS_FLOAT(v_res))*(scale/VFMVFS_FLOAT(v_res));
//update scale
- scale = v_res[0];
+ scale = VFMVFS_FLOAT(v_res);
//ssq in vector vr
v0 = VFDIVVF_FLOAT(v0, scale, gvl);
vr = VFMACCVV_FLOAT(v_zero, v0, v0, gvl);
//ssq in vector vr: vr[0]
v_res = VFREDSUM_FLOAT(v_res, vr, v_z0, gvl);
//total ssq now
- ssq += v_res[0];
+ ssq += VFMVFS_FLOAT(v_res);
//tail
if(j < n){
//find max
v_res = VFREDMAXVS_FLOAT(v_res, v0, v_z0, gvl);
//update ssq before max_index
- ssq = ssq * (scale/v_res[0])*(scale/v_res[0]);
+ ssq = ssq * (scale/VFMVFS_FLOAT(v_res))*(scale/VFMVFS_FLOAT(v_res));
//update scale
- scale = v_res[0];
+ scale = VFMVFS_FLOAT(v_res);
v0 = VFDIVVF_FLOAT(v0, scale, gvl);
}
vr = VFMACCVV_FLOAT(v_zero, v0, v0, gvl);
//ssq in vector vr: vr[0]
v_res = VFREDSUM_FLOAT(v_res, vr, v_z0, gvl);
//total ssq now
- ssq += v_res[0];
+ ssq += VFMVFS_FLOAT(v_res);
}
}else{
gvl = VSETVL(n);
//ssq in vector vr: vr[0]
v_res = VFREDSUM_FLOAT(v_res, vr, v_z0, gvl);
//total ssq before current vector
- ssq += v_res[0];
+ ssq += VFMVFS_FLOAT(v_res);
//find max
v_res = VFREDMAXVS_FLOAT(v_res, v0, v_z0, gvl);
//update ssq before max_index
- ssq = ssq * (scale/v_res[0])*(scale/v_res[0]);
+ ssq = ssq * (scale/VFMVFS_FLOAT(v_res))*(scale/VFMVFS_FLOAT(v_res));
//update scale
- scale = v_res[0];
+ scale = VFMVFS_FLOAT(v_res);
//ssq in vector vr
v0 = VFDIVVF_FLOAT(v0, scale, gvl);
vr = VFMACCVV_FLOAT(v_zero, v0, v0, gvl);
//ssq in vector vr: vr[0]
v_res = VFREDSUM_FLOAT(v_res, vr, v_z0, gvl);
//total ssq now
- ssq += v_res[0];
+ ssq += VFMVFS_FLOAT(v_res);
//tail
if(j < n){
//find max
v_res = VFREDMAXVS_FLOAT(v_res, v0, v_z0, gvl);
//update ssq before max_index
- ssq = ssq * (scale/v_res[0])*(scale/v_res[0]);
+ ssq = ssq * (scale/VFMVFS_FLOAT(v_res))*(scale/VFMVFS_FLOAT(v_res));
//update scale
- scale = vr[0];
+ scale = VFMVFS_FLOATM4(vr);
v0 = VFDIVVF_FLOAT(v0, scale, gvl);
}
vr = VFMACCVV_FLOAT(v_zero, v0, v0, gvl);
//ssq in vector vr: vr[0]
v_res = VFREDSUM_FLOAT(v_res, vr, v_z0, gvl);
//total ssq now
- ssq += v_res[0];
+ ssq += VFMVFS_FLOAT(v_res);
}
}
return(scale * sqrt(ssq));
#define VSETVL_MAX vsetvlmax_e32m1()
#define FLOAT_V_T vfloat32m8_t
#define FLOAT_V_T_M1 vfloat32m1_t
+#define VFMVFS_FLOAT vfmv_f_s_f32m1_f32
#define VLEV_FLOAT vle_v_f32m8
#define VLSEV_FLOAT vlse_v_f32m8
-#define VFREDSUM_FLOAT vfredsum_vs_f32m8_f32m1
+#define VFREDSUM_FLOAT vfredusum_vs_f32m8_f32m1
#define VFMACCVV_FLOAT vfmacc_vv_f32m8
#define VFMVVF_FLOAT vfmv_v_f_f32m8
#define VFMVVF_FLOAT_M1 vfmv_v_f_f32m1
#define VSETVL_MAX vsetvlmax_e64m1()
#define FLOAT_V_T vfloat64m8_t
#define FLOAT_V_T_M1 vfloat64m1_t
+#define VFMVFS_FLOAT vfmv_f_s_f64m1_f64
#define VLEV_FLOAT vle_v_f64m8
#define VLSEV_FLOAT vlse_v_f64m8
-#define VFREDSUM_FLOAT vfredsum_vs_f64m8_f64m1
+#define VFREDSUM_FLOAT vfredusum_vs_f64m8_f64m1
#define VFMACCVV_FLOAT vfmacc_vv_f64m8
#define VFMVVF_FLOAT vfmv_v_f_f64m8
#define VFMVVF_FLOAT_M1 vfmv_v_f_f64m1
j += gvl;
}
v_res = VFREDSUM_FLOAT(v_res, vr, v_z0, gvl);
- len += v_res[0];
+ len += VFMVFS_FLOAT(v_res);
}
//tail
for(;j < n;){
//vr = VFDOTVV_FLOAT(v0, v0, gvl);
vr = VFMACCVV_FLOAT(v1, v0, v0, gvl);
v_res = VFREDSUM_FLOAT(v_res, vr, v_z0, gvl);
- len += v_res[0];
+ len += VFMVFS_FLOAT(v_res);
j += gvl;
}
j += gvl;
}
v_res = VFREDSUM_FLOAT(v_res, vr, v_z0, gvl);
- len += v_res[0];
+ len += VFMVFS_FLOAT(v_res);
}
//tail
for(;j < n;){
//vr = VFDOTVV_FLOAT(v0, v0, gvl);
vr = VFMACCVV_FLOAT(v1, v0, v0, gvl);
v_res = VFREDSUM_FLOAT(v_res, vr, v_z0, gvl);
- len += v_res[0];
+ len += VFMVFS_FLOAT(v_res);
j += gvl;
}
}else{
if(da == 0.0){
gvl = VSETVL(n);
+ BLASLONG stride_x = inc_x * sizeof(FLOAT);
+ BLASLONG ix = 0;
if(gvl <= n / 2){
+ long int inc_xv = gvl * inc_x;
v0 = VFMVVF_FLOAT(0, gvl);
for(i = 0, j = 0; i < n/(2*gvl); i++, j+=2*gvl){
- VSEV_FLOAT(&x[j], v0, gvl);
- VSEV_FLOAT(&x[j+gvl], v0, gvl);
+ VSSEV_FLOAT(&x[ix], stride_x, v0, gvl);\r
+ VSSEV_FLOAT(&x[ix + inc_xv], stride_x, v0, gvl);\r
+ ix += inc_xv * 2;
}
}
//tail
for(; j <n; ){
gvl = VSETVL(n-j);
v0 = VFMVVF_FLOAT(0, gvl);
- VSEV_FLOAT(&x[j], v0, gvl);
+ VSSEV_FLOAT(&x[ix], stride_x, v0, gvl);
j += gvl;
+ ix += inc_x * gvl;
}
}else{
gvl = VSETVL(n);
#include "common.h"
-#include <riscv-vector.h>
+#include <riscv_vector.h>
#define KERNEL16x4_I \
"addi t1, %[PB], 1*4 \n\t"\
#define VSETVL_MAX vsetvlmax_e32m1()
#define FLOAT_V_T vfloat32m4_t
#define FLOAT_V_T_M1 vfloat32m1_t
+#define VFMVFS_FLOAT vfmv_f_s_f32m1_f32
#define VLEV_FLOAT vle_v_f32m4
#define VLSEV_FLOAT vlse_v_f32m4
#define VSEV_FLOAT vse_v_f32m4
#define VSSEV_FLOAT vsse_v_f32m4
-#define VFREDSUM_FLOAT vfredsum_vs_f32m4_f32m1
+#define VFREDSUM_FLOAT vfredusum_vs_f32m4_f32m1
#define VFMACCVV_FLOAT vfmacc_vv_f32m4
#define VFMACCVF_FLOAT vfmacc_vf_f32m4
#define VFMVVF_FLOAT vfmv_v_f_f32m4
#define VSETVL_MAX vsetvlmax_e64m1()
#define FLOAT_V_T vfloat64m4_t
#define FLOAT_V_T_M1 vfloat64m1_t
+#define VFMVFS_FLOAT vfmv_f_s_f64m1_f64
#define VLEV_FLOAT vle_v_f64m4
#define VLSEV_FLOAT vlse_v_f64m4
#define VSEV_FLOAT vse_v_f64m4
#define VSSEV_FLOAT vsse_v_f64m4
-#define VFREDSUM_FLOAT vfredsum_vs_f64m4_f64m1
+#define VFREDSUM_FLOAT vfredusum_vs_f64m4_f64m1
#define VFMACCVV_FLOAT vfmacc_vv_f64m4
#define VFMACCVF_FLOAT vfmacc_vf_f64m4
#define VFMVVF_FLOAT vfmv_v_f_f64m4
i += gvl;
}
v_res = VFREDSUM_FLOAT(v_res, vr, v_z0, gvl);
- temp2 = v_res[0];
+ temp2 = VFMVFS_FLOAT(v_res);
if(i < m){
gvl = VSETVL(m-i);
vy = VLEV_FLOAT(&y[i], gvl);
vx = VLEV_FLOAT(&x[i], gvl);
vr = VFMULVV_FLOAT(vx, va, gvl);
v_res = VFREDSUM_FLOAT(v_res, vr, v_z0, gvl);
- temp2 += v_res[0];
+ temp2 += VFMVFS_FLOAT(v_res);
}
}
y[j] += alpha * temp2;
iy += inc_yv;
}
v_res = VFREDSUM_FLOAT(v_res, vr, v_z0, gvl);
- temp2 = v_res[0];
+ temp2 = VFMVFS_FLOAT(v_res);
if(i < m){
gvl = VSETVL(m-i);
vy = VLSEV_FLOAT(&y[iy], stride_y, gvl);
vx = VLEV_FLOAT(&x[i], gvl);
vr = VFMULVV_FLOAT(vx, va, gvl);
v_res = VFREDSUM_FLOAT(v_res, vr, v_z0, gvl);
- temp2 += v_res[0];
+ temp2 += VFMVFS_FLOAT(v_res);
}
}
y[jy] += alpha * temp2;
ix += inc_xv;
}
v_res = VFREDSUM_FLOAT(v_res, vr, v_z0, gvl);
- temp2 = v_res[0];
+ temp2 = VFMVFS_FLOAT(v_res);
if(i < m){
gvl = VSETVL(m-i);
vy = VLEV_FLOAT(&y[i], gvl);
vx = VLSEV_FLOAT(&x[ix], stride_x, gvl);
vr = VFMULVV_FLOAT(vx, va, gvl);
v_res = VFREDSUM_FLOAT(v_res, vr, v_z0, gvl);
- temp2 += v_res[0];
+ temp2 += VFMVFS_FLOAT(v_res);
}
}
y[j] += alpha * temp2;
iy += inc_yv;
}
v_res = VFREDSUM_FLOAT(v_res, vr, v_z0, gvl);
- temp2 = v_res[0];
+ temp2 = VFMVFS_FLOAT(v_res);
if(i < m){
gvl = VSETVL(m-i);
vy = VLSEV_FLOAT(&y[iy], stride_y, gvl);
vx = VLSEV_FLOAT(&x[ix], stride_x, gvl);
vr = VFMULVV_FLOAT(vx, va, gvl);
v_res = VFREDSUM_FLOAT(v_res, vr, v_z0, gvl);
- temp2 += v_res[0];
+ temp2 += VFMVFS_FLOAT(v_res);
}
}
y[jy] += alpha * temp2;
#define VSETVL_MAX vsetvlmax_e32m1()
#define FLOAT_V_T vfloat32m4_t
#define FLOAT_V_T_M1 vfloat32m1_t
+#define VFMVFS_FLOAT vfmv_f_s_f32m1_f32
#define VLEV_FLOAT vle_v_f32m4
#define VLSEV_FLOAT vlse_v_f32m4
#define VSEV_FLOAT vse_v_f32m4
#define VSSEV_FLOAT vsse_v_f32m4
-#define VFREDSUM_FLOAT vfredsum_vs_f32m4_f32m1
+#define VFREDSUM_FLOAT vfredusum_vs_f32m4_f32m1
#define VFMACCVV_FLOAT vfmacc_vv_f32m4
#define VFMACCVF_FLOAT vfmacc_vf_f32m4
#define VFMVVF_FLOAT vfmv_v_f_f32m4
#define VSETVL_MAX vsetvlmax_e64m1()
#define FLOAT_V_T vfloat64m4_t
#define FLOAT_V_T_M1 vfloat64m1_t
+#define VFMVFS_FLOAT vfmv_f_s_f64m1_f64
#define VLEV_FLOAT vle_v_f64m4
#define VLSEV_FLOAT vlse_v_f64m4
#define VSEV_FLOAT vse_v_f64m4
#define VSSEV_FLOAT vsse_v_f64m4
-#define VFREDSUM_FLOAT vfredsum_vs_f64m4_f64m1
+#define VFREDSUM_FLOAT vfredusum_vs_f64m4_f64m1
#define VFMACCVV_FLOAT vfmacc_vv_f64m4
#define VFMACCVF_FLOAT vfmacc_vf_f64m4
#define VFMVVF_FLOAT vfmv_v_f_f64m4
i += gvl;
}
v_res = VFREDSUM_FLOAT(v_res, vr, v_z0, gvl);
- temp2 = v_res[0];
+ temp2 = VFMVFS_FLOAT(v_res);
if(i < j){
gvl = VSETVL(j-i);
vy = VLEV_FLOAT(&y[i], gvl);
vx = VLEV_FLOAT(&x[i], gvl);
vr = VFMULVV_FLOAT(vx, va, gvl);
v_res = VFREDSUM_FLOAT(v_res, vr, v_z0, gvl);
- temp2 += v_res[0];
+ temp2 += VFMVFS_FLOAT(v_res);
}
}
y[j] += temp1 * a_ptr[j] + alpha * temp2;
iy += inc_yv;
}
v_res = VFREDSUM_FLOAT(v_res, vr, v_z0, gvl);
- temp2 = v_res[0];
+ temp2 = VFMVFS_FLOAT(v_res);
if(i < j){
gvl = VSETVL(j-i);
vy = VLSEV_FLOAT(&y[iy], stride_y, gvl);
vx = VLEV_FLOAT(&x[i], gvl);
vr = VFMULVV_FLOAT(vx, va, gvl);
v_res = VFREDSUM_FLOAT(v_res, vr, v_z0, gvl);
- temp2 += v_res[0];
+ temp2 += VFMVFS_FLOAT(v_res);
}
}
y[jy] += temp1 * a_ptr[j] + alpha * temp2;
ix += inc_xv;
}
v_res = VFREDSUM_FLOAT(v_res, vr, v_z0, gvl);
- temp2 = v_res[0];
+ temp2 = VFMVFS_FLOAT(v_res);
if(i < j){
gvl = VSETVL(j-i);
vy = VLEV_FLOAT(&y[i], gvl);
vx = VLSEV_FLOAT(&x[ix], stride_x, gvl);
vr = VFMULVV_FLOAT(vx, va, gvl);
v_res = VFREDSUM_FLOAT(v_res, vr, v_z0, gvl);
- temp2 += v_res[0];
+ temp2 += VFMVFS_FLOAT(v_res);
}
}
y[j] += temp1 * a_ptr[j] + alpha * temp2;
iy += inc_yv;
}
v_res = VFREDSUM_FLOAT(v_res, vr, v_z0, gvl);
- temp2 = v_res[0];
+ temp2 = VFMVFS_FLOAT(v_res);
if(i < j){
gvl = VSETVL(j-i);
vy = VLSEV_FLOAT(&y[iy], stride_y, gvl);
vx = VLSEV_FLOAT(&x[ix], stride_x, gvl);
vr = VFMULVV_FLOAT(vx, va, gvl);
v_res = VFREDSUM_FLOAT(v_res, vr, v_z0, gvl);
- temp2 += v_res[0];
+ temp2 += VFMVFS_FLOAT(v_res);
}
}
y[jy] += temp1 * a_ptr[j] + alpha * temp2;
#define VSETVL_MAX vsetvlmax_e32m1()
#define FLOAT_V_T vfloat32m8_t
#define FLOAT_V_T_M1 vfloat32m1_t
+#define VFMVFS_FLOAT vfmv_f_s_f32m1_f32
#define VLSEV_FLOAT vlse_v_f32m8
#define VFREDMAXVS_FLOAT vfredmax_vs_f32m8_f32m1
#define MASK_T vbool4_t
#define VFRSUBVF_MASK_FLOAT vfrsub_vf_f32m8_m
#define VFMAXVV_FLOAT vfmax_vv_f32m8
#define VFADDVV_FLOAT vfadd_vv_f32m8
+
#else
#define VSETVL(n) vsetvl_e64m8(n)
#define VSETVL_MAX vsetvlmax_e64m1()
#define FLOAT_V_T vfloat64m8_t
#define FLOAT_V_T_M1 vfloat64m1_t
+#define VFMVFS_FLOAT vfmv_f_s_f64m1_f64
#define VLSEV_FLOAT vlse_v_f64m8
#define VFREDMAXVS_FLOAT vfredmax_vs_f64m8_f64m1
#define MASK_T vbool8_t
#define VFRSUBVF_MASK_FLOAT vfrsub_vf_f64m8_m
#define VFMAXVV_FLOAT vfmax_vv_f64m8
#define VFADDVV_FLOAT vfadd_vv_f64m8
+
#endif
FLOAT CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x)
ix += inc_xv;
}
v_res = VFREDMAXVS_FLOAT(v_res, v_max, v_z0, gvl);
- maxf = v_res[0];
+ maxf = VFMVFS_FLOAT(v_res);
if(j<n){
gvl = VSETVL(n-j);
v1 = VFRSUBVF_MASK_FLOAT(mask1, v1, v1, 0, gvl);
v1 = VFADDVV_FLOAT(v0, v1, gvl);
v_res = VFREDMAXVS_FLOAT(v_res, v1, v_z0, gvl);
- if(v_res[0] > maxf)
- maxf = v_res[0];
+
+ if(VFMVFS_FLOAT(v_res)> maxf)
+ maxf = VFMVFS_FLOAT(v_res);
}
return(maxf);
}
#define VSETVL_MAX vsetvlmax_e32m1()
#define FLOAT_V_T vfloat32m8_t
#define FLOAT_V_T_M1 vfloat32m1_t
+#define VFMVFS_FLOAT vfmv_f_s_f32m1_f32
#define VLSEV_FLOAT vlse_v_f32m8
#define VFREDMINVS_FLOAT vfredmin_vs_f32m8_f32m1
#define MASK_T vbool4_t
#define VSETVL_MAX vsetvlmax_e32m1()
#define FLOAT_V_T vfloat64m8_t
#define FLOAT_V_T_M1 vfloat64m1_t
+#define VFMVFS_FLOAT vfmv_f_s_f64m1_f64
#define VLSEV_FLOAT vlse_v_f64m8
#define VFREDMINVS_FLOAT vfredmin_vs_f64m8_f64m1
#define MASK_T vbool8_t
ix += inc_xv;
}
v_res = VFREDMINVS_FLOAT(v_res, v_min, v_max, gvl);
- minf = v_res[0];
+ minf = VFMVFS_FLOAT(v_res);
if(j<n){
gvl = VSETVL(n-j);
v1 = VFRSUBVF_MASK_FLOAT(mask1, v1, v1, 0, gvl);
v1 = VFADDVV_FLOAT(v0, v1, gvl);
v_res = VFREDMINVS_FLOAT(v_res, v1, v_max, gvl);
- if(v_res[0] < minf)
- minf = v_res[0];
+ if(VFMVFS_FLOAT(v_res) < minf)
+ minf = VFMVFS_FLOAT(v_res);
}
return(minf);
}
#define VSETVL_MAX vsetvlmax_e32m1()
#define FLOAT_V_T vfloat32m8_t
#define FLOAT_V_T_M1 vfloat32m1_t
+#define VFFMVFS_FLOAT vfmv_f_s_f32m1_f32
#define VLEV_FLOAT vle_v_f32m8
#define VLSEV_FLOAT vlse_v_f32m8
-#define VFREDSUMVS_FLOAT vfredsum_vs_f32m8_f32m1
+#define VFREDSUMVS_FLOAT vfredusum_vs_f32m8_f32m1
#define MASK_T vbool4_t
#define VMFLTVF_FLOAT vmflt_vf_f32m8_b4
#define VFMVVF_FLOAT vfmv_v_f_f32m8
#define VSETVL_MAX vsetvlmax_e64m1()
#define FLOAT_V_T vfloat64m8_t
#define FLOAT_V_T_M1 vfloat64m1_t
+#define VFFMVFS_FLOAT vfmv_f_s_f64m1_f64
#define VLEV_FLOAT vle_v_f64m8
#define VLSEV_FLOAT vlse_v_f64m8
-#define VFREDSUMVS_FLOAT vfredsum_vs_f64m8_f64m1
+#define VFREDSUMVS_FLOAT vfredusum_vs_f64m8_f64m1
#define MASK_T vbool8_t
#define VMFLTVF_FLOAT vmflt_vf_f64m8_b8
#define VFMVVF_FLOAT vfmv_v_f_f64m8
j += gvl * 2;
}
v_res = VFREDSUMVS_FLOAT(v_res, v_sum, v_z0, gvl);
- asumf += v_res[0];
+ asumf += VFFMVFS_FLOAT(v_res);
}
for(;j<n2;){
gvl = VSETVL(n2-j);
mask0 = VMFLTVF_FLOAT(v0, 0, gvl);
v0 = VFRSUBVF_MASK_FLOAT(mask0, v0, v0, 0, gvl);
v_res = VFREDSUMVS_FLOAT(v_res, v0, v_z0, gvl);
- asumf += v_res[0];
+ asumf += VFFMVFS_FLOAT(v_res);
j += gvl;
}
}else{
ix += inc_xv;
}
v_res = VFREDSUMVS_FLOAT(v_res, v_sum, v_z0, gvl);
- asumf += v_res[0];
+ asumf += VFFMVFS_FLOAT(v_res);
if(j<n){
gvl = VSETVL(n-j);
v0 = VLSEV_FLOAT(&x[ix], stride_x, gvl);
v1 = VFRSUBVF_MASK_FLOAT(mask1, v1, v1, 0, gvl);
v_sum = VFADDVV_FLOAT(v0, v1, gvl);
v_res = VFREDSUMVS_FLOAT(v_res, v_sum, v_z0, gvl);
- asumf += v_res[0];
+ asumf += VFFMVFS_FLOAT(v_res);
}
}
return(asumf);
#define VSETVL_MAX vsetvlmax_e32m1()
#define FLOAT_V_T vfloat32m4_t
#define FLOAT_V_T_M1 vfloat32m1_t
+#define VFMVFS_FLOAT vfmv_f_s_f32m1_f32
#define VLEV_FLOAT vle_v_f32m4
#define VLSEV_FLOAT vlse_v_f32m4
-#define VFREDSUM_FLOAT vfredsum_vs_f32m4_f32m1
+#define VFREDSUM_FLOAT vfredusum_vs_f32m4_f32m1
#define VFMACCVV_FLOAT vfmacc_vv_f32m4
#define VFMVVF_FLOAT vfmv_v_f_f32m4
#define VFMVVF_FLOAT_M1 vfmv_v_f_f32m1
#define VSETVL_MAX vsetvlmax_e64m1()
#define FLOAT_V_T vfloat64m4_t
#define FLOAT_V_T_M1 vfloat64m1_t
+#define VFMVFS_FLOAT vfmv_f_s_f64m1_f64
#define VLEV_FLOAT vle_v_f64m4
#define VLSEV_FLOAT vlse_v_f64m4
-#define VFREDSUM_FLOAT vfredsum_vs_f64m4_f64m1
+#define VFREDSUM_FLOAT vfredusum_vs_f64m4_f64m1
#define VFMACCVV_FLOAT vfmacc_vv_f64m4
#define VFMVVF_FLOAT vfmv_v_f_f64m4
#define VFMVVF_FLOAT_M1 vfmv_v_f_f64m1
iy += inc_yv;
}
v_res = VFREDSUM_FLOAT(v_res, vr0, v_z0, gvl);
- dot[0] += v_res[0];
+ dot[0] += VFMVFS_FLOAT(v_res);
v_res = VFREDSUM_FLOAT(v_res, vr1, v_z0, gvl);
- dot[1] += v_res[0];
+ dot[1] += VFMVFS_FLOAT(v_res);
//tail
if(j < n){
gvl = VSETVL(n-j);
vr1 = VFMSACVV_FLOAT(vr1, vx0, vy1, gvl);
#endif
v_res = VFREDSUM_FLOAT(v_res, vr0, v_z0, gvl);
- dot[0] += v_res[0];
+ dot[0] += VFMVFS_FLOAT(v_res);
v_res = VFREDSUM_FLOAT(v_res, vr1, v_z0, gvl);
- dot[1] += v_res[0];
+ dot[1] += VFMVFS_FLOAT(v_res);
}
CREAL(result) = dot[0];
CIMAG(result) = dot[1];
#define VSETVL_MAX vsetvlmax_e32m1()
#define FLOAT_V_T vfloat32m4_t
#define FLOAT_V_T_M1 vfloat32m1_t
+#define VFMVFS_FLOAT vfmv_f_s_f32m1_f32
#define VLSEV_FLOAT vlse_v_f32m4
-#define VFREDSUM_FLOAT vfredsum_vs_f32m4_f32m1
+#define VFREDSUM_FLOAT vfredusum_vs_f32m4_f32m1
#define VFMACCVV_FLOAT vfmacc_vv_f32m4
#define VFNMSACVV_FLOAT vfnmsac_vv_f32m4
#define VFMVVF_FLOAT vfmv_v_f_f32m4
#define VSETVL_MAX vsetvlmax_e64m1()
#define FLOAT_V_T vfloat64m4_t
#define FLOAT_V_T_M1 vfloat64m1_t
+#define VFMVFS_FLOAT vfmv_f_s_f64m1_f64
#define VLSEV_FLOAT vlse_v_f64m4
-#define VFREDSUM_FLOAT vfredsum_vs_f64m4_f64m1
+#define VFREDSUM_FLOAT vfredusum_vs_f64m4_f64m1
#define VFMACCVV_FLOAT vfmacc_vv_f64m4
#define VFNMSACVV_FLOAT vfnmsac_vv_f64m4
#define VFMVVF_FLOAT vfmv_v_f_f64m4
ix += inc_xv;
}
v_res = VFREDSUM_FLOAT(v_res, vr, v_z0, gvl);
- temp_r = v_res[0];
+ temp_r = VFMVFS_FLOAT(v_res);
v_res = VFREDSUM_FLOAT(v_res, vi, v_z0, gvl);
- temp_i = v_res[0];
+ temp_i = VFMVFS_FLOAT(v_res);
if(j/2 < m){
gvl = VSETVL(m-j/2);
va0 = VLSEV_FLOAT(&a_ptr[j], stride_a, gvl);
#endif
v_res = VFREDSUM_FLOAT(v_res, vr, v_z0, gvl);
- temp_r += v_res[0];
+ temp_r += VFMVFS_FLOAT(v_res);
v_res = VFREDSUM_FLOAT(v_res, vi, v_z0, gvl);
- temp_i += v_res[0];
+ temp_i += VFMVFS_FLOAT(v_res);
}
#if !defined(XCONJ)
y[iy] += alpha_r * temp_r - alpha_i * temp_i;
#define VSETVL_MAX vsetvlmax_e32m1()
#define FLOAT_V_T vfloat32m4_t
#define FLOAT_V_T_M1 vfloat32m1_t
+#define VFMVFS_FLOAT vfmv_f_s_f32m1_f32
#define VLSEV_FLOAT vlse_v_f32m4
#define VSSEV_FLOAT vsse_v_f32m4
-#define VFREDSUM_FLOAT vfredsum_vs_f32m4_f32m1
+#define VFREDSUM_FLOAT vfredusum_vs_f32m4_f32m1
#define VFMACCVV_FLOAT vfmacc_vv_f32m4
#define VFMACCVF_FLOAT vfmacc_vf_f32m4
#define VFMVVF_FLOAT vfmv_v_f_f32m4
#define VSETVL_MAX vsetvlmax_e64m1()
#define FLOAT_V_T vfloat64m4_t
#define FLOAT_V_T_M1 vfloat64m1_t
+#define VFMVFS_FLOAT vfmv_f_s_f64m1_f64
#define VLSEV_FLOAT vlse_v_f64m4
#define VSSEV_FLOAT vsse_v_f64m4
-#define VFREDSUM_FLOAT vfredsum_vs_f64m4_f64m1
+#define VFREDSUM_FLOAT vfredusum_vs_f64m4_f64m1
#define VFMACCVV_FLOAT vfmacc_vv_f64m4
#define VFMACCVF_FLOAT vfmacc_vf_f64m4
#define VFMVVF_FLOAT vfmv_v_f_f64m4
ia += inc_av;
}
v_res = VFREDSUM_FLOAT(v_res, vr0, v_z0, gvl);
- temp_r2 = v_res[0];
+ temp_r2 = VFMVFS_FLOAT(v_res);
v_res = VFREDSUM_FLOAT(v_res, vr1, v_z0, gvl);
- temp_i2 = v_res[0];
+ temp_i2 = VFMVFS_FLOAT(v_res);
if(i < m){
gvl = VSETVL(m-i);
va0 = VLSEV_FLOAT(&a_ptr[ia], stride_a, gvl);
#endif
v_res = VFREDSUM_FLOAT(v_res, vr0, v_z0, gvl);
- temp_r2 += v_res[0];
+ temp_r2 += VFMVFS_FLOAT(v_res);
v_res = VFREDSUM_FLOAT(v_res, vr1, v_z0, gvl);
- temp_i2 += v_res[0];
+ temp_i2 += VFMVFS_FLOAT(v_res);
}
}
y[jy] += alpha_r * temp_r2 - alpha_i * temp_i2;
#define VSETVL_MAX vsetvlmax_e32m1()
#define FLOAT_V_T vfloat32m4_t
#define FLOAT_V_T_M1 vfloat32m1_t
+#define VFMVFS_FLOAT vfmv_f_s_f32m1_f32
#define VLSEV_FLOAT vlse_v_f32m4
#define VSSEV_FLOAT vsse_v_f32m4
-#define VFREDSUM_FLOAT vfredsum_vs_f32m4_f32m1
+#define VFREDSUM_FLOAT vfredusum_vs_f32m4_f32m1
#define VFMACCVV_FLOAT vfmacc_vv_f32m4
#define VFMACCVF_FLOAT vfmacc_vf_f32m4
#define VFMVVF_FLOAT vfmv_v_f_f32m4
#define VSETVL_MAX vsetvlmax_e64m1()
#define FLOAT_V_T vfloat64m4_t
#define FLOAT_V_T_M1 vfloat64m1_t
+#define VFMVFS_FLOAT vfmv_f_s_f64m1_f64
#define VLSEV_FLOAT vlse_v_f64m4
#define VSSEV_FLOAT vsse_v_f64m4
-#define VFREDSUM_FLOAT vfredsum_vs_f64m4_f64m1
+#define VFREDSUM_FLOAT vfredusum_vs_f64m4_f64m1
#define VFMACCVV_FLOAT vfmacc_vv_f64m4
#define VFMACCVF_FLOAT vfmacc_vf_f64m4
#define VFMVVF_FLOAT vfmv_v_f_f64m4
ia += inc_av;
}
v_res = VFREDSUM_FLOAT(v_res, vr0, v_z0, gvl);
- temp_r2 = v_res[0];
+ temp_r2 = VFMVFS_FLOAT(v_res);
v_res = VFREDSUM_FLOAT(v_res, vr1, v_z0, gvl);
- temp_i2 = v_res[0];
+ temp_i2 = VFMVFS_FLOAT(v_res);
if(i < j){
gvl = VSETVL(j-i);
va0 = VLSEV_FLOAT(&a_ptr[ia], stride_a, gvl);
#endif
v_res = VFREDSUM_FLOAT(v_res, vr0, v_z0, gvl);
- temp_r2 += v_res[0];
+ temp_r2 += VFMVFS_FLOAT(v_res);
v_res = VFREDSUM_FLOAT(v_res, vr1, v_z0, gvl);
- temp_i2 += v_res[0];
+ temp_i2 += VFMVFS_FLOAT(v_res);
}
}
y[jy] += temp_r1 * a_ptr[ja];
#define VSETVL_MAX vsetvlmax_e32m1()
#define FLOAT_V_T vfloat32m4_t
#define FLOAT_V_T_M1 vfloat32m1_t
+#define VFMVFS_FLOAT vfmv_f_s_f32m1_f32
#define VLEV_FLOAT vle_v_f32m4
#define VLSEV_FLOAT vlse_v_f32m4
-#define VFREDSUM_FLOAT vfredsum_vs_f32m4_f32m1
+#define VFREDSUM_FLOAT vfredusum_vs_f32m4_f32m1
#define VFMACCVV_FLOAT vfmacc_vv_f32m4
#define VFMVVF_FLOAT vfmv_v_f_f32m4
#define VFMVVF_FLOAT_M1 vfmv_v_f_f32m1
#define VSETVL_MAX vsetvlmax_e64m1()
#define FLOAT_V_T vfloat64m4_t
#define FLOAT_V_T_M1 vfloat64m1_t
+#define VFMVFS_FLOAT vfmv_f_s_f64m1_f64
#define VLEV_FLOAT vle_v_f64m4
#define VLSEV_FLOAT vlse_v_f64m4
-#define VFREDSUM_FLOAT vfredsum_vs_f64m4_f64m1
+#define VFREDSUM_FLOAT vfredusum_vs_f64m4_f64m1
#define VFMACCVV_FLOAT vfmacc_vv_f64m4
#define VFMVVF_FLOAT vfmv_v_f_f64m4
#define VFMVVF_FLOAT_M1 vfmv_v_f_f64m1
//ssq in vector vr: vr[0]
v_res = VFREDSUM_FLOAT(v_res, vr, v_z0, gvl);
//total ssq before current vector
- ssq += v_res[0];
+ ssq += VFMVFS_FLOAT(v_res);
//find max
v_res = VFREDMAXVS_FLOAT(v_res, v0, v_z0, gvl);
//update ssq before max_index
- ssq = ssq * (scale/v_res[0])*(scale/v_res[0]);
+ ssq = ssq * (scale/VFMVFS_FLOAT(v_res))*(scale/VFMVFS_FLOAT(v_res));
//update scale
- scale = v_res[0];
+ scale = VFMVFS_FLOAT(v_res);
//ssq in vector vr
v0 = VFDIVVF_FLOAT(v0, scale, gvl);
vr = VFMACCVV_FLOAT(v_zero, v0, v0, gvl);
//ssq in vector vr: vr[0]
v_res = VFREDSUM_FLOAT(v_res, vr, v_z0, gvl);
//total ssq now
- ssq += v_res[0];
+ ssq += VFMVFS_FLOAT(v_res);
//tail
if(j < n2){
//find max
v_res = VFREDMAXVS_FLOAT(v_res, v0, v_z0, gvl);
//update ssq before max_index
- ssq = ssq * (scale/v_res[0])*(scale/v_res[0]);
+ ssq = ssq * (scale/VFMVFS_FLOAT(v_res))*(scale/VFMVFS_FLOAT(v_res));
//update scale
- scale = v_res[0];
+ scale = VFMVFS_FLOAT(v_res);
v0 = VFDIVVF_FLOAT(v0, scale, gvl);
}
vr = VFMACCVV_FLOAT(v_zero, v0, v0, gvl);
//ssq in vector vr: vr[0]
v_res = VFREDSUM_FLOAT(v_res, vr, v_z0, gvl);
//total ssq now
- ssq += v_res[0];
+ ssq += VFMVFS_FLOAT(v_res);
}
}else{
gvl = VSETVL(n);
//ssq in vector vr: vr[0]
v_res = VFREDSUM_FLOAT(v_res, vr, v_z0, gvl);
//total ssq before current vector
- ssq += v_res[0];
+ ssq += VFMVFS_FLOAT(v_res);
//find max
v_res = VFREDMAXVS_FLOAT(v_res, v0, v_z0, gvl);
//update ssq before max_index
- ssq = ssq * (scale/v_res[0])*(scale/v_res[0]);
+ ssq = ssq * (scale/VFMVFS_FLOAT(v_res))*(scale/VFMVFS_FLOAT(v_res));
//update scale
- scale = v_res[0];
+ scale = VFMVFS_FLOAT(v_res);
//ssq in vector vr
v0 = VFDIVVF_FLOAT(v0, scale, gvl);
vr = VFMACCVV_FLOAT(v_zero, v0, v0, gvl);
//ssq in vector vr: vr[0]
v_res = VFREDSUM_FLOAT(v_res, vr, v_z0, gvl);
//total ssq before current vector
- ssq += v_res[0];
+ ssq += VFMVFS_FLOAT(v_res);
//find max
v_res = VFREDMAXVS_FLOAT(v_res, v0, v_z0, gvl);
//update ssq before max_index
- ssq = ssq * (scale/v_res[0])*(scale/v_res[0]);
+ ssq = ssq * (scale/VFMVFS_FLOAT(v_res))*(scale/VFMVFS_FLOAT(v_res));
//update scale
- scale = v_res[0];
+ scale = VFMVFS_FLOAT(v_res);
//ssq in vector vr
v0 = VFDIVVF_FLOAT(v0, scale, gvl);
vr = VFMACCVV_FLOAT(v_zero, v0, v0, gvl);
//ssq in vector vr: vr[0]
v_res = VFREDSUM_FLOAT(v_res, vr, v_z0, gvl);
//total ssq now
- ssq += v_res[0];
+ ssq += VFMVFS_FLOAT(v_res);
//tail
if(j < n){
//find max
v_res = VFREDMAXVS_FLOAT(v_res, v0, v_z0, gvl);
//update ssq before max_index
- ssq = ssq * (scale/v_res[0])*(scale/v_res[0]);
+ ssq = ssq * (scale/VFMVFS_FLOAT(v_res))*(scale/VFMVFS_FLOAT(v_res));
//update scale
- scale = v_res[0];
+ scale = VFMVFS_FLOAT(v_res);
v0 = VFDIVVF_FLOAT(v0, scale, gvl);
vr = VFMACCVV_FLOAT(v_zero, v0, v0, gvl);
}
//ssq in vector vr: vr[0]
v_res = VFREDSUM_FLOAT(v_res, vr, v_z0, gvl);
//total ssq before current vector
- ssq += v_res[0];
+ ssq += VFMVFS_FLOAT(v_res);
//find max
v_res = VFREDMAXVS_FLOAT(v_res, v0, v_z0, gvl);
//update ssq before max_index
- ssq = ssq * (scale/v_res[0])*(scale/v_res[0]);
+ ssq = ssq * (scale/VFMVFS_FLOAT(v_res))*(scale/VFMVFS_FLOAT(v_res));
//update scale
- scale = v_res[0];
+ scale = VFMVFS_FLOAT(v_res);
v0 = VFDIVVF_FLOAT(v0, scale, gvl);
vr = VFMACCVV_FLOAT(v_zero, v0, v0, gvl);
}
//ssq in vector vr: vr[0]
v_res = VFREDSUM_FLOAT(v_res, vr, v_z0, gvl);
//total ssq now
- ssq += v_res[0];
+ ssq += VFMVFS_FLOAT(v_res);
}
}
return(scale * sqrt(ssq));