#include "common.h"
-#include <math.h>
-#define ABS fabs
+#ifndef ABS_K
+#define ABS_K(a) ((a) > 0 ? (a) : (-(a)))
+#endif
#if defined(SKYLAKEX)
#include "dasum_microk_skylakex-2.c"
#include "dasum_microk_haswell-2.c"
#endif
-#ifndef HAVE_KERNEL_16
-static FLOAT dasum_kernel_16(BLASLONG n, FLOAT *x1)
+#ifndef HAVE_DASUM_KERNEL
+static FLOAT dasum_kernel(BLASLONG n, FLOAT *x1)
{
- BLASLONG i=0;
- FLOAT *x = x1;
- FLOAT temp0, temp1, temp2, temp3;
- FLOAT temp4, temp5, temp6, temp7;
- FLOAT sum0 = 0.0;
- FLOAT sum1 = 0.0;
- FLOAT sum2 = 0.0;
- FLOAT sum3 = 0.0;
-
- while ( i< n )
- {
-
- temp0 = ABS(x[0]);
- temp1 = ABS(x[1]);
- temp2 = ABS(x[2]);
- temp3 = ABS(x[3]);
- temp4 = ABS(x[4]);
- temp5 = ABS(x[5]);
- temp6 = ABS(x[6]);
- temp7 = ABS(x[7]);
-
- sum0 += temp0;
- sum1 += temp1;
- sum2 += temp2;
- sum3 += temp3;
-
- sum0 += temp4;
- sum1 += temp5;
- sum2 += temp6;
- sum3 += temp7;
-
- x+=8;
- i+=8;
-
- }
-
- return sum0+sum1+sum2+sum3;
+ BLASLONG i=0;
+ BLASLONG n_8 = n & -8;
+ FLOAT *x = x1;
+ FLOAT temp0, temp1, temp2, temp3;
+ FLOAT temp4, temp5, temp6, temp7;
+ FLOAT sum0 = 0.0;
+ FLOAT sum1 = 0.0;
+ FLOAT sum2 = 0.0;
+ FLOAT sum3 = 0.0;
+ FLOAT sum4 = 0.0;
+
+ while (i < n_8) {
+ temp0 = ABS_K(x[0]);
+ temp1 = ABS_K(x[1]);
+ temp2 = ABS_K(x[2]);
+ temp3 = ABS_K(x[3]);
+ temp4 = ABS_K(x[4]);
+ temp5 = ABS_K(x[5]);
+ temp6 = ABS_K(x[6]);
+ temp7 = ABS_K(x[7]);
+
+ sum0 += temp0;
+ sum1 += temp1;
+ sum2 += temp2;
+ sum3 += temp3;
+
+ sum0 += temp4;
+ sum1 += temp5;
+ sum2 += temp6;
+ sum3 += temp7;
+
+ x+=8;
+ i+=8;
+ }
+
+ while (i < n) {
+ sum4 += ABS_K(x1[i]);
+ i++;
+ }
+
+ return sum0+sum1+sum2+sum3+sum4;
}
#endif
FLOAT CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x)
{
- BLASLONG i=0;
- FLOAT sumf = 0.0;
- BLASLONG n1;
-
- if (n <= 0 || inc_x <= 0) return(sumf);
-
- if ( inc_x == 1 )
- {
-
- n1 = n & -16;
- if ( n1 > 0 )
- {
-
- sumf = dasum_kernel_16(n1, x);
- i=n1;
- }
-
- while(i < n)
- {
- sumf += ABS(x[i]);
- i++;
- }
-
- }
- else
- {
-
- n *= inc_x;
- while(i < n)
- {
- sumf += ABS(x[i]);
- i += inc_x;
- }
-
- }
- return(sumf);
+ BLASLONG i=0;
+ FLOAT sumf = 0.0;
+
+ if (n <= 0 || inc_x <= 0) return(sumf);
+
+ if ( inc_x == 1 ) {
+ sumf = dasum_kernel(n, x);
+ }
+ else {
+ n *= inc_x;
+
+ while(i < n) {
+ sumf += ABS_K(x[i]);
+ i += inc_x;
+ }
+ }
+ return(sumf);
}
#if (( defined(__GNUC__) && __GNUC__ > 6 ) || (defined(__clang__) && __clang_major__ >= 6)) && defined(__AVX2__)
-#define HAVE_KERNEL_16 1
+#define HAVE_DASUM_KERNEL
#include <immintrin.h>
+#include <stdint.h>
-static FLOAT dasum_kernel_16(BLASLONG n, FLOAT *x1)
+#ifndef ABS_K
+#define ABS_K(a) ((a) > 0 ? (a) : (-(a)))
+#endif
+
+static FLOAT dasum_kernel(BLASLONG n, FLOAT *x1)
{
BLASLONG i = 0;
- __m256d accum_0, accum_1, accum_2, accum_3;
-
- accum_0 = _mm256_setzero_pd();
- accum_1 = _mm256_setzero_pd();
- accum_2 = _mm256_setzero_pd();
- accum_3 = _mm256_setzero_pd();
-
- __m256i abs_mask = _mm256_set1_epi64x(0x7fffffffffffffff);
- for (; i < n; i += 16) {
- accum_0 += (__m256d)_mm256_and_si256(_mm256_loadu_si256(&x1[i+ 0]), abs_mask);
- accum_1 += (__m256d)_mm256_and_si256(_mm256_loadu_si256(&x1[i+ 4]), abs_mask);
- accum_2 += (__m256d)_mm256_and_si256(_mm256_loadu_si256(&x1[i+ 8]), abs_mask);
- accum_3 += (__m256d)_mm256_and_si256(_mm256_loadu_si256(&x1[i+12]), abs_mask);
+ FLOAT sumf = 0.0;
+
+ if (n >= 256) {
+ BLASLONG align_256 = ((32 - ((uintptr_t)x1 & (uintptr_t)0x1f)) >> 3) & 0x3;
+
+ for (i = 0; i < align_256; i++) {
+ sumf += ABS_K(x1[i]);
+ }
+
+ n -= align_256;
+ x1 += align_256;
+ }
+
+ BLASLONG tail_index_SSE = n&(~7);
+ BLASLONG tail_index_AVX2 = n&(~255);
+
+ if (n >= 256) {
+ __m256d accum_0, accum_1, accum_2, accum_3;
+
+ accum_0 = _mm256_setzero_pd();
+ accum_1 = _mm256_setzero_pd();
+ accum_2 = _mm256_setzero_pd();
+ accum_3 = _mm256_setzero_pd();
+
+ __m256i abs_mask = _mm256_set1_epi64x(0x7fffffffffffffff);
+ for (i = 0; i < tail_index_AVX2; i += 16) {
+ accum_0 += (__m256d)_mm256_and_si256(_mm256_load_si256(&x1[i+ 0]), abs_mask);
+ accum_1 += (__m256d)_mm256_and_si256(_mm256_load_si256(&x1[i+ 4]), abs_mask);
+ accum_2 += (__m256d)_mm256_and_si256(_mm256_load_si256(&x1[i+ 8]), abs_mask);
+ accum_3 += (__m256d)_mm256_and_si256(_mm256_load_si256(&x1[i+12]), abs_mask);
+ }
+
+ accum_0 = accum_0 + accum_1 + accum_2 + accum_3;
+
+ __m128d half_accum0;
+ half_accum0 = _mm_add_pd(_mm256_extractf128_pd(accum_0, 0), _mm256_extractf128_pd(accum_0, 1));
+
+ half_accum0 = _mm_hadd_pd(half_accum0, half_accum0);
+
+ sumf += half_accum0[0];
}
+
+ if (n >= 8) {
+ __m128d accum_20, accum_21, accum_22, accum_23;
+ accum_20 = _mm_setzero_pd();
+ accum_21 = _mm_setzero_pd();
+ accum_22 = _mm_setzero_pd();
+ accum_23 = _mm_setzero_pd();
- accum_0 = accum_0 + accum_1 + accum_2 + accum_3;
+ __m128i abs_mask2 = _mm_set1_epi64x(0x7fffffffffffffff);
+ for (i = tail_index_AVX2; i < tail_index_SSE; i += 8) {
+ accum_20 += (__m128d)_mm_and_si128(_mm_loadu_si128(&x1[i + 0]), abs_mask2);
+ accum_21 += (__m128d)_mm_and_si128(_mm_loadu_si128(&x1[i + 2]), abs_mask2);
+ accum_22 += (__m128d)_mm_and_si128(_mm_loadu_si128(&x1[i + 4]), abs_mask2);
+ accum_23 += (__m128d)_mm_and_si128(_mm_loadu_si128(&x1[i + 6]), abs_mask2);
+ }
- __m128d half_accum0;
- half_accum0 = _mm_add_pd(_mm256_extractf128_pd(accum_0, 0), _mm256_extractf128_pd(accum_0, 1));
+ accum_20 = accum_20 + accum_21 + accum_22 + accum_23;
+ __m128d half_accum20;
+ half_accum20 = _mm_hadd_pd(accum_20, accum_20);
- half_accum0 = _mm_hadd_pd(half_accum0, half_accum0);
+ sumf += half_accum20[0];
+ }
+
+ for (i = tail_index_SSE; i < n; ++i) {
+ sumf += ABS_K(x1[i]);
+ }
- return half_accum0[0];
+ return sumf;
}
#endif
/* need a new enough GCC for avx512 support */
-#if (( defined(__GNUC__) && __GNUC__ > 6 && defined(__AVX2__)) || (defined(__clang__) && __clang_major__ >= 6))
+#if (( defined(__GNUC__) && __GNUC__ > 6 && defined(__AVX512CD__)) || (defined(__clang__) && __clang_major__ >= 9))
-#if defined(__AVX512CD__)
-#define HAVE_KERNEL_16 1
+#define HAVE_DASUM_KERNEL 1
#include <immintrin.h>
-static FLOAT dasum_kernel_16(BLASLONG n, FLOAT *x1)
+#include <stdint.h>
+
+#ifndef ABS_K
+#define ABS_K(a) ((a) > 0 ? (a) : (-(a)))
+#endif
+
+static FLOAT dasum_kernel(BLASLONG n, FLOAT *x1)
{
BLASLONG i = 0;
+ FLOAT sumf = 0.0;
+
+ if (n >= 256) {
+ BLASLONG align_512 = ((64 - ((uintptr_t)x1 & (uintptr_t)0x3f)) >> 3) & 0x7;
- __m512d accum_0, accum_1;
+ for (i = 0; i < align_512; i++) {
+ sumf += ABS_K(x1[i]);
+ }
+
+ n -= align_512;
+ x1 += align_512;
+ }
+
+ BLASLONG tail_index_SSE = n&(~7);
+ BLASLONG tail_index_AVX512 = n&(~255);
- accum_0 = _mm512_setzero_pd();
- accum_1 = _mm512_setzero_pd();
+ //
+ if ( n >= 256 ) {
- for (; i < n; i += 16) {
- accum_0 += _mm512_abs_pd(_mm512_loadu_pd(&x1[i+ 0]));
- accum_1 += _mm512_abs_pd(_mm512_loadu_pd(&x1[i+ 8]));
+ __m512d accum_0, accum_1, accum_2, accum_3;
+ accum_0 = _mm512_setzero_pd();
+ accum_1 = _mm512_setzero_pd();
+ accum_2 = _mm512_setzero_pd();
+ accum_3 = _mm512_setzero_pd();
+ for (i = 0; i < tail_index_AVX512; i += 32) {
+ accum_0 += _mm512_abs_pd(_mm512_load_pd(&x1[i + 0]));
+ accum_1 += _mm512_abs_pd(_mm512_load_pd(&x1[i + 8]));
+ accum_2 += _mm512_abs_pd(_mm512_load_pd(&x1[i +16]));
+ accum_3 += _mm512_abs_pd(_mm512_load_pd(&x1[i +24]));
+ }
+
+ accum_0 = accum_0 + accum_1 + accum_2 + accum_3;
+ sumf += _mm512_reduce_add_pd(accum_0);
}
- accum_0 += accum_1;
- return _mm512_reduce_add_pd(accum_0);
+ if (n >= 8) {
+ __m128d accum_20, accum_21, accum_22, accum_23;
+ accum_20 = _mm_setzero_pd();
+ accum_21 = _mm_setzero_pd();
+ accum_22 = _mm_setzero_pd();
+ accum_23 = _mm_setzero_pd();
+
+ __m128i abs_mask2 = _mm_set1_epi64x(0x7fffffffffffffff);
+ for (i = tail_index_AVX512; i < tail_index_SSE; i += 8) {
+ accum_20 += (__m128d)_mm_and_si128(_mm_loadu_si128(&x1[i + 0]), abs_mask2);
+ accum_21 += (__m128d)_mm_and_si128(_mm_loadu_si128(&x1[i + 2]), abs_mask2);
+ accum_22 += (__m128d)_mm_and_si128(_mm_loadu_si128(&x1[i + 4]), abs_mask2);
+ accum_23 += (__m128d)_mm_and_si128(_mm_loadu_si128(&x1[i + 6]), abs_mask2);
+ }
+
+ accum_20 = accum_20 + accum_21 + accum_22 + accum_23;
+ __m128d half_accum20;
+ half_accum20 = _mm_hadd_pd(accum_20, accum_20);
+
+ sumf += half_accum20[0];
+ }
+
+ for (i = tail_index_SSE; i < n; ++i) {
+ sumf += ABS_K(x1[i]);
+ }
+
+ return sumf;
}
#endif
-#endif
#include "common.h"
-#include <math.h>
#if defined(DOUBLE)
-
#error supports float only
-
#else
-
-#define ABS fabsf
+#ifndef ABS_K
+#define ABS_K(a) ((a) > 0 ? (a) : (-(a)))
+#endif
#endif
#include "sasum_microk_haswell-2.c"
#endif
-#ifndef HAVE_KERNEL_32
+#ifndef HAVE_SASUM_KERNEL
-static FLOAT sasum_kernel_32(BLASLONG n, FLOAT *x1)
+static FLOAT sasum_kernel(BLASLONG n, FLOAT *x1)
{
- BLASLONG i=0;
- FLOAT *x = x1;
- FLOAT temp0, temp1, temp2, temp3;
- FLOAT temp4, temp5, temp6, temp7;
- FLOAT sum0 = 0.0;
- FLOAT sum1 = 0.0;
- FLOAT sum2 = 0.0;
- FLOAT sum3 = 0.0;
-
- while ( i< n )
- {
-
- temp0 = ABS(x[0]);
- temp1 = ABS(x[1]);
- temp2 = ABS(x[2]);
- temp3 = ABS(x[3]);
- temp4 = ABS(x[4]);
- temp5 = ABS(x[5]);
- temp6 = ABS(x[6]);
- temp7 = ABS(x[7]);
-
- sum0 += temp0;
- sum1 += temp1;
- sum2 += temp2;
- sum3 += temp3;
-
- sum0 += temp4;
- sum1 += temp5;
- sum2 += temp6;
- sum3 += temp7;
-
- x+=8;
- i+=8;
-
- }
-
- return sum0+sum1+sum2+sum3;
+ BLASLONG i=0;
+ BLASLONG n_8 = n & -8;
+ FLOAT *x = x1;
+ FLOAT temp0, temp1, temp2, temp3;
+ FLOAT temp4, temp5, temp6, temp7;
+ FLOAT sum0 = 0.0;
+ FLOAT sum1 = 0.0;
+ FLOAT sum2 = 0.0;
+ FLOAT sum3 = 0.0;
+ FLOAT sum4 = 0.0;
+
+ while (i < n_8) {
+
+ temp0 = ABS_K(x[0]);
+ temp1 = ABS_K(x[1]);
+ temp2 = ABS_K(x[2]);
+ temp3 = ABS_K(x[3]);
+ temp4 = ABS_K(x[4]);
+ temp5 = ABS_K(x[5]);
+ temp6 = ABS_K(x[6]);
+ temp7 = ABS_K(x[7]);
+
+ sum0 += temp0;
+ sum1 += temp1;
+ sum2 += temp2;
+ sum3 += temp3;
+
+ sum0 += temp4;
+ sum1 += temp5;
+ sum2 += temp6;
+ sum3 += temp7;
+
+ x+=8;
+ i+=8;
+
+ }
+
+ while (i < n) {
+ sum4 += ABS_K(x1[i]);
+ i++;
+ }
+
+ return sum0+sum1+sum2+sum3+sum4;
}
#endif
FLOAT CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x)
{
- BLASLONG i=0;
- FLOAT sumf = 0.0;
- BLASLONG n1;
-
- if (n <= 0 || inc_x <= 0) return(sumf);
-
- if ( inc_x == 1 )
- {
-
- n1 = n & -32;
- if ( n1 > 0 )
- {
-
- sumf = sasum_kernel_32(n1, x);
- i=n1;
- }
-
- while(i < n)
- {
- sumf += ABS(x[i]);
- i++;
- }
-
- }
- else
- {
-
- n *= inc_x;
- while(i < n)
- {
- sumf += ABS(x[i]);
- i += inc_x;
- }
-
- }
- return(sumf);
+ BLASLONG i=0;
+ FLOAT sumf = 0.0;
+
+ if (n <= 0 || inc_x <= 0) return(sumf);
+
+ if ( inc_x == 1 ) {
+ sumf = sasum_kernel(n, x);
+ }
+ else {
+
+ n *= inc_x;
+ while(i < n) {
+ sumf += ABS_K(x[i]);
+ i += inc_x;
+ }
+
+ }
+ return(sumf);
}
#if (( defined(__GNUC__) && __GNUC__ > 6 ) || (defined(__clang__) && __clang_major__ >= 6)) && defined(__AVX2__)
-#define HAVE_KERNEL_32 1
+#define HAVE_SASUM_KERNEL 1
#include <immintrin.h>
+#include <stdint.h>
-static FLOAT sasum_kernel_32(BLASLONG n, FLOAT *x1)
+#ifndef ABS_K
+#define ABS_K(a) ((a) > 0 ? (a) : (-(a)))
+#endif
+
+static FLOAT sasum_kernel(BLASLONG n, FLOAT *x1)
{
BLASLONG i = 0;
- __m256 accum_0, accum_1, accum_2, accum_3;
-
- accum_0 = _mm256_setzero_ps();
- accum_1 = _mm256_setzero_ps();
- accum_2 = _mm256_setzero_ps();
- accum_3 = _mm256_setzero_ps();
-
- __m256i abs_mask = _mm256_set1_epi32(0x7fffffff);
- for (; i < n; i += 32) {
- accum_0 += (__m256)_mm256_and_si256(_mm256_loadu_si256(&x1[i+ 0]), abs_mask);
- accum_1 += (__m256)_mm256_and_si256(_mm256_loadu_si256(&x1[i+ 8]), abs_mask);
- accum_2 += (__m256)_mm256_and_si256(_mm256_loadu_si256(&x1[i+16]), abs_mask);
- accum_3 += (__m256)_mm256_and_si256(_mm256_loadu_si256(&x1[i+24]), abs_mask);
+ FLOAT sumf = 0.0;
+
+ if (n >= 256) {
+ BLASLONG align_256 = ((32 - ((uintptr_t)x1 & (uintptr_t)0x1f)) >> 2) & 0x7;
+
+ for (i = 0; i < align_256; i++) {
+ sumf += ABS_K(x1[i]);
+ }
+
+ n -= align_256;
+ x1 += align_256;
}
- accum_0 = accum_0 + accum_1 + accum_2 + accum_3;
+ BLASLONG tail_index_SSE = n&(~7);
+ BLASLONG tail_index_AVX2 = n&(~255);
+
+ if (n >= 256) {
+ __m256 accum_0, accum_1, accum_2, accum_3;
+
+ accum_0 = _mm256_setzero_ps();
+ accum_1 = _mm256_setzero_ps();
+ accum_2 = _mm256_setzero_ps();
+ accum_3 = _mm256_setzero_ps();
- __m128 half_accum0;
- half_accum0 = _mm_add_ps(_mm256_extractf128_ps(accum_0, 0), _mm256_extractf128_ps(accum_0, 1));
+ __m256i abs_mask = _mm256_set1_epi32(0x7fffffff);
+ for (i = 0; i < tail_index_AVX2; i += 32) {
+ accum_0 += (__m256)_mm256_and_si256(_mm256_load_si256(&x1[i+ 0]), abs_mask);
+ accum_1 += (__m256)_mm256_and_si256(_mm256_load_si256(&x1[i+ 8]), abs_mask);
+ accum_2 += (__m256)_mm256_and_si256(_mm256_load_si256(&x1[i+16]), abs_mask);
+ accum_3 += (__m256)_mm256_and_si256(_mm256_load_si256(&x1[i+24]), abs_mask);
+ }
- half_accum0 = _mm_hadd_ps(half_accum0, half_accum0);
- half_accum0 = _mm_hadd_ps(half_accum0, half_accum0);
+ accum_0 = accum_0 + accum_1 + accum_2 + accum_3;
+ __m128 half_accum0;
+ half_accum0 = _mm_add_ps(_mm256_extractf128_ps(accum_0, 0), _mm256_extractf128_ps(accum_0, 1));
- return half_accum0[0];
+ half_accum0 = _mm_hadd_ps(half_accum0, half_accum0);
+ half_accum0 = _mm_hadd_ps(half_accum0, half_accum0);
+
+ sumf += half_accum0[0];
+
+ }
+
+ if (n >= 8) {
+ __m128 accum_20, accum_21;
+ accum_20 = _mm_setzero_ps();
+ accum_21 = _mm_setzero_ps();
+
+ __m128i abs_mask2 = _mm_set1_epi32(0x7fffffff);
+ for (i = tail_index_AVX2; i < tail_index_SSE; i += 8) {
+ accum_20 += (__m128)_mm_and_si128(_mm_loadu_si128(&x1[i + 0]), abs_mask2);
+ accum_21 += (__m128)_mm_and_si128(_mm_loadu_si128(&x1[i + 4]), abs_mask2);
+ }
+
+ accum_20 += accum_21;
+ accum_20 = _mm_hadd_ps(accum_20, accum_20);
+ accum_20 = _mm_hadd_ps(accum_20, accum_20);
+
+ sumf += accum_20[0];
+ }
+
+ for (i = tail_index_SSE; i < n; ++i) {
+ sumf += ABS_K(x1[i]);
+ }
+ return sumf;
}
#endif
/* need a new enough GCC for avx512 support */
-#if (( defined(__GNUC__) && __GNUC__ > 6 && defined(__AVX2__)) || (defined(__clang__) && __clang_major__ >= 6))
+#if (( defined(__GNUC__) && __GNUC__ > 6 && defined(__AVX512CD__)) || (defined(__clang__) && __clang_major__ >= 9))
-#if defined(__AVX512CD__)
-#define HAVE_KERNEL_32 1
+#define HAVE_SASUM_KERNEL 1
+
+#ifndef ABS_K
+#define ABS_K(a) ((a) > 0 ? (a) : (-(a)))
+#endif
#include <immintrin.h>
+#include <stdint.h>
-static FLOAT sasum_kernel_32(BLASLONG n, FLOAT *x1)
+static FLOAT sasum_kernel(BLASLONG n, FLOAT *x1)
{
BLASLONG i = 0;
+ FLOAT sumf = 0.0;
+
+ if (n >= 256) {
+ BLASLONG align_512 = ((64 - ((uintptr_t)x1 & (uintptr_t)0x3f)) >> 2) & 0xf;
+
+ for (i = 0; i < align_512; i++) {
+ sumf += ABS_K(x1[i]);
+ }
+ n -= align_512;
+ x1 += align_512;
+ }
+
+ BLASLONG tail_index_SSE = n&(~7);
+ BLASLONG tail_index_AVX512 = n&(~255);
- __m512 accum_0, accum_1;
+ if (n >= 256) {
+ __m512 accum_0, accum_1, accum_2, accum_3;
+ accum_0 = _mm512_setzero_ps();
+ accum_1 = _mm512_setzero_ps();
+ accum_2 = _mm512_setzero_ps();
+ accum_3 = _mm512_setzero_ps();
- accum_0 = _mm512_setzero_ps();
- accum_1 = _mm512_setzero_ps();
+ for (i = 0; i < tail_index_AVX512; i += 64) {
+ accum_0 += _mm512_abs_ps(_mm512_load_ps(&x1[i + 0]));
+ accum_1 += _mm512_abs_ps(_mm512_load_ps(&x1[i +16]));
+ accum_2 += _mm512_abs_ps(_mm512_load_ps(&x1[i +32]));
+ accum_3 += _mm512_abs_ps(_mm512_load_ps(&x1[i +48]));
+ }
- for (; i < n; i += 32) {
- accum_0 += _mm512_abs_ps(_mm512_loadu_ps(&x1[i+ 0]));
- accum_1 += _mm512_abs_ps(_mm512_loadu_ps(&x1[i+ 16]));
+ accum_0 = accum_0 + accum_1 + accum_2 + accum_3;
+ sumf += _mm512_reduce_add_ps(accum_0);
}
- accum_0 += accum_1;
- return _mm512_reduce_add_ps(accum_0);
+ if (n >= 8) {
+ __m128 accum_20, accum_21;
+ accum_20 = _mm_setzero_ps();
+ accum_21 = _mm_setzero_ps();
+
+ __m128i abs_mask2 = _mm_set1_epi32(0x7fffffff);
+ for (i = tail_index_AVX512; i < tail_index_SSE; i += 8) {
+ accum_20 += (__m128)_mm_and_si128(_mm_loadu_si128(&x1[i + 0]), abs_mask2);
+ accum_21 += (__m128)_mm_and_si128(_mm_loadu_si128(&x1[i + 4]), abs_mask2);
+ }
+
+ accum_20 += accum_21;
+ accum_20 = _mm_hadd_ps(accum_20, accum_20);
+ accum_20 = _mm_hadd_ps(accum_20, accum_20);
+
+ sumf += accum_20[0];
+ }
+
+ for (i = tail_index_SSE; i < n; i++) {
+ sumf += ABS_K(x1[i]);
+ }
+
+ return sumf;
}
#endif
-#endif