+2007-05-22 H.J. Lu <hongjiu.lu@intel.com>
+
+ * gcc.dg/i386-cpuid.h (bit_SSE4_1): New.
+ (bit_SSE4_2): Likewise.
+ (bit_POPCNT): Likewise.
+
+ * gcc.target/i386/i386.exp (check_effective_target_sse4): New.
+ Check if assembler supports SSE4 instructions.
+
+ * gcc.target/i386/sse4_1-blendpd.c: New file.
+ * gcc.target/i386/sse4_1-blendps.c: Likewise.
+ * gcc.target/i386/sse4_1-blendvpd.c: Likewise.
+ * gcc.target/i386/sse4_1-blendvps.c: Likewise.
+ * gcc.target/i386/sse4_1-check.h: Likewise.
+ * gcc.target/i386/sse4_1-dppd-1.c: Likewise.
+ * gcc.target/i386/sse4_1-dppd-2.c: Likewise.
+ * gcc.target/i386/sse4_1-dpps-1.c: Likewise.
+ * gcc.target/i386/sse4_1-dpps-2.c: Likewise.
+ * gcc.target/i386/sse4_1-extractps.c: Likewise.
+ * gcc.target/i386/sse4_1-insertps-1.c: Likewise.
+ * gcc.target/i386/sse4_1-insertps-2.c: Likewise.
+ * gcc.target/i386/sse4_1-movntdqa.c: Likewise.
+ * gcc.target/i386/sse4_1-mpsadbw.c: Likewise.
+ * gcc.target/i386/sse4_1-packusdw.c: Likewise.
+ * gcc.target/i386/sse4_1-pblendvb.c: Likewise.
+ * gcc.target/i386/sse4_1-pblendw.c: Likewise.
+ * gcc.target/i386/sse4_1-pcmpeqq.c: Likewise.
+ * gcc.target/i386/sse4_1-pextrb.c: Likewise.
+ * gcc.target/i386/sse4_1-pextrd.c: Likewise.
+ * gcc.target/i386/sse4_1-pextrq.c: Likewise.
+ * gcc.target/i386/sse4_1-pextrw.c: Likewise.
+ * gcc.target/i386/sse4_1-phminposuw.c: Likewise.
+ * gcc.target/i386/sse4_1-pinsrb.c: Likewise.
+ * gcc.target/i386/sse4_1-pinsrd.c: Likewise.
+ * gcc.target/i386/sse4_1-pinsrq.c: Likewise.
+ * gcc.target/i386/sse4_1-pmaxsb.c: Likewise.
+ * gcc.target/i386/sse4_1-pmaxsd.c: Likewise.
+ * gcc.target/i386/sse4_1-pmaxud.c: Likewise.
+ * gcc.target/i386/sse4_1-pmaxuw.c: Likewise.
+ * gcc.target/i386/sse4_1-pminsb.c: Likewise.
+ * gcc.target/i386/sse4_1-pminsd.c: Likewise.
+ * gcc.target/i386/sse4_1-pminud.c: Likewise.
+ * gcc.target/i386/sse4_1-pminuw.c: Likewise.
+ * gcc.target/i386/sse4_1-pmovsxbd.c: Likewise.
+ * gcc.target/i386/sse4_1-pmovsxbq.c: Likewise.
+ * gcc.target/i386/sse4_1-pmovsxbw.c: Likewise.
+ * gcc.target/i386/sse4_1-pmovsxdq.c: Likewise.
+ * gcc.target/i386/sse4_1-pmovsxwd.c: Likewise.
+ * gcc.target/i386/sse4_1-pmovsxwq.c: Likewise.
+ * gcc.target/i386/sse4_1-pmovzxbd.c: Likewise.
+ * gcc.target/i386/sse4_1-pmovzxbq.c: Likewise.
+ * gcc.target/i386/sse4_1-pmovzxbw.c: Likewise.
+ * gcc.target/i386/sse4_1-pmovzxdq.c: Likewise.
+ * gcc.target/i386/sse4_1-pmovzxwd.c: Likewise.
+ * gcc.target/i386/sse4_1-pmovzxwq.c: Likewise.
+ * gcc.target/i386/sse4_1-pmuldq.c: Likewise.
+ * gcc.target/i386/sse4_1-pmulld.c: Likewise.
+ * gcc.target/i386/sse4_1-ptest-1.c: Likewise.
+ * gcc.target/i386/sse4_1-ptest-2.c: Likewise.
+ * gcc.target/i386/sse4_1-ptest-3.c: Likewise.
+ * gcc.target/i386/sse4_1-round.h: Likewise.
+ * gcc.target/i386/sse4_1-roundpd-1.c: Likewise.
+ * gcc.target/i386/sse4_1-roundpd-2.c: Likewise.
+ * gcc.target/i386/sse4_1-roundpd-3.c: Likewise.
+ * gcc.target/i386/sse4_1-roundps-1.c: Likewise.
+ * gcc.target/i386/sse4_1-roundps-2.c: Likewise.
+ * gcc.target/i386/sse4_1-roundps-3.c: Likewise.
+ * gcc.target/i386/sse4_1-roundsd-1.c: Likewise.
+ * gcc.target/i386/sse4_1-roundsd-2.c: Likewise.
+ * gcc.target/i386/sse4_1-roundsd-3.c: Likewise.
+ * gcc.target/i386/sse4_1-roundsd-4.c: Likewise.
+ * gcc.target/i386/sse4_1-roundss-1.c: Likewise.
+ * gcc.target/i386/sse4_1-roundss-2.c: Likewise.
+ * gcc.target/i386/sse4_1-roundss-3.c: Likewise.
+ * gcc.target/i386/sse4_1-roundss-4.c: Likewise.
+
2007-05-22 Francois-Xavier Coudert <fxcoudert@gcc.gnu.org>
PR fortran/31627
/* %ecx */
#define bit_SSE3 (1 << 0)
#define bit_SSSE3 (1 << 9)
+#define bit_SSE4_1 (1 << 19)
+#define bit_SSE4_2 (1 << 20)
+#define bit_POPCNT (1 << 23)
/* %edx */
#define bit_CMOV (1 << 15)
} "-O2 -mssse3" ]
}
+# Return 1 if sse4 instructions can be compiled.
+proc check_effective_target_sse4 { } {
+ return [check_no_compiler_messages sse4.1 object {
+ typedef long long __m128i __attribute__ ((__vector_size__ (16)));
+ typedef int __v4si __attribute__ ((__vector_size__ (16)));
+
+ __m128i _mm_mullo_epi32 (__m128i __X, __m128i __Y)
+ {
+ return (__m128i) __builtin_ia32_pmulld128 ((__v4si)__X,
+ (__v4si)__Y);
+ }
+ } "-O2 -msse4.1" ]
+}
+
# Return 1 if sse4a instructions can be compiled.
proc check_effective_target_sse4a { } {
return [check_no_compiler_messages sse4a object {
--- /dev/null
+/* { dg-do run { target i?86-*-* x86_64-*-* } } */
+/* { dg-require-effective-target sse4 } */
+/* { dg-options "-O2 -msse4.1" } */
+
+#include "sse4_1-check.h"
+
+#include <smmintrin.h>
+#include <string.h>
+
+#define NUM 20
+
+#ifndef MASK
+#define MASK 0x03
+#endif
+
+static void
+init_blendpd (double *src1, double *src2)
+{
+ int i, sign = 1;
+
+ for (i = 0; i < NUM * 2; i++)
+ {
+ src1[i] = i * i * sign;
+ src2[i] = (i + 20) * sign;
+ sign = -sign;
+ }
+}
+
+static int
+check_blendpd (__m128d *dst, double *src1, double *src2)
+{
+ double tmp[2];
+ int j;
+
+ memcpy (&tmp[0], src1, sizeof (tmp));
+
+ for(j = 0; j < 2; j++)
+ if ((MASK & (1 << j)))
+ tmp[j] = src2[j];
+
+ return memcmp (dst, &tmp[0], sizeof (tmp));
+}
+
+static void
+sse4_1_test (void)
+{
+ __m128d x, y;
+ union
+ {
+ __m128d x[NUM];
+ double d[NUM * 2];
+ } dst, src1, src2;
+ union
+ {
+ __m128d x;
+ double d[2];
+ } src3;
+ int i;
+
+ init_blendpd (src1.d, src2.d);
+
+ /* Check blendpd imm8, m128, xmm */
+ for (i = 0; i < NUM; i++)
+ {
+ dst.x[i] = _mm_blend_pd (src1.x[i], src2.x[i], MASK);
+ if (check_blendpd (&dst.x[i], &src1.d[i * 2], &src2.d[i * 2]))
+ abort ();
+ }
+
+ /* Check blendpd imm8, xmm, xmm */
+ src3.x = _mm_setzero_pd ();
+
+ x = _mm_blend_pd (dst.x[2], src3.x, MASK);
+ y = _mm_blend_pd (src3.x, dst.x[2], MASK);
+
+ if (check_blendpd (&x, &dst.d[4], &src3.d[0]))
+ abort ();
+
+ if (check_blendpd (&y, &src3.d[0], &dst.d[4]))
+ abort ();
+}
--- /dev/null
+/* { dg-do run { target i?86-*-* x86_64-*-* } } */
+/* { dg-require-effective-target sse4 } */
+/* { dg-options "-O2 -msse4.1" } */
+
+#include "sse4_1-check.h"
+
+#include <smmintrin.h>
+#include <string.h>
+
+#define NUM 20
+
+#ifndef MASK
+#define MASK 0x0f
+#endif
+
+static void
+init_blendps (float *src1, float *src2)
+{
+ int i, sign = 1;
+
+ for (i = 0; i < NUM * 4; i++)
+ {
+ src1[i] = i * i * sign;
+ src2[i] = (i + 20) * sign;
+ sign = -sign;
+ }
+}
+
+static int
+check_blendps (__m128 *dst, float *src1, float *src2)
+{
+ float tmp[4];
+ int j;
+
+ memcpy (&tmp[0], src1, sizeof (tmp));
+ for (j = 0; j < 4; j++)
+ if ((MASK & (1 << j)))
+ tmp[j] = src2[j];
+
+ return memcmp (dst, &tmp[0], sizeof (tmp));
+}
+
+static void
+sse4_1_test (void)
+{
+ __m128 x, y;
+ union
+ {
+ __m128 x[NUM];
+ float f[NUM * 4];
+ } dst, src1, src2;
+ union
+ {
+ __m128 x;
+ float f[4];
+ } src3;
+ int i;
+
+ init_blendps (src1.f, src2.f);
+
+ /* Check blendps imm8, m128, xmm */
+ for (i = 0; i < NUM; i++)
+ {
+ dst.x[i] = _mm_blend_ps (src1.x[i], src2.x[i], MASK);
+ if (check_blendps (&dst.x[i], &src1.f[i * 4], &src2.f[i * 4]))
+ abort ();
+ }
+
+ /* Check blendps imm8, xmm, xmm */
+ x = _mm_blend_ps (dst.x[2], src3.x, MASK);
+ y = _mm_blend_ps (src3.x, dst.x[2], MASK);
+
+ if (check_blendps (&x, &dst.f[8], &src3.f[0]))
+ abort ();
+
+ if (check_blendps (&y, &src3.f[0], &dst.f[8]))
+ abort ();
+}
--- /dev/null
+/* { dg-do run { target i?86-*-* x86_64-*-* } } */
+/* { dg-require-effective-target sse4 } */
+/* { dg-options "-O2 -msse4.1" } */
+
+#include "sse4_1-check.h"
+
+#include <smmintrin.h>
+#include <string.h>
+
+#define NUM 20
+
+static void
+init_blendvpd (double *src1, double *src2, double *mask)
+{
+ int i, msk, sign = 1;
+
+ msk = -1;
+ for (i = 0; i < NUM * 2; i++)
+ {
+ if((i % 2) == 0)
+ msk++;
+ src1[i] = i* (i + 1) * sign;
+ src2[i] = (i + 20) * sign;
+ mask[i] = (i + 120) * i;
+ if( (msk & (1 << (i % 2))))
+ mask[i] = -mask[i];
+ sign = -sign;
+ }
+}
+
+static int
+check_blendvpd (__m128d *dst, double *src1, double *src2,
+ double *mask)
+{
+ double tmp[2];
+ int j;
+
+ memcpy (&tmp[0], src1, sizeof (tmp));
+ for (j = 0; j < 2; j++)
+ if (mask [j] < 0.0)
+ tmp[j] = src2[j];
+
+ return memcmp (dst, &tmp[0], sizeof (tmp));
+}
+
+static void
+sse4_1_test (void)
+{
+ union
+ {
+ __m128d x[NUM];
+ double d[NUM * 2];
+ } dst, src1, src2, mask;
+ int i;
+
+ init_blendvpd (src1.d, src2.d, mask.d);
+
+ for (i = 0; i < NUM; i++)
+ {
+ dst.x[i] = _mm_blendv_pd (src1.x[i], src2.x[i], mask.x[i]);
+ if (check_blendvpd (&dst.x[i], &src1.d[i * 2], &src2.d[i * 2],
+ &mask.d[i * 2]))
+ abort ();
+ }
+}
--- /dev/null
+/* { dg-do run { target i?86-*-* x86_64-*-* } } */
+/* { dg-require-effective-target sse4 } */
+/* { dg-options "-O2 -msse4.1" } */
+
+#include "sse4_1-check.h"
+
+#include <smmintrin.h>
+#include <string.h>
+
+#define NUM 20
+
+static void
+init_blendvps (float *src1, float *src2, float *mask)
+{
+ int i, msk, sign = 1;
+
+ msk = -1;
+ for (i = 0; i < NUM * 4; i++)
+ {
+ if((i % 4) == 0)
+ msk++;
+ src1[i] = i* (i + 1) * sign;
+ src2[i] = (i + 20) * sign;
+ mask[i] = (i + 120) * i;
+ if( (msk & (1 << (i % 4))))
+ mask[i] = -mask[i];
+ sign = -sign;
+ }
+}
+
+static int
+check_blendvps (__m128 *dst, float *src1, float *src2,
+ float *mask)
+{
+ float tmp[4];
+ int j;
+
+ memcpy (&tmp[0], src1, sizeof (tmp));
+ for (j = 0; j < 4; j++)
+ if (mask [j] < 0.0)
+ tmp[j] = src2[j];
+
+ return memcmp (dst, &tmp[0], sizeof (tmp));
+}
+
+static void
+sse4_1_test (void)
+{
+ union
+ {
+ __m128 x[NUM];
+ float f[NUM * 4];
+ } dst, src1, src2, mask;
+ int i;
+
+ init_blendvps (src1.f, src2.f, mask.f);
+
+ for (i = 0; i < NUM; i++)
+ {
+ dst.x[i] = _mm_blendv_ps (src1.x[i], src2.x[i], mask.x[i]);
+ if (check_blendvps (&dst.x[i], &src1.f[i * 4], &src2.f[i * 4],
+ &mask.f[i * 4]))
+ abort ();
+ }
+}
--- /dev/null
+#include <stdio.h>
+#include <stdlib.h>
+
+#include "../auto-host.h"
+
+#include "../../gcc.dg/i386-cpuid.h"
+
+static void sse4_1_test (void);
+
+int
+main ()
+{
+ unsigned long cpu_facilities;
+
+ cpu_facilities = i386_cpuid_ecx ();
+
+ /* Run SSE4.1 test only if host has SSE4.1 support. */
+ if ((cpu_facilities & bit_SSE4_1))
+ sse4_1_test ();
+
+ exit (0);
+}
--- /dev/null
+/* { dg-do run { target i?86-*-* x86_64-*-* } } */
+/* { dg-require-effective-target sse4 } */
+/* { dg-options "-O2 -msse4.1" } */
+
+#include "sse4_1-check.h"
+
+#include <smmintrin.h>
+
+#define lmskN 0x00
+#define lmsk0 0x01
+#define lmsk1 0x02
+#define lmsk01 0x03
+
+#define hmskA 0x30
+#define hmsk0 0x10
+#define hmsk1 0x20
+#define hmsk01 0x30
+#define hmskN 0x00
+
+#ifndef HIMASK
+#define HIMASK hmskA
+#endif
+
+static void
+sse4_1_test (void)
+{
+ union
+ {
+ __m128d x;
+ double d[2];
+ } val1, val2, res[4];
+ int masks[4];
+ int i, j;
+
+ val1.d[0] = 2.;
+ val1.d[1] = 3.;
+
+ val2.d[0] = 10.;
+ val2.d[1] = 100.;
+
+ res[0].x = _mm_dp_pd (val1.x, val2.x, HIMASK | lmskN);
+ res[1].x = _mm_dp_pd (val1.x, val2.x, HIMASK | lmsk0);
+ res[2].x = _mm_dp_pd (val1.x, val2.x, HIMASK | lmsk1);
+ res[3].x = _mm_dp_pd (val1.x, val2.x, HIMASK | lmsk01);
+
+ masks[0] = HIMASK | lmskN;
+ masks[1] = HIMASK | lmsk0;
+ masks[2] = HIMASK | lmsk1;
+ masks[3] = HIMASK | lmsk01;
+
+ for (i = 0; i < 4; i++)
+ {
+ double tmp = 0.;
+
+ for (j = 0; j < 2; j++)
+ if (HIMASK & (0x10 << j))
+ tmp = tmp + (val1.d[j] * val2.d[j]);
+
+ for (j = 0; j < 2; j++)
+ if ((masks[i] & (1 << j)) && res[i].d[j] != tmp)
+ abort ();
+ }
+}
--- /dev/null
+/* { dg-do run { target i?86-*-* x86_64-*-* } } */
+/* { dg-require-effective-target sse4 } */
+/* { dg-options "-O2 -msse4.1" } */
+
+#include "sse4_1-check.h"
+
+#include <smmintrin.h>
+#include <string.h>
+
+#define lmskN 0x00
+#define lmsk0 0x01
+#define lmsk1 0x02
+#define lmsk01 0x03
+
+#define hmskA 0x30
+#define hmsk0 0x10
+#define hmsk1 0x20
+#define hmsk01 0x30
+#define hmskN 0x00
+
+#ifndef HIMASK
+#define HIMASK hmskA
+#endif
+
+#ifndef LOMASK
+#define LOMASK lmsk01
+#endif
+
+static void
+sse4_1_test (void)
+{
+ union
+ {
+ __m128d x;
+ double d[2];
+ } val1[4], val2[4], res[4], chk[4];
+ int i, j;
+ double tmp;
+
+ for (i = 0; i < 4; i++)
+ {
+ val1[i].d [0] = 2.;
+ val1[i].d [1] = 3.;
+
+ val2[i].d [0] = 10.;
+ val2[i].d [1] = 100.;
+
+ tmp = 0.;
+ for (j = 0; j < 2; j++)
+ if ((HIMASK & (0x10 << j)))
+ tmp += val1[i].d [j] * val2[i].d [j];
+
+ for (j = 0; j < 2; j++)
+ if ((LOMASK & (1 << j)))
+ chk[i].d[j] = tmp;
+ }
+
+ for (i = 0; i < 4; i++)
+ {
+ res[i].x = _mm_dp_pd (val1[i].x, val2[i].x, HIMASK | LOMASK);
+ if (memcmp (&res[i], &chk[i], sizeof (chk[i])))
+ abort ();
+ }
+}
--- /dev/null
+/* { dg-do run { target i?86-*-* x86_64-*-* } } */
+/* { dg-require-effective-target sse4 } */
+/* { dg-options "-O2 -msse4.1" } */
+
+#include "sse4_1-check.h"
+
+#include <smmintrin.h>
+
+#define lmskN 0x00
+#define lmsk0 0x01
+#define lmsk1 0x02
+#define lmsk2 0x04
+#define lmsk3 0x08
+#define lmsk01 0x03
+#define lmsk02 0x05
+#define lmsk03 0x09
+#define lmsk12 0x06
+#define lmsk13 0x0A
+#define lmsk23 0x0C
+#define lmskA 0x0F
+
+#define hmskN 0x00
+#define hmskA 0xF0
+#define hmsk0 0x10
+#define hmsk1 0x20
+#define hmsk2 0x40
+#define hmsk3 0x80
+#define hmsk01 0x30
+#define hmsk02 0x50
+#define hmsk03 0x90
+#define hmsk12 0x60
+#define hmsk13 0xA0
+#define hmsk23 0xC0
+
+#ifndef HIMASK
+#define HIMASK hmskA
+#endif
+
+static void
+sse4_1_test (void)
+{
+ union
+ {
+ __m128 x;
+ float f[4];
+ } val1, val2, res[16];
+ int masks[16];
+ int i, j;
+
+ val1.f[0] = 2.;
+ val1.f[1] = 3.;
+ val1.f[2] = 4.;
+ val1.f[3] = 5.;
+
+ val2.f[0] = 10.;
+ val2.f[1] = 100.;
+ val2.f[2] = 1000.;
+ val2.f[3] = 10000.;
+
+ res[0].x = _mm_dp_ps (val1.x, val2.x, HIMASK | lmsk0);
+ res[1].x = _mm_dp_ps (val1.x, val2.x, HIMASK | lmsk1);
+ res[2].x = _mm_dp_ps (val1.x, val2.x, HIMASK | lmsk2);
+ res[3].x = _mm_dp_ps (val1.x, val2.x, HIMASK | lmsk3);
+ res[4].x = _mm_dp_ps (val1.x, val2.x, HIMASK | lmsk01);
+ res[5].x = _mm_dp_ps (val1.x, val2.x, HIMASK | lmsk02);
+ res[6].x = _mm_dp_ps (val1.x, val2.x, HIMASK | lmsk03);
+ res[7].x = _mm_dp_ps (val1.x, val2.x, HIMASK | lmsk12);
+ res[8].x = _mm_dp_ps (val1.x, val2.x, HIMASK | lmsk13);
+ res[9].x = _mm_dp_ps (val1.x, val2.x, HIMASK | lmsk23);
+ res[10].x = _mm_dp_ps (val1.x, val2.x, HIMASK | (0x0F & ~lmsk0));
+ res[11].x = _mm_dp_ps (val1.x, val2.x, HIMASK | (0x0F & ~lmsk1));
+ res[12].x = _mm_dp_ps (val1.x, val2.x, HIMASK | (0x0F & ~lmsk2));
+ res[13].x = _mm_dp_ps (val1.x, val2.x, HIMASK | (0x0F & ~lmsk3));
+ res[14].x = _mm_dp_ps (val1.x, val2.x, HIMASK | lmskN);
+ res[15].x = _mm_dp_ps (val1.x, val2.x, HIMASK | lmskA);
+
+ masks[0] = HIMASK | lmsk0;
+ masks[1] = HIMASK | lmsk1;
+ masks[2] = HIMASK | lmsk2;
+ masks[3] = HIMASK | lmsk3;
+ masks[4] = HIMASK | lmsk01;
+ masks[5] = HIMASK | lmsk02;
+ masks[6] = HIMASK | lmsk03;
+ masks[7] = HIMASK | lmsk12;
+ masks[8] = HIMASK | lmsk13;
+ masks[9] = HIMASK | lmsk23;
+ masks[10] = HIMASK | (0x0F & ~lmsk0);
+ masks[11] = HIMASK | (0x0F & ~lmsk1);
+ masks[12] = HIMASK | (0x0F & ~lmsk2);
+ masks[13] = HIMASK | (0x0F & ~lmsk3);
+ masks[14] = HIMASK | lmskN;
+ masks[15] = HIMASK | lmskA;
+
+ for (i = 0; i <= 15; i++)
+ {
+ float tmp = 0.;
+
+ for (j = 0; j < 4; j++)
+ if ((HIMASK & (0x10 << j)))
+ tmp += val1.f[j] * val2.f[j];
+
+ for (j = 0; j < 4; j++)
+ if ((masks[i] & (1 << j)) && res[i].f[j] != tmp)
+ abort ();
+ }
+}
--- /dev/null
+/* { dg-do run { target i?86-*-* x86_64-*-* } } */
+/* { dg-require-effective-target sse4 } */
+/* { dg-options "-O2 -msse4.1" } */
+
+#include "sse4_1-check.h"
+
+#include <smmintrin.h>
+#include <string.h>
+
+#define lmskN 0x00
+#define lmsk0 0x01
+#define lmsk1 0x02
+#define lmsk2 0x04
+#define lmsk3 0x08
+#define lmsk01 0x03
+#define lmsk02 0x05
+#define lmsk03 0x09
+#define lmsk12 0x06
+#define lmsk13 0x0A
+#define lmsk23 0x0C
+#define lmskA 0x0F
+
+#define hmskN 0x00
+#define hmskA 0xF0
+#define hmsk0 0x10
+#define hmsk1 0x20
+#define hmsk2 0x40
+#define hmsk3 0x80
+#define hmsk01 0x30
+#define hmsk02 0x50
+#define hmsk03 0x90
+#define hmsk12 0x60
+#define hmsk13 0xA0
+#define hmsk23 0xC0
+
+#ifndef HIMASK
+#define HIMASK hmskA
+#endif
+
+#ifndef LOMASK
+#define LOMASK lmskA
+#endif
+
+static void
+sse4_1_test (void)
+{
+ union
+ {
+ __m128 x;
+ float f[4];
+ } val1[16], val2[16], res[16], chk[16];
+ int i,j;
+ float tmp;
+
+ for (i = 0; i < 16; i++)
+ {
+ val1[i].f[0] = 2.;
+ val1[i].f[1] = 3.;
+ val1[i].f[2] = 4.;
+ val1[i].f[3] = 5.;
+
+ val2[i].f[0] = 10.;
+ val2[i].f[1] = 100.;
+ val2[i].f[2] = 1000.;
+ val2[i].f[3] = 10000.;
+
+ tmp = 0.;
+ for (j = 0; j < 4; j++)
+ if ((HIMASK & (0x10 << j)))
+ tmp += val1[i].f [j] * val2[i].f [j];
+
+ for (j = 0; j < 4; j++)
+ if ((LOMASK & (1 << j)))
+ chk[i].f[j] = tmp;
+ }
+
+ for (i = 0; i < 16; i++)
+ {
+ res[i].x = _mm_dp_ps (val1[i].x, val2[i].x, HIMASK | LOMASK);
+ if (memcmp (&res[i], &chk[i], sizeof (chk[i])))
+ abort ();
+ }
+}
--- /dev/null
+/* { dg-do run { target i?86-*-* x86_64-*-* } } */
+/* { dg-require-effective-target sse4 } */
+/* { dg-options "-O2 -msse4.1" } */
+
+#include "sse4_1-check.h"
+
+#include <smmintrin.h>
+
+int masks[4];
+
+#define msk0 0x00
+#define msk1 0x01
+#define msk2 0x02
+#define msk3 0x03
+
+static void
+sse4_1_test (void)
+{
+ union
+ {
+ __m128 x;
+ float f[4];
+ } val1, val2;
+ union
+ {
+ int i;
+ float f;
+ } res[4];
+ float resm[4];
+ int i;
+
+ val1.f[0] = 10.;
+ val1.f[1] = 2.;
+ val1.f[2] = 3.;
+ val1.f[3] = 40.;
+
+ val2.f[0] = 77.;
+ val2.f[1] = 21.;
+ val2.f[2] = 34.;
+ val2.f[3] = 49.;
+
+ res[0].i = _mm_extract_ps (val1.x, msk0);
+ res[1].i = _mm_extract_ps (val1.x, msk1);
+ res[2].i = _mm_extract_ps (val1.x, msk2);
+ res[3].i = _mm_extract_ps (val1.x, msk3);
+
+ _MM_EXTRACT_FLOAT (resm[0], val2.x, msk0);
+ _MM_EXTRACT_FLOAT (resm[1], val2.x, msk1);
+ _MM_EXTRACT_FLOAT (resm[2], val2.x, msk2);
+ _MM_EXTRACT_FLOAT (resm[3], val2.x, msk3);
+
+ masks[0] = msk0;
+ masks[1] = msk1;
+ masks[2] = msk2;
+ masks[3] = msk3;
+
+ for( i=0; i < 4; i++ )
+ {
+ if (res[i].f != val1.f[masks[i]])
+ abort ();
+ if (resm[i] != val2.f[masks[i]])
+ abort ();
+ }
+}
--- /dev/null
+/* { dg-do run { target i?86-*-* x86_64-*-* } } */
+/* { dg-require-effective-target sse4 } */
+/* { dg-options "-O2 -msse4.1" } */
+
+#include "sse4_1-check.h"
+
+#include <smmintrin.h>
+#include <string.h>
+
+#define msk0 0x01
+#define msk1 0x10
+#define msk2 0x29
+#define msk3 0x30
+
+#define msk4 0xFC
+#define msk5 0x05
+#define msk6 0x0A
+#define msk7 0x0F
+
+static void
+sse4_1_test (void)
+{
+ union
+ {
+ __m128 x;
+ float f[4];
+ } res[8], val1, val2, tmp;
+ int masks[8];
+ int i, j;
+
+ val2.f[0] = 55.0;
+ val2.f[1] = 55.0;
+ val2.f[2] = 55.0;
+ val2.f[3] = 55.0;
+
+ val1.f[0] = 1.;
+ val1.f[1] = 2.;
+ val1.f[2] = 3.;
+ val1.f[3] = 4.;
+
+ res[0].x = _mm_insert_ps (val2.x, val1.x, msk0);
+ res[1].x = _mm_insert_ps (val2.x, val1.x, msk1);
+ res[2].x = _mm_insert_ps (val2.x, val1.x, msk2);
+ res[3].x = _mm_insert_ps (val2.x, val1.x, msk3);
+
+ masks[0] = msk0;
+ masks[1] = msk1;
+ masks[2] = msk2;
+ masks[3] = msk3;
+
+ for (i = 0; i < 4; i++)
+ res[i + 4].x = _mm_insert_ps (val2.x, val1.x, msk4);
+
+ masks[4] = msk4;
+ masks[5] = msk4;
+ masks[6] = msk4;
+ masks[7] = msk4;
+
+ for (i=0; i < 8; i++)
+ {
+ tmp = val2;
+ tmp.f[(masks[i] & 0x30) >> 4] = val1.f[(masks[i] & 0xC0) >> 6];
+
+ for (j = 0; j < 4; j++)
+ if (masks[i] & (0x1 << j))
+ tmp.f[j] = 0.f;
+
+ if (memcmp (&res[i], &tmp, sizeof (tmp)))
+ abort ();
+ }
+}
--- /dev/null
+/* { dg-do run { target i?86-*-* x86_64-*-* } } */
+/* { dg-require-effective-target sse4 } */
+/* { dg-options "-O2 -msse4.1" } */
+
+#include "sse4_1-check.h"
+
+#include <smmintrin.h>
+
+static void
+sse4_1_test (void)
+{
+ union
+ {
+ __m128 x;
+ float f[4];
+ } vals[4], val;
+ int i, j;
+
+ val.f[0]= 1.;
+ val.f[1]= 2.;
+ val.f[2]= 3.;
+ val.f[3]= 4.;
+
+ vals[0].x = _MM_PICK_OUT_PS (val.x, 0);
+ vals[1].x = _MM_PICK_OUT_PS (val.x, 1);
+ vals[2].x = _MM_PICK_OUT_PS (val.x, 2);
+ vals[3].x = _MM_PICK_OUT_PS (val.x, 3);
+
+ for (i = 0; i < 4; i++)
+ for (j = 0; j < 4; j++)
+ if ((j != 0 && vals[i].f[j] != 0)
+ || (j == 0 && vals[i].f[j] != val.f[i]))
+ abort ();
+
+ if (_MM_MK_INSERTPS_NDX(0, 0, 0x1) != 0x01
+ || _MM_MK_INSERTPS_NDX(0, 1, 0x2) != 0x12
+ || _MM_MK_INSERTPS_NDX(0, 2, 0x3) != 0x23
+ || _MM_MK_INSERTPS_NDX(0, 3, 0x4) != 0x34
+ || _MM_MK_INSERTPS_NDX(1, 0, 0x5) != 0x45
+ || _MM_MK_INSERTPS_NDX(1, 1, 0x6) != 0x56
+ || _MM_MK_INSERTPS_NDX(2, 2, 0x7) != 0xA7
+ || _MM_MK_INSERTPS_NDX(3, 3, 0x8) != 0xF8)
+ abort ();
+}
--- /dev/null
+/* { dg-do run { target i?86-*-* x86_64-*-* } } */
+/* { dg-require-effective-target sse4 } */
+/* { dg-options "-O2 -msse4.1" } */
+
+#include "sse4_1-check.h"
+
+#include <smmintrin.h>
+#include <string.h>
+
+#define NUM 20
+
+static void
+init_movntdqa (int *src)
+{
+ int i, j, sign = 1;
+
+ for (i = 0; i < NUM; i++)
+ for (j = 0; j < 4; j++)
+ {
+ src[i * 4 + j] = j * i * i * sign;
+ sign = -sign;
+ }
+}
+
+static void
+sse4_1_test (void)
+{
+ union
+ {
+ __m128i x[NUM];
+ int i[NUM * 4];
+ } dst, src;
+ int i;
+
+ init_movntdqa (src.i);
+
+ for (i = 0; i < NUM; i++)
+ dst.x[i] = _mm_stream_load_si128 (&src.x[i]);
+
+ for (i = 0; i < NUM; i++)
+ if (memcmp (&dst.x[i], &src.x[i], sizeof(src.x[i])))
+ abort ();
+}
--- /dev/null
+/* { dg-do run { target i?86-*-* x86_64-*-* } } */
+/* { dg-require-effective-target sse4 } */
+/* { dg-options "-O2 -msse4.1" } */
+
+#include "sse4_1-check.h"
+
+#include <smmintrin.h>
+#include <string.h>
+
+#define msk0 0xC0
+#define msk1 0x01
+#define msk2 0xF2
+#define msk3 0x03
+#define msk4 0x84
+#define msk5 0x05
+#define msk6 0xE6
+#define msk7 0x67
+
+static __m128i
+compute_mpsadbw (unsigned char *v1, unsigned char *v2, int mask)
+{
+ union
+ {
+ __m128i x;
+ unsigned short s[8];
+ } ret;
+ unsigned char s[4];
+ int i, j;
+ int offs1, offs2;
+
+ offs2 = 4 * (mask & 3);
+ for (i = 0; i < 4; i++)
+ s[i] = v2[offs2 + i];
+
+ offs1 = 4 * ((mask & 4) >> 2);
+ for (j = 0; j < 8; j++)
+ {
+ ret.s[j] = 0;
+ for (i = 0; i < 4; i++)
+ ret.s[j] += abs (v1[offs1 + j + i] - s[i]);
+ }
+
+ return ret.x;
+}
+
+static void
+sse4_1_test (void)
+{
+ union
+ {
+ __m128i x;
+ unsigned int i[4];
+ unsigned char c[16];
+ } val1, val2, val3 [8];
+ __m128i res[8], tmp;
+ unsigned char masks[8];
+ int i;
+
+ val1.i[0] = 0x35251505;
+ val1.i[1] = 0x75655545;
+ val1.i[2] = 0xB5A59585;
+ val1.i[3] = 0xF5E5D5C5;
+
+ val2.i[0] = 0x31211101;
+ val2.i[1] = 0x71615141;
+ val2.i[2] = 0xB1A19181;
+ val2.i[3] = 0xF1E1D1C1;
+
+ for (i=0; i < 8; i++)
+ switch (i % 3)
+ {
+ case 1:
+ val3[i].i[0] = 0xF1E1D1C1;
+ val3[i].i[1] = 0xB1A19181;
+ val3[i].i[2] = 0x71615141;
+ val3[i].i[3] = 0x31211101;
+ break;
+ default:
+ val3[i].x = val2.x;
+ break;
+ }
+
+ /* Check mpsadbw imm8, xmm, xmm. */
+ res[0] = _mm_mpsadbw_epu8 (val1.x, val2.x, msk0);
+ res[1] = _mm_mpsadbw_epu8 (val1.x, val2.x, msk1);
+ res[2] = _mm_mpsadbw_epu8 (val1.x, val2.x, msk2);
+ res[3] = _mm_mpsadbw_epu8 (val1.x, val2.x, msk3);
+ res[4] = _mm_mpsadbw_epu8 (val1.x, val2.x, msk4);
+ res[5] = _mm_mpsadbw_epu8 (val1.x, val2.x, msk5);
+ res[6] = _mm_mpsadbw_epu8 (val1.x, val2.x, msk6);
+ res[7] = _mm_mpsadbw_epu8 (val1.x, val2.x, msk7);
+
+ masks[0] = msk0;
+ masks[1] = msk1;
+ masks[2] = msk2;
+ masks[3] = msk3;
+ masks[4] = msk4;
+ masks[5] = msk5;
+ masks[6] = msk6;
+ masks[7] = msk7;
+
+ for (i=0; i < 8; i++)
+ {
+ tmp = compute_mpsadbw (val1.c, val2.c, masks[i]);
+ if (memcmp (&tmp, &res[i], sizeof (tmp)))
+ abort ();
+ }
+
+ /* Check mpsadbw imm8, m128, xmm. */
+ for (i=0; i < 8; i++)
+ {
+ res[i] = _mm_mpsadbw_epu8 (val1.x, val3[i].x, msk4);
+ masks[i] = msk4;
+ }
+
+ for (i=0; i < 8; i++)
+ {
+ tmp = compute_mpsadbw (val1.c, val3[i].c, masks[i]);
+ if (memcmp (&tmp, &res[i], sizeof (tmp)))
+ abort ();
+ }
+}
--- /dev/null
+/* { dg-do run { target i?86-*-* x86_64-*-* } } */
+/* { dg-require-effective-target sse4 } */
+/* { dg-options "-O2 -msse4.1" } */
+
+#include "sse4_1-check.h"
+
+#include <smmintrin.h>
+
+#define NUM 64
+
+static unsigned short
+int_to_ushort (int iVal)
+{
+ unsigned short sVal;
+
+ if (iVal < 0)
+ sVal = 0;
+ else if (iVal > 0xffff)
+ sVal = 0xffff;
+ else sVal = iVal;
+
+ return sVal;
+}
+
+static void
+sse4_1_test (void)
+{
+ union
+ {
+ __m128i x[NUM / 4];
+ int i[NUM];
+ } src1, src2;
+ union
+ {
+ __m128i x[NUM / 4];
+ unsigned short s[NUM * 2];
+ } dst;
+ int i, sign = 1;
+
+ for (i = 0; i < NUM; i++)
+ {
+ src1.i[i] = i * i * sign;
+ src2.i[i] = (i + 20) * sign;
+ sign = -sign;
+ }
+
+ for (i = 0; i < NUM; i += 4)
+ dst.x[i / 4] = _mm_packus_epi32 (src1.x [i / 4], src2.x [i / 4]);
+
+ for (i = 0; i < NUM; i ++)
+ {
+ int dstIndex;
+ unsigned short sVal;
+
+ sVal = int_to_ushort (src1.i[i]);
+ dstIndex = (i % 4) + (i / 4) * 8;
+ if (sVal != dst.s[dstIndex])
+ abort ();
+
+ sVal = int_to_ushort (src2.i[i]);
+ dstIndex += 4;
+ if (sVal != dst.s[dstIndex])
+ abort ();
+ }
+}
--- /dev/null
+/* { dg-do run { target i?86-*-* x86_64-*-* } } */
+/* { dg-require-effective-target sse4 } */
+/* { dg-options "-O2 -msse4.1" } */
+
+#include "sse4_1-check.h"
+
+#include <smmintrin.h>
+#include <string.h>
+
+#define NUM 20
+
+static void
+init_pblendvb (unsigned char *src1, unsigned char *src2,
+ unsigned char *mask)
+{
+ int i, sign = 1;
+
+ for (i = 0; i < NUM * 16; i++)
+ {
+ src1[i] = i* i * sign;
+ src2[i] = (i + 20) * sign;
+ mask[i] = (i % 3) + ((i * (14 + sign))
+ ^ (src1[i] | src2[i] | (i*3)));
+ sign = -sign;
+ }
+}
+
+static int
+check_pblendvb (__m128i *dst, unsigned char *src1,
+ unsigned char *src2, unsigned char *mask)
+{
+ unsigned char tmp[16];
+ int j;
+
+ memcpy (&tmp[0], src1, sizeof (tmp));
+ for (j = 0; j < 16; j++)
+ if (mask [j] & 0x80)
+ tmp[j] = src2[j];
+
+ return memcmp (dst, &tmp[0], sizeof (tmp));
+}
+
+static void
+sse4_1_test (void)
+{
+ union
+ {
+ __m128i x[NUM];
+ unsigned char c[NUM * 16];
+ } dst, src1, src2, mask;
+ int i;
+
+ init_pblendvb (src1.c, src2.c, mask.c);
+
+ for (i = 0; i < NUM; i++)
+ {
+ dst.x[i] = _mm_blendv_epi8 (src1.x[i], src2.x[i], mask.x[i]);
+ if (check_pblendvb (&dst.x[i], &src1.c[i * 16], &src2.c[i * 16],
+ &mask.c[i * 16]))
+ abort ();
+ }
+}
--- /dev/null
+/* { dg-do run { target i?86-*-* x86_64-*-* } } */
+/* { dg-require-effective-target sse4 } */
+/* { dg-options "-O2 -msse4.1" } */
+
+#include "sse4_1-check.h"
+
+#include <smmintrin.h>
+#include <string.h>
+
+#define NUM 20
+
+#ifndef MASK
+#define MASK 0x0f
+#endif
+
+static void
+init_pblendw (short *src1, short *src2)
+{
+ int i, sign = 1;
+
+ for (i = 0; i < NUM * 8; i++)
+ {
+ src1[i] = i * i * sign;
+ src2[i] = (i + 20) * sign;
+ sign = -sign;
+ }
+}
+
+static int
+check_pblendw (__m128i *dst, short *src1, short *src2)
+{
+ short tmp[8];
+ int j;
+
+ memcpy (&tmp[0], src1, sizeof (tmp));
+ for (j = 0; j < 8; j++)
+ if ((MASK & (1 << j)))
+ tmp[j] = src2[j];
+
+ return memcmp (dst, &tmp[0], sizeof (tmp));
+}
+
+static void
+sse4_1_test (void)
+{
+ __m128i x, y;
+ union
+ {
+ __m128i x[NUM];
+ short s[NUM * 8];
+ } dst, src1, src2;
+ union
+ {
+ __m128i x;
+ short s[8];
+ } src3;
+ int i;
+
+ init_pblendw (src1.s, src2.s);
+
+ /* Check pblendw imm8, m128, xmm */
+ for (i = 0; i < NUM; i++)
+ {
+ dst.x[i] = _mm_blend_epi16 (src1.x[i], src2.x[i], MASK);
+ if (check_pblendw (&dst.x[i], &src1.s[i * 8], &src2.s[i * 8]))
+ abort ();
+ }
+
+ /* Check pblendw imm8, xmm, xmm */
+ src3.x = _mm_setzero_si128 ();
+
+ x = _mm_blend_epi16 (dst.x[2], src3.x, MASK);
+ y = _mm_blend_epi16 (src3.x, dst.x[2], MASK);
+
+ if (check_pblendw (&x, &dst.s[16], &src3.s[0]))
+ abort ();
+
+ if (check_pblendw (&y, &src3.s[0], &dst.s[16]))
+ abort ();
+}
--- /dev/null
+/* { dg-do run { target i?86-*-* x86_64-*-* } } */
+/* { dg-require-effective-target sse4 } */
+/* { dg-options "-O2 -msse4.1" } */
+
+#include "sse4_1-check.h"
+
+#include <smmintrin.h>
+
+#define NUM 64
+
+static void
+sse4_1_test (void)
+{
+ union
+ {
+ __m128i x[NUM / 2];
+ long long ll[NUM];
+ } dst, src1, src2;
+ int i, sign=1;
+ long long is_eq;
+
+ for (i = 0; i < NUM; i++)
+ {
+ src1.ll[i] = i * i * sign;
+ src2.ll[i] = (i + 20) * sign;
+ sign = -sign;
+ }
+
+ for (i = 0; i < NUM; i += 2)
+ dst.x [i / 2] = _mm_cmpeq_epi64(src1.x [i / 2], src2.x [i / 2]);
+
+ for (i = 0; i < NUM; i++)
+ {
+ is_eq = src1.ll[i] == src2.ll[i] ? 0xffffffffffffffffLL : 0LL;
+ if (is_eq != dst.ll[i])
+ abort ();
+ }
+}
--- /dev/null
+/* { dg-do run { target i?86-*-* x86_64-*-* } } */
+/* { dg-require-effective-target sse4 } */
+/* { dg-options "-O2 -msse4.1" } */
+
+#include "sse4_1-check.h"
+
+#include <smmintrin.h>
+
+#define msk0 0
+#define msk1 1
+#define msk2 2
+#define msk3 3
+#define msk4 4
+#define msk5 5
+#define msk6 6
+#define msk7 7
+#define msk8 8
+#define msk9 9
+#define msk10 10
+#define msk11 11
+#define msk12 12
+#define msk13 13
+#define msk14 14
+#define msk15 15
+
+static void
+sse4_1_test (void)
+{
+ union
+ {
+ __m128i x;
+ int i[4];
+ char c[16];
+ } val1;
+ int res[16], masks[16];
+ int i;
+
+ val1.i[0] = 0x04030201;
+ val1.i[1] = 0x08070605;
+ val1.i[2] = 0x0C0B0A09;
+ val1.i[3] = 0x100F0E0D;
+
+ res[0] = _mm_extract_epi8 (val1.x, msk0);
+ res[1] = _mm_extract_epi8 (val1.x, msk1);
+ res[2] = _mm_extract_epi8 (val1.x, msk2);
+ res[3] = _mm_extract_epi8 (val1.x, msk3);
+ res[4] = _mm_extract_epi8 (val1.x, msk4);
+ res[5] = _mm_extract_epi8 (val1.x, msk5);
+ res[6] = _mm_extract_epi8 (val1.x, msk6);
+ res[7] = _mm_extract_epi8 (val1.x, msk7);
+ res[8] = _mm_extract_epi8 (val1.x, msk8);
+ res[9] = _mm_extract_epi8 (val1.x, msk9);
+ res[10] = _mm_extract_epi8 (val1.x, msk10);
+ res[11] = _mm_extract_epi8 (val1.x, msk11);
+ res[12] = _mm_extract_epi8 (val1.x, msk12);
+ res[13] = _mm_extract_epi8 (val1.x, msk13);
+ res[14] = _mm_extract_epi8 (val1.x, msk14);
+ res[15] = _mm_extract_epi8 (val1.x, msk15);
+
+ masks[0] = msk0;
+ masks[1] = msk1;
+ masks[2] = msk2;
+ masks[3] = msk3;
+ masks[4] = msk4;
+ masks[5] = msk5;
+ masks[6] = msk6;
+ masks[7] = msk7;
+ masks[8] = msk8;
+ masks[9] = msk9;
+ masks[10] = msk10;
+ masks[11] = msk11;
+ masks[12] = msk12;
+ masks[13] = msk13;
+ masks[14] = msk14;
+ masks[15] = msk15;
+
+ for (i = 0; i < 16; i++)
+ if (res[i] != val1.c [masks[i]])
+ abort ();
+}
--- /dev/null
+/* { dg-do run { target i?86-*-* x86_64-*-* } } */
+/* { dg-require-effective-target sse4 } */
+/* { dg-options "-O2 -msse4.1" } */
+
+#include "sse4_1-check.h"
+
+#include <smmintrin.h>
+
+#define msk0 0
+#define msk1 1
+#define msk2 2
+#define msk3 3
+
+static void
+sse4_1_test (void)
+{
+ union
+ {
+ __m128i x;
+ int i[4];
+ } val1;
+ int res[4], masks[4];
+ int i;
+
+ val1.i[0] = 0x04030201;
+ val1.i[1] = 0x08070605;
+ val1.i[2] = 0x0C0B0A09;
+ val1.i[3] = 0x100F0E0D;
+
+ res[0] = _mm_extract_epi32 (val1.x, msk0);
+ res[1] = _mm_extract_epi32 (val1.x, msk1);
+ res[2] = _mm_extract_epi32 (val1.x, msk2);
+ res[3] = _mm_extract_epi32 (val1.x, msk3);
+
+ masks[0] = msk0;
+ masks[1] = msk1;
+ masks[2] = msk2;
+ masks[3] = msk3;
+
+ for (i = 0; i < 4; i++)
+ if (res[i] != val1.i [masks[i]])
+ abort ();
+}
--- /dev/null
+/* { dg-do run { target { { i?86-*-* x86_64-*-* } && lp64 } } } */
+/* { dg-require-effective-target sse4 } */
+/* { dg-options "-O2 -msse4.1" } */
+
+#include "sse4_1-check.h"
+
+#include <smmintrin.h>
+
+#define msk0 0
+#define msk1 1
+
+static void
+sse4_1_test (void)
+{
+ union
+ {
+ __m128i x;
+ long long ll[2];
+ } val1;
+ long long res[2];
+ int masks[2];
+ int i;
+
+ val1.ll[0] = 0x0807060504030201LL;
+ val1.ll[1] = 0x100F0E0D0C0B0A09LL;
+
+ res[0] = _mm_extract_epi64 (val1.x, msk0);
+ res[1] = _mm_extract_epi64 (val1.x, msk1);
+
+ masks[0] = msk0;
+ masks[1] = msk1;
+
+ for (i = 0; i < 2; i++)
+ if (res[i] != val1.ll [masks[i]])
+ abort ();
+}
--- /dev/null
+/* { dg-do run { target i?86-*-* x86_64-*-* } } */
+/* { dg-require-effective-target sse4 } */
+/* { dg-options "-O2 -msse4.1" } */
+
+#include "sse4_1-check.h"
+
+#include <smmintrin.h>
+
+#define msk0 0
+#define msk1 1
+#define msk2 2
+#define msk3 3
+#define msk4 4
+#define msk5 5
+#define msk6 6
+#define msk7 7
+
+static void
+sse4_1_test (void)
+{
+ union
+ {
+ __m128i x;
+ int i[4];
+ short s[8];
+ } val1;
+ int res[8], masks[8];
+ int i;
+
+ val1.i[0] = 0x04030201;
+ val1.i[1] = 0x08070605;
+ val1.i[2] = 0x0C0B0A09;
+ val1.i[3] = 0x100F0E0D;
+
+ res[0] = _mm_extract_epi16 (val1.x, msk0);
+ res[1] = _mm_extract_epi16 (val1.x, msk1);
+ res[2] = _mm_extract_epi16 (val1.x, msk2);
+ res[3] = _mm_extract_epi16 (val1.x, msk3);
+ res[4] = _mm_extract_epi16 (val1.x, msk4);
+ res[5] = _mm_extract_epi16 (val1.x, msk5);
+ res[6] = _mm_extract_epi16 (val1.x, msk6);
+ res[7] = _mm_extract_epi16 (val1.x, msk7);
+
+ masks[0] = msk0;
+ masks[1] = msk1;
+ masks[2] = msk2;
+ masks[3] = msk3;
+ masks[4] = msk4;
+ masks[5] = msk5;
+ masks[6] = msk6;
+ masks[7] = msk7;
+
+ for (i = 0; i < 8; i++)
+ if (res[i] != val1.s [masks[i]])
+ abort ();
+}
--- /dev/null
+/* { dg-do run { target i?86-*-* x86_64-*-* } } */
+/* { dg-require-effective-target sse4 } */
+/* { dg-options "-O2 -msse4.1" } */
+
+#include "sse4_1-check.h"
+
+#include <smmintrin.h>
+
+#define NUM 64
+
+static void
+sse4_1_test (void)
+{
+ union
+ {
+ __m128i x[NUM/8];
+ unsigned short s[NUM];
+ } src;
+ unsigned short minVal[NUM/8];
+ int minInd[NUM/8];
+ unsigned short minValScalar, minIndScalar;
+ int i, j, res;
+
+ for (i = 0; i < NUM; i++)
+ src.s[i] = i * i / (i + i / 3.14 + 1.0);
+
+ for (i = 0, j = 0; i < NUM; i += 8, j++)
+ {
+ res = _mm_cvtsi128_si32 (_mm_minpos_epu16 (src.x [i/8]));
+ minVal[j] = res & 0xffff;
+ minInd[j] = (res >> 16) & 0x3;
+ }
+
+ for (i = 0; i < NUM; i += 8)
+ {
+ minValScalar = src.s[i];
+ minIndScalar = 0;
+
+ for (j = i + 1; j < i + 8; j++)
+ if (minValScalar > src.s[j])
+ {
+ minValScalar = src.s[j];
+ minIndScalar = j - i;
+ }
+
+ if (minValScalar != minVal[i/8] && minIndScalar != minInd[i/8])
+ abort ();
+ }
+}
--- /dev/null
+/* { dg-do run { target i?86-*-* x86_64-*-* } } */
+/* { dg-require-effective-target sse4 } */
+/* { dg-options "-O2 -msse4.1" } */
+
+#include "sse4_1-check.h"
+
+#include <smmintrin.h>
+#include <string.h>
+
+#define msk0 0x00
+#define msk1 0x01
+#define msk2 0x02
+#define msk3 0x03
+#define msk4 0x04
+#define msk5 0x05
+#define msk6 0x06
+#define msk7 0x07
+#define msk8 0x08
+#define msk9 0x09
+#define mskA 0x0A
+#define mskB 0x0B
+#define mskC 0x0C
+#define mskD 0x0D
+#define mskE 0x0E
+#define mskF 0x0F
+
+static void
+sse4_1_test (void)
+{
+ union
+ {
+ __m128i x;
+ unsigned int i[4];
+ unsigned char c[16];
+ } res [16], val, tmp;
+ int masks[16];
+ unsigned char ins[4] = { 3, 4, 5, 6 };
+ int i;
+
+ val.i[0] = 0x35251505;
+ val.i[1] = 0x75655545;
+ val.i[2] = 0xB5A59585;
+ val.i[3] = 0xF5E5D5C5;
+
+ /* Check pinsrb imm8, r32, xmm. */
+ res[0].x = _mm_insert_epi8 (val.x, ins[0], msk0);
+ res[1].x = _mm_insert_epi8 (val.x, ins[0], msk1);
+ res[2].x = _mm_insert_epi8 (val.x, ins[0], msk2);
+ res[3].x = _mm_insert_epi8 (val.x, ins[0], msk3);
+ res[4].x = _mm_insert_epi8 (val.x, ins[0], msk4);
+ res[5].x = _mm_insert_epi8 (val.x, ins[0], msk5);
+ res[6].x = _mm_insert_epi8 (val.x, ins[0], msk6);
+ res[7].x = _mm_insert_epi8 (val.x, ins[0], msk7);
+ res[8].x = _mm_insert_epi8 (val.x, ins[0], msk8);
+ res[9].x = _mm_insert_epi8 (val.x, ins[0], msk9);
+ res[10].x = _mm_insert_epi8 (val.x, ins[0], mskA);
+ res[11].x = _mm_insert_epi8 (val.x, ins[0], mskB);
+ res[12].x = _mm_insert_epi8 (val.x, ins[0], mskC);
+ res[13].x = _mm_insert_epi8 (val.x, ins[0], mskD);
+ res[14].x = _mm_insert_epi8 (val.x, ins[0], mskE);
+ res[15].x = _mm_insert_epi8 (val.x, ins[0], mskF);
+
+ masks[0] = msk0;
+ masks[1] = msk1;
+ masks[2] = msk2;
+ masks[3] = msk3;
+ masks[4] = msk4;
+ masks[5] = msk5;
+ masks[6] = msk6;
+ masks[7] = msk7;
+ masks[8] = msk8;
+ masks[9] = msk9;
+ masks[10] = mskA;
+ masks[11] = mskB;
+ masks[12] = mskC;
+ masks[13] = mskD;
+ masks[14] = mskE;
+ masks[15] = mskF;
+
+ for (i = 0; i < 16; i++)
+ {
+ tmp.x = val.x;
+ tmp.c[masks[i]] = ins[0];
+ if (memcmp (&tmp, &res[i], sizeof (tmp)))
+ abort ();
+ }
+
+ /* Check pinsrb imm8, m8, xmm. */
+ for (i = 0; i < 16; i++)
+ {
+ res[i].x = _mm_insert_epi8 (val.x, ins[i % 4], msk0);
+ masks[i] = msk0;
+ }
+
+ for (i = 0; i < 16; i++)
+ {
+ tmp.x = val.x;
+ tmp.c[masks[i]] = ins[i % 4];
+ if (memcmp (&tmp, &res[i], sizeof (tmp)))
+ abort ();
+ }
+}
--- /dev/null
+/* { dg-do run { target i?86-*-* x86_64-*-* } } */
+/* { dg-require-effective-target sse4 } */
+/* { dg-options "-O2 -msse4.1" } */
+
+#include "sse4_1-check.h"
+
+#include <smmintrin.h>
+#include <string.h>
+
+#define msk0 0x00
+#define msk1 0x01
+#define msk2 0x02
+#define msk3 0x03
+
+static void
+sse4_1_test (void)
+{
+ union
+ {
+ __m128i x;
+ unsigned int i[4];
+ } res [4], val, tmp;
+ static unsigned int ins[4] = { 3, 4, 5, 6 };
+ int masks[4];
+ int i;
+
+ val.i[0] = 55;
+ val.i[1] = 55;
+ val.i[2] = 55;
+ val.i[3] = 55;
+
+ /* Check pinsrd imm8, r32, xmm. */
+ res[0].x = _mm_insert_epi32 (val.x, ins[0], msk0);
+ res[1].x = _mm_insert_epi32 (val.x, ins[0], msk1);
+ res[2].x = _mm_insert_epi32 (val.x, ins[0], msk2);
+ res[3].x = _mm_insert_epi32 (val.x, ins[0], msk3);
+
+ masks[0] = msk0;
+ masks[1] = msk1;
+ masks[2] = msk2;
+ masks[3] = msk3;
+
+ for (i = 0; i < 4; i++)
+ {
+ tmp.x = val.x;
+ tmp.i[masks[i]] = ins[0];
+ if (memcmp (&tmp, &res[i], sizeof (tmp)))
+ abort ();
+ }
+
+ /* Check pinsrd imm8, m32, xmm. */
+ for (i = 0; i < 4; i++)
+ {
+ res[i].x = _mm_insert_epi32 (val.x, ins[i], msk0);
+ masks[i] = msk0;
+ }
+
+ for (i = 0; i < 4; i++)
+ {
+ tmp.x = val.x;
+ tmp.i[masks[i]] = ins[i];
+ if (memcmp (&tmp, &res[i], sizeof (tmp)))
+ abort ();
+ }
+}
--- /dev/null
+/* { dg-do run { target { { i?86-*-* x86_64-*-* } && lp64 } } } */
+/* { dg-require-effective-target sse4 } */
+/* { dg-options "-O2 -msse4.1" } */
+
+#include "sse4_1-check.h"
+
+#include <smmintrin.h>
+#include <string.h>
+
+#define msk0 0x00
+#define msk1 0x01
+
+static void
+sse4_1_test (void)
+{
+ union
+ {
+ __m128i x;
+ unsigned long long ll[2];
+ } res [4], val, tmp;
+ int masks[4];
+ static unsigned long long ins[2] =
+ { 0xAABBAABBAABBAABBLL, 0xCCDDCCDDCCDDCCDDLL };
+ int i;
+
+ val.ll[0] = 0x0807060504030201LL;
+ val.ll[1] = 0x100F0E0D0C0B0A09LL;
+
+ /* Check pinsrq imm8, r64, xmm. */
+ res[0].x = _mm_insert_epi64 (val.x, ins[0], msk0);
+ res[1].x = _mm_insert_epi64 (val.x, ins[0], msk1);
+
+ masks[0] = msk0;
+ masks[1] = msk1;
+
+ for (i = 0; i < 2; i++)
+ {
+ tmp.x = val.x;
+ tmp.ll[masks[i]] = ins[0];
+ if (memcmp (&tmp, &res[i], sizeof (tmp)))
+ abort ();
+ }
+
+ /* Check pinsrq imm8, m64, xmm. */
+ for (i = 0; i < 2; i++)
+ {
+ res[i].x = _mm_insert_epi64 (val.x, ins[i], msk0);
+ masks[i] = msk0;
+ }
+
+ for (i = 0; i < 2; i++)
+ {
+ tmp.x = val.x;
+ tmp.ll[masks[i]] = ins[i];
+ if (memcmp (&tmp, &res[i], sizeof (tmp)))
+ abort ();
+ }
+}
--- /dev/null
+/* { dg-do run { target i?86-*-* x86_64-*-* } } */
+/* { dg-require-effective-target sse4 } */
+/* { dg-options "-O2 -msse4.1" } */
+
+#include "sse4_1-check.h"
+
+#include <smmintrin.h>
+
+#define NUM 1024
+
+static void
+sse4_1_test (void)
+{
+ union
+ {
+ __m128i x[NUM / 16];
+ char i[NUM];
+ } dst, src1, src2;
+ int i, sign = 1;
+ char max;
+
+ for (i = 0; i < NUM; i++)
+ {
+ src1.i[i] = i * i * sign;
+ src2.i[i] = (i + 20) * sign;
+ sign = -sign;
+ }
+
+ for (i = 0; i < NUM; i += 16)
+ dst.x[i / 16] = _mm_max_epi8 (src1.x[i / 16], src2.x[i / 16]);
+
+ for (i = 0; i < NUM; i++)
+ {
+ max = src1.i[i] <= src2.i[i] ? src2.i[i] : src1.i[i];
+ if (max != dst.i[i])
+ abort ();
+ }
+}
--- /dev/null
+/* { dg-do run { target i?86-*-* x86_64-*-* } } */
+/* { dg-require-effective-target sse4 } */
+/* { dg-options "-O2 -msse4.1" } */
+
+#include "sse4_1-check.h"
+
+#include <smmintrin.h>
+
+#define NUM 64
+
+static void
+sse4_1_test (void)
+{
+ union
+ {
+ __m128i x[NUM / 4];
+ int i[NUM];
+ } dst, src1, src2;
+ int i, sign = 1;
+ int max;
+
+ for (i = 0; i < NUM; i++)
+ {
+ src1.i[i] = i * i * sign;
+ src2.i[i] = (i + 20) * sign;
+ sign = -sign;
+ }
+
+ for (i = 0; i < NUM; i += 4)
+ dst.x[i / 4] = _mm_max_epi32 (src1.x[i / 4], src2.x[i / 4]);
+
+ for (i = 0; i < NUM; i++)
+ {
+ max = src1.i[i] <= src2.i[i] ? src2.i[i] : src1.i[i];
+ if (max != dst.i[i])
+ abort ();
+ }
+}
--- /dev/null
+/* { dg-do run { target i?86-*-* x86_64-*-* } } */
+/* { dg-require-effective-target sse4 } */
+/* { dg-options "-O2 -msse4.1" } */
+
+#include "sse4_1-check.h"
+
+#include <smmintrin.h>
+
+#define NUM 64
+
+static void
+sse4_1_test (void)
+{
+ union
+ {
+ __m128i x[NUM / 4];
+ unsigned int i[NUM];
+ } dst, src1, src2;
+ int i;
+ unsigned int max;
+
+ for (i = 0; i < NUM; i++)
+ {
+ src1.i[i] = i * i;
+ src2.i[i] = i + 20;
+ if ((i % 4))
+ src2.i[i] |= 0x80000000;
+ }
+
+ for (i = 0; i < NUM; i += 4)
+ dst.x[i / 4] = _mm_max_epu32 (src1.x[i / 4], src2.x[i / 4]);
+
+ for (i = 0; i < NUM; i++)
+ {
+ max = src1.i[i] <= src2.i[i] ? src2.i[i] : src1.i[i];
+ if (max != dst.i[i])
+ abort ();
+ }
+}
--- /dev/null
+/* { dg-do run { target i?86-*-* x86_64-*-* } } */
+/* { dg-require-effective-target sse4 } */
+/* { dg-options "-O2 -msse4.1" } */
+
+#include "sse4_1-check.h"
+
+#include <smmintrin.h>
+
+#define NUM 64
+
+static void
+sse4_1_test (void)
+{
+ union
+ {
+ __m128i x[NUM / 8];
+ unsigned short i[NUM];
+ } dst, src1, src2;
+ int i;
+ unsigned short max;
+
+ for (i = 0; i < NUM; i++)
+ {
+ src1.i[i] = i * i;
+ src2.i[i] = i + 20;
+ if ((i % 8))
+ src2.i[i] |= 0x8000;
+ }
+
+ for (i = 0; i < NUM; i += 8)
+ dst.x[i / 8] = _mm_max_epu16 (src1.x[i / 8], src2.x[i / 8]);
+
+ for (i = 0; i < NUM; i++)
+ {
+ max = src1.i[i] <= src2.i[i] ? src2.i[i] : src1.i[i];
+ if (max != dst.i[i])
+ abort ();
+ }
+}
--- /dev/null
+/* { dg-do run { target i?86-*-* x86_64-*-* } } */
+/* { dg-require-effective-target sse4 } */
+/* { dg-options "-O2 -msse4.1" } */
+
+#include "sse4_1-check.h"
+
+#include <smmintrin.h>
+
+#define NUM 1024
+
+static void
+sse4_1_test (void)
+{
+ union
+ {
+ __m128i x[NUM / 16];
+ char i[NUM];
+ } dst, src1, src2;
+ int i, sign = 1;
+ char min;
+
+ for (i = 0; i < NUM; i++)
+ {
+ src1.i[i] = i * i * sign;
+ src2.i[i] = (i + 20) * sign;
+ sign = -sign;
+ }
+
+ for (i = 0; i < NUM; i += 16)
+ dst.x[i / 16] = _mm_min_epi8 (src1.x[i / 16], src2.x[i / 16]);
+
+ for (i = 0; i < NUM; i++)
+ {
+ min = src1.i[i] >= src2.i[i] ? src2.i[i] : src1.i[i];
+ if (min != dst.i[i])
+ abort ();
+ }
+}
--- /dev/null
+/* { dg-do run { target i?86-*-* x86_64-*-* } } */
+/* { dg-require-effective-target sse4 } */
+/* { dg-options "-O2 -msse4.1" } */
+
+#include "sse4_1-check.h"
+
+#include <smmintrin.h>
+
+#define NUM 64
+
+static void
+sse4_1_test (void)
+{
+ union
+ {
+ __m128i x[NUM / 4];
+ int i[NUM];
+ } dst, src1, src2;
+ int i, sign = 1;
+ int min;
+
+ for (i = 0; i < NUM; i++)
+ {
+ src1.i[i] = i * i * sign;
+ src2.i[i] = (i + 20) * sign;
+ sign = -sign;
+ }
+
+ for (i = 0; i < NUM; i += 4)
+ dst.x[i / 4] = _mm_min_epi32 (src1.x[i / 4], src2.x[i / 4]);
+
+ for (i = 0; i < NUM; i++)
+ {
+ min = src1.i[i] >= src2.i[i] ? src2.i[i] : src1.i[i];
+ if (min != dst.i[i])
+ abort ();
+ }
+}
--- /dev/null
+/* { dg-do run { target i?86-*-* x86_64-*-* } } */
+/* { dg-require-effective-target sse4 } */
+/* { dg-options "-O2 -msse4.1" } */
+
+#include "sse4_1-check.h"
+
+#include <smmintrin.h>
+
+#define NUM 64
+
+static void
+sse4_1_test (void)
+{
+ union
+ {
+ __m128i x[NUM / 4];
+ unsigned int i[NUM];
+ } dst, src1, src2;
+ int i;
+ unsigned int min;
+
+ for (i = 0; i < NUM; i++)
+ {
+ src1.i[i] = i * i;
+ src2.i[i] = i + 20;
+ if ((i % 4))
+ src2.i[i] |= 0x80000000;
+ }
+
+ for (i = 0; i < NUM; i += 4)
+ dst.x[i / 4] = _mm_min_epu32 (src1.x[i / 4], src2.x[i / 4]);
+
+ for (i = 0; i < NUM; i++)
+ {
+ min = src1.i[i] >= src2.i[i] ? src2.i[i] : src1.i[i];
+ if (min != dst.i[i])
+ abort ();
+ }
+}
--- /dev/null
+/* { dg-do run { target i?86-*-* x86_64-*-* } } */
+/* { dg-require-effective-target sse4 } */
+/* { dg-options "-O2 -msse4.1" } */
+
+#include "sse4_1-check.h"
+
+#include <smmintrin.h>
+
+#define NUM 64
+
+static void
+sse4_1_test (void)
+{
+ union
+ {
+ __m128i x[NUM / 8];
+ unsigned short i[NUM];
+ } dst, src1, src2;
+ int i;
+ unsigned short min;
+
+ for (i = 0; i < NUM; i++)
+ {
+ src1.i[i] = i * i;
+ src2.i[i] = i + 20;
+ if ((i % 8))
+ src2.i[i] |= 0x8000;
+ }
+
+ for (i = 0; i < NUM; i += 8)
+ dst.x[i / 8] = _mm_min_epu16 (src1.x[i / 8], src2.x[i / 8]);
+
+ for (i = 0; i < NUM; i++)
+ {
+ min = src1.i[i] >= src2.i[i] ? src2.i[i] : src1.i[i];
+ if (min != dst.i[i])
+ abort ();
+ }
+}
--- /dev/null
+/* { dg-do run { target i?86-*-* x86_64-*-* } } */
+/* { dg-require-effective-target sse4 } */
+/* { dg-options "-O2 -msse4.1" } */
+
+#include "sse4_1-check.h"
+
+#include <smmintrin.h>
+
+#define NUM 128
+
+static void
+sse4_1_test (void)
+{
+ union
+ {
+ __m128i x[NUM / 4];
+ int i[NUM];
+ char c[NUM * 4];
+ } dst, src;
+ int i, sign = 1;
+
+ for (i = 0; i < NUM; i++)
+ {
+ src.c[(i % 4) + (i / 4) * 16] = i * i * sign;
+ sign = -sign;
+ }
+
+ for (i = 0; i < NUM; i += 4)
+ dst.x [i / 4] = _mm_cvtepi8_epi32 (src.x [i / 4]);
+
+ for (i = 0; i < NUM; i++)
+ if (src.c[(i % 4) + (i / 4) * 16] != dst.i[i])
+ abort ();
+}
--- /dev/null
+/* { dg-do run { target i?86-*-* x86_64-*-* } } */
+/* { dg-require-effective-target sse4 } */
+/* { dg-options "-O2 -msse4.1" } */
+
+#include "sse4_1-check.h"
+
+#include <smmintrin.h>
+
+#define NUM 128
+
+static void
+sse4_1_test (void)
+{
+ union
+ {
+ __m128i x[NUM / 2];
+ long long ll[NUM];
+ char c[NUM * 8];
+ } dst, src;
+ int i, sign = 1;
+
+ for (i = 0; i < NUM; i++)
+ {
+ src.c[(i % 2) + (i / 2) * 16] = i * i * sign;
+ sign = -sign;
+ }
+
+ for (i = 0; i < NUM; i += 2)
+ dst.x [i / 2] = _mm_cvtepi8_epi64 (src.x [i / 2]);
+
+ for (i = 0; i < NUM; i++)
+ if (src.c[(i % 2) + (i / 2) * 16] != dst.ll[i])
+ abort ();
+}
--- /dev/null
+/* { dg-do run { target i?86-*-* x86_64-*-* } } */
+/* { dg-require-effective-target sse4 } */
+/* { dg-options "-O2 -msse4.1" } */
+
+#include "sse4_1-check.h"
+
+#include <smmintrin.h>
+
+#define NUM 128
+
+static void
+sse4_1_test (void)
+{
+ union
+ {
+ __m128i x[NUM / 8];
+ short s[NUM];
+ char c[NUM * 2];
+ } dst, src;
+ int i, sign = 1;
+
+ for (i = 0; i < NUM; i++)
+ {
+ src.c[(i % 8) + (i / 8) * 16] = i * i * sign;
+ sign = -sign;
+ }
+
+ for (i = 0; i < NUM; i += 8)
+ dst.x [i / 8] = _mm_cvtepi8_epi16 (src.x [i / 8]);
+
+ for (i = 0; i < NUM; i++)
+ if (src.c[(i % 8) + (i / 8) * 16] != dst.s[i])
+ abort ();
+}
--- /dev/null
+/* { dg-do run { target i?86-*-* x86_64-*-* } } */
+/* { dg-require-effective-target sse4 } */
+/* { dg-options "-O2 -msse4.1" } */
+
+#include "sse4_1-check.h"
+
+#include <smmintrin.h>
+
+#define NUM 128
+
+static void
+sse4_1_test (void)
+{
+ union
+ {
+ __m128i x[NUM / 2];
+ long long ll[NUM];
+ int i[NUM * 2];
+ } dst, src;
+ int i, sign = 1;
+
+ for (i = 0; i < NUM; i++)
+ {
+ src.i[(i % 2) + (i / 2) * 4] = i * i * sign;
+ sign = -sign;
+ }
+
+ for (i = 0; i < NUM; i += 2)
+ dst.x [i / 2] = _mm_cvtepi32_epi64 (src.x [i / 2]);
+
+ for (i = 0; i < NUM; i++)
+ if (src.i[(i % 2) + (i / 2) * 4] != dst.ll[i])
+ abort ();
+}
--- /dev/null
+/* { dg-do run { target i?86-*-* x86_64-*-* } } */
+/* { dg-require-effective-target sse4 } */
+/* { dg-options "-O2 -msse4.1" } */
+
+#include "sse4_1-check.h"
+
+#include <smmintrin.h>
+
+#define NUM 128
+
+static void
+sse4_1_test (void)
+{
+ union
+ {
+ __m128i x[NUM / 4];
+ int i[NUM];
+ short s[NUM * 2];
+ } dst, src;
+ int i, sign = 1;
+
+ for (i = 0; i < NUM; i++)
+ {
+ src.s[(i % 4) + (i / 4) * 8] = i * i * sign;
+ sign = -sign;
+ }
+
+ for (i = 0; i < NUM; i += 4)
+ dst.x [i / 4] = _mm_cvtepi16_epi32 (src.x [i / 4]);
+
+ for (i = 0; i < NUM; i++)
+ if (src.s[(i % 4) + (i / 4) * 8] != dst.i[i])
+ abort ();
+}
--- /dev/null
+/* { dg-do run { target i?86-*-* x86_64-*-* } } */
+/* { dg-require-effective-target sse4 } */
+/* { dg-options "-O2 -msse4.1" } */
+
+#include "sse4_1-check.h"
+
+#include <smmintrin.h>
+
+#define NUM 128
+
+static void
+sse4_1_test (void)
+{
+ union
+ {
+ __m128i x[NUM / 2];
+ long long ll[NUM];
+ short s[NUM * 4];
+ } dst, src;
+ int i, sign = 1;
+
+ for (i = 0; i < NUM; i++)
+ {
+ src.s[(i % 2) + (i / 2) * 8] = i * i * sign;
+ sign = -sign;
+ }
+
+ for (i = 0; i < NUM; i += 2)
+ dst.x [i / 2] = _mm_cvtepi16_epi64 (src.x [i / 2]);
+
+ for (i = 0; i < NUM; i++)
+ if (src.s[(i % 2) + (i / 2) * 8] != dst.ll[i])
+ abort ();
+}
--- /dev/null
+/* { dg-do run { target i?86-*-* x86_64-*-* } } */
+/* { dg-require-effective-target sse4 } */
+/* { dg-options "-O2 -msse4.1" } */
+
+#include "sse4_1-check.h"
+
+#include <smmintrin.h>
+
+#define NUM 128
+
+static void
+sse4_1_test (void)
+{
+ union
+ {
+ __m128i x[NUM / 4];
+ unsigned int i[NUM];
+ unsigned char c[NUM * 4];
+ } dst, src;
+ int i;
+
+ for (i = 0; i < NUM; i++)
+ {
+ src.c[(i % 4) + (i / 4) * 16] = i * i;
+ if ((i % 4))
+ src.c[(i % 4) + (i / 4) * 16] |= 0x80;
+ }
+
+ for (i = 0; i < NUM; i += 4)
+ dst.x [i / 4] = _mm_cvtepu8_epi32 (src.x [i / 4]);
+
+ for (i = 0; i < NUM; i++)
+ if (src.c[(i % 4) + (i / 4) * 16] != dst.i[i])
+ abort ();
+}
--- /dev/null
+/* { dg-do run { target i?86-*-* x86_64-*-* } } */
+/* { dg-require-effective-target sse4 } */
+/* { dg-options "-O2 -msse4.1" } */
+
+#include "sse4_1-check.h"
+
+#include <smmintrin.h>
+
+#define NUM 128
+
+static void
+sse4_1_test (void)
+{
+ union
+ {
+ __m128i x[NUM / 2];
+ unsigned long long ll[NUM];
+ unsigned char c[NUM * 8];
+ } dst, src;
+ int i;
+
+ for (i = 0; i < NUM; i++)
+ {
+ src.c[(i % 2) + (i / 2) * 16] = i * i;
+ if ((i % 2))
+ src.c[(i % 2) + (i / 2) * 16] |= 0x80;
+ }
+
+ for (i = 0; i < NUM; i += 2)
+ dst.x [i / 2] = _mm_cvtepu8_epi64 (src.x [i / 2]);
+
+ for (i = 0; i < NUM; i++)
+ if (src.c[(i % 2) + (i / 2) * 16] != dst.ll[i])
+ abort ();
+}
--- /dev/null
+/* { dg-do run { target i?86-*-* x86_64-*-* } } */
+/* { dg-require-effective-target sse4 } */
+/* { dg-options "-O2 -msse4.1" } */
+
+#include "sse4_1-check.h"
+
+#include <smmintrin.h>
+
+#define NUM 128
+
+static void
+sse4_1_test (void)
+{
+ union
+ {
+ __m128i x[NUM / 8];
+ unsigned short s[NUM];
+ unsigned char c[NUM * 2];
+ } dst, src;
+ int i;
+
+ for (i = 0; i < NUM; i++)
+ {
+ src.c[(i % 8) + (i / 8) * 16] = i * i;
+ if ((i % 4))
+ src.c[(i % 8) + (i / 8) * 16] |= 0x80;
+ }
+
+ for (i = 0; i < NUM; i += 8)
+ dst.x [i / 8] = _mm_cvtepu8_epi16 (src.x [i / 8]);
+
+ for (i = 0; i < NUM; i++)
+ if (src.c[(i % 8) + (i / 8) * 16] != dst.s[i])
+ abort ();
+}
--- /dev/null
+/* { dg-do run { target i?86-*-* x86_64-*-* } } */
+/* { dg-require-effective-target sse4 } */
+/* { dg-options "-O2 -msse4.1" } */
+
+#include "sse4_1-check.h"
+
+#include <smmintrin.h>
+
+#define NUM 128
+
+static void
+sse4_1_test (void)
+{
+ union
+ {
+ __m128i x[NUM / 2];
+ unsigned long long ll[NUM];
+ unsigned int i[NUM * 2];
+ } dst, src;
+ int i;
+
+ for (i = 0; i < NUM; i++)
+ {
+ src.i[(i % 2) + (i / 2) * 4] = i * i;
+ if ((i % 2))
+ src.i[(i % 2) + (i / 2) * 4] |= 0x80000000;
+ }
+
+ for (i = 0; i < NUM; i += 2)
+ dst.x [i / 2] = _mm_cvtepu32_epi64 (src.x [i / 2]);
+
+ for (i = 0; i < NUM; i++)
+ if (src.i[(i % 2) + (i / 2) * 4] != dst.ll[i])
+ abort ();
+}
--- /dev/null
+/* { dg-do run { target i?86-*-* x86_64-*-* } } */
+/* { dg-require-effective-target sse4 } */
+/* { dg-options "-O2 -msse4.1" } */
+
+#include "sse4_1-check.h"
+
+#include <smmintrin.h>
+
+#define NUM 128
+
+static void
+sse4_1_test (void)
+{
+ union
+ {
+ __m128i x[NUM / 4];
+ unsigned int i[NUM];
+ unsigned short s[NUM * 2];
+ } dst, src;
+ int i;
+
+ for (i = 0; i < NUM; i++)
+ {
+ src.s[(i % 4) + (i / 4) * 8] = i * i;
+ if ((i % 4))
+ src.s[(i % 4) + (i / 4) * 8] |= 0x8000;
+ }
+
+ for (i = 0; i < NUM; i += 4)
+ dst.x [i / 4] = _mm_cvtepu16_epi32 (src.x [i / 4]);
+
+ for (i = 0; i < NUM; i++)
+ if (src.s[(i % 4) + (i / 4) * 8] != dst.i[i])
+ abort ();
+}
--- /dev/null
+/* { dg-do run { target i?86-*-* x86_64-*-* } } */
+/* { dg-require-effective-target sse4 } */
+/* { dg-options "-O2 -msse4.1" } */
+
+#include "sse4_1-check.h"
+
+#include <smmintrin.h>
+
+#define NUM 128
+
+static void
+sse4_1_test (void)
+{
+ union
+ {
+ __m128i x[NUM / 2];
+ unsigned long long ll[NUM];
+ unsigned short s[NUM * 4];
+ } dst, src;
+ int i;
+
+ for (i = 0; i < NUM; i++)
+ {
+ src.s[(i % 2) + (i / 2) * 8] = i * i;
+ if ((i % 2))
+ src.s[(i % 2) + (i / 2) * 8] |= 0x8000;
+ }
+
+ for (i = 0; i < NUM; i += 2)
+ dst.x [i / 2] = _mm_cvtepu16_epi64 (src.x [i / 2]);
+
+ for (i = 0; i < NUM; i++)
+ if (src.s[(i % 2) + (i / 2) * 8] != dst.ll[i])
+ abort ();
+}
--- /dev/null
+/* { dg-do run { target i?86-*-* x86_64-*-* } } */
+/* { dg-require-effective-target sse4 } */
+/* { dg-options "-O2 -msse4.1" } */
+
+#include "sse4_1-check.h"
+
+#include <smmintrin.h>
+
+#define NUM 64
+
+static void
+sse4_1_test (void)
+{
+ union
+ {
+ __m128i x[NUM / 2];
+ long long ll[NUM];
+ } dst;
+ union
+ {
+ __m128i x[NUM / 2];
+ int i[NUM * 2];
+ } src1, src2;
+ int i, sign = 1;
+ long long value;
+
+ for (i = 0; i < NUM; i += 2)
+ {
+ src1.i[i] = i * i * sign;
+ src2.i[i] = (i + 20) * sign;
+ sign = -sign;
+ }
+
+ for (i = 0; i < NUM; i += 2)
+ dst.x[i / 2] = _mm_mul_epi32 (src1.x[i / 2], src2.x[i / 2]);
+
+ for (i = 0; i < NUM; i++)
+ {
+ value = (long long) src1.i[i * 2] * (long long) src2.i[i * 2];
+ if (value != dst.ll[i])
+ abort ();
+ }
+}
--- /dev/null
+/* { dg-do run { target i?86-*-* x86_64-*-* } } */
+/* { dg-require-effective-target sse4 } */
+/* { dg-options "-O2 -msse4.1" } */
+
+#include "sse4_1-check.h"
+
+#include <smmintrin.h>
+
+#define NUM 64
+
+static void
+sse4_1_test (void)
+{
+ union
+ {
+ __m128i x[NUM / 4];
+ int i[NUM];
+ } dst, src1, src2;
+ int i, sign = 1;
+ int value;
+
+ for (i = 0; i < NUM; i++)
+ {
+ src1.i[i] = i * i * sign;
+ src2.i[i] = (i + 20) * sign;
+ sign = -sign;
+ }
+
+ for (i = 0; i < NUM; i += 4)
+ dst.x[i / 4] = _mm_mullo_epi32 (src1.x[i / 4], src2.x[i / 4]);
+
+ for (i = 0; i < NUM; i++)
+ {
+ value = src1.i[i] * src2.i[i];
+ if (value != dst.i[i])
+ abort ();
+ }
+}
--- /dev/null
+/* { dg-do run { target i?86-*-* x86_64-*-* } } */
+/* { dg-require-effective-target sse4 } */
+/* { dg-options "-O2 -msse4.1" } */
+
+#include "sse4_1-check.h"
+
+#include <smmintrin.h>
+
+static int
+make_ptestz (__m128i m, __m128i v)
+{
+ union
+ {
+ __m128i x;
+ unsigned char c[16];
+ } val, mask;
+ int i, z;
+
+ mask.x = m;
+ val.x = v;
+
+ z = 1;
+ for (i = 0; i < 16; i++)
+ if ((mask.c[i] & val.c[i]))
+ {
+ z = 0;
+ break;
+ }
+ return z;
+}
+
+static int
+make_ptestc (__m128i m, __m128i v)
+{
+ union
+ {
+ __m128i x;
+ unsigned char c[16];
+ } val, mask;
+ int i, c;
+
+ mask.x = m;
+ val.x = v;
+
+ c = 1;
+ for (i = 0; i < 16; i++)
+ if ((val.c[i] & ~mask.c[i]))
+ {
+ c = 0;
+ break;
+ }
+ return c;
+}
+
+static void
+sse4_1_test (void)
+{
+ union
+ {
+ __m128i x;
+ unsigned int i[4];
+ } val[4];
+ int i, j, l;
+ int res[32];
+
+ val[0].i[0] = 0x11111111;
+ val[0].i[1] = 0x00000000;
+ val[0].i[2] = 0x00000000;
+ val[0].i[3] = 0x11111111;
+
+ val[1].i[0] = 0x00000000;
+ val[1].i[1] = 0x11111111;
+ val[1].i[2] = 0x11111111;
+ val[1].i[3] = 0x00000000;
+
+ val[2].i[0] = 0;
+ val[2].i[1] = 0;
+ val[2].i[2] = 0;
+ val[2].i[3] = 0;
+
+ val[3].i[0] = 0xffffffff;
+ val[3].i[1] = 0xffffffff;
+ val[3].i[2] = 0xffffffff;
+ val[3].i[3] = 0xffffffff;
+
+ l = 0;
+ for(i = 0; i < 4; i++)
+ for(j = 0; j < 4; j++)
+ {
+ res[l++] = _mm_testz_si128 (val[j].x, val[i].x);
+ res[l++] = _mm_testc_si128 (val[j].x, val[i].x);
+ }
+
+ l = 0;
+ for(i = 0; i < 4; i++)
+ for(j = 0; j < 4; j++)
+ {
+ if (res[l++] != make_ptestz (val[j].x, val[i].x))
+ abort ();
+ if (res[l++] != make_ptestc (val[j].x, val[i].x))
+ abort ();
+ }
+
+ if (res[2] != _mm_testz_si128 (val[1].x, val[0].x))
+ abort ();
+
+ if (res[3] != _mm_testc_si128 (val[1].x, val[0].x))
+ abort ();
+}
--- /dev/null
+/* { dg-do run { target i?86-*-* x86_64-*-* } } */
+/* { dg-require-effective-target sse4 } */
+/* { dg-options "-O2 -msse4.1" } */
+
+#include "sse4_1-check.h"
+
+#include <smmintrin.h>
+
+static int
+make_ptestnzc (__m128i m, __m128i v)
+{
+ union
+ {
+ __m128i x;
+ unsigned char c[16];
+ } val, mask;
+ int i, z, c;
+
+ mask.x = m;
+ val.x = v;
+
+ z = c = 1;
+ for (i = 0; i < 16; i++)
+ {
+ if ((mask.c[i] & val.c[i]))
+ z = 0;
+ if ((~mask.c[i] & val.c[i]))
+ c = 0;
+ }
+
+ return (z == 0 && c == 0) ? 1 : 0;
+}
+
+static void
+sse4_1_test (void)
+{
+ union
+ {
+ __m128i x;
+ unsigned int i[4];
+ } val[4];
+ int i, j, l;
+ int res[32];
+
+ val[0].i[0] = 0x11111111;
+ val[0].i[1] = 0x00000000;
+ val[0].i[2] = 0x00000000;
+ val[0].i[3] = 0x11111111;
+
+ val[1].i[0] = 0x00000000;
+ val[1].i[1] = 0x11111111;
+ val[1].i[2] = 0x11111111;
+ val[1].i[3] = 0x00000000;
+
+ val[2].i[0] = 0;
+ val[2].i[1] = 0;
+ val[2].i[2] = 0;
+ val[2].i[3] = 0;
+
+ val[3].i[0] = 0xffffffff;
+ val[3].i[1] = 0xffffffff;
+ val[3].i[2] = 0xffffffff;
+ val[3].i[3] = 0xffffffff;
+
+ l = 0;
+ for(i = 0; i < 4; i++)
+ for(j = 0; j < 4; j++)
+ {
+ res[l++] = _mm_testnzc_si128 (val[j].x, val[i].x);
+ res[l++] = _mm_testnzc_si128 (val[j].x, val[i].x);
+ }
+
+ l = 0;
+ for(i = 0; i < 4; i++)
+ for(j = 0; j < 4; j++)
+ {
+ if (res[l++] != make_ptestnzc (val[j].x, val[i].x))
+ abort ();
+ if (res[l++] != make_ptestnzc (val[j].x, val[i].x))
+ abort ();
+ }
+
+ if (res[2] != _mm_testnzc_si128 (val[1].x, val[0].x))
+ abort ();
+
+ if (res[3] != _mm_testnzc_si128 (val[1].x, val[0].x))
+ abort ();
+}
--- /dev/null
+/* { dg-do run { target i?86-*-* x86_64-*-* } } */
+/* { dg-require-effective-target sse4 } */
+/* { dg-options "-O2 -msse4.1" } */
+
+#include "sse4_1-check.h"
+
+#include <smmintrin.h>
+
+static void
+sse4_1_test (void)
+{
+ union
+ {
+ __m128i x;
+ unsigned int i[4];
+ } val[4];
+ int correct_zeros[4];
+ int correct_ones[4];
+ int correct_mixed[4];
+ int zeros[4];
+ int ones[4];
+ int mixed[4];
+ int i;
+ __m128i v;
+
+ val[0].i[0] = 0x11111111;
+ val[0].i[1] = 0x00000000;
+ val[0].i[2] = 0x00000000;
+ val[0].i[3] = 0x11111111;
+ correct_zeros[0] = 0;
+ correct_ones[0] = 0;
+ correct_mixed[0] = 1;
+
+ val[1].i[0] = 0x00000000;
+ val[1].i[1] = 0x11111111;
+ val[1].i[2] = 0x11111111;
+ val[1].i[3] = 0x00000000;
+ correct_zeros[1] = 0;
+ correct_ones[1] = 0;
+ correct_mixed[1] = 1;
+
+ val[2].i[0] = 0;
+ val[2].i[1] = 0;
+ val[2].i[2] = 0;
+ val[2].i[3] = 0;
+ correct_zeros[2] = 1;
+ correct_ones[2] = 0;
+ correct_mixed[2] = 0;
+
+ val[3].i[0] = 0xffffffff;
+ val[3].i[1] = 0xffffffff;
+ val[3].i[2] = 0xffffffff;
+ val[3].i[3] = 0xffffffff;
+ correct_zeros[3] = 0;
+ correct_ones[3] = 1;
+ correct_mixed[3] = 0;
+
+ for (i=0; i < 4; i++)
+ zeros[i] = _mm_test_all_zeros (val[i].x, val[i].x);
+
+ for( i=0; i < 4; i++ )
+ ones[i] = _mm_test_all_ones (val[i].x);
+
+ v = _mm_cmpeq_epi32 (val[0].x, val[0].x);
+ for( i=0; i < 4; i++ )
+ mixed[i] = _mm_test_mix_ones_zeros (val[i].x, v);
+
+ for( i=0; i < 4; i++ )
+ {
+ if (zeros[i] != correct_zeros[i])
+ abort ();
+ if (ones[i] != correct_ones[i])
+ abort ();
+ if (mixed[i] != correct_mixed[i])
+ abort ();
+ }
+}
--- /dev/null
+#include <smmintrin.h>
+#include <math.h>
+
+#define NUM 64
+
+static void
+init_round (FP_T *src)
+{
+ int i, sign = 1;
+ FP_T f = rand ();
+
+ for (i = 0; i < NUM; i++)
+ {
+ src[i] = (i + 1)* f * M_PI * sign;
+ if (i < (NUM / 2))
+ {
+ if ((i % 6) == 0)
+ f = f * src[i];
+ }
+ else if (i == (NUM / 2))
+ f = rand ();
+ else if ((i % 6) == 0)
+ f = 1 / (f * (i + 1) * src[i] * M_PI *sign);
+ sign = -sign;
+ }
+}
+
+static FP_T
+do_round (FP_T f, int type)
+{
+ short saved_cw, new_cw, clr_mask;
+ FP_T ret;
+
+ if ((type & 4))
+ {
+ type = 0;
+ clr_mask = 0xFFFF;
+ }
+ else
+ {
+ type = 0x003F | ((type & 3) << 10);
+ clr_mask = ~0x0C3F;
+ }
+
+ __asm__ ("fld" ASM_SUFFIX " %0" : : "m" (*&f));
+
+ __asm__ ("fstcw %0" : "=m" (*&saved_cw));
+ new_cw = saved_cw & clr_mask;
+ new_cw |= type;
+ __asm__ ("fldcw %0" : : "m" (*&new_cw));
+
+ __asm__ ("frndint\n"
+ "fstp" ASM_SUFFIX " %0\n" : "=m" (*&ret));
+ __asm__ ("fldcw %0" : : "m" (*&saved_cw));
+ return ret;
+}
+
+static void
+sse4_1_test (void)
+{
+ int i;
+ FP_T f;
+ union
+ {
+ VEC_T x[NUM / LOOP_INCREMENT];
+ FP_T f[NUM];
+ } dst, src;
+
+ init_round (src.f);
+
+ for (i = 0; i < NUM / LOOP_INCREMENT; i++)
+ dst.x[i] = ROUND_INTRIN (src.x[i], ROUND_MODE);
+
+ for (i = 0; i < NUM; i += CHECK_LOOP_INCREMENT)
+ {
+ f = do_round (src.f[i], CHECK_ROUND_MODE);
+ if (f != dst.f[i])
+ abort ();
+ }
+
+ if (_MM_FROUND_TO_NEAREST_INT != 0x00
+ || _MM_FROUND_TO_NEG_INF != 0x01
+ || _MM_FROUND_TO_POS_INF != 0x02
+ || _MM_FROUND_TO_ZERO != 0x03
+ || _MM_FROUND_CUR_DIRECTION != 0x04
+ || _MM_FROUND_RAISE_EXC != 0x00
+ || _MM_FROUND_NO_EXC != 0x08
+ || _MM_FROUND_NINT != 0x00
+ || _MM_FROUND_FLOOR != 0x01
+ || _MM_FROUND_CEIL != 0x02
+ || _MM_FROUND_TRUNC != 0x03
+ || _MM_FROUND_RINT != 0x04
+ || _MM_FROUND_NEARBYINT != 0x0C)
+ abort ();
+}
--- /dev/null
+/* { dg-do run { target i?86-*-* x86_64-*-* } } */
+/* { dg-require-effective-target sse4 } */
+/* { dg-options "-O2 -msse4.1" } */
+
+#include "sse4_1-check.h"
+
+#define VEC_T __m128d
+#define FP_T double
+#define ASM_SUFFIX "l"
+
+#define ROUND_INTRIN(x, mode) _mm_ceil_pd(x)
+#define ROUND_MODE _MM_FROUND_CEIL
+#define CHECK_ROUND_MODE 0x02
+
+#define LOOP_INCREMENT 2
+#define CHECK_LOOP_INCREMENT 1
+
+#include "sse4_1-round.h"
--- /dev/null
+/* { dg-do run { target i?86-*-* x86_64-*-* } } */
+/* { dg-require-effective-target sse4 } */
+/* { dg-options "-O2 -msse4.1" } */
+
+#include "sse4_1-check.h"
+
+#define VEC_T __m128d
+#define FP_T double
+#define ASM_SUFFIX "l"
+
+#define ROUND_INTRIN _mm_round_pd
+#define ROUND_MODE _MM_FROUND_NINT
+#define CHECK_ROUND_MODE 0x00
+
+#define LOOP_INCREMENT 2
+#define CHECK_LOOP_INCREMENT 1
+
+#include "sse4_1-round.h"
--- /dev/null
+/* { dg-do run { target i?86-*-* x86_64-*-* } } */
+/* { dg-require-effective-target sse4 } */
+/* { dg-options "-O2 -msse4.1" } */
+
+#include "sse4_1-check.h"
+
+#define VEC_T __m128d
+#define FP_T double
+#define ASM_SUFFIX "l"
+
+#define ROUND_INTRIN(x, mode) _mm_floor_pd(x)
+#define ROUND_MODE _MM_FROUND_FLOOR
+#define CHECK_ROUND_MODE 0x01
+
+#define LOOP_INCREMENT 2
+#define CHECK_LOOP_INCREMENT 1
+
+#include "sse4_1-round.h"
--- /dev/null
+/* { dg-do run { target i?86-*-* x86_64-*-* } } */
+/* { dg-require-effective-target sse4 } */
+/* { dg-options "-O2 -msse4.1" } */
+
+#include "sse4_1-check.h"
+
+#define VEC_T __m128
+#define FP_T float
+#define ASM_SUFFIX "s"
+
+#define ROUND_INTRIN(x, mode) _mm_ceil_ps(x)
+#define ROUND_MODE _MM_FROUND_CEIL
+#define CHECK_ROUND_MODE 0x02
+
+#define LOOP_INCREMENT 4
+#define CHECK_LOOP_INCREMENT 1
+
+#include "sse4_1-round.h"
--- /dev/null
+/* { dg-do run { target i?86-*-* x86_64-*-* } } */
+/* { dg-require-effective-target sse4 } */
+/* { dg-options "-O2 -msse4.1" } */
+
+#include "sse4_1-check.h"
+
+#define VEC_T __m128
+#define FP_T float
+#define ASM_SUFFIX "s"
+
+#define ROUND_INTRIN _mm_round_ps
+#define ROUND_MODE _MM_FROUND_NINT
+#define CHECK_ROUND_MODE 0x00
+
+#define LOOP_INCREMENT 4
+#define CHECK_LOOP_INCREMENT 1
+
+#include "sse4_1-round.h"
--- /dev/null
+/* { dg-do run { target i?86-*-* x86_64-*-* } } */
+/* { dg-require-effective-target sse4 } */
+/* { dg-options "-O2 -msse4.1" } */
+
+#include "sse4_1-check.h"
+
+#define VEC_T __m128
+#define FP_T float
+#define ASM_SUFFIX "s"
+
+#define ROUND_INTRIN(x, mode) _mm_floor_ps(x)
+#define ROUND_MODE _MM_FROUND_FLOOR
+#define CHECK_ROUND_MODE 0x01
+
+#define LOOP_INCREMENT 4
+#define CHECK_LOOP_INCREMENT 1
+
+#include "sse4_1-round.h"
--- /dev/null
+/* { dg-do run { target i?86-*-* x86_64-*-* } } */
+/* { dg-require-effective-target sse4 } */
+/* { dg-options "-O2 -msse4.1" } */
+
+#include "sse4_1-check.h"
+
+#define VEC_T __m128d
+#define FP_T double
+#define ASM_SUFFIX "l"
+
+#define ROUND_INTRIN(x, mode) _mm_ceil_sd(x, x)
+#define ROUND_MODE _MM_FROUND_CEIL
+#define CHECK_ROUND_MODE 0x02
+
+#define LOOP_INCREMENT 2
+#define CHECK_LOOP_INCREMENT 2
+
+#include "sse4_1-round.h"
--- /dev/null
+/* { dg-do run { target i?86-*-* x86_64-*-* } } */
+/* { dg-require-effective-target sse4 } */
+/* { dg-options "-O2 -msse4.1" } */
+
+#include "sse4_1-check.h"
+
+#define VEC_T __m128d
+#define FP_T double
+#define ASM_SUFFIX "l"
+
+#define ROUND_INTRIN(x, mode) _mm_round_sd(x, x, mode)
+#define ROUND_MODE _MM_FROUND_NINT
+#define CHECK_ROUND_MODE 0x00
+
+#define LOOP_INCREMENT 2
+#define CHECK_LOOP_INCREMENT 2
+
+#include "sse4_1-round.h"
--- /dev/null
+/* { dg-do run { target i?86-*-* x86_64-*-* } } */
+/* { dg-require-effective-target sse4 } */
+/* { dg-options "-O2 -msse4.1" } */
+
+#include "sse4_1-check.h"
+
+#define VEC_T __m128d
+#define FP_T double
+#define ASM_SUFFIX "l"
+
+#define ROUND_INTRIN(x, mode) _mm_floor_sd(x, x)
+#define ROUND_MODE _MM_FROUND_FLOOR
+#define CHECK_ROUND_MODE 0x01
+
+#define LOOP_INCREMENT 2
+#define CHECK_LOOP_INCREMENT 2
+
+#include "sse4_1-round.h"
--- /dev/null
+/* { dg-do run { target i?86-*-* x86_64-*-* } } */
+/* { dg-require-effective-target sse4 } */
+/* { dg-options "-O2 -msse4.1" } */
+
+#include "sse4_1-check.h"
+
+#include <smmintrin.h>
+#include <math.h>
+#include <string.h>
+
+#define NUM 64
+
+static void
+init_round (double *src)
+{
+ int i, sign = 1;
+ double d = rand ();
+
+ for (i = 0; i < NUM; i++)
+ {
+ src[i] = (i + 1)* d * M_PI * sign;
+ if (i < (NUM / 2))
+ {
+ if ((i % 6) == 0)
+ d = d * src[i];
+ }
+ else if (i == (NUM / 2))
+ d = rand ();
+ else if ((i % 6) == 0)
+ d = 1 / (d * (i + 1) * src[i] * M_PI *sign);
+ sign = -sign;
+ }
+}
+
+static double
+do_round (double f, int type)
+{
+ short saved_cw, new_cw, clr_mask;
+ double ret;
+
+ if ((type & 4))
+ {
+ type = 0;
+ clr_mask = 0xFFFF;
+ }
+ else
+ {
+ type = 0x003F | ((type & 3) << 10);
+ clr_mask = ~0x0C3F;
+ }
+
+ __asm__ ("fldl %0" : : "m" (*&f));
+
+ __asm__ ("fstcw %0" : "=m" (*&saved_cw));
+ new_cw = saved_cw & clr_mask;
+ new_cw |= type;
+ __asm__ ("fldcw %0" : : "m" (*&new_cw));
+
+ __asm__ ("frndint\n"
+ "fstpl %0\n" : "=m" (*&ret));
+ __asm__ ("fldcw %0" : : "m" (*&saved_cw));
+ return ret;
+}
+
+static void
+sse4_1_test (void)
+{
+ int i;
+ double f;
+ union
+ {
+ __m128d x[NUM / 2];
+ double d[NUM];
+ } dst, src;
+
+ init_round (src.d);
+ memset (&dst, 0, NUM * sizeof(double));
+
+ for (i = 0; i < NUM / 2 ; i++)
+ dst.x[i] = _mm_round_sd (dst.x[i], src.x[i], _MM_FROUND_TRUNC);
+
+ for (i = 0; i < NUM; i += 2)
+ {
+ if (dst.d[i + 1] != 0.0)
+ abort ();
+
+ f = do_round (src.d[i], 0x03);
+ if (f != dst.d[i])
+ abort ();
+ }
+}
--- /dev/null
+/* { dg-do run { target i?86-*-* x86_64-*-* } } */
+/* { dg-require-effective-target sse4 } */
+/* { dg-options "-O2 -msse4.1" } */
+
+#include "sse4_1-check.h"
+
+#define VEC_T __m128
+#define FP_T float
+#define ASM_SUFFIX "s"
+
+#define ROUND_INTRIN(x, mode) _mm_ceil_ss(x, x)
+#define ROUND_MODE _MM_FROUND_CEIL
+#define CHECK_ROUND_MODE 0x02
+
+#define LOOP_INCREMENT 4
+#define CHECK_LOOP_INCREMENT 4
+
+#include "sse4_1-round.h"
--- /dev/null
+/* { dg-do run { target i?86-*-* x86_64-*-* } } */
+/* { dg-require-effective-target sse4 } */
+/* { dg-options "-O2 -msse4.1" } */
+
+#include "sse4_1-check.h"
+
+#define VEC_T __m128
+#define FP_T float
+#define ASM_SUFFIX "s"
+
+#define ROUND_INTRIN(x, mode) _mm_round_ss(x, x, mode)
+#define ROUND_MODE _MM_FROUND_NINT
+#define CHECK_ROUND_MODE 0x00
+
+#define LOOP_INCREMENT 4
+#define CHECK_LOOP_INCREMENT 4
+
+#include "sse4_1-round.h"
--- /dev/null
+/* { dg-do run { target i?86-*-* x86_64-*-* } } */
+/* { dg-require-effective-target sse4 } */
+/* { dg-options "-O2 -msse4.1" } */
+
+#include "sse4_1-check.h"
+
+#define VEC_T __m128
+#define FP_T float
+#define ASM_SUFFIX "s"
+
+#define ROUND_INTRIN(x, mode) _mm_floor_ss(x, x)
+#define ROUND_MODE _MM_FROUND_FLOOR
+#define CHECK_ROUND_MODE 0x01
+
+#define LOOP_INCREMENT 4
+#define CHECK_LOOP_INCREMENT 4
+
+#include "sse4_1-round.h"
--- /dev/null
+/* { dg-do run { target i?86-*-* x86_64-*-* } } */
+/* { dg-require-effective-target sse4 } */
+/* { dg-options "-O2 -msse4.1" } */
+
+#include "sse4_1-check.h"
+
+#include <smmintrin.h>
+#include <math.h>
+#include <string.h>
+
+#define NUM 64
+
+static void
+init_round (float *src)
+{
+ int i, sign = 1;
+ float f = rand ();
+
+ for (i = 0; i < NUM; i++)
+ {
+ src[i] = (i + 1)* f * M_PI * sign;
+ if (i < (NUM / 2))
+ {
+ if ((i % 6) == 0)
+ f = f * src[i];
+ }
+ else if (i == (NUM / 2))
+ f = rand ();
+ else if ((i % 6) == 0)
+ f = 1 / (f * (i + 1) * src[i] * M_PI *sign);
+ sign = -sign;
+ }
+}
+
+static float
+do_round (float f, int type)
+{
+ short saved_cw, new_cw, clr_mask;
+ float ret;
+
+ if ((type & 4))
+ {
+ type = 0;
+ clr_mask = 0xFFFF;
+ }
+ else
+ {
+ type = 0x003F | ((type & 3) << 10);
+ clr_mask = ~0x0C3F;
+ }
+
+ __asm__ ("flds %0" : : "m" (*&f));
+
+ __asm__ ("fstcw %0" : "=m" (*&saved_cw));
+ new_cw = saved_cw & clr_mask;
+ new_cw |= type;
+ __asm__ ("fldcw %0" : : "m" (*&new_cw));
+
+ __asm__ ("frndint\n"
+ "fstps %0\n" : "=m" (*&ret));
+ __asm__ ("fldcw %0" : : "m" (*&saved_cw));
+ return ret;
+}
+
+static void
+sse4_1_test (void)
+{
+ int i, j;
+ float f;
+ union
+ {
+ __m128 x[NUM / 4];
+ float f[NUM];
+ } dst, src;
+
+ init_round (src.f);
+ memset (&dst, 0, NUM * sizeof(float));
+
+ for (i = 0; i < NUM / 4 ; i++)
+ dst.x[i] = _mm_round_ss (dst.x[i], src.x[i], _MM_FROUND_RINT);
+
+ for (i = 0; i < NUM; i += 4)
+ {
+ for (j = 0; j < 3; j++)
+ if (dst.f[i + j + 1] != 0.0)
+ abort ();
+
+ f = do_round (src.f[i], 0x04);
+ if (f != dst.f[i])
+ abort ();
+ }
+
+ for (i = 0; i < NUM / 4 ; i++)
+ dst.x[i] = _mm_round_ss (dst.x[i], src.x[i], _MM_FROUND_NEARBYINT);
+
+ for (i = 0; i < NUM; i += 4)
+ {
+ for (j = 0; j < 3; j++)
+ if (dst.f[i + j + 1] != 0.0)
+ abort ();
+
+ f = do_round (src.f[i], 0x0c);
+ if (f != dst.f[i])
+ abort ();
+ }
+}