;; z Constant call address operand.
;; C Integer SSE constant with all bits set operand.
;; F Floating-point SSE constant with all bits set operand.
+;; H Integer SSE constant that is 128/256bit all ones
+;; and zero-extand to 256/512bit, or 128bit all ones
+;; and zero-extend to 512bit.
;; M x86-64 memory operand.
(define_constraint "Bf"
(and (match_test "TARGET_SSE")
(match_operand 0 "float_vector_all_ones_operand")))
+(define_constraint "BH"
+ "@internal integer constant with last half/quarter bits set operand."
+ (ior (match_operand 0 "vector_all_ones_zero_extend_half_operand")
+ (match_operand 0 "vector_all_ones_zero_extend_quarter_operand")))
+
;; NB: Similar to 'm', but don't use define_memory_constraint on x86-64
;; to prevent LRA from converting the operand to the form '(mem (reg X))'
;; where X is a base register.
XFmode);
}
-/* Return 1 if X is all bits 0 and 2 if X is all bits 1
+/* Return 1 if X is all bits 0, 2 if X is all bits 1
+ and 3 if X is all bits 1 with zero extend
in supported SSE/AVX vector mode. */
int
}
}
+ if (vector_all_ones_zero_extend_half_operand (x, mode)
+ || vector_all_ones_zero_extend_quarter_operand (x, mode))
+ return 3;
+
return 0;
}
gcc_unreachable ();
}
}
+ else if (vector_all_ones_zero_extend_half_operand (x, mode))
+ {
+ if (GET_MODE_SIZE (mode) == 64)
+ {
+ gcc_assert (TARGET_AVX512F);
+ return "vpcmpeqd \t %t0, %t0, %t0";
+ }
+ else if (GET_MODE_SIZE (mode) == 32)
+ {
+ gcc_assert (TARGET_AVX);
+ return "vpcmpeqd \t %x0, %x0, %x0";
+ }
+ gcc_unreachable ();
+ }
+ else if (vector_all_ones_zero_extend_quarter_operand (x, mode))
+ {
+ gcc_assert (TARGET_AVX512F);
+ return "vpcmpeqd \t %x0, %x0, %x0";
+ }
gcc_unreachable ();
}
(match_test "INTEGRAL_MODE_P (GET_MODE (op))")
(match_test "op == CONSTM1_RTX (GET_MODE (op))")))
+/* Return true if operand is an 128/256bit all ones vector
+ that zero-extends to 256/512bit. */
+(define_predicate "vector_all_ones_zero_extend_half_operand"
+ (match_code "const_vector")
+{
+ mode = GET_MODE (op);
+ if (GET_MODE_CLASS (mode) != MODE_VECTOR_INT
+ || (GET_MODE_SIZE (mode) != 32
+ && GET_MODE_SIZE (mode) != 64))
+ return false;
+
+ int nelts = CONST_VECTOR_NUNITS (op);
+ for (int i = 0; i != nelts; i++)
+ {
+ rtx elt = CONST_VECTOR_ELT (op, i);
+ if (i < nelts / 2
+ && elt != CONSTM1_RTX (GET_MODE_INNER (mode)))
+ return false;
+ if (i >= nelts / 2
+ && elt != CONST0_RTX (GET_MODE_INNER (mode)))
+ return false;
+ }
+ return true;
+})
+
+/* Return true if operand is an 128bit all ones vector
+ that zero extends to 512bit. */
+(define_predicate "vector_all_ones_zero_extend_quarter_operand"
+ (match_code "const_vector")
+{
+ mode = GET_MODE (op);
+ if (GET_MODE_CLASS (mode) != MODE_VECTOR_INT
+ || GET_MODE_SIZE (mode) != 64)
+ return false;
+
+ int nelts = CONST_VECTOR_NUNITS (op);
+ for (int i = 0; i != nelts; i++)
+ {
+ rtx elt = CONST_VECTOR_ELT (op, i);
+ if (i < nelts / 4
+ && elt != CONSTM1_RTX (GET_MODE_INNER (mode)))
+ return false;
+ if (i >= nelts / 4
+ && elt != CONST0_RTX (GET_MODE_INNER (mode)))
+ return false;
+ }
+ return true;
+})
+
; Return true when OP is operand acceptable for vector memory operand.
; Only AVX can have misaligned memory operand.
(define_predicate "vector_memory_operand"
(define_insn "mov<mode>_internal"
[(set (match_operand:VMOVE 0 "nonimmediate_operand"
- "=v,v ,v ,m")
+ "=v,v ,v,v ,m")
(match_operand:VMOVE 1 "nonimmediate_or_sse_const_operand"
- " C,<sseconstm1>,vm,v"))]
+ " C,<sseconstm1>,BH,vm,v"))]
"TARGET_SSE
&& (register_operand (operands[0], <MODE>mode)
|| register_operand (operands[1], <MODE>mode))
gcc_unreachable ();
}
}
- [(set_attr "type" "sselog1,sselog1,ssemov,ssemov")
+ [(set_attr "type" "sselog1,sselog1,sselog1,ssemov,ssemov")
(set_attr "prefix" "maybe_vex")
(set (attr "mode")
(cond [(match_test "TARGET_AVX")
(and (match_test "<MODE>mode == V2DFmode")
(match_test "TARGET_SSE_PACKED_SINGLE_INSN_OPTIMAL"))
(const_string "V4SF")
- (and (eq_attr "alternative" "3")
+ (and (eq_attr "alternative" "4")
(match_test "TARGET_SSE_TYPELESS_STORES"))
(const_string "V4SF")
(and (eq_attr "alternative" "0")
c[i] = a[i] * b[i+3];
}
-/* { dg-final { scan-assembler-not "vmovups\[^\n\r]*movv8sf_internal/2" } } */
-/* { dg-final { scan-assembler "movv4sf_internal/2" } } */
+/* { dg-final { scan-assembler-not "vmovups\[^\n\r]*movv8sf_internal/3" } } */
+/* { dg-final { scan-assembler "movv4sf_internal/3" } } */
/* { dg-final { scan-assembler "vinsertf128" } } */
d[i] = c[i] * 20.0;
}
-/* { dg-final { scan-assembler-not "vmovups.*movv8sf_internal/3" } } */
-/* { dg-final { scan-assembler "vmovups.*movv4sf_internal/3" } } */
+/* { dg-final { scan-assembler-not "vmovups.*movv8sf_internal/4" } } */
+/* { dg-final { scan-assembler "vmovups.*movv4sf_internal/4" } } */
/* { dg-final { scan-assembler "vextractf128" } } */
}
}
-/* { dg-final { scan-assembler-not "vmovdqu.*movv32qi_internal/3" } } */
-/* { dg-final { scan-assembler "vmovdqu.*movv16qi_internal/3" } } */
+/* { dg-final { scan-assembler-not "vmovdqu.*movv32qi_internal/4" } } */
+/* { dg-final { scan-assembler "vmovdqu.*movv16qi_internal/4" } } */
/* { dg-final { scan-assembler "vextract.128" } } */
d[i] = c[i] * 20.0;
}
-/* { dg-final { scan-assembler-not "vmovupd.*movv4df_internal/3" } } */
-/* { dg-final { scan-assembler "vmovupd.*movv2df_internal/3" } } */
+/* { dg-final { scan-assembler-not "vmovupd.*movv4df_internal/4" } } */
+/* { dg-final { scan-assembler "vmovupd.*movv2df_internal/4" } } */
/* { dg-final { scan-assembler "vextractf128" } } */
--- /dev/null
+/* { dg-do compile } */
+/* { dg-options "-O2 -mavx" } */
+/* { dg-final { scan-assembler-not "vmovdqa" } } */
+/* { dg-final { scan-assembler-times "vpcmpeqd\[ \\t\]+%xmm\[0-9\]" 1 } } */
+
+#include <immintrin.h>
+
+__m256i mask()
+{
+ return _mm256_zextsi128_si256(_mm_set1_epi8(-1));
+}
--- /dev/null
+/* { dg-do compile } */
+/* { dg-options "-O2 -mavx512f" } */
+/* { dg-final { scan-assembler-not "vmovdqa" } } */
+/* { dg-final { scan-assembler-times "vpcmpeqd\[ \\t\]+%xmm\[0-9\]" 1 } } */
+/* { dg-final { scan-assembler-times "vpcmpeqd\[ \\t\]+%ymm\[0-9\]" 1 } } */
+
+#include <immintrin.h>
+
+__m512i mask1()
+{
+ return _mm512_zextsi128_si512(_mm_set1_epi8(-1));
+}
+
+__m512i mask2()
+{
+ return _mm512_zextsi256_si512(_mm256_set1_epi8(-1));
+}
--- /dev/null
+/* { dg-do compile } */
+/* { dg-options "-O2 -mavx512f" } */
+/* { dg-final { scan-assembler-not "vmovdqa" } } */
+/* { dg-final { scan-assembler-times "vpcmpeqd\[ \\t\]+%ymm\[0-9\]" 4 } } */
+
+typedef long long __v8di __attribute__ ((__vector_size__ (64)));
+typedef int __v16si __attribute__ ((__vector_size__ (64)));
+typedef short __v32hi __attribute__ ((__vector_size__ (64)));
+typedef char __v64qi __attribute__ ((__vector_size__ (64)));
+typedef long long __m512i __attribute__ ((__vector_size__ (64), __may_alias__));
+
+__m512i
+__attribute__ ((noinline, noclone))
+foo1 ()
+{
+ return __extension__ (__m512i)(__v8di) { -1, -1, -1, -1,
+ 0, 0, 0, 0 };
+}
+
+__m512i
+__attribute__ ((noinline, noclone))
+foo2 ()
+{
+ return __extension__ (__m512i)(__v16si) { -1, -1, -1, -1,
+ -1, -1, -1, -1,
+ 0, 0, 0, 0,
+ 0, 0, 0, 0 };
+}
+
+__m512i
+__attribute__ ((noinline, noclone))
+foo3 ()
+{
+ return __extension__ (__m512i)(__v32hi) { -1, -1, -1, -1,
+ -1, -1, -1, -1,
+ -1, -1, -1, -1,
+ -1, -1, -1, -1,
+ 0, 0, 0, 0,
+ 0, 0, 0, 0,
+ 0, 0, 0, 0,
+ 0, 0, 0, 0 };
+}
+
+__m512i
+__attribute__ ((noinline, noclone))
+foo4 ()
+{
+ return __extension__ (__m512i)(__v64qi) { -1, -1, -1, -1,
+ -1, -1, -1, -1,
+ -1, -1, -1, -1,
+ -1, -1, -1, -1,
+ -1, -1, -1, -1,
+ -1, -1, -1, -1,
+ -1, -1, -1, -1,
+ -1, -1, -1, -1,
+ 0, 0, 0, 0,
+ 0, 0, 0, 0,
+ 0, 0, 0, 0,
+ 0, 0, 0, 0,
+ 0, 0, 0, 0,
+ 0, 0, 0, 0,
+ 0, 0, 0, 0,
+ 0, 0, 0, 0 };
+}
--- /dev/null
+/* { dg-do compile } */
+/* { dg-options "-O2 -mavx" } */
+/* { dg-final { scan-assembler-not "vmovdqa" } } */
+/* { dg-final { scan-assembler-times "vpcmpeqd\[ \\t\]+%xmm\[0-9\]" 4 } } */
+
+typedef long long __v4di __attribute__ ((__vector_size__ (32)));
+typedef int __v8si __attribute__ ((__vector_size__ (32)));
+typedef short __v16hi __attribute__ ((__vector_size__ (32)));
+typedef char __v32qi __attribute__ ((__vector_size__ (32)));
+typedef long long __m256i __attribute__ ((__vector_size__ (32), __may_alias__));
+
+__m256i
+__attribute__ ((noinline, noclone))
+foo1 ()
+{
+ return __extension__ (__m256i)(__v4di) { -1, -1, 0, 0 };
+}
+
+__m256i
+__attribute__ ((noinline, noclone))
+foo2 ()
+{
+ return __extension__ (__m256i)(__v8si) { -1, -1, -1, -1,
+ 0, 0, 0, 0 };
+}
+
+__m256i
+__attribute__ ((noinline, noclone))
+foo3 ()
+{
+ return __extension__ (__m256i)(__v16hi) { -1, -1, -1, -1,
+ -1, -1, -1, -1,
+ 0, 0, 0, 0,
+ 0, 0, 0, 0 };
+}
+
+__m256i
+__attribute__ ((noinline, noclone))
+foo4 ()
+{
+ return __extension__ (__m256i)(__v32qi) { -1, -1, -1, -1,
+ -1, -1, -1, -1,
+ -1, -1, -1, -1,
+ -1, -1, -1, -1,
+ 0, 0, 0, 0,
+ 0, 0, 0, 0,
+ 0, 0, 0, 0,
+ 0, 0, 0, 0 };
+}