static rtx inline_expand_builtin_bytecmp (tree, rtx);
static rtx expand_builtin_strcmp (tree, rtx);
static rtx expand_builtin_strncmp (tree, rtx, machine_mode);
-static rtx builtin_memcpy_read_str (void *, HOST_WIDE_INT, scalar_int_mode);
static rtx expand_builtin_memchr (tree, rtx);
static rtx expand_builtin_memcpy (tree, rtx);
static rtx expand_builtin_memory_copy_args (tree dest, tree src, tree len,
static rtx expand_builtin_stpncpy (tree, rtx);
static rtx expand_builtin_strncat (tree, rtx);
static rtx expand_builtin_strncpy (tree, rtx);
-static rtx builtin_memset_gen_str (void *, HOST_WIDE_INT, scalar_int_mode);
static rtx expand_builtin_memset (tree, rtx, machine_mode);
static rtx expand_builtin_memset_args (tree, tree, tree, rtx, machine_mode, tree);
static rtx expand_builtin_bzero (tree);
a target constant. */
static rtx
-builtin_memcpy_read_str (void *data, HOST_WIDE_INT offset,
+builtin_memcpy_read_str (void *data, void *, HOST_WIDE_INT offset,
scalar_int_mode mode)
{
/* The REPresentation pointed to by DATA need not be a nul-terminated
constant. */
rtx
-builtin_strncpy_read_str (void *data, HOST_WIDE_INT offset,
+builtin_strncpy_read_str (void *data, void *, HOST_WIDE_INT offset,
scalar_int_mode mode)
{
const char *str = (const char *) data;
/* Callback routine for store_by_pieces. Read GET_MODE_BITSIZE (MODE)
bytes from constant string DATA + OFFSET and return it as target
- constant. */
+ constant. If PREV isn't nullptr, it has the RTL info from the
+ previous iteration. */
rtx
-builtin_memset_read_str (void *data, HOST_WIDE_INT offset ATTRIBUTE_UNUSED,
+builtin_memset_read_str (void *data, void *prevp,
+ HOST_WIDE_INT offset ATTRIBUTE_UNUSED,
scalar_int_mode mode)
{
+ by_pieces_prev *prev = (by_pieces_prev *) prevp;
+ if (prev != nullptr && prev->data != nullptr)
+ {
+ /* Use the previous data in the same mode. */
+ if (prev->mode == mode)
+ return prev->data;
+ }
+
const char *c = (const char *) data;
char *p = XALLOCAVEC (char, GET_MODE_SIZE (mode));
/* Callback routine for store_by_pieces. Return the RTL of a register
containing GET_MODE_SIZE (MODE) consecutive copies of the unsigned
char value given in the RTL register data. For example, if mode is
- 4 bytes wide, return the RTL for 0x01010101*data. */
+ 4 bytes wide, return the RTL for 0x01010101*data. If PREV isn't
+ nullptr, it has the RTL info from the previous iteration. */
static rtx
-builtin_memset_gen_str (void *data, HOST_WIDE_INT offset ATTRIBUTE_UNUSED,
+builtin_memset_gen_str (void *data, void *prevp,
+ HOST_WIDE_INT offset ATTRIBUTE_UNUSED,
scalar_int_mode mode)
{
rtx target, coeff;
size_t size;
char *p;
+ by_pieces_prev *prev = (by_pieces_prev *) prevp;
+ if (prev != nullptr && prev->data != nullptr)
+ {
+ /* Use the previous data in the same mode. */
+ if (prev->mode == mode)
+ return prev->data;
+
+ return simplify_gen_subreg (mode, prev->data, prev->mode, 0);
+ }
+
size = GET_MODE_SIZE (mode);
if (size == 1)
return (rtx) data;
extern tree mathfn_built_in (tree, enum built_in_function fn);
extern tree mathfn_built_in (tree, combined_fn);
extern tree mathfn_built_in_type (combined_fn);
-extern rtx builtin_strncpy_read_str (void *, HOST_WIDE_INT, scalar_int_mode);
-extern rtx builtin_memset_read_str (void *, HOST_WIDE_INT, scalar_int_mode);
+extern rtx builtin_strncpy_read_str (void *, void *, HOST_WIDE_INT,
+ scalar_int_mode);
+extern rtx builtin_memset_read_str (void *, void *, HOST_WIDE_INT,
+ scalar_int_mode);
extern rtx expand_builtin_saveregs (void);
extern tree std_build_builtin_va_list (void);
extern tree std_fn_abi_va_list (tree);
#undef TARGET_ADDRESS_COST
#define TARGET_ADDRESS_COST ix86_address_cost
+#undef TARGET_OVERLAP_OP_BY_PIECES_P
+#define TARGET_OVERLAP_OP_BY_PIECES_P hook_bool_void_true
+
#undef TARGET_FLAGS_REGNUM
#define TARGET_FLAGS_REGNUM FLAGS_REG
#undef TARGET_FIXED_CONDITION_CODE_REGS
move would be greater than that of a library call.
@end deftypefn
+@deftypefn {Target Hook} bool TARGET_OVERLAP_OP_BY_PIECES_P (void)
+This target hook should return true if when the @code{by_pieces}
+infrastructure is used, an offset adjusted unaligned memory operation
+in the smallest integer mode for the last piece operation of a memory
+region can be generated to avoid doing more than one smaller operations.
+@end deftypefn
+
@deftypefn {Target Hook} int TARGET_COMPARE_BY_PIECES_BRANCH_RATIO (machine_mode @var{mode})
When expanding a block comparison in MODE, gcc can try to reduce the
number of branches at the expense of more memory operations. This hook
@hook TARGET_USE_BY_PIECES_INFRASTRUCTURE_P
+@hook TARGET_OVERLAP_OP_BY_PIECES_P
+
@hook TARGET_COMPARE_BY_PIECES_BRANCH_RATIO
@defmac MOVE_MAX_PIECES
unsigned int max_size, by_pieces_operation op)
{
unsigned HOST_WIDE_INT n_insns = 0;
+ scalar_int_mode mode;
+
+ if (targetm.overlap_op_by_pieces_p () && op != COMPARE_BY_PIECES)
+ {
+ /* NB: Round up L and ALIGN to the widest integer mode for
+ MAX_SIZE. */
+ mode = widest_int_mode_for_size (max_size);
+ if (optab_handler (mov_optab, mode) != CODE_FOR_nothing)
+ {
+ unsigned HOST_WIDE_INT up = ROUND_UP (l, GET_MODE_SIZE (mode));
+ if (up > l)
+ l = up;
+ align = GET_MODE_ALIGNMENT (mode);
+ }
+ }
align = alignment_for_piecewise_move (MOVE_MAX_PIECES, align);
while (max_size > 1 && l > 0)
{
- scalar_int_mode mode = widest_int_mode_for_size (max_size);
+ mode = widest_int_mode_for_size (max_size);
enum insn_code icode;
unsigned int modesize = GET_MODE_SIZE (mode);
void *m_cfndata;
public:
pieces_addr (rtx, bool, by_pieces_constfn, void *);
- rtx adjust (scalar_int_mode, HOST_WIDE_INT);
+ rtx adjust (scalar_int_mode, HOST_WIDE_INT,
+ by_pieces_prev * = nullptr);
void increment_address (HOST_WIDE_INT);
void maybe_predec (HOST_WIDE_INT);
void maybe_postinc (HOST_WIDE_INT);
but we still modify the MEM's properties. */
rtx
-pieces_addr::adjust (scalar_int_mode mode, HOST_WIDE_INT offset)
+pieces_addr::adjust (scalar_int_mode mode, HOST_WIDE_INT offset,
+ by_pieces_prev *prev)
{
if (m_constfn)
- return m_constfn (m_cfndata, offset, mode);
+ /* Pass the previous data to m_constfn. */
+ return m_constfn (m_cfndata, prev, offset, mode);
if (m_obj == NULL_RTX)
return NULL_RTX;
if (m_auto)
unsigned int m_align;
unsigned int m_max_size;
bool m_reverse;
+ /* True if this is a stack push. */
+ bool m_push;
+ /* True if targetm.overlap_op_by_pieces_p () returns true. */
+ bool m_overlap_op_by_pieces;
/* Virtual functions, overriden by derived classes for the specific
operation. */
public:
op_by_pieces_d (rtx, bool, rtx, bool, by_pieces_constfn, void *,
- unsigned HOST_WIDE_INT, unsigned int);
+ unsigned HOST_WIDE_INT, unsigned int, bool);
void run ();
};
by_pieces_constfn from_cfn,
void *from_cfn_data,
unsigned HOST_WIDE_INT len,
- unsigned int align)
+ unsigned int align, bool push)
: m_to (to, to_load, NULL, NULL),
m_from (from, from_load, from_cfn, from_cfn_data),
- m_len (len), m_max_size (MOVE_MAX_PIECES + 1)
+ m_len (len), m_max_size (MOVE_MAX_PIECES + 1),
+ m_push (push)
{
int toi = m_to.get_addr_inc ();
int fromi = m_from.get_addr_inc ();
align = alignment_for_piecewise_move (MOVE_MAX_PIECES, align);
m_align = align;
+
+ m_overlap_op_by_pieces = targetm.overlap_op_by_pieces_p ();
}
/* This function returns the largest usable integer mode for LEN bytes
scalar_int_mode mode = widest_int_mode_for_size (m_max_size);
mode = get_usable_mode (mode, m_len);
+ by_pieces_prev to_prev = { nullptr, mode };
+ by_pieces_prev from_prev = { nullptr, mode };
+
do
{
unsigned int size = GET_MODE_SIZE (mode);
if (m_reverse)
m_offset -= size;
- to1 = m_to.adjust (mode, m_offset);
- from1 = m_from.adjust (mode, m_offset);
+ to1 = m_to.adjust (mode, m_offset, &to_prev);
+ to_prev.data = to1;
+ to_prev.mode = mode;
+ from1 = m_from.adjust (mode, m_offset, &from_prev);
+ from_prev.data = from1;
+ from_prev.mode = mode;
m_to.maybe_predec (-(HOST_WIDE_INT)size);
m_from.maybe_predec (-(HOST_WIDE_INT)size);
if (m_len == 0)
return;
- /* NB: widest_int_mode_for_size checks SIZE > 1. */
- mode = widest_int_mode_for_size (size);
- mode = get_usable_mode (mode, m_len);
+ if (!m_push && m_overlap_op_by_pieces)
+ {
+ /* NB: Generate overlapping operations if it is not a stack
+ push since stack push must not overlap. Get the smallest
+ integer mode for M_LEN bytes. */
+ mode = smallest_int_mode_for_size (m_len * BITS_PER_UNIT);
+ mode = get_usable_mode (mode, GET_MODE_SIZE (mode));
+ int gap = GET_MODE_SIZE (mode) - m_len;
+ if (gap > 0)
+ {
+ /* If size of MODE > M_LEN, generate the last operation
+ in MODE for the remaining bytes with ovelapping memory
+ from the previois operation. */
+ if (m_reverse)
+ m_offset += gap;
+ else
+ m_offset -= gap;
+ m_len += gap;
+ }
+ }
+ else
+ {
+ /* NB: widest_int_mode_for_size checks SIZE > 1. */
+ mode = widest_int_mode_for_size (size);
+ mode = get_usable_mode (mode, m_len);
+ }
}
while (1);
/* Derived class from op_by_pieces_d, providing support for block move
operations. */
+#ifdef PUSH_ROUNDING
+#define PUSHG_P(to) ((to) == nullptr)
+#else
+#define PUSHG_P(to) false
+#endif
+
class move_by_pieces_d : public op_by_pieces_d
{
insn_gen_fn m_gen_fun;
public:
move_by_pieces_d (rtx to, rtx from, unsigned HOST_WIDE_INT len,
unsigned int align)
- : op_by_pieces_d (to, false, from, true, NULL, NULL, len, align)
+ : op_by_pieces_d (to, false, from, true, NULL, NULL, len, align,
+ PUSHG_P (to))
{
}
rtx finish_retmode (memop_ret);
public:
store_by_pieces_d (rtx to, by_pieces_constfn cfn, void *cfn_data,
unsigned HOST_WIDE_INT len, unsigned int align)
- : op_by_pieces_d (to, false, NULL_RTX, true, cfn, cfn_data, len, align)
+ : op_by_pieces_d (to, false, NULL_RTX, true, cfn, cfn_data, len,
+ align, false)
{
}
rtx finish_retmode (memop_ret);
int
can_store_by_pieces (unsigned HOST_WIDE_INT len,
- rtx (*constfun) (void *, HOST_WIDE_INT, scalar_int_mode),
+ by_pieces_constfn constfun,
void *constfundata, unsigned int align, bool memsetp)
{
unsigned HOST_WIDE_INT l;
if (reverse)
offset -= size;
- cst = (*constfun) (constfundata, offset, mode);
+ cst = (*constfun) (constfundata, nullptr, offset, mode);
if (!targetm.legitimate_constant_p (mode, cst))
return 0;
rtx
store_by_pieces (rtx to, unsigned HOST_WIDE_INT len,
- rtx (*constfun) (void *, HOST_WIDE_INT, scalar_int_mode),
+ by_pieces_constfn constfun,
void *constfundata, unsigned int align, bool memsetp,
memop_ret retmode)
{
Return const0_rtx unconditionally. */
static rtx
-clear_by_pieces_1 (void *, HOST_WIDE_INT, scalar_int_mode)
+clear_by_pieces_1 (void *, void *, HOST_WIDE_INT, scalar_int_mode)
{
return const0_rtx;
}
compare_by_pieces_d (rtx op0, rtx op1, by_pieces_constfn op1_cfn,
void *op1_cfn_data, HOST_WIDE_INT len, int align,
rtx_code_label *fail_label)
- : op_by_pieces_d (op0, true, op1, true, op1_cfn, op1_cfn_data, len, align)
+ : op_by_pieces_d (op0, true, op1, true, op1_cfn, op1_cfn_data, len,
+ align, false)
{
m_fail_label = fail_label;
}
/* Helper function for store_expr storing of STRING_CST. */
static rtx
-string_cst_read_str (void *data, HOST_WIDE_INT offset, scalar_int_mode mode)
+string_cst_read_str (void *data, void *, HOST_WIDE_INT offset,
+ scalar_int_mode mode)
{
tree str = (tree) data;
BLOCK_OP_NO_LIBCALL_RET
};
-typedef rtx (*by_pieces_constfn) (void *, HOST_WIDE_INT, scalar_int_mode);
+typedef rtx (*by_pieces_constfn) (void *, void *, HOST_WIDE_INT,
+ scalar_int_mode);
+
+/* The second pointer passed to by_pieces_constfn. */
+struct by_pieces_prev
+{
+ rtx data;
+ scalar_int_mode mode;
+};
extern rtx emit_block_move (rtx, rtx, rtx, enum block_op_methods);
extern rtx emit_block_move_hints (rtx, rtx, rtx, enum block_op_methods,
default_use_by_pieces_infrastructure_p)
DEFHOOK
+(overlap_op_by_pieces_p,
+ "This target hook should return true if when the @code{by_pieces}\n\
+infrastructure is used, an offset adjusted unaligned memory operation\n\
+in the smallest integer mode for the last piece operation of a memory\n\
+region can be generated to avoid doing more than one smaller operations.",
+ bool, (void),
+ hook_bool_void_false)
+
+DEFHOOK
(compare_by_pieces_branch_ratio,
"When expanding a block comparison in MODE, gcc can try to reduce the\n\
number of branches at the expense of more memory operations. This hook\n\
--- /dev/null
+class fixed_wide_int_storage {
+public:
+ long val[10];
+ int len;
+ fixed_wide_int_storage ()
+ {
+ len = sizeof (val) / sizeof (val[0]);
+ for (int i = 0; i < len; i++)
+ val[i] = i;
+ }
+};
+
+extern void foo (fixed_wide_int_storage);
+extern int record_increment(void);
--- /dev/null
+// { dg-do compile }
+// { dg-options "-O2" }
+// { dg-additional-options "-mno-avx -msse2 -mtune=skylake" { target { i?86-*-* x86_64-*-* } } }
+
+#include "pr90773-1.h"
+
+int
+record_increment(void)
+{
+ fixed_wide_int_storage x;
+ foo (x);
+ return 0;
+}
--- /dev/null
+// { dg-do compile }
+// { dg-options "-O2" }
+// { dg-additional-options "-mno-avx512f -march=skylake" { target { i?86-*-* x86_64-*-* } } }
+
+#include "pr90773-1a.C"
--- /dev/null
+// { dg-do compile }
+// { dg-options "-O2" }
+// { dg-additional-options "-march=skylake-avx512" { target { i?86-*-* x86_64-*-* } } }
+
+#include "pr90773-1a.C"
--- /dev/null
+// { dg-do run }
+// { dg-options "-O2" }
+// { dg-additional-options "-march=native" { target { i?86-*-* x86_64-*-* } } }
+// { dg-additional-sources "pr90773-1a.C" }
+
+#include "pr90773-1.h"
+
+void
+foo (fixed_wide_int_storage x)
+{
+ for (int i = 0; i < x.len; i++)
+ if (x.val[i] != i)
+ __builtin_abort ();
+}
+
+int main ()
+{
+ return record_increment ();
+}
--- /dev/null
+/* { dg-do compile } */
+/* { dg-options "-O2 -mtune=generic" } */
+
+extern char *dst, *src;
+
+void
+foo (void)
+{
+ __builtin_memcpy (dst, src, 15);
+}
+
+/* { dg-final { scan-assembler-times "movq\[\\t \]+\\(%\[\^,\]+\\)," 1 { target { ! ia32 } } } } */
+/* { dg-final { scan-assembler-times "movq\[\\t \]+7\\(%\[\^,\]+\\)," 1 { target { ! ia32 } } } } */
+/* { dg-final { scan-assembler-times "movl\[\\t \]+\\(%\[\^,\]+\\)," 1 { target ia32 } } } */
+/* { dg-final { scan-assembler-times "movl\[\\t \]+4\\(%\[\^,\]+\\)," 1 { target ia32 } } } */
+/* { dg-final { scan-assembler-times "movl\[\\t \]+8\\(%\[\^,\]+\\)," 1 { target ia32 } } } */
+/* { dg-final { scan-assembler-times "movl\[\\t \]+11\\(%\[\^,\]+\\)," 1 { target ia32 } } } */
--- /dev/null
+/* { dg-do compile } */
+/* { dg-options "-O2 -mtune=generic" } */
+
+extern char *dst;
+
+void
+foo (int c)
+{
+ __builtin_memset (dst, c, 5);
+}
+
+/* { dg-final { scan-assembler-times "movl\[\\t \]+.+, \\(%\[\^,\]+\\)" 1 } } */
+/* { dg-final { scan-assembler-times "movb\[\\t \]+.+, 4\\(%\[\^,\]+\\)" 1 } } */
--- /dev/null
+/* { dg-do compile } */
+/* { dg-options "-O2 -mtune=generic" } */
+
+extern char *dst;
+
+void
+foo (int c)
+{
+ __builtin_memset (dst, c, 6);
+}
+
+/* { dg-final { scan-assembler-times "movl\[\\t \]+.+, \\(%\[\^,\]+\\)" 1 } } */
+/* { dg-final { scan-assembler-times "movw\[\\t \]+.+, 4\\(%\[\^,\]+\\)" 1 } } */
--- /dev/null
+/* { dg-do compile { target { ! ia32 } } } */
+/* { dg-options "-O2 -mno-avx -msse2 -mtune=skylake" } */
+
+void
+foo (char *dst, char *src)
+{
+ __builtin_memcpy (dst, src, 255);
+}
+
+/* { dg-final { scan-assembler-times "movdqu\[\\t \]+\[0-9\]*\\(%\[\^,\]+\\)," 16 } } */
+/* { dg-final { scan-assembler-not "mov\[bwlq\]" } } */
--- /dev/null
+/* { dg-do compile { target { ! ia32 } } } */
+/* { dg-options "-O2 -mno-avx -msse2 -mtune=skylake" } */
+
+void
+foo (char *dst)
+{
+ __builtin_memset (dst, 0, 255);
+}
+
+/* { dg-final { scan-assembler-times "movups\[\\t \]+%xmm\[0-9\]+, \[0-9\]*\\(%\[\^,\]+\\)" 16 } } */
+/* { dg-final { scan-assembler-not "mov\[bwlq\]" } } */
--- /dev/null
+/* { dg-do compile { target { ! ia32 } } } */
+/* { dg-options "-O2 -mno-avx -msse2 -mtune=generic" } */
+
+extern char *dst;
+
+void
+foo (void)
+{
+ __builtin_memset (dst, 1, 20);
+}
+
+/* { dg-final { scan-assembler-times "movups\[\\t \]+%xmm\[0-9\]+, \\(%\[\^,\]+\\)" 1 } } */
+/* { dg-final { scan-assembler-times "movl\[\\t \]+\\\$16843009, 16\\(%\[\^,\]+\\)" 1 } } */
--- /dev/null
+/* { dg-do compile } */
+/* { dg-options "-O2 -mtune=generic" } */
+/* { dg-additional-options "-mno-avx -msse2" { target { ! ia32 } } } */
+/* { dg-additional-options "-mno-sse" { target ia32 } } */
+
+extern char *dst, *src;
+
+void
+foo (void)
+{
+ __builtin_memcpy (dst, src, 19);
+}
+
+/* { dg-final { scan-assembler-times "movdqu\[\\t \]+\\(%\[\^,\]+\\)," 1 { target { ! ia32 } } } } */
+/* { dg-final { scan-assembler-times "movl\[\\t \]+15\\(%\[\^,\]+\\)," 1 { target { ! ia32 } } } } */
+/* { dg-final { scan-assembler-times "movl\[\\t \]+\\(%\[\^,\]+\\)," 1 { target ia32 } } } */
+/* { dg-final { scan-assembler-times "movl\[\\t \]+4\\(%\[\^,\]+\\)," 1 { target ia32 } } } */
+/* { dg-final { scan-assembler-times "movl\[\\t \]+8\\(%\[\^,\]+\\)," 1 { target ia32 } } } */
+/* { dg-final { scan-assembler-times "movl\[\\t \]+12\\(%\[\^,\]+\\)," 1 { target ia32 } } } */
+/* { dg-final { scan-assembler-times "movl\[\\t \]+15\\(%\[\^,\]+\\)," 1 { target ia32 } } } */
--- /dev/null
+/* { dg-do compile } */
+/* { dg-options "-O2 -mtune=generic" } */
+/* { dg-additional-options "-mno-avx -msse2" { target { ! ia32 } } } */
+/* { dg-additional-options "-mno-sse" { target ia32 } } */
+
+extern char *dst, *src;
+
+void
+foo (void)
+{
+ __builtin_memcpy (dst, src, 31);
+}
+
+/* { dg-final { scan-assembler-times "movdqu\[\\t \]+\\(%\[\^,\]+\\)," 1 { target { ! ia32 } } } } */
+/* { dg-final { scan-assembler-times "movdqu\[\\t \]+15\\(%\[\^,\]+\\)," 1 { target { ! ia32 } } } } */
+/* { dg-final { scan-assembler-times "movl\[\\t \]+\\(%\[\^,\]+\\)," 1 { target ia32 } } } */
+/* { dg-final { scan-assembler-times "movl\[\\t \]+4\\(%\[\^,\]+\\)," 1 { target ia32 } } } */
+/* { dg-final { scan-assembler-times "movl\[\\t \]+8\\(%\[\^,\]+\\)," 1 { target ia32 } } } */
+/* { dg-final { scan-assembler-times "movl\[\\t \]+12\\(%\[\^,\]+\\)," 1 { target ia32 } } } */
+/* { dg-final { scan-assembler-times "movl\[\\t \]+16\\(%\[\^,\]+\\)," 1 { target ia32 } } } */
+/* { dg-final { scan-assembler-times "movl\[\\t \]+20\\(%\[\^,\]+\\)," 1 { target ia32 } } } */
+/* { dg-final { scan-assembler-times "movl\[\\t \]+24\\(%\[\^,\]+\\)," 1 { target ia32 } } } */
+/* { dg-final { scan-assembler-times "movl\[\\t \]+27\\(%\[\^,\]+\\)," 1 { target ia32 } } } */
--- /dev/null
+/* { dg-do compile { target { ! ia32 } } } */
+/* { dg-options "-O2 -mno-avx -msse2 -mtune=generic" } */
+
+extern char *dst;
+
+void
+foo (void)
+{
+ __builtin_memset (dst, 0, 31);
+}
+
+/* { dg-final { scan-assembler-times "movups\[\\t \]+%xmm\[0-9\]+, \\(%\[\^,\]+\\)" 1 } } */
+/* { dg-final { scan-assembler-times "movups\[\\t \]+%xmm\[0-9\]+, 15\\(%\[\^,\]+\\)" 1 } } */
--- /dev/null
+/* { dg-do compile { target { ! ia32 } } } */
+/* { dg-options "-O2 -mno-avx -msse2 -mtune=generic" } */
+
+extern char *dst;
+
+void
+foo (void)
+{
+ __builtin_memset (dst, 0, 21);
+}
+
+/* { dg-final { scan-assembler-times "movups\[\\t \]+%xmm\[0-9\]+, \\(%\[\^,\]+\\)" 1 } } */
+/* { dg-final { scan-assembler-times "movq\[\\t \]+\\\$0+, 13\\(%\[\^,\]+\\)" 1 } } */
--- /dev/null
+/* { dg-do compile { target { ! ia32 } } } */
+/* { dg-options "-O2 -mno-avx -msse2 -mtune=generic" } */
+
+void
+foo (char *dst, char *src)
+{
+ __builtin_memcpy (dst, src, 255);
+}
+
+/* { dg-final { scan-assembler-times "movdqu\[\\t \]+\[0-9\]*\\(%\[\^,\]+\\)," 16 } } */
+/* { dg-final { scan-assembler-not "mov\[bwlq\]" } } */
--- /dev/null
+/* { dg-do compile { target { ! ia32 } } } */
+/* { dg-options "-O2 -mno-avx -msse2 -mtune=skylake" } */
+
+void
+foo (char *dst)
+{
+ __builtin_memset (dst, 0, 255);
+}
+
+/* { dg-final { scan-assembler-times "movups\[\\t \]+%xmm\[0-9\]+, \[0-9\]*\\(%\[\^,\]+\\)" 16 } } */
+/* { dg-final { scan-assembler-not "mov\[bwlq\]" } } */
--- /dev/null
+/* { dg-do compile } */
+/* { dg-options "-O2 -mtune=generic" } */
+
+extern char *dst;
+
+void
+foo (void)
+{
+ __builtin_memset (dst, 0, 5);
+}
+
+/* { dg-final { scan-assembler-times "movl\[\\t \]+.+, \\(%\[\^,\]+\\)" 1 } } */
+/* { dg-final { scan-assembler-times "movb\[\\t \]+.+, 4\\(%\[\^,\]+\\)" 1 } } */
--- /dev/null
+/* { dg-do compile } */
+/* { dg-options "-O2 -mtune=generic" } */
+
+extern char *dst;
+
+void
+foo (void)
+{
+ __builtin_memset (dst, 0, 6);
+}
+
+/* { dg-final { scan-assembler-times "movl\[\\t \]+.+, \\(%\[\^,\]+\\)" 1 } } */
+/* { dg-final { scan-assembler-times "movw\[\\t \]+.+, 4\\(%\[\^,\]+\\)" 1 } } */