(define_expand "movstrsi"
[(use (match_operand:BLK 0 "memory_operand" ""))
(use (match_operand:BLK 1 "memory_operand" ""))
- (use (match_operand:SI 2 "const_int_operand" ""))
+ (use (match_operand:SI 2 "nonmemory_operand" ""))
(use (match_operand:SI 3 "const_int_operand" ""))]
""
"
{
rtx srcreg, destreg, countreg;
+ int align = 0;
+ int count = -1;
- if (GET_CODE (operands[2]) != CONST_INT)
- FAIL;
+ if (GET_CODE (operands[3]) == CONST_INT)
+ align = INTVAL (operands[3]);
+
+ /* This simple hack avoids all inlining code and simplifies code bellow. */
+ if (!TARGET_ALIGN_STRINGOPS)
+ align = 32;
+
+ if (GET_CODE (operands[2]) == CONST_INT)
+ count = INTVAL (operands[2]);
destreg = copy_to_mode_reg (Pmode, XEXP (operands[0], 0));
srcreg = copy_to_mode_reg (Pmode, XEXP (operands[1], 0));
emit_insn (gen_cld());
+
/* When optimizing for size emit simple rep ; movsb instruction for
counts not divisible by 4. */
- if ((!optimize || optimize_size) && (INTVAL (operands[2]) & 0x03))
+
+ if ((!optimize || optimize_size)
+ && (count < 0 || (count & 0x03)))
{
countreg = copy_to_mode_reg (SImode, operands[2]);
emit_insn (gen_rep_movqi (destreg, srcreg, countreg,
destreg, srcreg, countreg));
}
- else
+
+ /* For constant aligned (or small unaligned) copies use rep movsl
+ followed by code copying the rest. For PentiumPro ensure 8 byte
+ alignment to allow rep movsl acceleration. */
+
+ else if (count >= 0
+ && (align >= 8
+ || (!TARGET_PENTIUMPRO && align >= 4)
+ || optimize_size || count < 64))
{
- if (INTVAL (operands[2]) & ~0x03)
+ if (count & ~0x03)
{
countreg = copy_to_mode_reg (SImode,
- GEN_INT ((INTVAL (operands[2]) >> 2)
+ GEN_INT ((count >> 2)
& 0x3fffffff));
emit_insn (gen_rep_movsi (destreg, srcreg, countreg,
destreg, srcreg, countreg));
}
- if (INTVAL (operands[2]) & 0x02)
+ if (count & 0x02)
emit_insn (gen_strmovhi (destreg, srcreg));
- if (INTVAL (operands[2]) & 0x01)
+ if (count & 0x01)
emit_insn (gen_strmovqi (destreg, srcreg));
}
+ /* The generic code based on the glibc implementation:
+ - align destination to 4 bytes (8 byte alignment is used for PentiumPro
+ allowing accelerated copying there)
+ - copy the data using rep movsl
+ - copy the rest. */
+ else
+ {
+ rtx countreg2;
+ rtx label = NULL;
+
+ /* In case we don't know anything about the alignment, default to
+ library version, since it is usually equally fast and result in
+ shorter code. */
+ if (!TARGET_INLINE_ALL_STRINGOPS && align < 4)
+ FAIL;
+
+ if (TARGET_SINGLE_STRINGOP)
+ emit_insn (gen_cld());
+
+ countreg2 = gen_reg_rtx (SImode);
+ countreg = copy_to_mode_reg (SImode, operands[2]);
+
+ /* We don't use loops to align destination and to copy parts smaller
+ than 4 bytes, because gcc is able to optimize such code better (in
+ the case the destination or the count really is aligned, gcc is often
+ able to predict the branches) and also it is friendlier to the
+ hardware branch prediction.
+
+ Using loops is benefical for generic case, because we can
+ handle small counts using the loops. Many CPUs (such as Athlon)
+ have large REP prefix setup costs.
+
+ This is quite costy. Maybe we can revisit this decision later or
+ add some customizability to this code. */
+
+ if (count < 0
+ && align < (TARGET_PENTIUMPRO && (count < 0 || count >= 260) ? 8 : 4))
+ {
+ label = gen_label_rtx ();
+ emit_cmp_and_jump_insns (countreg, GEN_INT (3),
+ LEU, 0, SImode, 1, 0, label);
+ }
+ if (align <= 1)
+ {
+ rtx label = gen_label_rtx ();
+ rtx tmpcount = gen_reg_rtx (SImode);
+ emit_insn (gen_andsi3 (tmpcount, destreg, GEN_INT (1)));
+ emit_cmp_and_jump_insns (tmpcount, GEN_INT (0), EQ, 0,
+ SImode, 1, 0, label);
+ emit_insn (gen_strmovqi (destreg, srcreg));
+ emit_insn (gen_addsi3 (countreg, countreg, constm1_rtx));
+ emit_label (label);
+ LABEL_NUSES (label) = 1;
+ }
+ if (align <= 2)
+ {
+ rtx label = gen_label_rtx ();
+ rtx tmpcount = gen_reg_rtx (SImode);
+ emit_insn (gen_andsi3 (tmpcount, destreg, GEN_INT (2)));
+ emit_cmp_and_jump_insns (tmpcount, GEN_INT (0), EQ, 0,
+ SImode, 1, 0, label);
+ emit_insn (gen_strmovhi (destreg, srcreg));
+ emit_insn (gen_addsi3 (countreg, countreg, GEN_INT (-2)));
+ emit_label (label);
+ LABEL_NUSES (label) = 1;
+ }
+ if (align <= 4 && TARGET_PENTIUMPRO && (count < 1 || count >= 260))
+ {
+ rtx label = gen_label_rtx ();
+ rtx tmpcount = gen_reg_rtx (SImode);
+ emit_insn (gen_andsi3 (tmpcount, destreg, GEN_INT (4)));
+ emit_cmp_and_jump_insns (tmpcount, GEN_INT (0), EQ, 0,
+ SImode, 1, 0, label);
+ emit_insn (gen_strmovsi (destreg, srcreg));
+ emit_insn (gen_addsi3 (countreg, countreg, GEN_INT (-4)));
+ emit_label (label);
+ LABEL_NUSES (label) = 1;
+ }
+
+ if (!TARGET_SINGLE_STRINGOP)
+ emit_insn (gen_cld());
+ emit_insn (gen_lshrsi3 (countreg2, countreg, GEN_INT (2)));
+ emit_insn (gen_rep_movsi (destreg, srcreg, countreg2,
+ destreg, srcreg, countreg2));
+
+ if (label)
+ {
+ emit_label (label);
+ LABEL_NUSES (label) = 1;
+ }
+ if (align > 2 && count > 0 && (count & 2))
+ emit_insn (gen_strmovhi (destreg, srcreg));
+ if (align <= 2 || count < 0)
+ {
+ rtx label = gen_label_rtx ();
+ rtx tmpcount = gen_reg_rtx (SImode);
+ emit_insn (gen_andsi3 (tmpcount, countreg, GEN_INT (2)));
+ emit_cmp_and_jump_insns (tmpcount, GEN_INT (0), EQ, 0,
+ SImode, 1, 0, label);
+ emit_insn (gen_strmovhi (destreg, srcreg));
+ emit_label (label);
+ LABEL_NUSES (label) = 1;
+ }
+ if (align > 1 && count > 0 && (count & 1))
+ emit_insn (gen_strmovsi (destreg, srcreg));
+ if (align <= 1 || count < 0)
+ {
+ rtx label = gen_label_rtx ();
+ rtx tmpcount = gen_reg_rtx (SImode);
+ emit_insn (gen_andsi3 (tmpcount, countreg, GEN_INT (1)));
+ emit_cmp_and_jump_insns (tmpcount, GEN_INT (0), EQ, 0,
+ SImode, 1, 0, label);
+ emit_insn (gen_strmovqi (destreg, srcreg));
+ emit_label (label);
+ LABEL_NUSES (label) = 1;
+ }
+ }
DONE;
}")
;; Most CPUs don't like single string operations
;; Handle this case here to simplify previous expander.
+(define_expand "strmovsi"
+ [(set (match_dup 2)
+ (mem:SI (match_operand:SI 1 "register_operand" "")))
+ (set (mem:SI (match_operand:SI 0 "register_operand" ""))
+ (match_dup 2))
+ (parallel [(set (match_dup 0) (plus:SI (match_dup 0) (const_int 4)))
+ (clobber (reg:CC 17))])
+ (parallel [(set (match_dup 1) (plus:SI (match_dup 1) (const_int 4)))
+ (clobber (reg:CC 17))])]
+ ""
+ "
+{
+ if (TARGET_SINGLE_STRINGOP || optimize_size)
+ {
+ emit_insn (gen_strmovsi_1 (operands[0], operands[1], operands[0],
+ operands[1]));
+ DONE;
+ }
+ else
+ operands[2] = gen_reg_rtx (SImode);
+}")
+
(define_expand "strmovhi"
[(set (match_dup 2)
(mem:HI (match_operand:SI 1 "register_operand" "")))
operands[2] = gen_reg_rtx (QImode);
}")
+(define_insn "strmovsi_1"
+ [(set (mem:SI (match_operand:SI 2 "register_operand" "0"))
+ (mem:SI (match_operand:SI 3 "register_operand" "1")))
+ (set (match_operand:SI 0 "register_operand" "=D")
+ (plus:SI (match_dup 0)
+ (const_int 4)))
+ (set (match_operand:SI 1 "register_operand" "=S")
+ (plus:SI (match_dup 1)
+ (const_int 4)))
+ (use (reg:SI 19))]
+ "TARGET_SINGLE_STRINGOP || optimize_size"
+ "movsl"
+ [(set_attr "type" "str")
+ (set_attr "memory" "both")])
+
(define_insn "strmovhi_1"
[(set (mem:HI (match_operand:SI 2 "register_operand" "0"))
(mem:HI (match_operand:SI 3 "register_operand" "1")))
(define_expand "clrstrsi"
[(use (match_operand:BLK 0 "memory_operand" ""))
- (use (match_operand:SI 1 "const_int_operand" ""))
+ (use (match_operand:SI 1 "nonmemory_operand" ""))
(use (match_operand:SI 2 "const_int_operand" ""))]
""
"
{
+ /* See comments in movstr expanders. The code is mostly identical. */
+
rtx destreg, zeroreg, countreg;
+ int align = 0;
+ int count = -1;
- if (GET_CODE (operands[1]) != CONST_INT)
- FAIL;
+ if (GET_CODE (operands[2]) == CONST_INT)
+ align = INTVAL (operands[2]);
+
+ /* This simple hack avoids all inlining code and simplifies code bellow. */
+ if (!TARGET_ALIGN_STRINGOPS)
+ align = 32;
+
+ if (GET_CODE (operands[1]) == CONST_INT)
+ count = INTVAL (operands[1]);
destreg = copy_to_mode_reg (Pmode, XEXP (operands[0], 0));
/* When optimizing for size emit simple rep ; movsb instruction for
counts not divisible by 4. */
- if ((!optimize || optimize_size) && (INTVAL (operands[1]) & 0x03))
+
+ if ((!optimize || optimize_size)
+ && (count < 0 || (count & 0x03)))
{
countreg = copy_to_mode_reg (SImode, operands[1]);
zeroreg = copy_to_mode_reg (QImode, const0_rtx);
emit_insn (gen_rep_stosqi (destreg, countreg, zeroreg,
destreg, countreg));
}
- else
+ else if (count >= 0
+ && (align >= 8
+ || (!TARGET_PENTIUMPRO && align >= 4)
+ || optimize_size || count < 64))
{
zeroreg = copy_to_mode_reg (SImode, const0_rtx);
if (INTVAL (operands[1]) & ~0x03)
emit_insn (gen_strsetqi (destreg,
gen_rtx_SUBREG (QImode, zeroreg, 0)));
}
+ else
+ {
+ rtx countreg2;
+ rtx label = NULL;
+
+ /* In case we don't know anything about the alignment, default to
+ library version, since it is usually equally fast and result in
+ shorter code. */
+ if (!TARGET_INLINE_ALL_STRINGOPS && align < 4)
+ FAIL;
+
+ if (TARGET_SINGLE_STRINGOP)
+ emit_insn (gen_cld());
+
+ countreg2 = gen_reg_rtx (SImode);
+ countreg = copy_to_mode_reg (SImode, operands[1]);
+ zeroreg = copy_to_mode_reg (SImode, const0_rtx);
+
+ if (count < 0
+ && align < (TARGET_PENTIUMPRO && (count < 0 || count >= 260) ? 8 : 4))
+ {
+ label = gen_label_rtx ();
+ emit_cmp_and_jump_insns (countreg, GEN_INT (3),
+ LEU, 0, SImode, 1, 0, label);
+ }
+ if (align <= 1)
+ {
+ rtx label = gen_label_rtx ();
+ rtx tmpcount = gen_reg_rtx (SImode);
+ emit_insn (gen_andsi3 (tmpcount, destreg, GEN_INT (1)));
+ emit_cmp_and_jump_insns (tmpcount, GEN_INT (0), EQ, 0,
+ SImode, 1, 0, label);
+ emit_insn (gen_strsetqi (destreg,
+ gen_rtx_SUBREG (QImode, zeroreg, 0)));
+ emit_insn (gen_addsi3 (countreg, countreg, constm1_rtx));
+ emit_label (label);
+ LABEL_NUSES (label) = 1;
+ }
+ if (align <= 2)
+ {
+ rtx label = gen_label_rtx ();
+ rtx tmpcount = gen_reg_rtx (SImode);
+ emit_insn (gen_andsi3 (tmpcount, destreg, GEN_INT (2)));
+ emit_cmp_and_jump_insns (tmpcount, GEN_INT (0), EQ, 0,
+ SImode, 1, 0, label);
+ emit_insn (gen_strsethi (destreg,
+ gen_rtx_SUBREG (HImode, zeroreg, 0)));
+ emit_insn (gen_addsi3 (countreg, countreg, GEN_INT (-2)));
+ emit_label (label);
+ LABEL_NUSES (label) = 1;
+ }
+ if (align <= 4 && TARGET_PENTIUMPRO && (count < 1 || count >= 260))
+ {
+ rtx label = gen_label_rtx ();
+ rtx tmpcount = gen_reg_rtx (SImode);
+ emit_insn (gen_andsi3 (tmpcount, destreg, GEN_INT (4)));
+ emit_cmp_and_jump_insns (tmpcount, GEN_INT (0), EQ, 0,
+ SImode, 1, 0, label);
+ emit_insn (gen_strsethi (destreg, zeroreg));
+ emit_insn (gen_addsi3 (countreg, countreg, GEN_INT (-4)));
+ emit_label (label);
+ LABEL_NUSES (label) = 1;
+ }
+
+ if (!TARGET_SINGLE_STRINGOP)
+ emit_insn (gen_cld());
+ emit_insn (gen_lshrsi3 (countreg2, countreg, GEN_INT (2)));
+ emit_insn (gen_rep_stossi (destreg, countreg2, zeroreg,
+ destreg, countreg2));
+
+ if (label)
+ {
+ emit_label (label);
+ LABEL_NUSES (label) = 1;
+ }
+ if (align > 2 && count > 0 && (count & 2))
+ emit_insn (gen_strsethi (destreg,
+ gen_rtx_SUBREG (HImode, zeroreg, 0)));
+ if (align <= 2 || count < 0)
+ {
+ rtx label = gen_label_rtx ();
+ rtx tmpcount = gen_reg_rtx (SImode);
+ emit_insn (gen_andsi3 (tmpcount, countreg, GEN_INT (2)));
+ emit_cmp_and_jump_insns (tmpcount, GEN_INT (0), EQ, 0,
+ SImode, 1, 0, label);
+ emit_insn (gen_strsethi (destreg,
+ gen_rtx_SUBREG (HImode, zeroreg, 0)));
+ emit_label (label);
+ LABEL_NUSES (label) = 1;
+ }
+ if (align > 1 && count > 0 && (count & 1))
+ emit_insn (gen_strsetqi (destreg,
+ gen_rtx_SUBREG (QImode, zeroreg, 0)));
+ if (align <= 1 || count < 0)
+ {
+ rtx label = gen_label_rtx ();
+ rtx tmpcount = gen_reg_rtx (SImode);
+ emit_insn (gen_andsi3 (tmpcount, countreg, GEN_INT (1)));
+ emit_cmp_and_jump_insns (tmpcount, GEN_INT (0), EQ, 0,
+ SImode, 1, 0, label);
+ emit_insn (gen_strsetqi (destreg,
+ gen_rtx_SUBREG (QImode, zeroreg, 0)));
+ emit_label (label);
+ LABEL_NUSES (label) = 1;
+ }
+ }
DONE;
}")
;; Most CPUs don't like single string operations
;; Handle this case here to simplify previous expander.
+(define_expand "strsetsi"
+ [(set (mem:SI (match_operand:SI 0 "register_operand" ""))
+ (match_operand:SI 1 "register_operand" ""))
+ (parallel [(set (match_dup 0) (plus:SI (match_dup 0) (const_int 4)))
+ (clobber (reg:CC 17))])]
+ ""
+ "
+{
+ if (TARGET_SINGLE_STRINGOP || optimize_size)
+ {
+ emit_insn (gen_strsetsi_1 (operands[0], operands[0], operands[1]));
+ DONE;
+ }
+}")
+
(define_expand "strsethi"
[(set (mem:HI (match_operand:SI 0 "register_operand" ""))
(match_operand:HI 1 "register_operand" ""))
}
}")
+(define_insn "strsetsi_1"
+ [(set (mem:SI (match_operand:SI 1 "register_operand" "0"))
+ (match_operand:SI 2 "register_operand" "a"))
+ (set (match_operand:SI 0 "register_operand" "=D")
+ (plus:SI (match_dup 0)
+ (const_int 4)))
+ (use (reg:SI 19))]
+ "TARGET_SINGLE_STRINGOP || optimize_size"
+ "stosl"
+ [(set_attr "type" "str")
+ (set_attr "memory" "store")])
+
(define_insn "strsethi_1"
[(set (mem:HI (match_operand:SI 1 "register_operand" "0"))
(match_operand:HI 2 "register_operand" "a"))
{
rtx out, addr, eoschar, align, scratch1, scratch2, scratch3;
+ /* The generic case of strlen expander is long. Avoid it's
+ expanding unless TARGET_INLINE_ALL_STRINGOPS. */
+
+ if (TARGET_UNROLL_STRLEN && eoschar == const0_rtx && optimize > 1
+ && !optimize_size
+ && (GET_CODE (align) != CONST_INT || INTVAL (align) < 4))
+ FAIL;
+
out = operands[0];
addr = force_reg (Pmode, XEXP (operands[1], 0));
eoschar = operands[2];
if (GET_CODE (align) != CONST_INT || INTVAL (align) < 4)
emit_move_insn (scratch1, addr);
+
emit_move_insn (out, addr);
ix86_expand_strlensi_unroll_1 (out, align, scratch1);