From 79f05c19ca1ee164031f0d3926bc5074c499da10 Mon Sep 17 00:00:00 2001 From: Jan Hubicka Date: Thu, 3 Feb 2000 15:10:02 +0100 Subject: [PATCH] i386.md (movstrsi, clrstrsi): Support variable sized copies, align destination when needed. * i386.md (movstrsi, clrstrsi): Support variable sized copies, align destination when needed. (strmovsi, strsetsi): New expander. (strmovsi_1, strsetsi_1): New pattern. * i386.h (MASK_NO_ALIGN_STROP, MASK_INLINE_ALL_STROP, TARGET_ALIGN_STRINGOPS, TARGET_INLINE_ALL_STRINGOPS): New macros. (TARGET_SWITCHES) Add align-stringops and inline-all-stringops. * invoke.texi (align-stringops, inline-all-stringops): Document. From-SVN: r31773 --- gcc/ChangeLog | 11 ++ gcc/config/i386/i386.h | 13 ++ gcc/config/i386/i386.md | 360 ++++++++++++++++++++++++++++++++++++++++++++++-- gcc/invoke.texi | 15 +- 4 files changed, 384 insertions(+), 15 deletions(-) diff --git a/gcc/ChangeLog b/gcc/ChangeLog index 5d0f642..8bb5dd6 100644 --- a/gcc/ChangeLog +++ b/gcc/ChangeLog @@ -1,3 +1,14 @@ +Thu Feb 3 15:08:13 MET 2000 Jan Hubicka + + * i386.md (movstrsi, clrstrsi): Support variable sized copies, align + destination when needed. + (strmovsi, strsetsi): New expander. + (strmovsi_1, strsetsi_1): New pattern. + * i386.h (MASK_NO_ALIGN_STROP, MASK_INLINE_ALL_STROP, + TARGET_ALIGN_STRINGOPS, TARGET_INLINE_ALL_STRINGOPS): New macros. + (TARGET_SWITCHES) Add align-stringops and inline-all-stringops. + * invoke.texi (align-stringops, inline-all-stringops): Document. + Wed Feb 2 23:04:47 2000 Krister Walfridsson * i386/netbsd.h (INT_ASM_OP): Define. diff --git a/gcc/config/i386/i386.h b/gcc/config/i386/i386.h index 1302b65..8c33c66 100644 --- a/gcc/config/i386/i386.h +++ b/gcc/config/i386/i386.h @@ -101,6 +101,8 @@ extern int target_flags; #define MASK_NO_FANCY_MATH_387 0x00000040 /* Disable sin, cos, sqrt */ #define MASK_OMIT_LEAF_FRAME_POINTER 0x080 /* omit leaf frame pointers */ #define MASK_STACK_PROBE 0x00000100 /* Enable stack probing */ +#define MASK_NO_ALIGN_STROPS 0x00001000 /* Enable aligning of string ops. */ +#define MASK_INLINE_ALL_STROPS 0x00002000 /* Inline stringops in all cases */ /* Temporary codegen switches */ #define MASK_INTEL_SYNTAX 0x00000200 @@ -190,6 +192,9 @@ extern const int x86_promote_QImode, x86_single_stringop; #define TARGET_STACK_PROBE (target_flags & MASK_STACK_PROBE) +#define TARGET_ALIGN_STRINGOPS (!(target_flags & MASK_NO_ALIGN_STROPS)) +#define TARGET_INLINE_ALL_STRINGOPS (target_flags & MASK_INLINE_ALL_STROPS) + #define ASSEMBLER_DIALECT ((target_flags & MASK_INTEL_SYNTAX) != 0) #define TARGET_SWITCHES \ @@ -238,6 +243,14 @@ extern const int x86_promote_QImode, x86_single_stringop; { "intel-syntax", MASK_INTEL_SYNTAX, \ "Emit Intel syntax assembler opcodes" }, \ { "no-intel-syntax", -MASK_INTEL_SYNTAX, "" }, \ + { "align-stringops", -MASK_NO_ALIGN_STROPS, \ + "Align destination of the string operations" }, \ + { "no-align-stringops", MASK_NO_ALIGN_STROPS, \ + "Do not align destination of the string operations" }, \ + { "inline-all-strinops", MASK_INLINE_ALL_STROPS, \ + "Inline all known string operations" }, \ + { "no-inline-all-stringops", -MASK_INLINE_ALL_STROPS, \ + "Do not inline all known string operations" }, \ SUBTARGET_SWITCHES \ { "", TARGET_DEFAULT, 0 }} diff --git a/gcc/config/i386/i386.md b/gcc/config/i386/i386.md index ce2ac95..c5454d7 100644 --- a/gcc/config/i386/i386.md +++ b/gcc/config/i386/i386.md @@ -7838,49 +7838,208 @@ (define_expand "movstrsi" [(use (match_operand:BLK 0 "memory_operand" "")) (use (match_operand:BLK 1 "memory_operand" "")) - (use (match_operand:SI 2 "const_int_operand" "")) + (use (match_operand:SI 2 "nonmemory_operand" "")) (use (match_operand:SI 3 "const_int_operand" ""))] "" " { rtx srcreg, destreg, countreg; + int align = 0; + int count = -1; - if (GET_CODE (operands[2]) != CONST_INT) - FAIL; + if (GET_CODE (operands[3]) == CONST_INT) + align = INTVAL (operands[3]); + + /* This simple hack avoids all inlining code and simplifies code bellow. */ + if (!TARGET_ALIGN_STRINGOPS) + align = 32; + + if (GET_CODE (operands[2]) == CONST_INT) + count = INTVAL (operands[2]); destreg = copy_to_mode_reg (Pmode, XEXP (operands[0], 0)); srcreg = copy_to_mode_reg (Pmode, XEXP (operands[1], 0)); emit_insn (gen_cld()); + /* When optimizing for size emit simple rep ; movsb instruction for counts not divisible by 4. */ - if ((!optimize || optimize_size) && (INTVAL (operands[2]) & 0x03)) + + if ((!optimize || optimize_size) + && (count < 0 || (count & 0x03))) { countreg = copy_to_mode_reg (SImode, operands[2]); emit_insn (gen_rep_movqi (destreg, srcreg, countreg, destreg, srcreg, countreg)); } - else + + /* For constant aligned (or small unaligned) copies use rep movsl + followed by code copying the rest. For PentiumPro ensure 8 byte + alignment to allow rep movsl acceleration. */ + + else if (count >= 0 + && (align >= 8 + || (!TARGET_PENTIUMPRO && align >= 4) + || optimize_size || count < 64)) { - if (INTVAL (operands[2]) & ~0x03) + if (count & ~0x03) { countreg = copy_to_mode_reg (SImode, - GEN_INT ((INTVAL (operands[2]) >> 2) + GEN_INT ((count >> 2) & 0x3fffffff)); emit_insn (gen_rep_movsi (destreg, srcreg, countreg, destreg, srcreg, countreg)); } - if (INTVAL (operands[2]) & 0x02) + if (count & 0x02) emit_insn (gen_strmovhi (destreg, srcreg)); - if (INTVAL (operands[2]) & 0x01) + if (count & 0x01) emit_insn (gen_strmovqi (destreg, srcreg)); } + /* The generic code based on the glibc implementation: + - align destination to 4 bytes (8 byte alignment is used for PentiumPro + allowing accelerated copying there) + - copy the data using rep movsl + - copy the rest. */ + else + { + rtx countreg2; + rtx label = NULL; + + /* In case we don't know anything about the alignment, default to + library version, since it is usually equally fast and result in + shorter code. */ + if (!TARGET_INLINE_ALL_STRINGOPS && align < 4) + FAIL; + + if (TARGET_SINGLE_STRINGOP) + emit_insn (gen_cld()); + + countreg2 = gen_reg_rtx (SImode); + countreg = copy_to_mode_reg (SImode, operands[2]); + + /* We don't use loops to align destination and to copy parts smaller + than 4 bytes, because gcc is able to optimize such code better (in + the case the destination or the count really is aligned, gcc is often + able to predict the branches) and also it is friendlier to the + hardware branch prediction. + + Using loops is benefical for generic case, because we can + handle small counts using the loops. Many CPUs (such as Athlon) + have large REP prefix setup costs. + + This is quite costy. Maybe we can revisit this decision later or + add some customizability to this code. */ + + if (count < 0 + && align < (TARGET_PENTIUMPRO && (count < 0 || count >= 260) ? 8 : 4)) + { + label = gen_label_rtx (); + emit_cmp_and_jump_insns (countreg, GEN_INT (3), + LEU, 0, SImode, 1, 0, label); + } + if (align <= 1) + { + rtx label = gen_label_rtx (); + rtx tmpcount = gen_reg_rtx (SImode); + emit_insn (gen_andsi3 (tmpcount, destreg, GEN_INT (1))); + emit_cmp_and_jump_insns (tmpcount, GEN_INT (0), EQ, 0, + SImode, 1, 0, label); + emit_insn (gen_strmovqi (destreg, srcreg)); + emit_insn (gen_addsi3 (countreg, countreg, constm1_rtx)); + emit_label (label); + LABEL_NUSES (label) = 1; + } + if (align <= 2) + { + rtx label = gen_label_rtx (); + rtx tmpcount = gen_reg_rtx (SImode); + emit_insn (gen_andsi3 (tmpcount, destreg, GEN_INT (2))); + emit_cmp_and_jump_insns (tmpcount, GEN_INT (0), EQ, 0, + SImode, 1, 0, label); + emit_insn (gen_strmovhi (destreg, srcreg)); + emit_insn (gen_addsi3 (countreg, countreg, GEN_INT (-2))); + emit_label (label); + LABEL_NUSES (label) = 1; + } + if (align <= 4 && TARGET_PENTIUMPRO && (count < 1 || count >= 260)) + { + rtx label = gen_label_rtx (); + rtx tmpcount = gen_reg_rtx (SImode); + emit_insn (gen_andsi3 (tmpcount, destreg, GEN_INT (4))); + emit_cmp_and_jump_insns (tmpcount, GEN_INT (0), EQ, 0, + SImode, 1, 0, label); + emit_insn (gen_strmovsi (destreg, srcreg)); + emit_insn (gen_addsi3 (countreg, countreg, GEN_INT (-4))); + emit_label (label); + LABEL_NUSES (label) = 1; + } + + if (!TARGET_SINGLE_STRINGOP) + emit_insn (gen_cld()); + emit_insn (gen_lshrsi3 (countreg2, countreg, GEN_INT (2))); + emit_insn (gen_rep_movsi (destreg, srcreg, countreg2, + destreg, srcreg, countreg2)); + + if (label) + { + emit_label (label); + LABEL_NUSES (label) = 1; + } + if (align > 2 && count > 0 && (count & 2)) + emit_insn (gen_strmovhi (destreg, srcreg)); + if (align <= 2 || count < 0) + { + rtx label = gen_label_rtx (); + rtx tmpcount = gen_reg_rtx (SImode); + emit_insn (gen_andsi3 (tmpcount, countreg, GEN_INT (2))); + emit_cmp_and_jump_insns (tmpcount, GEN_INT (0), EQ, 0, + SImode, 1, 0, label); + emit_insn (gen_strmovhi (destreg, srcreg)); + emit_label (label); + LABEL_NUSES (label) = 1; + } + if (align > 1 && count > 0 && (count & 1)) + emit_insn (gen_strmovsi (destreg, srcreg)); + if (align <= 1 || count < 0) + { + rtx label = gen_label_rtx (); + rtx tmpcount = gen_reg_rtx (SImode); + emit_insn (gen_andsi3 (tmpcount, countreg, GEN_INT (1))); + emit_cmp_and_jump_insns (tmpcount, GEN_INT (0), EQ, 0, + SImode, 1, 0, label); + emit_insn (gen_strmovqi (destreg, srcreg)); + emit_label (label); + LABEL_NUSES (label) = 1; + } + } DONE; }") ;; Most CPUs don't like single string operations ;; Handle this case here to simplify previous expander. +(define_expand "strmovsi" + [(set (match_dup 2) + (mem:SI (match_operand:SI 1 "register_operand" ""))) + (set (mem:SI (match_operand:SI 0 "register_operand" "")) + (match_dup 2)) + (parallel [(set (match_dup 0) (plus:SI (match_dup 0) (const_int 4))) + (clobber (reg:CC 17))]) + (parallel [(set (match_dup 1) (plus:SI (match_dup 1) (const_int 4))) + (clobber (reg:CC 17))])] + "" + " +{ + if (TARGET_SINGLE_STRINGOP || optimize_size) + { + emit_insn (gen_strmovsi_1 (operands[0], operands[1], operands[0], + operands[1])); + DONE; + } + else + operands[2] = gen_reg_rtx (SImode); +}") + (define_expand "strmovhi" [(set (match_dup 2) (mem:HI (match_operand:SI 1 "register_operand" ""))) @@ -7925,6 +8084,21 @@ operands[2] = gen_reg_rtx (QImode); }") +(define_insn "strmovsi_1" + [(set (mem:SI (match_operand:SI 2 "register_operand" "0")) + (mem:SI (match_operand:SI 3 "register_operand" "1"))) + (set (match_operand:SI 0 "register_operand" "=D") + (plus:SI (match_dup 0) + (const_int 4))) + (set (match_operand:SI 1 "register_operand" "=S") + (plus:SI (match_dup 1) + (const_int 4))) + (use (reg:SI 19))] + "TARGET_SINGLE_STRINGOP || optimize_size" + "movsl" + [(set_attr "type" "str") + (set_attr "memory" "both")]) + (define_insn "strmovhi_1" [(set (mem:HI (match_operand:SI 2 "register_operand" "0")) (mem:HI (match_operand:SI 3 "register_operand" "1"))) @@ -7996,15 +8170,26 @@ (define_expand "clrstrsi" [(use (match_operand:BLK 0 "memory_operand" "")) - (use (match_operand:SI 1 "const_int_operand" "")) + (use (match_operand:SI 1 "nonmemory_operand" "")) (use (match_operand:SI 2 "const_int_operand" ""))] "" " { + /* See comments in movstr expanders. The code is mostly identical. */ + rtx destreg, zeroreg, countreg; + int align = 0; + int count = -1; - if (GET_CODE (operands[1]) != CONST_INT) - FAIL; + if (GET_CODE (operands[2]) == CONST_INT) + align = INTVAL (operands[2]); + + /* This simple hack avoids all inlining code and simplifies code bellow. */ + if (!TARGET_ALIGN_STRINGOPS) + align = 32; + + if (GET_CODE (operands[1]) == CONST_INT) + count = INTVAL (operands[1]); destreg = copy_to_mode_reg (Pmode, XEXP (operands[0], 0)); @@ -8012,14 +8197,19 @@ /* When optimizing for size emit simple rep ; movsb instruction for counts not divisible by 4. */ - if ((!optimize || optimize_size) && (INTVAL (operands[1]) & 0x03)) + + if ((!optimize || optimize_size) + && (count < 0 || (count & 0x03))) { countreg = copy_to_mode_reg (SImode, operands[1]); zeroreg = copy_to_mode_reg (QImode, const0_rtx); emit_insn (gen_rep_stosqi (destreg, countreg, zeroreg, destreg, countreg)); } - else + else if (count >= 0 + && (align >= 8 + || (!TARGET_PENTIUMPRO && align >= 4) + || optimize_size || count < 64)) { zeroreg = copy_to_mode_reg (SImode, const0_rtx); if (INTVAL (operands[1]) & ~0x03) @@ -8037,12 +8227,133 @@ emit_insn (gen_strsetqi (destreg, gen_rtx_SUBREG (QImode, zeroreg, 0))); } + else + { + rtx countreg2; + rtx label = NULL; + + /* In case we don't know anything about the alignment, default to + library version, since it is usually equally fast and result in + shorter code. */ + if (!TARGET_INLINE_ALL_STRINGOPS && align < 4) + FAIL; + + if (TARGET_SINGLE_STRINGOP) + emit_insn (gen_cld()); + + countreg2 = gen_reg_rtx (SImode); + countreg = copy_to_mode_reg (SImode, operands[1]); + zeroreg = copy_to_mode_reg (SImode, const0_rtx); + + if (count < 0 + && align < (TARGET_PENTIUMPRO && (count < 0 || count >= 260) ? 8 : 4)) + { + label = gen_label_rtx (); + emit_cmp_and_jump_insns (countreg, GEN_INT (3), + LEU, 0, SImode, 1, 0, label); + } + if (align <= 1) + { + rtx label = gen_label_rtx (); + rtx tmpcount = gen_reg_rtx (SImode); + emit_insn (gen_andsi3 (tmpcount, destreg, GEN_INT (1))); + emit_cmp_and_jump_insns (tmpcount, GEN_INT (0), EQ, 0, + SImode, 1, 0, label); + emit_insn (gen_strsetqi (destreg, + gen_rtx_SUBREG (QImode, zeroreg, 0))); + emit_insn (gen_addsi3 (countreg, countreg, constm1_rtx)); + emit_label (label); + LABEL_NUSES (label) = 1; + } + if (align <= 2) + { + rtx label = gen_label_rtx (); + rtx tmpcount = gen_reg_rtx (SImode); + emit_insn (gen_andsi3 (tmpcount, destreg, GEN_INT (2))); + emit_cmp_and_jump_insns (tmpcount, GEN_INT (0), EQ, 0, + SImode, 1, 0, label); + emit_insn (gen_strsethi (destreg, + gen_rtx_SUBREG (HImode, zeroreg, 0))); + emit_insn (gen_addsi3 (countreg, countreg, GEN_INT (-2))); + emit_label (label); + LABEL_NUSES (label) = 1; + } + if (align <= 4 && TARGET_PENTIUMPRO && (count < 1 || count >= 260)) + { + rtx label = gen_label_rtx (); + rtx tmpcount = gen_reg_rtx (SImode); + emit_insn (gen_andsi3 (tmpcount, destreg, GEN_INT (4))); + emit_cmp_and_jump_insns (tmpcount, GEN_INT (0), EQ, 0, + SImode, 1, 0, label); + emit_insn (gen_strsethi (destreg, zeroreg)); + emit_insn (gen_addsi3 (countreg, countreg, GEN_INT (-4))); + emit_label (label); + LABEL_NUSES (label) = 1; + } + + if (!TARGET_SINGLE_STRINGOP) + emit_insn (gen_cld()); + emit_insn (gen_lshrsi3 (countreg2, countreg, GEN_INT (2))); + emit_insn (gen_rep_stossi (destreg, countreg2, zeroreg, + destreg, countreg2)); + + if (label) + { + emit_label (label); + LABEL_NUSES (label) = 1; + } + if (align > 2 && count > 0 && (count & 2)) + emit_insn (gen_strsethi (destreg, + gen_rtx_SUBREG (HImode, zeroreg, 0))); + if (align <= 2 || count < 0) + { + rtx label = gen_label_rtx (); + rtx tmpcount = gen_reg_rtx (SImode); + emit_insn (gen_andsi3 (tmpcount, countreg, GEN_INT (2))); + emit_cmp_and_jump_insns (tmpcount, GEN_INT (0), EQ, 0, + SImode, 1, 0, label); + emit_insn (gen_strsethi (destreg, + gen_rtx_SUBREG (HImode, zeroreg, 0))); + emit_label (label); + LABEL_NUSES (label) = 1; + } + if (align > 1 && count > 0 && (count & 1)) + emit_insn (gen_strsetqi (destreg, + gen_rtx_SUBREG (QImode, zeroreg, 0))); + if (align <= 1 || count < 0) + { + rtx label = gen_label_rtx (); + rtx tmpcount = gen_reg_rtx (SImode); + emit_insn (gen_andsi3 (tmpcount, countreg, GEN_INT (1))); + emit_cmp_and_jump_insns (tmpcount, GEN_INT (0), EQ, 0, + SImode, 1, 0, label); + emit_insn (gen_strsetqi (destreg, + gen_rtx_SUBREG (QImode, zeroreg, 0))); + emit_label (label); + LABEL_NUSES (label) = 1; + } + } DONE; }") ;; Most CPUs don't like single string operations ;; Handle this case here to simplify previous expander. +(define_expand "strsetsi" + [(set (mem:SI (match_operand:SI 0 "register_operand" "")) + (match_operand:SI 1 "register_operand" "")) + (parallel [(set (match_dup 0) (plus:SI (match_dup 0) (const_int 4))) + (clobber (reg:CC 17))])] + "" + " +{ + if (TARGET_SINGLE_STRINGOP || optimize_size) + { + emit_insn (gen_strsetsi_1 (operands[0], operands[0], operands[1])); + DONE; + } +}") + (define_expand "strsethi" [(set (mem:HI (match_operand:SI 0 "register_operand" "")) (match_operand:HI 1 "register_operand" "")) @@ -8073,6 +8384,18 @@ } }") +(define_insn "strsetsi_1" + [(set (mem:SI (match_operand:SI 1 "register_operand" "0")) + (match_operand:SI 2 "register_operand" "a")) + (set (match_operand:SI 0 "register_operand" "=D") + (plus:SI (match_dup 0) + (const_int 4))) + (use (reg:SI 19))] + "TARGET_SINGLE_STRINGOP || optimize_size" + "stosl" + [(set_attr "type" "str") + (set_attr "memory" "store")]) + (define_insn "strsethi_1" [(set (mem:HI (match_operand:SI 1 "register_operand" "0")) (match_operand:HI 2 "register_operand" "a")) @@ -8252,6 +8575,14 @@ { rtx out, addr, eoschar, align, scratch1, scratch2, scratch3; + /* The generic case of strlen expander is long. Avoid it's + expanding unless TARGET_INLINE_ALL_STRINGOPS. */ + + if (TARGET_UNROLL_STRLEN && eoschar == const0_rtx && optimize > 1 + && !optimize_size + && (GET_CODE (align) != CONST_INT || INTVAL (align) < 4)) + FAIL; + out = operands[0]; addr = force_reg (Pmode, XEXP (operands[1], 0)); eoschar = operands[2]; @@ -8271,6 +8602,7 @@ if (GET_CODE (align) != CONST_INT || INTVAL (align) < 4) emit_move_insn (scratch1, addr); + emit_move_insn (out, addr); ix86_expand_strlensi_unroll_1 (out, align, scratch1); diff --git a/gcc/invoke.texi b/gcc/invoke.texi index 549ece1..f09ef55 100644 --- a/gcc/invoke.texi +++ b/gcc/invoke.texi @@ -360,7 +360,7 @@ in the following sections. -mreg-alloc=@var{list} -mregparm=@var{num} -malign-jumps=@var{num} -malign-loops=@var{num} -malign-functions=@var{num} -mpreferred-stack-boundary=@var{num} --mthreads +-mthreads -mno-align-stringops -minline-all-stringops @emph{HPPA Options} -march=@var{architecture type} @@ -5954,6 +5954,19 @@ on thread-safe exception handling must compile and link all code with the @samp{-mthreads} option. When compiling, @samp{-mthreads} defines @samp{-D_MT}; when linking, it links in a special thread helper library @samp{-lmingwthrd} which cleans up per thread exception handling data. + +@item -mno-align-stringops +@kindex -mno-align-stringops +Do not align destination of inlined string operations. This switch reduces +code size and improves performance in case the destination is already aligned, +but gcc don't know about it. + +@item -minline-all-stringops +@kindex -minline-all-stringops +By default GCC inlines string operations only when destination is known to be +aligned at least to 4 byte boundary. This enables more inlining, increase code +size, but may improve performance of code that depends on fast memcpy, strlen +and memset for short lengths. @end table @node HPPA Options -- 2.7.4