From 79f05c19ca1ee164031f0d3926bc5074c499da10 Mon Sep 17 00:00:00 2001
From: Jan Hubicka <jh@suse.cz>
Date: Thu, 3 Feb 2000 15:10:02 +0100
Subject: [PATCH] i386.md (movstrsi, clrstrsi): Support variable sized copies,
 align destination when needed.

	* i386.md (movstrsi, clrstrsi): Support variable sized copies, align
	destination when needed.
	(strmovsi, strsetsi): New expander.
	(strmovsi_1, strsetsi_1): New pattern.
	* i386.h (MASK_NO_ALIGN_STROP, MASK_INLINE_ALL_STROP,
	TARGET_ALIGN_STRINGOPS, TARGET_INLINE_ALL_STRINGOPS): New macros.
	(TARGET_SWITCHES) Add align-stringops and inline-all-stringops.
	* invoke.texi (align-stringops, inline-all-stringops): Document.

From-SVN: r31773
---
 gcc/ChangeLog           |  11 ++
 gcc/config/i386/i386.h  |  13 ++
 gcc/config/i386/i386.md | 360 ++++++++++++++++++++++++++++++++++++++++++++++--
 gcc/invoke.texi         |  15 +-
 4 files changed, 384 insertions(+), 15 deletions(-)

diff --git a/gcc/ChangeLog b/gcc/ChangeLog
index 5d0f642..8bb5dd6 100644
--- a/gcc/ChangeLog
+++ b/gcc/ChangeLog
@@ -1,3 +1,14 @@
+Thu Feb  3 15:08:13 MET 2000  Jan Hubicka  <jh@suse.cz>
+
+	* i386.md (movstrsi, clrstrsi): Support variable sized copies, align
+	destination when needed.
+	(strmovsi, strsetsi): New expander.
+	(strmovsi_1, strsetsi_1): New pattern.
+	* i386.h (MASK_NO_ALIGN_STROP, MASK_INLINE_ALL_STROP,
+	TARGET_ALIGN_STRINGOPS, TARGET_INLINE_ALL_STRINGOPS): New macros.
+	(TARGET_SWITCHES) Add align-stringops and inline-all-stringops.
+	* invoke.texi (align-stringops, inline-all-stringops): Document.
+
 Wed Feb  2 23:04:47 2000   Krister Walfridsson <cato@df.lth.se>
 
 	* i386/netbsd.h (INT_ASM_OP): Define.
diff --git a/gcc/config/i386/i386.h b/gcc/config/i386/i386.h
index 1302b65..8c33c66 100644
--- a/gcc/config/i386/i386.h
+++ b/gcc/config/i386/i386.h
@@ -101,6 +101,8 @@ extern int target_flags;
 #define MASK_NO_FANCY_MATH_387	0x00000040	/* Disable sin, cos, sqrt */
 #define MASK_OMIT_LEAF_FRAME_POINTER 0x080      /* omit leaf frame pointers */
 #define MASK_STACK_PROBE	0x00000100	/* Enable stack probing */
+#define MASK_NO_ALIGN_STROPS	0x00001000	/* Enable aligning of string ops. */
+#define MASK_INLINE_ALL_STROPS	0x00002000	/* Inline stringops in all cases */
 
 /* Temporary codegen switches */
 #define MASK_INTEL_SYNTAX	0x00000200
@@ -190,6 +192,9 @@ extern const int x86_promote_QImode, x86_single_stringop;
 
 #define TARGET_STACK_PROBE (target_flags & MASK_STACK_PROBE)
 
+#define TARGET_ALIGN_STRINGOPS (!(target_flags & MASK_NO_ALIGN_STROPS))
+#define TARGET_INLINE_ALL_STRINGOPS (target_flags & MASK_INLINE_ALL_STROPS)
+
 #define ASSEMBLER_DIALECT ((target_flags & MASK_INTEL_SYNTAX) != 0)
 
 #define TARGET_SWITCHES							      \
@@ -238,6 +243,14 @@ extern const int x86_promote_QImode, x86_single_stringop;
   { "intel-syntax",		MASK_INTEL_SYNTAX,			      \
     "Emit Intel syntax assembler opcodes" },				      \
   { "no-intel-syntax",		-MASK_INTEL_SYNTAX, "" },		      \
+  { "align-stringops",		-MASK_NO_ALIGN_STROPS,			      \
+    "Align destination of the string operations" },			      \
+  { "no-align-stringops",	 MASK_NO_ALIGN_STROPS,			      \
+    "Do not align destination of the string operations" },		      \
+  { "inline-all-strinops",	 MASK_INLINE_ALL_STROPS,		      \
+    "Inline all known string operations" },				      \
+  { "no-inline-all-stringops",	-MASK_INLINE_ALL_STROPS,		      \
+    "Do not inline all known string operations" },			      \
   SUBTARGET_SWITCHES							      \
   { "", TARGET_DEFAULT, 0 }}
 
diff --git a/gcc/config/i386/i386.md b/gcc/config/i386/i386.md
index ce2ac95..c5454d7 100644
--- a/gcc/config/i386/i386.md
+++ b/gcc/config/i386/i386.md
@@ -7838,49 +7838,208 @@
 (define_expand "movstrsi"
   [(use (match_operand:BLK 0 "memory_operand" ""))
    (use (match_operand:BLK 1 "memory_operand" ""))
-   (use (match_operand:SI 2 "const_int_operand" ""))
+   (use (match_operand:SI 2 "nonmemory_operand" ""))
    (use (match_operand:SI 3 "const_int_operand" ""))]
   ""
   "
 {
   rtx srcreg, destreg, countreg;
+  int align = 0;
+  int count = -1;
 
-  if (GET_CODE (operands[2]) != CONST_INT)
-    FAIL;
+  if (GET_CODE (operands[3]) == CONST_INT)
+    align = INTVAL (operands[3]);
+
+  /* This simple hack avoids all inlining code and simplifies code bellow.  */
+  if (!TARGET_ALIGN_STRINGOPS)
+    align = 32;
+
+  if (GET_CODE (operands[2]) == CONST_INT)
+    count = INTVAL (operands[2]);
 
   destreg = copy_to_mode_reg (Pmode, XEXP (operands[0], 0));
   srcreg = copy_to_mode_reg (Pmode, XEXP (operands[1], 0));
 
   emit_insn (gen_cld());
+
   /* When optimizing for size emit simple rep ; movsb instruction for
      counts not divisible by 4.  */
-  if ((!optimize || optimize_size) && (INTVAL (operands[2]) & 0x03))
+
+  if ((!optimize || optimize_size) 
+      && (count < 0 || (count & 0x03)))
     {
       countreg = copy_to_mode_reg (SImode, operands[2]);
       emit_insn (gen_rep_movqi (destreg, srcreg, countreg,
       				destreg, srcreg, countreg));
     }
-  else
+
+  /* For constant aligned (or small unaligned) copies use rep movsl
+     followed by code copying the rest.  For PentiumPro ensure 8 byte
+     alignment to allow rep movsl acceleration.  */
+
+  else if (count >= 0 
+	   && (align >= 8
+	       || (!TARGET_PENTIUMPRO && align >= 4)
+	       || optimize_size || count < 64))
     {
-      if (INTVAL (operands[2]) & ~0x03)
+      if (count & ~0x03)
 	{
 	  countreg = copy_to_mode_reg (SImode,
-	  			       GEN_INT ((INTVAL (operands[2]) >> 2)
+	  			       GEN_INT ((count >> 2)
 						& 0x3fffffff));
 	  emit_insn (gen_rep_movsi (destreg, srcreg, countreg,
 				    destreg, srcreg, countreg));
 	}
-      if (INTVAL (operands[2]) & 0x02)
+      if (count & 0x02)
 	emit_insn (gen_strmovhi (destreg, srcreg));
-      if (INTVAL (operands[2]) & 0x01)
+      if (count & 0x01)
 	emit_insn (gen_strmovqi (destreg, srcreg));
     }
+  /* The generic code based on the glibc implementation:
+     - align destination to 4 bytes (8 byte alignment is used for PentiumPro
+       allowing accelerated copying there)
+     - copy the data using rep movsl
+     - copy the rest.  */
+  else
+    {
+      rtx countreg2;
+      rtx label = NULL;
+
+      /* In case we don't know anything about the alignment, default to
+         library version, since it is usually equally fast and result in
+	 shorter code.  */
+      if (!TARGET_INLINE_ALL_STRINGOPS && align < 4)
+	FAIL;
+
+      if (TARGET_SINGLE_STRINGOP)
+	emit_insn (gen_cld());
+
+      countreg2 = gen_reg_rtx (SImode);
+      countreg = copy_to_mode_reg (SImode, operands[2]);
+
+      /* We don't use loops to align destination and to copy parts smaller
+	 than 4 bytes, because gcc is able to optimize such code better (in
+	 the case the destination or the count really is aligned, gcc is often
+	 able to predict the branches) and also it is friendlier to the
+	 hardware branch prediction.  
+
+	 Using loops is benefical for generic case, because we can
+	 handle small counts using the loops.  Many CPUs (such as Athlon)
+	 have large REP prefix setup costs.
+
+	 This is quite costy.  Maybe we can revisit this decision later or
+	 add some customizability to this code.  */
+
+      if (count < 0
+	  && align < (TARGET_PENTIUMPRO && (count < 0 || count >= 260) ? 8 : 4))
+	{
+	  label = gen_label_rtx ();
+	  emit_cmp_and_jump_insns (countreg, GEN_INT (3),
+				   LEU, 0, SImode, 1, 0, label);
+	}
+      if (align <= 1)
+	{
+	  rtx label = gen_label_rtx ();
+	  rtx tmpcount = gen_reg_rtx (SImode);
+	  emit_insn (gen_andsi3 (tmpcount, destreg, GEN_INT (1)));
+	  emit_cmp_and_jump_insns (tmpcount, GEN_INT (0), EQ, 0,
+				   SImode, 1, 0, label);
+	  emit_insn (gen_strmovqi (destreg, srcreg));
+	  emit_insn (gen_addsi3 (countreg, countreg, constm1_rtx));
+	  emit_label (label);
+	  LABEL_NUSES (label) = 1;
+	}
+      if (align <= 2)
+	{
+	  rtx label = gen_label_rtx ();
+	  rtx tmpcount = gen_reg_rtx (SImode);
+	  emit_insn (gen_andsi3 (tmpcount, destreg, GEN_INT (2)));
+	  emit_cmp_and_jump_insns (tmpcount, GEN_INT (0), EQ, 0,
+				   SImode, 1, 0, label);
+	  emit_insn (gen_strmovhi (destreg, srcreg));
+	  emit_insn (gen_addsi3 (countreg, countreg, GEN_INT (-2)));
+	  emit_label (label);
+	  LABEL_NUSES (label) = 1;
+	}
+      if (align <= 4 && TARGET_PENTIUMPRO && (count < 1 || count >= 260))
+	{
+	  rtx label = gen_label_rtx ();
+	  rtx tmpcount = gen_reg_rtx (SImode);
+	  emit_insn (gen_andsi3 (tmpcount, destreg, GEN_INT (4)));
+	  emit_cmp_and_jump_insns (tmpcount, GEN_INT (0), EQ, 0,
+				   SImode, 1, 0, label);
+	  emit_insn (gen_strmovsi (destreg, srcreg));
+	  emit_insn (gen_addsi3 (countreg, countreg, GEN_INT (-4)));
+	  emit_label (label);
+	  LABEL_NUSES (label) = 1;
+	}
+
+      if (!TARGET_SINGLE_STRINGOP)
+	emit_insn (gen_cld());
+      emit_insn (gen_lshrsi3 (countreg2, countreg, GEN_INT (2)));
+      emit_insn (gen_rep_movsi (destreg, srcreg, countreg2,
+				destreg, srcreg, countreg2));
+
+      if (label)
+	{
+	  emit_label (label);
+	  LABEL_NUSES (label) = 1;
+	}
+      if (align > 2 && count > 0 && (count & 2))
+	emit_insn (gen_strmovhi (destreg, srcreg));
+      if (align <= 2 || count < 0)
+	{
+	  rtx label = gen_label_rtx ();
+	  rtx tmpcount = gen_reg_rtx (SImode);
+	  emit_insn (gen_andsi3 (tmpcount, countreg, GEN_INT (2)));
+	  emit_cmp_and_jump_insns (tmpcount, GEN_INT (0), EQ, 0,
+				   SImode, 1, 0, label);
+	  emit_insn (gen_strmovhi (destreg, srcreg));
+	  emit_label (label);
+	  LABEL_NUSES (label) = 1;
+	}
+      if (align > 1 && count > 0 && (count & 1))
+	emit_insn (gen_strmovsi (destreg, srcreg));
+      if (align <= 1 || count < 0)
+	{
+	  rtx label = gen_label_rtx ();
+	  rtx tmpcount = gen_reg_rtx (SImode);
+	  emit_insn (gen_andsi3 (tmpcount, countreg, GEN_INT (1)));
+	  emit_cmp_and_jump_insns (tmpcount, GEN_INT (0), EQ, 0,
+				   SImode, 1, 0, label);
+	  emit_insn (gen_strmovqi (destreg, srcreg));
+	  emit_label (label);
+	  LABEL_NUSES (label) = 1;
+	}
+    }
   DONE;
 }")
 
 ;; Most CPUs don't like single string operations
 ;; Handle this case here to simplify previous expander.
 
+(define_expand "strmovsi"
+  [(set (match_dup 2)
+  	(mem:SI (match_operand:SI 1 "register_operand" "")))
+   (set (mem:SI (match_operand:SI 0 "register_operand" ""))
+        (match_dup 2))
+   (parallel [(set (match_dup 0) (plus:SI (match_dup 0) (const_int 4)))
+	      (clobber (reg:CC 17))])
+   (parallel [(set (match_dup 1) (plus:SI (match_dup 1) (const_int 4)))
+	      (clobber (reg:CC 17))])]
+  ""
+  "
+{
+  if (TARGET_SINGLE_STRINGOP || optimize_size)
+    {
+      emit_insn (gen_strmovsi_1 (operands[0], operands[1], operands[0],
+				operands[1]));
+      DONE;
+    }
+  else 
+    operands[2] = gen_reg_rtx (SImode);
+}")
+
 (define_expand "strmovhi"
   [(set (match_dup 2)
   	(mem:HI (match_operand:SI 1 "register_operand" "")))
@@ -7925,6 +8084,21 @@
     operands[2] = gen_reg_rtx (QImode);
 }")
 
+(define_insn "strmovsi_1"
+  [(set (mem:SI (match_operand:SI 2 "register_operand" "0"))
+	(mem:SI (match_operand:SI 3 "register_operand" "1")))
+   (set (match_operand:SI 0 "register_operand" "=D")
+	(plus:SI (match_dup 0)
+		 (const_int 4)))
+   (set (match_operand:SI 1 "register_operand" "=S")
+	(plus:SI (match_dup 1)
+		 (const_int 4)))
+   (use (reg:SI 19))]
+  "TARGET_SINGLE_STRINGOP || optimize_size"
+  "movsl"
+  [(set_attr "type" "str")
+   (set_attr "memory" "both")])
+
 (define_insn "strmovhi_1"
   [(set (mem:HI (match_operand:SI 2 "register_operand" "0"))
 	(mem:HI (match_operand:SI 3 "register_operand" "1")))
@@ -7996,15 +8170,26 @@
 
 (define_expand "clrstrsi"
    [(use (match_operand:BLK 0 "memory_operand" ""))
-    (use (match_operand:SI 1 "const_int_operand" ""))
+    (use (match_operand:SI 1 "nonmemory_operand" ""))
     (use (match_operand:SI 2 "const_int_operand" ""))]
   ""
   "
 {
+  /* See comments in movstr expanders.  The code is mostly identical.  */
+
   rtx destreg, zeroreg, countreg;
+  int align = 0;
+  int count = -1;
 
-  if (GET_CODE (operands[1]) != CONST_INT)
-    FAIL;
+  if (GET_CODE (operands[2]) == CONST_INT)
+    align = INTVAL (operands[2]);
+
+  /* This simple hack avoids all inlining code and simplifies code bellow.  */
+  if (!TARGET_ALIGN_STRINGOPS)
+    align = 32;
+
+  if (GET_CODE (operands[1]) == CONST_INT)
+    count = INTVAL (operands[1]);
 
   destreg = copy_to_mode_reg (Pmode, XEXP (operands[0], 0));
 
@@ -8012,14 +8197,19 @@
 
   /* When optimizing for size emit simple rep ; movsb instruction for
      counts not divisible by 4.  */
-  if ((!optimize || optimize_size) && (INTVAL (operands[1]) & 0x03))
+
+  if ((!optimize || optimize_size) 
+      && (count < 0 || (count & 0x03)))
     {
       countreg = copy_to_mode_reg (SImode, operands[1]);
       zeroreg = copy_to_mode_reg (QImode, const0_rtx);
       emit_insn (gen_rep_stosqi (destreg, countreg, zeroreg,
 				 destreg, countreg));
     }
-  else
+  else if (count >= 0 
+	   && (align >= 8
+	       || (!TARGET_PENTIUMPRO && align >= 4)
+	       || optimize_size || count < 64))
     {
       zeroreg = copy_to_mode_reg (SImode, const0_rtx);
       if (INTVAL (operands[1]) & ~0x03)
@@ -8037,12 +8227,133 @@
 	emit_insn (gen_strsetqi (destreg,
 				 gen_rtx_SUBREG (QImode, zeroreg, 0)));
     }
+  else
+    {
+      rtx countreg2;
+      rtx label = NULL;
+
+      /* In case we don't know anything about the alignment, default to
+         library version, since it is usually equally fast and result in
+	 shorter code.  */
+      if (!TARGET_INLINE_ALL_STRINGOPS && align < 4)
+	FAIL;
+
+      if (TARGET_SINGLE_STRINGOP)
+	emit_insn (gen_cld());
+
+      countreg2 = gen_reg_rtx (SImode);
+      countreg = copy_to_mode_reg (SImode, operands[1]);
+      zeroreg = copy_to_mode_reg (SImode, const0_rtx);
+
+      if (count < 0
+	  && align < (TARGET_PENTIUMPRO && (count < 0 || count >= 260) ? 8 : 4))
+	{
+	  label = gen_label_rtx ();
+	  emit_cmp_and_jump_insns (countreg, GEN_INT (3),
+				   LEU, 0, SImode, 1, 0, label);
+	}
+      if (align <= 1)
+	{
+	  rtx label = gen_label_rtx ();
+	  rtx tmpcount = gen_reg_rtx (SImode);
+	  emit_insn (gen_andsi3 (tmpcount, destreg, GEN_INT (1)));
+	  emit_cmp_and_jump_insns (tmpcount, GEN_INT (0), EQ, 0,
+				   SImode, 1, 0, label);
+	  emit_insn (gen_strsetqi (destreg,
+				   gen_rtx_SUBREG (QImode, zeroreg, 0)));
+	  emit_insn (gen_addsi3 (countreg, countreg, constm1_rtx));
+	  emit_label (label);
+	  LABEL_NUSES (label) = 1;
+	}
+      if (align <= 2)
+	{
+	  rtx label = gen_label_rtx ();
+	  rtx tmpcount = gen_reg_rtx (SImode);
+	  emit_insn (gen_andsi3 (tmpcount, destreg, GEN_INT (2)));
+	  emit_cmp_and_jump_insns (tmpcount, GEN_INT (0), EQ, 0,
+				   SImode, 1, 0, label);
+	  emit_insn (gen_strsethi (destreg,
+				   gen_rtx_SUBREG (HImode, zeroreg, 0)));
+	  emit_insn (gen_addsi3 (countreg, countreg, GEN_INT (-2)));
+	  emit_label (label);
+	  LABEL_NUSES (label) = 1;
+	}
+      if (align <= 4 && TARGET_PENTIUMPRO && (count < 1 || count >= 260))
+	{
+	  rtx label = gen_label_rtx ();
+	  rtx tmpcount = gen_reg_rtx (SImode);
+	  emit_insn (gen_andsi3 (tmpcount, destreg, GEN_INT (4)));
+	  emit_cmp_and_jump_insns (tmpcount, GEN_INT (0), EQ, 0,
+				   SImode, 1, 0, label);
+	  emit_insn (gen_strsethi (destreg, zeroreg));
+	  emit_insn (gen_addsi3 (countreg, countreg, GEN_INT (-4)));
+	  emit_label (label);
+	  LABEL_NUSES (label) = 1;
+	}
+
+      if (!TARGET_SINGLE_STRINGOP)
+	emit_insn (gen_cld());
+      emit_insn (gen_lshrsi3 (countreg2, countreg, GEN_INT (2)));
+      emit_insn (gen_rep_stossi (destreg, countreg2, zeroreg,
+				 destreg, countreg2));
+
+      if (label)
+	{
+	  emit_label (label);
+	  LABEL_NUSES (label) = 1;
+	}
+      if (align > 2 && count > 0 && (count & 2))
+	emit_insn (gen_strsethi (destreg,
+				 gen_rtx_SUBREG (HImode, zeroreg, 0)));
+      if (align <= 2 || count < 0)
+	{
+	  rtx label = gen_label_rtx ();
+	  rtx tmpcount = gen_reg_rtx (SImode);
+	  emit_insn (gen_andsi3 (tmpcount, countreg, GEN_INT (2)));
+	  emit_cmp_and_jump_insns (tmpcount, GEN_INT (0), EQ, 0,
+				   SImode, 1, 0, label);
+	  emit_insn (gen_strsethi (destreg,
+				   gen_rtx_SUBREG (HImode, zeroreg, 0)));
+	  emit_label (label);
+	  LABEL_NUSES (label) = 1;
+	}
+      if (align > 1 && count > 0 && (count & 1))
+	emit_insn (gen_strsetqi (destreg,
+				 gen_rtx_SUBREG (QImode, zeroreg, 0)));
+      if (align <= 1 || count < 0)
+	{
+	  rtx label = gen_label_rtx ();
+	  rtx tmpcount = gen_reg_rtx (SImode);
+	  emit_insn (gen_andsi3 (tmpcount, countreg, GEN_INT (1)));
+	  emit_cmp_and_jump_insns (tmpcount, GEN_INT (0), EQ, 0,
+				   SImode, 1, 0, label);
+	  emit_insn (gen_strsetqi (destreg,
+				   gen_rtx_SUBREG (QImode, zeroreg, 0)));
+	  emit_label (label);
+	  LABEL_NUSES (label) = 1;
+	}
+    }
   DONE;
 }")
 
 ;; Most CPUs don't like single string operations
 ;; Handle this case here to simplify previous expander.
 
+(define_expand "strsetsi"
+  [(set (mem:SI (match_operand:SI 0 "register_operand" ""))
+	(match_operand:SI 1 "register_operand" ""))
+   (parallel [(set (match_dup 0) (plus:SI (match_dup 0) (const_int 4)))
+	      (clobber (reg:CC 17))])]
+  ""
+  "
+{
+  if (TARGET_SINGLE_STRINGOP || optimize_size)
+    {
+      emit_insn (gen_strsetsi_1 (operands[0], operands[0], operands[1]));
+      DONE;
+    }
+}")
+
 (define_expand "strsethi"
   [(set (mem:HI (match_operand:SI 0 "register_operand" ""))
 	(match_operand:HI 1 "register_operand" ""))
@@ -8073,6 +8384,18 @@
     }
 }")
 
+(define_insn "strsetsi_1"
+  [(set (mem:SI (match_operand:SI 1 "register_operand" "0"))
+	(match_operand:SI 2 "register_operand" "a"))
+   (set (match_operand:SI 0 "register_operand" "=D")
+	(plus:SI (match_dup 0)
+		 (const_int 4)))
+   (use (reg:SI 19))]
+  "TARGET_SINGLE_STRINGOP || optimize_size"
+  "stosl"
+  [(set_attr "type" "str")
+   (set_attr "memory" "store")])
+
 (define_insn "strsethi_1"
   [(set (mem:HI (match_operand:SI 1 "register_operand" "0"))
 	(match_operand:HI 2 "register_operand" "a"))
@@ -8252,6 +8575,14 @@
 {
   rtx out, addr, eoschar, align, scratch1, scratch2, scratch3;
 
+  /* The generic case of strlen expander is long.  Avoid it's
+     expanding unless TARGET_INLINE_ALL_STRINGOPS.  */
+
+  if (TARGET_UNROLL_STRLEN && eoschar == const0_rtx && optimize > 1
+      && !optimize_size
+      && (GET_CODE (align) != CONST_INT || INTVAL (align) < 4))
+    FAIL;
+
   out = operands[0];
   addr = force_reg (Pmode, XEXP (operands[1], 0));
   eoschar = operands[2];
@@ -8271,6 +8602,7 @@
 
       if (GET_CODE (align) != CONST_INT || INTVAL (align) < 4)
 	emit_move_insn (scratch1, addr);
+
       emit_move_insn (out, addr);
 
       ix86_expand_strlensi_unroll_1 (out, align, scratch1);
diff --git a/gcc/invoke.texi b/gcc/invoke.texi
index 549ece1..f09ef55 100644
--- a/gcc/invoke.texi
+++ b/gcc/invoke.texi
@@ -360,7 +360,7 @@ in the following sections.
 -mreg-alloc=@var{list}  -mregparm=@var{num}
 -malign-jumps=@var{num}  -malign-loops=@var{num}
 -malign-functions=@var{num} -mpreferred-stack-boundary=@var{num}
--mthreads
+-mthreads -mno-align-stringops -minline-all-stringops
 
 @emph{HPPA Options}
 -march=@var{architecture type}
@@ -5954,6 +5954,19 @@ on thread-safe exception handling must compile and link all code with the
 @samp{-mthreads} option. When compiling, @samp{-mthreads} defines 
 @samp{-D_MT}; when linking, it links in a special thread helper library 
 @samp{-lmingwthrd} which cleans up per thread exception handling data.
+
+@item -mno-align-stringops
+@kindex -mno-align-stringops
+Do not align destination of inlined string operations. This switch reduces
+code size and improves performance in case the destination is already aligned,
+but gcc don't know about it.
+
+@item -minline-all-stringops
+@kindex -minline-all-stringops
+By default GCC inlines string operations only when destination is known to be
+aligned at least to 4 byte boundary. This enables more inlining, increase code
+size, but may improve performance of code that depends on fast memcpy, strlen
+and memset for short lengths.
 @end table
 
 @node HPPA Options
-- 
2.7.4