* config/i386/i386.h (ACCUMULATE_OUTGOING_ARGS): Disable accumulation

author hubicka <hubicka@138bc75d-0d04-0410-961f-82ee72b054a4>

Sat, 19 Oct 2013 12:11:14 +0000 (12:11 +0000)

committer hubicka <hubicka@138bc75d-0d04-0410-961f-82ee72b054a4>

Sat, 19 Oct 2013 12:11:14 +0000 (12:11 +0000)
author hubicka <hubicka@138bc75d-0d04-0410-961f-82ee72b054a4>
Sat, 19 Oct 2013 12:11:14 +0000 (12:11 +0000)
committer hubicka <hubicka@138bc75d-0d04-0410-961f-82ee72b054a4>
Sat, 19 Oct 2013 12:11:14 +0000 (12:11 +0000)
diff --git a/gcc/ChangeLog b/gcc/ChangeLog

index bdb9e01..086b5b2 100644 (file)
--- a/gcc/ChangeLog
+++ b/gcc/ChangeLog
@@ -1,3 +1,18 @@
+2013-10-18  Jan Hubicka  <jh@suse.cz>
+
+       * config/i386/i386.h (ACCUMULATE_OUTGOING_ARGS): Disable accumulation
+       for cold functions.
+       * x86-tune.def (X86_TUNE_USE_LEAVE): Update comment.
+       (X86_TUNE_PUSH_MEMORY): Likewise.
+       (X86_TUNE_AVX256_UNALIGNED_LOAD_OPTIMAL,
+       X86_TUNE_AVX256_UNALIGNED_STORE_OPTIMAL): New.
+       (X86_TUNE_ACCUMULATE_OUTGOING_ARGS, X86_TUNE_ALWAYS_FANCY_MATH_387): New.
+       * i386.c (x86_accumulate_outgoing_args, x86_arch_always_fancy_math_387,
+       x86_avx256_split_unaligned_load, x86_avx256_split_unaligned_store):
+       Remove.
+       (ix86_option_override_internal): Update to use tune features instead
+       of variables.
+
  2013-10-18  Cong Hou  <congh@google.com>
  
         PR tree-optimization/58508
diff --git a/gcc/config/i386/i386.c b/gcc/config/i386/i386.c

index b8c3c1d..91e6510 100644 (file)
--- a/gcc/config/i386/i386.c
+++ b/gcc/config/i386/i386.c
@@ -1897,18 +1897,6 @@ static unsigned int initial_ix86_arch_features[X86_ARCH_LAST] = {
    ~m_386,
  };
  
-static const unsigned int x86_accumulate_outgoing_args
-  = m_PPRO | m_P4_NOCONA | m_ATOM | m_SLM | m_AMD_MULTIPLE | m_GENERIC;
-
-static const unsigned int x86_arch_always_fancy_math_387
-  = m_PENT | m_PPRO | m_P4_NOCONA | m_CORE_ALL | m_ATOM | m_SLM | m_AMD_MULTIPLE | m_GENERIC;
-
-static const unsigned int x86_avx256_split_unaligned_load
-  = m_COREI7 | m_GENERIC;
-
-static const unsigned int x86_avx256_split_unaligned_store
-  = m_COREI7 | m_BDVER | m_GENERIC;
-
  /* In case the average insn count for single function invocation is
     lower than this constant, emit fast (but longer) prologue and
     epilogue code.  */
@@ -2925,7 +2913,7 @@ ix86_option_override_internal (bool main_args_p,
                                struct gcc_options *opts_set)
  {
    int i;
-  unsigned int ix86_arch_mask, ix86_tune_mask;
+  unsigned int ix86_arch_mask;
    const bool ix86_tune_specified = (opts->x_ix86_tune_string != NULL);
    const char *prefix;
    const char *suffix;
@@ -3693,7 +3681,7 @@ ix86_option_override_internal (bool main_args_p,
  
    /* If the architecture always has an FPU, turn off NO_FANCY_MATH_387,
       since the insns won't need emulation.  */
-  if (x86_arch_always_fancy_math_387 & ix86_arch_mask)
+  if (ix86_tune_features [X86_TUNE_ALWAYS_FANCY_MATH_387])
      opts->x_target_flags &= ~MASK_NO_FANCY_MATH_387;
  
    /* Likewise, if the target doesn't have a 387, or we've specified
@@ -3835,8 +3823,7 @@ ix86_option_override_internal (bool main_args_p,
         gcc_unreachable ();
        }
  
-  ix86_tune_mask = 1u << ix86_tune;
-  if ((x86_accumulate_outgoing_args & ix86_tune_mask)
+  if (ix86_tune_features [X86_TUNE_ACCUMULATE_OUTGOING_ARGS]
        && !(opts_set->x_target_flags & MASK_ACCUMULATE_OUTGOING_ARGS)
        && !opts->x_optimize_size)
      opts->x_target_flags |= MASK_ACCUMULATE_OUTGOING_ARGS;
@@ -3976,10 +3963,10 @@ ix86_option_override_internal (bool main_args_p,
        if (flag_expensive_optimizations
           && !(opts_set->x_target_flags & MASK_VZEROUPPER))
         opts->x_target_flags |= MASK_VZEROUPPER;
-      if ((x86_avx256_split_unaligned_load & ix86_tune_mask)
+      if (!ix86_tune_features[X86_TUNE_SSE_UNALIGNED_LOAD_OPTIMAL]
           && !(opts_set->x_target_flags & MASK_AVX256_SPLIT_UNALIGNED_LOAD))
         opts->x_target_flags |= MASK_AVX256_SPLIT_UNALIGNED_LOAD;
-      if ((x86_avx256_split_unaligned_store & ix86_tune_mask)
+      if (!ix86_tune_features[X86_TUNE_SSE_UNALIGNED_STORE_OPTIMAL]
           && !(opts_set->x_target_flags & MASK_AVX256_SPLIT_UNALIGNED_STORE))
         opts->x_target_flags |= MASK_AVX256_SPLIT_UNALIGNED_STORE;
        /* Enable 128-bit AVX instruction generation
diff --git a/gcc/config/i386/i386.h b/gcc/config/i386/i386.h

index 10f7ff0..63e4903 100644 (file)
--- a/gcc/config/i386/i386.h
+++ b/gcc/config/i386/i386.h
@@ -1544,13 +1544,26 @@ enum reg_class
     will be computed and placed into the variable `crtl->outgoing_args_size'.
     No space will be pushed onto the stack for each call; instead, the
     function prologue should increase the stack frame size by this amount.  
+
+   In 32bit mode enabling argument accumulation results in about 5% code size
+   growth becuase move instructions are less compact than push.  In 64bit
+   mode the difference is less drastic but visible.  
+
+   FIXME: Unlike earlier implementations, the size of unwind info seems to
+   actually grouw with accumulation.  Is that because accumulated args
+   unwind info became unnecesarily bloated?
     
     64-bit MS ABI seem to require 16 byte alignment everywhere except for
-   function prologue and apilogue.  This is not possible without
-   ACCUMULATE_OUTGOING_ARGS.  */
+   function prologue and epilogue.  This is not possible without
+   ACCUMULATE_OUTGOING_ARGS.  
+
+   If stack probes are required, the space used for large function
+   arguments on the stack must also be probed, so enable
+   -maccumulate-outgoing-args so this happens in the prologue.  */
  
  #define ACCUMULATE_OUTGOING_ARGS \
-  (TARGET_ACCUMULATE_OUTGOING_ARGS || TARGET_64BIT_MS_ABI)
+  ((TARGET_ACCUMULATE_OUTGOING_ARGS && optimize_function_for_speed_p (cfun)) \
+   || TARGET_STACK_PROBE || TARGET_64BIT_MS_ABI)
  
  /* If defined, a C expression whose value is nonzero when we want to use PUSH
     instructions to pass outgoing arguments.  */
diff --git a/gcc/config/i386/x86-tune.def b/gcc/config/i386/x86-tune.def

index 34484a2..42eee33 100644 (file)
--- a/gcc/config/i386/x86-tune.def
+++ b/gcc/config/i386/x86-tune.def
@@ -18,15 +18,13 @@ a copy of the GCC Runtime Library Exception along with this program;
  see the files COPYING3 and COPYING.RUNTIME respectively.  If not, see
  <http://www.gnu.org/licenses/>.  */
  
-/* X86_TUNE_USE_LEAVE: Leave does not affect Nocona SPEC2000 results
-   negatively, so enabling for Generic64 seems like good code size
-   tradeoff.  We can't enable it for 32bit generic because it does not
-   work well with PPro base chips.  */
+/* X86_TUNE_USE_LEAVE: Use "leave" instruction in epilogues where it fits.  */
  DEF_TUNE (X86_TUNE_USE_LEAVE, "use_leave", 
           m_386 | m_CORE_ALL | m_K6_GEODE | m_AMD_MULTIPLE | m_GENERIC)
  
  /* X86_TUNE_PUSH_MEMORY: Enable generation of "push mem" instructions.
-   Some chips, like 486 and Pentium have problems with these sequences.  */
+   Some chips, like 486 and Pentium works faster with separate load
+   and push instructions.  */
  DEF_TUNE (X86_TUNE_PUSH_MEMORY, "push_memory", 
            m_386 | m_P4_NOCONA | m_CORE_ALL | m_K6_GEODE | m_AMD_MULTIPLE 
            | m_GENERIC)
@@ -210,6 +208,16 @@ DEF_TUNE (X86_TUNE_SSE_UNALIGNED_LOAD_OPTIMAL, "sse_unaligned_load_optimal",
  DEF_TUNE (X86_TUNE_SSE_UNALIGNED_STORE_OPTIMAL, "sse_unaligned_store_optimal",
            m_COREI7 | m_BDVER | m_SLM | m_GENERIC)
  
+/* X86_TUNE_AVX256_UNALIGNED_LOAD_OPTIMAL: if true, unaligned loads are
+   split.  */
+DEF_TUNE (X86_TUNE_AVX256_UNALIGNED_LOAD_OPTIMAL, "256_unaligned_load_optimal", 
+          ~(m_COREI7 | m_GENERIC))
+
+/* X86_TUNE_AVX256_UNALIGNED_STORE_OPTIMAL: if true, unaligned loads are
+   split.  */
+DEF_TUNE (X86_TUNE_AVX256_UNALIGNED_STORE_OPTIMAL, "256_unaligned_load_optimal", 
+          ~(m_COREI7 | m_BDVER | m_GENERIC))
+
  /* Use packed single precision instructions where posisble.  I.e. movups instead
     of movupd.  */
  DEF_TUNE (X86_TUNE_SSE_PACKED_SINGLE_INSN_OPTIMAL, "sse_packed_single_insn_optimal",
@@ -398,3 +406,24 @@ DEF_TUNE (X86_TUNE_AVOID_MEM_OPND_FOR_CMOVE, "avoid_mem_opnd_for_cmove",
     fp converts to destination register.  */
  DEF_TUNE (X86_TUNE_SPLIT_MEM_OPND_FOR_FP_CONVERTS, "split_mem_opnd_for_fp_converts",
            m_SLM)
+
+/* X86_TUNE_ACCUMULATE_OUTGOING_ARGS: Allocate stack space for outgoing
+   arguments in prologue/epilogue instead of separately for each call
+   by push/pop instructions.
+   This increase code size by about 5% in 32bit mode, less so in 64bit mode
+   because parameters are passed in registers.  It is considerable
+   win for targets without stack engine that prevents multple push operations
+   to happen in parallel.
+
+   FIXME: the flags is incorrectly enabled for amdfam10, Bulldozer,
+   Bobcat and Generic.  This is because disabling it causes large
+   regression on mgrid due to IRA limitation leading to unecessary
+   use of the frame pointer in 32bit mode.  */
+DEF_TUNE (X86_TUNE_ACCUMULATE_OUTGOING_ARGS, "accumulate_outgoing_args", 
+         m_PPRO | m_P4_NOCONA | m_ATOM | m_SLM | m_AMD_MULTIPLE | m_GENERIC)
+
+/* X86_TUNE_ALWAYS_FANCY_MATH_387: controls use of fancy 387 operations,
+   such as fsqrt, fprem, fsin, fcos, fsincos etc.
+   Should be enabled for all targets that always has coprocesor.  */
+DEF_TUNE (X86_TUNE_ALWAYS_FANCY_MATH_387, "always_fancy_math_387", 
+          ~(m_386 | m_486))
author	hubicka <hubicka@138bc75d-0d04-0410-961f-82ee72b054a4>
	Sat, 19 Oct 2013 12:11:14 +0000 (12:11 +0000)
committer	hubicka <hubicka@138bc75d-0d04-0410-961f-82ee72b054a4>
	Sat, 19 Oct 2013 12:11:14 +0000 (12:11 +0000)
gcc/ChangeLog		patch \| blob \| history
gcc/config/i386/i386.c		patch \| blob \| history
gcc/config/i386/i386.h		patch \| blob \| history
gcc/config/i386/x86-tune.def		patch \| blob \| history