aarch64: Add vector costs for SVE CLAST[AB] and FADDA

author Richard Sandiford <richard.sandiford@arm.com>

Fri, 26 Mar 2021 16:08:30 +0000 (16:08 +0000)

committer Richard Sandiford <richard.sandiford@arm.com>

Fri, 26 Mar 2021 16:08:30 +0000 (16:08 +0000)
author Richard Sandiford <richard.sandiford@arm.com>
Fri, 26 Mar 2021 16:08:30 +0000 (16:08 +0000)
committer Richard Sandiford <richard.sandiford@arm.com>
Fri, 26 Mar 2021 16:08:30 +0000 (16:08 +0000)
diff --git a/gcc/config/aarch64/aarch64-protos.h b/gcc/config/aarch64/aarch64-protos.h

index e4eeb2c..bfcab72 100644 (file)
--- a/gcc/config/aarch64/aarch64-protos.h
+++ b/gcc/config/aarch64/aarch64-protos.h
@@ -237,7 +237,33 @@ struct simd_vec_cost
  };
  
  typedef struct simd_vec_cost advsimd_vec_cost;
-typedef struct simd_vec_cost sve_vec_cost;
+
+/* SVE-specific extensions to the information provided by simd_vec_cost.  */
+struct sve_vec_cost : simd_vec_cost
+{
+  constexpr sve_vec_cost (const simd_vec_cost &base,
+                         unsigned int clast_cost,
+                         unsigned int fadda_f16_cost,
+                         unsigned int fadda_f32_cost,
+                         unsigned int fadda_f64_cost)
+    : simd_vec_cost (base),
+      clast_cost (clast_cost),
+      fadda_f16_cost (fadda_f16_cost),
+      fadda_f32_cost (fadda_f32_cost),
+      fadda_f64_cost (fadda_f64_cost)
+  {}
+
+  /* The cost of a vector-to-scalar CLASTA or CLASTB instruction,
+     with the scalar being stored in FP registers.  This cost is
+     assumed to be a cycle latency.  */
+  const int clast_cost;
+
+  /* The costs of FADDA for the three data types that it supports.
+     These costs are assumed to be cycle latencies.  */
+  const int fadda_f16_cost;
+  const int fadda_f32_cost;
+  const int fadda_f64_cost;
+};
  
  /* Cost for vector insn classes.  */
  struct cpu_vector_cost
diff --git a/gcc/config/aarch64/aarch64.c b/gcc/config/aarch64/aarch64.c

index b44dcdc..b62169a 100644 (file)
--- a/gcc/config/aarch64/aarch64.c
+++ b/gcc/config/aarch64/aarch64.c
@@ -609,22 +609,28 @@ static const advsimd_vec_cost generic_advsimd_vector_cost =
  /* Generic costs for SVE vector operations.  */
  static const sve_vec_cost generic_sve_vector_cost =
  {
-  1, /* int_stmt_cost  */
-  1, /* fp_stmt_cost  */
-  2, /* permute_cost  */
-  2, /* reduc_i8_cost  */
-  2, /* reduc_i16_cost  */
-  2, /* reduc_i32_cost  */
-  2, /* reduc_i64_cost  */
-  2, /* reduc_f16_cost  */
-  2, /* reduc_f32_cost  */
-  2, /* reduc_f64_cost  */
-  2, /* vec_to_scalar_cost  */
-  1, /* scalar_to_vec_cost  */
-  1, /* align_load_cost  */
-  1, /* unalign_load_cost  */
-  1, /* unalign_store_cost  */
-  1  /* store_cost  */
+  {
+    1, /* int_stmt_cost  */
+    1, /* fp_stmt_cost  */
+    2, /* permute_cost  */
+    2, /* reduc_i8_cost  */
+    2, /* reduc_i16_cost  */
+    2, /* reduc_i32_cost  */
+    2, /* reduc_i64_cost  */
+    2, /* reduc_f16_cost  */
+    2, /* reduc_f32_cost  */
+    2, /* reduc_f64_cost  */
+    2, /* vec_to_scalar_cost  */
+    1, /* scalar_to_vec_cost  */
+    1, /* align_load_cost  */
+    1, /* unalign_load_cost  */
+    1, /* unalign_store_cost  */
+    1  /* store_cost  */
+  },
+  2, /* clast_cost  */
+  2, /* fadda_f16_cost  */
+  2, /* fadda_f32_cost  */
+  2 /* fadda_f64_cost  */
  };
  
  /* Generic costs for vector insn classes.  */
@@ -662,22 +668,28 @@ static const advsimd_vec_cost a64fx_advsimd_vector_cost =
  
  static const sve_vec_cost a64fx_sve_vector_cost =
  {
-  2, /* int_stmt_cost  */
-  5, /* fp_stmt_cost  */
-  3, /* permute_cost  */
-  13, /* reduc_i8_cost  */
-  13, /* reduc_i16_cost  */
-  13, /* reduc_i32_cost  */
-  13, /* reduc_i64_cost  */
-  13, /* reduc_f16_cost  */
-  13, /* reduc_f32_cost  */
-  13, /* reduc_f64_cost  */
-  13, /* vec_to_scalar_cost  */
-  4, /* scalar_to_vec_cost  */
-  6, /* align_load_cost  */
-  6, /* unalign_load_cost  */
-  1, /* unalign_store_cost  */
-  1  /* store_cost  */
+  {
+    2, /* int_stmt_cost  */
+    5, /* fp_stmt_cost  */
+    3, /* permute_cost  */
+    13, /* reduc_i8_cost  */
+    13, /* reduc_i16_cost  */
+    13, /* reduc_i32_cost  */
+    13, /* reduc_i64_cost  */
+    13, /* reduc_f16_cost  */
+    13, /* reduc_f32_cost  */
+    13, /* reduc_f64_cost  */
+    13, /* vec_to_scalar_cost  */
+    4, /* scalar_to_vec_cost  */
+    6, /* align_load_cost  */
+    6, /* unalign_load_cost  */
+    1, /* unalign_store_cost  */
+    1  /* store_cost  */
+  },
+  13, /* clast_cost  */
+  13, /* fadda_f16_cost  */
+  13, /* fadda_f32_cost  */
+  13 /* fadda_f64_cost  */
  };
  
  static const struct cpu_vector_cost a64fx_vector_cost =
@@ -14060,6 +14072,20 @@ aarch64_is_reduction (stmt_vec_info stmt_info)
           || VECTORIZABLE_CYCLE_DEF (STMT_VINFO_DEF_TYPE (stmt_info)));
  }
  
+/* If STMT_INFO describes a reduction, return the type of reduction
+   it describes, otherwise return -1.  */
+static int
+aarch64_reduc_type (vec_info *vinfo, stmt_vec_info stmt_info)
+{
+  if (loop_vec_info loop_vinfo = dyn_cast<loop_vec_info> (vinfo))
+    if (STMT_VINFO_REDUC_DEF (stmt_info))
+      {
+       stmt_vec_info reduc_info = info_for_reduction (loop_vinfo, stmt_info);
+       return int (STMT_VINFO_REDUC_TYPE (reduc_info));
+      }
+  return -1;
+}
+
  /* Return true if creating multiple copies of STMT_INFO for Advanced SIMD
     vectors would produce a series of LDP or STP operations.  KIND is the
     kind of statement that STMT_INFO represents.  */
@@ -14123,6 +14149,43 @@ aarch64_integer_truncation_p (stmt_vec_info stmt_info)
           && TYPE_PRECISION (lhs_type) < TYPE_PRECISION (rhs_type));
  }
  
+/* We are considering implementing STMT_INFO using SVE vector type VECTYPE.
+   If STMT_INFO is an in-loop reduction that SVE supports directly, return
+   its latency in cycles, otherwise return zero.  SVE_COSTS specifies the
+   latencies of the relevant instructions.  */
+static unsigned int
+aarch64_sve_in_loop_reduction_latency (vec_info *vinfo,
+                                      stmt_vec_info stmt_info,
+                                      tree vectype,
+                                      const sve_vec_cost *sve_costs)
+{
+  switch (aarch64_reduc_type (vinfo, stmt_info))
+    {
+    case EXTRACT_LAST_REDUCTION:
+      return sve_costs->clast_cost;
+
+    case FOLD_LEFT_REDUCTION:
+      switch (GET_MODE_INNER (TYPE_MODE (vectype)))
+       {
+       case E_HFmode:
+       case E_BFmode:
+         return sve_costs->fadda_f16_cost;
+
+       case E_SFmode:
+         return sve_costs->fadda_f32_cost;
+
+       case E_DFmode:
+         return sve_costs->fadda_f64_cost;
+
+       default:
+         break;
+       }
+      break;
+    }
+
+  return 0;
+}
+
  /* STMT_COST is the cost calculated by aarch64_builtin_vectorization_cost
     for the vectorized form of STMT_INFO, which has cost kind KIND and which
     when vectorized would operate on vector type VECTYPE.  Try to subdivide
@@ -14130,12 +14193,27 @@ aarch64_integer_truncation_p (stmt_vec_info stmt_info)
     accurate cost.  WHERE specifies where the cost associated with KIND
     occurs.  */
  static unsigned int
-aarch64_detect_vector_stmt_subtype (vect_cost_for_stmt kind,
+aarch64_detect_vector_stmt_subtype (vec_info *vinfo, vect_cost_for_stmt kind,
                                     stmt_vec_info stmt_info, tree vectype,
                                     enum vect_cost_model_location where,
                                     unsigned int stmt_cost)
  {
    const simd_vec_cost *simd_costs = aarch64_simd_vec_costs (vectype);
+  const sve_vec_cost *sve_costs = nullptr;
+  if (aarch64_sve_mode_p (TYPE_MODE (vectype)))
+    sve_costs = aarch64_tune_params.vec_costs->sve;
+
+  /* Detect cases in which vec_to_scalar represents an in-loop reduction.  */
+  if (kind == vec_to_scalar
+      && where == vect_body
+      && sve_costs)
+    {
+      unsigned int latency
+       = aarch64_sve_in_loop_reduction_latency (vinfo, stmt_info, vectype,
+                                                sve_costs);
+      if (latency)
+       return latency;
+    }
  
    /* Detect cases in which vec_to_scalar represents a single reduction
       instruction like FADDP or MAXV.  */
@@ -14260,9 +14338,9 @@ aarch64_add_stmt_cost (class vec_info *vinfo, void *data, int count,
        /* Try to get a more accurate cost by looking at STMT_INFO instead
          of just looking at KIND.  */
        if (stmt_info && vectype && aarch64_use_new_vector_costs_p ())
-       stmt_cost = aarch64_detect_vector_stmt_subtype (kind, stmt_info,
-                                                       vectype, where,
-                                                       stmt_cost);
+       stmt_cost = aarch64_detect_vector_stmt_subtype (vinfo, kind,
+                                                       stmt_info, vectype,
+                                                       where, stmt_cost);
  
        /* Do any SVE-specific adjustments to the cost.  */
        if (stmt_info && vectype && aarch64_sve_mode_p (TYPE_MODE (vectype)))
author	Richard Sandiford <richard.sandiford@arm.com>
	Fri, 26 Mar 2021 16:08:30 +0000 (16:08 +0000)
committer	Richard Sandiford <richard.sandiford@arm.com>
	Fri, 26 Mar 2021 16:08:30 +0000 (16:08 +0000)
gcc/config/aarch64/aarch64-protos.h		patch \| blob \| history
gcc/config/aarch64/aarch64.c		patch \| blob \| history