[AARCH64] Implement Vector Permute Support.

author jgreenhalgh <jgreenhalgh@138bc75d-0d04-0410-961f-82ee72b054a4>

Wed, 5 Dec 2012 11:36:00 +0000 (11:36 +0000)

committer jgreenhalgh <jgreenhalgh@138bc75d-0d04-0410-961f-82ee72b054a4>

Wed, 5 Dec 2012 11:36:00 +0000 (11:36 +0000)
author jgreenhalgh <jgreenhalgh@138bc75d-0d04-0410-961f-82ee72b054a4>
Wed, 5 Dec 2012 11:36:00 +0000 (11:36 +0000)
committer jgreenhalgh <jgreenhalgh@138bc75d-0d04-0410-961f-82ee72b054a4>
Wed, 5 Dec 2012 11:36:00 +0000 (11:36 +0000)
diff --git a/gcc/ChangeLog b/gcc/ChangeLog

index 16bcc2d..370149c 100644 (file)
--- a/gcc/ChangeLog
+++ b/gcc/ChangeLog
@@ -1,3 +1,30 @@
+2012-12-05  James Greenhalgh  <james.greenhalgh@arm.com>
+
+       * config/aarch64/aarch64-protos.h
+       (aarch64_split_combinev16qi): New.
+       (aarch64_expand_vec_perm): Likewise.
+       (aarch64_expand_vec_perm_const): Likewise.
+       * config/aarch64/aarch64-simd.md (vec_perm_const<mode>): New.
+       (vec_perm<mode>): Likewise.
+       (aarch64_tbl1<mode>): Likewise.
+       (aarch64_tbl2v16qi): Likewise.
+       (aarch64_combinev16qi): New.
+       * config/aarch64/aarch64.c
+       (aarch64_vectorize_vec_perm_const_ok): New.
+       (aarch64_split_combinev16qi): Likewise.
+       (MAX_VECT_LEN): Define.
+       (expand_vec_perm_d): New.
+       (aarch64_expand_vec_perm_1): Likewise.
+       (aarch64_expand_vec_perm): Likewise.
+       (aarch64_evpc_tbl): Likewise.
+       (aarch64_expand_vec_perm_const_1): Likewise.
+       (aarch64_expand_vec_perm_const): Likewise.
+       (aarch64_vectorize_vec_perm_const_ok): Likewise.
+       (TARGET_VECTORIZE_VEC_PERM_CONST_OK): Likewise.
+       * config/aarch64/iterators.md
+       (unspec): Add UNSPEC_TBL, UNSPEC_CONCAT.
+       (V_cmp_result): Add mapping for V2DF.
+
  2012-12-05  Yufeng Zhang  <yufeng.zhang@arm.com>
  
         * config/aarch64/aarch64.c (aarch64_simd_mangle_map_entry): New
diff --git a/gcc/config/aarch64/aarch64-protos.h b/gcc/config/aarch64/aarch64-protos.h

index 2110970..bcd3bb1 100644 (file)
--- a/gcc/config/aarch64/aarch64-protos.h
+++ b/gcc/config/aarch64/aarch64-protos.h
@@ -241,4 +241,9 @@ aarch64_builtin_vectorized_function (tree fndecl,
                                      tree type_out,
                                      tree type_in);
  
+extern void aarch64_split_combinev16qi (rtx operands[3]);
+extern void aarch64_expand_vec_perm (rtx target, rtx op0, rtx op1, rtx sel);
+extern bool
+aarch64_expand_vec_perm_const (rtx target, rtx op0, rtx op1, rtx sel);
+
  #endif /* GCC_AARCH64_PROTOS_H */
diff --git a/gcc/config/aarch64/aarch64-simd.md b/gcc/config/aarch64/aarch64-simd.md

index 8c9ceac..baee0cc 100644 (file)
--- a/gcc/config/aarch64/aarch64-simd.md
+++ b/gcc/config/aarch64/aarch64-simd.md
@@ -3338,6 +3338,74 @@
  
  ;; Permuted-store expanders for neon intrinsics.
  
+;; Permute instructions
+
+;; vec_perm support
+
+(define_expand "vec_perm_const<mode>"
+  [(match_operand:VALL 0 "register_operand")
+   (match_operand:VALL 1 "register_operand")
+   (match_operand:VALL 2 "register_operand")
+   (match_operand:<V_cmp_result> 3)]
+  "TARGET_SIMD"
+{
+  if (aarch64_expand_vec_perm_const (operands[0], operands[1],
+                                    operands[2], operands[3]))
+    DONE;
+  else
+    FAIL;
+})
+
+(define_expand "vec_perm<mode>"
+  [(match_operand:VB 0 "register_operand")
+   (match_operand:VB 1 "register_operand")
+   (match_operand:VB 2 "register_operand")
+   (match_operand:VB 3 "register_operand")]
+  "TARGET_SIMD"
+{
+  aarch64_expand_vec_perm (operands[0], operands[1],
+                          operands[2], operands[3]);
+  DONE;
+})
+
+(define_insn "aarch64_tbl1<mode>"
+  [(set (match_operand:VB 0 "register_operand" "=w")
+       (unspec:VB [(match_operand:V16QI 1 "register_operand" "w")
+                   (match_operand:VB 2 "register_operand" "w")]
+                  UNSPEC_TBL))]
+  "TARGET_SIMD"
+  "tbl\\t%0.<Vtype>, {%1.16b}, %2.<Vtype>"
+  [(set_attr "simd_type" "simd_tbl")
+   (set_attr "simd_mode" "<MODE>")]
+)
+
+;; Two source registers.
+
+(define_insn "aarch64_tbl2v16qi"
+  [(set (match_operand:V16QI 0 "register_operand" "=w")
+       (unspec:V16QI [(match_operand:OI 1 "register_operand" "w")
+                      (match_operand:V16QI 2 "register_operand" "w")]
+                     UNSPEC_TBL))]
+  "TARGET_SIMD"
+  "tbl\\t%0.16b, {%S1.16b - %T1.16b}, %2.16b"
+  [(set_attr "simd_type" "simd_tbl")
+   (set_attr "simd_mode" "V16QI")]
+)
+
+(define_insn_and_split "aarch64_combinev16qi"
+  [(set (match_operand:OI 0 "register_operand" "=w")
+       (unspec:OI [(match_operand:V16QI 1 "register_operand" "w")
+                   (match_operand:V16QI 2 "register_operand" "w")]
+                  UNSPEC_CONCAT))]
+  "TARGET_SIMD"
+  "#"
+  "&& reload_completed"
+  [(const_int 0)]
+{
+  aarch64_split_combinev16qi (operands);
+  DONE;
+})
+
  (define_insn "aarch64_st2<mode>_dreg"
    [(set (match_operand:TI 0 "aarch64_simd_struct_operand" "=Utv")
         (unspec:TI [(match_operand:OI 1 "register_operand" "w")
diff --git a/gcc/config/aarch64/aarch64.c b/gcc/config/aarch64/aarch64.c

index 3fa4375..ae1a037 100644 (file)
--- a/gcc/config/aarch64/aarch64.c
+++ b/gcc/config/aarch64/aarch64.c
@@ -110,6 +110,9 @@ static unsigned bit_count (unsigned HOST_WIDE_INT);
  static bool aarch64_const_vec_all_same_int_p (rtx,
                                               HOST_WIDE_INT, HOST_WIDE_INT);
  
+static bool aarch64_vectorize_vec_perm_const_ok (enum machine_mode vmode,
+                                                const unsigned char *sel);
+
  /* The processor for which instructions should be scheduled.  */
  enum aarch64_processor aarch64_tune = generic;
  
@@ -6782,6 +6785,292 @@ aarch64_c_mode_for_suffix (char suffix)
    return VOIDmode;
  }
  
+/* Split operands into moves from op[1] + op[2] into op[0].  */
+
+void
+aarch64_split_combinev16qi (rtx operands[3])
+{
+  unsigned int dest = REGNO (operands[0]);
+  unsigned int src1 = REGNO (operands[1]);
+  unsigned int src2 = REGNO (operands[2]);
+  enum machine_mode halfmode = GET_MODE (operands[1]);
+  unsigned int halfregs = HARD_REGNO_NREGS (src1, halfmode);
+  rtx destlo, desthi;
+
+  gcc_assert (halfmode == V16QImode);
+
+  if (src1 == dest && src2 == dest + halfregs)
+    {
+      /* No-op move.  Can't split to nothing; emit something.  */
+      emit_note (NOTE_INSN_DELETED);
+      return;
+    }
+
+  /* Preserve register attributes for variable tracking.  */
+  destlo = gen_rtx_REG_offset (operands[0], halfmode, dest, 0);
+  desthi = gen_rtx_REG_offset (operands[0], halfmode, dest + halfregs,
+                              GET_MODE_SIZE (halfmode));
+
+  /* Special case of reversed high/low parts.  */
+  if (reg_overlap_mentioned_p (operands[2], destlo)
+      && reg_overlap_mentioned_p (operands[1], desthi))
+    {
+      emit_insn (gen_xorv16qi3 (operands[1], operands[1], operands[2]));
+      emit_insn (gen_xorv16qi3 (operands[2], operands[1], operands[2]));
+      emit_insn (gen_xorv16qi3 (operands[1], operands[1], operands[2]));
+    }
+  else if (!reg_overlap_mentioned_p (operands[2], destlo))
+    {
+      /* Try to avoid unnecessary moves if part of the result
+        is in the right place already.  */
+      if (src1 != dest)
+       emit_move_insn (destlo, operands[1]);
+      if (src2 != dest + halfregs)
+       emit_move_insn (desthi, operands[2]);
+    }
+  else
+    {
+      if (src2 != dest + halfregs)
+       emit_move_insn (desthi, operands[2]);
+      if (src1 != dest)
+       emit_move_insn (destlo, operands[1]);
+    }
+}
+
+/* vec_perm support.  */
+
+#define MAX_VECT_LEN 16
+
+struct expand_vec_perm_d
+{
+  rtx target, op0, op1;
+  unsigned char perm[MAX_VECT_LEN];
+  enum machine_mode vmode;
+  unsigned char nelt;
+  bool one_vector_p;
+  bool testing_p;
+};
+
+/* Generate a variable permutation.  */
+
+static void
+aarch64_expand_vec_perm_1 (rtx target, rtx op0, rtx op1, rtx sel)
+{
+  enum machine_mode vmode = GET_MODE (target);
+  bool one_vector_p = rtx_equal_p (op0, op1);
+
+  gcc_checking_assert (vmode == V8QImode || vmode == V16QImode);
+  gcc_checking_assert (GET_MODE (op0) == vmode);
+  gcc_checking_assert (GET_MODE (op1) == vmode);
+  gcc_checking_assert (GET_MODE (sel) == vmode);
+  gcc_checking_assert (TARGET_SIMD);
+
+  if (one_vector_p)
+    {
+      if (vmode == V8QImode)
+       {
+         /* Expand the argument to a V16QI mode by duplicating it.  */
+         rtx pair = gen_reg_rtx (V16QImode);
+         emit_insn (gen_aarch64_combinev8qi (pair, op0, op0));
+         emit_insn (gen_aarch64_tbl1v8qi (target, pair, sel));
+       }
+      else
+       {
+         emit_insn (gen_aarch64_tbl1v16qi (target, op0, sel));
+       }
+    }
+  else
+    {
+      rtx pair;
+
+      if (vmode == V8QImode)
+       {
+         pair = gen_reg_rtx (V16QImode);
+         emit_insn (gen_aarch64_combinev8qi (pair, op0, op1));
+         emit_insn (gen_aarch64_tbl1v8qi (target, pair, sel));
+       }
+      else
+       {
+         pair = gen_reg_rtx (OImode);
+         emit_insn (gen_aarch64_combinev16qi (pair, op0, op1));
+         emit_insn (gen_aarch64_tbl2v16qi (target, pair, sel));
+       }
+    }
+}
+
+void
+aarch64_expand_vec_perm (rtx target, rtx op0, rtx op1, rtx sel)
+{
+  enum machine_mode vmode = GET_MODE (target);
+  unsigned int i, nelt = GET_MODE_NUNITS (vmode);
+  bool one_vector_p = rtx_equal_p (op0, op1);
+  rtx rmask[MAX_VECT_LEN], mask;
+
+  gcc_checking_assert (!BYTES_BIG_ENDIAN);
+
+  /* The TBL instruction does not use a modulo index, so we must take care
+     of that ourselves.  */
+  mask = GEN_INT (one_vector_p ? nelt - 1 : 2 * nelt - 1);
+  for (i = 0; i < nelt; ++i)
+    rmask[i] = mask;
+  mask = gen_rtx_CONST_VECTOR (vmode, gen_rtvec_v (nelt, rmask));
+  sel = expand_simple_binop (vmode, AND, sel, mask, NULL, 0, OPTAB_LIB_WIDEN);
+
+  aarch64_expand_vec_perm_1 (target, op0, op1, sel);
+}
+
+static bool
+aarch64_evpc_tbl (struct expand_vec_perm_d *d)
+{
+  rtx rperm[MAX_VECT_LEN], sel;
+  enum machine_mode vmode = d->vmode;
+  unsigned int i, nelt = d->nelt;
+
+  /* TODO: ARM's TBL indexing is little-endian.  In order to handle GCC's
+     numbering of elements for big-endian, we must reverse the order.  */
+  if (BYTES_BIG_ENDIAN)
+    return false;
+
+  if (d->testing_p)
+    return true;
+
+  /* Generic code will try constant permutation twice.  Once with the
+     original mode and again with the elements lowered to QImode.
+     So wait and don't do the selector expansion ourselves.  */
+  if (vmode != V8QImode && vmode != V16QImode)
+    return false;
+
+  for (i = 0; i < nelt; ++i)
+    rperm[i] = GEN_INT (d->perm[i]);
+  sel = gen_rtx_CONST_VECTOR (vmode, gen_rtvec_v (nelt, rperm));
+  sel = force_reg (vmode, sel);
+
+  aarch64_expand_vec_perm_1 (d->target, d->op0, d->op1, sel);
+  return true;
+}
+
+static bool
+aarch64_expand_vec_perm_const_1 (struct expand_vec_perm_d *d)
+{
+  /* The pattern matching functions above are written to look for a small
+     number to begin the sequence (0, 1, N/2).  If we begin with an index
+     from the second operand, we can swap the operands.  */
+  if (d->perm[0] >= d->nelt)
+    {
+      unsigned i, nelt = d->nelt;
+      rtx x;
+
+      for (i = 0; i < nelt; ++i)
+       d->perm[i] = (d->perm[i] + nelt) & (2 * nelt - 1);
+
+      x = d->op0;
+      d->op0 = d->op1;
+      d->op1 = x;
+    }
+
+  if (TARGET_SIMD)
+    return aarch64_evpc_tbl (d);
+  return false;
+}
+
+/* Expand a vec_perm_const pattern.  */
+
+bool
+aarch64_expand_vec_perm_const (rtx target, rtx op0, rtx op1, rtx sel)
+{
+  struct expand_vec_perm_d d;
+  int i, nelt, which;
+
+  d.target = target;
+  d.op0 = op0;
+  d.op1 = op1;
+
+  d.vmode = GET_MODE (target);
+  gcc_assert (VECTOR_MODE_P (d.vmode));
+  d.nelt = nelt = GET_MODE_NUNITS (d.vmode);
+  d.testing_p = false;
+
+  for (i = which = 0; i < nelt; ++i)
+    {
+      rtx e = XVECEXP (sel, 0, i);
+      int ei = INTVAL (e) & (2 * nelt - 1);
+      which |= (ei < nelt ? 1 : 2);
+      d.perm[i] = ei;
+    }
+
+  switch (which)
+    {
+    default:
+      gcc_unreachable ();
+
+    case 3:
+      d.one_vector_p = false;
+      if (!rtx_equal_p (op0, op1))
+       break;
+
+      /* The elements of PERM do not suggest that only the first operand
+        is used, but both operands are identical.  Allow easier matching
+        of the permutation by folding the permutation into the single
+        input vector.  */
+      /* Fall Through.  */
+    case 2:
+      for (i = 0; i < nelt; ++i)
+       d.perm[i] &= nelt - 1;
+      d.op0 = op1;
+      d.one_vector_p = true;
+      break;
+
+    case 1:
+      d.op1 = op0;
+      d.one_vector_p = true;
+      break;
+    }
+
+  return aarch64_expand_vec_perm_const_1 (&d);
+}
+
+static bool
+aarch64_vectorize_vec_perm_const_ok (enum machine_mode vmode,
+                                    const unsigned char *sel)
+{
+  struct expand_vec_perm_d d;
+  unsigned int i, nelt, which;
+  bool ret;
+
+  d.vmode = vmode;
+  d.nelt = nelt = GET_MODE_NUNITS (d.vmode);
+  d.testing_p = true;
+  memcpy (d.perm, sel, nelt);
+
+  /* Calculate whether all elements are in one vector.  */
+  for (i = which = 0; i < nelt; ++i)
+    {
+      unsigned char e = d.perm[i];
+      gcc_assert (e < 2 * nelt);
+      which |= (e < nelt ? 1 : 2);
+    }
+
+  /* If all elements are from the second vector, reindex as if from the
+     first vector.  */
+  if (which == 2)
+    for (i = 0; i < nelt; ++i)
+      d.perm[i] -= nelt;
+
+  /* Check whether the mask can be applied to a single vector.  */
+  d.one_vector_p = (which != 3);
+
+  d.target = gen_raw_REG (d.vmode, LAST_VIRTUAL_REGISTER + 1);
+  d.op1 = d.op0 = gen_raw_REG (d.vmode, LAST_VIRTUAL_REGISTER + 2);
+  if (!d.one_vector_p)
+    d.op1 = gen_raw_REG (d.vmode, LAST_VIRTUAL_REGISTER + 3);
+
+  start_sequence ();
+  ret = aarch64_expand_vec_perm_const_1 (&d);
+  end_sequence ();
+
+  return ret;
+}
+
  #undef TARGET_ADDRESS_COST
  #define TARGET_ADDRESS_COST aarch64_address_cost
  
@@ -6985,6 +7274,12 @@ aarch64_c_mode_for_suffix (char suffix)
  #define TARGET_VECTORIZE_VECTOR_ALIGNMENT_REACHABLE \
    aarch64_simd_vector_alignment_reachable
  
+/* vec_perm support.  */
+
+#undef TARGET_VECTORIZE_VEC_PERM_CONST_OK
+#define TARGET_VECTORIZE_VEC_PERM_CONST_OK \
+  aarch64_vectorize_vec_perm_const_ok
+
  struct gcc_target targetm = TARGET_INITIALIZER;
  
  #include "gt-aarch64.h"
diff --git a/gcc/config/aarch64/iterators.md b/gcc/config/aarch64/iterators.md

index 31bc977..7cd4cef 100644 (file)
--- a/gcc/config/aarch64/iterators.md
+++ b/gcc/config/aarch64/iterators.md
@@ -228,6 +228,8 @@
      UNSPEC_FMAX                ; Used in aarch64-simd.md.
      UNSPEC_FMIN                ; Used in aarch64-simd.md.
      UNSPEC_BSL         ; Used in aarch64-simd.md.
+    UNSPEC_TBL         ; Used in vector permute patterns.
+    UNSPEC_CONCAT      ; Used in vector permute patterns.
  ])
  
  ;; -------------------------------------------------------------------
@@ -415,8 +417,9 @@
  (define_mode_attr V_cmp_result [(V8QI "V8QI") (V16QI "V16QI")
                                 (V4HI "V4HI") (V8HI  "V8HI")
                                 (V2SI "V2SI") (V4SI  "V4SI")
+                               (DI   "DI")   (V2DI  "V2DI")
                                 (V2SF "V2SI") (V4SF  "V4SI")
-                               (DI   "DI")   (V2DI  "V2DI")])
+                               (V2DF "V2DI")])
  
  ;; Vm for lane instructions is restricted to FP_LO_REGS.
  (define_mode_attr vwx [(V4HI "x") (V8HI "x") (HI "x")
diff --git a/gcc/testsuite/ChangeLog b/gcc/testsuite/ChangeLog

index 65c2ae3..b010c11 100644 (file)
--- a/gcc/testsuite/ChangeLog
+++ b/gcc/testsuite/ChangeLog
@@ -1,3 +1,13 @@
+2012-12-05  James Greenhalgh  <james.greenhalgh@arm.com>
+
+       * lib/target-supports.exp
+       (check_effective_target_vect_perm): Allow aarch64*-*-*.
+       (check_effective_target_vect_perm_byte): Likewise.
+       (check_effective_target_vect_perm_short): Likewise.
+       (check_effective_target_vect_char_mult): Likewise.
+       (check_effective_target_vect_extract_even_odd): Likewise.
+       (check_effective_target_vect_interleave): Likewise.
+
  2012-12-05  Yufeng Zhang  <yufeng.zhang@arm.com>
  
         * g++.dg/abi/mangle-neon-aarch64.C: New test.
diff --git a/gcc/testsuite/lib/target-supports.exp b/gcc/testsuite/lib/target-supports.exp

index 59d854f..5e830b1 100644 (file)
--- a/gcc/testsuite/lib/target-supports.exp
+++ b/gcc/testsuite/lib/target-supports.exp
@@ -3014,6 +3014,7 @@ proc check_effective_target_vect_perm { } {
      } else {
          set et_vect_perm_saved 0
          if { [is-effective-target arm_neon_ok]
+            || [istarget aarch64*-*-*]
              || [istarget powerpc*-*-*]
               || [istarget spu-*-*]
              || [istarget i?86-*-*]
@@ -3040,6 +3041,7 @@ proc check_effective_target_vect_perm_byte { } {
      } else {
          set et_vect_perm_byte_saved 0
          if { [is-effective-target arm_neon_ok]
+            || [istarget aarch64*-*-*]
              || [istarget powerpc*-*-*]
               || [istarget spu-*-*] } {
              set et_vect_perm_byte_saved 1
@@ -3062,6 +3064,7 @@ proc check_effective_target_vect_perm_short { } {
      } else {
          set et_vect_perm_short_saved 0
          if { [is-effective-target arm_neon_ok]
+            || [istarget aarch64*-*-*]
              || [istarget powerpc*-*-*]
               || [istarget spu-*-*] } {
              set et_vect_perm_short_saved 1
@@ -3697,7 +3700,8 @@ proc check_effective_target_vect_char_mult { } {
         verbose "check_effective_target_vect_char_mult: using cached result" 2
      } else {
         set et_vect_char_mult_saved 0
-       if { [istarget ia64-*-*]
+       if { [istarget aarch64*-*-*]
+            || [istarget ia64-*-*]
              || [istarget i?86-*-*]
              || [istarget x86_64-*-*]
              || [check_effective_target_arm32] } {
@@ -3768,8 +3772,9 @@ proc check_effective_target_vect_extract_even_odd { } {
          verbose "check_effective_target_vect_extract_even_odd: using cached result" 2
      } else {
          set et_vect_extract_even_odd_saved 0 
-        if { [istarget powerpc*-*-*] 
-            || [is-effective-target arm_neon_ok]
+       if { [istarget aarch64*-*-*]
+            || [istarget powerpc*-*-*]
+            || [is-effective-target arm_neon_ok]
               || [istarget i?86-*-*]
               || [istarget x86_64-*-*]
               || [istarget ia64-*-*]
@@ -3793,8 +3798,9 @@ proc check_effective_target_vect_interleave { } {
          verbose "check_effective_target_vect_interleave: using cached result" 2
      } else {
          set et_vect_interleave_saved 0
-        if { [istarget powerpc*-*-*]
-            || [is-effective-target arm_neon_ok]
+       if { [istarget aarch64*-*-*]
+            || [istarget powerpc*-*-*]
+            || [is-effective-target arm_neon_ok]
               || [istarget i?86-*-*]
               || [istarget x86_64-*-*]
               || [istarget ia64-*-*]
author	jgreenhalgh <jgreenhalgh@138bc75d-0d04-0410-961f-82ee72b054a4>
	Wed, 5 Dec 2012 11:36:00 +0000 (11:36 +0000)
committer	jgreenhalgh <jgreenhalgh@138bc75d-0d04-0410-961f-82ee72b054a4>
	Wed, 5 Dec 2012 11:36:00 +0000 (11:36 +0000)
gcc/ChangeLog		patch \| blob \| history
gcc/config/aarch64/aarch64-protos.h		patch \| blob \| history
gcc/config/aarch64/aarch64-simd.md		patch \| blob \| history
gcc/config/aarch64/aarch64.c		patch \| blob \| history
gcc/config/aarch64/iterators.md		patch \| blob \| history
gcc/testsuite/ChangeLog		patch \| blob \| history
gcc/testsuite/lib/target-supports.exp		patch \| blob \| history