AArch64: Add SVE2 implementation for pow2 bitmask division

author Tamar Christina <tamar.christina@arm.com>

Mon, 14 Nov 2022 15:53:42 +0000 (15:53 +0000)

committer Tamar Christina <tamar.christina@arm.com>

Mon, 14 Nov 2022 17:41:33 +0000 (17:41 +0000)
author Tamar Christina <tamar.christina@arm.com>
Mon, 14 Nov 2022 15:53:42 +0000 (15:53 +0000)
committer Tamar Christina <tamar.christina@arm.com>
Mon, 14 Nov 2022 17:41:33 +0000 (17:41 +0000)
diff --git a/gcc/config/aarch64/aarch64-sve2.md b/gcc/config/aarch64/aarch64-sve2.md

index 5df38e3..dadd046 100644 (file)
--- a/gcc/config/aarch64/aarch64-sve2.md
+++ b/gcc/config/aarch64/aarch64-sve2.md
@@ -71,6 +71,7 @@
  ;; ---- [INT] Reciprocal approximation
  ;; ---- [INT<-FP] Base-2 logarithm
  ;; ---- [INT] Polynomial multiplication
+;; ---- [INT] Misc optab implementations
  ;;
  ;; == Permutation
  ;; ---- [INT,FP] General permutes
@@ -2312,6 +2313,46 @@
    "<sve_int_op>\t%0.<Vewtype>, %1.<Vetype>, %2.<Vetype>"
  )
  
+;; -------------------------------------------------------------------------
+;; ---- [INT] Misc optab implementations
+;; -------------------------------------------------------------------------
+;; Includes:
+;; - aarch64_bitmask_udiv
+;; -------------------------------------------------------------------------
+
+;; div optimizations using narrowings
+;; we can do the division e.g. shorts by 255 faster by calculating it as
+;; (x + ((x + 257) >> 8)) >> 8 assuming the operation is done in
+;; double the precision of x.
+;;
+;; See aarch64-simd.md for bigger explanation.
+(define_expand "@aarch64_bitmask_udiv<mode>3"
+  [(match_operand:SVE_FULL_HSDI 0 "register_operand")
+   (match_operand:SVE_FULL_HSDI 1 "register_operand")
+   (match_operand:SVE_FULL_HSDI 2 "immediate_operand")]
+  "TARGET_SVE2"
+{
+  unsigned HOST_WIDE_INT size
+    = (1ULL << GET_MODE_UNIT_BITSIZE (<VNARROW>mode)) - 1;
+  rtx elt = unwrap_const_vec_duplicate (operands[2]);
+  if (!CONST_INT_P (elt) || UINTVAL (elt) != size)
+    FAIL;
+
+  rtx addend = gen_reg_rtx (<MODE>mode);
+  rtx tmp1 = gen_reg_rtx (<VNARROW>mode);
+  rtx tmp2 = gen_reg_rtx (<VNARROW>mode);
+  rtx val = aarch64_simd_gen_const_vector_dup (<VNARROW>mode, 1);
+  emit_move_insn (addend, lowpart_subreg (<MODE>mode, val, <VNARROW>mode));
+  emit_insn (gen_aarch64_sve (UNSPEC_ADDHNB, <MODE>mode, tmp1, operands[1],
+                             addend));
+  emit_insn (gen_aarch64_sve (UNSPEC_ADDHNB, <MODE>mode, tmp2, operands[1],
+                             lowpart_subreg (<MODE>mode, tmp1,
+                                             <VNARROW>mode)));
+  emit_move_insn (operands[0],
+                 lowpart_subreg (<MODE>mode, tmp2, <VNARROW>mode));
+  DONE;
+})
+
  ;; =========================================================================
  ;; == Permutation
  ;; =========================================================================
diff --git a/gcc/testsuite/gcc.target/aarch64/sve2/div-by-bitmask_1.c b/gcc/testsuite/gcc.target/aarch64/sve2/div-by-bitmask_1.c

new file mode 100644 (file)

index 0000000..e6f5098
--- /dev/null
+++ b/gcc/testsuite/gcc.target/aarch64/sve2/div-by-bitmask_1.c
@@ -0,0 +1,53 @@
+/* { dg-do compile } */
+/* { dg-additional-options "-O2 -std=c99" } */
+/* { dg-final { check-function-bodies "**" "" "" { target { le } } } } */
+
+#include <stdint.h>
+
+/*
+** draw_bitmap1:
+** ...
+**     mul     z[0-9]+.h, p[0-9]+/m, z[0-9]+.h, z[0-9]+.h
+**     addhnb  z[0-9]+.b, z[0-9]+.h, z[0-9]+.h
+**     addhnb  z[0-9]+.b, z[0-9]+.h, z[0-9]+.h
+** ...
+*/
+void draw_bitmap1(uint8_t* restrict pixel, uint8_t level, int n)
+{
+  for (int i = 0; i < (n & -16); i+=1)
+    pixel[i] = (pixel[i] * level) / 0xff;
+}
+
+void draw_bitmap2(uint8_t* restrict pixel, uint8_t level, int n)
+{
+  for (int i = 0; i < (n & -16); i+=1)
+    pixel[i] = (pixel[i] * level) / 0xfe;
+}
+
+/*
+** draw_bitmap3:
+** ...
+**     mul     z[0-9]+.s, p[0-9]+/m, z[0-9]+.s, z[0-9]+.s
+**     addhnb  z[0-9]+.h, z[0-9]+.s, z[0-9]+.s
+**     addhnb  z[0-9]+.h, z[0-9]+.s, z[0-9]+.s
+** ...
+*/
+void draw_bitmap3(uint16_t* restrict pixel, uint16_t level, int n)
+{
+  for (int i = 0; i < (n & -16); i+=1)
+    pixel[i] = (pixel[i] * level) / 0xffffU;
+}
+
+/*
+** draw_bitmap4:
+** ...
+**     mul     z[0-9]+.d, p[0-9]+/m, z[0-9]+.d, z[0-9]+.d
+**     addhnb  z[0-9]+.s, z[0-9]+.d, z[0-9]+.d
+**     addhnb  z[0-9]+.s, z[0-9]+.d, z[0-9]+.d
+** ...
+*/
+void draw_bitmap4(uint32_t* restrict pixel, uint32_t level, int n)
+{
+  for (int i = 0; i < (n & -16); i+=1)
+    pixel[i] = (pixel[i] * (uint64_t)level) / 0xffffffffUL;
+}
author	Tamar Christina <tamar.christina@arm.com>
	Mon, 14 Nov 2022 15:53:42 +0000 (15:53 +0000)
committer	Tamar Christina <tamar.christina@arm.com>
	Mon, 14 Nov 2022 17:41:33 +0000 (17:41 +0000)
gcc/config/aarch64/aarch64-sve2.md		patch \| blob \| history
gcc/testsuite/gcc.target/aarch64/sve2/div-by-bitmask_1.c	[new file with mode: 0644]	patch \| blob