;; ---- [INT] Reciprocal approximation
;; ---- [INT<-FP] Base-2 logarithm
;; ---- [INT] Polynomial multiplication
+;; ---- [INT] Misc optab implementations
;;
;; == Permutation
;; ---- [INT,FP] General permutes
"<sve_int_op>\t%0.<Vewtype>, %1.<Vetype>, %2.<Vetype>"
)
+;; -------------------------------------------------------------------------
+;; ---- [INT] Misc optab implementations
+;; -------------------------------------------------------------------------
+;; Includes:
+;; - aarch64_bitmask_udiv
+;; -------------------------------------------------------------------------
+
+;; div optimizations using narrowings
+;; we can do the division e.g. shorts by 255 faster by calculating it as
+;; (x + ((x + 257) >> 8)) >> 8 assuming the operation is done in
+;; double the precision of x.
+;;
+;; See aarch64-simd.md for bigger explanation.
+(define_expand "@aarch64_bitmask_udiv<mode>3"
+ [(match_operand:SVE_FULL_HSDI 0 "register_operand")
+ (match_operand:SVE_FULL_HSDI 1 "register_operand")
+ (match_operand:SVE_FULL_HSDI 2 "immediate_operand")]
+ "TARGET_SVE2"
+{
+ unsigned HOST_WIDE_INT size
+ = (1ULL << GET_MODE_UNIT_BITSIZE (<VNARROW>mode)) - 1;
+ rtx elt = unwrap_const_vec_duplicate (operands[2]);
+ if (!CONST_INT_P (elt) || UINTVAL (elt) != size)
+ FAIL;
+
+ rtx addend = gen_reg_rtx (<MODE>mode);
+ rtx tmp1 = gen_reg_rtx (<VNARROW>mode);
+ rtx tmp2 = gen_reg_rtx (<VNARROW>mode);
+ rtx val = aarch64_simd_gen_const_vector_dup (<VNARROW>mode, 1);
+ emit_move_insn (addend, lowpart_subreg (<MODE>mode, val, <VNARROW>mode));
+ emit_insn (gen_aarch64_sve (UNSPEC_ADDHNB, <MODE>mode, tmp1, operands[1],
+ addend));
+ emit_insn (gen_aarch64_sve (UNSPEC_ADDHNB, <MODE>mode, tmp2, operands[1],
+ lowpart_subreg (<MODE>mode, tmp1,
+ <VNARROW>mode)));
+ emit_move_insn (operands[0],
+ lowpart_subreg (<MODE>mode, tmp2, <VNARROW>mode));
+ DONE;
+})
+
;; =========================================================================
;; == Permutation
;; =========================================================================
--- /dev/null
+/* { dg-do compile } */
+/* { dg-additional-options "-O2 -std=c99" } */
+/* { dg-final { check-function-bodies "**" "" "" { target { le } } } } */
+
+#include <stdint.h>
+
+/*
+** draw_bitmap1:
+** ...
+** mul z[0-9]+.h, p[0-9]+/m, z[0-9]+.h, z[0-9]+.h
+** addhnb z[0-9]+.b, z[0-9]+.h, z[0-9]+.h
+** addhnb z[0-9]+.b, z[0-9]+.h, z[0-9]+.h
+** ...
+*/
+void draw_bitmap1(uint8_t* restrict pixel, uint8_t level, int n)
+{
+ for (int i = 0; i < (n & -16); i+=1)
+ pixel[i] = (pixel[i] * level) / 0xff;
+}
+
+void draw_bitmap2(uint8_t* restrict pixel, uint8_t level, int n)
+{
+ for (int i = 0; i < (n & -16); i+=1)
+ pixel[i] = (pixel[i] * level) / 0xfe;
+}
+
+/*
+** draw_bitmap3:
+** ...
+** mul z[0-9]+.s, p[0-9]+/m, z[0-9]+.s, z[0-9]+.s
+** addhnb z[0-9]+.h, z[0-9]+.s, z[0-9]+.s
+** addhnb z[0-9]+.h, z[0-9]+.s, z[0-9]+.s
+** ...
+*/
+void draw_bitmap3(uint16_t* restrict pixel, uint16_t level, int n)
+{
+ for (int i = 0; i < (n & -16); i+=1)
+ pixel[i] = (pixel[i] * level) / 0xffffU;
+}
+
+/*
+** draw_bitmap4:
+** ...
+** mul z[0-9]+.d, p[0-9]+/m, z[0-9]+.d, z[0-9]+.d
+** addhnb z[0-9]+.s, z[0-9]+.d, z[0-9]+.d
+** addhnb z[0-9]+.s, z[0-9]+.d, z[0-9]+.d
+** ...
+*/
+void draw_bitmap4(uint32_t* restrict pixel, uint32_t level, int n)
+{
+ for (int i = 0; i < (n & -16); i+=1)
+ pixel[i] = (pixel[i] * (uint64_t)level) / 0xffffffffUL;
+}