This adds optabs implementing usdot_prod.
The following testcase:
#define N 480
#define SIGNEDNESS_1 unsigned
#define SIGNEDNESS_2 signed
#define SIGNEDNESS_3 signed
#define SIGNEDNESS_4 unsigned
SIGNEDNESS_1 int __attribute__ ((noipa))
f (SIGNEDNESS_1 int res, SIGNEDNESS_3 char *restrict a,
SIGNEDNESS_4 char *restrict b)
{
for (__INTPTR_TYPE__ i = 0; i < N; ++i)
{
int av = a[i];
int bv = b[i];
SIGNEDNESS_2 short mult = av * bv;
res += mult;
}
return res;
}
Generates
f:
vmov.i32 q8, #0 @ v4si
add r3, r2, #480
.L2:
vld1.8 {q10}, [r2]!
vld1.8 {q9}, [r1]!
vusdot.s8 q8, q9, q10
cmp r3, r2
bne .L2
vadd.i32 d16, d16, d17
vpadd.i32 d16, d16, d16
vmov.32 r3, d16[0]
add r0, r0, r3
bx lr
instead of
f:
vmov.i32 q8, #0 @ v4si
add r3, r2, #480
.L2:
vld1.8 {q9}, [r2]!
vld1.8 {q11}, [r1]!
cmp r3, r2
vmull.s8 q10, d18, d22
vmull.s8 q9, d19, d23
vaddw.s16 q8, q8, d20
vaddw.s16 q8, q8, d21
vaddw.s16 q8, q8, d18
vaddw.s16 q8, q8, d19
bne .L2
vadd.i32 d16, d16, d17
vpadd.i32 d16, d16, d16
vmov.32 r3, d16[0]
add r0, r0, r3
bx lr
For NEON. I couldn't figure out if the MVE instruction vmlaldav.s16 could be
used to emulate this. Because it would require additional widening to work I
left MVE out of this patch set but perhaps someone should take a look.
gcc/ChangeLog:
* config/arm/neon.md (usdot_prod<vsi2qi>): New.
gcc/testsuite/ChangeLog:
* gcc.target/arm/simd/vusdot-autovec.c: New test.
DONE;
})
+;; Auto-vectorizer pattern for usdot
+(define_expand "usdot_prod<vsi2qi>"
+ [(set (match_operand:VCVTI 0 "register_operand")
+ (plus:VCVTI (unspec:VCVTI [(match_operand:<VSI2QI> 1
+ "register_operand")
+ (match_operand:<VSI2QI> 2
+ "register_operand")]
+ UNSPEC_DOT_US)
+ (match_operand:VCVTI 3 "register_operand")))]
+ "TARGET_I8MM"
+)
+
(define_expand "neon_copysignf<mode>"
[(match_operand:VCVTF 0 "register_operand")
(match_operand:VCVTF 1 "register_operand")
--- /dev/null
+/* { dg-do compile } */
+/* { dg-options "-O3 -march=armv8.2-a+i8mm" } */
+
+#define N 480
+#define SIGNEDNESS_1 unsigned
+#define SIGNEDNESS_2 signed
+#define SIGNEDNESS_3 signed
+#define SIGNEDNESS_4 unsigned
+
+SIGNEDNESS_1 int __attribute__ ((noipa))
+f (SIGNEDNESS_1 int res, SIGNEDNESS_3 char *restrict a,
+ SIGNEDNESS_4 char *restrict b)
+{
+ for (__INTPTR_TYPE__ i = 0; i < N; ++i)
+ {
+ int av = a[i];
+ int bv = b[i];
+ SIGNEDNESS_2 short mult = av * bv;
+ res += mult;
+ }
+ return res;
+}
+
+SIGNEDNESS_1 int __attribute__ ((noipa))
+g (SIGNEDNESS_1 int res, SIGNEDNESS_3 char *restrict b,
+ SIGNEDNESS_4 char *restrict a)
+{
+ for (__INTPTR_TYPE__ i = 0; i < N; ++i)
+ {
+ int av = a[i];
+ int bv = b[i];
+ SIGNEDNESS_2 short mult = av * bv;
+ res += mult;
+ }
+ return res;
+}
+
+/* { dg-final { scan-assembler-times {vusdot.s8} 2 { target { arm-*-*-gnueabihf } } } } */