BUILTIN_VSDQ_I_DI (BINOP_UUS, urshl, 0, NONE)
/* Implemented by <sur><dotprod>_prod<dot_mode>. */
- BUILTIN_VB (TERNOP, sdot, 0, NONE)
- BUILTIN_VB (TERNOPU, udot, 0, NONE)
+ BUILTIN_VB (TERNOP, sdot_prod, 10, NONE)
+ BUILTIN_VB (TERNOPU, udot_prod, 10, NONE)
BUILTIN_VB (TERNOP_SUSS, usdot_prod, 10, NONE)
/* Implemented by aarch64_<sur><dotprod>_lane{q}<dot_mode>. */
BUILTIN_VB (QUADOP_LANE, sdot_lane, 0, NONE)
DONE;
})
-;; These instructions map to the __builtins for the Dot Product operations.
-(define_insn "aarch64_<sur>dot<vsi2qi>"
- [(set (match_operand:VS 0 "register_operand" "=w")
- (plus:VS (match_operand:VS 1 "register_operand" "0")
- (unspec:VS [(match_operand:<VSI2QI> 2 "register_operand" "w")
- (match_operand:<VSI2QI> 3 "register_operand" "w")]
- DOTPROD)))]
- "TARGET_DOTPROD"
- "<sur>dot\\t%0.<Vtype>, %2.<Vdottype>, %3.<Vdottype>"
- [(set_attr "type" "neon_dot<q>")]
-)
-
-;; These expands map to the Dot Product optab the vectorizer checks for.
+;; These expands map to the Dot Product optab the vectorizer checks for
+;; and to the intrinsics patttern.
;; The auto-vectorizer expects a dot product builtin that also does an
;; accumulation into the provided register.
;; Given the following pattern
;; ...
;;
;; and so the vectorizer provides r, in which the result has to be accumulated.
-(define_expand "<sur>dot_prod<vsi2qi>"
- [(set (match_operand:VS 0 "register_operand")
- (plus:VS (unspec:VS [(match_operand:<VSI2QI> 1 "register_operand")
- (match_operand:<VSI2QI> 2 "register_operand")]
- DOTPROD)
- (match_operand:VS 3 "register_operand")))]
+(define_insn "<sur>dot_prod<vsi2qi>"
+ [(set (match_operand:VS 0 "register_operand" "=w")
+ (plus:VS
+ (unspec:VS [(match_operand:<VSI2QI> 1 "register_operand" "w")
+ (match_operand:<VSI2QI> 2 "register_operand" "w")]
+ DOTPROD)
+ (match_operand:VS 3 "register_operand" "0")))]
"TARGET_DOTPROD"
-{
- emit_insn (
- gen_aarch64_<sur>dot<vsi2qi> (operands[3], operands[3], operands[1],
- operands[2]));
- emit_insn (gen_rtx_SET (operands[0], operands[3]));
- DONE;
-})
+ "<sur>dot\\t%0.<Vtype>, %1.<Vdottype>, %2.<Vdottype>"
+ [(set_attr "type" "neon_dot<q>")]
+)
;; These instructions map to the __builtins for the Armv8.6-a I8MM usdot
;; (vector) Dot Product operation and the vectorized optab.
;; indexed operations.
(define_insn "aarch64_<sur>dot_lane<vsi2qi>"
[(set (match_operand:VS 0 "register_operand" "=w")
- (plus:VS (match_operand:VS 1 "register_operand" "0")
- (unspec:VS [(match_operand:<VSI2QI> 2 "register_operand" "w")
- (match_operand:V8QI 3 "register_operand" "<h_con>")
- (match_operand:SI 4 "immediate_operand" "i")]
- DOTPROD)))]
+ (plus:VS
+ (unspec:VS [(match_operand:<VSI2QI> 2 "register_operand" "w")
+ (match_operand:V8QI 3 "register_operand" "<h_con>")
+ (match_operand:SI 4 "immediate_operand" "i")]
+ DOTPROD)
+ (match_operand:VS 1 "register_operand" "0")))]
"TARGET_DOTPROD"
{
operands[4] = aarch64_endian_lane_rtx (V8QImode, INTVAL (operands[4]));
(define_insn "aarch64_<sur>dot_laneq<vsi2qi>"
[(set (match_operand:VS 0 "register_operand" "=w")
- (plus:VS (match_operand:VS 1 "register_operand" "0")
- (unspec:VS [(match_operand:<VSI2QI> 2 "register_operand" "w")
- (match_operand:V16QI 3 "register_operand" "<h_con>")
- (match_operand:SI 4 "immediate_operand" "i")]
- DOTPROD)))]
+ (plus:VS
+ (unspec:VS [(match_operand:<VSI2QI> 2 "register_operand" "w")
+ (match_operand:V16QI 3 "register_operand" "<h_con>")
+ (match_operand:SI 4 "immediate_operand" "i")]
+ DOTPROD)
+ (match_operand:VS 1 "register_operand" "0")))]
"TARGET_DOTPROD"
{
operands[4] = aarch64_endian_lane_rtx (V16QImode, INTVAL (operands[4]));
rtx ones = force_reg (V16QImode, CONST1_RTX (V16QImode));
rtx abd = gen_reg_rtx (V16QImode);
emit_insn (gen_aarch64_<sur>abdv16qi (abd, operands[1], operands[2]));
- emit_insn (gen_aarch64_udotv16qi (operands[0], operands[3],
- abd, ones));
+ emit_insn (gen_udot_prodv16qi (operands[0], abd, ones, operands[3]));
DONE;
}
rtx reduc = gen_reg_rtx (V8HImode);
__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
vdot_u32 (uint32x2_t __r, uint8x8_t __a, uint8x8_t __b)
{
- return __builtin_aarch64_udotv8qi_uuuu (__r, __a, __b);
+ return __builtin_aarch64_udot_prodv8qi_uuuu (__a, __b, __r);
}
__extension__ extern __inline uint32x4_t
__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
vdotq_u32 (uint32x4_t __r, uint8x16_t __a, uint8x16_t __b)
{
- return __builtin_aarch64_udotv16qi_uuuu (__r, __a, __b);
+ return __builtin_aarch64_udot_prodv16qi_uuuu (__a, __b, __r);
}
__extension__ extern __inline int32x2_t
__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
vdot_s32 (int32x2_t __r, int8x8_t __a, int8x8_t __b)
{
- return __builtin_aarch64_sdotv8qi (__r, __a, __b);
+ return __builtin_aarch64_sdot_prodv8qi (__a, __b, __r);
}
__extension__ extern __inline int32x4_t
__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
vdotq_s32 (int32x4_t __r, int8x16_t __a, int8x16_t __b)
{
- return __builtin_aarch64_sdotv16qi (__r, __a, __b);
+ return __builtin_aarch64_sdot_prodv16qi (__a, __b, __r);
}
__extension__ extern __inline uint32x2_t