Adding support for intrinsics of TDPBSUD/TDPBUSD/TDPBUUD.
Differential Revision: https://reviews.llvm.org/D97259
// AMX internal builtin
TARGET_BUILTIN(__builtin_ia32_tileloadd64_internal, "V256iUsUsvC*z", "n", "amx-tile")
TARGET_BUILTIN(__builtin_ia32_tdpbssd_internal, "V256iUsUsUsV256iV256iV256i", "n", "amx-int8")
+TARGET_BUILTIN(__builtin_ia32_tdpbsud_internal, "V256iUsUsUsV256iV256iV256i", "n", "amx-int8")
+TARGET_BUILTIN(__builtin_ia32_tdpbusd_internal, "V256iUsUsUsV256iV256iV256i", "n", "amx-int8")
+TARGET_BUILTIN(__builtin_ia32_tdpbuud_internal, "V256iUsUsUsV256iV256iV256i", "n", "amx-int8")
TARGET_BUILTIN(__builtin_ia32_tilestored64_internal, "vUsUsv*zV256i", "n", "amx-tile")
TARGET_BUILTIN(__builtin_ia32_tilezero_internal, "V256iUsUs", "n", "amx-tile")
// AMX
return __builtin_ia32_tdpbssd_internal(m, n, k, dst, src1, src2);
}
+static __inline__ _tile1024i __DEFAULT_FN_ATTRS_INT8
+_tile_dpbsud_internal(unsigned short m, unsigned short n, unsigned short k,
+ _tile1024i dst, _tile1024i src1, _tile1024i src2) {
+ return __builtin_ia32_tdpbsud_internal(m, n, k, dst, src1, src2);
+}
+
+static __inline__ _tile1024i __DEFAULT_FN_ATTRS_INT8
+_tile_dpbusd_internal(unsigned short m, unsigned short n, unsigned short k,
+ _tile1024i dst, _tile1024i src1, _tile1024i src2) {
+ return __builtin_ia32_tdpbusd_internal(m, n, k, dst, src1, src2);
+}
+
+static __inline__ _tile1024i __DEFAULT_FN_ATTRS_INT8
+_tile_dpbuud_internal(unsigned short m, unsigned short n, unsigned short k,
+ _tile1024i dst, _tile1024i src1, _tile1024i src2) {
+ return __builtin_ia32_tdpbuud_internal(m, n, k, dst, src1, src2);
+}
+
static __inline__ void __DEFAULT_FN_ATTRS_INT8
_tile_stored_internal(unsigned short m, unsigned short n, void *base,
__SIZE_TYPE__ stride, _tile1024i tile) {
src1.tile, src2.tile);
}
+__DEFAULT_FN_ATTRS_INT8
+static void __tile_dpbsud(__tile1024i *dst, __tile1024i src1,
+ __tile1024i src2) {
+ dst->tile = _tile_dpbsud_internal(src1.row, src2.col, src1.col, dst->tile,
+ src1.tile, src2.tile);
+}
+
+__DEFAULT_FN_ATTRS_INT8
+static void __tile_dpbusd(__tile1024i *dst, __tile1024i src1,
+ __tile1024i src2) {
+ dst->tile = _tile_dpbusd_internal(src1.row, src2.col, src1.col, dst->tile,
+ src1.tile, src2.tile);
+}
+
+__DEFAULT_FN_ATTRS_INT8
+static void __tile_dpbuud(__tile1024i *dst, __tile1024i src1,
+ __tile1024i src2) {
+ dst->tile = _tile_dpbuud_internal(src1.row, src2.col, src1.col, dst->tile,
+ src1.tile, src2.tile);
+}
+
__DEFAULT_FN_ATTRS_TILE
static void __tile_stored(void *base, __SIZE_TYPE__ stride, __tile1024i src) {
_tile_stored_internal(src.row, src.col, base, stride, src.tile);
__tile_dpbssd(&c, a, b);
}
+void test_tile_dpbsud(__tile1024i a, __tile1024i b, __tile1024i c) {
+ //CHECK-LABEL: @test_tile_dpbsud
+ //CHECK: call x86_amx @llvm.x86.tdpbsud.internal
+ //CHECK-NEXT: {{%.*}} = bitcast x86_amx {{%.*}} to <256 x i32>
+ __tile_dpbsud(&c, a, b);
+}
+
+void test_tile_dpbusd(__tile1024i a, __tile1024i b, __tile1024i c) {
+ //CHECK-LABEL: @test_tile_dpbusd
+ //CHECK: call x86_amx @llvm.x86.tdpbusd.internal
+ //CHECK-NEXT: {{%.*}} = bitcast x86_amx {{%.*}} to <256 x i32>
+ __tile_dpbusd(&c, a, b);
+}
+
+void test_tile_dpbuud(__tile1024i a, __tile1024i b, __tile1024i c) {
+ //CHECK-LABEL: @test_tile_dpbuud
+ //CHECK: call x86_amx @llvm.x86.tdpbuud.internal
+ //CHECK-NEXT: {{%.*}} = bitcast x86_amx {{%.*}} to <256 x i32>
+ __tile_dpbuud(&c, a, b);
+}
+
void test_tile_stored(__tile1024i c) {
//CHECK-LABEL: @test_tile_stored
//CHECK: {{%.*}} = bitcast <256 x i32> {{%.*}} to x86_amx
[llvm_i16_ty, llvm_i16_ty, llvm_i16_ty,
llvm_x86amx_ty, llvm_x86amx_ty,
llvm_x86amx_ty], []>;
+ def int_x86_tdpbsud_internal :
+ GCCBuiltin<"__builtin_ia32_tdpbsud_internal">,
+ Intrinsic<[llvm_x86amx_ty],
+ [llvm_i16_ty, llvm_i16_ty, llvm_i16_ty,
+ llvm_x86amx_ty, llvm_x86amx_ty,
+ llvm_x86amx_ty], []>;
+ def int_x86_tdpbusd_internal :
+ GCCBuiltin<"__builtin_ia32_tdpbusd_internal">,
+ Intrinsic<[llvm_x86amx_ty],
+ [llvm_i16_ty, llvm_i16_ty, llvm_i16_ty,
+ llvm_x86amx_ty, llvm_x86amx_ty,
+ llvm_x86amx_ty], []>;
+ def int_x86_tdpbuud_internal :
+ GCCBuiltin<"__builtin_ia32_tdpbuud_internal">,
+ Intrinsic<[llvm_x86amx_ty],
+ [llvm_i16_ty, llvm_i16_ty, llvm_i16_ty,
+ llvm_x86amx_ty, llvm_x86amx_ty,
+ llvm_x86amx_ty], []>;
def int_x86_tilestored64_internal :
GCCBuiltin<"__builtin_ia32_tilestored64_internal">,
Intrinsic<[], [llvm_i16_ty, llvm_i16_ty, llvm_ptr_ty,
MI.setDesc(TII->get(X86::TILELOADD));
return true;
}
- case X86::PTDPBSSDV: {
+ case X86::PTDPBSSDV:
+ case X86::PTDPBSUDV:
+ case X86::PTDPBUSDV:
+ case X86::PTDPBUUDV: {
MI.untieRegOperand(4);
for (unsigned i = 3; i > 0; --i)
MI.RemoveOperand(i);
- MI.setDesc(TII->get(X86::TDPBSSD));
+ unsigned Opc;
+ switch (Opcode) {
+ case X86::PTDPBSSDV: Opc = X86::TDPBSSD; break;
+ case X86::PTDPBSUDV: Opc = X86::TDPBSUD; break;
+ case X86::PTDPBUSDV: Opc = X86::TDPBUSD; break;
+ case X86::PTDPBUUDV: Opc = X86::TDPBUUD; break;
+ default: llvm_unreachable("Impossible Opcode!");
+ }
+ MI.setDesc(TII->get(Opc));
MI.tieOperands(0, 1);
return true;
}
ReplaceNode(Node, CNode);
return;
}
- case Intrinsic::x86_tdpbssd_internal: {
+
+ case Intrinsic::x86_tdpbssd_internal:
+ case Intrinsic::x86_tdpbsud_internal:
+ case Intrinsic::x86_tdpbusd_internal:
+ case Intrinsic::x86_tdpbuud_internal: {
if (!Subtarget->hasAMXTILE())
break;
SDValue Chain = Node->getOperand(0);
- unsigned Opc = X86::PTDPBSSDV;
+ unsigned Opc;
+ switch (IntNo) {
+ case Intrinsic::x86_tdpbssd_internal: Opc = X86::PTDPBSSDV; break;
+ case Intrinsic::x86_tdpbsud_internal: Opc = X86::PTDPBSUDV; break;
+ case Intrinsic::x86_tdpbusd_internal: Opc = X86::PTDPBUSDV; break;
+ case Intrinsic::x86_tdpbuud_internal: Opc = X86::PTDPBUUDV; break;
+ default: llvm_unreachable("Impossible intrinsic");
+ }
SDValue Ops[] = {Node->getOperand(2),
Node->getOperand(3),
Node->getOperand(4),
}
// Pseduo instruction for RA.
- let Constraints = "$src4 = $dst" in
- def PTDPBSSDV : PseudoI<(outs TILE: $dst), (ins GR16:$src1,
- GR16:$src2, GR16:$src3, TILE:$src4,
- TILE:$src5, TILE:$src6), []>;
+ let Constraints = "$src4 = $dst" in {
+ def PTDPBSSDV : PseudoI<(outs TILE: $dst), (ins GR16:$src1,
+ GR16:$src2, GR16:$src3, TILE:$src4,
+ TILE:$src5, TILE:$src6), []>;
+ def PTDPBSUDV : PseudoI<(outs TILE: $dst), (ins GR16:$src1,
+ GR16:$src2, GR16:$src3, TILE:$src4,
+ TILE:$src5, TILE:$src6), []>;
+ def PTDPBUSDV : PseudoI<(outs TILE: $dst), (ins GR16:$src1,
+ GR16:$src2, GR16:$src3, TILE:$src4,
+ TILE:$src5, TILE:$src6), []>;
+ def PTDPBUUDV : PseudoI<(outs TILE: $dst), (ins GR16:$src1,
+ GR16:$src2, GR16:$src3, TILE:$src4,
+ TILE:$src5, TILE:$src6), []>;
+ }
let usesCustomInserter = 1 in {
// Pseudo instructions, using immediates instead of tile registers.
}
// a * b + c
// The shape depends on which operand.
- case Intrinsic::x86_tdpbssd_internal: {
+ case Intrinsic::x86_tdpbssd_internal:
+ case Intrinsic::x86_tdpbsud_internal:
+ case Intrinsic::x86_tdpbusd_internal:
+ case Intrinsic::x86_tdpbuud_internal: {
switch (OpNo) {
case 3:
Row = II->getArgOperand(0);
llvm_unreachable("Unexpected machine instruction on tile");
case X86::PTILELOADDV:
case X86::PTDPBSSDV:
+ case X86::PTDPBSUDV:
+ case X86::PTDPBUSDV:
+ case X86::PTDPBUUDV:
case X86::PTILEZEROV:
MachineOperand &MO1 = const_cast<MachineOperand &>(MI.getOperand(1));
MachineOperand &MO2 = const_cast<MachineOperand &>(MI.getOperand(2));
case X86::PTILELOADDV:
case X86::PTILESTOREDV:
case X86::PTDPBSSDV:
+ case X86::PTDPBSUDV:
+ case X86::PTDPBUSDV:
+ case X86::PTDPBUUDV:
case X86::PTILEZEROV:
return true;
}
// We only collect the tile shape that is defined.
case X86::PTILELOADDV:
case X86::PTDPBSSDV:
+ case X86::PTDPBSUDV:
+ case X86::PTDPBUSDV:
+ case X86::PTDPBUUDV:
case X86::PTILEZEROV:
MachineOperand &MO1 = MI->getOperand(1);
MachineOperand &MO2 = MI->getOperand(2);
; CHECK-NEXT: tileloadd (%rsi,%rdx), %tmm1
; CHECK-NEXT: tileloadd (%rsi,%rdx), %tmm2
; CHECK-NEXT: tdpbssd %tmm2, %tmm1, %tmm0
+; CHECK-NEXT: tdpbsud %tmm2, %tmm1, %tmm0
+; CHECK-NEXT: tdpbusd %tmm2, %tmm1, %tmm0
+; CHECK-NEXT: tdpbuud %tmm2, %tmm1, %tmm0
; CHECK-NEXT: tilestored %tmm0, (%rdi,%rdx)
; CHECK-NEXT: tilerelease
; CHECK-NEXT: vzeroupper
%c = call x86_amx @llvm.x86.tilezero.internal(i16 8, i16 8)
%a = call x86_amx @llvm.x86.tileloadd64.internal(i16 8, i16 8, i8* %base, i64 %stride)
%b = call x86_amx @llvm.x86.tileloadd64.internal(i16 8, i16 8, i8* %base, i64 %stride)
- %d = call x86_amx @llvm.x86.tdpbssd.internal(i16 8, i16 8, i16 8, x86_amx %c, x86_amx %a, x86_amx %b)
- call void @llvm.x86.tilestored64.internal(i16 8, i16 8, i8* %pointer, i64 %stride, x86_amx %d)
+ %d0 = call x86_amx @llvm.x86.tdpbssd.internal(i16 8, i16 8, i16 8, x86_amx %c, x86_amx %a, x86_amx %b)
+ %d1 = call x86_amx @llvm.x86.tdpbsud.internal(i16 8, i16 8, i16 8, x86_amx %d0, x86_amx %a, x86_amx %b)
+ %d2 = call x86_amx @llvm.x86.tdpbusd.internal(i16 8, i16 8, i16 8, x86_amx %d1, x86_amx %a, x86_amx %b)
+ %d3 = call x86_amx @llvm.x86.tdpbuud.internal(i16 8, i16 8, i16 8, x86_amx %d2, x86_amx %a, x86_amx %b)
+ call void @llvm.x86.tilestored64.internal(i16 8, i16 8, i8* %pointer, i64 %stride, x86_amx %d3)
ret void
}
declare x86_amx @llvm.x86.tilezero.internal(i16, i16)
declare x86_amx @llvm.x86.tileloadd64.internal(i16, i16, i8*, i64)
declare x86_amx @llvm.x86.tdpbssd.internal(i16, i16, i16, x86_amx, x86_amx, x86_amx)
+declare x86_amx @llvm.x86.tdpbsud.internal(i16, i16, i16, x86_amx, x86_amx, x86_amx)
+declare x86_amx @llvm.x86.tdpbusd.internal(i16, i16, i16, x86_amx, x86_amx, x86_amx)
+declare x86_amx @llvm.x86.tdpbuud.internal(i16, i16, i16, x86_amx, x86_amx, x86_amx)
declare void @llvm.x86.tilestored64.internal(i16, i16, i8*, i64, x86_amx)