Adding support for __tile_stream_loadd intrinsic.
Reviewed By: LuoYuanke
Differential Revision: https://reviews.llvm.org/D103784
// AMX internal builtin
TARGET_BUILTIN(__builtin_ia32_tile_loadconfig_internal, "vvC*", "n", "amx-tile")
TARGET_BUILTIN(__builtin_ia32_tileloadd64_internal, "V256iUsUsvC*z", "n", "amx-tile")
+TARGET_BUILTIN(__builtin_ia32_tileloaddt164_internal, "V256iUsUsvC*z", "n", "amx-tile")
TARGET_BUILTIN(__builtin_ia32_tdpbssd_internal, "V256iUsUsUsV256iV256iV256i", "n", "amx-int8")
TARGET_BUILTIN(__builtin_ia32_tdpbsud_internal, "V256iUsUsUsV256iV256iV256i", "n", "amx-int8")
TARGET_BUILTIN(__builtin_ia32_tdpbusd_internal, "V256iUsUsUsV256iV256iV256i", "n", "amx-int8")
/// This is internal intrinsic. C/C++ user should avoid calling it directly.
static __inline__ _tile1024i __DEFAULT_FN_ATTRS_INT8
+_tile_loaddt1_internal(unsigned short m, unsigned short n, const void *base,
+ __SIZE_TYPE__ stride) {
+ return __builtin_ia32_tileloaddt164_internal(m, n, base,
+ (__SIZE_TYPE__)(stride));
+}
+
+/// This is internal intrinsic. C/C++ user should avoid calling it directly.
+static __inline__ _tile1024i __DEFAULT_FN_ATTRS_INT8
_tile_dpbssd_internal(unsigned short m, unsigned short n, unsigned short k,
_tile1024i dst, _tile1024i src1, _tile1024i src2) {
return __builtin_ia32_tdpbssd_internal(m, n, k, dst, src1, src2);
dst->tile = _tile_loadd_internal(dst->row, dst->col, base, stride);
}
+/// Load tile rows from memory specifieid by "base" address and "stride" into
+/// destination tile "dst". This intrinsic provides a hint to the implementation
+/// that the data will likely not be reused in the near future and the data
+/// caching can be optimized accordingly.
+///
+/// \headerfile <immintrin.h>
+///
+/// This intrinsic corresponds to the <c> TILELOADDT1 </c> instruction.
+///
+/// \param dst
+/// A destination tile. Max size is 1024 Bytes.
+/// \param base
+/// A pointer to base address.
+/// \param stride
+/// The stride between the rows' data to be loaded in memory.
+__DEFAULT_FN_ATTRS_TILE
+static void __tile_stream_loadd(__tile1024i *dst, const void *base,
+ __SIZE_TYPE__ stride) {
+ dst->tile = _tile_loaddt1_internal(dst->row, dst->col, base, stride);
+}
+
/// Compute dot-product of bytes in tiles with a source/destination accumulator.
/// Multiply groups of 4 adjacent pairs of signed 8-bit integers in src0 with
/// corresponding signed 8-bit integers in src1, producing 4 intermediate 32-bit
__tile_loadd(&a, buf, STRIDE);
}
+void test_tile_stream_loadd(short row, short col) {
+ //CHECK-LABEL: @test_tile_stream_loadd
+ //CHECK: call x86_amx @llvm.x86.tileloaddt164.internal
+ //CHECK-NEXT: {{%.*}} = bitcast x86_amx {{%.*}} to <256 x i32>
+ __tile1024i a = {row, col};
+ __tile_stream_loadd(&a, buf, STRIDE);
+}
+
void test_tile_dpbssd(__tile1024i a, __tile1024i b, __tile1024i c) {
//CHECK-LABEL: @test_tile_dpbssd
//CHECK: call x86_amx @llvm.x86.tdpbssd.internal
Intrinsic<[llvm_x86amx_ty],
[llvm_i16_ty, llvm_i16_ty, llvm_ptr_ty, llvm_i64_ty],
[]>;
+ def int_x86_tileloaddt164_internal :
+ GCCBuiltin<"__builtin_ia32_tileloaddt164_internal">,
+ Intrinsic<[llvm_x86amx_ty],
+ [llvm_i16_ty, llvm_i16_ty, llvm_ptr_ty, llvm_i64_ty],
+ []>;
def int_x86_tdpbssd_internal :
GCCBuiltin<"__builtin_ia32_tdpbssd_internal">,
Intrinsic<[llvm_x86amx_ty],
MI.setDesc(TII->get(X86::LDTILECFG));
return true;
}
- case X86::PTILELOADDV: {
+ case X86::PTILELOADDV:
+ case X86::PTILELOADDT1V: {
for (unsigned i = 2; i > 0; --i)
MI.RemoveOperand(i);
- MI.setDesc(TII->get(X86::TILELOADD));
+ unsigned Opc =
+ Opcode == X86::PTILELOADDV ? X86::TILELOADD : X86::TILELOADDT1;
+ MI.setDesc(TII->get(Opc));
return true;
}
case X86::PTDPBSSDV:
}
bool X86FastTileConfig::isTileLoad(MachineInstr &MI) {
- return MI.getOpcode() == X86::PTILELOADDV;
+ return MI.getOpcode() == X86::PTILELOADDV ||
+ MI.getOpcode() == X86::PTILELOADDT1V;
}
bool X86FastTileConfig::isTileStore(MachineInstr &MI) {
return MI.getOpcode() == X86::PTILESTOREDV;
ReplaceNode(Node, Res);
return;
}
- case Intrinsic::x86_tileloadd64_internal: {
+ case Intrinsic::x86_tileloadd64_internal:
+ case Intrinsic::x86_tileloaddt164_internal: {
if (!Subtarget->hasAMXTILE())
break;
- unsigned Opc = X86::PTILELOADDV;
+ unsigned Opc = IntNo == Intrinsic::x86_tileloadd64_internal
+ ? X86::PTILELOADDV
+ : X86::PTILELOADDT1V;
// _tile_loadd_internal(row, col, buf, STRIDE)
SDValue Base = Node->getOperand(4);
SDValue Scale = getI8Imm(1, dl);
def PTILELOADDV : PseudoI<(outs TILE:$dst), (ins GR16:$src1,
GR16:$src2,
opaquemem:$src3), []>;
+ def PTILELOADDT1V : PseudoI<(outs TILE:$dst), (ins GR16:$src1,
+ GR16:$src2,
+ opaquemem:$src3), []>;
def PTILESTOREDV : PseudoI<(outs), (ins GR16:$src1,
GR16:$src2, opaquemem:$src3,
TILE:$src4), []>;
default:
llvm_unreachable("Expect amx intrinsics");
case Intrinsic::x86_tileloadd64_internal:
+ case Intrinsic::x86_tileloaddt164_internal:
case Intrinsic::x86_tilestored64_internal: {
Row = II->getArgOperand(0);
Col = II->getArgOperand(1);
}
static bool isTileLoad(IntrinsicInst *II) {
- return II->getIntrinsicID() == Intrinsic::x86_tileloadd64_internal;
+ return II->getIntrinsicID() == Intrinsic::x86_tileloadd64_internal ||
+ II->getIntrinsicID() == Intrinsic::x86_tileloaddt164_internal;
}
static bool isTileStore(IntrinsicInst *II) {
}
// We only collect the tile shape that is defined.
case X86::PTILELOADDV:
+ case X86::PTILELOADDT1V:
case X86::PTDPBSSDV:
case X86::PTDPBSUDV:
case X86::PTDPBUSDV:
; CHECK-NEXT: tdpbusd %tmm2, %tmm1, %tmm0
; CHECK-NEXT: tdpbuud %tmm2, %tmm1, %tmm0
; CHECK-NEXT: tdpbf16ps %tmm2, %tmm1, %tmm0
+; CHECK-NEXT: tileloaddt1 (%rsi,%rdx), %tmm1
; CHECK-NEXT: tilestored %tmm0, (%rdi,%rdx)
; CHECK-NEXT: tilerelease
; CHECK-NEXT: vzeroupper
%d2 = call x86_amx @llvm.x86.tdpbusd.internal(i16 8, i16 8, i16 8, x86_amx %d1, x86_amx %a, x86_amx %b)
%d3 = call x86_amx @llvm.x86.tdpbuud.internal(i16 8, i16 8, i16 8, x86_amx %d2, x86_amx %a, x86_amx %b)
%d4 = call x86_amx @llvm.x86.tdpbf16ps.internal(i16 8, i16 8, i16 8, x86_amx %d3, x86_amx %a, x86_amx %b)
+ %e = call x86_amx @llvm.x86.tileloaddt164.internal(i16 8, i16 8, i8* %base, i64 %stride)
call void @llvm.x86.tilestored64.internal(i16 8, i16 8, i8* %pointer, i64 %stride, x86_amx %d4)
ret void
declare x86_amx @llvm.x86.tilezero.internal(i16, i16)
declare x86_amx @llvm.x86.tileloadd64.internal(i16, i16, i8*, i64)
+declare x86_amx @llvm.x86.tileloaddt164.internal(i16, i16, i8*, i64)
declare x86_amx @llvm.x86.tdpbssd.internal(i16, i16, i16, x86_amx, x86_amx, x86_amx)
declare x86_amx @llvm.x86.tdpbsud.internal(i16, i16, i16, x86_amx, x86_amx, x86_amx)
declare x86_amx @llvm.x86.tdpbusd.internal(i16, i16, i16, x86_amx, x86_amx, x86_amx)