assert(Results.size() == N->getNumValues() &&
"Custom lowering returned the wrong number of results!");
for (unsigned i = 0, e = Results.size(); i != e; ++i) {
- // If this is a chain output just replace it.
- if (Results[i].getValueType() == MVT::Other)
- ReplaceValueWith(SDValue(N, i), Results[i]);
- else
+ // If this is a chain output or already widened just replace it.
+ bool WasWidened = SDValue(N, i).getValueType() != Results[i].getValueType();
+ if (WasWidened)
SetWidenedVector(SDValue(N, i), Results[i]);
+ else
+ ReplaceValueWith(SDValue(N, i), Results[i]);
}
return true;
}
if (!NewNumElts)
return UndefValue::get(II.getType());
- // FIXME: Allow v3i16/v3f16 in buffer and image intrinsics when the types are
- // fully supported.
- if (II.getType()->getScalarSizeInBits() == 16 && NewNumElts == 3)
- return nullptr;
-
if (NewNumElts >= VWidth && DemandedElts.isMask()) {
if (DMaskIdx >= 0)
II.setArgOperand(DMaskIdx, Args[DMaskIdx]);
bit TiedDest = 0,
bit isLds = 0> {
- def _OFFSET : MUBUF_Load_Pseudo <opName, BUFAddrKind.Offset, load_vt, TiedDest, isLds>,
+ defvar legal_load_vt = !if(!eq(!cast<string>(load_vt), !cast<string>(v3f16)), v4f16, load_vt);
+
+ def _OFFSET : MUBUF_Load_Pseudo <opName, BUFAddrKind.Offset, legal_load_vt, TiedDest, isLds>,
MUBUFAddr64Table<0, NAME # !if(isLds, "_LDS", "")>;
- def _ADDR64 : MUBUF_Load_Pseudo <opName, BUFAddrKind.Addr64, load_vt, TiedDest, isLds>,
+ def _ADDR64 : MUBUF_Load_Pseudo <opName, BUFAddrKind.Addr64, legal_load_vt, TiedDest, isLds>,
MUBUFAddr64Table<1, NAME # !if(isLds, "_LDS", "")>;
- def _OFFEN : MUBUF_Load_Pseudo <opName, BUFAddrKind.OffEn, load_vt, TiedDest, isLds>;
- def _IDXEN : MUBUF_Load_Pseudo <opName, BUFAddrKind.IdxEn, load_vt, TiedDest, isLds>;
- def _BOTHEN : MUBUF_Load_Pseudo <opName, BUFAddrKind.BothEn, load_vt, TiedDest, isLds>;
+ def _OFFEN : MUBUF_Load_Pseudo <opName, BUFAddrKind.OffEn, legal_load_vt, TiedDest, isLds>;
+ def _IDXEN : MUBUF_Load_Pseudo <opName, BUFAddrKind.IdxEn, legal_load_vt, TiedDest, isLds>;
+ def _BOTHEN : MUBUF_Load_Pseudo <opName, BUFAddrKind.BothEn, legal_load_vt, TiedDest, isLds>;
let DisableWQM = 1 in {
- def _OFFSET_exact : MUBUF_Load_Pseudo <opName, BUFAddrKind.Offset, load_vt, TiedDest, isLds>;
- def _OFFEN_exact : MUBUF_Load_Pseudo <opName, BUFAddrKind.OffEn, load_vt, TiedDest, isLds>;
- def _IDXEN_exact : MUBUF_Load_Pseudo <opName, BUFAddrKind.IdxEn, load_vt, TiedDest, isLds>;
- def _BOTHEN_exact : MUBUF_Load_Pseudo <opName, BUFAddrKind.BothEn, load_vt, TiedDest, isLds>;
+ def _OFFSET_exact : MUBUF_Load_Pseudo <opName, BUFAddrKind.Offset, legal_load_vt, TiedDest, isLds>;
+ def _OFFEN_exact : MUBUF_Load_Pseudo <opName, BUFAddrKind.OffEn, legal_load_vt, TiedDest, isLds>;
+ def _IDXEN_exact : MUBUF_Load_Pseudo <opName, BUFAddrKind.IdxEn, legal_load_vt, TiedDest, isLds>;
+ def _BOTHEN_exact : MUBUF_Load_Pseudo <opName, BUFAddrKind.BothEn, legal_load_vt, TiedDest, isLds>;
}
}
ValueType store_vt = i32,
SDPatternOperator st = null_frag> {
- def _OFFSET : MUBUF_Store_Pseudo <opName, BUFAddrKind.Offset, store_vt,
- [(st store_vt:$vdata, (MUBUFOffset v4i32:$srsrc, i32:$soffset,
+ defvar legal_store_vt = !if(!eq(!cast<string>(store_vt), !cast<string>(v3f16)), v4f16, store_vt);
+
+ def _OFFSET : MUBUF_Store_Pseudo <opName, BUFAddrKind.Offset, legal_store_vt,
+ [(st legal_store_vt:$vdata, (MUBUFOffset v4i32:$srsrc, i32:$soffset,
i16:$offset, i1:$glc, i1:$slc, i1:$tfe, i1:$dlc, i1:$swz))]>,
MUBUFAddr64Table<0, NAME>;
- def _ADDR64 : MUBUF_Store_Pseudo <opName, BUFAddrKind.Addr64, store_vt,
- [(st store_vt:$vdata, (MUBUFAddr64 v4i32:$srsrc, i64:$vaddr, i32:$soffset,
+ def _ADDR64 : MUBUF_Store_Pseudo <opName, BUFAddrKind.Addr64, legal_store_vt,
+ [(st legal_store_vt:$vdata, (MUBUFAddr64 v4i32:$srsrc, i64:$vaddr, i32:$soffset,
i16:$offset, i1:$glc, i1:$slc, i1:$tfe, i1:$dlc, i1:$swz))]>,
MUBUFAddr64Table<1, NAME>;
- def _OFFEN : MUBUF_Store_Pseudo <opName, BUFAddrKind.OffEn, store_vt>;
- def _IDXEN : MUBUF_Store_Pseudo <opName, BUFAddrKind.IdxEn, store_vt>;
- def _BOTHEN : MUBUF_Store_Pseudo <opName, BUFAddrKind.BothEn, store_vt>;
+ def _OFFEN : MUBUF_Store_Pseudo <opName, BUFAddrKind.OffEn, legal_store_vt>;
+ def _IDXEN : MUBUF_Store_Pseudo <opName, BUFAddrKind.IdxEn, legal_store_vt>;
+ def _BOTHEN : MUBUF_Store_Pseudo <opName, BUFAddrKind.BothEn, legal_store_vt>;
let DisableWQM = 1 in {
- def _OFFSET_exact : MUBUF_Store_Pseudo <opName, BUFAddrKind.Offset, store_vt>;
- def _OFFEN_exact : MUBUF_Store_Pseudo <opName, BUFAddrKind.OffEn, store_vt>;
- def _IDXEN_exact : MUBUF_Store_Pseudo <opName, BUFAddrKind.IdxEn, store_vt>;
- def _BOTHEN_exact : MUBUF_Store_Pseudo <opName, BUFAddrKind.BothEn, store_vt>;
+ def _OFFSET_exact : MUBUF_Store_Pseudo <opName, BUFAddrKind.Offset, legal_store_vt>;
+ def _OFFEN_exact : MUBUF_Store_Pseudo <opName, BUFAddrKind.OffEn, legal_store_vt>;
+ def _IDXEN_exact : MUBUF_Store_Pseudo <opName, BUFAddrKind.IdxEn, legal_store_vt>;
+ def _BOTHEN_exact : MUBUF_Store_Pseudo <opName, BUFAddrKind.BothEn, legal_store_vt>;
}
}
//===----------------------------------------------------------------------===//
multiclass MUBUF_LoadIntrinsicPat<SDPatternOperator name, ValueType vt,
- string opcode> {
+ string opcode, ValueType memoryVt = vt> {
+ defvar st = !if(!eq(!cast<string>(memoryVt), !cast<string>(vt)), name, mubuf_intrinsic_load<name, memoryVt>);
+
def : GCNPat<
- (vt (name v4i32:$rsrc, 0, 0, i32:$soffset, timm:$offset,
+ (vt (st v4i32:$rsrc, 0, 0, i32:$soffset, timm:$offset,
timm:$auxiliary, 0)),
(!cast<MUBUF_Pseudo>(opcode # _OFFSET) SReg_128:$rsrc, SCSrc_b32:$soffset, (as_i16timm $offset),
(extract_glc $auxiliary), (extract_slc $auxiliary), 0, (extract_dlc $auxiliary),
>;
def : GCNPat<
- (vt (name v4i32:$rsrc, 0, i32:$voffset, i32:$soffset, timm:$offset,
+ (vt (st v4i32:$rsrc, 0, i32:$voffset, i32:$soffset, timm:$offset,
timm:$auxiliary, 0)),
(!cast<MUBUF_Pseudo>(opcode # _OFFEN) VGPR_32:$voffset, SReg_128:$rsrc, SCSrc_b32:$soffset, (as_i16timm $offset),
(extract_glc $auxiliary), (extract_slc $auxiliary), 0, (extract_dlc $auxiliary),
>;
def : GCNPat<
- (vt (name v4i32:$rsrc, i32:$vindex, 0, i32:$soffset, timm:$offset,
+ (vt (st v4i32:$rsrc, i32:$vindex, 0, i32:$soffset, timm:$offset,
timm:$auxiliary, timm)),
(!cast<MUBUF_Pseudo>(opcode # _IDXEN) VGPR_32:$vindex, SReg_128:$rsrc, SCSrc_b32:$soffset, (as_i16timm $offset),
(extract_glc $auxiliary), (extract_slc $auxiliary), 0, (extract_dlc $auxiliary),
>;
def : GCNPat<
- (vt (name v4i32:$rsrc, i32:$vindex, i32:$voffset, i32:$soffset, timm:$offset,
+ (vt (st v4i32:$rsrc, i32:$vindex, i32:$voffset, i32:$soffset, timm:$offset,
timm:$auxiliary, timm)),
(!cast<MUBUF_Pseudo>(opcode # _BOTHEN)
(REG_SEQUENCE VReg_64, VGPR_32:$vindex, sub0, VGPR_32:$voffset, sub1),
defm : MUBUF_LoadIntrinsicPat<SIbuffer_load_format_d16, i16, "BUFFER_LOAD_FORMAT_D16_X_gfx80">;
defm : MUBUF_LoadIntrinsicPat<SIbuffer_load_format_d16, i32, "BUFFER_LOAD_FORMAT_D16_X_gfx80">;
defm : MUBUF_LoadIntrinsicPat<SIbuffer_load_format_d16, v2i32, "BUFFER_LOAD_FORMAT_D16_XY_gfx80">;
+ defm : MUBUF_LoadIntrinsicPat<SIbuffer_load_format_d16, v3i32, "BUFFER_LOAD_FORMAT_D16_XYZ_gfx80">;
defm : MUBUF_LoadIntrinsicPat<SIbuffer_load_format_d16, v4i32, "BUFFER_LOAD_FORMAT_D16_XYZW_gfx80">;
} // End HasUnpackedD16VMem.
defm : MUBUF_LoadIntrinsicPat<SIbuffer_load_format_d16, i32, "BUFFER_LOAD_FORMAT_D16_X">;
defm : MUBUF_LoadIntrinsicPat<SIbuffer_load_format_d16, v2f16, "BUFFER_LOAD_FORMAT_D16_XY">;
defm : MUBUF_LoadIntrinsicPat<SIbuffer_load_format_d16, v2i16, "BUFFER_LOAD_FORMAT_D16_XY">;
+ defm : MUBUF_LoadIntrinsicPat<SIbuffer_load_format_d16, v4f16, "BUFFER_LOAD_FORMAT_D16_XYZ", v3f16>;
+ defm : MUBUF_LoadIntrinsicPat<SIbuffer_load_format_d16, v4i16, "BUFFER_LOAD_FORMAT_D16_XYZ", v3i16>;
defm : MUBUF_LoadIntrinsicPat<SIbuffer_load_format_d16, v4f16, "BUFFER_LOAD_FORMAT_D16_XYZW">;
defm : MUBUF_LoadIntrinsicPat<SIbuffer_load_format_d16, v4i16, "BUFFER_LOAD_FORMAT_D16_XYZW">;
} // End HasPackedD16VMem.
defm : MUBUF_LoadIntrinsicPat<SIbuffer_load_ushort, i32, "BUFFER_LOAD_USHORT">;
multiclass MUBUF_StoreIntrinsicPat<SDPatternOperator name, ValueType vt,
- string opcode> {
+ string opcode, ValueType memoryVt = vt> {
+ defvar st = !if(!eq(!cast<string>(memoryVt), !cast<string>(vt)), name, mubuf_intrinsic_store<name, memoryVt>);
+
def : GCNPat<
- (name vt:$vdata, v4i32:$rsrc, 0, 0, i32:$soffset, timm:$offset,
+ (st vt:$vdata, v4i32:$rsrc, 0, 0, i32:$soffset, timm:$offset,
timm:$auxiliary, 0),
(!cast<MUBUF_Pseudo>(opcode # _OFFSET_exact) getVregSrcForVT<vt>.ret:$vdata, SReg_128:$rsrc, SCSrc_b32:$soffset, (as_i16timm $offset),
(extract_glc $auxiliary), (extract_slc $auxiliary), 0, (extract_dlc $auxiliary),
>;
def : GCNPat<
- (name vt:$vdata, v4i32:$rsrc, 0, i32:$voffset, i32:$soffset, timm:$offset,
+ (st vt:$vdata, v4i32:$rsrc, 0, i32:$voffset, i32:$soffset, timm:$offset,
timm:$auxiliary, 0),
(!cast<MUBUF_Pseudo>(opcode # _OFFEN_exact) getVregSrcForVT<vt>.ret:$vdata, VGPR_32:$voffset, SReg_128:$rsrc, SCSrc_b32:$soffset,
(as_i16timm $offset), (extract_glc $auxiliary),
>;
def : GCNPat<
- (name vt:$vdata, v4i32:$rsrc, i32:$vindex, 0, i32:$soffset, timm:$offset,
+ (st vt:$vdata, v4i32:$rsrc, i32:$vindex, 0, i32:$soffset, timm:$offset,
timm:$auxiliary, timm),
(!cast<MUBUF_Pseudo>(opcode # _IDXEN_exact) getVregSrcForVT<vt>.ret:$vdata, VGPR_32:$vindex, SReg_128:$rsrc, SCSrc_b32:$soffset,
(as_i16timm $offset), (extract_glc $auxiliary),
>;
def : GCNPat<
- (name vt:$vdata, v4i32:$rsrc, i32:$vindex, i32:$voffset, i32:$soffset, timm:$offset,
+ (st vt:$vdata, v4i32:$rsrc, i32:$vindex, i32:$voffset, i32:$soffset, timm:$offset,
timm:$auxiliary, timm),
(!cast<MUBUF_Pseudo>(opcode # _BOTHEN_exact)
getVregSrcForVT<vt>.ret:$vdata,
defm : MUBUF_StoreIntrinsicPat<SIbuffer_store_format_d16, i16, "BUFFER_STORE_FORMAT_D16_X_gfx80">;
defm : MUBUF_StoreIntrinsicPat<SIbuffer_store_format_d16, i32, "BUFFER_STORE_FORMAT_D16_X_gfx80">;
defm : MUBUF_StoreIntrinsicPat<SIbuffer_store_format_d16, v2i32, "BUFFER_STORE_FORMAT_D16_XY_gfx80">;
+ defm : MUBUF_StoreIntrinsicPat<SIbuffer_store_format_d16, v3i32, "BUFFER_STORE_FORMAT_D16_XYZ_gfx80">;
defm : MUBUF_StoreIntrinsicPat<SIbuffer_store_format_d16, v4i32, "BUFFER_STORE_FORMAT_D16_XYZW_gfx80">;
} // End HasUnpackedD16VMem.
defm : MUBUF_StoreIntrinsicPat<SIbuffer_store_format_d16, i32, "BUFFER_STORE_FORMAT_D16_X">;
defm : MUBUF_StoreIntrinsicPat<SIbuffer_store_format_d16, v2f16, "BUFFER_STORE_FORMAT_D16_XY">;
defm : MUBUF_StoreIntrinsicPat<SIbuffer_store_format_d16, v2i16, "BUFFER_STORE_FORMAT_D16_XY">;
+ defm : MUBUF_StoreIntrinsicPat<SIbuffer_store_format_d16, v4f16, "BUFFER_STORE_FORMAT_D16_XYZ", v3f16>;
+ defm : MUBUF_StoreIntrinsicPat<SIbuffer_store_format_d16, v4i16, "BUFFER_STORE_FORMAT_D16_XYZ", v3i16>;
defm : MUBUF_StoreIntrinsicPat<SIbuffer_store_format_d16, v4f16, "BUFFER_STORE_FORMAT_D16_XYZW">;
defm : MUBUF_StoreIntrinsicPat<SIbuffer_store_format_d16, v4i16, "BUFFER_STORE_FORMAT_D16_XYZW">;
} // End HasPackedD16VMem.
//===----------------------------------------------------------------------===//
multiclass MTBUF_LoadIntrinsicPat<SDPatternOperator name, ValueType vt,
- string opcode> {
+ string opcode, ValueType memoryVt = vt> {
+ defvar st = !if(!eq(!cast<string>(memoryVt), !cast<string>(vt)), name, mtbuf_intrinsic_load<name, memoryVt>);
+
def : GCNPat<
- (vt (name v4i32:$rsrc, 0, 0, i32:$soffset, timm:$offset,
+ (vt (st v4i32:$rsrc, 0, 0, i32:$soffset, timm:$offset,
timm:$format, timm:$auxiliary, 0)),
(!cast<MTBUF_Pseudo>(opcode # _OFFSET) SReg_128:$rsrc, SCSrc_b32:$soffset, (as_i16timm $offset),
(as_i8timm $format),
>;
def : GCNPat<
- (vt (name v4i32:$rsrc, i32:$vindex, 0, i32:$soffset, timm:$offset,
+ (vt (st v4i32:$rsrc, i32:$vindex, 0, i32:$soffset, timm:$offset,
timm:$format, timm:$auxiliary, timm)),
(!cast<MTBUF_Pseudo>(opcode # _IDXEN) VGPR_32:$vindex, SReg_128:$rsrc, SCSrc_b32:$soffset, (as_i16timm $offset),
(as_i8timm $format),
>;
def : GCNPat<
- (vt (name v4i32:$rsrc, 0, i32:$voffset, i32:$soffset, timm:$offset,
+ (vt (st v4i32:$rsrc, 0, i32:$voffset, i32:$soffset, timm:$offset,
timm:$format, timm:$auxiliary, 0)),
(!cast<MTBUF_Pseudo>(opcode # _OFFEN) VGPR_32:$voffset, SReg_128:$rsrc, SCSrc_b32:$soffset, (as_i16timm $offset),
(as_i8timm $format),
>;
def : GCNPat<
- (vt (name v4i32:$rsrc, i32:$vindex, i32:$voffset, i32:$soffset, timm:$offset,
+ (vt (st v4i32:$rsrc, i32:$vindex, i32:$voffset, i32:$soffset, timm:$offset,
timm:$format, timm:$auxiliary, timm)),
(!cast<MTBUF_Pseudo>(opcode # _BOTHEN)
(REG_SEQUENCE VReg_64, VGPR_32:$vindex, sub0, VGPR_32:$voffset, sub1),
defm : MTBUF_LoadIntrinsicPat<SItbuffer_load_d16, f16, "TBUFFER_LOAD_FORMAT_D16_X_gfx80">;
defm : MTBUF_LoadIntrinsicPat<SItbuffer_load_d16, i32, "TBUFFER_LOAD_FORMAT_D16_X_gfx80">;
defm : MTBUF_LoadIntrinsicPat<SItbuffer_load_d16, v2i32, "TBUFFER_LOAD_FORMAT_D16_XY_gfx80">;
+ defm : MTBUF_LoadIntrinsicPat<SItbuffer_load_d16, v3i32, "TBUFFER_LOAD_FORMAT_D16_XYZ_gfx80">;
defm : MTBUF_LoadIntrinsicPat<SItbuffer_load_d16, v4i32, "TBUFFER_LOAD_FORMAT_D16_XYZW_gfx80">;
} // End HasUnpackedD16VMem.
defm : MTBUF_LoadIntrinsicPat<SItbuffer_load_d16, f16, "TBUFFER_LOAD_FORMAT_D16_X">;
defm : MTBUF_LoadIntrinsicPat<SItbuffer_load_d16, i32, "TBUFFER_LOAD_FORMAT_D16_X">;
defm : MTBUF_LoadIntrinsicPat<SItbuffer_load_d16, v2f16, "TBUFFER_LOAD_FORMAT_D16_XY">;
+ defm : MTBUF_LoadIntrinsicPat<SItbuffer_load_d16, v4f16, "TBUFFER_LOAD_FORMAT_D16_XYZ", v3f16>;
defm : MTBUF_LoadIntrinsicPat<SItbuffer_load_d16, v4f16, "TBUFFER_LOAD_FORMAT_D16_XYZW">;
} // End HasPackedD16VMem.
multiclass MTBUF_StoreIntrinsicPat<SDPatternOperator name, ValueType vt,
- string opcode> {
+ string opcode, ValueType memoryVt = vt> {
+ defvar st = !if(!eq(!cast<string>(memoryVt), !cast<string>(vt)), name, mtbuf_intrinsic_store<name, memoryVt>);
+
def : GCNPat<
- (name vt:$vdata, v4i32:$rsrc, 0, 0, i32:$soffset, timm:$offset,
+ (st vt:$vdata, v4i32:$rsrc, 0, 0, i32:$soffset, timm:$offset,
timm:$format, timm:$auxiliary, 0),
(!cast<MTBUF_Pseudo>(opcode # _OFFSET_exact) getVregSrcForVT<vt>.ret:$vdata, SReg_128:$rsrc, SCSrc_b32:$soffset,
(as_i16timm $offset), (as_i8timm $format),
>;
def : GCNPat<
- (name vt:$vdata, v4i32:$rsrc, i32:$vindex, 0, i32:$soffset, timm:$offset,
+ (st vt:$vdata, v4i32:$rsrc, i32:$vindex, 0, i32:$soffset, timm:$offset,
timm:$format, timm:$auxiliary, timm),
(!cast<MTBUF_Pseudo>(opcode # _IDXEN_exact) getVregSrcForVT<vt>.ret:$vdata, VGPR_32:$vindex, SReg_128:$rsrc, SCSrc_b32:$soffset,
(as_i16timm $offset), (as_i8timm $format),
>;
def : GCNPat<
- (name vt:$vdata, v4i32:$rsrc, 0, i32:$voffset, i32:$soffset, timm:$offset,
+ (st vt:$vdata, v4i32:$rsrc, 0, i32:$voffset, i32:$soffset, timm:$offset,
timm:$format, timm:$auxiliary, 0),
(!cast<MTBUF_Pseudo>(opcode # _OFFEN_exact) getVregSrcForVT<vt>.ret:$vdata, VGPR_32:$voffset, SReg_128:$rsrc, SCSrc_b32:$soffset,
(as_i16timm $offset), (as_i8timm $format),
>;
def : GCNPat<
- (name vt:$vdata, v4i32:$rsrc, i32:$vindex, i32:$voffset, i32:$soffset,
+ (st vt:$vdata, v4i32:$rsrc, i32:$vindex, i32:$voffset, i32:$soffset,
timm:$offset, timm:$format, timm:$auxiliary, timm),
(!cast<MTBUF_Pseudo>(opcode # _BOTHEN_exact)
getVregSrcForVT<vt>.ret:$vdata,
defm : MTBUF_StoreIntrinsicPat<SItbuffer_store_d16, f16, "TBUFFER_STORE_FORMAT_D16_X_gfx80">;
defm : MTBUF_StoreIntrinsicPat<SItbuffer_store_d16, i32, "TBUFFER_STORE_FORMAT_D16_X_gfx80">;
defm : MTBUF_StoreIntrinsicPat<SItbuffer_store_d16, v2i32, "TBUFFER_STORE_FORMAT_D16_XY_gfx80">;
+ defm : MTBUF_StoreIntrinsicPat<SItbuffer_store_d16, v3i32, "TBUFFER_STORE_FORMAT_D16_XYZ_gfx80">;
defm : MTBUF_StoreIntrinsicPat<SItbuffer_store_d16, v4i32, "TBUFFER_STORE_FORMAT_D16_XYZW_gfx80">;
} // End HasUnpackedD16VMem.
defm : MTBUF_StoreIntrinsicPat<SItbuffer_store_d16, f16, "TBUFFER_STORE_FORMAT_D16_X">;
defm : MTBUF_StoreIntrinsicPat<SItbuffer_store_d16, i32, "TBUFFER_STORE_FORMAT_D16_X">;
defm : MTBUF_StoreIntrinsicPat<SItbuffer_store_d16, v2f16, "TBUFFER_STORE_FORMAT_D16_XY">;
+ defm : MTBUF_StoreIntrinsicPat<SItbuffer_store_d16, v4f16, "TBUFFER_STORE_FORMAT_D16_XYZ", v3f16>;
defm : MTBUF_StoreIntrinsicPat<SItbuffer_store_d16, v4f16, "TBUFFER_STORE_FORMAT_D16_XYZW">;
} // End HasPackedD16VMem.
setOperationAction(ISD::INTRINSIC_W_CHAIN, MVT::v2f16, Custom);
setOperationAction(ISD::INTRINSIC_W_CHAIN, MVT::v2i16, Custom);
+ setOperationAction(ISD::INTRINSIC_W_CHAIN, MVT::v3f16, Custom);
+ setOperationAction(ISD::INTRINSIC_W_CHAIN, MVT::v3i16, Custom);
setOperationAction(ISD::INTRINSIC_W_CHAIN, MVT::v4f16, Custom);
setOperationAction(ISD::INTRINSIC_W_CHAIN, MVT::v4i16, Custom);
setOperationAction(ISD::INTRINSIC_W_CHAIN, MVT::v8f16, Custom);
setOperationAction(ISD::INTRINSIC_VOID, MVT::Other, Custom);
setOperationAction(ISD::INTRINSIC_VOID, MVT::v2i16, Custom);
setOperationAction(ISD::INTRINSIC_VOID, MVT::v2f16, Custom);
+ setOperationAction(ISD::INTRINSIC_VOID, MVT::v3i16, Custom);
+ setOperationAction(ISD::INTRINSIC_VOID, MVT::v3f16, Custom);
setOperationAction(ISD::INTRINSIC_VOID, MVT::v4f16, Custom);
setOperationAction(ISD::INTRINSIC_VOID, MVT::v4i16, Custom);
setOperationAction(ISD::INTRINSIC_VOID, MVT::f16, Custom);
return SDValue();
}
+// Used for D16: Casts the result of an instruction into the right vector,
+// packs values if loads return unpacked values.
static SDValue adjustLoadValueTypeImpl(SDValue Result, EVT LoadVT,
const SDLoc &DL,
SelectionDAG &DAG, bool Unpacked) {
if (!LoadVT.isVector())
return Result;
+ // Cast back to the original packed type or to a larger type that is a
+ // multiple of 32 bit for D16. Widening the return type is a required for
+ // legalization.
+ EVT FittingLoadVT = LoadVT;
+ if ((LoadVT.getVectorNumElements() % 2) == 1) {
+ FittingLoadVT =
+ EVT::getVectorVT(*DAG.getContext(), LoadVT.getVectorElementType(),
+ LoadVT.getVectorNumElements() + 1);
+ }
+
if (Unpacked) { // From v2i32/v4i32 back to v2f16/v4f16.
// Truncate to v2i16/v4i16.
- EVT IntLoadVT = LoadVT.changeTypeToInteger();
+ EVT IntLoadVT = FittingLoadVT.changeTypeToInteger();
// Workaround legalizer not scalarizing truncate after vector op
// legalization but not creating intermediate vector trunc.
for (SDValue &Elt : Elts)
Elt = DAG.getNode(ISD::TRUNCATE, DL, MVT::i16, Elt);
+ // Pad illegal v1i16/v3fi6 to v4i16
+ if ((LoadVT.getVectorNumElements() % 2) == 1)
+ Elts.push_back(DAG.getUNDEF(MVT::i16));
+
Result = DAG.getBuildVector(IntLoadVT, DL, Elts);
// Bitcast to original type (v2f16/v4f16).
- return DAG.getNode(ISD::BITCAST, DL, LoadVT, Result);
+ return DAG.getNode(ISD::BITCAST, DL, FittingLoadVT, Result);
}
// Cast back to the original packed type.
- return DAG.getNode(ISD::BITCAST, DL, LoadVT, Result);
+ return DAG.getNode(ISD::BITCAST, DL, FittingLoadVT, Result);
}
SDValue SITargetLowering::adjustLoadValueType(unsigned Opcode,
EVT LoadVT = M->getValueType(0);
EVT EquivLoadVT = LoadVT;
- if (Unpacked && LoadVT.isVector()) {
- EquivLoadVT = LoadVT.isVector() ?
- EVT::getVectorVT(*DAG.getContext(), MVT::i32,
- LoadVT.getVectorNumElements()) : LoadVT;
+ if (LoadVT.isVector()) {
+ if (Unpacked) {
+ EquivLoadVT = EVT::getVectorVT(*DAG.getContext(), MVT::i32,
+ LoadVT.getVectorNumElements());
+ } else if ((LoadVT.getVectorNumElements() % 2) == 1) {
+ // Widen v3f16 to legal type
+ EquivLoadVT =
+ EVT::getVectorVT(*DAG.getContext(), LoadVT.getVectorElementType(),
+ LoadVT.getVectorNumElements() + 1);
+ }
}
// Change from v4f16/v2f16 to EquivLoadVT.
IsIntrinsic ? (unsigned)ISD::INTRINSIC_W_CHAIN : Opcode, DL,
VTList, Ops, M->getMemoryVT(),
M->getMemOperand());
- if (!Unpacked) // Just adjusted the opcode.
- return Load;
SDValue Adjusted = adjustLoadValueTypeImpl(Load, LoadVT, DL, DAG, Unpacked);
if (SDValue Res = LowerINTRINSIC_W_CHAIN(SDValue(N, 0), DAG)) {
if (Res.getOpcode() == ISD::MERGE_VALUES) {
// FIXME: Hacky
- Results.push_back(Res.getOperand(0));
- Results.push_back(Res.getOperand(1));
+ for (unsigned I = 0; I < Res.getNumOperands(); I++) {
+ Results.push_back(Res.getOperand(I));
+ }
} else {
Results.push_back(Res);
Results.push_back(Res.getValue(1));
if (IsD16)
Data = adjustLoadValueTypeImpl(Data, ReqRetVT, DL, DAG, Unpacked);
- if (!ReqRetVT.isVector())
+ EVT LegalReqRetVT = ReqRetVT;
+ if (!ReqRetVT.isVector()) {
Data = DAG.getNode(ISD::TRUNCATE, DL, ReqRetVT.changeTypeToInteger(), Data);
-
- Data = DAG.getNode(ISD::BITCAST, DL, ReqRetVT, Data);
+ } else {
+ // We need to widen the return vector to a legal type
+ if ((ReqRetVT.getVectorNumElements() % 2) == 1) {
+ LegalReqRetVT =
+ EVT::getVectorVT(*DAG.getContext(), ReqRetVT.getVectorElementType(),
+ ReqRetVT.getVectorNumElements() + 1);
+ }
+ }
+ Data = DAG.getNode(ISD::BITCAST, DL, LegalReqRetVT, Data);
if (TexFail)
return DAG.getMergeValues({Data, TexFail, SDValue(Result, 1)}, DL);
return VData;
SDLoc DL(VData);
- assert((StoreVT.getVectorNumElements() != 3) && "Handle v3f16");
+ unsigned NumElements = StoreVT.getVectorNumElements();
if (Subtarget->hasUnpackedD16VMem()) {
// We need to unpack the packed data to store.
EVT IntStoreVT = StoreVT.changeTypeToInteger();
SDValue IntVData = DAG.getNode(ISD::BITCAST, DL, IntStoreVT, VData);
- EVT EquivStoreVT = EVT::getVectorVT(*DAG.getContext(), MVT::i32,
- StoreVT.getVectorNumElements());
+ EVT EquivStoreVT =
+ EVT::getVectorVT(*DAG.getContext(), MVT::i32, NumElements);
SDValue ZExt = DAG.getNode(ISD::ZERO_EXTEND, DL, EquivStoreVT, IntVData);
return DAG.UnrollVectorOp(ZExt.getNode());
+ } else if (NumElements == 3) {
+ EVT IntStoreVT =
+ EVT::getIntegerVT(*DAG.getContext(), StoreVT.getStoreSizeInBits());
+ SDValue IntVData = DAG.getNode(ISD::BITCAST, DL, IntStoreVT, VData);
+
+ EVT WidenedStoreVT = EVT::getVectorVT(
+ *DAG.getContext(), StoreVT.getVectorElementType(), NumElements + 1);
+ EVT WidenedIntVT = EVT::getIntegerVT(*DAG.getContext(),
+ WidenedStoreVT.getStoreSizeInBits());
+ SDValue ZExt = DAG.getNode(ISD::ZERO_EXTEND, DL, WidenedIntVT, IntVData);
+ return DAG.getNode(ISD::BITCAST, DL, WidenedStoreVT, ZExt);
}
assert(isTypeLegal(StoreVT));
EVT VDataVT = VData.getValueType();
EVT EltType = VDataVT.getScalarType();
bool IsD16 = IsFormat && (EltType.getSizeInBits() == 16);
- if (IsD16)
+ if (IsD16) {
VData = handleD16VData(VData, DAG);
+ VDataVT = VData.getValueType();
+ }
if (!isTypeLegal(VDataVT)) {
VData =
EVT EltType = VDataVT.getScalarType();
bool IsD16 = IsFormat && (EltType.getSizeInBits() == 16);
- if (IsD16)
+ if (IsD16) {
VData = handleD16VData(VData, DAG);
+ VDataVT = VData.getValueType();
+ }
if (!isTypeLegal(VDataVT)) {
VData =
}]>;
//===----------------------------------------------------------------------===//
+// SDNodes PatFrags for a16 loads and stores with 3 components.
+// v3f16/v3i16 is widened to v4f16/v4i16, so we need to match on the memory
+// load/store size.
+//===----------------------------------------------------------------------===//
+
+class mubuf_intrinsic_load<SDPatternOperator name, ValueType vt> : PatFrag <
+ (ops node:$rsrc, node:$vindex, node:$voffset, node:$soffset, node:$offset,
+ node:$auxiliary, node:$idxen),
+ (name node:$rsrc, node:$vindex, node:$voffset, node:$soffset, node:$offset,
+ node:$auxiliary, node:$idxen)> {
+ let IsLoad = 1;
+ let MemoryVT = vt;
+}
+
+class mubuf_intrinsic_store<SDPatternOperator name, ValueType vt> : PatFrag <
+ (ops node:$vdata, node:$rsrc, node:$vindex, node:$voffset, node:$soffset, node:$offset,
+ node:$auxiliary, node:$idxen),
+ (name node:$vdata, node:$rsrc, node:$vindex, node:$voffset, node:$soffset, node:$offset,
+ node:$auxiliary, node:$idxen)> {
+ let IsStore = 1;
+ let MemoryVT = vt;
+}
+
+class mtbuf_intrinsic_load<SDPatternOperator name, ValueType vt> : PatFrag <
+ (ops node:$rsrc, node:$vindex, node:$voffset, node:$soffset, node:$offset,
+ node:$format, node:$auxiliary, node:$idxen),
+ (name node:$rsrc, node:$vindex, node:$voffset, node:$soffset, node:$offset,
+ node:$format, node:$auxiliary, node:$idxen)> {
+ let IsLoad = 1;
+ let MemoryVT = vt;
+}
+
+class mtbuf_intrinsic_store<SDPatternOperator name, ValueType vt> : PatFrag <
+ (ops node:$vdata, node:$rsrc, node:$vindex, node:$voffset, node:$soffset, node:$offset,
+ node:$format, node:$auxiliary, node:$idxen),
+ (name node:$vdata, node:$rsrc, node:$vindex, node:$voffset, node:$soffset, node:$offset,
+ node:$format, node:$auxiliary, node:$idxen)> {
+ let IsStore = 1;
+ let MemoryVT = vt;
+}
+
+//===----------------------------------------------------------------------===//
// SDNodes PatFrags for d16 loads
//===----------------------------------------------------------------------===//
ret void
}
-; define amdgpu_ps void @load_1d_v3f16_tfe_dmask7(<8 x i32> inreg %rsrc, i32 %s) {
-; %v = call { <3 x half>, i32 } @llvm.amdgcn.image.load.1d.sl_v3f16i32s.i32(i32 7, i32 %s, <8 x i32> %rsrc, i32 1, i32 0)
-; %v.data = extractvalue { <3 x half>, i32 } %v, 0
-; %v.err = extractvalue { <3 x half>, i32 } %v, 1
-; store volatile <3 x half> %v.data, <3 x half> addrspace(1)* undef
-; store volatile i32 %v.err, i32 addrspace(1)* undef
-; ret void
-; }
+define amdgpu_ps void @load_1d_v3f16_tfe_dmask7(<8 x i32> inreg %rsrc, i32 %s) {
+; GFX9-LABEL: load_1d_v3f16_tfe_dmask7:
+; GFX9: ; %bb.0:
+; GFX9-NEXT: v_mov_b32_e32 v1, 0
+; GFX9-NEXT: s_mov_b32 s11, s9
+; GFX9-NEXT: s_mov_b32 s10, s8
+; GFX9-NEXT: s_mov_b32 s9, s7
+; GFX9-NEXT: s_mov_b32 s8, s6
+; GFX9-NEXT: s_mov_b32 s7, s5
+; GFX9-NEXT: s_mov_b32 s6, s4
+; GFX9-NEXT: s_mov_b32 s5, s3
+; GFX9-NEXT: s_mov_b32 s4, s2
+; GFX9-NEXT: v_mov_b32_e32 v2, v1
+; GFX9-NEXT: v_mov_b32_e32 v3, v1
+; GFX9-NEXT: image_load v[1:3], v0, s[4:11] dmask:0x7 unorm tfe d16
+; GFX9-NEXT: s_waitcnt vmcnt(0)
+; GFX9-NEXT: global_store_short v[0:1], v2, off
+; GFX9-NEXT: global_store_dword v[0:1], v1, off
+; GFX9-NEXT: global_store_dword v[0:1], v3, off
+; GFX9-NEXT: s_endpgm
+;
+; GFX10-LABEL: load_1d_v3f16_tfe_dmask7:
+; GFX10: ; %bb.0:
+; GFX10-NEXT: v_mov_b32_e32 v1, 0
+; GFX10-NEXT: s_mov_b32 s11, s9
+; GFX10-NEXT: s_mov_b32 s10, s8
+; GFX10-NEXT: s_mov_b32 s9, s7
+; GFX10-NEXT: s_mov_b32 s8, s6
+; GFX10-NEXT: s_mov_b32 s7, s5
+; GFX10-NEXT: s_mov_b32 s6, s4
+; GFX10-NEXT: s_mov_b32 s5, s3
+; GFX10-NEXT: s_mov_b32 s4, s2
+; GFX10-NEXT: v_mov_b32_e32 v2, v1
+; GFX10-NEXT: v_mov_b32_e32 v3, v1
+; GFX10-NEXT: image_load v[1:3], v0, s[4:11] dmask:0x7 dim:SQ_RSRC_IMG_1D unorm tfe d16
+; GFX10-NEXT: ; implicit-def: $vcc_hi
+; GFX10-NEXT: s_waitcnt vmcnt(0)
+; GFX10-NEXT: global_store_short v[0:1], v2, off
+; GFX10-NEXT: global_store_dword v[0:1], v1, off
+; GFX10-NEXT: global_store_dword v[0:1], v3, off
+; GFX10-NEXT: s_endpgm
+;
+; GFX8-UNPACKED-LABEL: load_1d_v3f16_tfe_dmask7:
+; GFX8-UNPACKED: ; %bb.0:
+; GFX8-UNPACKED-NEXT: v_mov_b32_e32 v1, 0
+; GFX8-UNPACKED-NEXT: s_mov_b32 s11, s9
+; GFX8-UNPACKED-NEXT: s_mov_b32 s10, s8
+; GFX8-UNPACKED-NEXT: s_mov_b32 s9, s7
+; GFX8-UNPACKED-NEXT: s_mov_b32 s8, s6
+; GFX8-UNPACKED-NEXT: s_mov_b32 s7, s5
+; GFX8-UNPACKED-NEXT: s_mov_b32 s6, s4
+; GFX8-UNPACKED-NEXT: s_mov_b32 s5, s3
+; GFX8-UNPACKED-NEXT: s_mov_b32 s4, s2
+; GFX8-UNPACKED-NEXT: v_mov_b32_e32 v2, v1
+; GFX8-UNPACKED-NEXT: v_mov_b32_e32 v3, v1
+; GFX8-UNPACKED-NEXT: v_mov_b32_e32 v4, v1
+; GFX8-UNPACKED-NEXT: image_load v[1:4], v0, s[4:11] dmask:0x7 unorm tfe d16
+; GFX8-UNPACKED-NEXT: s_waitcnt vmcnt(0)
+; GFX8-UNPACKED-NEXT: v_lshlrev_b32_e32 v0, 16, v2
+; GFX8-UNPACKED-NEXT: flat_store_short v[0:1], v3
+; GFX8-UNPACKED-NEXT: v_or_b32_sdwa v0, v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
+; GFX8-UNPACKED-NEXT: flat_store_dword v[0:1], v0
+; GFX8-UNPACKED-NEXT: flat_store_dword v[0:1], v4
+; GFX8-UNPACKED-NEXT: s_endpgm
+ %v = call { <3 x half>, i32 } @llvm.amdgcn.image.load.1d.sl_v3f16i32s.i32(i32 7, i32 %s, <8 x i32> %rsrc, i32 1, i32 0)
+ %v.data = extractvalue { <3 x half>, i32 } %v, 0
+ %v.err = extractvalue { <3 x half>, i32 } %v, 1
+ store volatile <3 x half> %v.data, <3 x half> addrspace(1)* undef
+ store volatile i32 %v.err, i32 addrspace(1)* undef
+ ret void
+}
define amdgpu_ps void @load_1d_v4f16_tfe_dmask15(<8 x i32> inreg %rsrc, i32 %s) {
; GFX9-LABEL: load_1d_v4f16_tfe_dmask15:
-; RUN: llc < %s -march=amdgcn -mcpu=tonga -verify-machineinstrs -show-mc-encoding | FileCheck -enable-var-scope -check-prefix=GCN -check-prefix=UNPACKED %s
+; RUN: llc < %s -march=amdgcn -mcpu=tonga -verify-machineinstrs | FileCheck -enable-var-scope -check-prefix=GCN -check-prefix=UNPACKED %s
; RUN: llc < %s -march=amdgcn -mcpu=gfx810 -verify-machineinstrs | FileCheck -enable-var-scope -check-prefix=GCN -check-prefix=PACKED %s
; RUN: llc < %s -march=amdgcn -mcpu=gfx900 -verify-machineinstrs | FileCheck -enable-var-scope -check-prefix=GCN -check-prefix=PACKED %s
ret half %elt
}
+; GCN-LABEL: {{^}}buffer_load_format_d16_xyz:
+; UNPACKED: buffer_load_format_d16_xyz v{{\[}}{{[0-9]+}}:[[HI:[0-9]+]]{{\]}}, off, s[{{[0-9]+:[0-9]+}}], 0
+; UNPACKED: v_mov_b32_e32 v{{[0-9]+}}, v[[HI]]
+
+; PACKED: buffer_load_format_d16_xyz v{{\[}}{{[0-9]+}}:[[HI:[0-9]+]]{{\]}}, off, s[{{[0-9]+:[0-9]+}}], 0
+; PACKED: v_mov_b32_e32 v{{[0-9]+}}, v[[HI]]
+define amdgpu_ps half @buffer_load_format_d16_xyz(<4 x i32> inreg %rsrc) {
+main_body:
+ %data = call <3 x half> @llvm.amdgcn.buffer.load.format.v3f16(<4 x i32> %rsrc, i32 0, i32 0, i1 0, i1 0)
+ %elt = extractelement <3 x half> %data, i32 2
+ ret half %elt
+}
+
; GCN-LABEL: {{^}}buffer_load_format_d16_xyzw:
; UNPACKED: buffer_load_format_d16_xyzw v{{\[}}{{[0-9]+}}:[[HI:[0-9]+]]{{\]}}, off, s[{{[0-9]+:[0-9]+}}], 0
; UNPACKED: v_mov_b32_e32 v{{[0-9]+}}, v[[HI]]
declare half @llvm.amdgcn.buffer.load.format.f16(<4 x i32>, i32, i32, i1, i1)
declare <2 x half> @llvm.amdgcn.buffer.load.format.v2f16(<4 x i32>, i32, i32, i1, i1)
+declare <3 x half> @llvm.amdgcn.buffer.load.format.v3f16(<4 x i32>, i32, i32, i1, i1)
declare <4 x half> @llvm.amdgcn.buffer.load.format.v4f16(<4 x i32>, i32, i32, i1, i1)
ret void
}
+define amdgpu_kernel void @buffer_store_format_d16_xyz(<4 x i32> %rsrc, <3 x half> %data, i32 %index) {
+main_body:
+ call void @llvm.amdgcn.buffer.store.format.v3f16(<3 x half> %data, <4 x i32> %rsrc, i32 %index, i32 0, i1 0, i1 0)
+ ret void
+}
+
; GCN-LABEL: {{^}}buffer_store_format_d16_xyzw:
; GCN-DAG: s_load_dwordx2 s{{\[}}[[S_DATA_0:[0-9]+]]:[[S_DATA_1:[0-9]+]]{{\]}}, s{{\[[0-9]+:[0-9]+\]}}, 0x10
declare void @llvm.amdgcn.buffer.store.format.f16(half, <4 x i32>, i32, i32, i1, i1)
declare void @llvm.amdgcn.buffer.store.format.v2f16(<2 x half>, <4 x i32>, i32, i32, i1, i1)
+declare void @llvm.amdgcn.buffer.store.format.v3f16(<3 x half>, <4 x i32>, i32, i32, i1, i1)
declare void @llvm.amdgcn.buffer.store.format.v4f16(<4 x half>, <4 x i32>, i32, i32, i1, i1)
ret float %r
}
+; GCN-LABEL: {{^}}image_load_v3f16:
+; UNPACKED: image_load v[0:2], v[0:1], s[0:7] dmask:0x7 unorm d16{{$}}
+; PACKED: image_load v[0:1], v[0:1], s[0:7] dmask:0x7 unorm d16{{$}}
+; GFX10: image_load v[0:1], v[0:1], s[0:7] dmask:0x7 dim:SQ_RSRC_IMG_2D unorm d16{{$}}
+define amdgpu_ps <2 x float> @image_load_v3f16(<8 x i32> inreg %rsrc, i32 %s, i32 %t) {
+main_body:
+ %tex = call <3 x half> @llvm.amdgcn.image.load.2d.v3f16.i32(i32 7, i32 %s, i32 %t, <8 x i32> %rsrc, i32 0, i32 0)
+ %ext = shufflevector <3 x half> %tex, <3 x half> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
+ %r = bitcast <4 x half> %ext to <2 x float>
+ ret <2 x float> %r
+}
+
; GCN-LABEL: {{^}}image_load_v4f16:
; UNPACKED: image_load v[0:3], v[0:1], s[0:7] dmask:0xf unorm d16{{$}}
; PACKED: image_load v[0:1], v[0:1], s[0:7] dmask:0xf unorm d16{{$}}
ret float %x
}
+define amdgpu_ps <2 x float> @image_load_3d_v3f16(<8 x i32> inreg %rsrc, i32 %s, i32 %t, i32 %r) {
+main_body:
+ %tex = call <3 x half> @llvm.amdgcn.image.load.3d.v3f16.i32(i32 7, i32 %s, i32 %t, i32 %r, <8 x i32> %rsrc, i32 0, i32 0)
+ %ext = shufflevector <3 x half> %tex, <3 x half> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
+ %res = bitcast <4 x half> %ext to <2 x float>
+ ret <2 x float> %res
+}
+
; GCN-LABEL: {{^}}image_store_f16
; GFX89: image_store v2, v[0:1], s[0:7] dmask:0x1 unorm d16{{$}}
; GFX10: image_store v2, v[0:1], s[0:7] dmask:0x1 dim:SQ_RSRC_IMG_2D unorm d16{{$}}
ret void
}
+define amdgpu_ps void @image_store_v3f16(<8 x i32> inreg %rsrc, i32 %s, i32 %t, <2 x float> %in) {
+main_body:
+ %r = bitcast <2 x float> %in to <4 x half>
+ %data = shufflevector <4 x half> %r, <4 x half> undef, <3 x i32> <i32 0, i32 1, i32 2>
+ call void @llvm.amdgcn.image.store.2d.v3f16.i32(<3 x half> %data, i32 7, i32 %s, i32 %t, <8 x i32> %rsrc, i32 0, i32 0)
+ ret void
+}
+
; GCN-LABEL: {{^}}image_store_v4f16
; UNPACKED: v_lshrrev_b32_e32
; UNPACKED: v_and_b32_e32
declare half @llvm.amdgcn.image.load.2d.f16.i32(i32, i32, i32, <8 x i32>, i32, i32) #1
declare <2 x half> @llvm.amdgcn.image.load.2d.v2f16.i32(i32, i32, i32, <8 x i32>, i32, i32) #1
+declare <3 x half> @llvm.amdgcn.image.load.2d.v3f16.i32(i32, i32, i32, <8 x i32>, i32, i32) #1
declare <4 x half> @llvm.amdgcn.image.load.2d.v4f16.i32(i32, i32, i32, <8 x i32>, i32, i32) #1
declare <4 x half> @llvm.amdgcn.image.load.mip.2d.v4f16.i32(i32, i32, i32, i32, <8 x i32>, i32, i32) #1
declare <2 x half> @llvm.amdgcn.image.load.3d.v2f16.i32(i32, i32, i32, i32, <8 x i32>, i32, i32) #1
+declare <3 x half> @llvm.amdgcn.image.load.3d.v3f16.i32(i32, i32, i32, i32, <8 x i32>, i32, i32) #1
declare void @llvm.amdgcn.image.store.2d.f16.i32(half, i32, i32, i32, <8 x i32>, i32, i32) #0
declare void @llvm.amdgcn.image.store.2d.v2f16.i32(<2 x half>, i32, i32, i32, <8 x i32>, i32, i32) #0
+declare void @llvm.amdgcn.image.store.2d.v3f16.i32(<3 x half>, i32, i32, i32, <8 x i32>, i32, i32) #0
declare void @llvm.amdgcn.image.store.2d.v4f16.i32(<4 x half>, i32, i32, i32, <8 x i32>, i32, i32) #0
declare void @llvm.amdgcn.image.store.mip.1d.v4f16.i32(<4 x half>, i32, i32, i32, <8 x i32>, i32, i32) #0
declare void @llvm.amdgcn.image.store.3d.v2f16.i32(<2 x half>, i32, i32, i32, i32, <8 x i32>, i32, i32) #0
+declare void @llvm.amdgcn.image.store.3d.v3f16.i32(<3 x half>, i32, i32, i32, i32, <8 x i32>, i32, i32) #0
attributes #0 = { nounwind }
attributes #1 = { nounwind readonly }
ret <2 x float> %r
}
+define amdgpu_ps <2 x float> @image_sample_b_2d_v3f16(<8 x i32> inreg %rsrc, <4 x i32> inreg %samp, float %bias, float %s, float %t) {
+; TONGA-LABEL: image_sample_b_2d_v3f16:
+; TONGA: ; %bb.0: ; %main_body
+; TONGA-NEXT: s_mov_b64 s[12:13], exec
+; TONGA-NEXT: s_wqm_b64 exec, exec
+; TONGA-NEXT: s_and_b64 exec, exec, s[12:13]
+; TONGA-NEXT: image_sample_b v[0:2], v[0:2], s[0:7], s[8:11] dmask:0x7 d16
+; TONGA-NEXT: s_waitcnt vmcnt(0)
+; TONGA-NEXT: v_lshlrev_b32_e32 v1, 16, v1
+; TONGA-NEXT: v_or_b32_sdwa v0, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
+; TONGA-NEXT: v_mov_b32_e32 v1, v2
+; TONGA-NEXT: ; return to shader part epilog
+;
+; GFX81-LABEL: image_sample_b_2d_v3f16:
+; GFX81: ; %bb.0: ; %main_body
+; GFX81-NEXT: s_mov_b64 s[12:13], exec
+; GFX81-NEXT: s_wqm_b64 exec, exec
+; GFX81-NEXT: s_and_b64 exec, exec, s[12:13]
+; GFX81-NEXT: image_sample_b v[0:1], v[0:2], s[0:7], s[8:11] dmask:0x7 d16
+; GFX81-NEXT: s_waitcnt vmcnt(0)
+; GFX81-NEXT: ; return to shader part epilog
+;
+; GFX9-LABEL: image_sample_b_2d_v3f16:
+; GFX9: ; %bb.0: ; %main_body
+; GFX9-NEXT: s_mov_b64 s[12:13], exec
+; GFX9-NEXT: s_wqm_b64 exec, exec
+; GFX9-NEXT: s_and_b64 exec, exec, s[12:13]
+; GFX9-NEXT: image_sample_b v[0:1], v[0:2], s[0:7], s[8:11] dmask:0x7 d16
+; GFX9-NEXT: s_waitcnt vmcnt(0)
+; GFX9-NEXT: ; return to shader part epilog
+;
+; GFX10-LABEL: image_sample_b_2d_v3f16:
+; GFX10: ; %bb.0: ; %main_body
+; GFX10-NEXT: s_mov_b32 s12, exec_lo
+; GFX10-NEXT: ; implicit-def: $vcc_hi
+; GFX10-NEXT: s_wqm_b32 exec_lo, exec_lo
+; GFX10-NEXT: s_and_b32 exec_lo, exec_lo, s12
+; GFX10-NEXT: image_sample_b v[0:1], v[0:2], s[0:7], s[8:11] dmask:0x7 dim:SQ_RSRC_IMG_2D d16
+; GFX10-NEXT: s_waitcnt vmcnt(0)
+; GFX10-NEXT: ; return to shader part epilog
+main_body:
+ %tex = call <3 x half> @llvm.amdgcn.image.sample.b.2d.v3f16.f32.f32(i32 7, float %bias, float %s, float %t, <8 x i32> %rsrc, <4 x i32> %samp, i1 false, i32 0, i32 0)
+ %tex_wide = shufflevector <3 x half> %tex, <3 x half> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
+ %r = bitcast <4 x half> %tex_wide to <2 x float>
+ ret <2 x float> %r
+}
+
+define amdgpu_ps <4 x float> @image_sample_b_2d_v3f16_tfe(<8 x i32> inreg %rsrc, <4 x i32> inreg %samp, float %bias, float %s, float %t) {
+; TONGA-LABEL: image_sample_b_2d_v3f16_tfe:
+; TONGA: ; %bb.0: ; %main_body
+; TONGA-NEXT: s_mov_b64 s[12:13], exec
+; TONGA-NEXT: s_wqm_b64 exec, exec
+; TONGA-NEXT: v_mov_b32_e32 v3, 0
+; TONGA-NEXT: v_mov_b32_e32 v4, v3
+; TONGA-NEXT: v_mov_b32_e32 v5, v3
+; TONGA-NEXT: v_mov_b32_e32 v6, v3
+; TONGA-NEXT: s_and_b64 exec, exec, s[12:13]
+; TONGA-NEXT: image_sample_b v[3:6], v[0:2], s[0:7], s[8:11] dmask:0x7 tfe d16
+; TONGA-NEXT: s_waitcnt vmcnt(0)
+; TONGA-NEXT: v_lshlrev_b32_e32 v0, 16, v4
+; TONGA-NEXT: v_or_b32_sdwa v0, v3, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
+; TONGA-NEXT: v_mov_b32_e32 v1, v5
+; TONGA-NEXT: v_mov_b32_e32 v2, v6
+; TONGA-NEXT: ; return to shader part epilog
+;
+; GFX81-LABEL: image_sample_b_2d_v3f16_tfe:
+; GFX81: ; %bb.0: ; %main_body
+; GFX81-NEXT: s_mov_b64 s[12:13], exec
+; GFX81-NEXT: s_wqm_b64 exec, exec
+; GFX81-NEXT: v_mov_b32_e32 v3, 0
+; GFX81-NEXT: v_mov_b32_e32 v4, v3
+; GFX81-NEXT: v_mov_b32_e32 v5, v3
+; GFX81-NEXT: s_and_b64 exec, exec, s[12:13]
+; GFX81-NEXT: image_sample_b v[3:5], v[0:2], s[0:7], s[8:11] dmask:0x7 tfe d16
+; GFX81-NEXT: s_waitcnt vmcnt(0)
+; GFX81-NEXT: v_mov_b32_e32 v0, v3
+; GFX81-NEXT: v_mov_b32_e32 v1, v4
+; GFX81-NEXT: v_mov_b32_e32 v2, v5
+; GFX81-NEXT: ; return to shader part epilog
+;
+; GFX9-LABEL: image_sample_b_2d_v3f16_tfe:
+; GFX9: ; %bb.0: ; %main_body
+; GFX9-NEXT: s_mov_b64 s[12:13], exec
+; GFX9-NEXT: s_wqm_b64 exec, exec
+; GFX9-NEXT: v_mov_b32_e32 v3, 0
+; GFX9-NEXT: v_mov_b32_e32 v4, v3
+; GFX9-NEXT: v_mov_b32_e32 v5, v3
+; GFX9-NEXT: s_and_b64 exec, exec, s[12:13]
+; GFX9-NEXT: image_sample_b v[3:5], v[0:2], s[0:7], s[8:11] dmask:0x7 tfe d16
+; GFX9-NEXT: s_waitcnt vmcnt(0)
+; GFX9-NEXT: v_mov_b32_e32 v0, v3
+; GFX9-NEXT: v_mov_b32_e32 v1, v4
+; GFX9-NEXT: v_mov_b32_e32 v2, v5
+; GFX9-NEXT: ; return to shader part epilog
+;
+; GFX10-LABEL: image_sample_b_2d_v3f16_tfe:
+; GFX10: ; %bb.0: ; %main_body
+; GFX10-NEXT: s_mov_b32 s12, exec_lo
+; GFX10-NEXT: ; implicit-def: $vcc_hi
+; GFX10-NEXT: s_wqm_b32 exec_lo, exec_lo
+; GFX10-NEXT: v_mov_b32_e32 v3, v0
+; GFX10-NEXT: v_mov_b32_e32 v0, 0
+; GFX10-NEXT: v_mov_b32_e32 v5, v2
+; GFX10-NEXT: v_mov_b32_e32 v4, v1
+; GFX10-NEXT: v_mov_b32_e32 v1, v0
+; GFX10-NEXT: v_mov_b32_e32 v2, v0
+; GFX10-NEXT: s_and_b32 exec_lo, exec_lo, s12
+; GFX10-NEXT: image_sample_b v[0:2], v[3:5], s[0:7], s[8:11] dmask:0x7 dim:SQ_RSRC_IMG_2D tfe d16
+; GFX10-NEXT: s_waitcnt vmcnt(0)
+; GFX10-NEXT: ; return to shader part epilog
+main_body:
+ %tex = call {<3 x half>,i32} @llvm.amdgcn.image.sample.b.2d.v3f16i32.f32.f32(i32 7, float %bias, float %s, float %t, <8 x i32> %rsrc, <4 x i32> %samp, i1 false, i32 1, i32 0)
+ %tex.vec = extractvalue {<3 x half>, i32} %tex, 0
+ %tex.vec_wide = shufflevector <3 x half> %tex.vec, <3 x half> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
+ %tex.err = extractvalue {<3 x half>, i32} %tex, 1
+ %tex.vecf = bitcast <4 x half> %tex.vec_wide to <2 x float>
+ %tex.vecf.0 = extractelement <2 x float> %tex.vecf, i32 0
+ %tex.vecf.1 = extractelement <2 x float> %tex.vecf, i32 1
+ %r.0 = insertelement <4 x float> undef, float %tex.vecf.0, i32 0
+ %r.1 = insertelement <4 x float> %r.0, float %tex.vecf.1, i32 1
+ %tex.errf = bitcast i32 %tex.err to float
+ %r = insertelement <4 x float> %r.1, float %tex.errf, i32 2
+ ret <4 x float> %r
+}
+
define amdgpu_ps <2 x float> @image_sample_b_2d_v4f16(<8 x i32> inreg %rsrc, <4 x i32> inreg %samp, float %bias, float %s, float %t) {
; TONGA-LABEL: image_sample_b_2d_v4f16:
; TONGA: ; %bb.0: ; %main_body
declare half @llvm.amdgcn.image.sample.2d.f16.f32(i32, float, float, <8 x i32>, <4 x i32>, i1, i32, i32) #1
declare {half,i32} @llvm.amdgcn.image.sample.2d.f16i32.f32(i32, float, float, <8 x i32>, <4 x i32>, i1, i32, i32) #1
+declare <3 x half> @llvm.amdgcn.image.sample.2d.v3f16.f32(i32, float, float, <8 x i32>, <4 x i32>, i1, i32, i32) #1
declare <4 x half> @llvm.amdgcn.image.sample.2d.v4f16.f32(i32, float, float, <8 x i32>, <4 x i32>, i1, i32, i32) #1
declare {<2 x half>,i32} @llvm.amdgcn.image.sample.2d.v2f16i32.f32(i32, float, float, <8 x i32>, <4 x i32>, i1, i32, i32) #1
declare <2 x half> @llvm.amdgcn.image.sample.c.d.1d.v2f16.f32.f32(i32, float, float, float, float, <8 x i32>, <4 x i32>, i1, i32, i32) #1
declare {<2 x half>,i32} @llvm.amdgcn.image.sample.c.d.1d.v2f16i32.f32.f32(i32, float, float, float, float, <8 x i32>, <4 x i32>, i1, i32, i32) #1
+declare <3 x half> @llvm.amdgcn.image.sample.b.2d.v3f16.f32.f32(i32, float, float, float, <8 x i32>, <4 x i32>, i1, i32, i32) #1
+declare {<3 x half>,i32} @llvm.amdgcn.image.sample.b.2d.v3f16i32.f32.f32(i32, float, float, float, <8 x i32>, <4 x i32>, i1, i32, i32) #1
declare <4 x half> @llvm.amdgcn.image.sample.b.2d.v4f16.f32.f32(i32, float, float, float, <8 x i32>, <4 x i32>, i1, i32, i32) #1
declare {<4 x half>,i32} @llvm.amdgcn.image.sample.b.2d.v4f16i32.f32.f32(i32, float, float, float, <8 x i32>, <4 x i32>, i1, i32, i32) #1
ret half %elt
}
+; GCN-LABEL: {{^}}buffer_load_format_d16_xyz:
+; UNPACKED: buffer_load_format_d16_xyz v{{\[}}{{[0-9]+}}:[[HI:[0-9]+]]{{\]}}, off, s[{{[0-9]+:[0-9]+}}], 0
+; UNPACKED: v_mov_b32_e32 v{{[0-9]+}}, v[[HI]]
+
+; PACKED: buffer_load_format_d16_xyz v{{\[}}{{[0-9]+}}:[[HI:[0-9]+]]{{\]}}, off, s[{{[0-9]+:[0-9]+}}], 0
+define amdgpu_ps half @buffer_load_format_d16_xyz(<4 x i32> inreg %rsrc) {
+main_body:
+ %data = call <3 x half> @llvm.amdgcn.raw.buffer.load.format.v3f16(<4 x i32> %rsrc, i32 0, i32 0, i32 0)
+ %elt = extractelement <3 x half> %data, i32 2
+ ret half %elt
+}
+
; GCN-LABEL: {{^}}buffer_load_format_d16_xyzw:
; UNPACKED: buffer_load_format_d16_xyzw v{{\[}}{{[0-9]+}}:[[HI:[0-9]+]]{{\]}}, off, s[{{[0-9]+:[0-9]+}}], 0
; UNPACKED: v_mov_b32_e32 v{{[0-9]+}}, v[[HI]]
declare half @llvm.amdgcn.raw.buffer.load.format.f16(<4 x i32>, i32, i32, i32)
declare <2 x half> @llvm.amdgcn.raw.buffer.load.format.v2f16(<4 x i32>, i32, i32, i32)
+declare <3 x half> @llvm.amdgcn.raw.buffer.load.format.v3f16(<4 x i32>, i32, i32, i32)
declare <4 x half> @llvm.amdgcn.raw.buffer.load.format.v4f16(<4 x i32>, i32, i32, i32)
ret void
}
+; GCN-LABEL: {{^}}buffer_store_format_d16_xyz:
+; GCN-DAG: s_load_dwordx2 s{{\[}}[[S_DATA_0:[0-9]+]]:[[S_DATA_1:[0-9]+]]{{\]}}, s{{\[[0-9]+:[0-9]+\]}}, 0x10
+
+; UNPACKED-DAG: s_mov_b32 [[K:s[0-9]+]], 0xffff{{$}}
+; UNPACKED-DAG: s_lshr_b32 [[SHR0:s[0-9]+]], s[[S_DATA_0]], 16
+; UNPACKED-DAG: s_and_b32 [[MASKED0:s[0-9]+]], s[[S_DATA_0]], [[K]]
+; UNPACKED-DAG: s_and_b32 [[MASKED1:s[0-9]+]], s[[S_DATA_1]], [[K]]
+
+; UNPACKED-DAG: v_mov_b32_e32 v[[LO:[0-9]+]], [[MASKED0]]
+; UNPACKED-DAG: v_mov_b32_e32 v[[HI:[0-9]+]], [[MASKED1]]
+
+; UNPACKED: buffer_store_format_d16_xyz v{{\[}}[[LO]]:[[HI]]{{\]}}, v{{[0-9]+}}, s[{{[0-9]+:[0-9]+}}], 0 offen
+
+; PACKED: s_and_b32 [[MASKED0:s[0-9]+]], s[[S_DATA_1]], 0xffff{{$}}
+; PACKED: v_mov_b32_e32 v[[LO:[0-9]+]], s[[S_DATA_0]]
+; PACKED: v_mov_b32_e32 v[[HI:[0-9]+]], [[MASKED0]]
+
+; PACKED: buffer_store_format_d16_xyz v{{\[}}[[LO]]:[[HI]]{{\]}}, v{{[0-9]+}}, s[{{[0-9]+:[0-9]+}}], 0 offen
+define amdgpu_kernel void @buffer_store_format_d16_xyz(<4 x i32> %rsrc, <4 x half> %data, i32 %voffset) {
+main_body:
+ %data_subvec = shufflevector <4 x half> %data, <4 x half> undef, <3 x i32> <i32 0, i32 1, i32 2>
+ call void @llvm.amdgcn.raw.buffer.store.format.v3f16(<3 x half> %data_subvec, <4 x i32> %rsrc, i32 %voffset, i32 0, i32 0)
+ ret void
+}
+
; GCN-LABEL: {{^}}buffer_store_format_d16_xyzw:
; GCN-DAG: s_load_dwordx2 s{{\[}}[[S_DATA_0:[0-9]+]]:[[S_DATA_1:[0-9]+]]{{\]}}, s{{\[[0-9]+:[0-9]+\]}}, 0x10
declare void @llvm.amdgcn.raw.buffer.store.format.f16(half, <4 x i32>, i32, i32, i32)
declare void @llvm.amdgcn.raw.buffer.store.format.v2f16(<2 x half>, <4 x i32>, i32, i32, i32)
+declare void @llvm.amdgcn.raw.buffer.store.format.v3f16(<3 x half>, <4 x i32>, i32, i32, i32)
declare void @llvm.amdgcn.raw.buffer.store.format.v4f16(<4 x half>, <4 x i32>, i32, i32, i32)
ret half %elt
}
+; GCN-LABEL: {{^}}tbuffer_load_d16_xyz:
+; PREGFX10-UNPACKED: tbuffer_load_format_d16_xyz v{{\[}}{{[0-9]+}}:[[HI:[0-9]+]]{{\]}}, off, s[{{[0-9]+:[0-9]+}}], 0 format:[BUF_DATA_FORMAT_10_11_11,BUF_NUM_FORMAT_SNORM]
+; GFX10-UNPACKED: tbuffer_load_format_d16_xyz v{{\[}}{{[0-9]+}}:[[HI:[0-9]+]]{{\]}}, off, s[{{[0-9]+:[0-9]+}}], 0 format:[BUF_FMT_32_FLOAT]
+; UNPACKED: v_mov_b32_e32 v{{[0-9]+}}, v[[HI]]
+
+; PREGFX10-PACKED: tbuffer_load_format_d16_xyz v{{\[}}{{[0-9]+}}:[[HI:[0-9]+]]{{\]}}, off, s[{{[0-9]+:[0-9]+}}], 0 format:[BUF_DATA_FORMAT_10_11_11,BUF_NUM_FORMAT_SNORM]
+; GFX10-PACKED: tbuffer_load_format_d16_xyz v{{\[}}{{[0-9]+}}:[[HI:[0-9]+]]{{\]}}, off, s[{{[0-9]+:[0-9]+}}], 0 format:[BUF_FMT_32_FLOAT]
+; PACKED: v_mov_b32_e32 v{{[0-9]+}}, v[[HI]]
+define amdgpu_ps half @tbuffer_load_d16_xyz(<4 x i32> inreg %rsrc) {
+main_body:
+ %data = call <3 x half> @llvm.amdgcn.raw.tbuffer.load.v3f16(<4 x i32> %rsrc, i32 0, i32 0, i32 22, i32 0)
+ %elt = extractelement <3 x half> %data, i32 2
+ ret half %elt
+}
+
; GCN-LABEL: {{^}}tbuffer_load_d16_xyzw:
; PREGFX10-UNPACKED: tbuffer_load_format_d16_xyzw v{{\[}}{{[0-9]+}}:[[HI:[0-9]+]]{{\]}}, off, s[{{[0-9]+:[0-9]+}}], 0 format:[BUF_DATA_FORMAT_10_11_11,BUF_NUM_FORMAT_SNORM]
; GFX10-UNPACKED: tbuffer_load_format_d16_xyzw v{{\[}}{{[0-9]+}}:[[HI:[0-9]+]]{{\]}}, off, s[{{[0-9]+:[0-9]+}}], 0 format:[BUF_FMT_32_FLOAT]
declare half @llvm.amdgcn.raw.tbuffer.load.f16(<4 x i32>, i32, i32, i32, i32)
declare <2 x half> @llvm.amdgcn.raw.tbuffer.load.v2f16(<4 x i32>, i32, i32, i32, i32)
+declare <3 x half> @llvm.amdgcn.raw.tbuffer.load.v3f16(<4 x i32>, i32, i32, i32, i32)
declare <4 x half> @llvm.amdgcn.raw.tbuffer.load.v4f16(<4 x i32>, i32, i32, i32, i32)
-
ret void
}
+; GCN-LABEL: {{^}}tbuffer_store_d16_xyz:
+; GCN-DAG: s_load_dwordx2 s{{\[}}[[S_DATA_0:[0-9]+]]:[[S_DATA_1:[0-9]+]]{{\]}}, s{{\[[0-9]+:[0-9]+\]}},
+
+; UNPACKED-DAG: s_mov_b32 [[K:s[0-9]+]], 0xffff{{$}}
+; UNPACKED-DAG: s_lshr_b32 [[SHR0:s[0-9]+]], s[[S_DATA_0]], 16
+; UNPACKED-DAG: s_and_b32 [[MASKED0:s[0-9]+]], s[[S_DATA_0]], [[K]]
+; UNPACKED-DAG: s_and_b32 [[MASKED1:s[0-9]+]], s[[S_DATA_1]], [[K]]
+
+; UNPACKED-DAG: v_mov_b32_e32 v[[LO:[0-9]+]], [[MASKED0]]
+; UNPACKED-DAG: v_mov_b32_e32 v[[HI:[0-9]+]], [[MASKED1]]
+; PREGFX10-UNPACKED: tbuffer_store_format_d16_xyz v{{\[}}[[LO]]:[[HI]]{{\]}}, off, s[{{[0-9]+:[0-9]+}}], 0 format:[BUF_NUM_FORMAT_USCALED]
+
+
+; PACKED-DAG: s_and_b32 [[MASKED0:s[0-9]+]], s[[S_DATA_1]], 0xffff{{$}}
+; PACKED-DAG: v_mov_b32_e32 v[[LO:[0-9]+]], s[[S_DATA_0]]
+; PACKED-DAG: v_mov_b32_e32 v[[HI:[0-9]+]], [[MASKED0]]
+; PREGFX10-PACKED: tbuffer_store_format_d16_xyz v{{\[}}[[LO]]:[[HI]]{{\]}}, off, s[{{[0-9]+:[0-9]+}}], 0 format:[BUF_NUM_FORMAT_USCALED]
+; GFX10-PACKED: tbuffer_store_format_d16_xyz v{{\[}}[[LO]]:[[HI]]{{\]}}, off, s[{{[0-9]+:[0-9]+}}], 0 format:[BUF_FMT_10_11_11_SSCALED]
+define amdgpu_kernel void @tbuffer_store_d16_xyz(<4 x i32> %rsrc, <4 x half> %data) {
+main_body:
+ %data_subvec = shufflevector <4 x half> %data, <4 x half> undef, <3 x i32> <i32 0, i32 1, i32 2>
+ call void @llvm.amdgcn.raw.tbuffer.store.v3f16(<3 x half> %data_subvec, <4 x i32> %rsrc, i32 0, i32 0, i32 33, i32 0)
+ ret void
+}
+
; GCN-LABEL: {{^}}tbuffer_store_d16_xyzw:
; GCN-DAG: s_load_dwordx2 s{{\[}}[[S_DATA_0:[0-9]+]]:[[S_DATA_1:[0-9]+]]{{\]}}, s{{\[[0-9]+:[0-9]+\]}},
declare void @llvm.amdgcn.raw.tbuffer.store.f16(half, <4 x i32>, i32, i32, i32, i32)
declare void @llvm.amdgcn.raw.tbuffer.store.v2f16(<2 x half>, <4 x i32>, i32, i32, i32, i32)
+declare void @llvm.amdgcn.raw.tbuffer.store.v3f16(<3 x half>, <4 x i32>, i32, i32, i32, i32)
declare void @llvm.amdgcn.raw.tbuffer.store.v4f16(<4 x half>, <4 x i32>, i32, i32, i32, i32)
ret half %elt
}
+; GCN-LABEL: {{^}}buffer_load_format_d16_xyz:
+; UNPACKED: buffer_load_format_d16_xyz v{{\[}}{{[0-9]+}}:[[HI:[0-9]+]]{{\]}}, {{v[0-9]+}}, s[{{[0-9]+:[0-9]+}}], 0 idxen
+; UNPACKED: v_mov_b32_e32 v{{[0-9]+}}, v[[HI]]
+
+; PACKED: buffer_load_format_d16_xyz v{{\[}}{{[0-9]+}}:[[HI:[0-9]+]]{{\]}}, {{v[0-9]+}}, s[{{[0-9]+:[0-9]+}}], 0 idxen
+; PACKED: v_mov_b32_e32 v{{[0-9]+}}, v[[HI]]
+define amdgpu_ps half @buffer_load_format_d16_xyz(<4 x i32> inreg %rsrc) {
+main_body:
+ %data = call <3 x half> @llvm.amdgcn.struct.buffer.load.format.v3f16(<4 x i32> %rsrc, i32 0, i32 0, i32 0, i32 0)
+ %elt = extractelement <3 x half> %data, i32 2
+ ret half %elt
+}
+
; GCN-LABEL: {{^}}buffer_load_format_d16_xyzw:
; UNPACKED: buffer_load_format_d16_xyzw v{{\[}}{{[0-9]+}}:[[HI:[0-9]+]]{{\]}}, {{v[0-9]+}}, s[{{[0-9]+:[0-9]+}}], 0 idxen
; UNPACKED: v_mov_b32_e32 v{{[0-9]+}}, v[[HI]]
declare half @llvm.amdgcn.struct.buffer.load.format.f16(<4 x i32>, i32, i32, i32, i32)
declare <2 x half> @llvm.amdgcn.struct.buffer.load.format.v2f16(<4 x i32>, i32, i32, i32, i32)
+declare <3 x half> @llvm.amdgcn.struct.buffer.load.format.v3f16(<4 x i32>, i32, i32, i32, i32)
declare <4 x half> @llvm.amdgcn.struct.buffer.load.format.v4f16(<4 x i32>, i32, i32, i32, i32)
declare i16 @llvm.amdgcn.struct.buffer.load.format.i16(<4 x i32>, i32, i32, i32, i32)
ret void
}
+; GCN-LABEL: {{^}}buffer_store_format_d16_xyz:
+; GCN-DAG: s_load_dwordx2 s{{\[}}[[S_DATA_0:[0-9]+]]:[[S_DATA_1:[0-9]+]]{{\]}}, s{{\[[0-9]+:[0-9]+\]}}, 0x10
+
+; UNPACKED-DAG: s_mov_b32 [[K:s[0-9]+]], 0xffff{{$}}
+; UNPACKED-DAG: s_lshr_b32 [[SHR0:s[0-9]+]], s[[S_DATA_0]], 16
+; UNPACKED-DAG: s_and_b32 [[MASKED0:s[0-9]+]], s[[S_DATA_0]], [[K]]
+; UNPACKED-DAG: s_and_b32 [[MASKED1:s[0-9]+]], s[[S_DATA_1]], [[K]]
+
+; UNPACKED-DAG: v_mov_b32_e32 v[[LO:[0-9]+]], [[MASKED0]]
+; UNPACKED-DAG: v_mov_b32_e32 v[[HI:[0-9]+]], [[MASKED1]]
+
+; UNPACKED: buffer_store_format_d16_xyz v{{\[}}[[LO]]:[[HI]]{{\]}}, v{{[0-9]+}}, s[{{[0-9]+:[0-9]+}}], 0 idxen
+
+; PACKED: s_and_b32 [[MASKED0:s[0-9]+]], s[[S_DATA_1]], 0xffff{{$}}
+; PACKED: v_mov_b32_e32 v[[LO:[0-9]+]], s[[S_DATA_0]]
+; PACKED: v_mov_b32_e32 v[[HI:[0-9]+]], [[MASKED0]]
+
+; PACKED: buffer_store_format_d16_xyz v{{\[}}[[LO]]:[[HI]]{{\]}}, v{{[0-9]+}}, s[{{[0-9]+:[0-9]+}}], 0 idxen
+define amdgpu_kernel void @buffer_store_format_d16_xyz(<4 x i32> %rsrc, <4 x half> %data, i32 %index) {
+main_body:
+ %data_subvec = shufflevector <4 x half> %data, <4 x half> undef, <3 x i32> <i32 0, i32 1, i32 2>
+ call void @llvm.amdgcn.struct.buffer.store.format.v3f16(<3 x half> %data_subvec, <4 x i32> %rsrc, i32 %index, i32 0, i32 0, i32 0)
+ ret void
+}
+
; GCN-LABEL: {{^}}buffer_store_format_d16_xyzw:
; GCN-DAG: s_load_dwordx2 s{{\[}}[[S_DATA_0:[0-9]+]]:[[S_DATA_1:[0-9]+]]{{\]}}, s{{\[[0-9]+:[0-9]+\]}}, 0x10
declare void @llvm.amdgcn.struct.buffer.store.format.f16(half, <4 x i32>, i32, i32, i32, i32)
declare void @llvm.amdgcn.struct.buffer.store.format.v2f16(<2 x half>, <4 x i32>, i32, i32, i32, i32)
+declare void @llvm.amdgcn.struct.buffer.store.format.v3f16(<3 x half>, <4 x i32>, i32, i32, i32, i32)
declare void @llvm.amdgcn.struct.buffer.store.format.v4f16(<4 x half>, <4 x i32>, i32, i32, i32, i32)
declare void @llvm.amdgcn.struct.buffer.store.format.i16(i16, <4 x i32>, i32, i32, i32, i32)
ret half %elt
}
+; GCN-LABEL: {{^}}tbuffer_load_d16_xyz:
+; GCN: v_mov_b32_e32 [[ZEROREG:v[0-9]+]], 0
+; PREGFX10-UNPACKED: tbuffer_load_format_d16_xyz v{{\[}}{{[0-9]+}}:[[HI:[0-9]+]]{{\]}}, [[ZEROREG]], s[{{[0-9]+:[0-9]+}}], 0 format:[BUF_DATA_FORMAT_10_11_11,BUF_NUM_FORMAT_SNORM] idxen
+; PREGFX10-UNPACKED: v_mov_b32_e32 v{{[0-9]+}}, v[[HI]]
+
+; PREGFX10-PACKED: tbuffer_load_format_d16_xyz v{{\[}}{{[0-9]+}}:[[HI:[0-9]+]]{{\]}}, [[ZEROREG]], s[{{[0-9]+:[0-9]+}}], 0 format:[BUF_DATA_FORMAT_10_11_11,BUF_NUM_FORMAT_SNORM] idxen
+; GFX10-PACKED: tbuffer_load_format_d16_xyz v{{\[}}{{[0-9]+}}:[[HI:[0-9]+]]{{\]}}, [[ZEROREG]], s[{{[0-9]+:[0-9]+}}], 0 format:[BUF_FMT_32_FLOAT] idxen
+; PACKED: v_mov_b32_e32 v{{[0-9]+}}, v[[HI]]
+define amdgpu_ps half @tbuffer_load_d16_xyz(<4 x i32> inreg %rsrc) {
+main_body:
+ %data = call <3 x half> @llvm.amdgcn.struct.tbuffer.load.v3f16(<4 x i32> %rsrc, i32 0, i32 0, i32 0, i32 22, i32 0)
+ %elt = extractelement <3 x half> %data, i32 2
+ ret half %elt
+}
+
; GCN-LABEL: {{^}}tbuffer_load_d16_xyzw:
; GCN: v_mov_b32_e32 [[ZEROREG:v[0-9]+]], 0
; PREGFX10-UNPACKED: tbuffer_load_format_d16_xyzw v{{\[}}{{[0-9]+}}:[[HI:[0-9]+]]{{\]}}, [[ZEROREG]], s[{{[0-9]+:[0-9]+}}], 0 format:[BUF_DATA_FORMAT_10_11_11,BUF_NUM_FORMAT_SNORM] idxen
declare half @llvm.amdgcn.struct.tbuffer.load.f16(<4 x i32>, i32, i32, i32, i32, i32)
declare <2 x half> @llvm.amdgcn.struct.tbuffer.load.v2f16(<4 x i32>, i32, i32, i32, i32, i32)
+declare <3 x half> @llvm.amdgcn.struct.tbuffer.load.v3f16(<4 x i32>, i32, i32, i32, i32, i32)
declare <4 x half> @llvm.amdgcn.struct.tbuffer.load.v4f16(<4 x i32>, i32, i32, i32, i32, i32)
-
ret void
}
+; GCN-LABEL: {{^}}tbuffer_store_d16_xyz:
+; GCN-DAG: s_load_dwordx2 s{{\[}}[[S_DATA_0:[0-9]+]]:[[S_DATA_1:[0-9]+]]{{\]}}, s{{\[[0-9]+:[0-9]+\]}}, 0x10
+
+; UNPACKED-DAG: s_mov_b32 [[K:s[0-9]+]], 0xffff{{$}}
+; UNPACKED-DAG: s_lshr_b32 [[SHR0:s[0-9]+]], s[[S_DATA_0]], 16
+; UNPACKED-DAG: s_and_b32 [[MASKED0:s[0-9]+]], s[[S_DATA_0]], [[K]]
+; UNPACKED-DAG: s_and_b32 [[MASKED1:s[0-9]+]], s[[S_DATA_1]], [[K]]
+
+; UNPACKED-DAG: v_mov_b32_e32 v[[LO:[0-9]+]], [[MASKED0]]
+; UNPACKED-DAG: v_mov_b32_e32 v[[HI:[0-9]+]], [[MASKED1]]
+; PREGFX10-UNPACKED: tbuffer_store_format_d16_xyz v{{\[}}[[LO]]:[[HI]]{{\]}}, v{{[0-9]+}}, s[{{[0-9]+:[0-9]+}}], 0 format:[BUF_NUM_FORMAT_USCALED] idxen
+
+; PACKED-DAG: s_and_b32 [[MASKED0:s[0-9]+]], s[[S_DATA_1]], 0xffff{{$}}
+; PACKED-DAG: v_mov_b32_e32 v[[LO:[0-9]+]], s[[S_DATA_0]]
+; PACKED-DAG: v_mov_b32_e32 v[[HI:[0-9]+]], [[MASKED0]]
+; PREGFX10-PACKED: tbuffer_store_format_d16_xyz v{{\[}}[[LO]]:[[HI]]{{\]}}, v{{[0-9]+}}, s[{{[0-9]+:[0-9]+}}], 0 format:[BUF_NUM_FORMAT_USCALED] idxen
+; GFX10-PACKED: tbuffer_store_format_d16_xyz v{{\[}}[[LO]]:[[HI]]{{\]}}, v{{[0-9]+}}, s[{{[0-9]+:[0-9]+}}], 0 format:[BUF_FMT_10_11_11_SSCALED] idxen
+define amdgpu_kernel void @tbuffer_store_d16_xyz(<4 x i32> %rsrc, <4 x half> %data, i32 %vindex) {
+main_body:
+ %data_subvec = shufflevector <4 x half> %data, <4 x half> undef, <3 x i32> <i32 0, i32 1, i32 2>
+ call void @llvm.amdgcn.struct.tbuffer.store.v3f16(<3 x half> %data_subvec, <4 x i32> %rsrc, i32 %vindex, i32 0, i32 0, i32 33, i32 0)
+ ret void
+}
+
; GCN-LABEL: {{^}}tbuffer_store_d16_xyzw:
; GCN-DAG: s_load_dwordx2 s{{\[}}[[S_DATA_0:[0-9]+]]:[[S_DATA_1:[0-9]+]]{{\]}}, s{{\[[0-9]+:[0-9]+\]}}, 0x10
declare void @llvm.amdgcn.struct.tbuffer.store.f16(half, <4 x i32>, i32, i32, i32, i32, i32)
declare void @llvm.amdgcn.struct.tbuffer.store.v2f16(<2 x half>, <4 x i32>, i32, i32, i32, i32, i32)
+declare void @llvm.amdgcn.struct.tbuffer.store.v3f16(<3 x half>, <4 x i32>, i32, i32, i32, i32, i32)
declare void @llvm.amdgcn.struct.tbuffer.store.v4f16(<4 x half>, <4 x i32>, i32, i32, i32, i32, i32)
ret half %elt
}
+; GCN-LABEL: {{^}}tbuffer_load_d16_xyz:
+; UNPACKED: tbuffer_load_format_d16_xyz v{{\[}}{{[0-9]+}}:[[HI:[0-9]+]]{{\]}}, off, s[{{[0-9]+:[0-9]+}}], 0 format:[BUF_DATA_FORMAT_10_11_11,BUF_NUM_FORMAT_SNORM]
+; UNPACKED: v_mov_b32_e32 v{{[0-9]+}}, v[[HI]]
+
+; PACKED: tbuffer_load_format_d16_xyz v{{\[}}{{[0-9]+}}:[[HI:[0-9]+]]{{\]}}, off, s[{{[0-9]+:[0-9]+}}], 0 format:[BUF_DATA_FORMAT_10_11_11,BUF_NUM_FORMAT_SNORM]
+; PACKED: v_mov_b32_e32 v{{[0-9]+}}, v[[HI]]
+define amdgpu_ps half @tbuffer_load_d16_xyz(<4 x i32> inreg %rsrc) {
+main_body:
+ %data = call <3 x half> @llvm.amdgcn.tbuffer.load.v3f16(<4 x i32> %rsrc, i32 0, i32 0, i32 0, i32 0, i32 6, i32 1, i1 0, i1 0)
+ %elt = extractelement <3 x half> %data, i32 2
+ ret half %elt
+}
+
; GCN-LABEL: {{^}}tbuffer_load_d16_xyzw:
; UNPACKED: tbuffer_load_format_d16_xyzw v{{\[}}{{[0-9]+}}:[[HI:[0-9]+]]{{\]}}, off, s[{{[0-9]+:[0-9]+}}], 0 format:[BUF_DATA_FORMAT_10_11_11,BUF_NUM_FORMAT_SNORM]
; UNPACKED: v_mov_b32_e32 v{{[0-9]+}}, v[[HI]]
declare half @llvm.amdgcn.tbuffer.load.f16(<4 x i32>, i32, i32, i32, i32, i32, i32, i1, i1)
declare <2 x half> @llvm.amdgcn.tbuffer.load.v2f16(<4 x i32>, i32, i32, i32, i32, i32, i32, i1, i1)
+declare <3 x half> @llvm.amdgcn.tbuffer.load.v3f16(<4 x i32>, i32, i32, i32, i32, i32, i32, i1, i1)
declare <4 x half> @llvm.amdgcn.tbuffer.load.v4f16(<4 x i32>, i32, i32, i32, i32, i32, i32, i1, i1)
ret void
}
+; GCN-LABEL: {{^}}tbuffer_store_d16_xyz:
+; GCN-DAG: s_load_dwordx2 s{{\[}}[[S_DATA_0:[0-9]+]]:[[S_DATA_1:[0-9]+]]{{\]}}, s{{\[[0-9]+:[0-9]+\]}}, 0x10
+
+; UNPACKED-DAG: s_mov_b32 [[K:s[0-9]+]], 0xffff{{$}}
+; UNPACKED-DAG: s_lshr_b32 [[SHR0:s[0-9]+]], s[[S_DATA_0]], 16
+; UNPACKED-DAG: s_and_b32 [[MASKED0:s[0-9]+]], s[[S_DATA_0]], [[K]]
+; UNPACKED-DAG: s_and_b32 [[SHR1:s[0-9]+]], s[[S_DATA_1]], [[K]]
+
+; UNPACKED-DAG: v_mov_b32_e32 v[[LO:[0-9]+]], [[MASKED0]]
+; UNPACKED-DAG: v_mov_b32_e32 v[[HI:[0-9]+]], [[SHR1]]
+; UNPACKED: tbuffer_store_format_d16_xyz v{{\[}}[[LO]]:[[HI]]{{\]}}, v{{[0-9]+}}, s[{{[0-9]+:[0-9]+}}], 0 format:[BUF_NUM_FORMAT_USCALED] idxen
+
+; PACKED-DAG: s_and_b32 [[SHR0:s[0-9]+]], s[[S_DATA_1]], 0xffff{{$}}
+; PACKED-DAG: v_mov_b32_e32 v[[LO:[0-9]+]], s[[S_DATA_0]]
+; PACKED-DAG: v_mov_b32_e32 v[[HI:[0-9]+]], [[SHR0]]
+; PACKED: tbuffer_store_format_d16_xyz v{{\[}}[[LO]]:[[HI]]{{\]}}, v{{[0-9]+}}, s[{{[0-9]+:[0-9]+}}], 0 format:[BUF_NUM_FORMAT_USCALED] idxen
+define amdgpu_kernel void @tbuffer_store_d16_xyz(<4 x i32> %rsrc, <3 x half> %data, i32 %vindex) {
+main_body:
+ call void @llvm.amdgcn.tbuffer.store.v3f16(<3 x half> %data, <4 x i32> %rsrc, i32 %vindex, i32 0, i32 0, i32 0, i32 1, i32 2, i1 0, i1 0)
+ ret void
+}
+
; GCN-LABEL: {{^}}tbuffer_store_d16_xyzw:
; GCN-DAG: s_load_dwordx2 s{{\[}}[[S_DATA_0:[0-9]+]]:[[S_DATA_1:[0-9]+]]{{\]}}, s{{\[[0-9]+:[0-9]+\]}}, 0x10
declare void @llvm.amdgcn.tbuffer.store.f16(half, <4 x i32>, i32, i32, i32, i32, i32, i32, i1, i1)
declare void @llvm.amdgcn.tbuffer.store.v2f16(<2 x half>, <4 x i32>, i32, i32, i32, i32, i32, i32, i1, i1)
+declare void @llvm.amdgcn.tbuffer.store.v3f16(<3 x half>, <4 x i32>, i32, i32, i32, i32, i32, i32, i1, i1)
declare void @llvm.amdgcn.tbuffer.store.v4f16(<4 x half>, <4 x i32>, i32, i32, i32, i32, i32, i32, i1, i1)
ret half %elt1
}
-; FIXME: Enable load shortening when full support for v3f16 has been added (should expect call <3 x half> @llvm.amdgcn.raw.tbuffer.load.v3f16).
; CHECK-LABEL: @extract_elt2_raw_tbuffer_load_v4f16(
-; CHECK-NEXT: %data = call <4 x half> @llvm.amdgcn.raw.tbuffer.load.v4f16(<4 x i32> %rsrc, i32 %arg0, i32 %arg1, i32 78, i32 0)
-; CHECK-NEXT: %elt1 = extractelement <4 x half> %data, i32 2
+; CHECK-NEXT: %data = call <3 x half> @llvm.amdgcn.raw.tbuffer.load.v3f16(<4 x i32> %rsrc, i32 %arg0, i32 %arg1, i32 78, i32 0)
+; CHECK-NEXT: %elt1 = extractelement <3 x half> %data, i32 2
; CHECK-NEXT: ret half %elt1
define amdgpu_ps half @extract_elt2_raw_tbuffer_load_v4f16(<4 x i32> inreg %rsrc, i32 %arg0, i32 inreg %arg1) #0 {
%data = call <4 x half> @llvm.amdgcn.raw.tbuffer.load.v4f16(<4 x i32> %rsrc, i32 %arg0, i32 %arg1, i32 78, i32 0)
ret half %elt0
}
-; FIXME: Enable load shortening when full support for v3f16 has been added (should expect call <3 x half> @llvm.amdgcn.image.sample.cd.cl.1d.v3f16.f32.f32).
; CHECK-LABEL: @extract_elt_to3_image_sample_cd_cl_1d_v4f16_f32_f32(
-; CHECK-NEXT: %data = call <4 x half> @llvm.amdgcn.image.sample.cd.cl.1d.v4f16.f32.f32(i32 15, float %dsdh, float %dsdv, float %s, float %clamp, <8 x i32> %sampler, <4 x i32> %rsrc, i1 false, i32 0, i32 0)
-; CHECK-NEXT: %res = shufflevector <4 x half> %data, <4 x half> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 undef>
+; CHECK-NEXT: %data = call <3 x half> @llvm.amdgcn.image.sample.cd.cl.1d.v3f16.f32.f32(i32 7, float %dsdh, float %dsdv, float %s, float %clamp, <8 x i32> %sampler, <4 x i32> %rsrc, i1 false, i32 0, i32 0)
+; CHECK-NEXT: %res = shufflevector <3 x half> %data, <3 x half> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 undef>
; CHECK-NEXT: ret <4 x half> %res
define amdgpu_ps <4 x half> @extract_elt_to3_image_sample_cd_cl_1d_v4f16_f32_f32(float %dsdh, float %dsdv, float %s, float %clamp, <8 x i32> inreg %sampler, <4 x i32> inreg %rsrc) #0 {
%data = call <4 x half> @llvm.amdgcn.image.sample.cd.cl.1d.v4f16.f32.f32(i32 15, float %dsdh, float %dsdv, float %s, float %clamp, <8 x i32> %sampler, <4 x i32> %rsrc, i1 false, i32 0, i32 0)