}
}
-Value *CodeGenFunction::EmitNVPTXBuiltinExpr(unsigned BuiltinID,
- const CallExpr *E) {
+namespace {
+// Helper classes for mapping MMA builtins to particular LLVM intrinsic variant.
+struct NVPTXMmaLdstInfo {
+ unsigned NumResults; // Number of elements to load/store
+ // Intrinsic IDs for row/col variants. 0 if particular layout is unsupported.
+ unsigned IID_col;
+ unsigned IID_row;
+};
+
+#define MMA_INTR(geom_op_type, layout) \
+ Intrinsic::nvvm_wmma_##geom_op_type##_##layout##_stride
+#define MMA_LDST(n, geom_op_type) \
+ { n, MMA_INTR(geom_op_type, col), MMA_INTR(geom_op_type, row) }
+
+static NVPTXMmaLdstInfo getNVPTXMmaLdstInfo(unsigned BuiltinID) {
+ switch (BuiltinID) {
+ // FP MMA loads
+ case NVPTX::BI__hmma_m16n16k16_ld_a:
+ return MMA_LDST(8, m16n16k16_load_a_f16);
+ case NVPTX::BI__hmma_m16n16k16_ld_b:
+ return MMA_LDST(8, m16n16k16_load_b_f16);
+ case NVPTX::BI__hmma_m16n16k16_ld_c_f16:
+ return MMA_LDST(4, m16n16k16_load_c_f16);
+ case NVPTX::BI__hmma_m16n16k16_ld_c_f32:
+ return MMA_LDST(8, m16n16k16_load_c_f32);
+ case NVPTX::BI__hmma_m32n8k16_ld_a:
+ return MMA_LDST(8, m32n8k16_load_a_f16);
+ case NVPTX::BI__hmma_m32n8k16_ld_b:
+ return MMA_LDST(8, m32n8k16_load_b_f16);
+ case NVPTX::BI__hmma_m32n8k16_ld_c_f16:
+ return MMA_LDST(4, m32n8k16_load_c_f16);
+ case NVPTX::BI__hmma_m32n8k16_ld_c_f32:
+ return MMA_LDST(8, m32n8k16_load_c_f32);
+ case NVPTX::BI__hmma_m8n32k16_ld_a:
+ return MMA_LDST(8, m8n32k16_load_a_f16);
+ case NVPTX::BI__hmma_m8n32k16_ld_b:
+ return MMA_LDST(8, m8n32k16_load_b_f16);
+ case NVPTX::BI__hmma_m8n32k16_ld_c_f16:
+ return MMA_LDST(4, m8n32k16_load_c_f16);
+ case NVPTX::BI__hmma_m8n32k16_ld_c_f32:
+ return MMA_LDST(8, m8n32k16_load_c_f32);
+
+ // Integer MMA loads
+ case NVPTX::BI__imma_m16n16k16_ld_a_s8:
+ return MMA_LDST(2, m16n16k16_load_a_s8);
+ case NVPTX::BI__imma_m16n16k16_ld_a_u8:
+ return MMA_LDST(2, m16n16k16_load_a_u8);
+ case NVPTX::BI__imma_m16n16k16_ld_b_s8:
+ return MMA_LDST(2, m16n16k16_load_b_s8);
+ case NVPTX::BI__imma_m16n16k16_ld_b_u8:
+ return MMA_LDST(2, m16n16k16_load_b_u8);
+ case NVPTX::BI__imma_m16n16k16_ld_c:
+ return MMA_LDST(8, m16n16k16_load_c_s32);
+ case NVPTX::BI__imma_m32n8k16_ld_a_s8:
+ return MMA_LDST(4, m32n8k16_load_a_s8);
+ case NVPTX::BI__imma_m32n8k16_ld_a_u8:
+ return MMA_LDST(4, m32n8k16_load_a_u8);
+ case NVPTX::BI__imma_m32n8k16_ld_b_s8:
+ return MMA_LDST(1, m32n8k16_load_b_s8);
+ case NVPTX::BI__imma_m32n8k16_ld_b_u8:
+ return MMA_LDST(1, m32n8k16_load_b_u8);
+ case NVPTX::BI__imma_m32n8k16_ld_c:
+ return MMA_LDST(8, m32n8k16_load_c_s32);
+ case NVPTX::BI__imma_m8n32k16_ld_a_s8:
+ return MMA_LDST(1, m8n32k16_load_a_s8);
+ case NVPTX::BI__imma_m8n32k16_ld_a_u8:
+ return MMA_LDST(1, m8n32k16_load_a_u8);
+ case NVPTX::BI__imma_m8n32k16_ld_b_s8:
+ return MMA_LDST(4, m8n32k16_load_b_s8);
+ case NVPTX::BI__imma_m8n32k16_ld_b_u8:
+ return MMA_LDST(4, m8n32k16_load_b_u8);
+ case NVPTX::BI__imma_m8n32k16_ld_c:
+ return MMA_LDST(8, m8n32k16_load_c_s32);
+
+ // Sub-integer MMA loads.
+ // Only row/col layout is supported by A/B fragments.
+ case NVPTX::BI__imma_m8n8k32_ld_a_s4:
+ return {1, 0, MMA_INTR(m8n8k32_load_a_s4, row)};
+ case NVPTX::BI__imma_m8n8k32_ld_a_u4:
+ return {1, 0, MMA_INTR(m8n8k32_load_a_u4, row)};
+ case NVPTX::BI__imma_m8n8k32_ld_b_s4:
+ return {1, MMA_INTR(m8n8k32_load_b_s4, col), 0};
+ case NVPTX::BI__imma_m8n8k32_ld_b_u4:
+ return {1, MMA_INTR(m8n8k32_load_b_u4, col), 0};
+ case NVPTX::BI__imma_m8n8k32_ld_c:
+ return MMA_LDST(2, m8n8k32_load_c_s32);
+ case NVPTX::BI__bmma_m8n8k128_ld_a_b1:
+ return {1, 0, MMA_INTR(m8n8k128_load_a_b1, row)};
+ case NVPTX::BI__bmma_m8n8k128_ld_b_b1:
+ return {1, MMA_INTR(m8n8k128_load_b_b1, col), 0};
+ case NVPTX::BI__bmma_m8n8k128_ld_c:
+ return MMA_LDST(2, m8n8k128_load_c_s32);
+
+ // NOTE: We need to follow inconsitent naming scheme used by NVCC. Unlike
+ // PTX and LLVM IR where stores always use fragment D, NVCC builtins always
+ // use fragment C for both loads and stores.
+ // FP MMA stores.
+ case NVPTX::BI__hmma_m16n16k16_st_c_f16:
+ return MMA_LDST(4, m16n16k16_store_d_f16);
+ case NVPTX::BI__hmma_m16n16k16_st_c_f32:
+ return MMA_LDST(8, m16n16k16_store_d_f32);
+ case NVPTX::BI__hmma_m32n8k16_st_c_f16:
+ return MMA_LDST(4, m32n8k16_store_d_f16);
+ case NVPTX::BI__hmma_m32n8k16_st_c_f32:
+ return MMA_LDST(8, m32n8k16_store_d_f32);
+ case NVPTX::BI__hmma_m8n32k16_st_c_f16:
+ return MMA_LDST(4, m8n32k16_store_d_f16);
+ case NVPTX::BI__hmma_m8n32k16_st_c_f32:
+ return MMA_LDST(8, m8n32k16_store_d_f32);
+
+ // Integer and sub-integer MMA stores.
+ // Another naming quirk. Unlike other MMA builtins that use PTX types in the
+ // name, integer loads/stores use LLVM's i32.
+ case NVPTX::BI__imma_m16n16k16_st_c_i32:
+ return MMA_LDST(8, m16n16k16_store_d_s32);
+ case NVPTX::BI__imma_m32n8k16_st_c_i32:
+ return MMA_LDST(8, m32n8k16_store_d_s32);
+ case NVPTX::BI__imma_m8n32k16_st_c_i32:
+ return MMA_LDST(8, m8n32k16_store_d_s32);
+ case NVPTX::BI__imma_m8n8k32_st_c_i32:
+ return MMA_LDST(2, m8n8k32_store_d_s32);
+ case NVPTX::BI__bmma_m8n8k128_st_c_i32:
+ return MMA_LDST(2, m8n8k128_store_d_s32);
+
+ default:
+ llvm_unreachable("Unknown MMA builtin");
+ }
+}
+#undef MMA_LDST
+#undef MMA_INTR
+
+
+struct NVPTXMmaInfo {
+ unsigned NumEltsA;
+ unsigned NumEltsB;
+ unsigned NumEltsC;
+ unsigned NumEltsD;
+ std::array<unsigned, 8> Variants;
+
+ unsigned getMMAIntrinsic(int Layout, bool Satf) {
+ unsigned Index = Layout * 2 + Satf;
+ if (Index >= Variants.size())
+ return 0;
+ return Variants[Index];
+ }
+};
+
+ // Returns an intrinsic that matches Layout and Satf for valid combinations of
+ // Layout and Satf, 0 otherwise.
+static NVPTXMmaInfo getNVPTXMmaInfo(unsigned BuiltinID) {
+ // clang-format off
+#define MMA_VARIANTS(geom, type) {{ \
+ Intrinsic::nvvm_wmma_##geom##_mma_row_row_##type, \
+ Intrinsic::nvvm_wmma_##geom##_mma_row_row_##type##_satfinite, \
+ Intrinsic::nvvm_wmma_##geom##_mma_row_col_##type, \
+ Intrinsic::nvvm_wmma_##geom##_mma_row_col_##type##_satfinite, \
+ Intrinsic::nvvm_wmma_##geom##_mma_col_row_##type, \
+ Intrinsic::nvvm_wmma_##geom##_mma_col_row_##type##_satfinite, \
+ Intrinsic::nvvm_wmma_##geom##_mma_col_col_##type, \
+ Intrinsic::nvvm_wmma_##geom##_mma_col_col_##type##_satfinite \
+ }}
+// Sub-integer MMA only supports row.col layout.
+#define MMA_VARIANTS_I4(geom, type) {{ \
+ 0, \
+ 0, \
+ Intrinsic::nvvm_wmma_##geom##_mma_row_col_##type, \
+ Intrinsic::nvvm_wmma_##geom##_mma_row_col_##type##_satfinite, \
+ 0, \
+ 0, \
+ 0, \
+ 0 \
+ }}
+// b1 MMA does not support .satfinite.
+#define MMA_VARIANTS_B1(geom, type) {{ \
+ 0, \
+ 0, \
+ Intrinsic::nvvm_wmma_##geom##_mma_row_col_##type, \
+ 0, \
+ 0, \
+ 0, \
+ 0, \
+ 0 \
+ }}
+ // clang-format on
+ switch (BuiltinID) {
+ // FP MMA
+ // Note that 'type' argument of MMA_VARIANT uses D_C notation, while
+ // NumEltsN of return value are ordered as A,B,C,D.
+ case NVPTX::BI__hmma_m16n16k16_mma_f16f16:
+ return {8, 8, 4, 4, MMA_VARIANTS(m16n16k16, f16_f16)};
+ case NVPTX::BI__hmma_m16n16k16_mma_f32f16:
+ return {8, 8, 4, 8, MMA_VARIANTS(m16n16k16, f32_f16)};
+ case NVPTX::BI__hmma_m16n16k16_mma_f16f32:
+ return {8, 8, 8, 4, MMA_VARIANTS(m16n16k16, f16_f32)};
+ case NVPTX::BI__hmma_m16n16k16_mma_f32f32:
+ return {8, 8, 8, 8, MMA_VARIANTS(m16n16k16, f32_f32)};
+ case NVPTX::BI__hmma_m32n8k16_mma_f16f16:
+ return {8, 8, 4, 4, MMA_VARIANTS(m32n8k16, f16_f16)};
+ case NVPTX::BI__hmma_m32n8k16_mma_f32f16:
+ return {8, 8, 4, 8, MMA_VARIANTS(m32n8k16, f32_f16)};
+ case NVPTX::BI__hmma_m32n8k16_mma_f16f32:
+ return {8, 8, 8, 4, MMA_VARIANTS(m32n8k16, f16_f32)};
+ case NVPTX::BI__hmma_m32n8k16_mma_f32f32:
+ return {8, 8, 8, 8, MMA_VARIANTS(m32n8k16, f32_f32)};
+ case NVPTX::BI__hmma_m8n32k16_mma_f16f16:
+ return {8, 8, 4, 4, MMA_VARIANTS(m8n32k16, f16_f16)};
+ case NVPTX::BI__hmma_m8n32k16_mma_f32f16:
+ return {8, 8, 4, 8, MMA_VARIANTS(m8n32k16, f32_f16)};
+ case NVPTX::BI__hmma_m8n32k16_mma_f16f32:
+ return {8, 8, 8, 4, MMA_VARIANTS(m8n32k16, f16_f32)};
+ case NVPTX::BI__hmma_m8n32k16_mma_f32f32:
+ return {8, 8, 8, 8, MMA_VARIANTS(m8n32k16, f32_f32)};
+
+ // Integer MMA
+ case NVPTX::BI__imma_m16n16k16_mma_s8:
+ return {2, 2, 8, 8, MMA_VARIANTS(m16n16k16, s8)};
+ case NVPTX::BI__imma_m16n16k16_mma_u8:
+ return {2, 2, 8, 8, MMA_VARIANTS(m16n16k16, u8)};
+ case NVPTX::BI__imma_m32n8k16_mma_s8:
+ return {4, 1, 8, 8, MMA_VARIANTS(m32n8k16, s8)};
+ case NVPTX::BI__imma_m32n8k16_mma_u8:
+ return {4, 1, 8, 8, MMA_VARIANTS(m32n8k16, u8)};
+ case NVPTX::BI__imma_m8n32k16_mma_s8:
+ return {1, 4, 8, 8, MMA_VARIANTS(m8n32k16, s8)};
+ case NVPTX::BI__imma_m8n32k16_mma_u8:
+ return {1, 4, 8, 8, MMA_VARIANTS(m8n32k16, u8)};
+
+ // Sub-integer MMA
+ case NVPTX::BI__imma_m8n8k32_mma_s4:
+ return {1, 1, 2, 2, MMA_VARIANTS_I4(m8n8k32, s4)};
+ case NVPTX::BI__imma_m8n8k32_mma_u4:
+ return {1, 1, 2, 2, MMA_VARIANTS_I4(m8n8k32, u4)};
+ case NVPTX::BI__bmma_m8n8k128_mma_xor_popc_b1:
+ return {1, 1, 2, 2, MMA_VARIANTS_B1(m8n8k128, b1)};
+ default:
+ llvm_unreachable("Unexpected builtin ID.");
+ }
+#undef MMA_VARIANTS
+#undef MMA_VARIANTS_I4
+#undef MMA_VARIANTS_B1
+}
+
+} // namespace
+
+Value *
+CodeGenFunction::EmitNVPTXBuiltinExpr(unsigned BuiltinID, const CallExpr *E) {
auto MakeLdg = [&](unsigned IntrinsicID) {
Value *Ptr = EmitScalarExpr(E->getArg(0));
clang::CharUnits Align =
Builder.CreateStore(Pred, PredOutPtr);
return Builder.CreateExtractValue(ResultPair, 0);
}
+
+ // FP MMA loads
case NVPTX::BI__hmma_m16n16k16_ld_a:
case NVPTX::BI__hmma_m16n16k16_ld_b:
case NVPTX::BI__hmma_m16n16k16_ld_c_f16:
case NVPTX::BI__hmma_m8n32k16_ld_a:
case NVPTX::BI__hmma_m8n32k16_ld_b:
case NVPTX::BI__hmma_m8n32k16_ld_c_f16:
- case NVPTX::BI__hmma_m8n32k16_ld_c_f32: {
+ case NVPTX::BI__hmma_m8n32k16_ld_c_f32:
+ // Integer MMA loads.
+ case NVPTX::BI__imma_m16n16k16_ld_a_s8:
+ case NVPTX::BI__imma_m16n16k16_ld_a_u8:
+ case NVPTX::BI__imma_m16n16k16_ld_b_s8:
+ case NVPTX::BI__imma_m16n16k16_ld_b_u8:
+ case NVPTX::BI__imma_m16n16k16_ld_c:
+ case NVPTX::BI__imma_m32n8k16_ld_a_s8:
+ case NVPTX::BI__imma_m32n8k16_ld_a_u8:
+ case NVPTX::BI__imma_m32n8k16_ld_b_s8:
+ case NVPTX::BI__imma_m32n8k16_ld_b_u8:
+ case NVPTX::BI__imma_m32n8k16_ld_c:
+ case NVPTX::BI__imma_m8n32k16_ld_a_s8:
+ case NVPTX::BI__imma_m8n32k16_ld_a_u8:
+ case NVPTX::BI__imma_m8n32k16_ld_b_s8:
+ case NVPTX::BI__imma_m8n32k16_ld_b_u8:
+ case NVPTX::BI__imma_m8n32k16_ld_c:
+ // Sub-integer MMA loads.
+ case NVPTX::BI__imma_m8n8k32_ld_a_s4:
+ case NVPTX::BI__imma_m8n8k32_ld_a_u4:
+ case NVPTX::BI__imma_m8n8k32_ld_b_s4:
+ case NVPTX::BI__imma_m8n8k32_ld_b_u4:
+ case NVPTX::BI__imma_m8n8k32_ld_c:
+ case NVPTX::BI__bmma_m8n8k128_ld_a_b1:
+ case NVPTX::BI__bmma_m8n8k128_ld_b_b1:
+ case NVPTX::BI__bmma_m8n8k128_ld_c:
+ {
Address Dst = EmitPointerWithAlignment(E->getArg(0));
Value *Src = EmitScalarExpr(E->getArg(1));
Value *Ldm = EmitScalarExpr(E->getArg(2));
if (!E->getArg(3)->isIntegerConstantExpr(isColMajorArg, getContext()))
return nullptr;
bool isColMajor = isColMajorArg.getSExtValue();
- unsigned IID;
- unsigned NumResults;
- switch (BuiltinID) {
- case NVPTX::BI__hmma_m16n16k16_ld_a:
- IID = isColMajor ? Intrinsic::nvvm_wmma_m16n16k16_load_a_f16_col_stride
- : Intrinsic::nvvm_wmma_m16n16k16_load_a_f16_row_stride;
- NumResults = 8;
- break;
- case NVPTX::BI__hmma_m16n16k16_ld_b:
- IID = isColMajor ? Intrinsic::nvvm_wmma_m16n16k16_load_b_f16_col_stride
- : Intrinsic::nvvm_wmma_m16n16k16_load_b_f16_row_stride;
- NumResults = 8;
- break;
- case NVPTX::BI__hmma_m16n16k16_ld_c_f16:
- IID = isColMajor ? Intrinsic::nvvm_wmma_m16n16k16_load_c_f16_col_stride
- : Intrinsic::nvvm_wmma_m16n16k16_load_c_f16_row_stride;
- NumResults = 4;
- break;
- case NVPTX::BI__hmma_m16n16k16_ld_c_f32:
- IID = isColMajor ? Intrinsic::nvvm_wmma_m16n16k16_load_c_f32_col_stride
- : Intrinsic::nvvm_wmma_m16n16k16_load_c_f32_row_stride;
- NumResults = 8;
- break;
- case NVPTX::BI__hmma_m32n8k16_ld_a:
- IID = isColMajor ? Intrinsic::nvvm_wmma_m32n8k16_load_a_f16_col_stride
- : Intrinsic::nvvm_wmma_m32n8k16_load_a_f16_row_stride;
- NumResults = 8;
- break;
- case NVPTX::BI__hmma_m32n8k16_ld_b:
- IID = isColMajor ? Intrinsic::nvvm_wmma_m32n8k16_load_b_f16_col_stride
- : Intrinsic::nvvm_wmma_m32n8k16_load_b_f16_row_stride;
- NumResults = 8;
- break;
- case NVPTX::BI__hmma_m32n8k16_ld_c_f16:
- IID = isColMajor ? Intrinsic::nvvm_wmma_m32n8k16_load_c_f16_col_stride
- : Intrinsic::nvvm_wmma_m32n8k16_load_c_f16_row_stride;
- NumResults = 4;
- break;
- case NVPTX::BI__hmma_m32n8k16_ld_c_f32:
- IID = isColMajor ? Intrinsic::nvvm_wmma_m32n8k16_load_c_f32_col_stride
- : Intrinsic::nvvm_wmma_m32n8k16_load_c_f32_row_stride;
- NumResults = 8;
- break;
- case NVPTX::BI__hmma_m8n32k16_ld_a:
- IID = isColMajor ? Intrinsic::nvvm_wmma_m8n32k16_load_a_f16_col_stride
- : Intrinsic::nvvm_wmma_m8n32k16_load_a_f16_row_stride;
- NumResults = 8;
- break;
- case NVPTX::BI__hmma_m8n32k16_ld_b:
- IID = isColMajor ? Intrinsic::nvvm_wmma_m8n32k16_load_b_f16_col_stride
- : Intrinsic::nvvm_wmma_m8n32k16_load_b_f16_row_stride;
- NumResults = 8;
- break;
- case NVPTX::BI__hmma_m8n32k16_ld_c_f16:
- IID = isColMajor ? Intrinsic::nvvm_wmma_m8n32k16_load_c_f16_col_stride
- : Intrinsic::nvvm_wmma_m8n32k16_load_c_f16_row_stride;
- NumResults = 4;
- break;
- case NVPTX::BI__hmma_m8n32k16_ld_c_f32:
- IID = isColMajor ? Intrinsic::nvvm_wmma_m8n32k16_load_c_f32_col_stride
- : Intrinsic::nvvm_wmma_m8n32k16_load_c_f32_row_stride;
- NumResults = 8;
- break;
- default:
- llvm_unreachable("Unexpected builtin ID.");
- }
+ NVPTXMmaLdstInfo II = getNVPTXMmaLdstInfo(BuiltinID);
+ unsigned IID = isColMajor ? II.IID_col : II.IID_row;
+ if (IID == 0)
+ return nullptr;
+
Value *Result =
Builder.CreateCall(CGM.getIntrinsic(IID, Src->getType()), {Src, Ldm});
// Save returned values.
- for (unsigned i = 0; i < NumResults; ++i) {
- Builder.CreateAlignedStore(
- Builder.CreateBitCast(Builder.CreateExtractValue(Result, i),
- Dst.getElementType()),
- Builder.CreateGEP(Dst.getPointer(), llvm::ConstantInt::get(IntTy, i)),
- CharUnits::fromQuantity(4));
+ assert(II.NumResults);
+ if (II.NumResults == 1) {
+ Builder.CreateAlignedStore(Result, Dst.getPointer(),
+ CharUnits::fromQuantity(4));
+ } else {
+ for (unsigned i = 0; i < II.NumResults; ++i) {
+ Builder.CreateAlignedStore(
+ Builder.CreateBitCast(Builder.CreateExtractValue(Result, i),
+ Dst.getElementType()),
+ Builder.CreateGEP(Dst.getPointer(),
+ llvm::ConstantInt::get(IntTy, i)),
+ CharUnits::fromQuantity(4));
+ }
}
return Result;
}
case NVPTX::BI__hmma_m32n8k16_st_c_f16:
case NVPTX::BI__hmma_m32n8k16_st_c_f32:
case NVPTX::BI__hmma_m8n32k16_st_c_f16:
- case NVPTX::BI__hmma_m8n32k16_st_c_f32: {
+ case NVPTX::BI__hmma_m8n32k16_st_c_f32:
+ case NVPTX::BI__imma_m16n16k16_st_c_i32:
+ case NVPTX::BI__imma_m32n8k16_st_c_i32:
+ case NVPTX::BI__imma_m8n32k16_st_c_i32:
+ case NVPTX::BI__imma_m8n8k32_st_c_i32:
+ case NVPTX::BI__bmma_m8n8k128_st_c_i32: {
Value *Dst = EmitScalarExpr(E->getArg(0));
Address Src = EmitPointerWithAlignment(E->getArg(1));
Value *Ldm = EmitScalarExpr(E->getArg(2));
if (!E->getArg(3)->isIntegerConstantExpr(isColMajorArg, getContext()))
return nullptr;
bool isColMajor = isColMajorArg.getSExtValue();
- unsigned IID;
- unsigned NumResults = 8;
- // PTX Instructions (and LLVM intrinsics) are defined for slice _d_, yet
- // for some reason nvcc builtins use _c_.
- switch (BuiltinID) {
- case NVPTX::BI__hmma_m16n16k16_st_c_f16:
- IID = isColMajor ? Intrinsic::nvvm_wmma_m16n16k16_store_d_f16_col_stride
- : Intrinsic::nvvm_wmma_m16n16k16_store_d_f16_row_stride;
- NumResults = 4;
- break;
- case NVPTX::BI__hmma_m16n16k16_st_c_f32:
- IID = isColMajor ? Intrinsic::nvvm_wmma_m16n16k16_store_d_f32_col_stride
- : Intrinsic::nvvm_wmma_m16n16k16_store_d_f32_row_stride;
- break;
- case NVPTX::BI__hmma_m32n8k16_st_c_f16:
- IID = isColMajor ? Intrinsic::nvvm_wmma_m32n8k16_store_d_f16_col_stride
- : Intrinsic::nvvm_wmma_m32n8k16_store_d_f16_row_stride;
- NumResults = 4;
- break;
- case NVPTX::BI__hmma_m32n8k16_st_c_f32:
- IID = isColMajor ? Intrinsic::nvvm_wmma_m32n8k16_store_d_f32_col_stride
- : Intrinsic::nvvm_wmma_m32n8k16_store_d_f32_row_stride;
- break;
- case NVPTX::BI__hmma_m8n32k16_st_c_f16:
- IID = isColMajor ? Intrinsic::nvvm_wmma_m8n32k16_store_d_f16_col_stride
- : Intrinsic::nvvm_wmma_m8n32k16_store_d_f16_row_stride;
- NumResults = 4;
- break;
- case NVPTX::BI__hmma_m8n32k16_st_c_f32:
- IID = isColMajor ? Intrinsic::nvvm_wmma_m8n32k16_store_d_f32_col_stride
- : Intrinsic::nvvm_wmma_m8n32k16_store_d_f32_row_stride;
- break;
- default:
- llvm_unreachable("Unexpected builtin ID.");
- }
- Function *Intrinsic = CGM.getIntrinsic(IID, Dst->getType());
+ NVPTXMmaLdstInfo II = getNVPTXMmaLdstInfo(BuiltinID);
+ unsigned IID = isColMajor ? II.IID_col : II.IID_row;
+ if (IID == 0)
+ return nullptr;
+ Function *Intrinsic =
+ CGM.getIntrinsic(IID, Dst->getType());
llvm::Type *ParamType = Intrinsic->getFunctionType()->getParamType(1);
SmallVector<Value *, 10> Values = {Dst};
- for (unsigned i = 0; i < NumResults; ++i) {
+ for (unsigned i = 0; i < II.NumResults; ++i) {
Value *V = Builder.CreateAlignedLoad(
Builder.CreateGEP(Src.getPointer(), llvm::ConstantInt::get(IntTy, i)),
CharUnits::fromQuantity(4));
case NVPTX::BI__hmma_m8n32k16_mma_f16f16:
case NVPTX::BI__hmma_m8n32k16_mma_f32f16:
case NVPTX::BI__hmma_m8n32k16_mma_f32f32:
- case NVPTX::BI__hmma_m8n32k16_mma_f16f32: {
+ case NVPTX::BI__hmma_m8n32k16_mma_f16f32:
+ case NVPTX::BI__imma_m16n16k16_mma_s8:
+ case NVPTX::BI__imma_m16n16k16_mma_u8:
+ case NVPTX::BI__imma_m32n8k16_mma_s8:
+ case NVPTX::BI__imma_m32n8k16_mma_u8:
+ case NVPTX::BI__imma_m8n32k16_mma_s8:
+ case NVPTX::BI__imma_m8n32k16_mma_u8:
+ case NVPTX::BI__imma_m8n8k32_mma_s4:
+ case NVPTX::BI__imma_m8n8k32_mma_u4:
+ case NVPTX::BI__bmma_m8n8k128_mma_xor_popc_b1: {
Address Dst = EmitPointerWithAlignment(E->getArg(0));
Address SrcA = EmitPointerWithAlignment(E->getArg(1));
Address SrcB = EmitPointerWithAlignment(E->getArg(2));
if (Layout < 0 || Layout > 3)
return nullptr;
llvm::APSInt SatfArg;
- if (!E->getArg(5)->isIntegerConstantExpr(SatfArg, getContext()))
+ if (BuiltinID == NVPTX::BI__bmma_m8n8k128_mma_xor_popc_b1)
+ SatfArg = 0; // .b1 does not have satf argument.
+ else if (!E->getArg(5)->isIntegerConstantExpr(SatfArg, getContext()))
return nullptr;
bool Satf = SatfArg.getSExtValue();
-
- // clang-format off
-#define MMA_VARIANTS(geom, type) {{ \
- Intrinsic::nvvm_wmma_##geom##_mma_row_row_##type, \
- Intrinsic::nvvm_wmma_##geom##_mma_row_row_##type##_satfinite, \
- Intrinsic::nvvm_wmma_##geom##_mma_row_col_##type, \
- Intrinsic::nvvm_wmma_##geom##_mma_row_col_##type##_satfinite, \
- Intrinsic::nvvm_wmma_##geom##_mma_col_row_##type, \
- Intrinsic::nvvm_wmma_##geom##_mma_col_row_##type##_satfinite, \
- Intrinsic::nvvm_wmma_##geom##_mma_col_col_##type, \
- Intrinsic::nvvm_wmma_##geom##_mma_col_col_##type##_satfinite \
- }}
- // clang-format on
-
- auto getMMAIntrinsic = [Layout, Satf](std::array<unsigned, 8> Variants) {
- unsigned Index = Layout * 2 + Satf;
- assert(Index < 8);
- return Variants[Index];
- };
- unsigned IID;
- unsigned NumEltsC;
- unsigned NumEltsD;
- switch (BuiltinID) {
- case NVPTX::BI__hmma_m16n16k16_mma_f16f16:
- IID = getMMAIntrinsic(MMA_VARIANTS(m16n16k16, f16_f16));
- NumEltsC = 4;
- NumEltsD = 4;
- break;
- case NVPTX::BI__hmma_m16n16k16_mma_f32f16:
- IID = getMMAIntrinsic(MMA_VARIANTS(m16n16k16, f32_f16));
- NumEltsC = 4;
- NumEltsD = 8;
- break;
- case NVPTX::BI__hmma_m16n16k16_mma_f16f32:
- IID = getMMAIntrinsic(MMA_VARIANTS(m16n16k16, f16_f32));
- NumEltsC = 8;
- NumEltsD = 4;
- break;
- case NVPTX::BI__hmma_m16n16k16_mma_f32f32:
- IID = getMMAIntrinsic(MMA_VARIANTS(m16n16k16, f32_f32));
- NumEltsC = 8;
- NumEltsD = 8;
- break;
- case NVPTX::BI__hmma_m32n8k16_mma_f16f16:
- IID = getMMAIntrinsic(MMA_VARIANTS(m32n8k16, f16_f16));
- NumEltsC = 4;
- NumEltsD = 4;
- break;
- case NVPTX::BI__hmma_m32n8k16_mma_f32f16:
- IID = getMMAIntrinsic(MMA_VARIANTS(m32n8k16, f32_f16));
- NumEltsC = 4;
- NumEltsD = 8;
- break;
- case NVPTX::BI__hmma_m32n8k16_mma_f16f32:
- IID = getMMAIntrinsic(MMA_VARIANTS(m32n8k16, f16_f32));
- NumEltsC = 8;
- NumEltsD = 4;
- break;
- case NVPTX::BI__hmma_m32n8k16_mma_f32f32:
- IID = getMMAIntrinsic(MMA_VARIANTS(m32n8k16, f32_f32));
- NumEltsC = 8;
- NumEltsD = 8;
- break;
- case NVPTX::BI__hmma_m8n32k16_mma_f16f16:
- IID = getMMAIntrinsic(MMA_VARIANTS(m8n32k16, f16_f16));
- NumEltsC = 4;
- NumEltsD = 4;
- break;
- case NVPTX::BI__hmma_m8n32k16_mma_f32f16:
- IID = getMMAIntrinsic(MMA_VARIANTS(m8n32k16, f32_f16));
- NumEltsC = 4;
- NumEltsD = 8;
- break;
- case NVPTX::BI__hmma_m8n32k16_mma_f16f32:
- IID = getMMAIntrinsic(MMA_VARIANTS(m8n32k16, f16_f32));
- NumEltsC = 8;
- NumEltsD = 4;
- break;
- case NVPTX::BI__hmma_m8n32k16_mma_f32f32:
- IID = getMMAIntrinsic(MMA_VARIANTS(m8n32k16, f32_f32));
- NumEltsC = 8;
- NumEltsD = 8;
- break;
- default:
- llvm_unreachable("Unexpected builtin ID.");
- }
-#undef MMA_VARIANTS
+ NVPTXMmaInfo MI = getNVPTXMmaInfo(BuiltinID);
+ unsigned IID = MI.getMMAIntrinsic(Layout, Satf);
+ if (IID == 0) // Unsupported combination of Layout/Satf.
+ return nullptr;
SmallVector<Value *, 24> Values;
Function *Intrinsic = CGM.getIntrinsic(IID);
- llvm::Type *ABType = Intrinsic->getFunctionType()->getParamType(0);
+ llvm::Type *AType = Intrinsic->getFunctionType()->getParamType(0);
// Load A
- for (unsigned i = 0; i < 8; ++i) {
+ for (unsigned i = 0; i < MI.NumEltsA; ++i) {
Value *V = Builder.CreateAlignedLoad(
Builder.CreateGEP(SrcA.getPointer(),
llvm::ConstantInt::get(IntTy, i)),
CharUnits::fromQuantity(4));
- Values.push_back(Builder.CreateBitCast(V, ABType));
+ Values.push_back(Builder.CreateBitCast(V, AType));
}
// Load B
- for (unsigned i = 0; i < 8; ++i) {
+ llvm::Type *BType = Intrinsic->getFunctionType()->getParamType(MI.NumEltsA);
+ for (unsigned i = 0; i < MI.NumEltsB; ++i) {
Value *V = Builder.CreateAlignedLoad(
Builder.CreateGEP(SrcB.getPointer(),
llvm::ConstantInt::get(IntTy, i)),
CharUnits::fromQuantity(4));
- Values.push_back(Builder.CreateBitCast(V, ABType));
+ Values.push_back(Builder.CreateBitCast(V, BType));
}
// Load C
- llvm::Type *CType = Intrinsic->getFunctionType()->getParamType(16);
- for (unsigned i = 0; i < NumEltsC; ++i) {
+ llvm::Type *CType =
+ Intrinsic->getFunctionType()->getParamType(MI.NumEltsA + MI.NumEltsB);
+ for (unsigned i = 0; i < MI.NumEltsC; ++i) {
Value *V = Builder.CreateAlignedLoad(
Builder.CreateGEP(SrcC.getPointer(),
llvm::ConstantInt::get(IntTy, i)),
}
Value *Result = Builder.CreateCall(Intrinsic, Values);
llvm::Type *DType = Dst.getElementType();
- for (unsigned i = 0; i < NumEltsD; ++i)
+ for (unsigned i = 0; i < MI.NumEltsD; ++i)
Builder.CreateAlignedStore(
Builder.CreateBitCast(Builder.CreateExtractValue(Result, i), DType),
Builder.CreateGEP(Dst.getPointer(), llvm::ConstantInt::get(IntTy, i)),
--- /dev/null
+
+//
+// *** DO NOT EDIT ***
+//
+// This test has been automatically generated by
+// builtins-nvtx-mma.py --ptx=63 --gpu-arch=75
+//
+// Make sure we can handle all builtins available on sm_75 with PTX63
+// RUN: %clang_cc1 -triple nvptx64-unknown-unknown -target-cpu sm_75 \
+// RUN: -fcuda-is-device -target-feature +ptx63 \
+// RUN: -DPTX=63 -DSM=75 \
+// RUN: -S -emit-llvm -o - -x cuda %s \
+// RUN: | FileCheck -check-prefixes=CHECK_PTX61_SM70,CHECK_PTX63_SM75,CHECK_PTX63_SM72,CHECK_PTX60_SM70 %s
+// Verify that all builtins have correct constraints.
+// RUN: %clang_cc1 -triple nvptx-unknown-unknown \
+// RUN: -target-cpu sm_60 -target-feature +ptx42 \
+// RUN: -DPTX=63 -DSM=75 -fcuda-is-device -S -o /dev/null -x cuda \
+// RUN: -verify %s
+
+
+#if !defined(CUDA_VERSION)
+#define __device__ __attribute__((device))
+#define __global__ __attribute__((global))
+#define __shared__ __attribute__((shared))
+#define __constant__ __attribute__((constant))
+
+typedef unsigned long long uint64_t;
+#endif
+
+// CHECK-LABEL: test_wmma_buitins
+__device__ void test_wmma_buitins(int *src, int *dst,
+ float *fsrc, float *fdst, int ldm) {
+
+
+#if (PTX >= 60) && (SM >= 70)
+
+ // CHECK_PTX60_SM70: call {{.*}} @llvm.nvvm.wmma.m16n16k16.load.a.col.stride.f16
+ // expected-error-re@+1 {{'__hmma_m16n16k16_ld_a' needs target feature sm_70{{.*}},ptx60{{.*}}}}
+ __hmma_m16n16k16_ld_a(dst, src, ldm, 1);
+ // CHECK_PTX60_SM70: call {{.*}} @llvm.nvvm.wmma.m16n16k16.load.a.row.stride.f16
+ // expected-error-re@+1 {{'__hmma_m16n16k16_ld_a' needs target feature sm_70{{.*}},ptx60{{.*}}}}
+ __hmma_m16n16k16_ld_a(dst, src, ldm, 0);
+ // CHECK_PTX60_SM70: call {{.*}} @llvm.nvvm.wmma.m16n16k16.load.b.col.stride.f16
+ // expected-error-re@+1 {{'__hmma_m16n16k16_ld_b' needs target feature sm_70{{.*}},ptx60{{.*}}}}
+ __hmma_m16n16k16_ld_b(dst, src, ldm, 1);
+ // CHECK_PTX60_SM70: call {{.*}} @llvm.nvvm.wmma.m16n16k16.load.b.row.stride.f16
+ // expected-error-re@+1 {{'__hmma_m16n16k16_ld_b' needs target feature sm_70{{.*}},ptx60{{.*}}}}
+ __hmma_m16n16k16_ld_b(dst, src, ldm, 0);
+ // CHECK_PTX60_SM70: call {{.*}} @llvm.nvvm.wmma.m16n16k16.load.c.col.stride.f16
+ // expected-error-re@+1 {{'__hmma_m16n16k16_ld_c_f16' needs target feature sm_70{{.*}},ptx60{{.*}}}}
+ __hmma_m16n16k16_ld_c_f16(dst, src, ldm, 1);
+ // CHECK_PTX60_SM70: call {{.*}} @llvm.nvvm.wmma.m16n16k16.load.c.row.stride.f16
+ // expected-error-re@+1 {{'__hmma_m16n16k16_ld_c_f16' needs target feature sm_70{{.*}},ptx60{{.*}}}}
+ __hmma_m16n16k16_ld_c_f16(dst, src, ldm, 0);
+ // CHECK_PTX60_SM70: call {{.*}} @llvm.nvvm.wmma.m16n16k16.load.c.col.stride.f32
+ // expected-error-re@+1 {{'__hmma_m16n16k16_ld_c_f32' needs target feature sm_70{{.*}},ptx60{{.*}}}}
+ __hmma_m16n16k16_ld_c_f32(fdst, fsrc, ldm, 1);
+ // CHECK_PTX60_SM70: call {{.*}} @llvm.nvvm.wmma.m16n16k16.load.c.row.stride.f32
+ // expected-error-re@+1 {{'__hmma_m16n16k16_ld_c_f32' needs target feature sm_70{{.*}},ptx60{{.*}}}}
+ __hmma_m16n16k16_ld_c_f32(fdst, fsrc, ldm, 0);
+ // CHECK_PTX60_SM70: call {{.*}} @llvm.nvvm.wmma.m16n16k16.store.d.col.stride.f16
+ // expected-error-re@+1 {{'__hmma_m16n16k16_st_c_f16' needs target feature sm_70{{.*}},ptx60{{.*}}}}
+ __hmma_m16n16k16_st_c_f16(dst, src, ldm, 1);
+ // CHECK_PTX60_SM70: call {{.*}} @llvm.nvvm.wmma.m16n16k16.store.d.row.stride.f16
+ // expected-error-re@+1 {{'__hmma_m16n16k16_st_c_f16' needs target feature sm_70{{.*}},ptx60{{.*}}}}
+ __hmma_m16n16k16_st_c_f16(dst, src, ldm, 0);
+ // CHECK_PTX60_SM70: call {{.*}} @llvm.nvvm.wmma.m16n16k16.store.d.col.stride.f32
+ // expected-error-re@+1 {{'__hmma_m16n16k16_st_c_f32' needs target feature sm_70{{.*}},ptx60{{.*}}}}
+ __hmma_m16n16k16_st_c_f32(fdst, fsrc, ldm, 1);
+ // CHECK_PTX60_SM70: call {{.*}} @llvm.nvvm.wmma.m16n16k16.store.d.row.stride.f32
+ // expected-error-re@+1 {{'__hmma_m16n16k16_st_c_f32' needs target feature sm_70{{.*}},ptx60{{.*}}}}
+ __hmma_m16n16k16_st_c_f32(fdst, fsrc, ldm, 0);
+ // CHECK_PTX60_SM70: call {{.*}} @llvm.nvvm.wmma.m16n16k16.mma.col.col.f16.f16
+ // expected-error-re@+1 {{'__hmma_m16n16k16_mma_f16f16' needs target feature sm_70{{.*}},ptx60{{.*}}}}
+ __hmma_m16n16k16_mma_f16f16(dst, src, src, src, 3, 0);
+ // CHECK_PTX60_SM70: call {{.*}} @llvm.nvvm.wmma.m16n16k16.mma.col.col.f16.f16.satfinite
+ // expected-error-re@+1 {{'__hmma_m16n16k16_mma_f16f16' needs target feature sm_70{{.*}},ptx60{{.*}}}}
+ __hmma_m16n16k16_mma_f16f16(dst, src, src, src, 3, 1);
+ // CHECK_PTX60_SM70: call {{.*}} @llvm.nvvm.wmma.m16n16k16.mma.col.row.f16.f16
+ // expected-error-re@+1 {{'__hmma_m16n16k16_mma_f16f16' needs target feature sm_70{{.*}},ptx60{{.*}}}}
+ __hmma_m16n16k16_mma_f16f16(dst, src, src, src, 2, 0);
+ // CHECK_PTX60_SM70: call {{.*}} @llvm.nvvm.wmma.m16n16k16.mma.col.row.f16.f16.satfinite
+ // expected-error-re@+1 {{'__hmma_m16n16k16_mma_f16f16' needs target feature sm_70{{.*}},ptx60{{.*}}}}
+ __hmma_m16n16k16_mma_f16f16(dst, src, src, src, 2, 1);
+ // CHECK_PTX60_SM70: call {{.*}} @llvm.nvvm.wmma.m16n16k16.mma.row.col.f16.f16
+ // expected-error-re@+1 {{'__hmma_m16n16k16_mma_f16f16' needs target feature sm_70{{.*}},ptx60{{.*}}}}
+ __hmma_m16n16k16_mma_f16f16(dst, src, src, src, 1, 0);
+ // CHECK_PTX60_SM70: call {{.*}} @llvm.nvvm.wmma.m16n16k16.mma.row.col.f16.f16.satfinite
+ // expected-error-re@+1 {{'__hmma_m16n16k16_mma_f16f16' needs target feature sm_70{{.*}},ptx60{{.*}}}}
+ __hmma_m16n16k16_mma_f16f16(dst, src, src, src, 1, 1);
+ // CHECK_PTX60_SM70: call {{.*}} @llvm.nvvm.wmma.m16n16k16.mma.row.row.f16.f16
+ // expected-error-re@+1 {{'__hmma_m16n16k16_mma_f16f16' needs target feature sm_70{{.*}},ptx60{{.*}}}}
+ __hmma_m16n16k16_mma_f16f16(dst, src, src, src, 0, 0);
+ // CHECK_PTX60_SM70: call {{.*}} @llvm.nvvm.wmma.m16n16k16.mma.row.row.f16.f16.satfinite
+ // expected-error-re@+1 {{'__hmma_m16n16k16_mma_f16f16' needs target feature sm_70{{.*}},ptx60{{.*}}}}
+ __hmma_m16n16k16_mma_f16f16(dst, src, src, src, 0, 1);
+ // CHECK_PTX60_SM70: call {{.*}} @llvm.nvvm.wmma.m16n16k16.mma.col.col.f32.f16
+ // expected-error-re@+1 {{'__hmma_m16n16k16_mma_f32f16' needs target feature sm_70{{.*}},ptx60{{.*}}}}
+ __hmma_m16n16k16_mma_f32f16(fdst, src, src, src, 3, 0);
+ // CHECK_PTX60_SM70: call {{.*}} @llvm.nvvm.wmma.m16n16k16.mma.col.col.f32.f16.satfinite
+ // expected-error-re@+1 {{'__hmma_m16n16k16_mma_f32f16' needs target feature sm_70{{.*}},ptx60{{.*}}}}
+ __hmma_m16n16k16_mma_f32f16(fdst, src, src, src, 3, 1);
+ // CHECK_PTX60_SM70: call {{.*}} @llvm.nvvm.wmma.m16n16k16.mma.col.row.f32.f16
+ // expected-error-re@+1 {{'__hmma_m16n16k16_mma_f32f16' needs target feature sm_70{{.*}},ptx60{{.*}}}}
+ __hmma_m16n16k16_mma_f32f16(fdst, src, src, src, 2, 0);
+ // CHECK_PTX60_SM70: call {{.*}} @llvm.nvvm.wmma.m16n16k16.mma.col.row.f32.f16.satfinite
+ // expected-error-re@+1 {{'__hmma_m16n16k16_mma_f32f16' needs target feature sm_70{{.*}},ptx60{{.*}}}}
+ __hmma_m16n16k16_mma_f32f16(fdst, src, src, src, 2, 1);
+ // CHECK_PTX60_SM70: call {{.*}} @llvm.nvvm.wmma.m16n16k16.mma.row.col.f32.f16
+ // expected-error-re@+1 {{'__hmma_m16n16k16_mma_f32f16' needs target feature sm_70{{.*}},ptx60{{.*}}}}
+ __hmma_m16n16k16_mma_f32f16(fdst, src, src, src, 1, 0);
+ // CHECK_PTX60_SM70: call {{.*}} @llvm.nvvm.wmma.m16n16k16.mma.row.col.f32.f16.satfinite
+ // expected-error-re@+1 {{'__hmma_m16n16k16_mma_f32f16' needs target feature sm_70{{.*}},ptx60{{.*}}}}
+ __hmma_m16n16k16_mma_f32f16(fdst, src, src, src, 1, 1);
+ // CHECK_PTX60_SM70: call {{.*}} @llvm.nvvm.wmma.m16n16k16.mma.row.row.f32.f16
+ // expected-error-re@+1 {{'__hmma_m16n16k16_mma_f32f16' needs target feature sm_70{{.*}},ptx60{{.*}}}}
+ __hmma_m16n16k16_mma_f32f16(fdst, src, src, src, 0, 0);
+ // CHECK_PTX60_SM70: call {{.*}} @llvm.nvvm.wmma.m16n16k16.mma.row.row.f32.f16.satfinite
+ // expected-error-re@+1 {{'__hmma_m16n16k16_mma_f32f16' needs target feature sm_70{{.*}},ptx60{{.*}}}}
+ __hmma_m16n16k16_mma_f32f16(fdst, src, src, src, 0, 1);
+ // CHECK_PTX60_SM70: call {{.*}} @llvm.nvvm.wmma.m16n16k16.mma.col.col.f16.f32
+ // expected-error-re@+1 {{'__hmma_m16n16k16_mma_f16f32' needs target feature sm_70{{.*}},ptx60{{.*}}}}
+ __hmma_m16n16k16_mma_f16f32(dst, src, src, fsrc, 3, 0);
+ // CHECK_PTX60_SM70: call {{.*}} @llvm.nvvm.wmma.m16n16k16.mma.col.col.f16.f32.satfinite
+ // expected-error-re@+1 {{'__hmma_m16n16k16_mma_f16f32' needs target feature sm_70{{.*}},ptx60{{.*}}}}
+ __hmma_m16n16k16_mma_f16f32(dst, src, src, fsrc, 3, 1);
+ // CHECK_PTX60_SM70: call {{.*}} @llvm.nvvm.wmma.m16n16k16.mma.col.row.f16.f32
+ // expected-error-re@+1 {{'__hmma_m16n16k16_mma_f16f32' needs target feature sm_70{{.*}},ptx60{{.*}}}}
+ __hmma_m16n16k16_mma_f16f32(dst, src, src, fsrc, 2, 0);
+ // CHECK_PTX60_SM70: call {{.*}} @llvm.nvvm.wmma.m16n16k16.mma.col.row.f16.f32.satfinite
+ // expected-error-re@+1 {{'__hmma_m16n16k16_mma_f16f32' needs target feature sm_70{{.*}},ptx60{{.*}}}}
+ __hmma_m16n16k16_mma_f16f32(dst, src, src, fsrc, 2, 1);
+ // CHECK_PTX60_SM70: call {{.*}} @llvm.nvvm.wmma.m16n16k16.mma.row.col.f16.f32
+ // expected-error-re@+1 {{'__hmma_m16n16k16_mma_f16f32' needs target feature sm_70{{.*}},ptx60{{.*}}}}
+ __hmma_m16n16k16_mma_f16f32(dst, src, src, fsrc, 1, 0);
+ // CHECK_PTX60_SM70: call {{.*}} @llvm.nvvm.wmma.m16n16k16.mma.row.col.f16.f32.satfinite
+ // expected-error-re@+1 {{'__hmma_m16n16k16_mma_f16f32' needs target feature sm_70{{.*}},ptx60{{.*}}}}
+ __hmma_m16n16k16_mma_f16f32(dst, src, src, fsrc, 1, 1);
+ // CHECK_PTX60_SM70: call {{.*}} @llvm.nvvm.wmma.m16n16k16.mma.row.row.f16.f32
+ // expected-error-re@+1 {{'__hmma_m16n16k16_mma_f16f32' needs target feature sm_70{{.*}},ptx60{{.*}}}}
+ __hmma_m16n16k16_mma_f16f32(dst, src, src, fsrc, 0, 0);
+ // CHECK_PTX60_SM70: call {{.*}} @llvm.nvvm.wmma.m16n16k16.mma.row.row.f16.f32.satfinite
+ // expected-error-re@+1 {{'__hmma_m16n16k16_mma_f16f32' needs target feature sm_70{{.*}},ptx60{{.*}}}}
+ __hmma_m16n16k16_mma_f16f32(dst, src, src, fsrc, 0, 1);
+ // CHECK_PTX60_SM70: call {{.*}} @llvm.nvvm.wmma.m16n16k16.mma.col.col.f32.f32
+ // expected-error-re@+1 {{'__hmma_m16n16k16_mma_f32f32' needs target feature sm_70{{.*}},ptx60{{.*}}}}
+ __hmma_m16n16k16_mma_f32f32(fdst, src, src, fsrc, 3, 0);
+ // CHECK_PTX60_SM70: call {{.*}} @llvm.nvvm.wmma.m16n16k16.mma.col.col.f32.f32.satfinite
+ // expected-error-re@+1 {{'__hmma_m16n16k16_mma_f32f32' needs target feature sm_70{{.*}},ptx60{{.*}}}}
+ __hmma_m16n16k16_mma_f32f32(fdst, src, src, fsrc, 3, 1);
+ // CHECK_PTX60_SM70: call {{.*}} @llvm.nvvm.wmma.m16n16k16.mma.col.row.f32.f32
+ // expected-error-re@+1 {{'__hmma_m16n16k16_mma_f32f32' needs target feature sm_70{{.*}},ptx60{{.*}}}}
+ __hmma_m16n16k16_mma_f32f32(fdst, src, src, fsrc, 2, 0);
+ // CHECK_PTX60_SM70: call {{.*}} @llvm.nvvm.wmma.m16n16k16.mma.col.row.f32.f32.satfinite
+ // expected-error-re@+1 {{'__hmma_m16n16k16_mma_f32f32' needs target feature sm_70{{.*}},ptx60{{.*}}}}
+ __hmma_m16n16k16_mma_f32f32(fdst, src, src, fsrc, 2, 1);
+ // CHECK_PTX60_SM70: call {{.*}} @llvm.nvvm.wmma.m16n16k16.mma.row.col.f32.f32
+ // expected-error-re@+1 {{'__hmma_m16n16k16_mma_f32f32' needs target feature sm_70{{.*}},ptx60{{.*}}}}
+ __hmma_m16n16k16_mma_f32f32(fdst, src, src, fsrc, 1, 0);
+ // CHECK_PTX60_SM70: call {{.*}} @llvm.nvvm.wmma.m16n16k16.mma.row.col.f32.f32.satfinite
+ // expected-error-re@+1 {{'__hmma_m16n16k16_mma_f32f32' needs target feature sm_70{{.*}},ptx60{{.*}}}}
+ __hmma_m16n16k16_mma_f32f32(fdst, src, src, fsrc, 1, 1);
+ // CHECK_PTX60_SM70: call {{.*}} @llvm.nvvm.wmma.m16n16k16.mma.row.row.f32.f32
+ // expected-error-re@+1 {{'__hmma_m16n16k16_mma_f32f32' needs target feature sm_70{{.*}},ptx60{{.*}}}}
+ __hmma_m16n16k16_mma_f32f32(fdst, src, src, fsrc, 0, 0);
+ // CHECK_PTX60_SM70: call {{.*}} @llvm.nvvm.wmma.m16n16k16.mma.row.row.f32.f32.satfinite
+ // expected-error-re@+1 {{'__hmma_m16n16k16_mma_f32f32' needs target feature sm_70{{.*}},ptx60{{.*}}}}
+ __hmma_m16n16k16_mma_f32f32(fdst, src, src, fsrc, 0, 1);
+#endif // (PTX >= 60) && (SM >= 70)
+
+#if (PTX >= 61) && (SM >= 70)
+
+ // CHECK_PTX61_SM70: call {{.*}} @llvm.nvvm.wmma.m32n8k16.load.a.col.stride.f16
+ // expected-error-re@+1 {{'__hmma_m32n8k16_ld_a' needs target feature sm_70{{.*}},ptx61{{.*}}}}
+ __hmma_m32n8k16_ld_a(dst, src, ldm, 1);
+ // CHECK_PTX61_SM70: call {{.*}} @llvm.nvvm.wmma.m32n8k16.load.a.row.stride.f16
+ // expected-error-re@+1 {{'__hmma_m32n8k16_ld_a' needs target feature sm_70{{.*}},ptx61{{.*}}}}
+ __hmma_m32n8k16_ld_a(dst, src, ldm, 0);
+ // CHECK_PTX61_SM70: call {{.*}} @llvm.nvvm.wmma.m32n8k16.load.b.col.stride.f16
+ // expected-error-re@+1 {{'__hmma_m32n8k16_ld_b' needs target feature sm_70{{.*}},ptx61{{.*}}}}
+ __hmma_m32n8k16_ld_b(dst, src, ldm, 1);
+ // CHECK_PTX61_SM70: call {{.*}} @llvm.nvvm.wmma.m32n8k16.load.b.row.stride.f16
+ // expected-error-re@+1 {{'__hmma_m32n8k16_ld_b' needs target feature sm_70{{.*}},ptx61{{.*}}}}
+ __hmma_m32n8k16_ld_b(dst, src, ldm, 0);
+ // CHECK_PTX61_SM70: call {{.*}} @llvm.nvvm.wmma.m32n8k16.load.c.col.stride.f16
+ // expected-error-re@+1 {{'__hmma_m32n8k16_ld_c_f16' needs target feature sm_70{{.*}},ptx61{{.*}}}}
+ __hmma_m32n8k16_ld_c_f16(dst, src, ldm, 1);
+ // CHECK_PTX61_SM70: call {{.*}} @llvm.nvvm.wmma.m32n8k16.load.c.row.stride.f16
+ // expected-error-re@+1 {{'__hmma_m32n8k16_ld_c_f16' needs target feature sm_70{{.*}},ptx61{{.*}}}}
+ __hmma_m32n8k16_ld_c_f16(dst, src, ldm, 0);
+ // CHECK_PTX61_SM70: call {{.*}} @llvm.nvvm.wmma.m32n8k16.load.c.col.stride.f32
+ // expected-error-re@+1 {{'__hmma_m32n8k16_ld_c_f32' needs target feature sm_70{{.*}},ptx61{{.*}}}}
+ __hmma_m32n8k16_ld_c_f32(fdst, fsrc, ldm, 1);
+ // CHECK_PTX61_SM70: call {{.*}} @llvm.nvvm.wmma.m32n8k16.load.c.row.stride.f32
+ // expected-error-re@+1 {{'__hmma_m32n8k16_ld_c_f32' needs target feature sm_70{{.*}},ptx61{{.*}}}}
+ __hmma_m32n8k16_ld_c_f32(fdst, fsrc, ldm, 0);
+ // CHECK_PTX61_SM70: call {{.*}} @llvm.nvvm.wmma.m32n8k16.store.d.col.stride.f16
+ // expected-error-re@+1 {{'__hmma_m32n8k16_st_c_f16' needs target feature sm_70{{.*}},ptx61{{.*}}}}
+ __hmma_m32n8k16_st_c_f16(dst, src, ldm, 1);
+ // CHECK_PTX61_SM70: call {{.*}} @llvm.nvvm.wmma.m32n8k16.store.d.row.stride.f16
+ // expected-error-re@+1 {{'__hmma_m32n8k16_st_c_f16' needs target feature sm_70{{.*}},ptx61{{.*}}}}
+ __hmma_m32n8k16_st_c_f16(dst, src, ldm, 0);
+ // CHECK_PTX61_SM70: call {{.*}} @llvm.nvvm.wmma.m32n8k16.store.d.col.stride.f32
+ // expected-error-re@+1 {{'__hmma_m32n8k16_st_c_f32' needs target feature sm_70{{.*}},ptx61{{.*}}}}
+ __hmma_m32n8k16_st_c_f32(fdst, fsrc, ldm, 1);
+ // CHECK_PTX61_SM70: call {{.*}} @llvm.nvvm.wmma.m32n8k16.store.d.row.stride.f32
+ // expected-error-re@+1 {{'__hmma_m32n8k16_st_c_f32' needs target feature sm_70{{.*}},ptx61{{.*}}}}
+ __hmma_m32n8k16_st_c_f32(fdst, fsrc, ldm, 0);
+ // CHECK_PTX61_SM70: call {{.*}} @llvm.nvvm.wmma.m8n32k16.load.a.col.stride.f16
+ // expected-error-re@+1 {{'__hmma_m8n32k16_ld_a' needs target feature sm_70{{.*}},ptx61{{.*}}}}
+ __hmma_m8n32k16_ld_a(dst, src, ldm, 1);
+ // CHECK_PTX61_SM70: call {{.*}} @llvm.nvvm.wmma.m8n32k16.load.a.row.stride.f16
+ // expected-error-re@+1 {{'__hmma_m8n32k16_ld_a' needs target feature sm_70{{.*}},ptx61{{.*}}}}
+ __hmma_m8n32k16_ld_a(dst, src, ldm, 0);
+ // CHECK_PTX61_SM70: call {{.*}} @llvm.nvvm.wmma.m8n32k16.load.b.col.stride.f16
+ // expected-error-re@+1 {{'__hmma_m8n32k16_ld_b' needs target feature sm_70{{.*}},ptx61{{.*}}}}
+ __hmma_m8n32k16_ld_b(dst, src, ldm, 1);
+ // CHECK_PTX61_SM70: call {{.*}} @llvm.nvvm.wmma.m8n32k16.load.b.row.stride.f16
+ // expected-error-re@+1 {{'__hmma_m8n32k16_ld_b' needs target feature sm_70{{.*}},ptx61{{.*}}}}
+ __hmma_m8n32k16_ld_b(dst, src, ldm, 0);
+ // CHECK_PTX61_SM70: call {{.*}} @llvm.nvvm.wmma.m8n32k16.load.c.col.stride.f16
+ // expected-error-re@+1 {{'__hmma_m8n32k16_ld_c_f16' needs target feature sm_70{{.*}},ptx61{{.*}}}}
+ __hmma_m8n32k16_ld_c_f16(dst, src, ldm, 1);
+ // CHECK_PTX61_SM70: call {{.*}} @llvm.nvvm.wmma.m8n32k16.load.c.row.stride.f16
+ // expected-error-re@+1 {{'__hmma_m8n32k16_ld_c_f16' needs target feature sm_70{{.*}},ptx61{{.*}}}}
+ __hmma_m8n32k16_ld_c_f16(dst, src, ldm, 0);
+ // CHECK_PTX61_SM70: call {{.*}} @llvm.nvvm.wmma.m8n32k16.load.c.col.stride.f32
+ // expected-error-re@+1 {{'__hmma_m8n32k16_ld_c_f32' needs target feature sm_70{{.*}},ptx61{{.*}}}}
+ __hmma_m8n32k16_ld_c_f32(fdst, fsrc, ldm, 1);
+ // CHECK_PTX61_SM70: call {{.*}} @llvm.nvvm.wmma.m8n32k16.load.c.row.stride.f32
+ // expected-error-re@+1 {{'__hmma_m8n32k16_ld_c_f32' needs target feature sm_70{{.*}},ptx61{{.*}}}}
+ __hmma_m8n32k16_ld_c_f32(fdst, fsrc, ldm, 0);
+ // CHECK_PTX61_SM70: call {{.*}} @llvm.nvvm.wmma.m8n32k16.store.d.col.stride.f16
+ // expected-error-re@+1 {{'__hmma_m8n32k16_st_c_f16' needs target feature sm_70{{.*}},ptx61{{.*}}}}
+ __hmma_m8n32k16_st_c_f16(dst, src, ldm, 1);
+ // CHECK_PTX61_SM70: call {{.*}} @llvm.nvvm.wmma.m8n32k16.store.d.row.stride.f16
+ // expected-error-re@+1 {{'__hmma_m8n32k16_st_c_f16' needs target feature sm_70{{.*}},ptx61{{.*}}}}
+ __hmma_m8n32k16_st_c_f16(dst, src, ldm, 0);
+ // CHECK_PTX61_SM70: call {{.*}} @llvm.nvvm.wmma.m8n32k16.store.d.col.stride.f32
+ // expected-error-re@+1 {{'__hmma_m8n32k16_st_c_f32' needs target feature sm_70{{.*}},ptx61{{.*}}}}
+ __hmma_m8n32k16_st_c_f32(fdst, fsrc, ldm, 1);
+ // CHECK_PTX61_SM70: call {{.*}} @llvm.nvvm.wmma.m8n32k16.store.d.row.stride.f32
+ // expected-error-re@+1 {{'__hmma_m8n32k16_st_c_f32' needs target feature sm_70{{.*}},ptx61{{.*}}}}
+ __hmma_m8n32k16_st_c_f32(fdst, fsrc, ldm, 0);
+ // CHECK_PTX61_SM70: call {{.*}} @llvm.nvvm.wmma.m32n8k16.mma.col.col.f16.f16
+ // expected-error-re@+1 {{'__hmma_m32n8k16_mma_f16f16' needs target feature sm_70{{.*}},ptx61{{.*}}}}
+ __hmma_m32n8k16_mma_f16f16(dst, src, src, src, 3, 0);
+ // CHECK_PTX61_SM70: call {{.*}} @llvm.nvvm.wmma.m32n8k16.mma.col.col.f16.f16.satfinite
+ // expected-error-re@+1 {{'__hmma_m32n8k16_mma_f16f16' needs target feature sm_70{{.*}},ptx61{{.*}}}}
+ __hmma_m32n8k16_mma_f16f16(dst, src, src, src, 3, 1);
+ // CHECK_PTX61_SM70: call {{.*}} @llvm.nvvm.wmma.m32n8k16.mma.col.row.f16.f16
+ // expected-error-re@+1 {{'__hmma_m32n8k16_mma_f16f16' needs target feature sm_70{{.*}},ptx61{{.*}}}}
+ __hmma_m32n8k16_mma_f16f16(dst, src, src, src, 2, 0);
+ // CHECK_PTX61_SM70: call {{.*}} @llvm.nvvm.wmma.m32n8k16.mma.col.row.f16.f16.satfinite
+ // expected-error-re@+1 {{'__hmma_m32n8k16_mma_f16f16' needs target feature sm_70{{.*}},ptx61{{.*}}}}
+ __hmma_m32n8k16_mma_f16f16(dst, src, src, src, 2, 1);
+ // CHECK_PTX61_SM70: call {{.*}} @llvm.nvvm.wmma.m32n8k16.mma.row.col.f16.f16
+ // expected-error-re@+1 {{'__hmma_m32n8k16_mma_f16f16' needs target feature sm_70{{.*}},ptx61{{.*}}}}
+ __hmma_m32n8k16_mma_f16f16(dst, src, src, src, 1, 0);
+ // CHECK_PTX61_SM70: call {{.*}} @llvm.nvvm.wmma.m32n8k16.mma.row.col.f16.f16.satfinite
+ // expected-error-re@+1 {{'__hmma_m32n8k16_mma_f16f16' needs target feature sm_70{{.*}},ptx61{{.*}}}}
+ __hmma_m32n8k16_mma_f16f16(dst, src, src, src, 1, 1);
+ // CHECK_PTX61_SM70: call {{.*}} @llvm.nvvm.wmma.m32n8k16.mma.row.row.f16.f16
+ // expected-error-re@+1 {{'__hmma_m32n8k16_mma_f16f16' needs target feature sm_70{{.*}},ptx61{{.*}}}}
+ __hmma_m32n8k16_mma_f16f16(dst, src, src, src, 0, 0);
+ // CHECK_PTX61_SM70: call {{.*}} @llvm.nvvm.wmma.m32n8k16.mma.row.row.f16.f16.satfinite
+ // expected-error-re@+1 {{'__hmma_m32n8k16_mma_f16f16' needs target feature sm_70{{.*}},ptx61{{.*}}}}
+ __hmma_m32n8k16_mma_f16f16(dst, src, src, src, 0, 1);
+ // CHECK_PTX61_SM70: call {{.*}} @llvm.nvvm.wmma.m32n8k16.mma.col.col.f32.f16
+ // expected-error-re@+1 {{'__hmma_m32n8k16_mma_f32f16' needs target feature sm_70{{.*}},ptx61{{.*}}}}
+ __hmma_m32n8k16_mma_f32f16(fdst, src, src, src, 3, 0);
+ // CHECK_PTX61_SM70: call {{.*}} @llvm.nvvm.wmma.m32n8k16.mma.col.col.f32.f16.satfinite
+ // expected-error-re@+1 {{'__hmma_m32n8k16_mma_f32f16' needs target feature sm_70{{.*}},ptx61{{.*}}}}
+ __hmma_m32n8k16_mma_f32f16(fdst, src, src, src, 3, 1);
+ // CHECK_PTX61_SM70: call {{.*}} @llvm.nvvm.wmma.m32n8k16.mma.col.row.f32.f16
+ // expected-error-re@+1 {{'__hmma_m32n8k16_mma_f32f16' needs target feature sm_70{{.*}},ptx61{{.*}}}}
+ __hmma_m32n8k16_mma_f32f16(fdst, src, src, src, 2, 0);
+ // CHECK_PTX61_SM70: call {{.*}} @llvm.nvvm.wmma.m32n8k16.mma.col.row.f32.f16.satfinite
+ // expected-error-re@+1 {{'__hmma_m32n8k16_mma_f32f16' needs target feature sm_70{{.*}},ptx61{{.*}}}}
+ __hmma_m32n8k16_mma_f32f16(fdst, src, src, src, 2, 1);
+ // CHECK_PTX61_SM70: call {{.*}} @llvm.nvvm.wmma.m32n8k16.mma.row.col.f32.f16
+ // expected-error-re@+1 {{'__hmma_m32n8k16_mma_f32f16' needs target feature sm_70{{.*}},ptx61{{.*}}}}
+ __hmma_m32n8k16_mma_f32f16(fdst, src, src, src, 1, 0);
+ // CHECK_PTX61_SM70: call {{.*}} @llvm.nvvm.wmma.m32n8k16.mma.row.col.f32.f16.satfinite
+ // expected-error-re@+1 {{'__hmma_m32n8k16_mma_f32f16' needs target feature sm_70{{.*}},ptx61{{.*}}}}
+ __hmma_m32n8k16_mma_f32f16(fdst, src, src, src, 1, 1);
+ // CHECK_PTX61_SM70: call {{.*}} @llvm.nvvm.wmma.m32n8k16.mma.row.row.f32.f16
+ // expected-error-re@+1 {{'__hmma_m32n8k16_mma_f32f16' needs target feature sm_70{{.*}},ptx61{{.*}}}}
+ __hmma_m32n8k16_mma_f32f16(fdst, src, src, src, 0, 0);
+ // CHECK_PTX61_SM70: call {{.*}} @llvm.nvvm.wmma.m32n8k16.mma.row.row.f32.f16.satfinite
+ // expected-error-re@+1 {{'__hmma_m32n8k16_mma_f32f16' needs target feature sm_70{{.*}},ptx61{{.*}}}}
+ __hmma_m32n8k16_mma_f32f16(fdst, src, src, src, 0, 1);
+ // CHECK_PTX61_SM70: call {{.*}} @llvm.nvvm.wmma.m32n8k16.mma.col.col.f16.f32
+ // expected-error-re@+1 {{'__hmma_m32n8k16_mma_f16f32' needs target feature sm_70{{.*}},ptx61{{.*}}}}
+ __hmma_m32n8k16_mma_f16f32(dst, src, src, fsrc, 3, 0);
+ // CHECK_PTX61_SM70: call {{.*}} @llvm.nvvm.wmma.m32n8k16.mma.col.col.f16.f32.satfinite
+ // expected-error-re@+1 {{'__hmma_m32n8k16_mma_f16f32' needs target feature sm_70{{.*}},ptx61{{.*}}}}
+ __hmma_m32n8k16_mma_f16f32(dst, src, src, fsrc, 3, 1);
+ // CHECK_PTX61_SM70: call {{.*}} @llvm.nvvm.wmma.m32n8k16.mma.col.row.f16.f32
+ // expected-error-re@+1 {{'__hmma_m32n8k16_mma_f16f32' needs target feature sm_70{{.*}},ptx61{{.*}}}}
+ __hmma_m32n8k16_mma_f16f32(dst, src, src, fsrc, 2, 0);
+ // CHECK_PTX61_SM70: call {{.*}} @llvm.nvvm.wmma.m32n8k16.mma.col.row.f16.f32.satfinite
+ // expected-error-re@+1 {{'__hmma_m32n8k16_mma_f16f32' needs target feature sm_70{{.*}},ptx61{{.*}}}}
+ __hmma_m32n8k16_mma_f16f32(dst, src, src, fsrc, 2, 1);
+ // CHECK_PTX61_SM70: call {{.*}} @llvm.nvvm.wmma.m32n8k16.mma.row.col.f16.f32
+ // expected-error-re@+1 {{'__hmma_m32n8k16_mma_f16f32' needs target feature sm_70{{.*}},ptx61{{.*}}}}
+ __hmma_m32n8k16_mma_f16f32(dst, src, src, fsrc, 1, 0);
+ // CHECK_PTX61_SM70: call {{.*}} @llvm.nvvm.wmma.m32n8k16.mma.row.col.f16.f32.satfinite
+ // expected-error-re@+1 {{'__hmma_m32n8k16_mma_f16f32' needs target feature sm_70{{.*}},ptx61{{.*}}}}
+ __hmma_m32n8k16_mma_f16f32(dst, src, src, fsrc, 1, 1);
+ // CHECK_PTX61_SM70: call {{.*}} @llvm.nvvm.wmma.m32n8k16.mma.row.row.f16.f32
+ // expected-error-re@+1 {{'__hmma_m32n8k16_mma_f16f32' needs target feature sm_70{{.*}},ptx61{{.*}}}}
+ __hmma_m32n8k16_mma_f16f32(dst, src, src, fsrc, 0, 0);
+ // CHECK_PTX61_SM70: call {{.*}} @llvm.nvvm.wmma.m32n8k16.mma.row.row.f16.f32.satfinite
+ // expected-error-re@+1 {{'__hmma_m32n8k16_mma_f16f32' needs target feature sm_70{{.*}},ptx61{{.*}}}}
+ __hmma_m32n8k16_mma_f16f32(dst, src, src, fsrc, 0, 1);
+ // CHECK_PTX61_SM70: call {{.*}} @llvm.nvvm.wmma.m32n8k16.mma.col.col.f32.f32
+ // expected-error-re@+1 {{'__hmma_m32n8k16_mma_f32f32' needs target feature sm_70{{.*}},ptx61{{.*}}}}
+ __hmma_m32n8k16_mma_f32f32(fdst, src, src, fsrc, 3, 0);
+ // CHECK_PTX61_SM70: call {{.*}} @llvm.nvvm.wmma.m32n8k16.mma.col.col.f32.f32.satfinite
+ // expected-error-re@+1 {{'__hmma_m32n8k16_mma_f32f32' needs target feature sm_70{{.*}},ptx61{{.*}}}}
+ __hmma_m32n8k16_mma_f32f32(fdst, src, src, fsrc, 3, 1);
+ // CHECK_PTX61_SM70: call {{.*}} @llvm.nvvm.wmma.m32n8k16.mma.col.row.f32.f32
+ // expected-error-re@+1 {{'__hmma_m32n8k16_mma_f32f32' needs target feature sm_70{{.*}},ptx61{{.*}}}}
+ __hmma_m32n8k16_mma_f32f32(fdst, src, src, fsrc, 2, 0);
+ // CHECK_PTX61_SM70: call {{.*}} @llvm.nvvm.wmma.m32n8k16.mma.col.row.f32.f32.satfinite
+ // expected-error-re@+1 {{'__hmma_m32n8k16_mma_f32f32' needs target feature sm_70{{.*}},ptx61{{.*}}}}
+ __hmma_m32n8k16_mma_f32f32(fdst, src, src, fsrc, 2, 1);
+ // CHECK_PTX61_SM70: call {{.*}} @llvm.nvvm.wmma.m32n8k16.mma.row.col.f32.f32
+ // expected-error-re@+1 {{'__hmma_m32n8k16_mma_f32f32' needs target feature sm_70{{.*}},ptx61{{.*}}}}
+ __hmma_m32n8k16_mma_f32f32(fdst, src, src, fsrc, 1, 0);
+ // CHECK_PTX61_SM70: call {{.*}} @llvm.nvvm.wmma.m32n8k16.mma.row.col.f32.f32.satfinite
+ // expected-error-re@+1 {{'__hmma_m32n8k16_mma_f32f32' needs target feature sm_70{{.*}},ptx61{{.*}}}}
+ __hmma_m32n8k16_mma_f32f32(fdst, src, src, fsrc, 1, 1);
+ // CHECK_PTX61_SM70: call {{.*}} @llvm.nvvm.wmma.m32n8k16.mma.row.row.f32.f32
+ // expected-error-re@+1 {{'__hmma_m32n8k16_mma_f32f32' needs target feature sm_70{{.*}},ptx61{{.*}}}}
+ __hmma_m32n8k16_mma_f32f32(fdst, src, src, fsrc, 0, 0);
+ // CHECK_PTX61_SM70: call {{.*}} @llvm.nvvm.wmma.m32n8k16.mma.row.row.f32.f32.satfinite
+ // expected-error-re@+1 {{'__hmma_m32n8k16_mma_f32f32' needs target feature sm_70{{.*}},ptx61{{.*}}}}
+ __hmma_m32n8k16_mma_f32f32(fdst, src, src, fsrc, 0, 1);
+ // CHECK_PTX61_SM70: call {{.*}} @llvm.nvvm.wmma.m8n32k16.mma.col.col.f16.f16
+ // expected-error-re@+1 {{'__hmma_m8n32k16_mma_f16f16' needs target feature sm_70{{.*}},ptx61{{.*}}}}
+ __hmma_m8n32k16_mma_f16f16(dst, src, src, src, 3, 0);
+ // CHECK_PTX61_SM70: call {{.*}} @llvm.nvvm.wmma.m8n32k16.mma.col.col.f16.f16.satfinite
+ // expected-error-re@+1 {{'__hmma_m8n32k16_mma_f16f16' needs target feature sm_70{{.*}},ptx61{{.*}}}}
+ __hmma_m8n32k16_mma_f16f16(dst, src, src, src, 3, 1);
+ // CHECK_PTX61_SM70: call {{.*}} @llvm.nvvm.wmma.m8n32k16.mma.col.row.f16.f16
+ // expected-error-re@+1 {{'__hmma_m8n32k16_mma_f16f16' needs target feature sm_70{{.*}},ptx61{{.*}}}}
+ __hmma_m8n32k16_mma_f16f16(dst, src, src, src, 2, 0);
+ // CHECK_PTX61_SM70: call {{.*}} @llvm.nvvm.wmma.m8n32k16.mma.col.row.f16.f16.satfinite
+ // expected-error-re@+1 {{'__hmma_m8n32k16_mma_f16f16' needs target feature sm_70{{.*}},ptx61{{.*}}}}
+ __hmma_m8n32k16_mma_f16f16(dst, src, src, src, 2, 1);
+ // CHECK_PTX61_SM70: call {{.*}} @llvm.nvvm.wmma.m8n32k16.mma.row.col.f16.f16
+ // expected-error-re@+1 {{'__hmma_m8n32k16_mma_f16f16' needs target feature sm_70{{.*}},ptx61{{.*}}}}
+ __hmma_m8n32k16_mma_f16f16(dst, src, src, src, 1, 0);
+ // CHECK_PTX61_SM70: call {{.*}} @llvm.nvvm.wmma.m8n32k16.mma.row.col.f16.f16.satfinite
+ // expected-error-re@+1 {{'__hmma_m8n32k16_mma_f16f16' needs target feature sm_70{{.*}},ptx61{{.*}}}}
+ __hmma_m8n32k16_mma_f16f16(dst, src, src, src, 1, 1);
+ // CHECK_PTX61_SM70: call {{.*}} @llvm.nvvm.wmma.m8n32k16.mma.row.row.f16.f16
+ // expected-error-re@+1 {{'__hmma_m8n32k16_mma_f16f16' needs target feature sm_70{{.*}},ptx61{{.*}}}}
+ __hmma_m8n32k16_mma_f16f16(dst, src, src, src, 0, 0);
+ // CHECK_PTX61_SM70: call {{.*}} @llvm.nvvm.wmma.m8n32k16.mma.row.row.f16.f16.satfinite
+ // expected-error-re@+1 {{'__hmma_m8n32k16_mma_f16f16' needs target feature sm_70{{.*}},ptx61{{.*}}}}
+ __hmma_m8n32k16_mma_f16f16(dst, src, src, src, 0, 1);
+ // CHECK_PTX61_SM70: call {{.*}} @llvm.nvvm.wmma.m8n32k16.mma.col.col.f32.f16
+ // expected-error-re@+1 {{'__hmma_m8n32k16_mma_f32f16' needs target feature sm_70{{.*}},ptx61{{.*}}}}
+ __hmma_m8n32k16_mma_f32f16(fdst, src, src, src, 3, 0);
+ // CHECK_PTX61_SM70: call {{.*}} @llvm.nvvm.wmma.m8n32k16.mma.col.col.f32.f16.satfinite
+ // expected-error-re@+1 {{'__hmma_m8n32k16_mma_f32f16' needs target feature sm_70{{.*}},ptx61{{.*}}}}
+ __hmma_m8n32k16_mma_f32f16(fdst, src, src, src, 3, 1);
+ // CHECK_PTX61_SM70: call {{.*}} @llvm.nvvm.wmma.m8n32k16.mma.col.row.f32.f16
+ // expected-error-re@+1 {{'__hmma_m8n32k16_mma_f32f16' needs target feature sm_70{{.*}},ptx61{{.*}}}}
+ __hmma_m8n32k16_mma_f32f16(fdst, src, src, src, 2, 0);
+ // CHECK_PTX61_SM70: call {{.*}} @llvm.nvvm.wmma.m8n32k16.mma.col.row.f32.f16.satfinite
+ // expected-error-re@+1 {{'__hmma_m8n32k16_mma_f32f16' needs target feature sm_70{{.*}},ptx61{{.*}}}}
+ __hmma_m8n32k16_mma_f32f16(fdst, src, src, src, 2, 1);
+ // CHECK_PTX61_SM70: call {{.*}} @llvm.nvvm.wmma.m8n32k16.mma.row.col.f32.f16
+ // expected-error-re@+1 {{'__hmma_m8n32k16_mma_f32f16' needs target feature sm_70{{.*}},ptx61{{.*}}}}
+ __hmma_m8n32k16_mma_f32f16(fdst, src, src, src, 1, 0);
+ // CHECK_PTX61_SM70: call {{.*}} @llvm.nvvm.wmma.m8n32k16.mma.row.col.f32.f16.satfinite
+ // expected-error-re@+1 {{'__hmma_m8n32k16_mma_f32f16' needs target feature sm_70{{.*}},ptx61{{.*}}}}
+ __hmma_m8n32k16_mma_f32f16(fdst, src, src, src, 1, 1);
+ // CHECK_PTX61_SM70: call {{.*}} @llvm.nvvm.wmma.m8n32k16.mma.row.row.f32.f16
+ // expected-error-re@+1 {{'__hmma_m8n32k16_mma_f32f16' needs target feature sm_70{{.*}},ptx61{{.*}}}}
+ __hmma_m8n32k16_mma_f32f16(fdst, src, src, src, 0, 0);
+ // CHECK_PTX61_SM70: call {{.*}} @llvm.nvvm.wmma.m8n32k16.mma.row.row.f32.f16.satfinite
+ // expected-error-re@+1 {{'__hmma_m8n32k16_mma_f32f16' needs target feature sm_70{{.*}},ptx61{{.*}}}}
+ __hmma_m8n32k16_mma_f32f16(fdst, src, src, src, 0, 1);
+ // CHECK_PTX61_SM70: call {{.*}} @llvm.nvvm.wmma.m8n32k16.mma.col.col.f16.f32
+ // expected-error-re@+1 {{'__hmma_m8n32k16_mma_f16f32' needs target feature sm_70{{.*}},ptx61{{.*}}}}
+ __hmma_m8n32k16_mma_f16f32(dst, src, src, fsrc, 3, 0);
+ // CHECK_PTX61_SM70: call {{.*}} @llvm.nvvm.wmma.m8n32k16.mma.col.col.f16.f32.satfinite
+ // expected-error-re@+1 {{'__hmma_m8n32k16_mma_f16f32' needs target feature sm_70{{.*}},ptx61{{.*}}}}
+ __hmma_m8n32k16_mma_f16f32(dst, src, src, fsrc, 3, 1);
+ // CHECK_PTX61_SM70: call {{.*}} @llvm.nvvm.wmma.m8n32k16.mma.col.row.f16.f32
+ // expected-error-re@+1 {{'__hmma_m8n32k16_mma_f16f32' needs target feature sm_70{{.*}},ptx61{{.*}}}}
+ __hmma_m8n32k16_mma_f16f32(dst, src, src, fsrc, 2, 0);
+ // CHECK_PTX61_SM70: call {{.*}} @llvm.nvvm.wmma.m8n32k16.mma.col.row.f16.f32.satfinite
+ // expected-error-re@+1 {{'__hmma_m8n32k16_mma_f16f32' needs target feature sm_70{{.*}},ptx61{{.*}}}}
+ __hmma_m8n32k16_mma_f16f32(dst, src, src, fsrc, 2, 1);
+ // CHECK_PTX61_SM70: call {{.*}} @llvm.nvvm.wmma.m8n32k16.mma.row.col.f16.f32
+ // expected-error-re@+1 {{'__hmma_m8n32k16_mma_f16f32' needs target feature sm_70{{.*}},ptx61{{.*}}}}
+ __hmma_m8n32k16_mma_f16f32(dst, src, src, fsrc, 1, 0);
+ // CHECK_PTX61_SM70: call {{.*}} @llvm.nvvm.wmma.m8n32k16.mma.row.col.f16.f32.satfinite
+ // expected-error-re@+1 {{'__hmma_m8n32k16_mma_f16f32' needs target feature sm_70{{.*}},ptx61{{.*}}}}
+ __hmma_m8n32k16_mma_f16f32(dst, src, src, fsrc, 1, 1);
+ // CHECK_PTX61_SM70: call {{.*}} @llvm.nvvm.wmma.m8n32k16.mma.row.row.f16.f32
+ // expected-error-re@+1 {{'__hmma_m8n32k16_mma_f16f32' needs target feature sm_70{{.*}},ptx61{{.*}}}}
+ __hmma_m8n32k16_mma_f16f32(dst, src, src, fsrc, 0, 0);
+ // CHECK_PTX61_SM70: call {{.*}} @llvm.nvvm.wmma.m8n32k16.mma.row.row.f16.f32.satfinite
+ // expected-error-re@+1 {{'__hmma_m8n32k16_mma_f16f32' needs target feature sm_70{{.*}},ptx61{{.*}}}}
+ __hmma_m8n32k16_mma_f16f32(dst, src, src, fsrc, 0, 1);
+ // CHECK_PTX61_SM70: call {{.*}} @llvm.nvvm.wmma.m8n32k16.mma.col.col.f32.f32
+ // expected-error-re@+1 {{'__hmma_m8n32k16_mma_f32f32' needs target feature sm_70{{.*}},ptx61{{.*}}}}
+ __hmma_m8n32k16_mma_f32f32(fdst, src, src, fsrc, 3, 0);
+ // CHECK_PTX61_SM70: call {{.*}} @llvm.nvvm.wmma.m8n32k16.mma.col.col.f32.f32.satfinite
+ // expected-error-re@+1 {{'__hmma_m8n32k16_mma_f32f32' needs target feature sm_70{{.*}},ptx61{{.*}}}}
+ __hmma_m8n32k16_mma_f32f32(fdst, src, src, fsrc, 3, 1);
+ // CHECK_PTX61_SM70: call {{.*}} @llvm.nvvm.wmma.m8n32k16.mma.col.row.f32.f32
+ // expected-error-re@+1 {{'__hmma_m8n32k16_mma_f32f32' needs target feature sm_70{{.*}},ptx61{{.*}}}}
+ __hmma_m8n32k16_mma_f32f32(fdst, src, src, fsrc, 2, 0);
+ // CHECK_PTX61_SM70: call {{.*}} @llvm.nvvm.wmma.m8n32k16.mma.col.row.f32.f32.satfinite
+ // expected-error-re@+1 {{'__hmma_m8n32k16_mma_f32f32' needs target feature sm_70{{.*}},ptx61{{.*}}}}
+ __hmma_m8n32k16_mma_f32f32(fdst, src, src, fsrc, 2, 1);
+ // CHECK_PTX61_SM70: call {{.*}} @llvm.nvvm.wmma.m8n32k16.mma.row.col.f32.f32
+ // expected-error-re@+1 {{'__hmma_m8n32k16_mma_f32f32' needs target feature sm_70{{.*}},ptx61{{.*}}}}
+ __hmma_m8n32k16_mma_f32f32(fdst, src, src, fsrc, 1, 0);
+ // CHECK_PTX61_SM70: call {{.*}} @llvm.nvvm.wmma.m8n32k16.mma.row.col.f32.f32.satfinite
+ // expected-error-re@+1 {{'__hmma_m8n32k16_mma_f32f32' needs target feature sm_70{{.*}},ptx61{{.*}}}}
+ __hmma_m8n32k16_mma_f32f32(fdst, src, src, fsrc, 1, 1);
+ // CHECK_PTX61_SM70: call {{.*}} @llvm.nvvm.wmma.m8n32k16.mma.row.row.f32.f32
+ // expected-error-re@+1 {{'__hmma_m8n32k16_mma_f32f32' needs target feature sm_70{{.*}},ptx61{{.*}}}}
+ __hmma_m8n32k16_mma_f32f32(fdst, src, src, fsrc, 0, 0);
+ // CHECK_PTX61_SM70: call {{.*}} @llvm.nvvm.wmma.m8n32k16.mma.row.row.f32.f32.satfinite
+ // expected-error-re@+1 {{'__hmma_m8n32k16_mma_f32f32' needs target feature sm_70{{.*}},ptx61{{.*}}}}
+ __hmma_m8n32k16_mma_f32f32(fdst, src, src, fsrc, 0, 1);
+#endif // (PTX >= 61) && (SM >= 70)
+
+#if (PTX >= 63) && (SM >= 72)
+
+ // CHECK_PTX63_SM72: call {{.*}} @llvm.nvvm.wmma.m16n16k16.load.a.col.stride.s8
+ // expected-error-re@+1 {{'__imma_m16n16k16_ld_a_s8' needs target feature sm_72{{.*}},ptx63{{.*}}}}
+ __imma_m16n16k16_ld_a_s8(dst, src, ldm, 1);
+ // CHECK_PTX63_SM72: call {{.*}} @llvm.nvvm.wmma.m16n16k16.load.a.row.stride.s8
+ // expected-error-re@+1 {{'__imma_m16n16k16_ld_a_s8' needs target feature sm_72{{.*}},ptx63{{.*}}}}
+ __imma_m16n16k16_ld_a_s8(dst, src, ldm, 0);
+ // CHECK_PTX63_SM72: call {{.*}} @llvm.nvvm.wmma.m16n16k16.load.a.col.stride.u8
+ // expected-error-re@+1 {{'__imma_m16n16k16_ld_a_u8' needs target feature sm_72{{.*}},ptx63{{.*}}}}
+ __imma_m16n16k16_ld_a_u8(dst, src, ldm, 1);
+ // CHECK_PTX63_SM72: call {{.*}} @llvm.nvvm.wmma.m16n16k16.load.a.row.stride.u8
+ // expected-error-re@+1 {{'__imma_m16n16k16_ld_a_u8' needs target feature sm_72{{.*}},ptx63{{.*}}}}
+ __imma_m16n16k16_ld_a_u8(dst, src, ldm, 0);
+ // CHECK_PTX63_SM72: call {{.*}} @llvm.nvvm.wmma.m16n16k16.load.b.col.stride.s8
+ // expected-error-re@+1 {{'__imma_m16n16k16_ld_b_s8' needs target feature sm_72{{.*}},ptx63{{.*}}}}
+ __imma_m16n16k16_ld_b_s8(dst, src, ldm, 1);
+ // CHECK_PTX63_SM72: call {{.*}} @llvm.nvvm.wmma.m16n16k16.load.b.row.stride.s8
+ // expected-error-re@+1 {{'__imma_m16n16k16_ld_b_s8' needs target feature sm_72{{.*}},ptx63{{.*}}}}
+ __imma_m16n16k16_ld_b_s8(dst, src, ldm, 0);
+ // CHECK_PTX63_SM72: call {{.*}} @llvm.nvvm.wmma.m16n16k16.load.b.col.stride.u8
+ // expected-error-re@+1 {{'__imma_m16n16k16_ld_b_u8' needs target feature sm_72{{.*}},ptx63{{.*}}}}
+ __imma_m16n16k16_ld_b_u8(dst, src, ldm, 1);
+ // CHECK_PTX63_SM72: call {{.*}} @llvm.nvvm.wmma.m16n16k16.load.b.row.stride.u8
+ // expected-error-re@+1 {{'__imma_m16n16k16_ld_b_u8' needs target feature sm_72{{.*}},ptx63{{.*}}}}
+ __imma_m16n16k16_ld_b_u8(dst, src, ldm, 0);
+ // CHECK_PTX63_SM72: call {{.*}} @llvm.nvvm.wmma.m16n16k16.load.c.col.stride.s32
+ // expected-error-re@+1 {{'__imma_m16n16k16_ld_c' needs target feature sm_72{{.*}},ptx63{{.*}}}}
+ __imma_m16n16k16_ld_c(dst, src, ldm, 1);
+ // CHECK_PTX63_SM72: call {{.*}} @llvm.nvvm.wmma.m16n16k16.load.c.row.stride.s32
+ // expected-error-re@+1 {{'__imma_m16n16k16_ld_c' needs target feature sm_72{{.*}},ptx63{{.*}}}}
+ __imma_m16n16k16_ld_c(dst, src, ldm, 0);
+ // CHECK_PTX63_SM72: call {{.*}} @llvm.nvvm.wmma.m16n16k16.store.d.col.stride.s32
+ // expected-error-re@+1 {{'__imma_m16n16k16_st_c_i32' needs target feature sm_72{{.*}},ptx63{{.*}}}}
+ __imma_m16n16k16_st_c_i32(dst, src, ldm, 1);
+ // CHECK_PTX63_SM72: call {{.*}} @llvm.nvvm.wmma.m16n16k16.store.d.row.stride.s32
+ // expected-error-re@+1 {{'__imma_m16n16k16_st_c_i32' needs target feature sm_72{{.*}},ptx63{{.*}}}}
+ __imma_m16n16k16_st_c_i32(dst, src, ldm, 0);
+ // CHECK_PTX63_SM72: call {{.*}} @llvm.nvvm.wmma.m32n8k16.load.a.col.stride.s8
+ // expected-error-re@+1 {{'__imma_m32n8k16_ld_a_s8' needs target feature sm_72{{.*}},ptx63{{.*}}}}
+ __imma_m32n8k16_ld_a_s8(dst, src, ldm, 1);
+ // CHECK_PTX63_SM72: call {{.*}} @llvm.nvvm.wmma.m32n8k16.load.a.row.stride.s8
+ // expected-error-re@+1 {{'__imma_m32n8k16_ld_a_s8' needs target feature sm_72{{.*}},ptx63{{.*}}}}
+ __imma_m32n8k16_ld_a_s8(dst, src, ldm, 0);
+ // CHECK_PTX63_SM72: call {{.*}} @llvm.nvvm.wmma.m32n8k16.load.a.col.stride.u8
+ // expected-error-re@+1 {{'__imma_m32n8k16_ld_a_u8' needs target feature sm_72{{.*}},ptx63{{.*}}}}
+ __imma_m32n8k16_ld_a_u8(dst, src, ldm, 1);
+ // CHECK_PTX63_SM72: call {{.*}} @llvm.nvvm.wmma.m32n8k16.load.a.row.stride.u8
+ // expected-error-re@+1 {{'__imma_m32n8k16_ld_a_u8' needs target feature sm_72{{.*}},ptx63{{.*}}}}
+ __imma_m32n8k16_ld_a_u8(dst, src, ldm, 0);
+ // CHECK_PTX63_SM72: call {{.*}} @llvm.nvvm.wmma.m32n8k16.load.b.col.stride.s8
+ // expected-error-re@+1 {{'__imma_m32n8k16_ld_b_s8' needs target feature sm_72{{.*}},ptx63{{.*}}}}
+ __imma_m32n8k16_ld_b_s8(dst, src, ldm, 1);
+ // CHECK_PTX63_SM72: call {{.*}} @llvm.nvvm.wmma.m32n8k16.load.b.row.stride.s8
+ // expected-error-re@+1 {{'__imma_m32n8k16_ld_b_s8' needs target feature sm_72{{.*}},ptx63{{.*}}}}
+ __imma_m32n8k16_ld_b_s8(dst, src, ldm, 0);
+ // CHECK_PTX63_SM72: call {{.*}} @llvm.nvvm.wmma.m32n8k16.load.b.col.stride.u8
+ // expected-error-re@+1 {{'__imma_m32n8k16_ld_b_u8' needs target feature sm_72{{.*}},ptx63{{.*}}}}
+ __imma_m32n8k16_ld_b_u8(dst, src, ldm, 1);
+ // CHECK_PTX63_SM72: call {{.*}} @llvm.nvvm.wmma.m32n8k16.load.b.row.stride.u8
+ // expected-error-re@+1 {{'__imma_m32n8k16_ld_b_u8' needs target feature sm_72{{.*}},ptx63{{.*}}}}
+ __imma_m32n8k16_ld_b_u8(dst, src, ldm, 0);
+ // CHECK_PTX63_SM72: call {{.*}} @llvm.nvvm.wmma.m32n8k16.load.c.col.stride.s32
+ // expected-error-re@+1 {{'__imma_m32n8k16_ld_c' needs target feature sm_72{{.*}},ptx63{{.*}}}}
+ __imma_m32n8k16_ld_c(dst, src, ldm, 1);
+ // CHECK_PTX63_SM72: call {{.*}} @llvm.nvvm.wmma.m32n8k16.load.c.row.stride.s32
+ // expected-error-re@+1 {{'__imma_m32n8k16_ld_c' needs target feature sm_72{{.*}},ptx63{{.*}}}}
+ __imma_m32n8k16_ld_c(dst, src, ldm, 0);
+ // CHECK_PTX63_SM72: call {{.*}} @llvm.nvvm.wmma.m32n8k16.store.d.col.stride.s32
+ // expected-error-re@+1 {{'__imma_m32n8k16_st_c_i32' needs target feature sm_72{{.*}},ptx63{{.*}}}}
+ __imma_m32n8k16_st_c_i32(dst, src, ldm, 1);
+ // CHECK_PTX63_SM72: call {{.*}} @llvm.nvvm.wmma.m32n8k16.store.d.row.stride.s32
+ // expected-error-re@+1 {{'__imma_m32n8k16_st_c_i32' needs target feature sm_72{{.*}},ptx63{{.*}}}}
+ __imma_m32n8k16_st_c_i32(dst, src, ldm, 0);
+ // CHECK_PTX63_SM72: call {{.*}} @llvm.nvvm.wmma.m8n32k16.load.a.col.stride.s8
+ // expected-error-re@+1 {{'__imma_m8n32k16_ld_a_s8' needs target feature sm_72{{.*}},ptx63{{.*}}}}
+ __imma_m8n32k16_ld_a_s8(dst, src, ldm, 1);
+ // CHECK_PTX63_SM72: call {{.*}} @llvm.nvvm.wmma.m8n32k16.load.a.row.stride.s8
+ // expected-error-re@+1 {{'__imma_m8n32k16_ld_a_s8' needs target feature sm_72{{.*}},ptx63{{.*}}}}
+ __imma_m8n32k16_ld_a_s8(dst, src, ldm, 0);
+ // CHECK_PTX63_SM72: call {{.*}} @llvm.nvvm.wmma.m8n32k16.load.a.col.stride.u8
+ // expected-error-re@+1 {{'__imma_m8n32k16_ld_a_u8' needs target feature sm_72{{.*}},ptx63{{.*}}}}
+ __imma_m8n32k16_ld_a_u8(dst, src, ldm, 1);
+ // CHECK_PTX63_SM72: call {{.*}} @llvm.nvvm.wmma.m8n32k16.load.a.row.stride.u8
+ // expected-error-re@+1 {{'__imma_m8n32k16_ld_a_u8' needs target feature sm_72{{.*}},ptx63{{.*}}}}
+ __imma_m8n32k16_ld_a_u8(dst, src, ldm, 0);
+ // CHECK_PTX63_SM72: call {{.*}} @llvm.nvvm.wmma.m8n32k16.load.b.col.stride.s8
+ // expected-error-re@+1 {{'__imma_m8n32k16_ld_b_s8' needs target feature sm_72{{.*}},ptx63{{.*}}}}
+ __imma_m8n32k16_ld_b_s8(dst, src, ldm, 1);
+ // CHECK_PTX63_SM72: call {{.*}} @llvm.nvvm.wmma.m8n32k16.load.b.row.stride.s8
+ // expected-error-re@+1 {{'__imma_m8n32k16_ld_b_s8' needs target feature sm_72{{.*}},ptx63{{.*}}}}
+ __imma_m8n32k16_ld_b_s8(dst, src, ldm, 0);
+ // CHECK_PTX63_SM72: call {{.*}} @llvm.nvvm.wmma.m8n32k16.load.b.col.stride.u8
+ // expected-error-re@+1 {{'__imma_m8n32k16_ld_b_u8' needs target feature sm_72{{.*}},ptx63{{.*}}}}
+ __imma_m8n32k16_ld_b_u8(dst, src, ldm, 1);
+ // CHECK_PTX63_SM72: call {{.*}} @llvm.nvvm.wmma.m8n32k16.load.b.row.stride.u8
+ // expected-error-re@+1 {{'__imma_m8n32k16_ld_b_u8' needs target feature sm_72{{.*}},ptx63{{.*}}}}
+ __imma_m8n32k16_ld_b_u8(dst, src, ldm, 0);
+ // CHECK_PTX63_SM72: call {{.*}} @llvm.nvvm.wmma.m8n32k16.load.c.col.stride.s32
+ // expected-error-re@+1 {{'__imma_m8n32k16_ld_c' needs target feature sm_72{{.*}},ptx63{{.*}}}}
+ __imma_m8n32k16_ld_c(dst, src, ldm, 1);
+ // CHECK_PTX63_SM72: call {{.*}} @llvm.nvvm.wmma.m8n32k16.load.c.row.stride.s32
+ // expected-error-re@+1 {{'__imma_m8n32k16_ld_c' needs target feature sm_72{{.*}},ptx63{{.*}}}}
+ __imma_m8n32k16_ld_c(dst, src, ldm, 0);
+ // CHECK_PTX63_SM72: call {{.*}} @llvm.nvvm.wmma.m8n32k16.store.d.col.stride.s32
+ // expected-error-re@+1 {{'__imma_m8n32k16_st_c_i32' needs target feature sm_72{{.*}},ptx63{{.*}}}}
+ __imma_m8n32k16_st_c_i32(dst, src, ldm, 1);
+ // CHECK_PTX63_SM72: call {{.*}} @llvm.nvvm.wmma.m8n32k16.store.d.row.stride.s32
+ // expected-error-re@+1 {{'__imma_m8n32k16_st_c_i32' needs target feature sm_72{{.*}},ptx63{{.*}}}}
+ __imma_m8n32k16_st_c_i32(dst, src, ldm, 0);
+ // CHECK_PTX63_SM72: call {{.*}} @llvm.nvvm.wmma.m16n16k16.mma.col.col.s8
+ // expected-error-re@+1 {{'__imma_m16n16k16_mma_s8' needs target feature sm_72{{.*}},ptx63{{.*}}}}
+ __imma_m16n16k16_mma_s8(dst, src, src, src, 3, 0);
+ // CHECK_PTX63_SM72: call {{.*}} @llvm.nvvm.wmma.m16n16k16.mma.col.col.s8.satfinite
+ // expected-error-re@+1 {{'__imma_m16n16k16_mma_s8' needs target feature sm_72{{.*}},ptx63{{.*}}}}
+ __imma_m16n16k16_mma_s8(dst, src, src, src, 3, 1);
+ // CHECK_PTX63_SM72: call {{.*}} @llvm.nvvm.wmma.m16n16k16.mma.col.row.s8
+ // expected-error-re@+1 {{'__imma_m16n16k16_mma_s8' needs target feature sm_72{{.*}},ptx63{{.*}}}}
+ __imma_m16n16k16_mma_s8(dst, src, src, src, 2, 0);
+ // CHECK_PTX63_SM72: call {{.*}} @llvm.nvvm.wmma.m16n16k16.mma.col.row.s8.satfinite
+ // expected-error-re@+1 {{'__imma_m16n16k16_mma_s8' needs target feature sm_72{{.*}},ptx63{{.*}}}}
+ __imma_m16n16k16_mma_s8(dst, src, src, src, 2, 1);
+ // CHECK_PTX63_SM72: call {{.*}} @llvm.nvvm.wmma.m16n16k16.mma.row.col.s8
+ // expected-error-re@+1 {{'__imma_m16n16k16_mma_s8' needs target feature sm_72{{.*}},ptx63{{.*}}}}
+ __imma_m16n16k16_mma_s8(dst, src, src, src, 1, 0);
+ // CHECK_PTX63_SM72: call {{.*}} @llvm.nvvm.wmma.m16n16k16.mma.row.col.s8.satfinite
+ // expected-error-re@+1 {{'__imma_m16n16k16_mma_s8' needs target feature sm_72{{.*}},ptx63{{.*}}}}
+ __imma_m16n16k16_mma_s8(dst, src, src, src, 1, 1);
+ // CHECK_PTX63_SM72: call {{.*}} @llvm.nvvm.wmma.m16n16k16.mma.row.row.s8
+ // expected-error-re@+1 {{'__imma_m16n16k16_mma_s8' needs target feature sm_72{{.*}},ptx63{{.*}}}}
+ __imma_m16n16k16_mma_s8(dst, src, src, src, 0, 0);
+ // CHECK_PTX63_SM72: call {{.*}} @llvm.nvvm.wmma.m16n16k16.mma.row.row.s8.satfinite
+ // expected-error-re@+1 {{'__imma_m16n16k16_mma_s8' needs target feature sm_72{{.*}},ptx63{{.*}}}}
+ __imma_m16n16k16_mma_s8(dst, src, src, src, 0, 1);
+ // CHECK_PTX63_SM72: call {{.*}} @llvm.nvvm.wmma.m16n16k16.mma.col.col.u8
+ // expected-error-re@+1 {{'__imma_m16n16k16_mma_u8' needs target feature sm_72{{.*}},ptx63{{.*}}}}
+ __imma_m16n16k16_mma_u8(dst, src, src, src, 3, 0);
+ // CHECK_PTX63_SM72: call {{.*}} @llvm.nvvm.wmma.m16n16k16.mma.col.col.u8.satfinite
+ // expected-error-re@+1 {{'__imma_m16n16k16_mma_u8' needs target feature sm_72{{.*}},ptx63{{.*}}}}
+ __imma_m16n16k16_mma_u8(dst, src, src, src, 3, 1);
+ // CHECK_PTX63_SM72: call {{.*}} @llvm.nvvm.wmma.m16n16k16.mma.col.row.u8
+ // expected-error-re@+1 {{'__imma_m16n16k16_mma_u8' needs target feature sm_72{{.*}},ptx63{{.*}}}}
+ __imma_m16n16k16_mma_u8(dst, src, src, src, 2, 0);
+ // CHECK_PTX63_SM72: call {{.*}} @llvm.nvvm.wmma.m16n16k16.mma.col.row.u8.satfinite
+ // expected-error-re@+1 {{'__imma_m16n16k16_mma_u8' needs target feature sm_72{{.*}},ptx63{{.*}}}}
+ __imma_m16n16k16_mma_u8(dst, src, src, src, 2, 1);
+ // CHECK_PTX63_SM72: call {{.*}} @llvm.nvvm.wmma.m16n16k16.mma.row.col.u8
+ // expected-error-re@+1 {{'__imma_m16n16k16_mma_u8' needs target feature sm_72{{.*}},ptx63{{.*}}}}
+ __imma_m16n16k16_mma_u8(dst, src, src, src, 1, 0);
+ // CHECK_PTX63_SM72: call {{.*}} @llvm.nvvm.wmma.m16n16k16.mma.row.col.u8.satfinite
+ // expected-error-re@+1 {{'__imma_m16n16k16_mma_u8' needs target feature sm_72{{.*}},ptx63{{.*}}}}
+ __imma_m16n16k16_mma_u8(dst, src, src, src, 1, 1);
+ // CHECK_PTX63_SM72: call {{.*}} @llvm.nvvm.wmma.m16n16k16.mma.row.row.u8
+ // expected-error-re@+1 {{'__imma_m16n16k16_mma_u8' needs target feature sm_72{{.*}},ptx63{{.*}}}}
+ __imma_m16n16k16_mma_u8(dst, src, src, src, 0, 0);
+ // CHECK_PTX63_SM72: call {{.*}} @llvm.nvvm.wmma.m16n16k16.mma.row.row.u8.satfinite
+ // expected-error-re@+1 {{'__imma_m16n16k16_mma_u8' needs target feature sm_72{{.*}},ptx63{{.*}}}}
+ __imma_m16n16k16_mma_u8(dst, src, src, src, 0, 1);
+ // CHECK_PTX63_SM72: call {{.*}} @llvm.nvvm.wmma.m32n8k16.mma.col.col.s8
+ // expected-error-re@+1 {{'__imma_m32n8k16_mma_s8' needs target feature sm_72{{.*}},ptx63{{.*}}}}
+ __imma_m32n8k16_mma_s8(dst, src, src, src, 3, 0);
+ // CHECK_PTX63_SM72: call {{.*}} @llvm.nvvm.wmma.m32n8k16.mma.col.col.s8.satfinite
+ // expected-error-re@+1 {{'__imma_m32n8k16_mma_s8' needs target feature sm_72{{.*}},ptx63{{.*}}}}
+ __imma_m32n8k16_mma_s8(dst, src, src, src, 3, 1);
+ // CHECK_PTX63_SM72: call {{.*}} @llvm.nvvm.wmma.m32n8k16.mma.col.row.s8
+ // expected-error-re@+1 {{'__imma_m32n8k16_mma_s8' needs target feature sm_72{{.*}},ptx63{{.*}}}}
+ __imma_m32n8k16_mma_s8(dst, src, src, src, 2, 0);
+ // CHECK_PTX63_SM72: call {{.*}} @llvm.nvvm.wmma.m32n8k16.mma.col.row.s8.satfinite
+ // expected-error-re@+1 {{'__imma_m32n8k16_mma_s8' needs target feature sm_72{{.*}},ptx63{{.*}}}}
+ __imma_m32n8k16_mma_s8(dst, src, src, src, 2, 1);
+ // CHECK_PTX63_SM72: call {{.*}} @llvm.nvvm.wmma.m32n8k16.mma.row.col.s8
+ // expected-error-re@+1 {{'__imma_m32n8k16_mma_s8' needs target feature sm_72{{.*}},ptx63{{.*}}}}
+ __imma_m32n8k16_mma_s8(dst, src, src, src, 1, 0);
+ // CHECK_PTX63_SM72: call {{.*}} @llvm.nvvm.wmma.m32n8k16.mma.row.col.s8.satfinite
+ // expected-error-re@+1 {{'__imma_m32n8k16_mma_s8' needs target feature sm_72{{.*}},ptx63{{.*}}}}
+ __imma_m32n8k16_mma_s8(dst, src, src, src, 1, 1);
+ // CHECK_PTX63_SM72: call {{.*}} @llvm.nvvm.wmma.m32n8k16.mma.row.row.s8
+ // expected-error-re@+1 {{'__imma_m32n8k16_mma_s8' needs target feature sm_72{{.*}},ptx63{{.*}}}}
+ __imma_m32n8k16_mma_s8(dst, src, src, src, 0, 0);
+ // CHECK_PTX63_SM72: call {{.*}} @llvm.nvvm.wmma.m32n8k16.mma.row.row.s8.satfinite
+ // expected-error-re@+1 {{'__imma_m32n8k16_mma_s8' needs target feature sm_72{{.*}},ptx63{{.*}}}}
+ __imma_m32n8k16_mma_s8(dst, src, src, src, 0, 1);
+ // CHECK_PTX63_SM72: call {{.*}} @llvm.nvvm.wmma.m32n8k16.mma.col.col.u8
+ // expected-error-re@+1 {{'__imma_m32n8k16_mma_u8' needs target feature sm_72{{.*}},ptx63{{.*}}}}
+ __imma_m32n8k16_mma_u8(dst, src, src, src, 3, 0);
+ // CHECK_PTX63_SM72: call {{.*}} @llvm.nvvm.wmma.m32n8k16.mma.col.col.u8.satfinite
+ // expected-error-re@+1 {{'__imma_m32n8k16_mma_u8' needs target feature sm_72{{.*}},ptx63{{.*}}}}
+ __imma_m32n8k16_mma_u8(dst, src, src, src, 3, 1);
+ // CHECK_PTX63_SM72: call {{.*}} @llvm.nvvm.wmma.m32n8k16.mma.col.row.u8
+ // expected-error-re@+1 {{'__imma_m32n8k16_mma_u8' needs target feature sm_72{{.*}},ptx63{{.*}}}}
+ __imma_m32n8k16_mma_u8(dst, src, src, src, 2, 0);
+ // CHECK_PTX63_SM72: call {{.*}} @llvm.nvvm.wmma.m32n8k16.mma.col.row.u8.satfinite
+ // expected-error-re@+1 {{'__imma_m32n8k16_mma_u8' needs target feature sm_72{{.*}},ptx63{{.*}}}}
+ __imma_m32n8k16_mma_u8(dst, src, src, src, 2, 1);
+ // CHECK_PTX63_SM72: call {{.*}} @llvm.nvvm.wmma.m32n8k16.mma.row.col.u8
+ // expected-error-re@+1 {{'__imma_m32n8k16_mma_u8' needs target feature sm_72{{.*}},ptx63{{.*}}}}
+ __imma_m32n8k16_mma_u8(dst, src, src, src, 1, 0);
+ // CHECK_PTX63_SM72: call {{.*}} @llvm.nvvm.wmma.m32n8k16.mma.row.col.u8.satfinite
+ // expected-error-re@+1 {{'__imma_m32n8k16_mma_u8' needs target feature sm_72{{.*}},ptx63{{.*}}}}
+ __imma_m32n8k16_mma_u8(dst, src, src, src, 1, 1);
+ // CHECK_PTX63_SM72: call {{.*}} @llvm.nvvm.wmma.m32n8k16.mma.row.row.u8
+ // expected-error-re@+1 {{'__imma_m32n8k16_mma_u8' needs target feature sm_72{{.*}},ptx63{{.*}}}}
+ __imma_m32n8k16_mma_u8(dst, src, src, src, 0, 0);
+ // CHECK_PTX63_SM72: call {{.*}} @llvm.nvvm.wmma.m32n8k16.mma.row.row.u8.satfinite
+ // expected-error-re@+1 {{'__imma_m32n8k16_mma_u8' needs target feature sm_72{{.*}},ptx63{{.*}}}}
+ __imma_m32n8k16_mma_u8(dst, src, src, src, 0, 1);
+ // CHECK_PTX63_SM72: call {{.*}} @llvm.nvvm.wmma.m8n32k16.mma.col.col.s8
+ // expected-error-re@+1 {{'__imma_m8n32k16_mma_s8' needs target feature sm_72{{.*}},ptx63{{.*}}}}
+ __imma_m8n32k16_mma_s8(dst, src, src, src, 3, 0);
+ // CHECK_PTX63_SM72: call {{.*}} @llvm.nvvm.wmma.m8n32k16.mma.col.col.s8.satfinite
+ // expected-error-re@+1 {{'__imma_m8n32k16_mma_s8' needs target feature sm_72{{.*}},ptx63{{.*}}}}
+ __imma_m8n32k16_mma_s8(dst, src, src, src, 3, 1);
+ // CHECK_PTX63_SM72: call {{.*}} @llvm.nvvm.wmma.m8n32k16.mma.col.row.s8
+ // expected-error-re@+1 {{'__imma_m8n32k16_mma_s8' needs target feature sm_72{{.*}},ptx63{{.*}}}}
+ __imma_m8n32k16_mma_s8(dst, src, src, src, 2, 0);
+ // CHECK_PTX63_SM72: call {{.*}} @llvm.nvvm.wmma.m8n32k16.mma.col.row.s8.satfinite
+ // expected-error-re@+1 {{'__imma_m8n32k16_mma_s8' needs target feature sm_72{{.*}},ptx63{{.*}}}}
+ __imma_m8n32k16_mma_s8(dst, src, src, src, 2, 1);
+ // CHECK_PTX63_SM72: call {{.*}} @llvm.nvvm.wmma.m8n32k16.mma.row.col.s8
+ // expected-error-re@+1 {{'__imma_m8n32k16_mma_s8' needs target feature sm_72{{.*}},ptx63{{.*}}}}
+ __imma_m8n32k16_mma_s8(dst, src, src, src, 1, 0);
+ // CHECK_PTX63_SM72: call {{.*}} @llvm.nvvm.wmma.m8n32k16.mma.row.col.s8.satfinite
+ // expected-error-re@+1 {{'__imma_m8n32k16_mma_s8' needs target feature sm_72{{.*}},ptx63{{.*}}}}
+ __imma_m8n32k16_mma_s8(dst, src, src, src, 1, 1);
+ // CHECK_PTX63_SM72: call {{.*}} @llvm.nvvm.wmma.m8n32k16.mma.row.row.s8
+ // expected-error-re@+1 {{'__imma_m8n32k16_mma_s8' needs target feature sm_72{{.*}},ptx63{{.*}}}}
+ __imma_m8n32k16_mma_s8(dst, src, src, src, 0, 0);
+ // CHECK_PTX63_SM72: call {{.*}} @llvm.nvvm.wmma.m8n32k16.mma.row.row.s8.satfinite
+ // expected-error-re@+1 {{'__imma_m8n32k16_mma_s8' needs target feature sm_72{{.*}},ptx63{{.*}}}}
+ __imma_m8n32k16_mma_s8(dst, src, src, src, 0, 1);
+ // CHECK_PTX63_SM72: call {{.*}} @llvm.nvvm.wmma.m8n32k16.mma.col.col.u8
+ // expected-error-re@+1 {{'__imma_m8n32k16_mma_u8' needs target feature sm_72{{.*}},ptx63{{.*}}}}
+ __imma_m8n32k16_mma_u8(dst, src, src, src, 3, 0);
+ // CHECK_PTX63_SM72: call {{.*}} @llvm.nvvm.wmma.m8n32k16.mma.col.col.u8.satfinite
+ // expected-error-re@+1 {{'__imma_m8n32k16_mma_u8' needs target feature sm_72{{.*}},ptx63{{.*}}}}
+ __imma_m8n32k16_mma_u8(dst, src, src, src, 3, 1);
+ // CHECK_PTX63_SM72: call {{.*}} @llvm.nvvm.wmma.m8n32k16.mma.col.row.u8
+ // expected-error-re@+1 {{'__imma_m8n32k16_mma_u8' needs target feature sm_72{{.*}},ptx63{{.*}}}}
+ __imma_m8n32k16_mma_u8(dst, src, src, src, 2, 0);
+ // CHECK_PTX63_SM72: call {{.*}} @llvm.nvvm.wmma.m8n32k16.mma.col.row.u8.satfinite
+ // expected-error-re@+1 {{'__imma_m8n32k16_mma_u8' needs target feature sm_72{{.*}},ptx63{{.*}}}}
+ __imma_m8n32k16_mma_u8(dst, src, src, src, 2, 1);
+ // CHECK_PTX63_SM72: call {{.*}} @llvm.nvvm.wmma.m8n32k16.mma.row.col.u8
+ // expected-error-re@+1 {{'__imma_m8n32k16_mma_u8' needs target feature sm_72{{.*}},ptx63{{.*}}}}
+ __imma_m8n32k16_mma_u8(dst, src, src, src, 1, 0);
+ // CHECK_PTX63_SM72: call {{.*}} @llvm.nvvm.wmma.m8n32k16.mma.row.col.u8.satfinite
+ // expected-error-re@+1 {{'__imma_m8n32k16_mma_u8' needs target feature sm_72{{.*}},ptx63{{.*}}}}
+ __imma_m8n32k16_mma_u8(dst, src, src, src, 1, 1);
+ // CHECK_PTX63_SM72: call {{.*}} @llvm.nvvm.wmma.m8n32k16.mma.row.row.u8
+ // expected-error-re@+1 {{'__imma_m8n32k16_mma_u8' needs target feature sm_72{{.*}},ptx63{{.*}}}}
+ __imma_m8n32k16_mma_u8(dst, src, src, src, 0, 0);
+ // CHECK_PTX63_SM72: call {{.*}} @llvm.nvvm.wmma.m8n32k16.mma.row.row.u8.satfinite
+ // expected-error-re@+1 {{'__imma_m8n32k16_mma_u8' needs target feature sm_72{{.*}},ptx63{{.*}}}}
+ __imma_m8n32k16_mma_u8(dst, src, src, src, 0, 1);
+#endif // (PTX >= 63) && (SM >= 72)
+
+#if (PTX >= 63) && (SM >= 75)
+
+ // CHECK_PTX63_SM75: call {{.*}} @llvm.nvvm.wmma.m8n8k128.load.a.row.stride.b1
+ // expected-error-re@+1 {{'__bmma_m8n8k128_ld_a_b1' needs target feature sm_75{{.*}},ptx63{{.*}}}}
+ __bmma_m8n8k128_ld_a_b1(dst, src, ldm, 0);
+ // CHECK_PTX63_SM75: call {{.*}} @llvm.nvvm.wmma.m8n8k128.load.b.col.stride.b1
+ // expected-error-re@+1 {{'__bmma_m8n8k128_ld_b_b1' needs target feature sm_75{{.*}},ptx63{{.*}}}}
+ __bmma_m8n8k128_ld_b_b1(dst, src, ldm, 1);
+ // CHECK_PTX63_SM75: call {{.*}} @llvm.nvvm.wmma.m8n8k128.load.c.col.stride.s32
+ // expected-error-re@+1 {{'__bmma_m8n8k128_ld_c' needs target feature sm_75{{.*}},ptx63{{.*}}}}
+ __bmma_m8n8k128_ld_c(dst, src, ldm, 1);
+ // CHECK_PTX63_SM75: call {{.*}} @llvm.nvvm.wmma.m8n8k128.load.c.row.stride.s32
+ // expected-error-re@+1 {{'__bmma_m8n8k128_ld_c' needs target feature sm_75{{.*}},ptx63{{.*}}}}
+ __bmma_m8n8k128_ld_c(dst, src, ldm, 0);
+ // CHECK_PTX63_SM75: call {{.*}} @llvm.nvvm.wmma.m8n8k128.store.d.col.stride.s32
+ // expected-error-re@+1 {{'__bmma_m8n8k128_st_c_i32' needs target feature sm_75{{.*}},ptx63{{.*}}}}
+ __bmma_m8n8k128_st_c_i32(dst, src, ldm, 1);
+ // CHECK_PTX63_SM75: call {{.*}} @llvm.nvvm.wmma.m8n8k128.store.d.row.stride.s32
+ // expected-error-re@+1 {{'__bmma_m8n8k128_st_c_i32' needs target feature sm_75{{.*}},ptx63{{.*}}}}
+ __bmma_m8n8k128_st_c_i32(dst, src, ldm, 0);
+ // CHECK_PTX63_SM75: call {{.*}} @llvm.nvvm.wmma.m8n8k32.load.a.row.stride.s4
+ // expected-error-re@+1 {{'__imma_m8n8k32_ld_a_s4' needs target feature sm_75{{.*}},ptx63{{.*}}}}
+ __imma_m8n8k32_ld_a_s4(dst, src, ldm, 0);
+ // CHECK_PTX63_SM75: call {{.*}} @llvm.nvvm.wmma.m8n8k32.load.a.row.stride.u4
+ // expected-error-re@+1 {{'__imma_m8n8k32_ld_a_u4' needs target feature sm_75{{.*}},ptx63{{.*}}}}
+ __imma_m8n8k32_ld_a_u4(dst, src, ldm, 0);
+ // CHECK_PTX63_SM75: call {{.*}} @llvm.nvvm.wmma.m8n8k32.load.b.col.stride.s4
+ // expected-error-re@+1 {{'__imma_m8n8k32_ld_b_s4' needs target feature sm_75{{.*}},ptx63{{.*}}}}
+ __imma_m8n8k32_ld_b_s4(dst, src, ldm, 1);
+ // CHECK_PTX63_SM75: call {{.*}} @llvm.nvvm.wmma.m8n8k32.load.b.col.stride.u4
+ // expected-error-re@+1 {{'__imma_m8n8k32_ld_b_u4' needs target feature sm_75{{.*}},ptx63{{.*}}}}
+ __imma_m8n8k32_ld_b_u4(dst, src, ldm, 1);
+ // CHECK_PTX63_SM75: call {{.*}} @llvm.nvvm.wmma.m8n8k32.load.c.col.stride.s32
+ // expected-error-re@+1 {{'__imma_m8n8k32_ld_c' needs target feature sm_75{{.*}},ptx63{{.*}}}}
+ __imma_m8n8k32_ld_c(dst, src, ldm, 1);
+ // CHECK_PTX63_SM75: call {{.*}} @llvm.nvvm.wmma.m8n8k32.load.c.row.stride.s32
+ // expected-error-re@+1 {{'__imma_m8n8k32_ld_c' needs target feature sm_75{{.*}},ptx63{{.*}}}}
+ __imma_m8n8k32_ld_c(dst, src, ldm, 0);
+ // CHECK_PTX63_SM75: call {{.*}} @llvm.nvvm.wmma.m8n8k32.store.d.col.stride.s32
+ // expected-error-re@+1 {{'__imma_m8n8k32_st_c_i32' needs target feature sm_75{{.*}},ptx63{{.*}}}}
+ __imma_m8n8k32_st_c_i32(dst, src, ldm, 1);
+ // CHECK_PTX63_SM75: call {{.*}} @llvm.nvvm.wmma.m8n8k32.store.d.row.stride.s32
+ // expected-error-re@+1 {{'__imma_m8n8k32_st_c_i32' needs target feature sm_75{{.*}},ptx63{{.*}}}}
+ __imma_m8n8k32_st_c_i32(dst, src, ldm, 0);
+ // CHECK_PTX63_SM75: call {{.*}} @llvm.nvvm.wmma.m8n8k128.mma.row.col.b1
+ // expected-error-re@+1 {{'__bmma_m8n8k128_mma_xor_popc_b1' needs target feature sm_75{{.*}},ptx63{{.*}}}}
+ __bmma_m8n8k128_mma_xor_popc_b1(dst, src, src, src, 1);
+ // CHECK_PTX63_SM75: call {{.*}} @llvm.nvvm.wmma.m8n8k32.mma.row.col.s4
+ // expected-error-re@+1 {{'__imma_m8n8k32_mma_s4' needs target feature sm_75{{.*}},ptx63{{.*}}}}
+ __imma_m8n8k32_mma_s4(dst, src, src, src, 1, 0);
+ // CHECK_PTX63_SM75: call {{.*}} @llvm.nvvm.wmma.m8n8k32.mma.row.col.s4.satfinite
+ // expected-error-re@+1 {{'__imma_m8n8k32_mma_s4' needs target feature sm_75{{.*}},ptx63{{.*}}}}
+ __imma_m8n8k32_mma_s4(dst, src, src, src, 1, 1);
+ // CHECK_PTX63_SM75: call {{.*}} @llvm.nvvm.wmma.m8n8k32.mma.row.col.u4
+ // expected-error-re@+1 {{'__imma_m8n8k32_mma_u4' needs target feature sm_75{{.*}},ptx63{{.*}}}}
+ __imma_m8n8k32_mma_u4(dst, src, src, src, 1, 0);
+ // CHECK_PTX63_SM75: call {{.*}} @llvm.nvvm.wmma.m8n8k32.mma.row.col.u4.satfinite
+ // expected-error-re@+1 {{'__imma_m8n8k32_mma_u4' needs target feature sm_75{{.*}},ptx63{{.*}}}}
+ __imma_m8n8k32_mma_u4(dst, src, src, src, 1, 1);
+#endif // (PTX >= 63) && (SM >= 75)
+}
--- /dev/null
+# This script generates all variants of wmma builtins, verifies that clang calls
+# correct LLVM instrinsics, and checks that availability of specific builtins is
+# constrained by the correct PTX version and the target GPU variant.
+
+# Dummy test run to avoid lit warnings.
+# RUN: echo "This is not a real test. It's a generator for builtins-nvpts-mma.cu" >/dev/null
+
+from __future__ import print_function
+
+import argparse
+from collections import defaultdict
+from itertools import product
+from string import Template
+
+class MMAFrag:
+ def __init__(self, geom, frag, ptx_elt_type):
+ self.geom = geom
+ self.frag = frag
+ self.ptx_type = ptx_elt_type;
+
+ def __repr__(self):
+ return "%s:%s:%s" % (self.geom, self.frag, self.ptx_type)
+
+class MMAOp:
+ def __init__(self, a, b, c, d):
+ self.a = a
+ self.b = b
+ self.c = c
+ self.d = d
+
+ def __repr__(self):
+ return ("{A:%s, B:%s, C:%s, D:%s}" % (self.a, self.b, self.c, self.d ))
+
+def make_mma_ops(geoms, types_a, types_b, types_c, types_d):
+ ops = []
+ for geom, type_a, type_c in product( geoms, types_a, types_c):
+ for type_b, type_d in product(types_b if types_b else [type_a],
+ types_d if types_d else [type_c]):
+ ops.append(MMAOp(MMAFrag(geom, "a", type_a),
+ MMAFrag(geom, "b", type_b),
+ MMAFrag(geom, "c", type_c),
+ MMAFrag(geom, "d", type_d)))
+ return ops
+
+def make_ldst_ops(geoms, frags, types):
+ return [MMAFrag(geom, frag, ptx_type) for (geom, frag, ptx_type)
+ in product(geoms, frags, types)]
+
+def get_mma_ops():
+ return (make_mma_ops(["m16n16k16", "m32n8k16", "m8n32k16"],
+ ["f16"], [], ["f16", "f32"], ["f16", "f32"]) +
+ make_mma_ops(["m16n16k16", "m32n8k16", "m8n32k16"],
+ ["s8", "u8"], [], ["s32"], []) +
+ make_mma_ops(["m8n8k32"],
+ ["s4", "u4"], [], ["s32"], []) +
+ make_mma_ops(["m8n8k128"],
+ ["b1"], [], ["s32"], []))
+def get_ldst_ops():
+ return (make_ldst_ops(["m16n16k16", "m32n8k16", "m8n32k16"],
+ ["a", "b"], ["f16", "u8", "s8"]) +
+ make_ldst_ops(["m16n16k16", "m32n8k16", "m8n32k16"],
+ ["c", "d"], ["f16", "f32", "s32"]) +
+ make_ldst_ops(["m8n8k32"], ["a", "b"], ["s4","u4"]) +
+ make_ldst_ops(["m8n8k128"], ["a", "b"], ["b1"]) +
+ make_ldst_ops(["m8n8k32", "m8n8k128"], ["c", "d"], ["s32"]))
+
+def is_geom_supported(geom):
+ # geometries for FP and ints.
+ if geom in ["m8n32k16", "m32n8k16"]:
+ return ptx_version >= 61
+ # geometries for sub-ints.
+ if geom in ["m8n8k32", "m8n8k128"]:
+ return ptx_version >= 63 and gpu_arch >= 75
+ if geom == "m16n16k16":
+ return ptx_version >= 60
+ assert(False) # Unexpected geometry.
+
+def is_type_supported(ptx_type):
+ if ptx_type in ["s8", "u8", "s32"]:
+ return ptx_version >= 63 and gpu_arch >= 72
+ if ptx_type in ["s4", "u4", "b1"]:
+ return ptx_version >= 63 and gpu_arch >= 75
+ return ptx_version >= 60 and gpu_arch >= 70
+
+def is_mma_variant_supported(op, layout_a, layout_b, satf):
+ if not (is_type_supported(op.a.ptx_type)
+ and is_geom_supported(op.a.geom)):
+ return False
+ # sub-integer require row/col layout, and no satf.
+ if op.a.ptx_type in ["s4", "u4", "b1"]:
+ if op.a.ptx_type == "b1" and satf:
+ return False
+ return layout_a == "row" and layout_b == "col"
+ return True
+
+def is_ldst_variant_supported(frag, layout):
+ if not (is_type_supported(frag.ptx_type)
+ and is_geom_supported(frag.geom)):
+ return False
+ if frag.ptx_type in ["s4", "u4", "b1"]:
+ # sub-integer require sm_75 and ptx63, row/col layout for a/b.
+ return ((frag.frag == "a" and layout == "row")
+ or (frag.frag == "b" and layout == "col")
+ or frag.frag in ["c", "d"])
+ return True
+
+def get_builtin_prefix(frag):
+ prefix = None
+ if frag.geom in ["m16n16k16", "m32n8k16", "m8n32k16"]:
+ if frag.ptx_type in ["f16", "f32"]:
+ prefix = "__hmma"
+ else:
+ prefix = "__imma"
+ elif frag.geom == "m8n8k32":
+ prefix = "__imma" # sub-integers
+ elif frag.geom == "m8n8k128":
+ prefix = "__bmma"
+ assert prefix
+ return prefix
+
+def get_ldst_builtin_name(frag):
+ prefix = get_builtin_prefix(frag)
+
+ if prefix == "__hmma":
+ suffix = "" if frag.frag in ["a","b"] else frag.ptx_type
+ elif prefix in ["__imma", "__bmma"]:
+ suffix = "" if frag.frag in ["c"] else frag.ptx_type
+ if suffix == "s32":
+ suffix = "i32"
+ if frag.frag == "d":
+ ifrag = "c"
+ op = "st"
+ else:
+ ifrag = frag.frag
+ op = "ld"
+
+ name = "%s_%s_%s_%s%s" % (prefix, frag.geom, op, ifrag,
+ "_" + suffix if suffix else "")
+ return name
+
+def get_mma_builtin_name(op):
+ prefix = get_builtin_prefix(op.a)
+
+ if prefix == "__hmma":
+ suffix = op.d.ptx_type + op.c.ptx_type
+ else:
+ suffix = op.a.ptx_type
+
+ name = "%s_%s_mma%s_%s" % (prefix, op.a.geom,
+ "_xor_popc" if op.a.ptx_type == "b1" else "",
+ suffix)
+ return name
+
+
+def get_required_sm(frag):
+ if frag.ptx_type in ["u4", "s4", "b1"]:
+ return 75
+ if frag.ptx_type in ["s8", "u8"]:
+ return 72
+ if frag.ptx_type == "s32":
+ if frag.geom in ["m8n8k32", "m8n8k128"]: # s4/u4/b1
+ return 75
+ else: # s8/u8
+ return 72
+ if frag.ptx_type in ["f16", "f32"]:
+ return 70
+ assert(False)
+
+def get_required_ptx(frag):
+ if frag.ptx_type in ["f16", "f32"]:
+ return 60 if frag.geom == "m16n16k16" else 61
+ return 63
+
+def gen_wmma_ldst_tests(results):
+ load_template = """
+ // CHECK${check_suffix}: call {{.*}} @${intrinsic}
+ // expected-error-re@+1 {{'${builtin}' needs target feature sm_${min_sm}{{.*}},ptx${min_ptx}{{.*}}}}
+ ${builtin}(${dst}, ${src}, ldm, ${blayout});
+""".rstrip()
+ intrinsic_template = "llvm.nvvm.wmma.${geom}.${op}.${frag}.${ilayout}.stride.${itype}"
+
+ for frag, layout in sorted(product(get_ldst_ops(), ["row","col"]), key=str):
+
+ if not is_ldst_variant_supported(frag, layout):
+ continue
+
+ is_fp = frag.ptx_type == "f32"
+ min_sm = get_required_sm(frag)
+ min_ptx = get_required_ptx(frag)
+ params = {
+ "check_suffix" : "_PTX%d_SM%d" % (min_ptx, min_sm),
+ "builtin" : get_ldst_builtin_name(frag),
+ "min_ptx" : min_ptx,
+ "min_sm" : min_sm,
+ "dst": "fdst" if is_fp else "dst",
+ "src": "fsrc" if is_fp else "src",
+ "blayout" : 0 if layout == "row" else 1,
+ "intrinsic" : Template(intrinsic_template).substitute({
+ "frag" : frag.frag,
+ "geom" : frag.geom,
+ "ilayout" : layout,
+ "itype" : frag.ptx_type,
+ "op" : "store" if frag.frag == "d" else "load",
+ })
+ }
+ results[(min_ptx,min_sm)] += Template(load_template).substitute(params)
+
+ return results
+
+def mma_signature(op):
+ if op.a.ptx_type in ["s8", "u8", "s4", "u4", "b1"]:
+ # int and sub-int ops are identified by input type.
+ return op.a.ptx_type
+ else:
+ # the rest are FP ops identified by accumulator & result type.
+ return "%s.%s" % (op.d.ptx_type, op.c.ptx_type)
+
+# Get numeric value for rowcol parameter of the builtin
+# AFAICT it uses the encoding accepted by NVVM intrinsics:
+# https://docs.nvidia.com/cuda/nvvm-ir-spec/index.html#nvvm-intrin-warp-level-matrix-mma
+def get_ilayout(a, b):
+ return {
+ "row.row" : 0,
+ "row.col" : 1,
+ "col.row" : 2,
+ "col.col" : 3
+ }[a + "." + b]
+
+def gen_wmma_mma_tests(results):
+ mma_template = """
+ // CHECK${check_suffix}: call {{.*}} @${intrinsic}
+ // expected-error-re@+1 {{'${builtin}' needs target feature sm_${min_sm}{{.*}},ptx${min_ptx}{{.*}}}}
+ ${builtin}(${dst}, ${asrc}, ${asrc}, ${csrc}, ${ilayout}${maybe_isatf});
+""".rstrip()
+ intrinsic_template = "llvm.nvvm.wmma.${geom}.mma.${alayout}.${blayout}.${intrinsic_signature}${satf}"
+
+ for op, alayout, blayout, satf in sorted(product( get_mma_ops(),
+ ["row","col"],
+ ["row","col"],
+ [".satfinite", ""]),
+ key=str):
+
+ if not is_mma_variant_supported(op, alayout, blayout, satf):
+ continue
+
+ a_is_fp = op.a.ptx_type == "f32"
+ c_is_fp = op.c.ptx_type == "f32"
+ d_is_fp = op.d.ptx_type == "f32"
+ min_sm = get_required_sm(op.a)
+ min_ptx = get_required_ptx(op.a)
+ if op.a.ptx_type == "b1": # .b1 MMA has no satf argument.
+ isatf_arg = ""
+ else:
+ isatf_arg = ", 1" if satf else ", 0"
+ params = {
+ "check_suffix" : "_PTX%d_SM%d" % (min_ptx, min_sm),
+ "builtin" : get_mma_builtin_name(op),
+ "min_ptx" : min_ptx,
+ "min_sm" : min_sm,
+ "dst": "fdst" if d_is_fp else "dst",
+ "asrc": "fsrc" if a_is_fp else "src",
+ "csrc": "fsrc" if c_is_fp else "src",
+ "ilayout" : get_ilayout(alayout, blayout),
+ "maybe_isatf" : isatf_arg,
+ "intrinsic" : Template(intrinsic_template).substitute({
+ "geom" : op.a.geom,
+ "alayout" : alayout,
+ "blayout" : blayout,
+ "intrinsic_signature" : mma_signature(op),
+ "satf" : satf,
+ })
+ }
+ results[(min_ptx, min_sm)] += Template(mma_template).substitute(params)
+
+ return results
+
+def gen_tests():
+ results = gen_wmma_ldst_tests(defaultdict(str))
+ results = gen_wmma_mma_tests(results)
+
+ run_template = r"""
+//
+// *** DO NOT EDIT ***
+//
+// This test has been automatically generated by
+// builtins-nvtx-mma.py --ptx=${ptx} --gpu-arch=${sm}
+//
+// Make sure we can handle all builtins available on sm_${sm} with PTX${ptx}
+// ${run}: %clang_cc1 -triple nvptx64-unknown-unknown -target-cpu sm_${sm} \
+// ${run}: -fcuda-is-device -target-feature +ptx${ptx} \
+// ${run}: -DPTX=${ptx} -DSM=${sm} \
+// ${run}: -S -emit-llvm -o - -x cuda %s \
+// ${run}: | FileCheck -check-prefixes=${check_labels} %s
+// Verify that all builtins have correct constraints.
+// ${run}: %clang_cc1 -triple nvptx-unknown-unknown \
+// ${run}: -target-cpu sm_60 -target-feature +ptx42 \
+// ${run}: -DPTX=${ptx} -DSM=${sm} -fcuda-is-device -S -o /dev/null -x cuda \
+// ${run}: -verify %s
+"""
+ def supported_variants(ptx, sm, results):
+ return [(ptx_, sm_) for ptx_, sm_ in results if ptx_ <= ptx and sm_ <= sm]
+
+ print(Template(run_template).substitute({
+ "run" : "RUN", # To avoid lit misinterpreting the template
+ "ptx" : ptx_version,
+ "sm" : gpu_arch,
+ "check_labels" : ",".join(["CHECK_PTX%d_SM%d" % (ptx_, sm_)
+ for ptx_, sm_
+ in supported_variants(ptx_version, gpu_arch,
+ results)])
+ }))
+
+ print("""
+#if !defined(CUDA_VERSION)
+#define __device__ __attribute__((device))
+#define __global__ __attribute__((global))
+#define __shared__ __attribute__((shared))
+#define __constant__ __attribute__((constant))
+
+typedef unsigned long long uint64_t;
+#endif
+
+// CHECK-LABEL: test_wmma_buitins
+__device__ void test_wmma_buitins(int *src, int *dst,
+ float *fsrc, float *fdst, int ldm) {
+""");
+
+ for (ptx, sm), tests in sorted(results.items()):
+ print()
+ print("#if (PTX >= %d) && (SM >= %d)" % (ptx, sm))
+ print(tests)
+ print("#endif // (PTX >= %d) && (SM >= %d) "% (ptx, sm))
+
+ print("}")
+
+parser = argparse.ArgumentParser()
+parser.add_argument("--ptx", type=int, default=60)
+parser.add_argument("--gpu-arch", type=int, default=70)
+args = parser.parse_args()
+ptx_version = args.ptx
+gpu_arch = args.gpu_arch
+
+gen_tests()