[AMDGPU] Split dot2-insts feature
authorJay Foad <jay.foad@amd.com>
Tue, 16 Mar 2021 16:01:03 +0000 (16:01 +0000)
committerJay Foad <jay.foad@amd.com>
Wed, 17 Mar 2021 09:42:21 +0000 (09:42 +0000)
Split out some of the instructions predicated on the dot2-insts target
feature into a new dot7-insts, in preparation for subtargets that have
some but not all of these instructions. NFCI.

Differential Revision: https://reviews.llvm.org/D98717

clang/include/clang/Basic/BuiltinsAMDGPU.def
clang/lib/Basic/Targets/AMDGPU.cpp
clang/test/CodeGenOpenCL/amdgpu-features.cl
clang/test/CodeGenOpenCL/builtins-amdgcn-dl-insts-err.cl
llvm/lib/Target/AMDGPU/AMDGPU.td
llvm/lib/Target/AMDGPU/AMDGPUSubtarget.cpp
llvm/lib/Target/AMDGPU/GCNSubtarget.h
llvm/lib/Target/AMDGPU/SIISelLowering.cpp
llvm/lib/Target/AMDGPU/VOP3PInstructions.td

index 415d8cb..9677b1a 100644 (file)
@@ -193,13 +193,13 @@ TARGET_BUILTIN(__builtin_amdgcn_fmed3h, "hhhh", "nc", "gfx9-insts")
 // Deep learning builtins.
 //===----------------------------------------------------------------------===//
 
-TARGET_BUILTIN(__builtin_amdgcn_fdot2, "fV2hV2hfIb", "nc", "dot2-insts")
+TARGET_BUILTIN(__builtin_amdgcn_fdot2, "fV2hV2hfIb", "nc", "dot7-insts")
 TARGET_BUILTIN(__builtin_amdgcn_sdot2, "SiV2SsV2SsSiIb", "nc", "dot2-insts")
 TARGET_BUILTIN(__builtin_amdgcn_udot2, "UiV2UsV2UsUiIb", "nc", "dot2-insts")
 TARGET_BUILTIN(__builtin_amdgcn_sdot4, "SiSiSiSiIb", "nc", "dot1-insts")
-TARGET_BUILTIN(__builtin_amdgcn_udot4, "UiUiUiUiIb", "nc", "dot2-insts")
+TARGET_BUILTIN(__builtin_amdgcn_udot4, "UiUiUiUiIb", "nc", "dot7-insts")
 TARGET_BUILTIN(__builtin_amdgcn_sdot8, "SiSiSiSiIb", "nc", "dot1-insts")
-TARGET_BUILTIN(__builtin_amdgcn_udot8, "UiUiUiUiIb", "nc", "dot2-insts")
+TARGET_BUILTIN(__builtin_amdgcn_udot8, "UiUiUiUiIb", "nc", "dot7-insts")
 
 //===----------------------------------------------------------------------===//
 // GFX10+ only builtins.
index a84422e..75115db 100644 (file)
@@ -183,6 +183,7 @@ bool AMDGPUTargetInfo::initFeatureMap(
       Features["dot2-insts"] = true;
       Features["dot5-insts"] = true;
       Features["dot6-insts"] = true;
+      Features["dot7-insts"] = true;
       Features["dl-insts"] = true;
       Features["flat-address-space"] = true;
       Features["16-bit-insts"] = true;
@@ -200,6 +201,7 @@ bool AMDGPUTargetInfo::initFeatureMap(
       Features["dot2-insts"] = true;
       Features["dot5-insts"] = true;
       Features["dot6-insts"] = true;
+      Features["dot7-insts"] = true;
       LLVM_FALLTHROUGH;
     case GK_GFX1010:
       Features["dl-insts"] = true;
@@ -227,6 +229,7 @@ bool AMDGPUTargetInfo::initFeatureMap(
       Features["dl-insts"] = true;
       Features["dot1-insts"] = true;
       Features["dot2-insts"] = true;
+      Features["dot7-insts"] = true;
       LLVM_FALLTHROUGH;
     case GK_GFX90C:
     case GK_GFX909:
index 930c537..5c90598 100644 (file)
 // GFX900: "target-features"="+16-bit-insts,+ci-insts,+dpp,+flat-address-space,+gfx8-insts,+gfx9-insts,+s-memrealtime,+s-memtime-inst"
 // GFX902: "target-features"="+16-bit-insts,+ci-insts,+dpp,+flat-address-space,+gfx8-insts,+gfx9-insts,+s-memrealtime,+s-memtime-inst"
 // GFX904: "target-features"="+16-bit-insts,+ci-insts,+dpp,+flat-address-space,+gfx8-insts,+gfx9-insts,+s-memrealtime,+s-memtime-inst"
-// GFX906: "target-features"="+16-bit-insts,+ci-insts,+dl-insts,+dot1-insts,+dot2-insts,+dpp,+flat-address-space,+gfx8-insts,+gfx9-insts,+s-memrealtime,+s-memtime-inst"
-// GFX908: "target-features"="+16-bit-insts,+ci-insts,+dl-insts,+dot1-insts,+dot2-insts,+dot3-insts,+dot4-insts,+dot5-insts,+dot6-insts,+dpp,+flat-address-space,+gfx8-insts,+gfx9-insts,+mai-insts,+s-memrealtime,+s-memtime-inst"
+// GFX906: "target-features"="+16-bit-insts,+ci-insts,+dl-insts,+dot1-insts,+dot2-insts,+dot7-insts,+dpp,+flat-address-space,+gfx8-insts,+gfx9-insts,+s-memrealtime,+s-memtime-inst"
+// GFX908: "target-features"="+16-bit-insts,+ci-insts,+dl-insts,+dot1-insts,+dot2-insts,+dot3-insts,+dot4-insts,+dot5-insts,+dot6-insts,+dot7-insts,+dpp,+flat-address-space,+gfx8-insts,+gfx9-insts,+mai-insts,+s-memrealtime,+s-memtime-inst"
 // GFX909: "target-features"="+16-bit-insts,+ci-insts,+dpp,+flat-address-space,+gfx8-insts,+gfx9-insts,+s-memrealtime,+s-memtime-inst"
-// GFX90A: "target-features"="+16-bit-insts,+ci-insts,+dl-insts,+dot1-insts,+dot2-insts,+dot3-insts,+dot4-insts,+dot5-insts,+dot6-insts,+dpp,+flat-address-space,+gfx8-insts,+gfx9-insts,+gfx90a-insts,+mai-insts,+s-memrealtime,+s-memtime-inst"
+// GFX90A: "target-features"="+16-bit-insts,+ci-insts,+dl-insts,+dot1-insts,+dot2-insts,+dot3-insts,+dot4-insts,+dot5-insts,+dot6-insts,+dot7-insts,+dpp,+flat-address-space,+gfx8-insts,+gfx9-insts,+gfx90a-insts,+mai-insts,+s-memrealtime,+s-memtime-inst"
 // GFX90C: "target-features"="+16-bit-insts,+ci-insts,+dpp,+flat-address-space,+gfx8-insts,+gfx9-insts,+s-memrealtime,+s-memtime-inst"
 // GFX1010: "target-features"="+16-bit-insts,+ci-insts,+dl-insts,+dpp,+flat-address-space,+gfx10-insts,+gfx8-insts,+gfx9-insts,+s-memrealtime,+s-memtime-inst"
-// GFX1011: "target-features"="+16-bit-insts,+ci-insts,+dl-insts,+dot1-insts,+dot2-insts,+dot5-insts,+dot6-insts,+dpp,+flat-address-space,+gfx10-insts,+gfx8-insts,+gfx9-insts,+s-memrealtime,+s-memtime-inst"
-// GFX1012: "target-features"="+16-bit-insts,+ci-insts,+dl-insts,+dot1-insts,+dot2-insts,+dot5-insts,+dot6-insts,+dpp,+flat-address-space,+gfx10-insts,+gfx8-insts,+gfx9-insts,+s-memrealtime,+s-memtime-inst"
-// GFX1030: "target-features"="+16-bit-insts,+ci-insts,+dl-insts,+dot1-insts,+dot2-insts,+dot5-insts,+dot6-insts,+dpp,+flat-address-space,+gfx10-3-insts,+gfx10-insts,+gfx8-insts,+gfx9-insts,+s-memrealtime,+s-memtime-inst"
-// GFX1031: "target-features"="+16-bit-insts,+ci-insts,+dl-insts,+dot1-insts,+dot2-insts,+dot5-insts,+dot6-insts,+dpp,+flat-address-space,+gfx10-3-insts,+gfx10-insts,+gfx8-insts,+gfx9-insts,+s-memrealtime,+s-memtime-inst"
-// GFX1032: "target-features"="+16-bit-insts,+ci-insts,+dl-insts,+dot1-insts,+dot2-insts,+dot5-insts,+dot6-insts,+dpp,+flat-address-space,+gfx10-3-insts,+gfx10-insts,+gfx8-insts,+gfx9-insts,+s-memrealtime,+s-memtime-inst"
-// GFX1033: "target-features"="+16-bit-insts,+ci-insts,+dl-insts,+dot1-insts,+dot2-insts,+dot5-insts,+dot6-insts,+dpp,+flat-address-space,+gfx10-3-insts,+gfx10-insts,+gfx8-insts,+gfx9-insts,+s-memrealtime,+s-memtime-inst"
+// GFX1011: "target-features"="+16-bit-insts,+ci-insts,+dl-insts,+dot1-insts,+dot2-insts,+dot5-insts,+dot6-insts,+dot7-insts,+dpp,+flat-address-space,+gfx10-insts,+gfx8-insts,+gfx9-insts,+s-memrealtime,+s-memtime-inst"
+// GFX1012: "target-features"="+16-bit-insts,+ci-insts,+dl-insts,+dot1-insts,+dot2-insts,+dot5-insts,+dot6-insts,+dot7-insts,+dpp,+flat-address-space,+gfx10-insts,+gfx8-insts,+gfx9-insts,+s-memrealtime,+s-memtime-inst"
+// GFX1030: "target-features"="+16-bit-insts,+ci-insts,+dl-insts,+dot1-insts,+dot2-insts,+dot5-insts,+dot6-insts,+dot7-insts,+dpp,+flat-address-space,+gfx10-3-insts,+gfx10-insts,+gfx8-insts,+gfx9-insts,+s-memrealtime,+s-memtime-inst"
+// GFX1031: "target-features"="+16-bit-insts,+ci-insts,+dl-insts,+dot1-insts,+dot2-insts,+dot5-insts,+dot6-insts,+dot7-insts,+dpp,+flat-address-space,+gfx10-3-insts,+gfx10-insts,+gfx8-insts,+gfx9-insts,+s-memrealtime,+s-memtime-inst"
+// GFX1032: "target-features"="+16-bit-insts,+ci-insts,+dl-insts,+dot1-insts,+dot2-insts,+dot5-insts,+dot6-insts,+dot7-insts,+dpp,+flat-address-space,+gfx10-3-insts,+gfx10-insts,+gfx8-insts,+gfx9-insts,+s-memrealtime,+s-memtime-inst"
+// GFX1033: "target-features"="+16-bit-insts,+ci-insts,+dl-insts,+dot1-insts,+dot2-insts,+dot5-insts,+dot6-insts,+dot7-insts,+dpp,+flat-address-space,+gfx10-3-insts,+gfx10-insts,+gfx8-insts,+gfx9-insts,+s-memrealtime,+s-memtime-inst"
 
 kernel void test() {}
index cb83c0b..e7a71b5 100644 (file)
@@ -13,8 +13,8 @@ kernel void builtins_amdgcn_dl_insts_err(
     half2 v2hA, half2 v2hB, float fC,
     short2 v2ssA, short2 v2ssB, int siA, int siB, int siC,
     ushort2 v2usA, ushort2 v2usB, uint uiA, uint uiB, uint uiC) {
-  fOut[0] = __builtin_amdgcn_fdot2(v2hA, v2hB, fC, false);     // expected-error {{'__builtin_amdgcn_fdot2' needs target feature dot2-insts}}
-  fOut[1] = __builtin_amdgcn_fdot2(v2hA, v2hB, fC, true);      // expected-error {{'__builtin_amdgcn_fdot2' needs target feature dot2-insts}}
+  fOut[0] = __builtin_amdgcn_fdot2(v2hA, v2hB, fC, false);     // expected-error {{'__builtin_amdgcn_fdot2' needs target feature dot7-insts}}
+  fOut[1] = __builtin_amdgcn_fdot2(v2hA, v2hB, fC, true);      // expected-error {{'__builtin_amdgcn_fdot2' needs target feature dot7-insts}}
 
   siOut[0] = __builtin_amdgcn_sdot2(v2ssA, v2ssB, siC, false); // expected-error {{'__builtin_amdgcn_sdot2' needs target feature dot2-insts}}
   siOut[1] = __builtin_amdgcn_sdot2(v2ssA, v2ssB, siC, true);  // expected-error {{'__builtin_amdgcn_sdot2' needs target feature dot2-insts}}
@@ -25,12 +25,12 @@ kernel void builtins_amdgcn_dl_insts_err(
   siOut[2] = __builtin_amdgcn_sdot4(siA, siB, siC, false);     // expected-error {{'__builtin_amdgcn_sdot4' needs target feature dot1-insts}}
   siOut[3] = __builtin_amdgcn_sdot4(siA, siB, siC, true);      // expected-error {{'__builtin_amdgcn_sdot4' needs target feature dot1-insts}}
 
-  uiOut[2] = __builtin_amdgcn_udot4(uiA, uiB, uiC, false);     // expected-error {{'__builtin_amdgcn_udot4' needs target feature dot2-insts}}
-  uiOut[3] = __builtin_amdgcn_udot4(uiA, uiB, uiC, true);      // expected-error {{'__builtin_amdgcn_udot4' needs target feature dot2-insts}}
+  uiOut[2] = __builtin_amdgcn_udot4(uiA, uiB, uiC, false);     // expected-error {{'__builtin_amdgcn_udot4' needs target feature dot7-insts}}
+  uiOut[3] = __builtin_amdgcn_udot4(uiA, uiB, uiC, true);      // expected-error {{'__builtin_amdgcn_udot4' needs target feature dot7-insts}}
 
   siOut[4] = __builtin_amdgcn_sdot8(siA, siB, siC, false);     // expected-error {{'__builtin_amdgcn_sdot8' needs target feature dot1-insts}}
   siOut[5] = __builtin_amdgcn_sdot8(siA, siB, siC, true);      // expected-error {{'__builtin_amdgcn_sdot8' needs target feature dot1-insts}}
 
-  uiOut[4] = __builtin_amdgcn_udot8(uiA, uiB, uiC, false);     // expected-error {{'__builtin_amdgcn_udot8' needs target feature dot2-insts}}
-  uiOut[5] = __builtin_amdgcn_udot8(uiA, uiB, uiC, true);      // expected-error {{'__builtin_amdgcn_udot8' needs target feature dot2-insts}}
+  uiOut[4] = __builtin_amdgcn_udot8(uiA, uiB, uiC, false);     // expected-error {{'__builtin_amdgcn_udot8' needs target feature dot7-insts}}
+  uiOut[5] = __builtin_amdgcn_udot8(uiA, uiB, uiC, true);      // expected-error {{'__builtin_amdgcn_udot8' needs target feature dot7-insts}}
 }
index 121373e..f061dd5 100644 (file)
@@ -480,7 +480,7 @@ def FeatureDot1Insts : SubtargetFeature<"dot1-insts",
 def FeatureDot2Insts : SubtargetFeature<"dot2-insts",
   "HasDot2Insts",
   "true",
-  "Has v_dot2_f32_f16, v_dot2_i32_i16, v_dot2_u32_u16, v_dot4_u32_u8, v_dot8_u32_u4 instructions"
+  "Has v_dot2_i32_i16, v_dot2_u32_u16 instructions"
 >;
 
 def FeatureDot3Insts : SubtargetFeature<"dot3-insts",
@@ -507,6 +507,12 @@ def FeatureDot6Insts : SubtargetFeature<"dot6-insts",
   "Has v_dot4c_i32_i8 instruction"
 >;
 
+def FeatureDot7Insts : SubtargetFeature<"dot7-insts",
+  "HasDot7Insts",
+  "true",
+  "Has v_dot2_f32_f16, v_dot4_u32_u8, v_dot8_u32_u4 instructions"
+>;
+
 def FeatureMAIInsts : SubtargetFeature<"mai-insts",
   "HasMAIInsts",
   "true",
@@ -902,6 +908,7 @@ def FeatureISAVersion9_0_6 : FeatureSet<
    FeatureDLInsts,
    FeatureDot1Insts,
    FeatureDot2Insts,
+   FeatureDot7Insts,
    FeatureSupportsSRAMECC,
    FeatureImageGather4D16Bug]>;
 
@@ -920,6 +927,7 @@ def FeatureISAVersion9_0_8 : FeatureSet<
    FeatureDot4Insts,
    FeatureDot5Insts,
    FeatureDot6Insts,
+   FeatureDot7Insts,
    FeatureMAIInsts,
    FeaturePkFmacF16Inst,
    FeatureAtomicFaddInsts,
@@ -948,6 +956,7 @@ def FeatureISAVersion9_0_A : FeatureSet<
    FeatureDot4Insts,
    FeatureDot5Insts,
    FeatureDot6Insts,
+   FeatureDot7Insts,
    Feature64BitDPP,
    FeaturePackedFP32Ops,
    FeatureMAIInsts,
@@ -1008,6 +1017,7 @@ def FeatureISAVersion10_1_1 : FeatureSet<
      FeatureDot2Insts,
      FeatureDot5Insts,
      FeatureDot6Insts,
+     FeatureDot7Insts,
      FeatureNSAEncoding,
      FeatureWavefrontSize32,
      FeatureScalarStores,
@@ -1028,6 +1038,7 @@ def FeatureISAVersion10_1_2 : FeatureSet<
      FeatureDot2Insts,
      FeatureDot5Insts,
      FeatureDot6Insts,
+     FeatureDot7Insts,
      FeatureNSAEncoding,
      FeatureWavefrontSize32,
      FeatureScalarStores,
@@ -1049,6 +1060,7 @@ def FeatureISAVersion10_3_0 : FeatureSet<
    FeatureDot2Insts,
    FeatureDot5Insts,
    FeatureDot6Insts,
+   FeatureDot7Insts,
    FeatureNSAEncoding,
    FeatureWavefrontSize32,
    FeatureShaderCyclesRegister]>;
@@ -1373,6 +1385,9 @@ def HasDot5Insts : Predicate<"Subtarget->hasDot5Insts()">,
 def HasDot6Insts : Predicate<"Subtarget->hasDot6Insts()">,
   AssemblerPredicate<(all_of FeatureDot6Insts)>;
 
+def HasDot7Insts : Predicate<"Subtarget->hasDot7Insts()">,
+  AssemblerPredicate<(all_of FeatureDot7Insts)>;
+
 def HasGetWaveIdInst : Predicate<"Subtarget->hasGetWaveIdInst()">,
   AssemblerPredicate<(all_of FeatureGetWaveIdInst)>;
 
index dfab7cc..4b2ec01 100644 (file)
@@ -267,6 +267,7 @@ GCNSubtarget::GCNSubtarget(const Triple &TT, StringRef GPU, StringRef FS,
     HasDot4Insts(false),
     HasDot5Insts(false),
     HasDot6Insts(false),
+    HasDot7Insts(false),
     HasMAIInsts(false),
     HasPkFmacF16Inst(false),
     HasAtomicFaddInsts(false),
index fc45f7e..f28462a 100644 (file)
@@ -150,6 +150,7 @@ protected:
   bool HasDot4Insts;
   bool HasDot5Insts;
   bool HasDot6Insts;
+  bool HasDot7Insts;
   bool HasMAIInsts;
   bool HasPkFmacF16Inst;
   bool HasAtomicFaddInsts;
@@ -687,6 +688,10 @@ public:
     return HasDot6Insts;
   }
 
+  bool hasDot7Insts() const {
+    return HasDot7Insts;
+  }
+
   bool hasMAIInsts() const {
     return HasMAIInsts;
   }
index 5a3eaa6..eb5b06e 100644 (file)
@@ -10486,7 +10486,7 @@ SDValue SITargetLowering::performFMACombine(SDNode *N,
   EVT VT = N->getValueType(0);
   SDLoc SL(N);
 
-  if (!Subtarget->hasDot2Insts() || VT != MVT::f32)
+  if (!Subtarget->hasDot7Insts() || VT != MVT::f32)
     return SDValue();
 
   // FMA((F32)S0.x, (F32)S1. x, FMA((F32)S0.y, (F32)S1.y, (F32)z)) ->
index 073e4df..880a6c6 100644 (file)
@@ -287,19 +287,24 @@ class SDot2Pat<Instruction Inst> : GCNPat <
 let IsDOT = 1 in {
 let SubtargetPredicate = HasDot2Insts in {
 
-def V_DOT2_F32_F16 : VOP3PInst<"v_dot2_f32_f16",
-  VOP3_Profile<VOP_F32_V2F16_V2F16_F32>,
-  AMDGPUfdot2, 1/*ExplicitClamp*/>;
 def V_DOT2_I32_I16 : VOP3PInst<"v_dot2_i32_i16",
   VOP3_Profile<VOP_I32_V2I16_V2I16_I32>, int_amdgcn_sdot2, 1>;
 def V_DOT2_U32_U16 : VOP3PInst<"v_dot2_u32_u16",
   VOP3_Profile<VOP_I32_V2I16_V2I16_I32>, int_amdgcn_udot2, 1>;
+
+} // End SubtargetPredicate = HasDot2Insts
+
+let SubtargetPredicate = HasDot7Insts in {
+
+def V_DOT2_F32_F16 : VOP3PInst<"v_dot2_f32_f16",
+  VOP3_Profile<VOP_F32_V2F16_V2F16_F32>,
+  AMDGPUfdot2, 1/*ExplicitClamp*/>;
 def V_DOT4_U32_U8  : VOP3PInst<"v_dot4_u32_u8",
   VOP3_Profile<VOP_I32_I32_I32_I32, VOP3_PACKED>, int_amdgcn_udot4, 1>;
 def V_DOT8_U32_U4  : VOP3PInst<"v_dot8_u32_u4",
   VOP3_Profile<VOP_I32_I32_I32_I32, VOP3_PACKED>, int_amdgcn_udot8, 1>;
 
-} // End SubtargetPredicate = HasDot2Insts
+} // End SubtargetPredicate = HasDot7Insts
 
 let SubtargetPredicate = HasDot1Insts in {
 
@@ -564,13 +569,18 @@ defm V_FMA_MIXHI_F16 : VOP3P_Real_vi <0x22>;
 
 let SubtargetPredicate = HasDot2Insts in {
 
-defm V_DOT2_F32_F16 : VOP3P_Real_vi <0x23>;
 defm V_DOT2_I32_I16 : VOP3P_Real_vi <0x26>;
 defm V_DOT2_U32_U16 : VOP3P_Real_vi <0x27>;
+
+} // End SubtargetPredicate = HasDot2Insts
+
+let SubtargetPredicate = HasDot7Insts in {
+
+defm V_DOT2_F32_F16 : VOP3P_Real_vi <0x23>;
 defm V_DOT4_U32_U8  : VOP3P_Real_vi <0x29>;
 defm V_DOT8_U32_U4  : VOP3P_Real_vi <0x2b>;
 
-} // End SubtargetPredicate = HasDot2Insts
+} // End SubtargetPredicate = HasDot7Insts
 
 let SubtargetPredicate = HasDot1Insts in {
 
@@ -657,13 +667,18 @@ defm V_FMA_MIXHI_F16  : VOP3P_Real_gfx10<0x22>;
 
 let SubtargetPredicate = HasDot2Insts in {
 
-defm V_DOT2_F32_F16 : VOP3P_Real_gfx10 <0x13>;
 defm V_DOT2_I32_I16 : VOP3P_Real_gfx10 <0x14>;
 defm V_DOT2_U32_U16 : VOP3P_Real_gfx10 <0x15>;
+
+} // End SubtargetPredicate = HasDot2Insts
+
+let SubtargetPredicate = HasDot7Insts in {
+
+defm V_DOT2_F32_F16 : VOP3P_Real_gfx10 <0x13>;
 defm V_DOT4_U32_U8  : VOP3P_Real_gfx10 <0x17>;
 defm V_DOT8_U32_U4  : VOP3P_Real_gfx10 <0x19>;
 
-} // End SubtargetPredicate = HasDot2Insts
+} // End SubtargetPredicate = HasDot7Insts
 
 let SubtargetPredicate = HasDot1Insts in {