[AMDGPU] Don't combine memory intrs to v3i16

author Sebastian Neubauer <sebastian.neubauer@amd.com>

Tue, 21 Jul 2020 08:28:12 +0000 (10:28 +0200)

committer Sebastian Neubauer <sebastian.neubauer@amd.com>

Wed, 22 Jul 2020 10:44:01 +0000 (12:44 +0200)
author Sebastian Neubauer <sebastian.neubauer@amd.com>
Tue, 21 Jul 2020 08:28:12 +0000 (10:28 +0200)
committer Sebastian Neubauer <sebastian.neubauer@amd.com>
Wed, 22 Jul 2020 10:44:01 +0000 (12:44 +0200)
diff --git a/llvm/lib/Transforms/InstCombine/InstCombineSimplifyDemanded.cpp b/llvm/lib/Transforms/InstCombine/InstCombineSimplifyDemanded.cpp

index 7cfe4c8b5892b6201ccf12fb44d716c886cfd875..c7f2f4ec3ca1073fd973ed0030aab0ffc85f9313 100644 (file)
--- a/llvm/lib/Transforms/InstCombine/InstCombineSimplifyDemanded.cpp
+++ b/llvm/lib/Transforms/InstCombine/InstCombineSimplifyDemanded.cpp
@@ -1030,12 +1030,6 @@ Value *InstCombiner::simplifyAMDGCNMemoryIntrinsicDemanded(IntrinsicInst *II,
                                                             APInt DemandedElts,
                                                             int DMaskIdx) {
  
-  // FIXME: Allow v3i16/v3f16 in buffer intrinsics when the types are fully supported.
-  if (DMaskIdx < 0 &&
-      II->getType()->getScalarSizeInBits() != 32 &&
-      DemandedElts.getActiveBits() == 3)
-    return nullptr;
-
    auto *IIVTy = cast<VectorType>(II->getType());
    unsigned VWidth = IIVTy->getNumElements();
    if (VWidth == 1)
@@ -1124,6 +1118,11 @@ Value *InstCombiner::simplifyAMDGCNMemoryIntrinsicDemanded(IntrinsicInst *II,
    if (!NewNumElts)
      return UndefValue::get(II->getType());
  
+  // FIXME: Allow v3i16/v3f16 in buffer and image intrinsics when the types are
+  // fully supported.
+  if (II->getType()->getScalarSizeInBits() == 16 && NewNumElts == 3)
+    return nullptr;
+
    if (NewNumElts >= VWidth && DemandedElts.isMask()) {
      if (DMaskIdx >= 0)
        II->setArgOperand(DMaskIdx, Args[DMaskIdx]);
diff --git a/llvm/test/Transforms/InstCombine/AMDGPU/amdgcn-demanded-vector-elts.ll b/llvm/test/Transforms/InstCombine/AMDGPU/amdgcn-demanded-vector-elts.ll

index c2f5e2857be08e2902e1723ae381b7c5304a1cca..3d58f0608081b9144916060fb8824e4ca4ab2878 100644 (file)
--- a/llvm/test/Transforms/InstCombine/AMDGPU/amdgcn-demanded-vector-elts.ll
+++ b/llvm/test/Transforms/InstCombine/AMDGPU/amdgcn-demanded-vector-elts.ll
@@ -2965,6 +2965,64 @@ declare <4 x float> @llvm.amdgcn.image.sample.cd.1d.v4f32.f32.f32(i32, float, fl
  ; llvm.amdgcn.image.sample.cd.cl
  ; --------------------------------------------------------------------
  
+; CHECK-LABEL: @extract_elt3_image_sample_cd_cl_1d_v4f16_f32_f32(
+; CHECK-NEXT: %data = call half @llvm.amdgcn.image.sample.cd.cl.1d.f16.f32.f32(i32 8, float %dsdh, float %dsdv, float %s, float %clamp, <8 x i32> %sampler, <4 x i32> %rsrc, i1 false, i32 0, i32 0)
+; CHECK-NEXT: ret half %data
+define amdgpu_ps half @extract_elt3_image_sample_cd_cl_1d_v4f16_f32_f32(float %dsdh, float %dsdv, float %s, float %clamp, <8 x i32> inreg %sampler, <4 x i32> inreg %rsrc) #0 {
+  %data = call <4 x half> @llvm.amdgcn.image.sample.cd.cl.1d.v4f16.f32.f32(i32 15, float %dsdh, float %dsdv, float %s, float %clamp, <8 x i32> %sampler, <4 x i32> %rsrc, i1 false, i32 0, i32 0)
+  %elt0 = extractelement <4 x half> %data, i32 3
+  ret half %elt0
+}
+
+; CHECK-LABEL: @extract_elt2_image_sample_cd_cl_1d_v4f16_f32_f32(
+; CHECK-NEXT: %data = call half @llvm.amdgcn.image.sample.cd.cl.1d.f16.f32.f32(i32 4, float %dsdh, float %dsdv, float %s, float %clamp, <8 x i32> %sampler, <4 x i32> %rsrc, i1 false, i32 0, i32 0)
+; CHECK-NEXT: ret half %data
+define amdgpu_ps half @extract_elt2_image_sample_cd_cl_1d_v4f16_f32_f32(float %dsdh, float %dsdv, float %s, float %clamp, <8 x i32> inreg %sampler, <4 x i32> inreg %rsrc) #0 {
+  %data = call <4 x half> @llvm.amdgcn.image.sample.cd.cl.1d.v4f16.f32.f32(i32 15, float %dsdh, float %dsdv, float %s, float %clamp, <8 x i32> %sampler, <4 x i32> %rsrc, i1 false, i32 0, i32 0)
+  %elt0 = extractelement <4 x half> %data, i32 2
+  ret half %elt0
+}
+
+; CHECK-LABEL: @extract_elt1_image_sample_cd_cl_1d_v4f16_f32_f32(
+; CHECK-NEXT: %data = call half @llvm.amdgcn.image.sample.cd.cl.1d.f16.f32.f32(i32 2, float %dsdh, float %dsdv, float %s, float %clamp, <8 x i32> %sampler, <4 x i32> %rsrc, i1 false, i32 0, i32 0)
+; CHECK-NEXT: ret half %data
+define amdgpu_ps half @extract_elt1_image_sample_cd_cl_1d_v4f16_f32_f32(float %dsdh, float %dsdv, float %s, float %clamp, <8 x i32> inreg %sampler, <4 x i32> inreg %rsrc) #0 {
+  %data = call <4 x half> @llvm.amdgcn.image.sample.cd.cl.1d.v4f16.f32.f32(i32 15, float %dsdh, float %dsdv, float %s, float %clamp, <8 x i32> %sampler, <4 x i32> %rsrc, i1 false, i32 0, i32 0)
+  %elt0 = extractelement <4 x half> %data, i32 1
+  ret half %elt0
+}
+
+; FIXME: Enable load shortening when full support for v3f16 has been added (should expect call <3 x half> @llvm.amdgcn.image.sample.cd.cl.1d.v3f16.f32.f32).
+; CHECK-LABEL: @extract_elt_to3_image_sample_cd_cl_1d_v4f16_f32_f32(
+; CHECK-NEXT: %data = call <4 x half> @llvm.amdgcn.image.sample.cd.cl.1d.v4f16.f32.f32(i32 15, float %dsdh, float %dsdv, float %s, float %clamp, <8 x i32> %sampler, <4 x i32> %rsrc, i1 false, i32 0, i32 0)
+; CHECK-NEXT: %res = shufflevector <4 x half> %data, <4 x half> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 undef>
+; CHECK-NEXT: ret <4 x half> %res
+define amdgpu_ps <4 x half> @extract_elt_to3_image_sample_cd_cl_1d_v4f16_f32_f32(float %dsdh, float %dsdv, float %s, float %clamp, <8 x i32> inreg %sampler, <4 x i32> inreg %rsrc) #0 {
+  %data = call <4 x half> @llvm.amdgcn.image.sample.cd.cl.1d.v4f16.f32.f32(i32 15, float %dsdh, float %dsdv, float %s, float %clamp, <8 x i32> %sampler, <4 x i32> %rsrc, i1 false, i32 0, i32 0)
+  %res = shufflevector <4 x half> %data, <4 x half> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 4>
+  ret <4 x half> %res
+}
+
+; CHECK-LABEL: @extract_elt_to2_image_sample_cd_cl_1d_v4f16_f32_f32(
+; CHECK-NEXT: %data = call <2 x half> @llvm.amdgcn.image.sample.cd.cl.1d.v2f16.f32.f32(i32 3, float %dsdh, float %dsdv, float %s, float %clamp, <8 x i32> %sampler, <4 x i32> %rsrc, i1 false, i32 0, i32 0)
+; CHECK-NEXT: %res = shufflevector <2 x half> %data, <2 x half> undef, <4 x i32> <i32 0, i32 1, i32 undef, i32 undef>
+; CHECK-NEXT: ret <4 x half> %res
+define amdgpu_ps <4 x half> @extract_elt_to2_image_sample_cd_cl_1d_v4f16_f32_f32(float %dsdh, float %dsdv, float %s, float %clamp, <8 x i32> inreg %sampler, <4 x i32> inreg %rsrc) #0 {
+  %data = call <4 x half> @llvm.amdgcn.image.sample.cd.cl.1d.v4f16.f32.f32(i32 15, float %dsdh, float %dsdv, float %s, float %clamp, <8 x i32> %sampler, <4 x i32> %rsrc, i1 false, i32 0, i32 0)
+  %res = shufflevector <4 x half> %data, <4 x half> undef, <4 x i32> <i32 0, i32 1, i32 4, i32 5>
+  ret <4 x half> %res
+}
+
+; CHECK-LABEL: @extract_elt_to1_image_sample_cd_cl_1d_v4f16_f32_f32(
+; CHECK-NEXT: %data = call half @llvm.amdgcn.image.sample.cd.cl.1d.f16.f32.f32(i32 1, float %dsdh, float %dsdv, float %s, float %clamp, <8 x i32> %sampler, <4 x i32> %rsrc, i1 false, i32 0, i32 0)
+; CHECK-NEXT: %res = insertelement <4 x half> undef, half %data, i64 0
+; CHECK-NEXT: ret <4 x half> %res
+define amdgpu_ps <4 x half> @extract_elt_to1_image_sample_cd_cl_1d_v4f16_f32_f32(float %dsdh, float %dsdv, float %s, float %clamp, <8 x i32> inreg %sampler, <4 x i32> inreg %rsrc) #0 {
+  %data = call <4 x half> @llvm.amdgcn.image.sample.cd.cl.1d.v4f16.f32.f32(i32 15, float %dsdh, float %dsdv, float %s, float %clamp, <8 x i32> %sampler, <4 x i32> %rsrc, i1 false, i32 0, i32 0)
+  %res = shufflevector <4 x half> %data, <4 x half> undef, <4 x i32> <i32 0, i32 4, i32 5, i32 6>
+  ret <4 x half> %res
+}
+
  ; CHECK-LABEL: @extract_elt0_image_sample_cd_cl_1d_v4f16_f32_f32(
  ; CHECK-NEXT: %data = call half @llvm.amdgcn.image.sample.cd.cl.1d.f16.f32.f32(i32 1, float %dsdh, float %dsdv, float %s, float %clamp, <8 x i32> %sampler, <4 x i32> %rsrc, i1 false, i32 0, i32 0)
  ; CHECK-NEXT: ret half %data
author	Sebastian Neubauer <sebastian.neubauer@amd.com>
	Tue, 21 Jul 2020 08:28:12 +0000 (10:28 +0200)
committer	Sebastian Neubauer <sebastian.neubauer@amd.com>
	Wed, 22 Jul 2020 10:44:01 +0000 (12:44 +0200)
llvm/lib/Transforms/InstCombine/InstCombineSimplifyDemanded.cpp		patch \| blob \| history
llvm/test/Transforms/InstCombine/AMDGPU/amdgcn-demanded-vector-elts.ll		patch \| blob \| history