[AMDGPU] Use three- and five-dword result type in image ops

author Tim Renouf <tpr.llvm@botech.co.uk>

Fri, 22 Mar 2019 15:21:11 +0000 (15:21 +0000)

committer Tim Renouf <tpr.llvm@botech.co.uk>

Fri, 22 Mar 2019 15:21:11 +0000 (15:21 +0000)
author Tim Renouf <tpr.llvm@botech.co.uk>
Fri, 22 Mar 2019 15:21:11 +0000 (15:21 +0000)
committer Tim Renouf <tpr.llvm@botech.co.uk>
Fri, 22 Mar 2019 15:21:11 +0000 (15:21 +0000)
diff --git a/llvm/lib/Target/AMDGPU/MIMGInstructions.td b/llvm/lib/Target/AMDGPU/MIMGInstructions.td

index 4071adc..6905101 100644 (file)
--- a/llvm/lib/Target/AMDGPU/MIMGInstructions.td
+++ b/llvm/lib/Target/AMDGPU/MIMGInstructions.td
@@ -178,8 +178,8 @@ multiclass MIMG_NoSampler <bits<7> op, string asm, bit has_d16, bit mip = 0,
      defm _V3 : MIMG_NoSampler_Src_Helper <op, asm, VReg_96, 0>;
      let VDataDwords = 4 in
      defm _V4 : MIMG_NoSampler_Src_Helper <op, asm, VReg_128, 0>;
-    let VDataDwords = 8 in
-    defm _V8 : MIMG_NoSampler_Src_Helper <op, asm, VReg_256, 0>;
+    let VDataDwords = 5 in
+    defm _V5 : MIMG_NoSampler_Src_Helper <op, asm, VReg_160, 0>;
    }
  }
  
@@ -412,8 +412,8 @@ multiclass MIMG_Sampler <bits<7> op, AMDGPUSampleVariant sample, bit wqm = 0,
      defm _V3 : MIMG_Sampler_Src_Helper<op, asm, sample, VReg_96>;
      let VDataDwords = 4 in
      defm _V4 : MIMG_Sampler_Src_Helper<op, asm, sample, VReg_128>;
-    let VDataDwords = 8 in
-    defm _V8 : MIMG_Sampler_Src_Helper<op, asm, sample, VReg_256>;
+    let VDataDwords = 5 in
+    defm _V5 : MIMG_Sampler_Src_Helper<op, asm, sample, VReg_160>;
    }
  }
  
@@ -433,8 +433,8 @@ multiclass MIMG_Gather <bits<7> op, AMDGPUSampleVariant sample, bit wqm = 0,
      defm _V2 : MIMG_Sampler_Src_Helper<op, asm, sample, VReg_64>; /* for packed D16 only */
      let VDataDwords = 4 in
      defm _V4 : MIMG_Sampler_Src_Helper<op, asm, sample, VReg_128, 1>;
-    let VDataDwords = 8 in
-    defm _V8 : MIMG_Sampler_Src_Helper<op, asm, sample, VReg_256>;
+    let VDataDwords = 5 in
+    defm _V5 : MIMG_Sampler_Src_Helper<op, asm, sample, VReg_160>;
    }
  }
  
diff --git a/llvm/lib/Target/AMDGPU/SIISelLowering.cpp b/llvm/lib/Target/AMDGPU/SIISelLowering.cpp

index 4155a01..8437e4b 100644 (file)
--- a/llvm/lib/Target/AMDGPU/SIISelLowering.cpp
+++ b/llvm/lib/Target/AMDGPU/SIISelLowering.cpp
@@ -4701,14 +4701,14 @@ static SDValue constructRetValue(SelectionDAG &DAG,
    EVT CastVT = NumElts > 1 ? EVT::getVectorVT(Context, AdjEltVT, NumElts)
                             : AdjEltVT;
  
-  // Special case for v8f16. Rather than add support for this, use v4i32 to
+  // Special case for v6f16. Rather than add support for this, use v3i32 to
    // extract the data elements
-  bool V8F16Special = false;
-  if (CastVT == MVT::v8f16) {
-    CastVT = MVT::v4i32;
+  bool V6F16Special = false;
+  if (NumElts == 6) {
+    CastVT = EVT::getVectorVT(Context, MVT::i32, NumElts / 2);
      DMaskPop >>= 1;
      ReqRetNumElts >>= 1;
-    V8F16Special = true;
+    V6F16Special = true;
      AdjVT = MVT::v2i32;
    }
  
@@ -4738,7 +4738,7 @@ static SDValue constructRetValue(SelectionDAG &DAG,
      PreTFCRes = BVElts[0];
    }
  
-  if (V8F16Special)
+  if (V6F16Special)
      PreTFCRes = DAG.getNode(ISD::BITCAST, DL, MVT::v4f16, PreTFCRes);
  
    if (!IsTexFail) {
@@ -4971,9 +4971,6 @@ SDValue SITargetLowering::lowerImage(SDValue Op,
        return Undef;
      }
  
-    // Have to use a power of 2 number of dwords
-    NumVDataDwords = 1 << Log2_32_Ceil(NumVDataDwords);
-
      EVT NewVT = NumVDataDwords > 1 ?
                    EVT::getVectorVT(*DAG.getContext(), MVT::f32, NumVDataDwords)
                  : MVT::f32;
diff --git a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.image.dim.ll b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.image.dim.ll

index b297aca..6084789 100644 (file)
--- a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.image.dim.ll
+++ b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.image.dim.ll
@@ -22,7 +22,7 @@ main_body:
  ; NOPRT-NOT: v_mov_b32_e32 v1
  ; NOPRT-NOT: v_mov_b32_e32 v2
  ; NOPRT-NOT: v_mov_b32_e32 v3
-; GCN: image_load v[0:7], v{{[0-9]+}}, s[0:7] dmask:0xf unorm tfe{{$}}
+; GCN: image_load v[0:4], v{{[0-9]+}}, s[0:7] dmask:0xf unorm tfe{{$}}
  ; SIVI: buffer_store_dword v4, off, s[8:11], 0
  ; GFX900: global_store_dword v[{{[0-9]+:[0-9]+}}], v4
  define amdgpu_ps <4 x float> @load_1d_tfe(<8 x i32> inreg %rsrc, i32 addrspace(1)* inreg %out, i32 %s) {
@@ -45,7 +45,7 @@ main_body:
  ; NOPRT-NOT: v_mov_b32_e32 v1
  ; NOPRT-NOT: v_mov_b32_e32 v2
  ; NOPRT-NOT: v_mov_b32_e32 v3
-; GCN: image_load v[0:7], v{{[0-9]+}}, s[0:7] dmask:0xf unorm lwe{{$}}
+; GCN: image_load v[0:4], v{{[0-9]+}}, s[0:7] dmask:0xf unorm lwe{{$}}
  ; SIVI: buffer_store_dword v4, off, s[8:11], 0
  ; GFX900: global_store_dword v[{{[0-9]+:[0-9]+}}], v4
  define amdgpu_ps <4 x float> @load_1d_lwe(<8 x i32> inreg %rsrc, i32 addrspace(1)* inreg %out, i32 %s) {
@@ -76,7 +76,7 @@ main_body:
  ; NOPRT-NOT: v_mov_b32_e32 v1
  ; NOPRT-NOT: v_mov_b32_e32 v2
  ; NOPRT-NOT: v_mov_b32_e32 v3
-; GCN: image_load v[0:7], v[{{[0-9]+:[0-9]+}}], s[0:7] dmask:0xf unorm tfe{{$}}
+; GCN: image_load v[0:4], v[{{[0-9]+:[0-9]+}}], s[0:7] dmask:0xf unorm tfe{{$}}
  ; SIVI: buffer_store_dword v4, off, s[8:11], 0
  ; GFX900: global_store_dword v[{{[0-9]+:[0-9]+}}], v4
  define amdgpu_ps <4 x float> @load_2d_tfe(<8 x i32> inreg %rsrc, i32 addrspace(1)* inreg %out, i32 %s, i32 %t) {
@@ -107,7 +107,7 @@ main_body:
  ; NOPRT-NOT: v_mov_b32_e32 v1
  ; NOPRT-NOT: v_mov_b32_e32 v2
  ; NOPRT-NOT: v_mov_b32_e32 v3
-; GCN: image_load v[0:7], v[{{[0-9]+:[0-9]+}}], s[0:7] dmask:0xf unorm tfe lwe{{$}}
+; GCN: image_load v[0:4], v[{{[0-9]+:[0-9]+}}], s[0:7] dmask:0xf unorm tfe lwe{{$}}
  ; SIVI: buffer_store_dword v4, off, s[8:11], 0
  ; GFX900: global_store_dword v[{{[0-9]+:[0-9]+}}], v4
  define amdgpu_ps <4 x float> @load_3d_tfe_lwe(<8 x i32> inreg %rsrc, i32 addrspace(1)* inreg %out, i32 %s, i32 %t, i32 %r) {
@@ -138,7 +138,7 @@ main_body:
  ; NOPRT-NOT: v_mov_b32_e32 v1
  ; NOPRT-NOT: v_mov_b32_e32 v2
  ; NOPRT-NOT: v_mov_b32_e32 v3
-; GCN: image_load v[0:7], v[{{[0-9]+:[0-9]+}}], s[0:7] dmask:0xf unorm lwe da{{$}}
+; GCN: image_load v[0:4], v[{{[0-9]+:[0-9]+}}], s[0:7] dmask:0xf unorm lwe da{{$}}
  ; SIVI: buffer_store_dword v4, off, s[8:11], 0
  ; GFX900: global_store_dword v[{{[0-9]+:[0-9]+}}], v4
  define amdgpu_ps <4 x float> @load_cube_lwe(<8 x i32> inreg %rsrc, i32 addrspace(1)* inreg %out, i32 %s, i32 %t, i32 %slice) {
@@ -169,7 +169,7 @@ main_body:
  ; NOPRT-NOT: v_mov_b32_e32 v1
  ; NOPRT-NOT: v_mov_b32_e32 v2
  ; NOPRT-NOT: v_mov_b32_e32 v3
-; GCN: image_load v[0:7], v[{{[0-9]+:[0-9]+}}], s[0:7] dmask:0xf unorm tfe da{{$}}
+; GCN: image_load v[0:4], v[{{[0-9]+:[0-9]+}}], s[0:7] dmask:0xf unorm tfe da{{$}}
  ; SIVI: buffer_store_dword v4, off, s[8:11], 0
  ; GFX900: global_store_dword v[{{[0-9]+:[0-9]+}}], v4
  define amdgpu_ps <4 x float> @load_1darray_tfe(<8 x i32> inreg %rsrc, i32 addrspace(1)* inreg %out, i32 %s, i32 %slice) {
@@ -200,7 +200,7 @@ main_body:
  ; NOPRT-NOT: v_mov_b32_e32 v1
  ; NOPRT-NOT: v_mov_b32_e32 v2
  ; NOPRT-NOT: v_mov_b32_e32 v3
-; GCN: image_load v[0:7], v[{{[0-9]+:[0-9]+}}], s[0:7] dmask:0xf unorm lwe da{{$}}
+; GCN: image_load v[0:4], v[{{[0-9]+:[0-9]+}}], s[0:7] dmask:0xf unorm lwe da{{$}}
  ; SIVI: buffer_store_dword v4, off, s[8:11], 0
  ; GFX900: global_store_dword v[{{[0-9]+:[0-9]+}}], v4
  define amdgpu_ps <4 x float> @load_2darray_lwe(<8 x i32> inreg %rsrc, i32 addrspace(1)* inreg %out, i32 %s, i32 %t, i32 %slice) {
@@ -231,7 +231,7 @@ main_body:
  ; NOPRT-NOT: v_mov_b32_e32 v1
  ; NOPRT-NOT: v_mov_b32_e32 v2
  ; NOPRT-NOT: v_mov_b32_e32 v3
-; GCN: image_load v[0:7], v[{{[0-9]+:[0-9]+}}], s[0:7] dmask:0xf unorm tfe lwe{{$}}
+; GCN: image_load v[0:4], v[{{[0-9]+:[0-9]+}}], s[0:7] dmask:0xf unorm tfe lwe{{$}}
  ; SIVI: buffer_store_dword v4, off, s[8:11], 0
  ; GFX900: global_store_dword v[{{[0-9]+:[0-9]+}}], v4
  define amdgpu_ps <4 x float> @load_2dmsaa_both(<8 x i32> inreg %rsrc, i32 addrspace(1)* inreg %out, i32 %s, i32 %t, i32 %fragid) {
@@ -262,7 +262,7 @@ main_body:
  ; NOPRT-NOT: v_mov_b32_e32 v1
  ; NOPRT-NOT: v_mov_b32_e32 v2
  ; NOPRT-NOT: v_mov_b32_e32 v3
-; GCN: image_load v[0:7], v[{{[0-9]+:[0-9]+}}], s[0:7] dmask:0xf unorm tfe da{{$}}
+; GCN: image_load v[0:4], v[{{[0-9]+:[0-9]+}}], s[0:7] dmask:0xf unorm tfe da{{$}}
  ; SIVI: buffer_store_dword v4, off, s[8:11], 0
  ; GFX900: global_store_dword v[{{[0-9]+:[0-9]+}}], v4
  define amdgpu_ps <4 x float> @load_2darraymsaa_tfe(<8 x i32> inreg %rsrc, i32 addrspace(1)* inreg %out, i32 %s, i32 %t, i32 %slice, i32 %fragid) {
@@ -293,7 +293,7 @@ main_body:
  ; NOPRT-NOT: v_mov_b32_e32 v1
  ; NOPRT-NOT: v_mov_b32_e32 v2
  ; NOPRT-NOT: v_mov_b32_e32 v3
-; GCN: image_load_mip v[0:7], v[{{[0-9]+:[0-9]+}}], s[0:7] dmask:0xf unorm lwe{{$}}
+; GCN: image_load_mip v[0:4], v[{{[0-9]+:[0-9]+}}], s[0:7] dmask:0xf unorm lwe{{$}}
  ; SIVI: buffer_store_dword v4, off, s[8:11], 0
  ; GFX900: global_store_dword v[{{[0-9]+:[0-9]+}}], v4
  define amdgpu_ps <4 x float> @load_mip_1d_lwe(<8 x i32> inreg %rsrc, i32 addrspace(1)* inreg %out, i32 %s, i32 %mip) {
@@ -324,7 +324,7 @@ main_body:
  ; NOPRT-NOT: v_mov_b32_e32 v1
  ; NOPRT-NOT: v_mov_b32_e32 v2
  ; NOPRT-NOT: v_mov_b32_e32 v3
-; GCN: image_load_mip v[0:7], v[{{[0-9]+:[0-9]+}}], s[0:7] dmask:0xf unorm tfe{{$}}
+; GCN: image_load_mip v[0:4], v[{{[0-9]+:[0-9]+}}], s[0:7] dmask:0xf unorm tfe{{$}}
  ; SIVI: buffer_store_dword v4, off, s[8:11], 0
  ; GFX900: global_store_dword v[{{[0-9]+:[0-9]+}}], v4
  define amdgpu_ps <4 x float> @load_mip_2d_tfe(<8 x i32> inreg %rsrc, i32 addrspace(1)* inreg %out, i32 %s, i32 %t, i32 %mip) {
@@ -451,7 +451,7 @@ main_body:
  ; NOPRT: v_mov_b32_e32 v2, 0
  ; NOPRT-NOT: v_mov_b32_e32 v0
  ; NOPRT-NOT: v_mov_b32_e32 v1
-; GCN: image_load v[0:3], v{{[0-9]+}}, s[0:7] dmask:0x6 unorm tfe{{$}}
+; GCN: image_load v[0:2], v{{[0-9]+}}, s[0:7] dmask:0x6 unorm tfe{{$}}
  ; SIVI: buffer_store_dword v2, off, s[8:11], 0
  ; GFX900: global_store_dword v[{{[0-9]+:[0-9]+}}], v2
  define amdgpu_ps <4 x float> @load_1d_tfe_V4_dmask2(<8 x i32> inreg %rsrc, i32 addrspace(1)* inreg %out, i32 %s) {
diff --git a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.image.load.a16.ll b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.image.load.a16.ll

index be579b8..b05b85e 100644 (file)
--- a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.image.load.a16.ll
+++ b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.image.load.a16.ll
@@ -19,7 +19,7 @@ main_body:
  }
  
  ; GCN-LABEL: {{^}}load.v3f32.1d:
-; GCN: image_load v[0:3], v0, s[0:7] dmask:0x7 unorm a16
+; GCN: image_load v[0:2], v0, s[0:7] dmask:0x7 unorm a16
  define amdgpu_ps <4 x float> @load.v3f32.1d(<8 x i32> inreg %rsrc, <2 x i16> %coords) {
  main_body:
    %x = extractelement <2 x i16> %coords, i32 0
@@ -57,7 +57,7 @@ main_body:
  }
  
  ; GCN-LABEL: {{^}}load.v3f32.2d:
-; GCN: image_load v[0:3], v0, s[0:7] dmask:0x7 unorm a16
+; GCN: image_load v[0:2], v0, s[0:7] dmask:0x7 unorm a16
  define amdgpu_ps <4 x float> @load.v3f32.2d(<8 x i32> inreg %rsrc, <2 x i16> %coords) {
  main_body:
    %x = extractelement <2 x i16> %coords, i32 0
@@ -99,7 +99,7 @@ main_body:
  }
  
  ; GCN-LABEL: {{^}}load.v3f32.3d:
-; GCN: image_load v[0:3], v[0:1], s[0:7] dmask:0x7 unorm a16
+; GCN: image_load v[0:2], v[0:1], s[0:7] dmask:0x7 unorm a16
  define amdgpu_ps <4 x float> @load.v3f32.3d(<8 x i32> inreg %rsrc, <2 x i16> %coords_lo, <2 x i16> %coords_hi) {
  main_body:
    %x = extractelement <2 x i16> %coords_lo, i32 0
diff --git a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.image.sample.dim.ll b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.image.sample.dim.ll

index 2ee69ac..28e747c 100644 (file)
--- a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.image.sample.dim.ll
+++ b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.image.sample.dim.ll
@@ -15,7 +15,7 @@ main_body:
  ; GCN: v_mov_b32_e32 v2, v0
  ; GCN: v_mov_b32_e32 v3, v0
  ; GCN: v_mov_b32_e32 v4, v0
-; GCN: image_sample v[0:7], v5, s[0:7], s[8:11] dmask:0xf tfe{{$}}
+; GCN: image_sample v[0:4], v5, s[0:7], s[8:11] dmask:0xf tfe{{$}}
  define amdgpu_ps <4 x float> @sample_1d_tfe(<8 x i32> inreg %rsrc, <4 x i32> inreg %samp, i32 addrspace(1)* inreg %out, float %s) {
  main_body:
    %v = call {<4 x float>,i32} @llvm.amdgcn.image.sample.1d.v4f32i32.f32(i32 15, float %s, <8 x i32> %rsrc, <4 x i32> %samp, i1 0, i32 1, i32 0)
@@ -155,7 +155,7 @@ main_body:
  ; GCN: v_mov_b32_e32 v2, v0
  ; GCN: v_mov_b32_e32 v3, v0
  ; GCN: v_mov_b32_e32 v4, v0
-; GCN: image_sample v[0:7], v5, s[0:7], s[8:11] dmask:0xf lwe{{$}}
+; GCN: image_sample v[0:4], v5, s[0:7], s[8:11] dmask:0xf lwe{{$}}
  define amdgpu_ps <4 x float> @sample_1d_lwe(<8 x i32> inreg %rsrc, <4 x i32> inreg %samp, i32 addrspace(1)* inreg %out, float %s) {
  main_body:
    %v = call {<4 x float>,i32} @llvm.amdgcn.image.sample.1d.v4f32i32.f32(i32 15, float %s, <8 x i32> %rsrc, <4 x i32> %samp, i1 0, i32 2, i32 0)
@@ -537,7 +537,7 @@ main_body:
  }
  
  ; GCN-LABEL: {{^}}sample_c_d_o_2darray_V2_tfe:
-; GCN: image_sample_c_d_o v[9:12], v[0:15], s[0:7], s[8:11] dmask:0x6 tfe da{{$}}
+; GCN: image_sample_c_d_o v[9:11], v[0:15], s[0:7], s[8:11] dmask:0x6 tfe da{{$}}
  define amdgpu_ps <4 x float> @sample_c_d_o_2darray_V2_tfe(<8 x i32> inreg %rsrc, <4 x i32> inreg %samp, i32 %offset, float %zcompare, float %dsdh, float %dtdh, float %dsdv, float %dtdv, float %s, float %t, float %slice) {
  main_body:
    %v = call {<2 x float>, i32} @llvm.amdgcn.image.sample.c.d.o.2darray.v2f32i32.f32.f32(i32 6, i32 %offset, float %zcompare, float %dsdh, float %dtdh, float %dsdv, float %dtdv, float %s, float %t, float %slice, <8 x i32> %rsrc, <4 x i32> %samp, i1 0, i32 1, i32 0)
author	Tim Renouf <tpr.llvm@botech.co.uk>
	Fri, 22 Mar 2019 15:21:11 +0000 (15:21 +0000)
committer	Tim Renouf <tpr.llvm@botech.co.uk>
	Fri, 22 Mar 2019 15:21:11 +0000 (15:21 +0000)
llvm/lib/Target/AMDGPU/MIMGInstructions.td		patch \| blob \| history
llvm/lib/Target/AMDGPU/SIISelLowering.cpp		patch \| blob \| history
llvm/test/CodeGen/AMDGPU/llvm.amdgcn.image.dim.ll		patch \| blob \| history
llvm/test/CodeGen/AMDGPU/llvm.amdgcn.image.load.a16.ll		patch \| blob \| history
llvm/test/CodeGen/AMDGPU/llvm.amdgcn.image.sample.dim.ll		patch \| blob \| history