[AMDGPU] Unnecessary -amdgpu-scalarize-global-loads=false flag removed from min/max...

author Alexander Timofeev <Alexander.Timofeev@amd.com>

Thu, 19 Sep 2019 16:44:38 +0000 (16:44 +0000)

committer Alexander Timofeev <Alexander.Timofeev@amd.com>

Thu, 19 Sep 2019 16:44:38 +0000 (16:44 +0000)
author Alexander Timofeev <Alexander.Timofeev@amd.com>
Thu, 19 Sep 2019 16:44:38 +0000 (16:44 +0000)
committer Alexander Timofeev <Alexander.Timofeev@amd.com>
Thu, 19 Sep 2019 16:44:38 +0000 (16:44 +0000)
diff --git a/llvm/test/CodeGen/AMDGPU/max.ll b/llvm/test/CodeGen/AMDGPU/max.ll

index 716caf2..b53d65e 100644 (file)
--- a/llvm/test/CodeGen/AMDGPU/max.ll
+++ b/llvm/test/CodeGen/AMDGPU/max.ll
@@ -1,5 +1,5 @@
-; RUN: llc -amdgpu-scalarize-global-loads=false -march=amdgcn -mcpu=pitcairn < %s | FileCheck -enable-var-scope -check-prefix=SI -check-prefix=FUNC %s
-; RUN: llc -amdgpu-scalarize-global-loads=false -march=r600 -mcpu=cypress < %s | FileCheck -enable-var-scope -check-prefix=EG -check-prefix=FUNC %s
+; RUN: llc -march=amdgcn -mcpu=pitcairn < %s | FileCheck -enable-var-scope -check-prefix=SI -check-prefix=FUNC %s
+; RUN: llc -march=r600 -mcpu=cypress < %s | FileCheck -enable-var-scope -check-prefix=EG -check-prefix=FUNC %s
  
  
  ; FUNC-LABEL: {{^}}v_test_imax_sge_i32:
@@ -7,8 +7,10 @@
  
  ; EG: MAX_INT
  define amdgpu_kernel void @v_test_imax_sge_i32(i32 addrspace(1)* %out, i32 addrspace(1)* %aptr, i32 addrspace(1)* %bptr) nounwind {
+  %tid = call i32 @llvm.r600.read.tidig.x()
+  %gep.in = getelementptr inbounds i32, i32 addrspace(1)* %bptr, i32 %tid
    %a = load i32, i32 addrspace(1)* %aptr, align 4
-  %b = load i32, i32 addrspace(1)* %bptr, align 4
+  %b = load i32, i32 addrspace(1)* %gep.in, align 4
    %cmp = icmp sge i32 %a, %b
    %val = select i1 %cmp, i32 %a, i32 %b
    store i32 %val, i32 addrspace(1)* %out, align 4
@@ -27,8 +29,10 @@ define amdgpu_kernel void @v_test_imax_sge_i32(i32 addrspace(1)* %out, i32 addrs
  ; EG: MAX_INT
  ; EG: MAX_INT
  define amdgpu_kernel void @v_test_imax_sge_v4i32(<4 x i32> addrspace(1)* %out, <4 x i32> addrspace(1)* %aptr, <4 x i32> addrspace(1)* %bptr) nounwind {
+  %tid = call i32 @llvm.r600.read.tidig.x()
+  %gep.in = getelementptr inbounds <4 x i32>, <4 x i32> addrspace(1)* %bptr, i32 %tid
    %a = load <4 x i32>, <4 x i32> addrspace(1)* %aptr, align 4
-  %b = load <4 x i32>, <4 x i32> addrspace(1)* %bptr, align 4
+  %b = load <4 x i32>, <4 x i32> addrspace(1)* %gep.in, align 4
    %cmp = icmp sge <4 x i32> %a, %b
    %val = select <4 x i1> %cmp, <4 x i32> %a, <4 x i32> %b
    store <4 x i32> %val, <4 x i32> addrspace(1)* %out, align 4
@@ -101,8 +105,10 @@ define amdgpu_kernel void @s_test_imax_sgt_imm_v2i32(<2 x i32> addrspace(1)* %ou
  
  ; EG: MAX_INT
  define amdgpu_kernel void @v_test_imax_sgt_i32(i32 addrspace(1)* %out, i32 addrspace(1)* %aptr, i32 addrspace(1)* %bptr) nounwind {
+  %tid = call i32 @llvm.r600.read.tidig.x()
+  %gep.in = getelementptr inbounds i32, i32 addrspace(1)* %bptr, i32 %tid
    %a = load i32, i32 addrspace(1)* %aptr, align 4
-  %b = load i32, i32 addrspace(1)* %bptr, align 4
+  %b = load i32, i32 addrspace(1)* %gep.in, align 4
    %cmp = icmp sgt i32 %a, %b
    %val = select i1 %cmp, i32 %a, i32 %b
    store i32 %val, i32 addrspace(1)* %out, align 4
@@ -125,8 +131,10 @@ define amdgpu_kernel void @s_test_imax_sgt_i32(i32 addrspace(1)* %out, i32 %a, i
  
  ; EG: MAX_UINT
  define amdgpu_kernel void @v_test_umax_uge_i32(i32 addrspace(1)* %out, i32 addrspace(1)* %aptr, i32 addrspace(1)* %bptr) nounwind {
+  %tid = call i32 @llvm.r600.read.tidig.x()
+  %gep.in = getelementptr inbounds i32, i32 addrspace(1)* %bptr, i32 %tid
    %a = load i32, i32 addrspace(1)* %aptr, align 4
-  %b = load i32, i32 addrspace(1)* %bptr, align 4
+  %b = load i32, i32 addrspace(1)* %gep.in, align 4
    %cmp = icmp uge i32 %a, %b
    %val = select i1 %cmp, i32 %a, i32 %b
    store i32 %val, i32 addrspace(1)* %out, align 4
@@ -182,7 +190,9 @@ define amdgpu_kernel void @v_test_umax_uge_i8(i8 addrspace(1)* %out, i8 addrspac
  
  ; EG: MAX_UINT
  define amdgpu_kernel void @v_test_umax_ugt_i32(i32 addrspace(1)* %out, i32 addrspace(1)* %aptr, i32 addrspace(1)* %bptr) nounwind {
-  %a = load i32, i32 addrspace(1)* %aptr, align 4
+  %tid = call i32 @llvm.r600.read.tidig.x()
+  %gep.in = getelementptr inbounds i32, i32 addrspace(1)* %bptr, i32 %tid
+  %a = load i32, i32 addrspace(1)* %gep.in, align 4
    %b = load i32, i32 addrspace(1)* %bptr, align 4
    %cmp = icmp ugt i32 %a, %b
    %val = select i1 %cmp, i32 %a, i32 %b
@@ -320,3 +330,9 @@ define amdgpu_kernel void @test_imax_sge_i64(i64 addrspace(1)* %out, i64 %a, i64
    store i64 %val, i64 addrspace(1)* %out, align 8
    ret void
  }
+
+
+declare i32 @llvm.r600.read.tidig.x() #0
+
+attributes #0 = { nounwind readnone }
+attributes #1 = { nounwind }
diff --git a/llvm/test/CodeGen/AMDGPU/sminmax.ll b/llvm/test/CodeGen/AMDGPU/sminmax.ll

index ea4b700..e56db7e 100644 (file)
--- a/llvm/test/CodeGen/AMDGPU/sminmax.ll
+++ b/llvm/test/CodeGen/AMDGPU/sminmax.ll
@@ -1,7 +1,7 @@
-; RUN:  llc -amdgpu-scalarize-global-loads=false  -march=amdgcn -mcpu=verde -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=GCN,SIVI,FUNC %s
-; RUN:  llc -amdgpu-scalarize-global-loads=false  -march=amdgcn -mcpu=tonga -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=GCN,SIVI,FUNC %s
-; RUN:  llc -amdgpu-scalarize-global-loads=false  -march=amdgcn -mcpu=gfx900 -verify-machineinstrs < %s | FileCheck -check-prefixes=GCN,GFX9,FUNC %s
-; RUN:  llc -amdgpu-scalarize-global-loads=false  -march=r600 -mcpu=cypress -verify-machineinstrs < %s | FileCheck -check-prefixes=EG,FUNC %s
+; RUN:  llc -march=amdgcn -mcpu=verde -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=GCN,SIVI,FUNC %s
+; RUN:  llc -march=amdgcn -mcpu=tonga -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=GCN,SIVI,FUNC %s
+; RUN:  llc -march=amdgcn -mcpu=gfx900 -verify-machineinstrs < %s | FileCheck -check-prefixes=GCN,GFX9,FUNC %s
+; RUN:  llc -march=r600 -mcpu=cypress -verify-machineinstrs < %s | FileCheck -check-prefixes=EG,FUNC %s
  
  ; FUNC-LABEL: {{^}}s_abs_i32:
  ; GCN: s_abs_i32
@@ -28,7 +28,9 @@ define amdgpu_kernel void @s_abs_i32(i32 addrspace(1)* %out, i32 %val) nounwind
  
  ; EG: MAX_INT
  define amdgpu_kernel void @v_abs_i32(i32 addrspace(1)* %out, i32 addrspace(1)* %src) nounwind {
-  %val = load i32, i32 addrspace(1)* %src, align 4
+  %tid = call i32 @llvm.r600.read.tidig.x()
+  %gep.in = getelementptr inbounds i32, i32 addrspace(1)* %src, i32 %tid
+  %val = load i32, i32 addrspace(1)* %gep.in, align 4
    %neg = sub i32 0, %val
    %cond = icmp sgt i32 %val, %neg
    %res = select i1 %cond, i32 %val, i32 %neg
@@ -43,7 +45,9 @@ define amdgpu_kernel void @v_abs_i32(i32 addrspace(1)* %out, i32 addrspace(1)* %
  ; GCN: v_max_i32_e32 [[MAX:v[0-9]+]], [[SRC]], [[NEG]]
  ; GCN: v_mul_lo_u32 v{{[0-9]+}}, [[MAX]], [[MAX]]
  define amdgpu_kernel void @v_abs_i32_repeat_user(i32 addrspace(1)* %out, i32 addrspace(1)* %src) nounwind {
-  %val = load i32, i32 addrspace(1)* %src, align 4
+  %tid = call i32 @llvm.r600.read.tidig.x()
+  %gep.in = getelementptr inbounds i32, i32 addrspace(1)* %src, i32 %tid
+  %val = load i32, i32 addrspace(1)* %gep.in, align 4
    %neg = sub i32 0, %val
    %cond = icmp sgt i32 %val, %neg
    %res = select i1 %cond, i32 %val, i32 %neg
@@ -96,7 +100,9 @@ define amdgpu_kernel void @v_abs_v2i32(<2 x i32> addrspace(1)* %out, <2 x i32> a
    %z1 = insertelement <2 x i32> %z0, i32 0, i32 1
    %t0 = insertelement <2 x i32> undef, i32 2, i32 0
    %t1 = insertelement <2 x i32> %t0, i32 2, i32 1
-  %val = load <2 x i32>, <2 x i32> addrspace(1)* %src, align 4
+  %tid = call i32 @llvm.r600.read.tidig.x()
+  %gep.in = getelementptr inbounds <2 x i32>, <2 x i32> addrspace(1)* %src, i32 %tid
+  %val = load <2 x i32>, <2 x i32> addrspace(1)* %gep.in, align 4
    %neg = sub <2 x i32> %z1, %val
    %cond = icmp sgt <2 x i32> %val, %neg
    %res = select <2 x i1> %cond, <2 x i32> %val, <2 x i32> %neg
@@ -178,7 +184,9 @@ define amdgpu_kernel void @v_abs_v4i32(<4 x i32> addrspace(1)* %out, <4 x i32> a
    %t1 = insertelement <4 x i32> %t0, i32 2, i32 1
    %t2 = insertelement <4 x i32> %t1, i32 2, i32 2
    %t3 = insertelement <4 x i32> %t2, i32 2, i32 3
-  %val = load <4 x i32>, <4 x i32> addrspace(1)* %src, align 4
+  %tid = call i32 @llvm.r600.read.tidig.x()
+  %gep.in = getelementptr inbounds <4 x i32>, <4 x i32> addrspace(1)* %src, i32 %tid
+  %val = load <4 x i32>, <4 x i32> addrspace(1)* %gep.in, align 4
    %neg = sub <4 x i32> %z3, %val
    %cond = icmp sgt <4 x i32> %val, %neg
    %res = select <4 x i1> %cond, <4 x i32> %val, <4 x i32> %neg
@@ -259,3 +267,8 @@ define amdgpu_kernel void @v_min_max_i32_user(i32 addrspace(1)* %out0, i32 addrs
    store volatile i1 %cond0, i1 addrspace(1)* undef
    ret void
  }
+
+declare i32 @llvm.r600.read.tidig.x() #0
+
+attributes #0 = { nounwind readnone }
+attributes #1 = { nounwind }
diff --git a/llvm/test/CodeGen/AMDGPU/sminmax.v2i16.ll b/llvm/test/CodeGen/AMDGPU/sminmax.v2i16.ll

index 8efc1ce..184082d 100644 (file)
--- a/llvm/test/CodeGen/AMDGPU/sminmax.v2i16.ll
+++ b/llvm/test/CodeGen/AMDGPU/sminmax.v2i16.ll
@@ -1,6 +1,6 @@
-; RUN: llc -amdgpu-scalarize-global-loads=false -march=amdgcn -mcpu=gfx900 -mattr=-flat-for-global -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=GFX9,GCN %s
-; RUN: llc -amdgpu-scalarize-global-loads=false -march=amdgcn -mcpu=tonga -mattr=-flat-for-global -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=VI,CIVI,GCN %s
-; RUN: llc -amdgpu-scalarize-global-loads=false -march=amdgcn -mcpu=bonaire -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=CI,CIVI,GCN %s
+; RUN: llc -march=amdgcn -mcpu=gfx900 -mattr=-flat-for-global -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=GFX9,GCN %s
+; RUN: llc -march=amdgcn -mcpu=tonga -mattr=-flat-for-global -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=VI,CIVI,GCN %s
+; RUN: llc -march=amdgcn -mcpu=bonaire -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=CI,CIVI,GCN %s
  
  ; GCN-LABEL: {{^}}s_abs_v2i16:
  ; GFX9: s_load_dword [[VAL:s[0-9]+]]
@@ -85,7 +85,7 @@ define amdgpu_kernel void @s_abs_v2i16_2(<2 x i16> addrspace(1)* %out, <2 x i16>
  }
  
  ; GCN-LABEL: {{^}}v_abs_v2i16_2:
-; GFX9: buffer_load_dword [[VAL:v[0-9]+]]
+; GFX9: global_load_dword [[VAL:v[0-9]+]]
  ; GFX9: v_pk_sub_i16 [[SUB:v[0-9]+]], 0, [[VAL]]
  ; GFX9: v_pk_max_i16 [[MAX:v[0-9]+]], [[VAL]], [[SUB]]
  ; GFX9: v_pk_sub_u16 [[ADD:v[0-9]+]], [[MAX]], -2 op_sel_hi:[1,0]
@@ -94,7 +94,9 @@ define amdgpu_kernel void @v_abs_v2i16_2(<2 x i16> addrspace(1)* %out, <2 x i16>
    %z1 = insertelement <2 x i16> %z0, i16 0, i16 1
    %t0 = insertelement <2 x i16> undef, i16 2, i16 0
    %t1 = insertelement <2 x i16> %t0, i16 2, i16 1
-  %val = load <2 x i16>, <2 x i16> addrspace(1)* %src, align 4
+  %tid = call i32 @llvm.amdgcn.workitem.id.x()
+  %gep.in = getelementptr inbounds <2 x i16>, <2 x i16> addrspace(1)* %src, i32 %tid
+  %val = load <2 x i16>, <2 x i16> addrspace(1)* %gep.in, align 4
    %neg = sub <2 x i16> %z1, %val
    %cond = icmp sgt <2 x i16> %val, %neg
    %res = select <2 x i1> %cond, <2 x i16> %val, <2 x i16> %neg
@@ -129,7 +131,7 @@ define amdgpu_kernel void @s_abs_v4i16(<4 x i16> addrspace(1)* %out, <4 x i16> %
  }
  
  ; GCN-LABEL: {{^}}v_abs_v4i16:
-; GFX9: buffer_load_dwordx2 v{{\[}}[[VAL0:[0-9]+]]:[[VAL1:[0-9]+]]{{\]}}
+; GFX9: global_load_dwordx2 v{{\[}}[[VAL0:[0-9]+]]:[[VAL1:[0-9]+]]{{\]}}
  
  ; GFX9-DAG: v_pk_sub_i16 [[SUB0:v[0-9]+]], 0, v[[VAL0]]
  ; GFX9-DAG: v_pk_max_i16 [[MAX0:v[0-9]+]], v[[VAL0]], [[SUB0]]
@@ -147,7 +149,9 @@ define amdgpu_kernel void @v_abs_v4i16(<4 x i16> addrspace(1)* %out, <4 x i16> a
    %t1 = insertelement <4 x i16> %t0, i16 2, i16 1
    %t2 = insertelement <4 x i16> %t1, i16 2, i16 2
    %t3 = insertelement <4 x i16> %t2, i16 2, i16 3
-  %val = load <4 x i16>, <4 x i16> addrspace(1)* %src, align 4
+  %tid = call i32 @llvm.amdgcn.workitem.id.x()
+  %gep.in = getelementptr inbounds <4 x i16>, <4 x i16> addrspace(1)* %src, i32 %tid
+  %val = load <4 x i16>, <4 x i16> addrspace(1)* %gep.in, align 4
    %neg = sub <4 x i16> %z3, %val
    %cond = icmp sgt <4 x i16> %val, %neg
    %res = select <4 x i1> %cond, <4 x i16> %val, <4 x i16> %neg
author	Alexander Timofeev <Alexander.Timofeev@amd.com>
	Thu, 19 Sep 2019 16:44:38 +0000 (16:44 +0000)
committer	Alexander Timofeev <Alexander.Timofeev@amd.com>
	Thu, 19 Sep 2019 16:44:38 +0000 (16:44 +0000)
llvm/test/CodeGen/AMDGPU/max.ll		patch \| blob \| history
llvm/test/CodeGen/AMDGPU/sminmax.ll		patch \| blob \| history
llvm/test/CodeGen/AMDGPU/sminmax.v2i16.ll		patch \| blob \| history