AMDGPU: Fix issue in shl(or) combine

author Ruiling Song <ruiling.song@amd.com>

Wed, 10 May 2023 03:07:00 +0000 (11:07 +0800)

committer Ruiling Song <ruiling.song@amd.com>

Fri, 12 May 2023 11:50:25 +0000 (19:50 +0800)
author Ruiling Song <ruiling.song@amd.com>
Wed, 10 May 2023 03:07:00 +0000 (11:07 +0800)
committer Ruiling Song <ruiling.song@amd.com>
Fri, 12 May 2023 11:50:25 +0000 (19:50 +0800)
diff --git a/llvm/lib/Target/AMDGPU/SIISelLowering.cpp b/llvm/lib/Target/AMDGPU/SIISelLowering.cpp

index cbbb2f6b9f40bfd3db4af4b184dcb6c6e5ecca88..6c8aa6c9ecd94d0db9f9fb3a41768e27041f477d 100644 (file)
--- a/llvm/lib/Target/AMDGPU/SIISelLowering.cpp
+++ b/llvm/lib/Target/AMDGPU/SIISelLowering.cpp
@@ -9550,6 +9550,8 @@ SDValue SITargetLowering::performFCopySignCombine(SDNode *N,
  }
  
  // (shl (add x, c1), c2) -> add (shl x, c2), (shl c1, c2)
+// (shl (or x, c1), c2) -> add (shl x, c2), (shl c1, c2) iff x and c1 share no
+// bits
  
  // This is a variant of
  // (mul (add x, c1), c2) -> add (mul x, c2), (mul c1, c2),
@@ -9584,8 +9586,14 @@ SDValue SITargetLowering::performSHLPtrCombine(SDNode *N,
    if (!CAdd)
      return SDValue();
  
-  // If the resulting offset is too large, we can't fold it into the addressing
-  // mode offset.
+  SelectionDAG &DAG = DCI.DAG;
+
+  if (N0->getOpcode() == ISD::OR &&
+      !DAG.haveNoCommonBitsSet(N0.getOperand(0), N0.getOperand(1)))
+    return SDValue();
+
+  // If the resulting offset is too large, we can't fold it into the
+  // addressing mode offset.
    APInt Offset = CAdd->getAPIntValue() << CN1->getAPIntValue();
    Type *Ty = MemVT.getTypeForEVT(*DCI.DAG.getContext());
  
@@ -9595,7 +9603,6 @@ SDValue SITargetLowering::performSHLPtrCombine(SDNode *N,
    if (!isLegalAddressingMode(DCI.DAG.getDataLayout(), AM, Ty, AddrSpace))
      return SDValue();
  
-  SelectionDAG &DAG = DCI.DAG;
    SDLoc SL(N);
    EVT VT = N->getValueType(0);
  
diff --git a/llvm/test/CodeGen/AMDGPU/shl_add_ptr.ll b/llvm/test/CodeGen/AMDGPU/shl_add_ptr.ll

index b89d9ea027868c25eee261ca07ac77aa43fdb426..670dd72ba2fb352b14032abf3b7603ad757e0c28 100644 (file)
--- a/llvm/test/CodeGen/AMDGPU/shl_add_ptr.ll
+++ b/llvm/test/CodeGen/AMDGPU/shl_add_ptr.ll
@@ -410,15 +410,12 @@ define void @shl_add_ptr_combine_2use_both_max_private_offset(i16 zeroext %idx.a
    ret void
  }
  
-; FIXME: This or should fold into an offset on the write
  ; GCN-LABEL: {{^}}shl_or_ptr_combine_2use_lds:
-; GCN: v_lshlrev_b32_e32 [[SCALE0:v[0-9]+]], 3, v0
-; GCN: v_or_b32_e32 [[SCALE1:v[0-9]+]], 32, [[SCALE0]]
-; GCN: v_lshlrev_b32_e32 [[SCALE2:v[0-9]+]], 4, v0
-; GCN: ds_write_b32 [[SCALE1]], v{{[0-9]+}}
-; GCN: ds_write_b32 [[SCALE2]], v{{[0-9]+}} offset:64
+; GCN-DAG: ds_write_b32 v{{[0-9]+}}, v{{[0-9]+}} offset:8
+; GCN-DAG: ds_write_b32 v{{[0-9]+}}, v{{[0-9]+}} offset:16
  define void @shl_or_ptr_combine_2use_lds(i32 %idx) #0 {
-  %idx.add = or i32 %idx, 4
+  %idx.shl = shl i32 %idx, 1
+  %idx.add = or i32 %idx.shl, 1
    %shl0 = shl i32 %idx.add, 3
    %shl1 = shl i32 %idx.add, 4
    %ptr0 = inttoptr i32 %shl0 to ptr addrspace(3)
@@ -427,15 +424,14 @@ define void @shl_or_ptr_combine_2use_lds(i32 %idx) #0 {
    store volatile i32 10, ptr addrspace(3) %ptr1
    ret void
  }
-
-; GCN-LABEL: {{^}}shl_or_ptr_combine_2use_max_lds_offset:
-; GCN-DAG: v_lshlrev_b32_e32 [[SCALE0:v[0-9]+]], 3, v0
-; GCN-DAG: v_lshlrev_b32_e32 [[SCALE1:v[0-9]+]], 4, v0
-; GCN-DAG: ds_write_b32 [[SCALE0]], v{{[0-9]+}} offset:65528
-; GCN-DAG: v_or_b32_e32 [[ADD1:v[0-9]+]], 0x1fff0, [[SCALE1]]
-; GCN: ds_write_b32 [[ADD1]], v{{[0-9]+$}}
-define void @shl_or_ptr_combine_2use_max_lds_offset(i32 %idx) #0 {
-  %idx.add = or i32 %idx, 8191
+; GCN-LABEL: {{^}}shl_or_ptr_not_combine_2use_lds:
+; GCN:     v_or_b32_e32 [[OR:v[0-9]+]], 1, v0
+; GCN-DAG: v_lshlrev_b32_e32 [[SCALE0:v[0-9]+]], 3, [[OR]]
+; GCN-DAG: v_lshlrev_b32_e32 [[SCALE1:v[0-9]+]], 4, [[OR]]
+; GCN-DAG: ds_write_b32 [[SCALE0]], v{{[0-9]+}}{{$}}
+; GCN-DAG: ds_write_b32 [[SCALE1]], v{{[0-9]+}}{{$}}
+define void @shl_or_ptr_not_combine_2use_lds(i32 %idx) #0 {
+  %idx.add = or i32 %idx, 1
    %shl0 = shl i32 %idx.add, 3
    %shl1 = shl i32 %idx.add, 4
    %ptr0 = inttoptr i32 %shl0 to ptr addrspace(3)
author	Ruiling Song <ruiling.song@amd.com>
	Wed, 10 May 2023 03:07:00 +0000 (11:07 +0800)
committer	Ruiling Song <ruiling.song@amd.com>
	Fri, 12 May 2023 11:50:25 +0000 (19:50 +0800)
llvm/lib/Target/AMDGPU/SIISelLowering.cpp		patch \| blob \| history
llvm/test/CodeGen/AMDGPU/shl_add_ptr.ll		patch \| blob \| history