AMDGPU: Enable store clustering

author Matt Arsenault <Matthew.Arsenault@amd.com>

Tue, 15 Nov 2016 20:22:55 +0000 (20:22 +0000)

committer Matt Arsenault <Matthew.Arsenault@amd.com>

Tue, 15 Nov 2016 20:22:55 +0000 (20:22 +0000)
author Matt Arsenault <Matthew.Arsenault@amd.com>
Tue, 15 Nov 2016 20:22:55 +0000 (20:22 +0000)
committer Matt Arsenault <Matthew.Arsenault@amd.com>
Tue, 15 Nov 2016 20:22:55 +0000 (20:22 +0000)
diff --git a/llvm/lib/Target/AMDGPU/AMDGPUInstrInfo.cpp b/llvm/lib/Target/AMDGPU/AMDGPUInstrInfo.cpp

index 42c7b967f3e79d20bb07d421de997517d000fde9..f88bb69c3a7439e4b6740fcad1fcd7462360a58b 100644 (file)
--- a/llvm/lib/Target/AMDGPU/AMDGPUInstrInfo.cpp
+++ b/llvm/lib/Target/AMDGPU/AMDGPUInstrInfo.cpp
@@ -36,6 +36,10 @@ bool AMDGPUInstrInfo::enableClusterLoads() const {
    return true;
  }
  
+bool AMDGPUInstrInfo::enableClusterStores() const {
+  return true;
+}
+
  // FIXME: This behaves strangely. If, for example, you have 32 load + stores,
  // the first 16 loads will be interleaved with the stores, and the next 16 will
  // be clustered as expected. It should really split into 2 16 store batches.
diff --git a/llvm/lib/Target/AMDGPU/AMDGPUInstrInfo.h b/llvm/lib/Target/AMDGPU/AMDGPUInstrInfo.h

index de834f453a6734fc0c3eb1703f0b23792580d17a..46e985dc8fde9f5d083e7bccf23be7c13209f910 100644 (file)
--- a/llvm/lib/Target/AMDGPU/AMDGPUInstrInfo.h
+++ b/llvm/lib/Target/AMDGPU/AMDGPUInstrInfo.h
@@ -40,6 +40,7 @@ public:
    explicit AMDGPUInstrInfo(const AMDGPUSubtarget &st);
  
    bool enableClusterLoads() const override;
+  bool enableClusterStores() const override;
  
    bool shouldScheduleLoadsNear(SDNode *Load1, SDNode *Load2,
                                 int64_t Offset1, int64_t Offset2,
diff --git a/llvm/lib/Target/AMDGPU/AMDGPUTargetMachine.cpp b/llvm/lib/Target/AMDGPU/AMDGPUTargetMachine.cpp

index 877dacd06f7ba359ac99c5852418b588e638c8b3..baf4d192c57fb9e3d1d390a261a6ccd9327715cf 100644 (file)
--- a/llvm/lib/Target/AMDGPU/AMDGPUTargetMachine.cpp
+++ b/llvm/lib/Target/AMDGPU/AMDGPUTargetMachine.cpp
@@ -102,7 +102,14 @@ static ScheduleDAGInstrs *
  createGCNMaxOccupancyMachineScheduler(MachineSchedContext *C) {
    ScheduleDAGMILive *DAG =
        new ScheduleDAGMILive(C, make_unique<GCNMaxOccupancySchedStrategy>(C));
-  DAG->addMutation(createLoadClusterDAGMutation(DAG->TII, DAG->TRI));
+
+  const SIInstrInfo *TII = static_cast<const SIInstrInfo *>(DAG->TII);
+  if (TII->enableClusterLoads())
+    DAG->addMutation(createLoadClusterDAGMutation(TII, DAG->TRI));
+
+  if (TII->enableClusterStores())
+    DAG->addMutation(createStoreClusterDAGMutation(TII, DAG->TRI));
+
    return DAG;
  }
  
diff --git a/llvm/test/CodeGen/AMDGPU/si-triv-disjoint-mem-access.ll b/llvm/test/CodeGen/AMDGPU/si-triv-disjoint-mem-access.ll

index d86f67a1334ed4f0cb8cfe8f4d9df448b5ab5a8a..4beefb047f2211c2310b00c075bbfded657337c6 100644 (file)
--- a/llvm/test/CodeGen/AMDGPU/si-triv-disjoint-mem-access.ll
+++ b/llvm/test/CodeGen/AMDGPU/si-triv-disjoint-mem-access.ll
@@ -207,13 +207,21 @@ define void @reorder_global_offsets(i32 addrspace(1)* nocapture %out, i32 addrsp
  }
  
  ; FUNC-LABEL: {{^}}reorder_global_offsets_addr64_soffset0:
-; GCN: buffer_store_dword v{{[0-9]+}}, v{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}} 0 addr64{{$}}
-; GCN: buffer_store_dword v{{[0-9]+}}, v{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}} 0 addr64 offset:20{{$}}
  ; GCN: buffer_load_dword v{{[0-9]+}}, v{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}} 0 addr64 offset:12{{$}}
-; GCN: buffer_load_dword v{{[0-9]+}}, v{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}} 0 addr64 offset:28{{$}}
-; GCN: buffer_load_dword v{{[0-9]+}}, v{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}} 0 addr64 offset:44{{$}}
+; GCN-NEXT: buffer_load_dword v{{[0-9]+}}, v{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}} 0 addr64 offset:28{{$}}
+; GCN-NEXT: buffer_load_dword v{{[0-9]+}}, v{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}} 0 addr64 offset:44{{$}}
+
+; GCN: v_mov_b32
+; GCN: v_mov_b32
+
+; GCN: buffer_store_dword v{{[0-9]+}}, v{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}} 0 addr64{{$}}
+; GCN-NEXT: buffer_store_dword v{{[0-9]+}}, v{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}} 0 addr64 offset:20{{$}}
+
+; GCN: v_add_i32
+; GCN: v_add_i32
+
  ; GCN: buffer_store_dword v{{[0-9]+}}, v{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}} 0 addr64 offset:36{{$}}
-; GCN: buffer_store_dword v{{[0-9]+}}, v{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}} 0 addr64 offset:52{{$}}
+; GCN-NEXT: buffer_store_dword v{{[0-9]+}}, v{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}} 0 addr64 offset:52{{$}}
  define void @reorder_global_offsets_addr64_soffset0(i32 addrspace(1)* noalias nocapture %ptr.base) #0 {
    %id = call i32 @llvm.amdgcn.workitem.id.x()
    %id.ext = sext i32 %id to i64
author	Matt Arsenault <Matthew.Arsenault@amd.com>
	Tue, 15 Nov 2016 20:22:55 +0000 (20:22 +0000)
committer	Matt Arsenault <Matthew.Arsenault@amd.com>
	Tue, 15 Nov 2016 20:22:55 +0000 (20:22 +0000)
llvm/lib/Target/AMDGPU/AMDGPUInstrInfo.cpp		patch \| blob \| history
llvm/lib/Target/AMDGPU/AMDGPUInstrInfo.h		patch \| blob \| history
llvm/lib/Target/AMDGPU/AMDGPUTargetMachine.cpp		patch \| blob \| history
llvm/test/CodeGen/AMDGPU/si-triv-disjoint-mem-access.ll		patch \| blob \| history