From 10c911db63ece5b0a7e144b21762bb20d5de0eb1 Mon Sep 17 00:00:00 2001 From: Nicolai Haehnle Date: Mon, 1 Jul 2019 17:17:52 +0000 Subject: [PATCH] AMDGPU/GFX10: implement ds_ordered_count changes Summary: ds_ordered_count can now simultaneously operate on up to 4 dwords in a single instruction, which are taken from (and returned to) lanes 0..3 of a single VGPR. Change-Id: I19b6e7b0732b617c10a779a7f9c0303eec7dd276 Reviewers: mareko, arsenm, rampitec Subscribers: kzhuravl, jvesely, wdng, yaxunl, dstuttard, tpr, t-tye, llvm-commits Tags: #llvm Differential Revision: https://reviews.llvm.org/D63716 llvm-svn: 364815 --- llvm/include/llvm/IR/IntrinsicsAMDGPU.td | 1 + llvm/lib/Target/AMDGPU/SIISelLowering.cpp | 23 +++++++++++++++++++++- .../AMDGPU/llvm.amdgcn.ds.ordered.add.gfx10.ll | 23 ++++++++++++++++++++++ 3 files changed, 46 insertions(+), 1 deletion(-) create mode 100644 llvm/test/CodeGen/AMDGPU/llvm.amdgcn.ds.ordered.add.gfx10.ll diff --git a/llvm/include/llvm/IR/IntrinsicsAMDGPU.td b/llvm/include/llvm/IR/IntrinsicsAMDGPU.td index b6ab610..9b467a2 100644 --- a/llvm/include/llvm/IR/IntrinsicsAMDGPU.td +++ b/llvm/include/llvm/IR/IntrinsicsAMDGPU.td @@ -406,6 +406,7 @@ class AMDGPUDSOrderedIntrinsic : Intrinsic< llvm_i32_ty, // scope llvm_i1_ty, // isVolatile llvm_i32_ty, // ordered count index (OA index), also added to the address + // gfx10: bits 24-27 indicate the number of active threads/dwords llvm_i1_ty, // wave release, usually set to 1 llvm_i1_ty], // wave done, set to 1 for the last ordered instruction [NoCapture<0>, diff --git a/llvm/lib/Target/AMDGPU/SIISelLowering.cpp b/llvm/lib/Target/AMDGPU/SIISelLowering.cpp index 44d9987..5e6325a 100644 --- a/llvm/lib/Target/AMDGPU/SIISelLowering.cpp +++ b/llvm/lib/Target/AMDGPU/SIISelLowering.cpp @@ -5891,12 +5891,29 @@ SDValue SITargetLowering::LowerINTRINSIC_W_CHAIN(SDValue Op, SDValue Chain = M->getOperand(0); SDValue M0 = M->getOperand(2); SDValue Value = M->getOperand(3); - unsigned OrderedCountIndex = M->getConstantOperandVal(7); + unsigned IndexOperand = M->getConstantOperandVal(7); unsigned WaveRelease = M->getConstantOperandVal(8); unsigned WaveDone = M->getConstantOperandVal(9); unsigned ShaderType; unsigned Instruction; + unsigned OrderedCountIndex = IndexOperand & 0x3f; + IndexOperand &= ~0x3f; + unsigned CountDw = 0; + + if (Subtarget->getGeneration() >= AMDGPUSubtarget::GFX10) { + CountDw = (IndexOperand >> 24) & 0xf; + IndexOperand &= ~(0xf << 24); + + if (CountDw < 1 || CountDw > 4) { + report_fatal_error( + "ds_ordered_count: dword count must be between 1 and 4"); + } + } + + if (IndexOperand) + report_fatal_error("ds_ordered_count: bad index operand"); + switch (IntrID) { case Intrinsic::amdgcn_ds_ordered_add: Instruction = 0; @@ -5930,6 +5947,10 @@ SDValue SITargetLowering::LowerINTRINSIC_W_CHAIN(SDValue Op, unsigned Offset0 = OrderedCountIndex << 2; unsigned Offset1 = WaveRelease | (WaveDone << 1) | (ShaderType << 2) | (Instruction << 4); + + if (Subtarget->getGeneration() >= AMDGPUSubtarget::GFX10) + Offset1 |= (CountDw - 1) << 6; + unsigned Offset = Offset0 | (Offset1 << 8); SDValue Ops[] = { diff --git a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.ds.ordered.add.gfx10.ll b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.ds.ordered.add.gfx10.ll new file mode 100644 index 0000000..c74310f --- /dev/null +++ b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.ds.ordered.add.gfx10.ll @@ -0,0 +1,23 @@ +; RUN: llc -march=amdgcn -mcpu=gfx1010 -verify-machineinstrs -show-mc-encoding < %s | FileCheck -check-prefixes=GCN %s + +; FUNC-LABEL: {{^}}ds_ordered_add: +; GCN-DAG: v_mov_b32_e32 v[[INCR:[0-9]+]], 31 +; GCN-DAG: s_mov_b32 m0, +; GCN: ds_ordered_count v{{[0-9]+}}, v[[INCR]] offset:772 gds +define amdgpu_kernel void @ds_ordered_add(i32 addrspace(2)* inreg %gds, i32 addrspace(1)* %out) { + %val = call i32@llvm.amdgcn.ds.ordered.add(i32 addrspace(2)* %gds, i32 31, i32 0, i32 0, i1 false, i32 16777217, i1 true, i1 true) + store i32 %val, i32 addrspace(1)* %out + ret void +} + +; FUNC-LABEL: {{^}}ds_ordered_add_4dw: +; GCN-DAG: v_mov_b32_e32 v[[INCR:[0-9]+]], 31 +; GCN-DAG: s_mov_b32 m0, +; GCN: ds_ordered_count v{{[0-9]+}}, v[[INCR]] offset:49924 gds +define amdgpu_kernel void @ds_ordered_add_4dw(i32 addrspace(2)* inreg %gds, i32 addrspace(1)* %out) { + %val = call i32@llvm.amdgcn.ds.ordered.add(i32 addrspace(2)* %gds, i32 31, i32 0, i32 0, i1 false, i32 67108865, i1 true, i1 true) + store i32 %val, i32 addrspace(1)* %out + ret void +} + +declare i32 @llvm.amdgcn.ds.ordered.add(i32 addrspace(2)* nocapture, i32, i32, i32, i1, i32, i1, i1) -- 2.7.4