From b27a538dda4caf2752ed8c6c731361ddf9458b87 Mon Sep 17 00:00:00 2001 From: Matt Arsenault Date: Sat, 16 May 2020 13:48:55 -0400 Subject: [PATCH] AMDGPU: Fix illegally constant folding from V_MOV_B32_sdwa This was assumed to be a simple move, and interpreting the immediate modifier operand as a materialized immediate. Apparently the SDWA pass never produces these, but GlobalISel does emit these for some vector shuffles. --- llvm/lib/Target/AMDGPU/VOP1Instructions.td | 11 +++++--- .../AMDGPU/atomic_optimizations_local_pointer.ll | 22 ++++++++-------- .../AMDGPU/atomic_optimizations_pixelshader.ll | 4 +-- .../CodeGen/AMDGPU/constant-fold-imm-immreg.mir | 30 ++++++++++++++++++++++ 4 files changed, 51 insertions(+), 16 deletions(-) diff --git a/llvm/lib/Target/AMDGPU/VOP1Instructions.td b/llvm/lib/Target/AMDGPU/VOP1Instructions.td index b1db1c0..0c2b5fb 100644 --- a/llvm/lib/Target/AMDGPU/VOP1Instructions.td +++ b/llvm/lib/Target/AMDGPU/VOP1Instructions.td @@ -100,8 +100,13 @@ class getVOP1Pat64 : LetDummies { multiclass VOP1Inst { - def _e32 : VOP1_Pseudo ; - def _e64 : VOP3_Pseudo .ret>; + // We only want to set this on the basic, non-SDWA or DPP forms. + defvar should_mov_imm = !eq(opName, "v_mov_b32"); + + let isMoveImm = should_mov_imm in { + def _e32 : VOP1_Pseudo ; + def _e64 : VOP3_Pseudo .ret>; + } foreach _ = BoolToList.ret in def _sdwa : VOP1_SDWA_Pseudo ; @@ -144,7 +149,7 @@ let VOPAsmPrefer32Bit = 1 in { defm V_NOP : VOP1Inst <"v_nop", VOP_NONE>; } -let isMoveImm = 1, isReMaterializable = 1, isAsCheapAsAMove = 1 in { +let isReMaterializable = 1, isAsCheapAsAMove = 1 in { defm V_MOV_B32 : VOP1Inst <"v_mov_b32", VOP_I32_I32>; } // End isMoveImm = 1 diff --git a/llvm/test/CodeGen/AMDGPU/atomic_optimizations_local_pointer.ll b/llvm/test/CodeGen/AMDGPU/atomic_optimizations_local_pointer.ll index 43c8653..279d1a9 100644 --- a/llvm/test/CodeGen/AMDGPU/atomic_optimizations_local_pointer.ll +++ b/llvm/test/CodeGen/AMDGPU/atomic_optimizations_local_pointer.ll @@ -496,8 +496,8 @@ define amdgpu_kernel void @add_i32_varying(i32 addrspace(1)* %out) { ; GFX1064-NEXT: v_readlane_b32 s2, v2, 31 ; GFX1064-NEXT: v_mov_b32_e32 v3, s2 ; GFX1064-NEXT: v_add_nc_u32_dpp v2, v3, v2 quad_perm:[0,1,2,3] row_mask:0xc bank_mask:0xf -; GFX1064-NEXT: v_readlane_b32 s2, v2, 15 ; GFX1064-NEXT: v_mov_b32_dpp v1, v2 row_shr:1 row_mask:0xf bank_mask:0xf +; GFX1064-NEXT: v_readlane_b32 s2, v2, 15 ; GFX1064-NEXT: v_readlane_b32 s3, v2, 31 ; GFX1064-NEXT: v_readlane_b32 s6, v2, 47 ; GFX1064-NEXT: v_writelane_b32 v1, s2, 16 @@ -735,8 +735,8 @@ define amdgpu_kernel void @add_i32_varying_gfx1032(i32 addrspace(1)* %out) { ; GFX1064-NEXT: v_readlane_b32 s2, v2, 31 ; GFX1064-NEXT: v_mov_b32_e32 v3, s2 ; GFX1064-NEXT: v_add_nc_u32_dpp v2, v3, v2 quad_perm:[0,1,2,3] row_mask:0xc bank_mask:0xf -; GFX1064-NEXT: v_readlane_b32 s2, v2, 15 ; GFX1064-NEXT: v_mov_b32_dpp v1, v2 row_shr:1 row_mask:0xf bank_mask:0xf +; GFX1064-NEXT: v_readlane_b32 s2, v2, 15 ; GFX1064-NEXT: v_readlane_b32 s3, v2, 31 ; GFX1064-NEXT: v_readlane_b32 s6, v2, 47 ; GFX1064-NEXT: v_writelane_b32 v1, s2, 16 @@ -974,8 +974,8 @@ define amdgpu_kernel void @add_i32_varying_gfx1064(i32 addrspace(1)* %out) { ; GFX1064-NEXT: v_readlane_b32 s2, v2, 31 ; GFX1064-NEXT: v_mov_b32_e32 v3, s2 ; GFX1064-NEXT: v_add_nc_u32_dpp v2, v3, v2 quad_perm:[0,1,2,3] row_mask:0xc bank_mask:0xf -; GFX1064-NEXT: v_readlane_b32 s2, v2, 15 ; GFX1064-NEXT: v_mov_b32_dpp v1, v2 row_shr:1 row_mask:0xf bank_mask:0xf +; GFX1064-NEXT: v_readlane_b32 s2, v2, 15 ; GFX1064-NEXT: v_readlane_b32 s3, v2, 31 ; GFX1064-NEXT: v_readlane_b32 s6, v2, 47 ; GFX1064-NEXT: v_writelane_b32 v1, s2, 16 @@ -2049,8 +2049,8 @@ define amdgpu_kernel void @sub_i32_varying(i32 addrspace(1)* %out) { ; GFX1064-NEXT: v_readlane_b32 s2, v2, 31 ; GFX1064-NEXT: v_mov_b32_e32 v3, s2 ; GFX1064-NEXT: v_add_nc_u32_dpp v2, v3, v2 quad_perm:[0,1,2,3] row_mask:0xc bank_mask:0xf -; GFX1064-NEXT: v_readlane_b32 s2, v2, 15 ; GFX1064-NEXT: v_mov_b32_dpp v1, v2 row_shr:1 row_mask:0xf bank_mask:0xf +; GFX1064-NEXT: v_readlane_b32 s2, v2, 15 ; GFX1064-NEXT: v_readlane_b32 s3, v2, 31 ; GFX1064-NEXT: v_readlane_b32 s6, v2, 47 ; GFX1064-NEXT: v_writelane_b32 v1, s2, 16 @@ -2784,8 +2784,8 @@ define amdgpu_kernel void @and_i32_varying(i32 addrspace(1)* %out) { ; GFX1064-NEXT: v_readlane_b32 s2, v2, 31 ; GFX1064-NEXT: v_mov_b32_e32 v3, s2 ; GFX1064-NEXT: v_and_b32_dpp v2, v3, v2 quad_perm:[0,1,2,3] row_mask:0xc bank_mask:0xf -; GFX1064-NEXT: v_readlane_b32 s2, v2, 15 ; GFX1064-NEXT: v_mov_b32_dpp v1, v2 row_shr:1 row_mask:0xf bank_mask:0xf +; GFX1064-NEXT: v_readlane_b32 s2, v2, 15 ; GFX1064-NEXT: v_readlane_b32 s3, v2, 31 ; GFX1064-NEXT: v_readlane_b32 s6, v2, 47 ; GFX1064-NEXT: v_writelane_b32 v1, s2, 16 @@ -3022,8 +3022,8 @@ define amdgpu_kernel void @or_i32_varying(i32 addrspace(1)* %out) { ; GFX1064-NEXT: v_readlane_b32 s2, v2, 31 ; GFX1064-NEXT: v_mov_b32_e32 v3, s2 ; GFX1064-NEXT: v_or_b32_dpp v2, v3, v2 quad_perm:[0,1,2,3] row_mask:0xc bank_mask:0xf -; GFX1064-NEXT: v_readlane_b32 s2, v2, 15 ; GFX1064-NEXT: v_mov_b32_dpp v1, v2 row_shr:1 row_mask:0xf bank_mask:0xf +; GFX1064-NEXT: v_readlane_b32 s2, v2, 15 ; GFX1064-NEXT: v_readlane_b32 s3, v2, 31 ; GFX1064-NEXT: v_readlane_b32 s6, v2, 47 ; GFX1064-NEXT: v_writelane_b32 v1, s2, 16 @@ -3261,8 +3261,8 @@ define amdgpu_kernel void @xor_i32_varying(i32 addrspace(1)* %out) { ; GFX1064-NEXT: v_readlane_b32 s2, v2, 31 ; GFX1064-NEXT: v_mov_b32_e32 v3, s2 ; GFX1064-NEXT: v_xor_b32_dpp v2, v3, v2 quad_perm:[0,1,2,3] row_mask:0xc bank_mask:0xf -; GFX1064-NEXT: v_readlane_b32 s2, v2, 15 ; GFX1064-NEXT: v_mov_b32_dpp v1, v2 row_shr:1 row_mask:0xf bank_mask:0xf +; GFX1064-NEXT: v_readlane_b32 s2, v2, 15 ; GFX1064-NEXT: v_readlane_b32 s3, v2, 31 ; GFX1064-NEXT: v_readlane_b32 s6, v2, 47 ; GFX1064-NEXT: v_writelane_b32 v1, s2, 16 @@ -3497,8 +3497,8 @@ define amdgpu_kernel void @max_i32_varying(i32 addrspace(1)* %out) { ; GFX1064-NEXT: v_readlane_b32 s2, v2, 31 ; GFX1064-NEXT: v_mov_b32_e32 v3, s2 ; GFX1064-NEXT: v_max_i32_dpp v2, v3, v2 quad_perm:[0,1,2,3] row_mask:0xc bank_mask:0xf -; GFX1064-NEXT: v_readlane_b32 s2, v2, 15 ; GFX1064-NEXT: v_mov_b32_dpp v1, v2 row_shr:1 row_mask:0xf bank_mask:0xf +; GFX1064-NEXT: v_readlane_b32 s2, v2, 15 ; GFX1064-NEXT: v_readlane_b32 s3, v2, 31 ; GFX1064-NEXT: v_readlane_b32 s6, v2, 47 ; GFX1064-NEXT: v_writelane_b32 v1, s2, 16 @@ -3917,8 +3917,8 @@ define amdgpu_kernel void @min_i32_varying(i32 addrspace(1)* %out) { ; GFX1064-NEXT: v_readlane_b32 s2, v2, 31 ; GFX1064-NEXT: v_mov_b32_e32 v3, s2 ; GFX1064-NEXT: v_min_i32_dpp v2, v3, v2 quad_perm:[0,1,2,3] row_mask:0xc bank_mask:0xf -; GFX1064-NEXT: v_readlane_b32 s2, v2, 15 ; GFX1064-NEXT: v_mov_b32_dpp v1, v2 row_shr:1 row_mask:0xf bank_mask:0xf +; GFX1064-NEXT: v_readlane_b32 s2, v2, 15 ; GFX1064-NEXT: v_readlane_b32 s3, v2, 31 ; GFX1064-NEXT: v_readlane_b32 s6, v2, 47 ; GFX1064-NEXT: v_writelane_b32 v1, s2, 16 @@ -4340,8 +4340,8 @@ define amdgpu_kernel void @umax_i32_varying(i32 addrspace(1)* %out) { ; GFX1064-NEXT: v_readlane_b32 s2, v2, 31 ; GFX1064-NEXT: v_mov_b32_e32 v3, s2 ; GFX1064-NEXT: v_max_u32_dpp v2, v3, v2 quad_perm:[0,1,2,3] row_mask:0xc bank_mask:0xf -; GFX1064-NEXT: v_readlane_b32 s2, v2, 15 ; GFX1064-NEXT: v_mov_b32_dpp v1, v2 row_shr:1 row_mask:0xf bank_mask:0xf +; GFX1064-NEXT: v_readlane_b32 s2, v2, 15 ; GFX1064-NEXT: v_readlane_b32 s3, v2, 31 ; GFX1064-NEXT: v_readlane_b32 s6, v2, 47 ; GFX1064-NEXT: v_writelane_b32 v1, s2, 16 @@ -4758,8 +4758,8 @@ define amdgpu_kernel void @umin_i32_varying(i32 addrspace(1)* %out) { ; GFX1064-NEXT: v_readlane_b32 s2, v2, 31 ; GFX1064-NEXT: v_mov_b32_e32 v3, s2 ; GFX1064-NEXT: v_min_u32_dpp v2, v3, v2 quad_perm:[0,1,2,3] row_mask:0xc bank_mask:0xf -; GFX1064-NEXT: v_readlane_b32 s2, v2, 15 ; GFX1064-NEXT: v_mov_b32_dpp v1, v2 row_shr:1 row_mask:0xf bank_mask:0xf +; GFX1064-NEXT: v_readlane_b32 s2, v2, 15 ; GFX1064-NEXT: v_readlane_b32 s3, v2, 31 ; GFX1064-NEXT: v_readlane_b32 s6, v2, 47 ; GFX1064-NEXT: v_writelane_b32 v1, s2, 16 diff --git a/llvm/test/CodeGen/AMDGPU/atomic_optimizations_pixelshader.ll b/llvm/test/CodeGen/AMDGPU/atomic_optimizations_pixelshader.ll index a90f0a6..a409457 100644 --- a/llvm/test/CodeGen/AMDGPU/atomic_optimizations_pixelshader.ll +++ b/llvm/test/CodeGen/AMDGPU/atomic_optimizations_pixelshader.ll @@ -347,8 +347,8 @@ define amdgpu_ps void @add_i32_varying(<4 x i32> inreg %out, <4 x i32> inreg %in ; GFX1064-NEXT: v_readlane_b32 s12, v2, 31 ; GFX1064-NEXT: v_mov_b32_e32 v3, s12 ; GFX1064-NEXT: v_add_nc_u32_dpp v2, v3, v2 quad_perm:[0,1,2,3] row_mask:0xc bank_mask:0xf -; GFX1064-NEXT: v_readlane_b32 s12, v2, 15 ; GFX1064-NEXT: v_mov_b32_dpp v1, v2 row_shr:1 row_mask:0xf bank_mask:0xf +; GFX1064-NEXT: v_readlane_b32 s12, v2, 15 ; GFX1064-NEXT: v_readlane_b32 s13, v2, 31 ; GFX1064-NEXT: v_writelane_b32 v1, s12, 16 ; GFX1064-NEXT: v_readlane_b32 s12, v2, 63 @@ -406,8 +406,8 @@ define amdgpu_ps void @add_i32_varying(<4 x i32> inreg %out, <4 x i32> inreg %in ; GFX1032-NEXT: v_permlanex16_b32 v3, v3, -1, -1 ; GFX1032-NEXT: v_add_nc_u32_dpp v2, v3, v2 quad_perm:[0,1,2,3] row_mask:0xa bank_mask:0xf ; GFX1032-NEXT: v_readlane_b32 s10, v2, 31 -; GFX1032-NEXT: v_readlane_b32 s11, v2, 15 ; GFX1032-NEXT: v_mov_b32_dpp v1, v2 row_shr:1 row_mask:0xf bank_mask:0xf +; GFX1032-NEXT: v_readlane_b32 s11, v2, 15 ; GFX1032-NEXT: v_writelane_b32 v1, s11, 16 ; GFX1032-NEXT: s_mov_b32 exec_lo, s9 ; GFX1032-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v0 diff --git a/llvm/test/CodeGen/AMDGPU/constant-fold-imm-immreg.mir b/llvm/test/CodeGen/AMDGPU/constant-fold-imm-immreg.mir index e879b1f..5baf4ac 100644 --- a/llvm/test/CodeGen/AMDGPU/constant-fold-imm-immreg.mir +++ b/llvm/test/CodeGen/AMDGPU/constant-fold-imm-immreg.mir @@ -965,3 +965,33 @@ body: | S_ENDPGM 0, implicit %3 ... + +# This used to incorrectly interpret V_MOV_B32_sdwa as being a move +# immediate, and interpreting the src0_modifiers field as a +# materialized immediate. + +--- +# GCN-LABEL: name: no_fold_sdwa_mov_imm +# GCN: %2:vgpr_32 = V_MOV_B32_sdwa 0, %0, 0, 5, 2, 4, implicit $exec, implicit %0(tied-def 0) +# GCN-NEXT: [[SHIFT:%[0-9]+]]:vgpr_32 = V_LSHRREV_B32_e64 16, %2, implicit $exec +# GCN-NEXT: S_ENDPGM 0, implicit [[SHIFT]] + +name: no_fold_sdwa_mov_imm +alignment: 0 +exposesReturnsTwice: false +legalized: false +regBankSelected: false +selected: false +tracksRegLiveness: true + +body: | + bb.0: + liveins: $vgpr0 + %0:vgpr_32 = COPY $vgpr0 + %1:vgpr_32 = V_MOV_B32_e32 16, implicit $exec + %2:vgpr_32 = V_MOV_B32_sdwa 0, %0:vgpr_32, 0, 5, 2, 4, implicit $exec, implicit %0:vgpr_32(tied-def 0) + %3:vgpr_32 = V_MOV_B32_e32 16, implicit $exec + %4:vgpr_32 = V_LSHRREV_B32_e64 %3:vgpr_32, %2:vgpr_32, implicit $exec + S_ENDPGM 0, implicit %4 + +... -- 2.7.4