From c73df5696696327a15af2f05b30923cd66361ddc Mon Sep 17 00:00:00 2001 From: Matt Arsenault Date: Sat, 18 Jul 2020 10:35:40 -0400 Subject: [PATCH] AMDGPU/GlobalISel: Address some test fixmes that don't fail now --- .../AMDGPU/GlobalISel/constant-bus-restriction.ll | 505 +++++++-------------- .../GlobalISel/llvm.amdgcn.ds.ordered.add.ll | 3 +- .../GlobalISel/llvm.amdgcn.ds.ordered.swap.ll | 3 +- .../CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.sbfe.ll | 50 +- llvm/test/CodeGen/AMDGPU/GlobalISel/xnor.ll | 140 +++++- 5 files changed, 341 insertions(+), 360 deletions(-) diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/constant-bus-restriction.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/constant-bus-restriction.ll index ff0de0d..c815220e 100644 --- a/llvm/test/CodeGen/AMDGPU/GlobalISel/constant-bus-restriction.ll +++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/constant-bus-restriction.ll @@ -1,294 +1,182 @@ -; NOTE: Assertions have been autogenerated by utils/update_mir_test_checks.py -; RUN: llc -global-isel -mtriple=amdgcn-mesa-mesa3d -mcpu=gfx900 -stop-after=regbankselect -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefix=GFX9 %s -; RUN: llc -global-isel -mtriple=amdgcn-mesa-mesa3d -mcpu=gfx1010 -stop-after=regbankselect -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefix=GFX10 %s +; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py +; RUN: llc -global-isel -mtriple=amdgcn-mesa-mesa3d -mcpu=gfx900 -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefix=GFX9 %s +; RUN: llc -global-isel -mtriple=amdgcn-mesa-mesa3d -mcpu=gfx1010 -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefix=GFX10 %s ; Make sure we don't violate the constant bus restriction -; FIXME: Make this test isa output when div.fmas works. - define amdgpu_ps float @fmul_s_s(float inreg %src0, float inreg %src1) { - ; GFX9-LABEL: name: fmul_s_s - ; GFX9: bb.1 (%ir-block.0): - ; GFX9: liveins: $sgpr2, $sgpr3 - ; GFX9: [[COPY:%[0-9]+]]:sgpr(s32) = COPY $sgpr2 - ; GFX9: [[COPY1:%[0-9]+]]:sgpr(s32) = COPY $sgpr3 - ; GFX9: [[COPY2:%[0-9]+]]:vgpr(s32) = COPY [[COPY]](s32) - ; GFX9: [[COPY3:%[0-9]+]]:vgpr(s32) = COPY [[COPY1]](s32) - ; GFX9: [[FMUL:%[0-9]+]]:vgpr(s32) = G_FMUL [[COPY2]], [[COPY3]] - ; GFX9: $vgpr0 = COPY [[FMUL]](s32) - ; GFX9: SI_RETURN_TO_EPILOG implicit $vgpr0 - ; GFX10-LABEL: name: fmul_s_s - ; GFX10: bb.1 (%ir-block.0): - ; GFX10: liveins: $sgpr2, $sgpr3 - ; GFX10: [[COPY:%[0-9]+]]:sgpr(s32) = COPY $sgpr2 - ; GFX10: [[COPY1:%[0-9]+]]:sgpr(s32) = COPY $sgpr3 - ; GFX10: [[COPY2:%[0-9]+]]:vgpr(s32) = COPY [[COPY]](s32) - ; GFX10: [[COPY3:%[0-9]+]]:vgpr(s32) = COPY [[COPY1]](s32) - ; GFX10: [[FMUL:%[0-9]+]]:vgpr(s32) = G_FMUL [[COPY2]], [[COPY3]] - ; GFX10: $vgpr0 = COPY [[FMUL]](s32) - ; GFX10: SI_RETURN_TO_EPILOG implicit $vgpr0 +; GFX9-LABEL: fmul_s_s: +; GFX9: ; %bb.0: +; GFX9-NEXT: v_mov_b32_e32 v0, s3 +; GFX9-NEXT: v_mul_f32_e32 v0, s2, v0 +; GFX9-NEXT: ; return to shader part epilog +; +; GFX10-LABEL: fmul_s_s: +; GFX10: ; %bb.0: +; GFX10-NEXT: v_mul_f32_e64 v0, s2, s3 +; GFX10-NEXT: ; implicit-def: $vcc_hi +; GFX10-NEXT: ; return to shader part epilog %result = fmul float %src0, %src1 ret float %result } define amdgpu_ps float @fmul_ss(float inreg %src) { - ; GFX9-LABEL: name: fmul_ss - ; GFX9: bb.1 (%ir-block.0): - ; GFX9: liveins: $sgpr2 - ; GFX9: [[COPY:%[0-9]+]]:sgpr(s32) = COPY $sgpr2 - ; GFX9: [[COPY1:%[0-9]+]]:vgpr(s32) = COPY [[COPY]](s32) - ; GFX9: [[COPY2:%[0-9]+]]:vgpr(s32) = COPY [[COPY]](s32) - ; GFX9: [[FMUL:%[0-9]+]]:vgpr(s32) = G_FMUL [[COPY1]], [[COPY2]] - ; GFX9: $vgpr0 = COPY [[FMUL]](s32) - ; GFX9: SI_RETURN_TO_EPILOG implicit $vgpr0 - ; GFX10-LABEL: name: fmul_ss - ; GFX10: bb.1 (%ir-block.0): - ; GFX10: liveins: $sgpr2 - ; GFX10: [[COPY:%[0-9]+]]:sgpr(s32) = COPY $sgpr2 - ; GFX10: [[COPY1:%[0-9]+]]:vgpr(s32) = COPY [[COPY]](s32) - ; GFX10: [[COPY2:%[0-9]+]]:vgpr(s32) = COPY [[COPY]](s32) - ; GFX10: [[FMUL:%[0-9]+]]:vgpr(s32) = G_FMUL [[COPY1]], [[COPY2]] - ; GFX10: $vgpr0 = COPY [[FMUL]](s32) - ; GFX10: SI_RETURN_TO_EPILOG implicit $vgpr0 +; GFX9-LABEL: fmul_ss: +; GFX9: ; %bb.0: +; GFX9-NEXT: v_mul_f32_e64 v0, s2, s2 +; GFX9-NEXT: ; return to shader part epilog +; +; GFX10-LABEL: fmul_ss: +; GFX10: ; %bb.0: +; GFX10-NEXT: v_mul_f32_e64 v0, s2, s2 +; GFX10-NEXT: ; implicit-def: $vcc_hi +; GFX10-NEXT: ; return to shader part epilog %result = fmul float %src, %src ret float %result } ; Ternary operation with 3 different SGPRs define amdgpu_ps float @fma_s_s_s(float inreg %src0, float inreg %src1, float inreg %src2) { - ; GFX9-LABEL: name: fma_s_s_s - ; GFX9: bb.1 (%ir-block.0): - ; GFX9: liveins: $sgpr2, $sgpr3, $sgpr4 - ; GFX9: [[COPY:%[0-9]+]]:sgpr(s32) = COPY $sgpr2 - ; GFX9: [[COPY1:%[0-9]+]]:sgpr(s32) = COPY $sgpr3 - ; GFX9: [[COPY2:%[0-9]+]]:sgpr(s32) = COPY $sgpr4 - ; GFX9: [[COPY3:%[0-9]+]]:vgpr(s32) = COPY [[COPY]](s32) - ; GFX9: [[COPY4:%[0-9]+]]:vgpr(s32) = COPY [[COPY1]](s32) - ; GFX9: [[COPY5:%[0-9]+]]:vgpr(s32) = COPY [[COPY2]](s32) - ; GFX9: [[FMA:%[0-9]+]]:vgpr(s32) = G_FMA [[COPY3]], [[COPY4]], [[COPY5]] - ; GFX9: $vgpr0 = COPY [[FMA]](s32) - ; GFX9: SI_RETURN_TO_EPILOG implicit $vgpr0 - ; GFX10-LABEL: name: fma_s_s_s - ; GFX10: bb.1 (%ir-block.0): - ; GFX10: liveins: $sgpr2, $sgpr3, $sgpr4 - ; GFX10: [[COPY:%[0-9]+]]:sgpr(s32) = COPY $sgpr2 - ; GFX10: [[COPY1:%[0-9]+]]:sgpr(s32) = COPY $sgpr3 - ; GFX10: [[COPY2:%[0-9]+]]:sgpr(s32) = COPY $sgpr4 - ; GFX10: [[COPY3:%[0-9]+]]:vgpr(s32) = COPY [[COPY]](s32) - ; GFX10: [[COPY4:%[0-9]+]]:vgpr(s32) = COPY [[COPY1]](s32) - ; GFX10: [[COPY5:%[0-9]+]]:vgpr(s32) = COPY [[COPY2]](s32) - ; GFX10: [[FMA:%[0-9]+]]:vgpr(s32) = G_FMA [[COPY3]], [[COPY4]], [[COPY5]] - ; GFX10: $vgpr0 = COPY [[FMA]](s32) - ; GFX10: SI_RETURN_TO_EPILOG implicit $vgpr0 +; GFX9-LABEL: fma_s_s_s: +; GFX9: ; %bb.0: +; GFX9-NEXT: v_mov_b32_e32 v0, s3 +; GFX9-NEXT: v_mov_b32_e32 v1, s4 +; GFX9-NEXT: v_fma_f32 v0, s2, v0, v1 +; GFX9-NEXT: ; return to shader part epilog +; +; GFX10-LABEL: fma_s_s_s: +; GFX10: ; %bb.0: +; GFX10-NEXT: v_mov_b32_e32 v0, s4 +; GFX10-NEXT: ; implicit-def: $vcc_hi +; GFX10-NEXT: v_fma_f32 v0, s3, s2, v0 +; GFX10-NEXT: ; return to shader part epilog %result = call float @llvm.fma.f32(float %src0, float %src1, float %src2) ret float %result } ; Ternary operation with 3 identical SGPRs define amdgpu_ps float @fma_sss(float inreg %src) { - ; GFX9-LABEL: name: fma_sss - ; GFX9: bb.1 (%ir-block.0): - ; GFX9: liveins: $sgpr2 - ; GFX9: [[COPY:%[0-9]+]]:sgpr(s32) = COPY $sgpr2 - ; GFX9: [[COPY1:%[0-9]+]]:vgpr(s32) = COPY [[COPY]](s32) - ; GFX9: [[COPY2:%[0-9]+]]:vgpr(s32) = COPY [[COPY]](s32) - ; GFX9: [[COPY3:%[0-9]+]]:vgpr(s32) = COPY [[COPY]](s32) - ; GFX9: [[FMA:%[0-9]+]]:vgpr(s32) = G_FMA [[COPY1]], [[COPY2]], [[COPY3]] - ; GFX9: $vgpr0 = COPY [[FMA]](s32) - ; GFX9: SI_RETURN_TO_EPILOG implicit $vgpr0 - ; GFX10-LABEL: name: fma_sss - ; GFX10: bb.1 (%ir-block.0): - ; GFX10: liveins: $sgpr2 - ; GFX10: [[COPY:%[0-9]+]]:sgpr(s32) = COPY $sgpr2 - ; GFX10: [[COPY1:%[0-9]+]]:vgpr(s32) = COPY [[COPY]](s32) - ; GFX10: [[COPY2:%[0-9]+]]:vgpr(s32) = COPY [[COPY]](s32) - ; GFX10: [[COPY3:%[0-9]+]]:vgpr(s32) = COPY [[COPY]](s32) - ; GFX10: [[FMA:%[0-9]+]]:vgpr(s32) = G_FMA [[COPY1]], [[COPY2]], [[COPY3]] - ; GFX10: $vgpr0 = COPY [[FMA]](s32) - ; GFX10: SI_RETURN_TO_EPILOG implicit $vgpr0 +; GFX9-LABEL: fma_sss: +; GFX9: ; %bb.0: +; GFX9-NEXT: v_fma_f32 v0, s2, s2, s2 +; GFX9-NEXT: ; return to shader part epilog +; +; GFX10-LABEL: fma_sss: +; GFX10: ; %bb.0: +; GFX10-NEXT: v_fma_f32 v0, s2, s2, s2 +; GFX10-NEXT: ; implicit-def: $vcc_hi +; GFX10-NEXT: ; return to shader part epilog %result = call float @llvm.fma.f32(float %src, float %src, float %src) ret float %result } ; src0/1 are same SGPR define amdgpu_ps float @fma_ss_s(float inreg %src01, float inreg %src2) { - ; GFX9-LABEL: name: fma_ss_s - ; GFX9: bb.1 (%ir-block.0): - ; GFX9: liveins: $sgpr2, $sgpr3 - ; GFX9: [[COPY:%[0-9]+]]:sgpr(s32) = COPY $sgpr2 - ; GFX9: [[COPY1:%[0-9]+]]:sgpr(s32) = COPY $sgpr3 - ; GFX9: [[COPY2:%[0-9]+]]:vgpr(s32) = COPY [[COPY]](s32) - ; GFX9: [[COPY3:%[0-9]+]]:vgpr(s32) = COPY [[COPY]](s32) - ; GFX9: [[COPY4:%[0-9]+]]:vgpr(s32) = COPY [[COPY1]](s32) - ; GFX9: [[FMA:%[0-9]+]]:vgpr(s32) = G_FMA [[COPY2]], [[COPY3]], [[COPY4]] - ; GFX9: $vgpr0 = COPY [[FMA]](s32) - ; GFX9: SI_RETURN_TO_EPILOG implicit $vgpr0 - ; GFX10-LABEL: name: fma_ss_s - ; GFX10: bb.1 (%ir-block.0): - ; GFX10: liveins: $sgpr2, $sgpr3 - ; GFX10: [[COPY:%[0-9]+]]:sgpr(s32) = COPY $sgpr2 - ; GFX10: [[COPY1:%[0-9]+]]:sgpr(s32) = COPY $sgpr3 - ; GFX10: [[COPY2:%[0-9]+]]:vgpr(s32) = COPY [[COPY]](s32) - ; GFX10: [[COPY3:%[0-9]+]]:vgpr(s32) = COPY [[COPY]](s32) - ; GFX10: [[COPY4:%[0-9]+]]:vgpr(s32) = COPY [[COPY1]](s32) - ; GFX10: [[FMA:%[0-9]+]]:vgpr(s32) = G_FMA [[COPY2]], [[COPY3]], [[COPY4]] - ; GFX10: $vgpr0 = COPY [[FMA]](s32) - ; GFX10: SI_RETURN_TO_EPILOG implicit $vgpr0 +; GFX9-LABEL: fma_ss_s: +; GFX9: ; %bb.0: +; GFX9-NEXT: v_mov_b32_e32 v0, s3 +; GFX9-NEXT: v_fma_f32 v0, s2, s2, v0 +; GFX9-NEXT: ; return to shader part epilog +; +; GFX10-LABEL: fma_ss_s: +; GFX10: ; %bb.0: +; GFX10-NEXT: v_fma_f32 v0, s2, s2, s3 +; GFX10-NEXT: ; implicit-def: $vcc_hi +; GFX10-NEXT: ; return to shader part epilog %result = call float @llvm.fma.f32(float %src01, float %src01, float %src2) ret float %result } ; src1/2 are same SGPR define amdgpu_ps float @fma_s_ss(float inreg %src0, float inreg %src12) { - ; GFX9-LABEL: name: fma_s_ss - ; GFX9: bb.1 (%ir-block.0): - ; GFX9: liveins: $sgpr2, $sgpr3 - ; GFX9: [[COPY:%[0-9]+]]:sgpr(s32) = COPY $sgpr2 - ; GFX9: [[COPY1:%[0-9]+]]:sgpr(s32) = COPY $sgpr3 - ; GFX9: [[COPY2:%[0-9]+]]:vgpr(s32) = COPY [[COPY]](s32) - ; GFX9: [[COPY3:%[0-9]+]]:vgpr(s32) = COPY [[COPY1]](s32) - ; GFX9: [[COPY4:%[0-9]+]]:vgpr(s32) = COPY [[COPY1]](s32) - ; GFX9: [[FMA:%[0-9]+]]:vgpr(s32) = G_FMA [[COPY2]], [[COPY3]], [[COPY4]] - ; GFX9: $vgpr0 = COPY [[FMA]](s32) - ; GFX9: SI_RETURN_TO_EPILOG implicit $vgpr0 - ; GFX10-LABEL: name: fma_s_ss - ; GFX10: bb.1 (%ir-block.0): - ; GFX10: liveins: $sgpr2, $sgpr3 - ; GFX10: [[COPY:%[0-9]+]]:sgpr(s32) = COPY $sgpr2 - ; GFX10: [[COPY1:%[0-9]+]]:sgpr(s32) = COPY $sgpr3 - ; GFX10: [[COPY2:%[0-9]+]]:vgpr(s32) = COPY [[COPY]](s32) - ; GFX10: [[COPY3:%[0-9]+]]:vgpr(s32) = COPY [[COPY1]](s32) - ; GFX10: [[COPY4:%[0-9]+]]:vgpr(s32) = COPY [[COPY1]](s32) - ; GFX10: [[FMA:%[0-9]+]]:vgpr(s32) = G_FMA [[COPY2]], [[COPY3]], [[COPY4]] - ; GFX10: $vgpr0 = COPY [[FMA]](s32) - ; GFX10: SI_RETURN_TO_EPILOG implicit $vgpr0 +; GFX9-LABEL: fma_s_ss: +; GFX9: ; %bb.0: +; GFX9-NEXT: v_mov_b32_e32 v0, s3 +; GFX9-NEXT: v_fma_f32 v0, s2, v0, v0 +; GFX9-NEXT: ; return to shader part epilog +; +; GFX10-LABEL: fma_s_ss: +; GFX10: ; %bb.0: +; GFX10-NEXT: v_fma_f32 v0, s2, s3, s3 +; GFX10-NEXT: ; implicit-def: $vcc_hi +; GFX10-NEXT: ; return to shader part epilog %result = call float @llvm.fma.f32(float %src0, float %src12, float %src12) ret float %result } ; src0/2 are same SGPR define amdgpu_ps float @fma_ss_s_same_outer(float inreg %src02, float inreg %src1) { - ; GFX9-LABEL: name: fma_ss_s_same_outer - ; GFX9: bb.1 (%ir-block.0): - ; GFX9: liveins: $sgpr2, $sgpr3 - ; GFX9: [[COPY:%[0-9]+]]:sgpr(s32) = COPY $sgpr2 - ; GFX9: [[COPY1:%[0-9]+]]:sgpr(s32) = COPY $sgpr3 - ; GFX9: [[COPY2:%[0-9]+]]:vgpr(s32) = COPY [[COPY]](s32) - ; GFX9: [[COPY3:%[0-9]+]]:vgpr(s32) = COPY [[COPY1]](s32) - ; GFX9: [[COPY4:%[0-9]+]]:vgpr(s32) = COPY [[COPY]](s32) - ; GFX9: [[FMA:%[0-9]+]]:vgpr(s32) = G_FMA [[COPY2]], [[COPY3]], [[COPY4]] - ; GFX9: $vgpr0 = COPY [[FMA]](s32) - ; GFX9: SI_RETURN_TO_EPILOG implicit $vgpr0 - ; GFX10-LABEL: name: fma_ss_s_same_outer - ; GFX10: bb.1 (%ir-block.0): - ; GFX10: liveins: $sgpr2, $sgpr3 - ; GFX10: [[COPY:%[0-9]+]]:sgpr(s32) = COPY $sgpr2 - ; GFX10: [[COPY1:%[0-9]+]]:sgpr(s32) = COPY $sgpr3 - ; GFX10: [[COPY2:%[0-9]+]]:vgpr(s32) = COPY [[COPY]](s32) - ; GFX10: [[COPY3:%[0-9]+]]:vgpr(s32) = COPY [[COPY1]](s32) - ; GFX10: [[COPY4:%[0-9]+]]:vgpr(s32) = COPY [[COPY]](s32) - ; GFX10: [[FMA:%[0-9]+]]:vgpr(s32) = G_FMA [[COPY2]], [[COPY3]], [[COPY4]] - ; GFX10: $vgpr0 = COPY [[FMA]](s32) - ; GFX10: SI_RETURN_TO_EPILOG implicit $vgpr0 +; GFX9-LABEL: fma_ss_s_same_outer: +; GFX9: ; %bb.0: +; GFX9-NEXT: v_mov_b32_e32 v0, s3 +; GFX9-NEXT: v_fma_f32 v0, s2, v0, s2 +; GFX9-NEXT: ; return to shader part epilog +; +; GFX10-LABEL: fma_ss_s_same_outer: +; GFX10: ; %bb.0: +; GFX10-NEXT: v_fma_f32 v0, s2, s3, s2 +; GFX10-NEXT: ; implicit-def: $vcc_hi +; GFX10-NEXT: ; return to shader part epilog %result = call float @llvm.fma.f32(float %src02, float %src1, float %src02) ret float %result } define amdgpu_ps float @fcmp_s_s(float inreg %src0, float inreg %src1) { - ; GFX9-LABEL: name: fcmp_s_s - ; GFX9: bb.1 (%ir-block.0): - ; GFX9: liveins: $sgpr2, $sgpr3 - ; GFX9: [[COPY:%[0-9]+]]:sgpr(s32) = COPY $sgpr2 - ; GFX9: [[COPY1:%[0-9]+]]:sgpr(s32) = COPY $sgpr3 - ; GFX9: [[COPY2:%[0-9]+]]:vgpr(s32) = COPY [[COPY1]](s32) - ; GFX9: [[FCMP:%[0-9]+]]:vcc(s1) = G_FCMP floatpred(oeq), [[COPY]](s32), [[COPY2]] - ; GFX9: [[C:%[0-9]+]]:sgpr(s32) = G_FCONSTANT float 0.000000e+00 - ; GFX9: [[C1:%[0-9]+]]:sgpr(s32) = G_FCONSTANT float 1.000000e+00 - ; GFX9: [[COPY3:%[0-9]+]]:vgpr(s32) = COPY [[C1]](s32) - ; GFX9: [[COPY4:%[0-9]+]]:vgpr(s32) = COPY [[C]](s32) - ; GFX9: [[SELECT:%[0-9]+]]:vgpr(s32) = G_SELECT [[FCMP]](s1), [[COPY3]], [[COPY4]] - ; GFX9: $vgpr0 = COPY [[SELECT]](s32) - ; GFX9: SI_RETURN_TO_EPILOG implicit $vgpr0 - ; GFX10-LABEL: name: fcmp_s_s - ; GFX10: bb.1 (%ir-block.0): - ; GFX10: liveins: $sgpr2, $sgpr3 - ; GFX10: [[COPY:%[0-9]+]]:sgpr(s32) = COPY $sgpr2 - ; GFX10: [[COPY1:%[0-9]+]]:sgpr(s32) = COPY $sgpr3 - ; GFX10: [[COPY2:%[0-9]+]]:vgpr(s32) = COPY [[COPY1]](s32) - ; GFX10: [[FCMP:%[0-9]+]]:vcc(s1) = G_FCMP floatpred(oeq), [[COPY]](s32), [[COPY2]] - ; GFX10: [[C:%[0-9]+]]:sgpr(s32) = G_FCONSTANT float 0.000000e+00 - ; GFX10: [[C1:%[0-9]+]]:sgpr(s32) = G_FCONSTANT float 1.000000e+00 - ; GFX10: [[COPY3:%[0-9]+]]:vgpr(s32) = COPY [[C1]](s32) - ; GFX10: [[COPY4:%[0-9]+]]:vgpr(s32) = COPY [[C]](s32) - ; GFX10: [[SELECT:%[0-9]+]]:vgpr(s32) = G_SELECT [[FCMP]](s1), [[COPY3]], [[COPY4]] - ; GFX10: $vgpr0 = COPY [[SELECT]](s32) - ; GFX10: SI_RETURN_TO_EPILOG implicit $vgpr0 +; GFX9-LABEL: fcmp_s_s: +; GFX9: ; %bb.0: +; GFX9-NEXT: v_mov_b32_e32 v0, s3 +; GFX9-NEXT: v_cmp_eq_f32_e32 vcc, s2, v0 +; GFX9-NEXT: v_cndmask_b32_e64 v0, 0, 1.0, vcc +; GFX9-NEXT: ; return to shader part epilog +; +; GFX10-LABEL: fcmp_s_s: +; GFX10: ; %bb.0: +; GFX10-NEXT: v_cmp_eq_f32_e64 s0, s2, s3 +; GFX10-NEXT: ; implicit-def: $vcc_hi +; GFX10-NEXT: v_cndmask_b32_e64 v0, 0, 1.0, s0 +; GFX10-NEXT: ; return to shader part epilog %cmp = fcmp oeq float %src0, %src1 %result = select i1 %cmp, float 1.0, float 0.0 ret float %result } define amdgpu_ps float @select_vcc_s_s(float %cmp0, float %cmp1, float inreg %src0, float inreg %src1) { - ; GFX9-LABEL: name: select_vcc_s_s - ; GFX9: bb.1 (%ir-block.0): - ; GFX9: liveins: $sgpr2, $sgpr3, $vgpr0, $vgpr1 - ; GFX9: [[COPY:%[0-9]+]]:vgpr(s32) = COPY $vgpr0 - ; GFX9: [[COPY1:%[0-9]+]]:vgpr(s32) = COPY $vgpr1 - ; GFX9: [[COPY2:%[0-9]+]]:sgpr(s32) = COPY $sgpr2 - ; GFX9: [[COPY3:%[0-9]+]]:sgpr(s32) = COPY $sgpr3 - ; GFX9: [[FCMP:%[0-9]+]]:vcc(s1) = G_FCMP floatpred(oeq), [[COPY]](s32), [[COPY1]] - ; GFX9: [[COPY4:%[0-9]+]]:vgpr(s32) = COPY [[COPY2]](s32) - ; GFX9: [[COPY5:%[0-9]+]]:vgpr(s32) = COPY [[COPY3]](s32) - ; GFX9: [[SELECT:%[0-9]+]]:vgpr(s32) = G_SELECT [[FCMP]](s1), [[COPY4]], [[COPY5]] - ; GFX9: $vgpr0 = COPY [[SELECT]](s32) - ; GFX9: SI_RETURN_TO_EPILOG implicit $vgpr0 - ; GFX10-LABEL: name: select_vcc_s_s - ; GFX10: bb.1 (%ir-block.0): - ; GFX10: liveins: $sgpr2, $sgpr3, $vgpr0, $vgpr1 - ; GFX10: [[COPY:%[0-9]+]]:vgpr(s32) = COPY $vgpr0 - ; GFX10: [[COPY1:%[0-9]+]]:vgpr(s32) = COPY $vgpr1 - ; GFX10: [[COPY2:%[0-9]+]]:sgpr(s32) = COPY $sgpr2 - ; GFX10: [[COPY3:%[0-9]+]]:sgpr(s32) = COPY $sgpr3 - ; GFX10: [[FCMP:%[0-9]+]]:vcc(s1) = G_FCMP floatpred(oeq), [[COPY]](s32), [[COPY1]] - ; GFX10: [[COPY4:%[0-9]+]]:vgpr(s32) = COPY [[COPY2]](s32) - ; GFX10: [[COPY5:%[0-9]+]]:vgpr(s32) = COPY [[COPY3]](s32) - ; GFX10: [[SELECT:%[0-9]+]]:vgpr(s32) = G_SELECT [[FCMP]](s1), [[COPY4]], [[COPY5]] - ; GFX10: $vgpr0 = COPY [[SELECT]](s32) - ; GFX10: SI_RETURN_TO_EPILOG implicit $vgpr0 +; GFX9-LABEL: select_vcc_s_s: +; GFX9: ; %bb.0: +; GFX9-NEXT: v_mov_b32_e32 v2, s2 +; GFX9-NEXT: v_mov_b32_e32 v3, s3 +; GFX9-NEXT: v_cmp_eq_f32_e32 vcc, v0, v1 +; GFX9-NEXT: v_cndmask_b32_e32 v0, v3, v2, vcc +; GFX9-NEXT: ; return to shader part epilog +; +; GFX10-LABEL: select_vcc_s_s: +; GFX10: ; %bb.0: +; GFX10-NEXT: v_mov_b32_e32 v2, s3 +; GFX10-NEXT: v_cmp_eq_f32_e32 vcc_lo, v0, v1 +; GFX10-NEXT: ; implicit-def: $vcc_hi +; GFX10-NEXT: v_cndmask_b32_e64 v0, v2, s2, vcc_lo +; GFX10-NEXT: ; return to shader part epilog %cmp = fcmp oeq float %cmp0, %cmp1 %result = select i1 %cmp, float %src0, float %src1 ret float %result } define amdgpu_ps float @select_vcc_fneg_s_s(float %cmp0, float %cmp1, float inreg %src0, float inreg %src1) { - ; GFX9-LABEL: name: select_vcc_fneg_s_s - ; GFX9: bb.1 (%ir-block.0): - ; GFX9: liveins: $sgpr2, $sgpr3, $vgpr0, $vgpr1 - ; GFX9: [[COPY:%[0-9]+]]:vgpr(s32) = COPY $vgpr0 - ; GFX9: [[COPY1:%[0-9]+]]:vgpr(s32) = COPY $vgpr1 - ; GFX9: [[COPY2:%[0-9]+]]:sgpr(s32) = COPY $sgpr2 - ; GFX9: [[COPY3:%[0-9]+]]:sgpr(s32) = COPY $sgpr3 - ; GFX9: [[FCMP:%[0-9]+]]:vcc(s1) = G_FCMP floatpred(oeq), [[COPY]](s32), [[COPY1]] - ; GFX9: [[FNEG:%[0-9]+]]:sgpr(s32) = G_FNEG [[COPY2]] - ; GFX9: [[COPY4:%[0-9]+]]:vgpr(s32) = COPY [[FNEG]](s32) - ; GFX9: [[COPY5:%[0-9]+]]:vgpr(s32) = COPY [[COPY3]](s32) - ; GFX9: [[SELECT:%[0-9]+]]:vgpr(s32) = G_SELECT [[FCMP]](s1), [[COPY4]], [[COPY5]] - ; GFX9: $vgpr0 = COPY [[SELECT]](s32) - ; GFX9: SI_RETURN_TO_EPILOG implicit $vgpr0 - ; GFX10-LABEL: name: select_vcc_fneg_s_s - ; GFX10: bb.1 (%ir-block.0): - ; GFX10: liveins: $sgpr2, $sgpr3, $vgpr0, $vgpr1 - ; GFX10: [[COPY:%[0-9]+]]:vgpr(s32) = COPY $vgpr0 - ; GFX10: [[COPY1:%[0-9]+]]:vgpr(s32) = COPY $vgpr1 - ; GFX10: [[COPY2:%[0-9]+]]:sgpr(s32) = COPY $sgpr2 - ; GFX10: [[COPY3:%[0-9]+]]:sgpr(s32) = COPY $sgpr3 - ; GFX10: [[FCMP:%[0-9]+]]:vcc(s1) = G_FCMP floatpred(oeq), [[COPY]](s32), [[COPY1]] - ; GFX10: [[FNEG:%[0-9]+]]:sgpr(s32) = G_FNEG [[COPY2]] - ; GFX10: [[COPY4:%[0-9]+]]:vgpr(s32) = COPY [[FNEG]](s32) - ; GFX10: [[COPY5:%[0-9]+]]:vgpr(s32) = COPY [[COPY3]](s32) - ; GFX10: [[SELECT:%[0-9]+]]:vgpr(s32) = G_SELECT [[FCMP]](s1), [[COPY4]], [[COPY5]] - ; GFX10: $vgpr0 = COPY [[SELECT]](s32) - ; GFX10: SI_RETURN_TO_EPILOG implicit $vgpr0 +; GFX9-LABEL: select_vcc_fneg_s_s: +; GFX9: ; %bb.0: +; GFX9-NEXT: v_mov_b32_e32 v2, s3 +; GFX9-NEXT: v_mov_b32_e32 v3, s2 +; GFX9-NEXT: v_cmp_eq_f32_e32 vcc, v0, v1 +; GFX9-NEXT: v_cndmask_b32_e64 v0, v2, -v3, vcc +; GFX9-NEXT: ; return to shader part epilog +; +; GFX10-LABEL: select_vcc_fneg_s_s: +; GFX10: ; %bb.0: +; GFX10-NEXT: v_mov_b32_e32 v2, s2 +; GFX10-NEXT: v_cmp_eq_f32_e32 vcc_lo, v0, v1 +; GFX10-NEXT: ; implicit-def: $vcc_hi +; GFX10-NEXT: v_cndmask_b32_e64 v0, s3, -v2, vcc_lo +; GFX10-NEXT: ; return to shader part epilog %cmp = fcmp oeq float %cmp0, %cmp1 %neg.src0 = fneg float %src0 %result = select i1 %cmp, float %neg.src0, float %src1 @@ -297,122 +185,73 @@ define amdgpu_ps float @select_vcc_fneg_s_s(float %cmp0, float %cmp1, float inre ; Constant bus used by vcc define amdgpu_ps float @amdgcn_div_fmas_sss(float inreg %src, float %cmp.src) { - ; GFX9-LABEL: name: amdgcn_div_fmas_sss - ; GFX9: bb.1 (%ir-block.0): - ; GFX9: liveins: $sgpr2, $vgpr0 - ; GFX9: [[COPY:%[0-9]+]]:sgpr(s32) = COPY $sgpr2 - ; GFX9: [[COPY1:%[0-9]+]]:vgpr(s32) = COPY $vgpr0 - ; GFX9: [[C:%[0-9]+]]:sgpr(s32) = G_FCONSTANT float 0.000000e+00 - ; GFX9: [[COPY2:%[0-9]+]]:vgpr(s32) = COPY [[C]](s32) - ; GFX9: [[FCMP:%[0-9]+]]:vcc(s1) = G_FCMP floatpred(oeq), [[COPY1]](s32), [[COPY2]] - ; GFX9: [[COPY3:%[0-9]+]]:vgpr(s32) = COPY [[COPY]](s32) - ; GFX9: [[COPY4:%[0-9]+]]:vgpr(s32) = COPY [[COPY]](s32) - ; GFX9: [[COPY5:%[0-9]+]]:vgpr(s32) = COPY [[COPY]](s32) - ; GFX9: [[INT:%[0-9]+]]:vgpr(s32) = G_INTRINSIC intrinsic(@llvm.amdgcn.div.fmas), [[COPY3]](s32), [[COPY4]](s32), [[COPY5]](s32), [[FCMP]](s1) - ; GFX9: $vgpr0 = COPY [[INT]](s32) - ; GFX9: SI_RETURN_TO_EPILOG implicit $vgpr0 - ; GFX10-LABEL: name: amdgcn_div_fmas_sss - ; GFX10: bb.1 (%ir-block.0): - ; GFX10: liveins: $sgpr2, $vgpr0 - ; GFX10: [[COPY:%[0-9]+]]:sgpr(s32) = COPY $sgpr2 - ; GFX10: [[COPY1:%[0-9]+]]:vgpr(s32) = COPY $vgpr0 - ; GFX10: [[C:%[0-9]+]]:sgpr(s32) = G_FCONSTANT float 0.000000e+00 - ; GFX10: [[COPY2:%[0-9]+]]:vgpr(s32) = COPY [[C]](s32) - ; GFX10: [[FCMP:%[0-9]+]]:vcc(s1) = G_FCMP floatpred(oeq), [[COPY1]](s32), [[COPY2]] - ; GFX10: [[COPY3:%[0-9]+]]:vgpr(s32) = COPY [[COPY]](s32) - ; GFX10: [[COPY4:%[0-9]+]]:vgpr(s32) = COPY [[COPY]](s32) - ; GFX10: [[COPY5:%[0-9]+]]:vgpr(s32) = COPY [[COPY]](s32) - ; GFX10: [[INT:%[0-9]+]]:vgpr(s32) = G_INTRINSIC intrinsic(@llvm.amdgcn.div.fmas), [[COPY3]](s32), [[COPY4]](s32), [[COPY5]](s32), [[FCMP]](s1) - ; GFX10: $vgpr0 = COPY [[INT]](s32) - ; GFX10: SI_RETURN_TO_EPILOG implicit $vgpr0 +; GFX9-LABEL: amdgcn_div_fmas_sss: +; GFX9: ; %bb.0: +; GFX9-NEXT: v_cmp_eq_f32_e32 vcc, 0, v0 +; GFX9-NEXT: v_mov_b32_e32 v0, s2 +; GFX9-NEXT: s_nop 2 +; GFX9-NEXT: v_div_fmas_f32 v0, v0, v0, v0 +; GFX9-NEXT: ; return to shader part epilog +; +; GFX10-LABEL: amdgcn_div_fmas_sss: +; GFX10: ; %bb.0: +; GFX10-NEXT: v_cmp_eq_f32_e32 vcc_lo, 0, v0 +; GFX10-NEXT: v_div_fmas_f32 v0, s2, s2, s2 +; GFX10-NEXT: ; implicit-def: $vcc_hi +; GFX10-NEXT: ; return to shader part epilog %vcc = fcmp oeq float %cmp.src, 0.0 %result = call float @llvm.amdgcn.div.fmas.f32(float %src, float %src, float %src, i1 %vcc) ret float %result } define amdgpu_ps float @class_s_s(float inreg %src0, i32 inreg %src1) { - ; GFX9-LABEL: name: class_s_s - ; GFX9: bb.1 (%ir-block.0): - ; GFX9: liveins: $sgpr2, $sgpr3 - ; GFX9: [[COPY:%[0-9]+]]:sgpr(s32) = COPY $sgpr2 - ; GFX9: [[COPY1:%[0-9]+]]:sgpr(s32) = COPY $sgpr3 - ; GFX9: [[COPY2:%[0-9]+]]:vgpr(s32) = COPY [[COPY]](s32) - ; GFX9: [[COPY3:%[0-9]+]]:vgpr(s32) = COPY [[COPY1]](s32) - ; GFX9: [[INT:%[0-9]+]]:vcc(s1) = G_INTRINSIC intrinsic(@llvm.amdgcn.class), [[COPY2]](s32), [[COPY3]](s32) - ; GFX9: [[C:%[0-9]+]]:sgpr(s32) = G_FCONSTANT float 0.000000e+00 - ; GFX9: [[C1:%[0-9]+]]:sgpr(s32) = G_FCONSTANT float 1.000000e+00 - ; GFX9: [[COPY4:%[0-9]+]]:vgpr(s32) = COPY [[C1]](s32) - ; GFX9: [[COPY5:%[0-9]+]]:vgpr(s32) = COPY [[C]](s32) - ; GFX9: [[SELECT:%[0-9]+]]:vgpr(s32) = G_SELECT [[INT]](s1), [[COPY4]], [[COPY5]] - ; GFX9: $vgpr0 = COPY [[SELECT]](s32) - ; GFX9: SI_RETURN_TO_EPILOG implicit $vgpr0 - ; GFX10-LABEL: name: class_s_s - ; GFX10: bb.1 (%ir-block.0): - ; GFX10: liveins: $sgpr2, $sgpr3 - ; GFX10: [[COPY:%[0-9]+]]:sgpr(s32) = COPY $sgpr2 - ; GFX10: [[COPY1:%[0-9]+]]:sgpr(s32) = COPY $sgpr3 - ; GFX10: [[COPY2:%[0-9]+]]:vgpr(s32) = COPY [[COPY]](s32) - ; GFX10: [[COPY3:%[0-9]+]]:vgpr(s32) = COPY [[COPY1]](s32) - ; GFX10: [[INT:%[0-9]+]]:vcc(s1) = G_INTRINSIC intrinsic(@llvm.amdgcn.class), [[COPY2]](s32), [[COPY3]](s32) - ; GFX10: [[C:%[0-9]+]]:sgpr(s32) = G_FCONSTANT float 0.000000e+00 - ; GFX10: [[C1:%[0-9]+]]:sgpr(s32) = G_FCONSTANT float 1.000000e+00 - ; GFX10: [[COPY4:%[0-9]+]]:vgpr(s32) = COPY [[C1]](s32) - ; GFX10: [[COPY5:%[0-9]+]]:vgpr(s32) = COPY [[C]](s32) - ; GFX10: [[SELECT:%[0-9]+]]:vgpr(s32) = G_SELECT [[INT]](s1), [[COPY4]], [[COPY5]] - ; GFX10: $vgpr0 = COPY [[SELECT]](s32) - ; GFX10: SI_RETURN_TO_EPILOG implicit $vgpr0 +; GFX9-LABEL: class_s_s: +; GFX9: ; %bb.0: +; GFX9-NEXT: v_mov_b32_e32 v0, s3 +; GFX9-NEXT: v_cmp_class_f32_e32 vcc, s2, v0 +; GFX9-NEXT: v_cndmask_b32_e64 v0, 0, 1.0, vcc +; GFX9-NEXT: ; return to shader part epilog +; +; GFX10-LABEL: class_s_s: +; GFX10: ; %bb.0: +; GFX10-NEXT: v_cmp_class_f32_e64 s0, s2, s3 +; GFX10-NEXT: ; implicit-def: $vcc_hi +; GFX10-NEXT: v_cndmask_b32_e64 v0, 0, 1.0, s0 +; GFX10-NEXT: ; return to shader part epilog %class = call i1 @llvm.amdgcn.class.f32(float %src0, i32 %src1) %result = select i1 %class, float 1.0, float 0.0 ret float %result } define amdgpu_ps float @div_scale_s_s_true(float inreg %src0, float inreg %src1) { - ; GFX9-LABEL: name: div_scale_s_s_true - ; GFX9: bb.1 (%ir-block.0): - ; GFX9: liveins: $sgpr2, $sgpr3 - ; GFX9: [[COPY:%[0-9]+]]:sgpr(s32) = COPY $sgpr2 - ; GFX9: [[COPY1:%[0-9]+]]:sgpr(s32) = COPY $sgpr3 - ; GFX9: [[COPY2:%[0-9]+]]:vgpr(s32) = COPY [[COPY]](s32) - ; GFX9: [[COPY3:%[0-9]+]]:vgpr(s32) = COPY [[COPY1]](s32) - ; GFX9: [[INT:%[0-9]+]]:vgpr(s32), [[INT1:%[0-9]+]]:vcc(s1) = G_INTRINSIC intrinsic(@llvm.amdgcn.div.scale), [[COPY2]](s32), [[COPY3]](s32), -1 - ; GFX9: $vgpr0 = COPY [[INT]](s32) - ; GFX9: SI_RETURN_TO_EPILOG implicit $vgpr0 - ; GFX10-LABEL: name: div_scale_s_s_true - ; GFX10: bb.1 (%ir-block.0): - ; GFX10: liveins: $sgpr2, $sgpr3 - ; GFX10: [[COPY:%[0-9]+]]:sgpr(s32) = COPY $sgpr2 - ; GFX10: [[COPY1:%[0-9]+]]:sgpr(s32) = COPY $sgpr3 - ; GFX10: [[COPY2:%[0-9]+]]:vgpr(s32) = COPY [[COPY]](s32) - ; GFX10: [[COPY3:%[0-9]+]]:vgpr(s32) = COPY [[COPY1]](s32) - ; GFX10: [[INT:%[0-9]+]]:vgpr(s32), [[INT1:%[0-9]+]]:vcc(s1) = G_INTRINSIC intrinsic(@llvm.amdgcn.div.scale), [[COPY2]](s32), [[COPY3]](s32), -1 - ; GFX10: $vgpr0 = COPY [[INT]](s32) - ; GFX10: SI_RETURN_TO_EPILOG implicit $vgpr0 +; GFX9-LABEL: div_scale_s_s_true: +; GFX9: ; %bb.0: +; GFX9-NEXT: v_mov_b32_e32 v0, s3 +; GFX9-NEXT: v_div_scale_f32 v0, s[0:1], s2, v0, s2 +; GFX9-NEXT: ; return to shader part epilog +; +; GFX10-LABEL: div_scale_s_s_true: +; GFX10: ; %bb.0: +; GFX10-NEXT: v_div_scale_f32 v0, s0, s2, s3, s2 +; GFX10-NEXT: ; implicit-def: $vcc_hi +; GFX10-NEXT: ; return to shader part epilog %div.scale = call { float, i1 } @llvm.amdgcn.div.scale.f32(float %src0, float %src1, i1 true) %result = extractvalue { float, i1 } %div.scale, 0 ret float %result } define amdgpu_ps float @div_scale_s_s_false(float inreg %src0, float inreg %src1) { - ; GFX9-LABEL: name: div_scale_s_s_false - ; GFX9: bb.1 (%ir-block.0): - ; GFX9: liveins: $sgpr2, $sgpr3 - ; GFX9: [[COPY:%[0-9]+]]:sgpr(s32) = COPY $sgpr2 - ; GFX9: [[COPY1:%[0-9]+]]:sgpr(s32) = COPY $sgpr3 - ; GFX9: [[COPY2:%[0-9]+]]:vgpr(s32) = COPY [[COPY]](s32) - ; GFX9: [[COPY3:%[0-9]+]]:vgpr(s32) = COPY [[COPY1]](s32) - ; GFX9: [[INT:%[0-9]+]]:vgpr(s32), [[INT1:%[0-9]+]]:vcc(s1) = G_INTRINSIC intrinsic(@llvm.amdgcn.div.scale), [[COPY2]](s32), [[COPY3]](s32), 0 - ; GFX9: $vgpr0 = COPY [[INT]](s32) - ; GFX9: SI_RETURN_TO_EPILOG implicit $vgpr0 - ; GFX10-LABEL: name: div_scale_s_s_false - ; GFX10: bb.1 (%ir-block.0): - ; GFX10: liveins: $sgpr2, $sgpr3 - ; GFX10: [[COPY:%[0-9]+]]:sgpr(s32) = COPY $sgpr2 - ; GFX10: [[COPY1:%[0-9]+]]:sgpr(s32) = COPY $sgpr3 - ; GFX10: [[COPY2:%[0-9]+]]:vgpr(s32) = COPY [[COPY]](s32) - ; GFX10: [[COPY3:%[0-9]+]]:vgpr(s32) = COPY [[COPY1]](s32) - ; GFX10: [[INT:%[0-9]+]]:vgpr(s32), [[INT1:%[0-9]+]]:vcc(s1) = G_INTRINSIC intrinsic(@llvm.amdgcn.div.scale), [[COPY2]](s32), [[COPY3]](s32), 0 - ; GFX10: $vgpr0 = COPY [[INT]](s32) - ; GFX10: SI_RETURN_TO_EPILOG implicit $vgpr0 +; GFX9-LABEL: div_scale_s_s_false: +; GFX9: ; %bb.0: +; GFX9-NEXT: v_mov_b32_e32 v0, s3 +; GFX9-NEXT: v_div_scale_f32 v0, s[0:1], v0, v0, s2 +; GFX9-NEXT: ; return to shader part epilog +; +; GFX10-LABEL: div_scale_s_s_false: +; GFX10: ; %bb.0: +; GFX10-NEXT: v_div_scale_f32 v0, s0, s3, s3, s2 +; GFX10-NEXT: ; implicit-def: $vcc_hi +; GFX10-NEXT: ; return to shader part epilog %div.scale = call { float, i1 } @llvm.amdgcn.div.scale.f32(float %src0, float %src1, i1 false) %result = extractvalue { float, i1 } %div.scale, 0 ret float %result diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.ds.ordered.add.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.ds.ordered.add.ll index 8cba08f..4193d97 100644 --- a/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.ds.ordered.add.ll +++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.ds.ordered.add.ll @@ -1,5 +1,4 @@ -; FIXME: Broken SI run line -; XUN: llc -global-isel -march=amdgcn -verify-machineinstrs < %S/../llvm.amdgcn.ds.ordered.add.ll | FileCheck -check-prefixes=GCN,FUNC %S/../llvm.amdgcn.ds.ordered.add.ll +; RUN: llc -global-isel -march=amdgcn -verify-machineinstrs < %S/../llvm.amdgcn.ds.ordered.add.ll | FileCheck -check-prefixes=GCN,FUNC %S/../llvm.amdgcn.ds.ordered.add.ll ; RUN: llc -global-isel -march=amdgcn -mcpu=bonaire -verify-machineinstrs < %S/../llvm.amdgcn.ds.ordered.add.ll | FileCheck -check-prefixes=GCN,FUNC %S/../llvm.amdgcn.ds.ordered.add.ll ; RUN: llc -global-isel -march=amdgcn -mcpu=tonga -mattr=-flat-for-global -verify-machineinstrs < %S/../llvm.amdgcn.ds.ordered.add.ll | FileCheck -check-prefixes=GCN,VIGFX9,FUNC %S/../llvm.amdgcn.ds.ordered.add.ll ; RUN: llc -global-isel -march=amdgcn -mcpu=gfx900 -verify-machineinstrs < %S/../llvm.amdgcn.ds.ordered.add.ll | FileCheck -check-prefixes=GCN,VIGFX9,FUNC %S/../llvm.amdgcn.ds.ordered.add.ll diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.ds.ordered.swap.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.ds.ordered.swap.ll index 28c2c7a..e2c3b62 100644 --- a/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.ds.ordered.swap.ll +++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.ds.ordered.swap.ll @@ -1,5 +1,4 @@ -; FIXME: Broken SI run line -; XUN: llc -global-isel -march=amdgcn -verify-machineinstrs < %S/../llvm.amdgcn.ds.ordered.swap.ll | FileCheck -check-prefixes=GCN,FUNC %S/../llvm.amdgcn.ds.ordered.swap.ll +; RUN: llc -global-isel -march=amdgcn -verify-machineinstrs < %S/../llvm.amdgcn.ds.ordered.swap.ll | FileCheck -check-prefixes=GCN,FUNC %S/../llvm.amdgcn.ds.ordered.swap.ll ; RUN: llc -global-isel -march=amdgcn -mcpu=bonaire -verify-machineinstrs < %S/../llvm.amdgcn.ds.ordered.swap.ll | FileCheck -check-prefixes=GCN,FUNC %S/../llvm.amdgcn.ds.ordered.swap.ll ; RUN: llc -global-isel -march=amdgcn -mcpu=tonga -mattr=-flat-for-global -verify-machineinstrs < %S/../llvm.amdgcn.ds.ordered.swap.ll | FileCheck -check-prefixes=GCN,VIGFX9,FUNC %S/../llvm.amdgcn.ds.ordered.swap.ll ; RUN: llc -global-isel -march=amdgcn -mcpu=gfx900 -verify-machineinstrs < %S/../llvm.amdgcn.ds.ordered.swap.ll | FileCheck -check-prefixes=GCN,VIGFX9,FUNC %S/../llvm.amdgcn.ds.ordered.swap.ll diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.sbfe.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.sbfe.ll index 5389adf..7d116f8 100644 --- a/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.sbfe.ll +++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.sbfe.ll @@ -687,14 +687,48 @@ define amdgpu_kernel void @bfe_sext_in_reg_i24(i32 addrspace(1)* %out, i32 addrs ret void } -; FIXME -; define amdgpu_kernel void @simplify_demanded_bfe_sdiv(i32 addrspace(1)* %out, i32 addrspace(1)* %in) #0 { -; %src = load i32, i32 addrspace(1)* %in, align 4 -; %bfe = call i32 @llvm.amdgcn.sbfe.i32(i32 %src, i32 1, i32 16) -; %div = sdiv i32 %bfe, 2 -; store i32 %div, i32 addrspace(1)* %out, align 4 -; ret void -; } +define amdgpu_kernel void @simplify_demanded_bfe_sdiv(i32 addrspace(1)* %out, i32 addrspace(1)* %in) #0 { +; GFX6-LABEL: simplify_demanded_bfe_sdiv: +; GFX6: ; %bb.0: +; GFX6-NEXT: v_cvt_f32_ubyte0_e32 v0, 2 +; GFX6-NEXT: v_rcp_iflag_f32_e32 v0, v0 +; GFX6-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0xb +; GFX6-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x9 +; GFX6-NEXT: s_mov_b32 s6, -1 +; GFX6-NEXT: v_mul_f32_e32 v0, 0x4f7ffffe, v0 +; GFX6-NEXT: v_cvt_u32_f32_e32 v0, v0 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) +; GFX6-NEXT: s_load_dword s2, s[2:3], 0x0 +; GFX6-NEXT: s_mov_b32 s7, 0xf000 +; GFX6-NEXT: v_mul_lo_u32 v1, -2, v0 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) +; GFX6-NEXT: s_bfe_i32 s2, s2, 0x100001 +; GFX6-NEXT: s_ashr_i32 s3, s2, 31 +; GFX6-NEXT: v_mul_hi_u32 v1, v0, v1 +; GFX6-NEXT: s_add_i32 s2, s2, s3 +; GFX6-NEXT: s_xor_b32 s2, s2, s3 +; GFX6-NEXT: v_add_i32_e32 v0, vcc, v0, v1 +; GFX6-NEXT: v_mul_hi_u32 v0, s2, v0 +; GFX6-NEXT: v_lshlrev_b32_e32 v1, 1, v0 +; GFX6-NEXT: v_add_i32_e32 v2, vcc, 1, v0 +; GFX6-NEXT: v_sub_i32_e32 v1, vcc, s2, v1 +; GFX6-NEXT: v_cmp_le_u32_e32 vcc, 2, v1 +; GFX6-NEXT: v_cndmask_b32_e32 v0, v0, v2, vcc +; GFX6-NEXT: v_subrev_i32_e64 v2, s[0:1], 2, v1 +; GFX6-NEXT: v_cndmask_b32_e32 v1, v1, v2, vcc +; GFX6-NEXT: v_add_i32_e32 v2, vcc, 1, v0 +; GFX6-NEXT: v_cmp_le_u32_e32 vcc, 2, v1 +; GFX6-NEXT: v_cndmask_b32_e32 v0, v0, v2, vcc +; GFX6-NEXT: v_xor_b32_e32 v0, s3, v0 +; GFX6-NEXT: v_subrev_i32_e32 v0, vcc, s3, v0 +; GFX6-NEXT: buffer_store_dword v0, off, s[4:7], 0 +; GFX6-NEXT: s_endpgm + %src = load i32, i32 addrspace(1)* %in, align 4 + %bfe = call i32 @llvm.amdgcn.sbfe.i32(i32 %src, i32 1, i32 16) + %div = sdiv i32 %bfe, 2 + store i32 %div, i32 addrspace(1)* %out, align 4 + ret void +} define amdgpu_kernel void @bfe_0_width(i32 addrspace(1)* %out, i32 addrspace(1)* %ptr) #0 { ; GFX6-LABEL: bfe_0_width: diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/xnor.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/xnor.ll index a8631a1..2512aaa 100644 --- a/llvm/test/CodeGen/AMDGPU/GlobalISel/xnor.ll +++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/xnor.ll @@ -15,14 +15,58 @@ entry: ret i32 %r0.val } -; FIXME: -; define amdgpu_ps i32 @scalar_xnor_v2i16_one_use(<2 x i16> inreg %a, <2 x i16> inreg %b) { -; entry: -; %xor = xor <2 x i16> %a, %b -; %r0.val = xor <2 x i16> %xor, -; %cast = bitcast <2 x i16> %r0.val to i32 -; ret i32 %cast -; } +; FIXME: fails to match +define amdgpu_ps i32 @scalar_xnor_v2i16_one_use(<2 x i16> inreg %a, <2 x i16> inreg %b) { +; GFX7-LABEL: scalar_xnor_v2i16_one_use: +; GFX7: ; %bb.0: ; %entry +; GFX7-NEXT: s_mov_b32 s4, 0xffff +; GFX7-NEXT: s_lshl_b32 s1, s1, 16 +; GFX7-NEXT: s_and_b32 s0, s0, s4 +; GFX7-NEXT: s_or_b32 s0, s1, s0 +; GFX7-NEXT: s_lshl_b32 s1, s3, 16 +; GFX7-NEXT: s_and_b32 s2, s2, s4 +; GFX7-NEXT: s_or_b32 s1, s1, s2 +; GFX7-NEXT: s_xor_b32 s0, s0, s1 +; GFX7-NEXT: s_xor_b32 s0, s0, -1 +; GFX7-NEXT: ; return to shader part epilog +; +; GFX8-LABEL: scalar_xnor_v2i16_one_use: +; GFX8: ; %bb.0: ; %entry +; GFX8-NEXT: s_mov_b32 s2, 0xffff +; GFX8-NEXT: s_lshr_b32 s5, s0, 16 +; GFX8-NEXT: s_lshr_b32 s6, s1, 16 +; GFX8-NEXT: s_and_b32 s4, s0, s2 +; GFX8-NEXT: s_and_b32 s0, s1, s2 +; GFX8-NEXT: s_and_b32 s5, s5, s2 +; GFX8-NEXT: s_and_b32 s1, s6, s2 +; GFX8-NEXT: s_mov_b32 s3, s2 +; GFX8-NEXT: s_xor_b64 s[0:1], s[4:5], s[0:1] +; GFX8-NEXT: s_and_b64 s[0:1], s[0:1], s[2:3] +; GFX8-NEXT: s_xor_b64 s[0:1], s[0:1], s[2:3] +; GFX8-NEXT: s_lshl_b32 s1, s1, 16 +; GFX8-NEXT: s_and_b32 s0, s0, s2 +; GFX8-NEXT: s_or_b32 s0, s1, s0 +; GFX8-NEXT: ; return to shader part epilog +; +; GFX900-LABEL: scalar_xnor_v2i16_one_use: +; GFX900: ; %bb.0: ; %entry +; GFX900-NEXT: s_pack_ll_b32_b16 s2, -1, -1 +; GFX900-NEXT: s_xor_b32 s0, s0, s1 +; GFX900-NEXT: s_xor_b32 s0, s0, s2 +; GFX900-NEXT: ; return to shader part epilog +; +; GFX906-LABEL: scalar_xnor_v2i16_one_use: +; GFX906: ; %bb.0: ; %entry +; GFX906-NEXT: s_pack_ll_b32_b16 s2, -1, -1 +; GFX906-NEXT: s_xor_b32 s0, s0, s1 +; GFX906-NEXT: s_xor_b32 s0, s0, s2 +; GFX906-NEXT: ; return to shader part epilog +entry: + %xor = xor <2 x i16> %a, %b + %r0.val = xor <2 x i16> %xor, + %cast = bitcast <2 x i16> %r0.val to i32 + ret i32 %cast +} define amdgpu_ps <2 x i32> @scalar_xnor_i32_mul_use(i32 inreg %a, i32 inreg %b) { ; GCN-LABEL: scalar_xnor_i32_mul_use: @@ -51,13 +95,79 @@ define amdgpu_ps i64 @scalar_xnor_i64_one_use(i64 inreg %a, i64 inreg %b) { ret i64 %r0.val } -; FIXME: -; define amdgpu_ps i64 @scalar_xnor_v4i16_one_use(<4 x i16> inreg %a, <4 x i16> inreg %b) { -; %xor = xor <4 x i16> %a, %b -; %ret = xor <4 x i16> %xor, -; %cast = bitcast <4 x i16> %ret to i64 -; ret i64 %cast -; } +; FIXME: fails to match +define amdgpu_ps i64 @scalar_xnor_v4i16_one_use(<4 x i16> inreg %a, <4 x i16> inreg %b) { +; GFX7-LABEL: scalar_xnor_v4i16_one_use: +; GFX7: ; %bb.0: +; GFX7-NEXT: s_mov_b32 s8, 0xffff +; GFX7-NEXT: s_lshl_b32 s1, s1, 16 +; GFX7-NEXT: s_and_b32 s0, s0, s8 +; GFX7-NEXT: s_or_b32 s0, s1, s0 +; GFX7-NEXT: s_lshl_b32 s1, s3, 16 +; GFX7-NEXT: s_and_b32 s2, s2, s8 +; GFX7-NEXT: s_or_b32 s1, s1, s2 +; GFX7-NEXT: s_and_b32 s3, s4, s8 +; GFX7-NEXT: s_lshl_b32 s2, s5, 16 +; GFX7-NEXT: s_or_b32 s2, s2, s3 +; GFX7-NEXT: s_lshl_b32 s3, s7, 16 +; GFX7-NEXT: s_and_b32 s4, s6, s8 +; GFX7-NEXT: s_or_b32 s3, s3, s4 +; GFX7-NEXT: s_mov_b32 s4, -1 +; GFX7-NEXT: s_mov_b32 s5, s4 +; GFX7-NEXT: s_xor_b64 s[0:1], s[0:1], s[2:3] +; GFX7-NEXT: s_xor_b64 s[0:1], s[0:1], s[4:5] +; GFX7-NEXT: ; return to shader part epilog +; +; GFX8-LABEL: scalar_xnor_v4i16_one_use: +; GFX8: ; %bb.0: +; GFX8-NEXT: s_mov_b32 s4, 0xffff +; GFX8-NEXT: s_lshr_b32 s5, s0, 16 +; GFX8-NEXT: s_and_b32 s7, s5, s4 +; GFX8-NEXT: s_lshr_b32 s5, s1, 16 +; GFX8-NEXT: s_and_b32 s6, s0, s4 +; GFX8-NEXT: s_and_b32 s0, s1, s4 +; GFX8-NEXT: s_and_b32 s1, s5, s4 +; GFX8-NEXT: s_lshr_b32 s5, s2, 16 +; GFX8-NEXT: s_and_b32 s8, s2, s4 +; GFX8-NEXT: s_and_b32 s9, s5, s4 +; GFX8-NEXT: s_lshr_b32 s5, s3, 16 +; GFX8-NEXT: s_and_b32 s2, s3, s4 +; GFX8-NEXT: s_and_b32 s3, s5, s4 +; GFX8-NEXT: s_xor_b64 s[6:7], s[6:7], s[8:9] +; GFX8-NEXT: s_mov_b32 s5, s4 +; GFX8-NEXT: s_xor_b64 s[0:1], s[0:1], s[2:3] +; GFX8-NEXT: s_and_b64 s[2:3], s[6:7], s[4:5] +; GFX8-NEXT: s_and_b64 s[0:1], s[0:1], s[4:5] +; GFX8-NEXT: s_xor_b64 s[2:3], s[2:3], s[4:5] +; GFX8-NEXT: s_xor_b64 s[6:7], s[0:1], s[4:5] +; GFX8-NEXT: s_and_b32 s1, s2, s4 +; GFX8-NEXT: s_lshl_b32 s0, s3, 16 +; GFX8-NEXT: s_or_b32 s0, s0, s1 +; GFX8-NEXT: s_lshl_b32 s1, s7, 16 +; GFX8-NEXT: s_and_b32 s2, s6, s4 +; GFX8-NEXT: s_or_b32 s1, s1, s2 +; GFX8-NEXT: ; return to shader part epilog +; +; GFX900-LABEL: scalar_xnor_v4i16_one_use: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_pack_ll_b32_b16 s4, -1, -1 +; GFX900-NEXT: s_mov_b32 s5, s4 +; GFX900-NEXT: s_xor_b64 s[0:1], s[0:1], s[2:3] +; GFX900-NEXT: s_xor_b64 s[0:1], s[0:1], s[4:5] +; GFX900-NEXT: ; return to shader part epilog +; +; GFX906-LABEL: scalar_xnor_v4i16_one_use: +; GFX906: ; %bb.0: +; GFX906-NEXT: s_pack_ll_b32_b16 s4, -1, -1 +; GFX906-NEXT: s_mov_b32 s5, s4 +; GFX906-NEXT: s_xor_b64 s[0:1], s[0:1], s[2:3] +; GFX906-NEXT: s_xor_b64 s[0:1], s[0:1], s[4:5] +; GFX906-NEXT: ; return to shader part epilog + %xor = xor <4 x i16> %a, %b + %ret = xor <4 x i16> %xor, + %cast = bitcast <4 x i16> %ret to i64 + ret i64 %cast +} define amdgpu_ps <2 x i64> @scalar_xnor_i64_mul_use(i64 inreg %a, i64 inreg %b) { ; GCN-LABEL: scalar_xnor_i64_mul_use: -- 2.7.4