-; NOTE: Assertions have been autogenerated by utils/update_mir_test_checks.py
-; RUN: llc -global-isel -mtriple=amdgcn-mesa-mesa3d -mcpu=gfx900 -stop-after=regbankselect -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefix=GFX9 %s
-; RUN: llc -global-isel -mtriple=amdgcn-mesa-mesa3d -mcpu=gfx1010 -stop-after=regbankselect -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefix=GFX10 %s
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
+; RUN: llc -global-isel -mtriple=amdgcn-mesa-mesa3d -mcpu=gfx900 -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefix=GFX9 %s
+; RUN: llc -global-isel -mtriple=amdgcn-mesa-mesa3d -mcpu=gfx1010 -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefix=GFX10 %s
; Make sure we don't violate the constant bus restriction
-; FIXME: Make this test isa output when div.fmas works.
-
define amdgpu_ps float @fmul_s_s(float inreg %src0, float inreg %src1) {
- ; GFX9-LABEL: name: fmul_s_s
- ; GFX9: bb.1 (%ir-block.0):
- ; GFX9: liveins: $sgpr2, $sgpr3
- ; GFX9: [[COPY:%[0-9]+]]:sgpr(s32) = COPY $sgpr2
- ; GFX9: [[COPY1:%[0-9]+]]:sgpr(s32) = COPY $sgpr3
- ; GFX9: [[COPY2:%[0-9]+]]:vgpr(s32) = COPY [[COPY]](s32)
- ; GFX9: [[COPY3:%[0-9]+]]:vgpr(s32) = COPY [[COPY1]](s32)
- ; GFX9: [[FMUL:%[0-9]+]]:vgpr(s32) = G_FMUL [[COPY2]], [[COPY3]]
- ; GFX9: $vgpr0 = COPY [[FMUL]](s32)
- ; GFX9: SI_RETURN_TO_EPILOG implicit $vgpr0
- ; GFX10-LABEL: name: fmul_s_s
- ; GFX10: bb.1 (%ir-block.0):
- ; GFX10: liveins: $sgpr2, $sgpr3
- ; GFX10: [[COPY:%[0-9]+]]:sgpr(s32) = COPY $sgpr2
- ; GFX10: [[COPY1:%[0-9]+]]:sgpr(s32) = COPY $sgpr3
- ; GFX10: [[COPY2:%[0-9]+]]:vgpr(s32) = COPY [[COPY]](s32)
- ; GFX10: [[COPY3:%[0-9]+]]:vgpr(s32) = COPY [[COPY1]](s32)
- ; GFX10: [[FMUL:%[0-9]+]]:vgpr(s32) = G_FMUL [[COPY2]], [[COPY3]]
- ; GFX10: $vgpr0 = COPY [[FMUL]](s32)
- ; GFX10: SI_RETURN_TO_EPILOG implicit $vgpr0
+; GFX9-LABEL: fmul_s_s:
+; GFX9: ; %bb.0:
+; GFX9-NEXT: v_mov_b32_e32 v0, s3
+; GFX9-NEXT: v_mul_f32_e32 v0, s2, v0
+; GFX9-NEXT: ; return to shader part epilog
+;
+; GFX10-LABEL: fmul_s_s:
+; GFX10: ; %bb.0:
+; GFX10-NEXT: v_mul_f32_e64 v0, s2, s3
+; GFX10-NEXT: ; implicit-def: $vcc_hi
+; GFX10-NEXT: ; return to shader part epilog
%result = fmul float %src0, %src1
ret float %result
}
define amdgpu_ps float @fmul_ss(float inreg %src) {
- ; GFX9-LABEL: name: fmul_ss
- ; GFX9: bb.1 (%ir-block.0):
- ; GFX9: liveins: $sgpr2
- ; GFX9: [[COPY:%[0-9]+]]:sgpr(s32) = COPY $sgpr2
- ; GFX9: [[COPY1:%[0-9]+]]:vgpr(s32) = COPY [[COPY]](s32)
- ; GFX9: [[COPY2:%[0-9]+]]:vgpr(s32) = COPY [[COPY]](s32)
- ; GFX9: [[FMUL:%[0-9]+]]:vgpr(s32) = G_FMUL [[COPY1]], [[COPY2]]
- ; GFX9: $vgpr0 = COPY [[FMUL]](s32)
- ; GFX9: SI_RETURN_TO_EPILOG implicit $vgpr0
- ; GFX10-LABEL: name: fmul_ss
- ; GFX10: bb.1 (%ir-block.0):
- ; GFX10: liveins: $sgpr2
- ; GFX10: [[COPY:%[0-9]+]]:sgpr(s32) = COPY $sgpr2
- ; GFX10: [[COPY1:%[0-9]+]]:vgpr(s32) = COPY [[COPY]](s32)
- ; GFX10: [[COPY2:%[0-9]+]]:vgpr(s32) = COPY [[COPY]](s32)
- ; GFX10: [[FMUL:%[0-9]+]]:vgpr(s32) = G_FMUL [[COPY1]], [[COPY2]]
- ; GFX10: $vgpr0 = COPY [[FMUL]](s32)
- ; GFX10: SI_RETURN_TO_EPILOG implicit $vgpr0
+; GFX9-LABEL: fmul_ss:
+; GFX9: ; %bb.0:
+; GFX9-NEXT: v_mul_f32_e64 v0, s2, s2
+; GFX9-NEXT: ; return to shader part epilog
+;
+; GFX10-LABEL: fmul_ss:
+; GFX10: ; %bb.0:
+; GFX10-NEXT: v_mul_f32_e64 v0, s2, s2
+; GFX10-NEXT: ; implicit-def: $vcc_hi
+; GFX10-NEXT: ; return to shader part epilog
%result = fmul float %src, %src
ret float %result
}
; Ternary operation with 3 different SGPRs
define amdgpu_ps float @fma_s_s_s(float inreg %src0, float inreg %src1, float inreg %src2) {
- ; GFX9-LABEL: name: fma_s_s_s
- ; GFX9: bb.1 (%ir-block.0):
- ; GFX9: liveins: $sgpr2, $sgpr3, $sgpr4
- ; GFX9: [[COPY:%[0-9]+]]:sgpr(s32) = COPY $sgpr2
- ; GFX9: [[COPY1:%[0-9]+]]:sgpr(s32) = COPY $sgpr3
- ; GFX9: [[COPY2:%[0-9]+]]:sgpr(s32) = COPY $sgpr4
- ; GFX9: [[COPY3:%[0-9]+]]:vgpr(s32) = COPY [[COPY]](s32)
- ; GFX9: [[COPY4:%[0-9]+]]:vgpr(s32) = COPY [[COPY1]](s32)
- ; GFX9: [[COPY5:%[0-9]+]]:vgpr(s32) = COPY [[COPY2]](s32)
- ; GFX9: [[FMA:%[0-9]+]]:vgpr(s32) = G_FMA [[COPY3]], [[COPY4]], [[COPY5]]
- ; GFX9: $vgpr0 = COPY [[FMA]](s32)
- ; GFX9: SI_RETURN_TO_EPILOG implicit $vgpr0
- ; GFX10-LABEL: name: fma_s_s_s
- ; GFX10: bb.1 (%ir-block.0):
- ; GFX10: liveins: $sgpr2, $sgpr3, $sgpr4
- ; GFX10: [[COPY:%[0-9]+]]:sgpr(s32) = COPY $sgpr2
- ; GFX10: [[COPY1:%[0-9]+]]:sgpr(s32) = COPY $sgpr3
- ; GFX10: [[COPY2:%[0-9]+]]:sgpr(s32) = COPY $sgpr4
- ; GFX10: [[COPY3:%[0-9]+]]:vgpr(s32) = COPY [[COPY]](s32)
- ; GFX10: [[COPY4:%[0-9]+]]:vgpr(s32) = COPY [[COPY1]](s32)
- ; GFX10: [[COPY5:%[0-9]+]]:vgpr(s32) = COPY [[COPY2]](s32)
- ; GFX10: [[FMA:%[0-9]+]]:vgpr(s32) = G_FMA [[COPY3]], [[COPY4]], [[COPY5]]
- ; GFX10: $vgpr0 = COPY [[FMA]](s32)
- ; GFX10: SI_RETURN_TO_EPILOG implicit $vgpr0
+; GFX9-LABEL: fma_s_s_s:
+; GFX9: ; %bb.0:
+; GFX9-NEXT: v_mov_b32_e32 v0, s3
+; GFX9-NEXT: v_mov_b32_e32 v1, s4
+; GFX9-NEXT: v_fma_f32 v0, s2, v0, v1
+; GFX9-NEXT: ; return to shader part epilog
+;
+; GFX10-LABEL: fma_s_s_s:
+; GFX10: ; %bb.0:
+; GFX10-NEXT: v_mov_b32_e32 v0, s4
+; GFX10-NEXT: ; implicit-def: $vcc_hi
+; GFX10-NEXT: v_fma_f32 v0, s3, s2, v0
+; GFX10-NEXT: ; return to shader part epilog
%result = call float @llvm.fma.f32(float %src0, float %src1, float %src2)
ret float %result
}
; Ternary operation with 3 identical SGPRs
define amdgpu_ps float @fma_sss(float inreg %src) {
- ; GFX9-LABEL: name: fma_sss
- ; GFX9: bb.1 (%ir-block.0):
- ; GFX9: liveins: $sgpr2
- ; GFX9: [[COPY:%[0-9]+]]:sgpr(s32) = COPY $sgpr2
- ; GFX9: [[COPY1:%[0-9]+]]:vgpr(s32) = COPY [[COPY]](s32)
- ; GFX9: [[COPY2:%[0-9]+]]:vgpr(s32) = COPY [[COPY]](s32)
- ; GFX9: [[COPY3:%[0-9]+]]:vgpr(s32) = COPY [[COPY]](s32)
- ; GFX9: [[FMA:%[0-9]+]]:vgpr(s32) = G_FMA [[COPY1]], [[COPY2]], [[COPY3]]
- ; GFX9: $vgpr0 = COPY [[FMA]](s32)
- ; GFX9: SI_RETURN_TO_EPILOG implicit $vgpr0
- ; GFX10-LABEL: name: fma_sss
- ; GFX10: bb.1 (%ir-block.0):
- ; GFX10: liveins: $sgpr2
- ; GFX10: [[COPY:%[0-9]+]]:sgpr(s32) = COPY $sgpr2
- ; GFX10: [[COPY1:%[0-9]+]]:vgpr(s32) = COPY [[COPY]](s32)
- ; GFX10: [[COPY2:%[0-9]+]]:vgpr(s32) = COPY [[COPY]](s32)
- ; GFX10: [[COPY3:%[0-9]+]]:vgpr(s32) = COPY [[COPY]](s32)
- ; GFX10: [[FMA:%[0-9]+]]:vgpr(s32) = G_FMA [[COPY1]], [[COPY2]], [[COPY3]]
- ; GFX10: $vgpr0 = COPY [[FMA]](s32)
- ; GFX10: SI_RETURN_TO_EPILOG implicit $vgpr0
+; GFX9-LABEL: fma_sss:
+; GFX9: ; %bb.0:
+; GFX9-NEXT: v_fma_f32 v0, s2, s2, s2
+; GFX9-NEXT: ; return to shader part epilog
+;
+; GFX10-LABEL: fma_sss:
+; GFX10: ; %bb.0:
+; GFX10-NEXT: v_fma_f32 v0, s2, s2, s2
+; GFX10-NEXT: ; implicit-def: $vcc_hi
+; GFX10-NEXT: ; return to shader part epilog
%result = call float @llvm.fma.f32(float %src, float %src, float %src)
ret float %result
}
; src0/1 are same SGPR
define amdgpu_ps float @fma_ss_s(float inreg %src01, float inreg %src2) {
- ; GFX9-LABEL: name: fma_ss_s
- ; GFX9: bb.1 (%ir-block.0):
- ; GFX9: liveins: $sgpr2, $sgpr3
- ; GFX9: [[COPY:%[0-9]+]]:sgpr(s32) = COPY $sgpr2
- ; GFX9: [[COPY1:%[0-9]+]]:sgpr(s32) = COPY $sgpr3
- ; GFX9: [[COPY2:%[0-9]+]]:vgpr(s32) = COPY [[COPY]](s32)
- ; GFX9: [[COPY3:%[0-9]+]]:vgpr(s32) = COPY [[COPY]](s32)
- ; GFX9: [[COPY4:%[0-9]+]]:vgpr(s32) = COPY [[COPY1]](s32)
- ; GFX9: [[FMA:%[0-9]+]]:vgpr(s32) = G_FMA [[COPY2]], [[COPY3]], [[COPY4]]
- ; GFX9: $vgpr0 = COPY [[FMA]](s32)
- ; GFX9: SI_RETURN_TO_EPILOG implicit $vgpr0
- ; GFX10-LABEL: name: fma_ss_s
- ; GFX10: bb.1 (%ir-block.0):
- ; GFX10: liveins: $sgpr2, $sgpr3
- ; GFX10: [[COPY:%[0-9]+]]:sgpr(s32) = COPY $sgpr2
- ; GFX10: [[COPY1:%[0-9]+]]:sgpr(s32) = COPY $sgpr3
- ; GFX10: [[COPY2:%[0-9]+]]:vgpr(s32) = COPY [[COPY]](s32)
- ; GFX10: [[COPY3:%[0-9]+]]:vgpr(s32) = COPY [[COPY]](s32)
- ; GFX10: [[COPY4:%[0-9]+]]:vgpr(s32) = COPY [[COPY1]](s32)
- ; GFX10: [[FMA:%[0-9]+]]:vgpr(s32) = G_FMA [[COPY2]], [[COPY3]], [[COPY4]]
- ; GFX10: $vgpr0 = COPY [[FMA]](s32)
- ; GFX10: SI_RETURN_TO_EPILOG implicit $vgpr0
+; GFX9-LABEL: fma_ss_s:
+; GFX9: ; %bb.0:
+; GFX9-NEXT: v_mov_b32_e32 v0, s3
+; GFX9-NEXT: v_fma_f32 v0, s2, s2, v0
+; GFX9-NEXT: ; return to shader part epilog
+;
+; GFX10-LABEL: fma_ss_s:
+; GFX10: ; %bb.0:
+; GFX10-NEXT: v_fma_f32 v0, s2, s2, s3
+; GFX10-NEXT: ; implicit-def: $vcc_hi
+; GFX10-NEXT: ; return to shader part epilog
%result = call float @llvm.fma.f32(float %src01, float %src01, float %src2)
ret float %result
}
; src1/2 are same SGPR
define amdgpu_ps float @fma_s_ss(float inreg %src0, float inreg %src12) {
- ; GFX9-LABEL: name: fma_s_ss
- ; GFX9: bb.1 (%ir-block.0):
- ; GFX9: liveins: $sgpr2, $sgpr3
- ; GFX9: [[COPY:%[0-9]+]]:sgpr(s32) = COPY $sgpr2
- ; GFX9: [[COPY1:%[0-9]+]]:sgpr(s32) = COPY $sgpr3
- ; GFX9: [[COPY2:%[0-9]+]]:vgpr(s32) = COPY [[COPY]](s32)
- ; GFX9: [[COPY3:%[0-9]+]]:vgpr(s32) = COPY [[COPY1]](s32)
- ; GFX9: [[COPY4:%[0-9]+]]:vgpr(s32) = COPY [[COPY1]](s32)
- ; GFX9: [[FMA:%[0-9]+]]:vgpr(s32) = G_FMA [[COPY2]], [[COPY3]], [[COPY4]]
- ; GFX9: $vgpr0 = COPY [[FMA]](s32)
- ; GFX9: SI_RETURN_TO_EPILOG implicit $vgpr0
- ; GFX10-LABEL: name: fma_s_ss
- ; GFX10: bb.1 (%ir-block.0):
- ; GFX10: liveins: $sgpr2, $sgpr3
- ; GFX10: [[COPY:%[0-9]+]]:sgpr(s32) = COPY $sgpr2
- ; GFX10: [[COPY1:%[0-9]+]]:sgpr(s32) = COPY $sgpr3
- ; GFX10: [[COPY2:%[0-9]+]]:vgpr(s32) = COPY [[COPY]](s32)
- ; GFX10: [[COPY3:%[0-9]+]]:vgpr(s32) = COPY [[COPY1]](s32)
- ; GFX10: [[COPY4:%[0-9]+]]:vgpr(s32) = COPY [[COPY1]](s32)
- ; GFX10: [[FMA:%[0-9]+]]:vgpr(s32) = G_FMA [[COPY2]], [[COPY3]], [[COPY4]]
- ; GFX10: $vgpr0 = COPY [[FMA]](s32)
- ; GFX10: SI_RETURN_TO_EPILOG implicit $vgpr0
+; GFX9-LABEL: fma_s_ss:
+; GFX9: ; %bb.0:
+; GFX9-NEXT: v_mov_b32_e32 v0, s3
+; GFX9-NEXT: v_fma_f32 v0, s2, v0, v0
+; GFX9-NEXT: ; return to shader part epilog
+;
+; GFX10-LABEL: fma_s_ss:
+; GFX10: ; %bb.0:
+; GFX10-NEXT: v_fma_f32 v0, s2, s3, s3
+; GFX10-NEXT: ; implicit-def: $vcc_hi
+; GFX10-NEXT: ; return to shader part epilog
%result = call float @llvm.fma.f32(float %src0, float %src12, float %src12)
ret float %result
}
; src0/2 are same SGPR
define amdgpu_ps float @fma_ss_s_same_outer(float inreg %src02, float inreg %src1) {
- ; GFX9-LABEL: name: fma_ss_s_same_outer
- ; GFX9: bb.1 (%ir-block.0):
- ; GFX9: liveins: $sgpr2, $sgpr3
- ; GFX9: [[COPY:%[0-9]+]]:sgpr(s32) = COPY $sgpr2
- ; GFX9: [[COPY1:%[0-9]+]]:sgpr(s32) = COPY $sgpr3
- ; GFX9: [[COPY2:%[0-9]+]]:vgpr(s32) = COPY [[COPY]](s32)
- ; GFX9: [[COPY3:%[0-9]+]]:vgpr(s32) = COPY [[COPY1]](s32)
- ; GFX9: [[COPY4:%[0-9]+]]:vgpr(s32) = COPY [[COPY]](s32)
- ; GFX9: [[FMA:%[0-9]+]]:vgpr(s32) = G_FMA [[COPY2]], [[COPY3]], [[COPY4]]
- ; GFX9: $vgpr0 = COPY [[FMA]](s32)
- ; GFX9: SI_RETURN_TO_EPILOG implicit $vgpr0
- ; GFX10-LABEL: name: fma_ss_s_same_outer
- ; GFX10: bb.1 (%ir-block.0):
- ; GFX10: liveins: $sgpr2, $sgpr3
- ; GFX10: [[COPY:%[0-9]+]]:sgpr(s32) = COPY $sgpr2
- ; GFX10: [[COPY1:%[0-9]+]]:sgpr(s32) = COPY $sgpr3
- ; GFX10: [[COPY2:%[0-9]+]]:vgpr(s32) = COPY [[COPY]](s32)
- ; GFX10: [[COPY3:%[0-9]+]]:vgpr(s32) = COPY [[COPY1]](s32)
- ; GFX10: [[COPY4:%[0-9]+]]:vgpr(s32) = COPY [[COPY]](s32)
- ; GFX10: [[FMA:%[0-9]+]]:vgpr(s32) = G_FMA [[COPY2]], [[COPY3]], [[COPY4]]
- ; GFX10: $vgpr0 = COPY [[FMA]](s32)
- ; GFX10: SI_RETURN_TO_EPILOG implicit $vgpr0
+; GFX9-LABEL: fma_ss_s_same_outer:
+; GFX9: ; %bb.0:
+; GFX9-NEXT: v_mov_b32_e32 v0, s3
+; GFX9-NEXT: v_fma_f32 v0, s2, v0, s2
+; GFX9-NEXT: ; return to shader part epilog
+;
+; GFX10-LABEL: fma_ss_s_same_outer:
+; GFX10: ; %bb.0:
+; GFX10-NEXT: v_fma_f32 v0, s2, s3, s2
+; GFX10-NEXT: ; implicit-def: $vcc_hi
+; GFX10-NEXT: ; return to shader part epilog
%result = call float @llvm.fma.f32(float %src02, float %src1, float %src02)
ret float %result
}
define amdgpu_ps float @fcmp_s_s(float inreg %src0, float inreg %src1) {
- ; GFX9-LABEL: name: fcmp_s_s
- ; GFX9: bb.1 (%ir-block.0):
- ; GFX9: liveins: $sgpr2, $sgpr3
- ; GFX9: [[COPY:%[0-9]+]]:sgpr(s32) = COPY $sgpr2
- ; GFX9: [[COPY1:%[0-9]+]]:sgpr(s32) = COPY $sgpr3
- ; GFX9: [[COPY2:%[0-9]+]]:vgpr(s32) = COPY [[COPY1]](s32)
- ; GFX9: [[FCMP:%[0-9]+]]:vcc(s1) = G_FCMP floatpred(oeq), [[COPY]](s32), [[COPY2]]
- ; GFX9: [[C:%[0-9]+]]:sgpr(s32) = G_FCONSTANT float 0.000000e+00
- ; GFX9: [[C1:%[0-9]+]]:sgpr(s32) = G_FCONSTANT float 1.000000e+00
- ; GFX9: [[COPY3:%[0-9]+]]:vgpr(s32) = COPY [[C1]](s32)
- ; GFX9: [[COPY4:%[0-9]+]]:vgpr(s32) = COPY [[C]](s32)
- ; GFX9: [[SELECT:%[0-9]+]]:vgpr(s32) = G_SELECT [[FCMP]](s1), [[COPY3]], [[COPY4]]
- ; GFX9: $vgpr0 = COPY [[SELECT]](s32)
- ; GFX9: SI_RETURN_TO_EPILOG implicit $vgpr0
- ; GFX10-LABEL: name: fcmp_s_s
- ; GFX10: bb.1 (%ir-block.0):
- ; GFX10: liveins: $sgpr2, $sgpr3
- ; GFX10: [[COPY:%[0-9]+]]:sgpr(s32) = COPY $sgpr2
- ; GFX10: [[COPY1:%[0-9]+]]:sgpr(s32) = COPY $sgpr3
- ; GFX10: [[COPY2:%[0-9]+]]:vgpr(s32) = COPY [[COPY1]](s32)
- ; GFX10: [[FCMP:%[0-9]+]]:vcc(s1) = G_FCMP floatpred(oeq), [[COPY]](s32), [[COPY2]]
- ; GFX10: [[C:%[0-9]+]]:sgpr(s32) = G_FCONSTANT float 0.000000e+00
- ; GFX10: [[C1:%[0-9]+]]:sgpr(s32) = G_FCONSTANT float 1.000000e+00
- ; GFX10: [[COPY3:%[0-9]+]]:vgpr(s32) = COPY [[C1]](s32)
- ; GFX10: [[COPY4:%[0-9]+]]:vgpr(s32) = COPY [[C]](s32)
- ; GFX10: [[SELECT:%[0-9]+]]:vgpr(s32) = G_SELECT [[FCMP]](s1), [[COPY3]], [[COPY4]]
- ; GFX10: $vgpr0 = COPY [[SELECT]](s32)
- ; GFX10: SI_RETURN_TO_EPILOG implicit $vgpr0
+; GFX9-LABEL: fcmp_s_s:
+; GFX9: ; %bb.0:
+; GFX9-NEXT: v_mov_b32_e32 v0, s3
+; GFX9-NEXT: v_cmp_eq_f32_e32 vcc, s2, v0
+; GFX9-NEXT: v_cndmask_b32_e64 v0, 0, 1.0, vcc
+; GFX9-NEXT: ; return to shader part epilog
+;
+; GFX10-LABEL: fcmp_s_s:
+; GFX10: ; %bb.0:
+; GFX10-NEXT: v_cmp_eq_f32_e64 s0, s2, s3
+; GFX10-NEXT: ; implicit-def: $vcc_hi
+; GFX10-NEXT: v_cndmask_b32_e64 v0, 0, 1.0, s0
+; GFX10-NEXT: ; return to shader part epilog
%cmp = fcmp oeq float %src0, %src1
%result = select i1 %cmp, float 1.0, float 0.0
ret float %result
}
define amdgpu_ps float @select_vcc_s_s(float %cmp0, float %cmp1, float inreg %src0, float inreg %src1) {
- ; GFX9-LABEL: name: select_vcc_s_s
- ; GFX9: bb.1 (%ir-block.0):
- ; GFX9: liveins: $sgpr2, $sgpr3, $vgpr0, $vgpr1
- ; GFX9: [[COPY:%[0-9]+]]:vgpr(s32) = COPY $vgpr0
- ; GFX9: [[COPY1:%[0-9]+]]:vgpr(s32) = COPY $vgpr1
- ; GFX9: [[COPY2:%[0-9]+]]:sgpr(s32) = COPY $sgpr2
- ; GFX9: [[COPY3:%[0-9]+]]:sgpr(s32) = COPY $sgpr3
- ; GFX9: [[FCMP:%[0-9]+]]:vcc(s1) = G_FCMP floatpred(oeq), [[COPY]](s32), [[COPY1]]
- ; GFX9: [[COPY4:%[0-9]+]]:vgpr(s32) = COPY [[COPY2]](s32)
- ; GFX9: [[COPY5:%[0-9]+]]:vgpr(s32) = COPY [[COPY3]](s32)
- ; GFX9: [[SELECT:%[0-9]+]]:vgpr(s32) = G_SELECT [[FCMP]](s1), [[COPY4]], [[COPY5]]
- ; GFX9: $vgpr0 = COPY [[SELECT]](s32)
- ; GFX9: SI_RETURN_TO_EPILOG implicit $vgpr0
- ; GFX10-LABEL: name: select_vcc_s_s
- ; GFX10: bb.1 (%ir-block.0):
- ; GFX10: liveins: $sgpr2, $sgpr3, $vgpr0, $vgpr1
- ; GFX10: [[COPY:%[0-9]+]]:vgpr(s32) = COPY $vgpr0
- ; GFX10: [[COPY1:%[0-9]+]]:vgpr(s32) = COPY $vgpr1
- ; GFX10: [[COPY2:%[0-9]+]]:sgpr(s32) = COPY $sgpr2
- ; GFX10: [[COPY3:%[0-9]+]]:sgpr(s32) = COPY $sgpr3
- ; GFX10: [[FCMP:%[0-9]+]]:vcc(s1) = G_FCMP floatpred(oeq), [[COPY]](s32), [[COPY1]]
- ; GFX10: [[COPY4:%[0-9]+]]:vgpr(s32) = COPY [[COPY2]](s32)
- ; GFX10: [[COPY5:%[0-9]+]]:vgpr(s32) = COPY [[COPY3]](s32)
- ; GFX10: [[SELECT:%[0-9]+]]:vgpr(s32) = G_SELECT [[FCMP]](s1), [[COPY4]], [[COPY5]]
- ; GFX10: $vgpr0 = COPY [[SELECT]](s32)
- ; GFX10: SI_RETURN_TO_EPILOG implicit $vgpr0
+; GFX9-LABEL: select_vcc_s_s:
+; GFX9: ; %bb.0:
+; GFX9-NEXT: v_mov_b32_e32 v2, s2
+; GFX9-NEXT: v_mov_b32_e32 v3, s3
+; GFX9-NEXT: v_cmp_eq_f32_e32 vcc, v0, v1
+; GFX9-NEXT: v_cndmask_b32_e32 v0, v3, v2, vcc
+; GFX9-NEXT: ; return to shader part epilog
+;
+; GFX10-LABEL: select_vcc_s_s:
+; GFX10: ; %bb.0:
+; GFX10-NEXT: v_mov_b32_e32 v2, s3
+; GFX10-NEXT: v_cmp_eq_f32_e32 vcc_lo, v0, v1
+; GFX10-NEXT: ; implicit-def: $vcc_hi
+; GFX10-NEXT: v_cndmask_b32_e64 v0, v2, s2, vcc_lo
+; GFX10-NEXT: ; return to shader part epilog
%cmp = fcmp oeq float %cmp0, %cmp1
%result = select i1 %cmp, float %src0, float %src1
ret float %result
}
define amdgpu_ps float @select_vcc_fneg_s_s(float %cmp0, float %cmp1, float inreg %src0, float inreg %src1) {
- ; GFX9-LABEL: name: select_vcc_fneg_s_s
- ; GFX9: bb.1 (%ir-block.0):
- ; GFX9: liveins: $sgpr2, $sgpr3, $vgpr0, $vgpr1
- ; GFX9: [[COPY:%[0-9]+]]:vgpr(s32) = COPY $vgpr0
- ; GFX9: [[COPY1:%[0-9]+]]:vgpr(s32) = COPY $vgpr1
- ; GFX9: [[COPY2:%[0-9]+]]:sgpr(s32) = COPY $sgpr2
- ; GFX9: [[COPY3:%[0-9]+]]:sgpr(s32) = COPY $sgpr3
- ; GFX9: [[FCMP:%[0-9]+]]:vcc(s1) = G_FCMP floatpred(oeq), [[COPY]](s32), [[COPY1]]
- ; GFX9: [[FNEG:%[0-9]+]]:sgpr(s32) = G_FNEG [[COPY2]]
- ; GFX9: [[COPY4:%[0-9]+]]:vgpr(s32) = COPY [[FNEG]](s32)
- ; GFX9: [[COPY5:%[0-9]+]]:vgpr(s32) = COPY [[COPY3]](s32)
- ; GFX9: [[SELECT:%[0-9]+]]:vgpr(s32) = G_SELECT [[FCMP]](s1), [[COPY4]], [[COPY5]]
- ; GFX9: $vgpr0 = COPY [[SELECT]](s32)
- ; GFX9: SI_RETURN_TO_EPILOG implicit $vgpr0
- ; GFX10-LABEL: name: select_vcc_fneg_s_s
- ; GFX10: bb.1 (%ir-block.0):
- ; GFX10: liveins: $sgpr2, $sgpr3, $vgpr0, $vgpr1
- ; GFX10: [[COPY:%[0-9]+]]:vgpr(s32) = COPY $vgpr0
- ; GFX10: [[COPY1:%[0-9]+]]:vgpr(s32) = COPY $vgpr1
- ; GFX10: [[COPY2:%[0-9]+]]:sgpr(s32) = COPY $sgpr2
- ; GFX10: [[COPY3:%[0-9]+]]:sgpr(s32) = COPY $sgpr3
- ; GFX10: [[FCMP:%[0-9]+]]:vcc(s1) = G_FCMP floatpred(oeq), [[COPY]](s32), [[COPY1]]
- ; GFX10: [[FNEG:%[0-9]+]]:sgpr(s32) = G_FNEG [[COPY2]]
- ; GFX10: [[COPY4:%[0-9]+]]:vgpr(s32) = COPY [[FNEG]](s32)
- ; GFX10: [[COPY5:%[0-9]+]]:vgpr(s32) = COPY [[COPY3]](s32)
- ; GFX10: [[SELECT:%[0-9]+]]:vgpr(s32) = G_SELECT [[FCMP]](s1), [[COPY4]], [[COPY5]]
- ; GFX10: $vgpr0 = COPY [[SELECT]](s32)
- ; GFX10: SI_RETURN_TO_EPILOG implicit $vgpr0
+; GFX9-LABEL: select_vcc_fneg_s_s:
+; GFX9: ; %bb.0:
+; GFX9-NEXT: v_mov_b32_e32 v2, s3
+; GFX9-NEXT: v_mov_b32_e32 v3, s2
+; GFX9-NEXT: v_cmp_eq_f32_e32 vcc, v0, v1
+; GFX9-NEXT: v_cndmask_b32_e64 v0, v2, -v3, vcc
+; GFX9-NEXT: ; return to shader part epilog
+;
+; GFX10-LABEL: select_vcc_fneg_s_s:
+; GFX10: ; %bb.0:
+; GFX10-NEXT: v_mov_b32_e32 v2, s2
+; GFX10-NEXT: v_cmp_eq_f32_e32 vcc_lo, v0, v1
+; GFX10-NEXT: ; implicit-def: $vcc_hi
+; GFX10-NEXT: v_cndmask_b32_e64 v0, s3, -v2, vcc_lo
+; GFX10-NEXT: ; return to shader part epilog
%cmp = fcmp oeq float %cmp0, %cmp1
%neg.src0 = fneg float %src0
%result = select i1 %cmp, float %neg.src0, float %src1
; Constant bus used by vcc
define amdgpu_ps float @amdgcn_div_fmas_sss(float inreg %src, float %cmp.src) {
- ; GFX9-LABEL: name: amdgcn_div_fmas_sss
- ; GFX9: bb.1 (%ir-block.0):
- ; GFX9: liveins: $sgpr2, $vgpr0
- ; GFX9: [[COPY:%[0-9]+]]:sgpr(s32) = COPY $sgpr2
- ; GFX9: [[COPY1:%[0-9]+]]:vgpr(s32) = COPY $vgpr0
- ; GFX9: [[C:%[0-9]+]]:sgpr(s32) = G_FCONSTANT float 0.000000e+00
- ; GFX9: [[COPY2:%[0-9]+]]:vgpr(s32) = COPY [[C]](s32)
- ; GFX9: [[FCMP:%[0-9]+]]:vcc(s1) = G_FCMP floatpred(oeq), [[COPY1]](s32), [[COPY2]]
- ; GFX9: [[COPY3:%[0-9]+]]:vgpr(s32) = COPY [[COPY]](s32)
- ; GFX9: [[COPY4:%[0-9]+]]:vgpr(s32) = COPY [[COPY]](s32)
- ; GFX9: [[COPY5:%[0-9]+]]:vgpr(s32) = COPY [[COPY]](s32)
- ; GFX9: [[INT:%[0-9]+]]:vgpr(s32) = G_INTRINSIC intrinsic(@llvm.amdgcn.div.fmas), [[COPY3]](s32), [[COPY4]](s32), [[COPY5]](s32), [[FCMP]](s1)
- ; GFX9: $vgpr0 = COPY [[INT]](s32)
- ; GFX9: SI_RETURN_TO_EPILOG implicit $vgpr0
- ; GFX10-LABEL: name: amdgcn_div_fmas_sss
- ; GFX10: bb.1 (%ir-block.0):
- ; GFX10: liveins: $sgpr2, $vgpr0
- ; GFX10: [[COPY:%[0-9]+]]:sgpr(s32) = COPY $sgpr2
- ; GFX10: [[COPY1:%[0-9]+]]:vgpr(s32) = COPY $vgpr0
- ; GFX10: [[C:%[0-9]+]]:sgpr(s32) = G_FCONSTANT float 0.000000e+00
- ; GFX10: [[COPY2:%[0-9]+]]:vgpr(s32) = COPY [[C]](s32)
- ; GFX10: [[FCMP:%[0-9]+]]:vcc(s1) = G_FCMP floatpred(oeq), [[COPY1]](s32), [[COPY2]]
- ; GFX10: [[COPY3:%[0-9]+]]:vgpr(s32) = COPY [[COPY]](s32)
- ; GFX10: [[COPY4:%[0-9]+]]:vgpr(s32) = COPY [[COPY]](s32)
- ; GFX10: [[COPY5:%[0-9]+]]:vgpr(s32) = COPY [[COPY]](s32)
- ; GFX10: [[INT:%[0-9]+]]:vgpr(s32) = G_INTRINSIC intrinsic(@llvm.amdgcn.div.fmas), [[COPY3]](s32), [[COPY4]](s32), [[COPY5]](s32), [[FCMP]](s1)
- ; GFX10: $vgpr0 = COPY [[INT]](s32)
- ; GFX10: SI_RETURN_TO_EPILOG implicit $vgpr0
+; GFX9-LABEL: amdgcn_div_fmas_sss:
+; GFX9: ; %bb.0:
+; GFX9-NEXT: v_cmp_eq_f32_e32 vcc, 0, v0
+; GFX9-NEXT: v_mov_b32_e32 v0, s2
+; GFX9-NEXT: s_nop 2
+; GFX9-NEXT: v_div_fmas_f32 v0, v0, v0, v0
+; GFX9-NEXT: ; return to shader part epilog
+;
+; GFX10-LABEL: amdgcn_div_fmas_sss:
+; GFX10: ; %bb.0:
+; GFX10-NEXT: v_cmp_eq_f32_e32 vcc_lo, 0, v0
+; GFX10-NEXT: v_div_fmas_f32 v0, s2, s2, s2
+; GFX10-NEXT: ; implicit-def: $vcc_hi
+; GFX10-NEXT: ; return to shader part epilog
%vcc = fcmp oeq float %cmp.src, 0.0
%result = call float @llvm.amdgcn.div.fmas.f32(float %src, float %src, float %src, i1 %vcc)
ret float %result
}
define amdgpu_ps float @class_s_s(float inreg %src0, i32 inreg %src1) {
- ; GFX9-LABEL: name: class_s_s
- ; GFX9: bb.1 (%ir-block.0):
- ; GFX9: liveins: $sgpr2, $sgpr3
- ; GFX9: [[COPY:%[0-9]+]]:sgpr(s32) = COPY $sgpr2
- ; GFX9: [[COPY1:%[0-9]+]]:sgpr(s32) = COPY $sgpr3
- ; GFX9: [[COPY2:%[0-9]+]]:vgpr(s32) = COPY [[COPY]](s32)
- ; GFX9: [[COPY3:%[0-9]+]]:vgpr(s32) = COPY [[COPY1]](s32)
- ; GFX9: [[INT:%[0-9]+]]:vcc(s1) = G_INTRINSIC intrinsic(@llvm.amdgcn.class), [[COPY2]](s32), [[COPY3]](s32)
- ; GFX9: [[C:%[0-9]+]]:sgpr(s32) = G_FCONSTANT float 0.000000e+00
- ; GFX9: [[C1:%[0-9]+]]:sgpr(s32) = G_FCONSTANT float 1.000000e+00
- ; GFX9: [[COPY4:%[0-9]+]]:vgpr(s32) = COPY [[C1]](s32)
- ; GFX9: [[COPY5:%[0-9]+]]:vgpr(s32) = COPY [[C]](s32)
- ; GFX9: [[SELECT:%[0-9]+]]:vgpr(s32) = G_SELECT [[INT]](s1), [[COPY4]], [[COPY5]]
- ; GFX9: $vgpr0 = COPY [[SELECT]](s32)
- ; GFX9: SI_RETURN_TO_EPILOG implicit $vgpr0
- ; GFX10-LABEL: name: class_s_s
- ; GFX10: bb.1 (%ir-block.0):
- ; GFX10: liveins: $sgpr2, $sgpr3
- ; GFX10: [[COPY:%[0-9]+]]:sgpr(s32) = COPY $sgpr2
- ; GFX10: [[COPY1:%[0-9]+]]:sgpr(s32) = COPY $sgpr3
- ; GFX10: [[COPY2:%[0-9]+]]:vgpr(s32) = COPY [[COPY]](s32)
- ; GFX10: [[COPY3:%[0-9]+]]:vgpr(s32) = COPY [[COPY1]](s32)
- ; GFX10: [[INT:%[0-9]+]]:vcc(s1) = G_INTRINSIC intrinsic(@llvm.amdgcn.class), [[COPY2]](s32), [[COPY3]](s32)
- ; GFX10: [[C:%[0-9]+]]:sgpr(s32) = G_FCONSTANT float 0.000000e+00
- ; GFX10: [[C1:%[0-9]+]]:sgpr(s32) = G_FCONSTANT float 1.000000e+00
- ; GFX10: [[COPY4:%[0-9]+]]:vgpr(s32) = COPY [[C1]](s32)
- ; GFX10: [[COPY5:%[0-9]+]]:vgpr(s32) = COPY [[C]](s32)
- ; GFX10: [[SELECT:%[0-9]+]]:vgpr(s32) = G_SELECT [[INT]](s1), [[COPY4]], [[COPY5]]
- ; GFX10: $vgpr0 = COPY [[SELECT]](s32)
- ; GFX10: SI_RETURN_TO_EPILOG implicit $vgpr0
+; GFX9-LABEL: class_s_s:
+; GFX9: ; %bb.0:
+; GFX9-NEXT: v_mov_b32_e32 v0, s3
+; GFX9-NEXT: v_cmp_class_f32_e32 vcc, s2, v0
+; GFX9-NEXT: v_cndmask_b32_e64 v0, 0, 1.0, vcc
+; GFX9-NEXT: ; return to shader part epilog
+;
+; GFX10-LABEL: class_s_s:
+; GFX10: ; %bb.0:
+; GFX10-NEXT: v_cmp_class_f32_e64 s0, s2, s3
+; GFX10-NEXT: ; implicit-def: $vcc_hi
+; GFX10-NEXT: v_cndmask_b32_e64 v0, 0, 1.0, s0
+; GFX10-NEXT: ; return to shader part epilog
%class = call i1 @llvm.amdgcn.class.f32(float %src0, i32 %src1)
%result = select i1 %class, float 1.0, float 0.0
ret float %result
}
define amdgpu_ps float @div_scale_s_s_true(float inreg %src0, float inreg %src1) {
- ; GFX9-LABEL: name: div_scale_s_s_true
- ; GFX9: bb.1 (%ir-block.0):
- ; GFX9: liveins: $sgpr2, $sgpr3
- ; GFX9: [[COPY:%[0-9]+]]:sgpr(s32) = COPY $sgpr2
- ; GFX9: [[COPY1:%[0-9]+]]:sgpr(s32) = COPY $sgpr3
- ; GFX9: [[COPY2:%[0-9]+]]:vgpr(s32) = COPY [[COPY]](s32)
- ; GFX9: [[COPY3:%[0-9]+]]:vgpr(s32) = COPY [[COPY1]](s32)
- ; GFX9: [[INT:%[0-9]+]]:vgpr(s32), [[INT1:%[0-9]+]]:vcc(s1) = G_INTRINSIC intrinsic(@llvm.amdgcn.div.scale), [[COPY2]](s32), [[COPY3]](s32), -1
- ; GFX9: $vgpr0 = COPY [[INT]](s32)
- ; GFX9: SI_RETURN_TO_EPILOG implicit $vgpr0
- ; GFX10-LABEL: name: div_scale_s_s_true
- ; GFX10: bb.1 (%ir-block.0):
- ; GFX10: liveins: $sgpr2, $sgpr3
- ; GFX10: [[COPY:%[0-9]+]]:sgpr(s32) = COPY $sgpr2
- ; GFX10: [[COPY1:%[0-9]+]]:sgpr(s32) = COPY $sgpr3
- ; GFX10: [[COPY2:%[0-9]+]]:vgpr(s32) = COPY [[COPY]](s32)
- ; GFX10: [[COPY3:%[0-9]+]]:vgpr(s32) = COPY [[COPY1]](s32)
- ; GFX10: [[INT:%[0-9]+]]:vgpr(s32), [[INT1:%[0-9]+]]:vcc(s1) = G_INTRINSIC intrinsic(@llvm.amdgcn.div.scale), [[COPY2]](s32), [[COPY3]](s32), -1
- ; GFX10: $vgpr0 = COPY [[INT]](s32)
- ; GFX10: SI_RETURN_TO_EPILOG implicit $vgpr0
+; GFX9-LABEL: div_scale_s_s_true:
+; GFX9: ; %bb.0:
+; GFX9-NEXT: v_mov_b32_e32 v0, s3
+; GFX9-NEXT: v_div_scale_f32 v0, s[0:1], s2, v0, s2
+; GFX9-NEXT: ; return to shader part epilog
+;
+; GFX10-LABEL: div_scale_s_s_true:
+; GFX10: ; %bb.0:
+; GFX10-NEXT: v_div_scale_f32 v0, s0, s2, s3, s2
+; GFX10-NEXT: ; implicit-def: $vcc_hi
+; GFX10-NEXT: ; return to shader part epilog
%div.scale = call { float, i1 } @llvm.amdgcn.div.scale.f32(float %src0, float %src1, i1 true)
%result = extractvalue { float, i1 } %div.scale, 0
ret float %result
}
define amdgpu_ps float @div_scale_s_s_false(float inreg %src0, float inreg %src1) {
- ; GFX9-LABEL: name: div_scale_s_s_false
- ; GFX9: bb.1 (%ir-block.0):
- ; GFX9: liveins: $sgpr2, $sgpr3
- ; GFX9: [[COPY:%[0-9]+]]:sgpr(s32) = COPY $sgpr2
- ; GFX9: [[COPY1:%[0-9]+]]:sgpr(s32) = COPY $sgpr3
- ; GFX9: [[COPY2:%[0-9]+]]:vgpr(s32) = COPY [[COPY]](s32)
- ; GFX9: [[COPY3:%[0-9]+]]:vgpr(s32) = COPY [[COPY1]](s32)
- ; GFX9: [[INT:%[0-9]+]]:vgpr(s32), [[INT1:%[0-9]+]]:vcc(s1) = G_INTRINSIC intrinsic(@llvm.amdgcn.div.scale), [[COPY2]](s32), [[COPY3]](s32), 0
- ; GFX9: $vgpr0 = COPY [[INT]](s32)
- ; GFX9: SI_RETURN_TO_EPILOG implicit $vgpr0
- ; GFX10-LABEL: name: div_scale_s_s_false
- ; GFX10: bb.1 (%ir-block.0):
- ; GFX10: liveins: $sgpr2, $sgpr3
- ; GFX10: [[COPY:%[0-9]+]]:sgpr(s32) = COPY $sgpr2
- ; GFX10: [[COPY1:%[0-9]+]]:sgpr(s32) = COPY $sgpr3
- ; GFX10: [[COPY2:%[0-9]+]]:vgpr(s32) = COPY [[COPY]](s32)
- ; GFX10: [[COPY3:%[0-9]+]]:vgpr(s32) = COPY [[COPY1]](s32)
- ; GFX10: [[INT:%[0-9]+]]:vgpr(s32), [[INT1:%[0-9]+]]:vcc(s1) = G_INTRINSIC intrinsic(@llvm.amdgcn.div.scale), [[COPY2]](s32), [[COPY3]](s32), 0
- ; GFX10: $vgpr0 = COPY [[INT]](s32)
- ; GFX10: SI_RETURN_TO_EPILOG implicit $vgpr0
+; GFX9-LABEL: div_scale_s_s_false:
+; GFX9: ; %bb.0:
+; GFX9-NEXT: v_mov_b32_e32 v0, s3
+; GFX9-NEXT: v_div_scale_f32 v0, s[0:1], v0, v0, s2
+; GFX9-NEXT: ; return to shader part epilog
+;
+; GFX10-LABEL: div_scale_s_s_false:
+; GFX10: ; %bb.0:
+; GFX10-NEXT: v_div_scale_f32 v0, s0, s3, s3, s2
+; GFX10-NEXT: ; implicit-def: $vcc_hi
+; GFX10-NEXT: ; return to shader part epilog
%div.scale = call { float, i1 } @llvm.amdgcn.div.scale.f32(float %src0, float %src1, i1 false)
%result = extractvalue { float, i1 } %div.scale, 0
ret float %result
ret i32 %r0.val
}
-; FIXME:
-; define amdgpu_ps i32 @scalar_xnor_v2i16_one_use(<2 x i16> inreg %a, <2 x i16> inreg %b) {
-; entry:
-; %xor = xor <2 x i16> %a, %b
-; %r0.val = xor <2 x i16> %xor, <i16 -1, i16 -1>
-; %cast = bitcast <2 x i16> %r0.val to i32
-; ret i32 %cast
-; }
+; FIXME: fails to match
+define amdgpu_ps i32 @scalar_xnor_v2i16_one_use(<2 x i16> inreg %a, <2 x i16> inreg %b) {
+; GFX7-LABEL: scalar_xnor_v2i16_one_use:
+; GFX7: ; %bb.0: ; %entry
+; GFX7-NEXT: s_mov_b32 s4, 0xffff
+; GFX7-NEXT: s_lshl_b32 s1, s1, 16
+; GFX7-NEXT: s_and_b32 s0, s0, s4
+; GFX7-NEXT: s_or_b32 s0, s1, s0
+; GFX7-NEXT: s_lshl_b32 s1, s3, 16
+; GFX7-NEXT: s_and_b32 s2, s2, s4
+; GFX7-NEXT: s_or_b32 s1, s1, s2
+; GFX7-NEXT: s_xor_b32 s0, s0, s1
+; GFX7-NEXT: s_xor_b32 s0, s0, -1
+; GFX7-NEXT: ; return to shader part epilog
+;
+; GFX8-LABEL: scalar_xnor_v2i16_one_use:
+; GFX8: ; %bb.0: ; %entry
+; GFX8-NEXT: s_mov_b32 s2, 0xffff
+; GFX8-NEXT: s_lshr_b32 s5, s0, 16
+; GFX8-NEXT: s_lshr_b32 s6, s1, 16
+; GFX8-NEXT: s_and_b32 s4, s0, s2
+; GFX8-NEXT: s_and_b32 s0, s1, s2
+; GFX8-NEXT: s_and_b32 s5, s5, s2
+; GFX8-NEXT: s_and_b32 s1, s6, s2
+; GFX8-NEXT: s_mov_b32 s3, s2
+; GFX8-NEXT: s_xor_b64 s[0:1], s[4:5], s[0:1]
+; GFX8-NEXT: s_and_b64 s[0:1], s[0:1], s[2:3]
+; GFX8-NEXT: s_xor_b64 s[0:1], s[0:1], s[2:3]
+; GFX8-NEXT: s_lshl_b32 s1, s1, 16
+; GFX8-NEXT: s_and_b32 s0, s0, s2
+; GFX8-NEXT: s_or_b32 s0, s1, s0
+; GFX8-NEXT: ; return to shader part epilog
+;
+; GFX900-LABEL: scalar_xnor_v2i16_one_use:
+; GFX900: ; %bb.0: ; %entry
+; GFX900-NEXT: s_pack_ll_b32_b16 s2, -1, -1
+; GFX900-NEXT: s_xor_b32 s0, s0, s1
+; GFX900-NEXT: s_xor_b32 s0, s0, s2
+; GFX900-NEXT: ; return to shader part epilog
+;
+; GFX906-LABEL: scalar_xnor_v2i16_one_use:
+; GFX906: ; %bb.0: ; %entry
+; GFX906-NEXT: s_pack_ll_b32_b16 s2, -1, -1
+; GFX906-NEXT: s_xor_b32 s0, s0, s1
+; GFX906-NEXT: s_xor_b32 s0, s0, s2
+; GFX906-NEXT: ; return to shader part epilog
+entry:
+ %xor = xor <2 x i16> %a, %b
+ %r0.val = xor <2 x i16> %xor, <i16 -1, i16 -1>
+ %cast = bitcast <2 x i16> %r0.val to i32
+ ret i32 %cast
+}
define amdgpu_ps <2 x i32> @scalar_xnor_i32_mul_use(i32 inreg %a, i32 inreg %b) {
; GCN-LABEL: scalar_xnor_i32_mul_use:
ret i64 %r0.val
}
-; FIXME:
-; define amdgpu_ps i64 @scalar_xnor_v4i16_one_use(<4 x i16> inreg %a, <4 x i16> inreg %b) {
-; %xor = xor <4 x i16> %a, %b
-; %ret = xor <4 x i16> %xor, <i16 -1, i16 -1, i16 -1, i16 -1>
-; %cast = bitcast <4 x i16> %ret to i64
-; ret i64 %cast
-; }
+; FIXME: fails to match
+define amdgpu_ps i64 @scalar_xnor_v4i16_one_use(<4 x i16> inreg %a, <4 x i16> inreg %b) {
+; GFX7-LABEL: scalar_xnor_v4i16_one_use:
+; GFX7: ; %bb.0:
+; GFX7-NEXT: s_mov_b32 s8, 0xffff
+; GFX7-NEXT: s_lshl_b32 s1, s1, 16
+; GFX7-NEXT: s_and_b32 s0, s0, s8
+; GFX7-NEXT: s_or_b32 s0, s1, s0
+; GFX7-NEXT: s_lshl_b32 s1, s3, 16
+; GFX7-NEXT: s_and_b32 s2, s2, s8
+; GFX7-NEXT: s_or_b32 s1, s1, s2
+; GFX7-NEXT: s_and_b32 s3, s4, s8
+; GFX7-NEXT: s_lshl_b32 s2, s5, 16
+; GFX7-NEXT: s_or_b32 s2, s2, s3
+; GFX7-NEXT: s_lshl_b32 s3, s7, 16
+; GFX7-NEXT: s_and_b32 s4, s6, s8
+; GFX7-NEXT: s_or_b32 s3, s3, s4
+; GFX7-NEXT: s_mov_b32 s4, -1
+; GFX7-NEXT: s_mov_b32 s5, s4
+; GFX7-NEXT: s_xor_b64 s[0:1], s[0:1], s[2:3]
+; GFX7-NEXT: s_xor_b64 s[0:1], s[0:1], s[4:5]
+; GFX7-NEXT: ; return to shader part epilog
+;
+; GFX8-LABEL: scalar_xnor_v4i16_one_use:
+; GFX8: ; %bb.0:
+; GFX8-NEXT: s_mov_b32 s4, 0xffff
+; GFX8-NEXT: s_lshr_b32 s5, s0, 16
+; GFX8-NEXT: s_and_b32 s7, s5, s4
+; GFX8-NEXT: s_lshr_b32 s5, s1, 16
+; GFX8-NEXT: s_and_b32 s6, s0, s4
+; GFX8-NEXT: s_and_b32 s0, s1, s4
+; GFX8-NEXT: s_and_b32 s1, s5, s4
+; GFX8-NEXT: s_lshr_b32 s5, s2, 16
+; GFX8-NEXT: s_and_b32 s8, s2, s4
+; GFX8-NEXT: s_and_b32 s9, s5, s4
+; GFX8-NEXT: s_lshr_b32 s5, s3, 16
+; GFX8-NEXT: s_and_b32 s2, s3, s4
+; GFX8-NEXT: s_and_b32 s3, s5, s4
+; GFX8-NEXT: s_xor_b64 s[6:7], s[6:7], s[8:9]
+; GFX8-NEXT: s_mov_b32 s5, s4
+; GFX8-NEXT: s_xor_b64 s[0:1], s[0:1], s[2:3]
+; GFX8-NEXT: s_and_b64 s[2:3], s[6:7], s[4:5]
+; GFX8-NEXT: s_and_b64 s[0:1], s[0:1], s[4:5]
+; GFX8-NEXT: s_xor_b64 s[2:3], s[2:3], s[4:5]
+; GFX8-NEXT: s_xor_b64 s[6:7], s[0:1], s[4:5]
+; GFX8-NEXT: s_and_b32 s1, s2, s4
+; GFX8-NEXT: s_lshl_b32 s0, s3, 16
+; GFX8-NEXT: s_or_b32 s0, s0, s1
+; GFX8-NEXT: s_lshl_b32 s1, s7, 16
+; GFX8-NEXT: s_and_b32 s2, s6, s4
+; GFX8-NEXT: s_or_b32 s1, s1, s2
+; GFX8-NEXT: ; return to shader part epilog
+;
+; GFX900-LABEL: scalar_xnor_v4i16_one_use:
+; GFX900: ; %bb.0:
+; GFX900-NEXT: s_pack_ll_b32_b16 s4, -1, -1
+; GFX900-NEXT: s_mov_b32 s5, s4
+; GFX900-NEXT: s_xor_b64 s[0:1], s[0:1], s[2:3]
+; GFX900-NEXT: s_xor_b64 s[0:1], s[0:1], s[4:5]
+; GFX900-NEXT: ; return to shader part epilog
+;
+; GFX906-LABEL: scalar_xnor_v4i16_one_use:
+; GFX906: ; %bb.0:
+; GFX906-NEXT: s_pack_ll_b32_b16 s4, -1, -1
+; GFX906-NEXT: s_mov_b32 s5, s4
+; GFX906-NEXT: s_xor_b64 s[0:1], s[0:1], s[2:3]
+; GFX906-NEXT: s_xor_b64 s[0:1], s[0:1], s[4:5]
+; GFX906-NEXT: ; return to shader part epilog
+ %xor = xor <4 x i16> %a, %b
+ %ret = xor <4 x i16> %xor, <i16 -1, i16 -1, i16 -1, i16 -1>
+ %cast = bitcast <4 x i16> %ret to i64
+ ret i64 %cast
+}
define amdgpu_ps <2 x i64> @scalar_xnor_i64_mul_use(i64 inreg %a, i64 inreg %b) {
; GCN-LABEL: scalar_xnor_i64_mul_use: