>;
} // End isCommutable = 1
+let isCommutable = 1 in {
+
defm V_MAC_LEGACY_F32 : VOP2Inst <vop2<0x6>, "v_mac_legacy_f32",
VOP_F32_F32_F32
>;
-let isCommutable = 1 in {
-
defm V_MUL_LEGACY_F32 : VOP2Inst <vop2<0x7>, "v_mul_legacy_f32",
VOP_F32_F32_F32, int_AMDGPU_mul
>;
VOP_F32_F32_F32, fmul
>;
-
defm V_MUL_I32_I24 : VOP2Inst <vop2<0x9>, "v_mul_i32_i24",
VOP_I32_I32_I32, AMDGPUmul_i24
>;
defm V_BFM_B32 : VOP2Inst <vop2<0x1e>, "v_bfm_b32",
VOP_I32_I32_I32, AMDGPUbfm>;
+
+let isCommutable = 1 in {
defm V_MAC_F32 : VOP2Inst <vop2<0x1f>, "v_mac_f32", VOP_F32_F32_F32>;
+} // End isCommutable = 1
+
defm V_MADMK_F32 : VOP2Inst <vop2<0x20>, "v_madmk_f32", VOP_F32_F32_F32>;
+
+let isCommutable = 1 in {
defm V_MADAK_F32 : VOP2Inst <vop2<0x21>, "v_madak_f32", VOP_F32_F32_F32>;
+} // End isCommutable = 1
+
+
defm V_BCNT_U32_B32 : VOP2Inst <vop2<0x22>, "v_bcnt_u32_b32", VOP_I32_I32_I32>;
defm V_MBCNT_LO_U32_B32 : VOP2Inst <vop2<0x23>, "v_mbcnt_lo_u32_b32",
+
VOP_I32_I32_I32
>;
defm V_MBCNT_HI_U32_B32 : VOP2Inst <vop2<0x24>, "v_mbcnt_hi_u32_b32",
// VOP3 Instructions
//===----------------------------------------------------------------------===//
+let isCommutable = 1 in {
defm V_MAD_LEGACY_F32 : VOP3Inst <vop3<0x140>, "v_mad_legacy_f32",
VOP_F32_F32_F32_F32
>;
+
defm V_MAD_F32 : VOP3Inst <vop3<0x141>, "v_mad_f32",
VOP_F32_F32_F32_F32, fmad
>;
+
defm V_MAD_I32_I24 : VOP3Inst <vop3<0x142>, "v_mad_i32_i24",
VOP_I32_I32_I32_I32, AMDGPUmad_i24
>;
defm V_MAD_U32_U24 : VOP3Inst <vop3<0x143>, "v_mad_u32_u24",
VOP_I32_I32_I32_I32, AMDGPUmad_u24
>;
+} // End isCommutable = 1
defm V_CUBEID_F32 : VOP3Inst <vop3<0x144>, "v_cubeid_f32",
VOP_F32_F32_F32_F32
defm V_BFI_B32 : VOP3Inst <vop3<0x14a>, "v_bfi_b32",
VOP_I32_I32_I32_I32, AMDGPUbfi
>;
+
+let isCommutable = 1 in {
defm V_FMA_F32 : VOP3Inst <vop3<0x14b>, "v_fma_f32",
VOP_F32_F32_F32_F32, fma
>;
defm V_FMA_F64 : VOP3Inst <vop3<0x14c>, "v_fma_f64",
VOP_F64_F64_F64_F64, fma
>;
+} // End isCommutable = 1
+
//def V_LERP_U8 : VOP3_U8 <0x0000014d, "v_lerp_u8", []>;
defm V_ALIGNBIT_B32 : VOP3Inst <vop3<0x14e>, "v_alignbit_b32",
VOP_I32_I32_I32_I32
// Double precision division pre-scale.
defm V_DIV_SCALE_F64 : VOP3b_64 <vop3<0x16e>, "v_div_scale_f64", []>;
+let isCommutable = 1 in {
defm V_DIV_FMAS_F32 : VOP3Inst <vop3<0x16f>, "v_div_fmas_f32",
VOP_F32_F32_F32_F32, AMDGPUdiv_fmas
>;
defm V_DIV_FMAS_F64 : VOP3Inst <vop3<0x170>, "v_div_fmas_f64",
VOP_F64_F64_F64_F64, AMDGPUdiv_fmas
>;
+} // End isCommutable = 1
+
//def V_MSAD_U8 : VOP3_U8 <0x00000171, "v_msad_u8", []>;
//def V_QSAD_U8 : VOP3_U8 <0x00000172, "v_qsad_u8", []>;
//def V_MQSAD_U8 : VOP3_U8 <0x00000173, "v_mqsad_u8", []>;
+
defm V_TRIG_PREOP_F64 : VOP3Inst <
vop3<0x174>, "v_trig_preop_f64", VOP_F64_F64_I32, AMDGPUtrig_preop
>;
defm V_MQSAD_U32_U8 : VOP3Inst <vop3<0x175>, "v_mqsad_u32_u8",
VOP_I32_I32_I32
>;
+
+let isCommutable = 1 in {
defm V_MAD_U64_U32 : VOP3Inst <vop3<0x176>, "v_mad_u64_u32",
VOP_I64_I32_I32_I64
>;
defm V_MAD_I64_I32 : VOP3Inst <vop3<0x177>, "v_mad_i64_i32",
VOP_I64_I32_I32_I64
>;
+} // End isCommutable = 1
// Remaining instructions:
// FLAT_*
ret void
}
+; FIXME: Should use SGPR for literal.
+; FUNC-LABEL: @commute_add_lit_fabs_f32
+; SI: buffer_load_dword [[X:v[0-9]+]], {{v\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}}, 0 addr64{{$}}
+; SI: v_mov_b32_e32 [[K:v[0-9]+]], 0x44800000
+; SI: v_add_f32_e64 [[REG:v[0-9]+]], |[[X]]|, [[K]]
+; SI-NEXT: buffer_store_dword [[REG]]
+define void @commute_add_lit_fabs_f32(float addrspace(1)* %out, float addrspace(1)* %in) #0 {
+ %tid = call i32 @llvm.r600.read.tidig.x() #1
+ %gep.0 = getelementptr float addrspace(1)* %in, i32 %tid
+ %x = load float addrspace(1)* %gep.0
+ %x.fabs = call float @llvm.fabs.f32(float %x) #1
+ %z = fadd float 1024.0, %x.fabs
+ store float %z, float addrspace(1)* %out
+ ret void
+}
+
+; FUNC-LABEL: @commute_add_fabs_f32
+; SI-DAG: buffer_load_dword [[X:v[0-9]+]], {{v\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}}, 0 addr64{{$}}
+; SI-DAG: buffer_load_dword [[Y:v[0-9]+]], {{v\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}}, 0 addr64 offset:0x4
+; SI: v_add_f32_e64 [[REG:v[0-9]+]], [[X]], |[[Y]]|
+; SI-NEXT: buffer_store_dword [[REG]]
+define void @commute_add_fabs_f32(float addrspace(1)* %out, float addrspace(1)* %in) #0 {
+ %tid = call i32 @llvm.r600.read.tidig.x() #1
+ %gep.0 = getelementptr float addrspace(1)* %in, i32 %tid
+ %gep.1 = getelementptr float addrspace(1)* %gep.0, i32 1
+ %x = load float addrspace(1)* %gep.0
+ %y = load float addrspace(1)* %gep.1
+ %y.fabs = call float @llvm.fabs.f32(float %y) #1
+ %z = fadd float %x, %y.fabs
+ store float %z, float addrspace(1)* %out
+ ret void
+}
+
+; FUNC-LABEL: @commute_mul_fneg_f32
+; SI-DAG: buffer_load_dword [[X:v[0-9]+]], {{v\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}}, 0 addr64{{$}}
+; SI-DAG: buffer_load_dword [[Y:v[0-9]+]], {{v\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}}, 0 addr64 offset:0x4
+; SI: v_mul_f32_e64 [[REG:v[0-9]+]], [[X]], -[[Y]]
+; SI-NEXT: buffer_store_dword [[REG]]
+define void @commute_mul_fneg_f32(float addrspace(1)* %out, float addrspace(1)* %in) #0 {
+ %tid = call i32 @llvm.r600.read.tidig.x() #1
+ %gep.0 = getelementptr float addrspace(1)* %in, i32 %tid
+ %gep.1 = getelementptr float addrspace(1)* %gep.0, i32 1
+ %x = load float addrspace(1)* %gep.0
+ %y = load float addrspace(1)* %gep.1
+ %y.fneg = fsub float -0.000000e+00, %y
+ %z = fmul float %x, %y.fneg
+ store float %z, float addrspace(1)* %out
+ ret void
+}
+
+; FUNC-LABEL: @commute_mul_fabs_fneg_f32
+; SI-DAG: buffer_load_dword [[X:v[0-9]+]], {{v\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}}, 0 addr64{{$}}
+; SI-DAG: buffer_load_dword [[Y:v[0-9]+]], {{v\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}}, 0 addr64 offset:0x4
+; SI: v_mul_f32_e64 [[REG:v[0-9]+]], [[X]], -|[[Y]]|
+; SI-NEXT: buffer_store_dword [[REG]]
+define void @commute_mul_fabs_fneg_f32(float addrspace(1)* %out, float addrspace(1)* %in) #0 {
+ %tid = call i32 @llvm.r600.read.tidig.x() #1
+ %gep.0 = getelementptr float addrspace(1)* %in, i32 %tid
+ %gep.1 = getelementptr float addrspace(1)* %gep.0, i32 1
+ %x = load float addrspace(1)* %gep.0
+ %y = load float addrspace(1)* %gep.1
+ %y.fabs = call float @llvm.fabs.f32(float %y) #1
+ %y.fabs.fneg = fsub float -0.000000e+00, %y.fabs
+ %z = fmul float %x, %y.fabs.fneg
+ store float %z, float addrspace(1)* %out
+ ret void
+}
+
+; There's no reason to commute this.
+; FUNC-LABEL: @commute_mul_fabs_x_fabs_y_f32
+; SI-DAG: buffer_load_dword [[X:v[0-9]+]], {{v\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}}, 0 addr64{{$}}
+; SI-DAG: buffer_load_dword [[Y:v[0-9]+]], {{v\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}}, 0 addr64 offset:0x4
+; SI: v_mul_f32_e64 [[REG:v[0-9]+]], |[[X]]|, |[[Y]]|
+; SI-NEXT: buffer_store_dword [[REG]]
+define void @commute_mul_fabs_x_fabs_y_f32(float addrspace(1)* %out, float addrspace(1)* %in) #0 {
+ %tid = call i32 @llvm.r600.read.tidig.x() #1
+ %gep.0 = getelementptr float addrspace(1)* %in, i32 %tid
+ %gep.1 = getelementptr float addrspace(1)* %gep.0, i32 1
+ %x = load float addrspace(1)* %gep.0
+ %y = load float addrspace(1)* %gep.1
+ %x.fabs = call float @llvm.fabs.f32(float %x) #1
+ %y.fabs = call float @llvm.fabs.f32(float %y) #1
+ %z = fmul float %x.fabs, %y.fabs
+ store float %z, float addrspace(1)* %out
+ ret void
+}
+
+; FUNC-LABEL: @commute_mul_fabs_x_fneg_fabs_y_f32
+; SI-DAG: buffer_load_dword [[X:v[0-9]+]], {{v\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}}, 0 addr64{{$}}
+; SI-DAG: buffer_load_dword [[Y:v[0-9]+]], {{v\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}}, 0 addr64 offset:0x4
+; SI: v_mul_f32_e64 [[REG:v[0-9]+]], |[[X]]|, -|[[Y]]|
+; SI-NEXT: buffer_store_dword [[REG]]
+define void @commute_mul_fabs_x_fneg_fabs_y_f32(float addrspace(1)* %out, float addrspace(1)* %in) #0 {
+ %tid = call i32 @llvm.r600.read.tidig.x() #1
+ %gep.0 = getelementptr float addrspace(1)* %in, i32 %tid
+ %gep.1 = getelementptr float addrspace(1)* %gep.0, i32 1
+ %x = load float addrspace(1)* %gep.0
+ %y = load float addrspace(1)* %gep.1
+ %x.fabs = call float @llvm.fabs.f32(float %x) #1
+ %y.fabs = call float @llvm.fabs.f32(float %y) #1
+ %y.fabs.fneg = fsub float -0.000000e+00, %y.fabs
+ %z = fmul float %x.fabs, %y.fabs.fneg
+ store float %z, float addrspace(1)* %out
+ ret void
+}
+
attributes #0 = { nounwind }
attributes #1 = { nounwind readnone }
declare <2 x float> @llvm.fma.v2f32(<2 x float>, <2 x float>, <2 x float>) nounwind readnone
declare <4 x float> @llvm.fma.v4f32(<4 x float>, <4 x float>, <4 x float>) nounwind readnone
+declare i32 @llvm.r600.read.tidig.x() nounwind readnone
+
; FUNC-LABEL: {{^}}fma_f32:
; SI: v_fma_f32 {{v[0-9]+, v[0-9]+, v[0-9]+, v[0-9]+}}
; EG: FMA {{\*? *}}[[RES]]
define void @fma_f32(float addrspace(1)* %out, float addrspace(1)* %in1,
float addrspace(1)* %in2, float addrspace(1)* %in3) {
- %r0 = load float addrspace(1)* %in1
- %r1 = load float addrspace(1)* %in2
- %r2 = load float addrspace(1)* %in3
- %r3 = tail call float @llvm.fma.f32(float %r0, float %r1, float %r2)
- store float %r3, float addrspace(1)* %out
- ret void
+ %r0 = load float addrspace(1)* %in1
+ %r1 = load float addrspace(1)* %in2
+ %r2 = load float addrspace(1)* %in3
+ %r3 = tail call float @llvm.fma.f32(float %r0, float %r1, float %r2)
+ store float %r3, float addrspace(1)* %out
+ ret void
}
; FUNC-LABEL: {{^}}fma_v2f32:
; EG-DAG: FMA {{\*? *}}[[RES]].[[CHHI]]
define void @fma_v2f32(<2 x float> addrspace(1)* %out, <2 x float> addrspace(1)* %in1,
<2 x float> addrspace(1)* %in2, <2 x float> addrspace(1)* %in3) {
- %r0 = load <2 x float> addrspace(1)* %in1
- %r1 = load <2 x float> addrspace(1)* %in2
- %r2 = load <2 x float> addrspace(1)* %in3
- %r3 = tail call <2 x float> @llvm.fma.v2f32(<2 x float> %r0, <2 x float> %r1, <2 x float> %r2)
- store <2 x float> %r3, <2 x float> addrspace(1)* %out
- ret void
+ %r0 = load <2 x float> addrspace(1)* %in1
+ %r1 = load <2 x float> addrspace(1)* %in2
+ %r2 = load <2 x float> addrspace(1)* %in3
+ %r3 = tail call <2 x float> @llvm.fma.v2f32(<2 x float> %r0, <2 x float> %r1, <2 x float> %r2)
+ store <2 x float> %r3, <2 x float> addrspace(1)* %out
+ ret void
}
; FUNC-LABEL: {{^}}fma_v4f32:
; EG-DAG: FMA {{\*? *}}[[RES]].W
define void @fma_v4f32(<4 x float> addrspace(1)* %out, <4 x float> addrspace(1)* %in1,
<4 x float> addrspace(1)* %in2, <4 x float> addrspace(1)* %in3) {
- %r0 = load <4 x float> addrspace(1)* %in1
- %r1 = load <4 x float> addrspace(1)* %in2
- %r2 = load <4 x float> addrspace(1)* %in3
- %r3 = tail call <4 x float> @llvm.fma.v4f32(<4 x float> %r0, <4 x float> %r1, <4 x float> %r2)
- store <4 x float> %r3, <4 x float> addrspace(1)* %out
- ret void
+ %r0 = load <4 x float> addrspace(1)* %in1
+ %r1 = load <4 x float> addrspace(1)* %in2
+ %r2 = load <4 x float> addrspace(1)* %in3
+ %r3 = tail call <4 x float> @llvm.fma.v4f32(<4 x float> %r0, <4 x float> %r1, <4 x float> %r2)
+ store <4 x float> %r3, <4 x float> addrspace(1)* %out
+ ret void
+}
+
+; FUNC-LABEL: @fma_commute_mul_inline_imm_f32
+; SI: v_fma_f32 {{v[0-9]+}}, 2.0, {{v[0-9]+}}, {{v[0-9]+}}
+define void @fma_commute_mul_inline_imm_f32(float addrspace(1)* noalias %out, float addrspace(1)* noalias %in.a, float addrspace(1)* noalias %in.b) nounwind {
+ %tid = tail call i32 @llvm.r600.read.tidig.x() nounwind readnone
+ %in.a.gep = getelementptr float addrspace(1)* %in.a, i32 %tid
+ %in.b.gep = getelementptr float addrspace(1)* %in.b, i32 %tid
+ %out.gep = getelementptr float addrspace(1)* %out, i32 %tid
+
+ %a = load float addrspace(1)* %in.a.gep, align 4
+ %b = load float addrspace(1)* %in.b.gep, align 4
+
+ %fma = call float @llvm.fma.f32(float %a, float 2.0, float %b)
+ store float %fma, float addrspace(1)* %out.gep, align 4
+ ret void
+}
+
+; FUNC-LABEL: @fma_commute_mul_s_f32
+define void @fma_commute_mul_s_f32(float addrspace(1)* noalias %out, float addrspace(1)* noalias %in.a, float addrspace(1)* noalias %in.b, float %b) nounwind {
+ %tid = tail call i32 @llvm.r600.read.tidig.x() nounwind readnone
+ %in.a.gep = getelementptr float addrspace(1)* %in.a, i32 %tid
+ %in.b.gep = getelementptr float addrspace(1)* %in.b, i32 %tid
+ %out.gep = getelementptr float addrspace(1)* %out, i32 %tid
+
+ %a = load float addrspace(1)* %in.a.gep, align 4
+ %c = load float addrspace(1)* %in.b.gep, align 4
+
+ %fma = call float @llvm.fma.f32(float %a, float %b, float %c)
+ store float %fma, float addrspace(1)* %out.gep, align 4
+ ret void
}
; CHECK-LABEL: {{^}}fmuladd_neg_2.0_a_b_f32
; CHECK-DAG: buffer_load_dword [[R1:v[0-9]+]], {{v\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}}, 0 addr64{{$}}
; CHECK-DAG: buffer_load_dword [[R2:v[0-9]+]], {{v\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}}, 0 addr64 offset:0x4
-; CHECK: v_mad_f32 [[RESULT:v[0-9]+]], [[R1]], -2.0, [[R2]]
+; CHECK: v_mad_f32 [[RESULT:v[0-9]+]], -2.0, [[R1]], [[R2]]
; CHECK: buffer_store_dword [[RESULT]]
define void @fmuladd_neg_2.0_a_b_f32(float addrspace(1)* %out, float addrspace(1)* %in) {
%tid = call i32 @llvm.r600.read.tidig.x() nounwind readnone
; CHECK-LABEL: {{^}}fmuladd_2.0_neg_a_b_f32
; CHECK-DAG: buffer_load_dword [[R1:v[0-9]+]], {{v\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}}, 0 addr64{{$}}
; CHECK-DAG: buffer_load_dword [[R2:v[0-9]+]], {{v\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}}, 0 addr64 offset:0x4
-; CHECK: v_mad_f32 [[RESULT:v[0-9]+]], [[R1]], -2.0, [[R2]]
+; CHECK: v_mad_f32 [[RESULT:v[0-9]+]], -2.0, [[R1]], [[R2]]
; CHECK: buffer_store_dword [[RESULT]]
define void @fmuladd_2.0_neg_a_b_f32(float addrspace(1)* %out, float addrspace(1)* %in) {
%tid = call i32 @llvm.r600.read.tidig.x() nounwind readnone
; XUN: llc -march=r600 -mcpu=rv770 -verify-machineinstrs < %s | FileCheck -check-prefix=R600 -check-prefix=FUNC %s
declare i32 @llvm.AMDGPU.umad24(i32, i32, i32) nounwind readnone
+declare i32 @llvm.r600.read.tidig.x() nounwind readnone
; FUNC-LABEL: {{^}}test_umad24:
; SI: v_mad_u32_u24
ret void
}
+; FUNC-LABEL: {{^}}commute_umad24:
+; SI-DAG: buffer_load_dword [[SRC0:v[0-9]+]], {{v\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}}, 0 addr64{{$}}
+; SI-DAG: buffer_load_dword [[SRC2:v[0-9]+]], {{v\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}}, 0 addr64 offset:0x4
+; SI: v_mad_u32_u24 [[RESULT:v[0-9]+]], 4, [[SRC0]], [[SRC2]]
+; SI: buffer_store_dword [[RESULT]]
+define void @commute_umad24(i32 addrspace(1)* %out, i32 addrspace(1)* %in) nounwind {
+ %tid = call i32 @llvm.r600.read.tidig.x() nounwind readnone
+ %out.gep = getelementptr i32 addrspace(1)* %out, i32 %tid
+ %src0.gep = getelementptr i32 addrspace(1)* %out, i32 %tid
+ %src2.gep = getelementptr i32 addrspace(1)* %src0.gep, i32 1
+
+ %src0 = load i32 addrspace(1)* %src0.gep, align 4
+ %src2 = load i32 addrspace(1)* %src2.gep, align 4
+ %mad = call i32 @llvm.AMDGPU.umad24(i32 %src0, i32 4, i32 %src2) nounwind readnone
+ store i32 %mad, i32 addrspace(1)* %out.gep, align 4
+ ret void
+}
+
; SI-LABEL: {{^}}test_sgpr_use_twice_ternary_op_a_imm_a:
; SI: s_load_dword [[SGPR:s[0-9]+]]
-; SI: v_fma_f32 [[RESULT:v[0-9]+]], [[SGPR]], 2.0, [[SGPR]]
+; SI: v_fma_f32 [[RESULT:v[0-9]+]], 2.0, [[SGPR]], [[SGPR]]
; SI: buffer_store_dword [[RESULT]]
define void @test_sgpr_use_twice_ternary_op_a_imm_a(float addrspace(1)* %out, float %a) #0 {
%fma = call float @llvm.fma.f32(float %a, float 2.0, float %a) #1