// TODO: This could be better with wider vectors that will be split to v2f16,
// and to consider uses since there aren't that many packed operations.
- if (N0.getOpcode() == ISD::BUILD_VECTOR && VT == MVT::v2f16) {
+ if (N0.getOpcode() == ISD::BUILD_VECTOR && VT == MVT::v2f16 &&
+ isTypeLegal(MVT::v2f16)) {
SDLoc SL(N);
SDValue NewElts[2];
SDValue Lo = N0.getOperand(0);
SDValue Hi = N0.getOperand(1);
+ EVT EltVT = Lo.getValueType();
+
if (vectorEltWillFoldAway(Lo) || vectorEltWillFoldAway(Hi)) {
for (unsigned I = 0; I != 2; ++I) {
SDValue Op = N0.getOperand(I);
- EVT EltVT = Op.getValueType();
if (ConstantFPSDNode *CFP = dyn_cast<ConstantFPSDNode>(Op)) {
NewElts[I] = getCanonicalConstantFP(DAG, SL, EltVT,
CFP->getValueAPF());
} else if (Op.isUndef()) {
- // This would ordinarily be folded to a qNaN. Since this may be half
- // of a packed operation, it may be cheaper to use a 0.
- NewElts[I] = DAG.getConstantFP(0.0f, SL, EltVT);
+ // Handled below based on what the other operand is.
+ NewElts[I] = Op;
} else {
NewElts[I] = DAG.getNode(ISD::FCANONICALIZE, SL, EltVT, Op);
}
}
+ // If one half is undef, and one is constant, perfer a splat vector rather
+ // than the normal qNaN. If it's a register, prefer 0.0 since that's
+ // cheaper to use and may be free with a packed operation.
+ if (NewElts[0].isUndef()) {
+ if (isa<ConstantFPSDNode>(NewElts[1]))
+ NewElts[0] = isa<ConstantFPSDNode>(NewElts[1]) ?
+ NewElts[1]: DAG.getConstantFP(0.0f, SL, EltVT);
+ }
+
+ if (NewElts[1].isUndef()) {
+ NewElts[1] = isa<ConstantFPSDNode>(NewElts[0]) ?
+ NewElts[0] : DAG.getConstantFP(0.0f, SL, EltVT);
+ }
+
return DAG.getBuildVector(VT, SL, NewElts);
}
}
ret void
}
+; GCN-LABEL: {{^}}v_clamp_v2f16_undef_limit_elts0:
+; GCN: {{buffer|flat|global}}_load_dword [[A:v[0-9]+]]
+; GFX9-NOT: [[A]]
+; GFX9: v_pk_max_f16 [[CLAMP:v[0-9]+]], [[A]], [[A]] clamp{{$}}
+define amdgpu_kernel void @v_clamp_v2f16_undef_limit_elts0(<2 x half> addrspace(1)* %out, <2 x half> addrspace(1)* %aptr) #0 {
+ %tid = call i32 @llvm.amdgcn.workitem.id.x()
+ %gep0 = getelementptr <2 x half>, <2 x half> addrspace(1)* %aptr, i32 %tid
+ %out.gep = getelementptr <2 x half>, <2 x half> addrspace(1)* %out, i32 %tid
+ %a = load <2 x half>, <2 x half> addrspace(1)* %gep0
+ %max = call <2 x half> @llvm.maxnum.v2f16(<2 x half> %a, <2 x half> <half 0.0, half undef>)
+ %med = call <2 x half> @llvm.minnum.v2f16(<2 x half> %max, <2 x half> <half undef, half 1.0>)
+
+ store <2 x half> %med, <2 x half> addrspace(1)* %out.gep
+ ret void
+}
+
+; GCN-LABEL: {{^}}v_clamp_v2f16_undef_limit_elts1:
+; GCN: {{buffer|flat|global}}_load_dword [[A:v[0-9]+]]
+; GFX9-NOT: [[A]]
+; GFX9: v_pk_max_f16 [[CLAMP:v[0-9]+]], [[A]], [[A]] clamp{{$}}
+define amdgpu_kernel void @v_clamp_v2f16_undef_limit_elts1(<2 x half> addrspace(1)* %out, <2 x half> addrspace(1)* %aptr) #0 {
+ %tid = call i32 @llvm.amdgcn.workitem.id.x()
+ %gep0 = getelementptr <2 x half>, <2 x half> addrspace(1)* %aptr, i32 %tid
+ %out.gep = getelementptr <2 x half>, <2 x half> addrspace(1)* %out, i32 %tid
+ %a = load <2 x half>, <2 x half> addrspace(1)* %gep0
+ %max = call <2 x half> @llvm.maxnum.v2f16(<2 x half> %a, <2 x half> <half undef, half 0.0>)
+ %med = call <2 x half> @llvm.minnum.v2f16(<2 x half> %max, <2 x half> <half 1.0, half undef>)
+
+ store <2 x half> %med, <2 x half> addrspace(1)* %out.gep
+ ret void
+}
+
; GCN-LABEL: {{^}}v_clamp_diff_source_f32:
; GCN: v_add_f32_e32 [[A:v[0-9]+]]
; GCN: v_add_f32_e32 [[B:v[0-9]+]]
}
; GCN-LABEL: {{^}}v_test_canonicalize_undef_reg_v2f16:
-; GFX9: s_waitcnt
-; GFX9-NEXT: v_max_f16_e32 v0, v0, v0
-; GFX9-NEXT: v_lshl_or_b32 v0, v0, 16, 0
-; GFX9-NEXT: s_setpc_b64
-
-; VI: s_waitcnt
-; VI-NEXT: v_max_f16_sdwa v0, v0, v0 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
-; VI-NEXT: s_setpc_b64
+; GFX89: s_waitcnt
+; GFX89-NEXT: v_max_f16_sdwa v0, v0, v0 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
+; GFX89-NEXT: s_setpc_b64
define <2 x half> @v_test_canonicalize_undef_reg_v2f16(half %val) #1 {
%vec = insertelement <2 x half> undef, half %val, i32 1
%canonicalized = call <2 x half> @llvm.canonicalize.v2f16(<2 x half> %vec)
ret <2 x half> %canonicalized
}
+; GCN-LABEL: {{^}}v_test_canonicalize_undef_lo_imm_hi_v2f16:
+; GCN: s_waitcnt
+; GFX89-NEXT: v_mov_b32_e32 v0, 0x3c003c00
+; GFX89-NEXT: s_setpc_b64
+
+; CI-NEXT: v_mov_b32_e32 v0, 0x7fc00000
+; CI-NEXT: v_mov_b32_e32 v1, 1.0
+; CI-NEXT: s_setpc_b64
+define <2 x half> @v_test_canonicalize_undef_lo_imm_hi_v2f16() #1 {
+ %vec = insertelement <2 x half> undef, half 1.0, i32 1
+ %canonicalized = call <2 x half> @llvm.canonicalize.v2f16(<2 x half> %vec)
+ ret <2 x half> %canonicalized
+}
+
+; GCN-LABEL: {{^}}v_test_canonicalize_imm_lo_undef_hi_v2f16:
+; GCN: s_waitcnt
+; GFX89-NEXT: v_mov_b32_e32 v0, 0x3c003c00
+; GFX89-NEXT: s_setpc_b64
+
+; CI-NEXT: v_mov_b32_e32 v0, 1.0
+; CI-NEXT: v_mov_b32_e32 v1, 0x7fc00000
+; CI-NEXT: s_setpc_b64
+define <2 x half> @v_test_canonicalize_imm_lo_undef_hi_v2f16() #1 {
+ %vec = insertelement <2 x half> undef, half 1.0, i32 0
+ %canonicalized = call <2 x half> @llvm.canonicalize.v2f16(<2 x half> %vec)
+ ret <2 x half> %canonicalized
+}
+
+; GCN-LABEL: {{^}}v_test_canonicalize_undef_lo_k_hi_v2f16:
+; GCN: s_waitcnt
+; GFX89-NEXT: v_mov_b32_e32 v0, 0x4c004c00
+; GFX89-NEXT: s_setpc_b64
+
+; CI-NEXT: v_mov_b32_e32 v0, 0x7fc00000
+; CI-NEXT: v_mov_b32_e32 v1, 0x41800000
+; CI-NEXT: s_setpc_b64
+define <2 x half> @v_test_canonicalize_undef_lo_k_hi_v2f16() #1 {
+ %vec = insertelement <2 x half> undef, half 16.0, i32 1
+ %canonicalized = call <2 x half> @llvm.canonicalize.v2f16(<2 x half> %vec)
+ ret <2 x half> %canonicalized
+}
+
+; GCN-LABEL: {{^}}v_test_canonicalize_k_lo_undef_hi_v2f16:
+; GCN: s_waitcnt
+; GFX89-NEXT: v_mov_b32_e32 v0, 0x4c004c00
+; GFX89-NEXT: s_setpc_b64
+
+; CI-NEXT: v_mov_b32_e32 v0, 0x41800000
+; CI-NEXT: v_mov_b32_e32 v1, 0x7fc00000
+; CI-NEXT: s_setpc_b64
+define <2 x half> @v_test_canonicalize_k_lo_undef_hi_v2f16() #1 {
+ %vec = insertelement <2 x half> undef, half 16.0, i32 0
+ %canonicalized = call <2 x half> @llvm.canonicalize.v2f16(<2 x half> %vec)
+ ret <2 x half> %canonicalized
+}
+
; GCN-LABEL: {{^}}v_test_canonicalize_reg_k_v2f16:
; GFX9: s_waitcnt
; GFX9-DAG: v_max_f16_e32 v0, v0, v0