From 805bc02c2b9500e6ae62dac5a075eb732ac83597 Mon Sep 17 00:00:00 2001 From: Sanjay Patel Date: Mon, 26 Jan 2015 18:42:16 +0000 Subject: [PATCH] Model sqrtsd as a binary operation with one source operand tied to the destination (PR14221) This patch fixes the following miscompile: define void @sqrtsd(<2 x double> %a) nounwind uwtable ssp { %0 = tail call <2 x double> @llvm.x86.sse2.sqrt.sd(<2 x double> %a) nounwind %a0 = extractelement <2 x double> %0, i32 0 %conv = fptrunc double %a0 to float %a1 = extractelement <2 x double> %0, i32 1 %conv3 = fptrunc double %a1 to float tail call void @callee2(float %conv, float %conv3) nounwind ret void } Current codegen: sqrtsd %xmm0, %xmm1 ## high element of %xmm1 is undef here xorps %xmm0, %xmm0 cvtsd2ss %xmm1, %xmm0 shufpd $1, %xmm1, %xmm1 cvtsd2ss %xmm1, %xmm1 ## operating on undef value jmp _callee This is a continuation of http://llvm.org/viewvc/llvm-project?view=revision&revision=224624 ( http://reviews.llvm.org/D6330 ) which was itself a continuation of r167064 ( http://llvm.org/viewvc/llvm-project?view=revision&revision=167064 ). All of these patches are partial fixes for PR14221 ( http://llvm.org/bugs/show_bug.cgi?id=14221 ); this should be the final patch needed to resolve that bug. Differential Revision: http://reviews.llvm.org/D6885 llvm-svn: 227111 --- llvm/lib/Target/X86/X86InstrSSE.td | 31 +++++++++++++++++------------ llvm/test/CodeGen/X86/sse_partial_update.ll | 23 +++++++++++++++++++++ 2 files changed, 41 insertions(+), 13 deletions(-) diff --git a/llvm/lib/Target/X86/X86InstrSSE.td b/llvm/lib/Target/X86/X86InstrSSE.td index d86e6c2..a1215f9 100644 --- a/llvm/lib/Target/X86/X86InstrSSE.td +++ b/llvm/lib/Target/X86/X86InstrSSE.td @@ -3647,8 +3647,10 @@ let Predicates = [HasAVX] in { } /// sse2_fp_unop_s - SSE2 unops in scalar form. +// FIXME: Combine the following sse2 classes with the sse1 classes above. +// The only usage of these is for SQRT[S/P]D. See sse12_fp_binop* for example. multiclass sse2_fp_unop_s opc, string OpcodeStr, - SDNode OpNode, Intrinsic F64Int, OpndItins itins> { + SDNode OpNode, OpndItins itins> { let Predicates = [HasAVX], hasSideEffects = 0 in { def V#NAME#SDr : SDI, XD, Requires<[UseSSE2, OptForSize]>, Sched<[itins.Sched.Folded]>; -let isCodeGenOnly = 1 in { - def SDr_Int : SDI, - Sched<[itins.Sched]>; - def SDm_Int : SDI, - Sched<[itins.Sched.Folded]>; -} + let isCodeGenOnly = 1, Constraints = "$src1 = $dst" in { + def SDr_Int : + SDI, Sched<[itins.Sched]>; + + let mayLoad = 1, hasSideEffects = 0 in + def SDm_Int : + SDI, Sched<[itins.Sched.Folded, ReadAfterLd]>; + } // isCodeGenOnly, Constraints } /// sse2_fp_unop_p - SSE2 unops in vector forms. @@ -3732,8 +3736,7 @@ let Predicates = [HasAVX] in { // Square root. defm SQRT : sse1_fp_unop_s<0x51, "sqrt", fsqrt, SSE_SQRTSS>, sse1_fp_unop_p<0x51, "sqrt", fsqrt, SSE_SQRTPS>, - sse2_fp_unop_s<0x51, "sqrt", fsqrt, int_x86_sse2_sqrt_sd, - SSE_SQRTSD>, + sse2_fp_unop_s<0x51, "sqrt", fsqrt, SSE_SQRTSD>, sse2_fp_unop_p<0x51, "sqrt", fsqrt, SSE_SQRTPD>; // Reciprocal approximations. Note that these typically require refinement @@ -3812,6 +3815,8 @@ let Predicates = [UseSSE1] in { (RCPSSr_Int VR128:$src, VR128:$src)>; def : Pat<(int_x86_sse_sqrt_ss VR128:$src), (SQRTSSr_Int VR128:$src, VR128:$src)>; + def : Pat<(int_x86_sse2_sqrt_sd VR128:$src), + (SQRTSDr_Int VR128:$src, VR128:$src)>; } // There is no f64 version of the reciprocal approximation instructions. diff --git a/llvm/test/CodeGen/X86/sse_partial_update.ll b/llvm/test/CodeGen/X86/sse_partial_update.ll index a1de259..377c3b7 100644 --- a/llvm/test/CodeGen/X86/sse_partial_update.ll +++ b/llvm/test/CodeGen/X86/sse_partial_update.ll @@ -67,3 +67,26 @@ entry: ret void } declare <4 x float> @llvm.x86.sse.sqrt.ss(<4 x float>) nounwind readnone + +define void @sqrtsd(<2 x double> %a) nounwind uwtable ssp { +entry: +; CHECK-LABEL: sqrtsd: +; CHECK: sqrtsd %xmm0, %xmm0 +; CHECK-NEXT: cvtsd2ss %xmm0 +; CHECK-NEXT: shufpd +; CHECK-NEXT: cvtsd2ss %xmm0 +; CHECK-NEXT: movap +; CHECK-NEXT: jmp + + %0 = tail call <2 x double> @llvm.x86.sse2.sqrt.sd(<2 x double> %a) nounwind + %a0 = extractelement <2 x double> %0, i32 0 + %conv = fptrunc double %a0 to float + %a1 = extractelement <2 x double> %0, i32 1 + %conv3 = fptrunc double %a1 to float + tail call void @callee2(float %conv, float %conv3) nounwind + ret void +} + +declare void @callee2(float, float) +declare <2 x double> @llvm.x86.sse2.sqrt.sd(<2 x double>) nounwind readnone + -- 2.7.4