From 0deab8a54fd8d83853740eb751ddba967ad514f7 Mon Sep 17 00:00:00 2001 From: Florian Hahn Date: Sun, 24 May 2020 14:14:43 +0100 Subject: [PATCH] [LV] Either get invariant condition OR vector condition. Currently we unconditionally get the first lane of the condition operand, even if we later use the full vector condition. This can result in some unnecessary instructions being generated. Suggested as follow-up in D80219. --- llvm/lib/Transforms/Vectorize/LoopVectorize.cpp | 10 ++-- llvm/test/Transforms/LoopVectorize/X86/optsize.ll | 60 ++++++++++------------ .../LoopVectorize/float-minmax-instruction-flag.ll | 21 ++++---- 3 files changed, 43 insertions(+), 48 deletions(-) diff --git a/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp b/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp index 5fc2f86..6c2a3e4 100644 --- a/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp +++ b/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp @@ -4422,15 +4422,15 @@ void InnerLoopVectorizer::widenSelectInstruction(SelectInst &I, // loop. This means that we can't just use the original 'cond' value. // We have to take the 'vectorized' value and pick the first lane. // Instcombine will make this a no-op. - - auto *ScalarCond = State.get(Operands.getOperand(0), {0, 0}); + auto *InvarCond = + InvariantCond ? State.get(Operands.getOperand(0), {0, 0}) : nullptr; for (unsigned Part = 0; Part < UF; ++Part) { - Value *Cond = State.get(Operands.getOperand(0), Part); + Value *Cond = + InvarCond ? InvarCond : State.get(Operands.getOperand(0), Part); Value *Op0 = State.get(Operands.getOperand(1), Part); Value *Op1 = State.get(Operands.getOperand(2), Part); - Value *Sel = - Builder.CreateSelect(InvariantCond ? ScalarCond : Cond, Op0, Op1); + Value *Sel = Builder.CreateSelect(Cond, Op0, Op1); VectorLoopValueMap.setVectorValue(&I, Part, Sel); addMetadata(Sel, &I); } diff --git a/llvm/test/Transforms/LoopVectorize/X86/optsize.ll b/llvm/test/Transforms/LoopVectorize/X86/optsize.ll index 214e100..ad72f90 100644 --- a/llvm/test/Transforms/LoopVectorize/X86/optsize.ll +++ b/llvm/test/Transforms/LoopVectorize/X86/optsize.ll @@ -27,13 +27,12 @@ define i32 @foo_optsize() #0 { ; CHECK-NEXT: [[TMP4:%.*]] = bitcast i8* [[TMP3]] to <64 x i8>* ; CHECK-NEXT: [[WIDE_MASKED_LOAD:%.*]] = call <64 x i8> @llvm.masked.load.v64i8.p0v64i8(<64 x i8>* [[TMP4]], i32 1, <64 x i1> [[TMP2]], <64 x i8> undef) ; CHECK-NEXT: [[TMP5:%.*]] = icmp eq <64 x i8> [[WIDE_MASKED_LOAD]], zeroinitializer -; CHECK-NEXT: [[TMP6:%.*]] = extractelement <64 x i1> [[TMP5]], i32 0 -; CHECK-NEXT: [[TMP7:%.*]] = select <64 x i1> [[TMP5]], <64 x i8> , <64 x i8> -; CHECK-NEXT: [[TMP8:%.*]] = bitcast i8* [[TMP3]] to <64 x i8>* -; CHECK-NEXT: call void @llvm.masked.store.v64i8.p0v64i8(<64 x i8> [[TMP7]], <64 x i8>* [[TMP8]], i32 1, <64 x i1> [[TMP2]]) +; CHECK-NEXT: [[TMP6:%.*]] = select <64 x i1> [[TMP5]], <64 x i8> , <64 x i8> +; CHECK-NEXT: [[TMP7:%.*]] = bitcast i8* [[TMP3]] to <64 x i8>* +; CHECK-NEXT: call void @llvm.masked.store.v64i8.p0v64i8(<64 x i8> [[TMP6]], <64 x i8>* [[TMP7]], i32 1, <64 x i1> [[TMP2]]) ; CHECK-NEXT: [[INDEX_NEXT]] = add i32 [[INDEX]], 64 -; CHECK-NEXT: [[TMP9:%.*]] = icmp eq i32 [[INDEX_NEXT]], 256 -; CHECK-NEXT: br i1 [[TMP9]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop !0 +; CHECK-NEXT: [[TMP8:%.*]] = icmp eq i32 [[INDEX_NEXT]], 256 +; CHECK-NEXT: br i1 [[TMP8]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop !0 ; CHECK: middle.block: ; CHECK-NEXT: br i1 true, label [[FOR_END:%.*]], label [[SCALAR_PH]] ; CHECK: scalar.ph: @@ -42,8 +41,8 @@ define i32 @foo_optsize() #0 { ; CHECK: for.body: ; CHECK-NEXT: [[I_08:%.*]] = phi i32 [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ], [ [[INC:%.*]], [[FOR_BODY]] ] ; CHECK-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds [32 x i8], [32 x i8]* @tab, i32 0, i32 [[I_08]] -; CHECK-NEXT: [[TMP10:%.*]] = load i8, i8* [[ARRAYIDX]], align 1 -; CHECK-NEXT: [[CMP1:%.*]] = icmp eq i8 [[TMP10]], 0 +; CHECK-NEXT: [[TMP9:%.*]] = load i8, i8* [[ARRAYIDX]], align 1 +; CHECK-NEXT: [[CMP1:%.*]] = icmp eq i8 [[TMP9]], 0 ; CHECK-NEXT: [[DOT:%.*]] = select i1 [[CMP1]], i8 2, i8 1 ; CHECK-NEXT: store i8 [[DOT]], i8* [[ARRAYIDX]], align 1 ; CHECK-NEXT: [[INC]] = add nsw i32 [[I_08]], 1 @@ -69,13 +68,12 @@ define i32 @foo_optsize() #0 { ; AUTOVF-NEXT: [[TMP4:%.*]] = bitcast i8* [[TMP3]] to <32 x i8>* ; AUTOVF-NEXT: [[WIDE_MASKED_LOAD:%.*]] = call <32 x i8> @llvm.masked.load.v32i8.p0v32i8(<32 x i8>* [[TMP4]], i32 1, <32 x i1> [[TMP2]], <32 x i8> undef) ; AUTOVF-NEXT: [[TMP5:%.*]] = icmp eq <32 x i8> [[WIDE_MASKED_LOAD]], zeroinitializer -; AUTOVF-NEXT: [[TMP6:%.*]] = extractelement <32 x i1> [[TMP5]], i32 0 -; AUTOVF-NEXT: [[TMP7:%.*]] = select <32 x i1> [[TMP5]], <32 x i8> , <32 x i8> -; AUTOVF-NEXT: [[TMP8:%.*]] = bitcast i8* [[TMP3]] to <32 x i8>* -; AUTOVF-NEXT: call void @llvm.masked.store.v32i8.p0v32i8(<32 x i8> [[TMP7]], <32 x i8>* [[TMP8]], i32 1, <32 x i1> [[TMP2]]) +; AUTOVF-NEXT: [[TMP6:%.*]] = select <32 x i1> [[TMP5]], <32 x i8> , <32 x i8> +; AUTOVF-NEXT: [[TMP7:%.*]] = bitcast i8* [[TMP3]] to <32 x i8>* +; AUTOVF-NEXT: call void @llvm.masked.store.v32i8.p0v32i8(<32 x i8> [[TMP6]], <32 x i8>* [[TMP7]], i32 1, <32 x i1> [[TMP2]]) ; AUTOVF-NEXT: [[INDEX_NEXT]] = add i32 [[INDEX]], 32 -; AUTOVF-NEXT: [[TMP9:%.*]] = icmp eq i32 [[INDEX_NEXT]], 224 -; AUTOVF-NEXT: br i1 [[TMP9]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop !0 +; AUTOVF-NEXT: [[TMP8:%.*]] = icmp eq i32 [[INDEX_NEXT]], 224 +; AUTOVF-NEXT: br i1 [[TMP8]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop !0 ; AUTOVF: middle.block: ; AUTOVF-NEXT: br i1 true, label [[FOR_END:%.*]], label [[SCALAR_PH]] ; AUTOVF: scalar.ph: @@ -84,8 +82,8 @@ define i32 @foo_optsize() #0 { ; AUTOVF: for.body: ; AUTOVF-NEXT: [[I_08:%.*]] = phi i32 [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ], [ [[INC:%.*]], [[FOR_BODY]] ] ; AUTOVF-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds [32 x i8], [32 x i8]* @tab, i32 0, i32 [[I_08]] -; AUTOVF-NEXT: [[TMP10:%.*]] = load i8, i8* [[ARRAYIDX]], align 1 -; AUTOVF-NEXT: [[CMP1:%.*]] = icmp eq i8 [[TMP10]], 0 +; AUTOVF-NEXT: [[TMP9:%.*]] = load i8, i8* [[ARRAYIDX]], align 1 +; AUTOVF-NEXT: [[CMP1:%.*]] = icmp eq i8 [[TMP9]], 0 ; AUTOVF-NEXT: [[DOT:%.*]] = select i1 [[CMP1]], i8 2, i8 1 ; AUTOVF-NEXT: store i8 [[DOT]], i8* [[ARRAYIDX]], align 1 ; AUTOVF-NEXT: [[INC]] = add nsw i32 [[I_08]], 1 @@ -133,13 +131,12 @@ define i32 @foo_minsize() #1 { ; CHECK-NEXT: [[TMP4:%.*]] = bitcast i8* [[TMP3]] to <64 x i8>* ; CHECK-NEXT: [[WIDE_MASKED_LOAD:%.*]] = call <64 x i8> @llvm.masked.load.v64i8.p0v64i8(<64 x i8>* [[TMP4]], i32 1, <64 x i1> [[TMP2]], <64 x i8> undef) ; CHECK-NEXT: [[TMP5:%.*]] = icmp eq <64 x i8> [[WIDE_MASKED_LOAD]], zeroinitializer -; CHECK-NEXT: [[TMP6:%.*]] = extractelement <64 x i1> [[TMP5]], i32 0 -; CHECK-NEXT: [[TMP7:%.*]] = select <64 x i1> [[TMP5]], <64 x i8> , <64 x i8> -; CHECK-NEXT: [[TMP8:%.*]] = bitcast i8* [[TMP3]] to <64 x i8>* -; CHECK-NEXT: call void @llvm.masked.store.v64i8.p0v64i8(<64 x i8> [[TMP7]], <64 x i8>* [[TMP8]], i32 1, <64 x i1> [[TMP2]]) +; CHECK-NEXT: [[TMP6:%.*]] = select <64 x i1> [[TMP5]], <64 x i8> , <64 x i8> +; CHECK-NEXT: [[TMP7:%.*]] = bitcast i8* [[TMP3]] to <64 x i8>* +; CHECK-NEXT: call void @llvm.masked.store.v64i8.p0v64i8(<64 x i8> [[TMP6]], <64 x i8>* [[TMP7]], i32 1, <64 x i1> [[TMP2]]) ; CHECK-NEXT: [[INDEX_NEXT]] = add i32 [[INDEX]], 64 -; CHECK-NEXT: [[TMP9:%.*]] = icmp eq i32 [[INDEX_NEXT]], 256 -; CHECK-NEXT: br i1 [[TMP9]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop !4 +; CHECK-NEXT: [[TMP8:%.*]] = icmp eq i32 [[INDEX_NEXT]], 256 +; CHECK-NEXT: br i1 [[TMP8]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop !4 ; CHECK: middle.block: ; CHECK-NEXT: br i1 true, label [[FOR_END:%.*]], label [[SCALAR_PH]] ; CHECK: scalar.ph: @@ -148,8 +145,8 @@ define i32 @foo_minsize() #1 { ; CHECK: for.body: ; CHECK-NEXT: [[I_08:%.*]] = phi i32 [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ], [ [[INC:%.*]], [[FOR_BODY]] ] ; CHECK-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds [32 x i8], [32 x i8]* @tab, i32 0, i32 [[I_08]] -; CHECK-NEXT: [[TMP10:%.*]] = load i8, i8* [[ARRAYIDX]], align 1 -; CHECK-NEXT: [[CMP1:%.*]] = icmp eq i8 [[TMP10]], 0 +; CHECK-NEXT: [[TMP9:%.*]] = load i8, i8* [[ARRAYIDX]], align 1 +; CHECK-NEXT: [[CMP1:%.*]] = icmp eq i8 [[TMP9]], 0 ; CHECK-NEXT: [[DOT:%.*]] = select i1 [[CMP1]], i8 2, i8 1 ; CHECK-NEXT: store i8 [[DOT]], i8* [[ARRAYIDX]], align 1 ; CHECK-NEXT: [[INC]] = add nsw i32 [[I_08]], 1 @@ -175,13 +172,12 @@ define i32 @foo_minsize() #1 { ; AUTOVF-NEXT: [[TMP4:%.*]] = bitcast i8* [[TMP3]] to <32 x i8>* ; AUTOVF-NEXT: [[WIDE_MASKED_LOAD:%.*]] = call <32 x i8> @llvm.masked.load.v32i8.p0v32i8(<32 x i8>* [[TMP4]], i32 1, <32 x i1> [[TMP2]], <32 x i8> undef) ; AUTOVF-NEXT: [[TMP5:%.*]] = icmp eq <32 x i8> [[WIDE_MASKED_LOAD]], zeroinitializer -; AUTOVF-NEXT: [[TMP6:%.*]] = extractelement <32 x i1> [[TMP5]], i32 0 -; AUTOVF-NEXT: [[TMP7:%.*]] = select <32 x i1> [[TMP5]], <32 x i8> , <32 x i8> -; AUTOVF-NEXT: [[TMP8:%.*]] = bitcast i8* [[TMP3]] to <32 x i8>* -; AUTOVF-NEXT: call void @llvm.masked.store.v32i8.p0v32i8(<32 x i8> [[TMP7]], <32 x i8>* [[TMP8]], i32 1, <32 x i1> [[TMP2]]) +; AUTOVF-NEXT: [[TMP6:%.*]] = select <32 x i1> [[TMP5]], <32 x i8> , <32 x i8> +; AUTOVF-NEXT: [[TMP7:%.*]] = bitcast i8* [[TMP3]] to <32 x i8>* +; AUTOVF-NEXT: call void @llvm.masked.store.v32i8.p0v32i8(<32 x i8> [[TMP6]], <32 x i8>* [[TMP7]], i32 1, <32 x i1> [[TMP2]]) ; AUTOVF-NEXT: [[INDEX_NEXT]] = add i32 [[INDEX]], 32 -; AUTOVF-NEXT: [[TMP9:%.*]] = icmp eq i32 [[INDEX_NEXT]], 224 -; AUTOVF-NEXT: br i1 [[TMP9]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop !4 +; AUTOVF-NEXT: [[TMP8:%.*]] = icmp eq i32 [[INDEX_NEXT]], 224 +; AUTOVF-NEXT: br i1 [[TMP8]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop !4 ; AUTOVF: middle.block: ; AUTOVF-NEXT: br i1 true, label [[FOR_END:%.*]], label [[SCALAR_PH]] ; AUTOVF: scalar.ph: @@ -190,8 +186,8 @@ define i32 @foo_minsize() #1 { ; AUTOVF: for.body: ; AUTOVF-NEXT: [[I_08:%.*]] = phi i32 [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ], [ [[INC:%.*]], [[FOR_BODY]] ] ; AUTOVF-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds [32 x i8], [32 x i8]* @tab, i32 0, i32 [[I_08]] -; AUTOVF-NEXT: [[TMP10:%.*]] = load i8, i8* [[ARRAYIDX]], align 1 -; AUTOVF-NEXT: [[CMP1:%.*]] = icmp eq i8 [[TMP10]], 0 +; AUTOVF-NEXT: [[TMP9:%.*]] = load i8, i8* [[ARRAYIDX]], align 1 +; AUTOVF-NEXT: [[CMP1:%.*]] = icmp eq i8 [[TMP9]], 0 ; AUTOVF-NEXT: [[DOT:%.*]] = select i1 [[CMP1]], i8 2, i8 1 ; AUTOVF-NEXT: store i8 [[DOT]], i8* [[ARRAYIDX]], align 1 ; AUTOVF-NEXT: [[INC]] = add nsw i32 [[I_08]], 1 diff --git a/llvm/test/Transforms/LoopVectorize/float-minmax-instruction-flag.ll b/llvm/test/Transforms/LoopVectorize/float-minmax-instruction-flag.ll index 3f18a97..142e4af 100644 --- a/llvm/test/Transforms/LoopVectorize/float-minmax-instruction-flag.ll +++ b/llvm/test/Transforms/LoopVectorize/float-minmax-instruction-flag.ll @@ -55,7 +55,7 @@ define float @minloopattr(float* nocapture readonly %arg) #0 { ; CHECK-NEXT: br label [[VECTOR_BODY:%.*]] ; CHECK: vector.body: ; CHECK-NEXT: [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ] -; CHECK-NEXT: [[VEC_PHI:%.*]] = phi <4 x float> [ [[MINMAX_IDENT_SPLAT]], [[VECTOR_PH]] ], [ [[TMP6:%.*]], [[VECTOR_BODY]] ] +; CHECK-NEXT: [[VEC_PHI:%.*]] = phi <4 x float> [ [[MINMAX_IDENT_SPLAT]], [[VECTOR_PH]] ], [ [[TMP5:%.*]], [[VECTOR_BODY]] ] ; CHECK-NEXT: [[OFFSET_IDX:%.*]] = add i64 1, [[INDEX]] ; CHECK-NEXT: [[TMP0:%.*]] = add i64 [[OFFSET_IDX]], 0 ; CHECK-NEXT: [[TMP1:%.*]] = getelementptr float, float* [[ARG]], i64 [[TMP0]] @@ -63,24 +63,23 @@ define float @minloopattr(float* nocapture readonly %arg) #0 { ; CHECK-NEXT: [[TMP3:%.*]] = bitcast float* [[TMP2]] to <4 x float>* ; CHECK-NEXT: [[WIDE_LOAD:%.*]] = load <4 x float>, <4 x float>* [[TMP3]], align 4 ; CHECK-NEXT: [[TMP4:%.*]] = fcmp olt <4 x float> [[VEC_PHI]], [[WIDE_LOAD]] -; CHECK-NEXT: [[TMP5:%.*]] = extractelement <4 x i1> [[TMP4]], i32 0 -; CHECK-NEXT: [[TMP6]] = select <4 x i1> [[TMP4]], <4 x float> [[VEC_PHI]], <4 x float> [[WIDE_LOAD]] +; CHECK-NEXT: [[TMP5]] = select <4 x i1> [[TMP4]], <4 x float> [[VEC_PHI]], <4 x float> [[WIDE_LOAD]] ; CHECK-NEXT: [[INDEX_NEXT]] = add i64 [[INDEX]], 4 -; CHECK-NEXT: [[TMP7:%.*]] = icmp eq i64 [[INDEX_NEXT]], 65536 -; CHECK-NEXT: br i1 [[TMP7]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop !0 +; CHECK-NEXT: [[TMP6:%.*]] = icmp eq i64 [[INDEX_NEXT]], 65536 +; CHECK-NEXT: br i1 [[TMP6]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop !0 ; CHECK: middle.block: -; CHECK-NEXT: [[RDX_SHUF:%.*]] = shufflevector <4 x float> [[TMP6]], <4 x float> undef, <4 x i32> -; CHECK-NEXT: [[RDX_MINMAX_CMP:%.*]] = fcmp fast olt <4 x float> [[TMP6]], [[RDX_SHUF]] -; CHECK-NEXT: [[RDX_MINMAX_SELECT:%.*]] = select fast <4 x i1> [[RDX_MINMAX_CMP]], <4 x float> [[TMP6]], <4 x float> [[RDX_SHUF]] +; CHECK-NEXT: [[RDX_SHUF:%.*]] = shufflevector <4 x float> [[TMP5]], <4 x float> undef, <4 x i32> +; CHECK-NEXT: [[RDX_MINMAX_CMP:%.*]] = fcmp fast olt <4 x float> [[TMP5]], [[RDX_SHUF]] +; CHECK-NEXT: [[RDX_MINMAX_SELECT:%.*]] = select fast <4 x i1> [[RDX_MINMAX_CMP]], <4 x float> [[TMP5]], <4 x float> [[RDX_SHUF]] ; CHECK-NEXT: [[RDX_SHUF1:%.*]] = shufflevector <4 x float> [[RDX_MINMAX_SELECT]], <4 x float> undef, <4 x i32> ; CHECK-NEXT: [[RDX_MINMAX_CMP2:%.*]] = fcmp fast olt <4 x float> [[RDX_MINMAX_SELECT]], [[RDX_SHUF1]] ; CHECK-NEXT: [[RDX_MINMAX_SELECT3:%.*]] = select fast <4 x i1> [[RDX_MINMAX_CMP2]], <4 x float> [[RDX_MINMAX_SELECT]], <4 x float> [[RDX_SHUF1]] -; CHECK-NEXT: [[TMP8:%.*]] = extractelement <4 x float> [[RDX_MINMAX_SELECT3]], i32 0 +; CHECK-NEXT: [[TMP7:%.*]] = extractelement <4 x float> [[RDX_MINMAX_SELECT3]], i32 0 ; CHECK-NEXT: [[CMP_N:%.*]] = icmp eq i64 65536, 65536 ; CHECK-NEXT: br i1 [[CMP_N]], label [[OUT:%.*]], label [[SCALAR_PH]] ; CHECK: scalar.ph: ; CHECK-NEXT: [[BC_RESUME_VAL:%.*]] = phi i64 [ 65537, [[MIDDLE_BLOCK]] ], [ 1, [[TOP:%.*]] ] -; CHECK-NEXT: [[BC_MERGE_RDX:%.*]] = phi float [ [[T]], [[TOP]] ], [ [[TMP8]], [[MIDDLE_BLOCK]] ] +; CHECK-NEXT: [[BC_MERGE_RDX:%.*]] = phi float [ [[T]], [[TOP]] ], [ [[TMP7]], [[MIDDLE_BLOCK]] ] ; CHECK-NEXT: br label [[LOOP:%.*]] ; CHECK: loop: ; CHECK-NEXT: [[T1:%.*]] = phi i64 [ [[T7:%.*]], [[LOOP]] ], [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ] @@ -93,7 +92,7 @@ define float @minloopattr(float* nocapture readonly %arg) #0 { ; CHECK-NEXT: [[T8:%.*]] = icmp eq i64 [[T7]], 65537 ; CHECK-NEXT: br i1 [[T8]], label [[OUT]], label [[LOOP]], !llvm.loop !2 ; CHECK: out: -; CHECK-NEXT: [[T6_LCSSA:%.*]] = phi float [ [[T6]], [[LOOP]] ], [ [[TMP8]], [[MIDDLE_BLOCK]] ] +; CHECK-NEXT: [[T6_LCSSA:%.*]] = phi float [ [[T6]], [[LOOP]] ], [ [[TMP7]], [[MIDDLE_BLOCK]] ] ; CHECK-NEXT: ret float [[T6_LCSSA]] ; top: -- 2.7.4