From dfc1aee25b68c9819b4a8a868be784110c6e751e Mon Sep 17 00:00:00 2001 From: Martin Storsjo Date: Mon, 7 Oct 2019 08:21:37 +0000 Subject: [PATCH] Revert "[SLP] avoid reduction transform on patterns that the backend can load-combine" This reverts SVN r373833, as it caused a failed assert "Non-zero loop cost expected" on building numerous projects, see PR43582 for details and reproduction samples. llvm-svn: 373882 --- llvm/include/llvm/Analysis/TargetTransformInfo.h | 10 -- llvm/lib/Analysis/TargetTransformInfo.cpp | 53 ------- llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp | 15 +- .../Transforms/SLPVectorizer/X86/bad-reduction.ll | 156 +++++++-------------- 4 files changed, 55 insertions(+), 179 deletions(-) diff --git a/llvm/include/llvm/Analysis/TargetTransformInfo.h b/llvm/include/llvm/Analysis/TargetTransformInfo.h index 67e62f6..6da2d7f 100644 --- a/llvm/include/llvm/Analysis/TargetTransformInfo.h +++ b/llvm/include/llvm/Analysis/TargetTransformInfo.h @@ -1129,16 +1129,6 @@ private: /// Returns -1 if the cost is unknown. int getInstructionThroughput(const Instruction *I) const; - /// Given an input value that is an element of an 'or' reduction, check if the - /// reduction is composed of narrower loaded values. Assuming that a - /// legal-sized reduction of shifted/zexted loaded values can be load combined - /// in the backend, create a relative cost that accounts for the removal of - /// the intermediate ops and replacement by a single wide load. - /// TODO: If load combining is allowed in the IR optimizer, this analysis - /// may not be necessary. - Optional getLoadCombineCost(unsigned Opcode, - ArrayRef Args) const; - /// The abstract base class used to type erase specific TTI /// implementations. class Concept; diff --git a/llvm/lib/Analysis/TargetTransformInfo.cpp b/llvm/lib/Analysis/TargetTransformInfo.cpp index 6730aa8..f3d20ce 100644 --- a/llvm/lib/Analysis/TargetTransformInfo.cpp +++ b/llvm/lib/Analysis/TargetTransformInfo.cpp @@ -571,64 +571,11 @@ TargetTransformInfo::getOperandInfo(Value *V, OperandValueProperties &OpProps) { return OpInfo; } -Optional -TargetTransformInfo::getLoadCombineCost(unsigned Opcode, - ArrayRef Args) const { - if (Opcode != Instruction::Or) - return llvm::None; - if (Args.empty()) - return llvm::None; - - // Look past the reduction to find a source value. Arbitrarily follow the - // path through operand 0 of any 'or'. Also, peek through optional - // shift-left-by-constant. - const Value *ZextLoad = Args.front(); - while (match(ZextLoad, m_Or(m_Value(), m_Value())) || - match(ZextLoad, m_Shl(m_Value(), m_Constant()))) - ZextLoad = cast(ZextLoad)->getOperand(0); - - // Check if the input to the reduction is an extended load. - Value *LoadPtr; - if (!match(ZextLoad, m_ZExt(m_Load(m_Value(LoadPtr))))) - return llvm::None; - - // Require that the total load bit width is a legal integer type. - // For example, <8 x i8> --> i64 is a legal integer on a 64-bit target. - // But <16 x i8> --> i128 is not, so the backend probably can't reduce it. - Type *WideType = ZextLoad->getType(); - Type *EltType = LoadPtr->getType()->getPointerElementType(); - unsigned WideWidth = WideType->getIntegerBitWidth(); - unsigned EltWidth = EltType->getIntegerBitWidth(); - if (!isTypeLegal(WideType) || WideWidth % EltWidth != 0) - return llvm::None; - - // Calculate relative cost: {narrow load+zext+shl+or} are assumed to be - // removed and replaced by a single wide load. - // FIXME: This is not accurate for the larger pattern where we replace - // multiple narrow load sequences with just 1 wide load. We could - // remove the addition of the wide load cost here and expect the caller - // to make an adjustment for that. - int Cost = 0; - Cost -= getMemoryOpCost(Instruction::Load, EltType, 0, 0); - Cost -= getCastInstrCost(Instruction::ZExt, WideType, EltType); - Cost -= getArithmeticInstrCost(Instruction::Shl, WideType); - Cost -= getArithmeticInstrCost(Instruction::Or, WideType); - Cost += getMemoryOpCost(Instruction::Load, WideType, 0, 0); - return Cost; -} - - int TargetTransformInfo::getArithmeticInstrCost( unsigned Opcode, Type *Ty, OperandValueKind Opd1Info, OperandValueKind Opd2Info, OperandValueProperties Opd1PropInfo, OperandValueProperties Opd2PropInfo, ArrayRef Args) const { - // Check if we can match this instruction as part of a larger pattern. - Optional LoadCombineCost = getLoadCombineCost(Opcode, Args); - if (LoadCombineCost) - return LoadCombineCost.getValue(); - - // Fallback to implementation-specific overrides or base class. int Cost = TTIImpl->getArithmeticInstrCost(Opcode, Ty, Opd1Info, Opd2Info, Opd1PropInfo, Opd2PropInfo, Args); assert(Cost >= 0 && "TTI should not produce negative costs!"); diff --git a/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp b/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp index ad12646..99428c6 100644 --- a/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp +++ b/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp @@ -6499,19 +6499,10 @@ private: int ScalarReduxCost = 0; switch (ReductionData.getKind()) { - case RK_Arithmetic: { - // Note: Passing in the reduction operands allows the cost model to match - // load combining patterns for this reduction. - auto *ReduxInst = cast(ReductionRoot); - SmallVector OperandList; - for (Value *Operand : ReduxInst->operands()) - OperandList.push_back(Operand); - ScalarReduxCost = TTI->getArithmeticInstrCost(ReductionData.getOpcode(), - ScalarTy, TargetTransformInfo::OK_AnyValue, - TargetTransformInfo::OK_AnyValue, TargetTransformInfo::OP_None, - TargetTransformInfo::OP_None, OperandList); + case RK_Arithmetic: + ScalarReduxCost = + TTI->getArithmeticInstrCost(ReductionData.getOpcode(), ScalarTy); break; - } case RK_Min: case RK_Max: case RK_UMin: diff --git a/llvm/test/Transforms/SLPVectorizer/X86/bad-reduction.ll b/llvm/test/Transforms/SLPVectorizer/X86/bad-reduction.ll index c44a852..e3452e1 100644 --- a/llvm/test/Transforms/SLPVectorizer/X86/bad-reduction.ll +++ b/llvm/test/Transforms/SLPVectorizer/X86/bad-reduction.ll @@ -15,37 +15,31 @@ define i64 @load_bswap(%v8i8* %p) { ; CHECK-NEXT: [[G5:%.*]] = getelementptr inbounds [[V8I8]], %v8i8* [[P]], i64 0, i32 5 ; CHECK-NEXT: [[G6:%.*]] = getelementptr inbounds [[V8I8]], %v8i8* [[P]], i64 0, i32 6 ; CHECK-NEXT: [[G7:%.*]] = getelementptr inbounds [[V8I8]], %v8i8* [[P]], i64 0, i32 7 -; CHECK-NEXT: [[T0:%.*]] = load i8, i8* [[G0]] -; CHECK-NEXT: [[T1:%.*]] = load i8, i8* [[G1]] -; CHECK-NEXT: [[T2:%.*]] = load i8, i8* [[G2]] -; CHECK-NEXT: [[T3:%.*]] = load i8, i8* [[G3]] +; CHECK-NEXT: [[TMP1:%.*]] = bitcast i8* [[G0]] to <4 x i8>* +; CHECK-NEXT: [[TMP2:%.*]] = load <4 x i8>, <4 x i8>* [[TMP1]], align 1 ; CHECK-NEXT: [[T4:%.*]] = load i8, i8* [[G4]] ; CHECK-NEXT: [[T5:%.*]] = load i8, i8* [[G5]] ; CHECK-NEXT: [[T6:%.*]] = load i8, i8* [[G6]] ; CHECK-NEXT: [[T7:%.*]] = load i8, i8* [[G7]] -; CHECK-NEXT: [[Z0:%.*]] = zext i8 [[T0]] to i64 -; CHECK-NEXT: [[Z1:%.*]] = zext i8 [[T1]] to i64 -; CHECK-NEXT: [[Z2:%.*]] = zext i8 [[T2]] to i64 -; CHECK-NEXT: [[Z3:%.*]] = zext i8 [[T3]] to i64 +; CHECK-NEXT: [[TMP3:%.*]] = zext <4 x i8> [[TMP2]] to <4 x i64> ; CHECK-NEXT: [[Z4:%.*]] = zext i8 [[T4]] to i64 ; CHECK-NEXT: [[Z5:%.*]] = zext i8 [[T5]] to i64 ; CHECK-NEXT: [[Z6:%.*]] = zext i8 [[T6]] to i64 ; CHECK-NEXT: [[Z7:%.*]] = zext i8 [[T7]] to i64 -; CHECK-NEXT: [[SH0:%.*]] = shl nuw i64 [[Z0]], 56 -; CHECK-NEXT: [[SH1:%.*]] = shl nuw nsw i64 [[Z1]], 48 -; CHECK-NEXT: [[SH2:%.*]] = shl nuw nsw i64 [[Z2]], 40 -; CHECK-NEXT: [[SH3:%.*]] = shl nuw nsw i64 [[Z3]], 32 +; CHECK-NEXT: [[TMP4:%.*]] = shl nuw <4 x i64> [[TMP3]], ; CHECK-NEXT: [[SH4:%.*]] = shl nuw nsw i64 [[Z4]], 24 ; CHECK-NEXT: [[SH5:%.*]] = shl nuw nsw i64 [[Z5]], 16 ; CHECK-NEXT: [[SH6:%.*]] = shl nuw nsw i64 [[Z6]], 8 -; CHECK-NEXT: [[OR01:%.*]] = or i64 [[SH0]], [[SH1]] -; CHECK-NEXT: [[OR012:%.*]] = or i64 [[OR01]], [[SH2]] -; CHECK-NEXT: [[OR0123:%.*]] = or i64 [[OR012]], [[SH3]] -; CHECK-NEXT: [[OR01234:%.*]] = or i64 [[OR0123]], [[SH4]] -; CHECK-NEXT: [[OR012345:%.*]] = or i64 [[OR01234]], [[SH5]] -; CHECK-NEXT: [[OR0123456:%.*]] = or i64 [[OR012345]], [[SH6]] -; CHECK-NEXT: [[OR01234567:%.*]] = or i64 [[OR0123456]], [[Z7]] -; CHECK-NEXT: ret i64 [[OR01234567]] +; CHECK-NEXT: [[RDX_SHUF:%.*]] = shufflevector <4 x i64> [[TMP4]], <4 x i64> undef, <4 x i32> +; CHECK-NEXT: [[BIN_RDX:%.*]] = or <4 x i64> [[TMP4]], [[RDX_SHUF]] +; CHECK-NEXT: [[RDX_SHUF1:%.*]] = shufflevector <4 x i64> [[BIN_RDX]], <4 x i64> undef, <4 x i32> +; CHECK-NEXT: [[BIN_RDX2:%.*]] = or <4 x i64> [[BIN_RDX]], [[RDX_SHUF1]] +; CHECK-NEXT: [[TMP5:%.*]] = extractelement <4 x i64> [[BIN_RDX2]], i32 0 +; CHECK-NEXT: [[TMP6:%.*]] = or i64 [[TMP5]], [[SH4]] +; CHECK-NEXT: [[TMP7:%.*]] = or i64 [[TMP6]], [[SH5]] +; CHECK-NEXT: [[TMP8:%.*]] = or i64 [[TMP7]], [[SH6]] +; CHECK-NEXT: [[OP_EXTRA:%.*]] = or i64 [[TMP8]], [[Z7]] +; CHECK-NEXT: ret i64 [[OP_EXTRA]] ; %g0 = getelementptr inbounds %v8i8, %v8i8* %p, i64 0, i32 0 %g1 = getelementptr inbounds %v8i8, %v8i8* %p, i64 0, i32 1 @@ -103,38 +97,18 @@ define i64 @load_bswap_nop_shift(%v8i8* %p) { ; CHECK-NEXT: [[G5:%.*]] = getelementptr inbounds [[V8I8]], %v8i8* [[P]], i64 0, i32 5 ; CHECK-NEXT: [[G6:%.*]] = getelementptr inbounds [[V8I8]], %v8i8* [[P]], i64 0, i32 6 ; CHECK-NEXT: [[G7:%.*]] = getelementptr inbounds [[V8I8]], %v8i8* [[P]], i64 0, i32 7 -; CHECK-NEXT: [[T0:%.*]] = load i8, i8* [[G0]] -; CHECK-NEXT: [[T1:%.*]] = load i8, i8* [[G1]] -; CHECK-NEXT: [[T2:%.*]] = load i8, i8* [[G2]] -; CHECK-NEXT: [[T3:%.*]] = load i8, i8* [[G3]] -; CHECK-NEXT: [[T4:%.*]] = load i8, i8* [[G4]] -; CHECK-NEXT: [[T5:%.*]] = load i8, i8* [[G5]] -; CHECK-NEXT: [[T6:%.*]] = load i8, i8* [[G6]] -; CHECK-NEXT: [[T7:%.*]] = load i8, i8* [[G7]] -; CHECK-NEXT: [[Z0:%.*]] = zext i8 [[T0]] to i64 -; CHECK-NEXT: [[Z1:%.*]] = zext i8 [[T1]] to i64 -; CHECK-NEXT: [[Z2:%.*]] = zext i8 [[T2]] to i64 -; CHECK-NEXT: [[Z3:%.*]] = zext i8 [[T3]] to i64 -; CHECK-NEXT: [[Z4:%.*]] = zext i8 [[T4]] to i64 -; CHECK-NEXT: [[Z5:%.*]] = zext i8 [[T5]] to i64 -; CHECK-NEXT: [[Z6:%.*]] = zext i8 [[T6]] to i64 -; CHECK-NEXT: [[Z7:%.*]] = zext i8 [[T7]] to i64 -; CHECK-NEXT: [[SH0:%.*]] = shl nuw i64 [[Z0]], 56 -; CHECK-NEXT: [[SH1:%.*]] = shl nuw nsw i64 [[Z1]], 48 -; CHECK-NEXT: [[SH2:%.*]] = shl nuw nsw i64 [[Z2]], 40 -; CHECK-NEXT: [[SH3:%.*]] = shl nuw nsw i64 [[Z3]], 32 -; CHECK-NEXT: [[SH4:%.*]] = shl nuw nsw i64 [[Z4]], 24 -; CHECK-NEXT: [[SH5:%.*]] = shl nuw nsw i64 [[Z5]], 16 -; CHECK-NEXT: [[SH6:%.*]] = shl nuw nsw i64 [[Z6]], 8 -; CHECK-NEXT: [[SH7:%.*]] = shl nuw nsw i64 [[Z7]], 0 -; CHECK-NEXT: [[OR01:%.*]] = or i64 [[SH0]], [[SH1]] -; CHECK-NEXT: [[OR012:%.*]] = or i64 [[OR01]], [[SH2]] -; CHECK-NEXT: [[OR0123:%.*]] = or i64 [[OR012]], [[SH3]] -; CHECK-NEXT: [[OR01234:%.*]] = or i64 [[OR0123]], [[SH4]] -; CHECK-NEXT: [[OR012345:%.*]] = or i64 [[OR01234]], [[SH5]] -; CHECK-NEXT: [[OR0123456:%.*]] = or i64 [[OR012345]], [[SH6]] -; CHECK-NEXT: [[OR01234567:%.*]] = or i64 [[OR0123456]], [[SH7]] -; CHECK-NEXT: ret i64 [[OR01234567]] +; CHECK-NEXT: [[TMP1:%.*]] = bitcast i8* [[G0]] to <8 x i8>* +; CHECK-NEXT: [[TMP2:%.*]] = load <8 x i8>, <8 x i8>* [[TMP1]], align 1 +; CHECK-NEXT: [[TMP3:%.*]] = zext <8 x i8> [[TMP2]] to <8 x i64> +; CHECK-NEXT: [[TMP4:%.*]] = shl nuw <8 x i64> [[TMP3]], +; CHECK-NEXT: [[RDX_SHUF:%.*]] = shufflevector <8 x i64> [[TMP4]], <8 x i64> undef, <8 x i32> +; CHECK-NEXT: [[BIN_RDX:%.*]] = or <8 x i64> [[TMP4]], [[RDX_SHUF]] +; CHECK-NEXT: [[RDX_SHUF1:%.*]] = shufflevector <8 x i64> [[BIN_RDX]], <8 x i64> undef, <8 x i32> +; CHECK-NEXT: [[BIN_RDX2:%.*]] = or <8 x i64> [[BIN_RDX]], [[RDX_SHUF1]] +; CHECK-NEXT: [[RDX_SHUF3:%.*]] = shufflevector <8 x i64> [[BIN_RDX2]], <8 x i64> undef, <8 x i32> +; CHECK-NEXT: [[BIN_RDX4:%.*]] = or <8 x i64> [[BIN_RDX2]], [[RDX_SHUF3]] +; CHECK-NEXT: [[TMP5:%.*]] = extractelement <8 x i64> [[BIN_RDX4]], i32 0 +; CHECK-NEXT: ret i64 [[TMP5]] ; %g0 = getelementptr inbounds %v8i8, %v8i8* %p, i64 0, i32 0 %g1 = getelementptr inbounds %v8i8, %v8i8* %p, i64 0, i32 1 @@ -194,36 +168,30 @@ define i64 @load64le(i8* %arg) { ; CHECK-NEXT: [[G6:%.*]] = getelementptr inbounds i8, i8* [[ARG]], i64 6 ; CHECK-NEXT: [[G7:%.*]] = getelementptr inbounds i8, i8* [[ARG]], i64 7 ; CHECK-NEXT: [[LD0:%.*]] = load i8, i8* [[ARG]], align 1 -; CHECK-NEXT: [[LD1:%.*]] = load i8, i8* [[G1]], align 1 -; CHECK-NEXT: [[LD2:%.*]] = load i8, i8* [[G2]], align 1 -; CHECK-NEXT: [[LD3:%.*]] = load i8, i8* [[G3]], align 1 -; CHECK-NEXT: [[LD4:%.*]] = load i8, i8* [[G4]], align 1 +; CHECK-NEXT: [[TMP1:%.*]] = bitcast i8* [[G1]] to <4 x i8>* +; CHECK-NEXT: [[TMP2:%.*]] = load <4 x i8>, <4 x i8>* [[TMP1]], align 1 ; CHECK-NEXT: [[LD5:%.*]] = load i8, i8* [[G5]], align 1 ; CHECK-NEXT: [[LD6:%.*]] = load i8, i8* [[G6]], align 1 ; CHECK-NEXT: [[LD7:%.*]] = load i8, i8* [[G7]], align 1 ; CHECK-NEXT: [[Z0:%.*]] = zext i8 [[LD0]] to i64 -; CHECK-NEXT: [[Z1:%.*]] = zext i8 [[LD1]] to i64 -; CHECK-NEXT: [[Z2:%.*]] = zext i8 [[LD2]] to i64 -; CHECK-NEXT: [[Z3:%.*]] = zext i8 [[LD3]] to i64 -; CHECK-NEXT: [[Z4:%.*]] = zext i8 [[LD4]] to i64 +; CHECK-NEXT: [[TMP3:%.*]] = zext <4 x i8> [[TMP2]] to <4 x i64> ; CHECK-NEXT: [[Z5:%.*]] = zext i8 [[LD5]] to i64 ; CHECK-NEXT: [[Z6:%.*]] = zext i8 [[LD6]] to i64 ; CHECK-NEXT: [[Z7:%.*]] = zext i8 [[LD7]] to i64 -; CHECK-NEXT: [[S1:%.*]] = shl nuw nsw i64 [[Z1]], 8 -; CHECK-NEXT: [[S2:%.*]] = shl nuw nsw i64 [[Z2]], 16 -; CHECK-NEXT: [[S3:%.*]] = shl nuw nsw i64 [[Z3]], 24 -; CHECK-NEXT: [[S4:%.*]] = shl nuw nsw i64 [[Z4]], 32 +; CHECK-NEXT: [[TMP4:%.*]] = shl nuw nsw <4 x i64> [[TMP3]], ; CHECK-NEXT: [[S5:%.*]] = shl nuw nsw i64 [[Z5]], 40 ; CHECK-NEXT: [[S6:%.*]] = shl nuw nsw i64 [[Z6]], 48 ; CHECK-NEXT: [[S7:%.*]] = shl nuw i64 [[Z7]], 56 -; CHECK-NEXT: [[O1:%.*]] = or i64 [[S1]], [[Z0]] -; CHECK-NEXT: [[O2:%.*]] = or i64 [[O1]], [[S2]] -; CHECK-NEXT: [[O3:%.*]] = or i64 [[O2]], [[S3]] -; CHECK-NEXT: [[O4:%.*]] = or i64 [[O3]], [[S4]] -; CHECK-NEXT: [[O5:%.*]] = or i64 [[O4]], [[S5]] -; CHECK-NEXT: [[O6:%.*]] = or i64 [[O5]], [[S6]] -; CHECK-NEXT: [[O7:%.*]] = or i64 [[O6]], [[S7]] -; CHECK-NEXT: ret i64 [[O7]] +; CHECK-NEXT: [[RDX_SHUF:%.*]] = shufflevector <4 x i64> [[TMP4]], <4 x i64> undef, <4 x i32> +; CHECK-NEXT: [[BIN_RDX:%.*]] = or <4 x i64> [[TMP4]], [[RDX_SHUF]] +; CHECK-NEXT: [[RDX_SHUF1:%.*]] = shufflevector <4 x i64> [[BIN_RDX]], <4 x i64> undef, <4 x i32> +; CHECK-NEXT: [[BIN_RDX2:%.*]] = or <4 x i64> [[BIN_RDX]], [[RDX_SHUF1]] +; CHECK-NEXT: [[TMP5:%.*]] = extractelement <4 x i64> [[BIN_RDX2]], i32 0 +; CHECK-NEXT: [[TMP6:%.*]] = or i64 [[TMP5]], [[S5]] +; CHECK-NEXT: [[TMP7:%.*]] = or i64 [[TMP6]], [[S6]] +; CHECK-NEXT: [[TMP8:%.*]] = or i64 [[TMP7]], [[S7]] +; CHECK-NEXT: [[OP_EXTRA:%.*]] = or i64 [[TMP8]], [[Z0]] +; CHECK-NEXT: ret i64 [[OP_EXTRA]] ; %g1 = getelementptr inbounds i8, i8* %arg, i64 1 %g2 = getelementptr inbounds i8, i8* %arg, i64 2 @@ -279,38 +247,18 @@ define i64 @load64le_nop_shift(i8* %arg) { ; CHECK-NEXT: [[G5:%.*]] = getelementptr inbounds i8, i8* [[ARG]], i64 5 ; CHECK-NEXT: [[G6:%.*]] = getelementptr inbounds i8, i8* [[ARG]], i64 6 ; CHECK-NEXT: [[G7:%.*]] = getelementptr inbounds i8, i8* [[ARG]], i64 7 -; CHECK-NEXT: [[LD0:%.*]] = load i8, i8* [[ARG]], align 1 -; CHECK-NEXT: [[LD1:%.*]] = load i8, i8* [[G1]], align 1 -; CHECK-NEXT: [[LD2:%.*]] = load i8, i8* [[G2]], align 1 -; CHECK-NEXT: [[LD3:%.*]] = load i8, i8* [[G3]], align 1 -; CHECK-NEXT: [[LD4:%.*]] = load i8, i8* [[G4]], align 1 -; CHECK-NEXT: [[LD5:%.*]] = load i8, i8* [[G5]], align 1 -; CHECK-NEXT: [[LD6:%.*]] = load i8, i8* [[G6]], align 1 -; CHECK-NEXT: [[LD7:%.*]] = load i8, i8* [[G7]], align 1 -; CHECK-NEXT: [[Z0:%.*]] = zext i8 [[LD0]] to i64 -; CHECK-NEXT: [[Z1:%.*]] = zext i8 [[LD1]] to i64 -; CHECK-NEXT: [[Z2:%.*]] = zext i8 [[LD2]] to i64 -; CHECK-NEXT: [[Z3:%.*]] = zext i8 [[LD3]] to i64 -; CHECK-NEXT: [[Z4:%.*]] = zext i8 [[LD4]] to i64 -; CHECK-NEXT: [[Z5:%.*]] = zext i8 [[LD5]] to i64 -; CHECK-NEXT: [[Z6:%.*]] = zext i8 [[LD6]] to i64 -; CHECK-NEXT: [[Z7:%.*]] = zext i8 [[LD7]] to i64 -; CHECK-NEXT: [[S0:%.*]] = shl nuw nsw i64 [[Z0]], 0 -; CHECK-NEXT: [[S1:%.*]] = shl nuw nsw i64 [[Z1]], 8 -; CHECK-NEXT: [[S2:%.*]] = shl nuw nsw i64 [[Z2]], 16 -; CHECK-NEXT: [[S3:%.*]] = shl nuw nsw i64 [[Z3]], 24 -; CHECK-NEXT: [[S4:%.*]] = shl nuw nsw i64 [[Z4]], 32 -; CHECK-NEXT: [[S5:%.*]] = shl nuw nsw i64 [[Z5]], 40 -; CHECK-NEXT: [[S6:%.*]] = shl nuw nsw i64 [[Z6]], 48 -; CHECK-NEXT: [[S7:%.*]] = shl nuw i64 [[Z7]], 56 -; CHECK-NEXT: [[O1:%.*]] = or i64 [[S1]], [[S0]] -; CHECK-NEXT: [[O2:%.*]] = or i64 [[O1]], [[S2]] -; CHECK-NEXT: [[O3:%.*]] = or i64 [[O2]], [[S3]] -; CHECK-NEXT: [[O4:%.*]] = or i64 [[O3]], [[S4]] -; CHECK-NEXT: [[O5:%.*]] = or i64 [[O4]], [[S5]] -; CHECK-NEXT: [[O6:%.*]] = or i64 [[O5]], [[S6]] -; CHECK-NEXT: [[O7:%.*]] = or i64 [[O6]], [[S7]] -; CHECK-NEXT: ret i64 [[O7]] +; CHECK-NEXT: [[TMP1:%.*]] = bitcast i8* [[ARG]] to <8 x i8>* +; CHECK-NEXT: [[TMP2:%.*]] = load <8 x i8>, <8 x i8>* [[TMP1]], align 1 +; CHECK-NEXT: [[TMP3:%.*]] = zext <8 x i8> [[TMP2]] to <8 x i64> +; CHECK-NEXT: [[TMP4:%.*]] = shl nuw <8 x i64> [[TMP3]], +; CHECK-NEXT: [[RDX_SHUF:%.*]] = shufflevector <8 x i64> [[TMP4]], <8 x i64> undef, <8 x i32> +; CHECK-NEXT: [[BIN_RDX:%.*]] = or <8 x i64> [[TMP4]], [[RDX_SHUF]] +; CHECK-NEXT: [[RDX_SHUF1:%.*]] = shufflevector <8 x i64> [[BIN_RDX]], <8 x i64> undef, <8 x i32> +; CHECK-NEXT: [[BIN_RDX2:%.*]] = or <8 x i64> [[BIN_RDX]], [[RDX_SHUF1]] +; CHECK-NEXT: [[RDX_SHUF3:%.*]] = shufflevector <8 x i64> [[BIN_RDX2]], <8 x i64> undef, <8 x i32> +; CHECK-NEXT: [[BIN_RDX4:%.*]] = or <8 x i64> [[BIN_RDX2]], [[RDX_SHUF3]] +; CHECK-NEXT: [[TMP5:%.*]] = extractelement <8 x i64> [[BIN_RDX4]], i32 0 +; CHECK-NEXT: ret i64 [[TMP5]] ; %g1 = getelementptr inbounds i8, i8* %arg, i64 1 %g2 = getelementptr inbounds i8, i8* %arg, i64 2 -- 2.7.4