From 16a72a0f87487f2a07bb2a4101c79e4d311151a0 Mon Sep 17 00:00:00 2001 From: David Green Date: Sat, 3 Dec 2022 16:08:58 +0000 Subject: [PATCH] [AArch64] Enable the select optimize pass for AArch64 This enabled the select optimize patch for ARM Out of order AArch64 cores. It is trying to solve a problem that is difficult for the compiler to fix. The criteria for when a csel is better or worse than a branch depends heavily on whether the branch is well predicted and the amount of ILP in the loop (as well as other criteria like the core in question and the relative performance of the branch predictor). The pass seems to do a decent job though, with the inner loop heuristics being well implemented and doing a better job than I had expected in general, even without PGO information. I've been doing quite a bit of benchmarking. The headline numbers are these for SPEC2017 on a Neoverse N1: 500.perlbench_r -0.12% 502.gcc_r 0.02% 505.mcf_r 6.02% 520.omnetpp_r 0.32% 523.xalancbmk_r 0.20% 525.x264_r 0.02% 531.deepsjeng_r 0.00% 541.leela_r -0.09% 548.exchange2_r 0.00% 557.xz_r -0.20% Running benchmarks with a combination of the llvm-test-suite plus several versions of SPEC gave between a 0.2% and 0.4% geomean improvement depending on the core/run. The instruction count went down by 0.1% too, which is a good sign, but the results can be a little noisy. Some issues from other benchmarks I had ran were improved in rGca78b5601466f8515f5f958ef8e63d787d9d812e. In summary well predicted branches will see in improvement, badly predicted branches may get worse, and on average performance seems to be a little better overall. This patch enables the pass for AArch64 under -O3 for cores that will benefit for it. i.e. not in-order cores that do not fit into the "Assume infinite resources that allow to fully exploit the available instruction-level parallelism" cost model. It uses a subtarget feature for specifying when the pass will be enabled, which I have enabled under cpu=generic as the performance increases for out of order cores seems larger than any decreases for inorder, which were minor. Differential Revision: https://reviews.llvm.org/D138990 --- llvm/include/llvm/Analysis/TargetTransformInfo.h | 7 + .../llvm/Analysis/TargetTransformInfoImpl.h | 2 + llvm/lib/Analysis/TargetTransformInfo.cpp | 4 + llvm/lib/CodeGen/SelectOptimize.cpp | 4 + llvm/lib/Target/AArch64/AArch64.td | 62 +++-- llvm/lib/Target/AArch64/AArch64TargetMachine.cpp | 8 + .../Target/AArch64/AArch64TargetTransformInfo.h | 2 + llvm/test/CodeGen/AArch64/O3-pipeline.ll | 4 + llvm/test/CodeGen/AArch64/selectopt.ll | 263 +++++++++++++++++++++ 9 files changed, 337 insertions(+), 19 deletions(-) create mode 100644 llvm/test/CodeGen/AArch64/selectopt.ll diff --git a/llvm/include/llvm/Analysis/TargetTransformInfo.h b/llvm/include/llvm/Analysis/TargetTransformInfo.h index 5222309..1da65da 100644 --- a/llvm/include/llvm/Analysis/TargetTransformInfo.h +++ b/llvm/include/llvm/Analysis/TargetTransformInfo.h @@ -805,6 +805,9 @@ public: MemCmpExpansionOptions enableMemCmpExpansion(bool OptSize, bool IsZeroCmp) const; + /// Should the Select Optimization pass be enabled and ran. + bool enableSelectOptimize() const; + /// Enable matching of interleaved access groups. bool enableInterleavedAccessVectorization() const; @@ -1683,6 +1686,7 @@ public: virtual bool enableAggressiveInterleaving(bool LoopHasReductions) = 0; virtual MemCmpExpansionOptions enableMemCmpExpansion(bool OptSize, bool IsZeroCmp) const = 0; + virtual bool enableSelectOptimize() = 0; virtual bool enableInterleavedAccessVectorization() = 0; virtual bool enableMaskedInterleavedAccessVectorization() = 0; virtual bool isFPVectorizationPotentiallyUnsafe() = 0; @@ -2173,6 +2177,9 @@ public: bool enableInterleavedAccessVectorization() override { return Impl.enableInterleavedAccessVectorization(); } + bool enableSelectOptimize() override { + return Impl.enableSelectOptimize(); + } bool enableMaskedInterleavedAccessVectorization() override { return Impl.enableMaskedInterleavedAccessVectorization(); } diff --git a/llvm/include/llvm/Analysis/TargetTransformInfoImpl.h b/llvm/include/llvm/Analysis/TargetTransformInfoImpl.h index ec98bc4..2031f00 100644 --- a/llvm/include/llvm/Analysis/TargetTransformInfoImpl.h +++ b/llvm/include/llvm/Analysis/TargetTransformInfoImpl.h @@ -359,6 +359,8 @@ public: return {}; } + bool enableSelectOptimize() const { return true; } + bool enableInterleavedAccessVectorization() const { return false; } bool enableMaskedInterleavedAccessVectorization() const { return false; } diff --git a/llvm/lib/Analysis/TargetTransformInfo.cpp b/llvm/lib/Analysis/TargetTransformInfo.cpp index c66e03a..7459ce1 100644 --- a/llvm/lib/Analysis/TargetTransformInfo.cpp +++ b/llvm/lib/Analysis/TargetTransformInfo.cpp @@ -547,6 +547,10 @@ TargetTransformInfo::enableMemCmpExpansion(bool OptSize, bool IsZeroCmp) const { return TTIImpl->enableMemCmpExpansion(OptSize, IsZeroCmp); } +bool TargetTransformInfo::enableSelectOptimize() const { + return TTIImpl->enableSelectOptimize(); +} + bool TargetTransformInfo::enableInterleavedAccessVectorization() const { return TTIImpl->enableInterleavedAccessVectorization(); } diff --git a/llvm/lib/CodeGen/SelectOptimize.cpp b/llvm/lib/CodeGen/SelectOptimize.cpp index e50d0b4..ad73e76 100644 --- a/llvm/lib/CodeGen/SelectOptimize.cpp +++ b/llvm/lib/CodeGen/SelectOptimize.cpp @@ -243,6 +243,10 @@ bool SelectOptimize::runOnFunction(Function &F) { return false; TTI = &getAnalysis().getTTI(F); + + if (!TTI->enableSelectOptimize()) + return false; + DT = &getAnalysis().getDomTree(); LI = &getAnalysis().getLoopInfo(); BPI.reset(new BranchProbabilityInfo(F, *LI)); diff --git a/llvm/lib/Target/AArch64/AArch64.td b/llvm/lib/Target/AArch64/AArch64.td index a9c2509..2d47be2 100644 --- a/llvm/lib/Target/AArch64/AArch64.td +++ b/llvm/lib/Target/AArch64/AArch64.td @@ -209,6 +209,10 @@ def FeaturePredictableSelectIsExpensive : SubtargetFeature< "predictable-select-expensive", "PredictableSelectIsExpensive", "true", "Prefer likely predicted branches over selects">; +def FeatureEnableSelectOptimize : SubtargetFeature< + "enable-select-opt", "EnableSelectOptimize", "true", + "Enable the select optimize pass for select loop heuristics">; + def FeatureCustomCheapAsMoveHandling : SubtargetFeature<"custom-cheap-as-move", "HasCustomCheapAsMoveHandling", "true", "Use custom handling of cheap instructions">; @@ -743,6 +747,7 @@ def TuneA57 : SubtargetFeature<"a57", "ARMProcFamily", "CortexA57", FeatureFuseAdrpAdd, FeatureFuseLiterals, FeaturePostRAScheduler, + FeatureEnableSelectOptimize, FeaturePredictableSelectIsExpensive]>; def TuneA65 : SubtargetFeature<"a65", "ARMProcFamily", "CortexA65", @@ -750,36 +755,42 @@ def TuneA65 : SubtargetFeature<"a65", "ARMProcFamily", "CortexA65", FeatureFuseAES, FeatureFuseAddress, FeatureFuseAdrpAdd, - FeatureFuseLiterals]>; + FeatureFuseLiterals, + FeatureEnableSelectOptimize]>; def TuneA72 : SubtargetFeature<"a72", "ARMProcFamily", "CortexA72", "Cortex-A72 ARM processors", [ FeatureFuseAES, FeatureFuseAdrpAdd, - FeatureFuseLiterals]>; + FeatureFuseLiterals, + FeatureEnableSelectOptimize]>; def TuneA73 : SubtargetFeature<"a73", "ARMProcFamily", "CortexA73", "Cortex-A73 ARM processors", [ FeatureFuseAES, - FeatureFuseAdrpAdd]>; + FeatureFuseAdrpAdd, + FeatureEnableSelectOptimize]>; def TuneA75 : SubtargetFeature<"a75", "ARMProcFamily", "CortexA75", "Cortex-A75 ARM processors", [ FeatureFuseAES, - FeatureFuseAdrpAdd]>; + FeatureFuseAdrpAdd, + FeatureEnableSelectOptimize]>; def TuneA76 : SubtargetFeature<"a76", "ARMProcFamily", "CortexA76", "Cortex-A76 ARM processors", [ FeatureFuseAES, FeatureFuseAdrpAdd, - FeatureLSLFast]>; + FeatureLSLFast, + FeatureEnableSelectOptimize]>; def TuneA77 : SubtargetFeature<"a77", "ARMProcFamily", "CortexA77", "Cortex-A77 ARM processors", [ FeatureCmpBccFusion, FeatureFuseAES, FeatureFuseAdrpAdd, - FeatureLSLFast]>; + FeatureLSLFast, + FeatureEnableSelectOptimize]>; def TuneA78 : SubtargetFeature<"a78", "ARMProcFamily", "CortexA78", "Cortex-A78 ARM processors", [ @@ -787,7 +798,8 @@ def TuneA78 : SubtargetFeature<"a78", "ARMProcFamily", "CortexA78", FeatureFuseAES, FeatureFuseAdrpAdd, FeatureLSLFast, - FeaturePostRAScheduler]>; + FeaturePostRAScheduler, + FeatureEnableSelectOptimize]>; def TuneA78C : SubtargetFeature<"a78c", "ARMProcFamily", "CortexA78C", @@ -796,7 +808,8 @@ def TuneA78C : SubtargetFeature<"a78c", "ARMProcFamily", FeatureFuseAES, FeatureFuseAdrpAdd, FeatureLSLFast, - FeaturePostRAScheduler]>; + FeaturePostRAScheduler, + FeatureEnableSelectOptimize]>; def TuneA710 : SubtargetFeature<"a710", "ARMProcFamily", "CortexA710", "Cortex-A710 ARM processors", [ @@ -804,7 +817,8 @@ def TuneA710 : SubtargetFeature<"a710", "ARMProcFamily", "CortexA710", FeatureFuseAES, FeatureFuseAdrpAdd, FeatureLSLFast, - FeaturePostRAScheduler]>; + FeaturePostRAScheduler, + FeatureEnableSelectOptimize]>; def TuneA715 : SubtargetFeature<"a715", "ARMProcFamily", "CortexA715", "Cortex-A715 ARM processors", [ @@ -812,7 +826,8 @@ def TuneA715 : SubtargetFeature<"a715", "ARMProcFamily", "CortexA715", FeaturePostRAScheduler, FeatureCmpBccFusion, FeatureLSLFast, - FeatureFuseAdrpAdd]>; + FeatureFuseAdrpAdd, + FeatureEnableSelectOptimize]>; def TuneR82 : SubtargetFeature<"cortex-r82", "ARMProcFamily", "CortexR82", @@ -825,7 +840,8 @@ def TuneX1 : SubtargetFeature<"cortex-x1", "ARMProcFamily", "CortexX1", FeatureFuseAES, FeatureFuseAdrpAdd, FeatureLSLFast, - FeaturePostRAScheduler]>; + FeaturePostRAScheduler, + FeatureEnableSelectOptimize]>; def TuneX2 : SubtargetFeature<"cortex-x2", "ARMProcFamily", "CortexX2", "Cortex-X2 ARM processors", [ @@ -833,14 +849,16 @@ def TuneX2 : SubtargetFeature<"cortex-x2", "ARMProcFamily", "CortexX2", FeatureFuseAES, FeatureFuseAdrpAdd, FeatureLSLFast, - FeaturePostRAScheduler]>; + FeaturePostRAScheduler, + FeatureEnableSelectOptimize]>; def TuneX3 : SubtargetFeature<"cortex-x3", "ARMProcFamily", "CortexX3", "Cortex-X3 ARM processors", [ FeatureLSLFast, FeatureFuseAdrpAdd, FeatureFuseAES, - FeaturePostRAScheduler]>; + FeaturePostRAScheduler, + FeatureEnableSelectOptimize]>; def TuneA64FX : SubtargetFeature<"a64fx", "ARMProcFamily", "A64FX", "Fujitsu A64FX processors", [ @@ -1024,34 +1042,39 @@ def TuneNeoverseN1 : SubtargetFeature<"neoversen1", "ARMProcFamily", "NeoverseN1 FeatureFuseAES, FeatureFuseAdrpAdd, FeatureLSLFast, - FeaturePostRAScheduler]>; + FeaturePostRAScheduler, + FeatureEnableSelectOptimize]>; def TuneNeoverseN2 : SubtargetFeature<"neoversen2", "ARMProcFamily", "NeoverseN2", "Neoverse N2 ARM processors", [ FeatureFuseAES, FeatureFuseAdrpAdd, FeatureLSLFast, - FeaturePostRAScheduler]>; + FeaturePostRAScheduler, + FeatureEnableSelectOptimize]>; def TuneNeoverse512TVB : SubtargetFeature<"neoverse512tvb", "ARMProcFamily", "Neoverse512TVB", "Neoverse 512-TVB ARM processors", [ FeatureFuseAES, FeatureFuseAdrpAdd, FeatureLSLFast, - FeaturePostRAScheduler]>; + FeaturePostRAScheduler, + FeatureEnableSelectOptimize]>; def TuneNeoverseV1 : SubtargetFeature<"neoversev1", "ARMProcFamily", "NeoverseV1", "Neoverse V1 ARM processors", [ FeatureFuseAES, FeatureFuseAdrpAdd, FeatureLSLFast, - FeaturePostRAScheduler]>; + FeaturePostRAScheduler, + FeatureEnableSelectOptimize]>; def TuneNeoverseV2 : SubtargetFeature<"neoversev2", "ARMProcFamily", "NeoverseV2", "Neoverse V2 ARM processors", [ FeatureFuseAES, FeatureLSLFast, - FeaturePostRAScheduler]>; + FeaturePostRAScheduler, + FeatureEnableSelectOptimize]>; def TuneSaphira : SubtargetFeature<"saphira", "ARMProcFamily", "Saphira", "Qualcomm Saphira processors", [ @@ -1262,7 +1285,8 @@ def ProcessorFeatures { // FeatureFuseAdrpAdd is enabled under Generic to allow linker merging // optimizations. def : ProcessorModel<"generic", CortexA55Model, ProcessorFeatures.Generic, - [FeatureFuseAES, FeatureFuseAdrpAdd, FeaturePostRAScheduler]>; + [FeatureFuseAES, FeatureFuseAdrpAdd, FeaturePostRAScheduler, + FeatureEnableSelectOptimize]>; def : ProcessorModel<"cortex-a35", CortexA53Model, ProcessorFeatures.A53, [TuneA35]>; def : ProcessorModel<"cortex-a34", CortexA53Model, ProcessorFeatures.A53, diff --git a/llvm/lib/Target/AArch64/AArch64TargetMachine.cpp b/llvm/lib/Target/AArch64/AArch64TargetMachine.cpp index 997b662..b622c93 100644 --- a/llvm/lib/Target/AArch64/AArch64TargetMachine.cpp +++ b/llvm/lib/Target/AArch64/AArch64TargetMachine.cpp @@ -132,6 +132,11 @@ static cl::opt cl::init(false)); static cl::opt + EnableSelectOpt("aarch64-select-opt", cl::Hidden, + cl::desc("Enable select to branch optimizations"), + cl::init(true)); + +static cl::opt BranchRelaxation("aarch64-enable-branch-relax", cl::Hidden, cl::init(true), cl::desc("Relax out of range conditional branches")); @@ -587,6 +592,9 @@ void AArch64PassConfig::addIRPasses() { TargetPassConfig::addIRPasses(); + if (getOptLevel() == CodeGenOpt::Aggressive && EnableSelectOpt) + addPass(createSelectOptimizePass()); + addPass(createAArch64StackTaggingPass( /*IsOptNone=*/TM->getOptLevel() == CodeGenOpt::None)); diff --git a/llvm/lib/Target/AArch64/AArch64TargetTransformInfo.h b/llvm/lib/Target/AArch64/AArch64TargetTransformInfo.h index c96e064..e309117 100644 --- a/llvm/lib/Target/AArch64/AArch64TargetTransformInfo.h +++ b/llvm/lib/Target/AArch64/AArch64TargetTransformInfo.h @@ -388,6 +388,8 @@ public: int64_t BaseOffset, bool HasBaseReg, int64_t Scale, unsigned AddrSpace) const; /// @} + + bool enableSelectOptimize() { return ST->enableSelectOptimize(); } }; } // end namespace llvm diff --git a/llvm/test/CodeGen/AArch64/O3-pipeline.ll b/llvm/test/CodeGen/AArch64/O3-pipeline.ll index 3ea1d12..90cf49e 100644 --- a/llvm/test/CodeGen/AArch64/O3-pipeline.ll +++ b/llvm/test/CodeGen/AArch64/O3-pipeline.ll @@ -66,6 +66,10 @@ ; CHECK-NEXT: Expand reduction intrinsics ; CHECK-NEXT: Natural Loop Information ; CHECK-NEXT: TLS Variable Hoist +; CHECK-NEXT: Lazy Branch Probability Analysis +; CHECK-NEXT: Lazy Block Frequency Analysis +; CHECK-NEXT: Optimization Remark Emitter +; CHECK-NEXT: Optimize selects ; CHECK-NEXT: Stack Safety Analysis ; CHECK-NEXT: FunctionPass Manager ; CHECK-NEXT: Dominator Tree Construction diff --git a/llvm/test/CodeGen/AArch64/selectopt.ll b/llvm/test/CodeGen/AArch64/selectopt.ll new file mode 100644 index 0000000..46ac585 --- /dev/null +++ b/llvm/test/CodeGen/AArch64/selectopt.ll @@ -0,0 +1,263 @@ +; NOTE: Assertions have been autogenerated by utils/update_test_checks.py +; RUN: opt -select-optimize -mtriple=aarch64-linux-gnu -mcpu=generic -S < %s | FileCheck %s --check-prefix=CHECKOO +; RUN: opt -select-optimize -mtriple=aarch64-linux-gnu -mcpu=cortex-a55 -S < %s | FileCheck %s --check-prefix=CHECKII +; RUN: opt -select-optimize -mtriple=aarch64-linux-gnu -mcpu=cortex-a510 -S < %s | FileCheck %s --check-prefix=CHECKII +; RUN: opt -select-optimize -mtriple=aarch64-linux-gnu -mcpu=cortex-a72 -S < %s | FileCheck %s --check-prefix=CHECKOO +; RUN: opt -select-optimize -mtriple=aarch64-linux-gnu -mcpu=neoverse-n1 -S < %s | FileCheck %s --check-prefix=CHECKOO +; RUN: opt -select-optimize -mtriple=aarch64-linux-gnu -mcpu=cortex-a710 -S < %s | FileCheck %s --check-prefix=CHECKOO +; RUN: opt -select-optimize -mtriple=aarch64-linux-gnu -mcpu=neoverse-v2 -S < %s | FileCheck %s --check-prefix=CHECKOO + +%struct.st = type { i32, i64, ptr, ptr, i16, ptr, ptr, i64, i64 } + +; This test has a select at the end of if.then, which is better transformed to a branch on OoO cores. + +define void @replace(ptr nocapture noundef %newst, ptr noundef %t, ptr noundef %h, i64 noundef %c, i64 noundef %rc, i64 noundef %ma, i64 noundef %n) { +; CHECKOO-LABEL: @replace( +; CHECKOO-NEXT: entry: +; CHECKOO-NEXT: [[T1:%.*]] = getelementptr inbounds [[STRUCT_ST:%.*]], ptr [[NEWST:%.*]], i64 0, i32 2 +; CHECKOO-NEXT: store ptr [[T:%.*]], ptr [[T1]], align 8 +; CHECKOO-NEXT: [[H3:%.*]] = getelementptr inbounds [[STRUCT_ST]], ptr [[NEWST]], i64 0, i32 3 +; CHECKOO-NEXT: store ptr [[H:%.*]], ptr [[H3]], align 8 +; CHECKOO-NEXT: [[ORG_C:%.*]] = getelementptr inbounds [[STRUCT_ST]], ptr [[NEWST]], i64 0, i32 8 +; CHECKOO-NEXT: store i64 [[C:%.*]], ptr [[ORG_C]], align 8 +; CHECKOO-NEXT: [[C6:%.*]] = getelementptr inbounds [[STRUCT_ST]], ptr [[NEWST]], i64 0, i32 1 +; CHECKOO-NEXT: store i64 [[C]], ptr [[C6]], align 8 +; CHECKOO-NEXT: [[FLOW:%.*]] = getelementptr inbounds [[STRUCT_ST]], ptr [[NEWST]], i64 0, i32 7 +; CHECKOO-NEXT: store i64 [[RC:%.*]], ptr [[FLOW]], align 8 +; CHECKOO-NEXT: [[CONV:%.*]] = trunc i64 [[N:%.*]] to i32 +; CHECKOO-NEXT: store i32 [[CONV]], ptr [[NEWST]], align 8 +; CHECKOO-NEXT: [[FLOW10:%.*]] = getelementptr inbounds [[STRUCT_ST]], ptr [[NEWST]], i64 1, i32 7 +; CHECKOO-NEXT: [[TMP0:%.*]] = load i64, ptr [[FLOW10]], align 8 +; CHECKOO-NEXT: [[FLOW12:%.*]] = getelementptr inbounds [[STRUCT_ST]], ptr [[NEWST]], i64 2, i32 7 +; CHECKOO-NEXT: [[TMP1:%.*]] = load i64, ptr [[FLOW12]], align 8 +; CHECKOO-NEXT: [[CMP13:%.*]] = icmp sgt i64 [[TMP0]], [[TMP1]] +; CHECKOO-NEXT: [[CONV15:%.*]] = select i1 [[CMP13]], i64 2, i64 3 +; CHECKOO-NEXT: [[CMP16_NOT149:%.*]] = icmp sgt i64 [[CONV15]], [[MA:%.*]] +; CHECKOO-NEXT: br i1 [[CMP16_NOT149]], label [[WHILE_END:%.*]], label [[LAND_RHS:%.*]] +; CHECKOO: land.rhs: +; CHECKOO-NEXT: [[CMP_0151:%.*]] = phi i64 [ [[CMP_1:%.*]], [[IF_END87:%.*]] ], [ [[CONV15]], [[ENTRY:%.*]] ] +; CHECKOO-NEXT: [[POS_0150:%.*]] = phi i64 [ [[CMP_0151]], [[IF_END87]] ], [ 1, [[ENTRY]] ] +; CHECKOO-NEXT: [[SUB:%.*]] = add nsw i64 [[CMP_0151]], -1 +; CHECKOO-NEXT: [[FLOW19:%.*]] = getelementptr inbounds [[STRUCT_ST]], ptr [[NEWST]], i64 [[SUB]], i32 7 +; CHECKOO-NEXT: [[TMP2:%.*]] = load i64, ptr [[FLOW19]], align 8 +; CHECKOO-NEXT: [[CMP20:%.*]] = icmp sgt i64 [[TMP2]], [[RC]] +; CHECKOO-NEXT: br i1 [[CMP20]], label [[WHILE_BODY:%.*]], label [[WHILE_END]] +; CHECKOO: while.body: +; CHECKOO-NEXT: [[ARRAYIDX18:%.*]] = getelementptr inbounds [[STRUCT_ST]], ptr [[NEWST]], i64 [[SUB]] +; CHECKOO-NEXT: [[T24:%.*]] = getelementptr inbounds [[STRUCT_ST]], ptr [[NEWST]], i64 [[SUB]], i32 2 +; CHECKOO-NEXT: [[TMP3:%.*]] = load ptr, ptr [[T24]], align 8 +; CHECKOO-NEXT: [[SUB25:%.*]] = add nsw i64 [[POS_0150]], -1 +; CHECKOO-NEXT: [[ARRAYIDX26:%.*]] = getelementptr inbounds [[STRUCT_ST]], ptr [[NEWST]], i64 [[SUB25]] +; CHECKOO-NEXT: [[T27:%.*]] = getelementptr inbounds [[STRUCT_ST]], ptr [[NEWST]], i64 [[SUB25]], i32 2 +; CHECKOO-NEXT: store ptr [[TMP3]], ptr [[T27]], align 8 +; CHECKOO-NEXT: [[H30:%.*]] = getelementptr inbounds [[STRUCT_ST]], ptr [[NEWST]], i64 [[SUB]], i32 3 +; CHECKOO-NEXT: [[TMP4:%.*]] = load ptr, ptr [[H30]], align 8 +; CHECKOO-NEXT: [[H33:%.*]] = getelementptr inbounds [[STRUCT_ST]], ptr [[NEWST]], i64 [[SUB25]], i32 3 +; CHECKOO-NEXT: store ptr [[TMP4]], ptr [[H33]], align 8 +; CHECKOO-NEXT: [[C36:%.*]] = getelementptr inbounds [[STRUCT_ST]], ptr [[NEWST]], i64 [[SUB]], i32 1 +; CHECKOO-NEXT: [[TMP5:%.*]] = load i64, ptr [[C36]], align 8 +; CHECKOO-NEXT: [[C39:%.*]] = getelementptr inbounds [[STRUCT_ST]], ptr [[NEWST]], i64 [[SUB25]], i32 1 +; CHECKOO-NEXT: store i64 [[TMP5]], ptr [[C39]], align 8 +; CHECKOO-NEXT: [[TMP6:%.*]] = load i64, ptr [[C36]], align 8 +; CHECKOO-NEXT: [[ORG_C45:%.*]] = getelementptr inbounds [[STRUCT_ST]], ptr [[NEWST]], i64 [[SUB25]], i32 8 +; CHECKOO-NEXT: store i64 [[TMP6]], ptr [[ORG_C45]], align 8 +; CHECKOO-NEXT: [[FLOW51:%.*]] = getelementptr inbounds [[STRUCT_ST]], ptr [[NEWST]], i64 [[SUB25]], i32 7 +; CHECKOO-NEXT: store i64 [[TMP2]], ptr [[FLOW51]], align 8 +; CHECKOO-NEXT: [[TMP7:%.*]] = load i32, ptr [[ARRAYIDX18]], align 8 +; CHECKOO-NEXT: store i32 [[TMP7]], ptr [[ARRAYIDX26]], align 8 +; CHECKOO-NEXT: store ptr [[T]], ptr [[T24]], align 8 +; CHECKOO-NEXT: store ptr [[H]], ptr [[H30]], align 8 +; CHECKOO-NEXT: store i64 [[C]], ptr [[C36]], align 8 +; CHECKOO-NEXT: [[ORG_C69:%.*]] = getelementptr inbounds [[STRUCT_ST]], ptr [[NEWST]], i64 [[SUB]], i32 8 +; CHECKOO-NEXT: store i64 [[C]], ptr [[ORG_C69]], align 8 +; CHECKOO-NEXT: store i64 [[RC]], ptr [[FLOW19]], align 8 +; CHECKOO-NEXT: store i32 [[CONV]], ptr [[ARRAYIDX18]], align 8 +; CHECKOO-NEXT: [[MUL:%.*]] = shl nsw i64 [[CMP_0151]], 1 +; CHECKOO-NEXT: [[ADD:%.*]] = or i64 [[MUL]], 1 +; CHECKOO-NEXT: [[CMP77_NOT:%.*]] = icmp sgt i64 [[ADD]], [[MA]] +; CHECKOO-NEXT: br i1 [[CMP77_NOT]], label [[IF_END87]], label [[IF_THEN:%.*]] +; CHECKOO: if.then: +; CHECKOO-NEXT: [[SUB79:%.*]] = add nsw i64 [[MUL]], -1 +; CHECKOO-NEXT: [[FLOW81:%.*]] = getelementptr inbounds [[STRUCT_ST]], ptr [[NEWST]], i64 [[SUB79]], i32 7 +; CHECKOO-NEXT: [[TMP8:%.*]] = load i64, ptr [[FLOW81]], align 8 +; CHECKOO-NEXT: [[FLOW83:%.*]] = getelementptr inbounds [[STRUCT_ST]], ptr [[NEWST]], i64 [[MUL]], i32 7 +; CHECKOO-NEXT: [[TMP9:%.*]] = load i64, ptr [[FLOW83]], align 8 +; CHECKOO-NEXT: [[CMP84:%.*]] = icmp slt i64 [[TMP8]], [[TMP9]] +; CHECKOO-NEXT: [[SPEC_SELECT_FROZEN:%.*]] = freeze i1 [[CMP84]] +; CHECKOO-NEXT: br i1 [[SPEC_SELECT_FROZEN]], label [[SELECT_END:%.*]], label [[SELECT_FALSE:%.*]] +; CHECKOO: select.false: +; CHECKOO-NEXT: br label [[SELECT_END]] +; CHECKOO: select.end: +; CHECKOO-NEXT: [[SPEC_SELECT:%.*]] = phi i64 [ [[ADD]], [[IF_THEN]] ], [ [[MUL]], [[SELECT_FALSE]] ] +; CHECKOO-NEXT: br label [[IF_END87]] +; CHECKOO: if.end87: +; CHECKOO-NEXT: [[CMP_1]] = phi i64 [ [[MUL]], [[WHILE_BODY]] ], [ [[SPEC_SELECT]], [[SELECT_END]] ] +; CHECKOO-NEXT: [[CMP16_NOT:%.*]] = icmp sgt i64 [[CMP_1]], [[MA]] +; CHECKOO-NEXT: br i1 [[CMP16_NOT]], label [[WHILE_END]], label [[LAND_RHS]] +; CHECKOO: while.end: +; CHECKOO-NEXT: ret void +; +; CHECKII-LABEL: @replace( +; CHECKII-NEXT: entry: +; CHECKII-NEXT: [[T1:%.*]] = getelementptr inbounds [[STRUCT_ST:%.*]], ptr [[NEWST:%.*]], i64 0, i32 2 +; CHECKII-NEXT: store ptr [[T:%.*]], ptr [[T1]], align 8 +; CHECKII-NEXT: [[H3:%.*]] = getelementptr inbounds [[STRUCT_ST]], ptr [[NEWST]], i64 0, i32 3 +; CHECKII-NEXT: store ptr [[H:%.*]], ptr [[H3]], align 8 +; CHECKII-NEXT: [[ORG_C:%.*]] = getelementptr inbounds [[STRUCT_ST]], ptr [[NEWST]], i64 0, i32 8 +; CHECKII-NEXT: store i64 [[C:%.*]], ptr [[ORG_C]], align 8 +; CHECKII-NEXT: [[C6:%.*]] = getelementptr inbounds [[STRUCT_ST]], ptr [[NEWST]], i64 0, i32 1 +; CHECKII-NEXT: store i64 [[C]], ptr [[C6]], align 8 +; CHECKII-NEXT: [[FLOW:%.*]] = getelementptr inbounds [[STRUCT_ST]], ptr [[NEWST]], i64 0, i32 7 +; CHECKII-NEXT: store i64 [[RC:%.*]], ptr [[FLOW]], align 8 +; CHECKII-NEXT: [[CONV:%.*]] = trunc i64 [[N:%.*]] to i32 +; CHECKII-NEXT: store i32 [[CONV]], ptr [[NEWST]], align 8 +; CHECKII-NEXT: [[FLOW10:%.*]] = getelementptr inbounds [[STRUCT_ST]], ptr [[NEWST]], i64 1, i32 7 +; CHECKII-NEXT: [[TMP0:%.*]] = load i64, ptr [[FLOW10]], align 8 +; CHECKII-NEXT: [[FLOW12:%.*]] = getelementptr inbounds [[STRUCT_ST]], ptr [[NEWST]], i64 2, i32 7 +; CHECKII-NEXT: [[TMP1:%.*]] = load i64, ptr [[FLOW12]], align 8 +; CHECKII-NEXT: [[CMP13:%.*]] = icmp sgt i64 [[TMP0]], [[TMP1]] +; CHECKII-NEXT: [[CONV15:%.*]] = select i1 [[CMP13]], i64 2, i64 3 +; CHECKII-NEXT: [[CMP16_NOT149:%.*]] = icmp sgt i64 [[CONV15]], [[MA:%.*]] +; CHECKII-NEXT: br i1 [[CMP16_NOT149]], label [[WHILE_END:%.*]], label [[LAND_RHS:%.*]] +; CHECKII: land.rhs: +; CHECKII-NEXT: [[CMP_0151:%.*]] = phi i64 [ [[CMP_1:%.*]], [[IF_END87:%.*]] ], [ [[CONV15]], [[ENTRY:%.*]] ] +; CHECKII-NEXT: [[POS_0150:%.*]] = phi i64 [ [[CMP_0151]], [[IF_END87]] ], [ 1, [[ENTRY]] ] +; CHECKII-NEXT: [[SUB:%.*]] = add nsw i64 [[CMP_0151]], -1 +; CHECKII-NEXT: [[FLOW19:%.*]] = getelementptr inbounds [[STRUCT_ST]], ptr [[NEWST]], i64 [[SUB]], i32 7 +; CHECKII-NEXT: [[TMP2:%.*]] = load i64, ptr [[FLOW19]], align 8 +; CHECKII-NEXT: [[CMP20:%.*]] = icmp sgt i64 [[TMP2]], [[RC]] +; CHECKII-NEXT: br i1 [[CMP20]], label [[WHILE_BODY:%.*]], label [[WHILE_END]] +; CHECKII: while.body: +; CHECKII-NEXT: [[ARRAYIDX18:%.*]] = getelementptr inbounds [[STRUCT_ST]], ptr [[NEWST]], i64 [[SUB]] +; CHECKII-NEXT: [[T24:%.*]] = getelementptr inbounds [[STRUCT_ST]], ptr [[NEWST]], i64 [[SUB]], i32 2 +; CHECKII-NEXT: [[TMP3:%.*]] = load ptr, ptr [[T24]], align 8 +; CHECKII-NEXT: [[SUB25:%.*]] = add nsw i64 [[POS_0150]], -1 +; CHECKII-NEXT: [[ARRAYIDX26:%.*]] = getelementptr inbounds [[STRUCT_ST]], ptr [[NEWST]], i64 [[SUB25]] +; CHECKII-NEXT: [[T27:%.*]] = getelementptr inbounds [[STRUCT_ST]], ptr [[NEWST]], i64 [[SUB25]], i32 2 +; CHECKII-NEXT: store ptr [[TMP3]], ptr [[T27]], align 8 +; CHECKII-NEXT: [[H30:%.*]] = getelementptr inbounds [[STRUCT_ST]], ptr [[NEWST]], i64 [[SUB]], i32 3 +; CHECKII-NEXT: [[TMP4:%.*]] = load ptr, ptr [[H30]], align 8 +; CHECKII-NEXT: [[H33:%.*]] = getelementptr inbounds [[STRUCT_ST]], ptr [[NEWST]], i64 [[SUB25]], i32 3 +; CHECKII-NEXT: store ptr [[TMP4]], ptr [[H33]], align 8 +; CHECKII-NEXT: [[C36:%.*]] = getelementptr inbounds [[STRUCT_ST]], ptr [[NEWST]], i64 [[SUB]], i32 1 +; CHECKII-NEXT: [[TMP5:%.*]] = load i64, ptr [[C36]], align 8 +; CHECKII-NEXT: [[C39:%.*]] = getelementptr inbounds [[STRUCT_ST]], ptr [[NEWST]], i64 [[SUB25]], i32 1 +; CHECKII-NEXT: store i64 [[TMP5]], ptr [[C39]], align 8 +; CHECKII-NEXT: [[TMP6:%.*]] = load i64, ptr [[C36]], align 8 +; CHECKII-NEXT: [[ORG_C45:%.*]] = getelementptr inbounds [[STRUCT_ST]], ptr [[NEWST]], i64 [[SUB25]], i32 8 +; CHECKII-NEXT: store i64 [[TMP6]], ptr [[ORG_C45]], align 8 +; CHECKII-NEXT: [[FLOW51:%.*]] = getelementptr inbounds [[STRUCT_ST]], ptr [[NEWST]], i64 [[SUB25]], i32 7 +; CHECKII-NEXT: store i64 [[TMP2]], ptr [[FLOW51]], align 8 +; CHECKII-NEXT: [[TMP7:%.*]] = load i32, ptr [[ARRAYIDX18]], align 8 +; CHECKII-NEXT: store i32 [[TMP7]], ptr [[ARRAYIDX26]], align 8 +; CHECKII-NEXT: store ptr [[T]], ptr [[T24]], align 8 +; CHECKII-NEXT: store ptr [[H]], ptr [[H30]], align 8 +; CHECKII-NEXT: store i64 [[C]], ptr [[C36]], align 8 +; CHECKII-NEXT: [[ORG_C69:%.*]] = getelementptr inbounds [[STRUCT_ST]], ptr [[NEWST]], i64 [[SUB]], i32 8 +; CHECKII-NEXT: store i64 [[C]], ptr [[ORG_C69]], align 8 +; CHECKII-NEXT: store i64 [[RC]], ptr [[FLOW19]], align 8 +; CHECKII-NEXT: store i32 [[CONV]], ptr [[ARRAYIDX18]], align 8 +; CHECKII-NEXT: [[MUL:%.*]] = shl nsw i64 [[CMP_0151]], 1 +; CHECKII-NEXT: [[ADD:%.*]] = or i64 [[MUL]], 1 +; CHECKII-NEXT: [[CMP77_NOT:%.*]] = icmp sgt i64 [[ADD]], [[MA]] +; CHECKII-NEXT: br i1 [[CMP77_NOT]], label [[IF_END87]], label [[IF_THEN:%.*]] +; CHECKII: if.then: +; CHECKII-NEXT: [[SUB79:%.*]] = add nsw i64 [[MUL]], -1 +; CHECKII-NEXT: [[FLOW81:%.*]] = getelementptr inbounds [[STRUCT_ST]], ptr [[NEWST]], i64 [[SUB79]], i32 7 +; CHECKII-NEXT: [[TMP8:%.*]] = load i64, ptr [[FLOW81]], align 8 +; CHECKII-NEXT: [[FLOW83:%.*]] = getelementptr inbounds [[STRUCT_ST]], ptr [[NEWST]], i64 [[MUL]], i32 7 +; CHECKII-NEXT: [[TMP9:%.*]] = load i64, ptr [[FLOW83]], align 8 +; CHECKII-NEXT: [[CMP84:%.*]] = icmp slt i64 [[TMP8]], [[TMP9]] +; CHECKII-NEXT: [[SPEC_SELECT:%.*]] = select i1 [[CMP84]], i64 [[ADD]], i64 [[MUL]] +; CHECKII-NEXT: br label [[IF_END87]] +; CHECKII: if.end87: +; CHECKII-NEXT: [[CMP_1]] = phi i64 [ [[MUL]], [[WHILE_BODY]] ], [ [[SPEC_SELECT]], [[IF_THEN]] ] +; CHECKII-NEXT: [[CMP16_NOT:%.*]] = icmp sgt i64 [[CMP_1]], [[MA]] +; CHECKII-NEXT: br i1 [[CMP16_NOT]], label [[WHILE_END]], label [[LAND_RHS]] +; CHECKII: while.end: +; CHECKII-NEXT: ret void +; +entry: + %t1 = getelementptr inbounds %struct.st, ptr %newst, i64 0, i32 2 + store ptr %t, ptr %t1, align 8 + %h3 = getelementptr inbounds %struct.st, ptr %newst, i64 0, i32 3 + store ptr %h, ptr %h3, align 8 + %org_c = getelementptr inbounds %struct.st, ptr %newst, i64 0, i32 8 + store i64 %c, ptr %org_c, align 8 + %c6 = getelementptr inbounds %struct.st, ptr %newst, i64 0, i32 1 + store i64 %c, ptr %c6, align 8 + %flow = getelementptr inbounds %struct.st, ptr %newst, i64 0, i32 7 + store i64 %rc, ptr %flow, align 8 + %conv = trunc i64 %n to i32 + store i32 %conv, ptr %newst, align 8 + %flow10 = getelementptr inbounds %struct.st, ptr %newst, i64 1, i32 7 + %0 = load i64, ptr %flow10, align 8 + %flow12 = getelementptr inbounds %struct.st, ptr %newst, i64 2, i32 7 + %1 = load i64, ptr %flow12, align 8 + %cmp13 = icmp sgt i64 %0, %1 + %conv15 = select i1 %cmp13, i64 2, i64 3 + %cmp16.not149 = icmp sgt i64 %conv15, %ma + br i1 %cmp16.not149, label %while.end, label %land.rhs + +land.rhs: ; preds = %entry, %if.end87 + %cmp.0151 = phi i64 [ %cmp.1, %if.end87 ], [ %conv15, %entry ] + %pos.0150 = phi i64 [ %cmp.0151, %if.end87 ], [ 1, %entry ] + %sub = add nsw i64 %cmp.0151, -1 + %flow19 = getelementptr inbounds %struct.st, ptr %newst, i64 %sub, i32 7 + %2 = load i64, ptr %flow19, align 8 + %cmp20 = icmp sgt i64 %2, %rc + br i1 %cmp20, label %while.body, label %while.end + +while.body: ; preds = %land.rhs + %arrayidx18 = getelementptr inbounds %struct.st, ptr %newst, i64 %sub + %t24 = getelementptr inbounds %struct.st, ptr %newst, i64 %sub, i32 2 + %3 = load ptr, ptr %t24, align 8 + %sub25 = add nsw i64 %pos.0150, -1 + %arrayidx26 = getelementptr inbounds %struct.st, ptr %newst, i64 %sub25 + %t27 = getelementptr inbounds %struct.st, ptr %newst, i64 %sub25, i32 2 + store ptr %3, ptr %t27, align 8 + %h30 = getelementptr inbounds %struct.st, ptr %newst, i64 %sub, i32 3 + %4 = load ptr, ptr %h30, align 8 + %h33 = getelementptr inbounds %struct.st, ptr %newst, i64 %sub25, i32 3 + store ptr %4, ptr %h33, align 8 + %c36 = getelementptr inbounds %struct.st, ptr %newst, i64 %sub, i32 1 + %5 = load i64, ptr %c36, align 8 + %c39 = getelementptr inbounds %struct.st, ptr %newst, i64 %sub25, i32 1 + store i64 %5, ptr %c39, align 8 + %6 = load i64, ptr %c36, align 8 + %org_c45 = getelementptr inbounds %struct.st, ptr %newst, i64 %sub25, i32 8 + store i64 %6, ptr %org_c45, align 8 + %flow51 = getelementptr inbounds %struct.st, ptr %newst, i64 %sub25, i32 7 + store i64 %2, ptr %flow51, align 8 + %7 = load i32, ptr %arrayidx18, align 8 + store i32 %7, ptr %arrayidx26, align 8 + store ptr %t, ptr %t24, align 8 + store ptr %h, ptr %h30, align 8 + store i64 %c, ptr %c36, align 8 + %org_c69 = getelementptr inbounds %struct.st, ptr %newst, i64 %sub, i32 8 + store i64 %c, ptr %org_c69, align 8 + store i64 %rc, ptr %flow19, align 8 + store i32 %conv, ptr %arrayidx18, align 8 + %mul = shl nsw i64 %cmp.0151, 1 + %add = or i64 %mul, 1 + %cmp77.not = icmp sgt i64 %add, %ma + br i1 %cmp77.not, label %if.end87, label %if.then + +if.then: ; preds = %while.body + %sub79 = add nsw i64 %mul, -1 + %flow81 = getelementptr inbounds %struct.st, ptr %newst, i64 %sub79, i32 7 + %8 = load i64, ptr %flow81, align 8 + %flow83 = getelementptr inbounds %struct.st, ptr %newst, i64 %mul, i32 7 + %9 = load i64, ptr %flow83, align 8 + %cmp84 = icmp slt i64 %8, %9 + %spec.select = select i1 %cmp84, i64 %add, i64 %mul + br label %if.end87 + +if.end87: ; preds = %if.then, %while.body + %cmp.1 = phi i64 [ %mul, %while.body ], [ %spec.select, %if.then ] + %cmp16.not = icmp sgt i64 %cmp.1, %ma + br i1 %cmp16.not, label %while.end, label %land.rhs + +while.end: ; preds = %land.rhs, %if.end87, %entry + ret void +} -- 2.7.4