From 4c4ff13d3c813712a80e030ae0a38ca475df74d9 Mon Sep 17 00:00:00 2001
From: Sam Parker <sam.parker@arm.com>
Date: Thu, 14 Mar 2019 11:14:13 +0000
Subject: [PATCH] [ARM][ParallelDSP] Enable multiple uses of loads

When choosing whether a pair of loads can be combined into a single
wide load, we check that the load only has a sext user and that sext
also only has one user. But this can prevent the transformation in
the cases when parallel macs use the same loaded data multiple times.

To enable this, we need to fix up any other uses after creating the
wide load: generating a trunc and a shift + trunc pair to recreate
the narrow values. We also need to keep a record of which loads have
already been widened.

Differential Revision: https://reviews.llvm.org/D59215

llvm-svn: 356132
---
 llvm/lib/Target/ARM/ARMParallelDSP.cpp             | 199 +++++++++-------
 .../CodeGen/ARM/ParallelDSP/multi-use-loads.ll     | 251 +++++++++++++++++++++
 llvm/test/CodeGen/ARM/ParallelDSP/smlad0.ll        |   1 +
 .../CodeGen/ARM/ParallelDSP/unroll-n-jam-smlad.ll  | 217 ++++++++++++++++++
 4 files changed, 584 insertions(+), 84 deletions(-)
 create mode 100644 llvm/test/CodeGen/ARM/ParallelDSP/multi-use-loads.ll
 create mode 100644 llvm/test/CodeGen/ARM/ParallelDSP/unroll-n-jam-smlad.ll
diff --git a/llvm/lib/Target/ARM/ARMParallelDSP.cpp b/llvm/lib/Target/ARM/ARMParallelDSP.cpp
index 9730c32..9b770dd 100644
--- a/llvm/lib/Target/ARM/ARMParallelDSP.cpp
+++ b/llvm/lib/Target/ARM/ARMParallelDSP.cpp
@@ -53,7 +53,7 @@ namespace {
   using OpChainList     = SmallVector<std::unique_ptr<OpChain>, 8>;
   using ReductionList   = SmallVector<Reduction, 8>;
   using ValueList       = SmallVector<Value*, 8>;
-  using MemInstList     = SmallVector<Instruction*, 8>;
+  using MemInstList     = SmallVector<LoadInst*, 8>;
   using PMACPair        = std::pair<BinOpChain*,BinOpChain*>;
   using PMACPairList    = SmallVector<PMACPair, 8>;
   using Instructions    = SmallVector<Instruction*,16>;
@@ -113,6 +113,21 @@ namespace {
     Reduction (PHINode *P, Instruction *Acc) : Phi(P), AccIntAdd(Acc) { };
   };
 
+  class WidenedLoad {
+    LoadInst *NewLd = nullptr;
+    SmallVector<LoadInst*, 4> Loads;
+
+  public:
+    WidenedLoad(SmallVectorImpl<LoadInst*> &Lds, LoadInst *Wide)
+      : NewLd(Wide) {
+      for (auto *I : Lds)
+        Loads.push_back(I);
+    }
+    LoadInst *getLoad() {
+      return NewLd;
+    }
+  };
+
   class ARMParallelDSP : public LoopPass {
     ScalarEvolution   *SE;
     AliasAnalysis     *AA;
@@ -123,13 +138,17 @@ namespace {
     const DataLayout  *DL;
     Module            *M;
     std::map<LoadInst*, LoadInst*> LoadPairs;
-    std::map<LoadInst*, SmallVector<LoadInst*, 4>> SequentialLoads;
+    std::map<LoadInst*, std::unique_ptr<WidenedLoad>> WideLoads;
 
-    bool RecordSequentialLoads(BasicBlock *Header);
+    bool RecordSequentialLoads(BasicBlock *BB);
     bool InsertParallelMACs(Reduction &Reduction);
     bool AreSequentialLoads(LoadInst *Ld0, LoadInst *Ld1, MemInstList &VecMem);
+    LoadInst* CreateLoadIns(IRBuilder<NoFolder> &IRB,
+                            SmallVectorImpl<LoadInst*> &Loads,
+                            IntegerType *LoadTy);
     void CreateParallelMACPairs(Reduction &R);
-    Instruction *CreateSMLADCall(LoadInst *VecLd0, LoadInst *VecLd1,
+    Instruction *CreateSMLADCall(SmallVectorImpl<LoadInst*> &VecLd0,
+                                 SmallVectorImpl<LoadInst*> &VecLd1,
                                  Instruction *Acc, bool Exchange,
                                  Instruction *InsertAfter);
 
@@ -202,7 +221,6 @@ namespace {
       }
 
       LoopAccessInfo LAI(L, SE, TLI, AA, DT, LI);
-      bool Changes = false;
 
       LLVM_DEBUG(dbgs() << "\n== Parallel DSP pass ==\n");
       LLVM_DEBUG(dbgs() << " - " << F.getName() << "\n\n");
@@ -212,7 +230,7 @@ namespace {
         return false;
       }
 
-      Changes = MatchSMLAD(F);
+      bool Changes = MatchSMLAD(F);
       return Changes;
     }
   };
@@ -225,7 +243,6 @@ namespace {
 // why we check that types are equal to MaxBitWidth, and not <= MaxBitWidth.
 template<unsigned MaxBitWidth>
 static bool IsNarrowSequence(Value *V, ValueList &VL) {
-  LLVM_DEBUG(dbgs() << "Is narrow sequence? "; V->dump());
   ConstantInt *CInt;
 
   if (match(V, m_ConstantInt(CInt))) {
@@ -244,38 +261,25 @@ static bool IsNarrowSequence(Value *V, ValueList &VL) {
   } else if (match(V, m_Add(m_Value(LHS), m_Value(RHS)))) {
     // TODO: we need to implement sadd16/sadd8 for this, which enables to
     // also do the rewrite for smlad8.ll, but it is unsupported for now.
-    LLVM_DEBUG(dbgs() << "No, unsupported Op:\t"; I->dump());
     return false;
   } else if (match(V, m_ZExtOrSExt(m_Value(Val)))) {
-    if (cast<CastInst>(I)->getSrcTy()->getIntegerBitWidth() != MaxBitWidth) {
-      LLVM_DEBUG(dbgs() << "No, wrong SrcTy size: " <<
-        cast<CastInst>(I)->getSrcTy()->getIntegerBitWidth() << "\n");
+    if (cast<CastInst>(I)->getSrcTy()->getIntegerBitWidth() != MaxBitWidth)
       return false;
-    }
 
     if (match(Val, m_Load(m_Value()))) {
-      LLVM_DEBUG(dbgs() << "Yes, found narrow Load:\t"; Val->dump());
       VL.push_back(Val);
       VL.push_back(I);
       return true;
     }
   }
-  LLVM_DEBUG(dbgs() << "No, unsupported Op:\t"; I->dump());
   return false;
 }
 
 template<typename MemInst>
 static bool AreSequentialAccesses(MemInst *MemOp0, MemInst *MemOp1,
                                   const DataLayout &DL, ScalarEvolution &SE) {
-  if (!MemOp0->isSimple() || !MemOp1->isSimple()) {
-    LLVM_DEBUG(dbgs() << "No, not touching volatile access\n");
-    return false;
-  }
-  if (isConsecutiveAccess(MemOp0, MemOp1, DL, SE)) {
-    LLVM_DEBUG(dbgs() << "OK: accesses are consecutive.\n");
+  if (isConsecutiveAccess(MemOp0, MemOp1, DL, SE))
     return true;
-  }
-  LLVM_DEBUG(dbgs() << "No, accesses aren't consecutive.\n");
   return false;
 }
 
@@ -284,19 +288,14 @@ bool ARMParallelDSP::AreSequentialLoads(LoadInst *Ld0, LoadInst *Ld1,
   if (!Ld0 || !Ld1)
     return false;
 
-  LLVM_DEBUG(dbgs() << "Are consecutive loads:\n";
+  if (!LoadPairs.count(Ld0) || LoadPairs[Ld0] != Ld1)
+    return false;
+
+  LLVM_DEBUG(dbgs() << "Loads are sequential and valid:\n";
     dbgs() << "Ld0:"; Ld0->dump();
     dbgs() << "Ld1:"; Ld1->dump();
   );
 
-  if (!Ld0->hasOneUse() || !Ld1->hasOneUse()) {
-    LLVM_DEBUG(dbgs() << "No, load has more than one use.\n");
-    return false;
-  }
-
-  if (!LoadPairs.count(Ld0) || LoadPairs[Ld0] != Ld1)
-    return false;
-
   VecMem.clear();
   VecMem.push_back(Ld0);
   VecMem.push_back(Ld1);
@@ -305,17 +304,16 @@ bool ARMParallelDSP::AreSequentialLoads(LoadInst *Ld0, LoadInst *Ld1,
 
 /// Iterate through the block and record base, offset pairs of loads as well as
 /// maximal sequences of sequential loads.
-bool ARMParallelDSP::RecordSequentialLoads(BasicBlock *Header) {
+bool ARMParallelDSP::RecordSequentialLoads(BasicBlock *BB) {
   SmallVector<LoadInst*, 8> Loads;
-  for (auto &I : *Header) {
+  for (auto &I : *BB) {
     auto *Ld = dyn_cast<LoadInst>(&I);
-    if (!Ld)
+    if (!Ld || !Ld->isSimple() ||
+        !Ld->hasOneUse() || !isa<SExtInst>(Ld->user_back()))
       continue;
     Loads.push_back(Ld);
   }
 
-  std::map<LoadInst*, LoadInst*> BaseLoads;
-
   for (auto *Ld0 : Loads) {
     for (auto *Ld1 : Loads) {
       if (Ld0 == Ld1)
@@ -323,17 +321,18 @@ bool ARMParallelDSP::RecordSequentialLoads(BasicBlock *Header) {
 
       if (AreSequentialAccesses<LoadInst>(Ld0, Ld1, *DL, *SE)) {
         LoadPairs[Ld0] = Ld1;
-        if (BaseLoads.count(Ld0)) {
-          LoadInst *Base = BaseLoads[Ld0];
-          BaseLoads[Ld1] = Base;
-          SequentialLoads[Base].push_back(Ld1);
-        } else {
-          BaseLoads[Ld1] = Ld0;
-          SequentialLoads[Ld0].push_back(Ld1);
-        }
+        break;
       }
     }
   }
+
+  LLVM_DEBUG(if (!LoadPairs.empty()) {
+               dbgs() << "Consecutive load pairs:\n";
+               for (auto &MapIt : LoadPairs) {
+                 LLVM_DEBUG(dbgs() << *MapIt.first << ", "
+                            << *MapIt.second << "\n");
+               }
+             });
   return LoadPairs.size() > 1;
 }
 
@@ -362,12 +361,11 @@ void ARMParallelDSP::CreateParallelMACPairs(Reduction &R) {
       if (!Ld0 || !Ld1 || !Ld2 || !Ld3)
         return false;
 
-      LLVM_DEBUG(dbgs() << "Looking at operands " << x << ":\n"
-                 << "\t Ld0: " << *Ld0 << "\n"
-                 << "\t Ld1: " << *Ld1 << "\n"
-                 << "and operands " << x + 2 << ":\n"
-                 << "\t Ld2: " << *Ld2 << "\n"
-                 << "\t Ld3: " << *Ld3 << "\n");
+      LLVM_DEBUG(dbgs() << "Loads:\n"
+                 << " - " << *Ld0 << "\n"
+                 << " - " << *Ld1 << "\n"
+                 << " - " << *Ld2 << "\n"
+                 << " - " << *Ld3 << "\n");
 
       if (AreSequentialLoads(Ld0, Ld1, PMul0->VecLd)) {
         if (AreSequentialLoads(Ld2, Ld3, PMul1->VecLd)) {
@@ -416,11 +414,6 @@ void ARMParallelDSP::CreateParallelMACPairs(Reduction &R) {
 
       assert(PMul0 != PMul1 && "expected different chains");
 
-      LLVM_DEBUG(dbgs() << "\nCheck parallel muls:\n";
-                 dbgs() << "- "; Mul0->dump();
-                 dbgs() << "- "; Mul1->dump());
-
-      LLVM_DEBUG(dbgs() << "OK: mul operands list match:\n");
       if (CanPair(PMul0, PMul1)) {
         Paired.insert(Mul0);
         Paired.insert(Mul1);
@@ -441,9 +434,8 @@ bool ARMParallelDSP::InsertParallelMACs(Reduction &Reduction) {
                dbgs() << "- "; PMul0->Root->dump();
                dbgs() << "- "; PMul1->Root->dump());
 
-    auto *VecLd0 = cast<LoadInst>(PMul0->VecLd[0]);
-    auto *VecLd1 = cast<LoadInst>(PMul1->VecLd[0]);
-    Acc = CreateSMLADCall(VecLd0, VecLd1, Acc, PMul1->Exchange, InsertAfter);
+    Acc = CreateSMLADCall(PMul0->VecLd, PMul1->VecLd, Acc, PMul1->Exchange,
+                          InsertAfter);
     InsertAfter = Acc;
   }
 
@@ -499,14 +491,12 @@ static void MatchReductions(Function &F, Loop *TheLoop, BasicBlock *Header,
 static void AddMACCandidate(OpChainList &Candidates,
                             Instruction *Mul,
                             Value *MulOp0, Value *MulOp1) {
-  LLVM_DEBUG(dbgs() << "OK, found acc mul:\t"; Mul->dump());
   assert(Mul->getOpcode() == Instruction::Mul &&
          "expected mul instruction");
   ValueList LHS;
   ValueList RHS;
   if (IsNarrowSequence<16>(MulOp0, LHS) &&
       IsNarrowSequence<16>(MulOp1, RHS)) {
-    LLVM_DEBUG(dbgs() << "OK, found narrow mul: "; Mul->dump());
     Candidates.push_back(make_unique<BinOpChain>(Mul, LHS, RHS));
   }
 }
@@ -514,7 +504,7 @@ static void AddMACCandidate(OpChainList &Candidates,
 static void MatchParallelMACSequences(Reduction &R,
                                       OpChainList &Candidates) {
   Instruction *Acc = R.AccIntAdd;
-  LLVM_DEBUG(dbgs() << "\n- Analysing:\t" << *Acc);
+  LLVM_DEBUG(dbgs() << "\n- Analysing:\t" << *Acc << "\n");
 
   // Returns false to signal the search should be stopped.
   std::function<bool(Value*)> Match =
@@ -687,32 +677,81 @@ bool ARMParallelDSP::MatchSMLAD(Function &F) {
   return Changed;
 }
 
-static LoadInst *CreateLoadIns(IRBuilder<NoFolder> &IRB, LoadInst &BaseLoad,
-                               Type *LoadTy) {
-  const unsigned AddrSpace = BaseLoad.getPointerAddressSpace();
-
-  Value *VecPtr = IRB.CreateBitCast(BaseLoad.getPointerOperand(),
+LoadInst* ARMParallelDSP::CreateLoadIns(IRBuilder<NoFolder> &IRB,
+                                        SmallVectorImpl<LoadInst*> &Loads,
+                                        IntegerType *LoadTy) {
+  assert(Loads.size() == 2 && "currently only support widening two loads");
+ 
+  const unsigned AddrSpace = Loads[0]->getPointerAddressSpace();
+  Value *VecPtr = IRB.CreateBitCast(Loads[0]->getPointerOperand(),
                                     LoadTy->getPointerTo(AddrSpace));
-  return IRB.CreateAlignedLoad(LoadTy, VecPtr, BaseLoad.getAlignment());
+  LoadInst *WideLoad = IRB.CreateAlignedLoad(LoadTy, VecPtr,
+                                             Loads[0]->getAlignment());
+  // Fix up users, Loads[0] needs trunc while Loads[1] needs a lshr and trunc.
+  Instruction *SExt0 = dyn_cast<SExtInst>(Loads[0]->user_back());
+  Instruction *SExt1 = dyn_cast<SExtInst>(Loads[1]->user_back());
+
+  assert((Loads[0]->hasOneUse() && Loads[1]->hasOneUse() && SExt0 && SExt1) &&
+         "Loads should have a single, extending, user");
+
+  std::function<void(Instruction*, Instruction*)> MoveAfter =
+    [&](Instruction* Source, Instruction* Sink) -> void {
+    if (DT->dominates(Source, Sink) ||
+        Source->getParent() != Sink->getParent() ||
+        isa<PHINode>(Source) || isa<PHINode>(Sink))
+      return;
+
+    Sink->moveAfter(Source);
+    for (auto &U : Sink->uses())
+      MoveAfter(Sink, cast<Instruction>(U.getUser()));
+  };
+
+  // From the wide load, create two values that equal the original two loads.
+  Value *Bottom = IRB.CreateTrunc(WideLoad, Loads[0]->getType());
+  SExt0->setOperand(0, Bottom);
+  if (auto *I = dyn_cast<Instruction>(Bottom)) {
+    I->moveAfter(WideLoad);
+    MoveAfter(I, SExt0);
+  }
+
+  IntegerType *Ld1Ty = cast<IntegerType>(Loads[1]->getType());
+  Value *ShiftVal = ConstantInt::get(LoadTy, Ld1Ty->getBitWidth());
+  Value *Top = IRB.CreateLShr(WideLoad, ShiftVal);
+  if (auto *I = dyn_cast<Instruction>(Top))
+    MoveAfter(WideLoad, I);
+
+  Value *Trunc = IRB.CreateTrunc(Top, Ld1Ty);
+  SExt1->setOperand(0, Trunc);
+  if (auto *I = dyn_cast<Instruction>(Trunc))
+    MoveAfter(I, SExt1);
+
+  WideLoads.emplace(std::make_pair(Loads[0],
+                                   make_unique<WidenedLoad>(Loads, WideLoad)));
+  return WideLoad;
 }
 
-Instruction *ARMParallelDSP::CreateSMLADCall(LoadInst *VecLd0, LoadInst *VecLd1,
+Instruction *ARMParallelDSP::CreateSMLADCall(SmallVectorImpl<LoadInst*> &VecLd0,
+                                             SmallVectorImpl<LoadInst*> &VecLd1,
                                              Instruction *Acc, bool Exchange,
                                              Instruction *InsertAfter) {
   LLVM_DEBUG(dbgs() << "Create SMLAD intrinsic using:\n"
-             << "- " << *VecLd0 << "\n"
-             << "- " << *VecLd1 << "\n"
+             << "- " << *VecLd0[0] << "\n"
+             << "- " << *VecLd0[1] << "\n"
+             << "- " << *VecLd1[0] << "\n"
+             << "- " << *VecLd1[1] << "\n"
              << "- " << *Acc << "\n"
-             << "Exchange: " << Exchange << "\n");
+             << "- Exchange: " << Exchange << "\n");
 
   IRBuilder<NoFolder> Builder(InsertAfter->getParent(),
                               ++BasicBlock::iterator(InsertAfter));
 
   // Replace the reduction chain with an intrinsic call
-  Type *Ty = IntegerType::get(M->getContext(), 32);
-  LoadInst *NewLd0 = CreateLoadIns(Builder, VecLd0[0], Ty);
-  LoadInst *NewLd1 = CreateLoadIns(Builder, VecLd1[0], Ty);
-  Value* Args[] = { NewLd0, NewLd1, Acc };
+  IntegerType *Ty = IntegerType::get(M->getContext(), 32);
+  LoadInst *WideLd0 = WideLoads.count(VecLd0[0]) ?
+    WideLoads[VecLd0[0]]->getLoad() : CreateLoadIns(Builder, VecLd0, Ty);
+  LoadInst *WideLd1 = WideLoads.count(VecLd1[0]) ?
+    WideLoads[VecLd1[0]]->getLoad() : CreateLoadIns(Builder, VecLd1, Ty);
+  Value* Args[] = { WideLd0, WideLd1, Acc };
   Function *SMLAD = nullptr;
   if (Exchange)
     SMLAD = Acc->getType()->isIntegerTy(32) ?
@@ -740,7 +779,6 @@ bool BinOpChain::AreSymmetrical(BinOpChain *Other) {
     }
 
     const unsigned Pairs = VL0.size();
-    LLVM_DEBUG(dbgs() << "Number of operand pairs: " << Pairs << "\n");
 
     for (unsigned i = 0; i < Pairs; ++i) {
       const Value *V0 = VL0[i];
@@ -748,24 +786,17 @@ bool BinOpChain::AreSymmetrical(BinOpChain *Other) {
       const auto *Inst0 = dyn_cast<Instruction>(V0);
       const auto *Inst1 = dyn_cast<Instruction>(V1);
 
-      LLVM_DEBUG(dbgs() << "Pair " << i << ":\n";
-                dbgs() << "mul1: "; V0->dump();
-                dbgs() << "mul2: "; V1->dump());
-
       if (!Inst0 || !Inst1)
         return false;
 
-      if (Inst0->isSameOperationAs(Inst1)) {
-        LLVM_DEBUG(dbgs() << "OK: same operation found!\n");
+      if (Inst0->isSameOperationAs(Inst1))
         continue;
-      }
 
       const APInt *C0, *C1;
       if (!(match(V0, m_APInt(C0)) && match(V1, m_APInt(C1)) && C0 == C1))
         return false;
     }
 
-    LLVM_DEBUG(dbgs() << "OK: found symmetrical operand lists.\n");
     return true;
   };
 
diff --git a/llvm/test/CodeGen/ARM/ParallelDSP/multi-use-loads.ll b/llvm/test/CodeGen/ARM/ParallelDSP/multi-use-loads.ll
new file mode 100644
index 0000000..3c19068
--- /dev/null
+++ b/llvm/test/CodeGen/ARM/ParallelDSP/multi-use-loads.ll
@@ -0,0 +1,251 @@
+; RUN: llc -O3 -mtriple=arm-arm-eabi -mcpu=cortex-m33 < %s | FileCheck %s
+
+; CHECK-LABEL: add_user
+; CHECK: %for.body
+; CHECK: ldr [[A:r[0-9]+]],{{.*}}, #2]!
+; CHECK: ldr [[B:r[0-9]+]],{{.*}}, #2]!
+; CHECK: smlad [[ACC:r[0-9]+]], [[B]], [[A]], [[ACC]]
+; CHECK: sxtah [[COUNT:r[0-9]+]], [[COUNT]], [[A]]
+define i32 @add_user(i32 %arg, i32* nocapture readnone %arg1, i16* nocapture readonly %arg2, i16* nocapture readonly %arg3) {
+entry:
+  %cmp24 = icmp sgt i32 %arg, 0
+  br i1 %cmp24, label %for.body.preheader, label %for.cond.cleanup
+
+for.body.preheader:
+  %.pre = load i16, i16* %arg3, align 2
+  %.pre27 = load i16, i16* %arg2, align 2
+  br label %for.body
+
+for.cond.cleanup:
+  %mac1.0.lcssa = phi i32 [ 0, %entry ], [ %add11, %for.body ]
+  %count.final = phi i32 [ 0, %entry ], [ %count.next, %for.body ]
+  %res = add i32 %mac1.0.lcssa, %count.final
+  ret i32 %res
+
+for.body:
+  %mac1.026 = phi i32 [ %add11, %for.body ], [ 0, %for.body.preheader ]
+  %i.025 = phi i32 [ %add, %for.body ], [ 0, %for.body.preheader ]
+  %count = phi i32 [ %count.next, %for.body ], [ 0, %for.body.preheader ]
+  %arrayidx = getelementptr inbounds i16, i16* %arg3, i32 %i.025
+  %0 = load i16, i16* %arrayidx, align 2
+  %add = add nuw nsw i32 %i.025, 1
+  %arrayidx1 = getelementptr inbounds i16, i16* %arg3, i32 %add
+  %1 = load i16, i16* %arrayidx1, align 2
+  %arrayidx3 = getelementptr inbounds i16, i16* %arg2, i32 %i.025
+  %2 = load i16, i16* %arrayidx3, align 2
+  %conv = sext i16 %2 to i32
+  %conv4 = sext i16 %0 to i32
+  %count.next = add i32 %conv4, %count
+  %mul = mul nsw i32 %conv, %conv4
+  %arrayidx6 = getelementptr inbounds i16, i16* %arg2, i32 %add
+  %3 = load i16, i16* %arrayidx6, align 2
+  %conv7 = sext i16 %3 to i32
+  %conv8 = sext i16 %1 to i32
+  %mul9 = mul nsw i32 %conv7, %conv8
+  %add10 = add i32 %mul, %mac1.026
+  %add11 = add i32 %mul9, %add10
+  %exitcond = icmp ne i32 %add, %arg
+  br i1 %exitcond, label %for.body, label %for.cond.cleanup
+}
+
+; CHECK-LABEL: mul_bottom_user
+; CHECK: %for.body
+; CHECK: ldr [[A:r[0-9]+]],{{.*}}, #2]!
+; CHECK: ldr [[B:r[0-9]+]],{{.*}}, #2]!
+; CHECK: smlad [[ACC:r[0-9]+]], [[B]], [[A]], [[ACC]]
+; CHECK: sxth [[SXT:r[0-9]+]], [[A]]
+; CHECK: mul [[COUNT:r[0-9]+]], [[SXT]], [[COUNT]]
+define i32 @mul_bottom_user(i32 %arg, i32* nocapture readnone %arg1, i16* nocapture readonly %arg2, i16* nocapture readonly %arg3) {
+entry:
+  %cmp24 = icmp sgt i32 %arg, 0
+  br i1 %cmp24, label %for.body.preheader, label %for.cond.cleanup
+
+for.body.preheader:
+  %.pre = load i16, i16* %arg3, align 2
+  %.pre27 = load i16, i16* %arg2, align 2
+  br label %for.body
+
+for.cond.cleanup:
+  %mac1.0.lcssa = phi i32 [ 0, %entry ], [ %add11, %for.body ]
+  %count.final = phi i32 [ 0, %entry ], [ %count.next, %for.body ]
+  %res = add i32 %mac1.0.lcssa, %count.final
+  ret i32 %res
+
+for.body:
+  %mac1.026 = phi i32 [ %add11, %for.body ], [ 0, %for.body.preheader ]
+  %i.025 = phi i32 [ %add, %for.body ], [ 0, %for.body.preheader ]
+  %count = phi i32 [ %count.next, %for.body ], [ 0, %for.body.preheader ]
+  %arrayidx = getelementptr inbounds i16, i16* %arg3, i32 %i.025
+  %0 = load i16, i16* %arrayidx, align 2
+  %add = add nuw nsw i32 %i.025, 1
+  %arrayidx1 = getelementptr inbounds i16, i16* %arg3, i32 %add
+  %1 = load i16, i16* %arrayidx1, align 2
+  %arrayidx3 = getelementptr inbounds i16, i16* %arg2, i32 %i.025
+  %2 = load i16, i16* %arrayidx3, align 2
+  %conv = sext i16 %2 to i32
+  %conv4 = sext i16 %0 to i32
+  %mul = mul nsw i32 %conv, %conv4
+  %arrayidx6 = getelementptr inbounds i16, i16* %arg2, i32 %add
+  %3 = load i16, i16* %arrayidx6, align 2
+  %conv7 = sext i16 %3 to i32
+  %conv8 = sext i16 %1 to i32
+  %mul9 = mul nsw i32 %conv7, %conv8
+  %add10 = add i32 %mul, %mac1.026
+  %add11 = add i32 %mul9, %add10
+  %count.next = mul i32 %conv4, %count
+  %exitcond = icmp ne i32 %add, %arg
+  br i1 %exitcond, label %for.body, label %for.cond.cleanup
+}
+
+; CHECK-LABEL: mul_top_user
+; CHECK: %for.body
+; CHECK: ldr [[A:[rl0-9]+]],{{.*}}, #2]!
+; CHECK: ldr [[B:[rl0-9]+]],{{.*}}, #2]!
+; CHECK: smlad [[ACC:[rl0-9]+]], [[B]], [[A]], [[ACC]]
+; CHECK: asr.w [[ASR:[rl0-9]+]], [[ASR]], #16
+; CHECK: mul [[COUNT:[rl0-9]+]], [[ASR]], [[COUNT]]
+define i32 @mul_top_user(i32 %arg, i32* nocapture readnone %arg1, i16* nocapture readonly %arg2, i16* nocapture readonly %arg3) {
+entry:
+  %cmp24 = icmp sgt i32 %arg, 0
+  br i1 %cmp24, label %for.body.preheader, label %for.cond.cleanup
+
+for.body.preheader:
+  %.pre = load i16, i16* %arg3, align 2
+  %.pre27 = load i16, i16* %arg2, align 2
+  br label %for.body
+
+for.cond.cleanup:
+  %mac1.0.lcssa = phi i32 [ 0, %entry ], [ %add11, %for.body ]
+  %count.final = phi i32 [ 0, %entry ], [ %count.next, %for.body ]
+  %res = add i32 %mac1.0.lcssa, %count.final
+  ret i32 %res
+
+for.body:
+  %mac1.026 = phi i32 [ %add11, %for.body ], [ 0, %for.body.preheader ]
+  %i.025 = phi i32 [ %add, %for.body ], [ 0, %for.body.preheader ]
+  %count = phi i32 [ %count.next, %for.body ], [ 0, %for.body.preheader ]
+  %arrayidx = getelementptr inbounds i16, i16* %arg3, i32 %i.025
+  %0 = load i16, i16* %arrayidx, align 2
+  %add = add nuw nsw i32 %i.025, 1
+  %arrayidx1 = getelementptr inbounds i16, i16* %arg3, i32 %add
+  %1 = load i16, i16* %arrayidx1, align 2
+  %arrayidx3 = getelementptr inbounds i16, i16* %arg2, i32 %i.025
+  %2 = load i16, i16* %arrayidx3, align 2
+  %conv = sext i16 %2 to i32
+  %conv4 = sext i16 %0 to i32
+  %mul = mul nsw i32 %conv, %conv4
+  %arrayidx6 = getelementptr inbounds i16, i16* %arg2, i32 %add
+  %3 = load i16, i16* %arrayidx6, align 2
+  %conv7 = sext i16 %3 to i32
+  %conv8 = sext i16 %1 to i32
+  %mul9 = mul nsw i32 %conv7, %conv8
+  %add10 = add i32 %mul, %mac1.026
+  %add11 = add i32 %mul9, %add10
+  %count.next = mul i32 %conv7, %count
+  %exitcond = icmp ne i32 %add, %arg
+  br i1 %exitcond, label %for.body, label %for.cond.cleanup
+}
+
+; CHECK-LABEL: and_user
+; CHECK: %for.body
+; CHECK: ldr [[A:r[0-9]+]],{{.*}}, #2]!
+; CHECK: ldr [[B:r[0-9]+]],{{.*}}, #2]!
+; CHECK: smlad [[ACC:r[0-9]+]], [[B]], [[A]], [[ACC]]
+; CHECK: uxth [[UXT:r[0-9]+]], [[A]]
+; CHECK: mul [[MUL:r[0-9]+]], [[UXT]], [[MUL]]
+define i32 @and_user(i32 %arg, i32* nocapture readnone %arg1, i16* nocapture readonly %arg2, i16* nocapture readonly %arg3) {
+entry:
+  %cmp24 = icmp sgt i32 %arg, 0
+  br i1 %cmp24, label %for.body.preheader, label %for.cond.cleanup
+
+for.body.preheader:
+  %.pre = load i16, i16* %arg3, align 2
+  %.pre27 = load i16, i16* %arg2, align 2
+  br label %for.body
+
+for.cond.cleanup:
+  %mac1.0.lcssa = phi i32 [ 0, %entry ], [ %add11, %for.body ]
+  %count.final = phi i32 [ 0, %entry ], [ %count.next, %for.body ]
+  %res = add i32 %mac1.0.lcssa, %count.final
+  ret i32 %res
+
+for.body:
+  %mac1.026 = phi i32 [ %add11, %for.body ], [ 0, %for.body.preheader ]
+  %i.025 = phi i32 [ %add, %for.body ], [ 0, %for.body.preheader ]
+  %count = phi i32 [ %count.next, %for.body ], [ 0, %for.body.preheader ]
+  %arrayidx = getelementptr inbounds i16, i16* %arg3, i32 %i.025
+  %0 = load i16, i16* %arrayidx, align 2
+  %add = add nuw nsw i32 %i.025, 1
+  %arrayidx1 = getelementptr inbounds i16, i16* %arg3, i32 %add
+  %arrayidx3 = getelementptr inbounds i16, i16* %arg2, i32 %i.025
+  %arrayidx6 = getelementptr inbounds i16, i16* %arg2, i32 %add
+  %1 = load i16, i16* %arrayidx1, align 2
+  %2 = load i16, i16* %arrayidx3, align 2
+  %conv = sext i16 %2 to i32
+  %conv4 = sext i16 %0 to i32
+  %bottom = and i32 %conv4, 65535
+  %mul = mul nsw i32 %conv, %conv4
+  %3 = load i16, i16* %arrayidx6, align 2
+  %conv7 = sext i16 %3 to i32
+  %conv8 = sext i16 %1 to i32
+  %mul9 = mul nsw i32 %conv7, %conv8
+  %add10 = add i32 %mul, %mac1.026
+  %add11 = add i32 %mul9, %add10
+  %count.next = mul i32 %bottom, %count
+  %exitcond = icmp ne i32 %add, %arg
+  br i1 %exitcond, label %for.body, label %for.cond.cleanup
+}
+
+; CHECK-LABEL: multi_uses
+; CHECK: %for.body
+; CHECK: ldr [[A:r[0-9]+]], [{{.*}}, #2]!
+; CHECK: ldr [[B:r[0-9]+]], [{{.*}}, #2]!
+; CHECK: smlad [[ACC:[rl0-9]+]], [[B]], [[A]], [[ACC]]
+; CHECK: sxth [[SXT:r[0-9]+]], [[A]]
+; CHECK: eor.w [[EOR:r[0-9]+]], [[SXT]], [[SHIFT:r[0-9]+]]
+; CHECK: mul [[MUL:r[0-9]+]], [[EOR]], [[SXT]]
+; CHECK: lsl.w [[SHIFT]], [[MUL]], #16
+define i32 @multi_uses(i32 %arg, i32* nocapture readnone %arg1, i16* nocapture readonly %arg2, i16* nocapture readonly %arg3) {
+entry:
+  %cmp24 = icmp sgt i32 %arg, 0
+  br i1 %cmp24, label %for.body.preheader, label %for.cond.cleanup
+
+for.body.preheader:
+  %.pre = load i16, i16* %arg3, align 2
+  %.pre27 = load i16, i16* %arg2, align 2
+  br label %for.body
+
+for.cond.cleanup:
+  %mac1.0.lcssa = phi i32 [ 0, %entry ], [ %add11, %for.body ]
+  %count.final = phi i32 [ 0, %entry ], [ %count.next, %for.body ]
+  %res = add i32 %mac1.0.lcssa, %count.final
+  ret i32 %res
+
+for.body:
+  %mac1.026 = phi i32 [ %add11, %for.body ], [ 0, %for.body.preheader ]
+  %i.025 = phi i32 [ %add, %for.body ], [ 0, %for.body.preheader ]
+  %count = phi i32 [ %count.next, %for.body ], [ 0, %for.body.preheader ]
+  %arrayidx = getelementptr inbounds i16, i16* %arg3, i32 %i.025
+  %0 = load i16, i16* %arrayidx, align 2
+  %add = add nuw nsw i32 %i.025, 1
+  %arrayidx1 = getelementptr inbounds i16, i16* %arg3, i32 %add
+  %arrayidx3 = getelementptr inbounds i16, i16* %arg2, i32 %i.025
+  %arrayidx6 = getelementptr inbounds i16, i16* %arg2, i32 %add
+  %1 = load i16, i16* %arrayidx1, align 2
+  %2 = load i16, i16* %arrayidx3, align 2
+  %conv = sext i16 %2 to i32
+  %conv4 = sext i16 %0 to i32
+  %bottom = and i32 %conv4, 65535
+  %mul = mul nsw i32 %conv, %conv4
+  %3 = load i16, i16* %arrayidx6, align 2
+  %conv7 = sext i16 %3 to i32
+  %conv8 = sext i16 %1 to i32
+  %mul9 = mul nsw i32 %conv7, %conv8
+  %add10 = add i32 %mul, %mac1.026
+  %shl = shl i32 %conv4, 16
+  %add11 = add i32 %mul9, %add10
+  %xor = xor i32 %bottom, %count
+  %count.next = mul i32 %xor, %shl
+  %exitcond = icmp ne i32 %add, %arg
+  br i1 %exitcond, label %for.body, label %for.cond.cleanup
+}
diff --git a/llvm/test/CodeGen/ARM/ParallelDSP/smlad0.ll b/llvm/test/CodeGen/ARM/ParallelDSP/smlad0.ll
index 477f565..d3bf51f 100644
--- a/llvm/test/CodeGen/ARM/ParallelDSP/smlad0.ll
+++ b/llvm/test/CodeGen/ARM/ParallelDSP/smlad0.ll
@@ -210,3 +210,4 @@ for.body:
   %exitcond = icmp ne i32 %add, %arg
   br i1 %exitcond, label %for.body, label %for.cond.cleanup
 }
+
diff --git a/llvm/test/CodeGen/ARM/ParallelDSP/unroll-n-jam-smlad.ll b/llvm/test/CodeGen/ARM/ParallelDSP/unroll-n-jam-smlad.ll
new file mode 100644
index 0000000..cb51bc5
--- /dev/null
+++ b/llvm/test/CodeGen/ARM/ParallelDSP/unroll-n-jam-smlad.ll
@@ -0,0 +1,217 @@
+; RUN: llc -O3 -mtriple=thumbv7em %s -o - | FileCheck %s
+; RUN: llc -O3 -mtriple=thumbv8m.main -mattr=+dsp %s -o - | FileCheck %s
+
+; Test that the duplicate loads are removed, which allows parallel dsp to find
+; the parallel operations.
+
+define void @unroll_n_jam_smlad(i32* %res, i16* %A, i16* %B, i32 %N, i32 %idx) {
+entry:
+  %xtraiter306.i = and i32 %N, 3
+  %unroll_iter310.i = sub i32 %N, %xtraiter306.i
+  %arrayidx.us.i117.i = getelementptr inbounds i32, i32* %res, i32 %idx
+  store i32 0, i32* %arrayidx.us.i117.i, align 4
+  %mul.us.i118.i = mul i32 %idx, %N
+  %inc11.us.i.i = or i32 %idx, 1
+  %arrayidx.us.i117.1.i = getelementptr inbounds i32, i32* %res, i32 %inc11.us.i.i
+  store i32 0, i32* %arrayidx.us.i117.1.i, align 4
+  %mul.us.i118.1.i = mul i32 %inc11.us.i.i, %N
+  %inc11.us.i.1.i = or i32 %idx, 2
+  %arrayidx.us.i117.2.i = getelementptr inbounds i32, i32* %res, i32 %inc11.us.i.1.i
+  store i32 0, i32* %arrayidx.us.i117.2.i, align 4
+  %mul.us.i118.2.i = mul i32 %inc11.us.i.1.i, %N
+  %inc11.us.i.2.i = or i32 %idx, 3
+  %arrayidx.us.i117.3.i = getelementptr inbounds i32, i32* %res, i32 %inc11.us.i.2.i
+  store i32 0, i32* %arrayidx.us.i117.3.i, align 4
+  %mul.us.i118.3.i = mul i32 %inc11.us.i.2.i, %N
+  %inc11.us.i.3.i = add i32 %idx, 4
+  br label %for.body
+
+; CHECK: %for.body
+; CHECK: smlad
+; CHECK: smlad
+; CHECK: smlad
+; CHECK: smlad
+; CHECK: smlad
+; CHECK: smlad
+; CHECK: smlad
+; CHECK: smlad
+
+for.body:
+  %A3 = phi i32 [ %add9.us.i.3361.i, %for.body ], [ 0, %entry ]
+  %j.026.us.i.i = phi i32 [ %inc.us.i.3362.i, %for.body ], [ 0, %entry ]
+  %A4 = phi i32 [ %add9.us.i.1.3.i, %for.body ], [ 0, %entry ]
+  %A5 = phi i32 [ %add9.us.i.2.3.i, %for.body ], [ 0, %entry ]
+  %A6 = phi i32 [ %add9.us.i.3.3.i, %for.body ], [ 0, %entry ]
+  %niter335.i = phi i32 [ %niter335.nsub.3.i, %for.body ], [ %unroll_iter310.i, %entry ]
+  %add.us.i.i = add i32 %j.026.us.i.i, %mul.us.i118.i
+  %arrayidx4.us.i.i = getelementptr inbounds i16, i16* %A, i32 %add.us.i.i
+  %A7 = load i16, i16* %arrayidx4.us.i.i, align 2
+  %conv.us.i.i = sext i16 %A7 to i32
+  %arrayidx5.us.i.i = getelementptr inbounds i16, i16* %B, i32 %j.026.us.i.i
+  %A8 = load i16, i16* %arrayidx5.us.i.i, align 2
+  %conv6.us.i.i = sext i16 %A8 to i32
+  %mul7.us.i.i = mul nsw i32 %conv6.us.i.i, %conv.us.i.i
+  %add9.us.i.i = add nsw i32 %mul7.us.i.i, %A3
+  %inc.us.i.i = or i32 %j.026.us.i.i, 1
+  %add.us.i.1.i = add i32 %j.026.us.i.i, %mul.us.i118.1.i
+  %arrayidx4.us.i.1.i = getelementptr inbounds i16, i16* %A, i32 %add.us.i.1.i
+  %A9 = load i16, i16* %arrayidx4.us.i.1.i, align 2
+  %conv.us.i.1.i = sext i16 %A9 to i32
+  %arrayidx5.us.i.1.i = getelementptr inbounds i16, i16* %B, i32 %j.026.us.i.i
+  %B0 = load i16, i16* %arrayidx5.us.i.1.i, align 2
+  %conv6.us.i.1.i = sext i16 %B0 to i32
+  %mul7.us.i.1.i = mul nsw i32 %conv6.us.i.1.i, %conv.us.i.1.i
+  %add9.us.i.1.i = add nsw i32 %mul7.us.i.1.i, %A4
+  %inc.us.i.1.i = or i32 %j.026.us.i.i, 1
+  %add.us.i.2.i = add i32 %j.026.us.i.i, %mul.us.i118.2.i
+  %arrayidx4.us.i.2.i = getelementptr inbounds i16, i16* %A, i32 %add.us.i.2.i
+  %B1 = load i16, i16* %arrayidx4.us.i.2.i, align 2
+  %conv.us.i.2.i = sext i16 %B1 to i32
+  %arrayidx5.us.i.2.i = getelementptr inbounds i16, i16* %B, i32 %j.026.us.i.i
+  %B2 = load i16, i16* %arrayidx5.us.i.2.i, align 2
+  %conv6.us.i.2.i = sext i16 %B2 to i32
+  %mul7.us.i.2.i = mul nsw i32 %conv6.us.i.2.i, %conv.us.i.2.i
+  %add9.us.i.2.i = add nsw i32 %mul7.us.i.2.i, %A5
+  %inc.us.i.2.i = or i32 %j.026.us.i.i, 1
+  %add.us.i.3.i = add i32 %j.026.us.i.i, %mul.us.i118.3.i
+  %arrayidx4.us.i.3.i = getelementptr inbounds i16, i16* %A, i32 %add.us.i.3.i
+  %B3 = load i16, i16* %arrayidx4.us.i.3.i, align 2
+  %conv.us.i.3.i = sext i16 %B3 to i32
+  %arrayidx5.us.i.3.i = getelementptr inbounds i16, i16* %B, i32 %j.026.us.i.i
+  %B4 = load i16, i16* %arrayidx5.us.i.3.i, align 2
+  %conv6.us.i.3.i = sext i16 %B4 to i32
+  %mul7.us.i.3.i = mul nsw i32 %conv6.us.i.3.i, %conv.us.i.3.i
+  %add9.us.i.3.i = add nsw i32 %mul7.us.i.3.i, %A6
+  %inc.us.i.3.i = or i32 %j.026.us.i.i, 1
+  %add.us.i.1337.i = add i32 %inc.us.i.i, %mul.us.i118.i
+  %arrayidx4.us.i.1338.i = getelementptr inbounds i16, i16* %A, i32 %add.us.i.1337.i
+  %B5 = load i16, i16* %arrayidx4.us.i.1338.i, align 2
+  %conv.us.i.1339.i = sext i16 %B5 to i32
+  %arrayidx5.us.i.1340.i = getelementptr inbounds i16, i16* %B, i32 %inc.us.i.i
+  %B6 = load i16, i16* %arrayidx5.us.i.1340.i, align 2
+  %conv6.us.i.1341.i = sext i16 %B6 to i32
+  %mul7.us.i.1342.i = mul nsw i32 %conv6.us.i.1341.i, %conv.us.i.1339.i
+  %add9.us.i.1343.i = add nsw i32 %mul7.us.i.1342.i, %add9.us.i.i
+  %inc.us.i.1344.i = or i32 %j.026.us.i.i, 2
+  %add.us.i.1.1.i = add i32 %inc.us.i.1.i, %mul.us.i118.1.i
+  %arrayidx4.us.i.1.1.i = getelementptr inbounds i16, i16* %A, i32 %add.us.i.1.1.i
+  %B7 = load i16, i16* %arrayidx4.us.i.1.1.i, align 2
+  %conv.us.i.1.1.i = sext i16 %B7 to i32
+  %arrayidx5.us.i.1.1.i = getelementptr inbounds i16, i16* %B, i32 %inc.us.i.1.i
+  %B6.dup = load i16, i16* %arrayidx5.us.i.1.1.i, align 2
+  %conv6.us.i.1.1.i = sext i16 %B6.dup to i32
+  %mul7.us.i.1.1.i = mul nsw i32 %conv6.us.i.1.1.i, %conv.us.i.1.1.i
+  %add9.us.i.1.1.i = add nsw i32 %mul7.us.i.1.1.i, %add9.us.i.1.i
+  %inc.us.i.1.1.i = or i32 %j.026.us.i.i, 2
+  %add.us.i.2.1.i = add i32 %inc.us.i.2.i, %mul.us.i118.2.i
+  %arrayidx4.us.i.2.1.i = getelementptr inbounds i16, i16* %A, i32 %add.us.i.2.1.i
+  %B9 = load i16, i16* %arrayidx4.us.i.2.1.i, align 2
+  %conv.us.i.2.1.i = sext i16 %B9 to i32
+  %arrayidx5.us.i.2.1.i = getelementptr inbounds i16, i16* %B, i32 %inc.us.i.2.i
+  %B6.dup.i = load i16, i16* %arrayidx5.us.i.2.1.i, align 2
+  %conv6.us.i.2.1.i = sext i16 %B6.dup.i to i32
+  %mul7.us.i.2.1.i = mul nsw i32 %conv6.us.i.2.1.i, %conv.us.i.2.1.i
+  %add9.us.i.2.1.i = add nsw i32 %mul7.us.i.2.1.i, %add9.us.i.2.i
+  %inc.us.i.2.1.i = or i32 %j.026.us.i.i, 2
+  %add.us.i.3.1.i = add i32 %inc.us.i.3.i, %mul.us.i118.3.i
+  %arrayidx4.us.i.3.1.i = getelementptr inbounds i16, i16* %A, i32 %add.us.i.3.1.i
+  %B11 = load i16, i16* %arrayidx4.us.i.3.1.i, align 2
+  %conv.us.i.3.1.i = sext i16 %B11 to i32
+  %arrayidx5.us.i.3.1.i = getelementptr inbounds i16, i16* %B, i32 %inc.us.i.3.i
+  %B6.dup.i.i = load i16, i16* %arrayidx5.us.i.3.1.i, align 2
+  %conv6.us.i.3.1.i = sext i16 %B6.dup.i.i to i32
+  %mul7.us.i.3.1.i = mul nsw i32 %conv6.us.i.3.1.i, %conv.us.i.3.1.i
+  %add9.us.i.3.1.i = add nsw i32 %mul7.us.i.3.1.i, %add9.us.i.3.i
+  %inc.us.i.3.1.i = or i32 %j.026.us.i.i, 2
+  %add.us.i.2346.i = add i32 %inc.us.i.1344.i, %mul.us.i118.i
+  %arrayidx4.us.i.2347.i = getelementptr inbounds i16, i16* %A, i32 %add.us.i.2346.i
+  %B13 = load i16, i16* %arrayidx4.us.i.2347.i, align 2
+  %conv.us.i.2348.i = sext i16 %B13 to i32
+  %arrayidx5.us.i.2349.i = getelementptr inbounds i16, i16* %B, i32 %inc.us.i.1344.i
+  %B14 = load i16, i16* %arrayidx5.us.i.2349.i, align 2
+  %conv6.us.i.2350.i = sext i16 %B14 to i32
+  %mul7.us.i.2351.i = mul nsw i32 %conv6.us.i.2350.i, %conv.us.i.2348.i
+  %add9.us.i.2352.i = add nsw i32 %mul7.us.i.2351.i, %add9.us.i.1343.i
+  %inc.us.i.2353.i = or i32 %j.026.us.i.i, 3
+  %add.us.i.1.2.i = add i32 %inc.us.i.1.1.i, %mul.us.i118.1.i
+  %arrayidx4.us.i.1.2.i = getelementptr inbounds i16, i16* %A, i32 %add.us.i.1.2.i
+  %B15 = load i16, i16* %arrayidx4.us.i.1.2.i, align 2
+  %conv.us.i.1.2.i = sext i16 %B15 to i32
+  %arrayidx5.us.i.1.2.i = getelementptr inbounds i16, i16* %B, i32 %inc.us.i.1.1.i
+  %B14.dup = load i16, i16* %arrayidx5.us.i.1.2.i, align 2
+  %conv6.us.i.1.2.i = sext i16 %B14.dup to i32
+  %mul7.us.i.1.2.i = mul nsw i32 %conv6.us.i.1.2.i, %conv.us.i.1.2.i
+  %add9.us.i.1.2.i = add nsw i32 %mul7.us.i.1.2.i, %add9.us.i.1.1.i
+  %inc.us.i.1.2.i = or i32 %j.026.us.i.i, 3
+  %add.us.i.2.2.i = add i32 %inc.us.i.2.1.i, %mul.us.i118.2.i
+  %arrayidx4.us.i.2.2.i = getelementptr inbounds i16, i16* %A, i32 %add.us.i.2.2.i
+  %B17 = load i16, i16* %arrayidx4.us.i.2.2.i, align 2
+  %conv.us.i.2.2.i = sext i16 %B17 to i32
+  %arrayidx5.us.i.2.2.i = getelementptr inbounds i16, i16* %B, i32 %inc.us.i.2.1.i
+  %B14.dup.i = load i16, i16* %arrayidx5.us.i.2.2.i, align 2
+  %conv6.us.i.2.2.i = sext i16 %B14.dup.i to i32
+  %mul7.us.i.2.2.i = mul nsw i32 %conv6.us.i.2.2.i, %conv.us.i.2.2.i
+  %add9.us.i.2.2.i = add nsw i32 %mul7.us.i.2.2.i, %add9.us.i.2.1.i
+  %inc.us.i.2.2.i = or i32 %j.026.us.i.i, 3
+  %add.us.i.3.2.i = add i32 %inc.us.i.3.1.i, %mul.us.i118.3.i
+  %arrayidx4.us.i.3.2.i = getelementptr inbounds i16, i16* %A, i32 %add.us.i.3.2.i
+  %B19 = load i16, i16* %arrayidx4.us.i.3.2.i, align 2
+  %conv.us.i.3.2.i = sext i16 %B19 to i32
+  %arrayidx5.us.i.3.2.i = getelementptr inbounds i16, i16* %B, i32 %inc.us.i.3.1.i
+  %B14.dup.i.i = load i16, i16* %arrayidx5.us.i.3.2.i, align 2
+  %conv6.us.i.3.2.i = sext i16 %B14.dup.i.i to i32
+  %mul7.us.i.3.2.i = mul nsw i32 %conv6.us.i.3.2.i, %conv.us.i.3.2.i
+  %add9.us.i.3.2.i = add nsw i32 %mul7.us.i.3.2.i, %add9.us.i.3.1.i
+  %inc.us.i.3.2.i = or i32 %j.026.us.i.i, 3
+  %add.us.i.3355.i = add i32 %inc.us.i.2353.i, %mul.us.i118.i
+  %arrayidx4.us.i.3356.i = getelementptr inbounds i16, i16* %A, i32 %add.us.i.3355.i
+  %B21 = load i16, i16* %arrayidx4.us.i.3356.i, align 2
+  %conv.us.i.3357.i = sext i16 %B21 to i32
+  %arrayidx5.us.i.3358.i = getelementptr inbounds i16, i16* %B, i32 %inc.us.i.2353.i
+  %B22 = load i16, i16* %arrayidx5.us.i.3358.i, align 2
+  %conv6.us.i.3359.i = sext i16 %B22 to i32
+  %mul7.us.i.3360.i = mul nsw i32 %conv6.us.i.3359.i, %conv.us.i.3357.i
+  %add9.us.i.3361.i = add nsw i32 %mul7.us.i.3360.i, %add9.us.i.2352.i
+  %inc.us.i.3362.i = add i32 %j.026.us.i.i, 4
+  %add.us.i.1.3.i = add i32 %inc.us.i.1.2.i, %mul.us.i118.1.i
+  %arrayidx4.us.i.1.3.i = getelementptr inbounds i16, i16* %A, i32 %add.us.i.1.3.i
+  %B23 = load i16, i16* %arrayidx4.us.i.1.3.i, align 2
+  %conv.us.i.1.3.i = sext i16 %B23 to i32
+  %arrayidx5.us.i.1.3.i = getelementptr inbounds i16, i16* %B, i32 %inc.us.i.1.2.i
+  %B22.dup = load i16, i16* %arrayidx5.us.i.1.3.i, align 2
+  %conv6.us.i.1.3.i = sext i16 %B22.dup to i32
+  %mul7.us.i.1.3.i = mul nsw i32 %conv6.us.i.1.3.i, %conv.us.i.1.3.i
+  %add9.us.i.1.3.i = add nsw i32 %mul7.us.i.1.3.i, %add9.us.i.1.2.i
+  %add.us.i.2.3.i = add i32 %inc.us.i.2.2.i, %mul.us.i118.2.i
+  %arrayidx4.us.i.2.3.i = getelementptr inbounds i16, i16* %A, i32 %add.us.i.2.3.i
+  %B25 = load i16, i16* %arrayidx4.us.i.2.3.i, align 2
+  %conv.us.i.2.3.i = sext i16 %B25 to i32
+  %arrayidx5.us.i.2.3.i = getelementptr inbounds i16, i16* %B, i32 %inc.us.i.2.2.i
+  %B22.dup.i = load i16, i16* %arrayidx5.us.i.2.3.i, align 2
+  %conv6.us.i.2.3.i = sext i16 %B22.dup.i to i32
+  %mul7.us.i.2.3.i = mul nsw i32 %conv6.us.i.2.3.i, %conv.us.i.2.3.i
+  %add9.us.i.2.3.i = add nsw i32 %mul7.us.i.2.3.i, %add9.us.i.2.2.i
+  %add.us.i.3.3.i = add i32 %inc.us.i.3.2.i, %mul.us.i118.3.i
+  %arrayidx4.us.i.3.3.i = getelementptr inbounds i16, i16* %A, i32 %add.us.i.3.3.i
+  %B27 = load i16, i16* %arrayidx4.us.i.3.3.i, align 2
+  %conv.us.i.3.3.i = sext i16 %B27 to i32
+  %arrayidx5.us.i.3.3.i = getelementptr inbounds i16, i16* %B, i32 %inc.us.i.3.2.i
+  %B22.dup.i.i = load i16, i16* %arrayidx5.us.i.3.3.i, align 2
+  %conv6.us.i.3.3.i = sext i16 %B22.dup.i.i to i32
+  %mul7.us.i.3.3.i = mul nsw i32 %conv6.us.i.3.3.i, %conv.us.i.3.3.i
+  %add9.us.i.3.3.i = add nsw i32 %mul7.us.i.3.3.i, %add9.us.i.3.2.i
+  %niter335.nsub.3.i = add i32 %niter335.i, -4
+  %niter335.ncmp.3.i = icmp eq i32 %niter335.nsub.3.i, 0
+  br i1 %niter335.ncmp.3.i, label %exit, label %for.body
+
+exit:
+  %arrayidx.out.i = getelementptr inbounds i32, i32* %res, i32 0
+  store i32 %add9.us.i.3361.i, i32* %arrayidx.out.i, align 4
+  %arrayidx.out.1.i = getelementptr inbounds i32, i32* %res, i32 1
+  store i32 %add9.us.i.1.3.i, i32* %arrayidx.out.1.i, align 4
+  %arrayidx.out.2.i = getelementptr inbounds i32, i32* %res, i32 2
+  store i32 %add9.us.i.2.3.i, i32* %arrayidx.out.2.i, align 4
+  %arrayidx.out.3.i = getelementptr inbounds i32, i32* %res, i32 3
+  store i32 %add9.us.i.3.3.i, i32* %arrayidx.out.3.i, align 4
+  ret void
+}
-- 
2.7.4