From bc6c8c4bbbee24fbcd288479651f2ffd9528342a Mon Sep 17 00:00:00 2001
From: Florian Hahn <flo@fhahn.com>
Date: Wed, 11 Mar 2020 17:01:47 +0000
Subject: [PATCH] [Matrix] Add remark propagation along  the inlined-at chain.

This patch adds support for propagating matrix expressions along the
inlined-at chain and emitting remarks at the traversed function scopes.

To motivate this new behavior, consider the example below. Without the
remark 'up-leveling', we would only get remarks in load.h and store.h,
but we cannot generate a remark describing the full expression in
toplevel.cpp, which is the place where the user has the best chance of
spotting/fixing potential problems.

With this patch, we generate a remark for the load in load.h, one for
the store in store.h and one for the complete expression in
toplevel.cpp. For a bigger example, please see remarks-inlining.ll.

    load.h:
    template <typename Ty, unsigned R, unsigned C> Matrix<Ty, R, C> load(Ty *Ptr) {
      Matrix<Ty, R, C> Result;
      Result.value = *reinterpret_cast <typename Matrix<Ty, R, C>::matrix_t *>(Ptr);
      return Result;
    }

    store.h:
    template <typename Ty, unsigned R, unsigned C> void store(Matrix<Ty, R, C> M1, Ty *Ptr) {
       *reinterpret_cast<typename decltype(M1)::matrix_t *>(Ptr) = M1.value;
    }

    toplevel.cpp
    void test(double *A, double *B, double *C) {
      store(add(load<double, 3, 5>(A), load<double, 3, 5>(B)), C);
    }

For a given function, we traverse the inlined-at chain for each
matrix instruction (= instructions with shape information). We collect
the matrix instructions in each DISubprogram we visit. This produces a
mapping of DISubprogram -> (List of matrix instructions visible in the
subpogram). We then generate remarks using the list of instructions for
each subprogram in the inlined-at chain. Note that the list of instructions
for a subprogram includes the instructions from its own subprograms
recursively. For example using the example above, for the subprogram
'test' this includes inline functions 'load' and 'store'. This allows
surfacing the remarks at a level useful to users.

Please note that the current approach may create a lot of extra remarks.
Additional heuristics to cut-off the traversal can be implemented in the
future. For example, it might make sense to stop 'up-leveling' once all
matrix instructions are at the same debug location.

Reviewers: anemet, Gerolf, thegameg, hfinkel, andrew.w.kaylor, LuoYuanke

Reviewed By: anemet

Differential Revision: https://reviews.llvm.org/D73600
---
 .../Transforms/Scalar/LowerMatrixIntrinsics.cpp    | 193 +++++++++++++--------
 .../LowerMatrixIntrinsics/remarks-inlining.ll      | 166 ++++++++++++++++++
 .../Transforms/LowerMatrixIntrinsics/remarks.ll    |  14 +-
 3 files changed, 296 insertions(+), 77 deletions(-)
 create mode 100644 llvm/test/Transforms/LowerMatrixIntrinsics/remarks-inlining.ll
diff --git a/llvm/lib/Transforms/Scalar/LowerMatrixIntrinsics.cpp b/llvm/lib/Transforms/Scalar/LowerMatrixIntrinsics.cpp
index 87b3ef8..5efd3ff 100644
--- a/llvm/lib/Transforms/Scalar/LowerMatrixIntrinsics.cpp
+++ b/llvm/lib/Transforms/Scalar/LowerMatrixIntrinsics.cpp
@@ -10,8 +10,6 @@
 //
 // TODO:
 //  * Implement multiply & add fusion
-//  * Add remark, summarizing the available matrix optimization opportunities
-//    (WIP).
 //
 //===----------------------------------------------------------------------===//
 
@@ -25,6 +23,7 @@
 #include "llvm/Analysis/VectorUtils.h"
 #include "llvm/IR/CFG.h"
 #include "llvm/IR/DataLayout.h"
+#include "llvm/IR/DebugInfoMetadata.h"
 #include "llvm/IR/Function.h"
 #include "llvm/IR/IRBuilder.h"
 #include "llvm/IR/Instructions.h"
@@ -50,6 +49,14 @@ static cl::opt<bool> AllowContractEnabled(
     cl::desc("Allow the use of FMAs if available and profitable. This may "
              "result in different results, due to less rounding error."));
 
+/// Helper function to either return Scope, if it is a subprogram or the
+/// attached subprogram for a local scope.
+static DISubprogram *getSubprogram(DIScope *Scope) {
+  if (auto *Subprogram = dyn_cast<DISubprogram>(Scope))
+    return Subprogram;
+  return cast<DILocalScope>(Scope)->getSubprogram();
+}
+
 namespace {
 
 // Given an element poitner \p BasePtr to the start of a (sub) matrix, compute
@@ -574,7 +581,7 @@ public:
       }
     }
 
-    RemarkGenerator RemarkGen(Inst2ColumnMatrix, ORE, DL);
+    RemarkGenerator RemarkGen(Inst2ColumnMatrix, ORE, Func);
     RemarkGen.emitRemarks();
 
     for (Instruction *Inst : reverse(ToRemove))
@@ -950,6 +957,9 @@ public:
     /// part of.
     const DenseMap<Value *, SmallPtrSet<Value *, 2>> &Shared;
 
+    /// Set of matrix expressions in the scope of a given DISubprogram.
+    const SmallSetVector<Value *, 32> &ExprsInSubprogram;
+
     /// Leaf node of the expression to linearize.
     Value *Leaf;
 
@@ -960,9 +970,10 @@ public:
     ExprLinearizer(const DataLayout &DL,
                    const MapVector<Value *, ColumnMatrixTy> &Inst2ColumnMatrix,
                    const DenseMap<Value *, SmallPtrSet<Value *, 2>> &Shared,
+                   const SmallSetVector<Value *, 32> &ExprsInSubprogram,
                    Value *Leaf)
         : Str(), Stream(Str), DL(DL), Inst2ColumnMatrix(Inst2ColumnMatrix),
-          Shared(Shared), Leaf(Leaf) {}
+          Shared(Shared), ExprsInSubprogram(ExprsInSubprogram), Leaf(Leaf) {}
 
     void indent(unsigned N) {
       LineLength += N;
@@ -996,10 +1007,8 @@ public:
       return V;
     }
 
-    /// Returns true if \p V is a matrix value.
-    bool isMatrix(Value *V) const {
-      return Inst2ColumnMatrix.find(V) != Inst2ColumnMatrix.end();
-    }
+    /// Returns true if \p V is a matrix value in the given subprogram.
+    bool isMatrix(Value *V) const { return ExprsInSubprogram.count(V); }
 
     /// If \p V is a matrix value, print its shape as as NumRows x NumColumns to
     /// \p SS.
@@ -1191,60 +1200,69 @@ public:
 
   /// Generate remarks for matrix operations in a function. To generate remarks
   /// for matrix expressions, the following approach is used:
-  /// 1. Collect leafs of matrix expressions (done in
-  ///    RemarkGenerator::getExpressionLeaves).  Leaves are lowered matrix
-  ///    instructions without other matrix users (like stores).
-  ///
-  /// 2. For each leaf, create a remark containing a linearizied version of the
-  ///    matrix expression.
-  ///
-  /// TODO:
-  ///  * Summarize number of vector instructions generated for each expression.
-  ///  * Propagate matrix remarks up the inlining chain.
+  /// 1. Use the inlined-at debug information to group matrix operations to the
+  ///    DISubprograms they are contained in.
+  /// 2. Collect leaves of matrix expressions (done in
+  ///    RemarkGenerator::getExpressionLeaves) for each subprogram - expression
+  //     mapping.  Leaves are lowered matrix instructions without other matrix
+  //     users (like stores) in the current subprogram.
+  /// 3. For each leaf, create a remark containing a linearizied version of the
+  ///    matrix expression. The expression is linearized by a recursive
+  ///    bottom-up traversal of the matrix operands, starting at a leaf. Note
+  ///    that multiple leaves can share sub-expressions. Shared subexpressions
+  ///    are explicitly marked as shared().
   struct RemarkGenerator {
     const MapVector<Value *, ColumnMatrixTy> &Inst2ColumnMatrix;
     OptimizationRemarkEmitter &ORE;
+    Function &Func;
     const DataLayout &DL;
 
     RemarkGenerator(const MapVector<Value *, ColumnMatrixTy> &Inst2ColumnMatrix,
-                    OptimizationRemarkEmitter &ORE, const DataLayout &DL)
-        : Inst2ColumnMatrix(Inst2ColumnMatrix), ORE(ORE), DL(DL) {}
-
-    /// Return all leafs of matrix expressions. Those are instructions in
-    /// Inst2ColumnMatrix returing void. Currently that should only include
-    /// stores.
-    SmallVector<Value *, 4> getExpressionLeaves() {
+                    OptimizationRemarkEmitter &ORE, Function &Func)
+        : Inst2ColumnMatrix(Inst2ColumnMatrix), ORE(ORE), Func(Func),
+          DL(Func.getParent()->getDataLayout()) {}
+
+    /// Return all leaves of the expressions in \p ExprsInSubprogram. Those are
+    /// instructions in Inst2ColumnMatrix returning void or without any users in
+    /// \p ExprsInSubprogram. Currently that should only include stores.
+    SmallVector<Value *, 4>
+    getExpressionLeaves(const SmallSetVector<Value *, 32> &ExprsInSubprogram) {
       SmallVector<Value *, 4> Leaves;
-      for (auto &KV : Inst2ColumnMatrix)
-        if (KV.first->getType()->isVoidTy())
-          Leaves.push_back(KV.first);
-
+      for (auto *Expr : ExprsInSubprogram)
+        if (Expr->getType()->isVoidTy() ||
+            !any_of(Expr->users(), [&ExprsInSubprogram](User *U) {
+              return ExprsInSubprogram.count(U);
+            }))
+          Leaves.push_back(Expr);
       return Leaves;
     }
 
     /// Recursively traverse expression \p V starting at \p Leaf and add \p Leaf
-    /// to all visited expressions in \p Shared.
+    /// to all visited expressions in \p Shared. Limit the matrix operations to
+    /// the ones in \p ExprsInSubprogram.
     void collectSharedInfo(Value *Leaf, Value *V,
+                           const SmallSetVector<Value *, 32> &ExprsInSubprogram,
                            DenseMap<Value *, SmallPtrSet<Value *, 2>> &Shared) {
 
-      if (Inst2ColumnMatrix.find(V) == Inst2ColumnMatrix.end())
+      if (!ExprsInSubprogram.count(V))
         return;
 
       auto I = Shared.insert({V, {}});
       I.first->second.insert(Leaf);
 
       for (Value *Op : cast<Instruction>(V)->operand_values())
-        collectSharedInfo(Leaf, Op, Shared);
+        collectSharedInfo(Leaf, Op, ExprsInSubprogram, Shared);
       return;
     }
 
     /// Calculate the number of exclusive and shared op counts for expression
     /// starting at \p V. Expressions used multiple times are counted once.
+    /// Limit the matrix operations to the ones in \p ExprsInSubprogram.
     std::pair<OpInfoTy, OpInfoTy>
     sumOpInfos(Value *Root, SmallPtrSetImpl<Value *> &ReusedExprs,
-               DenseMap<Value *, SmallPtrSet<Value *, 2>> &Shared) {
-      auto CM = Inst2ColumnMatrix.find(Root);
-      if (CM == Inst2ColumnMatrix.end())
+               const SmallSetVector<Value *, 32> &ExprsInSubprogram,
+               DenseMap<Value *, SmallPtrSet<Value *, 2>> &Shared) const {
+      if (!ExprsInSubprogram.count(Root))
         return {};
 
       // Already counted this expression. Stop.
@@ -1255,13 +1273,14 @@ public:
       OpInfoTy Count;
 
       auto I = Shared.find(Root);
+      auto CM = Inst2ColumnMatrix.find(Root);
       if (I->second.size() == 1)
         Count = CM->second.getOpInfo();
       else
         SharedCount = CM->second.getOpInfo();
 
       for (Value *Op : cast<Instruction>(Root)->operand_values()) {
-        auto C = sumOpInfos(Op, ReusedExprs, Shared);
+        auto C = sumOpInfos(Op, ReusedExprs, ExprsInSubprogram, Shared);
         Count += C.first;
         SharedCount += C.second;
       }
@@ -1272,49 +1291,83 @@ public:
       if (!ORE.allowExtraAnalysis(DEBUG_TYPE))
         return;
 
-      // Find leafs of matrix expressions.
-      auto Leaves = getExpressionLeaves();
-
-      DenseMap<Value *, SmallPtrSet<Value *, 2>> Shared;
-
-      for (Value *Leaf : Leaves)
-        collectSharedInfo(Leaf, Leaf, Shared);
-
-      // Generate remarks for each leaf.
-      for (auto *L : Leaves) {
-        SmallPtrSet<Value *, 8> ReusedExprs;
-        OpInfoTy Counts, SharedCounts;
-        std::tie(Counts, SharedCounts) = sumOpInfos(L, ReusedExprs, Shared);
-
-        OptimizationRemark Rem(DEBUG_TYPE, "matrix-lowered",
-                               cast<Instruction>(L)->getDebugLoc(),
-                               cast<Instruction>(L)->getParent());
-
-        Rem << "Lowered with ";
-        Rem << ore::NV("NumStores", Counts.NumStores) << " stores, "
-            << ore::NV("NumLoads", Counts.NumLoads) << " loads, "
-            << ore::NV("NumComputeOps", Counts.NumComputeOps) << " compute ops";
-
-        if (SharedCounts.NumStores > 0 || SharedCounts.NumLoads > 0 ||
-            SharedCounts.NumComputeOps > 0) {
-          Rem << ",\nadditionally "
-              << ore::NV("NumStores", SharedCounts.NumStores) << " stores, "
-              << ore::NV("NumLoads", SharedCounts.NumLoads) << " loads, "
-              << ore::NV("NumFPOps", SharedCounts.NumComputeOps)
-              << " compute ops"
-              << " are shared with other expressions";
+      // Map matrix operations to their containting subprograms, by traversing
+      // the inlinedAt chain. If the function does not have a DISubprogram, we
+      // only map them to the containing function.
+      MapVector<DISubprogram *, SmallVector<Value *, 8>> Subprog2Exprs;
+      for (auto &KV : Inst2ColumnMatrix) {
+        if (Func.getSubprogram()) {
+          auto *I = cast<Instruction>(KV.first);
+          DILocation *Context = I->getDebugLoc();
+          while (Context) {
+            auto I =
+                Subprog2Exprs.insert({getSubprogram(Context->getScope()), {}});
+            I.first->second.push_back(KV.first);
+            Context = DebugLoc(Context).getInlinedAt();
+          }
+        } else {
+          auto I = Subprog2Exprs.insert({nullptr, {}});
+          I.first->second.push_back(KV.first);
         }
+      }
+      for (auto &KV : Subprog2Exprs) {
+        SmallSetVector<Value *, 32> ExprsInSubprogram(KV.second.begin(),
+                                                      KV.second.end());
+        auto Leaves = getExpressionLeaves(ExprsInSubprogram);
+
+        DenseMap<Value *, SmallPtrSet<Value *, 2>> Shared;
+        for (Value *Leaf : Leaves)
+          collectSharedInfo(Leaf, Leaf, ExprsInSubprogram, Shared);
+
+        // Generate remarks for each leaf.
+        for (auto *L : Leaves) {
+
+          DebugLoc Loc = cast<Instruction>(L)->getDebugLoc();
+          DILocation *Context = cast<Instruction>(L)->getDebugLoc();
+          while (Context) {
+            if (getSubprogram(Context->getScope()) == KV.first) {
+              Loc = Context;
+              break;
+            }
+            Context = DebugLoc(Context).getInlinedAt();
+          }
+
+          SmallPtrSet<Value *, 8> ReusedExprs;
+          OpInfoTy Counts, SharedCounts;
+          std::tie(Counts, SharedCounts) =
+              sumOpInfos(L, ReusedExprs, ExprsInSubprogram, Shared);
+
+          OptimizationRemark Rem(DEBUG_TYPE, "matrix-lowered", Loc,
+                                 cast<Instruction>(L)->getParent());
+
+          Rem << "Lowered with ";
+          Rem << ore::NV("NumStores", Counts.NumStores) << " stores, "
+              << ore::NV("NumLoads", Counts.NumLoads) << " loads, "
+              << ore::NV("NumComputeOps", Counts.NumComputeOps)
+              << " compute ops";
+
+          if (SharedCounts.NumStores > 0 || SharedCounts.NumLoads > 0 ||
+              SharedCounts.NumComputeOps > 0) {
+            Rem << ",\nadditionally "
+                << ore::NV("NumStores", SharedCounts.NumStores) << " stores, "
+                << ore::NV("NumLoads", SharedCounts.NumLoads) << " loads, "
+                << ore::NV("NumFPOps", SharedCounts.NumComputeOps)
+                << " compute ops"
+                << " are shared with other expressions";
+          }
 
-        Rem << ("\n" + linearize(L, Shared, DL));
-        ORE.emit(Rem);
+          Rem << ("\n" + linearize(L, Shared, ExprsInSubprogram, DL));
+          ORE.emit(Rem);
+        }
       }
     }
 
     std::string
     linearize(Value *L,
               const DenseMap<Value *, SmallPtrSet<Value *, 2>> &Shared,
+              const SmallSetVector<Value *, 32> &ExprsInSubprogram,
               const DataLayout &DL) {
-      ExprLinearizer Lin(DL, Inst2ColumnMatrix, Shared, L);
+      ExprLinearizer Lin(DL, Inst2ColumnMatrix, Shared, ExprsInSubprogram, L);
       Lin.linearizeExpr(L, 0, false, false);
       return Lin.getResult();
     }
diff --git a/llvm/test/Transforms/LowerMatrixIntrinsics/remarks-inlining.ll b/llvm/test/Transforms/LowerMatrixIntrinsics/remarks-inlining.ll
new file mode 100644
index 0000000..f0a5212
--- /dev/null
+++ b/llvm/test/Transforms/LowerMatrixIntrinsics/remarks-inlining.ll
@@ -0,0 +1,166 @@
+; REQUIRES: aarch64-registered-target
+
+; This test needs to be target specific due to the cost estimate in the output.
+
+; RUN: opt -lower-matrix-intrinsics -pass-remarks=lower-matrix-intrinsics -mtriple=arm64-apple-iphoneos -S < %s 2>&1 | FileCheck  %s
+
+; Test the propagation of matrix expressions along to inlined-at chain. The IR
+; in the test roughly corresponds to the C++ code below, with the IR containing
+; references to a few more functions.
+
+; matrix.h
+; template <typename Ty, unsigned R, unsigned C>
+; struct Matrix {
+;   using matrix_t = Ty __attribute__((matrix_type(R, C)));
+;
+;   matrix_t value;
+; };
+;
+; ; add.h
+; template <typename Ty, unsigned R, unsigned C>
+; Matrix<Ty, R, C> add(Matrix<Ty, R, C> M1, Matrix<Ty, R, C> M2) {
+;   Matrix<Ty, R, C> Result;
+;   Result.value = __builtin_matrix_add(M1.value, M2.value);
+;   return Result;
+; }
+;
+; load.h:
+; template <typename Ty, unsigned R, unsigned C>
+; Matrix<Ty, R, C> load(Ty *Ptr) {
+;   Matrix<Ty, R, C> Result;
+;   Result.value = *reinterpret_cast <typename Matrix<Ty, R, C>::matrix_t *>(Ptr);
+;   return Result;
+; }
+;
+; store.h:
+; template <typename Ty, unsigned R, unsigned C>
+; void store(Matrix<Ty, R, C> M1, Ty *Ptr) {
+;   *reinterpret_cast<typename decltype(M1)::matrix_t *>(Ptr) = M1.value;
+; }
+;
+; toplevel.cpp
+; void test(double *A, double *B, double *C) {
+;   store(add(load<double, 3, 5>(A), load<double, 3, 5>(B)), C);
+; }
+;
+
+target datalayout = "e-m:o-i64:64-f80:128-n8:16:32:64-S128"
+target triple = "aarch64-apple-ios"
+
+; CHECK-LABEL: remark: load.h:41:43: Lowered with 0 stores, 10 loads, 0 compute ops
+; CHECK-NEXT:  load(addr %A)
+
+; CHECK-LABEL: remark: load.h:41:43: Lowered with 0 stores, 10 loads, 0 compute ops
+; CHECK-NEXT:  columnwise.load.3x5.double(addr %B, 5)
+
+; CHECK-LABEL: remark: load.h:41:11: Lowered with 0 stores, 1 loads, 0 compute ops
+; CHECK-NEXT: load(addr %D)
+
+; CHECK-LABEL: remark: assign.h:32:43: Lowered with 0 stores, 10 loads, 0 compute ops
+; CHECK-NEXT:  load(addr %A)
+
+; CHECK-LABEL: remark: assign.h:32:43: Lowered with 0 stores, 10 loads, 0 compute ops
+; CHECK-NEXT:  columnwise.load.3x5.double(addr %B, 5)
+
+; CHECK-LABEL: remark: toplevel.c:410:0: Lowered with 10 stores, 20 loads, 10 compute ops
+; CHECK-NEXT:  store(
+; CHECK-NEXT:   fadd(
+; CHECK-NEXT:    load(addr %A),
+; CHECK-NEXT:    columnwise.load.3x5.double(addr %B, 5)),
+; CHECK-NEXT:   addr %C)
+
+; CHECK-LABEL: remark: toplevel.c:510:0: Lowered with 1 stores, 1 loads, 8 compute ops
+; CHECK-NEXT:  store(
+; CHECK-NEXT:   transpose.1x2.float(transpose.2x1.float(load(addr %D))),
+; CHECK-NEXT:   addr %D)
+
+; CHECK-LABEL: remark: add.h:66:11: Lowered with 0 stores, 0 loads, 10 compute ops
+; CHECK-NEXT:  fadd(
+; CHECK-NEXT:   addr %A,
+; CHECK-NEXT:   scalar)
+
+; CHECK-LABEL: remark: store.h:10:11: Lowered with 10 stores, 0 loads, 0 compute ops
+; CHECK-NEXT:  store(
+; CHECK-NEXT:   scalar,
+; CHECK-NEXT:   addr %C)
+
+; CHECK-LABEL: remark: store.h:66:11: Lowered with 1 stores, 0 loads, 0 compute ops
+; CHECK-NEXT:  store(
+; CHECK-NEXT:  scalar,
+; CHECK-NEXT:  addr %D)
+
+; CHECK-LABEL: remark: transpose.h:13:11: Lowered with 0 stores, 0 loads, 8 compute ops
+; CHECK-NEXT:  transpose.1x2.float(transpose.2x1.float(addr %D))
+
+define void @toplevel(<15 x double>* %A, <15 x double>* %B, <15 x double>* %C, <2 x float>* %D) !dbg !16 {
+entry:
+  %a = load <15 x double>, <15 x double> *%A, align 16, !dbg !3791
+  %b = call <15 x double> @llvm.matrix.columnwise.load(<15 x double>* %B, i32 5, i32 3, i32 5), !dbg !3793
+  %c  = fadd <15 x double> %a, %b, !dbg !100
+  store <15 x double> %c, <15 x double> *%C, align 16, !dbg !102
+
+  %load = load <2 x float>, <2 x float>* %D, !dbg !104
+  %t1 = call <2 x float> @llvm.matrix.transpose(<2 x float> %load, i32 2, i32 1), !dbg !106
+  %t2 = call <2 x float> @llvm.matrix.transpose(<2 x float> %t1, i32 1, i32 2), !dbg !106
+  store <2 x float> %t2, <2 x float>* %D, !dbg !108
+  ret void
+}
+
+declare <15 x double> @llvm.matrix.columnwise.load(<15 x double>*, i32, i32, i32)
+declare <2 x float> @llvm.matrix.transpose(<2 x float>, i32, i32)
+
+!llvm.dbg.cu = !{!0}
+!llvm.module.flags = !{!3, !4}
+
+!0 = distinct !DICompileUnit(language: DW_LANG_C99, file: !1, producer: "clang", isOptimized: true, runtimeVersion: 0, emissionKind: FullDebug, enums: !2)
+!1 = !DIFile(filename: "load.h", directory: "/test")
+!2 = !{}
+!3 = !{i32 2, !"Dwarf Version", i32 4}
+!4 = !{i32 2, !"Debug Info Version", i32 3}
+!5 = distinct !DISubprogram(name: "load_fn", scope: !1, file: !1, line: 1, type: !6, isLocal: false, isDefinition: true, scopeLine: 1, flags: DIFlagPrototyped, isOptimized: true, unit: !0, retainedNodes: !12)
+!17 = !DIFile(filename: "toplevel.c", directory: "/test")
+!16 = distinct !DISubprogram(name: "toplevel", scope: !1, file: !17, line: 1, type: !6, isLocal: false, isDefinition: true, scopeLine: 1, flags: DIFlagPrototyped, isOptimized: true, unit: !0, retainedNodes: !12)
+!18 = !DIFile(filename: "assign.h", directory: "/test")
+!19 = distinct !DISubprogram(name: "assign", scope: !1, file: !18, line: 1, type: !6, isLocal: false, isDefinition: true, scopeLine: 1, flags: DIFlagPrototyped, isOptimized: true, unit: !0, retainedNodes: !12)
+
+!20 = !DIFile(filename: "add.h", directory: "/test")
+!21 = distinct !DISubprogram(name: "add_fn", scope: !1, file: !20, line: 1, type: !6, isLocal: false, isDefinition: true, scopeLine: 1, flags: DIFlagPrototyped, isOptimized: true, unit: !0, retainedNodes: !12)
+
+!22 = !DIFile(filename: "store.h", directory: "/test")
+!23 = distinct !DISubprogram(name: "store_fn", scope: !1, file: !22, line: 1, type: !6, isLocal: false, isDefinition: true, scopeLine: 1, flags: DIFlagPrototyped, isOptimized: true, unit: !0, retainedNodes: !12)
+
+!24 = !DIFile(filename: "transpose.h", directory: "/test")
+!25 = distinct !DISubprogram(name: "transpose", scope: !1, file: !24, line: 1, type: !6, isLocal: false, isDefinition: true, scopeLine: 1, flags: DIFlagPrototyped, isOptimized: true, unit: !0, retainedNodes: !12)
+
+
+!6 = !DISubroutineType(types: !7)
+!7 = !{null, !8, !8, !11}
+!8 = !DIDerivedType(tag: DW_TAG_restrict_type, baseType: !9)
+!9 = !DIDerivedType(tag: DW_TAG_pointer_type, baseType: !10, size: 32, align: 32)
+!10 = !DIBasicType(name: "float", size: 32, align: 32, encoding: DW_ATE_float)
+!11 = !DIBasicType(name: "int", size: 32, align: 32, encoding: DW_ATE_signed)
+!12 = !{!13}
+!13 = !DILocalVariable(name: "a", arg: 1, scope: !5, file: !1, line: 1, type: !8)
+!14 = !DILocation(line: 1, column: 27, scope: !5)
+
+!3791 = !DILocation(line: 41, column: 43, scope: !5, inlinedAt: !3795)
+!3792 = !DILocation(line: 405, column: 3, scope: !16)
+!3793 = !DILocation(line: 41, column: 43, scope: !5, inlinedAt: !3796)
+!3794 = !DILocation(line: 406, column: 11, scope: !16)
+!3795 = !DILocation(line: 32, column: 43, scope: !19, inlinedAt: !3792)
+!3796 = !DILocation(line: 32, column: 43, scope: !19, inlinedAt: !3794)
+
+!100 = !DILocation(line: 66, column: 11, scope: !21, inlinedAt: !101)
+!101 = !DILocation(line: 410, column: 11, scope: !16)
+
+!102 = !DILocation(line: 10, column: 11, scope: !23, inlinedAt: !103)
+!103 = !DILocation(line: 410, column: 0, scope: !16)
+
+!104 = !DILocation(line: 41, column: 11, scope: !5, inlinedAt: !101)
+!105 = !DILocation(line: 500, column: 11, scope: !16)
+
+!106 = !DILocation(line: 13, column: 11, scope: !25, inlinedAt: !101)
+!107 = !DILocation(line: 510, column: 11, scope: !16)
+
+!108 = !DILocation(line: 66, column: 11, scope: !23, inlinedAt: !109)
+!109 = !DILocation(line: 510, column: 0, scope: !16)
diff --git a/llvm/test/Transforms/LowerMatrixIntrinsics/remarks.ll b/llvm/test/Transforms/LowerMatrixIntrinsics/remarks.ll
index 0bca099..c83648e 100644
--- a/llvm/test/Transforms/LowerMatrixIntrinsics/remarks.ll
+++ b/llvm/test/Transforms/LowerMatrixIntrinsics/remarks.ll
@@ -71,8 +71,8 @@ declare void @llvm.matrix.columnwise.store(<9 x double>, <9 x double>*, i32, i32
 
 define void @binaryops(<9 x double>* %A, <9 x double>* %B) !dbg !31 {
   %A.matrix = call <9 x double> @llvm.matrix.columnwise.load(<9 x double>* %A, i32 5, i32 3, i32 3), !dbg !32
-  %R1.matrix = fadd <9 x double> %A.matrix, %A.matrix
-  %R2.matrix = fmul <9 x double> %R1.matrix, %A.matrix
+  %R1.matrix = fadd <9 x double> %A.matrix, %A.matrix, !dbg !32
+  %R2.matrix = fmul <9 x double> %R1.matrix, %A.matrix, !dbg !32
   call void @llvm.matrix.columnwise.store(<9 x double> %R2.matrix, <9 x double>* %B, i32 10, i32 3, i32 3), !dbg !32
   ret void
 }
@@ -95,8 +95,8 @@ define void @binaryops(<9 x double>* %A, <9 x double>* %B) !dbg !31 {
 
 define void @multiple_expressions(<9 x double>* %A, <9 x double>* %B, <12 x double>* %C, <12 x double>* %D, <4 x double>* %E) !dbg !33 {
   %A.matrix = call <9 x double> @llvm.matrix.columnwise.load(<9 x double>* %A, i32 5, i32 3, i32 3), !dbg !34
-  %R1.matrix = fadd <9 x double> %A.matrix, %A.matrix
-  %R2.matrix = fmul <9 x double> %R1.matrix, %A.matrix
+  %R1.matrix = fadd <9 x double> %A.matrix, %A.matrix, !dbg !34
+  %R2.matrix = fmul <9 x double> %R1.matrix, %A.matrix, !dbg !34
   call void @llvm.matrix.columnwise.store(<9 x double> %R2.matrix, <9 x double>* %B, i32 10, i32 3, i32 3), !dbg !34
 
   %C.matrix = load <12 x double>, <12 x double>* %C, !dbg !34
@@ -119,8 +119,8 @@ define void @multiple_expressions(<9 x double>* %A, <9 x double>* %B, <12 x doub
 define void @stackaddresses(<9 x double>* %A) !dbg !35 {
   %B = alloca <9 x double>
   %A.matrix = call <9 x double> @llvm.matrix.columnwise.load(<9 x double>* %A, i32 5, i32 3, i32 3), !dbg !36
-  %R1.matrix = fadd <9 x double> %A.matrix, %A.matrix
-  %R2.matrix = fmul <9 x double> %R1.matrix, %A.matrix
+  %R1.matrix = fadd <9 x double> %A.matrix, %A.matrix, !dbg !36
+  %R2.matrix = fmul <9 x double> %R1.matrix, %A.matrix, !dbg !36
   call void @llvm.matrix.columnwise.store(<9 x double> %R2.matrix, <9 x double>* %B, i32 10, i32 3, i32 3), !dbg !36
   ret void
 }
@@ -140,7 +140,7 @@ entry:
   %s2 = bitcast <15 x double>* %s1 to i64*, !dbg !22
   %s3 = bitcast i64* %s2 to <15 x double>*, !dbg !22
 
-  %t = call <15 x double> @llvm.matrix.transpose.v15f64.v15f64(<15 x double> %av, i32 5, i32 3)
+  %t = call <15 x double> @llvm.matrix.transpose.v15f64.v15f64(<15 x double> %av, i32 5, i32 3), !dbg !22
 
   store <15 x double> %t, <15 x double>* %s3, !dbg !22
   ret void
-- 
2.7.4