From bc6c8c4bbbee24fbcd288479651f2ffd9528342a Mon Sep 17 00:00:00 2001 From: Florian Hahn Date: Wed, 11 Mar 2020 17:01:47 +0000 Subject: [PATCH] [Matrix] Add remark propagation along the inlined-at chain. This patch adds support for propagating matrix expressions along the inlined-at chain and emitting remarks at the traversed function scopes. To motivate this new behavior, consider the example below. Without the remark 'up-leveling', we would only get remarks in load.h and store.h, but we cannot generate a remark describing the full expression in toplevel.cpp, which is the place where the user has the best chance of spotting/fixing potential problems. With this patch, we generate a remark for the load in load.h, one for the store in store.h and one for the complete expression in toplevel.cpp. For a bigger example, please see remarks-inlining.ll. load.h: template Matrix load(Ty *Ptr) { Matrix Result; Result.value = *reinterpret_cast ::matrix_t *>(Ptr); return Result; } store.h: template void store(Matrix M1, Ty *Ptr) { *reinterpret_cast(Ptr) = M1.value; } toplevel.cpp void test(double *A, double *B, double *C) { store(add(load(A), load(B)), C); } For a given function, we traverse the inlined-at chain for each matrix instruction (= instructions with shape information). We collect the matrix instructions in each DISubprogram we visit. This produces a mapping of DISubprogram -> (List of matrix instructions visible in the subpogram). We then generate remarks using the list of instructions for each subprogram in the inlined-at chain. Note that the list of instructions for a subprogram includes the instructions from its own subprograms recursively. For example using the example above, for the subprogram 'test' this includes inline functions 'load' and 'store'. This allows surfacing the remarks at a level useful to users. Please note that the current approach may create a lot of extra remarks. Additional heuristics to cut-off the traversal can be implemented in the future. For example, it might make sense to stop 'up-leveling' once all matrix instructions are at the same debug location. Reviewers: anemet, Gerolf, thegameg, hfinkel, andrew.w.kaylor, LuoYuanke Reviewed By: anemet Differential Revision: https://reviews.llvm.org/D73600 --- .../Transforms/Scalar/LowerMatrixIntrinsics.cpp | 193 +++++++++++++-------- .../LowerMatrixIntrinsics/remarks-inlining.ll | 166 ++++++++++++++++++ .../Transforms/LowerMatrixIntrinsics/remarks.ll | 14 +- 3 files changed, 296 insertions(+), 77 deletions(-) create mode 100644 llvm/test/Transforms/LowerMatrixIntrinsics/remarks-inlining.ll diff --git a/llvm/lib/Transforms/Scalar/LowerMatrixIntrinsics.cpp b/llvm/lib/Transforms/Scalar/LowerMatrixIntrinsics.cpp index 87b3ef8..5efd3ff 100644 --- a/llvm/lib/Transforms/Scalar/LowerMatrixIntrinsics.cpp +++ b/llvm/lib/Transforms/Scalar/LowerMatrixIntrinsics.cpp @@ -10,8 +10,6 @@ // // TODO: // * Implement multiply & add fusion -// * Add remark, summarizing the available matrix optimization opportunities -// (WIP). // //===----------------------------------------------------------------------===// @@ -25,6 +23,7 @@ #include "llvm/Analysis/VectorUtils.h" #include "llvm/IR/CFG.h" #include "llvm/IR/DataLayout.h" +#include "llvm/IR/DebugInfoMetadata.h" #include "llvm/IR/Function.h" #include "llvm/IR/IRBuilder.h" #include "llvm/IR/Instructions.h" @@ -50,6 +49,14 @@ static cl::opt AllowContractEnabled( cl::desc("Allow the use of FMAs if available and profitable. This may " "result in different results, due to less rounding error.")); +/// Helper function to either return Scope, if it is a subprogram or the +/// attached subprogram for a local scope. +static DISubprogram *getSubprogram(DIScope *Scope) { + if (auto *Subprogram = dyn_cast(Scope)) + return Subprogram; + return cast(Scope)->getSubprogram(); +} + namespace { // Given an element poitner \p BasePtr to the start of a (sub) matrix, compute @@ -574,7 +581,7 @@ public: } } - RemarkGenerator RemarkGen(Inst2ColumnMatrix, ORE, DL); + RemarkGenerator RemarkGen(Inst2ColumnMatrix, ORE, Func); RemarkGen.emitRemarks(); for (Instruction *Inst : reverse(ToRemove)) @@ -950,6 +957,9 @@ public: /// part of. const DenseMap> &Shared; + /// Set of matrix expressions in the scope of a given DISubprogram. + const SmallSetVector &ExprsInSubprogram; + /// Leaf node of the expression to linearize. Value *Leaf; @@ -960,9 +970,10 @@ public: ExprLinearizer(const DataLayout &DL, const MapVector &Inst2ColumnMatrix, const DenseMap> &Shared, + const SmallSetVector &ExprsInSubprogram, Value *Leaf) : Str(), Stream(Str), DL(DL), Inst2ColumnMatrix(Inst2ColumnMatrix), - Shared(Shared), Leaf(Leaf) {} + Shared(Shared), ExprsInSubprogram(ExprsInSubprogram), Leaf(Leaf) {} void indent(unsigned N) { LineLength += N; @@ -996,10 +1007,8 @@ public: return V; } - /// Returns true if \p V is a matrix value. - bool isMatrix(Value *V) const { - return Inst2ColumnMatrix.find(V) != Inst2ColumnMatrix.end(); - } + /// Returns true if \p V is a matrix value in the given subprogram. + bool isMatrix(Value *V) const { return ExprsInSubprogram.count(V); } /// If \p V is a matrix value, print its shape as as NumRows x NumColumns to /// \p SS. @@ -1191,60 +1200,69 @@ public: /// Generate remarks for matrix operations in a function. To generate remarks /// for matrix expressions, the following approach is used: - /// 1. Collect leafs of matrix expressions (done in - /// RemarkGenerator::getExpressionLeaves). Leaves are lowered matrix - /// instructions without other matrix users (like stores). - /// - /// 2. For each leaf, create a remark containing a linearizied version of the - /// matrix expression. - /// - /// TODO: - /// * Summarize number of vector instructions generated for each expression. - /// * Propagate matrix remarks up the inlining chain. + /// 1. Use the inlined-at debug information to group matrix operations to the + /// DISubprograms they are contained in. + /// 2. Collect leaves of matrix expressions (done in + /// RemarkGenerator::getExpressionLeaves) for each subprogram - expression + // mapping. Leaves are lowered matrix instructions without other matrix + // users (like stores) in the current subprogram. + /// 3. For each leaf, create a remark containing a linearizied version of the + /// matrix expression. The expression is linearized by a recursive + /// bottom-up traversal of the matrix operands, starting at a leaf. Note + /// that multiple leaves can share sub-expressions. Shared subexpressions + /// are explicitly marked as shared(). struct RemarkGenerator { const MapVector &Inst2ColumnMatrix; OptimizationRemarkEmitter &ORE; + Function &Func; const DataLayout &DL; RemarkGenerator(const MapVector &Inst2ColumnMatrix, - OptimizationRemarkEmitter &ORE, const DataLayout &DL) - : Inst2ColumnMatrix(Inst2ColumnMatrix), ORE(ORE), DL(DL) {} - - /// Return all leafs of matrix expressions. Those are instructions in - /// Inst2ColumnMatrix returing void. Currently that should only include - /// stores. - SmallVector getExpressionLeaves() { + OptimizationRemarkEmitter &ORE, Function &Func) + : Inst2ColumnMatrix(Inst2ColumnMatrix), ORE(ORE), Func(Func), + DL(Func.getParent()->getDataLayout()) {} + + /// Return all leaves of the expressions in \p ExprsInSubprogram. Those are + /// instructions in Inst2ColumnMatrix returning void or without any users in + /// \p ExprsInSubprogram. Currently that should only include stores. + SmallVector + getExpressionLeaves(const SmallSetVector &ExprsInSubprogram) { SmallVector Leaves; - for (auto &KV : Inst2ColumnMatrix) - if (KV.first->getType()->isVoidTy()) - Leaves.push_back(KV.first); - + for (auto *Expr : ExprsInSubprogram) + if (Expr->getType()->isVoidTy() || + !any_of(Expr->users(), [&ExprsInSubprogram](User *U) { + return ExprsInSubprogram.count(U); + })) + Leaves.push_back(Expr); return Leaves; } /// Recursively traverse expression \p V starting at \p Leaf and add \p Leaf - /// to all visited expressions in \p Shared. + /// to all visited expressions in \p Shared. Limit the matrix operations to + /// the ones in \p ExprsInSubprogram. void collectSharedInfo(Value *Leaf, Value *V, + const SmallSetVector &ExprsInSubprogram, DenseMap> &Shared) { - if (Inst2ColumnMatrix.find(V) == Inst2ColumnMatrix.end()) + if (!ExprsInSubprogram.count(V)) return; auto I = Shared.insert({V, {}}); I.first->second.insert(Leaf); for (Value *Op : cast(V)->operand_values()) - collectSharedInfo(Leaf, Op, Shared); + collectSharedInfo(Leaf, Op, ExprsInSubprogram, Shared); return; } /// Calculate the number of exclusive and shared op counts for expression /// starting at \p V. Expressions used multiple times are counted once. + /// Limit the matrix operations to the ones in \p ExprsInSubprogram. std::pair sumOpInfos(Value *Root, SmallPtrSetImpl &ReusedExprs, - DenseMap> &Shared) { - auto CM = Inst2ColumnMatrix.find(Root); - if (CM == Inst2ColumnMatrix.end()) + const SmallSetVector &ExprsInSubprogram, + DenseMap> &Shared) const { + if (!ExprsInSubprogram.count(Root)) return {}; // Already counted this expression. Stop. @@ -1255,13 +1273,14 @@ public: OpInfoTy Count; auto I = Shared.find(Root); + auto CM = Inst2ColumnMatrix.find(Root); if (I->second.size() == 1) Count = CM->second.getOpInfo(); else SharedCount = CM->second.getOpInfo(); for (Value *Op : cast(Root)->operand_values()) { - auto C = sumOpInfos(Op, ReusedExprs, Shared); + auto C = sumOpInfos(Op, ReusedExprs, ExprsInSubprogram, Shared); Count += C.first; SharedCount += C.second; } @@ -1272,49 +1291,83 @@ public: if (!ORE.allowExtraAnalysis(DEBUG_TYPE)) return; - // Find leafs of matrix expressions. - auto Leaves = getExpressionLeaves(); - - DenseMap> Shared; - - for (Value *Leaf : Leaves) - collectSharedInfo(Leaf, Leaf, Shared); - - // Generate remarks for each leaf. - for (auto *L : Leaves) { - SmallPtrSet ReusedExprs; - OpInfoTy Counts, SharedCounts; - std::tie(Counts, SharedCounts) = sumOpInfos(L, ReusedExprs, Shared); - - OptimizationRemark Rem(DEBUG_TYPE, "matrix-lowered", - cast(L)->getDebugLoc(), - cast(L)->getParent()); - - Rem << "Lowered with "; - Rem << ore::NV("NumStores", Counts.NumStores) << " stores, " - << ore::NV("NumLoads", Counts.NumLoads) << " loads, " - << ore::NV("NumComputeOps", Counts.NumComputeOps) << " compute ops"; - - if (SharedCounts.NumStores > 0 || SharedCounts.NumLoads > 0 || - SharedCounts.NumComputeOps > 0) { - Rem << ",\nadditionally " - << ore::NV("NumStores", SharedCounts.NumStores) << " stores, " - << ore::NV("NumLoads", SharedCounts.NumLoads) << " loads, " - << ore::NV("NumFPOps", SharedCounts.NumComputeOps) - << " compute ops" - << " are shared with other expressions"; + // Map matrix operations to their containting subprograms, by traversing + // the inlinedAt chain. If the function does not have a DISubprogram, we + // only map them to the containing function. + MapVector> Subprog2Exprs; + for (auto &KV : Inst2ColumnMatrix) { + if (Func.getSubprogram()) { + auto *I = cast(KV.first); + DILocation *Context = I->getDebugLoc(); + while (Context) { + auto I = + Subprog2Exprs.insert({getSubprogram(Context->getScope()), {}}); + I.first->second.push_back(KV.first); + Context = DebugLoc(Context).getInlinedAt(); + } + } else { + auto I = Subprog2Exprs.insert({nullptr, {}}); + I.first->second.push_back(KV.first); } + } + for (auto &KV : Subprog2Exprs) { + SmallSetVector ExprsInSubprogram(KV.second.begin(), + KV.second.end()); + auto Leaves = getExpressionLeaves(ExprsInSubprogram); + + DenseMap> Shared; + for (Value *Leaf : Leaves) + collectSharedInfo(Leaf, Leaf, ExprsInSubprogram, Shared); + + // Generate remarks for each leaf. + for (auto *L : Leaves) { + + DebugLoc Loc = cast(L)->getDebugLoc(); + DILocation *Context = cast(L)->getDebugLoc(); + while (Context) { + if (getSubprogram(Context->getScope()) == KV.first) { + Loc = Context; + break; + } + Context = DebugLoc(Context).getInlinedAt(); + } + + SmallPtrSet ReusedExprs; + OpInfoTy Counts, SharedCounts; + std::tie(Counts, SharedCounts) = + sumOpInfos(L, ReusedExprs, ExprsInSubprogram, Shared); + + OptimizationRemark Rem(DEBUG_TYPE, "matrix-lowered", Loc, + cast(L)->getParent()); + + Rem << "Lowered with "; + Rem << ore::NV("NumStores", Counts.NumStores) << " stores, " + << ore::NV("NumLoads", Counts.NumLoads) << " loads, " + << ore::NV("NumComputeOps", Counts.NumComputeOps) + << " compute ops"; + + if (SharedCounts.NumStores > 0 || SharedCounts.NumLoads > 0 || + SharedCounts.NumComputeOps > 0) { + Rem << ",\nadditionally " + << ore::NV("NumStores", SharedCounts.NumStores) << " stores, " + << ore::NV("NumLoads", SharedCounts.NumLoads) << " loads, " + << ore::NV("NumFPOps", SharedCounts.NumComputeOps) + << " compute ops" + << " are shared with other expressions"; + } - Rem << ("\n" + linearize(L, Shared, DL)); - ORE.emit(Rem); + Rem << ("\n" + linearize(L, Shared, ExprsInSubprogram, DL)); + ORE.emit(Rem); + } } } std::string linearize(Value *L, const DenseMap> &Shared, + const SmallSetVector &ExprsInSubprogram, const DataLayout &DL) { - ExprLinearizer Lin(DL, Inst2ColumnMatrix, Shared, L); + ExprLinearizer Lin(DL, Inst2ColumnMatrix, Shared, ExprsInSubprogram, L); Lin.linearizeExpr(L, 0, false, false); return Lin.getResult(); } diff --git a/llvm/test/Transforms/LowerMatrixIntrinsics/remarks-inlining.ll b/llvm/test/Transforms/LowerMatrixIntrinsics/remarks-inlining.ll new file mode 100644 index 0000000..f0a5212 --- /dev/null +++ b/llvm/test/Transforms/LowerMatrixIntrinsics/remarks-inlining.ll @@ -0,0 +1,166 @@ +; REQUIRES: aarch64-registered-target + +; This test needs to be target specific due to the cost estimate in the output. + +; RUN: opt -lower-matrix-intrinsics -pass-remarks=lower-matrix-intrinsics -mtriple=arm64-apple-iphoneos -S < %s 2>&1 | FileCheck %s + +; Test the propagation of matrix expressions along to inlined-at chain. The IR +; in the test roughly corresponds to the C++ code below, with the IR containing +; references to a few more functions. + +; matrix.h +; template +; struct Matrix { +; using matrix_t = Ty __attribute__((matrix_type(R, C))); +; +; matrix_t value; +; }; +; +; ; add.h +; template +; Matrix add(Matrix M1, Matrix M2) { +; Matrix Result; +; Result.value = __builtin_matrix_add(M1.value, M2.value); +; return Result; +; } +; +; load.h: +; template +; Matrix load(Ty *Ptr) { +; Matrix Result; +; Result.value = *reinterpret_cast ::matrix_t *>(Ptr); +; return Result; +; } +; +; store.h: +; template +; void store(Matrix M1, Ty *Ptr) { +; *reinterpret_cast(Ptr) = M1.value; +; } +; +; toplevel.cpp +; void test(double *A, double *B, double *C) { +; store(add(load(A), load(B)), C); +; } +; + +target datalayout = "e-m:o-i64:64-f80:128-n8:16:32:64-S128" +target triple = "aarch64-apple-ios" + +; CHECK-LABEL: remark: load.h:41:43: Lowered with 0 stores, 10 loads, 0 compute ops +; CHECK-NEXT: load(addr %A) + +; CHECK-LABEL: remark: load.h:41:43: Lowered with 0 stores, 10 loads, 0 compute ops +; CHECK-NEXT: columnwise.load.3x5.double(addr %B, 5) + +; CHECK-LABEL: remark: load.h:41:11: Lowered with 0 stores, 1 loads, 0 compute ops +; CHECK-NEXT: load(addr %D) + +; CHECK-LABEL: remark: assign.h:32:43: Lowered with 0 stores, 10 loads, 0 compute ops +; CHECK-NEXT: load(addr %A) + +; CHECK-LABEL: remark: assign.h:32:43: Lowered with 0 stores, 10 loads, 0 compute ops +; CHECK-NEXT: columnwise.load.3x5.double(addr %B, 5) + +; CHECK-LABEL: remark: toplevel.c:410:0: Lowered with 10 stores, 20 loads, 10 compute ops +; CHECK-NEXT: store( +; CHECK-NEXT: fadd( +; CHECK-NEXT: load(addr %A), +; CHECK-NEXT: columnwise.load.3x5.double(addr %B, 5)), +; CHECK-NEXT: addr %C) + +; CHECK-LABEL: remark: toplevel.c:510:0: Lowered with 1 stores, 1 loads, 8 compute ops +; CHECK-NEXT: store( +; CHECK-NEXT: transpose.1x2.float(transpose.2x1.float(load(addr %D))), +; CHECK-NEXT: addr %D) + +; CHECK-LABEL: remark: add.h:66:11: Lowered with 0 stores, 0 loads, 10 compute ops +; CHECK-NEXT: fadd( +; CHECK-NEXT: addr %A, +; CHECK-NEXT: scalar) + +; CHECK-LABEL: remark: store.h:10:11: Lowered with 10 stores, 0 loads, 0 compute ops +; CHECK-NEXT: store( +; CHECK-NEXT: scalar, +; CHECK-NEXT: addr %C) + +; CHECK-LABEL: remark: store.h:66:11: Lowered with 1 stores, 0 loads, 0 compute ops +; CHECK-NEXT: store( +; CHECK-NEXT: scalar, +; CHECK-NEXT: addr %D) + +; CHECK-LABEL: remark: transpose.h:13:11: Lowered with 0 stores, 0 loads, 8 compute ops +; CHECK-NEXT: transpose.1x2.float(transpose.2x1.float(addr %D)) + +define void @toplevel(<15 x double>* %A, <15 x double>* %B, <15 x double>* %C, <2 x float>* %D) !dbg !16 { +entry: + %a = load <15 x double>, <15 x double> *%A, align 16, !dbg !3791 + %b = call <15 x double> @llvm.matrix.columnwise.load(<15 x double>* %B, i32 5, i32 3, i32 5), !dbg !3793 + %c = fadd <15 x double> %a, %b, !dbg !100 + store <15 x double> %c, <15 x double> *%C, align 16, !dbg !102 + + %load = load <2 x float>, <2 x float>* %D, !dbg !104 + %t1 = call <2 x float> @llvm.matrix.transpose(<2 x float> %load, i32 2, i32 1), !dbg !106 + %t2 = call <2 x float> @llvm.matrix.transpose(<2 x float> %t1, i32 1, i32 2), !dbg !106 + store <2 x float> %t2, <2 x float>* %D, !dbg !108 + ret void +} + +declare <15 x double> @llvm.matrix.columnwise.load(<15 x double>*, i32, i32, i32) +declare <2 x float> @llvm.matrix.transpose(<2 x float>, i32, i32) + +!llvm.dbg.cu = !{!0} +!llvm.module.flags = !{!3, !4} + +!0 = distinct !DICompileUnit(language: DW_LANG_C99, file: !1, producer: "clang", isOptimized: true, runtimeVersion: 0, emissionKind: FullDebug, enums: !2) +!1 = !DIFile(filename: "load.h", directory: "/test") +!2 = !{} +!3 = !{i32 2, !"Dwarf Version", i32 4} +!4 = !{i32 2, !"Debug Info Version", i32 3} +!5 = distinct !DISubprogram(name: "load_fn", scope: !1, file: !1, line: 1, type: !6, isLocal: false, isDefinition: true, scopeLine: 1, flags: DIFlagPrototyped, isOptimized: true, unit: !0, retainedNodes: !12) +!17 = !DIFile(filename: "toplevel.c", directory: "/test") +!16 = distinct !DISubprogram(name: "toplevel", scope: !1, file: !17, line: 1, type: !6, isLocal: false, isDefinition: true, scopeLine: 1, flags: DIFlagPrototyped, isOptimized: true, unit: !0, retainedNodes: !12) +!18 = !DIFile(filename: "assign.h", directory: "/test") +!19 = distinct !DISubprogram(name: "assign", scope: !1, file: !18, line: 1, type: !6, isLocal: false, isDefinition: true, scopeLine: 1, flags: DIFlagPrototyped, isOptimized: true, unit: !0, retainedNodes: !12) + +!20 = !DIFile(filename: "add.h", directory: "/test") +!21 = distinct !DISubprogram(name: "add_fn", scope: !1, file: !20, line: 1, type: !6, isLocal: false, isDefinition: true, scopeLine: 1, flags: DIFlagPrototyped, isOptimized: true, unit: !0, retainedNodes: !12) + +!22 = !DIFile(filename: "store.h", directory: "/test") +!23 = distinct !DISubprogram(name: "store_fn", scope: !1, file: !22, line: 1, type: !6, isLocal: false, isDefinition: true, scopeLine: 1, flags: DIFlagPrototyped, isOptimized: true, unit: !0, retainedNodes: !12) + +!24 = !DIFile(filename: "transpose.h", directory: "/test") +!25 = distinct !DISubprogram(name: "transpose", scope: !1, file: !24, line: 1, type: !6, isLocal: false, isDefinition: true, scopeLine: 1, flags: DIFlagPrototyped, isOptimized: true, unit: !0, retainedNodes: !12) + + +!6 = !DISubroutineType(types: !7) +!7 = !{null, !8, !8, !11} +!8 = !DIDerivedType(tag: DW_TAG_restrict_type, baseType: !9) +!9 = !DIDerivedType(tag: DW_TAG_pointer_type, baseType: !10, size: 32, align: 32) +!10 = !DIBasicType(name: "float", size: 32, align: 32, encoding: DW_ATE_float) +!11 = !DIBasicType(name: "int", size: 32, align: 32, encoding: DW_ATE_signed) +!12 = !{!13} +!13 = !DILocalVariable(name: "a", arg: 1, scope: !5, file: !1, line: 1, type: !8) +!14 = !DILocation(line: 1, column: 27, scope: !5) + +!3791 = !DILocation(line: 41, column: 43, scope: !5, inlinedAt: !3795) +!3792 = !DILocation(line: 405, column: 3, scope: !16) +!3793 = !DILocation(line: 41, column: 43, scope: !5, inlinedAt: !3796) +!3794 = !DILocation(line: 406, column: 11, scope: !16) +!3795 = !DILocation(line: 32, column: 43, scope: !19, inlinedAt: !3792) +!3796 = !DILocation(line: 32, column: 43, scope: !19, inlinedAt: !3794) + +!100 = !DILocation(line: 66, column: 11, scope: !21, inlinedAt: !101) +!101 = !DILocation(line: 410, column: 11, scope: !16) + +!102 = !DILocation(line: 10, column: 11, scope: !23, inlinedAt: !103) +!103 = !DILocation(line: 410, column: 0, scope: !16) + +!104 = !DILocation(line: 41, column: 11, scope: !5, inlinedAt: !101) +!105 = !DILocation(line: 500, column: 11, scope: !16) + +!106 = !DILocation(line: 13, column: 11, scope: !25, inlinedAt: !101) +!107 = !DILocation(line: 510, column: 11, scope: !16) + +!108 = !DILocation(line: 66, column: 11, scope: !23, inlinedAt: !109) +!109 = !DILocation(line: 510, column: 0, scope: !16) diff --git a/llvm/test/Transforms/LowerMatrixIntrinsics/remarks.ll b/llvm/test/Transforms/LowerMatrixIntrinsics/remarks.ll index 0bca099..c83648e 100644 --- a/llvm/test/Transforms/LowerMatrixIntrinsics/remarks.ll +++ b/llvm/test/Transforms/LowerMatrixIntrinsics/remarks.ll @@ -71,8 +71,8 @@ declare void @llvm.matrix.columnwise.store(<9 x double>, <9 x double>*, i32, i32 define void @binaryops(<9 x double>* %A, <9 x double>* %B) !dbg !31 { %A.matrix = call <9 x double> @llvm.matrix.columnwise.load(<9 x double>* %A, i32 5, i32 3, i32 3), !dbg !32 - %R1.matrix = fadd <9 x double> %A.matrix, %A.matrix - %R2.matrix = fmul <9 x double> %R1.matrix, %A.matrix + %R1.matrix = fadd <9 x double> %A.matrix, %A.matrix, !dbg !32 + %R2.matrix = fmul <9 x double> %R1.matrix, %A.matrix, !dbg !32 call void @llvm.matrix.columnwise.store(<9 x double> %R2.matrix, <9 x double>* %B, i32 10, i32 3, i32 3), !dbg !32 ret void } @@ -95,8 +95,8 @@ define void @binaryops(<9 x double>* %A, <9 x double>* %B) !dbg !31 { define void @multiple_expressions(<9 x double>* %A, <9 x double>* %B, <12 x double>* %C, <12 x double>* %D, <4 x double>* %E) !dbg !33 { %A.matrix = call <9 x double> @llvm.matrix.columnwise.load(<9 x double>* %A, i32 5, i32 3, i32 3), !dbg !34 - %R1.matrix = fadd <9 x double> %A.matrix, %A.matrix - %R2.matrix = fmul <9 x double> %R1.matrix, %A.matrix + %R1.matrix = fadd <9 x double> %A.matrix, %A.matrix, !dbg !34 + %R2.matrix = fmul <9 x double> %R1.matrix, %A.matrix, !dbg !34 call void @llvm.matrix.columnwise.store(<9 x double> %R2.matrix, <9 x double>* %B, i32 10, i32 3, i32 3), !dbg !34 %C.matrix = load <12 x double>, <12 x double>* %C, !dbg !34 @@ -119,8 +119,8 @@ define void @multiple_expressions(<9 x double>* %A, <9 x double>* %B, <12 x doub define void @stackaddresses(<9 x double>* %A) !dbg !35 { %B = alloca <9 x double> %A.matrix = call <9 x double> @llvm.matrix.columnwise.load(<9 x double>* %A, i32 5, i32 3, i32 3), !dbg !36 - %R1.matrix = fadd <9 x double> %A.matrix, %A.matrix - %R2.matrix = fmul <9 x double> %R1.matrix, %A.matrix + %R1.matrix = fadd <9 x double> %A.matrix, %A.matrix, !dbg !36 + %R2.matrix = fmul <9 x double> %R1.matrix, %A.matrix, !dbg !36 call void @llvm.matrix.columnwise.store(<9 x double> %R2.matrix, <9 x double>* %B, i32 10, i32 3, i32 3), !dbg !36 ret void } @@ -140,7 +140,7 @@ entry: %s2 = bitcast <15 x double>* %s1 to i64*, !dbg !22 %s3 = bitcast i64* %s2 to <15 x double>*, !dbg !22 - %t = call <15 x double> @llvm.matrix.transpose.v15f64.v15f64(<15 x double> %av, i32 5, i32 3) + %t = call <15 x double> @llvm.matrix.transpose.v15f64.v15f64(<15 x double> %av, i32 5, i32 3), !dbg !22 store <15 x double> %t, <15 x double>* %s3, !dbg !22 ret void -- 2.7.4