//
// TODO:
// * Implement multiply & add fusion
-// * Add remark, summarizing the available matrix optimization opportunities
-// (WIP).
//
//===----------------------------------------------------------------------===//
#include "llvm/Analysis/VectorUtils.h"
#include "llvm/IR/CFG.h"
#include "llvm/IR/DataLayout.h"
+#include "llvm/IR/DebugInfoMetadata.h"
#include "llvm/IR/Function.h"
#include "llvm/IR/IRBuilder.h"
#include "llvm/IR/Instructions.h"
cl::desc("Allow the use of FMAs if available and profitable. This may "
"result in different results, due to less rounding error."));
+/// Helper function to either return Scope, if it is a subprogram or the
+/// attached subprogram for a local scope.
+static DISubprogram *getSubprogram(DIScope *Scope) {
+ if (auto *Subprogram = dyn_cast<DISubprogram>(Scope))
+ return Subprogram;
+ return cast<DILocalScope>(Scope)->getSubprogram();
+}
+
namespace {
// Given an element poitner \p BasePtr to the start of a (sub) matrix, compute
}
}
- RemarkGenerator RemarkGen(Inst2ColumnMatrix, ORE, DL);
+ RemarkGenerator RemarkGen(Inst2ColumnMatrix, ORE, Func);
RemarkGen.emitRemarks();
for (Instruction *Inst : reverse(ToRemove))
/// part of.
const DenseMap<Value *, SmallPtrSet<Value *, 2>> &Shared;
+ /// Set of matrix expressions in the scope of a given DISubprogram.
+ const SmallSetVector<Value *, 32> &ExprsInSubprogram;
+
/// Leaf node of the expression to linearize.
Value *Leaf;
ExprLinearizer(const DataLayout &DL,
const MapVector<Value *, ColumnMatrixTy> &Inst2ColumnMatrix,
const DenseMap<Value *, SmallPtrSet<Value *, 2>> &Shared,
+ const SmallSetVector<Value *, 32> &ExprsInSubprogram,
Value *Leaf)
: Str(), Stream(Str), DL(DL), Inst2ColumnMatrix(Inst2ColumnMatrix),
- Shared(Shared), Leaf(Leaf) {}
+ Shared(Shared), ExprsInSubprogram(ExprsInSubprogram), Leaf(Leaf) {}
void indent(unsigned N) {
LineLength += N;
return V;
}
- /// Returns true if \p V is a matrix value.
- bool isMatrix(Value *V) const {
- return Inst2ColumnMatrix.find(V) != Inst2ColumnMatrix.end();
- }
+ /// Returns true if \p V is a matrix value in the given subprogram.
+ bool isMatrix(Value *V) const { return ExprsInSubprogram.count(V); }
/// If \p V is a matrix value, print its shape as as NumRows x NumColumns to
/// \p SS.
/// Generate remarks for matrix operations in a function. To generate remarks
/// for matrix expressions, the following approach is used:
- /// 1. Collect leafs of matrix expressions (done in
- /// RemarkGenerator::getExpressionLeaves). Leaves are lowered matrix
- /// instructions without other matrix users (like stores).
- ///
- /// 2. For each leaf, create a remark containing a linearizied version of the
- /// matrix expression.
- ///
- /// TODO:
- /// * Summarize number of vector instructions generated for each expression.
- /// * Propagate matrix remarks up the inlining chain.
+ /// 1. Use the inlined-at debug information to group matrix operations to the
+ /// DISubprograms they are contained in.
+ /// 2. Collect leaves of matrix expressions (done in
+ /// RemarkGenerator::getExpressionLeaves) for each subprogram - expression
+ // mapping. Leaves are lowered matrix instructions without other matrix
+ // users (like stores) in the current subprogram.
+ /// 3. For each leaf, create a remark containing a linearizied version of the
+ /// matrix expression. The expression is linearized by a recursive
+ /// bottom-up traversal of the matrix operands, starting at a leaf. Note
+ /// that multiple leaves can share sub-expressions. Shared subexpressions
+ /// are explicitly marked as shared().
struct RemarkGenerator {
const MapVector<Value *, ColumnMatrixTy> &Inst2ColumnMatrix;
OptimizationRemarkEmitter &ORE;
+ Function &Func;
const DataLayout &DL;
RemarkGenerator(const MapVector<Value *, ColumnMatrixTy> &Inst2ColumnMatrix,
- OptimizationRemarkEmitter &ORE, const DataLayout &DL)
- : Inst2ColumnMatrix(Inst2ColumnMatrix), ORE(ORE), DL(DL) {}
-
- /// Return all leafs of matrix expressions. Those are instructions in
- /// Inst2ColumnMatrix returing void. Currently that should only include
- /// stores.
- SmallVector<Value *, 4> getExpressionLeaves() {
+ OptimizationRemarkEmitter &ORE, Function &Func)
+ : Inst2ColumnMatrix(Inst2ColumnMatrix), ORE(ORE), Func(Func),
+ DL(Func.getParent()->getDataLayout()) {}
+
+ /// Return all leaves of the expressions in \p ExprsInSubprogram. Those are
+ /// instructions in Inst2ColumnMatrix returning void or without any users in
+ /// \p ExprsInSubprogram. Currently that should only include stores.
+ SmallVector<Value *, 4>
+ getExpressionLeaves(const SmallSetVector<Value *, 32> &ExprsInSubprogram) {
SmallVector<Value *, 4> Leaves;
- for (auto &KV : Inst2ColumnMatrix)
- if (KV.first->getType()->isVoidTy())
- Leaves.push_back(KV.first);
-
+ for (auto *Expr : ExprsInSubprogram)
+ if (Expr->getType()->isVoidTy() ||
+ !any_of(Expr->users(), [&ExprsInSubprogram](User *U) {
+ return ExprsInSubprogram.count(U);
+ }))
+ Leaves.push_back(Expr);
return Leaves;
}
/// Recursively traverse expression \p V starting at \p Leaf and add \p Leaf
- /// to all visited expressions in \p Shared.
+ /// to all visited expressions in \p Shared. Limit the matrix operations to
+ /// the ones in \p ExprsInSubprogram.
void collectSharedInfo(Value *Leaf, Value *V,
+ const SmallSetVector<Value *, 32> &ExprsInSubprogram,
DenseMap<Value *, SmallPtrSet<Value *, 2>> &Shared) {
- if (Inst2ColumnMatrix.find(V) == Inst2ColumnMatrix.end())
+ if (!ExprsInSubprogram.count(V))
return;
auto I = Shared.insert({V, {}});
I.first->second.insert(Leaf);
for (Value *Op : cast<Instruction>(V)->operand_values())
- collectSharedInfo(Leaf, Op, Shared);
+ collectSharedInfo(Leaf, Op, ExprsInSubprogram, Shared);
return;
}
/// Calculate the number of exclusive and shared op counts for expression
/// starting at \p V. Expressions used multiple times are counted once.
+ /// Limit the matrix operations to the ones in \p ExprsInSubprogram.
std::pair<OpInfoTy, OpInfoTy>
sumOpInfos(Value *Root, SmallPtrSetImpl<Value *> &ReusedExprs,
- DenseMap<Value *, SmallPtrSet<Value *, 2>> &Shared) {
- auto CM = Inst2ColumnMatrix.find(Root);
- if (CM == Inst2ColumnMatrix.end())
+ const SmallSetVector<Value *, 32> &ExprsInSubprogram,
+ DenseMap<Value *, SmallPtrSet<Value *, 2>> &Shared) const {
+ if (!ExprsInSubprogram.count(Root))
return {};
// Already counted this expression. Stop.
OpInfoTy Count;
auto I = Shared.find(Root);
+ auto CM = Inst2ColumnMatrix.find(Root);
if (I->second.size() == 1)
Count = CM->second.getOpInfo();
else
SharedCount = CM->second.getOpInfo();
for (Value *Op : cast<Instruction>(Root)->operand_values()) {
- auto C = sumOpInfos(Op, ReusedExprs, Shared);
+ auto C = sumOpInfos(Op, ReusedExprs, ExprsInSubprogram, Shared);
Count += C.first;
SharedCount += C.second;
}
if (!ORE.allowExtraAnalysis(DEBUG_TYPE))
return;
- // Find leafs of matrix expressions.
- auto Leaves = getExpressionLeaves();
-
- DenseMap<Value *, SmallPtrSet<Value *, 2>> Shared;
-
- for (Value *Leaf : Leaves)
- collectSharedInfo(Leaf, Leaf, Shared);
-
- // Generate remarks for each leaf.
- for (auto *L : Leaves) {
- SmallPtrSet<Value *, 8> ReusedExprs;
- OpInfoTy Counts, SharedCounts;
- std::tie(Counts, SharedCounts) = sumOpInfos(L, ReusedExprs, Shared);
-
- OptimizationRemark Rem(DEBUG_TYPE, "matrix-lowered",
- cast<Instruction>(L)->getDebugLoc(),
- cast<Instruction>(L)->getParent());
-
- Rem << "Lowered with ";
- Rem << ore::NV("NumStores", Counts.NumStores) << " stores, "
- << ore::NV("NumLoads", Counts.NumLoads) << " loads, "
- << ore::NV("NumComputeOps", Counts.NumComputeOps) << " compute ops";
-
- if (SharedCounts.NumStores > 0 || SharedCounts.NumLoads > 0 ||
- SharedCounts.NumComputeOps > 0) {
- Rem << ",\nadditionally "
- << ore::NV("NumStores", SharedCounts.NumStores) << " stores, "
- << ore::NV("NumLoads", SharedCounts.NumLoads) << " loads, "
- << ore::NV("NumFPOps", SharedCounts.NumComputeOps)
- << " compute ops"
- << " are shared with other expressions";
+ // Map matrix operations to their containting subprograms, by traversing
+ // the inlinedAt chain. If the function does not have a DISubprogram, we
+ // only map them to the containing function.
+ MapVector<DISubprogram *, SmallVector<Value *, 8>> Subprog2Exprs;
+ for (auto &KV : Inst2ColumnMatrix) {
+ if (Func.getSubprogram()) {
+ auto *I = cast<Instruction>(KV.first);
+ DILocation *Context = I->getDebugLoc();
+ while (Context) {
+ auto I =
+ Subprog2Exprs.insert({getSubprogram(Context->getScope()), {}});
+ I.first->second.push_back(KV.first);
+ Context = DebugLoc(Context).getInlinedAt();
+ }
+ } else {
+ auto I = Subprog2Exprs.insert({nullptr, {}});
+ I.first->second.push_back(KV.first);
}
+ }
+ for (auto &KV : Subprog2Exprs) {
+ SmallSetVector<Value *, 32> ExprsInSubprogram(KV.second.begin(),
+ KV.second.end());
+ auto Leaves = getExpressionLeaves(ExprsInSubprogram);
+
+ DenseMap<Value *, SmallPtrSet<Value *, 2>> Shared;
+ for (Value *Leaf : Leaves)
+ collectSharedInfo(Leaf, Leaf, ExprsInSubprogram, Shared);
+
+ // Generate remarks for each leaf.
+ for (auto *L : Leaves) {
+
+ DebugLoc Loc = cast<Instruction>(L)->getDebugLoc();
+ DILocation *Context = cast<Instruction>(L)->getDebugLoc();
+ while (Context) {
+ if (getSubprogram(Context->getScope()) == KV.first) {
+ Loc = Context;
+ break;
+ }
+ Context = DebugLoc(Context).getInlinedAt();
+ }
+
+ SmallPtrSet<Value *, 8> ReusedExprs;
+ OpInfoTy Counts, SharedCounts;
+ std::tie(Counts, SharedCounts) =
+ sumOpInfos(L, ReusedExprs, ExprsInSubprogram, Shared);
+
+ OptimizationRemark Rem(DEBUG_TYPE, "matrix-lowered", Loc,
+ cast<Instruction>(L)->getParent());
+
+ Rem << "Lowered with ";
+ Rem << ore::NV("NumStores", Counts.NumStores) << " stores, "
+ << ore::NV("NumLoads", Counts.NumLoads) << " loads, "
+ << ore::NV("NumComputeOps", Counts.NumComputeOps)
+ << " compute ops";
+
+ if (SharedCounts.NumStores > 0 || SharedCounts.NumLoads > 0 ||
+ SharedCounts.NumComputeOps > 0) {
+ Rem << ",\nadditionally "
+ << ore::NV("NumStores", SharedCounts.NumStores) << " stores, "
+ << ore::NV("NumLoads", SharedCounts.NumLoads) << " loads, "
+ << ore::NV("NumFPOps", SharedCounts.NumComputeOps)
+ << " compute ops"
+ << " are shared with other expressions";
+ }
- Rem << ("\n" + linearize(L, Shared, DL));
- ORE.emit(Rem);
+ Rem << ("\n" + linearize(L, Shared, ExprsInSubprogram, DL));
+ ORE.emit(Rem);
+ }
}
}
std::string
linearize(Value *L,
const DenseMap<Value *, SmallPtrSet<Value *, 2>> &Shared,
+ const SmallSetVector<Value *, 32> &ExprsInSubprogram,
const DataLayout &DL) {
- ExprLinearizer Lin(DL, Inst2ColumnMatrix, Shared, L);
+ ExprLinearizer Lin(DL, Inst2ColumnMatrix, Shared, ExprsInSubprogram, L);
Lin.linearizeExpr(L, 0, false, false);
return Lin.getResult();
}
--- /dev/null
+; REQUIRES: aarch64-registered-target
+
+; This test needs to be target specific due to the cost estimate in the output.
+
+; RUN: opt -lower-matrix-intrinsics -pass-remarks=lower-matrix-intrinsics -mtriple=arm64-apple-iphoneos -S < %s 2>&1 | FileCheck %s
+
+; Test the propagation of matrix expressions along to inlined-at chain. The IR
+; in the test roughly corresponds to the C++ code below, with the IR containing
+; references to a few more functions.
+
+; matrix.h
+; template <typename Ty, unsigned R, unsigned C>
+; struct Matrix {
+; using matrix_t = Ty __attribute__((matrix_type(R, C)));
+;
+; matrix_t value;
+; };
+;
+; ; add.h
+; template <typename Ty, unsigned R, unsigned C>
+; Matrix<Ty, R, C> add(Matrix<Ty, R, C> M1, Matrix<Ty, R, C> M2) {
+; Matrix<Ty, R, C> Result;
+; Result.value = __builtin_matrix_add(M1.value, M2.value);
+; return Result;
+; }
+;
+; load.h:
+; template <typename Ty, unsigned R, unsigned C>
+; Matrix<Ty, R, C> load(Ty *Ptr) {
+; Matrix<Ty, R, C> Result;
+; Result.value = *reinterpret_cast <typename Matrix<Ty, R, C>::matrix_t *>(Ptr);
+; return Result;
+; }
+;
+; store.h:
+; template <typename Ty, unsigned R, unsigned C>
+; void store(Matrix<Ty, R, C> M1, Ty *Ptr) {
+; *reinterpret_cast<typename decltype(M1)::matrix_t *>(Ptr) = M1.value;
+; }
+;
+; toplevel.cpp
+; void test(double *A, double *B, double *C) {
+; store(add(load<double, 3, 5>(A), load<double, 3, 5>(B)), C);
+; }
+;
+
+target datalayout = "e-m:o-i64:64-f80:128-n8:16:32:64-S128"
+target triple = "aarch64-apple-ios"
+
+; CHECK-LABEL: remark: load.h:41:43: Lowered with 0 stores, 10 loads, 0 compute ops
+; CHECK-NEXT: load(addr %A)
+
+; CHECK-LABEL: remark: load.h:41:43: Lowered with 0 stores, 10 loads, 0 compute ops
+; CHECK-NEXT: columnwise.load.3x5.double(addr %B, 5)
+
+; CHECK-LABEL: remark: load.h:41:11: Lowered with 0 stores, 1 loads, 0 compute ops
+; CHECK-NEXT: load(addr %D)
+
+; CHECK-LABEL: remark: assign.h:32:43: Lowered with 0 stores, 10 loads, 0 compute ops
+; CHECK-NEXT: load(addr %A)
+
+; CHECK-LABEL: remark: assign.h:32:43: Lowered with 0 stores, 10 loads, 0 compute ops
+; CHECK-NEXT: columnwise.load.3x5.double(addr %B, 5)
+
+; CHECK-LABEL: remark: toplevel.c:410:0: Lowered with 10 stores, 20 loads, 10 compute ops
+; CHECK-NEXT: store(
+; CHECK-NEXT: fadd(
+; CHECK-NEXT: load(addr %A),
+; CHECK-NEXT: columnwise.load.3x5.double(addr %B, 5)),
+; CHECK-NEXT: addr %C)
+
+; CHECK-LABEL: remark: toplevel.c:510:0: Lowered with 1 stores, 1 loads, 8 compute ops
+; CHECK-NEXT: store(
+; CHECK-NEXT: transpose.1x2.float(transpose.2x1.float(load(addr %D))),
+; CHECK-NEXT: addr %D)
+
+; CHECK-LABEL: remark: add.h:66:11: Lowered with 0 stores, 0 loads, 10 compute ops
+; CHECK-NEXT: fadd(
+; CHECK-NEXT: addr %A,
+; CHECK-NEXT: scalar)
+
+; CHECK-LABEL: remark: store.h:10:11: Lowered with 10 stores, 0 loads, 0 compute ops
+; CHECK-NEXT: store(
+; CHECK-NEXT: scalar,
+; CHECK-NEXT: addr %C)
+
+; CHECK-LABEL: remark: store.h:66:11: Lowered with 1 stores, 0 loads, 0 compute ops
+; CHECK-NEXT: store(
+; CHECK-NEXT: scalar,
+; CHECK-NEXT: addr %D)
+
+; CHECK-LABEL: remark: transpose.h:13:11: Lowered with 0 stores, 0 loads, 8 compute ops
+; CHECK-NEXT: transpose.1x2.float(transpose.2x1.float(addr %D))
+
+define void @toplevel(<15 x double>* %A, <15 x double>* %B, <15 x double>* %C, <2 x float>* %D) !dbg !16 {
+entry:
+ %a = load <15 x double>, <15 x double> *%A, align 16, !dbg !3791
+ %b = call <15 x double> @llvm.matrix.columnwise.load(<15 x double>* %B, i32 5, i32 3, i32 5), !dbg !3793
+ %c = fadd <15 x double> %a, %b, !dbg !100
+ store <15 x double> %c, <15 x double> *%C, align 16, !dbg !102
+
+ %load = load <2 x float>, <2 x float>* %D, !dbg !104
+ %t1 = call <2 x float> @llvm.matrix.transpose(<2 x float> %load, i32 2, i32 1), !dbg !106
+ %t2 = call <2 x float> @llvm.matrix.transpose(<2 x float> %t1, i32 1, i32 2), !dbg !106
+ store <2 x float> %t2, <2 x float>* %D, !dbg !108
+ ret void
+}
+
+declare <15 x double> @llvm.matrix.columnwise.load(<15 x double>*, i32, i32, i32)
+declare <2 x float> @llvm.matrix.transpose(<2 x float>, i32, i32)
+
+!llvm.dbg.cu = !{!0}
+!llvm.module.flags = !{!3, !4}
+
+!0 = distinct !DICompileUnit(language: DW_LANG_C99, file: !1, producer: "clang", isOptimized: true, runtimeVersion: 0, emissionKind: FullDebug, enums: !2)
+!1 = !DIFile(filename: "load.h", directory: "/test")
+!2 = !{}
+!3 = !{i32 2, !"Dwarf Version", i32 4}
+!4 = !{i32 2, !"Debug Info Version", i32 3}
+!5 = distinct !DISubprogram(name: "load_fn", scope: !1, file: !1, line: 1, type: !6, isLocal: false, isDefinition: true, scopeLine: 1, flags: DIFlagPrototyped, isOptimized: true, unit: !0, retainedNodes: !12)
+!17 = !DIFile(filename: "toplevel.c", directory: "/test")
+!16 = distinct !DISubprogram(name: "toplevel", scope: !1, file: !17, line: 1, type: !6, isLocal: false, isDefinition: true, scopeLine: 1, flags: DIFlagPrototyped, isOptimized: true, unit: !0, retainedNodes: !12)
+!18 = !DIFile(filename: "assign.h", directory: "/test")
+!19 = distinct !DISubprogram(name: "assign", scope: !1, file: !18, line: 1, type: !6, isLocal: false, isDefinition: true, scopeLine: 1, flags: DIFlagPrototyped, isOptimized: true, unit: !0, retainedNodes: !12)
+
+!20 = !DIFile(filename: "add.h", directory: "/test")
+!21 = distinct !DISubprogram(name: "add_fn", scope: !1, file: !20, line: 1, type: !6, isLocal: false, isDefinition: true, scopeLine: 1, flags: DIFlagPrototyped, isOptimized: true, unit: !0, retainedNodes: !12)
+
+!22 = !DIFile(filename: "store.h", directory: "/test")
+!23 = distinct !DISubprogram(name: "store_fn", scope: !1, file: !22, line: 1, type: !6, isLocal: false, isDefinition: true, scopeLine: 1, flags: DIFlagPrototyped, isOptimized: true, unit: !0, retainedNodes: !12)
+
+!24 = !DIFile(filename: "transpose.h", directory: "/test")
+!25 = distinct !DISubprogram(name: "transpose", scope: !1, file: !24, line: 1, type: !6, isLocal: false, isDefinition: true, scopeLine: 1, flags: DIFlagPrototyped, isOptimized: true, unit: !0, retainedNodes: !12)
+
+
+!6 = !DISubroutineType(types: !7)
+!7 = !{null, !8, !8, !11}
+!8 = !DIDerivedType(tag: DW_TAG_restrict_type, baseType: !9)
+!9 = !DIDerivedType(tag: DW_TAG_pointer_type, baseType: !10, size: 32, align: 32)
+!10 = !DIBasicType(name: "float", size: 32, align: 32, encoding: DW_ATE_float)
+!11 = !DIBasicType(name: "int", size: 32, align: 32, encoding: DW_ATE_signed)
+!12 = !{!13}
+!13 = !DILocalVariable(name: "a", arg: 1, scope: !5, file: !1, line: 1, type: !8)
+!14 = !DILocation(line: 1, column: 27, scope: !5)
+
+!3791 = !DILocation(line: 41, column: 43, scope: !5, inlinedAt: !3795)
+!3792 = !DILocation(line: 405, column: 3, scope: !16)
+!3793 = !DILocation(line: 41, column: 43, scope: !5, inlinedAt: !3796)
+!3794 = !DILocation(line: 406, column: 11, scope: !16)
+!3795 = !DILocation(line: 32, column: 43, scope: !19, inlinedAt: !3792)
+!3796 = !DILocation(line: 32, column: 43, scope: !19, inlinedAt: !3794)
+
+!100 = !DILocation(line: 66, column: 11, scope: !21, inlinedAt: !101)
+!101 = !DILocation(line: 410, column: 11, scope: !16)
+
+!102 = !DILocation(line: 10, column: 11, scope: !23, inlinedAt: !103)
+!103 = !DILocation(line: 410, column: 0, scope: !16)
+
+!104 = !DILocation(line: 41, column: 11, scope: !5, inlinedAt: !101)
+!105 = !DILocation(line: 500, column: 11, scope: !16)
+
+!106 = !DILocation(line: 13, column: 11, scope: !25, inlinedAt: !101)
+!107 = !DILocation(line: 510, column: 11, scope: !16)
+
+!108 = !DILocation(line: 66, column: 11, scope: !23, inlinedAt: !109)
+!109 = !DILocation(line: 510, column: 0, scope: !16)
define void @binaryops(<9 x double>* %A, <9 x double>* %B) !dbg !31 {
%A.matrix = call <9 x double> @llvm.matrix.columnwise.load(<9 x double>* %A, i32 5, i32 3, i32 3), !dbg !32
- %R1.matrix = fadd <9 x double> %A.matrix, %A.matrix
- %R2.matrix = fmul <9 x double> %R1.matrix, %A.matrix
+ %R1.matrix = fadd <9 x double> %A.matrix, %A.matrix, !dbg !32
+ %R2.matrix = fmul <9 x double> %R1.matrix, %A.matrix, !dbg !32
call void @llvm.matrix.columnwise.store(<9 x double> %R2.matrix, <9 x double>* %B, i32 10, i32 3, i32 3), !dbg !32
ret void
}
define void @multiple_expressions(<9 x double>* %A, <9 x double>* %B, <12 x double>* %C, <12 x double>* %D, <4 x double>* %E) !dbg !33 {
%A.matrix = call <9 x double> @llvm.matrix.columnwise.load(<9 x double>* %A, i32 5, i32 3, i32 3), !dbg !34
- %R1.matrix = fadd <9 x double> %A.matrix, %A.matrix
- %R2.matrix = fmul <9 x double> %R1.matrix, %A.matrix
+ %R1.matrix = fadd <9 x double> %A.matrix, %A.matrix, !dbg !34
+ %R2.matrix = fmul <9 x double> %R1.matrix, %A.matrix, !dbg !34
call void @llvm.matrix.columnwise.store(<9 x double> %R2.matrix, <9 x double>* %B, i32 10, i32 3, i32 3), !dbg !34
%C.matrix = load <12 x double>, <12 x double>* %C, !dbg !34
define void @stackaddresses(<9 x double>* %A) !dbg !35 {
%B = alloca <9 x double>
%A.matrix = call <9 x double> @llvm.matrix.columnwise.load(<9 x double>* %A, i32 5, i32 3, i32 3), !dbg !36
- %R1.matrix = fadd <9 x double> %A.matrix, %A.matrix
- %R2.matrix = fmul <9 x double> %R1.matrix, %A.matrix
+ %R1.matrix = fadd <9 x double> %A.matrix, %A.matrix, !dbg !36
+ %R2.matrix = fmul <9 x double> %R1.matrix, %A.matrix, !dbg !36
call void @llvm.matrix.columnwise.store(<9 x double> %R2.matrix, <9 x double>* %B, i32 10, i32 3, i32 3), !dbg !36
ret void
}
%s2 = bitcast <15 x double>* %s1 to i64*, !dbg !22
%s3 = bitcast i64* %s2 to <15 x double>*, !dbg !22
- %t = call <15 x double> @llvm.matrix.transpose.v15f64.v15f64(<15 x double> %av, i32 5, i32 3)
+ %t = call <15 x double> @llvm.matrix.transpose.v15f64.v15f64(<15 x double> %av, i32 5, i32 3), !dbg !22
store <15 x double> %t, <15 x double>* %s3, !dbg !22
ret void