From ea21d688dc0a420b9fc385562a46017fb39b13e5 Mon Sep 17 00:00:00 2001
From: Florian Hahn <flo@fhahn.com>
Date: Wed, 22 Sep 2021 10:28:36 +0100
Subject: [PATCH] [Matrix] Emit assumption that matrix indices are valid.

The matrix extension requires the indices for matrix subscript
expression to be valid and it is UB otherwise.

extract/insertelement produce poison if the index is invalid, which
limits the optimizer to not be bale to scalarize load/extract pairs for
example, which causes very suboptimal code to be generated when using
matrix subscript expressions with variable indices for large matrixes.

This patch updates IRGen to emit assumes to for index expression to
convey the information that the index must be valid.

This also adjusts the order in which operations are emitted slightly, so
indices & assumes are added before the load of the matrix value.

Reviewed By: erichkeane

Differential Revision: https://reviews.llvm.org/D102478
---
 clang/lib/CodeGen/CGExpr.cpp                  | 22 +++++++---
 clang/lib/CodeGen/CGExprScalar.cpp            | 13 ++++--
 clang/test/CodeGen/matrix-type-operators.c    | 43 +++++++++++++++----
 .../test/CodeGenCXX/matrix-type-operators.cpp | 25 ++++++++---
 .../test/CodeGenObjC/matrix-type-operators.m  |  6 +--
 llvm/include/llvm/IR/MatrixBuilder.h          | 24 ++++++++---
 6 files changed, 99 insertions(+), 34 deletions(-)
diff --git a/clang/lib/CodeGen/CGExpr.cpp b/clang/lib/CodeGen/CGExpr.cpp
index e07f95c60c5f..fafaaf447234 100644
--- a/clang/lib/CodeGen/CGExpr.cpp
+++ b/clang/lib/CodeGen/CGExpr.cpp
@@ -35,6 +35,7 @@
 #include "llvm/IR/Intrinsics.h"
 #include "llvm/IR/LLVMContext.h"
 #include "llvm/IR/MDBuilder.h"
+#include "llvm/IR/MatrixBuilder.h"
 #include "llvm/Support/ConvertUTF.h"
 #include "llvm/Support/MathExtras.h"
 #include "llvm/Support/Path.h"
@@ -1939,10 +1940,15 @@ RValue CodeGenFunction::EmitLoadOfLValue(LValue LV, SourceLocation Loc) {
     return EmitLoadOfGlobalRegLValue(LV);
 
   if (LV.isMatrixElt()) {
+    llvm::Value *Idx = LV.getMatrixIdx();
+    if (CGM.getCodeGenOpts().OptimizationLevel > 0) {
+      const auto *const MatTy = LV.getType()->getAs<ConstantMatrixType>();
+      llvm::MatrixBuilder<CGBuilderTy> MB(Builder);
+      MB.CreateIndexAssumption(Idx, MatTy->getNumElementsFlattened());
+    }
     llvm::LoadInst *Load =
         Builder.CreateLoad(LV.getMatrixAddress(), LV.isVolatileQualified());
-    return RValue::get(
-        Builder.CreateExtractElement(Load, LV.getMatrixIdx(), "matrixext"));
+    return RValue::get(Builder.CreateExtractElement(Load, Idx, "matrixext"));
   }
 
   assert(LV.isBitField() && "Unknown LValue type!");
@@ -2080,9 +2086,15 @@ void CodeGenFunction::EmitStoreThroughLValue(RValue Src, LValue Dst,
       return EmitStoreThroughGlobalRegLValue(Src, Dst);
 
     if (Dst.isMatrixElt()) {
-      llvm::Value *Vec = Builder.CreateLoad(Dst.getMatrixAddress());
-      Vec = Builder.CreateInsertElement(Vec, Src.getScalarVal(),
-                                        Dst.getMatrixIdx(), "matins");
+      llvm::Value *Idx = Dst.getMatrixIdx();
+      if (CGM.getCodeGenOpts().OptimizationLevel > 0) {
+        const auto *const MatTy = Dst.getType()->getAs<ConstantMatrixType>();
+        llvm::MatrixBuilder<CGBuilderTy> MB(Builder);
+        MB.CreateIndexAssumption(Idx, MatTy->getNumElementsFlattened());
+      }
+      llvm::Instruction *Load = Builder.CreateLoad(Dst.getMatrixAddress());
+      llvm::Value *Vec =
+          Builder.CreateInsertElement(Load, Src.getScalarVal(), Idx, "matins");
       Builder.CreateStore(Vec, Dst.getMatrixAddress(),
                           Dst.isVolatileQualified());
       return;
diff --git a/clang/lib/CodeGen/CGExprScalar.cpp b/clang/lib/CodeGen/CGExprScalar.cpp
index fe62e63521a1..67c581b46eae 100644
--- a/clang/lib/CodeGen/CGExprScalar.cpp
+++ b/clang/lib/CodeGen/CGExprScalar.cpp
@@ -1775,13 +1775,18 @@ Value *ScalarExprEmitter::VisitMatrixSubscriptExpr(MatrixSubscriptExpr *E) {
   // integer value.
   Value *RowIdx = Visit(E->getRowIdx());
   Value *ColumnIdx = Visit(E->getColumnIdx());
+
+  const auto *MatrixTy = E->getBase()->getType()->castAs<ConstantMatrixType>();
+  unsigned NumRows = MatrixTy->getNumRows();
+  llvm::MatrixBuilder<CGBuilderTy> MB(Builder);
+  Value *Idx = MB.CreateIndex(RowIdx, ColumnIdx, NumRows);
+  if (CGF.CGM.getCodeGenOpts().OptimizationLevel > 0)
+    MB.CreateIndexAssumption(Idx, MatrixTy->getNumElementsFlattened());
+
   Value *Matrix = Visit(E->getBase());
 
   // TODO: Should we emit bounds checks with SanitizerKind::ArrayBounds?
-  llvm::MatrixBuilder<CGBuilderTy> MB(Builder);
-  return MB.CreateExtractElement(
-      Matrix, RowIdx, ColumnIdx,
-      E->getBase()->getType()->castAs<ConstantMatrixType>()->getNumRows());
+  return Builder.CreateExtractElement(Matrix, Idx, "matrixext");
 }
 
 static int getMaskElt(llvm::ShuffleVectorInst *SVI, unsigned Idx,
diff --git a/clang/test/CodeGen/matrix-type-operators.c b/clang/test/CodeGen/matrix-type-operators.c
index 7d43276ca72a..34ce57c0702a 100644
--- a/clang/test/CodeGen/matrix-type-operators.c
+++ b/clang/test/CodeGen/matrix-type-operators.c
@@ -1,4 +1,5 @@
-// RUN: %clang_cc1 -fenable-matrix -triple x86_64-apple-darwin %s -emit-llvm -disable-llvm-passes -o - | FileCheck %s
+// RUN: %clang_cc1 -O0 -fenable-matrix -triple x86_64-apple-darwin %s -emit-llvm -disable-llvm-passes -o - | FileCheck --check-prefixes=CHECK %s
+// RUN: %clang_cc1 -O1 -fenable-matrix -triple x86_64-apple-darwin %s -emit-llvm -disable-llvm-passes -o - | FileCheck --check-prefixes=CHECK,OPT %s
 
 typedef double dx5x5_t __attribute__((matrix_type(5, 5)));
 typedef float fx2x3_t __attribute__((matrix_type(2, 3)));
@@ -506,7 +507,7 @@ void multiply_matrix_matrix_double(dx5x5_t b, dx5x5_t c) {
   // CHECK-NEXT:    [[RES:%.*]] = call <25 x double> @llvm.matrix.multiply.v25f64.v25f64.v25f64(<25 x double> [[B]], <25 x double> [[C]], i32 5, i32 5, i32 5)
   // CHECK-NEXT:    [[A_ADDR:%.*]] = bitcast [25 x double]* %a to <25 x double>*
   // CHECK-NEXT:    store <25 x double> [[RES]], <25 x double>* [[A_ADDR]], align 8
-  // CHECK-NEXT:    ret void
+  // CHECK:         ret void
   //
 
   dx5x5_t a;
@@ -531,7 +532,7 @@ typedef int ix9x9_t __attribute__((matrix_type(9, 9)));
 // CHECK-NEXT:    [[RES:%.*]] = call <81 x i32> @llvm.matrix.multiply.v81i32.v27i32.v27i32(<27 x i32> [[B]], <27 x i32> [[C]], i32 9, i32 3, i32 9)
 // CHECK-NEXT:    [[A_ADDR:%.*]] = bitcast [81 x i32]* %a to <81 x i32>*
 // CHECK-NEXT:    store <81 x i32> [[RES]], <81 x i32>* [[A_ADDR]], align 4
-// CHECK-NEXT:    ret void
+// CHECK:         ret void
 //
 void multiply_matrix_matrix_int(ix9x3_t b, ix3x9_t c) {
   ix9x9_t a;
@@ -874,6 +875,8 @@ void insert_float_matrix_idx_i_u_float(fx2x3_t b, float e, int j, unsigned k) {
   // CHECK-NEXT:    [[K_EXT:%.*]] = zext i32 [[K]] to i64
   // CHECK-NEXT:    [[IDX1:%.*]] = mul i64 [[K_EXT]], 2
   // CHECK-NEXT:    [[IDX2:%.*]] = add i64 [[IDX1]], [[J_EXT]]
+  // OPT-NEXT:      [[CMP:%.*]] = icmp ult i64 [[IDX2]], 6
+  // OPT-NEXT:      call void @llvm.assume(i1 [[CMP]])
   // CHECK-NEXT:    [[MAT:%.*]] = load <6 x float>, <6 x float>* [[MAT_ADDR:%.*]], align 4
   // CHECK-NEXT:    [[MATINS:%.*]] = insertelement <6 x float> [[MAT]], float [[E]], i64 [[IDX2]]
   // CHECK-NEXT:    store <6 x float> [[MATINS]], <6 x float>* [[MAT_ADDR]], align 4
@@ -890,6 +893,8 @@ void insert_float_matrix_idx_s_ull_float(fx2x3_t b, float e, short j, unsigned l
   // CHECK-NEXT:    [[K:%.*]] = load i64, i64* %k.addr, align 8
   // CHECK-NEXT:    [[IDX1:%.*]] = mul i64 [[K]], 2
   // CHECK-NEXT:    [[IDX2:%.*]] = add i64 [[IDX1]], [[J_EXT]]
+  // OPT-NEXT:      [[CMP:%.*]] = icmp ult i64 [[IDX2]], 6
+  // OPT-NEXT:      call void @llvm.assume(i1 [[CMP]])
   // CHECK-NEXT:    [[MAT:%.*]] = load <6 x float>, <6 x float>* [[MAT_ADDR:%.*]], align 4
   // CHECK-NEXT:    [[MATINS:%.*]] = insertelement <6 x float> [[MAT]], float [[E]], i64 [[IDX2]]
   // CHECK-NEXT:    store <6 x float> [[MATINS]], <6 x float>* [[MAT_ADDR]], align 4
@@ -907,6 +912,8 @@ void insert_int_idx_expr(ix9x3_t a, int i) {
   // CHECK-NEXT:    [[I2_ADD:%.*]] = add nsw i32 4, [[I2]]
   // CHECK-NEXT:    [[ADD_EXT:%.*]] = sext i32 [[I2_ADD]] to i64
   // CHECK-NEXT:    [[IDX2:%.*]] = add i64 18, [[ADD_EXT]]
+  // OPT-NEXT:      [[CMP:%.*]] = icmp ult i64 [[IDX2]], 27
+  // OPT-NEXT:      call void @llvm.assume(i1 [[CMP]])
   // CHECK-NEXT:    [[MAT:%.*]] = load <27 x i32>, <27 x i32>* [[MAT_ADDR:%.*]], align 4
   // CHECK-NEXT:    [[MATINS:%.*]] = insertelement <27 x i32> [[MAT]], i32 [[I1]], i64 [[IDX2]]
   // CHECK-NEXT:    store <27 x i32> [[MATINS]], <27 x i32>* [[MAT_ADDR]], align 4
@@ -980,9 +987,11 @@ int extract_int(ix9x3_t c, unsigned long j) {
   // CHECK-LABEL: @extract_int(
   // CHECK:         [[J1:%.*]] = load i64, i64* %j.addr, align 8
   // CHECK-NEXT:    [[J2:%.*]] = load i64, i64* %j.addr, align 8
-  // CHECK-NEXT:    [[MAT:%.*]] = load <27 x i32>, <27 x i32>* {{.*}}, align 4
   // CHECK-NEXT:    [[IDX1:%.*]] = mul i64 [[J2]], 9
   // CHECK-NEXT:    [[IDX2:%.*]] = add i64 [[IDX1]], [[J1]]
+  // OPT-NEXT:      [[CMP:%.*]] = icmp ult i64 [[IDX2]], 27
+  // OPT-NEXT:      call void @llvm.assume(i1 [[CMP]])
+  // CHECK-NEXT:    [[MAT:%.*]] = load <27 x i32>, <27 x i32>* {{.*}}, align 4
   // CHECK-NEXT:    [[MATEXT:%.*]] = extractelement <27 x i32> [[MAT]], i64 [[IDX2]]
   // CHECK-NEXT:    ret i32 [[MATEXT]]
 
@@ -995,13 +1004,15 @@ double test_extract_matrix_pointer1(dx3x2_t **ptr, unsigned j) {
   // CHECK-LABEL: @test_extract_matrix_pointer1(
   // CHECK:         [[J:%.*]] = load i32, i32* %j.addr, align 4
   // CHECK-NEXT:    [[J_EXT:%.*]] = zext i32 [[J]] to i64
+  // CHECK-NEXT:    [[IDX:%.*]] = add i64 3, [[J_EXT]]
+  // OPT-NEXT:      [[CMP:%.*]] = icmp ult i64 [[IDX]], 6
+  // OPT-NEXT:      call void @llvm.assume(i1 [[CMP]])
   // CHECK-NEXT:    [[PTR:%.*]] = load [6 x double]**, [6 x double]*** %ptr.addr, align 8
   // CHECK-NEXT:    [[PTR_IDX:%.*]] = getelementptr inbounds [6 x double]*, [6 x double]** [[PTR]], i64 1
   // CHECK-NEXT:    [[PTR2:%.*]] = load [6 x double]*, [6 x double]** [[PTR_IDX]], align 8
   // CHECK-NEXT:    [[PTR2_IDX:%.*]] = getelementptr inbounds [6 x double], [6 x double]* [[PTR2]], i64 2
   // CHECK-NEXT:    [[MAT_ADDR:%.*]] = bitcast [6 x double]* [[PTR2_IDX]] to <6 x double>*
   // CHECK-NEXT:    [[MAT:%.*]] = load <6 x double>, <6 x double>* [[MAT_ADDR]], align 8
-  // CHECK-NEXT:    [[IDX:%.*]] = add i64 3, [[J_EXT]]
   // CHECK-NEXT:    [[MATEXT:%.*]] = extractelement <6 x double> [[MAT]], i64 [[IDX]]
   // CHECK-NEXT:    ret double [[MATEXT]]
 
@@ -1027,13 +1038,17 @@ void insert_extract(dx5x5_t a, fx3x3_t b, unsigned long j, short k) {
   // CHECK-LABEL: @insert_extract(
   // CHECK:         [[K:%.*]] = load i16, i16* %k.addr, align 2
   // CHECK-NEXT:    [[K_EXT:%.*]] = sext i16 [[K]] to i64
-  // CHECK-NEXT:    [[MAT:%.*]] = load <9 x float>, <9 x float>* [[MAT_ADDR:%.*]], align 4
   // CHECK-NEXT:    [[IDX1:%.*]] = mul i64 [[K_EXT]], 3
   // CHECK-NEXT:    [[IDX2:%.*]] = add i64 [[IDX1]], 0
-  // CHECK-NEXT:    [[MATEXT:%.*]] = extractelement <9 x float> [[MAT]], i64 [[IDX]]
+  // OPT-NEXT:      [[CMP:%.*]] = icmp ult i64 [[IDX2]], 9
+  // OPT-NEXT:      call void @llvm.assume(i1 [[CMP]])
+  // CHECK-NEXT:    [[MAT:%.*]] = load <9 x float>, <9 x float>* [[MAT_ADDR:%.*]], align 4
+  // CHECK-NEXT:    [[MATEXT:%.*]] = extractelement <9 x float> [[MAT]], i64 [[IDX2]]
   // CHECK-NEXT:    [[J:%.*]] = load i64, i64* %j.addr, align 8
   // CHECK-NEXT:    [[IDX3:%.*]] = mul i64 [[J]], 3
   // CHECK-NEXT:    [[IDX4:%.*]] = add i64 [[IDX3]], 2
+  // OPT-NEXT:      [[CMP:%.*]] = icmp ult i64 [[IDX4]], 9
+  // OPT-NEXT:      call void @llvm.assume(i1 [[CMP]])
   // CHECK-NEXT:    [[MAT2:%.*]] = load <9 x float>, <9 x float>* [[MAT_ADDR]], align 4
   // CHECK-NEXT:    [[MATINS:%.*]] = insertelement <9 x float> [[MAT2]], float [[MATEXT]], i64 [[IDX4]]
   // CHECK-NEXT:    store <9 x float> [[MATINS]], <9 x float>* [[MAT_ADDR]], align 4
@@ -1068,9 +1083,13 @@ void insert_compound_stmt_field(struct Foo *a, float f, unsigned i, unsigned j)
   // CHECK-NEXT:    [[IDX1:%.*]] = mul i64 [[J_EXT]], 2
   // CHECK-NEXT:    [[IDX2:%.*]] = add i64 [[IDX1]], [[I_EXT]]
   // CHECK-NEXT:    [[MAT_PTR:%.*]] = bitcast [6 x float]* %mat to <6 x float>*
+  // OPT-NEXT:      [[CMP:%.*]] = icmp ult i64 [[IDX2]], 6
+  // OPT-NEXT:      call void @llvm.assume(i1 [[CMP]])
   // CHECK-NEXT:    [[MAT:%.*]] = load <6 x float>, <6 x float>* [[MAT_PTR]], align 4
   // CHECK-NEXT:    [[EXT:%.*]] = extractelement <6 x float> [[MAT]], i64 [[IDX2]]
   // CHECK-NEXT:    [[SUM:%.*]] = fadd float [[EXT]], {{.*}}
+  // OPT-NEXT:      [[CMP:%.*]] = icmp ult i64 [[IDX2]], 6
+  // OPT-NEXT:      call void @llvm.assume(i1 [[CMP]])
   // CHECK-NEXT:    [[MAT2:%.*]] = load <6 x float>, <6 x float>* [[MAT_PTR]], align 4
   // CHECK-NEXT:    [[INS:%.*]] = insertelement <6 x float> [[MAT2]], float [[SUM]], i64 [[IDX2]]
   // CHECK-NEXT:    store <6 x float> [[INS]], <6 x float>* [[MAT_PTR]], align 4
@@ -1085,23 +1104,29 @@ void matrix_as_idx(ix9x3_t a, int i, int j, dx5x5_t b) {
   // CHECK-NEXT:  [[I1_EXT:%.*]] = sext i32 [[I1]] to i64
   // CHECK-NEXT:  [[J1:%.*]] = load i32, i32* %j.addr, align 4
   // CHECK-NEXT:  [[J1_EXT:%.*]] = sext i32 [[J1]] to i64
-  // CHECK-NEXT:  [[A:%.*]] = load <27 x i32>, <27 x i32>* %0, align 4
   // CHECK-NEXT:  [[IDX1_1:%.*]] = mul i64 [[J1_EXT]], 9
   // CHECK-NEXT:  [[IDX1_2:%.*]] = add i64 [[IDX1_1]], [[I1_EXT]]
+  // OPT-NEXT:    [[CMP:%.*]] = icmp ult i64 [[IDX1_2]], 27
+  // OPT-NEXT:    call void @llvm.assume(i1 [[CMP]])
+  // CHECK-NEXT:  [[A:%.*]] = load <27 x i32>, <27 x i32>* %0, align 4
   // CHECK-NEXT:  [[MI1:%.*]] = extractelement <27 x i32> [[A]], i64 [[IDX1_2]]
   // CHECK-NEXT:  [[MI1_EXT:%.*]] = sext i32 [[MI1]] to i64
   // CHECK-NEXT:  [[J2:%.*]] = load i32, i32* %j.addr, align 4
   // CHECK-NEXT:  [[J2_EXT:%.*]] = sext i32 [[J2]] to i64
   // CHECK-NEXT:  [[I2:%.*]] = load i32, i32* %i.addr, align 4
   // CHECK-NEXT:  [[I2_EXT:%.*]] = sext i32 [[I2]] to i64
-  // CHECK-NEXT:  [[A2:%.*]] = load <27 x i32>, <27 x i32>* {{.*}}, align 4
   // CHECK-NEXT:  [[IDX2_1:%.*]] = mul i64 [[I2_EXT]], 9
   // CHECK-NEXT:  [[IDX2_2:%.*]] = add i64 [[IDX2_1]], [[J2_EXT]]
+  // OPT-NEXT:    [[CMP:%.*]] = icmp ult i64 [[IDX2_2]], 27
+  // OPT-NEXT:    call void @llvm.assume(i1 [[CMP]])
+  // CHECK-NEXT:  [[A2:%.*]] = load <27 x i32>, <27 x i32>* {{.*}}, align 4
   // CHECK-NEXT:  [[MI2:%.*]] = extractelement <27 x i32> [[A2]], i64 [[IDX2_2]]
   // CHECK-NEXT:  [[MI3:%.*]] = add nsw i32 [[MI2]], 2
   // CHECK-NEXT:  [[MI3_EXT:%.*]] = sext i32 [[MI3]] to i64
   // CHECK-NEXT:  [[IDX3_1:%.*]] = mul i64 [[MI3_EXT]], 5
   // CHECK-NEXT:  [[IDX3_2:%.*]] = add i64 [[IDX3_1]], [[MI1_EXT]]
+  // OPT-NEXT:    [[CMP:%.*]] = icmp ult i64 [[IDX3_2]], 25
+  // OPT-NEXT:    call void @llvm.assume(i1 [[CMP]])
   // CHECK-NEXT:  [[B:%.*]] = load <25 x double>, <25 x double>* [[B_PTR:%.*]], align 8
   // CHECK-NEXT:  [[INS:%.*]] = insertelement <25 x double> [[B]], double 1.500000e+00, i64 [[IDX3_2]]
   // CHECK-NEXT:  store <25 x double> [[INS]], <25 x double>* [[B_PTR]], align 8
diff --git a/clang/test/CodeGenCXX/matrix-type-operators.cpp b/clang/test/CodeGenCXX/matrix-type-operators.cpp
index bb1cb7474fca..a082b53f99c5 100644
--- a/clang/test/CodeGenCXX/matrix-type-operators.cpp
+++ b/clang/test/CodeGenCXX/matrix-type-operators.cpp
@@ -1,4 +1,5 @@
-// RUN: %clang_cc1 -fenable-matrix -triple x86_64-apple-darwin %s -emit-llvm -disable-llvm-passes -o - -std=c++11 | FileCheck %s
+// RUN: %clang_cc1 -O0 -fenable-matrix -triple x86_64-apple-darwin %s -emit-llvm -disable-llvm-passes -o - -std=c++11 | FileCheck %s
+// RUN: %clang_cc1 -O1 -fenable-matrix -triple x86_64-apple-darwin %s -emit-llvm -disable-llvm-passes -o - -std=c++11 | FileCheck --check-prefixes=CHECK,OPT %s
 
 typedef double dx5x5_t __attribute__((matrix_type(5, 5)));
 using fx2x3_t = float __attribute__((matrix_type(2, 3)));
@@ -94,7 +95,7 @@ struct DoubleWrapper2 {
 
 void test_DoubleWrapper2_Add1(MyMatrix<double, 10, 9> &m) {
   // CHECK-LABEL: define{{.*}} void @_Z24test_DoubleWrapper2_Add1R8MyMatrixIdLj10ELj9EE(
-  // CHECK:       [[MATRIX:%.*]] = load <90 x double>, <90 x double>* %1, align 8
+  // CHECK:       [[MATRIX:%.*]] = load <90 x double>, <90 x double>* {{.+}}, align 8
   // CHECK:       [[SCALAR:%.*]] = call double @_ZN14DoubleWrapper2cvdEv(%struct.DoubleWrapper2* {{[^,]*}} %w2)
   // CHECK-NEXT:  [[SCALAR_EMBED:%.*]] = insertelement <90 x double> poison, double [[SCALAR]], i32 0
   // CHECK-NEXT:  [[SCALAR_EMBED1:%.*]] = shufflevector <90 x double> [[SCALAR_EMBED]], <90 x double> poison, <90 x i32> zeroinitializer
@@ -109,7 +110,7 @@ void test_DoubleWrapper2_Add1(MyMatrix<double, 10, 9> &m) {
 void test_DoubleWrapper2_Add2(MyMatrix<double, 10, 9> &m) {
   // CHECK-LABEL: define{{.*}} void @_Z24test_DoubleWrapper2_Add2R8MyMatrixIdLj10ELj9EE(
   // CHECK:       [[SCALAR:%.*]] = call double @_ZN14DoubleWrapper2cvdEv(%struct.DoubleWrapper2* {{[^,]*}} %w2)
-  // CHECK:       [[MATRIX:%.*]] = load <90 x double>, <90 x double>* %1, align 8
+  // CHECK:       [[MATRIX:%.*]] = load <90 x double>, <90 x double>* {{.*}}, align 8
   // CHECK-NEXT:  [[SCALAR_EMBED:%.*]] = insertelement <90 x double> poison, double [[SCALAR]], i32 0
   // CHECK-NEXT:  [[SCALAR_EMBED1:%.*]] = shufflevector <90 x double> [[SCALAR_EMBED]], <90 x double> poison, <90 x i32> zeroinitializer
   // CHECK-NEXT:  [[RES:%.*]] = fadd <90 x double> [[SCALAR_EMBED1]], [[MATRIX]]
@@ -219,6 +220,8 @@ void test_insert_template1(MyMatrix<unsigned, 2, 2> &Mat, unsigned e, unsigned i
   // CHECK-NEXT:    [[IDX1:%.*]] = mul i64 [[J_EXT]], 2
   // CHECK-NEXT:    [[IDX2:%.*]] = add i64 [[IDX1]], [[I_EXT]]
   // CHECK-NEXT:    [[MAT_ADDR:%.*]] = bitcast [4 x i32]* {{.*}} to <4 x i32>*
+  // OPT-NEXT:      [[CMP:%.*]] = icmp ult i64 [[IDX2]], 4
+  // OPT-NEXT:      call void @llvm.assume(i1 [[CMP]])
   // CHECK-NEXT:    [[MAT:%.*]] = load <4 x i32>, <4 x i32>* [[MAT_ADDR]], align 4
   // CHECK-NEXT:    [[MATINS:%.*]] = insertelement <4 x i32> [[MAT]], i32 [[E]], i64 [[IDX2]]
   // CHECK-NEXT:    store <4 x i32> [[MATINS]], <4 x i32>* [[MAT_ADDR]], align 4
@@ -243,6 +246,8 @@ void test_insert_template2(MyMatrix<float, 3, 8> &Mat, float e) {
   // CHECK-NEXT:    [[IDX1:%.*]] = mul i64 [[J_EXT]], 3
   // CHECK-NEXT:    [[IDX2:%.*]] = add i64 [[IDX1]], [[I_EXT]]
   // CHECK-NEXT:    [[MAT_ADDR:%.*]] = bitcast [24 x float]* {{.*}} to <24 x float>*
+  // OPT-NEXT:      [[CMP:%.*]] = icmp ult i64 [[IDX2]], 24
+  // OPT-NEXT:      call void @llvm.assume(i1 [[CMP]])
   // CHECK-NEXT:    [[MAT:%.*]] = load <24 x float>, <24 x float>* [[MAT_ADDR]], align 4
   // CHECK-NEXT:    [[MATINS:%.*]] = insertelement <24 x float> [[MAT]], float [[E]], i64 [[IDX2]]
   // CHECK-NEXT:    store <24 x float> [[MATINS]], <24 x float>* [[MAT_ADDR]], align 4
@@ -292,10 +297,10 @@ const double &test_matrix_subscript_reference(const double4x4 m) {
   // CHECK-NEXT:    [[REF_TMP:%.*]] = alloca double, align 8
   // CHECK-NEXT:    [[NAMELESS0:%.*]] = bitcast [16 x double]* [[M_ADDR]] to <16 x double>*
   // CHECK-NEXT:    store <16 x double> [[M:%.*]], <16 x double>* [[NAMELESS0]], align 8
-  // CHECK-NEXT:    [[NAMELESS1:%.*]] = load <16 x double>, <16 x double>* [[NAMELESS0]], align 8
+  // CHECK:         [[NAMELESS1:%.*]] = load <16 x double>, <16 x double>* [[NAMELESS0]], align 8
   // CHECK-NEXT:    [[MATEXT:%.*]] = extractelement <16 x double> [[NAMELESS1]], i64 4
   // CHECK-NEXT:    store double [[MATEXT]], double* [[REF_TMP]], align 8
-  // CHECK-NEXT:    ret double* [[REF_TMP]]
+  // CHECK:         ret double* [[REF_TMP]]
 
   return m[0][1];
 }
@@ -315,11 +320,13 @@ double extract_IntWrapper_idx(double4x4 &m, IntWrapper i, UnsignedWrapper j) {
   // CHECK-NEXT:    [[J:%.*]] = call i32 @_ZN15UnsignedWrappercvjEv(%struct.UnsignedWrapper* {{[^,]*}} %j)
   // CHECK-NEXT:    [[J_SUB:%.*]] = sub i32 [[J]], 1
   // CHECK-NEXT:    [[J_SUB_EXT:%.*]] = zext i32 [[J_SUB]] to i64
+  // CHECK-NEXT:    [[IDX1:%.*]] = mul i64 [[J_SUB_EXT]], 4
+  // CHECK-NEXT:    [[IDX2:%.*]] = add i64 [[IDX1]], [[I_ADD_EXT]]
+  // OPT-NEXT:      [[CMP:%.*]] = icmp ult i64 [[IDX2]], 16
+  // OPT-NEXT:      call void @llvm.assume(i1 [[CMP]])
   // CHECK-NEXT:    [[MAT_ADDR:%.*]] = load [16 x double]*, [16 x double]** %m.addr, align 8
   // CHECK-NEXT:    [[MAT_ADDR2:%.*]] = bitcast [16 x double]* [[MAT_ADDR]] to <16 x double>*
   // CHECK-NEXT:    [[MAT:%.*]] = load <16 x double>, <16 x double>* [[MAT_ADDR2]], align 8
-  // CHECK-NEXT:    [[IDX1:%.*]] = mul i64 [[J_SUB_EXT]], 4
-  // CHECK-NEXT:    [[IDX2:%.*]] = add i64 [[IDX1]], [[I_ADD_EXT]]
   // CHECK-NEXT:    [[MATEXT:%.*]]  = extractelement <16 x double> [[MAT]], i64 [[IDX2]]
   // CHECK-NEXT:    ret double [[MATEXT]]
   return m[i + 1][j - 1];
@@ -358,6 +365,8 @@ void test_constexpr1(matrix_type<float, 4, 4> &m) {
   // CHECK-NEXT:   [[IDX1:%.*]] = mul i64 [[I2_EXT]], 4
   // CHECK-NEXT:   [[IDX2:%.*]] = add i64 [[IDX1]], [[I_EXT]]
   // CHECK-NEXT:   [[MAT_ADDR:%.*]] = bitcast [16 x float]* %result to <16 x float>*
+  // OPT-NEXT:     [[CMP:%.*]] = icmp ult i64 [[IDX2]], 16
+  // OPT-NEXT:     call void @llvm.assume(i1 [[CMP]])
   // CHECK-NEXT:   [[MAT:%.*]] = load <16 x float>, <16 x float>* [[MAT_ADDR]], align 4
   // CHECK-NEXT:   [[MATINS:%.*]] = insertelement <16 x float> [[MAT]], float 1.000000e+00, i64 [[IDX2]]
   // CHECK-NEXT:   store <16 x float> [[MATINS]], <16 x float>* [[MAT_ADDR]], align 4
@@ -386,6 +395,8 @@ void test_constexpr2(matrix_type<int, 5, 5> &m) {
   // CHECK-NEXT:   [[IDX1:%.*]] = mul i64 [[I2_EXT]], 5
   // CHECK-NEXT:   [[IDX2:%.*]] = add i64 [[IDX1]], [[I_EXT]]
   // CHECK-NEXT:   [[MAT_ADDR:%.*]] = bitcast [25 x i32]* %result to <25 x i32>*
+  // OPT-NEXT:     [[CMP:%.*]] = icmp ult i64 [[IDX2]], 25
+  // OPT-NEXT:     call void @llvm.assume(i1 [[CMP]])
   // CHECK-NEXT:   [[MAT:%.*]] = load <25 x i32>, <25 x i32>* [[MAT_ADDR]], align 4
   // CHECK-NEXT:   [[MATINS:%.*]] = insertelement <25 x i32> [[MAT]], i32 1, i64 [[IDX2]]
   // CHECK-NEXT:   store <25 x i32> [[MATINS]], <25 x i32>* [[MAT_ADDR]], align 4
diff --git a/clang/test/CodeGenObjC/matrix-type-operators.m b/clang/test/CodeGenObjC/matrix-type-operators.m
index 56ddee8b512a..8e3986e21e50 100644
--- a/clang/test/CodeGenObjC/matrix-type-operators.m
+++ b/clang/test/CodeGenObjC/matrix-type-operators.m
@@ -22,9 +22,9 @@ typedef double double4x4 __attribute__((matrix_type(4, 4)));
 // CHECK-NEXT:    [[IV2_PTR:%.*]] = bitcast %0* [[IV2]] to i8*
 // CHECK-NEXT:    [[CALL1:%.*]] = call i32 bitcast (i8* (i8*, i8*, ...)* @objc_msgSend to i32 (i8*, i8*)*)(i8* [[IV2_PTR]], i8* [[SEL2]])
 // CHECK-NEXT:    [[CONV2:%.*]] = sext i32 [[CALL1]] to i64
-// CHECK-NEXT:    [[MAT:%.*]] = load <16 x double>, <16 x double>* {{.*}} align 8
 // CHECK-NEXT:    [[IDX1:%.*]] = mul i64 [[CONV2]], 4
 // CHECK-NEXT:    [[IDX2:%.*]] = add i64 [[IDX1]], [[CONV]]
+// CHECK-NEXT:    [[MAT:%.*]] = load <16 x double>, <16 x double>* {{.*}} align 8
 // CHECK-NEXT:    [[MATEXT:%.*]] = extractelement <16 x double> [[MAT]], i64 [[IDX2]]
 // CHECK-NEXT:    ret double [[MATEXT]]
 //
@@ -49,12 +49,12 @@ __attribute__((objc_root_class))
 // CHECK-NEXT:    [[IV2_PTR:%.*]] = bitcast %0* [[IV2]] to i8*
 // CHECK-NEXT:    [[CALL1:%.*]] = call i32 bitcast (i8* (i8*, i8*, ...)* @objc_msgSend to i32 (i8*, i8*)*)(i8* [[IV2_PTR]], i8* [[SEL2]])
 // CHECK-NEXT:    [[CONV2:%.*]] = sext i32 [[CALL1]] to i64
+// CHECK-NEXT:    [[IDX1:%.*]] = mul i64 [[CONV2]], 4
+// CHECK-NEXT:    [[IDX2:%.*]] = add i64 [[IDX1]], [[CONV]]
 // CHECK-NEXT:    [[M:%.*]] = load %1*, %1** %m.addr, align 8
 // CHECK-NEXT:    [[SEL3:%.*]] = load i8*, i8** @OBJC_SELECTOR_REFERENCES_, align 8, !invariant.load !7
 // CHECK-NEXT:    [[M_PTR:%.*]] = bitcast %1* [[M]] to i8*
 // CHECK-NEXT:    [[MAT:%.*]] = call <16 x double> bitcast (i8* (i8*, i8*, ...)* @objc_msgSend to <16 x double> (i8*, i8*)*)(i8* [[M_PTR]], i8* [[SEL3]])
-// CHECK-NEXT:    [[IDX1:%.*]] = mul i64 [[CONV2]], 4
-// CHECK-NEXT:    [[IDX2:%.*]] = add i64 [[IDX1]], [[CONV]]
 // CHECK-NEXT:    [[MATEXT:%.*]] = extractelement <16 x double> [[MAT]], i64 [[IDX2]]
 // CHECK-NEXT:    ret double [[MATEXT]]
 //
diff --git a/llvm/include/llvm/IR/MatrixBuilder.h b/llvm/include/llvm/IR/MatrixBuilder.h
index d5fc2d1fce71..6cc5797269e2 100644
--- a/llvm/include/llvm/IR/MatrixBuilder.h
+++ b/llvm/include/llvm/IR/MatrixBuilder.h
@@ -231,9 +231,23 @@ public:
                : (IsUnsigned ? B.CreateUDiv(LHS, RHS) : B.CreateSDiv(LHS, RHS));
   }
 
-  /// Extracts the element at (\p RowIdx, \p ColumnIdx) from \p Matrix.
-  Value *CreateExtractElement(Value *Matrix, Value *RowIdx, Value *ColumnIdx,
-                              unsigned NumRows, Twine const &Name = "") {
+  /// Create an assumption that \p Idx is less than \p NumElements.
+  void CreateIndexAssumption(Value *Idx, unsigned NumElements,
+                             Twine const &Name = "") {
+
+    Value *NumElts =
+        B.getIntN(Idx->getType()->getScalarSizeInBits(), NumElements);
+    auto *Cmp = B.CreateICmpULT(Idx, NumElts);
+    if (auto *ConstCond = dyn_cast<ConstantInt>(Cmp))
+      assert(ConstCond->isOne() && "Index must be valid!");
+    else
+      B.CreateAssumption(Cmp);
+  }
+
+  /// Compute the index to access the element at (\p RowIdx, \p ColumnIdx) from
+  /// a matrix with \p NumRows embedded in a vector.
+  Value *CreateIndex(Value *RowIdx, Value *ColumnIdx, unsigned NumRows,
+                     Twine const &Name = "") {
 
     unsigned MaxWidth = std::max(RowIdx->getType()->getScalarSizeInBits(),
                                  ColumnIdx->getType()->getScalarSizeInBits());
@@ -241,9 +255,7 @@ public:
     RowIdx = B.CreateZExt(RowIdx, IntTy);
     ColumnIdx = B.CreateZExt(ColumnIdx, IntTy);
     Value *NumRowsV = B.getIntN(MaxWidth, NumRows);
-    return B.CreateExtractElement(
-        Matrix, B.CreateAdd(B.CreateMul(ColumnIdx, NumRowsV), RowIdx),
-        "matext");
+    return B.CreateAdd(B.CreateMul(ColumnIdx, NumRowsV), RowIdx);
   }
 };
 
-- 
2.34.1