From ea21d688dc0a420b9fc385562a46017fb39b13e5 Mon Sep 17 00:00:00 2001 From: Florian Hahn Date: Wed, 22 Sep 2021 10:28:36 +0100 Subject: [PATCH] [Matrix] Emit assumption that matrix indices are valid. The matrix extension requires the indices for matrix subscript expression to be valid and it is UB otherwise. extract/insertelement produce poison if the index is invalid, which limits the optimizer to not be bale to scalarize load/extract pairs for example, which causes very suboptimal code to be generated when using matrix subscript expressions with variable indices for large matrixes. This patch updates IRGen to emit assumes to for index expression to convey the information that the index must be valid. This also adjusts the order in which operations are emitted slightly, so indices & assumes are added before the load of the matrix value. Reviewed By: erichkeane Differential Revision: https://reviews.llvm.org/D102478 --- clang/lib/CodeGen/CGExpr.cpp | 22 ++++++++++--- clang/lib/CodeGen/CGExprScalar.cpp | 13 +++++--- clang/test/CodeGen/matrix-type-operators.c | 43 +++++++++++++++++++------ clang/test/CodeGenCXX/matrix-type-operators.cpp | 25 ++++++++++---- clang/test/CodeGenObjC/matrix-type-operators.m | 6 ++-- llvm/include/llvm/IR/MatrixBuilder.h | 24 ++++++++++---- 6 files changed, 99 insertions(+), 34 deletions(-) diff --git a/clang/lib/CodeGen/CGExpr.cpp b/clang/lib/CodeGen/CGExpr.cpp index e07f95c..fafaaf4 100644 --- a/clang/lib/CodeGen/CGExpr.cpp +++ b/clang/lib/CodeGen/CGExpr.cpp @@ -35,6 +35,7 @@ #include "llvm/IR/Intrinsics.h" #include "llvm/IR/LLVMContext.h" #include "llvm/IR/MDBuilder.h" +#include "llvm/IR/MatrixBuilder.h" #include "llvm/Support/ConvertUTF.h" #include "llvm/Support/MathExtras.h" #include "llvm/Support/Path.h" @@ -1939,10 +1940,15 @@ RValue CodeGenFunction::EmitLoadOfLValue(LValue LV, SourceLocation Loc) { return EmitLoadOfGlobalRegLValue(LV); if (LV.isMatrixElt()) { + llvm::Value *Idx = LV.getMatrixIdx(); + if (CGM.getCodeGenOpts().OptimizationLevel > 0) { + const auto *const MatTy = LV.getType()->getAs(); + llvm::MatrixBuilder MB(Builder); + MB.CreateIndexAssumption(Idx, MatTy->getNumElementsFlattened()); + } llvm::LoadInst *Load = Builder.CreateLoad(LV.getMatrixAddress(), LV.isVolatileQualified()); - return RValue::get( - Builder.CreateExtractElement(Load, LV.getMatrixIdx(), "matrixext")); + return RValue::get(Builder.CreateExtractElement(Load, Idx, "matrixext")); } assert(LV.isBitField() && "Unknown LValue type!"); @@ -2080,9 +2086,15 @@ void CodeGenFunction::EmitStoreThroughLValue(RValue Src, LValue Dst, return EmitStoreThroughGlobalRegLValue(Src, Dst); if (Dst.isMatrixElt()) { - llvm::Value *Vec = Builder.CreateLoad(Dst.getMatrixAddress()); - Vec = Builder.CreateInsertElement(Vec, Src.getScalarVal(), - Dst.getMatrixIdx(), "matins"); + llvm::Value *Idx = Dst.getMatrixIdx(); + if (CGM.getCodeGenOpts().OptimizationLevel > 0) { + const auto *const MatTy = Dst.getType()->getAs(); + llvm::MatrixBuilder MB(Builder); + MB.CreateIndexAssumption(Idx, MatTy->getNumElementsFlattened()); + } + llvm::Instruction *Load = Builder.CreateLoad(Dst.getMatrixAddress()); + llvm::Value *Vec = + Builder.CreateInsertElement(Load, Src.getScalarVal(), Idx, "matins"); Builder.CreateStore(Vec, Dst.getMatrixAddress(), Dst.isVolatileQualified()); return; diff --git a/clang/lib/CodeGen/CGExprScalar.cpp b/clang/lib/CodeGen/CGExprScalar.cpp index fe62e63..67c581b 100644 --- a/clang/lib/CodeGen/CGExprScalar.cpp +++ b/clang/lib/CodeGen/CGExprScalar.cpp @@ -1775,13 +1775,18 @@ Value *ScalarExprEmitter::VisitMatrixSubscriptExpr(MatrixSubscriptExpr *E) { // integer value. Value *RowIdx = Visit(E->getRowIdx()); Value *ColumnIdx = Visit(E->getColumnIdx()); + + const auto *MatrixTy = E->getBase()->getType()->castAs(); + unsigned NumRows = MatrixTy->getNumRows(); + llvm::MatrixBuilder MB(Builder); + Value *Idx = MB.CreateIndex(RowIdx, ColumnIdx, NumRows); + if (CGF.CGM.getCodeGenOpts().OptimizationLevel > 0) + MB.CreateIndexAssumption(Idx, MatrixTy->getNumElementsFlattened()); + Value *Matrix = Visit(E->getBase()); // TODO: Should we emit bounds checks with SanitizerKind::ArrayBounds? - llvm::MatrixBuilder MB(Builder); - return MB.CreateExtractElement( - Matrix, RowIdx, ColumnIdx, - E->getBase()->getType()->castAs()->getNumRows()); + return Builder.CreateExtractElement(Matrix, Idx, "matrixext"); } static int getMaskElt(llvm::ShuffleVectorInst *SVI, unsigned Idx, diff --git a/clang/test/CodeGen/matrix-type-operators.c b/clang/test/CodeGen/matrix-type-operators.c index 7d43276..34ce57c 100644 --- a/clang/test/CodeGen/matrix-type-operators.c +++ b/clang/test/CodeGen/matrix-type-operators.c @@ -1,4 +1,5 @@ -// RUN: %clang_cc1 -fenable-matrix -triple x86_64-apple-darwin %s -emit-llvm -disable-llvm-passes -o - | FileCheck %s +// RUN: %clang_cc1 -O0 -fenable-matrix -triple x86_64-apple-darwin %s -emit-llvm -disable-llvm-passes -o - | FileCheck --check-prefixes=CHECK %s +// RUN: %clang_cc1 -O1 -fenable-matrix -triple x86_64-apple-darwin %s -emit-llvm -disable-llvm-passes -o - | FileCheck --check-prefixes=CHECK,OPT %s typedef double dx5x5_t __attribute__((matrix_type(5, 5))); typedef float fx2x3_t __attribute__((matrix_type(2, 3))); @@ -506,7 +507,7 @@ void multiply_matrix_matrix_double(dx5x5_t b, dx5x5_t c) { // CHECK-NEXT: [[RES:%.*]] = call <25 x double> @llvm.matrix.multiply.v25f64.v25f64.v25f64(<25 x double> [[B]], <25 x double> [[C]], i32 5, i32 5, i32 5) // CHECK-NEXT: [[A_ADDR:%.*]] = bitcast [25 x double]* %a to <25 x double>* // CHECK-NEXT: store <25 x double> [[RES]], <25 x double>* [[A_ADDR]], align 8 - // CHECK-NEXT: ret void + // CHECK: ret void // dx5x5_t a; @@ -531,7 +532,7 @@ typedef int ix9x9_t __attribute__((matrix_type(9, 9))); // CHECK-NEXT: [[RES:%.*]] = call <81 x i32> @llvm.matrix.multiply.v81i32.v27i32.v27i32(<27 x i32> [[B]], <27 x i32> [[C]], i32 9, i32 3, i32 9) // CHECK-NEXT: [[A_ADDR:%.*]] = bitcast [81 x i32]* %a to <81 x i32>* // CHECK-NEXT: store <81 x i32> [[RES]], <81 x i32>* [[A_ADDR]], align 4 -// CHECK-NEXT: ret void +// CHECK: ret void // void multiply_matrix_matrix_int(ix9x3_t b, ix3x9_t c) { ix9x9_t a; @@ -874,6 +875,8 @@ void insert_float_matrix_idx_i_u_float(fx2x3_t b, float e, int j, unsigned k) { // CHECK-NEXT: [[K_EXT:%.*]] = zext i32 [[K]] to i64 // CHECK-NEXT: [[IDX1:%.*]] = mul i64 [[K_EXT]], 2 // CHECK-NEXT: [[IDX2:%.*]] = add i64 [[IDX1]], [[J_EXT]] + // OPT-NEXT: [[CMP:%.*]] = icmp ult i64 [[IDX2]], 6 + // OPT-NEXT: call void @llvm.assume(i1 [[CMP]]) // CHECK-NEXT: [[MAT:%.*]] = load <6 x float>, <6 x float>* [[MAT_ADDR:%.*]], align 4 // CHECK-NEXT: [[MATINS:%.*]] = insertelement <6 x float> [[MAT]], float [[E]], i64 [[IDX2]] // CHECK-NEXT: store <6 x float> [[MATINS]], <6 x float>* [[MAT_ADDR]], align 4 @@ -890,6 +893,8 @@ void insert_float_matrix_idx_s_ull_float(fx2x3_t b, float e, short j, unsigned l // CHECK-NEXT: [[K:%.*]] = load i64, i64* %k.addr, align 8 // CHECK-NEXT: [[IDX1:%.*]] = mul i64 [[K]], 2 // CHECK-NEXT: [[IDX2:%.*]] = add i64 [[IDX1]], [[J_EXT]] + // OPT-NEXT: [[CMP:%.*]] = icmp ult i64 [[IDX2]], 6 + // OPT-NEXT: call void @llvm.assume(i1 [[CMP]]) // CHECK-NEXT: [[MAT:%.*]] = load <6 x float>, <6 x float>* [[MAT_ADDR:%.*]], align 4 // CHECK-NEXT: [[MATINS:%.*]] = insertelement <6 x float> [[MAT]], float [[E]], i64 [[IDX2]] // CHECK-NEXT: store <6 x float> [[MATINS]], <6 x float>* [[MAT_ADDR]], align 4 @@ -907,6 +912,8 @@ void insert_int_idx_expr(ix9x3_t a, int i) { // CHECK-NEXT: [[I2_ADD:%.*]] = add nsw i32 4, [[I2]] // CHECK-NEXT: [[ADD_EXT:%.*]] = sext i32 [[I2_ADD]] to i64 // CHECK-NEXT: [[IDX2:%.*]] = add i64 18, [[ADD_EXT]] + // OPT-NEXT: [[CMP:%.*]] = icmp ult i64 [[IDX2]], 27 + // OPT-NEXT: call void @llvm.assume(i1 [[CMP]]) // CHECK-NEXT: [[MAT:%.*]] = load <27 x i32>, <27 x i32>* [[MAT_ADDR:%.*]], align 4 // CHECK-NEXT: [[MATINS:%.*]] = insertelement <27 x i32> [[MAT]], i32 [[I1]], i64 [[IDX2]] // CHECK-NEXT: store <27 x i32> [[MATINS]], <27 x i32>* [[MAT_ADDR]], align 4 @@ -980,9 +987,11 @@ int extract_int(ix9x3_t c, unsigned long j) { // CHECK-LABEL: @extract_int( // CHECK: [[J1:%.*]] = load i64, i64* %j.addr, align 8 // CHECK-NEXT: [[J2:%.*]] = load i64, i64* %j.addr, align 8 - // CHECK-NEXT: [[MAT:%.*]] = load <27 x i32>, <27 x i32>* {{.*}}, align 4 // CHECK-NEXT: [[IDX1:%.*]] = mul i64 [[J2]], 9 // CHECK-NEXT: [[IDX2:%.*]] = add i64 [[IDX1]], [[J1]] + // OPT-NEXT: [[CMP:%.*]] = icmp ult i64 [[IDX2]], 27 + // OPT-NEXT: call void @llvm.assume(i1 [[CMP]]) + // CHECK-NEXT: [[MAT:%.*]] = load <27 x i32>, <27 x i32>* {{.*}}, align 4 // CHECK-NEXT: [[MATEXT:%.*]] = extractelement <27 x i32> [[MAT]], i64 [[IDX2]] // CHECK-NEXT: ret i32 [[MATEXT]] @@ -995,13 +1004,15 @@ double test_extract_matrix_pointer1(dx3x2_t **ptr, unsigned j) { // CHECK-LABEL: @test_extract_matrix_pointer1( // CHECK: [[J:%.*]] = load i32, i32* %j.addr, align 4 // CHECK-NEXT: [[J_EXT:%.*]] = zext i32 [[J]] to i64 + // CHECK-NEXT: [[IDX:%.*]] = add i64 3, [[J_EXT]] + // OPT-NEXT: [[CMP:%.*]] = icmp ult i64 [[IDX]], 6 + // OPT-NEXT: call void @llvm.assume(i1 [[CMP]]) // CHECK-NEXT: [[PTR:%.*]] = load [6 x double]**, [6 x double]*** %ptr.addr, align 8 // CHECK-NEXT: [[PTR_IDX:%.*]] = getelementptr inbounds [6 x double]*, [6 x double]** [[PTR]], i64 1 // CHECK-NEXT: [[PTR2:%.*]] = load [6 x double]*, [6 x double]** [[PTR_IDX]], align 8 // CHECK-NEXT: [[PTR2_IDX:%.*]] = getelementptr inbounds [6 x double], [6 x double]* [[PTR2]], i64 2 // CHECK-NEXT: [[MAT_ADDR:%.*]] = bitcast [6 x double]* [[PTR2_IDX]] to <6 x double>* // CHECK-NEXT: [[MAT:%.*]] = load <6 x double>, <6 x double>* [[MAT_ADDR]], align 8 - // CHECK-NEXT: [[IDX:%.*]] = add i64 3, [[J_EXT]] // CHECK-NEXT: [[MATEXT:%.*]] = extractelement <6 x double> [[MAT]], i64 [[IDX]] // CHECK-NEXT: ret double [[MATEXT]] @@ -1027,13 +1038,17 @@ void insert_extract(dx5x5_t a, fx3x3_t b, unsigned long j, short k) { // CHECK-LABEL: @insert_extract( // CHECK: [[K:%.*]] = load i16, i16* %k.addr, align 2 // CHECK-NEXT: [[K_EXT:%.*]] = sext i16 [[K]] to i64 - // CHECK-NEXT: [[MAT:%.*]] = load <9 x float>, <9 x float>* [[MAT_ADDR:%.*]], align 4 // CHECK-NEXT: [[IDX1:%.*]] = mul i64 [[K_EXT]], 3 // CHECK-NEXT: [[IDX2:%.*]] = add i64 [[IDX1]], 0 - // CHECK-NEXT: [[MATEXT:%.*]] = extractelement <9 x float> [[MAT]], i64 [[IDX]] + // OPT-NEXT: [[CMP:%.*]] = icmp ult i64 [[IDX2]], 9 + // OPT-NEXT: call void @llvm.assume(i1 [[CMP]]) + // CHECK-NEXT: [[MAT:%.*]] = load <9 x float>, <9 x float>* [[MAT_ADDR:%.*]], align 4 + // CHECK-NEXT: [[MATEXT:%.*]] = extractelement <9 x float> [[MAT]], i64 [[IDX2]] // CHECK-NEXT: [[J:%.*]] = load i64, i64* %j.addr, align 8 // CHECK-NEXT: [[IDX3:%.*]] = mul i64 [[J]], 3 // CHECK-NEXT: [[IDX4:%.*]] = add i64 [[IDX3]], 2 + // OPT-NEXT: [[CMP:%.*]] = icmp ult i64 [[IDX4]], 9 + // OPT-NEXT: call void @llvm.assume(i1 [[CMP]]) // CHECK-NEXT: [[MAT2:%.*]] = load <9 x float>, <9 x float>* [[MAT_ADDR]], align 4 // CHECK-NEXT: [[MATINS:%.*]] = insertelement <9 x float> [[MAT2]], float [[MATEXT]], i64 [[IDX4]] // CHECK-NEXT: store <9 x float> [[MATINS]], <9 x float>* [[MAT_ADDR]], align 4 @@ -1068,9 +1083,13 @@ void insert_compound_stmt_field(struct Foo *a, float f, unsigned i, unsigned j) // CHECK-NEXT: [[IDX1:%.*]] = mul i64 [[J_EXT]], 2 // CHECK-NEXT: [[IDX2:%.*]] = add i64 [[IDX1]], [[I_EXT]] // CHECK-NEXT: [[MAT_PTR:%.*]] = bitcast [6 x float]* %mat to <6 x float>* + // OPT-NEXT: [[CMP:%.*]] = icmp ult i64 [[IDX2]], 6 + // OPT-NEXT: call void @llvm.assume(i1 [[CMP]]) // CHECK-NEXT: [[MAT:%.*]] = load <6 x float>, <6 x float>* [[MAT_PTR]], align 4 // CHECK-NEXT: [[EXT:%.*]] = extractelement <6 x float> [[MAT]], i64 [[IDX2]] // CHECK-NEXT: [[SUM:%.*]] = fadd float [[EXT]], {{.*}} + // OPT-NEXT: [[CMP:%.*]] = icmp ult i64 [[IDX2]], 6 + // OPT-NEXT: call void @llvm.assume(i1 [[CMP]]) // CHECK-NEXT: [[MAT2:%.*]] = load <6 x float>, <6 x float>* [[MAT_PTR]], align 4 // CHECK-NEXT: [[INS:%.*]] = insertelement <6 x float> [[MAT2]], float [[SUM]], i64 [[IDX2]] // CHECK-NEXT: store <6 x float> [[INS]], <6 x float>* [[MAT_PTR]], align 4 @@ -1085,23 +1104,29 @@ void matrix_as_idx(ix9x3_t a, int i, int j, dx5x5_t b) { // CHECK-NEXT: [[I1_EXT:%.*]] = sext i32 [[I1]] to i64 // CHECK-NEXT: [[J1:%.*]] = load i32, i32* %j.addr, align 4 // CHECK-NEXT: [[J1_EXT:%.*]] = sext i32 [[J1]] to i64 - // CHECK-NEXT: [[A:%.*]] = load <27 x i32>, <27 x i32>* %0, align 4 // CHECK-NEXT: [[IDX1_1:%.*]] = mul i64 [[J1_EXT]], 9 // CHECK-NEXT: [[IDX1_2:%.*]] = add i64 [[IDX1_1]], [[I1_EXT]] + // OPT-NEXT: [[CMP:%.*]] = icmp ult i64 [[IDX1_2]], 27 + // OPT-NEXT: call void @llvm.assume(i1 [[CMP]]) + // CHECK-NEXT: [[A:%.*]] = load <27 x i32>, <27 x i32>* %0, align 4 // CHECK-NEXT: [[MI1:%.*]] = extractelement <27 x i32> [[A]], i64 [[IDX1_2]] // CHECK-NEXT: [[MI1_EXT:%.*]] = sext i32 [[MI1]] to i64 // CHECK-NEXT: [[J2:%.*]] = load i32, i32* %j.addr, align 4 // CHECK-NEXT: [[J2_EXT:%.*]] = sext i32 [[J2]] to i64 // CHECK-NEXT: [[I2:%.*]] = load i32, i32* %i.addr, align 4 // CHECK-NEXT: [[I2_EXT:%.*]] = sext i32 [[I2]] to i64 - // CHECK-NEXT: [[A2:%.*]] = load <27 x i32>, <27 x i32>* {{.*}}, align 4 // CHECK-NEXT: [[IDX2_1:%.*]] = mul i64 [[I2_EXT]], 9 // CHECK-NEXT: [[IDX2_2:%.*]] = add i64 [[IDX2_1]], [[J2_EXT]] + // OPT-NEXT: [[CMP:%.*]] = icmp ult i64 [[IDX2_2]], 27 + // OPT-NEXT: call void @llvm.assume(i1 [[CMP]]) + // CHECK-NEXT: [[A2:%.*]] = load <27 x i32>, <27 x i32>* {{.*}}, align 4 // CHECK-NEXT: [[MI2:%.*]] = extractelement <27 x i32> [[A2]], i64 [[IDX2_2]] // CHECK-NEXT: [[MI3:%.*]] = add nsw i32 [[MI2]], 2 // CHECK-NEXT: [[MI3_EXT:%.*]] = sext i32 [[MI3]] to i64 // CHECK-NEXT: [[IDX3_1:%.*]] = mul i64 [[MI3_EXT]], 5 // CHECK-NEXT: [[IDX3_2:%.*]] = add i64 [[IDX3_1]], [[MI1_EXT]] + // OPT-NEXT: [[CMP:%.*]] = icmp ult i64 [[IDX3_2]], 25 + // OPT-NEXT: call void @llvm.assume(i1 [[CMP]]) // CHECK-NEXT: [[B:%.*]] = load <25 x double>, <25 x double>* [[B_PTR:%.*]], align 8 // CHECK-NEXT: [[INS:%.*]] = insertelement <25 x double> [[B]], double 1.500000e+00, i64 [[IDX3_2]] // CHECK-NEXT: store <25 x double> [[INS]], <25 x double>* [[B_PTR]], align 8 diff --git a/clang/test/CodeGenCXX/matrix-type-operators.cpp b/clang/test/CodeGenCXX/matrix-type-operators.cpp index bb1cb74..a082b53 100644 --- a/clang/test/CodeGenCXX/matrix-type-operators.cpp +++ b/clang/test/CodeGenCXX/matrix-type-operators.cpp @@ -1,4 +1,5 @@ -// RUN: %clang_cc1 -fenable-matrix -triple x86_64-apple-darwin %s -emit-llvm -disable-llvm-passes -o - -std=c++11 | FileCheck %s +// RUN: %clang_cc1 -O0 -fenable-matrix -triple x86_64-apple-darwin %s -emit-llvm -disable-llvm-passes -o - -std=c++11 | FileCheck %s +// RUN: %clang_cc1 -O1 -fenable-matrix -triple x86_64-apple-darwin %s -emit-llvm -disable-llvm-passes -o - -std=c++11 | FileCheck --check-prefixes=CHECK,OPT %s typedef double dx5x5_t __attribute__((matrix_type(5, 5))); using fx2x3_t = float __attribute__((matrix_type(2, 3))); @@ -94,7 +95,7 @@ struct DoubleWrapper2 { void test_DoubleWrapper2_Add1(MyMatrix &m) { // CHECK-LABEL: define{{.*}} void @_Z24test_DoubleWrapper2_Add1R8MyMatrixIdLj10ELj9EE( - // CHECK: [[MATRIX:%.*]] = load <90 x double>, <90 x double>* %1, align 8 + // CHECK: [[MATRIX:%.*]] = load <90 x double>, <90 x double>* {{.+}}, align 8 // CHECK: [[SCALAR:%.*]] = call double @_ZN14DoubleWrapper2cvdEv(%struct.DoubleWrapper2* {{[^,]*}} %w2) // CHECK-NEXT: [[SCALAR_EMBED:%.*]] = insertelement <90 x double> poison, double [[SCALAR]], i32 0 // CHECK-NEXT: [[SCALAR_EMBED1:%.*]] = shufflevector <90 x double> [[SCALAR_EMBED]], <90 x double> poison, <90 x i32> zeroinitializer @@ -109,7 +110,7 @@ void test_DoubleWrapper2_Add1(MyMatrix &m) { void test_DoubleWrapper2_Add2(MyMatrix &m) { // CHECK-LABEL: define{{.*}} void @_Z24test_DoubleWrapper2_Add2R8MyMatrixIdLj10ELj9EE( // CHECK: [[SCALAR:%.*]] = call double @_ZN14DoubleWrapper2cvdEv(%struct.DoubleWrapper2* {{[^,]*}} %w2) - // CHECK: [[MATRIX:%.*]] = load <90 x double>, <90 x double>* %1, align 8 + // CHECK: [[MATRIX:%.*]] = load <90 x double>, <90 x double>* {{.*}}, align 8 // CHECK-NEXT: [[SCALAR_EMBED:%.*]] = insertelement <90 x double> poison, double [[SCALAR]], i32 0 // CHECK-NEXT: [[SCALAR_EMBED1:%.*]] = shufflevector <90 x double> [[SCALAR_EMBED]], <90 x double> poison, <90 x i32> zeroinitializer // CHECK-NEXT: [[RES:%.*]] = fadd <90 x double> [[SCALAR_EMBED1]], [[MATRIX]] @@ -219,6 +220,8 @@ void test_insert_template1(MyMatrix &Mat, unsigned e, unsigned i // CHECK-NEXT: [[IDX1:%.*]] = mul i64 [[J_EXT]], 2 // CHECK-NEXT: [[IDX2:%.*]] = add i64 [[IDX1]], [[I_EXT]] // CHECK-NEXT: [[MAT_ADDR:%.*]] = bitcast [4 x i32]* {{.*}} to <4 x i32>* + // OPT-NEXT: [[CMP:%.*]] = icmp ult i64 [[IDX2]], 4 + // OPT-NEXT: call void @llvm.assume(i1 [[CMP]]) // CHECK-NEXT: [[MAT:%.*]] = load <4 x i32>, <4 x i32>* [[MAT_ADDR]], align 4 // CHECK-NEXT: [[MATINS:%.*]] = insertelement <4 x i32> [[MAT]], i32 [[E]], i64 [[IDX2]] // CHECK-NEXT: store <4 x i32> [[MATINS]], <4 x i32>* [[MAT_ADDR]], align 4 @@ -243,6 +246,8 @@ void test_insert_template2(MyMatrix &Mat, float e) { // CHECK-NEXT: [[IDX1:%.*]] = mul i64 [[J_EXT]], 3 // CHECK-NEXT: [[IDX2:%.*]] = add i64 [[IDX1]], [[I_EXT]] // CHECK-NEXT: [[MAT_ADDR:%.*]] = bitcast [24 x float]* {{.*}} to <24 x float>* + // OPT-NEXT: [[CMP:%.*]] = icmp ult i64 [[IDX2]], 24 + // OPT-NEXT: call void @llvm.assume(i1 [[CMP]]) // CHECK-NEXT: [[MAT:%.*]] = load <24 x float>, <24 x float>* [[MAT_ADDR]], align 4 // CHECK-NEXT: [[MATINS:%.*]] = insertelement <24 x float> [[MAT]], float [[E]], i64 [[IDX2]] // CHECK-NEXT: store <24 x float> [[MATINS]], <24 x float>* [[MAT_ADDR]], align 4 @@ -292,10 +297,10 @@ const double &test_matrix_subscript_reference(const double4x4 m) { // CHECK-NEXT: [[REF_TMP:%.*]] = alloca double, align 8 // CHECK-NEXT: [[NAMELESS0:%.*]] = bitcast [16 x double]* [[M_ADDR]] to <16 x double>* // CHECK-NEXT: store <16 x double> [[M:%.*]], <16 x double>* [[NAMELESS0]], align 8 - // CHECK-NEXT: [[NAMELESS1:%.*]] = load <16 x double>, <16 x double>* [[NAMELESS0]], align 8 + // CHECK: [[NAMELESS1:%.*]] = load <16 x double>, <16 x double>* [[NAMELESS0]], align 8 // CHECK-NEXT: [[MATEXT:%.*]] = extractelement <16 x double> [[NAMELESS1]], i64 4 // CHECK-NEXT: store double [[MATEXT]], double* [[REF_TMP]], align 8 - // CHECK-NEXT: ret double* [[REF_TMP]] + // CHECK: ret double* [[REF_TMP]] return m[0][1]; } @@ -315,11 +320,13 @@ double extract_IntWrapper_idx(double4x4 &m, IntWrapper i, UnsignedWrapper j) { // CHECK-NEXT: [[J:%.*]] = call i32 @_ZN15UnsignedWrappercvjEv(%struct.UnsignedWrapper* {{[^,]*}} %j) // CHECK-NEXT: [[J_SUB:%.*]] = sub i32 [[J]], 1 // CHECK-NEXT: [[J_SUB_EXT:%.*]] = zext i32 [[J_SUB]] to i64 + // CHECK-NEXT: [[IDX1:%.*]] = mul i64 [[J_SUB_EXT]], 4 + // CHECK-NEXT: [[IDX2:%.*]] = add i64 [[IDX1]], [[I_ADD_EXT]] + // OPT-NEXT: [[CMP:%.*]] = icmp ult i64 [[IDX2]], 16 + // OPT-NEXT: call void @llvm.assume(i1 [[CMP]]) // CHECK-NEXT: [[MAT_ADDR:%.*]] = load [16 x double]*, [16 x double]** %m.addr, align 8 // CHECK-NEXT: [[MAT_ADDR2:%.*]] = bitcast [16 x double]* [[MAT_ADDR]] to <16 x double>* // CHECK-NEXT: [[MAT:%.*]] = load <16 x double>, <16 x double>* [[MAT_ADDR2]], align 8 - // CHECK-NEXT: [[IDX1:%.*]] = mul i64 [[J_SUB_EXT]], 4 - // CHECK-NEXT: [[IDX2:%.*]] = add i64 [[IDX1]], [[I_ADD_EXT]] // CHECK-NEXT: [[MATEXT:%.*]] = extractelement <16 x double> [[MAT]], i64 [[IDX2]] // CHECK-NEXT: ret double [[MATEXT]] return m[i + 1][j - 1]; @@ -358,6 +365,8 @@ void test_constexpr1(matrix_type &m) { // CHECK-NEXT: [[IDX1:%.*]] = mul i64 [[I2_EXT]], 4 // CHECK-NEXT: [[IDX2:%.*]] = add i64 [[IDX1]], [[I_EXT]] // CHECK-NEXT: [[MAT_ADDR:%.*]] = bitcast [16 x float]* %result to <16 x float>* + // OPT-NEXT: [[CMP:%.*]] = icmp ult i64 [[IDX2]], 16 + // OPT-NEXT: call void @llvm.assume(i1 [[CMP]]) // CHECK-NEXT: [[MAT:%.*]] = load <16 x float>, <16 x float>* [[MAT_ADDR]], align 4 // CHECK-NEXT: [[MATINS:%.*]] = insertelement <16 x float> [[MAT]], float 1.000000e+00, i64 [[IDX2]] // CHECK-NEXT: store <16 x float> [[MATINS]], <16 x float>* [[MAT_ADDR]], align 4 @@ -386,6 +395,8 @@ void test_constexpr2(matrix_type &m) { // CHECK-NEXT: [[IDX1:%.*]] = mul i64 [[I2_EXT]], 5 // CHECK-NEXT: [[IDX2:%.*]] = add i64 [[IDX1]], [[I_EXT]] // CHECK-NEXT: [[MAT_ADDR:%.*]] = bitcast [25 x i32]* %result to <25 x i32>* + // OPT-NEXT: [[CMP:%.*]] = icmp ult i64 [[IDX2]], 25 + // OPT-NEXT: call void @llvm.assume(i1 [[CMP]]) // CHECK-NEXT: [[MAT:%.*]] = load <25 x i32>, <25 x i32>* [[MAT_ADDR]], align 4 // CHECK-NEXT: [[MATINS:%.*]] = insertelement <25 x i32> [[MAT]], i32 1, i64 [[IDX2]] // CHECK-NEXT: store <25 x i32> [[MATINS]], <25 x i32>* [[MAT_ADDR]], align 4 diff --git a/clang/test/CodeGenObjC/matrix-type-operators.m b/clang/test/CodeGenObjC/matrix-type-operators.m index 56ddee8..8e3986e 100644 --- a/clang/test/CodeGenObjC/matrix-type-operators.m +++ b/clang/test/CodeGenObjC/matrix-type-operators.m @@ -22,9 +22,9 @@ typedef double double4x4 __attribute__((matrix_type(4, 4))); // CHECK-NEXT: [[IV2_PTR:%.*]] = bitcast %0* [[IV2]] to i8* // CHECK-NEXT: [[CALL1:%.*]] = call i32 bitcast (i8* (i8*, i8*, ...)* @objc_msgSend to i32 (i8*, i8*)*)(i8* [[IV2_PTR]], i8* [[SEL2]]) // CHECK-NEXT: [[CONV2:%.*]] = sext i32 [[CALL1]] to i64 -// CHECK-NEXT: [[MAT:%.*]] = load <16 x double>, <16 x double>* {{.*}} align 8 // CHECK-NEXT: [[IDX1:%.*]] = mul i64 [[CONV2]], 4 // CHECK-NEXT: [[IDX2:%.*]] = add i64 [[IDX1]], [[CONV]] +// CHECK-NEXT: [[MAT:%.*]] = load <16 x double>, <16 x double>* {{.*}} align 8 // CHECK-NEXT: [[MATEXT:%.*]] = extractelement <16 x double> [[MAT]], i64 [[IDX2]] // CHECK-NEXT: ret double [[MATEXT]] // @@ -49,12 +49,12 @@ __attribute__((objc_root_class)) // CHECK-NEXT: [[IV2_PTR:%.*]] = bitcast %0* [[IV2]] to i8* // CHECK-NEXT: [[CALL1:%.*]] = call i32 bitcast (i8* (i8*, i8*, ...)* @objc_msgSend to i32 (i8*, i8*)*)(i8* [[IV2_PTR]], i8* [[SEL2]]) // CHECK-NEXT: [[CONV2:%.*]] = sext i32 [[CALL1]] to i64 +// CHECK-NEXT: [[IDX1:%.*]] = mul i64 [[CONV2]], 4 +// CHECK-NEXT: [[IDX2:%.*]] = add i64 [[IDX1]], [[CONV]] // CHECK-NEXT: [[M:%.*]] = load %1*, %1** %m.addr, align 8 // CHECK-NEXT: [[SEL3:%.*]] = load i8*, i8** @OBJC_SELECTOR_REFERENCES_, align 8, !invariant.load !7 // CHECK-NEXT: [[M_PTR:%.*]] = bitcast %1* [[M]] to i8* // CHECK-NEXT: [[MAT:%.*]] = call <16 x double> bitcast (i8* (i8*, i8*, ...)* @objc_msgSend to <16 x double> (i8*, i8*)*)(i8* [[M_PTR]], i8* [[SEL3]]) -// CHECK-NEXT: [[IDX1:%.*]] = mul i64 [[CONV2]], 4 -// CHECK-NEXT: [[IDX2:%.*]] = add i64 [[IDX1]], [[CONV]] // CHECK-NEXT: [[MATEXT:%.*]] = extractelement <16 x double> [[MAT]], i64 [[IDX2]] // CHECK-NEXT: ret double [[MATEXT]] // diff --git a/llvm/include/llvm/IR/MatrixBuilder.h b/llvm/include/llvm/IR/MatrixBuilder.h index d5fc2d1..6cc5797 100644 --- a/llvm/include/llvm/IR/MatrixBuilder.h +++ b/llvm/include/llvm/IR/MatrixBuilder.h @@ -231,9 +231,23 @@ public: : (IsUnsigned ? B.CreateUDiv(LHS, RHS) : B.CreateSDiv(LHS, RHS)); } - /// Extracts the element at (\p RowIdx, \p ColumnIdx) from \p Matrix. - Value *CreateExtractElement(Value *Matrix, Value *RowIdx, Value *ColumnIdx, - unsigned NumRows, Twine const &Name = "") { + /// Create an assumption that \p Idx is less than \p NumElements. + void CreateIndexAssumption(Value *Idx, unsigned NumElements, + Twine const &Name = "") { + + Value *NumElts = + B.getIntN(Idx->getType()->getScalarSizeInBits(), NumElements); + auto *Cmp = B.CreateICmpULT(Idx, NumElts); + if (auto *ConstCond = dyn_cast(Cmp)) + assert(ConstCond->isOne() && "Index must be valid!"); + else + B.CreateAssumption(Cmp); + } + + /// Compute the index to access the element at (\p RowIdx, \p ColumnIdx) from + /// a matrix with \p NumRows embedded in a vector. + Value *CreateIndex(Value *RowIdx, Value *ColumnIdx, unsigned NumRows, + Twine const &Name = "") { unsigned MaxWidth = std::max(RowIdx->getType()->getScalarSizeInBits(), ColumnIdx->getType()->getScalarSizeInBits()); @@ -241,9 +255,7 @@ public: RowIdx = B.CreateZExt(RowIdx, IntTy); ColumnIdx = B.CreateZExt(ColumnIdx, IntTy); Value *NumRowsV = B.getIntN(MaxWidth, NumRows); - return B.CreateExtractElement( - Matrix, B.CreateAdd(B.CreateMul(ColumnIdx, NumRowsV), RowIdx), - "matext"); + return B.CreateAdd(B.CreateMul(ColumnIdx, NumRowsV), RowIdx); } }; -- 2.7.4