From 9e7e297da33be70ec41335800c05b554f5de065b Mon Sep 17 00:00:00 2001 From: Nicolas Vasilache Date: Fri, 18 Oct 2019 14:09:42 -0700 Subject: [PATCH] Lower vector transfer ops to loop.for operations. This allows mixing linalg operations with vector transfer operations (with additional modifications to affine ops) and is a step towards solving tensorflow/mlir#189. PiperOrigin-RevId: 275543361 --- .../Linalg/Transforms/LowerToLLVMDialect.cpp | 2 +- mlir/lib/EDSC/Builders.cpp | 1 - mlir/lib/EDSC/Helpers.cpp | 5 +--- mlir/lib/Transforms/LowerVectorTransfers.cpp | 27 +++++++++++-------- mlir/test/Dialect/Linalg/llvm.mlir | 4 +-- mlir/test/Dialect/Linalg/loops.mlir | 2 +- mlir/test/Dialect/Linalg/roundtrip.mlir | 2 +- .../Vectorize/lower_vector_transfers.mlir | 30 ++++++++++++++-------- .../mlir-cpu-runner/linalg_integration_test.mlir | 12 ++++----- mlir/test/mlir-cpu-runner/utils.mlir | 8 +++--- 10 files changed, 52 insertions(+), 41 deletions(-) diff --git a/mlir/lib/Dialect/Linalg/Transforms/LowerToLLVMDialect.cpp b/mlir/lib/Dialect/Linalg/Transforms/LowerToLLVMDialect.cpp index a0955d5..90a76de 100644 --- a/mlir/lib/Dialect/Linalg/Transforms/LowerToLLVMDialect.cpp +++ b/mlir/lib/Dialect/Linalg/Transforms/LowerToLLVMDialect.cpp @@ -770,5 +770,5 @@ mlir::linalg::createLowerLinalgToLLVMPass() { } static PassRegistration - pass("linalg-convert-to-llvm", + pass("convert-linalg-to-llvm", "Lower the operations from the linalg dialect into the LLVM dialect"); diff --git a/mlir/lib/EDSC/Builders.cpp b/mlir/lib/EDSC/Builders.cpp index f1dceec..4046a7c 100644 --- a/mlir/lib/EDSC/Builders.cpp +++ b/mlir/lib/EDSC/Builders.cpp @@ -320,7 +320,6 @@ categorizeValueByAffineType(MLIRContext *context, Value *val, unsigned &numDims, d = getAffineSymbolExpr(numSymbols++, context); resultVal = val; } else { - assert(isValidDim(val) && "Must be a valid Dim"); d = getAffineDimExpr(numDims++, context); resultVal = val; } diff --git a/mlir/lib/EDSC/Helpers.cpp b/mlir/lib/EDSC/Helpers.cpp index b4455c4..eeb2866 100644 --- a/mlir/lib/EDSC/Helpers.cpp +++ b/mlir/lib/EDSC/Helpers.cpp @@ -24,11 +24,8 @@ using namespace mlir::edsc; static SmallVector getMemRefSizes(Value *memRef) { MemRefType memRefType = memRef->getType().cast(); + assert(isStrided(memRefType) && "Expected strided MemRef type"); - auto maps = memRefType.getAffineMaps(); - (void)maps; - assert((maps.empty() || (maps.size() == 1 && maps[0].isIdentity())) && - "Layout maps not supported"); SmallVector res; res.reserve(memRefType.getShape().size()); const auto &shape = memRefType.getShape(); diff --git a/mlir/lib/Transforms/LowerVectorTransfers.cpp b/mlir/lib/Transforms/LowerVectorTransfers.cpp index c7d986c..c517d74 100644 --- a/mlir/lib/Transforms/LowerVectorTransfers.cpp +++ b/mlir/lib/Transforms/LowerVectorTransfers.cpp @@ -25,6 +25,7 @@ #include "mlir/Analysis/NestedMatcher.h" #include "mlir/Analysis/Utils.h" #include "mlir/Analysis/VectorAnalysis.h" +#include "mlir/Dialect/LoopOps/LoopOps.h" #include "mlir/Dialect/StandardOps/Ops.h" #include "mlir/Dialect/VectorOps/VectorOps.h" #include "mlir/EDSC/Builders.h" @@ -54,9 +55,9 @@ /// // Read the slice `%A[%i0, %i1:%i1+256, %i2:%i2+32]` into /// // vector<32x256xf32> and pad with %f0 to handle the boundary case: /// %f0 = constant 0.0f : f32 -/// affine.for %i0 = 0 to %0 { -/// affine.for %i1 = 0 to %1 step 256 { -/// affine.for %i2 = 0 to %2 step 32 { +/// loop.for %i0 = 0 to %0 { +/// loop.for %i1 = 0 to %1 step %c256 { +/// loop.for %i2 = 0 to %2 step %c32 { /// %v = vector.transfer_read %A[%i0, %i1, %i2], (%f0) /// {permutation_map: (d0, d1, d2) -> (d2, d1)} : /// memref, vector<32x256xf32> @@ -68,8 +69,8 @@ /// abstraction): /// /// ```mlir {.mlir} -/// affine.for %d2 = 0 to 256 { -/// affine.for %d1 = 0 to 32 { +/// loop.for %d2 = 0 to %c256 { +/// loop.for %d1 = 0 to %c32 { /// %s = %A[%i0, %i1 + %d1, %i2 + %d2] : f32 /// %tmp[%d2, %d1] = %s /// } @@ -126,7 +127,7 @@ struct VectorTransferRewriter : public RewritePattern { /// Analyzes the `transfer` to find an access dimension along the fastest remote /// MemRef dimension. If such a dimension with coalescing properties is found, /// `pivs` and `vectorView` are swapped so that the invocation of -/// AffineLoopNestBuilder captures it in the innermost loop. +/// LoopNestBuilder captures it in the innermost loop. template void coalesceCopy(VectorTransferOpTy transfer, SmallVectorImpl *pivs, @@ -282,13 +283,16 @@ VectorTransferRewriter::matchAndRewrite( auto lbs = vectorView.getLbs(); auto ubs = vectorView.getUbs(); - auto steps = vectorView.getSteps(); + SmallVector steps; + steps.reserve(vectorView.getSteps().size()); + for (auto step : vectorView.getSteps()) + steps.push_back(constant_index(step)); // 2. Emit alloc-copy-load-dealloc. ValueHandle tmp = alloc(tmpMemRefType(transfer)); IndexedValue local(tmp); ValueHandle vec = vector_type_cast(tmp, vectorMemRefType(transfer)); - AffineLoopNestBuilder(pivs, lbs, ubs, steps)([&] { + LoopNestBuilder(pivs, lbs, ubs, steps)([&] { // Computes clippedScalarAccessExprs in the loop nest scope (ivs exist). local(ivs) = remote(clip(transfer, view, ivs)); }); @@ -342,14 +346,17 @@ VectorTransferRewriter::matchAndRewrite( auto lbs = vectorView.getLbs(); auto ubs = vectorView.getUbs(); - auto steps = vectorView.getSteps(); + SmallVector steps; + steps.reserve(vectorView.getSteps().size()); + for (auto step : vectorView.getSteps()) + steps.push_back(constant_index(step)); // 2. Emit alloc-store-copy-dealloc. ValueHandle tmp = alloc(tmpMemRefType(transfer)); IndexedValue local(tmp); ValueHandle vec = vector_type_cast(tmp, vectorMemRefType(transfer)); std_store(vectorValue, vec, {constant_index(0)}); - AffineLoopNestBuilder(pivs, lbs, ubs, steps)([&] { + LoopNestBuilder(pivs, lbs, ubs, steps)([&] { // Computes clippedScalarAccessExprs in the loop nest scope (ivs exist). remote(clip(transfer, view, ivs)) = local(ivs); }); diff --git a/mlir/test/Dialect/Linalg/llvm.mlir b/mlir/test/Dialect/Linalg/llvm.mlir index a2da8e04..6cf67c8 100644 --- a/mlir/test/Dialect/Linalg/llvm.mlir +++ b/mlir/test/Dialect/Linalg/llvm.mlir @@ -1,5 +1,5 @@ -// RUN: mlir-opt %s -linalg-convert-to-llvm | FileCheck %s -// RUN: mlir-opt %s -linalg-lower-to-loops -linalg-convert-to-llvm | FileCheck %s --check-prefix=LLVM-LOOPS +// RUN: mlir-opt %s -convert-linalg-to-llvm | FileCheck %s +// RUN: mlir-opt %s -linalg-lower-to-loops -convert-linalg-to-llvm | FileCheck %s --check-prefix=LLVM-LOOPS func @buffer_size(%arg0: !linalg.buffer) { %c1 = constant 1 : index diff --git a/mlir/test/Dialect/Linalg/loops.mlir b/mlir/test/Dialect/Linalg/loops.mlir index a76aa96..2b293a9 100644 --- a/mlir/test/Dialect/Linalg/loops.mlir +++ b/mlir/test/Dialect/Linalg/loops.mlir @@ -1,7 +1,7 @@ // RUN: mlir-opt %s -linalg-lower-to-loops | FileCheck %s // Test that we can lower all the way to LLVM without crashing, don't check results here. -// RUN: mlir-opt %s --linalg-convert-to-llvm -o=/dev/null 2>&1 +// RUN: mlir-opt %s --convert-linalg-to-llvm -o=/dev/null 2>&1 // CHECK-DAG: #[[strided1D:.*]] = (d0)[s0] -> (d0 + s0) // CHECK-DAG: #[[strided2D:.*]] = (d0, d1)[s0, s1] -> (d0 * s1 + s0 + d1) diff --git a/mlir/test/Dialect/Linalg/roundtrip.mlir b/mlir/test/Dialect/Linalg/roundtrip.mlir index 6c660a1..7ef0699 100644 --- a/mlir/test/Dialect/Linalg/roundtrip.mlir +++ b/mlir/test/Dialect/Linalg/roundtrip.mlir @@ -1,7 +1,7 @@ // RUN: mlir-opt %s | mlir-opt | FileCheck %s // Test that we can lower all the way to LLVM without crashing, don't check results here. -// RUN: mlir-opt %s --linalg-convert-to-llvm -o=/dev/null 2>&1 +// RUN: mlir-opt %s --convert-linalg-to-llvm -o=/dev/null 2>&1 // CHECK-DAG: #[[strided1D:.*]] = (d0)[s0] -> (d0 + s0) // CHECK-DAG: #[[strided2D:.*]] = (d0, d1)[s0, s1] -> (d0 * s1 + s0 + d1) diff --git a/mlir/test/Transforms/Vectorize/lower_vector_transfers.mlir b/mlir/test/Transforms/Vectorize/lower_vector_transfers.mlir index 5d8acea..31f8bf6 100644 --- a/mlir/test/Transforms/Vectorize/lower_vector_transfers.mlir +++ b/mlir/test/Transforms/Vectorize/lower_vector_transfers.mlir @@ -53,8 +53,12 @@ func @materialize_read_1d_partially_specialized(%dyn1 : index, %dyn2 : index, %d // CHECK-LABEL: func @materialize_read(%{{.*}}: index, %{{.*}}: index, %{{.*}}: index, %{{.*}}: index) { func @materialize_read(%M: index, %N: index, %O: index, %P: index) { - // CHECK-NEXT: %[[C0:.*]] = constant 0 : index - // CHECK-NEXT: %{{.*}} = alloc(%{{.*}}, %{{.*}}, %{{.*}}, %{{.*}}) : memref + // CHECK-DAG: %[[C0:.*]] = constant 0 : index + // CHECK-DAG: %[[C1:.*]] = constant 1 : index + // CHECK-DAG: %[[C3:.*]] = constant 3 : index + // CHECK-DAG: %[[C4:.*]] = constant 4 : index + // CHECK-DAG: %[[C5:.*]] = constant 5 : index + // CHECK: %{{.*}} = alloc(%{{.*}}, %{{.*}}, %{{.*}}, %{{.*}}) : memref // CHECK-NEXT: affine.for %[[I0:.*]] = 0 to %{{.*}} step 3 { // CHECK-NEXT: affine.for %[[I1:.*]] = 0 to %{{.*}} { // CHECK-NEXT: affine.for %[[I2:.*]] = 0 to %{{.*}} { @@ -65,9 +69,9 @@ func @materialize_read(%M: index, %N: index, %O: index, %P: index) { // CHECK-NEXT: %[[D3:.*]] = dim %{{.*}}, 3 : memref // CHECK: %[[ALLOC:.*]] = alloc() : memref<5x4x3xf32> // CHECK-NEXT: %[[VECTOR_VIEW:.*]] = vector.type_cast %[[ALLOC]] : memref<5x4x3xf32>, memref<1xvector<5x4x3xf32>> - // CHECK-NEXT: affine.for %[[I4:.*]] = 0 to 3 { - // CHECK-NEXT: affine.for %[[I5:.*]] = 0 to 4 { - // CHECK-NEXT: affine.for %[[I6:.*]] = 0 to 5 { + // CHECK-NEXT: loop.for %[[I4:.*]] = %[[C0]] to %[[C3]] step %[[C1]] { + // CHECK-NEXT: loop.for %[[I5:.*]] = %[[C0]] to %[[C4]] step %[[C1]] { + // CHECK-NEXT: loop.for %[[I6:.*]] = %[[C0]] to %[[C5]] step %[[C1]] { // CHECK-NEXT: {{.*}} = affine.apply #[[ADD]](%[[I0]], %[[I4]]) // CHECK-NEXT: {{.*}} = affine.apply #[[SUB]]()[%[[D0]]] // CHECK-NEXT: {{.*}} = cmpi "slt", {{.*}} : index @@ -126,9 +130,13 @@ func @materialize_read(%M: index, %N: index, %O: index, %P: index) { // CHECK-LABEL:func @materialize_write(%{{.*}}: index, %{{.*}}: index, %{{.*}}: index, %{{.*}}: index) { func @materialize_write(%M: index, %N: index, %O: index, %P: index) { - // CHECK-NEXT: %{{.*}} = constant dense<1.000000e+00> : vector<5x4x3xf32> - // CHECK-NEXT: %[[C0:.*]] = constant 0 : index - // CHECK-NEXT: %{{.*}} = alloc(%{{.*}}, %{{.*}}, %{{.*}}, %{{.*}}) : memref + // CHECK-DAG: %{{.*}} = constant dense<1.000000e+00> : vector<5x4x3xf32> + // CHECK-DAG: %[[C0:.*]] = constant 0 : index + // CHECK-DAG: %[[C1:.*]] = constant 1 : index + // CHECK-DAG: %[[C3:.*]] = constant 3 : index + // CHECK-DAG: %[[C4:.*]] = constant 4 : index + // CHECK-DAG: %[[C5:.*]] = constant 5 : index + // CHECK: %{{.*}} = alloc(%{{.*}}, %{{.*}}, %{{.*}}, %{{.*}}) : memref // CHECK-NEXT: affine.for %[[I0:.*]] = 0 to %{{.*}} step 3 { // CHECK-NEXT: affine.for %[[I1:.*]] = 0 to %{{.*}} step 4 { // CHECK-NEXT: affine.for %[[I2:.*]] = 0 to %{{.*}} { @@ -140,9 +148,9 @@ func @materialize_write(%M: index, %N: index, %O: index, %P: index) { // CHECK: %[[ALLOC:.*]] = alloc() : memref<5x4x3xf32> // CHECK-NEXT: %[[VECTOR_VIEW:.*]] = vector.type_cast {{.*}} : memref<5x4x3xf32>, memref<1xvector<5x4x3xf32>> // CHECK: store %{{.*}}, {{.*}} : memref<1xvector<5x4x3xf32>> - // CHECK-NEXT: affine.for %[[I4:.*]] = 0 to 3 { - // CHECK-NEXT: affine.for %[[I5:.*]] = 0 to 4 { - // CHECK-NEXT: affine.for %[[I6:.*]] = 0 to 5 { + // CHECK-NEXT: loop.for %[[I4:.*]] = %[[C0]] to %[[C3]] step %[[C1]] { + // CHECK-NEXT: loop.for %[[I5:.*]] = %[[C0]] to %[[C4]] step %[[C1]] { + // CHECK-NEXT: loop.for %[[I6:.*]] = %[[C0]] to %[[C5]] step %[[C1]] { // CHECK-NEXT: {{.*}} = affine.apply #[[ADD]](%[[I0]], %[[I4]]) // CHECK-NEXT: {{.*}} = affine.apply #[[SUB]]()[%[[D0]]] // CHECK-NEXT: {{.*}} = cmpi "slt", {{.*}}, {{.*}} : index diff --git a/mlir/test/mlir-cpu-runner/linalg_integration_test.mlir b/mlir/test/mlir-cpu-runner/linalg_integration_test.mlir index e63cecb..c533ed1 100644 --- a/mlir/test/mlir-cpu-runner/linalg_integration_test.mlir +++ b/mlir/test/mlir-cpu-runner/linalg_integration_test.mlir @@ -1,9 +1,9 @@ -// RUN: mlir-opt %s -linalg-convert-to-llvm | mlir-cpu-runner -e dot -entry-point-result=f32 -shared-libs=%linalg_test_lib_dir/libcblas%shlibext,%linalg_test_lib_dir/libcblas_interface%shlibext | FileCheck %s -// RUN: mlir-opt %s -linalg-lower-to-loops -linalg-convert-to-llvm | mlir-cpu-runner -e dot -entry-point-result=f32 -shared-libs=%linalg_test_lib_dir/libcblas%shlibext,%linalg_test_lib_dir/libcblas_interface%shlibext | FileCheck %s -// RUN: mlir-opt %s -linalg-convert-to-llvm | mlir-cpu-runner -e matmul -entry-point-result=f32 -shared-libs=%linalg_test_lib_dir/libcblas%shlibext,%linalg_test_lib_dir/libcblas_interface%shlibext | FileCheck %s -// RUN: mlir-opt %s -linalg-lower-to-loops -linalg-convert-to-llvm | mlir-cpu-runner -e matmul -entry-point-result=f32 -shared-libs=%linalg_test_lib_dir/libcblas%shlibext,%linalg_test_lib_dir/libcblas_interface%shlibext | FileCheck %s -// RUN: mlir-opt %s -linalg-tile -linalg-tile-sizes=2,3,4 -linalg-promote-subviews -linalg-lower-to-loops -linalg-convert-to-llvm | mlir-cpu-runner -e matmul -entry-point-result=f32 -shared-libs=%linalg_test_lib_dir/libcblas%shlibext,%linalg_test_lib_dir/libcblas_interface%shlibext | FileCheck %s -// RUN: mlir-opt %s -linalg-tile -linalg-tile-sizes=2,3,4 -linalg-promote-subviews -linalg-convert-to-llvm | mlir-cpu-runner -e matmul -entry-point-result=f32 -shared-libs=%linalg_test_lib_dir/libcblas%shlibext,%linalg_test_lib_dir/libcblas_interface%shlibext | FileCheck %s +// RUN: mlir-opt %s -convert-linalg-to-llvm | mlir-cpu-runner -e dot -entry-point-result=f32 -shared-libs=%linalg_test_lib_dir/libcblas%shlibext,%linalg_test_lib_dir/libcblas_interface%shlibext | FileCheck %s +// RUN: mlir-opt %s -linalg-lower-to-loops -convert-linalg-to-llvm | mlir-cpu-runner -e dot -entry-point-result=f32 -shared-libs=%linalg_test_lib_dir/libcblas%shlibext,%linalg_test_lib_dir/libcblas_interface%shlibext | FileCheck %s +// RUN: mlir-opt %s -convert-linalg-to-llvm | mlir-cpu-runner -e matmul -entry-point-result=f32 -shared-libs=%linalg_test_lib_dir/libcblas%shlibext,%linalg_test_lib_dir/libcblas_interface%shlibext | FileCheck %s +// RUN: mlir-opt %s -linalg-lower-to-loops -convert-linalg-to-llvm | mlir-cpu-runner -e matmul -entry-point-result=f32 -shared-libs=%linalg_test_lib_dir/libcblas%shlibext,%linalg_test_lib_dir/libcblas_interface%shlibext | FileCheck %s +// RUN: mlir-opt %s -linalg-tile -linalg-tile-sizes=2,3,4 -linalg-promote-subviews -linalg-lower-to-loops -convert-linalg-to-llvm | mlir-cpu-runner -e matmul -entry-point-result=f32 -shared-libs=%linalg_test_lib_dir/libcblas%shlibext,%linalg_test_lib_dir/libcblas_interface%shlibext | FileCheck %s +// RUN: mlir-opt %s -linalg-tile -linalg-tile-sizes=2,3,4 -linalg-promote-subviews -convert-linalg-to-llvm | mlir-cpu-runner -e matmul -entry-point-result=f32 -shared-libs=%linalg_test_lib_dir/libcblas%shlibext,%linalg_test_lib_dir/libcblas_interface%shlibext | FileCheck %s #strided1D = (d0)[s0] -> (d0 + s0) #strided2D = (d0, d1)[s0, s1] -> (d0 * s1 + s0 + d1) diff --git a/mlir/test/mlir-cpu-runner/utils.mlir b/mlir/test/mlir-cpu-runner/utils.mlir index 314916f..9e1498e 100644 --- a/mlir/test/mlir-cpu-runner/utils.mlir +++ b/mlir/test/mlir-cpu-runner/utils.mlir @@ -1,7 +1,7 @@ -// RUN: mlir-opt %s -linalg-lower-to-loops -linalg-convert-to-llvm -lower-to-llvm | mlir-cpu-runner -e print_0d -entry-point-result=void -shared-libs=%linalg_test_lib_dir/libmlir_runner_utils%shlibext | FileCheck %s --check-prefix=PRINT-0D -// RUN: mlir-opt %s -linalg-lower-to-loops -linalg-convert-to-llvm -lower-to-llvm | mlir-cpu-runner -e print_1d -entry-point-result=void -shared-libs=%linalg_test_lib_dir/libmlir_runner_utils%shlibext | FileCheck %s --check-prefix=PRINT-1D -// RUN: mlir-opt %s -linalg-lower-to-loops -linalg-convert-to-llvm -lower-to-llvm | mlir-cpu-runner -e print_3d -entry-point-result=void -shared-libs=%linalg_test_lib_dir/libmlir_runner_utils%shlibext | FileCheck %s --check-prefix=PRINT-3D -// RUN: mlir-opt %s -linalg-lower-to-loops -linalg-convert-to-llvm -lower-to-llvm | mlir-cpu-runner -e vector_splat_2d -entry-point-result=void -shared-libs=%linalg_test_lib_dir/libmlir_runner_utils%shlibext | FileCheck %s --check-prefix=PRINT-VECTOR-SPLAT-2D +// RUN: mlir-opt %s -linalg-lower-to-loops -convert-linalg-to-llvm -lower-to-llvm | mlir-cpu-runner -e print_0d -entry-point-result=void -shared-libs=%linalg_test_lib_dir/libmlir_runner_utils%shlibext | FileCheck %s --check-prefix=PRINT-0D +// RUN: mlir-opt %s -linalg-lower-to-loops -convert-linalg-to-llvm -lower-to-llvm | mlir-cpu-runner -e print_1d -entry-point-result=void -shared-libs=%linalg_test_lib_dir/libmlir_runner_utils%shlibext | FileCheck %s --check-prefix=PRINT-1D +// RUN: mlir-opt %s -linalg-lower-to-loops -convert-linalg-to-llvm -lower-to-llvm | mlir-cpu-runner -e print_3d -entry-point-result=void -shared-libs=%linalg_test_lib_dir/libmlir_runner_utils%shlibext | FileCheck %s --check-prefix=PRINT-3D +// RUN: mlir-opt %s -linalg-lower-to-loops -convert-linalg-to-llvm -lower-to-llvm | mlir-cpu-runner -e vector_splat_2d -entry-point-result=void -shared-libs=%linalg_test_lib_dir/libmlir_runner_utils%shlibext | FileCheck %s --check-prefix=PRINT-VECTOR-SPLAT-2D func @print_0d() { %f = constant 2.00000e+00 : f32 -- 2.7.4