From ba7f3e1d1e50212bdc8cc438185519fd7257aa44 Mon Sep 17 00:00:00 2001 From: Nicolas Vasilache Date: Mon, 20 Mar 2023 05:17:39 -0700 Subject: [PATCH] [mlir][Transform] Fix support for mapping to GPU warps and to linear ids c59465e1203dd78d06e15f7ddf62141807dbd5a7 introduced mapping to warps and linear GPU ids. In the implementation, the delinearization basis is reversed from [x, y, z] to [z, y x] order to properly compute the strides and allow delinearization. Prior to this commit, we forgot to reverse it back to [x, y, z] order before materializing the indices. Fix this oversight. --- mlir/lib/Dialect/GPU/TransformOps/GPUTransformOps.cpp | 10 ++++++++-- mlir/test/Dialect/GPU/transform-gpu.mlir | 18 ++++++++++-------- 2 files changed, 18 insertions(+), 10 deletions(-) diff --git a/mlir/lib/Dialect/GPU/TransformOps/GPUTransformOps.cpp b/mlir/lib/Dialect/GPU/TransformOps/GPUTransformOps.cpp index 0f566e4..550d8c1 100644 --- a/mlir/lib/Dialect/GPU/TransformOps/GPUTransformOps.cpp +++ b/mlir/lib/Dialect/GPU/TransformOps/GPUTransformOps.cpp @@ -140,12 +140,15 @@ struct GpuWarpIdBuilder : public GpuIdBuilder { OpFoldResult warpIdOfr = makeComposedFoldedAffineApply( rewriter, loc, d0.floorDiv(kWarpSize), {linearId}); Value warpId = getValueOrCreateConstantIndexOp(rewriter, loc, warpIdOfr); + // Sizes in [x, y, z] -> [z, y x] order to properly compute strides in + // "row-major" order. SmallVector reverseBasisSizes( llvm::reverse(this->availableMappingSizes)); SmallVector strides = computeStrides(reverseBasisSizes); SmallVector delinearizingExprs = delinearize(d0, strides); SmallVector ids; - for (AffineExpr e : delinearizingExprs) + // Reverse back to be in [x, y, z] order. + for (AffineExpr e : llvm::reverse(delinearizingExprs)) ids.push_back(makeComposedAffineApply(rewriter, loc, e, warpId)); // clang-format off @@ -191,13 +194,16 @@ struct GpuLinearIdBuilder : public GpuIdBuilder { // Build the linear thread id and decompose it in the basis of // `forallMappingSizes`. Value linearId = buildLinearThreadId(rewriter, loc, this->blockDimsOfr); + // Sizes in [x, y, z] -> [z, y x] order to properly compute strides in + // "row-major" order. SmallVector reverseBasisSizes(llvm::reverse(forallMappingSizes)); SmallVector strides = computeStrides(reverseBasisSizes); AffineExpr d0; bindDims(rewriter.getContext(), d0); SmallVector delinearizingExprs = delinearize(d0, strides); SmallVector ids; - for (AffineExpr e : delinearizingExprs) + // Reverse back to be in [x, y, z] order. + for (AffineExpr e : llvm::reverse(delinearizingExprs)) ids.push_back(makeComposedAffineApply(rewriter, loc, e, linearId)); // clang-format off diff --git a/mlir/test/Dialect/GPU/transform-gpu.mlir b/mlir/test/Dialect/GPU/transform-gpu.mlir index e54af05..e485d41 100644 --- a/mlir/test/Dialect/GPU/transform-gpu.mlir +++ b/mlir/test/Dialect/GPU/transform-gpu.mlir @@ -241,12 +241,12 @@ transform.sequence failures(propagate) { !type = memref<2 x 32 x f32> !type1d = memref<32 x f32> -// CHECK-DAG: #[[$MAPWY:.*]] = affine_map<(d0, d1) -> (((d0 + d1 * 12) floordiv 32) floordiv 4)> -// CHECK-DAG: #[[$MAPWX:.*]] = affine_map<(d0, d1) -> ((((d0 + d1 * 12) floordiv 32) mod 4) floordiv 2)> +// CHECK-DAG: #[[$MAPWX:.*]] = affine_map<(d0, d1) -> (((d0 + d1 * 12) floordiv 32) mod 3)> +// CHECK-DAG: #[[$MAPWY:.*]] = affine_map<(d0, d1) -> ((((d0 + d1 * 12) floordiv 32) mod 6) floordiv 3)> // CHECK-DAG: #[[$MAPLIN:.*]] = affine_map<(d0, d1) -> (d0 + d1 * 12)> -// CHECK-DAG: #[[$MAPLY:.*]] = affine_map<(d0, d1) -> ((d0 + d1 * 12) floordiv 20)> -// CHECK-DAG: #[[$MAPLX:.*]] = affine_map<(d0, d1) -> (((d0 + d1 * 12) mod 20) floordiv 10)> +// CHECK-DAG: #[[$MAPLX:.*]] = affine_map<(d0, d1) -> ((d0 + d1 * 12) mod 10)> +// CHECK-DAG: #[[$MAPLY:.*]] = affine_map<(d0, d1) -> (((d0 + d1 * 12) mod 20) floordiv 10)> // CHECK-LABEL: func.func @map_multi_level( func.func @map_multi_level(%x: !type, %y: !type, %t: !type1d, %alpha : f32, %stream : !gpu.async.token) -> !type { @@ -277,11 +277,11 @@ func.func @map_multi_level(%x: !type, %y: !type, %t: !type1d, %alpha : f32, %str memref.store %6, %y[%i, %j] : !type } { mapping = [#gpu.thread, #gpu.thread]} - // CHECK-DAG: %[[WIDY:.*]] = affine.apply #[[$MAPWY]](%[[TIDX]], %[[TIDY]]) // CHECK-DAG: %[[WIDX:.*]] = affine.apply #[[$MAPWX]](%[[TIDX]], %[[TIDY]]) + // CHECK-DAG: %[[WIDY:.*]] = affine.apply #[[$MAPWY]](%[[TIDX]], %[[TIDY]]) // CHECK-DAG: %[[CMPX:.*]] = arith.cmpi ult, %[[WIDX]], %[[C1]] : index // CHECK-DAG: %[[CMPY:.*]] = arith.cmpi ult, %[[WIDY]], %[[C1]] : index - // CHECK: %[[COND:.*]] = arith.andi %[[CMPY]], %[[CMPX]] : i1 + // CHECK: %[[COND:.*]] = arith.andi %[[CMPX]], %[[CMPY]] : i1 // CHECK: scf.if %[[COND]] scf.forall (%i) in (%c1) { %7 = memref.load %t[%i] : !type1d @@ -290,10 +290,12 @@ func.func @map_multi_level(%x: !type, %y: !type, %t: !type1d, %alpha : f32, %str } {mapping = [#gpu.warp] } // CHECK-DAG: %[[LIN:.*]] = affine.apply #[[$MAPLIN]](%[[TIDX]], %[[TIDY]]) + // CHECK-DAG: %[[LIDX:.*]] = affine.apply #[[$MAPLX]](%[[TIDX]], %[[TIDY]]) // CHECK-DAG: %[[LIDY:.*]] = affine.apply #[[$MAPLY]](%[[TIDX]], %[[TIDY]]) - // CHECK-DAG: %[[LIDZ:.*]] = affine.apply #[[$MAPLX]](%[[TIDX]], %[[TIDY]]) // CHECK-DAG: %[[COND:.*]] = arith.cmpi ult, %[[LIN]], %[[C20]] : index // CHECK: scf.if %[[COND]] + // CHECK: memref.load %{{.*}}[%[[LIDX]]] : memref<32xf32> + // CHECK: memref.store %{{.*}}[%[[LIDY]]] : memref<32xf32> scf.forall (%i, %j) in (%c10, %c2) { %7 = memref.load %t[%i] : !type1d %8 = arith.addf %alpha, %7 : f32 @@ -308,6 +310,6 @@ transform.sequence failures(propagate) { ^bb1(%arg0: !pdl.operation): %funcop = transform.structured.match ops{["gpu.launch"]} in %arg0 : (!pdl.operation) -> !pdl.operation transform.gpu.map_nested_forall_to_threads %funcop - block_dims = [12, 11, 1] warp_dims = [2, 2, 1] + block_dims = [12, 11, 1] warp_dims = [3, 2, 1] : (!pdl.operation) -> () } -- 2.7.4