// Create procInfo so it dominates loops, if appropriate.
OpBuilder &builder = edsc::ScopedContext::getBuilderRef();
Location loc = edsc::ScopedContext::getLocation();
- SmallVector<ProcInfo, 2> procInfo;
- if (distributionOptions.hasValue())
- procInfo = distributionOptions->procInfo(builder, loc, loopRanges);
+
+ SmallVector<ProcInfo, 4> procInfo;
+ SmallVector<DistributionMethod, 0> distributionMethod;
+ if (distributionOptions.hasValue()) {
+ // Collect loop ranges for parallel dimensions.
+ SmallVector<Range, 2> parallelLoopRanges;
+ for (auto iteratorType : enumerate(iteratorTypes))
+ if (isParallelIteratorType(iteratorType.value()))
+ parallelLoopRanges.push_back(loopRanges[iteratorType.index()]);
+
+ // Get their distribution schemes.
+ distributionMethod = distributionOptions->distributionMethod;
+ if (distributionMethod.size() < parallelLoopRanges.size())
+ parallelLoopRanges.resize(distributionMethod.size());
+ procInfo = distributionOptions->procInfo(builder, loc, parallelLoopRanges);
+ }
SmallVector<Value, 4> lbs, ubs, steps;
unpackRanges(loopRanges, lbs, ubs, steps);
LoopNest loopNest =
edsc::loopNestBuilder(lbs, ubs, steps, iterArgInitValues, bodyBuilderFn);
- if (!distributionOptions.hasValue() || loopNest.loops.empty())
+ if (!distributionOptions || loopNest.loops.empty())
return;
- // Only supports cyclic distribution for now.
- for (auto it : llvm::zip(loopNest.loops, procInfo,
- distributionOptions->distributionMethod))
+ // Filter out scf.for loops that were created out of parallel dimensions.
+ SmallVector<scf::ForOp, 4> loops;
+ for (auto iteratorType : enumerate(iteratorTypes))
+ if (isParallelIteratorType(iteratorType.value()))
+ loops.push_back(loopNest.loops[iteratorType.index()]);
+
+ // Distribute - only supports cyclic distribution for now.
+ for (auto it : llvm::zip(loops, procInfo, distributionMethod))
if (std::get<2>(it) == DistributionMethod::Cyclic)
mapLoopToProcessorIds(std::get<0>(it), std::get<1>(it).procId,
std::get<1>(it).nprocs);
// CHECK-SAME: %[[ARG0:[a-zA-Z0-9_]*]]: memref<?x?xf32>
// CHECK-SAME: %[[ARG1:[a-zA-Z0-9_]*]]: memref<?x?xf32>
// CHECK-SAME: %[[ARG2:[a-zA-Z0-9_]*]]: memref<?x?xf32>
-// CHECK: %[[BIDY:.*]] = "gpu.block_id"() {dimension = "y"}
-// CHECK: %[[BIDX:.*]] = "gpu.block_id"() {dimension = "x"}
+// CHECK-DAG: %[[BIDY:.*]] = "gpu.block_id"() {dimension = "y"}
+// CHECK-DAG: %[[BIDX:.*]] = "gpu.block_id"() {dimension = "x"}
// CHECK: scf.for %[[ARG3:.*]] =
// CHECK: %[[OFFSETY:.*]] = affine.apply #[[MAP0]]()[%[[BIDY]]]
// CHECK: %[[SV1:.*]] = memref.subview %[[ARG0]][%[[OFFSETY]], %[[ARG3]]]
// CHECK-SAME: %[[ARG0:[a-zA-Z0-9_]*]]: memref<?x?xf32>
// CHECK-SAME: %[[ARG1:[a-zA-Z0-9_]*]]: memref<?x?xf32>
// CHECK-SAME: %[[ARG2:[a-zA-Z0-9_]*]]: memref<?x?xf32>
-// CHECK: %[[BIDY:.*]] = "gpu.block_id"() {dimension = "y"}
-// CHECK: %[[NBLOCKSY:.*]] = "gpu.grid_dim"() {dimension = "y"}
-// CHECK: %[[BIDX:.*]] = "gpu.block_id"() {dimension = "x"}
-// CHECK: %[[NBLOCKSX:.*]] = "gpu.grid_dim"() {dimension = "x"}
+// CHECK-DAG: %[[BIDY:.*]] = "gpu.block_id"() {dimension = "y"}
+// CHECK-DAG: %[[NBLOCKSY:.*]] = "gpu.grid_dim"() {dimension = "y"}
+// CHECK-DAG: %[[BIDX:.*]] = "gpu.block_id"() {dimension = "x"}
+// CHECK-DAG: %[[NBLOCKSX:.*]] = "gpu.grid_dim"() {dimension = "x"}
// CHECK: %[[LBY:.*]] = affine.apply #[[MAP0]]()[%[[BIDY]]]
// CHECK: %[[STEPY:.*]] = affine.apply #[[MAP0]]()[%[[NBLOCKSY]]]
// CHECK: %[[LBX:.*]] = affine.apply #[[MAP0]]()[%[[BIDX]]]
// CHECK-SAME: %[[ARG0:[a-zA-Z0-9_]*]]: memref<?x?xf32>
// CHECK-SAME: %[[ARG1:[a-zA-Z0-9_]*]]: memref<?x?xf32>
// CHECK-SAME: %[[ARG2:[a-zA-Z0-9_]*]]: memref<?x?xf32>
-// CHECK: %[[BIDY:.*]] = "gpu.block_id"() {dimension = "y"}
-// CHECK: %[[BIDX:.*]] = "gpu.block_id"() {dimension = "x"}
+// CHECK-DAG: %[[BIDY:.*]] = "gpu.block_id"() {dimension = "y"}
+// CHECK-DAG: %[[BIDX:.*]] = "gpu.block_id"() {dimension = "x"}
// CHECK: %[[LBX:.*]] = affine.apply #[[MAP0]]()[%[[BIDX]]]
// CHECK: %[[INBOUNDS:.*]] = cmpi slt, %[[LBX]], %{{.*}}
// CHECK: scf.if %[[INBOUNDS]]
// CHECK-SAME: %[[ARG0:[a-zA-Z0-9_]*]]: memref<?x?xf32>
// CHECK-SAME: %[[ARG1:[a-zA-Z0-9_]*]]: memref<?x?xf32>
// CHECK-SAME: %[[ARG2:[a-zA-Z0-9_]*]]: memref<?x?xf32>
-// CHECK: %[[BIDY:.*]] = "gpu.block_id"() {dimension = "y"}
-// CHECK: %[[BIDX:.*]] = "gpu.block_id"() {dimension = "x"}
-// CHECK: %[[NBLOCKSX:.*]] = "gpu.grid_dim"() {dimension = "x"}
+// CHECK-DAG: %[[BIDY:.*]] = "gpu.block_id"() {dimension = "y"}
+// CHECK-DAG: %[[BIDX:.*]] = "gpu.block_id"() {dimension = "x"}
+// CHECK-DAG: %[[NBLOCKSX:.*]] = "gpu.grid_dim"() {dimension = "x"}
// CHECK: %[[LBY:.*]] = affine.apply #[[MAP0]]()[%[[BIDY]]]
// CHECK: %[[LBX:.*]] = affine.apply #[[MAP0]]()[%[[BIDX]]]
// CHECK: %[[STEPX:.*]] = affine.apply #[[MAP0]]()[%[[NBLOCKSX]]]
// CHECK-SAME: %[[ARG0:[a-zA-Z0-9_]*]]: memref<?x?xf32>
// CHECK-SAME: %[[ARG1:[a-zA-Z0-9_]*]]: memref<?x?xf32>
// CHECK-SAME: %[[ARG2:[a-zA-Z0-9_]*]]: memref<?x?xf32>
-// CHECK: %[[BIDY:.*]] = "gpu.block_id"() {dimension = "y"}
-// CHECK: %[[NBLOCKSY:.*]] = "gpu.grid_dim"() {dimension = "y"}
-// CHECK: %[[BIDX:.*]] = "gpu.block_id"() {dimension = "x"}
+// CHECK-DAG: %[[BIDY:.*]] = "gpu.block_id"() {dimension = "y"}
+// CHECK-DAG: %[[NBLOCKSY:.*]] = "gpu.grid_dim"() {dimension = "y"}
+// CHECK-DAG: %[[BIDX:.*]] = "gpu.block_id"() {dimension = "x"}
// CHECK: %[[LBY:.*]] = affine.apply #[[MAP0]]()[%[[BIDY]]]
// CHECK: %[[STEPY:.*]] = affine.apply #[[MAP0]]()[%[[NBLOCKSY]]]
// CHECK: scf.parallel (%[[ARG3:.*]]) = (%[[LBY]]) to (%{{.*}}) step (%[[STEPY]])
-> tensor<?x?xf32> {
// CHECK-DAG: %[[C8:.*]] = constant 8 : index
// CHECK-DAG: %[[C0:.*]] = constant 0 : index
-// CHECK: %[[BIDY:.*]] = "gpu.block_id"() {dimension = "y"}
-// CHECK: %[[NBLOCKSY:.*]] = "gpu.grid_dim"() {dimension = "y"}
-// CHECK: %[[BIDX:.*]] = "gpu.block_id"() {dimension = "x"}
-// CHECK: %[[NBLOCKSX:.*]] = "gpu.grid_dim"() {dimension = "x"}
+// CHECK-DAG: %[[BIDY:.*]] = "gpu.block_id"() {dimension = "y"}
+// CHECK-DAG: %[[NBLOCKSY:.*]] = "gpu.grid_dim"() {dimension = "y"}
+// CHECK-DAG: %[[BIDX:.*]] = "gpu.block_id"() {dimension = "x"}
+// CHECK-DAG: %[[NBLOCKSX:.*]] = "gpu.grid_dim"() {dimension = "x"}
// CHECK: %[[MUL:.+]] = affine.apply #[[MULMAP]]()[%[[BIDY]], %[[C8]]]
// CHECK: %[[LBY:.+]] = affine.apply #[[ADDMAP]]()[%[[MUL]], %[[C0]]]
// CHECK: %[[STEPY:.+]] = affine.apply #[[MULMAP]]()[%[[NBLOCKSY]], %[[C8]]]
template <typename IdOp, typename NProcsOp>
static SmallVector<ProcInfo, 2>
getGpuProcIds(OpBuilder &b, Location loc, ArrayRef<Range> parallelLoopRanges) {
+ size_t count = std::min<size_t>(3, parallelLoopRanges.size());
+ SmallVector<ProcInfo, 2> procInfo(count);
+ const char *xyz[] = {"x", "y", "z"};
Type indexType = b.getIndexType();
- SmallVector<ProcInfo, 2> procInfo(2);
- procInfo[0] = {b.create<IdOp>(loc, indexType, b.getStringAttr("y")),
- b.create<NProcsOp>(loc, indexType, b.getStringAttr("y"))};
- procInfo[1] = {b.create<IdOp>(loc, indexType, b.getStringAttr("x")),
- b.create<NProcsOp>(loc, indexType, b.getStringAttr("x"))};
+ for (unsigned i = 0; i < count; ++i) {
+ procInfo[count - 1 - i] = {
+ b.create<IdOp>(loc, indexType, b.getStringAttr(xyz[i])),
+ b.create<NProcsOp>(loc, indexType, b.getStringAttr(xyz[i]))};
+ }
return procInfo;
}