From: Roman Gareev Date: Wed, 21 Dec 2016 12:51:12 +0000 (+0000) Subject: Change the determination of parameters of macro-kernel X-Git-Url: http://review.tizen.org/git/?a=commitdiff_plain;h=be5299af0b88b74bf986d223883c0992bd0d993f;p=platform%2Fupstream%2Fllvm.git Change the determination of parameters of macro-kernel Typically processor architectures do not include an L3 cache, which means that Nc, the parameter of the micro-kernel, is, for all practical purposes, redundant ([1]). However, its small values can cause the redundant packing of the same elements of the matrix A, the first operand of the matrix multiplication. At the same time, big values of the parameter Nc can cause segmentation faults in case the available stack is exceeded. This patch adds an option to specify the parameter Nc as a multiple of the parameter of the micro-kernel Nr. In case of Intel Core i7-3820 SandyBridge and the following options, clang -O3 gemm.c -I utilities/ utilities/polybench.c -DPOLYBENCH_TIME -march=native -mllvm -polly -mllvm -polly-pattern-matching-based-opts=true -DPOLYBENCH_USE_SCALAR_LB -mllvm -polly-target-cache-level-associativity=8,8 -mllvm -polly-target-cache-level-sizes=32768,262144 -mllvm -polly-target-latency-vector-fma=8 it helps to improve the performance from 11.303 GFlops/sec (39,247% of theoretical peak) to 17.896 GFlops/sec (62,14% of theoretical peak). Refs.: [1] - http://www.cs.utexas.edu/users/flame/pubs/TOMS-BLIS-Analytical.pdf Reviewed-by: Tobias Grosser Differential Revision: https://reviews.llvm.org/D28019 llvm-svn: 290256 --- diff --git a/polly/lib/Transform/ScheduleOptimizer.cpp b/polly/lib/Transform/ScheduleOptimizer.cpp index ae7d733..54e3d8d 100644 --- a/polly/lib/Transform/ScheduleOptimizer.cpp +++ b/polly/lib/Transform/ScheduleOptimizer.cpp @@ -185,6 +185,12 @@ static cl::opt RegisterDefaultTileSize( " --polly-register-tile-sizes)"), cl::Hidden, cl::init(2), cl::ZeroOrMore, cl::cat(PollyCategory)); +static cl::opt PollyPatternMatchingNcQuotient( + "polly-pattern-matching-nc-quotient", + cl::desc("Quotient that is obtained by dividing Nc, the parameter of the" + "macro-kernel, by Nr, the parameter of the micro-kernel"), + cl::Hidden, cl::init(256), cl::ZeroOrMore, cl::cat(PollyCategory)); + static cl::list RegisterTileSizes("polly-register-tile-sizes", cl::desc("A tile size for each loop dimension, filled " @@ -610,6 +616,9 @@ getMacroKernelParams(const MicroKernelParamsTy &MicroKernelParams) { CacheLevelSizes[0] > 0 && CacheLevelSizes[1] > 0 && CacheLevelAssociativity[0] > 2 && CacheLevelAssociativity[1] > 2)) return {1, 1, 1}; + // The quotient should be greater than zero. + if (PollyPatternMatchingNcQuotient <= 0) + return {1, 1, 1}; int Car = floor( (CacheLevelAssociativity[0] - 1) / (1 + static_cast(MicroKernelParams.Nr) / MicroKernelParams.Mr)); @@ -618,7 +627,7 @@ getMacroKernelParams(const MicroKernelParamsTy &MicroKernelParams) { double Cac = static_cast(Kc * 8 * CacheLevelAssociativity[1]) / CacheLevelSizes[1]; int Mc = floor((CacheLevelAssociativity[1] - 2) / Cac); - int Nc = floor(1 / Cac); + int Nc = PollyPatternMatchingNcQuotient * MicroKernelParams.Nr; return {Mc, Nc, Kc}; } diff --git a/polly/test/ScheduleOptimizer/mat_mul_pattern_data_layout.ll b/polly/test/ScheduleOptimizer/mat_mul_pattern_data_layout.ll index e7a1dc8..4f723c5 100644 --- a/polly/test/ScheduleOptimizer/mat_mul_pattern_data_layout.ll +++ b/polly/test/ScheduleOptimizer/mat_mul_pattern_data_layout.ll @@ -9,14 +9,14 @@ ; C[i][j] += alpha * A[i][k] * B[k][j]; ; } ; -; CHECK: double Packed_B[ { [] -> [(2)] } ][ { [] -> [(256)] } ][ { [] -> [(8)] } ]; // Element size 8 +; CHECK: double Packed_B[ { [] -> [(256)] } ][ { [] -> [(256)] } ][ { [] -> [(8)] } ]; ; CHECK-NEXT: double Packed_A[ { [] -> [(24)] } ][ { [] -> [(256)] } ][ { [] -> [(4)] } ]; // Element size 8 ; ; CHECK: { Stmt_Copy_0[i0, i1, i2] -> MemRef_arg6[i0, i2] }; ; CHECK-NEXT: new: { Stmt_Copy_0[i0, i1, i2] -> Packed_A[o0, o1, o2] : 256*floor((-i2 + o1)/256) = -i2 + o1 and 4*floor((-i0 + o2)/4) = -i0 + o2 and 0 <= o1 <= 255 and 0 <= o2 <= 3 and -3 + i0 - 4o0 <= 96*floor((i0)/96) <= i0 - 4o0 }; ; ; CHECK: { Stmt_Copy_0[i0, i1, i2] -> MemRef_arg7[i2, i1] }; -; CHECK-NEXT: new: { Stmt_Copy_0[i0, i1, i2] -> Packed_B[o0, o1, o2] : 256*floor((-i2 + o1)/256) = -i2 + o1 and 8*floor((-i1 + o2)/8) = -i1 + o2 and 0 <= o1 <= 255 and 0 <= o2 <= 7 and -7 + i1 - 8o0 <= 16*floor((i1)/16) <= i1 - 8o0 }; +; CHECK-NEXT: new: { Stmt_Copy_0[i0, i1, i2] -> Packed_B[o0, o1, o2] : 256*floor((-i2 + o1)/256) = -i2 + o1 and 8*floor((-i1 + o2)/8) = -i1 + o2 and 0 <= o1 <= 255 and 0 <= o2 <= 7 and -7 + i1 - 8o0 <= 2048*floor((i1)/2048) <= i1 - 8o0 }; ; ; CHECK: CopyStmt_0 ; CHECK-NEXT: Domain := @@ -25,7 +25,7 @@ ; CHECK-NEXT: ; ; CHECK-NEXT: MustWriteAccess := [Reduction Type: NONE] [Scalar: 0] ; CHECK-NEXT: null; -; CHECK-NEXT: new: { CopyStmt_0[i0, i1, i2] -> Packed_B[o0, o1, o2] : 256*floor((-i2 + o1)/256) = -i2 + o1 and 8*floor((-i1 + o2)/8) = -i1 + o2 and 0 <= o1 <= 255 and 0 <= o2 <= 7 and -7 + i1 - 8o0 <= 16*floor((i1)/16) <= i1 - 8o0 }; +; CHECK-NEXT: new: { CopyStmt_0[i0, i1, i2] -> Packed_B[o0, o1, o2] : 256*floor((-i2 + o1)/256) = -i2 + o1 and 8*floor((-i1 + o2)/8) = -i1 + o2 and 0 <= o1 <= 255 and 0 <= o2 <= 7 and -7 + i1 - 8o0 <= 2048*floor((i1)/2048) <= i1 - 8o0 }; ; CHECK-NEXT: ReadAccess := [Reduction Type: NONE] [Scalar: 0] ; CHECK-NEXT: null; ; CHECK-NEXT: new: { CopyStmt_0[i0, i1, i2] -> MemRef_arg7[i2, i1] }; diff --git a/polly/test/ScheduleOptimizer/mat_mul_pattern_data_layout_2.ll b/polly/test/ScheduleOptimizer/mat_mul_pattern_data_layout_2.ll index 9ff8da1..73d48e3 100644 --- a/polly/test/ScheduleOptimizer/mat_mul_pattern_data_layout_2.ll +++ b/polly/test/ScheduleOptimizer/mat_mul_pattern_data_layout_2.ll @@ -20,60 +20,59 @@ ; CHECK-NEXT: Stmt_bb9(32 * c0 + c2, 32 * c1 + c3); ; CHECK-NEXT: } ; CHECK-NEXT: // 1st level tiling - Tiles -; CHECK-NEXT: for (int c0 = 0; c0 <= 65; c0 += 1) -; CHECK-NEXT: for (int c1 = 0; c1 <= 3; c1 += 1) { -; CHECK-NEXT: for (int c3 = 16 * c0; c3 <= 16 * c0 + 15; c3 += 1) -; CHECK-NEXT: for (int c4 = 256 * c1; c4 <= min(1022, 256 * c1 + 255); c4 += 1) -; CHECK-NEXT: CopyStmt_0(0, c3, c4); -; CHECK-NEXT: for (int c2 = 0; c2 <= 10; c2 += 1) { -; CHECK-NEXT: for (int c3 = 96 * c2; c3 <= 96 * c2 + 95; c3 += 1) -; CHECK-NEXT: for (int c5 = 256 * c1; c5 <= min(1022, 256 * c1 + 255); c5 += 1) -; CHECK-NEXT: CopyStmt_1(c3, 0, c5); -; CHECK-NEXT: // 1st level tiling - Points -; CHECK-NEXT: // Register tiling - Tiles -; CHECK-NEXT: for (int c3 = 0; c3 <= 1; c3 += 1) -; CHECK-NEXT: for (int c4 = 0; c4 <= 23; c4 += 1) -; CHECK-NEXT: for (int c5 = 0; c5 <= min(255, -256 * c1 + 1022); c5 += 1) { -; CHECK-NEXT: // Register tiling - Points -; CHECK-NEXT: // 1st level tiling - Tiles -; CHECK-NEXT: // 1st level tiling - Points -; CHECK-NEXT: { -; CHECK-NEXT: Stmt_Copy_0(96 * c2 + 4 * c4, 16 * c0 + 8 * c3, 256 * c1 + c5); -; CHECK-NEXT: Stmt_Copy_0(96 * c2 + 4 * c4, 16 * c0 + 8 * c3 + 1, 256 * c1 + c5); -; CHECK-NEXT: Stmt_Copy_0(96 * c2 + 4 * c4, 16 * c0 + 8 * c3 + 2, 256 * c1 + c5); -; CHECK-NEXT: Stmt_Copy_0(96 * c2 + 4 * c4, 16 * c0 + 8 * c3 + 3, 256 * c1 + c5); -; CHECK-NEXT: Stmt_Copy_0(96 * c2 + 4 * c4, 16 * c0 + 8 * c3 + 4, 256 * c1 + c5); -; CHECK-NEXT: Stmt_Copy_0(96 * c2 + 4 * c4, 16 * c0 + 8 * c3 + 5, 256 * c1 + c5); -; CHECK-NEXT: Stmt_Copy_0(96 * c2 + 4 * c4, 16 * c0 + 8 * c3 + 6, 256 * c1 + c5); -; CHECK-NEXT: Stmt_Copy_0(96 * c2 + 4 * c4, 16 * c0 + 8 * c3 + 7, 256 * c1 + c5); -; CHECK-NEXT: Stmt_Copy_0(96 * c2 + 4 * c4 + 1, 16 * c0 + 8 * c3, 256 * c1 + c5); -; CHECK-NEXT: Stmt_Copy_0(96 * c2 + 4 * c4 + 1, 16 * c0 + 8 * c3 + 1, 256 * c1 + c5); -; CHECK-NEXT: Stmt_Copy_0(96 * c2 + 4 * c4 + 1, 16 * c0 + 8 * c3 + 2, 256 * c1 + c5); -; CHECK-NEXT: Stmt_Copy_0(96 * c2 + 4 * c4 + 1, 16 * c0 + 8 * c3 + 3, 256 * c1 + c5); -; CHECK-NEXT: Stmt_Copy_0(96 * c2 + 4 * c4 + 1, 16 * c0 + 8 * c3 + 4, 256 * c1 + c5); -; CHECK-NEXT: Stmt_Copy_0(96 * c2 + 4 * c4 + 1, 16 * c0 + 8 * c3 + 5, 256 * c1 + c5); -; CHECK-NEXT: Stmt_Copy_0(96 * c2 + 4 * c4 + 1, 16 * c0 + 8 * c3 + 6, 256 * c1 + c5); -; CHECK-NEXT: Stmt_Copy_0(96 * c2 + 4 * c4 + 1, 16 * c0 + 8 * c3 + 7, 256 * c1 + c5); -; CHECK-NEXT: Stmt_Copy_0(96 * c2 + 4 * c4 + 2, 16 * c0 + 8 * c3, 256 * c1 + c5); -; CHECK-NEXT: Stmt_Copy_0(96 * c2 + 4 * c4 + 2, 16 * c0 + 8 * c3 + 1, 256 * c1 + c5); -; CHECK-NEXT: Stmt_Copy_0(96 * c2 + 4 * c4 + 2, 16 * c0 + 8 * c3 + 2, 256 * c1 + c5); -; CHECK-NEXT: Stmt_Copy_0(96 * c2 + 4 * c4 + 2, 16 * c0 + 8 * c3 + 3, 256 * c1 + c5); -; CHECK-NEXT: Stmt_Copy_0(96 * c2 + 4 * c4 + 2, 16 * c0 + 8 * c3 + 4, 256 * c1 + c5); -; CHECK-NEXT: Stmt_Copy_0(96 * c2 + 4 * c4 + 2, 16 * c0 + 8 * c3 + 5, 256 * c1 + c5); -; CHECK-NEXT: Stmt_Copy_0(96 * c2 + 4 * c4 + 2, 16 * c0 + 8 * c3 + 6, 256 * c1 + c5); -; CHECK-NEXT: Stmt_Copy_0(96 * c2 + 4 * c4 + 2, 16 * c0 + 8 * c3 + 7, 256 * c1 + c5); -; CHECK-NEXT: Stmt_Copy_0(96 * c2 + 4 * c4 + 3, 16 * c0 + 8 * c3, 256 * c1 + c5); -; CHECK-NEXT: Stmt_Copy_0(96 * c2 + 4 * c4 + 3, 16 * c0 + 8 * c3 + 1, 256 * c1 + c5); -; CHECK-NEXT: Stmt_Copy_0(96 * c2 + 4 * c4 + 3, 16 * c0 + 8 * c3 + 2, 256 * c1 + c5); -; CHECK-NEXT: Stmt_Copy_0(96 * c2 + 4 * c4 + 3, 16 * c0 + 8 * c3 + 3, 256 * c1 + c5); -; CHECK-NEXT: Stmt_Copy_0(96 * c2 + 4 * c4 + 3, 16 * c0 + 8 * c3 + 4, 256 * c1 + c5); -; CHECK-NEXT: Stmt_Copy_0(96 * c2 + 4 * c4 + 3, 16 * c0 + 8 * c3 + 5, 256 * c1 + c5); -; CHECK-NEXT: Stmt_Copy_0(96 * c2 + 4 * c4 + 3, 16 * c0 + 8 * c3 + 6, 256 * c1 + c5); -; CHECK-NEXT: Stmt_Copy_0(96 * c2 + 4 * c4 + 3, 16 * c0 + 8 * c3 + 7, 256 * c1 + c5); -; CHECK-NEXT: } +; CHECK-NEXT: for (int c1 = 0; c1 <= 3; c1 += 1) { +; CHECK-NEXT: for (int c3 = 0; c3 <= 1055; c3 += 1) +; CHECK-NEXT: for (int c4 = 256 * c1; c4 <= min(1022, 256 * c1 + 255); c4 += 1) +; CHECK-NEXT: CopyStmt_0(0, c3, c4); +; CHECK-NEXT: for (int c2 = 0; c2 <= 10; c2 += 1) { +; CHECK-NEXT: for (int c3 = 96 * c2; c3 <= 96 * c2 + 95; c3 += 1) +; CHECK-NEXT: for (int c5 = 256 * c1; c5 <= min(1022, 256 * c1 + 255); c5 += 1) +; CHECK-NEXT: CopyStmt_1(c3, 0, c5); +; CHECK-NEXT: // 1st level tiling - Points +; CHECK-NEXT: // Register tiling - Tiles +; CHECK-NEXT: for (int c3 = 0; c3 <= 131; c3 += 1) +; CHECK-NEXT: for (int c4 = 0; c4 <= 23; c4 += 1) +; CHECK-NEXT: for (int c5 = 0; c5 <= min(255, -256 * c1 + 1022); c5 += 1) { +; CHECK-NEXT: // Register tiling - Points +; CHECK-NEXT: // 1st level tiling - Tiles +; CHECK-NEXT: // 1st level tiling - Points +; CHECK-NEXT: { +; CHECK-NEXT: Stmt_Copy_0(96 * c2 + 4 * c4, 8 * c3, 256 * c1 + c5); +; CHECK-NEXT: Stmt_Copy_0(96 * c2 + 4 * c4, 8 * c3 + 1, 256 * c1 + c5); +; CHECK-NEXT: Stmt_Copy_0(96 * c2 + 4 * c4, 8 * c3 + 2, 256 * c1 + c5); +; CHECK-NEXT: Stmt_Copy_0(96 * c2 + 4 * c4, 8 * c3 + 3, 256 * c1 + c5); +; CHECK-NEXT: Stmt_Copy_0(96 * c2 + 4 * c4, 8 * c3 + 4, 256 * c1 + c5); +; CHECK-NEXT: Stmt_Copy_0(96 * c2 + 4 * c4, 8 * c3 + 5, 256 * c1 + c5); +; CHECK-NEXT: Stmt_Copy_0(96 * c2 + 4 * c4, 8 * c3 + 6, 256 * c1 + c5); +; CHECK-NEXT: Stmt_Copy_0(96 * c2 + 4 * c4, 8 * c3 + 7, 256 * c1 + c5); +; CHECK-NEXT: Stmt_Copy_0(96 * c2 + 4 * c4 + 1, 8 * c3, 256 * c1 + c5); +; CHECK-NEXT: Stmt_Copy_0(96 * c2 + 4 * c4 + 1, 8 * c3 + 1, 256 * c1 + c5); +; CHECK-NEXT: Stmt_Copy_0(96 * c2 + 4 * c4 + 1, 8 * c3 + 2, 256 * c1 + c5); +; CHECK-NEXT: Stmt_Copy_0(96 * c2 + 4 * c4 + 1, 8 * c3 + 3, 256 * c1 + c5); +; CHECK-NEXT: Stmt_Copy_0(96 * c2 + 4 * c4 + 1, 8 * c3 + 4, 256 * c1 + c5); +; CHECK-NEXT: Stmt_Copy_0(96 * c2 + 4 * c4 + 1, 8 * c3 + 5, 256 * c1 + c5); +; CHECK-NEXT: Stmt_Copy_0(96 * c2 + 4 * c4 + 1, 8 * c3 + 6, 256 * c1 + c5); +; CHECK-NEXT: Stmt_Copy_0(96 * c2 + 4 * c4 + 1, 8 * c3 + 7, 256 * c1 + c5); +; CHECK-NEXT: Stmt_Copy_0(96 * c2 + 4 * c4 + 2, 8 * c3, 256 * c1 + c5); +; CHECK-NEXT: Stmt_Copy_0(96 * c2 + 4 * c4 + 2, 8 * c3 + 1, 256 * c1 + c5); +; CHECK-NEXT: Stmt_Copy_0(96 * c2 + 4 * c4 + 2, 8 * c3 + 2, 256 * c1 + c5); +; CHECK-NEXT: Stmt_Copy_0(96 * c2 + 4 * c4 + 2, 8 * c3 + 3, 256 * c1 + c5); +; CHECK-NEXT: Stmt_Copy_0(96 * c2 + 4 * c4 + 2, 8 * c3 + 4, 256 * c1 + c5); +; CHECK-NEXT: Stmt_Copy_0(96 * c2 + 4 * c4 + 2, 8 * c3 + 5, 256 * c1 + c5); +; CHECK-NEXT: Stmt_Copy_0(96 * c2 + 4 * c4 + 2, 8 * c3 + 6, 256 * c1 + c5); +; CHECK-NEXT: Stmt_Copy_0(96 * c2 + 4 * c4 + 2, 8 * c3 + 7, 256 * c1 + c5); +; CHECK-NEXT: Stmt_Copy_0(96 * c2 + 4 * c4 + 3, 8 * c3, 256 * c1 + c5); +; CHECK-NEXT: Stmt_Copy_0(96 * c2 + 4 * c4 + 3, 8 * c3 + 1, 256 * c1 + c5); +; CHECK-NEXT: Stmt_Copy_0(96 * c2 + 4 * c4 + 3, 8 * c3 + 2, 256 * c1 + c5); +; CHECK-NEXT: Stmt_Copy_0(96 * c2 + 4 * c4 + 3, 8 * c3 + 3, 256 * c1 + c5); +; CHECK-NEXT: Stmt_Copy_0(96 * c2 + 4 * c4 + 3, 8 * c3 + 4, 256 * c1 + c5); +; CHECK-NEXT: Stmt_Copy_0(96 * c2 + 4 * c4 + 3, 8 * c3 + 5, 256 * c1 + c5); +; CHECK-NEXT: Stmt_Copy_0(96 * c2 + 4 * c4 + 3, 8 * c3 + 6, 256 * c1 + c5); +; CHECK-NEXT: Stmt_Copy_0(96 * c2 + 4 * c4 + 3, 8 * c3 + 7, 256 * c1 + c5); ; CHECK-NEXT: } -; CHECK-NEXT: } +; CHECK-NEXT: } ; CHECK-NEXT: } +; CHECK-NEXT: } ; CHECK-NEXT: } ; target datalayout = "e-m:e-i64:64-f80:128-n8:16:32:64-S128" diff --git a/polly/test/ScheduleOptimizer/pattern-matching-based-opts_3.ll b/polly/test/ScheduleOptimizer/pattern-matching-based-opts_3.ll index f2117eb..f5b99ee 100644 --- a/polly/test/ScheduleOptimizer/pattern-matching-based-opts_3.ll +++ b/polly/test/ScheduleOptimizer/pattern-matching-based-opts_3.ll @@ -73,53 +73,52 @@ ; EXTRACTION-OF-MACRO-KERNEL-NEXT: Stmt_bb14(32 * c0 + c2, 32 * c1 + c3); ; EXTRACTION-OF-MACRO-KERNEL-NEXT: } ; EXTRACTION-OF-MACRO-KERNEL-NEXT: // 1st level tiling - Tiles -; EXTRACTION-OF-MACRO-KERNEL-NEXT: for (int c0 = 0; c0 <= 65; c0 += 1) -; EXTRACTION-OF-MACRO-KERNEL-NEXT: for (int c1 = 0; c1 <= 3; c1 += 1) -; EXTRACTION-OF-MACRO-KERNEL-NEXT: for (int c2 = 0; c2 <= 10; c2 += 1) { -; EXTRACTION-OF-MACRO-KERNEL-NEXT: // 1st level tiling - Points -; EXTRACTION-OF-MACRO-KERNEL-NEXT: // Register tiling - Tiles -; EXTRACTION-OF-MACRO-KERNEL-NEXT: for (int c3 = 0; c3 <= 1; c3 += 1) -; EXTRACTION-OF-MACRO-KERNEL-NEXT: for (int c4 = 0; c4 <= 23; c4 += 1) -; EXTRACTION-OF-MACRO-KERNEL-NEXT: for (int c5 = 0; c5 <= 255; c5 += 1) { -; EXTRACTION-OF-MACRO-KERNEL-NEXT: // Register tiling - Points -; EXTRACTION-OF-MACRO-KERNEL-NEXT: // 1st level tiling - Tiles -; EXTRACTION-OF-MACRO-KERNEL-NEXT: // 1st level tiling - Points -; EXTRACTION-OF-MACRO-KERNEL-NEXT: { -; EXTRACTION-OF-MACRO-KERNEL-NEXT: Stmt_bb24(96 * c2 + 4 * c4, 16 * c0 + 8 * c3, 256 * c1 + c5); -; EXTRACTION-OF-MACRO-KERNEL-NEXT: Stmt_bb24(96 * c2 + 4 * c4, 16 * c0 + 8 * c3 + 1, 256 * c1 + c5); -; EXTRACTION-OF-MACRO-KERNEL-NEXT: Stmt_bb24(96 * c2 + 4 * c4, 16 * c0 + 8 * c3 + 2, 256 * c1 + c5); -; EXTRACTION-OF-MACRO-KERNEL-NEXT: Stmt_bb24(96 * c2 + 4 * c4, 16 * c0 + 8 * c3 + 3, 256 * c1 + c5); -; EXTRACTION-OF-MACRO-KERNEL-NEXT: Stmt_bb24(96 * c2 + 4 * c4, 16 * c0 + 8 * c3 + 4, 256 * c1 + c5); -; EXTRACTION-OF-MACRO-KERNEL-NEXT: Stmt_bb24(96 * c2 + 4 * c4, 16 * c0 + 8 * c3 + 5, 256 * c1 + c5); -; EXTRACTION-OF-MACRO-KERNEL-NEXT: Stmt_bb24(96 * c2 + 4 * c4, 16 * c0 + 8 * c3 + 6, 256 * c1 + c5); -; EXTRACTION-OF-MACRO-KERNEL-NEXT: Stmt_bb24(96 * c2 + 4 * c4, 16 * c0 + 8 * c3 + 7, 256 * c1 + c5); -; EXTRACTION-OF-MACRO-KERNEL-NEXT: Stmt_bb24(96 * c2 + 4 * c4 + 1, 16 * c0 + 8 * c3, 256 * c1 + c5); -; EXTRACTION-OF-MACRO-KERNEL-NEXT: Stmt_bb24(96 * c2 + 4 * c4 + 1, 16 * c0 + 8 * c3 + 1, 256 * c1 + c5); -; EXTRACTION-OF-MACRO-KERNEL-NEXT: Stmt_bb24(96 * c2 + 4 * c4 + 1, 16 * c0 + 8 * c3 + 2, 256 * c1 + c5); -; EXTRACTION-OF-MACRO-KERNEL-NEXT: Stmt_bb24(96 * c2 + 4 * c4 + 1, 16 * c0 + 8 * c3 + 3, 256 * c1 + c5); -; EXTRACTION-OF-MACRO-KERNEL-NEXT: Stmt_bb24(96 * c2 + 4 * c4 + 1, 16 * c0 + 8 * c3 + 4, 256 * c1 + c5); -; EXTRACTION-OF-MACRO-KERNEL-NEXT: Stmt_bb24(96 * c2 + 4 * c4 + 1, 16 * c0 + 8 * c3 + 5, 256 * c1 + c5); -; EXTRACTION-OF-MACRO-KERNEL-NEXT: Stmt_bb24(96 * c2 + 4 * c4 + 1, 16 * c0 + 8 * c3 + 6, 256 * c1 + c5); -; EXTRACTION-OF-MACRO-KERNEL-NEXT: Stmt_bb24(96 * c2 + 4 * c4 + 1, 16 * c0 + 8 * c3 + 7, 256 * c1 + c5); -; EXTRACTION-OF-MACRO-KERNEL-NEXT: Stmt_bb24(96 * c2 + 4 * c4 + 2, 16 * c0 + 8 * c3, 256 * c1 + c5); -; EXTRACTION-OF-MACRO-KERNEL-NEXT: Stmt_bb24(96 * c2 + 4 * c4 + 2, 16 * c0 + 8 * c3 + 1, 256 * c1 + c5); -; EXTRACTION-OF-MACRO-KERNEL-NEXT: Stmt_bb24(96 * c2 + 4 * c4 + 2, 16 * c0 + 8 * c3 + 2, 256 * c1 + c5); -; EXTRACTION-OF-MACRO-KERNEL-NEXT: Stmt_bb24(96 * c2 + 4 * c4 + 2, 16 * c0 + 8 * c3 + 3, 256 * c1 + c5); -; EXTRACTION-OF-MACRO-KERNEL-NEXT: Stmt_bb24(96 * c2 + 4 * c4 + 2, 16 * c0 + 8 * c3 + 4, 256 * c1 + c5); -; EXTRACTION-OF-MACRO-KERNEL-NEXT: Stmt_bb24(96 * c2 + 4 * c4 + 2, 16 * c0 + 8 * c3 + 5, 256 * c1 + c5); -; EXTRACTION-OF-MACRO-KERNEL-NEXT: Stmt_bb24(96 * c2 + 4 * c4 + 2, 16 * c0 + 8 * c3 + 6, 256 * c1 + c5); -; EXTRACTION-OF-MACRO-KERNEL-NEXT: Stmt_bb24(96 * c2 + 4 * c4 + 2, 16 * c0 + 8 * c3 + 7, 256 * c1 + c5); -; EXTRACTION-OF-MACRO-KERNEL-NEXT: Stmt_bb24(96 * c2 + 4 * c4 + 3, 16 * c0 + 8 * c3, 256 * c1 + c5); -; EXTRACTION-OF-MACRO-KERNEL-NEXT: Stmt_bb24(96 * c2 + 4 * c4 + 3, 16 * c0 + 8 * c3 + 1, 256 * c1 + c5); -; EXTRACTION-OF-MACRO-KERNEL-NEXT: Stmt_bb24(96 * c2 + 4 * c4 + 3, 16 * c0 + 8 * c3 + 2, 256 * c1 + c5); -; EXTRACTION-OF-MACRO-KERNEL-NEXT: Stmt_bb24(96 * c2 + 4 * c4 + 3, 16 * c0 + 8 * c3 + 3, 256 * c1 + c5); -; EXTRACTION-OF-MACRO-KERNEL-NEXT: Stmt_bb24(96 * c2 + 4 * c4 + 3, 16 * c0 + 8 * c3 + 4, 256 * c1 + c5); -; EXTRACTION-OF-MACRO-KERNEL-NEXT: Stmt_bb24(96 * c2 + 4 * c4 + 3, 16 * c0 + 8 * c3 + 5, 256 * c1 + c5); -; EXTRACTION-OF-MACRO-KERNEL-NEXT: Stmt_bb24(96 * c2 + 4 * c4 + 3, 16 * c0 + 8 * c3 + 6, 256 * c1 + c5); -; EXTRACTION-OF-MACRO-KERNEL-NEXT: Stmt_bb24(96 * c2 + 4 * c4 + 3, 16 * c0 + 8 * c3 + 7, 256 * c1 + c5); -; EXTRACTION-OF-MACRO-KERNEL-NEXT: } +; EXTRACTION-OF-MACRO-KERNEL-NEXT: for (int c1 = 0; c1 <= 3; c1 += 1) +; EXTRACTION-OF-MACRO-KERNEL-NEXT: for (int c2 = 0; c2 <= 10; c2 += 1) { +; EXTRACTION-OF-MACRO-KERNEL-NEXT: // 1st level tiling - Points +; EXTRACTION-OF-MACRO-KERNEL-NEXT: // Register tiling - Tiles +; EXTRACTION-OF-MACRO-KERNEL-NEXT: for (int c3 = 0; c3 <= 131; c3 += 1) +; EXTRACTION-OF-MACRO-KERNEL-NEXT: for (int c4 = 0; c4 <= 23; c4 += 1) +; EXTRACTION-OF-MACRO-KERNEL-NEXT: for (int c5 = 0; c5 <= 255; c5 += 1) { +; EXTRACTION-OF-MACRO-KERNEL-NEXT: // Register tiling - Points +; EXTRACTION-OF-MACRO-KERNEL-NEXT: // 1st level tiling - Tiles +; EXTRACTION-OF-MACRO-KERNEL-NEXT: // 1st level tiling - Points +; EXTRACTION-OF-MACRO-KERNEL-NEXT: { +; EXTRACTION-OF-MACRO-KERNEL-NEXT: Stmt_bb24(96 * c2 + 4 * c4, 8 * c3, 256 * c1 + c5); +; EXTRACTION-OF-MACRO-KERNEL-NEXT: Stmt_bb24(96 * c2 + 4 * c4, 8 * c3 + 1, 256 * c1 + c5); +; EXTRACTION-OF-MACRO-KERNEL-NEXT: Stmt_bb24(96 * c2 + 4 * c4, 8 * c3 + 2, 256 * c1 + c5); +; EXTRACTION-OF-MACRO-KERNEL-NEXT: Stmt_bb24(96 * c2 + 4 * c4, 8 * c3 + 3, 256 * c1 + c5); +; EXTRACTION-OF-MACRO-KERNEL-NEXT: Stmt_bb24(96 * c2 + 4 * c4, 8 * c3 + 4, 256 * c1 + c5); +; EXTRACTION-OF-MACRO-KERNEL-NEXT: Stmt_bb24(96 * c2 + 4 * c4, 8 * c3 + 5, 256 * c1 + c5); +; EXTRACTION-OF-MACRO-KERNEL-NEXT: Stmt_bb24(96 * c2 + 4 * c4, 8 * c3 + 6, 256 * c1 + c5); +; EXTRACTION-OF-MACRO-KERNEL-NEXT: Stmt_bb24(96 * c2 + 4 * c4, 8 * c3 + 7, 256 * c1 + c5); +; EXTRACTION-OF-MACRO-KERNEL-NEXT: Stmt_bb24(96 * c2 + 4 * c4 + 1, 8 * c3, 256 * c1 + c5); +; EXTRACTION-OF-MACRO-KERNEL-NEXT: Stmt_bb24(96 * c2 + 4 * c4 + 1, 8 * c3 + 1, 256 * c1 + c5); +; EXTRACTION-OF-MACRO-KERNEL-NEXT: Stmt_bb24(96 * c2 + 4 * c4 + 1, 8 * c3 + 2, 256 * c1 + c5); +; EXTRACTION-OF-MACRO-KERNEL-NEXT: Stmt_bb24(96 * c2 + 4 * c4 + 1, 8 * c3 + 3, 256 * c1 + c5); +; EXTRACTION-OF-MACRO-KERNEL-NEXT: Stmt_bb24(96 * c2 + 4 * c4 + 1, 8 * c3 + 4, 256 * c1 + c5); +; EXTRACTION-OF-MACRO-KERNEL-NEXT: Stmt_bb24(96 * c2 + 4 * c4 + 1, 8 * c3 + 5, 256 * c1 + c5); +; EXTRACTION-OF-MACRO-KERNEL-NEXT: Stmt_bb24(96 * c2 + 4 * c4 + 1, 8 * c3 + 6, 256 * c1 + c5); +; EXTRACTION-OF-MACRO-KERNEL-NEXT: Stmt_bb24(96 * c2 + 4 * c4 + 1, 8 * c3 + 7, 256 * c1 + c5); +; EXTRACTION-OF-MACRO-KERNEL-NEXT: Stmt_bb24(96 * c2 + 4 * c4 + 2, 8 * c3, 256 * c1 + c5); +; EXTRACTION-OF-MACRO-KERNEL-NEXT: Stmt_bb24(96 * c2 + 4 * c4 + 2, 8 * c3 + 1, 256 * c1 + c5); +; EXTRACTION-OF-MACRO-KERNEL-NEXT: Stmt_bb24(96 * c2 + 4 * c4 + 2, 8 * c3 + 2, 256 * c1 + c5); +; EXTRACTION-OF-MACRO-KERNEL-NEXT: Stmt_bb24(96 * c2 + 4 * c4 + 2, 8 * c3 + 3, 256 * c1 + c5); +; EXTRACTION-OF-MACRO-KERNEL-NEXT: Stmt_bb24(96 * c2 + 4 * c4 + 2, 8 * c3 + 4, 256 * c1 + c5); +; EXTRACTION-OF-MACRO-KERNEL-NEXT: Stmt_bb24(96 * c2 + 4 * c4 + 2, 8 * c3 + 5, 256 * c1 + c5); +; EXTRACTION-OF-MACRO-KERNEL-NEXT: Stmt_bb24(96 * c2 + 4 * c4 + 2, 8 * c3 + 6, 256 * c1 + c5); +; EXTRACTION-OF-MACRO-KERNEL-NEXT: Stmt_bb24(96 * c2 + 4 * c4 + 2, 8 * c3 + 7, 256 * c1 + c5); +; EXTRACTION-OF-MACRO-KERNEL-NEXT: Stmt_bb24(96 * c2 + 4 * c4 + 3, 8 * c3, 256 * c1 + c5); +; EXTRACTION-OF-MACRO-KERNEL-NEXT: Stmt_bb24(96 * c2 + 4 * c4 + 3, 8 * c3 + 1, 256 * c1 + c5); +; EXTRACTION-OF-MACRO-KERNEL-NEXT: Stmt_bb24(96 * c2 + 4 * c4 + 3, 8 * c3 + 2, 256 * c1 + c5); +; EXTRACTION-OF-MACRO-KERNEL-NEXT: Stmt_bb24(96 * c2 + 4 * c4 + 3, 8 * c3 + 3, 256 * c1 + c5); +; EXTRACTION-OF-MACRO-KERNEL-NEXT: Stmt_bb24(96 * c2 + 4 * c4 + 3, 8 * c3 + 4, 256 * c1 + c5); +; EXTRACTION-OF-MACRO-KERNEL-NEXT: Stmt_bb24(96 * c2 + 4 * c4 + 3, 8 * c3 + 5, 256 * c1 + c5); +; EXTRACTION-OF-MACRO-KERNEL-NEXT: Stmt_bb24(96 * c2 + 4 * c4 + 3, 8 * c3 + 6, 256 * c1 + c5); +; EXTRACTION-OF-MACRO-KERNEL-NEXT: Stmt_bb24(96 * c2 + 4 * c4 + 3, 8 * c3 + 7, 256 * c1 + c5); ; EXTRACTION-OF-MACRO-KERNEL-NEXT: } -; EXTRACTION-OF-MACRO-KERNEL-NEXT: } +; EXTRACTION-OF-MACRO-KERNEL-NEXT: } +; EXTRACTION-OF-MACRO-KERNEL-NEXT: } ; EXTRACTION-OF-MACRO-KERNEL-NEXT: } ; target datalayout = "e-m:e-i64:64-f80:128-n8:16:32:64-S128"