bool preloadInvariantEquivClass(const InvariantEquivClassTy &IAClass);
void createForVector(__isl_take isl_ast_node *For, int VectorWidth);
- void createForSequential(__isl_take isl_ast_node *For);
+ void createForSequential(__isl_take isl_ast_node *For, bool KnownParallel);
/// Create LLVM-IR that executes a for node thread parallel.
///
// tested for parallelism. Test them here to ensure we check all innermost
// loops for parallelism.
if (Payload->IsInnermost && BuildInfo->InParallelFor) {
- if (Payload->IsOutermostParallel)
+ if (Payload->IsOutermostParallel) {
Payload->IsInnermostParallel = true;
- else
- Payload->IsInnermostParallel =
- astScheduleDimIsParallel(Build, BuildInfo->Deps, Payload);
+ } else {
+ if (PollyVectorizerChoice == VECTORIZER_NONE)
+ Payload->IsInnermostParallel =
+ astScheduleDimIsParallel(Build, BuildInfo->Deps, Payload);
+ }
}
if (Payload->IsOutermostParallel)
BuildInfo->InParallelFor = false;
return Node;
}
+static isl_stat astBuildBeforeMark(__isl_keep isl_id *MarkId,
+ __isl_keep isl_ast_build *Build,
+ void *User) {
+ if (!MarkId)
+ return isl_stat_error;
+
+ AstBuildUserInfo *BuildInfo = (AstBuildUserInfo *)User;
+ if (!strcmp(isl_id_get_name(MarkId), "SIMD"))
+ BuildInfo->InParallelFor = true;
+
+ return isl_stat_ok;
+}
+
+static __isl_give isl_ast_node *
+astBuildAfterMark(__isl_take isl_ast_node *Node,
+ __isl_keep isl_ast_build *Build, void *User) {
+ assert(isl_ast_node_get_type(Node) == isl_ast_node_mark);
+ AstBuildUserInfo *BuildInfo = (AstBuildUserInfo *)User;
+ auto *Id = isl_ast_node_mark_get_id(Node);
+ if (!strcmp(isl_id_get_name(Id), "SIMD"))
+ BuildInfo->InParallelFor = false;
+ isl_id_free(Id);
+ return Node;
+}
+
static __isl_give isl_ast_node *AtEachDomain(__isl_take isl_ast_node *Node,
__isl_keep isl_ast_build *Build,
void *User) {
&BuildInfo);
Build =
isl_ast_build_set_after_each_for(Build, &astBuildAfterFor, &BuildInfo);
+
+ Build = isl_ast_build_set_before_each_mark(Build, &astBuildBeforeMark,
+ &BuildInfo);
+
+ Build = isl_ast_build_set_after_each_mark(Build, &astBuildAfterMark,
+ &BuildInfo);
}
buildRunCondition(Build);
}
void IslNodeBuilder::createMark(__isl_take isl_ast_node *Node) {
+ auto *Id = isl_ast_node_mark_get_id(Node);
auto Child = isl_ast_node_mark_get_node(Node);
- create(Child);
isl_ast_node_free(Node);
+ // If a child node of a 'SIMD mark' is a loop that has a single iteration,
+ // it will be optimized away and we should skip it.
+ if (!strcmp(isl_id_get_name(Id), "SIMD") &&
+ isl_ast_node_get_type(Child) == isl_ast_node_for) {
+ bool Vector = PollyVectorizerChoice == VECTORIZER_POLLY;
+ int VectorWidth = getNumberOfIterations(Child);
+ if (Vector && 1 < VectorWidth && VectorWidth <= 16)
+ createForVector(Child, VectorWidth);
+ else
+ createForSequential(Child, true);
+ isl_id_free(Id);
+ return;
+ }
+ create(Child);
+ isl_id_free(Id);
}
void IslNodeBuilder::createForVector(__isl_take isl_ast_node *For,
isl_ast_expr_free(Iterator);
}
-void IslNodeBuilder::createForSequential(__isl_take isl_ast_node *For) {
+void IslNodeBuilder::createForSequential(__isl_take isl_ast_node *For,
+ bool KnownParallel) {
isl_ast_node *Body;
isl_ast_expr *Init, *Inc, *Iterator, *UB;
isl_id *IteratorID;
CmpInst::Predicate Predicate;
bool Parallel;
- Parallel =
- IslAstInfo::isParallel(For) && !IslAstInfo::isReductionParallel(For);
+ Parallel = KnownParallel || (IslAstInfo::isParallel(For) &&
+ !IslAstInfo::isReductionParallel(For));
Body = isl_ast_node_for_get_body(For);
createForParallel(For);
return;
}
- createForSequential(For);
+ createForSequential(For, false);
}
void IslNodeBuilder::createIf(__isl_take isl_ast_node *If) {
Node, isl_union_set_read_from_str(Ctx, "{ unroll[x]: 1 = 0 }"));
Node = isl_schedule_node_band_sink(Node);
Node = isl_schedule_node_child(Node, 0);
+ if (isl_schedule_node_get_type(Node) == isl_schedule_node_leaf)
+ Node = isl_schedule_node_parent(Node);
+ isl_id *LoopMarker = isl_id_alloc(Ctx, "SIMD", nullptr);
+ Node = isl_schedule_node_insert_mark(Node, LoopMarker);
return Node;
}
-; RUN: opt %loadPolly -polly-codegen -polly-vectorizer=polly -S -dce < %s | FileCheck %s
+; RUN: opt %loadPolly -polly-opt-isl -polly-codegen -polly-vectorizer=polly -polly-prevect-width=8 -S -dce < %s | FileCheck %s
;
; void foo(long n, float A[restrict][n], float B[restrict][n],
; float C[restrict][n], float D[restrict][n]) {
; RUN: opt -S %loadPolly -polly-vectorizer=stripmine -polly-opt-isl -polly-ast -analyze < %s | FileCheck %s
; CHECK: // 1st level tiling - Tiles
-; CHECK-NEXT: #pragma known-parallel
-; CHECK-NEXT: for (int c0 = 0; c0 <= floord(ni - 1, 32); c0 += 1)
-; CHECK-NEXT: for (int c1 = 0; c1 <= floord(nj - 1, 32); c1 += 1)
-; CHECK-NEXT: for (int c2 = 0; c2 <= floord(nk - 1, 32); c2 += 1) {
-; CHECK-NEXT: // 1st level tiling - Points
-; CHECK-NEXT: for (int c3 = 0; c3 <= min(31, ni - 32 * c0 - 1); c3 += 1) {
-; CHECK-NEXT: for (int c4 = 0; c4 <= min(7, -8 * c1 + nj / 4 - 1); c4 += 1)
-; CHECK-NEXT: for (int c5 = 0; c5 <= min(31, nk - 32 * c2 - 1); c5 += 1)
-; CHECK-NEXT: #pragma simd
-; CHECK-NEXT: for (int c6 = 0; c6 <= 3; c6 += 1)
-; CHECK-NEXT: Stmt_for_body_6(32 * c0 + c3, 32 * c1 + 4 * c4 + c6, 32 * c2 + c5);
-; CHECK-NEXT: if (32 * c1 + 31 >= nj)
-; CHECK-NEXT: for (int c5 = 0; c5 <= min(31, nk - 32 * c2 - 1); c5 += 1)
-; CHECK-NEXT: #pragma simd
-; CHECK-NEXT: for (int c6 = 0; c6 < nj % 4; c6 += 1)
-; CHECK-NEXT: Stmt_for_body_6(32 * c0 + c3, -(nj % 4) + nj + c6, 32 * c2 + c5);
-; CHECK-NEXT: }
-; CHECK-NEXT: }
+; CHECK-NEXT: #pragma known-parallel
+; CHECK-NEXT: for (int c0 = 0; c0 <= floord(ni - 1, 32); c0 += 1)
+; CHECK-NEXT: for (int c1 = 0; c1 <= floord(nj - 1, 32); c1 += 1)
+; CHECK-NEXT: for (int c2 = 0; c2 <= floord(nk - 1, 32); c2 += 1) {
+; CHECK-NEXT: // 1st level tiling - Points
+; CHECK-NEXT: for (int c3 = 0; c3 <= min(31, ni - 32 * c0 - 1); c3 += 1) {
+; CHECK-NEXT: for (int c4 = 0; c4 <= min(7, -8 * c1 + nj / 4 - 1); c4 += 1)
+; CHECK-NEXT: for (int c5 = 0; c5 <= min(31, nk - 32 * c2 - 1); c5 += 1) {
+; CHECK-NEXT: // SIMD
+; CHECK-NEXT: for (int c6 = 0; c6 <= 3; c6 += 1)
+; CHECK-NEXT: Stmt_for_body_6(32 * c0 + c3, 32 * c1 + 4 * c4 + c6, 32 * c2 + c5);
+; CHECK-NEXT: }
+; CHECK-NEXT: if (32 * c1 + 31 >= nj)
+; CHECK-NEXT: #pragma minimal dependence distance: 1
+; CHECK-NEXT: for (int c5 = 0; c5 <= min(31, nk - 32 * c2 - 1); c5 += 1) {
+; CHECK-NEXT: // SIMD
+; CHECK-NEXT: for (int c6 = 0; c6 < nj % 4; c6 += 1)
+; CHECK-NEXT: Stmt_for_body_6(32 * c0 + c3, -(nj % 4) + nj + c6, 32 * c2 + c5);
+; CHECK-NEXT: }
+; CHECK-NEXT: }
+; CHECK-NEXT: }
; Function Attrs: nounwind uwtable
define void @kernel_gemm(i32 %ni, i32 %nj, i32 %nk, double %alpha, double %beta, [1024 x double]* %C, [1024 x double]* %A, [1024 x double]* %B) #0 {
; CHECK: #pragma known-parallel
; CHECK: for (int c0 = 0; c0 <= 1535; c0 += 1)
; CHECK: for (int c1 = 0; c1 <= 383; c1 += 1)
-; CHECK: #pragma simd
+; CHECK: // SIMD
; CHECK: for (int c2 = 0; c2 <= 3; c2 += 1)
; CHECK: Stmt_for_body3(c0, 4 * c1 + c2);
; CHECK: #pragma known-parallel
; CHECK: for (int c0 = 0; c0 <= 1535; c0 += 1)
; CHECK: for (int c1 = 0; c1 <= 383; c1 += 1)
; CHECK: for (int c2 = 0; c2 <= 1535; c2 += 1)
-; CHECK: #pragma simd
+; CHECK: // SIMD
; CHECK: for (int c3 = 0; c3 <= 3; c3 += 1)
; CHECK: Stmt_for_body8(c0, 4 * c1 + c3, c2);
; CHECK: for (int c1 = 0; c1 <= 47; c1 += 1)
; CHECK: for (int c2 = 0; c2 <= 31; c2 += 1)
; CHECK: for (int c3 = 0; c3 <= 7; c3 += 1)
-; CHECK: #pragma simd
+; CHECK: // SIMD
; CHECK: for (int c4 = 0; c4 <= 3; c4 += 1)
; CHECK: Stmt_for_body3(32 * c0 + c2, 32 * c1 + 4 * c3 + c4);
; CHECK: #pragma known-parallel
; CHECK: for (int c3 = 0; c3 <= 31; c3 += 1)
; CHECK: for (int c4 = 0; c4 <= 7; c4 += 1)
; CHECK: for (int c5 = 0; c5 <= 31; c5 += 1)
-; CHECK: #pragma simd
+; CHECK: // SIMD
; CHECK: for (int c6 = 0; c6 <= 3; c6 += 1)
; CHECK: Stmt_for_body8(32 * c0 + c3, 32 * c1 + 4 * c4 + c6, 32 * c2 + c5);
; VEC16: for (int c1 = 0; c1 <= 47; c1 += 1)
; VEC16: for (int c2 = 0; c2 <= 31; c2 += 1)
; VEC16: for (int c3 = 0; c3 <= 1; c3 += 1)
-; VEC16: #pragma simd
+; VEC16: // SIMD
; VEC16: for (int c4 = 0; c4 <= 15; c4 += 1)
; VEC16: Stmt_for_body3(32 * c0 + c2, 32 * c1 + 16 * c3 + c4);
; VEC16: #pragma known-parallel
; VEC16: for (int c3 = 0; c3 <= 31; c3 += 1)
; VEC16: for (int c4 = 0; c4 <= 1; c4 += 1)
; VEC16: for (int c5 = 0; c5 <= 31; c5 += 1)
-; VEC16: #pragma simd
+; VEC16: // SIMD
; VEC16: for (int c6 = 0; c6 <= 15; c6 += 1)
; VEC16: Stmt_for_body8(32 * c0 + c3, 32 * c1 + 16 * c4 + c6, 32 * c2 + c5);
; VEC16: }
; TWO-PLUS-REGISTER-PLUS-VECTORIZATION: for (int c3 = 0; c3 <= 1; c3 += 1)
; TWO-PLUS-REGISTER-PLUS-VECTORIZATION: for (int c4 = 0; c4 <= 7; c4 += 1)
; TWO-PLUS-REGISTER-PLUS-VECTORIZATION: for (int c5 = 0; c5 <= 1; c5 += 1) {
-; TWO-PLUS-REGISTER-PLUS-VECTORIZATION: #pragma simd
+; TWO-PLUS-REGISTER-PLUS-VECTORIZATION: // SIMD
; TWO-PLUS-REGISTER-PLUS-VECTORIZATION: for (int c8 = 0; c8 <= 3; c8 += 1)
; TWO-PLUS-REGISTER-PLUS-VECTORIZATION: Stmt_for_body3(256 * c0 + 16 * c2 + 2 * c4, 16 * c1 + 8 * c3 + 4 * c5 + c8);
-; TWO-PLUS-REGISTER-PLUS-VECTORIZATION: #pragma simd
+; TWO-PLUS-REGISTER-PLUS-VECTORIZATION: // SIMD
; TWO-PLUS-REGISTER-PLUS-VECTORIZATION: for (int c8 = 0; c8 <= 3; c8 += 1)
; TWO-PLUS-REGISTER-PLUS-VECTORIZATION: Stmt_for_body3(256 * c0 + 16 * c2 + 2 * c4 + 1, 16 * c1 + 8 * c3 + 4 * c5 + c8);
; TWO-PLUS-REGISTER-PLUS-VECTORIZATION: }
-; RUN: opt %loadPolly -polly-import-jscop -polly-import-jscop-dir=%S -polly-vectorizer=polly -polly-codegen < %s -S | FileCheck %s
+; RUN: opt %loadPolly -polly-opt-isl -polly-vectorizer=polly -polly-codegen < %s -S | FileCheck %s
; #pragma known-parallel
; for (int c0 = 0; c0 <= 31; c0 += 1)