__atomic_store_n (&bar->generation, bar->generation + BAR_INCR,
MEMMODEL_RELAXED);
}
- asm ("s_barrier" ::: "memory");
+ if (bar->total > 1)
+ asm ("s_barrier" ::: "memory");
}
void
void
gomp_team_barrier_wake (gomp_barrier_t *bar, int count)
{
- asm ("s_barrier" ::: "memory");
+ if (bar->total > 1)
+ asm ("s_barrier" ::: "memory");
}
void
state &= ~BAR_CANCELLED;
state += BAR_INCR - BAR_WAS_LAST;
__atomic_store_n (&bar->generation, state, MEMMODEL_RELAXED);
- asm ("s_barrier" ::: "memory");
+ if (bar->total > 1)
+ asm ("s_barrier" ::: "memory");
return;
}
}
{
state += BAR_INCR - BAR_WAS_LAST;
__atomic_store_n (&bar->generation, state, MEMMODEL_RELAXED);
- asm ("s_barrier" ::: "memory");
+ if (bar->total > 1)
+ asm ("s_barrier" ::: "memory");
return false;
}
}
abort();
}
- asm ("s_barrier" ::: "memory");
+ if (bar->total > 1)
+ asm ("s_barrier" ::: "memory");
gen = __atomic_load_n (&bar->generation, MEMMODEL_RELAXED);
if (__builtin_expect (gen & BAR_CANCELLED, 0))
return true;
__atomic_store_n (&bar->generation, bar->generation + BAR_INCR,
MEMMODEL_RELEASE);
}
- asm ("bar.sync 1, %0;" : : "r" (32 * bar->total));
+ if (bar->total > 1)
+ asm ("bar.sync 1, %0;" : : "r" (32 * bar->total));
}
void
void
gomp_team_barrier_wake (gomp_barrier_t *bar, int count)
{
- asm ("bar.sync 1, %0;" : : "r" (32 * bar->total));
+ if (bar->total > 1)
+ asm ("bar.sync 1, %0;" : : "r" (32 * bar->total));
}
void
state &= ~BAR_CANCELLED;
state += BAR_INCR - BAR_WAS_LAST;
__atomic_store_n (&bar->generation, state, MEMMODEL_RELEASE);
- asm ("bar.sync 1, %0;" : : "r" (32 * bar->total));
+ if (bar->total > 1)
+ asm ("bar.sync 1, %0;" : : "r" (32 * bar->total));
return;
}
}
state &= ~BAR_CANCELLED;
do
{
- asm ("bar.sync 1, %0;" : : "r" (32 * bar->total));
+ if (bar->total > 1)
+ asm ("bar.sync 1, %0;" : : "r" (32 * bar->total));
gen = __atomic_load_n (&bar->generation, MEMMODEL_ACQUIRE);
if (__builtin_expect (gen & BAR_TASK_PENDING, 0))
{
{
state += BAR_INCR - BAR_WAS_LAST;
__atomic_store_n (&bar->generation, state, MEMMODEL_RELEASE);
- asm ("bar.sync 1, %0;" : : "r" (32 * bar->total));
+ if (bar->total > 1)
+ asm ("bar.sync 1, %0;" : : "r" (32 * bar->total));
return false;
}
}
generation = state;
do
{
- asm ("bar.sync 1, %0;" : : "r" (32 * bar->total));
+ if (bar->total > 1)
+ asm ("bar.sync 1, %0;" : : "r" (32 * bar->total));
gen = __atomic_load_n (&bar->generation, MEMMODEL_ACQUIRE);
if (__builtin_expect (gen & BAR_CANCELLED, 0))
return true;
--- /dev/null
+/* Ensure that nested parallel regions work even when the number of loop
+ iterations is not divisible by the number of threads. */
+
+#include <stdlib.h>
+
+int main() {
+ int A[30][40], B[30][40];
+ size_t n = 30;
+
+ for (size_t i = 0; i < 30; ++i)
+ for (size_t j = 0; j < 40; ++j)
+ A[i][j] = 42;
+
+#pragma omp target map(A[0:30][0:40], B[0:30][0:40])
+ {
+#pragma omp parallel for num_threads(8)
+ for (size_t i = 0; i < n; ++i)
+ {
+#pragma omp parallel for
+ for (size_t j = 0; j < n; ++j)
+ {
+ B[i][j] = A[i][j];
+ }
+ }
+ }
+
+for (size_t i = 0; i < n; ++i)
+ for (size_t j = 0; j < n; ++j)
+ if (B[i][j] != 42)
+ abort ();
+}