Add build-time option for OMP scheduler; document MULTITHREAD_THRESHOLD range (#1620)

author Martin Kroeker <martin@ruby.chemie.uni-freiburg.de>

Fri, 15 Jun 2018 09:25:05 +0000 (11:25 +0200)

committer GitHub <noreply@github.com>

Fri, 15 Jun 2018 09:25:05 +0000 (11:25 +0200)
author Martin Kroeker <martin@ruby.chemie.uni-freiburg.de>
Fri, 15 Jun 2018 09:25:05 +0000 (11:25 +0200)
committer GitHub <noreply@github.com>
Fri, 15 Jun 2018 09:25:05 +0000 (11:25 +0200)
diff --git a/Makefile.rule b/Makefile.rule

index 5c03d01..649aabe 100644 (file)
--- a/Makefile.rule
+++ b/Makefile.rule
@@ -60,6 +60,14 @@ VERSION = 0.3.1.dev
  # This flag is always set for POWER8. Don't modify the flag 
  # USE_OPENMP = 1
  
+# The OpenMP scheduler to use - by default this is "static" and you
+# will normally not want to change this unless you know that your main
+# workload will involve tasks that have highly unbalanced running times
+# for individual threads. Changing away from "static" may also adversely
+# affect memory access locality in NUMA systems. Setting to "runtime" will
+# allow you to select the scheduler from the environment variable OMP_SCHEDULE
+# CCOMMON_OPT += -DOMP_SCHED=dynamic
+
  # You can define maximum number of threads. Basically it should be
  # less than actual number of cores. If you don't specify one, it's
  # automatically detected by the the script.
@@ -156,8 +164,11 @@ NO_AFFINITY = 1
  # CONSISTENT_FPCSR = 1
  
  # If any gemm arguement m, n or k is less or equal this threshold, gemm will be execute
-# with single thread. You can use this flag to avoid the overhead of multi-threading
-# in small matrix sizes. The default value is 4.
+# with single thread. (Actually in recent versions this is a factor proportional to the
+# number of floating point operations necessary for the given problem size, no longer
+# an individual dimension). You can use this setting to avoid the overhead of multi-
+# threading in small matrix sizes. The default value is 4, but values as high as 50 have 
+# been reported to be optimal for certain workloads (50 is the recommended value for Julia).
  # GEMM_MULTITHREAD_THRESHOLD = 4
  
  # If you need santy check by comparing reference BLAS. It'll be very
diff --git a/driver/others/blas_server_omp.c b/driver/others/blas_server_omp.c

index fccdb43..4255852 100644 (file)
--- a/driver/others/blas_server_omp.c
+++ b/driver/others/blas_server_omp.c
@@ -48,6 +48,10 @@
  
  #else
  
+#ifndef OMP_SCHED
+#define OMP_SCHED static
+#endif
+
  int blas_server_avail = 0;
  
  static void * blas_thread_buffer[MAX_PARALLEL_NUMBER][MAX_CPU_NUMBER];
@@ -331,7 +335,7 @@ int exec_blas(BLASLONG num, blas_queue_t *queue){
        break;
    }
  
-#pragma omp parallel for schedule(static)
+#pragma omp parallel for schedule(OMP_SCHED)
    for (i = 0; i < num; i ++) {
  
  #ifndef USE_SIMPLE_THREADED_LEVEL3
author	Martin Kroeker <martin@ruby.chemie.uni-freiburg.de>
	Fri, 15 Jun 2018 09:25:05 +0000 (11:25 +0200)
committer	GitHub <noreply@github.com>
	Fri, 15 Jun 2018 09:25:05 +0000 (11:25 +0200)
Makefile.rule		patch \| blob \| history
driver/others/blas_server_omp.c		patch \| blob \| history