1 /*******************************************************************************
2 * Copyright 2018 Intel Corporation
4 * Licensed under the Apache License, Version 2.0 (the "License");
5 * you may not use this file except in compliance with the License.
6 * You may obtain a copy of the License at
8 * http://www.apache.org/licenses/LICENSE-2.0
10 * Unless required by applicable law or agreed to in writing, software
11 * distributed under the License is distributed on an "AS IS" BASIS,
12 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13 * See the License for the specific language governing permissions and
14 * limitations under the License.
15 *******************************************************************************/
18 #include "mkldnn_thread.hpp"
20 #include "gemm_utils_f32.hpp"
25 namespace gemm_utils {
26 #define BM_NOCOPY_AVX 64
27 #define BN_NOCOPY_AVX 48
28 #define BK_NOCOPY_AVX 384
29 #define BN_LARGE_NOCOPY_AVX 192
30 #define BM_SMALL_NOCOPY_AVX 16
31 #define BN_SMALL_NOCOPY_AVX 1
32 #define BK_SMALL_NOCOPY_AVX 4
33 // Determine number of threads for each dimension of a 3-D partitioning
34 // algorithm based on input parameters
35 // m/n/k - First/second/third parameter for GEMM
36 // nthrs - total available number of threads
37 // nthrs_m/nthrs_n/nthrs_k - number of threads to use in each dimension
38 // BM/BN/BK - blocking values
39 void calc_nthr_nocopy_avx(int m, int n, int k,
40 int nthrs, int *nthrs_m, int *nthrs_n, int *nthrs_k, int *BM, int *BN,
43 int nthr, nthr_m, nthr_n, nthr_k;
47 nthr_m = (m + BM_NOCOPY_AVX - 1) / BM_NOCOPY_AVX;
48 nthr_n = (n + BN_NOCOPY_AVX - 1) / BN_NOCOPY_AVX;
51 // Partition along K dimension
52 // - if threading allows having barriers (e.g. OMP)
53 // - if there is not enough parallelism along M or N
54 if (mkldnn_thr_syncable()) {
55 int nthr_other = nthr_k = 1;
56 while ((nthr_m * nthr_n * nthr_other < nthr)
57 && (k / (nthr_other + 1) > BK_NOCOPY_AVX)) {
59 if ((nthr / nthr_other) * nthr_other > 0.9 * nthr)
70 // Simple partition reduction
71 while (nthr_m * nthr_n > nthr)
76 while (nthr_m * nthr_n < nthr)
82 if ((nthr_m * nthr_n > nthr) && (nthr_m > 1) && (nthr_n > 1)) {
84 if (nthr_m <= nthr_n) {
85 nthr_m = (int)sqrt((double)nthr);
86 if (nthr_m > (m + BM_SMALL_NOCOPY_AVX - 1) / BM_SMALL_NOCOPY_AVX)
87 nthr_m = (m + BM_SMALL_NOCOPY_AVX - 1) / BM_SMALL_NOCOPY_AVX;
88 nthr_n = nthr / nthr_m;
90 while ((nthr_m > 1) && (nthr_m * nthr_n != nthr)) {
92 nthr_n = nthr / nthr_m;
95 nthr_n = (int)sqrt((double)nthr);
96 if (nthr_n > (n + BN_SMALL_NOCOPY_AVX - 1) / BN_SMALL_NOCOPY_AVX)
97 nthr_n = (n + BN_SMALL_NOCOPY_AVX - 1) / BN_SMALL_NOCOPY_AVX;
98 nthr_m = nthr / nthr_n;
100 while ((nthr_n > 1) && (nthr_m * nthr_n != nthr)) {
102 nthr_m = nthr / nthr_n;
107 MB = (m + nthr_m - 1) / nthr_m + BM_SMALL_NOCOPY_AVX - 1;
108 MB -= MB % BM_SMALL_NOCOPY_AVX;
109 NB = (n + nthr_n - 1) / nthr_n + BN_SMALL_NOCOPY_AVX - 1;
110 NB -= NB % BN_SMALL_NOCOPY_AVX;
111 KB = (k + nthr_k - 1) / nthr_k + BK_SMALL_NOCOPY_AVX - 1;
112 KB -= KB % BK_SMALL_NOCOPY_AVX;
115 nthr_m = (m + MB - 1) / MB;
117 nthr_n = (n + NB - 1) / NB;
119 nthr_k = (k + KB - 1) / KB;
132 #undef BN_LARGE_NOCOPY_AVX
133 #undef BM_SMALL_NOCOPY_AVX
134 #undef BN_SMALL_NOCOPY_AVX
135 #undef BK_SMALL_NOCOPY_AVX
137 #define BM_NOCOPY_AVX512_COMMON 32
138 #define BN_NOCOPY_AVX512_COMMON 64
139 #define BK_NOCOPY_AVX512_COMMON 192
140 #define BN_LARGE_NOCOPY_AVX512_COMMON 192
141 #define BM_SMALL_NOCOPY_AVX512_COMMON 16
142 #define BN_SMALL_NOCOPY_AVX512_COMMON 1
143 #define BK_SMALL_NOCOPY_AVX512_COMMON 4
144 // Determine number of threads for each dimension of a 3-D partitioning
145 // algorithm based on input parameters
146 // m/n/k - First/second/third parameter for GEMM
147 // nthrs - total available number of threads
148 // nthrs_m/nthrs_n/nthrs_k - number of threads to use in each dimension
149 // BM/BN/BK - blocking values
150 void calc_nthr_nocopy_avx512_common(int m,
151 int n, int k, int nthrs, int *nthrs_m, int *nthrs_n, int *nthrs_k,
152 int *BM, int *BN, int *BK)
154 int nthr, nthr_m, nthr_n, nthr_k = 1;
159 float ratio_float = 1.;
164 // Partition along K dimension
165 // - if threading allows having barriers (e.g. OMP)
166 // - if there is not enough parallelism along M or N
167 if (mkldnn_thr_syncable()) {
168 if (n <= 2 * BN_NOCOPY_AVX512_COMMON &&
169 m <= 2 * BM_NOCOPY_AVX512_COMMON * nthr) {
170 nthr_k = k / BK_NOCOPY_AVX512_COMMON;
171 if (nthr_k > nthr / 4)
176 while ((nthr_k > 1) && (nthr % nthr_k)) {
184 nthr_m = (m + BM_NOCOPY_AVX512_COMMON - 1) / BM_NOCOPY_AVX512_COMMON;
185 nthr_n = (n + BN_NOCOPY_AVX512_COMMON - 1) / BN_NOCOPY_AVX512_COMMON;
192 nthr_m_gt_n = nthr_m > nthr_n ? 1 : 0;
193 ratio_float = (float)nthr_m / nthr_n;
196 ratio = (int)ratio_float;
198 ratio = (int)(1. / ratio_float);
200 // scale down nthr_m and nthr_n if they are too large
201 while (nthr_m * nthr_n > 4 * nthr) {
211 // Simple partition reduction
213 while (nthr_m * nthr_n > nthr) {
214 if (nthr_m > nthr_n) {
232 // Simple partition increment
234 while (nthr_m * nthr_n < 0.95 * nthr) {
235 if (nthr_m > nthr_n) {
253 // if nothing works out, then this should work
254 if ((nthr_m * nthr_n > nthr)) {
256 if (nthr_m <= nthr_n) {
257 nthr_m = (int)sqrt((double)nthr);
258 if (nthr_m > (m + BM_SMALL_NOCOPY_AVX512_COMMON - 1)
259 / BM_SMALL_NOCOPY_AVX512_COMMON)
260 nthr_m = (m + BM_SMALL_NOCOPY_AVX512_COMMON - 1)
261 / BM_SMALL_NOCOPY_AVX512_COMMON;
262 nthr_n = nthr / nthr_m;
264 while ((nthr_m > 1) && (nthr_m * nthr_n != nthr)) {
266 nthr_n = nthr / nthr_m;
269 nthr_n = (int)sqrt((double)nthr);
270 if (nthr_n > (n + BN_SMALL_NOCOPY_AVX512_COMMON - 1)
271 / BN_SMALL_NOCOPY_AVX512_COMMON)
272 nthr_n = (n + BN_SMALL_NOCOPY_AVX512_COMMON - 1)
273 / BN_SMALL_NOCOPY_AVX512_COMMON;
274 nthr_m = nthr / nthr_n;
276 while ((nthr_n > 1) && (nthr_m * nthr_n != nthr)) {
278 nthr_m = nthr / nthr_n;
283 MB = (m + nthr_m - 1) / nthr_m + BM_SMALL_NOCOPY_AVX512_COMMON - 1;
284 MB -= MB % BM_SMALL_NOCOPY_AVX512_COMMON;
285 NB = (n + nthr_n - 1) / nthr_n + BN_SMALL_NOCOPY_AVX512_COMMON - 1;
286 NB -= NB % BN_SMALL_NOCOPY_AVX512_COMMON;
287 KB = (k + nthr_k - 1) / nthr_k + BK_SMALL_NOCOPY_AVX512_COMMON - 1;
288 KB -= KB % BK_SMALL_NOCOPY_AVX512_COMMON;
291 nthr_m = (m + MB - 1) / MB;
293 nthr_n = (n + NB - 1) / NB;
295 nthr_k = (k + KB - 1) / KB;
305 #undef BM_NOCOPY_AVX512_COMMON
306 #undef BN_NOCOPY_AVX512_COMMON
307 #undef BK_NOCOPY_AVX512_COMMON
308 #undef BN_LARGE_NOCOPY_AVX512_COMMON
309 #undef BM_SMALL_NOCOPY_AVX512_COMMON
310 #undef BN_SMALL_NOCOPY_AVX512_COMMON
311 #undef BK_SMALL_NOCOPY_AVX512_COMMON
313 // Partition n values as equally as possible among nthr threads
314 // and set the offset (t_offset) and number of values (t_block) for ithr
315 // Assumption: 0 <= ithr < nthr
316 void partition_unit_diff(
317 int ithr, int nthr, int n, int *t_offset, int *t_block)
322 int tail = n - band * nthr;
328 *t_offset = band * ithr;
331 *t_offset = band * ithr + tail;
335 if (*t_offset >= n) {
340 if (*t_offset + *t_block > n) {
341 *t_block = n - *t_offset;
345 // Sum the m*n values from p_src into p_dst, assuming the two-dimensional
346 // arrays have leading dimensions ld_src and ld_dst, respectively
347 template<typename data_t>
348 void sum_two_matrices(int m, int n,
349 data_t * __restrict p_src, dim_t ld_src,
350 data_t * __restrict p_dst, dim_t ld_dst)
353 for (j = 0; j < n; j++) {
354 for (i = 0; i < m; i++) {
355 p_dst[i + j * ld_dst] += p_src[i + j * ld_src];
361 void sum_two_matrices<float>(int m, int n,
362 float * __restrict p_src, dim_t ld_src,
363 float * __restrict p_dst, dim_t ld_dst);
366 void sum_two_matrices<double>(int m, int n,
367 double * __restrict p_src, dim_t ld_src,
368 double * __restrict p_dst, dim_t ld_dst);