1 /*******************************************************************************
2 * Copyright 2018 Intel Corporation
4 * Licensed under the Apache License, Version 2.0 (the "License");
5 * you may not use this file except in compliance with the License.
6 * You may obtain a copy of the License at
8 * http://www.apache.org/licenses/LICENSE-2.0
10 * Unless required by applicable law or agreed to in writing, software
11 * distributed under the License is distributed on an "AS IS" BASIS,
12 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13 * See the License for the specific language governing permissions and
14 * limitations under the License.
15 *******************************************************************************/
17 #ifndef IGEMM_KERNEL_GENERATOR_HPP
18 #define IGEMM_KERNEL_GENERATOR_HPP
20 #include "jit_generator.hpp"
27 class jit_avx512_core_gemm_s8u8s32_kern : public jit_generator {
29 jit_avx512_core_gemm_s8u8s32_kern(bool beta_zero_, bool enable_offset_c_,
30 bool enable_offset_r_);
31 DECLARE_CPU_JIT_AUX_FUNCTIONS(jit_avx512_core_gemm_s8u8s32_kern);
35 bool enable_offset_c, enable_offset_r;
38 void prefetch_a(const Xbyak::Address &src) {
41 void prefetch_b(const Xbyak::Address &src) {
44 void prefetch_c(const Xbyak::Address &src) {
47 void prefetch_x(const Xbyak::Address &src) {
51 void c_load(const Xbyak::Xmm &dst, const Xbyak::Address &src, int nelems);
52 void c_store(const Xbyak::Address &dst, const Xbyak::Xmm &src, int nelems);
54 void dot_product(const Xbyak::Xmm &dst, const Xbyak::Xmm &src1,
55 const Xbyak::Xmm &src2);
56 void kernel_loop(int unroll_m, int unroll_n, bool cfetch);
57 void remainder_kernel(int unroll_m, int unroll_n, int unroll_k, int bwidth);
58 void innerloop(int unroll_m, int unroll_n);
59 void outerloop(int unroll_x, int unroll_y, Xbyak::Label *&outerloop_label);
65 static const int IGEMM_UNROLL_M = 48;
66 static const int IGEMM_UNROLL_N = 8;
68 static const int isize = 2;
69 static const int size = 4;
71 // Prefetch configuration
72 static const int prefetch_size_a = 32 * 5;
73 static const int prefetch_size_b = 32 * 4;
75 static const int offset_a = 256, offset_b = 256;
76 static const int max_unroll_m = 48, max_unroll_n = 8;
78 // Integer register assignments
79 Xbyak::Reg64 M, N, K, A, B, C, LDC, I, J, LoopCount;
80 Xbyak::Reg64 AO, BO, CO1, CO2, AA;
82 // Vector register assignments
83 Xbyak::Zmm dp_scratch, ones, a_regs[max_unroll_m >> 4], b_regs[2];
84 Xbyak::Zmm c_regs[max_unroll_m >> 4][max_unroll_n];
86 // Stack variable assignments
88 Xbyak::Address arg_a, arg_b, arg_c, arg_ldc, arg_coffset_c, arg_coffset_r;
89 Xbyak::Address coffset_cx, coffset_cy, coffset_rx, coffset_ry;
91 void L_aligned(Xbyak::Label &label, int alignment = 16) {
101 #endif /* header guard */