1 /*******************************************************************************
2 * Copyright 2018 Intel Corporation
4 * Licensed under the Apache License, Version 2.0 (the "License");
5 * you may not use this file except in compliance with the License.
6 * You may obtain a copy of the License at
8 * http://www.apache.org/licenses/LICENSE-2.0
10 * Unless required by applicable law or agreed to in writing, software
11 * distributed under the License is distributed on an "AS IS" BASIS,
12 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13 * See the License for the specific language governing permissions and
14 * limitations under the License.
15 *******************************************************************************/
17 #include "jit_generator.hpp"
24 jit_avx512_core_u8_copy_bn_kern::jit_avx512_core_u8_copy_bn_kern(): jit_generator(nullptr, GEMM_CODE_SIZE)
54 #define ARG_ALPHA 40+stacksize+rsp
55 #define ARG_B 48+stacksize+rsp
97 auto stacksize = get_size_of_abi_save_regs();
98 mov(ALPHA, ptr[ARG_ALPHA]);
104 mov(LDA, qword[LDA]);
107 lea(LDA3, ptr[LDA+LDA*2]);
114 lea(A2, ptr[A1+LDA*4]);
115 lea(I, ptr[A1+LDA*8]);
123 movdqu(xmm0, xword[A1-0x80]);
124 movdqu(xmm1, xword[A1+LDA*1-0x80]);
125 movdqu(xmm2, xword[A1+LDA*2-0x80]);
126 movdqu(xmm3, xword[A1+LDA3*1-0x80]);
129 punpckldq(xmm0, xmm1);
130 punpckhdq(xmm4, xmm1);
132 punpckldq(xmm2, xmm3);
133 punpckhdq(xmm5, xmm3);
135 punpcklqdq(xmm0, xmm2);
136 punpckhqdq(xmm1, xmm2);
138 punpcklqdq(xmm4, xmm5);
139 punpckhqdq(xmm3, xmm5);
140 movdqu(xword[B-0x80], xmm0);
141 movdqu(xword[B-0x60], xmm1);
142 movdqu(xword[B-0x40], xmm4);
143 movdqu(xword[B-0x20], xmm3);
144 movdqu(xmm0, xword[A2-0x80]);
145 movdqu(xmm1, xword[A2+LDA*1-0x80]);
146 movdqu(xmm2, xword[A2+LDA*2-0x80]);
147 movdqu(xmm3, xword[A2+LDA3*1-0x80]);
150 punpckldq(xmm0, xmm1);
151 punpckhdq(xmm4, xmm1);
153 punpckldq(xmm2, xmm3);
154 punpckhdq(xmm5, xmm3);
156 punpcklqdq(xmm0, xmm2);
157 punpckhqdq(xmm1, xmm2);
159 punpcklqdq(xmm4, xmm5);
160 punpckhqdq(xmm3, xmm5);
161 movdqu(xword[B-0x70], xmm0);
162 movdqu(xword[B-0x50], xmm1);
163 movdqu(xword[B-0x30], xmm4);
164 movdqu(xword[B-0x10], xmm3);
173 movq(xmm0, qword[A1-0x80]);
174 movq(xmm1, qword[A1+LDA*1-0x80]);
175 movq(xmm2, qword[A1+LDA*2-0x80]);
176 movq(xmm3, qword[A1+LDA3*1-0x80]);
178 punpckldq(xmm0, xmm1);
179 punpckldq(xmm2, xmm3);
181 punpcklqdq(xmm0, xmm2);
182 punpckhqdq(xmm1, xmm2);
183 movdqu(xword[B-0x80], xmm0);
184 movdqu(xword[B-0x60], xmm1);
185 movq(xmm0, qword[A2-0x80]);
186 movq(xmm1, qword[A2+LDA*1-0x80]);
187 movq(xmm2, qword[A2+LDA*2-0x80]);
188 movq(xmm3, qword[A2+LDA3*1-0x80]);
190 punpckldq(xmm0, xmm1);
191 punpckldq(xmm2, xmm3);
193 punpcklqdq(xmm0, xmm2);
194 punpckhqdq(xmm1, xmm2);
195 movdqu(xword[B-0x70], xmm0);
196 movdqu(xword[B-0x50], xmm1);
203 movd(xmm0, dword[A1-0x80]);
204 movd(xmm1, dword[A1+LDA*1-0x80]);
205 movd(xmm2, dword[A1+LDA*2-0x80]);
206 movd(xmm3, dword[A1+LDA3*1-0x80]);
208 punpckldq(xmm0, xmm1);
209 punpckldq(xmm2, xmm3);
210 punpcklqdq(xmm0, xmm2);
211 movdqu(xword[B-0x80], xmm0);
212 movd(xmm0, dword[A2-0x80]);
213 movd(xmm1, dword[A2+LDA*1-0x80]);
214 movd(xmm2, dword[A2+LDA*2-0x80]);
215 movd(xmm3, dword[A2+LDA3*1-0x80]);
217 punpckldq(xmm0, xmm1);
218 punpckldq(xmm2, xmm3);
219 punpcklqdq(xmm0, xmm2);
220 movdqu(xword[B-0x70], xmm0);
227 mov(ax, word[A1-0x80]);
228 pinsrw(xmm0, eax, 0x0);
229 mov(ax, word[A1+LDA*1-0x80]);
230 pinsrw(xmm0, eax, 0x1);
231 mov(ax, word[A1+LDA*2-0x80]);
232 pinsrw(xmm0, eax, 0x2);
233 mov(ax, word[A1+LDA3*1-0x80]);
235 pinsrw(xmm0, eax, 0x3);
236 mov(ax, word[A2-0x80]);
237 pinsrw(xmm0, eax, 0x4);
238 mov(ax, word[A2+LDA*1-0x80]);
239 pinsrw(xmm0, eax, 0x5);
240 mov(ax, word[A2+LDA*2-0x80]);
241 pinsrw(xmm0, eax, 0x6);
242 mov(ax, word[A2+LDA3*1-0x80]);
244 pinsrw(xmm0, eax, 0x7);
245 movdqu(xword[B-0x80], xmm0);
252 mov(al, byte[A1-0x80]);
253 pinsrb(xmm0, eax, 0x0);
254 mov(al, byte[A1+LDA*1-0x80]);
255 pinsrb(xmm0, eax, 0x1);
256 mov(al, byte[A1+LDA*2-0x80]);
257 pinsrb(xmm0, eax, 0x2);
258 mov(al, byte[A1+LDA3*1-0x80]);
259 pinsrb(xmm0, eax, 0x3);
260 mov(al, byte[A2-0x80]);
261 pinsrb(xmm0, eax, 0x4);
262 mov(al, byte[A2+LDA*1-0x80]);
263 pinsrb(xmm0, eax, 0x5);
264 mov(al, byte[A2+LDA*2-0x80]);
265 pinsrb(xmm0, eax, 0x6);
266 mov(al, byte[A2+LDA3*1-0x80]);
267 pinsrb(xmm0, eax, 0x7);
268 movq(qword[B-0x80], xmm0);
285 lea(A2, ptr[A1+LDA*2]);
286 lea(I, ptr[A1+LDA*4]);
294 movdqu(xmm0, xword[A1-0x80]);
295 movdqu(xmm1, xword[A1+LDA*1-0x80]);
297 movdqu(xmm2, xword[A2-0x80]);
298 movdqu(xmm3, xword[A2+LDA*1-0x80]);
301 punpckldq(xmm0, xmm1);
302 punpckhdq(xmm4, xmm1);
304 punpckldq(xmm2, xmm3);
305 punpckhdq(xmm5, xmm3);
307 punpcklqdq(xmm0, xmm2);
308 punpckhqdq(xmm1, xmm2);
310 punpcklqdq(xmm4, xmm5);
311 punpckhqdq(xmm3, xmm5);
312 movdqu(xword[B-0x80], xmm0);
313 movdqu(xword[B-0x70], xmm1);
314 movdqu(xword[B-0x60], xmm4);
315 movdqu(xword[B-0x50], xmm3);
324 movq(xmm0, qword[A1-0x80]);
325 movq(xmm1, qword[A1+LDA*1-0x80]);
327 movq(xmm2, qword[A2-0x80]);
328 movq(xmm3, qword[A2+LDA*1-0x80]);
330 punpckldq(xmm0, xmm1);
331 punpckldq(xmm2, xmm3);
333 punpcklqdq(xmm0, xmm2);
334 punpckhqdq(xmm1, xmm2);
335 movdqu(xword[B-0x80], xmm0);
336 movdqu(xword[B-0x70], xmm1);
343 movd(xmm0, dword[A1-0x80]);
344 movd(xmm1, dword[A1+LDA*1-0x80]);
346 movd(xmm2, dword[A2-0x80]);
347 movd(xmm3, dword[A2+LDA*1-0x80]);
349 punpckldq(xmm0, xmm1);
350 punpckldq(xmm2, xmm3);
351 punpcklqdq(xmm0, xmm2);
352 movdqu(xword[B-0x80], xmm0);
359 mov(ax, word[A1-0x80]);
360 pinsrw(xmm0, eax, 0x0);
361 mov(ax, word[A1+LDA*1-0x80]);
363 pinsrw(xmm0, eax, 0x1);
364 mov(ax, word[A2-0x80]);
365 pinsrw(xmm0, eax, 0x2);
366 mov(ax, word[A2+LDA*1-0x80]);
368 pinsrw(xmm0, eax, 0x3);
369 movq(qword[B-0x80], xmm0);
376 mov(al, byte[A1-0x80]);
377 pinsrb(xmm0, eax, 0x0);
378 mov(al, byte[A1+LDA*1-0x80]);
379 pinsrb(xmm0, eax, 0x1);
380 mov(al, byte[A2-0x80]);
381 pinsrb(xmm0, eax, 0x2);
382 mov(al, byte[A2+LDA*1-0x80]);
383 pinsrb(xmm0, eax, 0x3);
384 movd(dword[B-0x80], xmm0);
401 lea(A2, ptr[A1+LDA*1]);
402 lea(I, ptr[A1+LDA*2]);
410 movdqu(xmm0, xword[A1-0x80]);
412 movdqu(xmm1, xword[A2-0x80]);
415 punpckldq(xmm0, xmm1);
416 punpckhdq(xmm2, xmm1);
417 movdqu(xword[B-0x80], xmm0);
418 movdqu(xword[B-0x70], xmm2);
427 movq(xmm0, qword[A1-0x80]);
429 movq(xmm1, qword[A2-0x80]);
431 punpckldq(xmm0, xmm1);
432 movdqu(xword[B-0x80], xmm0);
439 movd(xmm0, dword[A1-0x80]);
441 movd(xmm1, dword[A2-0x80]);
443 punpckldq(xmm0, xmm1);
444 movq(qword[B-0x80], xmm0);
451 mov(ax, word[A1-0x80]);
453 pinsrw(xmm0, eax, 0x0);
454 mov(ax, word[A2-0x80]);
456 pinsrw(xmm0, eax, 0x1);
457 movd(dword[B-0x80], xmm0);
464 mov(al, byte[A1-0x80]);
465 mov(byte[B-0x80], al);
466 mov(al, byte[A2-0x80]);
467 mov(byte[B-0x7f], al);
491 movdqu(xmm0, xword[A1-0x80]);
493 movdqu(xword[B-0x80], xmm0);
502 movq(xmm0, qword[A1-0x80]);
504 movq(qword[B-0x80], xmm0);
511 movd(xmm0, dword[A1-0x80]);
513 movd(dword[B-0x80], xmm0);
520 mov(ax, word[A1-0x80]);
521 mov(word[B-0x80], ax);
529 mov(al, byte[A1-0x80]);
530 mov(byte[B-0x80], al);