1 /*******************************************************************************
2 * Copyright 2018 Intel Corporation
4 * Licensed under the Apache License, Version 2.0 (the "License");
5 * you may not use this file except in compliance with the License.
6 * You may obtain a copy of the License at
8 * http://www.apache.org/licenses/LICENSE-2.0
10 * Unless required by applicable law or agreed to in writing, software
11 * distributed under the License is distributed on an "AS IS" BASIS,
12 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13 * See the License for the specific language governing permissions and
14 * limitations under the License.
15 *******************************************************************************/
17 #include "jit_generator.hpp"
24 jit_avx512_core_u8_copy_sum_bn_kern::jit_avx512_core_u8_copy_sum_bn_kern(): jit_generator(nullptr, GEMM_CODE_SIZE)
40 #define ARG_BIAS 24+stacksize+rsp
56 #define ARG_ALPHA 40+stacksize+rsp
57 #define ARG_B 48+stacksize+rsp
58 #define ARG_BIAS 72+stacksize+rsp
99 auto stacksize = get_size_of_abi_save_regs();
101 mov(ALPHA, ptr[ARG_ALPHA]);
107 mov(LDA, qword[LDA]);
110 lea(LDA3, ptr[LDA+LDA*2]);
117 lea(A2, ptr[A1+LDA*4]);
118 lea(I, ptr[A1+LDA*8]);
128 movdqu(xmm0, xword[A1-0x80]);
129 movdqu(xmm1, xword[A1+LDA*1-0x80]);
130 movdqu(xmm2, xword[A1+LDA*2-0x80]);
131 movdqu(xmm3, xword[A1+LDA3*1-0x80]);
134 punpckldq(xmm0, xmm1);
135 punpckhdq(xmm4, xmm1);
137 punpckldq(xmm2, xmm3);
138 punpckhdq(xmm5, xmm3);
140 punpcklqdq(xmm0, xmm2);
141 punpckhqdq(xmm1, xmm2);
143 punpcklqdq(xmm4, xmm5);
144 punpckhqdq(xmm3, xmm5);
145 pmovsxbw(xmm5, xmm0);
147 pmovsxbw(xmm6, xmm6);
150 pmovsxwd(xmm5, xmm5);
152 movdqu(xword[B-0x80], xmm0);
153 pmovsxbw(xmm5, xmm1);
155 pmovsxbw(xmm6, xmm6);
158 pmovsxwd(xmm5, xmm5);
160 movdqu(xword[B-0x60], xmm1);
161 pmovsxbw(xmm5, xmm4);
163 pmovsxbw(xmm6, xmm6);
166 pmovsxwd(xmm5, xmm5);
168 movdqu(xword[B-0x40], xmm4);
169 pmovsxbw(xmm5, xmm3);
171 pmovsxbw(xmm6, xmm6);
174 pmovsxwd(xmm5, xmm5);
176 movdqu(xword[B-0x20], xmm3);
177 movdqu(xmm0, xword[A2-0x80]);
178 movdqu(xmm1, xword[A2+LDA*1-0x80]);
179 movdqu(xmm2, xword[A2+LDA*2-0x80]);
180 movdqu(xmm3, xword[A2+LDA3*1-0x80]);
183 punpckldq(xmm0, xmm1);
184 punpckhdq(xmm4, xmm1);
186 punpckldq(xmm2, xmm3);
187 punpckhdq(xmm5, xmm3);
189 punpcklqdq(xmm0, xmm2);
190 punpckhqdq(xmm1, xmm2);
192 punpcklqdq(xmm4, xmm5);
193 punpckhqdq(xmm3, xmm5);
194 pmovsxbw(xmm5, xmm0);
196 pmovsxbw(xmm6, xmm6);
199 pmovsxwd(xmm5, xmm5);
201 movdqu(xword[B-0x70], xmm0);
202 pmovsxbw(xmm5, xmm1);
204 pmovsxbw(xmm6, xmm6);
207 pmovsxwd(xmm5, xmm5);
209 movdqu(xword[B-0x50], xmm1);
210 pmovsxbw(xmm5, xmm4);
212 pmovsxbw(xmm6, xmm6);
215 pmovsxwd(xmm5, xmm5);
217 movdqu(xword[B-0x30], xmm4);
218 pmovsxbw(xmm5, xmm3);
220 pmovsxbw(xmm6, xmm6);
223 pmovsxwd(xmm5, xmm5);
225 movdqu(xword[B-0x10], xmm3);
234 movq(xmm0, qword[A1-0x80]);
235 movq(xmm1, qword[A1+LDA*1-0x80]);
236 movq(xmm2, qword[A1+LDA*2-0x80]);
237 movq(xmm3, qword[A1+LDA3*1-0x80]);
239 punpckldq(xmm0, xmm1);
240 punpckldq(xmm2, xmm3);
242 punpcklqdq(xmm0, xmm2);
243 punpckhqdq(xmm1, xmm2);
244 pmovsxbw(xmm5, xmm0);
246 pmovsxbw(xmm6, xmm6);
249 pmovsxwd(xmm5, xmm5);
251 movdqu(xword[B-0x80], xmm0);
252 pmovsxbw(xmm5, xmm1);
254 pmovsxbw(xmm6, xmm6);
257 pmovsxwd(xmm5, xmm5);
259 movdqu(xword[B-0x60], xmm1);
260 movq(xmm0, qword[A2-0x80]);
261 movq(xmm1, qword[A2+LDA*1-0x80]);
262 movq(xmm2, qword[A2+LDA*2-0x80]);
263 movq(xmm3, qword[A2+LDA3*1-0x80]);
265 punpckldq(xmm0, xmm1);
266 punpckldq(xmm2, xmm3);
268 punpcklqdq(xmm0, xmm2);
269 punpckhqdq(xmm1, xmm2);
270 pmovsxbw(xmm5, xmm0);
272 pmovsxbw(xmm6, xmm6);
275 pmovsxwd(xmm5, xmm5);
277 movdqu(xword[B-0x70], xmm0);
278 pmovsxbw(xmm5, xmm1);
280 pmovsxbw(xmm6, xmm6);
283 pmovsxwd(xmm5, xmm5);
285 movdqu(xword[B-0x50], xmm1);
292 movd(xmm0, dword[A1-0x80]);
293 movd(xmm1, dword[A1+LDA*1-0x80]);
294 movd(xmm2, dword[A1+LDA*2-0x80]);
295 movd(xmm3, dword[A1+LDA3*1-0x80]);
297 punpckldq(xmm0, xmm1);
298 punpckldq(xmm2, xmm3);
299 punpcklqdq(xmm0, xmm2);
300 pmovsxbw(xmm5, xmm0);
302 pmovsxbw(xmm6, xmm6);
305 pmovsxwd(xmm5, xmm5);
307 movdqu(xword[B-0x80], xmm0);
308 movd(xmm0, dword[A2-0x80]);
309 movd(xmm1, dword[A2+LDA*1-0x80]);
310 movd(xmm2, dword[A2+LDA*2-0x80]);
311 movd(xmm3, dword[A2+LDA3*1-0x80]);
313 punpckldq(xmm0, xmm1);
314 punpckldq(xmm2, xmm3);
315 punpcklqdq(xmm0, xmm2);
316 pmovsxbw(xmm5, xmm0);
318 pmovsxbw(xmm6, xmm6);
321 pmovsxwd(xmm5, xmm5);
323 movdqu(xword[B-0x70], xmm0);
330 mov(ax, word[A1-0x80]);
331 pinsrw(xmm0, eax, 0x0);
332 mov(ax, word[A1+LDA*1-0x80]);
333 pinsrw(xmm0, eax, 0x1);
334 mov(ax, word[A1+LDA*2-0x80]);
335 pinsrw(xmm0, eax, 0x2);
336 mov(ax, word[A1+LDA3*1-0x80]);
338 pinsrw(xmm0, eax, 0x3);
339 mov(ax, word[A2-0x80]);
340 pinsrw(xmm0, eax, 0x4);
341 mov(ax, word[A2+LDA*1-0x80]);
342 pinsrw(xmm0, eax, 0x5);
343 mov(ax, word[A2+LDA*2-0x80]);
344 pinsrw(xmm0, eax, 0x6);
345 mov(ax, word[A2+LDA3*1-0x80]);
347 pinsrw(xmm0, eax, 0x7);
348 pmovsxbw(xmm5, xmm0);
350 pmovsxwd(xmm5, xmm5);
353 pmovsxbw(xmm6, xmm6);
355 pmovsxwd(xmm6, xmm6);
357 movdqu(xword[B-0x80], xmm0);
364 mov(al, byte[A1-0x80]);
365 pinsrb(xmm0, eax, 0x0);
366 mov(al, byte[A1+LDA*1-0x80]);
367 pinsrb(xmm0, eax, 0x1);
368 mov(al, byte[A1+LDA*2-0x80]);
369 pinsrb(xmm0, eax, 0x2);
370 mov(al, byte[A1+LDA3*1-0x80]);
371 pinsrb(xmm0, eax, 0x3);
372 mov(al, byte[A2-0x80]);
373 pinsrb(xmm0, eax, 0x4);
374 mov(al, byte[A2+LDA*1-0x80]);
375 pinsrb(xmm0, eax, 0x5);
376 mov(al, byte[A2+LDA*2-0x80]);
377 pinsrb(xmm0, eax, 0x6);
378 mov(al, byte[A2+LDA3*1-0x80]);
379 pinsrb(xmm0, eax, 0x7);
380 pmovsxbd(xmm5, xmm0);
381 pshufd(xmm6, xmm0, 0x55);
382 pmovsxbd(xmm6, xmm6);
385 movq(qword[B-0x80], xmm0);
390 mov(A1, qword[ARG_BIAS]);
391 movdqu(xword[A1], xmm8);
392 movdqu(xword[A1+0x10], xmm9);
393 add(qword[ARG_BIAS], 0x20);
406 lea(A2, ptr[A1+LDA*2]);
407 lea(I, ptr[A1+LDA*4]);
416 movdqu(xmm0, xword[A1-0x80]);
417 movdqu(xmm1, xword[A1+LDA*1-0x80]);
419 movdqu(xmm2, xword[A2-0x80]);
420 movdqu(xmm3, xword[A2+LDA*1-0x80]);
423 punpckldq(xmm0, xmm1);
424 punpckhdq(xmm4, xmm1);
426 punpckldq(xmm2, xmm3);
427 punpckhdq(xmm5, xmm3);
429 punpcklqdq(xmm0, xmm2);
430 punpckhqdq(xmm1, xmm2);
432 punpcklqdq(xmm4, xmm5);
433 punpckhqdq(xmm3, xmm5);
434 pmovsxbw(xmm5, xmm0);
436 pmovsxbw(xmm6, xmm6);
439 pmovsxwd(xmm5, xmm5);
441 movdqu(xword[B-0x80], xmm0);
442 pmovsxbw(xmm5, xmm1);
444 pmovsxbw(xmm6, xmm6);
447 pmovsxwd(xmm5, xmm5);
449 movdqu(xword[B-0x70], xmm1);
450 pmovsxbw(xmm5, xmm4);
452 pmovsxbw(xmm6, xmm6);
455 pmovsxwd(xmm5, xmm5);
457 movdqu(xword[B-0x60], xmm4);
458 pmovsxbw(xmm5, xmm3);
460 pmovsxbw(xmm6, xmm6);
463 pmovsxwd(xmm5, xmm5);
465 movdqu(xword[B-0x50], xmm3);
474 movq(xmm0, qword[A1-0x80]);
475 movq(xmm1, qword[A1+LDA*1-0x80]);
477 movq(xmm2, qword[A2-0x80]);
478 movq(xmm3, qword[A2+LDA*1-0x80]);
480 punpckldq(xmm0, xmm1);
481 punpckldq(xmm2, xmm3);
483 punpcklqdq(xmm0, xmm2);
484 punpckhqdq(xmm1, xmm2);
485 pmovsxbw(xmm5, xmm0);
487 pmovsxbw(xmm6, xmm6);
490 pmovsxwd(xmm5, xmm5);
492 movdqu(xword[B-0x80], xmm0);
493 pmovsxbw(xmm5, xmm1);
495 pmovsxbw(xmm6, xmm6);
498 pmovsxwd(xmm5, xmm5);
500 movdqu(xword[B-0x70], xmm1);
507 movd(xmm0, dword[A1-0x80]);
508 movd(xmm1, dword[A1+LDA*1-0x80]);
510 movd(xmm2, dword[A2-0x80]);
511 movd(xmm3, dword[A2+LDA*1-0x80]);
513 punpckldq(xmm0, xmm1);
514 punpckldq(xmm2, xmm3);
515 punpcklqdq(xmm0, xmm2);
516 pmovsxbw(xmm5, xmm0);
518 pmovsxbw(xmm6, xmm6);
521 pmovsxwd(xmm5, xmm5);
523 movdqu(xword[B-0x80], xmm0);
530 mov(ax, word[A1-0x80]);
531 pinsrw(xmm0, eax, 0x0);
532 mov(ax, word[A1+LDA*1-0x80]);
534 pinsrw(xmm0, eax, 0x1);
535 mov(ax, word[A2-0x80]);
536 pinsrw(xmm0, eax, 0x2);
537 mov(ax, word[A2+LDA*1-0x80]);
539 pinsrw(xmm0, eax, 0x3);
540 pmovsxbw(xmm5, xmm0);
542 pmovsxwd(xmm5, xmm5);
544 movq(qword[B-0x80], xmm0);
551 mov(al, byte[A1-0x80]);
552 pinsrb(xmm0, eax, 0x0);
553 mov(al, byte[A1+LDA*1-0x80]);
554 pinsrb(xmm0, eax, 0x1);
555 mov(al, byte[A2-0x80]);
556 pinsrb(xmm0, eax, 0x2);
557 mov(al, byte[A2+LDA*1-0x80]);
558 pinsrb(xmm0, eax, 0x3);
559 pmovsxbd(xmm5, xmm0);
561 movd(dword[B-0x80], xmm0);
566 mov(A1, qword[ARG_BIAS]);
567 movdqu(xword[A1], xmm7);
568 add(qword[ARG_BIAS], 0x10);
581 lea(A2, ptr[A1+LDA*1]);
582 lea(I, ptr[A1+LDA*2]);
591 movdqu(xmm0, xword[A1-0x80]);
593 movdqu(xmm1, xword[A2-0x80]);
596 punpckldq(xmm0, xmm1);
597 punpckhdq(xmm2, xmm1);
598 pshufd(xmm6, xmm0, 0xd8);
599 pmovsxbw(xmm5, xmm6);
601 pmovsxbw(xmm6, xmm6);
605 pmovsxwd(xmm5, xmm5);
607 movdqu(xword[B-0x80], xmm0);
608 pshufd(xmm6, xmm2, 0xd8);
609 pmovsxbw(xmm5, xmm6);
611 pmovsxbw(xmm6, xmm6);
615 pmovsxwd(xmm5, xmm5);
617 movdqu(xword[B-0x70], xmm2);
626 movq(xmm0, qword[A1-0x80]);
628 movq(xmm1, qword[A2-0x80]);
630 punpckldq(xmm0, xmm1);
631 pshufd(xmm6, xmm0, 0xd8);
632 pmovsxbw(xmm5, xmm6);
634 pmovsxbw(xmm6, xmm6);
638 pmovsxwd(xmm5, xmm5);
640 movdqu(xword[B-0x80], xmm0);
647 movd(xmm0, dword[A1-0x80]);
649 movd(xmm1, dword[A2-0x80]);
651 punpckldq(xmm0, xmm1);
652 pmovsxbw(xmm5, xmm0);
655 pmovsxwd(xmm5, xmm5);
657 movq(qword[B-0x80], xmm0);
664 mov(ax, word[A1-0x80]);
666 pinsrw(xmm0, eax, 0x0);
667 mov(ax, word[A2-0x80]);
669 pinsrw(xmm0, eax, 0x1);
670 pmovsxbw(xmm5, xmm0);
672 pmovsxwd(xmm5, xmm5);
674 movd(dword[B-0x80], xmm0);
681 mov(al, byte[A1-0x80]);
682 pinsrb(xmm0, eax, 0x0);
683 mov(byte[B-0x80], al);
684 mov(al, byte[A2-0x80]);
685 pinsrb(xmm0, eax, 0x1);
686 mov(byte[B-0x7f], al);
688 pmovsxbd(xmm5, xmm0);
693 mov(A1, qword[ARG_BIAS]);
694 movq(qword[A1], xmm7);
695 add(qword[ARG_BIAS], 0x8);
716 movdqu(xmm0, xword[A1-0x80]);
718 pmovsxbw(xmm5, xmm0);
720 pmovsxbw(xmm6, xmm6);
725 pmovsxwd(xmm5, xmm5);
727 movdqu(xword[B-0x80], xmm0);
736 movq(xmm0, qword[A1-0x80]);
738 pmovsxbw(xmm5, xmm0);
742 pmovsxwd(xmm5, xmm5);
744 movq(qword[B-0x80], xmm0);
751 movd(xmm0, dword[A1-0x80]);
753 pmovsxbw(xmm5, xmm0);
756 pmovsxwd(xmm5, xmm5);
758 movd(dword[B-0x80], xmm0);
765 mov(ax, word[A1-0x80]);
766 pinsrw(xmm0, eax, 0x0);
767 pmovsxbw(xmm5, xmm0);
769 pmovsxwd(xmm5, xmm5);
771 mov(word[B-0x80], ax);
779 mov(al, byte[A1-0x80]);
780 pinsrb(xmm0, eax, 0x0);
781 pmovsxbd(xmm5, xmm0);
783 mov(byte[B-0x80], al);
788 mov(A1, qword[ARG_BIAS]);
789 movd(dword[A1], xmm7);
790 add(qword[ARG_BIAS], 0x4);