1 /*******************************************************************************
2 * Copyright 2018 Intel Corporation
4 * Licensed under the Apache License, Version 2.0 (the "License");
5 * you may not use this file except in compliance with the License.
6 * You may obtain a copy of the License at
8 * http://www.apache.org/licenses/LICENSE-2.0
10 * Unless required by applicable law or agreed to in writing, software
11 * distributed under the License is distributed on an "AS IS" BASIS,
12 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13 * See the License for the specific language governing permissions and
14 * limitations under the License.
15 *******************************************************************************/
17 #include "jit_generator.hpp"
24 jit_avx512_core_u8_copy_an_kern::jit_avx512_core_u8_copy_an_kern(): jit_generator(nullptr, GEMM_CODE_SIZE)
54 #define ARG_ALPHA 40+stacksize+rsp
55 #define ARG_B 48+stacksize+rsp
111 auto stacksize = get_size_of_abi_save_regs();
112 mov(ALPHA, ptr[ARG_ALPHA]);
118 mov(LDA, qword[LDA]);
119 lea(LDA3, ptr[LDA+LDA*2]);
135 movdqu(xmm0, xword[A1-0x80]);
136 movdqu(xmm1, xword[A1+LDA*1-0x80]);
137 movdqu(xmm2, xword[A1+LDA*2-0x80]);
138 movdqu(xmm3, xword[A1+LDA3*1-0x80]);
140 punpcklbw(xmm0, xmm1);
141 punpckhbw(xmm4, xmm1);
143 punpcklbw(xmm2, xmm3);
144 punpckhbw(xmm5, xmm3);
146 punpcklwd(xmm0, xmm2);
147 punpckhwd(xmm1, xmm2);
149 punpcklwd(xmm4, xmm5);
150 punpckhwd(xmm2, xmm5);
151 movdqu(xword[B-0x80], xmm0);
152 movdqu(xword[B-0x70], xmm1);
153 movdqu(xword[B-0x60], xmm4);
154 movdqu(xword[B-0x50], xmm2);
155 movdqu(xmm0, xword[A1-0x70]);
156 movdqu(xmm1, xword[A1+LDA*1-0x70]);
157 movdqu(xmm2, xword[A1+LDA*2-0x70]);
158 movdqu(xmm3, xword[A1+LDA3*1-0x70]);
160 punpcklbw(xmm0, xmm1);
161 punpckhbw(xmm4, xmm1);
163 punpcklbw(xmm2, xmm3);
164 punpckhbw(xmm5, xmm3);
166 punpcklwd(xmm0, xmm2);
167 punpckhwd(xmm1, xmm2);
169 punpcklwd(xmm4, xmm5);
170 punpckhwd(xmm2, xmm5);
171 movdqu(xword[B-0x40], xmm0);
172 movdqu(xword[B-0x30], xmm1);
173 movdqu(xword[B-0x20], xmm4);
174 movdqu(xword[B-0x10], xmm2);
175 movdqu(xmm0, xword[A1-0x60]);
176 movdqu(xmm1, xword[A1+LDA*1-0x60]);
177 movdqu(xmm2, xword[A1+LDA*2-0x60]);
178 movdqu(xmm3, xword[A1+LDA3*1-0x60]);
179 lea(A1, ptr[A1+LDA*4]);
181 punpcklbw(xmm0, xmm1);
182 punpckhbw(xmm4, xmm1);
184 punpcklbw(xmm2, xmm3);
185 punpckhbw(xmm5, xmm3);
187 punpcklwd(xmm0, xmm2);
188 punpckhwd(xmm1, xmm2);
190 punpcklwd(xmm4, xmm5);
191 punpckhwd(xmm2, xmm5);
192 movdqu(xword[B], xmm0);
193 movdqu(xword[B+0x10], xmm1);
194 movdqu(xword[B+0x20], xmm4);
195 movdqu(xword[B+0x30], xmm2);
204 movdqu(xmm0, xword[A1-0x80]);
205 movdqu(xmm1, xword[A1-0x70]);
206 movdqu(xmm2, xword[A1-0x60]);
208 movdqu(xmm3, xword[A1-0x80]);
209 movdqu(xmm4, xword[A1-0x70]);
210 movdqu(xmm5, xword[A1-0x60]);
213 punpcklbw(xmm0, xmm3);
214 punpckhbw(xmm6, xmm3);
215 movdqu(xword[B-0x80], xmm0);
216 movdqu(xword[B-0x70], xmm6);
218 punpcklbw(xmm1, xmm4);
219 punpckhbw(xmm6, xmm4);
220 movdqu(xword[B-0x60], xmm1);
221 movdqu(xword[B-0x50], xmm6);
223 punpcklbw(xmm2, xmm5);
224 punpckhbw(xmm6, xmm5);
225 movdqu(xword[B-0x40], xmm2);
226 movdqu(xword[B-0x30], xmm6);
233 movdqu(xmm0, xword[A1-0x80]);
234 movdqu(xmm1, xword[A1-0x70]);
235 movdqu(xmm2, xword[A1-0x60]);
237 movdqu(xword[B-0x80], xmm0);
238 movdqu(xword[B-0x70], xmm1);
239 movdqu(xword[B-0x60], xmm2);
263 movdqu(xmm0, xword[A1-0x80]);
264 movdqu(xmm1, xword[A1+LDA*1-0x80]);
265 movdqu(xmm2, xword[A1+LDA*2-0x80]);
266 movdqu(xmm3, xword[A1+LDA3*1-0x80]);
268 punpcklbw(xmm0, xmm1);
269 punpckhbw(xmm4, xmm1);
271 punpcklbw(xmm2, xmm3);
272 punpckhbw(xmm5, xmm3);
274 punpcklwd(xmm0, xmm2);
275 punpckhwd(xmm1, xmm2);
277 punpcklwd(xmm4, xmm5);
278 punpckhwd(xmm2, xmm5);
279 movdqu(xword[B-0x80], xmm0);
280 movdqu(xword[B-0x70], xmm1);
281 movdqu(xword[B-0x60], xmm4);
282 movdqu(xword[B-0x50], xmm2);
283 movdqu(xmm0, xword[A1-0x70]);
284 movdqu(xmm1, xword[A1+LDA*1-0x70]);
285 movdqu(xmm2, xword[A1+LDA*2-0x70]);
286 movdqu(xmm3, xword[A1+LDA3*1-0x70]);
287 lea(A1, ptr[A1+LDA*4]);
289 punpcklbw(xmm0, xmm1);
290 punpckhbw(xmm4, xmm1);
292 punpcklbw(xmm2, xmm3);
293 punpckhbw(xmm5, xmm3);
295 punpcklwd(xmm0, xmm2);
296 punpckhwd(xmm1, xmm2);
298 punpcklwd(xmm4, xmm5);
299 punpckhwd(xmm2, xmm5);
300 movdqu(xword[B-0x40], xmm0);
301 movdqu(xword[B-0x30], xmm1);
302 movdqu(xword[B-0x20], xmm4);
303 movdqu(xword[B-0x10], xmm2);
312 movdqu(xmm0, xword[A1-0x80]);
313 movdqu(xmm1, xword[A1-0x70]);
315 movdqu(xmm2, xword[A1-0x80]);
316 movdqu(xmm3, xword[A1-0x70]);
319 punpcklbw(xmm0, xmm2);
320 punpckhbw(xmm4, xmm2);
321 movdqu(xword[B-0x80], xmm0);
322 movdqu(xword[B-0x70], xmm4);
324 punpcklbw(xmm1, xmm3);
325 punpckhbw(xmm4, xmm3);
326 movdqu(xword[B-0x60], xmm1);
327 movdqu(xword[B-0x50], xmm4);
334 movdqu(xmm0, xword[A1-0x80]);
335 movdqu(xmm1, xword[A1-0x70]);
337 movdqu(xword[B-0x80], xmm0);
338 movdqu(xword[B-0x70], xmm1);
362 movdqu(xmm0, xword[A1-0x80]);
364 movdqu(xmm1, xword[A1-0x80]);
366 movdqu(xmm2, xword[A1-0x80]);
368 movdqu(xmm3, xword[A1-0x80]);
371 punpcklbw(xmm0, xmm1);
372 punpckhbw(xmm4, xmm1);
374 punpcklbw(xmm2, xmm3);
375 punpckhbw(xmm1, xmm3);
377 punpcklwd(xmm0, xmm2);
378 punpckhwd(xmm3, xmm2);
380 punpcklwd(xmm4, xmm1);
381 punpckhwd(xmm2, xmm1);
382 movdqu(xword[B-0x80], xmm0);
383 movdqu(xword[B-0x70], xmm3);
384 movdqu(xword[B-0x60], xmm4);
385 movdqu(xword[B-0x50], xmm2);
394 movdqu(xmm0, xword[A1-0x80]);
396 movdqu(xmm1, xword[A1-0x80]);
399 punpcklbw(xmm0, xmm1);
400 punpckhbw(xmm2, xmm1);
401 movdqu(xword[B-0x80], xmm0);
402 movdqu(xword[B-0x70], xmm2);
409 movdqu(xmm0, xword[A1-0x80]);
411 movdqu(xword[B-0x80], xmm0);
435 movq(xmm0, qword[A1-0x80]);
437 movq(xmm1, qword[A1-0x80]);
439 movq(xmm2, qword[A1-0x80]);
441 movq(xmm3, qword[A1-0x80]);
443 punpcklbw(xmm0, xmm1);
444 punpcklbw(xmm2, xmm3);
446 punpcklwd(xmm0, xmm2);
447 punpckhwd(xmm1, xmm2);
448 movdqu(xword[B-0x80], xmm0);
449 movdqu(xword[B-0x70], xmm1);
450 movq(xmm0, qword[A1-0x80]);
452 movq(xmm1, qword[A1-0x80]);
454 movq(xmm2, qword[A1-0x80]);
456 movq(xmm3, qword[A1-0x80]);
458 punpcklbw(xmm0, xmm1);
459 punpcklbw(xmm2, xmm3);
461 punpcklwd(xmm0, xmm2);
462 punpckhwd(xmm1, xmm2);
463 movdqu(xword[B-0x60], xmm0);
464 movdqu(xword[B-0x50], xmm1);
473 movq(xmm0, qword[A1-0x80]);
475 movq(xmm1, qword[A1-0x80]);
477 movq(xmm2, qword[A1-0x80]);
479 movq(xmm3, qword[A1-0x80]);
481 punpcklbw(xmm0, xmm1);
482 punpcklbw(xmm2, xmm3);
484 punpcklwd(xmm0, xmm2);
485 punpckhwd(xmm1, xmm2);
486 movdqu(xword[B-0x80], xmm0);
487 movdqu(xword[B-0x70], xmm1);
494 movq(xmm0, qword[A1-0x80]);
496 movq(xmm1, qword[A1-0x80]);
498 punpcklbw(xmm0, xmm1);
499 movdqu(xword[B-0x80], xmm0);
506 movq(xmm0, qword[A1-0x80]);
508 movq(qword[B-0x80], xmm0);
532 movd(xmm0, dword[A1-0x80]);
534 movd(xmm1, dword[A1-0x80]);
536 movd(xmm2, dword[A1-0x80]);
538 movd(xmm3, dword[A1-0x80]);
540 punpcklbw(xmm0, xmm1);
541 punpcklbw(xmm2, xmm3);
542 punpcklwd(xmm0, xmm2);
543 movdqu(xword[B-0x80], xmm0);
544 movd(xmm0, dword[A1-0x80]);
546 movd(xmm1, dword[A1-0x80]);
548 movd(xmm2, dword[A1-0x80]);
550 movd(xmm3, dword[A1-0x80]);
552 punpcklbw(xmm0, xmm1);
553 punpcklbw(xmm2, xmm3);
554 punpcklwd(xmm0, xmm2);
555 movdqu(xword[B-0x70], xmm0);
564 movd(xmm0, dword[A1-0x80]);
566 movd(xmm1, dword[A1-0x80]);
568 movd(xmm2, dword[A1-0x80]);
570 movd(xmm3, dword[A1-0x80]);
572 punpcklbw(xmm0, xmm1);
573 punpcklbw(xmm2, xmm3);
574 punpcklwd(xmm0, xmm2);
575 movdqu(xword[B-0x80], xmm0);
582 movd(xmm0, dword[A1-0x80]);
584 movd(xmm1, dword[A1-0x80]);
586 punpcklbw(xmm0, xmm1);
587 movq(qword[B-0x80], xmm0);
594 movd(xmm0, dword[A1-0x80]);
595 movd(dword[B-0x80], xmm0);
619 mov(ax, word[A1-0x80]);
621 pinsrw(xmm0, eax, 0x0);
622 mov(ax, word[A1-0x80]);
624 pinsrw(xmm1, eax, 0x0);
625 mov(ax, word[A1-0x80]);
627 pinsrw(xmm2, eax, 0x0);
628 mov(ax, word[A1-0x80]);
630 pinsrw(xmm3, eax, 0x0);
631 punpcklbw(xmm0, xmm1);
632 punpcklbw(xmm2, xmm3);
633 punpcklwd(xmm0, xmm2);
634 mov(ax, word[A1-0x80]);
636 pinsrw(xmm1, eax, 0x0);
637 mov(ax, word[A1-0x80]);
639 pinsrw(xmm2, eax, 0x0);
640 mov(ax, word[A1-0x80]);
642 pinsrw(xmm3, eax, 0x0);
643 mov(ax, word[A1-0x80]);
645 pinsrw(xmm4, eax, 0x0);
646 punpcklbw(xmm1, xmm2);
647 punpcklbw(xmm3, xmm4);
648 punpcklwd(xmm1, xmm3);
649 punpcklqdq(xmm0, xmm1);
650 movdqu(xword[B-0x80], xmm0);
659 mov(ax, word[A1-0x80]);
661 pinsrw(xmm0, eax, 0x0);
662 mov(ax, word[A1-0x80]);
664 pinsrw(xmm1, eax, 0x0);
665 mov(ax, word[A1-0x80]);
667 pinsrw(xmm2, eax, 0x0);
668 mov(ax, word[A1-0x80]);
670 pinsrw(xmm3, eax, 0x0);
671 punpcklbw(xmm0, xmm1);
672 punpcklbw(xmm2, xmm3);
673 punpcklwd(xmm0, xmm2);
674 movq(qword[B-0x80], xmm0);
681 mov(ax, word[A1-0x80]);
683 pinsrw(xmm0, eax, 0x0);
684 mov(ax, word[A1-0x80]);
686 pinsrw(xmm1, eax, 0x0);
687 punpcklbw(xmm0, xmm1);
688 movd(dword[B-0x80], xmm0);
695 mov(ax, word[A1-0x80]);
696 mov(word[B-0x80], ax);
720 mov(al, byte[A1-0x80]);
722 pinsrb(xmm0, eax, 0x0);
723 mov(al, byte[A1-0x80]);
725 pinsrb(xmm0, eax, 0x1);
726 mov(al, byte[A1-0x80]);
728 pinsrb(xmm0, eax, 0x2);
729 mov(al, byte[A1-0x80]);
731 pinsrb(xmm0, eax, 0x3);
732 mov(al, byte[A1-0x80]);
734 pinsrb(xmm0, eax, 0x4);
735 mov(al, byte[A1-0x80]);
737 pinsrb(xmm0, eax, 0x5);
738 mov(al, byte[A1-0x80]);
740 pinsrb(xmm0, eax, 0x6);
741 mov(al, byte[A1-0x80]);
743 pinsrb(xmm0, eax, 0x7);
744 movq(qword[B-0x80], xmm0);
753 mov(al, byte[A1-0x80]);
755 pinsrb(xmm0, eax, 0x0);
756 mov(al, byte[A1-0x80]);
758 pinsrb(xmm0, eax, 0x1);
759 mov(al, byte[A1-0x80]);
761 pinsrb(xmm0, eax, 0x2);
762 mov(al, byte[A1-0x80]);
764 pinsrb(xmm0, eax, 0x3);
765 movd(dword[B-0x80], xmm0);
772 mov(al, byte[A1-0x80]);
774 mov(byte[B-0x80], al);
775 mov(al, byte[A1-0x80]);
777 mov(byte[B-0x7f], al);
784 mov(al, byte[A1-0x80]);
785 mov(byte[B-0x80], al);