1 /*******************************************************************************
2 * Copyright 2018 Intel Corporation
4 * Licensed under the Apache License, Version 2.0 (the "License");
5 * you may not use this file except in compliance with the License.
6 * You may obtain a copy of the License at
8 * http://www.apache.org/licenses/LICENSE-2.0
10 * Unless required by applicable law or agreed to in writing, software
11 * distributed under the License is distributed on an "AS IS" BASIS,
12 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13 * See the License for the specific language governing permissions and
14 * limitations under the License.
15 *******************************************************************************/
17 #include "jit_generator.hpp"
24 jit_avx512_core_u8_copy_sum_bt_kern::jit_avx512_core_u8_copy_sum_bt_kern(): jit_generator(nullptr, GEMM_CODE_SIZE)
40 #define ARG_BIAS 24+stacksize+rsp
56 #define ARG_ALPHA 40+stacksize+rsp
57 #define ARG_B 48+stacksize+rsp
58 #define ARG_BIAS 72+stacksize+rsp
95 auto stacksize = get_size_of_abi_save_regs();
97 mov(ALPHA, ptr[ARG_ALPHA]);
103 mov(LDA, qword[LDA]);
104 lea(LDA3, ptr[LDA+LDA*2]);
122 movq(xmm0, qword[A1-0x80]);
124 movq(xmm1, qword[A1-0x80]);
126 movq(xmm2, qword[A1-0x80]);
128 movq(xmm3, qword[A1-0x80]);
130 punpcklbw(xmm0, xmm1);
131 punpcklbw(xmm2, xmm3);
133 punpcklwd(xmm0, xmm2);
134 punpckhwd(xmm1, xmm2);
135 pmovsxbw(xmm5, xmm0);
137 pmovsxbw(xmm6, xmm6);
140 pmovsxwd(xmm5, xmm5);
142 pmovsxbw(xmm5, xmm1);
144 pmovsxbw(xmm6, xmm6);
147 pmovsxwd(xmm5, xmm5);
149 movdqu(xword[B-0x80], xmm0);
150 movdqu(xword[B-0x70], xmm1);
151 movq(xmm0, qword[A1-0x80]);
153 movq(xmm1, qword[A1-0x80]);
155 movq(xmm2, qword[A1-0x80]);
157 movq(xmm3, qword[A1-0x80]);
159 punpcklbw(xmm0, xmm1);
160 punpcklbw(xmm2, xmm3);
162 punpcklwd(xmm0, xmm2);
163 punpckhwd(xmm1, xmm2);
164 pmovsxbw(xmm5, xmm0);
166 pmovsxbw(xmm6, xmm6);
169 pmovsxwd(xmm5, xmm5);
171 pmovsxbw(xmm5, xmm1);
173 pmovsxbw(xmm6, xmm6);
176 pmovsxwd(xmm5, xmm5);
178 movdqu(xword[B-0x60], xmm0);
179 movdqu(xword[B-0x50], xmm1);
188 movq(xmm0, qword[A1-0x80]);
190 movq(xmm1, qword[A1-0x80]);
192 movq(xmm2, qword[A1-0x80]);
194 movq(xmm3, qword[A1-0x80]);
196 punpcklbw(xmm0, xmm1);
197 punpcklbw(xmm2, xmm3);
199 punpcklwd(xmm0, xmm2);
200 punpckhwd(xmm1, xmm2);
201 pmovsxbw(xmm5, xmm0);
203 pmovsxbw(xmm6, xmm6);
206 pmovsxwd(xmm5, xmm5);
208 pmovsxbw(xmm5, xmm1);
210 pmovsxbw(xmm6, xmm6);
213 pmovsxwd(xmm5, xmm5);
215 movdqu(xword[B-0x80], xmm0);
216 movdqu(xword[B-0x70], xmm1);
223 movq(xmm0, qword[A1-0x80]);
225 movq(xmm1, qword[A1-0x80]);
227 punpcklbw(xmm0, xmm1);
228 pmovsxbw(xmm5, xmm0);
230 pmovsxwd(xmm5, xmm5);
233 pmovsxbw(xmm6, xmm6);
235 pmovsxwd(xmm6, xmm6);
237 movdqu(xword[B-0x80], xmm0);
244 movq(xmm0, qword[A1-0x80]);
246 pmovsxbd(xmm5, xmm0);
247 pshufd(xmm6, xmm0, 0x55);
248 pmovsxbd(xmm6, xmm6);
251 movq(qword[B-0x80], xmm0);
256 mov(A1, qword[ARG_BIAS]);
257 movdqu(xword[A1], xmm8);
258 movdqu(xword[A1+0x10], xmm9);
259 add(qword[ARG_BIAS], 0x20);
280 movd(xmm0, dword[A1-0x80]);
282 movd(xmm1, dword[A1-0x80]);
284 movd(xmm2, dword[A1-0x80]);
286 movd(xmm3, dword[A1-0x80]);
288 punpcklbw(xmm0, xmm1);
289 punpcklbw(xmm2, xmm3);
290 punpcklwd(xmm0, xmm2);
291 pmovsxbw(xmm5, xmm0);
293 pmovsxbw(xmm6, xmm6);
296 pmovsxwd(xmm5, xmm5);
298 movdqu(xword[B-0x80], xmm0);
299 movd(xmm0, dword[A1-0x80]);
301 movd(xmm1, dword[A1-0x80]);
303 movd(xmm2, dword[A1-0x80]);
305 movd(xmm3, dword[A1-0x80]);
307 punpcklbw(xmm0, xmm1);
308 punpcklbw(xmm2, xmm3);
309 punpcklwd(xmm0, xmm2);
310 pmovsxbw(xmm5, xmm0);
312 pmovsxbw(xmm6, xmm6);
315 pmovsxwd(xmm5, xmm5);
317 movdqu(xword[B-0x70], xmm0);
326 movd(xmm0, dword[A1-0x80]);
328 movd(xmm1, dword[A1-0x80]);
330 movd(xmm2, dword[A1-0x80]);
332 movd(xmm3, dword[A1-0x80]);
334 punpcklbw(xmm0, xmm1);
335 punpcklbw(xmm2, xmm3);
336 punpcklwd(xmm0, xmm2);
337 pmovsxbw(xmm5, xmm0);
339 pmovsxbw(xmm6, xmm6);
342 pmovsxwd(xmm5, xmm5);
344 movdqu(xword[B-0x80], xmm0);
351 movd(xmm0, dword[A1-0x80]);
353 movd(xmm1, dword[A1-0x80]);
355 punpcklbw(xmm0, xmm1);
356 pmovsxbw(xmm5, xmm0);
358 pmovsxwd(xmm5, xmm5);
360 movq(qword[B-0x80], xmm0);
367 movd(xmm0, dword[A1-0x80]);
368 pmovsxbd(xmm5, xmm0);
370 movd(dword[B-0x80], xmm0);
375 mov(A1, qword[ARG_BIAS]);
376 movdqu(xword[A1], xmm7);
377 add(qword[ARG_BIAS], 0x10);
398 mov(ax, word[A1-0x80]);
400 pinsrw(xmm0, eax, 0x0);
401 mov(ax, word[A1-0x80]);
403 pinsrw(xmm1, eax, 0x0);
404 mov(ax, word[A1-0x80]);
406 pinsrw(xmm2, eax, 0x0);
407 mov(ax, word[A1-0x80]);
409 pinsrw(xmm3, eax, 0x0);
410 punpcklbw(xmm0, xmm1);
411 punpcklbw(xmm2, xmm3);
412 punpcklwd(xmm0, xmm2);
413 mov(ax, word[A1-0x80]);
415 pinsrw(xmm1, eax, 0x0);
416 mov(ax, word[A1-0x80]);
418 pinsrw(xmm2, eax, 0x0);
419 mov(ax, word[A1-0x80]);
421 pinsrw(xmm3, eax, 0x0);
422 mov(ax, word[A1-0x80]);
424 pinsrw(xmm4, eax, 0x0);
425 punpcklbw(xmm1, xmm2);
426 punpcklbw(xmm3, xmm4);
427 punpcklwd(xmm1, xmm3);
428 punpcklqdq(xmm0, xmm1);
429 pshufd(xmm6, xmm0, 0xd8);
430 pmovsxbw(xmm5, xmm6);
432 pmovsxbw(xmm6, xmm6);
436 pmovsxwd(xmm5, xmm5);
438 movdqu(xword[B-0x80], xmm0);
447 mov(ax, word[A1-0x80]);
449 pinsrw(xmm0, eax, 0x0);
450 mov(ax, word[A1-0x80]);
452 pinsrw(xmm1, eax, 0x0);
453 mov(ax, word[A1-0x80]);
455 pinsrw(xmm2, eax, 0x0);
456 mov(ax, word[A1-0x80]);
458 pinsrw(xmm3, eax, 0x0);
459 punpcklbw(xmm0, xmm1);
460 punpcklbw(xmm2, xmm3);
461 punpcklwd(xmm0, xmm2);
462 pmovsxbw(xmm5, xmm0);
465 pmovsxwd(xmm5, xmm5);
467 movq(qword[B-0x80], xmm0);
474 mov(ax, word[A1-0x80]);
476 pinsrw(xmm0, eax, 0x0);
477 mov(ax, word[A1-0x80]);
479 pinsrw(xmm1, eax, 0x0);
480 punpcklbw(xmm0, xmm1);
481 pmovsxbw(xmm5, xmm0);
483 pmovsxwd(xmm5, xmm5);
485 movd(dword[B-0x80], xmm0);
492 mov(ax, word[A1-0x80]);
493 pinsrw(xmm0, eax, 0x0);
494 pmovsxbd(xmm5, xmm0);
496 mov(word[B-0x80], ax);
501 mov(A1, qword[ARG_BIAS]);
502 movq(qword[A1], xmm7);
503 add(qword[ARG_BIAS], 0x8);
524 mov(al, byte[A1-0x80]);
526 pinsrb(xmm0, eax, 0x0);
527 mov(al, byte[A1-0x80]);
529 pinsrb(xmm0, eax, 0x1);
530 mov(al, byte[A1-0x80]);
532 pinsrb(xmm0, eax, 0x2);
533 mov(al, byte[A1-0x80]);
535 pinsrb(xmm0, eax, 0x3);
536 mov(al, byte[A1-0x80]);
538 pinsrb(xmm0, eax, 0x4);
539 mov(al, byte[A1-0x80]);
541 pinsrb(xmm0, eax, 0x5);
542 mov(al, byte[A1-0x80]);
544 pinsrb(xmm0, eax, 0x6);
545 mov(al, byte[A1-0x80]);
547 pinsrb(xmm0, eax, 0x7);
548 pmovsxbw(xmm5, xmm0);
552 pmovsxwd(xmm5, xmm5);
554 movq(qword[B-0x80], xmm0);
563 mov(al, byte[A1-0x80]);
565 pinsrb(xmm0, eax, 0x0);
566 mov(al, byte[A1-0x80]);
568 pinsrb(xmm0, eax, 0x1);
569 mov(al, byte[A1-0x80]);
571 pinsrb(xmm0, eax, 0x2);
572 mov(al, byte[A1-0x80]);
574 pinsrb(xmm0, eax, 0x3);
575 pmovsxbw(xmm5, xmm0);
578 pmovsxwd(xmm5, xmm5);
580 movd(dword[B-0x80], xmm0);
587 mov(al, byte[A1-0x80]);
589 pinsrb(xmm0, eax, 0x0);
590 mov(byte[B-0x80], al);
591 mov(al, byte[A1-0x80]);
593 pinsrb(xmm0, eax, 0x1);
594 pmovsxbw(xmm5, xmm0);
596 pmovsxwd(xmm5, xmm5);
598 mov(byte[B-0x7f], al);
605 mov(al, byte[A1-0x80]);
606 pinsrw(xmm0, eax, 0x0);
607 pmovsxbd(xmm5, xmm0);
609 mov(byte[B-0x80], al);
614 mov(A1, qword[ARG_BIAS]);
615 movd(dword[A1], xmm7);
616 add(qword[ARG_BIAS], 0x4);