1 /*******************************************************************************
2 * Copyright 2018 Intel Corporation
4 * Licensed under the Apache License, Version 2.0 (the "License");
5 * you may not use this file except in compliance with the License.
6 * You may obtain a copy of the License at
8 * http://www.apache.org/licenses/LICENSE-2.0
10 * Unless required by applicable law or agreed to in writing, software
11 * distributed under the License is distributed on an "AS IS" BASIS,
12 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13 * See the License for the specific language governing permissions and
14 * limitations under the License.
15 *******************************************************************************/
17 #include "jit_generator.hpp"
24 jit_avx512_core_u8_copy_bt_kern::jit_avx512_core_u8_copy_bt_kern(): jit_generator(nullptr, GEMM_CODE_SIZE)
54 #define ARG_ALPHA 40+stacksize+rsp
55 #define ARG_B 48+stacksize+rsp
93 auto stacksize = get_size_of_abi_save_regs();
94 mov(ALPHA, ptr[ARG_ALPHA]);
100 mov(LDA, qword[LDA]);
101 lea(LDA3, ptr[LDA+LDA*2]);
117 movq(xmm0, qword[A1-0x80]);
119 movq(xmm1, qword[A1-0x80]);
121 movq(xmm2, qword[A1-0x80]);
123 movq(xmm3, qword[A1-0x80]);
125 punpcklbw(xmm0, xmm1);
126 punpcklbw(xmm2, xmm3);
128 punpcklwd(xmm0, xmm2);
129 punpckhwd(xmm1, xmm2);
130 movdqu(xword[B-0x80], xmm0);
131 movdqu(xword[B-0x70], xmm1);
132 movq(xmm0, qword[A1-0x80]);
134 movq(xmm1, qword[A1-0x80]);
136 movq(xmm2, qword[A1-0x80]);
138 movq(xmm3, qword[A1-0x80]);
140 punpcklbw(xmm0, xmm1);
141 punpcklbw(xmm2, xmm3);
143 punpcklwd(xmm0, xmm2);
144 punpckhwd(xmm1, xmm2);
145 movdqu(xword[B-0x60], xmm0);
146 movdqu(xword[B-0x50], xmm1);
155 movq(xmm0, qword[A1-0x80]);
157 movq(xmm1, qword[A1-0x80]);
159 movq(xmm2, qword[A1-0x80]);
161 movq(xmm3, qword[A1-0x80]);
163 punpcklbw(xmm0, xmm1);
164 punpcklbw(xmm2, xmm3);
166 punpcklwd(xmm0, xmm2);
167 punpckhwd(xmm1, xmm2);
168 movdqu(xword[B-0x80], xmm0);
169 movdqu(xword[B-0x70], xmm1);
176 movq(xmm0, qword[A1-0x80]);
178 movq(xmm1, qword[A1-0x80]);
180 punpcklbw(xmm0, xmm1);
181 movdqu(xword[B-0x80], xmm0);
188 movq(xmm0, qword[A1-0x80]);
190 movq(qword[B-0x80], xmm0);
214 movd(xmm0, dword[A1-0x80]);
216 movd(xmm1, dword[A1-0x80]);
218 movd(xmm2, dword[A1-0x80]);
220 movd(xmm3, dword[A1-0x80]);
222 punpcklbw(xmm0, xmm1);
223 punpcklbw(xmm2, xmm3);
224 punpcklwd(xmm0, xmm2);
225 movdqu(xword[B-0x80], xmm0);
226 movd(xmm0, dword[A1-0x80]);
228 movd(xmm1, dword[A1-0x80]);
230 movd(xmm2, dword[A1-0x80]);
232 movd(xmm3, dword[A1-0x80]);
234 punpcklbw(xmm0, xmm1);
235 punpcklbw(xmm2, xmm3);
236 punpcklwd(xmm0, xmm2);
237 movdqu(xword[B-0x70], xmm0);
246 movd(xmm0, dword[A1-0x80]);
248 movd(xmm1, dword[A1-0x80]);
250 movd(xmm2, dword[A1-0x80]);
252 movd(xmm3, dword[A1-0x80]);
254 punpcklbw(xmm0, xmm1);
255 punpcklbw(xmm2, xmm3);
256 punpcklwd(xmm0, xmm2);
257 movdqu(xword[B-0x80], xmm0);
264 movd(xmm0, dword[A1-0x80]);
266 movd(xmm1, dword[A1-0x80]);
268 punpcklbw(xmm0, xmm1);
269 movq(qword[B-0x80], xmm0);
276 movd(xmm0, dword[A1-0x80]);
277 movd(dword[B-0x80], xmm0);
301 mov(ax, word[A1-0x80]);
303 pinsrw(xmm0, eax, 0x0);
304 mov(ax, word[A1-0x80]);
306 pinsrw(xmm1, eax, 0x0);
307 mov(ax, word[A1-0x80]);
309 pinsrw(xmm2, eax, 0x0);
310 mov(ax, word[A1-0x80]);
312 pinsrw(xmm3, eax, 0x0);
313 punpcklbw(xmm0, xmm1);
314 punpcklbw(xmm2, xmm3);
315 punpcklwd(xmm0, xmm2);
316 mov(ax, word[A1-0x80]);
318 pinsrw(xmm1, eax, 0x0);
319 mov(ax, word[A1-0x80]);
321 pinsrw(xmm2, eax, 0x0);
322 mov(ax, word[A1-0x80]);
324 pinsrw(xmm3, eax, 0x0);
325 mov(ax, word[A1-0x80]);
327 pinsrw(xmm4, eax, 0x0);
328 punpcklbw(xmm1, xmm2);
329 punpcklbw(xmm3, xmm4);
330 punpcklwd(xmm1, xmm3);
331 punpcklqdq(xmm0, xmm1);
332 movdqu(xword[B-0x80], xmm0);
341 mov(ax, word[A1-0x80]);
343 pinsrw(xmm0, eax, 0x0);
344 mov(ax, word[A1-0x80]);
346 pinsrw(xmm1, eax, 0x0);
347 mov(ax, word[A1-0x80]);
349 pinsrw(xmm2, eax, 0x0);
350 mov(ax, word[A1-0x80]);
352 pinsrw(xmm3, eax, 0x0);
353 punpcklbw(xmm0, xmm1);
354 punpcklbw(xmm2, xmm3);
355 punpcklwd(xmm0, xmm2);
356 movq(qword[B-0x80], xmm0);
363 mov(ax, word[A1-0x80]);
365 pinsrw(xmm0, eax, 0x0);
366 mov(ax, word[A1-0x80]);
368 pinsrw(xmm1, eax, 0x0);
369 punpcklbw(xmm0, xmm1);
370 movd(dword[B-0x80], xmm0);
377 mov(ax, word[A1-0x80]);
378 mov(word[B-0x80], ax);
402 mov(al, byte[A1-0x80]);
404 pinsrb(xmm0, eax, 0x0);
405 mov(al, byte[A1-0x80]);
407 pinsrb(xmm0, eax, 0x1);
408 mov(al, byte[A1-0x80]);
410 pinsrb(xmm0, eax, 0x2);
411 mov(al, byte[A1-0x80]);
413 pinsrb(xmm0, eax, 0x3);
414 mov(al, byte[A1-0x80]);
416 pinsrb(xmm0, eax, 0x4);
417 mov(al, byte[A1-0x80]);
419 pinsrb(xmm0, eax, 0x5);
420 mov(al, byte[A1-0x80]);
422 pinsrb(xmm0, eax, 0x6);
423 mov(al, byte[A1-0x80]);
425 pinsrb(xmm0, eax, 0x7);
426 movq(qword[B-0x80], xmm0);
435 mov(al, byte[A1-0x80]);
437 pinsrb(xmm0, eax, 0x0);
438 mov(al, byte[A1-0x80]);
440 pinsrb(xmm0, eax, 0x1);
441 mov(al, byte[A1-0x80]);
443 pinsrb(xmm0, eax, 0x2);
444 mov(al, byte[A1-0x80]);
446 pinsrb(xmm0, eax, 0x3);
447 movd(dword[B-0x80], xmm0);
454 mov(al, byte[A1-0x80]);
456 mov(byte[B-0x80], al);
457 mov(al, byte[A1-0x80]);
459 mov(byte[B-0x7f], al);
466 mov(al, byte[A1-0x80]);
467 mov(byte[B-0x80], al);