1 /*******************************************************************************
2 * Copyright 2018 Intel Corporation
4 * Licensed under the Apache License, Version 2.0 (the "License");
5 * you may not use this file except in compliance with the License.
6 * You may obtain a copy of the License at
8 * http://www.apache.org/licenses/LICENSE-2.0
10 * Unless required by applicable law or agreed to in writing, software
11 * distributed under the License is distributed on an "AS IS" BASIS,
12 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13 * See the License for the specific language governing permissions and
14 * limitations under the License.
15 *******************************************************************************/
17 #include "jit_generator.hpp"
24 jit_avx512_core_u8_copy_sum_an_kern::jit_avx512_core_u8_copy_sum_an_kern(): jit_generator(nullptr, GEMM_CODE_SIZE)
40 #define ARG_BIAS 24+stacksize+rsp
56 #define ARG_ALPHA 40+stacksize+rsp
57 #define ARG_B 48+stacksize+rsp
58 #define ARG_BIAS 72+stacksize+rsp
113 auto stacksize = get_size_of_abi_save_regs();
115 mov(ALPHA, ptr[ARG_ALPHA]);
121 mov(LDA, qword[LDA]);
122 lea(LDA3, ptr[LDA+LDA*2]);
132 vxorps(ymm8, ymm8, ymm8);
133 vxorps(ymm9, ymm9, ymm9);
134 vxorps(ymm10, ymm10, ymm10);
135 vxorps(ymm11, ymm11, ymm11);
136 vxorps(ymm12, ymm12, ymm12);
137 vxorps(ymm13, ymm13, ymm13);
138 vxorps(ymm14, ymm14, ymm14);
139 vxorps(ymm15, ymm15, ymm15);
146 vmovdqu(xmm0, xword[A1-0x80]);
147 vmovdqu(xmm1, xword[A1+LDA*1-0x80]);
148 vmovdqu(xmm2, xword[A1+LDA*2-0x80]);
149 vmovdqu(xmm3, xword[A1+LDA3*1-0x80]);
150 vpunpcklbw(xmm4, xmm0, xmm1);
151 vpunpckhbw(xmm5, xmm0, xmm1);
152 vpunpcklbw(xmm6, xmm2, xmm3);
153 vpunpckhbw(xmm7, xmm2, xmm3);
154 vpunpcklwd(xmm0, xmm4, xmm6);
155 vpunpckhwd(xmm1, xmm4, xmm6);
156 vpunpcklwd(xmm2, xmm5, xmm7);
157 vpunpckhwd(xmm3, xmm5, xmm7);
158 vpmovsxbw(ymm5, xmm0);
159 vmovhlps(xmm6, xmm0, xmm0);
160 vpmovsxbw(ymm6, xmm6);
161 vphaddw(ymm5, ymm5, ymm6);
162 vpmovsxbw(ymm6, xmm1);
163 vmovhlps(xmm7, xmm1, xmm1);
164 vpmovsxbw(ymm7, xmm7);
165 vphaddw(ymm6, ymm6, ymm7);
166 vphaddw(ymm5, ymm5, ymm6);
167 vpmovsxwd(ymm5, xmm5);
168 vpaddd(ymm8, ymm8, ymm5);
169 vmovdqu(xword[B-0x80], xmm0);
170 vmovdqu(xword[B-0x70], xmm1);
171 vpmovsxbw(ymm5, xmm2);
172 vmovhlps(xmm6, xmm2, xmm2);
173 vpmovsxbw(ymm6, xmm6);
174 vphaddw(ymm5, ymm5, ymm6);
175 vpmovsxbw(ymm6, xmm3);
176 vmovhlps(xmm7, xmm3, xmm3);
177 vpmovsxbw(ymm7, xmm7);
178 vphaddw(ymm6, ymm6, ymm7);
179 vphaddw(ymm5, ymm5, ymm6);
180 vpmovsxwd(ymm5, xmm5);
181 vpaddd(ymm9, ymm9, ymm5);
182 vmovdqu(xword[B-0x60], xmm2);
183 vmovdqu(xword[B-0x50], xmm3);
184 vmovdqu(xmm0, xword[A1-0x70]);
185 vmovdqu(xmm1, xword[A1+LDA*1-0x70]);
186 vmovdqu(xmm2, xword[A1+LDA*2-0x70]);
187 vmovdqu(xmm3, xword[A1+LDA3*1-0x70]);
188 vpunpcklbw(xmm4, xmm0, xmm1);
189 vpunpckhbw(xmm5, xmm0, xmm1);
190 vpunpcklbw(xmm6, xmm2, xmm3);
191 vpunpckhbw(xmm7, xmm2, xmm3);
192 vpunpcklwd(xmm0, xmm4, xmm6);
193 vpunpckhwd(xmm1, xmm4, xmm6);
194 vpunpcklwd(xmm2, xmm5, xmm7);
195 vpunpckhwd(xmm3, xmm5, xmm7);
196 vpmovsxbw(ymm5, xmm0);
197 vmovhlps(xmm6, xmm0, xmm0);
198 vpmovsxbw(ymm6, xmm6);
199 vphaddw(ymm5, ymm5, ymm6);
200 vpmovsxbw(ymm6, xmm1);
201 vmovhlps(xmm7, xmm1, xmm1);
202 vpmovsxbw(ymm7, xmm7);
203 vphaddw(ymm6, ymm6, ymm7);
204 vphaddw(ymm5, ymm5, ymm6);
205 vpmovsxwd(ymm5, xmm5);
206 vpaddd(ymm10, ymm10, ymm5);
207 vmovdqu(xword[B-0x40], xmm0);
208 vmovdqu(xword[B-0x30], xmm1);
209 vpmovsxbw(ymm5, xmm2);
210 vmovhlps(xmm6, xmm2, xmm2);
211 vpmovsxbw(ymm6, xmm6);
212 vphaddw(ymm5, ymm5, ymm6);
213 vpmovsxbw(ymm6, xmm3);
214 vmovhlps(xmm7, xmm3, xmm3);
215 vpmovsxbw(ymm7, xmm7);
216 vphaddw(ymm6, ymm6, ymm7);
217 vphaddw(ymm5, ymm5, ymm6);
218 vpmovsxwd(ymm5, xmm5);
219 vpaddd(ymm11, ymm11, ymm5);
220 vmovdqu(xword[B-0x20], xmm2);
221 vmovdqu(xword[B-0x10], xmm3);
222 vmovdqu(xmm0, xword[A1-0x60]);
223 vmovdqu(xmm1, xword[A1+LDA*1-0x60]);
224 vmovdqu(xmm2, xword[A1+LDA*2-0x60]);
225 vmovdqu(xmm3, xword[A1+LDA3*1-0x60]);
226 lea(A1, ptr[A1+LDA*4]);
227 vpunpcklbw(xmm4, xmm0, xmm1);
228 vpunpckhbw(xmm5, xmm0, xmm1);
229 vpunpcklbw(xmm6, xmm2, xmm3);
230 vpunpckhbw(xmm7, xmm2, xmm3);
231 vpunpcklwd(xmm0, xmm4, xmm6);
232 vpunpckhwd(xmm1, xmm4, xmm6);
233 vpunpcklwd(xmm2, xmm5, xmm7);
234 vpunpckhwd(xmm3, xmm5, xmm7);
235 vpmovsxbw(ymm5, xmm0);
236 vmovhlps(xmm6, xmm0, xmm0);
237 vpmovsxbw(ymm6, xmm6);
238 vphaddw(ymm5, ymm5, ymm6);
239 vpmovsxbw(ymm6, xmm1);
240 vmovhlps(xmm7, xmm1, xmm1);
241 vpmovsxbw(ymm7, xmm7);
242 vphaddw(ymm6, ymm6, ymm7);
243 vphaddw(ymm5, ymm5, ymm6);
244 vpmovsxwd(ymm5, xmm5);
245 vpaddd(ymm12, ymm12, ymm5);
246 vmovdqu(xword[B], xmm0);
247 vmovdqu(xword[B+0x10], xmm1);
248 vpmovsxbw(ymm5, xmm2);
249 vmovhlps(xmm6, xmm2, xmm2);
250 vpmovsxbw(ymm6, xmm6);
251 vphaddw(ymm5, ymm5, ymm6);
252 vpmovsxbw(ymm6, xmm3);
253 vmovhlps(xmm7, xmm3, xmm3);
254 vpmovsxbw(ymm7, xmm7);
255 vphaddw(ymm6, ymm6, ymm7);
256 vphaddw(ymm5, ymm5, ymm6);
257 vpmovsxwd(ymm5, xmm5);
258 vpaddd(ymm13, ymm13, ymm5);
259 vmovdqu(xword[B+0x20], xmm2);
260 vmovdqu(xword[B+0x30], xmm3);
269 vmovdqu(xmm0, xword[A1-0x80]);
270 vmovdqu(xmm1, xword[A1-0x70]);
271 vmovdqu(xmm2, xword[A1-0x60]);
273 vmovdqu(xmm6, xword[A1-0x80]);
274 vmovdqu(xmm4, xword[A1-0x70]);
275 vmovdqu(xmm5, xword[A1-0x60]);
277 vpunpcklbw(xmm3, xmm0, xmm6);
278 vpunpckhbw(xmm0, xmm0, xmm6);
279 vpmovsxbw(ymm7, xmm3);
280 vmovhlps(xmm6, xmm3, xmm3);
281 vpmovsxbw(ymm6, xmm6);
282 vphaddw(ymm7, ymm7, ymm6);
283 vpmovsxwd(ymm7, xmm7);
284 vpaddd(ymm8, ymm8, ymm7);
285 vmovdqu(xword[B-0x80], xmm3);
286 vpmovsxbw(ymm7, xmm0);
287 vmovhlps(xmm6, xmm0, xmm0);
288 vpmovsxbw(ymm6, xmm6);
289 vphaddw(ymm7, ymm7, ymm6);
290 vpmovsxwd(ymm7, xmm7);
291 vpaddd(ymm9, ymm9, ymm7);
292 vmovdqu(xword[B-0x70], xmm0);
293 vpunpcklbw(xmm3, xmm1, xmm4);
294 vpunpckhbw(xmm0, xmm1, xmm4);
295 vpmovsxbw(ymm7, xmm3);
296 vmovhlps(xmm6, xmm3, xmm3);
297 vpmovsxbw(ymm6, xmm6);
298 vphaddw(ymm7, ymm7, ymm6);
299 vpmovsxwd(ymm7, xmm7);
300 vpaddd(ymm10, ymm10, ymm7);
301 vmovdqu(xword[B-0x60], xmm3);
302 vpmovsxbw(ymm7, xmm0);
303 vmovhlps(xmm6, xmm0, xmm0);
304 vpmovsxbw(ymm6, xmm6);
305 vphaddw(ymm7, ymm7, ymm6);
306 vpmovsxwd(ymm7, xmm7);
307 vpaddd(ymm11, ymm11, ymm7);
308 vmovdqu(xword[B-0x50], xmm0);
309 vpunpcklbw(xmm3, xmm2, xmm5);
310 vpunpckhbw(xmm0, xmm2, xmm5);
311 vpmovsxbw(ymm7, xmm3);
312 vmovhlps(xmm6, xmm3, xmm3);
313 vpmovsxbw(ymm6, xmm6);
314 vphaddw(ymm7, ymm7, ymm6);
315 vpmovsxwd(ymm7, xmm7);
316 vpaddd(ymm12, ymm12, ymm7);
317 vmovdqu(xword[B-0x40], xmm3);
318 vpmovsxbw(ymm7, xmm0);
319 vmovhlps(xmm6, xmm0, xmm0);
320 vpmovsxbw(ymm6, xmm6);
321 vphaddw(ymm7, ymm7, ymm6);
322 vpmovsxwd(ymm7, xmm7);
323 vpaddd(ymm13, ymm13, ymm7);
324 vmovdqu(xword[B-0x30], xmm0);
331 vmovdqu(xmm0, xword[A1-0x80]);
332 vmovdqu(xmm1, xword[A1-0x70]);
333 vmovdqu(xmm2, xword[A1-0x60]);
335 vpmovsxbd(ymm7, xmm0);
336 vpaddd(ymm8, ymm8, ymm7);
337 vmovhlps(xmm7, xmm0, xmm0);
338 vpmovsxbd(ymm7, xmm7);
339 vpaddd(ymm9, ymm9, ymm7);
340 vmovdqu(xword[B-0x80], xmm0);
341 vpmovsxbd(ymm7, xmm1);
342 vpaddd(ymm10, ymm10, ymm7);
343 vmovhlps(xmm7, xmm1, xmm1);
344 vpmovsxbd(ymm7, xmm7);
345 vpaddd(ymm11, ymm11, ymm7);
346 vmovdqu(xword[B-0x70], xmm1);
347 vpmovsxbd(ymm7, xmm2);
348 vpaddd(ymm12, ymm12, ymm7);
349 vmovhlps(xmm7, xmm2, xmm2);
350 vpmovsxbd(ymm7, xmm7);
351 vpaddd(ymm13, ymm13, ymm7);
352 vmovdqu(xword[B-0x60], xmm2);
357 mov(A1, qword[ARG_BIAS]);
358 vmovdqu(yword[A1], ymm8);
359 vmovdqu(yword[A1+0x20], ymm9);
360 vmovdqu(yword[A1+0x40], ymm10);
361 vmovdqu(yword[A1+0x60], ymm11);
362 vmovdqu(yword[A1+0x80], ymm12);
363 vmovdqu(yword[A1+0xa0], ymm13);
364 add(qword[ARG_BIAS], 0xc0);
393 movdqu(xmm0, xword[A1-0x80]);
394 movdqu(xmm1, xword[A1+LDA*1-0x80]);
395 movdqu(xmm2, xword[A1+LDA*2-0x80]);
396 movdqu(xmm3, xword[A1+LDA3*1-0x80]);
398 punpcklbw(xmm0, xmm1);
399 punpckhbw(xmm4, xmm1);
401 punpcklbw(xmm2, xmm3);
402 punpckhbw(xmm5, xmm3);
404 punpcklwd(xmm0, xmm2);
405 punpckhwd(xmm1, xmm2);
407 punpcklwd(xmm4, xmm5);
408 punpckhwd(xmm2, xmm5);
409 pmovsxbw(xmm5, xmm0);
411 pmovsxbw(xmm6, xmm6);
414 pmovsxwd(xmm5, xmm5);
416 movdqu(xword[B-0x80], xmm0);
417 pmovsxbw(xmm5, xmm1);
419 pmovsxbw(xmm6, xmm6);
422 pmovsxwd(xmm5, xmm5);
424 movdqu(xword[B-0x70], xmm1);
425 pmovsxbw(xmm5, xmm4);
427 pmovsxbw(xmm6, xmm6);
430 pmovsxwd(xmm5, xmm5);
432 movdqu(xword[B-0x60], xmm4);
433 pmovsxbw(xmm5, xmm2);
435 pmovsxbw(xmm6, xmm6);
438 pmovsxwd(xmm5, xmm5);
440 movdqu(xword[B-0x50], xmm2);
441 movdqu(xmm0, xword[A1-0x70]);
442 movdqu(xmm1, xword[A1+LDA*1-0x70]);
443 movdqu(xmm2, xword[A1+LDA*2-0x70]);
444 movdqu(xmm3, xword[A1+LDA3*1-0x70]);
445 lea(A1, ptr[A1+LDA*4]);
447 punpcklbw(xmm0, xmm1);
448 punpckhbw(xmm4, xmm1);
450 punpcklbw(xmm2, xmm3);
451 punpckhbw(xmm5, xmm3);
453 punpcklwd(xmm0, xmm2);
454 punpckhwd(xmm1, xmm2);
456 punpcklwd(xmm4, xmm5);
457 punpckhwd(xmm2, xmm5);
458 pmovsxbw(xmm5, xmm0);
460 pmovsxbw(xmm6, xmm6);
463 pmovsxwd(xmm5, xmm5);
465 movdqu(xword[B-0x40], xmm0);
466 pmovsxbw(xmm5, xmm1);
468 pmovsxbw(xmm6, xmm6);
471 pmovsxwd(xmm5, xmm5);
473 movdqu(xword[B-0x30], xmm1);
474 pmovsxbw(xmm5, xmm4);
476 pmovsxbw(xmm6, xmm6);
479 pmovsxwd(xmm5, xmm5);
481 movdqu(xword[B-0x20], xmm4);
482 pmovsxbw(xmm5, xmm2);
484 pmovsxbw(xmm6, xmm6);
487 pmovsxwd(xmm5, xmm5);
489 movdqu(xword[B-0x10], xmm2);
498 movdqu(xmm0, xword[A1-0x80]);
499 movdqu(xmm1, xword[A1-0x70]);
501 movdqu(xmm2, xword[A1-0x80]);
502 movdqu(xmm3, xword[A1-0x70]);
505 punpcklbw(xmm0, xmm2);
506 punpckhbw(xmm4, xmm2);
507 pmovsxbw(xmm5, xmm0);
509 pmovsxwd(xmm5, xmm5);
512 pmovsxbw(xmm6, xmm6);
514 pmovsxwd(xmm6, xmm6);
516 movdqu(xword[B-0x80], xmm0);
517 pmovsxbw(xmm5, xmm4);
519 pmovsxwd(xmm5, xmm5);
522 pmovsxbw(xmm6, xmm6);
524 pmovsxwd(xmm6, xmm6);
526 movdqu(xword[B-0x70], xmm4);
528 punpcklbw(xmm1, xmm3);
529 punpckhbw(xmm4, xmm3);
530 pmovsxbw(xmm5, xmm1);
532 pmovsxwd(xmm5, xmm5);
535 pmovsxbw(xmm6, xmm6);
537 pmovsxwd(xmm6, xmm6);
539 movdqu(xword[B-0x60], xmm1);
540 pmovsxbw(xmm5, xmm4);
542 pmovsxwd(xmm5, xmm5);
545 pmovsxbw(xmm6, xmm6);
547 pmovsxwd(xmm6, xmm6);
549 movdqu(xword[B-0x50], xmm4);
556 movdqu(xmm0, xword[A1-0x80]);
557 movdqu(xmm1, xword[A1-0x70]);
559 pmovsxbd(xmm5, xmm0);
561 pshufd(xmm6, xmm0, 0x55);
562 pmovsxbd(xmm6, xmm6);
564 pshufd(xmm5, xmm0, 0xaa);
565 pmovsxbd(xmm5, xmm5);
567 pshufd(xmm6, xmm0, 0xff);
568 pmovsxbd(xmm6, xmm6);
570 movdqu(xword[B-0x80], xmm0);
571 pmovsxbd(xmm5, xmm1);
573 pshufd(xmm6, xmm1, 0x55);
574 pmovsxbd(xmm6, xmm6);
576 pshufd(xmm5, xmm1, 0xaa);
577 pmovsxbd(xmm5, xmm5);
579 pshufd(xmm6, xmm1, 0xff);
580 pmovsxbd(xmm6, xmm6);
582 movdqu(xword[B-0x70], xmm1);
587 mov(A1, qword[ARG_BIAS]);
588 movdqu(xword[A1], xmm8);
589 movdqu(xword[A1+0x10], xmm9);
590 movdqu(xword[A1+0x20], xmm10);
591 movdqu(xword[A1+0x30], xmm11);
592 movdqu(xword[A1+0x40], xmm12);
593 movdqu(xword[A1+0x50], xmm13);
594 movdqu(xword[A1+0x60], xmm14);
595 movdqu(xword[A1+0x70], xmm15);
596 add(qword[ARG_BIAS], 0x80);
620 movdqu(xmm0, xword[A1-0x80]);
622 movdqu(xmm1, xword[A1-0x80]);
624 movdqu(xmm2, xword[A1-0x80]);
626 movdqu(xmm3, xword[A1-0x80]);
629 punpcklbw(xmm0, xmm1);
630 punpckhbw(xmm4, xmm1);
632 punpcklbw(xmm2, xmm3);
633 punpckhbw(xmm1, xmm3);
635 punpcklwd(xmm0, xmm2);
636 punpckhwd(xmm3, xmm2);
638 punpcklwd(xmm4, xmm1);
639 punpckhwd(xmm2, xmm1);
640 pmovsxbw(xmm5, xmm0);
642 pmovsxbw(xmm6, xmm6);
645 pmovsxwd(xmm5, xmm5);
647 pmovsxbw(xmm5, xmm3);
649 pmovsxbw(xmm6, xmm6);
652 pmovsxwd(xmm5, xmm5);
654 movdqu(xword[B-0x80], xmm0);
655 movdqu(xword[B-0x70], xmm3);
656 pmovsxbw(xmm5, xmm4);
658 pmovsxbw(xmm6, xmm6);
661 pmovsxwd(xmm5, xmm5);
663 pmovsxbw(xmm5, xmm2);
665 pmovsxbw(xmm6, xmm6);
668 pmovsxwd(xmm5, xmm5);
670 movdqu(xword[B-0x60], xmm4);
671 movdqu(xword[B-0x50], xmm2);
680 movdqu(xmm0, xword[A1-0x80]);
682 movdqu(xmm1, xword[A1-0x80]);
685 punpcklbw(xmm0, xmm1);
686 punpckhbw(xmm2, xmm1);
687 pmovsxbw(xmm5, xmm0);
689 pmovsxwd(xmm5, xmm5);
692 pmovsxbw(xmm6, xmm6);
694 pmovsxwd(xmm6, xmm6);
696 pmovsxbw(xmm5, xmm2);
698 pmovsxwd(xmm5, xmm5);
701 pmovsxbw(xmm6, xmm6);
703 pmovsxwd(xmm6, xmm6);
705 movdqu(xword[B-0x80], xmm0);
706 movdqu(xword[B-0x70], xmm2);
713 movdqu(xmm0, xword[A1-0x80]);
715 pmovsxbd(xmm5, xmm0);
717 pshufd(xmm6, xmm0, 0x55);
718 pmovsxbd(xmm6, xmm6);
720 pshufd(xmm5, xmm0, 0xaa);
721 pmovsxbd(xmm5, xmm5);
723 pshufd(xmm6, xmm0, 0xff);
724 pmovsxbd(xmm6, xmm6);
726 movdqu(xword[B-0x80], xmm0);
731 mov(A1, qword[ARG_BIAS]);
732 movdqu(xword[A1], xmm8);
733 movdqu(xword[A1+0x10], xmm9);
734 movdqu(xword[A1+0x20], xmm10);
735 movdqu(xword[A1+0x30], xmm11);
736 add(qword[ARG_BIAS], 0x40);
758 movq(xmm0, qword[A1-0x80]);
760 movq(xmm1, qword[A1-0x80]);
762 movq(xmm2, qword[A1-0x80]);
764 movq(xmm3, qword[A1-0x80]);
766 punpcklbw(xmm0, xmm1);
767 punpcklbw(xmm2, xmm3);
769 punpcklwd(xmm0, xmm2);
770 punpckhwd(xmm1, xmm2);
771 pmovsxbw(xmm5, xmm0);
773 pmovsxbw(xmm6, xmm6);
776 pmovsxwd(xmm5, xmm5);
778 pmovsxbw(xmm5, xmm1);
780 pmovsxbw(xmm6, xmm6);
783 pmovsxwd(xmm5, xmm5);
785 movdqu(xword[B-0x80], xmm0);
786 movdqu(xword[B-0x70], xmm1);
787 movq(xmm0, qword[A1-0x80]);
789 movq(xmm1, qword[A1-0x80]);
791 movq(xmm2, qword[A1-0x80]);
793 movq(xmm3, qword[A1-0x80]);
795 punpcklbw(xmm0, xmm1);
796 punpcklbw(xmm2, xmm3);
798 punpcklwd(xmm0, xmm2);
799 punpckhwd(xmm1, xmm2);
800 pmovsxbw(xmm5, xmm0);
802 pmovsxbw(xmm6, xmm6);
805 pmovsxwd(xmm5, xmm5);
807 pmovsxbw(xmm5, xmm1);
809 pmovsxbw(xmm6, xmm6);
812 pmovsxwd(xmm5, xmm5);
814 movdqu(xword[B-0x60], xmm0);
815 movdqu(xword[B-0x50], xmm1);
824 movq(xmm0, qword[A1-0x80]);
826 movq(xmm1, qword[A1-0x80]);
828 movq(xmm2, qword[A1-0x80]);
830 movq(xmm3, qword[A1-0x80]);
832 punpcklbw(xmm0, xmm1);
833 punpcklbw(xmm2, xmm3);
835 punpcklwd(xmm0, xmm2);
836 punpckhwd(xmm1, xmm2);
837 pmovsxbw(xmm5, xmm0);
839 pmovsxbw(xmm6, xmm6);
842 pmovsxwd(xmm5, xmm5);
844 pmovsxbw(xmm5, xmm1);
846 pmovsxbw(xmm6, xmm6);
849 pmovsxwd(xmm5, xmm5);
851 movdqu(xword[B-0x80], xmm0);
852 movdqu(xword[B-0x70], xmm1);
859 movq(xmm0, qword[A1-0x80]);
861 movq(xmm1, qword[A1-0x80]);
863 punpcklbw(xmm0, xmm1);
864 pmovsxbw(xmm5, xmm0);
866 pmovsxwd(xmm5, xmm5);
869 pmovsxbw(xmm6, xmm6);
871 pmovsxwd(xmm6, xmm6);
873 movdqu(xword[B-0x80], xmm0);
880 movq(xmm0, qword[A1-0x80]);
882 pmovsxbd(xmm5, xmm0);
883 pshufd(xmm6, xmm0, 0x55);
884 pmovsxbd(xmm6, xmm6);
887 movq(qword[B-0x80], xmm0);
892 mov(A1, qword[ARG_BIAS]);
893 movdqu(xword[A1], xmm8);
894 movdqu(xword[A1+0x10], xmm9);
895 add(qword[ARG_BIAS], 0x20);
916 movd(xmm0, dword[A1-0x80]);
918 movd(xmm1, dword[A1-0x80]);
920 movd(xmm2, dword[A1-0x80]);
922 movd(xmm3, dword[A1-0x80]);
924 punpcklbw(xmm0, xmm1);
925 punpcklbw(xmm2, xmm3);
926 punpcklwd(xmm0, xmm2);
927 pmovsxbw(xmm5, xmm0);
929 pmovsxbw(xmm6, xmm6);
932 pmovsxwd(xmm5, xmm5);
934 movdqu(xword[B-0x80], xmm0);
935 movd(xmm0, dword[A1-0x80]);
937 movd(xmm1, dword[A1-0x80]);
939 movd(xmm2, dword[A1-0x80]);
941 movd(xmm3, dword[A1-0x80]);
943 punpcklbw(xmm0, xmm1);
944 punpcklbw(xmm2, xmm3);
945 punpcklwd(xmm0, xmm2);
946 pmovsxbw(xmm5, xmm0);
948 pmovsxbw(xmm6, xmm6);
951 pmovsxwd(xmm5, xmm5);
953 movdqu(xword[B-0x70], xmm0);
962 movd(xmm0, dword[A1-0x80]);
964 movd(xmm1, dword[A1-0x80]);
966 movd(xmm2, dword[A1-0x80]);
968 movd(xmm3, dword[A1-0x80]);
970 punpcklbw(xmm0, xmm1);
971 punpcklbw(xmm2, xmm3);
972 punpcklwd(xmm0, xmm2);
973 pmovsxbw(xmm5, xmm0);
975 pmovsxbw(xmm6, xmm6);
978 pmovsxwd(xmm5, xmm5);
980 movdqu(xword[B-0x80], xmm0);
987 movd(xmm0, dword[A1-0x80]);
989 movd(xmm1, dword[A1-0x80]);
991 punpcklbw(xmm0, xmm1);
992 pmovsxbw(xmm5, xmm0);
994 pmovsxwd(xmm5, xmm5);
996 movq(qword[B-0x80], xmm0);
1003 movd(xmm0, dword[A1-0x80]);
1004 pmovsxbd(xmm5, xmm0);
1006 movd(dword[B-0x80], xmm0);
1011 mov(A1, qword[ARG_BIAS]);
1012 movdqu(xword[A1], xmm7);
1013 add(qword[ARG_BIAS], 0x10);
1034 mov(ax, word[A1-0x80]);
1036 pinsrw(xmm0, eax, 0x0);
1037 mov(ax, word[A1-0x80]);
1039 pinsrw(xmm1, eax, 0x0);
1040 mov(ax, word[A1-0x80]);
1042 pinsrw(xmm2, eax, 0x0);
1043 mov(ax, word[A1-0x80]);
1045 pinsrw(xmm3, eax, 0x0);
1046 punpcklbw(xmm0, xmm1);
1047 punpcklbw(xmm2, xmm3);
1048 punpcklwd(xmm0, xmm2);
1049 mov(ax, word[A1-0x80]);
1051 pinsrw(xmm1, eax, 0x0);
1052 mov(ax, word[A1-0x80]);
1054 pinsrw(xmm2, eax, 0x0);
1055 mov(ax, word[A1-0x80]);
1057 pinsrw(xmm3, eax, 0x0);
1058 mov(ax, word[A1-0x80]);
1060 pinsrw(xmm4, eax, 0x0);
1061 punpcklbw(xmm1, xmm2);
1062 punpcklbw(xmm3, xmm4);
1063 punpcklwd(xmm1, xmm3);
1064 punpcklqdq(xmm0, xmm1);
1065 pshufd(xmm6, xmm0, 0xd8);
1066 pmovsxbw(xmm5, xmm6);
1067 movhlps(xmm6, xmm6);
1068 pmovsxbw(xmm6, xmm6);
1072 pmovsxwd(xmm5, xmm5);
1074 movdqu(xword[B-0x80], xmm0);
1083 mov(ax, word[A1-0x80]);
1085 pinsrw(xmm0, eax, 0x0);
1086 mov(ax, word[A1-0x80]);
1088 pinsrw(xmm1, eax, 0x0);
1089 mov(ax, word[A1-0x80]);
1091 pinsrw(xmm2, eax, 0x0);
1092 mov(ax, word[A1-0x80]);
1094 pinsrw(xmm3, eax, 0x0);
1095 punpcklbw(xmm0, xmm1);
1096 punpcklbw(xmm2, xmm3);
1097 punpcklwd(xmm0, xmm2);
1098 pmovsxbw(xmm5, xmm0);
1101 pmovsxwd(xmm5, xmm5);
1103 movq(qword[B-0x80], xmm0);
1110 mov(ax, word[A1-0x80]);
1112 pinsrw(xmm0, eax, 0x0);
1113 mov(ax, word[A1-0x80]);
1115 pinsrw(xmm1, eax, 0x0);
1116 punpcklbw(xmm0, xmm1);
1117 pmovsxbw(xmm5, xmm0);
1119 pmovsxwd(xmm5, xmm5);
1121 movd(dword[B-0x80], xmm0);
1128 mov(ax, word[A1-0x80]);
1129 pinsrw(xmm0, eax, 0x0);
1130 pmovsxbd(xmm5, xmm0);
1132 mov(word[B-0x80], ax);
1137 mov(A1, qword[ARG_BIAS]);
1138 movq(qword[A1], xmm7);
1139 add(qword[ARG_BIAS], 0x8);
1160 mov(al, byte[A1-0x80]);
1162 pinsrb(xmm0, eax, 0x0);
1163 mov(al, byte[A1-0x80]);
1165 pinsrb(xmm0, eax, 0x1);
1166 mov(al, byte[A1-0x80]);
1168 pinsrb(xmm0, eax, 0x2);
1169 mov(al, byte[A1-0x80]);
1171 pinsrb(xmm0, eax, 0x3);
1172 mov(al, byte[A1-0x80]);
1174 pinsrb(xmm0, eax, 0x4);
1175 mov(al, byte[A1-0x80]);
1177 pinsrb(xmm0, eax, 0x5);
1178 mov(al, byte[A1-0x80]);
1180 pinsrb(xmm0, eax, 0x6);
1181 mov(al, byte[A1-0x80]);
1183 pinsrb(xmm0, eax, 0x7);
1184 pmovsxbw(xmm5, xmm0);
1188 pmovsxwd(xmm5, xmm5);
1190 movq(qword[B-0x80], xmm0);
1199 mov(al, byte[A1-0x80]);
1201 pinsrb(xmm0, eax, 0x0);
1202 mov(al, byte[A1-0x80]);
1204 pinsrb(xmm0, eax, 0x1);
1205 mov(al, byte[A1-0x80]);
1207 pinsrb(xmm0, eax, 0x2);
1208 mov(al, byte[A1-0x80]);
1210 pinsrb(xmm0, eax, 0x3);
1211 pmovsxbw(xmm5, xmm0);
1214 pmovsxwd(xmm5, xmm5);
1216 movd(dword[B-0x80], xmm0);
1223 mov(al, byte[A1-0x80]);
1225 pinsrb(xmm0, eax, 0x0);
1226 mov(byte[B-0x80], al);
1227 mov(al, byte[A1-0x80]);
1229 pinsrb(xmm0, eax, 0x1);
1230 pmovsxbw(xmm5, xmm0);
1232 pmovsxwd(xmm5, xmm5);
1234 mov(byte[B-0x7f], al);
1241 mov(al, byte[A1-0x80]);
1242 pinsrw(xmm0, eax, 0x0);
1243 pmovsxbd(xmm5, xmm0);
1245 mov(byte[B-0x80], al);
1250 mov(A1, qword[ARG_BIAS]);
1251 movd(dword[A1], xmm7);
1252 add(qword[ARG_BIAS], 0x4);