1 /*******************************************************************************
2 * Copyright 2018 Intel Corporation
4 * Licensed under the Apache License, Version 2.0 (the "License");
5 * you may not use this file except in compliance with the License.
6 * You may obtain a copy of the License at
8 * http://www.apache.org/licenses/LICENSE-2.0
10 * Unless required by applicable law or agreed to in writing, software
11 * distributed under the License is distributed on an "AS IS" BASIS,
12 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13 * See the License for the specific language governing permissions and
14 * limitations under the License.
15 *******************************************************************************/
17 #include "jit_generator.hpp"
24 jit_avx512_core_u8_copy_at_kern::jit_avx512_core_u8_copy_at_kern(): jit_generator(nullptr, GEMM_CODE_SIZE)
54 #define ARG_ALPHA 40+stacksize+rsp
55 #define ARG_B 48+stacksize+rsp
121 auto stacksize = get_size_of_abi_save_regs();
122 mov(ALPHA, ptr[ARG_ALPHA]);
128 mov(LDA, qword[LDA]);
131 lea(LDA3, ptr[LDA+LDA*2]);
140 lea(I, ptr[I+LDA*8]);
141 lea(I, ptr[I+LDA*8]);
149 movdqu(xmm0, xword[A1-0x80]);
150 movdqu(xmm1, xword[A1+LDA*1-0x80]);
151 movdqu(xmm2, xword[A1+LDA*2-0x80]);
152 movdqu(xmm3, xword[A1+LDA3*1-0x80]);
153 lea(A2, ptr[A1+LDA*4]);
155 punpckldq(xmm0, xmm1);
156 punpckhdq(xmm4, xmm1);
158 punpckldq(xmm2, xmm3);
159 punpckhdq(xmm5, xmm3);
161 punpcklqdq(xmm0, xmm2);
162 punpckhqdq(xmm1, xmm2);
164 punpcklqdq(xmm4, xmm5);
165 punpckhqdq(xmm3, xmm5);
166 movdqu(xword[B-0x80], xmm0);
167 movdqu(xword[B+0x40], xmm1);
168 movdqu(xword[B+0x100], xmm4);
169 movdqu(xword[B+0x1c0], xmm3);
170 movdqu(xmm0, xword[A2-0x80]);
171 movdqu(xmm1, xword[A2+LDA*1-0x80]);
172 movdqu(xmm2, xword[A2+LDA*2-0x80]);
173 movdqu(xmm3, xword[A2+LDA3*1-0x80]);
174 lea(A2, ptr[A2+LDA*4]);
176 punpckldq(xmm0, xmm1);
177 punpckhdq(xmm4, xmm1);
179 punpckldq(xmm2, xmm3);
180 punpckhdq(xmm5, xmm3);
182 punpcklqdq(xmm0, xmm2);
183 punpckhqdq(xmm1, xmm2);
185 punpcklqdq(xmm4, xmm5);
186 punpckhqdq(xmm3, xmm5);
187 movdqu(xword[B-0x70], xmm0);
188 movdqu(xword[B+0x50], xmm1);
189 movdqu(xword[B+0x110], xmm4);
190 movdqu(xword[B+0x1d0], xmm3);
191 movdqu(xmm0, xword[A2-0x80]);
192 movdqu(xmm1, xword[A2+LDA*1-0x80]);
193 movdqu(xmm2, xword[A2+LDA*2-0x80]);
194 movdqu(xmm3, xword[A2+LDA3*1-0x80]);
195 lea(A2, ptr[A2+LDA*4]);
197 punpckldq(xmm0, xmm1);
198 punpckhdq(xmm4, xmm1);
200 punpckldq(xmm2, xmm3);
201 punpckhdq(xmm5, xmm3);
203 punpcklqdq(xmm0, xmm2);
204 punpckhqdq(xmm1, xmm2);
206 punpcklqdq(xmm4, xmm5);
207 punpckhqdq(xmm3, xmm5);
208 movdqu(xword[B-0x60], xmm0);
209 movdqu(xword[B+0x60], xmm1);
210 movdqu(xword[B+0x120], xmm4);
211 movdqu(xword[B+0x1e0], xmm3);
212 movdqu(xmm0, xword[A2-0x80]);
213 movdqu(xmm1, xword[A2+LDA*1-0x80]);
214 movdqu(xmm2, xword[A2+LDA*2-0x80]);
215 movdqu(xmm3, xword[A2+LDA3*1-0x80]);
216 lea(A2, ptr[A2+LDA*4]);
218 punpckldq(xmm0, xmm1);
219 punpckhdq(xmm4, xmm1);
221 punpckldq(xmm2, xmm3);
222 punpckhdq(xmm5, xmm3);
224 punpcklqdq(xmm0, xmm2);
225 punpckhqdq(xmm1, xmm2);
227 punpcklqdq(xmm4, xmm5);
228 punpckhqdq(xmm3, xmm5);
229 movdqu(xword[B-0x50], xmm0);
230 movdqu(xword[B+0x70], xmm1);
231 movdqu(xword[B+0x130], xmm4);
232 movdqu(xword[B+0x1f0], xmm3);
233 movdqu(xmm0, xword[A2-0x80]);
234 movdqu(xmm1, xword[A2+LDA*1-0x80]);
235 movdqu(xmm2, xword[A2+LDA*2-0x80]);
236 movdqu(xmm3, xword[A2+LDA3*1-0x80]);
237 lea(A2, ptr[A2+LDA*4]);
239 punpckldq(xmm0, xmm1);
240 punpckhdq(xmm4, xmm1);
242 punpckldq(xmm2, xmm3);
243 punpckhdq(xmm5, xmm3);
245 punpcklqdq(xmm0, xmm2);
246 punpckhqdq(xmm1, xmm2);
248 punpcklqdq(xmm4, xmm5);
249 punpckhqdq(xmm3, xmm5);
250 movdqu(xword[B-0x40], xmm0);
251 movdqu(xword[B+0x80], xmm1);
252 movdqu(xword[B+0x140], xmm4);
253 movdqu(xword[B+0x200], xmm3);
254 movdqu(xmm0, xword[A2-0x80]);
255 movdqu(xmm1, xword[A2+LDA*1-0x80]);
256 movdqu(xmm2, xword[A2+LDA*2-0x80]);
257 movdqu(xmm3, xword[A2+LDA3*1-0x80]);
258 lea(A2, ptr[A2+LDA*4]);
260 punpckldq(xmm0, xmm1);
261 punpckhdq(xmm4, xmm1);
263 punpckldq(xmm2, xmm3);
264 punpckhdq(xmm5, xmm3);
266 punpcklqdq(xmm0, xmm2);
267 punpckhqdq(xmm1, xmm2);
269 punpcklqdq(xmm4, xmm5);
270 punpckhqdq(xmm3, xmm5);
271 movdqu(xword[B-0x30], xmm0);
272 movdqu(xword[B+0x90], xmm1);
273 movdqu(xword[B+0x150], xmm4);
274 movdqu(xword[B+0x210], xmm3);
275 movdqu(xmm0, xword[A2-0x80]);
276 movdqu(xmm1, xword[A2+LDA*1-0x80]);
277 movdqu(xmm2, xword[A2+LDA*2-0x80]);
278 movdqu(xmm3, xword[A2+LDA3*1-0x80]);
279 lea(A2, ptr[A2+LDA*4]);
281 punpckldq(xmm0, xmm1);
282 punpckhdq(xmm4, xmm1);
284 punpckldq(xmm2, xmm3);
285 punpckhdq(xmm5, xmm3);
287 punpcklqdq(xmm0, xmm2);
288 punpckhqdq(xmm1, xmm2);
290 punpcklqdq(xmm4, xmm5);
291 punpckhqdq(xmm3, xmm5);
292 movdqu(xword[B-0x20], xmm0);
293 movdqu(xword[B+0xa0], xmm1);
294 movdqu(xword[B+0x160], xmm4);
295 movdqu(xword[B+0x220], xmm3);
296 movdqu(xmm0, xword[A2-0x80]);
297 movdqu(xmm1, xword[A2+LDA*1-0x80]);
298 movdqu(xmm2, xword[A2+LDA*2-0x80]);
299 movdqu(xmm3, xword[A2+LDA3*1-0x80]);
300 lea(A2, ptr[A2+LDA*4]);
302 punpckldq(xmm0, xmm1);
303 punpckhdq(xmm4, xmm1);
305 punpckldq(xmm2, xmm3);
306 punpckhdq(xmm5, xmm3);
308 punpcklqdq(xmm0, xmm2);
309 punpckhqdq(xmm1, xmm2);
311 punpcklqdq(xmm4, xmm5);
312 punpckhqdq(xmm3, xmm5);
313 movdqu(xword[B-0x10], xmm0);
314 movdqu(xword[B+0xb0], xmm1);
315 movdqu(xword[B+0x170], xmm4);
316 movdqu(xword[B+0x230], xmm3);
317 movdqu(xmm0, xword[A2-0x80]);
318 movdqu(xmm1, xword[A2+LDA*1-0x80]);
319 movdqu(xmm2, xword[A2+LDA*2-0x80]);
320 movdqu(xmm3, xword[A2+LDA3*1-0x80]);
321 lea(A2, ptr[A2+LDA*4]);
323 punpckldq(xmm0, xmm1);
324 punpckhdq(xmm4, xmm1);
326 punpckldq(xmm2, xmm3);
327 punpckhdq(xmm5, xmm3);
329 punpcklqdq(xmm0, xmm2);
330 punpckhqdq(xmm1, xmm2);
332 punpcklqdq(xmm4, xmm5);
333 punpckhqdq(xmm3, xmm5);
334 movdqu(xword[B], xmm0);
335 movdqu(xword[B+0xc0], xmm1);
336 movdqu(xword[B+0x180], xmm4);
337 movdqu(xword[B+0x240], xmm3);
338 movdqu(xmm0, xword[A2-0x80]);
339 movdqu(xmm1, xword[A2+LDA*1-0x80]);
340 movdqu(xmm2, xword[A2+LDA*2-0x80]);
341 movdqu(xmm3, xword[A2+LDA3*1-0x80]);
342 lea(A2, ptr[A2+LDA*4]);
344 punpckldq(xmm0, xmm1);
345 punpckhdq(xmm4, xmm1);
347 punpckldq(xmm2, xmm3);
348 punpckhdq(xmm5, xmm3);
350 punpcklqdq(xmm0, xmm2);
351 punpckhqdq(xmm1, xmm2);
353 punpcklqdq(xmm4, xmm5);
354 punpckhqdq(xmm3, xmm5);
355 movdqu(xword[B+0x10], xmm0);
356 movdqu(xword[B+0xd0], xmm1);
357 movdqu(xword[B+0x190], xmm4);
358 movdqu(xword[B+0x250], xmm3);
359 movdqu(xmm0, xword[A2-0x80]);
360 movdqu(xmm1, xword[A2+LDA*1-0x80]);
361 movdqu(xmm2, xword[A2+LDA*2-0x80]);
362 movdqu(xmm3, xword[A2+LDA3*1-0x80]);
363 lea(A2, ptr[A2+LDA*4]);
365 punpckldq(xmm0, xmm1);
366 punpckhdq(xmm4, xmm1);
368 punpckldq(xmm2, xmm3);
369 punpckhdq(xmm5, xmm3);
371 punpcklqdq(xmm0, xmm2);
372 punpckhqdq(xmm1, xmm2);
374 punpcklqdq(xmm4, xmm5);
375 punpckhqdq(xmm3, xmm5);
376 movdqu(xword[B+0x20], xmm0);
377 movdqu(xword[B+0xe0], xmm1);
378 movdqu(xword[B+0x1a0], xmm4);
379 movdqu(xword[B+0x260], xmm3);
380 movdqu(xmm0, xword[A2-0x80]);
381 movdqu(xmm1, xword[A2+LDA*1-0x80]);
382 movdqu(xmm2, xword[A2+LDA*2-0x80]);
383 movdqu(xmm3, xword[A2+LDA3*1-0x80]);
384 lea(A2, ptr[A2+LDA*4]);
386 punpckldq(xmm0, xmm1);
387 punpckhdq(xmm4, xmm1);
389 punpckldq(xmm2, xmm3);
390 punpckhdq(xmm5, xmm3);
392 punpcklqdq(xmm0, xmm2);
393 punpckhqdq(xmm1, xmm2);
395 punpcklqdq(xmm4, xmm5);
396 punpckhqdq(xmm3, xmm5);
397 movdqu(xword[B+0x30], xmm0);
398 movdqu(xword[B+0xf0], xmm1);
399 movdqu(xword[B+0x1b0], xmm4);
400 movdqu(xword[B+0x270], xmm3);
410 movq(xmm0, qword[A1-0x80]);
411 movq(xmm1, qword[A1+LDA*1-0x80]);
412 movq(xmm2, qword[A1+LDA*2-0x80]);
413 movq(xmm3, qword[A1+LDA3*1-0x80]);
414 lea(A2, ptr[A1+LDA*4]);
415 punpckldq(xmm0, xmm1);
416 punpckldq(xmm2, xmm3);
418 punpcklqdq(xmm0, xmm2);
419 punpckhqdq(xmm1, xmm2);
420 movdqu(xword[B-0x80], xmm0);
421 movdqu(xword[B+0x40], xmm1);
422 movq(xmm0, qword[A2-0x80]);
423 movq(xmm1, qword[A2+LDA*1-0x80]);
424 movq(xmm2, qword[A2+LDA*2-0x80]);
425 movq(xmm3, qword[A2+LDA3*1-0x80]);
426 lea(A2, ptr[A2+LDA*4]);
427 punpckldq(xmm0, xmm1);
428 punpckldq(xmm2, xmm3);
430 punpcklqdq(xmm0, xmm2);
431 punpckhqdq(xmm1, xmm2);
432 movdqu(xword[B-0x70], xmm0);
433 movdqu(xword[B+0x50], xmm1);
434 movq(xmm0, qword[A2-0x80]);
435 movq(xmm1, qword[A2+LDA*1-0x80]);
436 movq(xmm2, qword[A2+LDA*2-0x80]);
437 movq(xmm3, qword[A2+LDA3*1-0x80]);
438 lea(A2, ptr[A2+LDA*4]);
439 punpckldq(xmm0, xmm1);
440 punpckldq(xmm2, xmm3);
442 punpcklqdq(xmm0, xmm2);
443 punpckhqdq(xmm1, xmm2);
444 movdqu(xword[B-0x60], xmm0);
445 movdqu(xword[B+0x60], xmm1);
446 movq(xmm0, qword[A2-0x80]);
447 movq(xmm1, qword[A2+LDA*1-0x80]);
448 movq(xmm2, qword[A2+LDA*2-0x80]);
449 movq(xmm3, qword[A2+LDA3*1-0x80]);
450 lea(A2, ptr[A2+LDA*4]);
451 punpckldq(xmm0, xmm1);
452 punpckldq(xmm2, xmm3);
454 punpcklqdq(xmm0, xmm2);
455 punpckhqdq(xmm1, xmm2);
456 movdqu(xword[B-0x50], xmm0);
457 movdqu(xword[B+0x70], xmm1);
458 movq(xmm0, qword[A2-0x80]);
459 movq(xmm1, qword[A2+LDA*1-0x80]);
460 movq(xmm2, qword[A2+LDA*2-0x80]);
461 movq(xmm3, qword[A2+LDA3*1-0x80]);
462 lea(A2, ptr[A2+LDA*4]);
463 punpckldq(xmm0, xmm1);
464 punpckldq(xmm2, xmm3);
466 punpcklqdq(xmm0, xmm2);
467 punpckhqdq(xmm1, xmm2);
468 movdqu(xword[B-0x40], xmm0);
469 movdqu(xword[B+0x80], xmm1);
470 movq(xmm0, qword[A2-0x80]);
471 movq(xmm1, qword[A2+LDA*1-0x80]);
472 movq(xmm2, qword[A2+LDA*2-0x80]);
473 movq(xmm3, qword[A2+LDA3*1-0x80]);
474 lea(A2, ptr[A2+LDA*4]);
475 punpckldq(xmm0, xmm1);
476 punpckldq(xmm2, xmm3);
478 punpcklqdq(xmm0, xmm2);
479 punpckhqdq(xmm1, xmm2);
480 movdqu(xword[B-0x30], xmm0);
481 movdqu(xword[B+0x90], xmm1);
482 movq(xmm0, qword[A2-0x80]);
483 movq(xmm1, qword[A2+LDA*1-0x80]);
484 movq(xmm2, qword[A2+LDA*2-0x80]);
485 movq(xmm3, qword[A2+LDA3*1-0x80]);
486 lea(A2, ptr[A2+LDA*4]);
487 punpckldq(xmm0, xmm1);
488 punpckldq(xmm2, xmm3);
490 punpcklqdq(xmm0, xmm2);
491 punpckhqdq(xmm1, xmm2);
492 movdqu(xword[B-0x20], xmm0);
493 movdqu(xword[B+0xa0], xmm1);
494 movq(xmm0, qword[A2-0x80]);
495 movq(xmm1, qword[A2+LDA*1-0x80]);
496 movq(xmm2, qword[A2+LDA*2-0x80]);
497 movq(xmm3, qword[A2+LDA3*1-0x80]);
498 lea(A2, ptr[A2+LDA*4]);
499 punpckldq(xmm0, xmm1);
500 punpckldq(xmm2, xmm3);
502 punpcklqdq(xmm0, xmm2);
503 punpckhqdq(xmm1, xmm2);
504 movdqu(xword[B-0x10], xmm0);
505 movdqu(xword[B+0xb0], xmm1);
506 movq(xmm0, qword[A2-0x80]);
507 movq(xmm1, qword[A2+LDA*1-0x80]);
508 movq(xmm2, qword[A2+LDA*2-0x80]);
509 movq(xmm3, qword[A2+LDA3*1-0x80]);
510 lea(A2, ptr[A2+LDA*4]);
511 punpckldq(xmm0, xmm1);
512 punpckldq(xmm2, xmm3);
514 punpcklqdq(xmm0, xmm2);
515 punpckhqdq(xmm1, xmm2);
516 movdqu(xword[B], xmm0);
517 movdqu(xword[B+0xc0], xmm1);
518 movq(xmm0, qword[A2-0x80]);
519 movq(xmm1, qword[A2+LDA*1-0x80]);
520 movq(xmm2, qword[A2+LDA*2-0x80]);
521 movq(xmm3, qword[A2+LDA3*1-0x80]);
522 lea(A2, ptr[A2+LDA*4]);
523 punpckldq(xmm0, xmm1);
524 punpckldq(xmm2, xmm3);
526 punpcklqdq(xmm0, xmm2);
527 punpckhqdq(xmm1, xmm2);
528 movdqu(xword[B+0x10], xmm0);
529 movdqu(xword[B+0xd0], xmm1);
530 movq(xmm0, qword[A2-0x80]);
531 movq(xmm1, qword[A2+LDA*1-0x80]);
532 movq(xmm2, qword[A2+LDA*2-0x80]);
533 movq(xmm3, qword[A2+LDA3*1-0x80]);
534 lea(A2, ptr[A2+LDA*4]);
535 punpckldq(xmm0, xmm1);
536 punpckldq(xmm2, xmm3);
538 punpcklqdq(xmm0, xmm2);
539 punpckhqdq(xmm1, xmm2);
540 movdqu(xword[B+0x20], xmm0);
541 movdqu(xword[B+0xe0], xmm1);
542 movq(xmm0, qword[A2-0x80]);
543 movq(xmm1, qword[A2+LDA*1-0x80]);
544 movq(xmm2, qword[A2+LDA*2-0x80]);
545 movq(xmm3, qword[A2+LDA3*1-0x80]);
546 lea(A2, ptr[A2+LDA*4]);
547 punpckldq(xmm0, xmm1);
548 punpckldq(xmm2, xmm3);
550 punpcklqdq(xmm0, xmm2);
551 punpckhqdq(xmm1, xmm2);
552 movdqu(xword[B+0x30], xmm0);
553 movdqu(xword[B+0xf0], xmm1);
561 movd(xmm0, dword[A1-0x80]);
562 movd(xmm1, dword[A1+LDA*1-0x80]);
563 movd(xmm2, dword[A1+LDA*2-0x80]);
564 movd(xmm3, dword[A1+LDA3*1-0x80]);
565 lea(A2, ptr[A1+LDA*4]);
566 punpckldq(xmm0, xmm1);
567 punpckldq(xmm2, xmm3);
568 punpcklqdq(xmm0, xmm2);
569 movdqu(xword[B-0x80], xmm0);
570 movd(xmm0, dword[A2-0x80]);
571 movd(xmm1, dword[A2+LDA*1-0x80]);
572 movd(xmm2, dword[A2+LDA*2-0x80]);
573 movd(xmm3, dword[A2+LDA3*1-0x80]);
574 lea(A2, ptr[A2+LDA*4]);
575 punpckldq(xmm0, xmm1);
576 punpckldq(xmm2, xmm3);
577 punpcklqdq(xmm0, xmm2);
578 movdqu(xword[B-0x70], xmm0);
579 movd(xmm0, dword[A2-0x80]);
580 movd(xmm1, dword[A2+LDA*1-0x80]);
581 movd(xmm2, dword[A2+LDA*2-0x80]);
582 movd(xmm3, dword[A2+LDA3*1-0x80]);
583 lea(A2, ptr[A2+LDA*4]);
584 punpckldq(xmm0, xmm1);
585 punpckldq(xmm2, xmm3);
586 punpcklqdq(xmm0, xmm2);
587 movdqu(xword[B-0x60], xmm0);
588 movd(xmm0, dword[A2-0x80]);
589 movd(xmm1, dword[A2+LDA*1-0x80]);
590 movd(xmm2, dword[A2+LDA*2-0x80]);
591 movd(xmm3, dword[A2+LDA3*1-0x80]);
592 lea(A2, ptr[A2+LDA*4]);
593 punpckldq(xmm0, xmm1);
594 punpckldq(xmm2, xmm3);
595 punpcklqdq(xmm0, xmm2);
596 movdqu(xword[B-0x50], xmm0);
597 movd(xmm0, dword[A2-0x80]);
598 movd(xmm1, dword[A2+LDA*1-0x80]);
599 movd(xmm2, dword[A2+LDA*2-0x80]);
600 movd(xmm3, dword[A2+LDA3*1-0x80]);
601 lea(A2, ptr[A2+LDA*4]);
602 punpckldq(xmm0, xmm1);
603 punpckldq(xmm2, xmm3);
604 punpcklqdq(xmm0, xmm2);
605 movdqu(xword[B-0x40], xmm0);
606 movd(xmm0, dword[A2-0x80]);
607 movd(xmm1, dword[A2+LDA*1-0x80]);
608 movd(xmm2, dword[A2+LDA*2-0x80]);
609 movd(xmm3, dword[A2+LDA3*1-0x80]);
610 lea(A2, ptr[A2+LDA*4]);
611 punpckldq(xmm0, xmm1);
612 punpckldq(xmm2, xmm3);
613 punpcklqdq(xmm0, xmm2);
614 movdqu(xword[B-0x30], xmm0);
615 movd(xmm0, dword[A2-0x80]);
616 movd(xmm1, dword[A2+LDA*1-0x80]);
617 movd(xmm2, dword[A2+LDA*2-0x80]);
618 movd(xmm3, dword[A2+LDA3*1-0x80]);
619 lea(A2, ptr[A2+LDA*4]);
620 punpckldq(xmm0, xmm1);
621 punpckldq(xmm2, xmm3);
622 punpcklqdq(xmm0, xmm2);
623 movdqu(xword[B-0x20], xmm0);
624 movd(xmm0, dword[A2-0x80]);
625 movd(xmm1, dword[A2+LDA*1-0x80]);
626 movd(xmm2, dword[A2+LDA*2-0x80]);
627 movd(xmm3, dword[A2+LDA3*1-0x80]);
628 lea(A2, ptr[A2+LDA*4]);
629 punpckldq(xmm0, xmm1);
630 punpckldq(xmm2, xmm3);
631 punpcklqdq(xmm0, xmm2);
632 movdqu(xword[B-0x10], xmm0);
633 movd(xmm0, dword[A2-0x80]);
634 movd(xmm1, dword[A2+LDA*1-0x80]);
635 movd(xmm2, dword[A2+LDA*2-0x80]);
636 movd(xmm3, dword[A2+LDA3*1-0x80]);
637 lea(A2, ptr[A2+LDA*4]);
638 punpckldq(xmm0, xmm1);
639 punpckldq(xmm2, xmm3);
640 punpcklqdq(xmm0, xmm2);
641 movdqu(xword[B], xmm0);
642 movd(xmm0, dword[A2-0x80]);
643 movd(xmm1, dword[A2+LDA*1-0x80]);
644 movd(xmm2, dword[A2+LDA*2-0x80]);
645 movd(xmm3, dword[A2+LDA3*1-0x80]);
646 lea(A2, ptr[A2+LDA*4]);
647 punpckldq(xmm0, xmm1);
648 punpckldq(xmm2, xmm3);
649 punpcklqdq(xmm0, xmm2);
650 movdqu(xword[B+0x10], xmm0);
651 movd(xmm0, dword[A2-0x80]);
652 movd(xmm1, dword[A2+LDA*1-0x80]);
653 movd(xmm2, dword[A2+LDA*2-0x80]);
654 movd(xmm3, dword[A2+LDA3*1-0x80]);
655 lea(A2, ptr[A2+LDA*4]);
656 punpckldq(xmm0, xmm1);
657 punpckldq(xmm2, xmm3);
658 punpcklqdq(xmm0, xmm2);
659 movdqu(xword[B+0x20], xmm0);
660 movd(xmm0, dword[A2-0x80]);
661 movd(xmm1, dword[A2+LDA*1-0x80]);
662 movd(xmm2, dword[A2+LDA*2-0x80]);
663 movd(xmm3, dword[A2+LDA3*1-0x80]);
664 lea(A2, ptr[A2+LDA*4]);
665 punpckldq(xmm0, xmm1);
666 punpckldq(xmm2, xmm3);
667 punpcklqdq(xmm0, xmm2);
668 movdqu(xword[B+0x30], xmm0);
676 mov(ax, word[A1-0x80]);
677 pinsrw(xmm0, eax, 0x0);
678 mov(ax, word[A1+LDA*1-0x80]);
679 pinsrw(xmm0, eax, 0x1);
680 mov(ax, word[A1+LDA*2-0x80]);
681 pinsrw(xmm0, eax, 0x2);
682 mov(ax, word[A1+LDA3*1-0x80]);
683 lea(A2, ptr[A1+LDA*4]);
684 pinsrw(xmm0, eax, 0x3);
685 mov(ax, word[A2-0x80]);
686 pinsrw(xmm0, eax, 0x4);
687 mov(ax, word[A2+LDA*1-0x80]);
688 pinsrw(xmm0, eax, 0x5);
689 mov(ax, word[A2+LDA*2-0x80]);
690 pinsrw(xmm0, eax, 0x6);
691 mov(ax, word[A2+LDA3*1-0x80]);
692 lea(A2, ptr[A2+LDA*4]);
693 pinsrw(xmm0, eax, 0x7);
694 movdqu(xword[B-0x80], xmm0);
695 mov(ax, word[A2-0x80]);
696 pinsrw(xmm0, eax, 0x0);
697 mov(ax, word[A2+LDA*1-0x80]);
698 pinsrw(xmm0, eax, 0x1);
699 mov(ax, word[A2+LDA*2-0x80]);
700 pinsrw(xmm0, eax, 0x2);
701 mov(ax, word[A2+LDA3*1-0x80]);
702 lea(A2, ptr[A2+LDA*4]);
703 pinsrw(xmm0, eax, 0x3);
704 mov(ax, word[A2-0x80]);
705 pinsrw(xmm0, eax, 0x4);
706 mov(ax, word[A2+LDA*1-0x80]);
707 pinsrw(xmm0, eax, 0x5);
708 mov(ax, word[A2+LDA*2-0x80]);
709 pinsrw(xmm0, eax, 0x6);
710 mov(ax, word[A2+LDA3*1-0x80]);
711 pinsrw(xmm0, eax, 0x7);
712 lea(A2, ptr[A2+LDA*4]);
713 movdqu(xword[B-0x70], xmm0);
714 mov(ax, word[A2-0x80]);
715 pinsrw(xmm0, eax, 0x0);
716 mov(ax, word[A2+LDA*1-0x80]);
717 pinsrw(xmm0, eax, 0x1);
718 mov(ax, word[A2+LDA*2-0x80]);
719 pinsrw(xmm0, eax, 0x2);
720 mov(ax, word[A2+LDA3*1-0x80]);
721 lea(A2, ptr[A2+LDA*4]);
722 pinsrw(xmm0, eax, 0x3);
723 mov(ax, word[A2-0x80]);
724 pinsrw(xmm0, eax, 0x4);
725 mov(ax, word[A2+LDA*1-0x80]);
726 pinsrw(xmm0, eax, 0x5);
727 mov(ax, word[A2+LDA*2-0x80]);
728 pinsrw(xmm0, eax, 0x6);
729 mov(ax, word[A2+LDA3*1-0x80]);
730 pinsrw(xmm0, eax, 0x7);
731 lea(A2, ptr[A2+LDA*4]);
732 movdqu(xword[B-0x60], xmm0);
733 mov(ax, word[A2-0x80]);
734 pinsrw(xmm0, eax, 0x0);
735 mov(ax, word[A2+LDA*1-0x80]);
736 pinsrw(xmm0, eax, 0x1);
737 mov(ax, word[A2+LDA*2-0x80]);
738 pinsrw(xmm0, eax, 0x2);
739 mov(ax, word[A2+LDA3*1-0x80]);
740 lea(A2, ptr[A2+LDA*4]);
741 pinsrw(xmm0, eax, 0x3);
742 mov(ax, word[A2-0x80]);
743 pinsrw(xmm0, eax, 0x4);
744 mov(ax, word[A2+LDA*1-0x80]);
745 pinsrw(xmm0, eax, 0x5);
746 mov(ax, word[A2+LDA*2-0x80]);
747 pinsrw(xmm0, eax, 0x6);
748 mov(ax, word[A2+LDA3*1-0x80]);
749 pinsrw(xmm0, eax, 0x7);
750 lea(A2, ptr[A2+LDA*4]);
751 movdqu(xword[B-0x50], xmm0);
752 mov(ax, word[A2-0x80]);
753 pinsrw(xmm0, eax, 0x0);
754 mov(ax, word[A2+LDA*1-0x80]);
755 pinsrw(xmm0, eax, 0x1);
756 mov(ax, word[A2+LDA*2-0x80]);
757 pinsrw(xmm0, eax, 0x2);
758 mov(ax, word[A2+LDA3*1-0x80]);
759 lea(A2, ptr[A2+LDA*4]);
760 pinsrw(xmm0, eax, 0x3);
761 mov(ax, word[A2-0x80]);
762 pinsrw(xmm0, eax, 0x4);
763 mov(ax, word[A2+LDA*1-0x80]);
764 pinsrw(xmm0, eax, 0x5);
765 mov(ax, word[A2+LDA*2-0x80]);
766 pinsrw(xmm0, eax, 0x6);
767 mov(ax, word[A2+LDA3*1-0x80]);
768 pinsrw(xmm0, eax, 0x7);
769 lea(A2, ptr[A2+LDA*4]);
770 movdqu(xword[B-0x40], xmm0);
771 mov(ax, word[A2-0x80]);
772 pinsrw(xmm0, eax, 0x0);
773 mov(ax, word[A2+LDA*1-0x80]);
774 pinsrw(xmm0, eax, 0x1);
775 mov(ax, word[A2+LDA*2-0x80]);
776 pinsrw(xmm0, eax, 0x2);
777 mov(ax, word[A2+LDA3*1-0x80]);
778 lea(A2, ptr[A2+LDA*4]);
779 pinsrw(xmm0, eax, 0x3);
780 mov(ax, word[A2-0x80]);
781 pinsrw(xmm0, eax, 0x4);
782 mov(ax, word[A2+LDA*1-0x80]);
783 pinsrw(xmm0, eax, 0x5);
784 mov(ax, word[A2+LDA*2-0x80]);
785 pinsrw(xmm0, eax, 0x6);
786 mov(ax, word[A2+LDA3*1-0x80]);
787 pinsrw(xmm0, eax, 0x7);
788 lea(A2, ptr[A2+LDA*4]);
789 movdqu(xword[B-0x30], xmm0);
797 mov(al, byte[A1-0x80]);
798 pinsrb(xmm0, eax, 0x0);
799 mov(al, byte[A1+LDA*1-0x80]);
800 pinsrb(xmm0, eax, 0x1);
801 mov(al, byte[A1+LDA*2-0x80]);
802 pinsrb(xmm0, eax, 0x2);
803 mov(al, byte[A1+LDA3*1-0x80]);
804 lea(A2, ptr[A1+LDA*4]);
805 pinsrb(xmm0, eax, 0x3);
806 mov(al, byte[A2-0x80]);
807 pinsrb(xmm0, eax, 0x4);
808 mov(al, byte[A2+LDA*1-0x80]);
809 pinsrb(xmm0, eax, 0x5);
810 mov(al, byte[A2+LDA*2-0x80]);
811 pinsrb(xmm0, eax, 0x6);
812 mov(al, byte[A2+LDA3*1-0x80]);
813 lea(A2, ptr[A2+LDA*4]);
814 pinsrb(xmm0, eax, 0x7);
815 mov(al, byte[A2-0x80]);
816 pinsrb(xmm0, eax, 0x8);
817 mov(al, byte[A2+LDA*1-0x80]);
818 pinsrb(xmm0, eax, 0x9);
819 mov(al, byte[A2+LDA*2-0x80]);
820 pinsrb(xmm0, eax, 0xa);
821 mov(al, byte[A2+LDA3*1-0x80]);
822 lea(A2, ptr[A2+LDA*4]);
823 pinsrb(xmm0, eax, 0xb);
824 mov(al, byte[A2-0x80]);
825 pinsrb(xmm0, eax, 0xc);
826 mov(al, byte[A2+LDA*1-0x80]);
827 pinsrb(xmm0, eax, 0xd);
828 mov(al, byte[A2+LDA*2-0x80]);
829 pinsrb(xmm0, eax, 0xe);
830 mov(al, byte[A2+LDA3*1-0x80]);
831 lea(A2, ptr[A2+LDA*4]);
832 pinsrb(xmm0, eax, 0xf);
833 movdqu(xword[B-0x80], xmm0);
834 mov(al, byte[A2-0x80]);
835 pinsrb(xmm0, eax, 0x0);
836 mov(al, byte[A2+LDA*1-0x80]);
837 pinsrb(xmm0, eax, 0x1);
838 mov(al, byte[A2+LDA*2-0x80]);
839 pinsrb(xmm0, eax, 0x2);
840 mov(al, byte[A2+LDA3*1-0x80]);
841 lea(A2, ptr[A2+LDA*4]);
842 pinsrb(xmm0, eax, 0x3);
843 mov(al, byte[A2-0x80]);
844 pinsrb(xmm0, eax, 0x4);
845 mov(al, byte[A2+LDA*1-0x80]);
846 pinsrb(xmm0, eax, 0x5);
847 mov(al, byte[A2+LDA*2-0x80]);
848 pinsrb(xmm0, eax, 0x6);
849 mov(al, byte[A2+LDA3*1-0x80]);
850 lea(A2, ptr[A2+LDA*4]);
851 pinsrb(xmm0, eax, 0x7);
852 mov(al, byte[A2-0x80]);
853 pinsrb(xmm0, eax, 0x8);
854 mov(al, byte[A2+LDA*1-0x80]);
855 pinsrb(xmm0, eax, 0x9);
856 mov(al, byte[A2+LDA*2-0x80]);
857 pinsrb(xmm0, eax, 0xa);
858 mov(al, byte[A2+LDA3*1-0x80]);
859 lea(A2, ptr[A2+LDA*4]);
860 pinsrb(xmm0, eax, 0xb);
861 mov(al, byte[A2-0x80]);
862 pinsrb(xmm0, eax, 0xc);
863 mov(al, byte[A2+LDA*1-0x80]);
864 pinsrb(xmm0, eax, 0xd);
865 mov(al, byte[A2+LDA*2-0x80]);
866 pinsrb(xmm0, eax, 0xe);
867 mov(al, byte[A2+LDA3*1-0x80]);
868 lea(A2, ptr[A2+LDA*4]);
869 pinsrb(xmm0, eax, 0xf);
870 movdqu(xword[B-0x70], xmm0);
871 mov(al, byte[A2-0x80]);
872 pinsrb(xmm0, eax, 0x0);
873 mov(al, byte[A2+LDA*1-0x80]);
874 pinsrb(xmm0, eax, 0x1);
875 mov(al, byte[A2+LDA*2-0x80]);
876 pinsrb(xmm0, eax, 0x2);
877 mov(al, byte[A2+LDA3*1-0x80]);
878 lea(A2, ptr[A2+LDA*4]);
879 pinsrb(xmm0, eax, 0x3);
880 mov(al, byte[A2-0x80]);
881 pinsrb(xmm0, eax, 0x4);
882 mov(al, byte[A2+LDA*1-0x80]);
883 pinsrb(xmm0, eax, 0x5);
884 mov(al, byte[A2+LDA*2-0x80]);
885 pinsrb(xmm0, eax, 0x6);
886 mov(al, byte[A2+LDA3*1-0x80]);
887 lea(A2, ptr[A2+LDA*4]);
888 pinsrb(xmm0, eax, 0x7);
889 mov(al, byte[A2-0x80]);
890 pinsrb(xmm0, eax, 0x8);
891 mov(al, byte[A2+LDA*1-0x80]);
892 pinsrb(xmm0, eax, 0x9);
893 mov(al, byte[A2+LDA*2-0x80]);
894 pinsrb(xmm0, eax, 0xa);
895 mov(al, byte[A2+LDA3*1-0x80]);
896 lea(A2, ptr[A2+LDA*4]);
897 pinsrb(xmm0, eax, 0xb);
898 mov(al, byte[A2-0x80]);
899 pinsrb(xmm0, eax, 0xc);
900 mov(al, byte[A2+LDA*1-0x80]);
901 pinsrb(xmm0, eax, 0xd);
902 mov(al, byte[A2+LDA*2-0x80]);
903 pinsrb(xmm0, eax, 0xe);
904 mov(al, byte[A2+LDA3*1-0x80]);
905 lea(A2, ptr[A2+LDA*4]);
906 pinsrb(xmm0, eax, 0xf);
907 movdqu(xword[B-0x60], xmm0);
933 movdqu(xmm0, xword[A1-0x80]);
934 movdqu(xmm1, xword[A1+LDA*1-0x80]);
935 movdqu(xmm2, xword[A1+LDA*2-0x80]);
936 movdqu(xmm3, xword[A1+LDA3*1-0x80]);
937 lea(A2, ptr[A1+LDA*4]);
939 punpckldq(xmm0, xmm1);
940 punpckhdq(xmm4, xmm1);
942 punpckldq(xmm2, xmm3);
943 punpckhdq(xmm5, xmm3);
945 punpcklqdq(xmm0, xmm2);
946 punpckhqdq(xmm1, xmm2);
948 punpcklqdq(xmm4, xmm5);
949 punpckhqdq(xmm3, xmm5);
950 movdqu(xword[B-0x80], xmm0);
951 movdqu(xword[B], xmm1);
952 movdqu(xword[B+0x80], xmm4);
953 movdqu(xword[B+0x100], xmm3);
954 movdqu(xmm0, xword[A2-0x80]);
955 movdqu(xmm1, xword[A2+LDA*1-0x80]);
956 movdqu(xmm2, xword[A2+LDA*2-0x80]);
957 movdqu(xmm3, xword[A2+LDA3*1-0x80]);
958 lea(A2, ptr[A2+LDA*4]);
960 punpckldq(xmm0, xmm1);
961 punpckhdq(xmm4, xmm1);
963 punpckldq(xmm2, xmm3);
964 punpckhdq(xmm5, xmm3);
966 punpcklqdq(xmm0, xmm2);
967 punpckhqdq(xmm1, xmm2);
969 punpcklqdq(xmm4, xmm5);
970 punpckhqdq(xmm3, xmm5);
971 movdqu(xword[B-0x70], xmm0);
972 movdqu(xword[B+0x10], xmm1);
973 movdqu(xword[B+0x90], xmm4);
974 movdqu(xword[B+0x110], xmm3);
975 movdqu(xmm0, xword[A2-0x80]);
976 movdqu(xmm1, xword[A2+LDA*1-0x80]);
977 movdqu(xmm2, xword[A2+LDA*2-0x80]);
978 movdqu(xmm3, xword[A2+LDA3*1-0x80]);
979 lea(A2, ptr[A2+LDA*4]);
981 punpckldq(xmm0, xmm1);
982 punpckhdq(xmm4, xmm1);
984 punpckldq(xmm2, xmm3);
985 punpckhdq(xmm5, xmm3);
987 punpcklqdq(xmm0, xmm2);
988 punpckhqdq(xmm1, xmm2);
990 punpcklqdq(xmm4, xmm5);
991 punpckhqdq(xmm3, xmm5);
992 movdqu(xword[B-0x60], xmm0);
993 movdqu(xword[B+0x20], xmm1);
994 movdqu(xword[B+0xa0], xmm4);
995 movdqu(xword[B+0x120], xmm3);
996 movdqu(xmm0, xword[A2-0x80]);
997 movdqu(xmm1, xword[A2+LDA*1-0x80]);
998 movdqu(xmm2, xword[A2+LDA*2-0x80]);
999 movdqu(xmm3, xword[A2+LDA3*1-0x80]);
1000 lea(A2, ptr[A2+LDA*4]);
1002 punpckldq(xmm0, xmm1);
1003 punpckhdq(xmm4, xmm1);
1005 punpckldq(xmm2, xmm3);
1006 punpckhdq(xmm5, xmm3);
1008 punpcklqdq(xmm0, xmm2);
1009 punpckhqdq(xmm1, xmm2);
1011 punpcklqdq(xmm4, xmm5);
1012 punpckhqdq(xmm3, xmm5);
1013 movdqu(xword[B-0x50], xmm0);
1014 movdqu(xword[B+0x30], xmm1);
1015 movdqu(xword[B+0xb0], xmm4);
1016 movdqu(xword[B+0x130], xmm3);
1017 movdqu(xmm0, xword[A2-0x80]);
1018 movdqu(xmm1, xword[A2+LDA*1-0x80]);
1019 movdqu(xmm2, xword[A2+LDA*2-0x80]);
1020 movdqu(xmm3, xword[A2+LDA3*1-0x80]);
1021 lea(A2, ptr[A2+LDA*4]);
1023 punpckldq(xmm0, xmm1);
1024 punpckhdq(xmm4, xmm1);
1026 punpckldq(xmm2, xmm3);
1027 punpckhdq(xmm5, xmm3);
1029 punpcklqdq(xmm0, xmm2);
1030 punpckhqdq(xmm1, xmm2);
1032 punpcklqdq(xmm4, xmm5);
1033 punpckhqdq(xmm3, xmm5);
1034 movdqu(xword[B-0x40], xmm0);
1035 movdqu(xword[B+0x40], xmm1);
1036 movdqu(xword[B+0xc0], xmm4);
1037 movdqu(xword[B+0x140], xmm3);
1038 movdqu(xmm0, xword[A2-0x80]);
1039 movdqu(xmm1, xword[A2+LDA*1-0x80]);
1040 movdqu(xmm2, xword[A2+LDA*2-0x80]);
1041 movdqu(xmm3, xword[A2+LDA3*1-0x80]);
1042 lea(A2, ptr[A2+LDA*4]);
1044 punpckldq(xmm0, xmm1);
1045 punpckhdq(xmm4, xmm1);
1047 punpckldq(xmm2, xmm3);
1048 punpckhdq(xmm5, xmm3);
1050 punpcklqdq(xmm0, xmm2);
1051 punpckhqdq(xmm1, xmm2);
1053 punpcklqdq(xmm4, xmm5);
1054 punpckhqdq(xmm3, xmm5);
1055 movdqu(xword[B-0x30], xmm0);
1056 movdqu(xword[B+0x50], xmm1);
1057 movdqu(xword[B+0xd0], xmm4);
1058 movdqu(xword[B+0x150], xmm3);
1059 movdqu(xmm0, xword[A2-0x80]);
1060 movdqu(xmm1, xword[A2+LDA*1-0x80]);
1061 movdqu(xmm2, xword[A2+LDA*2-0x80]);
1062 movdqu(xmm3, xword[A2+LDA3*1-0x80]);
1063 lea(A2, ptr[A2+LDA*4]);
1065 punpckldq(xmm0, xmm1);
1066 punpckhdq(xmm4, xmm1);
1068 punpckldq(xmm2, xmm3);
1069 punpckhdq(xmm5, xmm3);
1071 punpcklqdq(xmm0, xmm2);
1072 punpckhqdq(xmm1, xmm2);
1074 punpcklqdq(xmm4, xmm5);
1075 punpckhqdq(xmm3, xmm5);
1076 movdqu(xword[B-0x20], xmm0);
1077 movdqu(xword[B+0x60], xmm1);
1078 movdqu(xword[B+0xe0], xmm4);
1079 movdqu(xword[B+0x160], xmm3);
1080 movdqu(xmm0, xword[A2-0x80]);
1081 movdqu(xmm1, xword[A2+LDA*1-0x80]);
1082 movdqu(xmm2, xword[A2+LDA*2-0x80]);
1083 movdqu(xmm3, xword[A2+LDA3*1-0x80]);
1084 lea(A2, ptr[A2+LDA*4]);
1086 punpckldq(xmm0, xmm1);
1087 punpckhdq(xmm4, xmm1);
1089 punpckldq(xmm2, xmm3);
1090 punpckhdq(xmm5, xmm3);
1092 punpcklqdq(xmm0, xmm2);
1093 punpckhqdq(xmm1, xmm2);
1095 punpcklqdq(xmm4, xmm5);
1096 punpckhqdq(xmm3, xmm5);
1097 movdqu(xword[B-0x10], xmm0);
1098 movdqu(xword[B+0x70], xmm1);
1099 movdqu(xword[B+0xf0], xmm4);
1100 movdqu(xword[B+0x170], xmm3);
1110 movq(xmm0, qword[A1-0x80]);
1111 movq(xmm1, qword[A1+LDA*1-0x80]);
1112 movq(xmm2, qword[A1+LDA*2-0x80]);
1113 movq(xmm3, qword[A1+LDA3*1-0x80]);
1114 lea(A2, ptr[A1+LDA*4]);
1115 punpckldq(xmm0, xmm1);
1116 punpckldq(xmm2, xmm3);
1118 punpcklqdq(xmm0, xmm2);
1119 punpckhqdq(xmm1, xmm2);
1120 movdqu(xword[B-0x80], xmm0);
1121 movdqu(xword[B], xmm1);
1122 movq(xmm0, qword[A2-0x80]);
1123 movq(xmm1, qword[A2+LDA*1-0x80]);
1124 movq(xmm2, qword[A2+LDA*2-0x80]);
1125 movq(xmm3, qword[A2+LDA3*1-0x80]);
1126 lea(A2, ptr[A2+LDA*4]);
1127 punpckldq(xmm0, xmm1);
1128 punpckldq(xmm2, xmm3);
1130 punpcklqdq(xmm0, xmm2);
1131 punpckhqdq(xmm1, xmm2);
1132 movdqu(xword[B-0x70], xmm0);
1133 movdqu(xword[B+0x10], xmm1);
1134 movq(xmm0, qword[A2-0x80]);
1135 movq(xmm1, qword[A2+LDA*1-0x80]);
1136 movq(xmm2, qword[A2+LDA*2-0x80]);
1137 movq(xmm3, qword[A2+LDA3*1-0x80]);
1138 lea(A2, ptr[A2+LDA*4]);
1139 punpckldq(xmm0, xmm1);
1140 punpckldq(xmm2, xmm3);
1142 punpcklqdq(xmm0, xmm2);
1143 punpckhqdq(xmm1, xmm2);
1144 movdqu(xword[B-0x60], xmm0);
1145 movdqu(xword[B+0x20], xmm1);
1146 movq(xmm0, qword[A2-0x80]);
1147 movq(xmm1, qword[A2+LDA*1-0x80]);
1148 movq(xmm2, qword[A2+LDA*2-0x80]);
1149 movq(xmm3, qword[A2+LDA3*1-0x80]);
1150 lea(A2, ptr[A2+LDA*4]);
1151 punpckldq(xmm0, xmm1);
1152 punpckldq(xmm2, xmm3);
1154 punpcklqdq(xmm0, xmm2);
1155 punpckhqdq(xmm1, xmm2);
1156 movdqu(xword[B-0x50], xmm0);
1157 movdqu(xword[B+0x30], xmm1);
1158 movq(xmm0, qword[A2-0x80]);
1159 movq(xmm1, qword[A2+LDA*1-0x80]);
1160 movq(xmm2, qword[A2+LDA*2-0x80]);
1161 movq(xmm3, qword[A2+LDA3*1-0x80]);
1162 lea(A2, ptr[A2+LDA*4]);
1163 punpckldq(xmm0, xmm1);
1164 punpckldq(xmm2, xmm3);
1166 punpcklqdq(xmm0, xmm2);
1167 punpckhqdq(xmm1, xmm2);
1168 movdqu(xword[B-0x40], xmm0);
1169 movdqu(xword[B+0x40], xmm1);
1170 movq(xmm0, qword[A2-0x80]);
1171 movq(xmm1, qword[A2+LDA*1-0x80]);
1172 movq(xmm2, qword[A2+LDA*2-0x80]);
1173 movq(xmm3, qword[A2+LDA3*1-0x80]);
1174 lea(A2, ptr[A2+LDA*4]);
1175 punpckldq(xmm0, xmm1);
1176 punpckldq(xmm2, xmm3);
1178 punpcklqdq(xmm0, xmm2);
1179 punpckhqdq(xmm1, xmm2);
1180 movdqu(xword[B-0x30], xmm0);
1181 movdqu(xword[B+0x50], xmm1);
1182 movq(xmm0, qword[A2-0x80]);
1183 movq(xmm1, qword[A2+LDA*1-0x80]);
1184 movq(xmm2, qword[A2+LDA*2-0x80]);
1185 movq(xmm3, qword[A2+LDA3*1-0x80]);
1186 lea(A2, ptr[A2+LDA*4]);
1187 punpckldq(xmm0, xmm1);
1188 punpckldq(xmm2, xmm3);
1190 punpcklqdq(xmm0, xmm2);
1191 punpckhqdq(xmm1, xmm2);
1192 movdqu(xword[B-0x20], xmm0);
1193 movdqu(xword[B+0x60], xmm1);
1194 movq(xmm0, qword[A2-0x80]);
1195 movq(xmm1, qword[A2+LDA*1-0x80]);
1196 movq(xmm2, qword[A2+LDA*2-0x80]);
1197 movq(xmm3, qword[A2+LDA3*1-0x80]);
1198 punpckldq(xmm0, xmm1);
1199 punpckldq(xmm2, xmm3);
1201 punpcklqdq(xmm0, xmm2);
1202 punpckhqdq(xmm1, xmm2);
1203 movdqu(xword[B-0x10], xmm0);
1204 movdqu(xword[B+0x70], xmm1);
1212 movd(xmm0, dword[A1-0x80]);
1213 movd(xmm1, dword[A1+LDA*1-0x80]);
1214 movd(xmm2, dword[A1+LDA*2-0x80]);
1215 movd(xmm3, dword[A1+LDA3*1-0x80]);
1216 lea(A2, ptr[A1+LDA*4]);
1217 punpckldq(xmm0, xmm1);
1218 punpckldq(xmm2, xmm3);
1219 punpcklqdq(xmm0, xmm2);
1220 movdqu(xword[B-0x80], xmm0);
1221 movd(xmm0, dword[A2-0x80]);
1222 movd(xmm1, dword[A2+LDA*1-0x80]);
1223 movd(xmm2, dword[A2+LDA*2-0x80]);
1224 movd(xmm3, dword[A2+LDA3*1-0x80]);
1225 lea(A2, ptr[A2+LDA*4]);
1226 punpckldq(xmm0, xmm1);
1227 punpckldq(xmm2, xmm3);
1228 punpcklqdq(xmm0, xmm2);
1229 movdqu(xword[B-0x70], xmm0);
1230 movd(xmm0, dword[A2-0x80]);
1231 movd(xmm1, dword[A2+LDA*1-0x80]);
1232 movd(xmm2, dword[A2+LDA*2-0x80]);
1233 movd(xmm3, dword[A2+LDA3*1-0x80]);
1234 lea(A2, ptr[A2+LDA*4]);
1235 punpckldq(xmm0, xmm1);
1236 punpckldq(xmm2, xmm3);
1237 punpcklqdq(xmm0, xmm2);
1238 movdqu(xword[B-0x60], xmm0);
1239 movd(xmm0, dword[A2-0x80]);
1240 movd(xmm1, dword[A2+LDA*1-0x80]);
1241 movd(xmm2, dword[A2+LDA*2-0x80]);
1242 movd(xmm3, dword[A2+LDA3*1-0x80]);
1243 lea(A2, ptr[A2+LDA*4]);
1244 punpckldq(xmm0, xmm1);
1245 punpckldq(xmm2, xmm3);
1246 punpcklqdq(xmm0, xmm2);
1247 movdqu(xword[B-0x50], xmm0);
1248 movd(xmm0, dword[A2-0x80]);
1249 movd(xmm1, dword[A2+LDA*1-0x80]);
1250 movd(xmm2, dword[A2+LDA*2-0x80]);
1251 movd(xmm3, dword[A2+LDA3*1-0x80]);
1252 lea(A2, ptr[A2+LDA*4]);
1253 punpckldq(xmm0, xmm1);
1254 punpckldq(xmm2, xmm3);
1255 punpcklqdq(xmm0, xmm2);
1256 movdqu(xword[B-0x40], xmm0);
1257 movd(xmm0, dword[A2-0x80]);
1258 movd(xmm1, dword[A2+LDA*1-0x80]);
1259 movd(xmm2, dword[A2+LDA*2-0x80]);
1260 movd(xmm3, dword[A2+LDA3*1-0x80]);
1261 lea(A2, ptr[A2+LDA*4]);
1262 punpckldq(xmm0, xmm1);
1263 punpckldq(xmm2, xmm3);
1264 punpcklqdq(xmm0, xmm2);
1265 movdqu(xword[B-0x30], xmm0);
1266 movd(xmm0, dword[A2-0x80]);
1267 movd(xmm1, dword[A2+LDA*1-0x80]);
1268 movd(xmm2, dword[A2+LDA*2-0x80]);
1269 movd(xmm3, dword[A2+LDA3*1-0x80]);
1270 lea(A2, ptr[A2+LDA*4]);
1271 punpckldq(xmm0, xmm1);
1272 punpckldq(xmm2, xmm3);
1273 punpcklqdq(xmm0, xmm2);
1274 movdqu(xword[B-0x20], xmm0);
1275 movd(xmm0, dword[A2-0x80]);
1276 movd(xmm1, dword[A2+LDA*1-0x80]);
1277 movd(xmm2, dword[A2+LDA*2-0x80]);
1278 movd(xmm3, dword[A2+LDA3*1-0x80]);
1279 lea(A2, ptr[A2+LDA*4]);
1280 punpckldq(xmm0, xmm1);
1281 punpckldq(xmm2, xmm3);
1282 punpcklqdq(xmm0, xmm2);
1283 movdqu(xword[B-0x10], xmm0);
1291 mov(ax, word[A1-0x80]);
1292 pinsrw(xmm0, eax, 0x0);
1293 mov(ax, word[A1+LDA*1-0x80]);
1294 pinsrw(xmm0, eax, 0x1);
1295 mov(ax, word[A1+LDA*2-0x80]);
1296 pinsrw(xmm0, eax, 0x2);
1297 mov(ax, word[A1+LDA3*1-0x80]);
1298 lea(A2, ptr[A1+LDA*4]);
1299 pinsrw(xmm0, eax, 0x3);
1300 mov(ax, word[A2-0x80]);
1301 pinsrw(xmm0, eax, 0x4);
1302 mov(ax, word[A2+LDA*1-0x80]);
1303 pinsrw(xmm0, eax, 0x5);
1304 mov(ax, word[A2+LDA*2-0x80]);
1305 pinsrw(xmm0, eax, 0x6);
1306 mov(ax, word[A2+LDA3*1-0x80]);
1307 lea(A2, ptr[A2+LDA*4]);
1308 pinsrw(xmm0, eax, 0x7);
1309 movdqu(xword[B-0x80], xmm0);
1310 mov(ax, word[A2-0x80]);
1311 pinsrw(xmm0, eax, 0x0);
1312 mov(ax, word[A2+LDA*1-0x80]);
1313 pinsrw(xmm0, eax, 0x1);
1314 mov(ax, word[A2+LDA*2-0x80]);
1315 pinsrw(xmm0, eax, 0x2);
1316 mov(ax, word[A2+LDA3*1-0x80]);
1317 lea(A2, ptr[A2+LDA*4]);
1318 pinsrw(xmm0, eax, 0x3);
1319 mov(ax, word[A2-0x80]);
1320 pinsrw(xmm0, eax, 0x4);
1321 mov(ax, word[A2+LDA*1-0x80]);
1322 pinsrw(xmm0, eax, 0x5);
1323 mov(ax, word[A2+LDA*2-0x80]);
1324 pinsrw(xmm0, eax, 0x6);
1325 mov(ax, word[A2+LDA3*1-0x80]);
1326 pinsrw(xmm0, eax, 0x7);
1327 lea(A2, ptr[A2+LDA*4]);
1328 movdqu(xword[B-0x70], xmm0);
1329 mov(ax, word[A2-0x80]);
1330 pinsrw(xmm0, eax, 0x0);
1331 mov(ax, word[A2+LDA*1-0x80]);
1332 pinsrw(xmm0, eax, 0x1);
1333 mov(ax, word[A2+LDA*2-0x80]);
1334 pinsrw(xmm0, eax, 0x2);
1335 mov(ax, word[A2+LDA3*1-0x80]);
1336 lea(A2, ptr[A2+LDA*4]);
1337 pinsrw(xmm0, eax, 0x3);
1338 mov(ax, word[A2-0x80]);
1339 pinsrw(xmm0, eax, 0x4);
1340 mov(ax, word[A2+LDA*1-0x80]);
1341 pinsrw(xmm0, eax, 0x5);
1342 mov(ax, word[A2+LDA*2-0x80]);
1343 pinsrw(xmm0, eax, 0x6);
1344 mov(ax, word[A2+LDA3*1-0x80]);
1345 pinsrw(xmm0, eax, 0x7);
1346 lea(A2, ptr[A2+LDA*4]);
1347 movdqu(xword[B-0x60], xmm0);
1348 mov(ax, word[A2-0x80]);
1349 pinsrw(xmm0, eax, 0x0);
1350 mov(ax, word[A2+LDA*1-0x80]);
1351 pinsrw(xmm0, eax, 0x1);
1352 mov(ax, word[A2+LDA*2-0x80]);
1353 pinsrw(xmm0, eax, 0x2);
1354 mov(ax, word[A2+LDA3*1-0x80]);
1355 lea(A2, ptr[A2+LDA*4]);
1356 pinsrw(xmm0, eax, 0x3);
1357 mov(ax, word[A2-0x80]);
1358 pinsrw(xmm0, eax, 0x4);
1359 mov(ax, word[A2+LDA*1-0x80]);
1360 pinsrw(xmm0, eax, 0x5);
1361 mov(ax, word[A2+LDA*2-0x80]);
1362 pinsrw(xmm0, eax, 0x6);
1363 mov(ax, word[A2+LDA3*1-0x80]);
1364 pinsrw(xmm0, eax, 0x7);
1365 lea(A2, ptr[A2+LDA*4]);
1366 movdqu(xword[B-0x50], xmm0);
1374 mov(al, byte[A1-0x80]);
1375 pinsrb(xmm0, eax, 0x0);
1376 mov(al, byte[A1+LDA*1-0x80]);
1377 pinsrb(xmm0, eax, 0x1);
1378 mov(al, byte[A1+LDA*2-0x80]);
1379 pinsrb(xmm0, eax, 0x2);
1380 mov(al, byte[A1+LDA3*1-0x80]);
1381 lea(A2, ptr[A1+LDA*4]);
1382 pinsrb(xmm0, eax, 0x3);
1383 mov(al, byte[A2-0x80]);
1384 pinsrb(xmm0, eax, 0x4);
1385 mov(al, byte[A2+LDA*1-0x80]);
1386 pinsrb(xmm0, eax, 0x5);
1387 mov(al, byte[A2+LDA*2-0x80]);
1388 pinsrb(xmm0, eax, 0x6);
1389 mov(al, byte[A2+LDA3*1-0x80]);
1390 lea(A2, ptr[A2+LDA*4]);
1391 pinsrb(xmm0, eax, 0x7);
1392 mov(al, byte[A2-0x80]);
1393 pinsrb(xmm0, eax, 0x8);
1394 mov(al, byte[A2+LDA*1-0x80]);
1395 pinsrb(xmm0, eax, 0x9);
1396 mov(al, byte[A2+LDA*2-0x80]);
1397 pinsrb(xmm0, eax, 0xa);
1398 mov(al, byte[A2+LDA3*1-0x80]);
1399 lea(A2, ptr[A2+LDA*4]);
1400 pinsrb(xmm0, eax, 0xb);
1401 mov(al, byte[A2-0x80]);
1402 pinsrb(xmm0, eax, 0xc);
1403 mov(al, byte[A2+LDA*1-0x80]);
1404 pinsrb(xmm0, eax, 0xd);
1405 mov(al, byte[A2+LDA*2-0x80]);
1406 pinsrb(xmm0, eax, 0xe);
1407 mov(al, byte[A2+LDA3*1-0x80]);
1408 lea(A2, ptr[A2+LDA*4]);
1409 pinsrb(xmm0, eax, 0xf);
1410 movdqu(xword[B-0x80], xmm0);
1411 mov(al, byte[A2-0x80]);
1412 pinsrb(xmm0, eax, 0x0);
1413 mov(al, byte[A2+LDA*1-0x80]);
1414 pinsrb(xmm0, eax, 0x1);
1415 mov(al, byte[A2+LDA*2-0x80]);
1416 pinsrb(xmm0, eax, 0x2);
1417 mov(al, byte[A2+LDA3*1-0x80]);
1418 lea(A2, ptr[A2+LDA*4]);
1419 pinsrb(xmm0, eax, 0x3);
1420 mov(al, byte[A2-0x80]);
1421 pinsrb(xmm0, eax, 0x4);
1422 mov(al, byte[A2+LDA*1-0x80]);
1423 pinsrb(xmm0, eax, 0x5);
1424 mov(al, byte[A2+LDA*2-0x80]);
1425 pinsrb(xmm0, eax, 0x6);
1426 mov(al, byte[A2+LDA3*1-0x80]);
1427 lea(A2, ptr[A2+LDA*4]);
1428 pinsrb(xmm0, eax, 0x7);
1429 mov(al, byte[A2-0x80]);
1430 pinsrb(xmm0, eax, 0x8);
1431 mov(al, byte[A2+LDA*1-0x80]);
1432 pinsrb(xmm0, eax, 0x9);
1433 mov(al, byte[A2+LDA*2-0x80]);
1434 pinsrb(xmm0, eax, 0xa);
1435 mov(al, byte[A2+LDA3*1-0x80]);
1436 lea(A2, ptr[A2+LDA*4]);
1437 pinsrb(xmm0, eax, 0xb);
1438 mov(al, byte[A2-0x80]);
1439 pinsrb(xmm0, eax, 0xc);
1440 mov(al, byte[A2+LDA*1-0x80]);
1441 pinsrb(xmm0, eax, 0xd);
1442 mov(al, byte[A2+LDA*2-0x80]);
1443 pinsrb(xmm0, eax, 0xe);
1444 mov(al, byte[A2+LDA3*1-0x80]);
1445 lea(A2, ptr[A2+LDA*4]);
1446 pinsrb(xmm0, eax, 0xf);
1447 movdqu(xword[B-0x70], xmm0);
1473 movdqu(xmm0, xword[A1-0x80]);
1474 movdqu(xmm1, xword[A1+LDA*1-0x80]);
1475 movdqu(xmm2, xword[A1+LDA*2-0x80]);
1476 movdqu(xmm3, xword[A1+LDA3*1-0x80]);
1477 lea(A2, ptr[A1+LDA*4]);
1479 punpckldq(xmm0, xmm1);
1480 punpckhdq(xmm4, xmm1);
1482 punpckldq(xmm2, xmm3);
1483 punpckhdq(xmm5, xmm3);
1485 punpcklqdq(xmm0, xmm2);
1486 punpckhqdq(xmm1, xmm2);
1488 punpcklqdq(xmm4, xmm5);
1489 punpckhqdq(xmm3, xmm5);
1490 movdqu(xword[B-0x80], xmm0);
1491 movdqu(xword[B-0x40], xmm1);
1492 movdqu(xword[B], xmm4);
1493 movdqu(xword[B+0x40], xmm3);
1494 movdqu(xmm0, xword[A2-0x80]);
1495 movdqu(xmm1, xword[A2+LDA*1-0x80]);
1496 movdqu(xmm2, xword[A2+LDA*2-0x80]);
1497 movdqu(xmm3, xword[A2+LDA3*1-0x80]);
1498 lea(A2, ptr[A2+LDA*4]);
1500 punpckldq(xmm0, xmm1);
1501 punpckhdq(xmm4, xmm1);
1503 punpckldq(xmm2, xmm3);
1504 punpckhdq(xmm5, xmm3);
1506 punpcklqdq(xmm0, xmm2);
1507 punpckhqdq(xmm1, xmm2);
1509 punpcklqdq(xmm4, xmm5);
1510 punpckhqdq(xmm3, xmm5);
1511 movdqu(xword[B-0x70], xmm0);
1512 movdqu(xword[B-0x30], xmm1);
1513 movdqu(xword[B+0x10], xmm4);
1514 movdqu(xword[B+0x50], xmm3);
1515 movdqu(xmm0, xword[A2-0x80]);
1516 movdqu(xmm1, xword[A2+LDA*1-0x80]);
1517 movdqu(xmm2, xword[A2+LDA*2-0x80]);
1518 movdqu(xmm3, xword[A2+LDA3*1-0x80]);
1519 lea(A2, ptr[A2+LDA*4]);
1521 punpckldq(xmm0, xmm1);
1522 punpckhdq(xmm4, xmm1);
1524 punpckldq(xmm2, xmm3);
1525 punpckhdq(xmm5, xmm3);
1527 punpcklqdq(xmm0, xmm2);
1528 punpckhqdq(xmm1, xmm2);
1530 punpcklqdq(xmm4, xmm5);
1531 punpckhqdq(xmm3, xmm5);
1532 movdqu(xword[B-0x60], xmm0);
1533 movdqu(xword[B-0x20], xmm1);
1534 movdqu(xword[B+0x20], xmm4);
1535 movdqu(xword[B+0x60], xmm3);
1536 movdqu(xmm0, xword[A2-0x80]);
1537 movdqu(xmm1, xword[A2+LDA*1-0x80]);
1538 movdqu(xmm2, xword[A2+LDA*2-0x80]);
1539 movdqu(xmm3, xword[A2+LDA3*1-0x80]);
1540 lea(A2, ptr[A2+LDA*4]);
1542 punpckldq(xmm0, xmm1);
1543 punpckhdq(xmm4, xmm1);
1545 punpckldq(xmm2, xmm3);
1546 punpckhdq(xmm5, xmm3);
1548 punpcklqdq(xmm0, xmm2);
1549 punpckhqdq(xmm1, xmm2);
1551 punpcklqdq(xmm4, xmm5);
1552 punpckhqdq(xmm3, xmm5);
1553 movdqu(xword[B-0x50], xmm0);
1554 movdqu(xword[B-0x10], xmm1);
1555 movdqu(xword[B+0x30], xmm4);
1556 movdqu(xword[B+0x70], xmm3);
1566 movq(xmm0, qword[A1-0x80]);
1567 movq(xmm1, qword[A1+LDA*1-0x80]);
1568 movq(xmm2, qword[A1+LDA*2-0x80]);
1569 movq(xmm3, qword[A1+LDA3*1-0x80]);
1570 lea(A2, ptr[A1+LDA*4]);
1571 punpckldq(xmm0, xmm1);
1572 punpckldq(xmm2, xmm3);
1574 punpcklqdq(xmm0, xmm2);
1575 punpckhqdq(xmm1, xmm2);
1576 movdqu(xword[B-0x80], xmm0);
1577 movdqu(xword[B-0x40], xmm1);
1578 movq(xmm0, qword[A2-0x80]);
1579 movq(xmm1, qword[A2+LDA*1-0x80]);
1580 movq(xmm2, qword[A2+LDA*2-0x80]);
1581 movq(xmm3, qword[A2+LDA3*1-0x80]);
1582 lea(A2, ptr[A2+LDA*4]);
1583 punpckldq(xmm0, xmm1);
1584 punpckldq(xmm2, xmm3);
1586 punpcklqdq(xmm0, xmm2);
1587 punpckhqdq(xmm1, xmm2);
1588 movdqu(xword[B-0x70], xmm0);
1589 movdqu(xword[B-0x30], xmm1);
1590 movq(xmm0, qword[A2-0x80]);
1591 movq(xmm1, qword[A2+LDA*1-0x80]);
1592 movq(xmm2, qword[A2+LDA*2-0x80]);
1593 movq(xmm3, qword[A2+LDA3*1-0x80]);
1594 lea(A2, ptr[A2+LDA*4]);
1595 punpckldq(xmm0, xmm1);
1596 punpckldq(xmm2, xmm3);
1598 punpcklqdq(xmm0, xmm2);
1599 punpckhqdq(xmm1, xmm2);
1600 movdqu(xword[B-0x60], xmm0);
1601 movdqu(xword[B-0x20], xmm1);
1602 movq(xmm0, qword[A2-0x80]);
1603 movq(xmm1, qword[A2+LDA*1-0x80]);
1604 movq(xmm2, qword[A2+LDA*2-0x80]);
1605 movq(xmm3, qword[A2+LDA3*1-0x80]);
1606 punpckldq(xmm0, xmm1);
1607 punpckldq(xmm2, xmm3);
1609 punpcklqdq(xmm0, xmm2);
1610 punpckhqdq(xmm1, xmm2);
1611 movdqu(xword[B-0x50], xmm0);
1612 movdqu(xword[B-0x10], xmm1);
1620 movd(xmm0, dword[A1-0x80]);
1621 movd(xmm1, dword[A1+LDA*1-0x80]);
1622 movd(xmm2, dword[A1+LDA*2-0x80]);
1623 movd(xmm3, dword[A1+LDA3*1-0x80]);
1624 lea(A2, ptr[A1+LDA*4]);
1625 punpckldq(xmm0, xmm1);
1626 punpckldq(xmm2, xmm3);
1627 punpcklqdq(xmm0, xmm2);
1628 movdqu(xword[B-0x80], xmm0);
1629 movd(xmm0, dword[A2-0x80]);
1630 movd(xmm1, dword[A2+LDA*1-0x80]);
1631 movd(xmm2, dword[A2+LDA*2-0x80]);
1632 movd(xmm3, dword[A2+LDA3*1-0x80]);
1633 lea(A2, ptr[A2+LDA*4]);
1634 punpckldq(xmm0, xmm1);
1635 punpckldq(xmm2, xmm3);
1636 punpcklqdq(xmm0, xmm2);
1637 movdqu(xword[B-0x70], xmm0);
1638 movd(xmm0, dword[A2-0x80]);
1639 movd(xmm1, dword[A2+LDA*1-0x80]);
1640 movd(xmm2, dword[A2+LDA*2-0x80]);
1641 movd(xmm3, dword[A2+LDA3*1-0x80]);
1642 lea(A2, ptr[A2+LDA*4]);
1643 punpckldq(xmm0, xmm1);
1644 punpckldq(xmm2, xmm3);
1645 punpcklqdq(xmm0, xmm2);
1646 movdqu(xword[B-0x60], xmm0);
1647 movd(xmm0, dword[A2-0x80]);
1648 movd(xmm1, dword[A2+LDA*1-0x80]);
1649 movd(xmm2, dword[A2+LDA*2-0x80]);
1650 movd(xmm3, dword[A2+LDA3*1-0x80]);
1651 lea(A2, ptr[A2+LDA*4]);
1652 punpckldq(xmm0, xmm1);
1653 punpckldq(xmm2, xmm3);
1654 punpcklqdq(xmm0, xmm2);
1655 movdqu(xword[B-0x50], xmm0);
1663 mov(ax, word[A1-0x80]);
1664 pinsrw(xmm0, eax, 0x0);
1665 mov(ax, word[A1+LDA*1-0x80]);
1666 pinsrw(xmm0, eax, 0x1);
1667 mov(ax, word[A1+LDA*2-0x80]);
1668 pinsrw(xmm0, eax, 0x2);
1669 mov(ax, word[A1+LDA3*1-0x80]);
1670 lea(A2, ptr[A1+LDA*4]);
1671 pinsrw(xmm0, eax, 0x3);
1672 mov(ax, word[A2-0x80]);
1673 pinsrw(xmm0, eax, 0x4);
1674 mov(ax, word[A2+LDA*1-0x80]);
1675 pinsrw(xmm0, eax, 0x5);
1676 mov(ax, word[A2+LDA*2-0x80]);
1677 pinsrw(xmm0, eax, 0x6);
1678 mov(ax, word[A2+LDA3*1-0x80]);
1679 lea(A2, ptr[A2+LDA*4]);
1680 pinsrw(xmm0, eax, 0x7);
1681 movdqu(xword[B-0x80], xmm0);
1682 mov(ax, word[A2-0x80]);
1683 pinsrw(xmm0, eax, 0x0);
1684 mov(ax, word[A2+LDA*1-0x80]);
1685 pinsrw(xmm0, eax, 0x1);
1686 mov(ax, word[A2+LDA*2-0x80]);
1687 pinsrw(xmm0, eax, 0x2);
1688 mov(ax, word[A2+LDA3*1-0x80]);
1689 lea(A2, ptr[A2+LDA*4]);
1690 pinsrw(xmm0, eax, 0x3);
1691 mov(ax, word[A2-0x80]);
1692 pinsrw(xmm0, eax, 0x4);
1693 mov(ax, word[A2+LDA*1-0x80]);
1694 pinsrw(xmm0, eax, 0x5);
1695 mov(ax, word[A2+LDA*2-0x80]);
1696 pinsrw(xmm0, eax, 0x6);
1697 mov(ax, word[A2+LDA3*1-0x80]);
1698 pinsrw(xmm0, eax, 0x7);
1699 movdqu(xword[B-0x70], xmm0);
1707 mov(al, byte[A1-0x80]);
1708 pinsrb(xmm0, eax, 0x0);
1709 mov(al, byte[A1+LDA*1-0x80]);
1710 pinsrb(xmm0, eax, 0x1);
1711 mov(al, byte[A1+LDA*2-0x80]);
1712 pinsrb(xmm0, eax, 0x2);
1713 mov(al, byte[A1+LDA3*1-0x80]);
1714 lea(A2, ptr[A1+LDA*4]);
1715 pinsrb(xmm0, eax, 0x3);
1716 mov(al, byte[A2-0x80]);
1717 pinsrb(xmm0, eax, 0x4);
1718 mov(al, byte[A2+LDA*1-0x80]);
1719 pinsrb(xmm0, eax, 0x5);
1720 mov(al, byte[A2+LDA*2-0x80]);
1721 pinsrb(xmm0, eax, 0x6);
1722 mov(al, byte[A2+LDA3*1-0x80]);
1723 lea(A2, ptr[A2+LDA*4]);
1724 pinsrb(xmm0, eax, 0x7);
1725 mov(al, byte[A2-0x80]);
1726 pinsrb(xmm0, eax, 0x8);
1727 mov(al, byte[A2+LDA*1-0x80]);
1728 pinsrb(xmm0, eax, 0x9);
1729 mov(al, byte[A2+LDA*2-0x80]);
1730 pinsrb(xmm0, eax, 0xa);
1731 mov(al, byte[A2+LDA3*1-0x80]);
1732 lea(A2, ptr[A2+LDA*4]);
1733 pinsrb(xmm0, eax, 0xb);
1734 mov(al, byte[A2-0x80]);
1735 pinsrb(xmm0, eax, 0xc);
1736 mov(al, byte[A2+LDA*1-0x80]);
1737 pinsrb(xmm0, eax, 0xd);
1738 mov(al, byte[A2+LDA*2-0x80]);
1739 pinsrb(xmm0, eax, 0xe);
1740 mov(al, byte[A2+LDA3*1-0x80]);
1741 pinsrb(xmm0, eax, 0xf);
1742 movdqu(xword[B-0x80], xmm0);
1759 lea(A2, ptr[A1+LDA*4]);
1760 lea(I, ptr[A1+LDA*8]);
1768 movdqu(xmm0, xword[A1-0x80]);
1769 movdqu(xmm1, xword[A1+LDA*1-0x80]);
1770 movdqu(xmm2, xword[A1+LDA*2-0x80]);
1771 movdqu(xmm3, xword[A1+LDA3*1-0x80]);
1774 punpckldq(xmm0, xmm1);
1775 punpckhdq(xmm4, xmm1);
1777 punpckldq(xmm2, xmm3);
1778 punpckhdq(xmm5, xmm3);
1780 punpcklqdq(xmm0, xmm2);
1781 punpckhqdq(xmm1, xmm2);
1783 punpcklqdq(xmm4, xmm5);
1784 punpckhqdq(xmm3, xmm5);
1785 movdqu(xword[B-0x80], xmm0);
1786 movdqu(xword[B-0x60], xmm1);
1787 movdqu(xword[B-0x40], xmm4);
1788 movdqu(xword[B-0x20], xmm3);
1789 movdqu(xmm0, xword[A2-0x80]);
1790 movdqu(xmm1, xword[A2+LDA*1-0x80]);
1791 movdqu(xmm2, xword[A2+LDA*2-0x80]);
1792 movdqu(xmm3, xword[A2+LDA3*1-0x80]);
1795 punpckldq(xmm0, xmm1);
1796 punpckhdq(xmm4, xmm1);
1798 punpckldq(xmm2, xmm3);
1799 punpckhdq(xmm5, xmm3);
1801 punpcklqdq(xmm0, xmm2);
1802 punpckhqdq(xmm1, xmm2);
1804 punpcklqdq(xmm4, xmm5);
1805 punpckhqdq(xmm3, xmm5);
1806 movdqu(xword[B-0x70], xmm0);
1807 movdqu(xword[B-0x50], xmm1);
1808 movdqu(xword[B-0x30], xmm4);
1809 movdqu(xword[B-0x10], xmm3);
1818 movq(xmm0, qword[A1-0x80]);
1819 movq(xmm1, qword[A1+LDA*1-0x80]);
1820 movq(xmm2, qword[A1+LDA*2-0x80]);
1821 movq(xmm3, qword[A1+LDA3*1-0x80]);
1823 punpckldq(xmm0, xmm1);
1824 punpckldq(xmm2, xmm3);
1826 punpcklqdq(xmm0, xmm2);
1827 punpckhqdq(xmm1, xmm2);
1828 movdqu(xword[B-0x80], xmm0);
1829 movdqu(xword[B-0x60], xmm1);
1830 movq(xmm0, qword[A2-0x80]);
1831 movq(xmm1, qword[A2+LDA*1-0x80]);
1832 movq(xmm2, qword[A2+LDA*2-0x80]);
1833 movq(xmm3, qword[A2+LDA3*1-0x80]);
1835 punpckldq(xmm0, xmm1);
1836 punpckldq(xmm2, xmm3);
1838 punpcklqdq(xmm0, xmm2);
1839 punpckhqdq(xmm1, xmm2);
1840 movdqu(xword[B-0x70], xmm0);
1841 movdqu(xword[B-0x50], xmm1);
1848 movd(xmm0, dword[A1-0x80]);
1849 movd(xmm1, dword[A1+LDA*1-0x80]);
1850 movd(xmm2, dword[A1+LDA*2-0x80]);
1851 movd(xmm3, dword[A1+LDA3*1-0x80]);
1853 punpckldq(xmm0, xmm1);
1854 punpckldq(xmm2, xmm3);
1855 punpcklqdq(xmm0, xmm2);
1856 movdqu(xword[B-0x80], xmm0);
1857 movd(xmm0, dword[A2-0x80]);
1858 movd(xmm1, dword[A2+LDA*1-0x80]);
1859 movd(xmm2, dword[A2+LDA*2-0x80]);
1860 movd(xmm3, dword[A2+LDA3*1-0x80]);
1862 punpckldq(xmm0, xmm1);
1863 punpckldq(xmm2, xmm3);
1864 punpcklqdq(xmm0, xmm2);
1865 movdqu(xword[B-0x70], xmm0);
1872 mov(ax, word[A1-0x80]);
1873 pinsrw(xmm0, eax, 0x0);
1874 mov(ax, word[A1+LDA*1-0x80]);
1875 pinsrw(xmm0, eax, 0x1);
1876 mov(ax, word[A1+LDA*2-0x80]);
1877 pinsrw(xmm0, eax, 0x2);
1878 mov(ax, word[A1+LDA3*1-0x80]);
1880 pinsrw(xmm0, eax, 0x3);
1881 mov(ax, word[A2-0x80]);
1882 pinsrw(xmm0, eax, 0x4);
1883 mov(ax, word[A2+LDA*1-0x80]);
1884 pinsrw(xmm0, eax, 0x5);
1885 mov(ax, word[A2+LDA*2-0x80]);
1886 pinsrw(xmm0, eax, 0x6);
1887 mov(ax, word[A2+LDA3*1-0x80]);
1889 pinsrw(xmm0, eax, 0x7);
1890 movdqu(xword[B-0x80], xmm0);
1897 mov(al, byte[A1-0x80]);
1898 pinsrb(xmm0, eax, 0x0);
1899 mov(al, byte[A1+LDA*1-0x80]);
1900 pinsrb(xmm0, eax, 0x1);
1901 mov(al, byte[A1+LDA*2-0x80]);
1902 pinsrb(xmm0, eax, 0x2);
1903 mov(al, byte[A1+LDA3*1-0x80]);
1904 pinsrb(xmm0, eax, 0x3);
1905 mov(al, byte[A2-0x80]);
1906 pinsrb(xmm0, eax, 0x4);
1907 mov(al, byte[A2+LDA*1-0x80]);
1908 pinsrb(xmm0, eax, 0x5);
1909 mov(al, byte[A2+LDA*2-0x80]);
1910 pinsrb(xmm0, eax, 0x6);
1911 mov(al, byte[A2+LDA3*1-0x80]);
1912 pinsrb(xmm0, eax, 0x7);
1913 movq(qword[B-0x80], xmm0);
1930 lea(A2, ptr[A1+LDA*2]);
1931 lea(I, ptr[A1+LDA*4]);
1939 movdqu(xmm0, xword[A1-0x80]);
1940 movdqu(xmm1, xword[A1+LDA*1-0x80]);
1942 movdqu(xmm2, xword[A2-0x80]);
1943 movdqu(xmm3, xword[A2+LDA*1-0x80]);
1946 punpckldq(xmm0, xmm1);
1947 punpckhdq(xmm4, xmm1);
1949 punpckldq(xmm2, xmm3);
1950 punpckhdq(xmm5, xmm3);
1952 punpcklqdq(xmm0, xmm2);
1953 punpckhqdq(xmm1, xmm2);
1955 punpcklqdq(xmm4, xmm5);
1956 punpckhqdq(xmm3, xmm5);
1957 movdqu(xword[B-0x80], xmm0);
1958 movdqu(xword[B-0x70], xmm1);
1959 movdqu(xword[B-0x60], xmm4);
1960 movdqu(xword[B-0x50], xmm3);
1969 movq(xmm0, qword[A1-0x80]);
1970 movq(xmm1, qword[A1+LDA*1-0x80]);
1972 movq(xmm2, qword[A2-0x80]);
1973 movq(xmm3, qword[A2+LDA*1-0x80]);
1975 punpckldq(xmm0, xmm1);
1976 punpckldq(xmm2, xmm3);
1978 punpcklqdq(xmm0, xmm2);
1979 punpckhqdq(xmm1, xmm2);
1980 movdqu(xword[B-0x80], xmm0);
1981 movdqu(xword[B-0x70], xmm1);
1988 movd(xmm0, dword[A1-0x80]);
1989 movd(xmm1, dword[A1+LDA*1-0x80]);
1991 movd(xmm2, dword[A2-0x80]);
1992 movd(xmm3, dword[A2+LDA*1-0x80]);
1994 punpckldq(xmm0, xmm1);
1995 punpckldq(xmm2, xmm3);
1996 punpcklqdq(xmm0, xmm2);
1997 movdqu(xword[B-0x80], xmm0);
2004 mov(ax, word[A1-0x80]);
2005 pinsrw(xmm0, eax, 0x0);
2006 mov(ax, word[A1+LDA*1-0x80]);
2008 pinsrw(xmm0, eax, 0x1);
2009 mov(ax, word[A2-0x80]);
2010 pinsrw(xmm0, eax, 0x2);
2011 mov(ax, word[A2+LDA*1-0x80]);
2013 pinsrw(xmm0, eax, 0x3);
2014 movq(qword[B-0x80], xmm0);
2021 mov(al, byte[A1-0x80]);
2022 pinsrb(xmm0, eax, 0x0);
2023 mov(al, byte[A1+LDA*1-0x80]);
2024 pinsrb(xmm0, eax, 0x1);
2025 mov(al, byte[A2-0x80]);
2026 pinsrb(xmm0, eax, 0x2);
2027 mov(al, byte[A2+LDA*1-0x80]);
2028 pinsrb(xmm0, eax, 0x3);
2029 movd(dword[B-0x80], xmm0);
2046 lea(A2, ptr[A1+LDA*1]);
2047 lea(I, ptr[A1+LDA*2]);
2055 movdqu(xmm0, xword[A1-0x80]);
2057 movdqu(xmm1, xword[A2-0x80]);
2060 punpckldq(xmm0, xmm1);
2061 punpckhdq(xmm2, xmm1);
2062 movdqu(xword[B-0x80], xmm0);
2063 movdqu(xword[B-0x70], xmm2);
2072 movq(xmm0, qword[A1-0x80]);
2074 movq(xmm1, qword[A2-0x80]);
2076 punpckldq(xmm0, xmm1);
2077 movdqu(xword[B-0x80], xmm0);
2084 movd(xmm0, dword[A1-0x80]);
2086 movd(xmm1, dword[A2-0x80]);
2088 punpckldq(xmm0, xmm1);
2089 movq(qword[B-0x80], xmm0);
2096 mov(ax, word[A1-0x80]);
2098 pinsrw(xmm0, eax, 0x0);
2099 mov(ax, word[A2-0x80]);
2101 pinsrw(xmm0, eax, 0x1);
2102 movd(dword[B-0x80], xmm0);
2109 mov(al, byte[A1-0x80]);
2110 mov(byte[B-0x80], al);
2111 mov(al, byte[A2-0x80]);
2112 mov(byte[B-0x7f], al);
2136 movdqu(xmm0, xword[A1-0x80]);
2138 movdqu(xword[B-0x80], xmm0);
2147 movq(xmm0, qword[A1-0x80]);
2149 movq(qword[B-0x80], xmm0);
2156 movd(xmm0, dword[A1-0x80]);
2158 movd(dword[B-0x80], xmm0);
2165 mov(ax, word[A1-0x80]);
2166 mov(word[B-0x80], ax);
2174 mov(al, byte[A1-0x80]);
2175 mov(byte[B-0x80], al);