2 * MIPS DSPr2 optimizations for libjpeg-turbo
4 * Copyright (C) 2013-2014, MIPS Technologies, Inc., California.
6 * Authors: Teodora Novkovic <teodora.novkovic@imgtec.com>
7 * Darko Laus <darko.laus@imgtec.com>
8 * Copyright (C) 2015, D. R. Commander. All Rights Reserved.
10 * This software is provided 'as-is', without any express or implied
11 * warranty. In no event will the authors be held liable for any damages
12 * arising from the use of this software.
14 * Permission is granted to anyone to use this software for any purpose,
15 * including commercial applications, and to alter it and redistribute it
16 * freely, subject to the following restrictions:
18 * 1. The origin of this software must not be misrepresented; you must not
19 * claim that you wrote the original software. If you use this software
20 * in a product, an acknowledgment in the product documentation would be
21 * appreciated but is not required.
22 * 2. Altered source versions must be plainly marked as such, and must not be
23 * misrepresented as being the original software.
24 * 3. This notice may not be removed or altered from any source distribution.
27 #include "jsimd_dspr2_asm.h"
30 /*****************************************************************************/
31 LEAF_DSPR2(jsimd_c_null_convert_dspr2)
33 * a0 = cinfo->image_width
38 * 20(sp) = cinfo->num_components
40 * Null conversion for compression
42 SAVE_REGS_ON_STACK 8, s0, s1
44 lw t9, 24(sp) // t9 = num_rows
45 lw s0, 28(sp) // s0 = cinfo->num_components
46 andi t0, a0, 3 // t0 = cinfo->image_width & 3
47 beqz t0, 4f // no residual
55 lwx t5, t3(a2) // t5 = outptr = output_buf[ci]
56 lw t2, 0(a1) // t2 = inptr = *input_buf
58 lwx t5, t4(t5) // t5 = outptr = output_buf[ci][output_row]
97 lwx t5, t3(a2) // t5 = outptr = output_buf[ci]
98 lw t2, 0(a1) // t2 = inptr = *input_buf
100 lwx t5, t4(t5) // t5 = outptr = output_buf[ci][output_row]
126 RESTORE_REGS_FROM_STACK 8, s0, s1
131 END(jsimd_c_null_convert_dspr2)
134 /*****************************************************************************/
136 * jsimd_extrgb_ycc_convert_dspr2
137 * jsimd_extbgr_ycc_convert_dspr2
138 * jsimd_extrgbx_ycc_convert_dspr2
139 * jsimd_extbgrx_ycc_convert_dspr2
140 * jsimd_extxbgr_ycc_convert_dspr2
141 * jsimd_extxrgb_ycc_convert_dspr2
143 * Colorspace conversion RGB -> YCbCr
146 .macro GENERATE_JSIMD_RGB_YCC_CONVERT_DSPR2 colorid, pixel_size, \
147 r_offs, g_offs, b_offs
149 .macro DO_RGB_TO_YCC r, g, b, inptr
150 lbu \r, \r_offs(\inptr)
151 lbu \g, \g_offs(\inptr)
152 lbu \b, \b_offs(\inptr)
153 addiu \inptr, \pixel_size
156 LEAF_DSPR2(jsimd_\colorid\()_ycc_convert_dspr2)
158 * a0 = cinfo->image_width
164 SAVE_REGS_ON_STACK 32, s0, s1, s2, s3, s4, s5, s6, s7
166 lw t7, 48(sp) // t7 = num_rows
167 li s0, 0x4c8b // FIX(0.29900)
168 li s1, 0x9646 // FIX(0.58700)
169 li s2, 0x1d2f // FIX(0.11400)
170 li s3, 0xffffd4cd // -FIX(0.16874)
171 li s4, 0xffffab33 // -FIX(0.33126)
172 li s5, 0x8000 // FIX(0.50000)
173 li s6, 0xffff94d1 // -FIX(0.41869)
174 li s7, 0xffffeb2f // -FIX(0.08131)
175 li t8, 0x807fff // CBCR_OFFSET + ONE_HALF-1
178 addiu t7, -1 // --num_rows
179 lw t6, 0(a1) // t6 = input_buf[0]
184 lwx t0, t3(t0) // t0 = output_buf[0][output_row]
185 lwx t1, t3(t1) // t1 = output_buf[1][output_row]
186 lwx t2, t3(t2) // t2 = output_buf[2][output_row]
188 addu t9, t2, a0 // t9 = end address
192 DO_RGB_TO_YCC t3, t4, t5, t6
219 RESTORE_REGS_FROM_STACK 32, s0, s1, s2, s3, s4, s5, s6, s7
223 END(jsimd_\colorid\()_ycc_convert_dspr2)
225 .purgem DO_RGB_TO_YCC
229 /*-------------------------------------id -- pix R G B */
230 GENERATE_JSIMD_RGB_YCC_CONVERT_DSPR2 extrgb, 3, 0, 1, 2
231 GENERATE_JSIMD_RGB_YCC_CONVERT_DSPR2 extbgr, 3, 2, 1, 0
232 GENERATE_JSIMD_RGB_YCC_CONVERT_DSPR2 extrgbx, 4, 0, 1, 2
233 GENERATE_JSIMD_RGB_YCC_CONVERT_DSPR2 extbgrx, 4, 2, 1, 0
234 GENERATE_JSIMD_RGB_YCC_CONVERT_DSPR2 extxbgr, 4, 3, 2, 1
235 GENERATE_JSIMD_RGB_YCC_CONVERT_DSPR2 extxrgb, 4, 1, 2, 3
238 /*****************************************************************************/
240 * jsimd_ycc_extrgb_convert_dspr2
241 * jsimd_ycc_extbgr_convert_dspr2
242 * jsimd_ycc_extrgbx_convert_dspr2
243 * jsimd_ycc_extbgrx_convert_dspr2
244 * jsimd_ycc_extxbgr_convert_dspr2
245 * jsimd_ycc_extxrgb_convert_dspr2
247 * Colorspace conversion YCbCr -> RGB
250 .macro GENERATE_JSIMD_YCC_RGB_CONVERT_DSPR2 colorid, pixel_size, \
251 r_offs, g_offs, b_offs, a_offs
253 .macro STORE_YCC_TO_RGB scratch0 scratch1 scratch2 outptr
254 sb \scratch0, \r_offs(\outptr)
255 sb \scratch1, \g_offs(\outptr)
256 sb \scratch2, \b_offs(\outptr)
257 .if (\pixel_size == 4)
259 sb t0, \a_offs(\outptr)
261 addiu \outptr, \pixel_size
264 LEAF_DSPR2(jsimd_ycc_\colorid\()_convert_dspr2)
266 * a0 = cinfo->image_width
272 SAVE_REGS_ON_STACK 32, s0, s1, s2, s3, s4, s5, s6, s7
276 li t4, 0x166e9 // FIX(1.40200)
277 li t5, 0x1c5a2 // FIX(1.77200)
278 li t6, 0xffff492e // -FIX(0.71414)
279 li t7, 0xffffa7e6 // -FIX(0.34414)
304 mul t0, t6, s7 // Crgtab[cr]
306 mulq_rs.w t1, t4, s7 // Crrtab[cr]
308 addu t2, t3 // Cbgtab[cb]
311 mulq_rs.w t0, t5, s6 // Cbbtab[cb]
322 addu.ph t2, t2, t8 // clip & store
327 STORE_YCC_TO_RGB t1, t2, t0, s0
334 RESTORE_REGS_FROM_STACK 32, s0, s1, s2, s3, s4, s5, s6, s7
338 END(jsimd_ycc_\colorid\()_convert_dspr2)
340 .purgem STORE_YCC_TO_RGB
344 /*-------------------------------------id -- pix R G B A */
345 GENERATE_JSIMD_YCC_RGB_CONVERT_DSPR2 extrgb, 3, 0, 1, 2, 3
346 GENERATE_JSIMD_YCC_RGB_CONVERT_DSPR2 extbgr, 3, 2, 1, 0, 3
347 GENERATE_JSIMD_YCC_RGB_CONVERT_DSPR2 extrgbx, 4, 0, 1, 2, 3
348 GENERATE_JSIMD_YCC_RGB_CONVERT_DSPR2 extbgrx, 4, 2, 1, 0, 3
349 GENERATE_JSIMD_YCC_RGB_CONVERT_DSPR2 extxbgr, 4, 3, 2, 1, 0
350 GENERATE_JSIMD_YCC_RGB_CONVERT_DSPR2 extxrgb, 4, 1, 2, 3, 0
353 /*****************************************************************************/
355 * jsimd_extrgb_gray_convert_dspr2
356 * jsimd_extbgr_gray_convert_dspr2
357 * jsimd_extrgbx_gray_convert_dspr2
358 * jsimd_extbgrx_gray_convert_dspr2
359 * jsimd_extxbgr_gray_convert_dspr2
360 * jsimd_extxrgb_gray_convert_dspr2
362 * Colorspace conversion RGB -> GRAY
365 .macro GENERATE_JSIMD_RGB_GRAY_CONVERT_DSPR2 colorid, pixel_size, \
366 r_offs, g_offs, b_offs
368 .macro DO_RGB_TO_GRAY r, g, b, inptr
369 lbu \r, \r_offs(\inptr)
370 lbu \g, \g_offs(\inptr)
371 lbu \b, \b_offs(\inptr)
372 addiu \inptr, \pixel_size
375 LEAF_DSPR2(jsimd_\colorid\()_gray_convert_dspr2)
377 * a0 = cinfo->image_width
383 SAVE_REGS_ON_STACK 32, s0, s1, s2, s3, s4, s5, s6, s7
385 li s0, 0x4c8b // s0 = FIX(0.29900)
386 li s1, 0x9646 // s1 = FIX(0.58700)
387 li s2, 0x1d2f // s2 = FIX(0.11400)
388 li s7, 0x8000 // s7 = FIX(0.50000)
393 addiu s6, -1 // s6 = num_rows
405 DO_RGB_TO_GRAY t3, t4, t5, t0
406 DO_RGB_TO_GRAY s3, s4, s5, t0
418 DO_RGB_TO_GRAY t3, t4, t5, t0
419 DO_RGB_TO_GRAY s3, s4, s5, t0
445 DO_RGB_TO_GRAY t3, t4, t5, t0
461 RESTORE_REGS_FROM_STACK 32, s0, s1, s2, s3, s4, s5, s6, s7
465 END(jsimd_\colorid\()_gray_convert_dspr2)
467 .purgem DO_RGB_TO_GRAY
471 /*-------------------------------------id -- pix R G B */
472 GENERATE_JSIMD_RGB_GRAY_CONVERT_DSPR2 extrgb, 3, 0, 1, 2
473 GENERATE_JSIMD_RGB_GRAY_CONVERT_DSPR2 extbgr, 3, 2, 1, 0
474 GENERATE_JSIMD_RGB_GRAY_CONVERT_DSPR2 extrgbx, 4, 0, 1, 2
475 GENERATE_JSIMD_RGB_GRAY_CONVERT_DSPR2 extbgrx, 4, 2, 1, 0
476 GENERATE_JSIMD_RGB_GRAY_CONVERT_DSPR2 extxbgr, 4, 3, 2, 1
477 GENERATE_JSIMD_RGB_GRAY_CONVERT_DSPR2 extxrgb, 4, 1, 2, 3
480 /*****************************************************************************/
482 * jsimd_h2v2_merged_upsample_dspr2
483 * jsimd_h2v2_extrgb_merged_upsample_dspr2
484 * jsimd_h2v2_extrgbx_merged_upsample_dspr2
485 * jsimd_h2v2_extbgr_merged_upsample_dspr2
486 * jsimd_h2v2_extbgrx_merged_upsample_dspr2
487 * jsimd_h2v2_extxbgr_merged_upsample_dspr2
488 * jsimd_h2v2_extxrgb_merged_upsample_dspr2
490 * Merged h2v2 upsample routines
492 .macro GENERATE_H2V2_MERGED_UPSAMPLE_DSPR2 colorid, pixel_size, \
498 .macro STORE_H2V2_2_PIXELS scratch0 scratch1 scratch2 scratch3 scratch4 \
500 sb \scratch0, \r1_offs(\outptr)
501 sb \scratch1, \g1_offs(\outptr)
502 sb \scratch2, \b1_offs(\outptr)
503 sb \scratch3, \r2_offs(\outptr)
504 sb \scratch4, \g2_offs(\outptr)
505 sb \scratch5, \b2_offs(\outptr)
506 .if (\pixel_size == 8)
508 sb \scratch0, \a1_offs(\outptr)
509 sb \scratch0, \a2_offs(\outptr)
511 addiu \outptr, \pixel_size
514 .macro STORE_H2V2_1_PIXEL scratch0 scratch1 scratch2 outptr
515 sb \scratch0, \r1_offs(\outptr)
516 sb \scratch1, \g1_offs(\outptr)
517 sb \scratch2, \b1_offs(\outptr)
519 .if (\pixel_size == 8)
521 sb t0, \a1_offs(\outptr)
525 LEAF_DSPR2(jsimd_h2v2_\colorid\()_merged_upsample_dspr2)
527 * a0 = cinfo->output_width
529 * a2 = in_row_group_ctr
531 * 16(sp) = cinfo->sample_range_limit
533 SAVE_REGS_ON_STACK 40, s0, s1, s2, s3, s4, s5, s6, s7, ra
535 lw t9, 56(sp) // cinfo->sample_range_limit
542 lw t4, 0(a3) // t4 = output_buf[0]
543 lwx t1, t1(v0) // t1 = input_buf[0][in_row_group_ctr*2]
544 lwx t2, t2(v0) // t2 = input_buf[0][in_row_group_ctr*2 + 1]
545 lwx t5, t3(v1) // t5 = input_buf[1][in_row_group_ctr]
546 lwx t6, t3(t0) // t6 = input_buf[2][in_row_group_ctr]
547 lw t7, 4(a3) // t7 = output_buf[1]
549 addiu t8, s1, 0x7fff // t8 = 0x166e9 [FIX(1.40200)]
550 addiu s0, t8, 0x5eb9 // s0 = 0x1c5a2 [FIX(1.77200)]
551 addiu s1, zero, 0xa7e6 // s4 = 0xffffa7e6 [-FIX(0.34414)]
552 xori s2, s1, 0xeec8 // s3 = 0xffff492e [-FIX(0.71414)]
555 addu t0, t5, t3 // t0 = end address
560 addiu t3, t3, -128 // (cb - 128)
561 addiu s3, s3, -128 // (cr - 128)
566 mulq_rs.w s4, t8, s3 // s4 = (C1 * cr + ONE_HALF)>> SCALEBITS
567 extr_r.w s5, $ac1, 16
568 mulq_rs.w s6, s0, t3 // s6 = (C2 * cb + ONE_HALF)>> SCALEBITS
572 addu t3, v0, s4 // y+cred
573 addu s3, v0, s5 // y+cgreen
574 addu v1, v0, s6 // y+cblue
575 addu t3, t9, t3 // y+cred
576 addu s3, t9, s3 // y+cgreen
577 addu v1, t9, v1 // y+cblue
582 addu t3, v0, s4 // y+cred
583 addu s3, v0, s5 // y+cgreen
584 addu v1, v0, s6 // y+cblue
585 addu t3, t9, t3 // y+cred
586 addu s3, t9, s3 // y+cgreen
587 addu v1, t9, v1 // y+cblue
593 STORE_H2V2_2_PIXELS AT, s7, ra, t3, s3, v1, t4
595 addu t3, v0, s4 // y+cred
596 addu s3, v0, s5 // y+cgreen
597 addu v1, v0, s6 // y+cblue
598 addu t3, t9, t3 // y+cred
599 addu s3, t9, s3 // y+cgreen
600 addu v1, t9, v1 // y+cblue
606 addu t3, v0, s4 // y+cred
607 addu s3, v0, s5 // y+cgreen
608 addu v1, v0, s6 // y+cblue
609 addu t3, t9, t3 // y+cred
610 addu s3, t9, s3 // y+cgreen
611 addu v1, t9, v1 // y+cblue
616 STORE_H2V2_2_PIXELS AT, s7, ra, t3, s3, v1, t7
625 addiu t3, t3, -128 // (cb - 128)
626 addiu s3, s3, -128 // (cr - 128)
632 extr_r.w s5, $ac1, 16
633 mulq_rs.w s4, t8, s3 // s4 = (C1 * cr + ONE_HALF)>> SCALEBITS
634 mulq_rs.w s6, s0, t3 // s6 = (C2 * cb + ONE_HALF)>> SCALEBITS
635 addu t3, v0, s4 // y+cred
636 addu s3, v0, s5 // y+cgreen
637 addu v1, v0, s6 // y+cblue
638 addu t3, t9, t3 // y+cred
639 addu s3, t9, s3 // y+cgreen
640 addu v1, t9, v1 // y+cblue
646 STORE_H2V2_1_PIXEL t3, s3, v1, t4
648 addu t3, v0, s4 // y+cred
649 addu s3, v0, s5 // y+cgreen
650 addu v1, v0, s6 // y+cblue
651 addu t3, t9, t3 // y+cred
652 addu s3, t9, s3 // y+cgreen
653 addu v1, t9, v1 // y+cblue
658 STORE_H2V2_1_PIXEL t3, s3, v1, t7
660 RESTORE_REGS_FROM_STACK 40, s0, s1, s2, s3, s4, s5, s6, s7, ra
665 END(jsimd_h2v2_\colorid\()_merged_upsample_dspr2)
667 .purgem STORE_H2V2_1_PIXEL
668 .purgem STORE_H2V2_2_PIXELS
671 /*------------------------------------id -- pix R1 G1 B1 A1 R2 G2 B2 A2 */
672 GENERATE_H2V2_MERGED_UPSAMPLE_DSPR2 extrgb, 6, 0, 1, 2, 6, 3, 4, 5, 6
673 GENERATE_H2V2_MERGED_UPSAMPLE_DSPR2 extbgr, 6, 2, 1, 0, 3, 5, 4, 3, 6
674 GENERATE_H2V2_MERGED_UPSAMPLE_DSPR2 extrgbx, 8, 0, 1, 2, 3, 4, 5, 6, 7
675 GENERATE_H2V2_MERGED_UPSAMPLE_DSPR2 extbgrx, 8, 2, 1, 0, 3, 6, 5, 4, 7
676 GENERATE_H2V2_MERGED_UPSAMPLE_DSPR2 extxbgr, 8, 3, 2, 1, 0, 7, 6, 5, 4
677 GENERATE_H2V2_MERGED_UPSAMPLE_DSPR2 extxrgb, 8, 1, 2, 3, 0, 5, 6, 7, 4
680 /*****************************************************************************/
682 * jsimd_h2v1_merged_upsample_dspr2
683 * jsimd_h2v1_extrgb_merged_upsample_dspr2
684 * jsimd_h2v1_extrgbx_merged_upsample_dspr2
685 * jsimd_h2v1_extbgr_merged_upsample_dspr2
686 * jsimd_h2v1_extbgrx_merged_upsample_dspr2
687 * jsimd_h2v1_extxbgr_merged_upsample_dspr2
688 * jsimd_h2v1_extxrgb_merged_upsample_dspr2
690 * Merged h2v1 upsample routines
693 .macro GENERATE_H2V1_MERGED_UPSAMPLE_DSPR2 colorid, pixel_size, \
699 .macro STORE_H2V1_2_PIXELS scratch0 scratch1 scratch2 scratch3 scratch4 \
701 sb \scratch0, \r1_offs(\outptr)
702 sb \scratch1, \g1_offs(\outptr)
703 sb \scratch2, \b1_offs(\outptr)
704 sb \scratch3, \r2_offs(\outptr)
705 sb \scratch4, \g2_offs(\outptr)
706 sb \scratch5, \b2_offs(\outptr)
707 .if (\pixel_size == 8)
709 sb t0, \a1_offs(\outptr)
710 sb t0, \a2_offs(\outptr)
712 addiu \outptr, \pixel_size
715 .macro STORE_H2V1_1_PIXEL scratch0 scratch1 scratch2 outptr
716 sb \scratch0, \r1_offs(\outptr)
717 sb \scratch1, \g1_offs(\outptr)
718 sb \scratch2, \b1_offs(\outptr)
719 .if (\pixel_size == 8)
721 sb t0, \a1_offs(\outptr)
725 LEAF_DSPR2(jsimd_h2v1_\colorid\()_merged_upsample_dspr2)
727 * a0 = cinfo->output_width
729 * a2 = in_row_group_ctr
731 * 16(sp) = range_limit
733 SAVE_REGS_ON_STACK 40, s0, s1, s2, s3, s4, s5, s6, s7, ra
736 lw t1, 0(a1) // t1 = input_buf[0]
737 lw t2, 4(a1) // t2 = input_buf[1]
738 lw t3, 8(a1) // t3 = input_buf[2]
739 lw t8, 56(sp) // t8 = range_limit
740 addiu s1, t0, 0x7fff // s1 = 0x166e9 [FIX(1.40200)]
741 addiu s2, s1, 0x5eb9 // s2 = 0x1c5a2 [FIX(1.77200)]
742 addiu s0, t0, 0x9916 // s0 = 0x8000
743 addiu s4, zero, 0xa7e6 // s4 = 0xffffa7e6 [-FIX(0.34414)]
744 xori s3, s4, 0xeec8 // s3 = 0xffff492e [-FIX(0.71414)]
747 lwx s5, t4(t1) // s5 = inptr0
748 lwx s6, t4(t2) // s6 = inptr1
749 lwx s7, t4(t3) // s7 = inptr2
750 lw t7, 0(a3) // t7 = outptr
752 addu t9, s6, t0 // t9 = end address
754 lbu t2, 0(s6) // t2 = cb
755 lbu t0, 0(s7) // t0 = cr
756 lbu t1, 0(s5) // t1 = y
757 addiu t2, t2, -128 // t2 = cb - 128
758 addiu t0, t0, -128 // t0 = cr - 128
763 mulq_rs.w t0, s1, t0 // t0 = (C1*cr + ONE_HALF)>> SCALEBITS
764 extr_r.w t5, $ac1, 16
765 mulq_rs.w t6, s2, t2 // t6 = (C2*cb + ONE_HALF)>> SCALEBITS
768 addu t2, t1, t0 // t2 = y + cred
769 addu t3, t1, t5 // t3 = y + cgreen
770 addu t4, t1, t6 // t4 = y + cblue
788 STORE_H2V1_2_PIXELS v0, v1, ra, t2, t3, t4, t7
800 addiu t2, t2, -128 // (cb - 128)
801 addiu t0, t0, -128 // (cr - 128)
806 mulq_rs.w t0, s1, t0 // (C1*cr + ONE_HALF)>> SCALEBITS
807 mulq_rs.w t6, s2, t2 // (C2*cb + ONE_HALF)>> SCALEBITS
810 sra t5, t3, 16 // (C4*cb + ONE_HALF + C3*cr)>> SCALEBITS
811 addu t2, t1, t0 // y + cred
812 addu t3, t1, t5 // y + cgreen
813 addu t4, t1, t6 // y + cblue
821 STORE_H2V1_1_PIXEL t2, t3, t4, t7
823 RESTORE_REGS_FROM_STACK 40, s0, s1, s2, s3, s4, s5, s6, s7, ra
828 END(jsimd_h2v1_\colorid\()_merged_upsample_dspr2)
830 .purgem STORE_H2V1_1_PIXEL
831 .purgem STORE_H2V1_2_PIXELS
834 /*------------------------------------id -- pix R1 G1 B1 A1 R2 G2 B2 A2 */
835 GENERATE_H2V1_MERGED_UPSAMPLE_DSPR2 extrgb, 6, 0, 1, 2, 6, 3, 4, 5, 6
836 GENERATE_H2V1_MERGED_UPSAMPLE_DSPR2 extbgr, 6, 2, 1, 0, 3, 5, 4, 3, 6
837 GENERATE_H2V1_MERGED_UPSAMPLE_DSPR2 extrgbx, 8, 0, 1, 2, 3, 4, 5, 6, 7
838 GENERATE_H2V1_MERGED_UPSAMPLE_DSPR2 extbgrx, 8, 2, 1, 0, 3, 6, 5, 4, 7
839 GENERATE_H2V1_MERGED_UPSAMPLE_DSPR2 extxbgr, 8, 3, 2, 1, 0, 7, 6, 5, 4
840 GENERATE_H2V1_MERGED_UPSAMPLE_DSPR2 extxrgb, 8, 1, 2, 3, 0, 5, 6, 7, 4
843 /*****************************************************************************/
845 * jsimd_h2v2_fancy_upsample_dspr2
847 * Fancy processing for the common case of 2:1 horizontal and 2:1 vertical.
849 LEAF_DSPR2(jsimd_h2v2_fancy_upsample_dspr2)
851 * a0 = cinfo->max_v_samp_factor
852 * a1 = downsampled_width
854 * a3 = output_data_ptr
856 SAVE_REGS_ON_STACK 24, s0, s1, s2, s3, s4, s5
859 lw s2, 0(a3) // s2 = *output_data_ptr
862 lw s1, -4(a2) // s1 = inptr1
865 lw s0, 0(a2) // s0 = inptr0
867 addiu s5, a1, -2 // s5 = downsampled_width - 2
876 addu t8, s0, t4 // t8 = end address
877 andi s5, s5, 1 // s5 = residual
880 addu t0, t0, t4 // t0 = (*inptr0++) * 3
881 addu t1, t1, t6 // t1 = (*inptr0++) * 3
882 addu t7, t0, t2 // t7 = thiscolsum
883 addu t6, t1, t3 // t5 = nextcolsum
884 sll t0, t7, 2 // t0 = thiscolsum * 4
885 subu t1, t0, t7 // t1 = thiscolsum * 3
892 beq t8, s0, 22f // skip to final iteration if width == 3
895 lh t0, 0(s0) // t0 = A3|A2
896 lh t2, 0(s1) // t2 = B3|B2
899 preceu.ph.qbr t0, t0 // t0 = 0|A3|0|A2
900 preceu.ph.qbr t2, t2 // t2 = 0|B3|0|B2
903 addu.ph t0, t1, t0 // t0 = A3*3|A2*3
904 addu t3, t3, t6 // t3 = this * 3
905 addu.ph t0, t0, t2 // t0 = next2|next1
907 andi t7, t0, 0xFFFF // t7 = next1
909 addu t2, t7, t2 // t2 = next1*3
911 srl t6, t0, 16 // t6 = next2
912 shra_r.w t1, t1, 4 // t1 = (this*3 + last + 8) >> 4
915 srl t0, t0, 4 // t0 = (this*3 + next1 + 7) >> 4
916 shra_r.w t4, t4, 4 // t3 = (next1*3 + this + 8) >> 4
919 srl t2, t2, 4 // t2 = (next1*3 + next2 + 7) >> 4
936 addu t1, t0, t1 // t1 = inptr0 * 3
937 addu t3, t3, t6 // t3 = thiscolsum * 3
951 sll t0, t6, 2 // t0 = thiscolsum * 4
952 subu t1, t0, t6 // t1 = thiscolsum * 3
969 RESTORE_REGS_FROM_STACK 24, s0, s1, s2, s3, s4, s5
973 END(jsimd_h2v2_fancy_upsample_dspr2)
976 /*****************************************************************************/
977 LEAF_DSPR2(jsimd_h2v1_fancy_upsample_dspr2)
979 * a0 = cinfo->max_v_samp_factor
980 * a1 = downsampled_width
982 * a3 = output_data_ptr
984 SAVE_REGS_ON_STACK 16, s0, s1, s2, s3
999 lbu t1, 1(t7) // t1 = inptr[1]
1001 addu t2, t2, t0 // t2 = invalue*3
1009 ulw t0, 0(t7) // t0 = |P3|P2|P1|P0|
1011 ulh t2, 4(t7) // t2 = |0|0|P5|P4|
1012 preceu.ph.qbl t3, t0 // t3 = |0|P3|0|P2|
1013 preceu.ph.qbr t0, t0 // t0 = |0|P1|0|P0|
1014 preceu.ph.qbr t2, t2 // t2 = |0|P5|0|P4|
1015 preceu.ph.qbl t4, t1 // t4 = |0|P4|0|P3|
1016 preceu.ph.qbr t1, t1 // t1 = |0|P2|0|P1|
1019 addu.ph t5, t5, t4 // t5 = |P4*3|P3*3|
1020 addu.ph t6, t6, t1 // t6 = |P2*3|P1*3|
1025 shrl.ph t4, t4, 2 // t4 = |0|P3|0|P2|
1026 shrl.ph t0, t0, 2 // t0 = |0|P1|0|P0|
1029 shra_r.ph t2, t2, 2 // t2 = |0|P5|0|P4|
1030 shra_r.ph t3, t3, 2 // t3 = |0|P3|0|P2|
1050 addu t2, t0, t1 // t2 = invalue
1069 addu t1, t1, t0 // t1 = invalue * 3
1079 RESTORE_REGS_FROM_STACK 16, s0, s1, s2, s3
1083 END(jsimd_h2v1_fancy_upsample_dspr2)
1086 /*****************************************************************************/
1087 LEAF_DSPR2(jsimd_h2v1_downsample_dspr2)
1089 * a0 = cinfo->image_width
1090 * a1 = cinfo->max_v_samp_factor
1091 * a2 = compptr->v_samp_factor
1092 * a3 = compptr->width_in_blocks
1093 * 16(sp) = input_data
1094 * 20(sp) = output_data
1098 SAVE_REGS_ON_STACK 24, s0, s1, s2, s3, s4
1101 lw s1, 44(sp) // s1 = output_data
1102 lw s0, 40(sp) // s0 = input_data
1107 sll t0, a3, 3 // t0 = width_in_blocks*DCT
1111 andi t6, a0, 1 // t6 = temp_index
1113 lw t4, 0(s1) // t4 = outptr
1114 lw t5, 0(s0) // t5 = inptr0
1115 li s3, 0 // s3 = bias
1116 srl t7, a0, 1 // t7 = image_width1
1154 addqh.w t2, t1, s3 // t2 = pixval1
1156 addqh.w t3, t1, s3 // t3 = pixval2
1159 addu t5, t4, s2 // t5 = loop_end2
1175 RESTORE_REGS_FROM_STACK 24, s0, s1, s2, s3, s4
1179 END(jsimd_h2v1_downsample_dspr2)
1182 /*****************************************************************************/
1183 LEAF_DSPR2(jsimd_h2v2_downsample_dspr2)
1185 * a0 = cinfo->image_width
1186 * a1 = cinfo->max_v_samp_factor
1187 * a2 = compptr->v_samp_factor
1188 * a3 = compptr->width_in_blocks
1189 * 16(sp) = input_data
1190 * 20(sp) = output_data
1194 SAVE_REGS_ON_STACK 32, s0, s1, s2, s3, s4, s5, s6, s7
1197 lw s1, 52(sp) // s1 = output_data
1198 lw s0, 48(sp) // s0 = input_data
1200 andi t6, a0, 1 // t6 = temp_index
1202 srl t7, a0, 1 // t7 = image_width1
1209 sll t0, a3, 3 // s2 = width_in_blocks*DCT
1213 lw t4, 0(s1) // t4 = outptr
1214 lw t5, 0(s0) // t5 = inptr0
1215 lw s7, 4(s0) // s7 = inptr1
1216 li s6, 1 // s6 = bias
1218 ulw t0, 0(t5) // t0 = |P3|P2|P1|P0|
1219 ulw t1, 0(s7) // t1 = |Q3|Q2|Q1|Q0|
1222 precrq.ph.w t7, t0, t1 // t2 = |P3|P2|Q3|Q2|
1223 ins t0, t1, 16, 16 // t0 = |Q1|Q0|P1|P0|
1229 precrq.ph.w t7, t2, t3
1267 srl t0, t3, 2 // t2 = pixval1
1270 srl t1, t2, 2 // t3 = pixval2
1288 RESTORE_REGS_FROM_STACK 32, s0, s1, s2, s3, s4, s5, s6, s7
1292 END(jsimd_h2v2_downsample_dspr2)
1295 /*****************************************************************************/
1296 LEAF_DSPR2(jsimd_h2v2_smooth_downsample_dspr2)
1300 * a2 = compptr->v_samp_factor
1301 * a3 = cinfo->max_v_samp_factor
1302 * 16(sp) = cinfo->smoothing_factor
1303 * 20(sp) = compptr->width_in_blocks
1304 * 24(sp) = cinfo->image_width
1308 SAVE_REGS_ON_STACK 32, s0, s1, s2, s3, s4, s5, s6, s7
1310 lw s7, 52(sp) // compptr->width_in_blocks
1311 lw s0, 56(sp) // cinfo->image_width
1312 lw s6, 48(sp) // cinfo->smoothing_factor
1313 sll s7, 3 // output_cols = width_in_blocks * DCTSIZE
1318 addiu t0, a3, 2 // t0 = cinfo->max_v_samp_factor + 2
1340 subu t6, v1, v0 // t6 = 16384 - tmp_smoot_f * 80
1341 sll t7, s6, 4 // t7 = tmp_smoot_f * 16
1343 /* Special case for first column: pretend column -1 is same as column 0 */
1345 lwx t8, v0(a1) // outptr = output_data[outrow]
1350 lwx s2, v1(a0) // inptr0 = input_data[inrow]
1351 lwx t9, t9(a0) // inptr1 = input_data[inrow+1]
1352 lwx s0, s0(a0) // above_ptr = input_data[inrow-1]
1353 lwx s1, s1(a0) // below_ptr = input_data[inrow+2]
1381 extr_r.w v0, $ac1, 16
1390 addu s5, s4, t8 // end address
1419 extr_r.w t2, $ac1, 16
1429 addu s5, s5, t8 // end address
1462 extr_r.w t2, $ac1, 16
1491 extr_r.w t2, $ac1, 16
1520 extr_r.w t3, $ac1, 16
1548 extr_r.w t1, $ac1, 16
1553 /* Special case for last column */
1581 extr_r.w t0, $ac1, 16
1588 RESTORE_REGS_FROM_STACK 32, s0, s1, s2, s3, s4, s5, s6, s7
1593 END(jsimd_h2v2_smooth_downsample_dspr2)
1596 /*****************************************************************************/
1597 LEAF_DSPR2(jsimd_int_upsample_dspr2)
1599 * a0 = upsample->h_expand[compptr->component_index]
1600 * a1 = upsample->v_expand[compptr->component_index]
1602 * a3 = output_data_ptr
1603 * 16(sp) = cinfo->output_width
1604 * 20(sp) = cinfo->max_v_samp_factor
1608 SAVE_REGS_ON_STACK 16, s0, s1, s2, s3
1610 lw s0, 0(a3) // s0 = output_data
1611 lw s1, 32(sp) // s1 = cinfo->output_width
1612 lw s2, 36(sp) // s2 = cinfo->max_v_samp_factor
1613 li t6, 0 // t6 = inrow
1615 li s3, 0 // s3 = outrow
1619 lw t3, 0(t0) // t3 = inptr
1620 lw t8, 0(t7) // t8 = outptr
1622 addu t5, t8, s1 // t5 = outend
1624 lb t2, 0(t3) // t2 = invalue = *inptr++
1627 move t0, a0 // t0 = h_expand
1637 addiu t9, a1, -1 // t9 = v_expand - 1
1645 addu t5, t3, s1 // t5 = end address
1646 andi t7, s1, 0xF // t7 = residual
1677 RESTORE_REGS_FROM_STACK 16, s0, s1, s2, s3
1681 END(jsimd_int_upsample_dspr2)
1684 /*****************************************************************************/
1685 LEAF_DSPR2(jsimd_h2v1_upsample_dspr2)
1687 * a0 = cinfo->max_v_samp_factor
1688 * a1 = cinfo->output_width
1690 * a3 = output_data_ptr
1692 lw t7, 0(a3) // t7 = output_data
1693 andi t8, a1, 0xf // t8 = residual
1696 addu t9, t7, t0 // t9 = output_data end address
1698 lw t5, 0(t7) // t5 = outptr
1699 lw t6, 0(a2) // t6 = inptr
1700 addu t3, t5, a1 // t3 = outptr + output_width (end address)
1701 subu t3, t8 // t3 = end address - residual
1705 ulw t0, 0(t6) // t0 = |P3|P2|P1|P0|
1706 ulw t2, 4(t6) // t2 = |P7|P6|P5|P4|
1707 srl t1, t0, 16 // t1 = |X|X|P3|P2|
1708 ins t0, t0, 16, 16 // t0 = |P1|P0|P1|P0|
1709 ins t1, t1, 16, 16 // t1 = |P3|P2|P3|P2|
1710 ins t0, t0, 8, 16 // t0 = |P1|P1|P0|P0|
1711 ins t1, t1, 8, 16 // t1 = |P3|P3|P2|P2|
1714 srl t0, t2, 16 // t0 = |X|X|P7|P6|
1715 ins t2, t2, 16, 16 // t2 = |P5|P4|P5|P4|
1716 ins t0, t0, 16, 16 // t0 = |P7|P6|P7|P6|
1717 ins t2, t2, 8, 16 // t2 = |P5|P5|P4|P4|
1718 ins t0, t0, 8, 16 // t0 = |P7|P7|P6|P6|
1741 END(jsimd_h2v1_upsample_dspr2)
1744 /*****************************************************************************/
1745 LEAF_DSPR2(jsimd_h2v2_upsample_dspr2)
1747 * a0 = cinfo->max_v_samp_factor
1748 * a1 = cinfo->output_width
1750 * a3 = output_data_ptr
1754 andi t9, a1, 0xf // t9 = residual
1756 lw t6, 0(a2) // t6 = inptr
1757 lw t5, 0(t7) // t5 = outptr
1758 addu t8, t5, a1 // t8 = outptr end address
1759 subu t8, t9 // t8 = end address - residual
1793 lw t6, 0(t7) // t6 = outptr[0]
1794 lw t5, 4(t7) // t5 = outptr[1]
1795 addu t4, t6, a1 // t4 = new end address
1826 END(jsimd_h2v2_upsample_dspr2)
1829 /*****************************************************************************/
1830 LEAF_DSPR2(jsimd_idct_islow_dspr2)
1833 * a1 = compptr->dcttable
1837 SAVE_REGS_ON_STACK 32, s0, s1, s2, s3, s4, s5, s6, s7
1841 addiu v1, zero, 8 // v1 = DCTSIZE = 8
1843 lh s4, 32(a0) // s4 = inptr[16]
1844 lh s5, 64(a0) // s5 = inptr[32]
1845 lh s6, 96(a0) // s6 = inptr[48]
1846 lh t1, 112(a0) // t1 = inptr[56]
1847 lh t7, 16(a0) // t7 = inptr[8]
1848 lh t5, 80(a0) // t5 = inptr[40]
1849 lh t3, 48(a0) // t3 = inptr[24]
1858 lh s5, 0(a1) // quantptr[DCTSIZE*0]
1859 lh s6, 0(a0) // inptr[DCTSIZE*0]
1860 mul s5, s5, s6 // DEQUANTIZE(inptr[0], quantptr[0])
1876 mul t0, t0, t1 // DEQUANTIZE(inptr[DCTSIZE*7], quantptr[DCTSIZE*7])
1877 mul t1, t2, t3 // DEQUANTIZE(inptr[DCTSIZE*3], quantptr[DCTSIZE*3])
1878 mul t2, t4, t5 // DEQUANTIZE(inptr[DCTSIZE*5], quantptr[DCTSIZE*5])
1879 mul t3, t6, t7 // DEQUANTIZE(inptr[DCTSIZE*1], quantptr[DCTSIZE*1])
1884 addu s0, t0, t1 // z3 = tmp0 + tmp2
1885 addu s1, t1, t2 // z2 = tmp1 + tmp2
1886 addu s2, t2, t3 // z4 = tmp1 + tmp3
1887 addu s3, s0, s2 // z3 + z4
1888 addiu t9, zero, 9633 // FIX_1_175875602
1889 mul s3, s3, t9 // z5 = MULTIPLY(z3 + z4, FIX_1_175875602)
1890 addu t8, t0, t3 // z1 = tmp0 + tmp3
1891 addiu t9, zero, 2446 // FIX_0_298631336
1892 mul t0, t0, t9 // tmp0 = MULTIPLY(tmp0, FIX_0_298631336)
1893 addiu t9, zero, 16819 // FIX_2_053119869
1894 mul t2, t2, t9 // tmp1 = MULTIPLY(tmp1, FIX_2_053119869)
1895 addiu t9, zero, 25172 // FIX_3_072711026
1896 mul t1, t1, t9 // tmp2 = MULTIPLY(tmp2, FIX_3_072711026)
1897 addiu t9, zero, 12299 // FIX_1_501321110
1898 mul t3, t3, t9 // tmp3 = MULTIPLY(tmp3, FIX_1_501321110)
1899 addiu t9, zero, 16069 // FIX_1_961570560
1900 mul s0, s0, t9 // -z3 = MULTIPLY(z3, FIX_1_961570560)
1901 addiu t9, zero, 3196 // FIX_0_390180644
1902 mul s2, s2, t9 // -z4 = MULTIPLY(z4, FIX_0_390180644)
1903 addiu t9, zero, 7373 // FIX_0_899976223
1904 mul t8, t8, t9 // -z1 = MULTIPLY(z1, FIX_0_899976223)
1905 addiu t9, zero, 20995 // FIX_2_562915447
1906 mul s1, s1, t9 // -z2 = MULTIPLY(z2, FIX_2_562915447)
1907 subu s0, s3, s0 // z3 += z5
1908 addu t0, t0, s0 // tmp0 += z3
1909 addu t1, t1, s0 // tmp2 += z3
1910 subu s2, s3, s2 // z4 += z5
1911 addu t2, t2, s2 // tmp1 += z4
1912 addu t3, t3, s2 // tmp3 += z4
1913 subu t0, t0, t8 // tmp0 += z1
1914 subu t1, t1, s1 // tmp2 += z2
1915 subu t2, t2, s1 // tmp1 += z2
1916 subu t3, t3, t8 // tmp3 += z1
1917 mul s0, t4, t5 // DEQUANTIZE(inptr[DCTSIZE*2], quantptr[DCTSIZE*2])
1918 addiu t9, zero, 6270 // FIX_0_765366865
1919 mul s1, t6, t7 // DEQUANTIZE(inptr[DCTSIZE*6], quantptr[DCTSIZE*6])
1924 mul s2, t9, s0 // MULTIPLY(z2, FIX_0_765366865)
1925 mul t5, t4, t5 // DEQUANTIZE(inptr[DCTSIZE*0], quantptr[DCTSIZE*0])
1926 mul t6, t6, t7 // DEQUANTIZE(inptr[DCTSIZE*4], quantptr[DCTSIZE*4])
1927 addiu t9, zero, 4433 // FIX_0_541196100
1928 addu s3, s0, s1 // z2 + z3
1929 mul s3, s3, t9 // z1 = MULTIPLY(z2 + z3, FIX_0_541196100)
1930 addiu t9, zero, 15137 // FIX_1_847759065
1931 mul t8, s1, t9 // MULTIPLY(z3, FIX_1_847759065)
1934 sll t4, t4, 13 // tmp0 = (z2 + z3) << CONST_BITS
1935 sll t5, t5, 13 // tmp1 = (z2 - z3) << CONST_BITS
1936 addu t7, s3, s2 // tmp3 = z1 + MULTIPLY(z2, FIX_0_765366865)
1937 subu t6, s3, t8 // tmp2 = z1 + MULTIPLY(z3, -FIX_1_847759065)
1974 lw t0, 8(v0) // z2 = (JLONG)wsptr[2]
1975 lw t1, 24(v0) // z3 = (JLONG)wsptr[6]
1976 lw t2, 0(v0) // (JLONG)wsptr[0]
1977 lw t3, 16(v0) // (JLONG)wsptr[4]
1978 lw s4, 4(v0) // (JLONG)wsptr[1]
1979 lw s5, 12(v0) // (JLONG)wsptr[3]
1980 lw s6, 20(v0) // (JLONG)wsptr[5]
1981 lw s7, 28(v0) // (JLONG)wsptr[7]
2000 addu t4, t0, t1 // z2 + z3
2001 addiu t8, zero, 4433 // FIX_0_541196100
2002 mul t5, t4, t8 // z1 = MULTIPLY(z2 + z3, FIX_0_541196100)
2003 addiu t8, zero, 15137 // FIX_1_847759065
2004 mul t1, t1, t8 // MULTIPLY(z3, FIX_1_847759065)
2005 addiu t8, zero, 6270 // FIX_0_765366865
2006 mul t0, t0, t8 // MULTIPLY(z2, FIX_0_765366865)
2007 addu t4, t2, t3 // (JLONG)wsptr[0] + (JLONG)wsptr[4]
2008 subu t2, t2, t3 // (JLONG)wsptr[0] - (JLONG)wsptr[4]
2009 sll t4, t4, 13 // tmp0 = (wsptr[0] + wsptr[4]) << CONST_BITS
2010 sll t2, t2, 13 // tmp1 = (wsptr[0] - wsptr[4]) << CONST_BITS
2011 subu t1, t5, t1 // tmp2 = z1 + MULTIPLY(z3, -FIX_1_847759065)
2012 subu t3, t2, t1 // tmp12 = tmp1 - tmp2
2013 addu t2, t2, t1 // tmp11 = tmp1 + tmp2
2014 addu t5, t5, t0 // tmp3 = z1 + MULTIPLY(z2, FIX_0_765366865)
2015 subu t1, t4, t5 // tmp13 = tmp0 - tmp3
2016 addu t0, t4, t5 // tmp10 = tmp0 + tmp3
2017 lw t4, 28(v0) // tmp0 = (JLONG)wsptr[7]
2018 lw t6, 12(v0) // tmp2 = (JLONG)wsptr[3]
2019 lw t5, 20(v0) // tmp1 = (JLONG)wsptr[5]
2020 lw t7, 4(v0) // tmp3 = (JLONG)wsptr[1]
2021 addu s0, t4, t6 // z3 = tmp0 + tmp2
2022 addiu t8, zero, 9633 // FIX_1_175875602
2023 addu s1, t5, t7 // z4 = tmp1 + tmp3
2024 addu s2, s0, s1 // z3 + z4
2025 mul s2, s2, t8 // z5 = MULTIPLY(z3 + z4, FIX_1_175875602)
2026 addu s3, t4, t7 // z1 = tmp0 + tmp3
2027 addu t9, t5, t6 // z2 = tmp1 + tmp2
2028 addiu t8, zero, 16069 // FIX_1_961570560
2029 mul s0, s0, t8 // -z3 = MULTIPLY(z3, FIX_1_961570560)
2030 addiu t8, zero, 3196 // FIX_0_390180644
2031 mul s1, s1, t8 // -z4 = MULTIPLY(z4, FIX_0_390180644)
2032 addiu t8, zero, 2446 // FIX_0_298631336
2033 mul t4, t4, t8 // tmp0 = MULTIPLY(tmp0, FIX_0_298631336)
2034 addiu t8, zero, 7373 // FIX_0_899976223
2035 mul s3, s3, t8 // -z1 = MULTIPLY(z1, FIX_0_899976223)
2036 addiu t8, zero, 16819 // FIX_2_053119869
2037 mul t5, t5, t8 // tmp1 = MULTIPLY(tmp1, FIX_2_053119869)
2038 addiu t8, zero, 20995 // FIX_2_562915447
2039 mul t9, t9, t8 // -z2 = MULTIPLY(z2, FIX_2_562915447)
2040 addiu t8, zero, 25172 // FIX_3_072711026
2041 mul t6, t6, t8 // tmp2 = MULTIPLY(tmp2, FIX_3_072711026)
2042 addiu t8, zero, 12299 // FIX_1_501321110
2043 mul t7, t7, t8 // tmp3 = MULTIPLY(tmp3, FIX_1_501321110)
2044 subu s0, s2, s0 // z3 += z5
2045 subu s1, s2, s1 // z4 += z5
2047 subu t4, t4, s3 // tmp0
2049 subu t5, t5, t9 // tmp1
2051 subu t6, t6, t9 // tmp2
2053 subu t7, t7, s3 // tmp3
2101 RESTORE_REGS_FROM_STACK 32, s0, s1, s2, s3, s4, s5, s6, s7
2106 END(jsimd_idct_islow_dspr2)
2109 /*****************************************************************************/
2110 LEAF_DSPR2(jsimd_idct_ifast_cols_dspr2)
2115 * a3 = mips_idct_ifast_coefs
2117 SAVE_REGS_ON_STACK 32, s0, s1, s2, s3, s4, s5, s6, s7
2119 addiu t9, a0, 16 // end address
2123 lw s0, 0(a1) // quantptr[DCTSIZE*0]
2124 lw t0, 0(a0) // inptr[DCTSIZE*0]
2125 lw t1, 16(a0) // inptr[DCTSIZE*1]
2126 muleq_s.w.phl v0, t0, s0 // tmp0 ...
2127 lw t2, 32(a0) // inptr[DCTSIZE*2]
2128 lw t3, 48(a0) // inptr[DCTSIZE*3]
2129 lw t4, 64(a0) // inptr[DCTSIZE*4]
2130 lw t5, 80(a0) // inptr[DCTSIZE*5]
2131 muleq_s.w.phr t0, t0, s0 // ... tmp0 ...
2132 lw t6, 96(a0) // inptr[DCTSIZE*6]
2133 lw t7, 112(a0) // inptr[DCTSIZE*7]
2137 ins t0, v0, 16, 16 // ... tmp0
2142 sw t0, 0(a2) // wsptr[DCTSIZE*0]
2143 sw t0, 16(a2) // wsptr[DCTSIZE*1]
2144 sw t0, 32(a2) // wsptr[DCTSIZE*2]
2145 sw t0, 48(a2) // wsptr[DCTSIZE*3]
2146 sw t0, 64(a2) // wsptr[DCTSIZE*4]
2147 sw t0, 80(a2) // wsptr[DCTSIZE*5]
2148 sw t0, 96(a2) // wsptr[DCTSIZE*6]
2149 sw t0, 112(a2) // wsptr[DCTSIZE*7]
2155 lw s1, 32(a1) // quantptr[DCTSIZE*2]
2156 lw s2, 64(a1) // quantptr[DCTSIZE*4]
2157 muleq_s.w.phl v0, t2, s1 // tmp1 ...
2158 muleq_s.w.phr t2, t2, s1 // ... tmp1 ...
2159 lw s0, 16(a1) // quantptr[DCTSIZE*1]
2160 lw s1, 48(a1) // quantptr[DCTSIZE*3]
2161 lw s3, 96(a1) // quantptr[DCTSIZE*6]
2162 muleq_s.w.phl v1, t4, s2 // tmp2 ...
2163 muleq_s.w.phr t4, t4, s2 // ... tmp2 ...
2164 lw s2, 80(a1) // quantptr[DCTSIZE*5]
2165 lw t8, 4(AT) // FIX(1.414213562)
2166 ins t2, v0, 16, 16 // ... tmp1
2167 muleq_s.w.phl v0, t6, s3 // tmp3 ...
2168 muleq_s.w.phr t6, t6, s3 // ... tmp3 ...
2169 ins t4, v1, 16, 16 // ... tmp2
2170 addq.ph s4, t0, t4 // tmp10
2171 subq.ph s5, t0, t4 // tmp11
2172 ins t6, v0, 16, 16 // ... tmp3
2173 subq.ph s6, t2, t6 // tmp12 ...
2174 addq.ph s7, t2, t6 // tmp13
2175 mulq_s.ph s6, s6, t8 // ... tmp12 ...
2176 addq.ph t0, s4, s7 // tmp0
2177 subq.ph t6, s4, s7 // tmp3
2178 muleq_s.w.phl v0, t1, s0 // tmp4 ...
2179 muleq_s.w.phr t1, t1, s0 // ... tmp4 ...
2180 shll_s.ph s6, s6, 1 // x2
2181 lw s3, 112(a1) // quantptr[DCTSIZE*7]
2182 subq.ph s6, s6, s7 // ... tmp12
2183 muleq_s.w.phl v1, t7, s3 // tmp7 ...
2184 muleq_s.w.phr t7, t7, s3 // ... tmp7 ...
2185 ins t1, v0, 16, 16 // ... tmp4
2186 addq.ph t2, s5, s6 // tmp1
2187 subq.ph t4, s5, s6 // tmp2
2188 muleq_s.w.phl v0, t5, s2 // tmp6 ...
2189 muleq_s.w.phr t5, t5, s2 // ... tmp6 ...
2190 ins t7, v1, 16, 16 // ... tmp7
2191 addq.ph s5, t1, t7 // z11
2192 subq.ph s6, t1, t7 // z12
2193 muleq_s.w.phl v1, t3, s1 // tmp5 ...
2194 muleq_s.w.phr t3, t3, s1 // ... tmp5 ...
2195 ins t5, v0, 16, 16 // ... tmp6
2196 ins t3, v1, 16, 16 // ... tmp5
2197 addq.ph s7, t5, t3 // z13
2198 subq.ph v0, t5, t3 // z10
2199 addq.ph t7, s5, s7 // tmp7
2200 subq.ph s5, s5, s7 // tmp11 ...
2201 addq.ph v1, v0, s6 // z5 ...
2202 mulq_s.ph s5, s5, t8 // ... tmp11
2203 lw t8, 8(AT) // FIX(1.847759065)
2204 lw s4, 0(AT) // FIX(1.082392200)
2207 mulq_s.ph v1, v1, t8 // ... z5
2208 shll_s.ph s5, s5, 1 // x2
2209 lw t8, 12(AT) // FIX(-2.613125930)
2210 sw s0, 0(a2) // wsptr[DCTSIZE*0]
2211 shll_s.ph v0, v0, 1 // x4
2212 mulq_s.ph v0, v0, t8 // tmp12 ...
2213 mulq_s.ph s4, s6, s4 // tmp10 ...
2214 shll_s.ph v1, v1, 1 // x2
2217 sw s1, 112(a2) // wsptr[DCTSIZE*7]
2218 shll_s.ph s6, v0, 1 // x4
2219 shll_s.ph s4, s4, 1 // x2
2220 addq.ph s6, s6, v1 // ... tmp12
2221 subq.ph t5, s6, t7 // tmp6
2222 subq.ph s4, s4, v1 // ... tmp10
2223 subq.ph t3, s5, t5 // tmp5
2225 addq.ph t1, s4, t3 // tmp4
2227 sw s2, 16(a2) // wsptr[DCTSIZE*1]
2228 sw s3, 96(a2) // wsptr[DCTSIZE*6]
2231 sw v0, 32(a2) // wsptr[DCTSIZE*2]
2232 sw v1, 80(a2) // wsptr[DCTSIZE*5]
2235 sw v0, 64(a2) // wsptr[DCTSIZE*4]
2236 sw v1, 48(a2) // wsptr[DCTSIZE*3]
2242 RESTORE_REGS_FROM_STACK 32, s0, s1, s2, s3, s4, s5, s6, s7
2247 END(jsimd_idct_ifast_cols_dspr2)
2250 /*****************************************************************************/
2251 LEAF_DSPR2(jsimd_idct_ifast_rows_dspr2)
2256 * a3 = mips_idct_ifast_coefs
2258 SAVE_REGS_ON_STACK 40, s0, s1, s2, s3, s4, s5, s6, s7, s8, a3
2260 addiu t9, a0, 128 // end address
2265 lw AT, 36(sp) // restore $a3 (mips_idct_ifast_coefs)
2266 lw t0, 0(a0) // wsptr[DCTSIZE*0+0/1] b a
2267 lw s0, 16(a0) // wsptr[DCTSIZE*1+0/1] B A
2268 lw t2, 4(a0) // wsptr[DCTSIZE*0+2/3] d c
2269 lw s2, 20(a0) // wsptr[DCTSIZE*1+2/3] D C
2270 lw t4, 8(a0) // wsptr[DCTSIZE*0+4/5] f e
2271 lw s4, 24(a0) // wsptr[DCTSIZE*1+4/5] F E
2272 lw t6, 12(a0) // wsptr[DCTSIZE*0+6/7] h g
2273 lw s6, 28(a0) // wsptr[DCTSIZE*1+6/7] H G
2274 precrq.ph.w t1, s0, t0 // B b
2275 ins t0, s0, 16, 16 // A a
2283 shll_s.ph s0, t0, 2 // A a
2286 precrq.ph.w t0, s0, s0 // A A
2287 ins s0, s0, 16, 16 // a a
2290 precrq.qb.ph t0, t0, t0 // A A A A
2291 precrq.qb.ph s0, s0, s0 // a a a a
2305 precrq.ph.w t3, s2, t2
2307 precrq.ph.w t5, s4, t4
2309 precrq.ph.w t7, s6, t6
2311 lw t8, 4(AT) // FIX(1.414213562)
2312 addq.ph s4, t0, t4 // tmp10
2313 subq.ph s5, t0, t4 // tmp11
2314 subq.ph s6, t2, t6 // tmp12 ...
2315 addq.ph s7, t2, t6 // tmp13
2316 mulq_s.ph s6, s6, t8 // ... tmp12 ...
2317 addq.ph t0, s4, s7 // tmp0
2318 subq.ph t6, s4, s7 // tmp3
2319 shll_s.ph s6, s6, 1 // x2
2320 subq.ph s6, s6, s7 // ... tmp12
2321 addq.ph t2, s5, s6 // tmp1
2322 subq.ph t4, s5, s6 // tmp2
2323 addq.ph s5, t1, t7 // z11
2324 subq.ph s6, t1, t7 // z12
2325 addq.ph s7, t5, t3 // z13
2326 subq.ph v0, t5, t3 // z10
2327 addq.ph t7, s5, s7 // tmp7
2328 subq.ph s5, s5, s7 // tmp11 ...
2329 addq.ph v1, v0, s6 // z5 ...
2330 mulq_s.ph s5, s5, t8 // ... tmp11
2331 lw t8, 8(AT) // FIX(1.847759065)
2332 lw s4, 0(AT) // FIX(1.082392200)
2333 addq.ph s0, t0, t7 // tmp0 + tmp7
2334 subq.ph s7, t0, t7 // tmp0 - tmp7
2335 mulq_s.ph v1, v1, t8 // ... z5
2337 lw t8, 12(AT) // FIX(-2.613125930)
2338 shll_s.ph s5, s5, 1 // x2
2340 shll_s.ph v0, v0, 1 // x4
2341 mulq_s.ph v0, v0, t8 // tmp12 ...
2342 mulq_s.ph s4, s6, s4 // tmp10 ...
2343 shll_s.ph v1, v1, 1 // x2
2346 shll_s.ph s6, v0, 1 // x4
2347 shll_s.ph s4, s4, 1 // x2
2348 addq.ph s6, s6, v1 // ... tmp12
2350 subq.ph t5, s6, t7 // tmp6
2351 subq.ph s4, s4, v1 // ... tmp10
2352 subq.ph t3, s5, t5 // tmp5
2354 addq.ph t1, s4, t3 // tmp4
2355 addq.ph s1, t2, t5 // tmp1 + tmp6
2356 subq.ph s6, t2, t5 // tmp1 - tmp6
2357 addq.ph s2, t4, t3 // tmp2 + tmp5
2358 subq.ph s5, t4, t3 // tmp2 - tmp5
2359 addq.ph s4, t6, t1 // tmp3 + tmp4
2360 subq.ph s3, t6, t1 // tmp3 - tmp4
2367 precrq.ph.w t0, s1, s0 // B A
2368 ins s0, s1, 16, 16 // b a
2369 precrq.ph.w t2, s3, s2 // D C
2370 ins s2, s3, 16, 16 // d c
2371 precrq.ph.w t4, s5, s4 // F E
2372 ins s4, s5, 16, 16 // f e
2373 precrq.ph.w t6, s7, s6 // H G
2374 ins s6, s7, 16, 16 // h g
2375 precrq.qb.ph t0, t2, t0 // D C B A
2376 precrq.qb.ph s0, s2, s0 // d c b a
2377 precrq.qb.ph t4, t6, t4 // H G F E
2378 precrq.qb.ph s4, s6, s4 // h g f e
2381 sw s0, 0(a3) // outptr[0/1/2/3] d c b a
2382 sw s4, 4(a3) // outptr[4/5/6/7] h g f e
2387 sw t0, 0(a3) // outptr[0/1/2/3] D C B A
2389 sw t4, 4(a3) // outptr[4/5/6/7] H G F E
2393 RESTORE_REGS_FROM_STACK 40, s0, s1, s2, s3, s4, s5, s6, s7, s8, a3
2398 END(jsimd_idct_ifast_rows_dspr2)
2401 /*****************************************************************************/
2402 LEAF_DSPR2(jsimd_fdct_islow_dspr2)
2406 SAVE_REGS_ON_STACK 40, s0, s1, s2, s3, s4, s5, s6, s7, s8
2431 lw s0, 0(a1) // tmp0 = 1|0
2432 lw s1, 4(a1) // tmp1 = 3|2
2433 lw s2, 8(a1) // tmp2 = 5|4
2434 lw s3, 12(a1) // tmp3 = 7|6
2435 packrl.ph s1, s1, s1 // tmp1 = 2|3
2436 packrl.ph s3, s3, s3 // tmp3 = 6|7
2437 subq.ph s7, s1, s2 // tmp7 = 2-5|3-4 = t5|t4
2438 subq.ph s5, s0, s3 // tmp5 = 1-6|0-7 = t6|t7
2439 mult $0, $0 // ac0 = 0
2440 dpa.w.ph $ac0, s7, t0 // ac0 += t5* 6437 + t4* 2260
2441 dpa.w.ph $ac0, s5, t1 // ac0 += t6* 9633 + t7* 11363
2442 mult $ac1, $0, $0 // ac1 = 0
2443 dpa.w.ph $ac1, s7, t2 // ac1 += t5*-11362 + t4* -6436
2444 dpa.w.ph $ac1, s5, t3 // ac1 += t6* -2259 + t7* 9633
2445 mult $ac2, $0, $0 // ac2 = 0
2446 dpa.w.ph $ac2, s7, t4 // ac2 += t5* 2261 + t4* 9633
2447 dpa.w.ph $ac2, s5, t5 // ac2 += t6*-11362 + t7* 6437
2448 mult $ac3, $0, $0 // ac3 = 0
2449 dpa.w.ph $ac3, s7, t6 // ac3 += t5* 9633 + t4*-11363
2450 dpa.w.ph $ac3, s5, t7 // ac3 += t6* -6436 + t7* 2260
2451 addq.ph s6, s1, s2 // tmp6 = 2+5|3+4 = t2|t3
2452 addq.ph s4, s0, s3 // tmp4 = 1+6|0+7 = t1|t0
2453 extr_r.w s0, $ac0, 11 // tmp0 = (ac0 + 1024) >> 11
2454 extr_r.w s1, $ac1, 11 // tmp1 = (ac1 + 1024) >> 11
2455 extr_r.w s2, $ac2, 11 // tmp2 = (ac2 + 1024) >> 11
2456 extr_r.w s3, $ac3, 11 // tmp3 = (ac3 + 1024) >> 11
2457 addq.ph s5, s4, s6 // tmp5 = t1+t2|t0+t3 = t11|t10
2458 subq.ph s7, s4, s6 // tmp7 = t1-t2|t0-t3 = t12|t13
2463 mult $0, $0 // ac0 = 0
2464 dpa.w.ph $ac0, s7, t8 // ac0 += t12* 4433 + t13* 10703
2465 mult $ac1, $0, $0 // ac1 = 0
2466 dpa.w.ph $ac1, s7, t9 // ac1 += t12*-10704 + t13* 4433
2467 sra s4, s5, 16 // tmp4 = t11
2470 extr_r.w s0, $ac0, 11 // tmp0 = (ac0 + 1024) >> 11
2471 extr_r.w s1, $ac1, 11 // tmp1 = (ac1 + 1024) >> 11
2472 addu s2, s5, s4 // tmp2 = t10 + t11
2473 subu s3, s5, s4 // tmp3 = t10 - t11
2474 sll s2, s2, 2 // tmp2 = (t10 + t11) << 2
2475 sll s3, s3, 2 // tmp3 = (t10 - t11) << 2
2502 lh s7, 112(a0) // 56
2503 addu s2, v0, s5 // tmp2 = 16 + 40
2504 subu s5, v0, s5 // tmp5 = 16 - 40
2505 addu s3, v1, s4 // tmp3 = 24 + 32
2506 subu s4, v1, s4 // tmp4 = 24 - 32
2507 addu s0, a2, s7 // tmp0 = 0 + 56
2508 subu s7, a2, s7 // tmp7 = 0 - 56
2509 addu s1, a3, s6 // tmp1 = 8 + 48
2510 subu s6, a3, s6 // tmp6 = 8 - 48
2511 addu a2, s0, s3 // tmp10 = tmp0 + tmp3
2512 subu v1, s0, s3 // tmp13 = tmp0 - tmp3
2513 addu a3, s1, s2 // tmp11 = tmp1 + tmp2
2514 subu v0, s1, s2 // tmp12 = tmp1 - tmp2
2515 mult s7, t1 // ac0 = tmp7 * c1
2516 madd s4, t0 // ac0 += tmp4 * c0
2517 madd s5, t4 // ac0 += tmp5 * c4
2518 madd s6, t2 // ac0 += tmp6 * c2
2519 mult $ac1, s7, t2 // ac1 = tmp7 * c2
2520 msub $ac1, s4, t3 // ac1 -= tmp4 * c3
2521 msub $ac1, s5, t6 // ac1 -= tmp5 * c6
2522 msub $ac1, s6, t7 // ac1 -= tmp6 * c7
2523 mult $ac2, s7, t4 // ac2 = tmp7 * c4
2524 madd $ac2, s4, t2 // ac2 += tmp4 * c2
2525 madd $ac2, s5, t5 // ac2 += tmp5 * c5
2526 msub $ac2, s6, t6 // ac2 -= tmp6 * c6
2527 mult $ac3, s7, t0 // ac3 = tmp7 * c0
2528 msub $ac3, s4, t1 // ac3 -= tmp4 * c1
2529 madd $ac3, s5, t2 // ac3 += tmp5 * c2
2530 msub $ac3, s6, t3 // ac3 -= tmp6 * c3
2531 extr_r.w s0, $ac0, 15 // tmp0 = (ac0 + 16384) >> 15
2532 extr_r.w s1, $ac1, 15 // tmp1 = (ac1 + 16384) >> 15
2533 extr_r.w s2, $ac2, 15 // tmp2 = (ac2 + 16384) >> 15
2534 extr_r.w s3, $ac3, 15 // tmp3 = (ac3 + 16384) >> 15
2536 addu s4, a2, a3 // tmp4 = tmp10 + tmp11
2537 subu s5, a2, a3 // tmp5 = tmp10 - tmp11
2542 mult v0, t8 // ac0 = tmp12 * c8
2543 madd v1, t9 // ac0 += tmp13 * c9
2544 mult $ac1, v1, t8 // ac1 = tmp13 * c8
2545 msub $ac1, v0, a1 // ac1 -= tmp12 * c10
2547 extr_r.w s6, $ac0, 15 // tmp6 = (ac0 + 16384) >> 15
2548 extr_r.w s7, $ac1, 15 // tmp7 = (ac1 + 16384) >> 15
2549 shra_r.w s4, s4, 2 // tmp4 = (tmp4 + 2) >> 2
2550 shra_r.w s5, s5, 2 // tmp5 = (tmp5 + 2) >> 2
2557 RESTORE_REGS_FROM_STACK 40, s0, s1, s2, s3, s4, s5, s6, s7, s8
2562 END(jsimd_fdct_islow_dspr2)
2565 /**************************************************************************/
2566 LEAF_DSPR2(jsimd_fdct_ifast_dspr2)
2572 SAVE_REGS_ON_STACK 8, s0, s1
2574 li a1, 0x014e014e // FIX_1_306562965 (334 << 16)|(334 & 0xffff)
2575 li a2, 0x008b008b // FIX_0_541196100 (139 << 16)|(139 & 0xffff)
2576 li a3, 0x00620062 // FIX_0_382683433 (98 << 16) |(98 & 0xffff)
2577 li s1, 0x00b500b5 // FIX_0_707106781 (181 << 16)|(181 & 0xffff)
2580 addiu v1, v0, 128 // end address
2583 lw t0, 0(v0) // tmp0 = 1|0
2584 lw t1, 4(v0) // tmp1 = 3|2
2585 lw t2, 8(v0) // tmp2 = 5|4
2586 lw t3, 12(v0) // tmp3 = 7|6
2587 packrl.ph t1, t1, t1 // tmp1 = 2|3
2588 packrl.ph t3, t3, t3 // tmp3 = 6|7
2589 subq.ph t7, t1, t2 // tmp7 = 2-5|3-4 = t5|t4
2590 subq.ph t5, t0, t3 // tmp5 = 1-6|0-7 = t6|t7
2591 addq.ph t6, t1, t2 // tmp6 = 2+5|3+4 = t2|t3
2592 addq.ph t4, t0, t3 // tmp4 = 1+6|0+7 = t1|t0
2593 addq.ph t8, t4, t6 // tmp5 = t1+t2|t0+t3 = t11|t10
2594 subq.ph t9, t4, t6 // tmp7 = t1-t2|t0-t3 = t12|t13
2595 sra t4, t8, 16 // tmp4 = t11
2596 mult $0, $0 // ac0 = 0
2597 dpa.w.ph $ac0, t9, s1
2598 mult $ac1, $0, $0 // ac1 = 0
2599 dpa.w.ph $ac1, t7, a3 // ac1 += t4*98 + t5*98
2600 dpsx.w.ph $ac1, t5, a3 // ac1 += t6*98 + t7*98
2601 mult $ac2, $0, $0 // ac2 = 0
2602 dpa.w.ph $ac2, t7, a2 // ac2 += t4*139 + t5*139
2603 mult $ac3, $0, $0 // ac3 = 0
2604 dpa.w.ph $ac3, t5, a1 // ac3 += t6*334 + t7*334
2605 precrq.ph.w t0, t5, t7 // t0 = t5|t6
2606 addq.ph t2, t8, t4 // tmp2 = t10 + t11
2607 subq.ph t3, t8, t4 // tmp3 = t10 - t11
2609 mult $0, $0 // ac0 = 0
2610 dpa.w.ph $ac0, t0, s1 // ac0 += t5*181 + t6*181
2611 extr.w t0, $ac1, 8 // t0 = z5
2612 extr.w t1, $ac2, 8 // t1 = MULTIPLY(tmp10, 139)
2613 extr.w t7, $ac3, 8 // t2 = MULTIPLY(tmp12, 334)
2614 extr.w t8, $ac0, 8 // t8 = z3 = MULTIPLY(tmp11, 181)
2615 add t6, t1, t0 // t6 = z2
2616 add t7, t7, t0 // t7 = z4
2617 subq.ph t0, t5, t8 // t0 = z13 = tmp7 - z3
2618 addq.ph t8, t5, t8 // t9 = z11 = tmp7 + z3
2619 addq.ph t1, t0, t6 // t1 = z13 + z2
2620 subq.ph t6, t0, t6 // t6 = z13 - z2
2621 addq.ph t0, t8, t7 // t0 = z11 + z4
2622 subq.ph t7, t8, t7 // t7 = z11 - z4
2647 lh t7, 112(v0) // 56
2648 add t8, t0, t7 // t8 = tmp0
2649 sub t7, t0, t7 // t7 = tmp7
2650 add t0, t1, t6 // t0 = tmp1
2651 sub t1, t1, t6 // t1 = tmp6
2652 add t6, t2, t5 // t6 = tmp2
2653 sub t5, t2, t5 // t5 = tmp5
2654 add t2, t3, t4 // t2 = tmp3
2655 sub t3, t3, t4 // t3 = tmp4
2656 add t4, t8, t2 // t4 = tmp10 = tmp0 + tmp3
2657 sub t8, t8, t2 // t8 = tmp13 = tmp0 - tmp3
2658 sub s0, t0, t6 // s0 = tmp12 = tmp1 - tmp2
2659 ins t8, s0, 16, 16 // t8 = tmp12|tmp13
2660 add t2, t0, t6 // t2 = tmp11 = tmp1 + tmp2
2661 mult $0, $0 // ac0 = 0
2662 dpa.w.ph $ac0, t8, s1 // ac0 += t12*181 + t13*181
2663 add s0, t4, t2 // t8 = tmp10+tmp11
2664 sub t4, t4, t2 // t4 = tmp10-tmp11
2667 extr.w t2, $ac0, 8 // z1 = MULTIPLY(tmp12+tmp13, FIX_0_707106781)
2668 addq.ph t4, t8, t2 // t9 = tmp13 + z1
2669 subq.ph t8, t8, t2 // t2 = tmp13 - z1
2672 add t3, t3, t5 // t3 = tmp10 = tmp4 + tmp5
2673 add t0, t5, t1 // t0 = tmp11 = tmp5 + tmp6
2674 add t1, t1, t7 // t1 = tmp12 = tmp6 + tmp7
2677 sra s0, s0, 8 // s0 = z4 = MULTIPLY(tmp12, FIX_1_306562965)
2678 ins t1, t3, 16, 16 // t1 = tmp10|tmp12
2679 mult $0, $0 // ac0 = 0
2680 mulsa.w.ph $ac0, t1, a3 // ac0 += t10*98 - t12*98
2681 extr.w t8, $ac0, 8 // z5 = MULTIPLY(tmp10-tmp12, FIX_0_382683433)
2682 add t2, t7, t8 // t2 = tmp7 + z5
2683 sub t7, t7, t8 // t7 = tmp7 - z5
2686 sra t8, t8, 8 // t8 = z2 = MULTIPLY(tmp10, FIX_0_541196100)
2689 sra t6, t6, 8 // t6 = z3 = MULTIPLY(tmp11, FIX_0_707106781)
2690 add t0, t6, t8 // t0 = z3 + z2
2691 sub t1, t6, t8 // t1 = z3 - z2
2692 add t3, t6, s0 // t3 = z3 + z4
2693 sub t4, t6, s0 // t4 = z3 - z4
2694 sub t5, t2, t1 // t5 = dataptr[5]
2695 sub t6, t7, t0 // t6 = dataptr[3]
2696 add t3, t2, t3 // t3 = dataptr[1]
2697 add t4, t7, t4 // t4 = dataptr[7]
2706 RESTORE_REGS_FROM_STACK 8, s0, s1
2710 END(jsimd_fdct_ifast_dspr2)
2713 /*****************************************************************************/
2714 LEAF_DSPR2(jsimd_quantize_dspr2)
2722 SAVE_REGS_ON_STACK 16, s0, s1, s2
2724 addiu v0, a2, 124 // v0 = workspace_end
2805 RESTORE_REGS_FROM_STACK 16, s0, s1, s2
2810 END(jsimd_quantize_dspr2)
2813 #ifndef __mips_soft_float
2815 /*****************************************************************************/
2816 LEAF_DSPR2(jsimd_quantize_float_dspr2)
2824 li t1, 0x46800100 // integer representation 16384.5
2836 madd.s f2, f0, f2, f10
2837 madd.s f4, f0, f4, f12
2838 madd.s f6, f0, f6, f14
2839 madd.s f8, f0, f8, f16
2856 madd.s f2, f0, f2, f10
2857 madd.s f4, f0, f4, f12
2858 madd.s f6, f0, f6, f14
2859 madd.s f8, f0, f8, f16
2860 addiu t1, t1, -16384
2861 addiu t2, t2, -16384
2862 addiu t3, t3, -16384
2863 addiu t4, t4, -16384
2879 addiu t1, t1, -16384
2880 addiu t2, t2, -16384
2881 addiu t3, t3, -16384
2882 addiu t4, t4, -16384
2893 END(jsimd_quantize_float_dspr2)
2898 /*****************************************************************************/
2899 LEAF_DSPR2(jsimd_idct_2x2_dspr2)
2901 * a0 = compptr->dct_table
2908 SAVE_REGS_ON_STACK 24, s0, s1, s2, s3, s4, s5
2912 addiu s2, zero, 29692
2913 addiu s3, zero, -10426
2914 addiu s4, zero, 6967
2915 addiu s5, zero, -5906
2916 lh t0, 0(a1) // t0 = inptr[DCTSIZE*0]
2917 lh t5, 0(a0) // t5 = quantptr[DCTSIZE*0]
2918 lh t1, 48(a1) // t1 = inptr[DCTSIZE*3]
2919 lh t6, 48(a0) // t6 = quantptr[DCTSIZE*3]
2921 lh t0, 16(a1) // t0 = inptr[DCTSIZE*1]
2922 lh t5, 16(a0) // t5 = quantptr[DCTSIZE*1]
2925 lh t2, 80(a1) // t2 = inptr[DCTSIZE*5]
2926 lh t7, 80(a0) // t7 = quantptr[DCTSIZE*5]
2927 lh t3, 112(a1) // t3 = inptr[DCTSIZE*7]
2928 lh t8, 112(a0) // t8 = quantptr[DCTSIZE*7]
2932 li s0, 0x73FCD746 // s0 = (29692 << 16) | (-10426 & 0xffff)
2933 li s1, 0x1B37E8EE // s1 = (6967 << 16) | (-5906 & 0xffff)
2934 ins t6, t5, 16, 16 // t6 = t5|t6
2936 dpa.w.ph $ac0, t6, s0
2939 ins t8, t7, 16, 16 // t8 = t7|t8
2940 dpa.w.ph $ac0, t8, s1
2964 dpa.w.ph $ac0, t7, s0
2968 dpa.w.ph $ac0, t3, s1
2992 dpa.w.ph $ac0, t7, s0
2996 dpa.w.ph $ac0, t3, s1
3020 dpa.w.ph $ac0, t7, s0
3024 dpa.w.ph $ac0, t3, s1
3048 dpa.w.ph $ac0, t7, s0
3050 dpa.w.ph $ac0, t3, s1
3110 RESTORE_REGS_FROM_STACK 24, s0, s1, s2, s3, s4, s5
3115 END(jsimd_idct_2x2_dspr2)
3118 /*****************************************************************************/
3119 LEAF_DSPR2(jsimd_idct_4x4_dspr2)
3121 * a0 = compptr->dct_table
3125 * 16(sp) = workspace[DCTSIZE*4]; // buffers data between passes
3129 SAVE_REGS_ON_STACK 32, s0, s1, s2, s3, s4, s5, s6, s7
3141 lh s6, 32(t0) // inptr[DCTSIZE*2]
3142 lh t6, 32(a0) // quantptr[DCTSIZE*2]
3143 lh s7, 96(t0) // inptr[DCTSIZE*6]
3144 lh t7, 96(a0) // quantptr[DCTSIZE*6]
3145 mul t6, s6, t6 // z2 = (inptr[DCTSIZE*2] * quantptr[DCTSIZE*2])
3146 lh s4, 0(t0) // inptr[DCTSIZE*0]
3147 mul t7, s7, t7 // z3 = (inptr[DCTSIZE*6] * quantptr[DCTSIZE*6])
3148 lh s5, 0(a0) // quantptr[0]
3151 mul t2, s4, s5 // tmp0 = (inptr[0] * quantptr[0])
3152 mul t6, s6, t6 // z2 = (inptr[DCTSIZE*2] * quantptr[DCTSIZE*2])
3153 lh t5, 112(t0) // inptr[DCTSIZE*7]
3154 mul t7, s7, t7 // z3 = (inptr[DCTSIZE*6] * quantptr[DCTSIZE*6])
3155 lh s4, 112(a0) // quantptr[DCTSIZE*7]
3156 lh v0, 80(t0) // inptr[DCTSIZE*5]
3157 lh s5, 80(a0) // quantptr[DCTSIZE*5]
3158 lh s6, 48(a0) // quantptr[DCTSIZE*3]
3159 sll t2, t2, 14 // tmp0 <<= (CONST_BITS+1)
3160 lh s7, 16(a0) // quantptr[DCTSIZE*1]
3161 lh t8, 16(t0) // inptr[DCTSIZE*1]
3162 subu t6, t6, t7 // tmp2 = MULTIPLY(z2, t5) - MULTIPLY(z3, t6)
3163 lh t7, 48(t0) // inptr[DCTSIZE*3]
3164 mul t5, s4, t5 // z1 = (inptr[DCTSIZE*7] * quantptr[DCTSIZE*7])
3165 mul v0, s5, v0 // z2 = (inptr[DCTSIZE*5] * quantptr[DCTSIZE*5])
3166 mul t7, s6, t7 // z3 = (inptr[DCTSIZE*3] * quantptr[DCTSIZE*3])
3167 mul t8, s7, t8 // z4 = (inptr[DCTSIZE*1] * quantptr[DCTSIZE*1])
3168 addu t3, t2, t6 // tmp10 = tmp0 + z2
3169 subu t4, t2, t6 // tmp10 = tmp0 - z2
3170 mult $ac0, zero, zero
3171 mult $ac1, zero, zero
3175 dpa.w.ph $ac0, t5, s0
3176 dpa.w.ph $ac0, t7, s1
3177 dpa.w.ph $ac1, t5, s2
3178 dpa.w.ph $ac1, t7, s3
3188 shra_r.w t6, t6, 12 // DESCALE(tmp12 + temp1, 12)
3189 shra_r.w t5, t5, 12 // DESCALE(tmp12 - temp1, 12)
3190 shra_r.w s6, s6, 12 // DESCALE(tmp10 + temp2, 12)
3191 shra_r.w s7, s7, 12 // DESCALE(tmp10 - temp2, 12)
3197 // second loop three pass
3200 lh s6, 34(t0) // inptr[DCTSIZE*2]
3201 lh t6, 34(a0) // quantptr[DCTSIZE*2]
3202 lh s7, 98(t0) // inptr[DCTSIZE*6]
3203 lh t7, 98(a0) // quantptr[DCTSIZE*6]
3204 mul t6, s6, t6 // z2 = (inptr[DCTSIZE*2] * quantptr[DCTSIZE*2])
3205 lh s4, 2(t0) // inptr[DCTSIZE*0]
3206 mul t7, s7, t7 // z3 = (inptr[DCTSIZE*6] * quantptr[DCTSIZE*6])
3207 lh s5, 2(a0) // quantptr[DCTSIZE*0]
3210 mul t2, s4, s5 // tmp0 = (inptr[0] * quantptr[0])
3211 mul v0, s6, t6 // z2 = (inptr[DCTSIZE*2] * quantptr[DCTSIZE*2])
3212 lh t5, 114(t0) // inptr[DCTSIZE*7]
3213 mul t7, s7, t7 // z3 = (inptr[DCTSIZE*6] * quantptr[DCTSIZE*6])
3214 lh s4, 114(a0) // quantptr[DCTSIZE*7]
3215 lh s5, 82(a0) // quantptr[DCTSIZE*5]
3216 lh t6, 82(t0) // inptr[DCTSIZE*5]
3217 sll t2, t2, 14 // tmp0 <<= (CONST_BITS+1)
3218 lh s6, 50(a0) // quantptr[DCTSIZE*3]
3219 lh t8, 18(t0) // inptr[DCTSIZE*1]
3220 subu v0, v0, t7 // tmp2 = MULTIPLY(z2, t5) - MULTIPLY(z3, t6)
3221 lh t7, 50(t0) // inptr[DCTSIZE*3]
3222 lh s7, 18(a0) // quantptr[DCTSIZE*1]
3223 mul t5, s4, t5 // z1 = (inptr[DCTSIZE*7] * quantptr[DCTSIZE*7])
3224 mul t6, s5, t6 // z2 = (inptr[DCTSIZE*5] * quantptr[DCTSIZE*5])
3225 mul t7, s6, t7 // z3 = (inptr[DCTSIZE*3] * quantptr[DCTSIZE*3])
3226 mul t8, s7, t8 // z4 = (inptr[DCTSIZE*1] * quantptr[DCTSIZE*1])
3227 addu t3, t2, v0 // tmp10 = tmp0 + z2
3228 subu t4, t2, v0 // tmp10 = tmp0 - z2
3229 mult $ac0, zero, zero
3230 mult $ac1, zero, zero
3233 dpa.w.ph $ac0, t5, s0
3234 dpa.w.ph $ac0, t7, s1
3235 dpa.w.ph $ac1, t5, s2
3236 dpa.w.ph $ac1, t7, s3
3247 shra_r.w s5, s5, 12 // DESCALE(tmp12 + temp1, 12)
3248 shra_r.w s4, s4, 12 // DESCALE(tmp12 - temp1, 12)
3249 shra_r.w s6, s6, 12 // DESCALE(tmp10 + temp2, 12)
3250 shra_r.w s7, s7, 12 // DESCALE(tmp10 - temp2, 12)
3258 lw s6, 8(t1) // wsptr[2]
3260 lw s7, 24(t1) // wsptr[6]
3261 mul s4, s4, s6 // MULTIPLY((JLONG)wsptr[2], FIX_1_847759065)
3262 lw t2, 0(t1) // wsptr[0]
3263 mul s5, s5, s7 // MULTIPLY((JLONG)wsptr[6], -FIX_0_765366865)
3264 lh t5, 28(t1) // wsptr[7]
3265 lh t6, 20(t1) // wsptr[5]
3266 lh t7, 12(t1) // wsptr[3]
3267 lh t8, 4(t1) // wsptr[1]
3270 mult $ac0, zero, zero
3271 dpa.w.ph $ac0, t5, s0
3272 dpa.w.ph $ac0, t7, s1
3273 mult $ac1, zero, zero
3274 dpa.w.ph $ac1, t5, s2
3275 dpa.w.ph $ac1, t7, s3
3276 sll t2, t2, 14 // tmp0 = ((JLONG)wsptr[0]) << (CONST_BITS+1)
3278 // MULTIPLY(wsptr[2], FIX_1_847759065 + MULTIPLY(wsptr[6], -FIX_0_765366865)
3280 addu t3, t2, s4 // tmp10 = tmp0 + z2
3282 subu t4, t2, s4 // tmp10 = tmp0 - z2
3287 shra_r.w t5, t5, 19 // DESCALE(tmp10 + temp2, 19)
3288 shra_r.w t6, t6, 19 // DESCALE(tmp10 - temp2, 19)
3289 shra_r.w t7, t7, 19 // DESCALE(tmp12 + temp1, 19)
3290 shra_r.w t8, t8, 19 // DESCALE(tmp12 - temp1, 19)
3292 lw v0, 0(a2) // output_buf[ctr]
3301 addu v0, v0, a3 // outptr = output_buf[ctr] + output_col
3312 lw s6, 40(t1) // wsptr[2]
3314 lw s7, 56(t1) // wsptr[6]
3315 mul s4, s4, s6 // MULTIPLY((JLONG)wsptr[2], FIX_1_847759065)
3316 lw t2, 32(t1) // wsptr[0]
3317 mul s5, s5, s7 // MULTIPLY((JLONG)wsptr[6], -FIX_0_765366865)
3318 lh t5, 60(t1) // wsptr[7]
3319 lh t6, 52(t1) // wsptr[5]
3320 lh t7, 44(t1) // wsptr[3]
3321 lh t8, 36(t1) // wsptr[1]
3324 mult $ac0, zero, zero
3325 dpa.w.ph $ac0, t5, s0
3326 dpa.w.ph $ac0, t7, s1
3327 mult $ac1, zero, zero
3328 dpa.w.ph $ac1, t5, s2
3329 dpa.w.ph $ac1, t7, s3
3330 sll t2, t2, 14 // tmp0 = ((JLONG)wsptr[0]) << (CONST_BITS+1)
3332 // MULTIPLY(wsptr[2], FIX_1_847759065 + MULTIPLY(wsptr[6], -FIX_0_765366865)
3334 addu t3, t2, s4 // tmp10 = tmp0 + z2
3336 subu t4, t2, s4 // tmp10 = tmp0 - z2
3341 shra_r.w t5, t5, 19 // DESCALE(tmp10 + temp2, CONST_BITS-PASS1_BITS+1)
3342 shra_r.w t6, t6, 19 // DESCALE(tmp10 - temp2, CONST_BITS-PASS1_BITS+1)
3343 shra_r.w t7, t7, 19 // DESCALE(tmp12 + temp1, CONST_BITS-PASS1_BITS+1)
3344 shra_r.w t8, t8, 19 // DESCALE(tmp12 - temp1, CONST_BITS-PASS1_BITS+1)
3346 lw v0, 4(a2) // output_buf[ctr]
3355 addu v0, v0, a3 // outptr = output_buf[ctr] + output_col
3366 lw s6, 72(t1) // wsptr[2]
3368 lw s7, 88(t1) // wsptr[6]
3369 mul s4, s4, s6 // MULTIPLY((JLONG)wsptr[2], FIX_1_847759065)
3370 lw t2, 64(t1) // wsptr[0]
3371 mul s5, s5, s7 // MULTIPLY((JLONG)wsptr[6], -FIX_0_765366865)
3372 lh t5, 92(t1) // wsptr[7]
3373 lh t6, 84(t1) // wsptr[5]
3374 lh t7, 76(t1) // wsptr[3]
3375 lh t8, 68(t1) // wsptr[1]
3378 mult $ac0, zero, zero
3379 dpa.w.ph $ac0, t5, s0
3380 dpa.w.ph $ac0, t7, s1
3381 mult $ac1, zero, zero
3382 dpa.w.ph $ac1, t5, s2
3383 dpa.w.ph $ac1, t7, s3
3384 sll t2, t2, 14 // tmp0 = ((JLONG)wsptr[0]) << (CONST_BITS+1)
3386 // MULTIPLY(wsptr[2], FIX_1_847759065 + MULTIPLY(wsptr[6], -FIX_0_765366865)
3388 addu t3, t2, s4 // tmp10 = tmp0 + z2
3390 subu t4, t2, s4 // tmp10 = tmp0 - z2
3395 shra_r.w t5, t5, 19 // DESCALE(tmp10 + temp2, 19)
3396 shra_r.w t6, t6, 19 // DESCALE(tmp10 - temp2, 19)
3397 shra_r.w t7, t7, 19 // DESCALE(tmp12 + temp1, 19)
3398 shra_r.w t8, t8, 19 // DESCALE(tmp12 - temp1, 19)
3400 lw v0, 8(a2) // output_buf[ctr]
3409 addu v0, v0, a3 // outptr = output_buf[ctr] + output_col
3419 lw s6, 104(t1) // wsptr[2]
3421 lw s7, 120(t1) // wsptr[6]
3422 mul s4, s4, s6 // MULTIPLY((JLONG)wsptr[2], FIX_1_847759065)
3423 lw t2, 96(t1) // wsptr[0]
3424 mul s5, s5, s7 // MULTIPLY((JLONG)wsptr[6], -FIX_0_765366865)
3425 lh t5, 124(t1) // wsptr[7]
3426 lh t6, 116(t1) // wsptr[5]
3427 lh t7, 108(t1) // wsptr[3]
3428 lh t8, 100(t1) // wsptr[1]
3431 mult $ac0, zero, zero
3432 dpa.w.ph $ac0, t5, s0
3433 dpa.w.ph $ac0, t7, s1
3434 mult $ac1, zero, zero
3435 dpa.w.ph $ac1, t5, s2
3436 dpa.w.ph $ac1, t7, s3
3437 sll t2, t2, 14 // tmp0 = ((JLONG)wsptr[0]) << (CONST_BITS+1)
3439 // MULTIPLY(wsptr[2], FIX_1_847759065 + MULTIPLY(wsptr[6], -FIX_0_765366865)
3441 addu t3, t2, s4 // tmp10 = tmp0 + z2;
3443 subu t4, t2, s4 // tmp10 = tmp0 - z2;
3448 shra_r.w t5, t5, 19 // DESCALE(tmp10 + temp2, 19)
3449 shra_r.w t6, t6, 19 // DESCALE(tmp10 - temp2, 19)
3450 shra_r.w t7, t7, 19 // DESCALE(tmp12 + temp1, 19)
3451 shra_r.w t8, t8, 19 // DESCALE(tmp12 - temp1, 19)
3453 lw v0, 12(a2) // output_buf[ctr]
3462 addu v0, v0, a3 // outptr = output_buf[ctr] + output_col
3472 RESTORE_REGS_FROM_STACK 32, s0, s1, s2, s3, s4, s5, s6, s7
3476 END(jsimd_idct_4x4_dspr2)
3479 /*****************************************************************************/
3480 LEAF_DSPR2(jsimd_idct_6x6_dspr2)
3482 * a0 = compptr->dct_table
3489 SAVE_REGS_ON_STACK 32, s0, s1, s2, s3, s4, s5, s6, s7
3494 addiu t9, zero, 5793
3495 addiu s0, zero, 10033
3496 addiu s1, zero, 2998
3499 lh s2, 0(a0) // q0 = quantptr[ 0]
3500 lh s3, 32(a0) // q1 = quantptr[16]
3501 lh s4, 64(a0) // q2 = quantptr[32]
3502 lh t2, 64(a1) // tmp2 = inptr[32]
3503 lh t1, 32(a1) // tmp1 = inptr[16]
3504 lh t0, 0(a1) // tmp0 = inptr[ 0]
3505 mul t2, t2, s4 // tmp2 = tmp2 * q2
3506 mul t1, t1, s3 // tmp1 = tmp1 * q1
3507 mul t0, t0, s2 // tmp0 = tmp0 * q0
3508 lh t6, 16(a1) // z1 = inptr[ 8]
3509 lh t8, 80(a1) // z3 = inptr[40]
3510 lh t7, 48(a1) // z2 = inptr[24]
3511 lh s2, 16(a0) // q0 = quantptr[ 8]
3512 lh s4, 80(a0) // q2 = quantptr[40]
3513 lh s3, 48(a0) // q1 = quantptr[24]
3514 mul t2, t2, t9 // tmp2 = tmp2 * 5793
3515 mul t1, t1, s0 // tmp1 = tmp1 * 10033
3516 sll t0, t0, 13 // tmp0 = tmp0 << 13
3517 mul t6, t6, s2 // z1 = z1 * q0
3518 mul t8, t8, s4 // z3 = z3 * q2
3519 mul t7, t7, s3 // z2 = z2 * q1
3520 addu t3, t0, t2 // tmp10 = tmp0 + tmp2
3521 sll t2, t2, 1 // tmp2 = tmp2 << 2
3522 subu t4, t0, t2 // tmp11 = tmp0 - tmp2;
3523 subu t5, t3, t1 // tmp12 = tmp10 - tmp1
3524 addu t3, t3, t1 // tmp10 = tmp10 + tmp1
3525 addu t1, t6, t8 // tmp1 = z1 + z3
3526 mul t1, t1, s1 // tmp1 = tmp1 * 2998
3527 shra_r.w t4, t4, 11 // tmp11 = (tmp11 + 1024) >> 11
3528 subu t2, t6, t8 // tmp2 = z1 - z3
3529 subu t2, t2, t7 // tmp2 = tmp2 - z2
3530 sll t2, t2, 2 // tmp2 = tmp2 << 2
3531 addu t0, t6, t7 // tmp0 = z1 + z2
3532 sll t0, t0, 13 // tmp0 = tmp0 << 13
3533 subu s2, t8, t7 // q0 = z3 - z2
3534 sll s2, s2, 13 // q0 = q0 << 13
3535 addu t0, t0, t1 // tmp0 = tmp0 + tmp1
3536 addu t1, s2, t1 // tmp1 = q0 + tmp1
3537 addu s2, t4, t2 // q0 = tmp11 + tmp2
3538 subu s3, t4, t2 // q1 = tmp11 - tmp2
3539 addu t6, t3, t0 // z1 = tmp10 + tmp0
3540 subu t7, t3, t0 // z2 = tmp10 - tmp0
3541 addu t4, t5, t1 // tmp11 = tmp12 + tmp1
3542 subu t5, t5, t1 // tmp12 = tmp12 - tmp1
3543 shra_r.w t6, t6, 11 // z1 = (z1 + 1024) >> 11
3544 shra_r.w t7, t7, 11 // z2 = (z2 + 1024) >> 11
3545 shra_r.w t4, t4, 11 // tmp11 = (tmp11 + 1024) >> 11
3546 shra_r.w t5, t5, 11 // tmp12 = (tmp12 + 1024) >> 11
3558 /* Pass 2: process 6 rows from work array, store into output array. */
3627 RESTORE_REGS_FROM_STACK 32, s0, s1, s2, s3, s4, s5, s6, s7
3632 END(jsimd_idct_6x6_dspr2)
3635 /*****************************************************************************/
3636 LEAF_DSPR2(jsimd_idct_12x12_pass1_dspr2)
3638 * a0 = compptr->dct_table
3642 SAVE_REGS_ON_STACK 16, s0, s1, s2, s3
3656 mul t0, t0, t1 // z2
3657 mul t1, t2, t3 // z1
3658 mul t2, t4, t5 // z3
3659 mul t3, t6, t7 // z4
3660 li t4, 10703 // FIX(1.306562965)
3661 li t5, 4433 // FIX_0_541196100
3662 li t6, 7053 // FIX(0.860918669)
3663 mul t4, t0, t4 // tmp11
3664 mul t5, t0, t5 // -tmp14
3665 addu t7, t1, t2 // tmp10
3666 addu t8, t7, t3 // tmp10 + z4
3667 mul t6, t6, t8 // tmp15
3668 li t8, 2139 // FIX(0.261052384)
3669 mul t8, t7, t8 // MULTIPLY(tmp10, FIX(0.261052384))
3670 li t7, 2295 // FIX(0.280143716)
3671 mul t7, t1, t7 // MULTIPLY(z1, FIX(0.280143716))
3672 addu t9, t2, t3 // z3 + z4
3673 li s0, 8565 // FIX(1.045510580)
3674 mul t9, t9, s0 // -tmp13
3675 li s0, 12112 // FIX(1.478575242)
3676 mul s0, t2, s0 // MULTIPLY(z3, FIX(1.478575242)
3677 li s1, 12998 // FIX(1.586706681)
3678 mul s1, t3, s1 // MULTIPLY(z4, FIX(1.586706681))
3679 li s2, 5540 // FIX(0.676326758)
3680 mul s2, t1, s2 // MULTIPLY(z1, FIX(0.676326758))
3681 li s3, 16244 // FIX(1.982889723)
3682 mul s3, t3, s3 // MULTIPLY(z4, FIX(1.982889723))
3683 subu t1, t1, t3 // z1-=z4
3684 subu t0, t0, t2 // z2-=z3
3685 addu t2, t0, t1 // z1+z2
3686 li t3, 4433 // FIX_0_541196100
3687 mul t2, t2, t3 // z3
3688 li t3, 6270 // FIX_0_765366865
3689 mul t1, t1, t3 // MULTIPLY(z1, FIX_0_765366865)
3690 li t3, 15137 // FIX_0_765366865
3691 mul t0, t0, t3 // MULTIPLY(z2, FIX_1_847759065)
3692 addu t8, t6, t8 // tmp12
3693 addu t3, t8, t4 // tmp12 + tmp11
3694 addu t3, t3, t7 // tmp10
3695 subu t8, t8, t9 // tmp12 + tmp13
3697 subu t8, t8, s0 // tmp12
3700 addu t9, t9, s1 // tmp13
3703 subu t6, t6, s3 // tmp15
3713 mul t4, t4, t5 // DEQUANTIZE(inptr[DCTSIZE*4], quantptr[DCTSIZE*4])
3714 mul t5, t7, s0 // DEQUANTIZE(inptr[DCTSIZE*2], quantptr[DCTSIZE*2])
3715 mul t7, s1, s2 // DEQUANTIZE(inptr[DCTSIZE*0], quantptr[DCTSIZE*0])
3716 mul s0, s3, v0 // DEQUANTIZE(inptr[DCTSIZE*6], quantptr[DCTSIZE*6])
3718 addu t1, t2, t1 // tmp11
3719 subu t0, t2, t0 // tmp14
3720 // update counter and pointers
3727 mul t4, t4, s1 // z4
3728 mul s1, t5, s2 // z4
3729 sll t5, t5, 13 // z1
3731 addiu t7, t7, 1024 // z3
3732 sll s0, s0, 13 // z2
3733 addu s2, t7, t4 // tmp10
3734 subu t4, t7, t4 // tmp11
3735 subu s3, t5, s0 // tmp12
3736 addu t2, t7, s3 // tmp21
3737 subu s3, t7, s3 // tmp24
3738 addu t7, s1, s0 // tmp12
3739 addu v0, s2, t7 // tmp20
3740 subu s2, s2, t7 // tmp25
3741 subu s1, s1, t5 // z4 - z1
3742 subu s1, s1, s0 // tmp12
3743 addu s0, t4, s1 // tmp22
3744 subu t4, t4, s1 // tmp23
3745 // final output stage
3785 RESTORE_REGS_FROM_STACK 16, s0, s1, s2, s3
3790 END(jsimd_idct_12x12_pass1_dspr2)
3793 /*****************************************************************************/
3794 LEAF_DSPR2(jsimd_idct_12x12_pass2_dspr2)
3799 SAVE_REGS_ON_STACK 16, s0, s1, s2, s3
3809 li t4, 10703 // FIX(1.306562965)
3810 li t5, 4433 // FIX_0_541196100
3811 mul t4, t0, t4 // tmp11
3812 mul t5, t0, t5 // -tmp14
3813 addu t6, t1, t2 // tmp10
3814 li t7, 2139 // FIX(0.261052384)
3815 mul t7, t6, t7 // MULTIPLY(tmp10, FIX(0.261052384))
3816 addu t6, t6, t3 // tmp10 + z4
3817 li t8, 7053 // FIX(0.860918669)
3818 mul t6, t6, t8 // tmp15
3819 li t8, 2295 // FIX(0.280143716)
3820 mul t8, t1, t8 // MULTIPLY(z1, FIX(0.280143716))
3821 addu t9, t2, t3 // z3 + z4
3822 li s0, 8565 // FIX(1.045510580)
3823 mul t9, t9, s0 // -tmp13
3824 li s0, 12112 // FIX(1.478575242)
3825 mul s0, t2, s0 // MULTIPLY(z3, FIX(1.478575242))
3826 li s1, 12998 // FIX(1.586706681)
3827 mul s1, t3, s1 // MULTIPLY(z4, FIX(1.586706681))
3828 li s2, 5540 // FIX(0.676326758)
3829 mul s2, t1, s2 // MULTIPLY(z1, FIX(0.676326758))
3830 li s3, 16244 // FIX(1.982889723)
3831 mul s3, t3, s3 // MULTIPLY(z4, FIX(1.982889723))
3832 subu t1, t1, t3 // z1 -= z4
3833 subu t0, t0, t2 // z2 -= z3
3834 addu t2, t1, t0 // z1 + z2
3835 li t3, 4433 // FIX_0_541196100
3836 mul t2, t2, t3 // z3
3837 li t3, 6270 // FIX_0_765366865
3838 mul t1, t1, t3 // MULTIPLY(z1, FIX_0_765366865)
3839 li t3, 15137 // FIX_1_847759065
3840 mul t0, t0, t3 // MULTIPLY(z2, FIX_1_847759065)
3841 addu t3, t6, t7 // tmp12
3843 addu t7, t7, t8 // tmp10
3846 subu t3, t3, s0 // tmp12
3849 addu t9, t9, s1 // tmp13
3852 subu t6, t6, s3 // tmp15
3853 addu t1, t2, t1 // tmp11
3854 subu t0, t2, t0 // tmp14
3860 li s0, 10033 // FIX(1.224744871)
3861 li s1, 11190 // FIX(1.366025404)
3862 mul t2, t2, s0 // z4
3863 mul s0, t4, s1 // z4
3865 sll t5, t5, 13 // z3
3866 sll t4, t4, 13 // z1
3867 sll t8, t8, 13 // z2
3868 subu s1, t4, t8 // tmp12
3869 addu s2, t5, t2 // tmp10
3870 subu t2, t5, t2 // tmp11
3871 addu s3, t5, s1 // tmp21
3872 subu s1, t5, s1 // tmp24
3873 addu t5, s0, t8 // tmp12
3874 addu v0, s2, t5 // tmp20
3875 subu t5, s2, t5 // tmp25
3877 subu t4, t4, t8 // tmp12
3878 addu t8, t2, t4 // tmp22
3879 subu t2, t2, t4 // tmp23
3880 // increment counter and pointers
3960 RESTORE_REGS_FROM_STACK 16, s0, s1, s2, s3
3965 END(jsimd_idct_12x12_pass2_dspr2)
3968 /*****************************************************************************/
3969 LEAF_DSPR2(jsimd_convsamp_dspr2)
3980 preceu.ph.qbr t3, t1
3981 preceu.ph.qbl t4, t1
3983 preceu.ph.qbr t5, t2
3984 preceu.ph.qbl t6, t2
3994 preceu.ph.qbr t3, t1
3995 preceu.ph.qbl t4, t1
4000 preceu.ph.qbr t5, t2
4001 preceu.ph.qbl t6, t2
4011 preceu.ph.qbr t3, t1
4012 preceu.ph.qbl t4, t1
4017 preceu.ph.qbr t5, t2
4018 preceu.ph.qbl t6, t2
4028 preceu.ph.qbr t3, t1
4029 preceu.ph.qbl t4, t1
4034 preceu.ph.qbr t5, t2
4035 preceu.ph.qbl t6, t2
4045 preceu.ph.qbr t3, t1
4046 preceu.ph.qbl t4, t1
4051 preceu.ph.qbr t5, t2
4052 preceu.ph.qbl t6, t2
4062 preceu.ph.qbr t3, t1
4063 preceu.ph.qbl t4, t1
4068 preceu.ph.qbr t5, t2
4069 preceu.ph.qbl t6, t2
4079 preceu.ph.qbr t3, t1
4080 preceu.ph.qbl t4, t1
4085 preceu.ph.qbr t5, t2
4086 preceu.ph.qbl t6, t2
4096 preceu.ph.qbr t3, t1
4097 preceu.ph.qbl t4, t1
4100 preceu.ph.qbr t5, t2
4101 preceu.ph.qbl t6, t2
4114 END(jsimd_convsamp_dspr2)
4117 #ifndef __mips_soft_float
4119 /*****************************************************************************/
4120 LEAF_DSPR2(jsimd_convsamp_float_dspr2)
4475 END(jsimd_convsamp_float_dspr2)
4479 /*****************************************************************************/