- add sources.
[platform/framework/web/crosswalk.git] / src / skia / ext / convolver_mips_dspr2.cc
1 // Copyright (c) 2013 The Chromium Authors. All rights reserved.
2 // Use of this source code is governed by a BSD-style license that can be
3 // found in the LICENSE file.
4
5 #include <algorithm>
6 #include "skia/ext/convolver.h"
7 #include "skia/ext/convolver_mips_dspr2.h"
8 #include "third_party/skia/include/core/SkTypes.h"
9
10 namespace skia {
11 // Convolves horizontally along a single row. The row data is given in
12 // |src_data| and continues for the num_values() of the filter.
13 void ConvolveHorizontally_mips_dspr2(const unsigned char* src_data,
14                                      const ConvolutionFilter1D& filter,
15                                      unsigned char* out_row,
16                                      bool has_alpha) {
17 #if SIMD_MIPS_DSPR2
18   int row_to_filter = 0;
19   int num_values = filter.num_values();
20   if (has_alpha) {
21     for (int out_x = 0; out_x < num_values; out_x++) {
22       // Get the filter that determines the current output pixel.
23       int filter_offset, filter_length;
24       const ConvolutionFilter1D::Fixed* filter_values =
25         filter.FilterForValue(out_x, &filter_offset, &filter_length);
26       int filter_x = 0;
27
28       __asm__ __volatile__ (
29         ".set push                                  \n"
30         ".set noreorder                             \n"
31
32         "beqz            %[filter_len], 3f          \n"
33         " sll            $t0, %[filter_offset], 2   \n"
34         "addu            %[rtf], %[src_data], $t0   \n"
35         "mtlo            $0, $ac0                   \n"
36         "mtlo            $0, $ac1                   \n"
37         "mtlo            $0, $ac2                   \n"
38         "mtlo            $0, $ac3                   \n"
39         "srl             $t7, %[filter_len], 2      \n"
40         "beqz            $t7, 2f                    \n"
41         " li             %[fx], 0                   \n"
42
43         "11:                                        \n"
44         "addu            $t4, %[filter_val], %[fx]  \n"
45         "sll             $t5, %[fx], 1              \n"
46         "ulw             $t6, 0($t4)                \n" // t6 = |cur[1]|cur[0]|
47         "ulw             $t8, 4($t4)                \n" // t8 = |cur[3]|cur[2]|
48         "addu            $t0, %[rtf], $t5           \n"
49         "lw              $t1, 0($t0)                \n" // t1 = |a0|b0|g0|r0|
50         "lw              $t2, 4($t0)                \n" // t2 = |a1|b1|g1|r1|
51         "lw              $t3, 8($t0)                \n" // t3 = |a2|b2|g2|r2|
52         "lw              $t4, 12($t0)               \n" // t4 = |a3|b3|g3|r3|
53         "precrq.qb.ph    $t0, $t2, $t1              \n" // t0 = |a1|g1|a0|g0|
54         "precr.qb.ph     $t5, $t2, $t1              \n" // t5 = |b1|r1|b0|r0|
55         "preceu.ph.qbla  $t1, $t0                   \n" // t1 = |0|a1|0|a0|
56         "preceu.ph.qbra  $t2, $t0                   \n" // t2 = |0|g1|0|g0|
57         "preceu.ph.qbla  $t0, $t5                   \n" // t0 = |0|b1|0|b0|
58         "preceu.ph.qbra  $t5, $t5                   \n" // t5 = |0|r1|0|r0|
59         "dpa.w.ph        $ac0, $t1, $t6             \n" // ac0+(cur*a1)+(cur*a0)
60         "dpa.w.ph        $ac1, $t0, $t6             \n" // ac1+(cur*b1)+(cur*b0)
61         "dpa.w.ph        $ac2, $t2, $t6             \n" // ac2+(cur*g1)+(cur*g0)
62         "dpa.w.ph        $ac3, $t5, $t6             \n" // ac3+(cur*r1)+(cur*r0)
63         "precrq.qb.ph    $t0, $t4, $t3              \n" // t0 = |a3|g3|a2|g2|
64         "precr.qb.ph     $t5, $t4, $t3              \n" // t5 = |b3|r3|b2|r2|
65         "preceu.ph.qbla  $t1, $t0                   \n" // t1 = |0|a3|0|a2|
66         "preceu.ph.qbra  $t2, $t0                   \n" // t2 = |0|g3|0|g2|
67         "preceu.ph.qbla  $t0, $t5                   \n" // t0 = |0|b3|0|b2|
68         "preceu.ph.qbra  $t5, $t5                   \n" // t5 = |0|r3|0|r2|
69         "dpa.w.ph        $ac0, $t1, $t8             \n" // ac0+(cur*a3)+(cur*a2)
70         "dpa.w.ph        $ac1, $t0, $t8             \n" // ac1+(cur*b3)+(cur*b2)
71         "dpa.w.ph        $ac2, $t2, $t8             \n" // ac2+(cur*g3)+(cur*g2)
72         "dpa.w.ph        $ac3, $t5, $t8             \n" // ac3+(cur*r3)+(cur*r2)
73         "addiu           $t7, $t7, -1               \n"
74         "bgtz            $t7, 11b                   \n"
75         " addiu          %[fx], %[fx], 8            \n"
76
77         "2:                                         \n"
78         "andi            $t7, %[filter_len], 0x3    \n" // residual
79         "beqz            $t7, 3f                    \n"
80         " nop                                       \n"
81
82         "21:                                        \n"
83         "sll             $t1, %[fx], 1              \n"
84         "addu            $t2, %[filter_val], %[fx]  \n"
85         "addu            $t0, %[rtf], $t1           \n"
86         "lh              $t6, 0($t2)                \n" // t6 = filter_val[fx]
87         "lbu             $t1, 0($t0)                \n" // t1 = row[fx * 4 + 0]
88         "lbu             $t2, 1($t0)                \n" // t2 = row[fx * 4 + 1]
89         "lbu             $t3, 2($t0)                \n" // t3 = row[fx * 4 + 2]
90         "lbu             $t4, 3($t0)                \n" // t4 = row[fx * 4 + 2]
91         "maddu           $ac3, $t6, $t1             \n"
92         "maddu           $ac2, $t6, $t2             \n"
93         "maddu           $ac1, $t6, $t3             \n"
94         "maddu           $ac0, $t6, $t4             \n"
95         "addiu           $t7, $t7, -1               \n"
96         "bgtz            $t7, 21b                   \n"
97         " addiu          %[fx], %[fx], 2            \n"
98
99         "3:                                         \n"
100         "extrv.w         $t0, $ac0, %[kShiftBits]   \n" // a >> kShiftBits
101         "extrv.w         $t1, $ac1, %[kShiftBits]   \n" // b >> kShiftBits
102         "extrv.w         $t2, $ac2, %[kShiftBits]   \n" // g >> kShiftBits
103         "extrv.w         $t3, $ac3, %[kShiftBits]   \n" // r >> kShiftBits
104         "sll             $t5, %[out_x], 2           \n"
105         "repl.ph         $t6, 128                   \n" // t6 = | 128 | 128 |
106         "addu            $t5, %[out_row], $t5       \n"
107         "append          $t2, $t3, 16               \n"
108         "append          $t0, $t1, 16               \n"
109         "subu.ph         $t1, $t0, $t6              \n"
110         "shll_s.ph       $t1, $t1, 8                \n"
111         "shra.ph         $t1, $t1, 8                \n"
112         "addu.ph         $t1, $t1, $t6              \n"
113         "subu.ph         $t3, $t2, $t6              \n"
114         "shll_s.ph       $t3, $t3, 8                \n"
115         "shra.ph         $t3, $t3, 8                \n"
116         "addu.ph         $t3, $t3, $t6              \n"
117         "precr.qb.ph     $t0, $t1, $t3              \n"
118         "usw             $t0, 0($t5)                \n"
119
120         ".set pop                                   \n"
121       : [fx] "+r" (filter_x), [out_x] "+r" (out_x), [out_row] "+r" (out_row),
122         [rtf] "+r" (row_to_filter)
123       : [filter_val] "r" (filter_values), [filter_len] "r" (filter_length),
124         [kShiftBits] "r" (ConvolutionFilter1D::kShiftBits),
125         [filter_offset] "r" (filter_offset), [src_data] "r" (src_data)
126       : "lo", "hi", "$ac1lo", "$ac1hi", "$ac2lo", "$ac2hi", "$ac3lo", "$ac3hi",
127         "t0", "t1", "t2", "t3", "t4", "t5", "t6", "t7", "t8"
128       );
129     }
130   } else {
131     for (int out_x = 0; out_x < num_values; out_x++) {
132       // Get the filter that determines the current output pixel.
133       int filter_offset, filter_length;
134       const ConvolutionFilter1D::Fixed* filter_values =
135         filter.FilterForValue(out_x, &filter_offset, &filter_length);
136       int filter_x = 0;
137       __asm__ __volatile__ (
138         ".set push                                  \n"
139         ".set noreorder                             \n"
140
141         "beqz            %[filter_len], 3f          \n"
142         " sll            $t0, %[filter_offset], 2   \n"
143         "addu            %[rtf], %[src_data], $t0   \n"
144         "mtlo            $0, $ac1                   \n"
145         "mtlo            $0, $ac2                   \n"
146         "mtlo            $0, $ac3                   \n"
147         "srl             $t7, %[filter_len], 2      \n"
148         "beqz            $t7, 2f                    \n"
149         " li             %[fx], 0                   \n"
150
151         "11:                                        \n"
152         "addu            $t4, %[filter_val], %[fx]  \n"
153         "sll             $t5, %[fx], 1              \n"
154         "ulw             $t6, 0($t4)                \n" // t6 = |cur[1]|cur[0]|
155         "ulw             $t8, 4($t4)                \n" // t8 = |cur[3]|cur[2]|
156         "addu            $t0, %[rtf], $t5           \n"
157         "lw              $t1, 0($t0)                \n" // t1 = |a0|b0|g0|r0|
158         "lw              $t2, 4($t0)                \n" // t2 = |a1|b1|g1|r1|
159         "lw              $t3, 8($t0)                \n" // t3 = |a2|b2|g2|r2|
160         "lw              $t4, 12($t0)               \n" // t4 = |a3|b3|g3|r3|
161         "precrq.qb.ph    $t0, $t2, $t1              \n" // t0 = |a1|g1|a0|g0|
162         "precr.qb.ph     $t5, $t2, $t1              \n" // t5 = |b1|r1|b0|r0|
163         "preceu.ph.qbra  $t2, $t0                   \n" // t2 = |0|g1|0|g0|
164         "preceu.ph.qbla  $t0, $t5                   \n" // t0 = |0|b1|0|b0|
165         "preceu.ph.qbra  $t5, $t5                   \n" // t5 = |0|r1|0|r0|
166         "dpa.w.ph        $ac1, $t0, $t6             \n" // ac1+(cur*b1)+(cur*b0)
167         "dpa.w.ph        $ac2, $t2, $t6             \n" // ac2+(cur*g1)+(cur*g0)
168         "dpa.w.ph        $ac3, $t5, $t6             \n" // ac3+(cur*r1)+(cur*r0)
169         "precrq.qb.ph    $t0, $t4, $t3              \n" // t0 = |a3|g3|a2|g2|
170         "precr.qb.ph     $t5, $t4, $t3              \n" // t5 = |b3|r3|b2|r2|
171         "preceu.ph.qbra  $t2, $t0                   \n" // t2 = |0|g3|0|g2|
172         "preceu.ph.qbla  $t0, $t5                   \n" // t0 = |0|b3|0|b2|
173         "preceu.ph.qbra  $t5, $t5                   \n" // t5 = |0|r3|0|r2|
174         "dpa.w.ph        $ac1, $t0, $t8             \n" // ac1+(cur*b3)+(cur*b2)
175         "dpa.w.ph        $ac2, $t2, $t8             \n" // ac2+(cur*g3)+(cur*g2)
176         "dpa.w.ph        $ac3, $t5, $t8             \n" // ac3+(cur*r3)+(cur*r2)
177         "addiu           $t7, $t7, -1               \n"
178         "bgtz            $t7, 11b                   \n"
179         " addiu          %[fx], %[fx], 8            \n"
180
181         "2:                                         \n"
182         "andi            $t7, %[filter_len], 0x3    \n" // residual
183         "beqz            $t7, 3f                    \n"
184         " nop                                       \n"
185
186         "21:                                        \n"
187         "sll             $t1, %[fx], 1              \n"
188         "addu            $t2, %[filter_val], %[fx]  \n"
189         "addu            $t0, %[rtf], $t1           \n"
190         "lh              $t6, 0($t2)                \n" // t6 = filter_val[fx]
191         "lbu             $t1, 0($t0)                \n" // t1 = row[fx * 4 + 0]
192         "lbu             $t2, 1($t0)                \n" // t2 = row[fx * 4 + 1]
193         "lbu             $t3, 2($t0)                \n" // t3 = row[fx * 4 + 2]
194         "maddu           $ac3, $t6, $t1             \n"
195         "maddu           $ac2, $t6, $t2             \n"
196         "maddu           $ac1, $t6, $t3             \n"
197         "addiu           $t7, $t7, -1               \n"
198         "bgtz            $t7, 21b                   \n"
199         " addiu          %[fx], %[fx], 2            \n"
200
201         "3:                                         \n"
202         "extrv.w         $t1, $ac1, %[kShiftBits]   \n" // b >> kShiftBits
203         "extrv.w         $t2, $ac2, %[kShiftBits]   \n" // g >> kShiftBits
204         "extrv.w         $t3, $ac3, %[kShiftBits]   \n" // r >> kShiftBits
205         "repl.ph         $t6, 128                   \n" // t6 = | 128 | 128 |
206         "sll             $t8, %[out_x], 2           \n"
207         "addu            $t8, %[out_row], $t8       \n"
208         "append          $t2, $t3, 16               \n"
209         "andi            $t1, 0xFFFF                \n"
210         "subu.ph         $t5, $t1, $t6              \n"
211         "shll_s.ph       $t5, $t5, 8                \n"
212         "shra.ph         $t5, $t5, 8                \n"
213         "addu.ph         $t5, $t5, $t6              \n"
214         "subu.ph         $t4, $t2, $t6              \n"
215         "shll_s.ph       $t4, $t4, 8                \n"
216         "shra.ph         $t4, $t4, 8                \n"
217         "addu.ph         $t4, $t4, $t6              \n"
218         "precr.qb.ph     $t0, $t5, $t4              \n"
219         "usw             $t0, 0($t8)                \n"
220
221         ".set pop                                   \n"
222       : [fx] "+r" (filter_x), [out_x] "+r" (out_x), [out_row] "+r" (out_row),
223         [rtf] "+r" (row_to_filter)
224       : [filter_val] "r" (filter_values), [filter_len] "r" (filter_length),
225         [kShiftBits] "r" (ConvolutionFilter1D::kShiftBits),
226         [filter_offset] "r" (filter_offset), [src_data] "r" (src_data)
227       : "lo", "hi", "$ac1lo", "$ac1hi", "$ac2lo", "$ac2hi", "$ac3lo", "$ac3hi",
228         "t0", "t1", "t2", "t3", "t4", "t5", "t6", "t7", "t8"
229       );
230     }
231   }
232 #endif
233 }
234 void ConvolveVertically_mips_dspr2(const ConvolutionFilter1D::Fixed* filter_val,
235                                    int filter_length,
236                                    unsigned char* const* source_data_rows,
237                                    int pixel_width,
238                                    unsigned char* out_row,
239                                    bool has_alpha) {
240 #if SIMD_MIPS_DSPR2
241   // We go through each column in the output and do a vertical convolution,
242   // generating one output pixel each time.
243   int byte_offset;
244   int cnt;
245   int filter_y;
246   if (has_alpha) {
247     for (int out_x = 0; out_x < pixel_width; out_x++) {
248       __asm__ __volatile__ (
249         ".set push                                   \n"
250         ".set noreorder                              \n"
251
252         "beqz            %[filter_len], 3f           \n"
253         " sll            %[offset], %[out_x], 2      \n"
254         "mtlo            $0, $ac0                    \n"
255         "mtlo            $0, $ac1                    \n"
256         "mtlo            $0, $ac2                    \n"
257         "mtlo            $0, $ac3                    \n"
258         "srl             %[cnt], %[filter_len], 2    \n"
259         "beqz            %[cnt], 2f                  \n"
260         " li             %[fy], 0                    \n"
261
262         "11:                                         \n"
263         "sll             $t1, %[fy], 1               \n"
264         "addu            $t0, %[src_data_rows], $t1  \n"
265         "lw              $t1, 0($t0)                 \n"
266         "lw              $t2, 4($t0)                 \n"
267         "lw              $t3, 8($t0)                 \n"
268         "lw              $t4, 12($t0)                \n"
269         "addu            $t1, $t1, %[offset]         \n"
270         "addu            $t2, $t2, %[offset]         \n"
271         "addu            $t3, $t3, %[offset]         \n"
272         "addu            $t4, $t4, %[offset]         \n"
273         "lw              $t1, 0($t1)                 \n" // t1 = |a0|b0|g0|r0|
274         "lw              $t2, 0($t2)                 \n" // t2 = |a1|b1|g1|r1|
275         "lw              $t3, 0($t3)                 \n" // t3 = |a0|b0|g0|r0|
276         "lw              $t4, 0($t4)                 \n" // t4 = |a1|b1|g1|r1|
277         "precrq.qb.ph    $t5, $t2, $t1               \n" // t5 = |a1|g1|a0|g0|
278         "precr.qb.ph     $t6, $t2, $t1               \n" // t6 = |b1|r1|b0|r0|
279         "preceu.ph.qbla  $t0, $t5                    \n" // t0 = |0|a1|0|a0|
280         "preceu.ph.qbra  $t1, $t5                    \n" // t1 = |0|g1|0|g0|
281         "preceu.ph.qbla  $t2, $t6                    \n" // t2 = |0|b1|0|b0|
282         "preceu.ph.qbra  $t5, $t6                    \n" // t5 = |0|r1|0|r0|
283         "addu            $t6, %[filter_val], %[fy]   \n"
284         "ulw             $t7, 0($t6)                 \n" // t7 = |cur_1|cur_0|
285         "ulw             $t6, 4($t6)                 \n" // t6 = |cur_3|cur_2|
286         "dpa.w.ph        $ac0, $t5, $t7              \n" // (cur*r1)+(cur*r0)
287         "dpa.w.ph        $ac1, $t1, $t7              \n" // (cur*g1)+(cur*g0)
288         "dpa.w.ph        $ac2, $t2, $t7              \n" // (cur*b1)+(cur*b0)
289         "dpa.w.ph        $ac3, $t0, $t7              \n" // (cur*a1)+(cur*a0)
290         "precrq.qb.ph    $t5, $t4, $t3               \n" // t5 = |a3|g3|a2|g2|
291         "precr.qb.ph     $t7, $t4, $t3               \n" // t7 = |b3|r3|b2|r2|
292         "preceu.ph.qbla  $t0, $t5                    \n" // t0 = |0|a3|0|a2|
293         "preceu.ph.qbra  $t1, $t5                    \n" // t1 = |0|g3|0|g2|
294         "preceu.ph.qbla  $t2, $t7                    \n" // t2 = |0|b3|0|b2|
295         "preceu.ph.qbra  $t5, $t7                    \n" // t5 = |0|r3|0|r2|
296         "dpa.w.ph        $ac0, $t5, $t6              \n" // (cur*r3)+(cur*r2)
297         "dpa.w.ph        $ac1, $t1, $t6              \n" // (cur*g3)+(cur*g2)
298         "dpa.w.ph        $ac2, $t2, $t6              \n" // (cur*b3)+(cur*b2)
299         "dpa.w.ph        $ac3, $t0, $t6              \n" // (cur*a3)+(cur*a2)
300         "addiu           %[cnt], %[cnt], -1          \n"
301         "bgtz            %[cnt], 11b                 \n"
302         " addiu          %[fy], %[fy], 8             \n"
303
304         "2:                                          \n"
305         "andi            %[cnt], %[filter_len], 0x3  \n" // residual
306         "beqz            %[cnt], 3f                  \n"
307         " nop                                        \n"
308
309         "21:                                         \n"
310         "addu            $t0, %[filter_val], %[fy]   \n"
311         "lh              $t4, 0($t0)                 \n" // t4=filter_val[fx]
312         "sll             $t1, %[fy], 1               \n"
313         "addu            $t0, %[src_data_rows], $t1  \n"
314         "lw              $t1, 0($t0)                 \n"
315         "addu            $t0, $t1, %[offset]         \n"
316         "lbu             $t1, 0($t0)                 \n" // t1 = row[fx*4 + 0]
317         "lbu             $t2, 1($t0)                 \n" // t2 = row[fx*4 + 1]
318         "lbu             $t3, 2($t0)                 \n" // t3 = row[fx*4 + 2]
319         "lbu             $t0, 3($t0)                 \n" // t4 = row[fx*4 + 2]
320         "maddu           $ac0, $t4, $t1              \n"
321         "maddu           $ac1, $t4, $t2              \n"
322         "maddu           $ac2, $t4, $t3              \n"
323         "maddu           $ac3, $t4, $t0              \n"
324         "addiu           %[cnt], %[cnt], -1          \n"
325         "bgtz            %[cnt], 21b                 \n"
326         " addiu          %[fy], %[fy], 2             \n"
327
328         "3:                                          \n"
329         "extrv.w         $t3, $ac0, %[kShiftBits]    \n" // a >> kShiftBits
330         "extrv.w         $t2, $ac1, %[kShiftBits]    \n" // b >> kShiftBits
331         "extrv.w         $t1, $ac2, %[kShiftBits]    \n" // g >> kShiftBits
332         "extrv.w         $t0, $ac3, %[kShiftBits]    \n" // r >> kShiftBits
333         "repl.ph         $t4, 128                    \n" // t4 = | 128 | 128 |
334         "addu            $t5, %[out_row], %[offset]  \n"
335         "append          $t2, $t3, 16                \n" // t2 = |0|g|0|r|
336         "append          $t0, $t1, 16                \n" // t0 = |0|a|0|b|
337         "subu.ph         $t1, $t0, $t4               \n"
338         "shll_s.ph       $t1, $t1, 8                 \n"
339         "shra.ph         $t1, $t1, 8                 \n"
340         "addu.ph         $t1, $t1, $t4               \n" // Clamp(a)|Clamp(b)
341         "subu.ph         $t2, $t2, $t4               \n"
342         "shll_s.ph       $t2, $t2, 8                 \n"
343         "shra.ph         $t2, $t2, 8                 \n"
344         "addu.ph         $t2, $t2, $t4               \n" // Clamp(g)|Clamp(r)
345         "andi            $t3, $t1, 0xFF              \n" // t3 = ClampTo8(b)
346         "cmp.lt.ph       $t3, $t2                    \n" // cmp b, g, r
347         "pick.ph         $t0, $t2, $t3               \n"
348         "andi            $t3, $t0, 0xFF              \n"
349         "srl             $t4, $t0, 16                \n"
350         "cmp.lt.ph       $t3, $t4                    \n"
351         "pick.ph         $t0, $t4, $t3               \n" // t0 = max_color_ch
352         "srl             $t3, $t1, 16                \n" // t1 = ClampTo8(a)
353         "cmp.lt.ph       $t3, $t0                    \n"
354         "pick.ph         $t0, $t0, $t3               \n"
355         "ins             $t1, $t0, 16, 8             \n"
356         "precr.qb.ph     $t0, $t1, $t2               \n" // t0 = |a|b|g|r|
357         "usw             $t0, 0($t5)                 \n"
358
359         ".set pop                                    \n"
360       : [filter_val] "+r" (filter_val), [filter_len] "+r" (filter_length),
361         [offset] "+r" (byte_offset), [fy] "+r" (filter_y), [cnt] "+r" (cnt),
362         [out_x] "+r" (out_x), [pixel_width] "+r" (pixel_width)
363       : [src_data_rows] "r" (source_data_rows), [out_row] "r" (out_row),
364         [kShiftBits] "r" (ConvolutionFilter1D::kShiftBits)
365       : "lo", "hi", "$ac1lo", "$ac1hi", "$ac2lo", "$ac2hi", "$ac3lo", "$ac3hi",
366         "t0", "t1", "t2", "t3", "t4", "t5", "t6","t7", "memory"
367       );
368     }
369   } else {
370     for (int out_x = 0; out_x < pixel_width; out_x++) {
371       __asm__ __volatile__ (
372         ".set push                                   \n"
373         ".set noreorder                              \n"
374
375         "beqz            %[filter_len], 3f           \n"
376         " sll            %[offset], %[out_x], 2      \n"
377         "mtlo            $0, $ac0                    \n"
378         "mtlo            $0, $ac1                    \n"
379         "mtlo            $0, $ac2                    \n"
380         "srl             %[cnt], %[filter_len], 2    \n"
381         "beqz            %[cnt], 2f                  \n"
382         " li             %[fy], 0                    \n"
383
384         "11:                                         \n"
385         "sll             $t1, %[fy], 1               \n"
386         "addu            $t0, %[src_data_rows], $t1  \n"
387         "lw              $t1, 0($t0)                 \n"
388         "lw              $t2, 4($t0)                 \n"
389         "lw              $t3, 8($t0)                 \n"
390         "lw              $t4, 12($t0)                \n"
391         "addu            $t1, $t1, %[offset]         \n"
392         "addu            $t2, $t2, %[offset]         \n"
393         "addu            $t3, $t3, %[offset]         \n"
394         "addu            $t4, $t4, %[offset]         \n"
395         "lw              $t1, 0($t1)                 \n" // t1 = |a0|b0|g0|r0|
396         "lw              $t2, 0($t2)                 \n" // t2 = |a1|b1|g1|r1|
397         "lw              $t3, 0($t3)                 \n" // t3 = |a0|b0|g0|r0|
398         "lw              $t4, 0($t4)                 \n" // t4 = |a1|b1|g1|r1|
399         "precrq.qb.ph    $t5, $t2, $t1               \n" // t5 = |a1|g1|a0|g0|
400         "precr.qb.ph     $t6, $t2, $t1               \n" // t6 = |b1|r1|b0|r0|
401         "preceu.ph.qbra  $t1, $t5                    \n" // t1 = |0|g1|0|g0|
402         "preceu.ph.qbla  $t2, $t6                    \n" // t2 = |0|b1|0|b0|
403         "preceu.ph.qbra  $t5, $t6                    \n" // t5 = |0|r1|0|r0|
404         "addu            $t6, %[filter_val], %[fy]   \n"
405         "ulw             $t0, 0($t6)                 \n" // t0 = |cur_1|cur_0|
406         "ulw             $t6, 4($t6)                 \n" // t6 = |cur_1|cur_0|
407         "dpa.w.ph        $ac0, $t5, $t0              \n" // (cur*r1)+(cur*r0)
408         "dpa.w.ph        $ac1, $t1, $t0              \n" // (cur*g1)+(cur*g0)
409         "dpa.w.ph        $ac2, $t2, $t0              \n" // (cur*b1)+(cur*b0)
410         "precrq.qb.ph    $t5, $t4, $t3               \n" // t5 = |a3|g3|a2|g2|
411         "precr.qb.ph     $t0, $t4, $t3               \n" // t0 = |b3|r3|b2|r2|
412         "preceu.ph.qbra  $t1, $t5                    \n" // t1 = |0|g3|0|g2|
413         "preceu.ph.qbla  $t2, $t0                    \n" // t2 = |0|b3|0|b2|
414         "preceu.ph.qbra  $t5, $t0                    \n" // t5 = |0|r3|0|r2|
415         "dpa.w.ph        $ac0, $t5, $t6              \n" // (cur*r1)+(cur*r0)
416         "dpa.w.ph        $ac1, $t1, $t6              \n" // (cur*g1)+(cur*g0)
417         "dpa.w.ph        $ac2, $t2, $t6              \n" // (cur*b1)+(cur*b0)
418         "addiu           %[cnt], %[cnt], -1          \n"
419         "bgtz            %[cnt], 11b                 \n"
420         " addiu          %[fy], %[fy], 8             \n"
421
422         "2:                                          \n"
423         "andi            %[cnt], %[filter_len], 0x3  \n" // residual
424         "beqz            %[cnt], 3f                  \n"
425         " nop                                        \n"
426
427         "21:                                         \n"
428         "addu            $t0, %[filter_val], %[fy]   \n"
429         "lh              $t4, 0($t0)                 \n" // filter_val[fx]
430         "sll             $t1, %[fy], 1               \n"
431         "addu            $t0, %[src_data_rows], $t1  \n"
432         "lw              $t1, 0($t0)                 \n"
433         "addu            $t0, $t1, %[offset]         \n"
434         "lbu             $t1, 0($t0)                 \n" // t1 = row[fx*4 + 0]
435         "lbu             $t2, 1($t0)                 \n" // t2 = row[fx*4 + 1]
436         "lbu             $t3, 2($t0)                 \n" // t3 = row[fx*4 + 2]
437         "maddu           $ac0, $t4, $t1              \n"
438         "maddu           $ac1, $t4, $t2              \n"
439         "maddu           $ac2, $t4, $t3              \n"
440         "addiu           %[cnt], %[cnt], -1          \n"
441         "bgtz            %[cnt], 21b                 \n"
442         " addiu          %[fy], %[fy], 2             \n"
443
444         "3:                                          \n"
445         "extrv.w         $t3, $ac0, %[kShiftBits]    \n" // r >> kShiftBits
446         "extrv.w         $t2, $ac1, %[kShiftBits]    \n" // g >> kShiftBits
447         "extrv.w         $t1, $ac2, %[kShiftBits]    \n" // b >> kShiftBits
448         "repl.ph         $t6, 128                    \n" // t6 = | 128 | 128 |
449         "addu            $t5, %[out_row], %[offset]  \n"
450         "append          $t2, $t3, 16                \n" // t2 = |0|g|0|r|
451         "andi            $t1, $t1, 0xFFFF            \n"
452         "subu.ph         $t1, $t1, $t6               \n"
453         "shll_s.ph       $t1, $t1, 8                 \n"
454         "shra.ph         $t1, $t1, 8                 \n"
455         "addu.ph         $t1, $t1, $t6               \n" // Clamp(a)|Clamp(b)
456         "subu.ph         $t2, $t2, $t6               \n"
457         "shll_s.ph       $t2, $t2, 8                 \n"
458         "shra.ph         $t2, $t2, 8                 \n"
459         "addu.ph         $t2, $t2, $t6               \n" // Clamp(g)|Clamp(r)
460         "li              $t0, 0xFF                   \n"
461         "ins             $t1, $t0, 16, 8             \n"
462         "precr.qb.ph     $t0, $t1, $t2               \n" // t0 = |a|b|g|r|
463         "usw             $t0, 0($t5)                 \n"
464
465         ".set pop                                    \n"
466       : [filter_val] "+r" (filter_val), [filter_len] "+r" (filter_length),
467         [offset] "+r" (byte_offset), [fy] "+r" (filter_y), [cnt] "+r" (cnt),
468         [out_x] "+r" (out_x), [pixel_width] "+r" (pixel_width)
469       : [src_data_rows] "r" (source_data_rows), [out_row] "r" (out_row),
470         [kShiftBits] "r" (ConvolutionFilter1D::kShiftBits)
471       : "lo", "hi", "$ac1lo", "$ac1hi", "$ac2lo", "$ac2hi", "$ac3lo", "$ac3hi",
472         "t0", "t1", "t2", "t3", "t4", "t5", "t6", "memory"
473       );
474     }
475   }
476 #endif
477 }
478 } // namespace skia