Upstream version 9.38.198.0
[platform/framework/web/crosswalk.git] / src / third_party / libyuv / source / rotate_neon.cc
1 /*
2  *  Copyright 2011 The LibYuv Project Authors. All rights reserved.
3  *
4  *  Use of this source code is governed by a BSD-style license
5  *  that can be found in the LICENSE file in the root of the source
6  *  tree. An additional intellectual property rights grant can be found
7  *  in the file PATENTS. All contributing project authors may
8  *  be found in the AUTHORS file in the root of the source tree.
9  */
10
11 #include "libyuv/row.h"
12
13 #include "libyuv/basic_types.h"
14
15 #ifdef __cplusplus
16 namespace libyuv {
17 extern "C" {
18 #endif
19
20 #if !defined(LIBYUV_DISABLE_NEON) && defined(__ARM_NEON__)
21
22 static uvec8 kVTbl4x4Transpose =
23   { 0,  4,  8, 12,  1,  5,  9, 13,  2,  6, 10, 14,  3,  7, 11, 15 };
24
25 void TransposeWx8_NEON(const uint8* src, int src_stride,
26                        uint8* dst, int dst_stride,
27                        int width) {
28   const uint8* src_temp = NULL;
29   asm volatile (
30     // loops are on blocks of 8. loop will stop when
31     // counter gets to or below 0. starting the counter
32     // at w-8 allow for this
33     "sub         %5, #8                        \n"
34
35     // handle 8x8 blocks. this should be the majority of the plane
36     ".p2align  2                               \n"
37     "1:                                        \n"
38       "mov         %0, %1                      \n"
39
40       MEMACCESS(0)
41       "vld1.8      {d0}, [%0], %2              \n"
42       MEMACCESS(0)
43       "vld1.8      {d1}, [%0], %2              \n"
44       MEMACCESS(0)
45       "vld1.8      {d2}, [%0], %2              \n"
46       MEMACCESS(0)
47       "vld1.8      {d3}, [%0], %2              \n"
48       MEMACCESS(0)
49       "vld1.8      {d4}, [%0], %2              \n"
50       MEMACCESS(0)
51       "vld1.8      {d5}, [%0], %2              \n"
52       MEMACCESS(0)
53       "vld1.8      {d6}, [%0], %2              \n"
54       MEMACCESS(0)
55       "vld1.8      {d7}, [%0]                  \n"
56
57       "vtrn.8      d1, d0                      \n"
58       "vtrn.8      d3, d2                      \n"
59       "vtrn.8      d5, d4                      \n"
60       "vtrn.8      d7, d6                      \n"
61
62       "vtrn.16     d1, d3                      \n"
63       "vtrn.16     d0, d2                      \n"
64       "vtrn.16     d5, d7                      \n"
65       "vtrn.16     d4, d6                      \n"
66
67       "vtrn.32     d1, d5                      \n"
68       "vtrn.32     d0, d4                      \n"
69       "vtrn.32     d3, d7                      \n"
70       "vtrn.32     d2, d6                      \n"
71
72       "vrev16.8    q0, q0                      \n"
73       "vrev16.8    q1, q1                      \n"
74       "vrev16.8    q2, q2                      \n"
75       "vrev16.8    q3, q3                      \n"
76
77       "mov         %0, %3                      \n"
78
79     MEMACCESS(0)
80       "vst1.8      {d1}, [%0], %4              \n"
81     MEMACCESS(0)
82       "vst1.8      {d0}, [%0], %4              \n"
83     MEMACCESS(0)
84       "vst1.8      {d3}, [%0], %4              \n"
85     MEMACCESS(0)
86       "vst1.8      {d2}, [%0], %4              \n"
87     MEMACCESS(0)
88       "vst1.8      {d5}, [%0], %4              \n"
89     MEMACCESS(0)
90       "vst1.8      {d4}, [%0], %4              \n"
91     MEMACCESS(0)
92       "vst1.8      {d7}, [%0], %4              \n"
93     MEMACCESS(0)
94       "vst1.8      {d6}, [%0]                  \n"
95
96       "add         %1, #8                      \n"  // src += 8
97       "add         %3, %3, %4, lsl #3          \n"  // dst += 8 * dst_stride
98       "subs        %5,  #8                     \n"  // w   -= 8
99       "bge         1b                          \n"
100
101     // add 8 back to counter. if the result is 0 there are
102     // no residuals.
103     "adds        %5, #8                        \n"
104     "beq         4f                            \n"
105
106     // some residual, so between 1 and 7 lines left to transpose
107     "cmp         %5, #2                        \n"
108     "blt         3f                            \n"
109
110     "cmp         %5, #4                        \n"
111     "blt         2f                            \n"
112
113     // 4x8 block
114     "mov         %0, %1                        \n"
115     MEMACCESS(0)
116     "vld1.32     {d0[0]}, [%0], %2             \n"
117     MEMACCESS(0)
118     "vld1.32     {d0[1]}, [%0], %2             \n"
119     MEMACCESS(0)
120     "vld1.32     {d1[0]}, [%0], %2             \n"
121     MEMACCESS(0)
122     "vld1.32     {d1[1]}, [%0], %2             \n"
123     MEMACCESS(0)
124     "vld1.32     {d2[0]}, [%0], %2             \n"
125     MEMACCESS(0)
126     "vld1.32     {d2[1]}, [%0], %2             \n"
127     MEMACCESS(0)
128     "vld1.32     {d3[0]}, [%0], %2             \n"
129     MEMACCESS(0)
130     "vld1.32     {d3[1]}, [%0]                 \n"
131
132     "mov         %0, %3                        \n"
133
134     MEMACCESS(6)
135     "vld1.8      {q3}, [%6]                    \n"
136
137     "vtbl.8      d4, {d0, d1}, d6              \n"
138     "vtbl.8      d5, {d0, d1}, d7              \n"
139     "vtbl.8      d0, {d2, d3}, d6              \n"
140     "vtbl.8      d1, {d2, d3}, d7              \n"
141
142     // TODO(frkoenig): Rework shuffle above to
143     // write out with 4 instead of 8 writes.
144     MEMACCESS(0)
145     "vst1.32     {d4[0]}, [%0], %4             \n"
146     MEMACCESS(0)
147     "vst1.32     {d4[1]}, [%0], %4             \n"
148     MEMACCESS(0)
149     "vst1.32     {d5[0]}, [%0], %4             \n"
150     MEMACCESS(0)
151     "vst1.32     {d5[1]}, [%0]                 \n"
152
153     "add         %0, %3, #4                    \n"
154     MEMACCESS(0)
155     "vst1.32     {d0[0]}, [%0], %4             \n"
156     MEMACCESS(0)
157     "vst1.32     {d0[1]}, [%0], %4             \n"
158     MEMACCESS(0)
159     "vst1.32     {d1[0]}, [%0], %4             \n"
160     MEMACCESS(0)
161     "vst1.32     {d1[1]}, [%0]                 \n"
162
163     "add         %1, #4                        \n"  // src += 4
164     "add         %3, %3, %4, lsl #2            \n"  // dst += 4 * dst_stride
165     "subs        %5,  #4                       \n"  // w   -= 4
166     "beq         4f                            \n"
167
168     // some residual, check to see if it includes a 2x8 block,
169     // or less
170     "cmp         %5, #2                        \n"
171     "blt         3f                            \n"
172
173     // 2x8 block
174     "2:                                        \n"
175     "mov         %0, %1                        \n"
176     MEMACCESS(0)
177     "vld1.16     {d0[0]}, [%0], %2             \n"
178     MEMACCESS(0)
179     "vld1.16     {d1[0]}, [%0], %2             \n"
180     MEMACCESS(0)
181     "vld1.16     {d0[1]}, [%0], %2             \n"
182     MEMACCESS(0)
183     "vld1.16     {d1[1]}, [%0], %2             \n"
184     MEMACCESS(0)
185     "vld1.16     {d0[2]}, [%0], %2             \n"
186     MEMACCESS(0)
187     "vld1.16     {d1[2]}, [%0], %2             \n"
188     MEMACCESS(0)
189     "vld1.16     {d0[3]}, [%0], %2             \n"
190     MEMACCESS(0)
191     "vld1.16     {d1[3]}, [%0]                 \n"
192
193     "vtrn.8      d0, d1                        \n"
194
195     "mov         %0, %3                        \n"
196
197     MEMACCESS(0)
198     "vst1.64     {d0}, [%0], %4                \n"
199     MEMACCESS(0)
200     "vst1.64     {d1}, [%0]                    \n"
201
202     "add         %1, #2                        \n"  // src += 2
203     "add         %3, %3, %4, lsl #1            \n"  // dst += 2 * dst_stride
204     "subs        %5,  #2                       \n"  // w   -= 2
205     "beq         4f                            \n"
206
207     // 1x8 block
208     "3:                                        \n"
209     MEMACCESS(1)
210     "vld1.8      {d0[0]}, [%1], %2             \n"
211     MEMACCESS(1)
212     "vld1.8      {d0[1]}, [%1], %2             \n"
213     MEMACCESS(1)
214     "vld1.8      {d0[2]}, [%1], %2             \n"
215     MEMACCESS(1)
216     "vld1.8      {d0[3]}, [%1], %2             \n"
217     MEMACCESS(1)
218     "vld1.8      {d0[4]}, [%1], %2             \n"
219     MEMACCESS(1)
220     "vld1.8      {d0[5]}, [%1], %2             \n"
221     MEMACCESS(1)
222     "vld1.8      {d0[6]}, [%1], %2             \n"
223     MEMACCESS(1)
224     "vld1.8      {d0[7]}, [%1]                 \n"
225
226     MEMACCESS(3)
227     "vst1.64     {d0}, [%3]                    \n"
228
229     "4:                                        \n"
230
231     : "+r"(src_temp),          // %0
232       "+r"(src),               // %1
233       "+r"(src_stride),        // %2
234       "+r"(dst),               // %3
235       "+r"(dst_stride),        // %4
236       "+r"(width)              // %5
237     : "r"(&kVTbl4x4Transpose)  // %6
238     : "memory", "cc", "q0", "q1", "q2", "q3"
239   );
240 }
241
242 static uvec8 kVTbl4x4TransposeDi =
243   { 0,  8,  1,  9,  2, 10,  3, 11,  4, 12,  5, 13,  6, 14,  7, 15 };
244
245 void TransposeUVWx8_NEON(const uint8* src, int src_stride,
246                          uint8* dst_a, int dst_stride_a,
247                          uint8* dst_b, int dst_stride_b,
248                          int width) {
249   const uint8* src_temp = NULL;
250   asm volatile (
251     // loops are on blocks of 8. loop will stop when
252     // counter gets to or below 0. starting the counter
253     // at w-8 allow for this
254     "sub         %7, #8                        \n"
255
256     // handle 8x8 blocks. this should be the majority of the plane
257     ".p2align  2                               \n"
258     "1:                                        \n"
259       "mov         %0, %1                      \n"
260
261       MEMACCESS(0)
262       "vld2.8      {d0,  d1},  [%0], %2        \n"
263       MEMACCESS(0)
264       "vld2.8      {d2,  d3},  [%0], %2        \n"
265       MEMACCESS(0)
266       "vld2.8      {d4,  d5},  [%0], %2        \n"
267       MEMACCESS(0)
268       "vld2.8      {d6,  d7},  [%0], %2        \n"
269       MEMACCESS(0)
270       "vld2.8      {d16, d17}, [%0], %2        \n"
271       MEMACCESS(0)
272       "vld2.8      {d18, d19}, [%0], %2        \n"
273       MEMACCESS(0)
274       "vld2.8      {d20, d21}, [%0], %2        \n"
275       MEMACCESS(0)
276       "vld2.8      {d22, d23}, [%0]            \n"
277
278       "vtrn.8      q1, q0                      \n"
279       "vtrn.8      q3, q2                      \n"
280       "vtrn.8      q9, q8                      \n"
281       "vtrn.8      q11, q10                    \n"
282
283       "vtrn.16     q1, q3                      \n"
284       "vtrn.16     q0, q2                      \n"
285       "vtrn.16     q9, q11                     \n"
286       "vtrn.16     q8, q10                     \n"
287
288       "vtrn.32     q1, q9                      \n"
289       "vtrn.32     q0, q8                      \n"
290       "vtrn.32     q3, q11                     \n"
291       "vtrn.32     q2, q10                     \n"
292
293       "vrev16.8    q0, q0                      \n"
294       "vrev16.8    q1, q1                      \n"
295       "vrev16.8    q2, q2                      \n"
296       "vrev16.8    q3, q3                      \n"
297       "vrev16.8    q8, q8                      \n"
298       "vrev16.8    q9, q9                      \n"
299       "vrev16.8    q10, q10                    \n"
300       "vrev16.8    q11, q11                    \n"
301
302       "mov         %0, %3                      \n"
303
304     MEMACCESS(0)
305       "vst1.8      {d2},  [%0], %4             \n"
306     MEMACCESS(0)
307       "vst1.8      {d0},  [%0], %4             \n"
308     MEMACCESS(0)
309       "vst1.8      {d6},  [%0], %4             \n"
310     MEMACCESS(0)
311       "vst1.8      {d4},  [%0], %4             \n"
312     MEMACCESS(0)
313       "vst1.8      {d18}, [%0], %4             \n"
314     MEMACCESS(0)
315       "vst1.8      {d16}, [%0], %4             \n"
316     MEMACCESS(0)
317       "vst1.8      {d22}, [%0], %4             \n"
318     MEMACCESS(0)
319       "vst1.8      {d20}, [%0]                 \n"
320
321       "mov         %0, %5                      \n"
322
323     MEMACCESS(0)
324       "vst1.8      {d3},  [%0], %6             \n"
325     MEMACCESS(0)
326       "vst1.8      {d1},  [%0], %6             \n"
327     MEMACCESS(0)
328       "vst1.8      {d7},  [%0], %6             \n"
329     MEMACCESS(0)
330       "vst1.8      {d5},  [%0], %6             \n"
331     MEMACCESS(0)
332       "vst1.8      {d19}, [%0], %6             \n"
333     MEMACCESS(0)
334       "vst1.8      {d17}, [%0], %6             \n"
335     MEMACCESS(0)
336       "vst1.8      {d23}, [%0], %6             \n"
337     MEMACCESS(0)
338       "vst1.8      {d21}, [%0]                 \n"
339
340       "add         %1, #8*2                    \n"  // src   += 8*2
341       "add         %3, %3, %4, lsl #3          \n"  // dst_a += 8 * dst_stride_a
342       "add         %5, %5, %6, lsl #3          \n"  // dst_b += 8 * dst_stride_b
343       "subs        %7,  #8                     \n"  // w     -= 8
344       "bge         1b                          \n"
345
346     // add 8 back to counter. if the result is 0 there are
347     // no residuals.
348     "adds        %7, #8                        \n"
349     "beq         4f                            \n"
350
351     // some residual, so between 1 and 7 lines left to transpose
352     "cmp         %7, #2                        \n"
353     "blt         3f                            \n"
354
355     "cmp         %7, #4                        \n"
356     "blt         2f                            \n"
357
358     // TODO(frkoenig): Clean this up
359     // 4x8 block
360     "mov         %0, %1                        \n"
361     MEMACCESS(0)
362     "vld1.64     {d0}, [%0], %2                \n"
363     MEMACCESS(0)
364     "vld1.64     {d1}, [%0], %2                \n"
365     MEMACCESS(0)
366     "vld1.64     {d2}, [%0], %2                \n"
367     MEMACCESS(0)
368     "vld1.64     {d3}, [%0], %2                \n"
369     MEMACCESS(0)
370     "vld1.64     {d4}, [%0], %2                \n"
371     MEMACCESS(0)
372     "vld1.64     {d5}, [%0], %2                \n"
373     MEMACCESS(0)
374     "vld1.64     {d6}, [%0], %2                \n"
375     MEMACCESS(0)
376     "vld1.64     {d7}, [%0]                    \n"
377
378     MEMACCESS(8)
379     "vld1.8      {q15}, [%8]                   \n"
380
381     "vtrn.8      q0, q1                        \n"
382     "vtrn.8      q2, q3                        \n"
383
384     "vtbl.8      d16, {d0, d1}, d30            \n"
385     "vtbl.8      d17, {d0, d1}, d31            \n"
386     "vtbl.8      d18, {d2, d3}, d30            \n"
387     "vtbl.8      d19, {d2, d3}, d31            \n"
388     "vtbl.8      d20, {d4, d5}, d30            \n"
389     "vtbl.8      d21, {d4, d5}, d31            \n"
390     "vtbl.8      d22, {d6, d7}, d30            \n"
391     "vtbl.8      d23, {d6, d7}, d31            \n"
392
393     "mov         %0, %3                        \n"
394
395     MEMACCESS(0)
396     "vst1.32     {d16[0]},  [%0], %4           \n"
397     MEMACCESS(0)
398     "vst1.32     {d16[1]},  [%0], %4           \n"
399     MEMACCESS(0)
400     "vst1.32     {d17[0]},  [%0], %4           \n"
401     MEMACCESS(0)
402     "vst1.32     {d17[1]},  [%0], %4           \n"
403
404     "add         %0, %3, #4                    \n"
405     MEMACCESS(0)
406     "vst1.32     {d20[0]}, [%0], %4            \n"
407     MEMACCESS(0)
408     "vst1.32     {d20[1]}, [%0], %4            \n"
409     MEMACCESS(0)
410     "vst1.32     {d21[0]}, [%0], %4            \n"
411     MEMACCESS(0)
412     "vst1.32     {d21[1]}, [%0]                \n"
413
414     "mov         %0, %5                        \n"
415
416     MEMACCESS(0)
417     "vst1.32     {d18[0]}, [%0], %6            \n"
418     MEMACCESS(0)
419     "vst1.32     {d18[1]}, [%0], %6            \n"
420     MEMACCESS(0)
421     "vst1.32     {d19[0]}, [%0], %6            \n"
422     MEMACCESS(0)
423     "vst1.32     {d19[1]}, [%0], %6            \n"
424
425     "add         %0, %5, #4                    \n"
426     MEMACCESS(0)
427     "vst1.32     {d22[0]},  [%0], %6           \n"
428     MEMACCESS(0)
429     "vst1.32     {d22[1]},  [%0], %6           \n"
430     MEMACCESS(0)
431     "vst1.32     {d23[0]},  [%0], %6           \n"
432     MEMACCESS(0)
433     "vst1.32     {d23[1]},  [%0]               \n"
434
435     "add         %1, #4*2                      \n"  // src   += 4 * 2
436     "add         %3, %3, %4, lsl #2            \n"  // dst_a += 4 * dst_stride_a
437     "add         %5, %5, %6, lsl #2            \n"  // dst_b += 4 * dst_stride_b
438     "subs        %7,  #4                       \n"  // w     -= 4
439     "beq         4f                            \n"
440
441     // some residual, check to see if it includes a 2x8 block,
442     // or less
443     "cmp         %7, #2                        \n"
444     "blt         3f                            \n"
445
446     // 2x8 block
447     "2:                                        \n"
448     "mov         %0, %1                        \n"
449     MEMACCESS(0)
450     "vld2.16     {d0[0], d2[0]}, [%0], %2      \n"
451     MEMACCESS(0)
452     "vld2.16     {d1[0], d3[0]}, [%0], %2      \n"
453     MEMACCESS(0)
454     "vld2.16     {d0[1], d2[1]}, [%0], %2      \n"
455     MEMACCESS(0)
456     "vld2.16     {d1[1], d3[1]}, [%0], %2      \n"
457     MEMACCESS(0)
458     "vld2.16     {d0[2], d2[2]}, [%0], %2      \n"
459     MEMACCESS(0)
460     "vld2.16     {d1[2], d3[2]}, [%0], %2      \n"
461     MEMACCESS(0)
462     "vld2.16     {d0[3], d2[3]}, [%0], %2      \n"
463     MEMACCESS(0)
464     "vld2.16     {d1[3], d3[3]}, [%0]          \n"
465
466     "vtrn.8      d0, d1                        \n"
467     "vtrn.8      d2, d3                        \n"
468
469     "mov         %0, %3                        \n"
470
471     MEMACCESS(0)
472     "vst1.64     {d0}, [%0], %4                \n"
473     MEMACCESS(0)
474     "vst1.64     {d2}, [%0]                    \n"
475
476     "mov         %0, %5                        \n"
477
478     MEMACCESS(0)
479     "vst1.64     {d1}, [%0], %6                \n"
480     MEMACCESS(0)
481     "vst1.64     {d3}, [%0]                    \n"
482
483     "add         %1, #2*2                      \n"  // src   += 2 * 2
484     "add         %3, %3, %4, lsl #1            \n"  // dst_a += 2 * dst_stride_a
485     "add         %5, %5, %6, lsl #1            \n"  // dst_b += 2 * dst_stride_b
486     "subs        %7,  #2                       \n"  // w     -= 2
487     "beq         4f                            \n"
488
489     // 1x8 block
490     "3:                                        \n"
491     MEMACCESS(1)
492     "vld2.8      {d0[0], d1[0]}, [%1], %2      \n"
493     MEMACCESS(1)
494     "vld2.8      {d0[1], d1[1]}, [%1], %2      \n"
495     MEMACCESS(1)
496     "vld2.8      {d0[2], d1[2]}, [%1], %2      \n"
497     MEMACCESS(1)
498     "vld2.8      {d0[3], d1[3]}, [%1], %2      \n"
499     MEMACCESS(1)
500     "vld2.8      {d0[4], d1[4]}, [%1], %2      \n"
501     MEMACCESS(1)
502     "vld2.8      {d0[5], d1[5]}, [%1], %2      \n"
503     MEMACCESS(1)
504     "vld2.8      {d0[6], d1[6]}, [%1], %2      \n"
505     MEMACCESS(1)
506     "vld2.8      {d0[7], d1[7]}, [%1]          \n"
507
508     MEMACCESS(3)
509     "vst1.64     {d0}, [%3]                    \n"
510     MEMACCESS(5)
511     "vst1.64     {d1}, [%5]                    \n"
512
513     "4:                                        \n"
514
515     : "+r"(src_temp),            // %0
516       "+r"(src),                 // %1
517       "+r"(src_stride),          // %2
518       "+r"(dst_a),               // %3
519       "+r"(dst_stride_a),        // %4
520       "+r"(dst_b),               // %5
521       "+r"(dst_stride_b),        // %6
522       "+r"(width)                // %7
523     : "r"(&kVTbl4x4TransposeDi)  // %8
524     : "memory", "cc",
525       "q0", "q1", "q2", "q3", "q8", "q9", "q10", "q11"
526   );
527 }
528 #endif
529
530 #ifdef __cplusplus
531 }  // extern "C"
532 }  // namespace libyuv
533 #endif