2 * LIBOIL - Library of Optimized Inner Loops
3 * Copyright (c) 2003,2004 David A. Schleef <ds@schleef.org>
6 * Redistribution and use in source and binary forms, with or without
7 * modification, are permitted provided that the following conditions
9 * 1. Redistributions of source code must retain the above copyright
10 * notice, this list of conditions and the following disclaimer.
11 * 2. Redistributions in binary form must reproduce the above copyright
12 * notice, this list of conditions and the following disclaimer in the
13 * documentation and/or other materials provided with the distribution.
15 * THIS SOFTWARE IS PROVIDED BY THE AUTHOR ``AS IS'' AND ANY EXPRESS OR
16 * IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
17 * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
18 * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY DIRECT,
19 * INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
20 * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
21 * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
22 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT,
23 * STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING
24 * IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
25 * POSSIBILITY OF SUCH DAMAGE.
32 #include <liboil/liboil.h>
36 * zigzag_order in zigzag8z8_c.c is defined in terms of where the
37 * elements are moving from. We rewrite the matrix in terms of
38 * where the elements are moving _to_ (which is the same as
41 * 0, 1, 5, 6, 14, 15, 27, 28,
42 * 2, 4, 7, 13, 16, 26, 29, 42,
43 * 3, 8, 12, 17, 25, 30, 41, 43,
44 * 9, 11, 18, 24, 31, 40, 44, 53,
45 * 10, 19, 23, 32, 39, 45, 52, 54,
46 * 20, 22, 33, 38, 46, 51, 55, 60,
47 * 21, 34, 37, 47, 50, 56, 59, 61,
48 * 35, 36, 48, 49, 57, 58, 62, 63,
50 * If we shift each successive row to the right by one place, we
53 * 0, 1, 5, 6, 14, 15, 27, 28,
54 * 42, 2, 4, 7, 13, 16, 26, 29,
55 * 41, 43, 3, 8, 12, 17, 25, 30,
56 * 40, 44, 53, 9, 11, 18, 24, 31,
57 * 39, 45, 52, 54, 10, 19, 23, 32,
58 * 38, 46, 51, 55, 60, 20, 22, 33,
59 * 47, 50, 56, 59, 61, 21, 34, 37,
60 * 36, 48, 49, 57, 58, 62, 63, 35,
64 * 0, 42, 41, 40, 39, 38, 47, 36,
65 * 1, 2, 43, 44, 45, 46, 50, 48,
66 * 5, 4, 3, 53, 52, 51, 56, 49,
67 * 6, 7, 8, 9, 54, 55, 59, 57,
68 * 14, 13, 12, 11, 10, 60, 61, 58,
69 * 15, 16, 17, 18, 19, 20, 21, 62,
70 * 27, 26, 25, 24, 23, 22, 34, 63,
71 * 28, 29, 30, 31, 32, 33, 37, 35,
73 * we see that groups of numbers tend to be collected on the same
74 * line. In fact, aside from a few exceptions, rows on our final
75 * matrix can be created simply by merging 2 rows of the above
76 * matrix. For example, the first line can (almost) be created
77 * by merging lines 2 (called 'a') and 3 ('b') above:
79 * xx, a0, a1, b2, b1, b0, xx, xx,
81 * (where 'xx' denotes elements that cannot be created. */
85 /* 00 indicates that the element can't be handled by vperm, and needs
86 * to be fixed up later. */
87 static const uint8_t mangle[128] __attribute__ ((__aligned__ (16))) = {
88 00,00, 2, 3, 0, 1,16,17,18,19,20,21,00,00,00,00, /* 1, 2 */
89 2, 3, 0, 1,16,17,18,19,20,21,22,23,24,25,00,00, /* 3, 4 */
90 8, 9, 6, 7, 4, 5, 2, 3, 0, 1,16,17,18,19,20,21, /* 5, 6 */
91 6, 7, 8, 9,10,11,12,13,30,31,28,29,26,27,24,25, /* 6, 7 */
92 6, 7, 4, 5, 2, 3, 0, 1,18,19,20,21,22,23,24,25, /* 7, 0 */
93 10,11,12,13,14,15,30,31,28,29,26,27,24,25,22,23, /* 0, 1 */
94 00,00, 6, 7, 8, 9,10,11,12,13,14,15,30,31,28,29, /* 2, 3 */
95 00,00,00,00,10,11,12,13,14,15,30,31,28,29,00,00, /* 4, 5 */
99 zigzag8x8_s16_a16_altivec(int16_t *dest, int dstr, int16_t *src, int sstr)
101 __asm__ __volatile__(
119 : "b" (0), "b" (8), "b" (sstr)
122 /* "slide" vectors to right */
123 __asm__ __volatile__(
135 __asm__ __volatile__(
138 "\tvmrghh 10, 1, 5\n"
139 "\tvmrglh 11, 1, 5\n"
140 "\tvmrghh 12, 2, 6\n"
141 "\tvmrglh 13, 2, 6\n"
142 "\tvmrghh 14, 3, 7\n"
143 "\tvmrglh 15, 3, 7\n"
145 "\tvmrghh 16, 8, 12\n"
146 "\tvmrglh 17, 8, 12\n"
147 "\tvmrghh 18, 9, 13\n"
148 "\tvmrglh 19, 9, 13\n"
149 "\tvmrghh 20, 10, 14\n"
150 "\tvmrglh 21, 10, 14\n"
151 "\tvmrghh 22, 11, 15\n"
152 "\tvmrglh 23, 11, 15\n"
154 "\tvmrghh 0, 16, 20\n"
155 "\tvmrglh 1, 16, 20\n"
156 "\tvmrghh 2, 17, 21\n"
157 "\tvmrglh 3, 17, 21\n"
158 "\tvmrghh 4, 18, 22\n"
159 "\tvmrglh 5, 18, 22\n"
160 "\tvmrghh 6, 19, 23\n"
161 "\tvmrglh 7, 19, 23\n"
164 sl_altivec_load8_8(mangle,16);
166 __asm__ __volatile__(
170 "\tvperm 18,5,6,10\n"
171 "\tvperm 19,6,7,11\n"
172 "\tvperm 20,7,0,12\n"
173 "\tvperm 21,0,1,13\n"
174 "\tvperm 22,2,3,14\n"
175 "\tvperm 23,4,5,15\n"
178 sl_altivec_store8_16(dest,16);
180 /* fix up the elements that were missed */
182 block8x8_s16(dest,dstr,0,0) = block8x8_s16(src,sstr,0,0);
183 block8x8_s16(dest,dstr,0,6) = block8x8_s16(src,sstr,3,0);
184 block8x8_s16(dest,dstr,0,7) = block8x8_s16(src,sstr,2,1);
185 block8x8_s16(dest,dstr,1,7) = block8x8_s16(src,sstr,5,0);
187 block8x8_s16(dest,dstr,6,0) = block8x8_s16(src,sstr,2,7);
188 block8x8_s16(dest,dstr,7,0) = block8x8_s16(src,sstr,5,6);
189 block8x8_s16(dest,dstr,7,1) = block8x8_s16(src,sstr,4,7);
190 block8x8_s16(dest,dstr,7,7) = block8x8_s16(src,sstr,7,7);
192 OIL_DEFINE_IMPL_FULL (zigzag8x8_s16_altivec, zigzag8x8_s16, OIL_FLAG_ALTIVEC);