liboil/powerpc/zigzag8x8.c

   1 /*
   2  * LIBOIL - Library of Optimized Inner Loops
   3  * Copyright (c) 2003,2004 David A. Schleef <ds@schleef.org>
   4  * All rights reserved.
   5  *
   6  * Redistribution and use in source and binary forms, with or without
   7  * modification, are permitted provided that the following conditions
   8  * are met:
   9  * 1. Redistributions of source code must retain the above copyright
  10  *    notice, this list of conditions and the following disclaimer.
  11  * 2. Redistributions in binary form must reproduce the above copyright
  12  *    notice, this list of conditions and the following disclaimer in the
  13  *    documentation and/or other materials provided with the distribution.
  14  *
  15  * THIS SOFTWARE IS PROVIDED BY THE AUTHOR ``AS IS'' AND ANY EXPRESS OR
  16  * IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
  17  * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
  18  * ARE DISCLAIMED.  IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY DIRECT,
  19  * INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
  20  * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
  21  * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
  22  * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT,
  23  * STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING
  24  * IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
  25  * POSSIBILITY OF SUCH DAMAGE.
  26  */
  27
  28 #ifdef HAVE_CONFIG_H
  29 #include "config.h"
  30 #endif
  31
  32 #include <liboil/liboil.h>
  33
  34
  35 /* Derivation:
  36  * zigzag_order in zigzag8z8_c.c is defined in terms of where the
  37  * elements are moving from.  We rewrite the matrix in terms of
  38  * where the elements are moving _to_ (which is the same as
  39  * unzigzag_order):
  40  *
  41  *         0,  1,  5,  6,  14, 15, 27, 28,
  42  *         2,  4,  7,  13, 16, 26, 29, 42,
  43  *         3,  8,  12, 17, 25, 30, 41, 43,
  44  *         9,  11, 18, 24, 31, 40, 44, 53,
  45  *         10, 19, 23, 32, 39, 45, 52, 54,
  46  *         20, 22, 33, 38, 46, 51, 55, 60,
  47  *         21, 34, 37, 47, 50, 56, 59, 61,
  48  *         35, 36, 48, 49, 57, 58, 62, 63,
  49  *
  50  * If we shift each successive row to the right by one place, we
  51  * get:
  52  *
  53  *         0,  1,  5,  6,  14, 15, 27, 28,
  54  *         42, 2,  4,  7,  13, 16, 26, 29,
  55  *         41, 43, 3,  8,  12, 17, 25, 30,
  56  *         40, 44, 53, 9,  11, 18, 24, 31,
  57  *         39, 45, 52, 54, 10, 19, 23, 32,
  58  *         38, 46, 51, 55, 60, 20, 22, 33,
  59  *         47, 50, 56, 59, 61, 21, 34, 37,
  60  *         36, 48, 49, 57, 58, 62, 63, 35,
  61  *
  62  * And by transposing,
  63  *
  64  *         0,  42, 41, 40, 39, 38, 47, 36,
  65  *         1,  2,  43, 44, 45, 46, 50, 48,
  66  *         5,  4,  3,  53, 52, 51, 56, 49,
  67  *         6,  7,  8,  9,  54, 55, 59, 57,
  68  *         14, 13, 12, 11, 10, 60, 61, 58,
  69  *         15, 16, 17, 18, 19, 20, 21, 62,
  70  *         27, 26, 25, 24, 23, 22, 34, 63,
  71  *         28, 29, 30, 31, 32, 33, 37, 35,
  72  *
  73  * we see that groups of numbers tend to be collected on the same
  74  * line.  In fact, aside from a few exceptions, rows on our final
  75  * matrix can be created simply by merging 2 rows of the above
  76  * matrix.  For example, the first line can (almost) be created
  77  * by merging lines 2 (called 'a') and 3 ('b') above:
  78  *
  79  *         xx, a0, a1, b2, b1, b0, xx, xx,
  80  *
  81  * (where 'xx' denotes elements that cannot be created. */
  82
  83
  84 #if 0
  85 /* 00 indicates that the element can't be handled by vperm, and needs
  86  * to be fixed up later. */
  87 static const uint8_t mangle[128] __attribute__ ((__aligned__ (16))) = {
  88         00,00, 2, 3, 0, 1,16,17,18,19,20,21,00,00,00,00, /* 1, 2 */
  89          2, 3, 0, 1,16,17,18,19,20,21,22,23,24,25,00,00, /* 3, 4 */
  90          8, 9, 6, 7, 4, 5, 2, 3, 0, 1,16,17,18,19,20,21, /* 5, 6 */
  91          6, 7, 8, 9,10,11,12,13,30,31,28,29,26,27,24,25, /* 6, 7 */
  92          6, 7, 4, 5, 2, 3, 0, 1,18,19,20,21,22,23,24,25, /* 7, 0 */
  93         10,11,12,13,14,15,30,31,28,29,26,27,24,25,22,23, /* 0, 1 */
  94         00,00, 6, 7, 8, 9,10,11,12,13,14,15,30,31,28,29, /* 2, 3 */
  95         00,00,00,00,10,11,12,13,14,15,30,31,28,29,00,00, /* 4, 5 */
  96 };
  97
  98 static void
  99 zigzag8x8_s16_a16_altivec(int16_t *dest, int dstr, int16_t *src, int sstr)
 100 {
 101         __asm__ __volatile__(
 102                 "lvx 0, %0, %1\n"
 103                 "\tadd %0, %0, %3\n"
 104                 "\tlvx 1, %0, %1\n"
 105                 "\tadd %0, %0, %3\n"
 106                 "\tlvx 2, %0, %1\n"
 107                 "\tadd %0, %0, %3\n"
 108                 "\tlvx 3, %0, %1\n"
 109                 "\tadd %0, %0, %3\n"
 110                 "\tlvx 4, %0, %1\n"
 111                 "\tadd %0, %0, %3\n"
 112                 "\tlvx 5, %0, %1\n"
 113                 "\tadd %0, %0, %3\n"
 114                 "\tlvx 6, %0, %1\n"
 115                 "\tadd %0, %0, %3\n"
 116                 "\tlvx 7, %0, %1\n"
 117                 "\tadd %0, %0, %3\n"
 118         : "+b" (src)
 119         : "b" (0), "b" (8), "b" (sstr)
 120         );
 121
 122         /* "slide" vectors to right */
 123         __asm__ __volatile__(
 124                 "vsldoi 0,0,0,0\n"
 125                 "vsldoi 1,1,1,14\n"
 126                 "vsldoi 2,2,2,12\n"
 127                 "vsldoi 3,3,3,10\n"
 128                 "vsldoi 4,4,4,8\n"
 129                 "vsldoi 5,5,5,6\n"
 130                 "vsldoi 6,6,6,4\n"
 131                 "vsldoi 7,7,7,2\n"
 132         );
 133
 134         /* transpose */
 135         __asm__ __volatile__(
 136                 "vmrghh    8, 0, 4\n"
 137                 "\tvmrglh  9, 0, 4\n"
 138                 "\tvmrghh 10, 1, 5\n"
 139                 "\tvmrglh 11, 1, 5\n"
 140                 "\tvmrghh 12, 2, 6\n"
 141                 "\tvmrglh 13, 2, 6\n"
 142                 "\tvmrghh 14, 3, 7\n"
 143                 "\tvmrglh 15, 3, 7\n"
 144
 145                 "\tvmrghh 16,  8, 12\n"
 146                 "\tvmrglh 17,  8, 12\n"
 147                 "\tvmrghh 18,  9, 13\n"
 148                 "\tvmrglh 19,  9, 13\n"
 149                 "\tvmrghh 20, 10, 14\n"
 150                 "\tvmrglh 21, 10, 14\n"
 151                 "\tvmrghh 22, 11, 15\n"
 152                 "\tvmrglh 23, 11, 15\n"
 153
 154                 "\tvmrghh  0, 16, 20\n"
 155                 "\tvmrglh  1, 16, 20\n"
 156                 "\tvmrghh  2, 17, 21\n"
 157                 "\tvmrglh  3, 17, 21\n"
 158                 "\tvmrghh  4, 18, 22\n"
 159                 "\tvmrglh  5, 18, 22\n"
 160                 "\tvmrghh  6, 19, 23\n"
 161                 "\tvmrglh  7, 19, 23\n"
 162         );
 163
 164         sl_altivec_load8_8(mangle,16);
 165
 166         __asm__ __volatile__(
 167                 "\n"
 168                 "\tvperm 16,1,2,8\n"
 169                 "\tvperm 17,3,4,9\n"
 170                 "\tvperm 18,5,6,10\n"
 171                 "\tvperm 19,6,7,11\n"
 172                 "\tvperm 20,7,0,12\n"
 173                 "\tvperm 21,0,1,13\n"
 174                 "\tvperm 22,2,3,14\n"
 175                 "\tvperm 23,4,5,15\n"
 176         );
 177
 178         sl_altivec_store8_16(dest,16);
 179
 180         /* fix up the elements that were missed */
 181
 182         block8x8_s16(dest,dstr,0,0) = block8x8_s16(src,sstr,0,0);
 183         block8x8_s16(dest,dstr,0,6) = block8x8_s16(src,sstr,3,0);
 184         block8x8_s16(dest,dstr,0,7) = block8x8_s16(src,sstr,2,1);
 185         block8x8_s16(dest,dstr,1,7) = block8x8_s16(src,sstr,5,0);
 186
 187         block8x8_s16(dest,dstr,6,0) = block8x8_s16(src,sstr,2,7);
 188         block8x8_s16(dest,dstr,7,0) = block8x8_s16(src,sstr,5,6);
 189         block8x8_s16(dest,dstr,7,1) = block8x8_s16(src,sstr,4,7);
 190         block8x8_s16(dest,dstr,7,7) = block8x8_s16(src,sstr,7,7);
 191 }
 192 OIL_DEFINE_IMPL_FULL (zigzag8x8_s16_altivec, zigzag8x8_s16, OIL_FLAG_ALTIVEC);
 193 #endif
 194
 195