libavcodec/ppc/h264chroma_template.c

   1 /*
   2  * Copyright (c) 2004 Romain Dolbeau <romain@dolbeau.org>
   3  *
   4  * This file is part of FFmpeg.
   5  *
   6  * FFmpeg is free software; you can redistribute it and/or
   7  * modify it under the terms of the GNU Lesser General Public
   8  * License as published by the Free Software Foundation; either
   9  * version 2.1 of the License, or (at your option) any later version.
  10  *
  11  * FFmpeg is distributed in the hope that it will be useful,
  12  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  13  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  14  * Lesser General Public License for more details.
  15  *
  16  * You should have received a copy of the GNU Lesser General Public
  17  * License along with FFmpeg; if not, write to the Free Software
  18  * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
  19  */
  20
  21 #include "libavutil/mem_internal.h"
  22 #include "libavutil/ppc/util_altivec.h"
  23
  24 /* this code assume that stride % 16 == 0 */
  25
  26 #define CHROMA_MC8_ALTIVEC_CORE(BIAS1, BIAS2) \
  27         vsrc2ssH = (vec_s16)VEC_MERGEH(zero_u8v,(vec_u8)vsrc2uc);\
  28         vsrc3ssH = (vec_s16)VEC_MERGEH(zero_u8v,(vec_u8)vsrc3uc);\
  29 \
  30         psum = vec_mladd(vA, vsrc0ssH, BIAS1);\
  31         psum = vec_mladd(vB, vsrc1ssH, psum);\
  32         psum = vec_mladd(vC, vsrc2ssH, psum);\
  33         psum = vec_mladd(vD, vsrc3ssH, psum);\
  34         psum = BIAS2(psum);\
  35         psum = vec_sr(psum, v6us);\
  36 \
  37         vdst = vec_ld(0, dst);\
  38         ppsum = (vec_u8)vec_pack(psum, psum);\
  39         vfdst = vec_perm(vdst, ppsum, fperm);\
  40 \
  41         OP_U8_ALTIVEC(fsum, vfdst, vdst);\
  42 \
  43         vec_st(fsum, 0, dst);\
  44 \
  45         vsrc0ssH = vsrc2ssH;\
  46         vsrc1ssH = vsrc3ssH;\
  47 \
  48         dst += stride;\
  49         src += stride;
  50
  51 #define CHROMA_MC8_ALTIVEC_CORE_SIMPLE \
  52 \
  53         vsrc0ssH = (vec_s16)VEC_MERGEH(zero_u8v,(vec_u8)vsrc0uc);\
  54         vsrc1ssH = (vec_s16)VEC_MERGEH(zero_u8v,(vec_u8)vsrc1uc);\
  55 \
  56         psum = vec_mladd(vA, vsrc0ssH, v32ss);\
  57         psum = vec_mladd(vE, vsrc1ssH, psum);\
  58         psum = vec_sr(psum, v6us);\
  59 \
  60         vdst = vec_ld(0, dst);\
  61         ppsum = (vec_u8)vec_pack(psum, psum);\
  62         vfdst = vec_perm(vdst, ppsum, fperm);\
  63 \
  64         OP_U8_ALTIVEC(fsum, vfdst, vdst);\
  65 \
  66         vec_st(fsum, 0, dst);\
  67 \
  68         dst += stride;\
  69         src += stride;
  70
  71 #define noop(a) a
  72 #define add28(a) vec_add(v28ss, a)
  73
  74 #if HAVE_BIGENDIAN
  75 #define GET_VSRC1(vs0, off, b, perm0, s){    \
  76     vec_u8 vsrcCuc, vsrcDuc;                 \
  77     vsrcCuc = vec_ld(off, s);                \
  78     if (loadSecond){                         \
  79         vsrcDuc = vec_ld(off + b, s);        \
  80     } else                                   \
  81         vsrcDuc = vsrcCuc;                   \
  82                                              \
  83     vs0 = vec_perm(vsrcCuc, vsrcDuc, perm0); \
  84 }
  85 #define GET_VSRC(vs0, vs1, off, b, perm0, perm1, s){ \
  86     vec_u8 vsrcCuc, vsrcDuc;                         \
  87     vsrcCuc = vec_ld(off, s);                        \
  88     if (loadSecond){                                 \
  89         vsrcDuc = vec_ld(off + b, s);                \
  90     } else                                           \
  91         vsrcDuc = vsrcCuc;                           \
  92                                                      \
  93     vs0 = vec_perm(vsrcCuc, vsrcDuc, perm0);         \
  94     if (reallyBadAlign){                             \
  95         vs1 = vsrcDuc;                               \
  96     } else                                           \
  97         vs1 = vec_perm(vsrcCuc, vsrcDuc, perm1);     \
  98  }
  99
 100 #else
 101
 102 #define GET_VSRC1(vs0, off, b, perm0, s){            \
 103     vs0 = vec_vsx_ld(off, s);                        \
 104  }
 105 #define GET_VSRC(vs0, vs1, off, b, perm0, perm1, s){ \
 106     vs0 = vec_vsx_ld(off, s);                        \
 107     vs1 = vec_vsx_ld(off + 1, s);                    \
 108  }
 109 #endif /* HAVE_BIGENDIAN */
 110
 111 #ifdef PREFIX_h264_chroma_mc8_altivec
 112 static void PREFIX_h264_chroma_mc8_altivec(uint8_t * dst, const uint8_t * src,
 113                                            ptrdiff_t stride, int h,
 114                                            int x, int y)
 115 {
 116     DECLARE_ALIGNED(16, signed int, ABCD)[4] =
 117                         {((8 - x) * (8 - y)),
 118                          ((    x) * (8 - y)),
 119                          ((8 - x) * (    y)),
 120                          ((    x) * (    y))};
 121     register int i;
 122     vec_u8 fperm;
 123     LOAD_ZERO;
 124     const vec_s32 vABCD = vec_ld(0, ABCD);
 125     const vec_s16 vA = VEC_SPLAT16(vABCD, 1);
 126     const vec_s16 vB = VEC_SPLAT16(vABCD, 3);
 127     const vec_s16 vC = VEC_SPLAT16(vABCD, 5);
 128     const vec_s16 vD = VEC_SPLAT16(vABCD, 7);
 129     const vec_s16 v32ss = vec_sl(vec_splat_s16(1),vec_splat_u16(5));
 130     const vec_u16 v6us = vec_splat_u16(6);
 131
 132     vec_u8 vsrcperm0, vsrcperm1;
 133     vec_u8 vsrc0uc, vsrc1uc;
 134     vec_s16 vsrc0ssH, vsrc1ssH;
 135     vec_u8 vsrc2uc, vsrc3uc;
 136     vec_s16 vsrc2ssH, vsrc3ssH, psum;
 137     vec_u8 vdst, ppsum, vfdst, fsum;
 138 #if HAVE_BIGENDIAN
 139     register int loadSecond = (((unsigned long)src) % 16) <= 7 ? 0 : 1;
 140     register int reallyBadAlign = (((unsigned long)src) % 16) == 15 ? 1 : 0;
 141     vsrcperm0 = vec_lvsl(0, src);
 142     vsrcperm1 = vec_lvsl(1, src);
 143 #endif
 144
 145     if (((unsigned long)dst) % 16 == 0) {
 146         fperm = (vec_u8){0x10, 0x11, 0x12, 0x13,
 147                          0x14, 0x15, 0x16, 0x17,
 148                          0x08, 0x09, 0x0A, 0x0B,
 149                          0x0C, 0x0D, 0x0E, 0x0F};
 150     } else {
 151         fperm = (vec_u8){0x00, 0x01, 0x02, 0x03,
 152                          0x04, 0x05, 0x06, 0x07,
 153                          0x18, 0x19, 0x1A, 0x1B,
 154                          0x1C, 0x1D, 0x1E, 0x1F};
 155     }
 156
 157     GET_VSRC(vsrc0uc, vsrc1uc, 0, 16, vsrcperm0, vsrcperm1, src);
 158
 159     vsrc0ssH = (vec_s16)VEC_MERGEH(zero_u8v,(vec_u8)vsrc0uc);
 160     vsrc1ssH = (vec_s16)VEC_MERGEH(zero_u8v,(vec_u8)vsrc1uc);
 161
 162     if (ABCD[3]) {
 163         for (i = 0 ; i < h ; i++) {
 164             GET_VSRC(vsrc2uc, vsrc3uc, stride, 16, vsrcperm0, vsrcperm1, src);
 165             CHROMA_MC8_ALTIVEC_CORE(v32ss, noop);
 166         }
 167     } else {
 168         const vec_s16 vE = vec_add(vB, vC);
 169         if (ABCD[2]) { // x == 0 B == 0
 170             for (i = 0 ; i < h ; i++) {
 171                 GET_VSRC1(vsrc1uc, stride, 15, vsrcperm0, src);
 172                 CHROMA_MC8_ALTIVEC_CORE_SIMPLE;
 173                 vsrc0uc = vsrc1uc;
 174             }
 175         } else { // y == 0 C == 0
 176             for (i = 0 ; i < h ; i++) {
 177                GET_VSRC(vsrc0uc, vsrc1uc, 0, 15, vsrcperm0, vsrcperm1, src);
 178                CHROMA_MC8_ALTIVEC_CORE_SIMPLE;
 179             }
 180         }
 181     }
 182 }
 183 #endif
 184
 185 /* this code assume that stride % 16 == 0 */
 186 #ifdef PREFIX_no_rnd_vc1_chroma_mc8_altivec
 187 static void PREFIX_no_rnd_vc1_chroma_mc8_altivec(uint8_t *dst, const uint8_t *src,
 188                                                  ptrdiff_t stride, int h,
 189                                                  int x, int y)
 190 {
 191    DECLARE_ALIGNED(16, signed int, ABCD)[4] =
 192                         {((8 - x) * (8 - y)),
 193                          ((    x) * (8 - y)),
 194                          ((8 - x) * (    y)),
 195                          ((    x) * (    y))};
 196     register int i;
 197     vec_u8 fperm;
 198     LOAD_ZERO;
 199     const vec_s32 vABCD = vec_ld(0, ABCD);
 200     const vec_s16 vA = VEC_SPLAT16(vABCD, 1);
 201     const vec_s16 vB = VEC_SPLAT16(vABCD, 3);
 202     const vec_s16 vC = VEC_SPLAT16(vABCD, 5);
 203     const vec_s16 vD = VEC_SPLAT16(vABCD, 7);
 204     const vec_s16 v28ss = vec_sub(vec_sl(vec_splat_s16(1),vec_splat_u16(5)),vec_splat_s16(4));
 205     const vec_u16 v6us  = vec_splat_u16(6);
 206
 207     vec_u8 vsrcperm0, vsrcperm1;
 208     vec_u8 vsrc0uc, vsrc1uc;
 209     vec_s16 vsrc0ssH, vsrc1ssH;
 210     vec_u8 vsrc2uc, vsrc3uc;
 211     vec_s16 vsrc2ssH, vsrc3ssH, psum;
 212     vec_u8 vdst, ppsum, vfdst, fsum;
 213 #if HAVE_BIGENDIAN
 214     register int loadSecond     = (((unsigned long)src) % 16) <= 7 ? 0 : 1;
 215     register int reallyBadAlign = (((unsigned long)src) % 16) == 15 ? 1 : 0;
 216     vsrcperm0 = vec_lvsl(0, src);
 217     vsrcperm1 = vec_lvsl(1, src);
 218 #endif
 219
 220     if (((unsigned long)dst) % 16 == 0) {
 221         fperm = (vec_u8){0x10, 0x11, 0x12, 0x13,
 222                          0x14, 0x15, 0x16, 0x17,
 223                          0x08, 0x09, 0x0A, 0x0B,
 224                          0x0C, 0x0D, 0x0E, 0x0F};
 225     } else {
 226         fperm = (vec_u8){0x00, 0x01, 0x02, 0x03,
 227                          0x04, 0x05, 0x06, 0x07,
 228                          0x18, 0x19, 0x1A, 0x1B,
 229                          0x1C, 0x1D, 0x1E, 0x1F};
 230     }
 231
 232     GET_VSRC(vsrc0uc, vsrc1uc, 0, 16, vsrcperm0, vsrcperm1, src);
 233
 234     vsrc0ssH = (vec_s16)VEC_MERGEH(zero_u8v, (vec_u8)vsrc0uc);
 235     vsrc1ssH = (vec_s16)VEC_MERGEH(zero_u8v, (vec_u8)vsrc1uc);
 236
 237     for (i = 0 ; i < h ; i++) {
 238         GET_VSRC(vsrc2uc, vsrc3uc, stride, 16, vsrcperm0, vsrcperm1, src);
 239         CHROMA_MC8_ALTIVEC_CORE(vec_splat_s16(0), add28);
 240     }
 241 }
 242 #endif
 243
 244 #undef noop
 245 #undef add28
 246 #undef CHROMA_MC8_ALTIVEC_CORE