src/third_party/libvpx/source/libvpx/vp9/encoder/x86/vp9_variance_avx2.c

   1 /*
   2  *  Copyright (c) 2012 The WebM project authors. All Rights Reserved.
   3  *
   4  *  Use of this source code is governed by a BSD-style license
   5  *  that can be found in the LICENSE file in the root of the source
   6  *  tree. An additional intellectual property rights grant can be found
   7  *  in the file PATENTS.  All contributing project authors may
   8  *  be found in the AUTHORS file in the root of the source tree.
   9  */
  10 #include "./vpx_config.h"
  11
  12 #include "vp9/encoder/vp9_variance.h"
  13 #include "vp9/common/vp9_pragmas.h"
  14 #include "vpx_ports/mem.h"
  15
  16 typedef void (*get_var_avx2) (
  17   const unsigned char *src_ptr,
  18   int source_stride,
  19   const unsigned char *ref_ptr,
  20   int recon_stride,
  21   unsigned int *SSE,
  22   int *Sum
  23 );
  24
  25 void vp9_get16x16var_avx2
  26 (
  27   const unsigned char *src_ptr,
  28   int source_stride,
  29   const unsigned char *ref_ptr,
  30   int recon_stride,
  31   unsigned int *SSE,
  32   int *Sum
  33 );
  34
  35 void vp9_get32x32var_avx2
  36 (
  37   const unsigned char *src_ptr,
  38   int source_stride,
  39   const unsigned char *ref_ptr,
  40   int recon_stride,
  41   unsigned int *SSE,
  42   int *Sum
  43 );
  44
  45 unsigned int vp9_sub_pixel_variance32xh_avx2
  46 (
  47   const uint8_t *src,
  48   int src_stride,
  49   int x_offset,
  50   int y_offset,
  51   const uint8_t *dst,
  52   int dst_stride,
  53   int height,
  54   unsigned int *sse
  55 );
  56
  57 unsigned int vp9_sub_pixel_avg_variance32xh_avx2
  58 (
  59   const uint8_t *src,
  60   int src_stride,
  61   int x_offset,
  62   int y_offset,
  63   const uint8_t *dst,
  64   int dst_stride,
  65   const uint8_t *sec,
  66   int sec_stride,
  67   int height,
  68   unsigned int *sseptr
  69 );
  70
  71 static void variance_avx2(const unsigned char *src_ptr, int  source_stride,
  72                         const unsigned char *ref_ptr, int  recon_stride,
  73                         int  w, int  h, unsigned int *sse, int *sum,
  74                         get_var_avx2 var_fn, int block_size) {
  75   unsigned int sse0;
  76   int sum0;
  77   int i, j;
  78
  79   *sse = 0;
  80   *sum = 0;
  81
  82   for (i = 0; i < h; i += 16) {
  83     for (j = 0; j < w; j += block_size) {
  84       // processing 16 rows horizontally each call
  85       var_fn(src_ptr + source_stride * i + j, source_stride,
  86              ref_ptr + recon_stride * i + j, recon_stride, &sse0, &sum0);
  87       *sse += sse0;
  88       *sum += sum0;
  89     }
  90   }
  91 }
  92
  93 unsigned int vp9_variance16x16_avx2
  94 (
  95   const unsigned char *src_ptr,
  96   int  source_stride,
  97   const unsigned char *ref_ptr,
  98   int  recon_stride,
  99   unsigned int *sse) {
 100   unsigned int var;
 101   int avg;
 102
 103   variance_avx2(src_ptr, source_stride, ref_ptr, recon_stride, 16, 16,
 104                 &var, &avg, vp9_get16x16var_avx2, 16);
 105   *sse = var;
 106   return (var - (((unsigned int)avg * avg) >> 8));
 107 }
 108
 109 unsigned int vp9_mse16x16_avx2(
 110   const unsigned char *src_ptr,
 111   int  source_stride,
 112   const unsigned char *ref_ptr,
 113   int  recon_stride,
 114   unsigned int *sse) {
 115   unsigned int sse0;
 116   int sum0;
 117   vp9_get16x16var_avx2(src_ptr, source_stride, ref_ptr, recon_stride, &sse0,
 118                        &sum0);
 119   *sse = sse0;
 120   return sse0;
 121 }
 122
 123 unsigned int vp9_variance32x32_avx2(const uint8_t *src_ptr,
 124                                     int  source_stride,
 125                                     const uint8_t *ref_ptr,
 126                                     int  recon_stride,
 127                                     unsigned int *sse) {
 128   unsigned int var;
 129   int avg;
 130
 131   // processing 32 elements vertically in parallel
 132   variance_avx2(src_ptr, source_stride, ref_ptr, recon_stride, 32, 32,
 133                 &var, &avg, vp9_get32x32var_avx2, 32);
 134   *sse = var;
 135   return (var - (((int64_t)avg * avg) >> 10));
 136 }
 137
 138 unsigned int vp9_variance32x16_avx2(const uint8_t *src_ptr,
 139                                     int  source_stride,
 140                                     const uint8_t *ref_ptr,
 141                                     int  recon_stride,
 142                                     unsigned int *sse) {
 143   unsigned int var;
 144   int avg;
 145
 146   // processing 32 elements vertically in parallel
 147   variance_avx2(src_ptr, source_stride, ref_ptr, recon_stride, 32, 16,
 148                 &var, &avg, vp9_get32x32var_avx2, 32);
 149   *sse = var;
 150   return (var - (((int64_t)avg * avg) >> 9));
 151 }
 152
 153
 154 unsigned int vp9_variance64x64_avx2(const uint8_t *src_ptr,
 155                                     int  source_stride,
 156                                     const uint8_t *ref_ptr,
 157                                     int  recon_stride,
 158                                     unsigned int *sse) {
 159   unsigned int var;
 160   int avg;
 161
 162   // processing 32 elements vertically in parallel
 163   variance_avx2(src_ptr, source_stride, ref_ptr, recon_stride, 64, 64,
 164                 &var, &avg, vp9_get32x32var_avx2, 32);
 165   *sse = var;
 166   return (var - (((int64_t)avg * avg) >> 12));
 167 }
 168
 169 unsigned int vp9_variance64x32_avx2(const uint8_t *src_ptr,
 170                                     int  source_stride,
 171                                     const uint8_t *ref_ptr,
 172                                     int  recon_stride,
 173                                     unsigned int *sse) {
 174   unsigned int var;
 175   int avg;
 176
 177   // processing 32 elements vertically in parallel
 178   variance_avx2(src_ptr, source_stride, ref_ptr, recon_stride, 64, 32,
 179                 &var, &avg, vp9_get32x32var_avx2, 32);
 180
 181   *sse = var;
 182   return (var - (((int64_t)avg * avg) >> 11));
 183 }
 184
 185 unsigned int vp9_sub_pixel_variance64x64_avx2(const uint8_t *src,
 186                                               int src_stride,
 187                                               int x_offset,
 188                                               int y_offset,
 189                                               const uint8_t *dst,
 190                                               int dst_stride,
 191                                               unsigned int *sse_ptr) {
 192   // processing 32 elements in parallel
 193   unsigned int sse;
 194   int se = vp9_sub_pixel_variance32xh_avx2(src, src_stride, x_offset,
 195                                            y_offset, dst, dst_stride,
 196                                            64, &sse);
 197   // processing the next 32 elements in parallel
 198   unsigned int sse2;
 199   int se2 = vp9_sub_pixel_variance32xh_avx2(src + 32, src_stride,
 200                                             x_offset, y_offset,
 201                                             dst + 32, dst_stride,
 202                                             64, &sse2);
 203   se += se2;
 204   sse += sse2;
 205   *sse_ptr = sse;
 206   return sse - (((int64_t)se * se) >> 12);
 207 }
 208
 209 unsigned int vp9_sub_pixel_variance32x32_avx2(const uint8_t *src,
 210                                               int src_stride,
 211                                               int x_offset,
 212                                               int y_offset,
 213                                               const uint8_t *dst,
 214                                               int dst_stride,
 215                                               unsigned int *sse_ptr) {
 216   // processing 32 element in parallel
 217   unsigned int sse;
 218   int se = vp9_sub_pixel_variance32xh_avx2(src, src_stride, x_offset,
 219                                            y_offset, dst, dst_stride,
 220                                            32, &sse);
 221   *sse_ptr = sse;
 222   return sse - (((int64_t)se * se) >> 10);
 223 }
 224
 225 unsigned int vp9_sub_pixel_avg_variance64x64_avx2(const uint8_t *src,
 226                                                   int src_stride,
 227                                                   int x_offset,
 228                                                   int y_offset,
 229                                                   const uint8_t *dst,
 230                                                   int dst_stride,
 231                                                   unsigned int *sseptr,
 232                                                   const uint8_t *sec) {
 233   // processing 32 elements in parallel
 234   unsigned int sse;
 235
 236   int se = vp9_sub_pixel_avg_variance32xh_avx2(src, src_stride, x_offset,
 237                                                y_offset, dst, dst_stride,
 238                                                sec, 64, 64, &sse);
 239   unsigned int sse2;
 240   // processing the next 32 elements in parallel
 241   int se2 = vp9_sub_pixel_avg_variance32xh_avx2(src + 32, src_stride, x_offset,
 242                                                 y_offset, dst + 32, dst_stride,
 243                                                 sec + 32, 64, 64, &sse2);
 244   se += se2;
 245   sse += sse2;
 246   *sseptr = sse;
 247
 248   return sse - (((int64_t)se * se) >> 12);
 249 }
 250
 251 unsigned int vp9_sub_pixel_avg_variance32x32_avx2(const uint8_t *src,
 252                                                   int src_stride,
 253                                                   int x_offset,
 254                                                   int y_offset,
 255                                                   const uint8_t *dst,
 256                                                   int dst_stride,
 257                                                   unsigned int *sseptr,
 258                                                   const uint8_t *sec) {
 259   // processing 32 element in parallel
 260   unsigned int sse;
 261   int se = vp9_sub_pixel_avg_variance32xh_avx2(src, src_stride, x_offset,
 262                                                  y_offset, dst, dst_stride,
 263                                                  sec, 32, 32, &sse);
 264   *sseptr = sse;
 265   return sse - (((int64_t)se * se) >> 10);
 266 }
 267
 268