vp8/encoder/mips/msa/temporal_filter_msa.c

   1 /*
   2  *  Copyright (c) 2015 The WebM project authors. All Rights Reserved.
   3  *
   4  *  Use of this source code is governed by a BSD-style license
   5  *  that can be found in the LICENSE file in the root of the source
   6  *  tree. An additional intellectual property rights grant can be found
   7  *  in the file PATENTS.  All contributing project authors may
   8  *  be found in the AUTHORS file in the root of the source tree.
   9  */
  10
  11 #include "./vp8_rtcd.h"
  12 #include "vp8/common/mips/msa/vp8_macros_msa.h"
  13
  14 static void temporal_filter_apply_16size_msa(
  15     uint8_t *frame1_ptr, uint32_t stride, uint8_t *frame2_ptr,
  16     int32_t strength_in, int32_t filter_wt_in, uint32_t *acc, uint16_t *cnt) {
  17   uint32_t row;
  18   v16i8 frame1_0_b, frame1_1_b, frame2_0_b, frame2_1_b;
  19   v16u8 frame_l, frame_h;
  20   v16i8 zero = { 0 };
  21   v8i16 frame2_0_h, frame2_1_h, mod0_h, mod1_h;
  22   v8i16 diff0, diff1, cnt0, cnt1;
  23   v4i32 const3, const16, filter_wt, strength;
  24   v4i32 mod0_w, mod1_w, mod2_w, mod3_w;
  25   v4i32 diff0_r, diff0_l, diff1_r, diff1_l;
  26   v4i32 frame2_0, frame2_1, frame2_2, frame2_3;
  27   v4i32 acc0, acc1, acc2, acc3;
  28
  29   filter_wt = __msa_fill_w(filter_wt_in);
  30   strength = __msa_fill_w(strength_in);
  31   const3 = __msa_ldi_w(3);
  32   const16 = __msa_ldi_w(16);
  33
  34   for (row = 8; row--;) {
  35     frame1_0_b = LD_SB(frame1_ptr);
  36     frame2_0_b = LD_SB(frame2_ptr);
  37     frame1_ptr += stride;
  38     frame2_ptr += 16;
  39     frame1_1_b = LD_SB(frame1_ptr);
  40     frame2_1_b = LD_SB(frame2_ptr);
  41     LD_SW2(acc, 4, acc0, acc1);
  42     LD_SW2(acc + 8, 4, acc2, acc3);
  43     LD_SH2(cnt, 8, cnt0, cnt1);
  44     ILVRL_B2_UB(frame1_0_b, frame2_0_b, frame_l, frame_h);
  45     HSUB_UB2_SH(frame_l, frame_h, diff0, diff1);
  46     UNPCK_SH_SW(diff0, diff0_r, diff0_l);
  47     UNPCK_SH_SW(diff1, diff1_r, diff1_l);
  48     MUL4(diff0_r, diff0_r, diff0_l, diff0_l, diff1_r, diff1_r, diff1_l, diff1_l,
  49          mod0_w, mod1_w, mod2_w, mod3_w);
  50     MUL4(mod0_w, const3, mod1_w, const3, mod2_w, const3, mod3_w, const3, mod0_w,
  51          mod1_w, mod2_w, mod3_w);
  52     SRAR_W4_SW(mod0_w, mod1_w, mod2_w, mod3_w, strength);
  53     diff0_r = (mod0_w < const16);
  54     diff0_l = (mod1_w < const16);
  55     diff1_r = (mod2_w < const16);
  56     diff1_l = (mod3_w < const16);
  57     SUB4(const16, mod0_w, const16, mod1_w, const16, mod2_w, const16, mod3_w,
  58          mod0_w, mod1_w, mod2_w, mod3_w);
  59     mod0_w = diff0_r & mod0_w;
  60     mod1_w = diff0_l & mod1_w;
  61     mod2_w = diff1_r & mod2_w;
  62     mod3_w = diff1_l & mod3_w;
  63     MUL4(mod0_w, filter_wt, mod1_w, filter_wt, mod2_w, filter_wt, mod3_w,
  64          filter_wt, mod0_w, mod1_w, mod2_w, mod3_w);
  65     PCKEV_H2_SH(mod1_w, mod0_w, mod3_w, mod2_w, mod0_h, mod1_h)
  66     ADD2(mod0_h, cnt0, mod1_h, cnt1, mod0_h, mod1_h);
  67     ST_SH2(mod0_h, mod1_h, cnt, 8);
  68     cnt += 16;
  69     ILVRL_B2_SH(zero, frame2_0_b, frame2_0_h, frame2_1_h);
  70     UNPCK_SH_SW(frame2_0_h, frame2_0, frame2_1);
  71     UNPCK_SH_SW(frame2_1_h, frame2_2, frame2_3);
  72     MUL4(mod0_w, frame2_0, mod1_w, frame2_1, mod2_w, frame2_2, mod3_w, frame2_3,
  73          mod0_w, mod1_w, mod2_w, mod3_w);
  74     ADD4(mod0_w, acc0, mod1_w, acc1, mod2_w, acc2, mod3_w, acc3, mod0_w, mod1_w,
  75          mod2_w, mod3_w);
  76     ST_SW2(mod0_w, mod1_w, acc, 4);
  77     ST_SW2(mod2_w, mod3_w, acc + 8, 4);
  78     acc += 16;
  79     LD_SW2(acc, 4, acc0, acc1);
  80     LD_SW2(acc + 8, 4, acc2, acc3);
  81     LD_SH2(cnt, 8, cnt0, cnt1);
  82     ILVRL_B2_UB(frame1_1_b, frame2_1_b, frame_l, frame_h);
  83     HSUB_UB2_SH(frame_l, frame_h, diff0, diff1);
  84     UNPCK_SH_SW(diff0, diff0_r, diff0_l);
  85     UNPCK_SH_SW(diff1, diff1_r, diff1_l);
  86     MUL4(diff0_r, diff0_r, diff0_l, diff0_l, diff1_r, diff1_r, diff1_l, diff1_l,
  87          mod0_w, mod1_w, mod2_w, mod3_w);
  88     MUL4(mod0_w, const3, mod1_w, const3, mod2_w, const3, mod3_w, const3, mod0_w,
  89          mod1_w, mod2_w, mod3_w);
  90     SRAR_W4_SW(mod0_w, mod1_w, mod2_w, mod3_w, strength);
  91     diff0_r = (mod0_w < const16);
  92     diff0_l = (mod1_w < const16);
  93     diff1_r = (mod2_w < const16);
  94     diff1_l = (mod3_w < const16);
  95     SUB4(const16, mod0_w, const16, mod1_w, const16, mod2_w, const16, mod3_w,
  96          mod0_w, mod1_w, mod2_w, mod3_w);
  97     mod0_w = diff0_r & mod0_w;
  98     mod1_w = diff0_l & mod1_w;
  99     mod2_w = diff1_r & mod2_w;
 100     mod3_w = diff1_l & mod3_w;
 101     MUL4(mod0_w, filter_wt, mod1_w, filter_wt, mod2_w, filter_wt, mod3_w,
 102          filter_wt, mod0_w, mod1_w, mod2_w, mod3_w);
 103     PCKEV_H2_SH(mod1_w, mod0_w, mod3_w, mod2_w, mod0_h, mod1_h);
 104     ADD2(mod0_h, cnt0, mod1_h, cnt1, mod0_h, mod1_h);
 105     ST_SH2(mod0_h, mod1_h, cnt, 8);
 106     cnt += 16;
 107
 108     UNPCK_UB_SH(frame2_1_b, frame2_0_h, frame2_1_h);
 109     UNPCK_SH_SW(frame2_0_h, frame2_0, frame2_1);
 110     UNPCK_SH_SW(frame2_1_h, frame2_2, frame2_3);
 111     MUL4(mod0_w, frame2_0, mod1_w, frame2_1, mod2_w, frame2_2, mod3_w, frame2_3,
 112          mod0_w, mod1_w, mod2_w, mod3_w);
 113     ADD4(mod0_w, acc0, mod1_w, acc1, mod2_w, acc2, mod3_w, acc3, mod0_w, mod1_w,
 114          mod2_w, mod3_w);
 115     ST_SW2(mod0_w, mod1_w, acc, 4);
 116     ST_SW2(mod2_w, mod3_w, acc + 8, 4);
 117     acc += 16;
 118     frame1_ptr += stride;
 119     frame2_ptr += 16;
 120   }
 121 }
 122
 123 static void temporal_filter_apply_8size_msa(
 124     uint8_t *frame1_ptr, uint32_t stride, uint8_t *frame2_ptr,
 125     int32_t strength_in, int32_t filter_wt_in, uint32_t *acc, uint16_t *cnt) {
 126   uint32_t row;
 127   uint64_t f0, f1, f2, f3, f4, f5, f6, f7;
 128   v16i8 frame1 = { 0 };
 129   v16i8 frame2 = { 0 };
 130   v16i8 frame3 = { 0 };
 131   v16i8 frame4 = { 0 };
 132   v16u8 frame_l, frame_h;
 133   v8i16 frame2_0_h, frame2_1_h, mod0_h, mod1_h;
 134   v8i16 diff0, diff1, cnt0, cnt1;
 135   v4i32 const3, const16;
 136   v4i32 filter_wt, strength;
 137   v4i32 mod0_w, mod1_w, mod2_w, mod3_w;
 138   v4i32 diff0_r, diff0_l, diff1_r, diff1_l;
 139   v4i32 frame2_0, frame2_1, frame2_2, frame2_3;
 140   v4i32 acc0, acc1, acc2, acc3;
 141
 142   filter_wt = __msa_fill_w(filter_wt_in);
 143   strength = __msa_fill_w(strength_in);
 144   const3 = __msa_ldi_w(3);
 145   const16 = __msa_ldi_w(16);
 146
 147   for (row = 2; row--;) {
 148     LD2(frame1_ptr, stride, f0, f1);
 149     frame1_ptr += (2 * stride);
 150     LD2(frame2_ptr, 8, f2, f3);
 151     frame2_ptr += 16;
 152     LD2(frame1_ptr, stride, f4, f5);
 153     frame1_ptr += (2 * stride);
 154     LD2(frame2_ptr, 8, f6, f7);
 155     frame2_ptr += 16;
 156
 157     LD_SW2(acc, 4, acc0, acc1);
 158     LD_SW2(acc + 8, 4, acc2, acc3);
 159     LD_SH2(cnt, 8, cnt0, cnt1);
 160     INSERT_D2_SB(f0, f1, frame1);
 161     INSERT_D2_SB(f2, f3, frame2);
 162     INSERT_D2_SB(f4, f5, frame3);
 163     INSERT_D2_SB(f6, f7, frame4);
 164     ILVRL_B2_UB(frame1, frame2, frame_l, frame_h);
 165     HSUB_UB2_SH(frame_l, frame_h, diff0, diff1);
 166     UNPCK_SH_SW(diff0, diff0_r, diff0_l);
 167     UNPCK_SH_SW(diff1, diff1_r, diff1_l);
 168     MUL4(diff0_r, diff0_r, diff0_l, diff0_l, diff1_r, diff1_r, diff1_l, diff1_l,
 169          mod0_w, mod1_w, mod2_w, mod3_w);
 170     MUL4(mod0_w, const3, mod1_w, const3, mod2_w, const3, mod3_w, const3, mod0_w,
 171          mod1_w, mod2_w, mod3_w);
 172     SRAR_W4_SW(mod0_w, mod1_w, mod2_w, mod3_w, strength);
 173     diff0_r = (mod0_w < const16);
 174     diff0_l = (mod1_w < const16);
 175     diff1_r = (mod2_w < const16);
 176     diff1_l = (mod3_w < const16);
 177     SUB4(const16, mod0_w, const16, mod1_w, const16, mod2_w, const16, mod3_w,
 178          mod0_w, mod1_w, mod2_w, mod3_w);
 179     mod0_w = diff0_r & mod0_w;
 180     mod1_w = diff0_l & mod1_w;
 181     mod2_w = diff1_r & mod2_w;
 182     mod3_w = diff1_l & mod3_w;
 183     MUL4(mod0_w, filter_wt, mod1_w, filter_wt, mod2_w, filter_wt, mod3_w,
 184          filter_wt, mod0_w, mod1_w, mod2_w, mod3_w);
 185     PCKEV_H2_SH(mod1_w, mod0_w, mod3_w, mod2_w, mod0_h, mod1_h);
 186     ADD2(mod0_h, cnt0, mod1_h, cnt1, mod0_h, mod1_h);
 187     ST_SH2(mod0_h, mod1_h, cnt, 8);
 188     cnt += 16;
 189
 190     UNPCK_UB_SH(frame2, frame2_0_h, frame2_1_h);
 191     UNPCK_SH_SW(frame2_0_h, frame2_0, frame2_1);
 192     UNPCK_SH_SW(frame2_1_h, frame2_2, frame2_3);
 193     MUL4(mod0_w, frame2_0, mod1_w, frame2_1, mod2_w, frame2_2, mod3_w, frame2_3,
 194          mod0_w, mod1_w, mod2_w, mod3_w);
 195     ADD4(mod0_w, acc0, mod1_w, acc1, mod2_w, acc2, mod3_w, acc3, mod0_w, mod1_w,
 196          mod2_w, mod3_w);
 197     ST_SW2(mod0_w, mod1_w, acc, 4);
 198     ST_SW2(mod2_w, mod3_w, acc + 8, 4);
 199     acc += 16;
 200
 201     LD_SW2(acc, 4, acc0, acc1);
 202     LD_SW2(acc + 8, 4, acc2, acc3);
 203     LD_SH2(cnt, 8, cnt0, cnt1);
 204     ILVRL_B2_UB(frame3, frame4, frame_l, frame_h);
 205     HSUB_UB2_SH(frame_l, frame_h, diff0, diff1);
 206     UNPCK_SH_SW(diff0, diff0_r, diff0_l);
 207     UNPCK_SH_SW(diff1, diff1_r, diff1_l);
 208     MUL4(diff0_r, diff0_r, diff0_l, diff0_l, diff1_r, diff1_r, diff1_l, diff1_l,
 209          mod0_w, mod1_w, mod2_w, mod3_w);
 210     MUL4(mod0_w, const3, mod1_w, const3, mod2_w, const3, mod3_w, const3, mod0_w,
 211          mod1_w, mod2_w, mod3_w);
 212     SRAR_W4_SW(mod0_w, mod1_w, mod2_w, mod3_w, strength);
 213     diff0_r = (mod0_w < const16);
 214     diff0_l = (mod1_w < const16);
 215     diff1_r = (mod2_w < const16);
 216     diff1_l = (mod3_w < const16);
 217     SUB4(const16, mod0_w, const16, mod1_w, const16, mod2_w, const16, mod3_w,
 218          mod0_w, mod1_w, mod2_w, mod3_w);
 219     mod0_w = diff0_r & mod0_w;
 220     mod1_w = diff0_l & mod1_w;
 221     mod2_w = diff1_r & mod2_w;
 222     mod3_w = diff1_l & mod3_w;
 223     MUL4(mod0_w, filter_wt, mod1_w, filter_wt, mod2_w, filter_wt, mod3_w,
 224          filter_wt, mod0_w, mod1_w, mod2_w, mod3_w);
 225     PCKEV_H2_SH(mod1_w, mod0_w, mod3_w, mod2_w, mod0_h, mod1_h);
 226     ADD2(mod0_h, cnt0, mod1_h, cnt1, mod0_h, mod1_h);
 227     ST_SH2(mod0_h, mod1_h, cnt, 8);
 228     cnt += 16;
 229
 230     UNPCK_UB_SH(frame4, frame2_0_h, frame2_1_h);
 231     UNPCK_SH_SW(frame2_0_h, frame2_0, frame2_1);
 232     UNPCK_SH_SW(frame2_1_h, frame2_2, frame2_3);
 233     MUL4(mod0_w, frame2_0, mod1_w, frame2_1, mod2_w, frame2_2, mod3_w, frame2_3,
 234          mod0_w, mod1_w, mod2_w, mod3_w);
 235     ADD4(mod0_w, acc0, mod1_w, acc1, mod2_w, acc2, mod3_w, acc3, mod0_w, mod1_w,
 236          mod2_w, mod3_w);
 237     ST_SW2(mod0_w, mod1_w, acc, 4);
 238     ST_SW2(mod2_w, mod3_w, acc + 8, 4);
 239     acc += 16;
 240   }
 241 }
 242
 243 void vp8_temporal_filter_apply_msa(uint8_t *frame1, uint32_t stride,
 244                                    uint8_t *frame2, uint32_t block_size,
 245                                    int32_t strength, int32_t filter_weight,
 246                                    uint32_t *accumulator, uint16_t *count) {
 247   if (8 == block_size) {
 248     temporal_filter_apply_8size_msa(frame1, stride, frame2, strength,
 249                                     filter_weight, accumulator, count);
 250   } else if (16 == block_size) {
 251     temporal_filter_apply_16size_msa(frame1, stride, frame2, strength,
 252                                      filter_weight, accumulator, count);
 253   } else {
 254     uint32_t i, j, k;
 255     int32_t modifier;
 256     int32_t byte = 0;
 257     const int32_t rounding = strength > 0 ? 1 << (strength - 1) : 0;
 258
 259     for (i = 0, k = 0; i < block_size; ++i) {
 260       for (j = 0; j < block_size; ++j, ++k) {
 261         int src_byte = frame1[byte];
 262         int pixel_value = *frame2++;
 263
 264         modifier = src_byte - pixel_value;
 265         modifier *= modifier;
 266         modifier *= 3;
 267         modifier += rounding;
 268         modifier >>= strength;
 269
 270         if (modifier > 16) modifier = 16;
 271
 272         modifier = 16 - modifier;
 273         modifier *= filter_weight;
 274
 275         count[k] += modifier;
 276         accumulator[k] += modifier * pixel_value;
 277
 278         byte++;
 279       }
 280
 281       byte += stride - block_size;
 282     }
 283   }
 284 }