v16i8 src0, src1, src2, src3;
v16i8 mask = { 0, 1, 1, 2, 2, 3, 3, 4, 4, 5, 5, 6, 6, 7, 7, 8 };
v8u16 vec0, vec1, vec2, vec3;
- v8u16 const255;
v8i16 avg = { 0 };
v4i32 vec, var = { 0 };
filtval = LH(filter);
filt0 = (v16u8)__msa_fill_h(filtval);
- const255 = (v8u16)__msa_ldi_h(255);
-
for (loop_cnt = (height >> 2); loop_cnt--;) {
LD_SB4(src, src_stride, src0, src1, src2, src3);
src += (4 * src_stride);
DOTP_UB4_UH(vec0, vec1, vec2, vec3, filt0, filt0, filt0, filt0,
vec0, vec1, vec2, vec3);
SRARI_H4_UH(vec0, vec1, vec2, vec3, FILTER_BITS);
- MIN_UH4_UH(vec0, vec1, vec2, vec3, const255);
PCKEV_B4_SB(vec0, vec0, vec1, vec1, vec2, vec2, vec3, vec3,
src0, src1, src2, src3);
ILVEV_W2_SB(src0, src1, src2, src3, src0, src2);
v16u8 filt0, out, ref0, ref1, ref2, ref3;
v16i8 src0, src1, src2, src3;
v16i8 mask = { 0, 1, 1, 2, 2, 3, 3, 4, 4, 5, 5, 6, 6, 7, 7, 8 };
- v8u16 vec0, vec1, vec2, vec3, const255;
+ v8u16 vec0, vec1, vec2, vec3;
v8i16 avg = { 0 };
v4i32 vec, var = { 0 };
filtval = LH(filter);
filt0 = (v16u8)__msa_fill_h(filtval);
- const255 = (v8u16)__msa_ldi_h(255);
-
for (loop_cnt = (height >> 2); loop_cnt--;) {
LD_SB4(src, src_stride, src0, src1, src2, src3);
src += (4 * src_stride);
DOTP_UB4_UH(vec0, vec1, vec2, vec3, filt0, filt0, filt0, filt0,
vec0, vec1, vec2, vec3);
SRARI_H4_UH(vec0, vec1, vec2, vec3, FILTER_BITS);
- MIN_UH4_UH(vec0, vec1, vec2, vec3, const255);
PCKEV_B4_SB(vec0, vec0, vec1, vec1, vec2, vec2, vec3, vec3,
src0, src1, src2, src3);
out = (v16u8)__msa_ilvev_d((v2i64)src1, (v2i64)src0);
v16u8 dst0, dst1, dst2, dst3, filt0;
v8u16 vec0, vec1, vec2, vec3, vec4, vec5, vec6, vec7;
v8u16 out0, out1, out2, out3, out4, out5, out6, out7;
- v8u16 const255;
v8i16 avg = { 0 };
v4i32 vec, var = { 0 };
filtval = LH(filter);
filt0 = (v16u8)__msa_fill_h(filtval);
- const255 = (v8u16)__msa_ldi_h(255);
-
for (loop_cnt = (height >> 2); loop_cnt--;) {
LD_SB4(src, src_stride, src0, src2, src4, src6);
LD_SB4(src + 8, src_stride, src1, src3, src5, src7);
out4, out5, out6, out7);
SRARI_H4_UH(out0, out1, out2, out3, FILTER_BITS);
SRARI_H4_UH(out4, out5, out6, out7, FILTER_BITS);
- MIN_UH4_UH(out0, out1, out2, out3, const255);
- MIN_UH4_UH(out4, out5, out6, out7, const255);
PCKEV_B4_SB(out1, out0, out3, out2, out5, out4, out7, out6,
src0, src1, src2, src3);
CALC_MSE_AVG_B(src0, dst0, var, avg);
ILVR_D2_UB(src21_r, src10_r, src43_r, src32_r, src2110, src4332);
DOTP_UB2_UH(src2110, src4332, filt0, filt0, tmp0, tmp1);
SRARI_H2_UH(tmp0, tmp1, FILTER_BITS);
- SAT_UH2_UH(tmp0, tmp1, 7);
out = (v16u8)__msa_pckev_b((v16i8)tmp1, (v16i8)tmp0);
CALC_MSE_AVG_B(out, ref, var, avg);
src0 = src4;
DOTP_UB4_UH(vec0, vec1, vec2, vec3, filt0, filt0, filt0, filt0,
tmp0, tmp1, tmp2, tmp3);
SRARI_H4_UH(tmp0, tmp1, tmp2, tmp3, FILTER_BITS);
- SAT_UH4_UH(tmp0, tmp1, tmp2, tmp3, 7);
PCKEV_B2_UB(tmp1, tmp0, tmp3, tmp2, src0, src1);
CALC_MSE_AVG_B(src0, ref0, var, avg);
CALC_MSE_AVG_B(src1, ref1, var, avg);
ILVL_B2_UB(src1, src0, src2, src1, vec1, vec3);
DOTP_UB2_UH(vec0, vec1, filt0, filt0, tmp0, tmp1);
SRARI_H2_UH(tmp0, tmp1, FILTER_BITS);
- SAT_UH2_UH(tmp0, tmp1, 7);
out0 = (v16u8)__msa_pckev_b((v16i8)tmp1, (v16i8)tmp0);
ILVR_B2_UB(src3, src2, src4, src3, vec4, vec6);
ILVL_B2_UB(src3, src2, src4, src3, vec5, vec7);
DOTP_UB2_UH(vec2, vec3, filt0, filt0, tmp2, tmp3);
SRARI_H2_UH(tmp2, tmp3, FILTER_BITS);
- SAT_UH2_UH(tmp2, tmp3, 7);
out1 = (v16u8)__msa_pckev_b((v16i8)tmp3, (v16i8)tmp2);
DOTP_UB2_UH(vec4, vec5, filt0, filt0, tmp0, tmp1);
SRARI_H2_UH(tmp0, tmp1, FILTER_BITS);
- SAT_UH2_UH(tmp0, tmp1, 7);
out2 = (v16u8)__msa_pckev_b((v16i8)tmp1, (v16i8)tmp0);
DOTP_UB2_UH(vec6, vec7, filt0, filt0, tmp2, tmp3);
SRARI_H2_UH(tmp2, tmp3, FILTER_BITS);
- SAT_UH2_UH(tmp2, tmp3, 7);
out3 = (v16u8)__msa_pckev_b((v16i8)tmp3, (v16i8)tmp2);
src0 = src4;
ILVEV_B2_UB(hz_out0, hz_out1, hz_out2, hz_out3, vec0, vec1);
DOTP_UB2_UH(vec0, vec1, filt_vt, filt_vt, tmp0, tmp1);
SRARI_H2_UH(tmp0, tmp1, FILTER_BITS);
- SAT_UH2_UH(tmp0, tmp1, 7);
out = (v16u8)__msa_pckev_b((v16i8)tmp1, (v16i8)tmp0);
CALC_MSE_AVG_B(out, ref, var, avg);
src0 = src4;
vec0 = (v16u8)__msa_ilvev_b((v16i8)hz_out0, (v16i8)hz_out1);
tmp1 = __msa_dotp_u_h(vec0, filt_vt);
SRARI_H2_UH(tmp0, tmp1, FILTER_BITS);
- SAT_UH2_UH(tmp0, tmp1, 7);
hz_out1 = HORIZ_2TAP_FILT_UH(src3, src3, mask, filt_hz, FILTER_BITS);
vec0 = (v16u8)__msa_ilvev_b((v16i8)hz_out1, (v16i8)hz_out0);
tmp2 = __msa_dotp_u_h(vec0, filt_vt);
vec0 = (v16u8)__msa_ilvev_b((v16i8)hz_out0, (v16i8)hz_out1);
tmp3 = __msa_dotp_u_h(vec0, filt_vt);
SRARI_H2_UH(tmp2, tmp3, FILTER_BITS);
- SAT_UH2_UH(tmp2, tmp3, 7);
PCKEV_B2_UB(tmp1, tmp0, tmp3, tmp2, out0, out1);
CALC_MSE_AVG_B(out0, ref0, var, avg);
CALC_MSE_AVG_B(out1, ref1, var, avg);
ILVEV_B2_UB(hz_out0, hz_out1, hz_out2, hz_out3, vec0, vec1);
DOTP_UB2_UH(vec0, vec1, filt_vt, filt_vt, tmp0, tmp1);
SRARI_H2_UH(tmp0, tmp1, FILTER_BITS);
- SAT_UH2_UH(tmp0, tmp1, 7);
src0 = (v16u8)__msa_pckev_b((v16i8)tmp1, (v16i8)tmp0);
hz_out0 = HORIZ_2TAP_FILT_UH(src2, src2, mask, filt_hz, FILTER_BITS);
ILVEV_B2_UB(hz_out1, hz_out0, hz_out3, hz_out2, vec0, vec1);
DOTP_UB2_UH(vec0, vec1, filt_vt, filt_vt, tmp0, tmp1);
SRARI_H2_UH(tmp0, tmp1, FILTER_BITS);
- SAT_UH2_UH(tmp0, tmp1, 7);
src1 = (v16u8)__msa_pckev_b((v16i8)tmp1, (v16i8)tmp0);
hz_out1 = HORIZ_2TAP_FILT_UH(src4, src4, mask, filt_hz, FILTER_BITS);
ILVEV_B2_UB(hz_out0, hz_out1, hz_out2, hz_out3, vec0, vec1);
DOTP_UB2_UH(vec0, vec1, filt_vt, filt_vt, tmp0, tmp1);
SRARI_H2_UH(tmp0, tmp1, FILTER_BITS);
- SAT_UH2_UH(tmp0, tmp1, 7);
src2 = (v16u8)__msa_pckev_b((v16i8)tmp1, (v16i8)tmp0);
hz_out0 = HORIZ_2TAP_FILT_UH(src6, src6, mask, filt_hz, FILTER_BITS);
ILVEV_B2_UB(hz_out1, hz_out0, hz_out3, hz_out2, vec0, vec1);
DOTP_UB2_UH(vec0, vec1, filt_vt, filt_vt, tmp0, tmp1);
SRARI_H2_UH(tmp0, tmp1, FILTER_BITS);
- SAT_UH2_UH(tmp0, tmp1, 7);
src3 = (v16u8)__msa_pckev_b((v16i8)tmp1, (v16i8)tmp0);
CALC_MSE_AVG_B(src0, ref0, var, avg);
v16i8 src0, src1, src2, src3;
v16i8 mask = { 0, 1, 1, 2, 2, 3, 3, 4, 4, 5, 5, 6, 6, 7, 7, 8 };
v8u16 vec0, vec1, vec2, vec3;
- v8u16 const255;
v8i16 avg = { 0 };
v4i32 vec, var = { 0 };
filtval = LH(filter);
filt0 = (v16u8)__msa_fill_h(filtval);
- const255 = (v8u16)__msa_ldi_h(255);
-
for (loop_cnt = (height >> 2); loop_cnt--;) {
LD_SB4(src, src_stride, src0, src1, src2, src3);
src += (4 * src_stride);
DOTP_UB4_UH(vec0, vec1, vec2, vec3, filt0, filt0, filt0, filt0,
vec0, vec1, vec2, vec3);
SRARI_H4_UH(vec0, vec1, vec2, vec3, FILTER_BITS);
- MIN_UH4_UH(vec0, vec1, vec2, vec3, const255);
PCKEV_B4_SB(vec0, vec0, vec1, vec1, vec2, vec2, vec3, vec3,
src0, src1, src2, src3);
ILVEV_W2_SB(src0, src1, src2, src3, src0, src2);
v16i8 src0, src1, src2, src3;
v16i8 mask = { 0, 1, 1, 2, 2, 3, 3, 4, 4, 5, 5, 6, 6, 7, 7, 8 };
v8u16 vec0, vec1, vec2, vec3;
- v8u16 const255;
v8i16 avg = { 0 };
v4i32 vec, var = { 0 };
filtval = LH(filter);
filt0 = (v16u8)__msa_fill_h(filtval);
- const255 = (v8u16)__msa_ldi_h(255);
-
for (loop_cnt = (height >> 2); loop_cnt--;) {
LD_SB4(src, src_stride, src0, src1, src2, src3);
src += (4 * src_stride);
DOTP_UB4_UH(vec0, vec1, vec2, vec3, filt0, filt0, filt0, filt0,
vec0, vec1, vec2, vec3);
SRARI_H4_UH(vec0, vec1, vec2, vec3, FILTER_BITS);
- MIN_UH4_UH(vec0, vec1, vec2, vec3, const255);
PCKEV_B4_SB(vec0, vec0, vec1, vec1, vec2, vec2, vec3, vec3,
src0, src1, src2, src3);
out = (v16u8)__msa_ilvev_d((v2i64)src1, (v2i64)src0);
v16u8 pred0, pred1, pred2, pred3, filt0;
v8u16 vec0, vec1, vec2, vec3, vec4, vec5, vec6, vec7;
v8u16 out0, out1, out2, out3, out4, out5, out6, out7;
- v8u16 const255;
v8i16 avg = { 0 };
v4i32 vec, var = { 0 };
filtval = LH(filter);
filt0 = (v16u8)__msa_fill_h(filtval);
- const255 = (v8u16)__msa_ldi_h(255);
-
for (loop_cnt = (height >> 2); loop_cnt--;) {
LD_SB4(src, src_stride, src0, src2, src4, src6);
LD_SB4(src + 8, src_stride, src1, src3, src5, src7);
out4, out5, out6, out7);
SRARI_H4_UH(out0, out1, out2, out3, FILTER_BITS);
SRARI_H4_UH(out4, out5, out6, out7, FILTER_BITS);
- MIN_UH4_UH(out0, out1, out2, out3, const255);
- MIN_UH4_UH(out4, out5, out6, out7, const255);
PCKEV_B4_UB(out1, out0, out3, out2, out5, out4, out7, out6,
tmp0, tmp1, tmp2, tmp3);
AVER_UB4_UB(tmp0, pred0, tmp1, pred1, tmp2, pred2, tmp3, pred3,
ILVR_D2_UB(src21_r, src10_r, src43_r, src32_r, src2110, src4332);
DOTP_UB2_UH(src2110, src4332, filt0, filt0, tmp0, tmp1);
SRARI_H2_UH(tmp0, tmp1, FILTER_BITS);
- SAT_UH2_UH(tmp0, tmp1, 7);
out = (v16u8)__msa_pckev_b((v16i8)tmp1, (v16i8)tmp0);
out = __msa_aver_u_b(out, pred);
DOTP_UB4_UH(vec0, vec1, vec2, vec3, filt0, filt0, filt0, filt0,
tmp0, tmp1, tmp2, tmp3);
SRARI_H4_UH(tmp0, tmp1, tmp2, tmp3, FILTER_BITS);
- SAT_UH4_UH(tmp0, tmp1, tmp2, tmp3, 7);
PCKEV_B2_UB(tmp1, tmp0, tmp3, tmp2, src0, src1);
AVER_UB2_UB(src0, pred0, src1, pred1, src0, src1);
CALC_MSE_AVG_B(src0, ref0, var, avg);
ILVL_B2_UH(src1, src0, src2, src1, vec1, vec3);
DOTP_UB2_UH(vec0, vec1, filt0, filt0, tmp0, tmp1);
SRARI_H2_UH(tmp0, tmp1, FILTER_BITS);
- SAT_UH2_UH(tmp0, tmp1, 7);
out0 = (v16u8)__msa_pckev_b((v16i8)tmp1, (v16i8)tmp0);
ILVR_B2_UH(src3, src2, src4, src3, vec4, vec6);
ILVL_B2_UH(src3, src2, src4, src3, vec5, vec7);
DOTP_UB2_UH(vec2, vec3, filt0, filt0, tmp2, tmp3);
SRARI_H2_UH(tmp2, tmp3, FILTER_BITS);
- SAT_UH2_UH(tmp2, tmp3, 7);
out1 = (v16u8)__msa_pckev_b((v16i8)tmp3, (v16i8)tmp2);
DOTP_UB2_UH(vec4, vec5, filt0, filt0, tmp0, tmp1);
SRARI_H2_UH(tmp0, tmp1, FILTER_BITS);
- SAT_UH2_UH(tmp0, tmp1, 7);
out2 = (v16u8)__msa_pckev_b((v16i8)tmp1, (v16i8)tmp0);
DOTP_UB2_UH(vec6, vec7, filt0, filt0, tmp2, tmp3);
SRARI_H2_UH(tmp2, tmp3, FILTER_BITS);
- SAT_UH2_UH(tmp2, tmp3, 7);
out3 = (v16u8)__msa_pckev_b((v16i8)tmp3, (v16i8)tmp2);
src0 = src4;
ILVEV_B2_UB(hz_out0, hz_out1, hz_out2, hz_out3, vec0, vec1);
DOTP_UB2_UH(vec0, vec1, filt_vt, filt_vt, tmp0, tmp1);
SRARI_H2_UH(tmp0, tmp1, FILTER_BITS);
- SAT_UH2_UH(tmp0, tmp1, 7);
out = (v16u8)__msa_pckev_b((v16i8)tmp1, (v16i8)tmp0);
out = __msa_aver_u_b(out, pred);
CALC_MSE_AVG_B(out, ref, var, avg);
vec0 = (v16u8)__msa_ilvev_b((v16i8)hz_out0, (v16i8)hz_out1);
tmp1 = __msa_dotp_u_h(vec0, filt_vt);
SRARI_H2_UH(tmp0, tmp1, FILTER_BITS);
- SAT_UH2_UH(tmp0, tmp1, 7);
hz_out1 = HORIZ_2TAP_FILT_UH(src3, src3, mask, filt_hz, FILTER_BITS);
vec0 = (v16u8)__msa_ilvev_b((v16i8)hz_out1, (v16i8)hz_out0);
tmp3 = __msa_dotp_u_h(vec0, filt_vt);
SRARI_H2_UH(tmp2, tmp3, FILTER_BITS);
- SAT_UH2_UH(tmp2, tmp3, 7);
PCKEV_B2_UB(tmp1, tmp0, tmp3, tmp2, out0, out1);
AVER_UB2_UB(out0, pred0, out1, pred1, out0, out1);
ILVEV_B2_UB(hz_out0, hz_out1, hz_out2, hz_out3, vec0, vec1);
DOTP_UB2_UH(vec0, vec1, filt_vt, filt_vt, tmp0, tmp1);
SRARI_H2_UH(tmp0, tmp1, FILTER_BITS);
- SAT_UH2_UH(tmp0, tmp1, 7);
out0 = (v16u8)__msa_pckev_b((v16i8)tmp1, (v16i8)tmp0);
hz_out0 = HORIZ_2TAP_FILT_UH(src2, src2, mask, filt_hz, FILTER_BITS);
ILVEV_B2_UB(hz_out1, hz_out0, hz_out3, hz_out2, vec0, vec1);
DOTP_UB2_UH(vec0, vec1, filt_vt, filt_vt, tmp0, tmp1);
SRARI_H2_UH(tmp0, tmp1, FILTER_BITS);
- SAT_UH2_UH(tmp0, tmp1, 7);
out1 = (v16u8)__msa_pckev_b((v16i8)tmp1, (v16i8)tmp0);
hz_out1 = HORIZ_2TAP_FILT_UH(src4, src4, mask, filt_hz, FILTER_BITS);
ILVEV_B2_UB(hz_out0, hz_out1, hz_out2, hz_out3, vec0, vec1);
DOTP_UB2_UH(vec0, vec1, filt_vt, filt_vt, tmp0, tmp1);
SRARI_H2_UH(tmp0, tmp1, FILTER_BITS);
- SAT_UH2_UH(tmp0, tmp1, 7);
out2 = (v16u8)__msa_pckev_b((v16i8)tmp1, (v16i8)tmp0);
hz_out0 = HORIZ_2TAP_FILT_UH(src6, src6, mask, filt_hz, FILTER_BITS);
ILVEV_B2_UB(hz_out1, hz_out0, hz_out3, hz_out2, vec0, vec1);
DOTP_UB2_UH(vec0, vec1, filt_vt, filt_vt, tmp0, tmp1);
SRARI_H2_UH(tmp0, tmp1, FILTER_BITS);
- SAT_UH2_UH(tmp0, tmp1, 7);
out3 = (v16u8)__msa_pckev_b((v16i8)tmp1, (v16i8)tmp0);
LD_UB4(dst, dst_stride, ref0, ref1, ref2, ref3);