filt = __lsx_vsadd_b(filt, q0_sub_p0); \
filt = __lsx_vand_v(filt, mask); \
t1 = __lsx_vsadd_b(filt, cnst4b); \
- t1 = __lsx_vsra_b(filt, cnst3b); \
+ t1 = __lsx_vsra_b(t1, cnst3b); \
t2 = __lsx_vsadd_b(filt, cnst3b); \
- t2 = __lsx_vsra_b(filt, cnst3b); \
+ t2 = __lsx_vsra_b(t2, cnst3b); \
q0_m = __lsx_vssub_b(q0_m, t1); \
q0 = __lsx_vxori_b(q0_m, 0x80); \
p0_m = __lsx_vsadd_b(p0_m, t2); \
const uint8_t *b_limit1_ptr,
const uint8_t *limit1_ptr,
const uint8_t *thresh1_ptr) {
- uint8_t *temp_src;
int32_t pitch_x2 = pitch << 1;
int32_t pitch_x3 = pitch_x2 + pitch;
int32_t pitch_x4 = pitch << 2;
__m128i thresh0, b_limit0, limit0, thresh1, b_limit1, limit1;
__m128i p3, p2, p1, p0, q3, q2, q1, q0;
- temp_src = src - pitch_x4;
- DUP4_ARG2(__lsx_vldx, temp_src, 0, temp_src, pitch, temp_src, pitch_x2,
- temp_src, pitch_x3, p3, p2, p1, p0);
- temp_src += pitch_x4;
- DUP4_ARG2(__lsx_vldx, temp_src, 0, temp_src, pitch, temp_src, pitch_x2,
- temp_src, pitch_x3, q0, q1, q2, q3);
+ DUP4_ARG2(__lsx_vldx, src, -pitch_x4, src, -pitch_x3, src, -pitch_x2, src,
+ -pitch, p3, p2, p1, p0);
+ q0 = __lsx_vld(src, 0);
+ DUP2_ARG2(__lsx_vldx, src, pitch, src, pitch_x2, q1, q2);
+ q3 = __lsx_vldx(src, pitch_x3);
thresh0 = __lsx_vreplgr2vr_b(*thresh0_ptr);
thresh1 = __lsx_vreplgr2vr_b(*thresh1_ptr);
mask, flat);
VP8_LPF_FILTER4_4W(p1, p0, q0, q1, mask, hev);
- __lsx_vstelm_d(p1, src_u, 0, 0);
- __lsx_vstelm_d(p0, src_u + pitch, 0, 0);
- __lsx_vstelm_d(q0, src_u + pitch_x2, 0, 0);
- __lsx_vstelm_d(q1, src_u + pitch_x3, 0, 0);
+ __lsx_vstelm_d(q1, src_u + pitch, 0, 0);
+ __lsx_vstelm_d(q0, src_u, 0, 0);
+ __lsx_vstelm_d(p0, src_u - pitch, 0, 0);
+ __lsx_vstelm_d(p1, src_u - pitch_x2, 0, 0);
- __lsx_vstelm_d(p1, src_v, 0, 1);
- __lsx_vstelm_d(p0, src_v + pitch, 0, 1);
- __lsx_vstelm_d(q0, src_v + pitch_x2, 0, 1);
- __lsx_vstelm_d(q1, src_v + pitch_x3, 0, 1);
+ __lsx_vstelm_d(q1, src_v + pitch, 0, 1);
+ __lsx_vstelm_d(q0, src_v, 0, 1);
+ __lsx_vstelm_d(p0, src_v - pitch, 0, 1);
+ __lsx_vstelm_d(p1, src_v - pitch_x2, 0, 1);
}
static void loop_filter_vertical_edge_uv_lsx(uint8_t *src_u, uint8_t *src_v,
tmp2 = __lsx_vilvl_h(tmp1, tmp0);
tmp3 = __lsx_vilvh_h(tmp1, tmp0);
- tmp0 = __lsx_vilvl_b(p0, q1);
- tmp1 = __lsx_vilvl_b(q1, q0);
+ tmp0 = __lsx_vilvh_b(p0, p1);
+ tmp1 = __lsx_vilvh_b(q1, q0);
tmp4 = __lsx_vilvl_h(tmp1, tmp0);
tmp5 = __lsx_vilvh_h(tmp1, tmp0);
#include "./vpx_dsp_rtcd.h"
#include "vpx_dsp/loongarch/vpx_convolve_lsx.h"
+static const uint8_t mc_filt_mask_arr[16 * 3] = {
+ /* 8 width cases */
+ 0, 1, 1, 2, 2, 3, 3, 4, 4, 5, 5, 6, 6, 7, 7, 8,
+ /* 4 width cases */
+ 0, 1, 1, 2, 2, 3, 3, 4, 16, 17, 17, 18, 18, 19, 19, 20,
+ /* 4 width cases */
+ 8, 9, 9, 10, 10, 11, 11, 12, 24, 25, 25, 26, 26, 27, 27, 28
+};
+
static void common_hv_8ht_8vt_and_aver_dst_4w_lsx(
const uint8_t *src, int32_t src_stride, uint8_t *dst, int32_t dst_stride,
const int8_t *filter_horiz, const int8_t *filter_vert, int32_t height) {
src0 = __lsx_vpackev_b(src1, src0);
out1 = FILT_8TAP_DPADD_S_H(tmp1, tmp2, tmp4, src0, filt_vt0, filt_vt1,
filt_vt2, filt_vt3);
- out0 = __lsx_vssrarni_b_h(out1, out0, 7);
+ out0 = __lsx_vssrarni_b_h(out1, out0, FILTER_BITS);
out0 = __lsx_vxori_b(out0, 128);
out0 = __lsx_vavgr_bu(out0, src2);
__lsx_vstelm_w(out0, dst, 0, 0);
src2 = __lsx_vpackev_b(src10, src9);
src4 = FILT_8TAP_DPADD_S_H(tmp5, tmp6, src0, src2, filt_vt0, filt_vt1,
filt_vt2, filt_vt3);
- DUP2_ARG3(__lsx_vssrarni_b_h, out1, out0, 7, src4, src3, 7, out0, out1);
+ DUP2_ARG3(__lsx_vssrarni_b_h, out1, out0, FILTER_BITS, src4, src3,
+ FILTER_BITS, out0, out1);
DUP2_ARG2(__lsx_vxori_b, out0, 128, out1, 128, out0, out1);
src5 = __lsx_vldrepl_d(dst_tmp, 0);
dst_tmp += dst_stride;
common_hv_8ht_8vt_and_aver_dst_8w_lsx(src, src_stride, dst, dst_stride,
filter_horiz, filter_vert, height);
- src += 8;
- dst += 8;
}
static void common_hv_8ht_8vt_and_aver_dst_32w_lsx(
static void common_hv_2ht_2vt_and_aver_dst_4x8_lsx(
const uint8_t *src, int32_t src_stride, uint8_t *dst, int32_t dst_stride,
int8_t *filter_horiz, int8_t *filter_vert) {
- uint8_t *dst_tmp1;
+ uint8_t *dst_tmp = dst;
__m128i src0, src1, src2, src3, src4, src5, src6, src7, src8, mask;
__m128i filt_hz, filt_vt, vec0, vec1, vec2, vec3, res0, res1;
__m128i hz_out0, hz_out1, hz_out2, hz_out3, hz_out4, hz_out5, hz_out6;
hz_out5 = __lsx_vshuf_b(hz_out6, hz_out4, shuff);
hz_out7 = __lsx_vpickod_d(hz_out8, hz_out6);
- dst0 = __lsx_vldrepl_w(dst, 0);
- dst += dst_stride;
- dst1 = __lsx_vldrepl_w(dst, 0);
- dst += dst_stride;
- dst2 = __lsx_vldrepl_w(dst, 0);
- dst += dst_stride;
- dst3 = __lsx_vldrepl_w(dst, 0);
- dst += dst_stride;
+ dst0 = __lsx_vldrepl_w(dst_tmp, 0);
+ dst_tmp += dst_stride;
+ dst1 = __lsx_vldrepl_w(dst_tmp, 0);
+ dst_tmp += dst_stride;
+ dst2 = __lsx_vldrepl_w(dst_tmp, 0);
+ dst_tmp += dst_stride;
+ dst3 = __lsx_vldrepl_w(dst_tmp, 0);
+ dst_tmp += dst_stride;
dst0 = __lsx_vilvl_w(dst1, dst0);
dst1 = __lsx_vilvl_w(dst3, dst2);
dst0 = __lsx_vilvl_d(dst1, dst0);
- dst1 = __lsx_vldrepl_w(dst, 0);
- dst += dst_stride;
- dst2 = __lsx_vldrepl_w(dst, 0);
- dst += dst_stride;
- dst3 = __lsx_vldrepl_w(dst, 0);
- dst += dst_stride;
- dst4 = __lsx_vldrepl_w(dst, 0);
- dst += dst_stride;
+ dst1 = __lsx_vldrepl_w(dst_tmp, 0);
+ dst_tmp += dst_stride;
+ dst2 = __lsx_vldrepl_w(dst_tmp, 0);
+ dst_tmp += dst_stride;
+ dst3 = __lsx_vldrepl_w(dst_tmp, 0);
+ dst_tmp += dst_stride;
+ dst4 = __lsx_vldrepl_w(dst_tmp, 0);
dst1 = __lsx_vilvl_w(dst2, dst1);
dst2 = __lsx_vilvl_w(dst4, dst3);
dst1 = __lsx_vilvl_d(dst2, dst1);
DUP2_ARG2(__lsx_vpickev_b, tmp1, tmp0, tmp3, tmp2, res0, res1);
DUP2_ARG2(__lsx_vavgr_bu, res0, dst0, res1, dst1, res0, res1);
- dst_tmp1 = dst;
- __lsx_vstelm_w(res0, dst_tmp1, 0, 0);
- dst_tmp1 += dst_stride;
- __lsx_vstelm_w(res0, dst_tmp1, 0, 1);
- dst_tmp1 += dst_stride;
- __lsx_vstelm_w(res0, dst_tmp1, 0, 2);
- dst_tmp1 += dst_stride;
- __lsx_vstelm_w(res0, dst_tmp1, 0, 3);
- dst_tmp1 += dst_stride;
-
- __lsx_vstelm_w(res1, dst_tmp1, 0, 0);
- dst_tmp1 += dst_stride;
- __lsx_vstelm_w(res1, dst_tmp1, 0, 1);
- dst_tmp1 += dst_stride;
- __lsx_vstelm_w(res1, dst_tmp1, 0, 2);
- dst_tmp1 += dst_stride;
- __lsx_vstelm_w(res1, dst_tmp1, 0, 3);
+ __lsx_vstelm_w(res0, dst, 0, 0);
+ dst += dst_stride;
+ __lsx_vstelm_w(res0, dst, 0, 1);
+ dst += dst_stride;
+ __lsx_vstelm_w(res0, dst, 0, 2);
+ dst += dst_stride;
+ __lsx_vstelm_w(res0, dst, 0, 3);
+ dst += dst_stride;
+
+ __lsx_vstelm_w(res1, dst, 0, 0);
+ dst += dst_stride;
+ __lsx_vstelm_w(res1, dst, 0, 1);
+ dst += dst_stride;
+ __lsx_vstelm_w(res1, dst, 0, 2);
+ dst += dst_stride;
+ __lsx_vstelm_w(res1, dst, 0, 3);
}
static void common_hv_2ht_2vt_and_aver_dst_4w_lsx(
mask = __lsx_vld(mc_filt_mask_arr, 0);
/* rearranging filter */
filt_hz = __lsx_vldrepl_h(filter_horiz, 0);
- filt_vt = __lsx_vldrepl_h(filtrt_ver, 0);
+ filt_vt = __lsx_vldrepl_h(filter_vert, 0);
src0 = __lsx_vld(src, 0);
DUP4_ARG2(__lsx_vldx, src, src_stride, src, src_stride2, src, src_stride3,
src, src_stride4, src1, src2, src3, src4);
- src += (src_stride4 + src_stride);
dst0 = __lsx_vldrepl_d(dst_tmp, 0);
dst_tmp += dst_stride;
dst2 = __lsx_vldrepl_d(dst_tmp, 0);
dst_tmp += dst_stride;
dst3 = __lsx_vldrepl_d(dst_tmp, 0);
- dst_tmp += dst_stride;
DUP2_ARG2(__lsx_vilvl_d, dst1, dst0, dst3, dst2, dst0, dst1);
hz_out0 = HORIZ_2TAP_FILT_UH(src0, src0, mask, filt_hz, FILTER_BITS);
hz_out1 = HORIZ_2TAP_FILT_UH(src1, src1, mask, filt_hz, FILTER_BITS);
hz_out0 = HORIZ_2TAP_FILT_UH(src4, src4, mask, filt_hz, FILTER_BITS);
vec3 = __lsx_vpackev_b(hz_out0, hz_out1);
- tmp3 = __lsx_vdp2_h_bu(vec1, filt_vt);
+ tmp3 = __lsx_vdp2_h_bu(vec3, filt_vt);
DUP4_ARG2(__lsx_vsrari_h, tmp0, FILTER_BITS, tmp1, FILTER_BITS, tmp2,
FILTER_BITS, tmp3, FILTER_BITS, tmp0, tmp1, tmp2, tmp3);
PCKEV_AVG_ST4_D(tmp0, tmp1, tmp2, tmp3, dst0, dst1, dst, dst_stride);
- dst -= dst_stride * 3;
}
static void common_hv_2ht_2vt_and_aver_dst_8x8mult_lsx(
for (; loop_cnt--;) {
src1 = __lsx_vld(src, 0);
DUP2_ARG2(__lsx_vldx, src, src_stride, src, src_stride2, src2, src3);
- src4 = __lsx_vlds(src, src_stride3);
+ src4 = __lsx_vldx(src, src_stride3);
src += src_stride4;
hz_out1 = HORIZ_2TAP_FILT_UH(src1, src1, mask, filt_hz, FILTER_BITS);
vec0 = __lsx_vpackev_b(hz_out1, hz_out0);
- tmp0 = __lsx_vavgr_bu(vec0, filt_vt);
+ tmp0 = __lsx_vdp2_h_bu(vec0, filt_vt);
hz_out0 = HORIZ_2TAP_FILT_UH(src2, src2, mask, filt_hz, FILTER_BITS);
vec0 = __lsx_vpackev_b(hz_out0, hz_out1);
- tmp1 = __lsx_vavgr_bu(vec0, filt_vt);
+ tmp1 = __lsx_vdp2_h_bu(vec0, filt_vt);
DUP2_ARG2(__lsx_vsrari_h, tmp0, FILTER_BITS, tmp1, FILTER_BITS, tmp0, tmp1);
hz_out1 = HORIZ_2TAP_FILT_UH(src3, src3, mask, filt_hz, FILTER_BITS);
vec0 = __lsx_vpackev_b(hz_out1, hz_out0);
- tmp2 = __lsx_vavgr_bu(vec0, filt_vt);
+ tmp2 = __lsx_vdp2_h_bu(vec0, filt_vt);
hz_out0 = HORIZ_2TAP_FILT_UH(src4, src4, mask, filt_hz, FILTER_BITS);
vec0 = __lsx_vpackev_b(hz_out0, hz_out1);
- tmp3 = __lsx_vavgr_bu(vec0, filt_vt);
+ tmp3 = __lsx_vdp2_h_bu(vec0, filt_vt);
- DUP2_ARG2(__lsx_vsrari_h, tmp2, FILTER_BITS, tmp3, FILTER_BITS, tmp0, tmp1);
+ DUP2_ARG2(__lsx_vsrari_h, tmp2, FILTER_BITS, tmp3, FILTER_BITS, tmp2, tmp3);
dst0 = __lsx_vldrepl_d(dst_tmp, 0);
dst_tmp += dst_stride;
int32_t dst_stride2 = dst_stride << 1;
int32_t dst_stride3 = dst_stride2 + dst_stride;
- int32_t dst_stride4 = dst_stride2 << 1;
+ int32_t dst_stride4 = dst_stride << 2;
mask = __lsx_vld(mc_filt_mask_arr, 0);
/* rearranging filter */
src1 = __lsx_vld(src_tmp1, 0);
DUP2_ARG2(__lsx_vldx, src_tmp1, src_stride, src_tmp1, src_stride2, src3,
src5);
- src5 = __lsx_vldx(src_tmp1, src_stride3);
+ src7 = __lsx_vldx(src_tmp1, src_stride3);
src += src_stride4;
dst0 = __lsx_vld(dst, 0);
DUP2_ARG2(__lsx_vldx, dst, dst_stride, dst, dst_stride2, dst1, dst2);
hz_out1 = HORIZ_2TAP_FILT_UH(src0, src0, mask, filt_hz, FILTER_BITS);
hz_out3 = HORIZ_2TAP_FILT_UH(src1, src1, mask, filt_hz, FILTER_BITS);
DUP2_ARG2(__lsx_vpackev_b, hz_out1, hz_out0, hz_out3, hz_out2, vec0, vec1);
- DUP2_ARG2(__lsx_vavgr_bu, vec0, filt_vt, vec1, filt_vt, tmp0, tmp1);
+ DUP2_ARG2(__lsx_vdp2_h_bu, vec0, filt_vt, vec1, filt_vt, tmp0, tmp1);
DUP2_ARG2(__lsx_vsrari_h, tmp0, FILTER_BITS, tmp1, FILTER_BITS, tmp0, tmp1);
tmp3 = __lsx_vpickev_b(tmp1, tmp0);
tmp3 = __lsx_vavgr_bu(tmp3, dst0);
__lsx_vst(tmp3, dst, 0);
- dst += dst_stride;
hz_out0 = HORIZ_2TAP_FILT_UH(src2, src2, mask, filt_hz, FILTER_BITS);
hz_out2 = HORIZ_2TAP_FILT_UH(src3, src3, mask, filt_hz, FILTER_BITS);
- DUP2_ARG2(__lsx_vpackev_b, hz_out1, hz_out0, hz_out3, hz_out2, vec0, vec1);
- DUP2_ARG2(__lsx_vavgr_bu, vec0, filt_vt, vec1, filt_vt, tmp0, tmp1);
+ DUP2_ARG2(__lsx_vpackev_b, hz_out0, hz_out1, hz_out2, hz_out3, vec0, vec1);
+ DUP2_ARG2(__lsx_vdp2_h_bu, vec0, filt_vt, vec1, filt_vt, tmp0, tmp1);
DUP2_ARG2(__lsx_vsrari_h, tmp0, FILTER_BITS, tmp1, FILTER_BITS, tmp0, tmp1);
tmp3 = __lsx_vpickev_b(tmp1, tmp0);
tmp3 = __lsx_vavgr_bu(tmp3, dst1);
- __lsx_vst(tmp3, dst, 0);
- dst += dst_stride;
+ __lsx_vstx(tmp3, dst, dst_stride);
hz_out1 = HORIZ_2TAP_FILT_UH(src4, src4, mask, filt_hz, FILTER_BITS);
hz_out3 = HORIZ_2TAP_FILT_UH(src5, src5, mask, filt_hz, FILTER_BITS);
DUP2_ARG2(__lsx_vpackev_b, hz_out1, hz_out0, hz_out3, hz_out2, vec0, vec1);
- DUP2_ARG2(__lsx_vavgr_bu, vec0, filt_vt, vec1, filt_vt, tmp0, tmp1);
+ DUP2_ARG2(__lsx_vdp2_h_bu, vec0, filt_vt, vec1, filt_vt, tmp0, tmp1);
DUP2_ARG2(__lsx_vsrari_h, tmp0, FILTER_BITS, tmp1, FILTER_BITS, tmp0, tmp1);
tmp3 = __lsx_vpickev_b(tmp1, tmp0);
tmp3 = __lsx_vavgr_bu(tmp3, dst2);
- __lsx_vst(tmp3, dst, 0);
- dst += dst_stride;
+ __lsx_vstx(tmp3, dst, dst_stride2);
- hz_out0 = HORIZ_2TAP_FILT_UH(src2, src2, mask, filt_hz, FILTER_BITS);
- hz_out2 = HORIZ_2TAP_FILT_UH(src3, src3, mask, filt_hz, FILTER_BITS);
- DUP2_ARG2(__lsx_vpackev_b, hz_out1, hz_out0, hz_out3, hz_out2, vec0, vec1);
- DUP2_ARG2(__lsx_vavgr_bu, vec0, filt_vt, vec1, filt_vt, tmp0, tmp1);
+ hz_out0 = HORIZ_2TAP_FILT_UH(src6, src6, mask, filt_hz, FILTER_BITS);
+ hz_out2 = HORIZ_2TAP_FILT_UH(src7, src7, mask, filt_hz, FILTER_BITS);
+ DUP2_ARG2(__lsx_vpackev_b, hz_out0, hz_out1, hz_out2, hz_out3, vec0, vec1);
+ DUP2_ARG2(__lsx_vdp2_h_bu, vec0, filt_vt, vec1, filt_vt, tmp0, tmp1);
DUP2_ARG2(__lsx_vsrari_h, tmp0, FILTER_BITS, tmp1, FILTER_BITS, tmp0, tmp1);
tmp3 = __lsx_vpickev_b(tmp1, tmp0);
tmp3 = __lsx_vavgr_bu(tmp3, dst3);
- __lsx_vst(tmp3, dst, 0);
- dst += dst_stride;
+ __lsx_vstx(tmp3, dst, dst_stride3);
+ dst += dst_stride4;
}
}