2 * Copyright (c) 2014 The WebM project authors. All Rights Reserved.
4 * Use of this source code is governed by a BSD-style license
5 * that can be found in the LICENSE file in the root of the source
6 * tree. An additional intellectual property rights grant can be found
7 * in the file PATENTS. All contributing project authors may
8 * be found in the AUTHORS file in the root of the source tree.
13 #include "./vpx_config.h"
14 #include "./vp9_rtcd.h"
15 #include "vpx_ports/mem.h"
17 typedef void filter8_1dfunction (
18 const unsigned char *src_ptr,
19 const ptrdiff_t src_pitch,
20 unsigned char *output_ptr,
22 unsigned int output_height,
26 #define FUN_CONV_1D(name, step_q4, filter, dir, src_start, avg, opt) \
27 void vp9_convolve8_##name##_##opt(const uint8_t *src, ptrdiff_t src_stride, \
28 uint8_t *dst, ptrdiff_t dst_stride, \
29 const int16_t *filter_x, int x_step_q4, \
30 const int16_t *filter_y, int y_step_q4, \
32 if (step_q4 == 16 && filter[3] != 128) { \
33 if (filter[0] || filter[1] || filter[2]) { \
35 vp9_filter_block1d16_##dir##8_##avg##opt(src_start, \
46 vp9_filter_block1d8_##dir##8_##avg##opt(src_start, \
57 vp9_filter_block1d4_##dir##8_##avg##opt(src_start, \
69 vp9_filter_block1d16_##dir##2_##avg##opt(src, \
80 vp9_filter_block1d8_##dir##2_##avg##opt(src, \
91 vp9_filter_block1d4_##dir##2_##avg##opt(src, \
104 vp9_convolve8_##name##_c(src, src_stride, dst, dst_stride, \
105 filter_x, x_step_q4, filter_y, y_step_q4, \
110 #define FUN_CONV_2D(avg, opt) \
111 void vp9_convolve8_##avg##opt(const uint8_t *src, ptrdiff_t src_stride, \
112 uint8_t *dst, ptrdiff_t dst_stride, \
113 const int16_t *filter_x, int x_step_q4, \
114 const int16_t *filter_y, int y_step_q4, \
118 if (x_step_q4 == 16 && y_step_q4 == 16) { \
119 if (filter_x[0] || filter_x[1] || filter_x[2] || filter_x[3] == 128 || \
120 filter_y[0] || filter_y[1] || filter_y[2] || filter_y[3] == 128) { \
121 DECLARE_ALIGNED_ARRAY(16, unsigned char, fdata2, 64 * 71); \
122 vp9_convolve8_horiz_##opt(src - 3 * src_stride, src_stride, fdata2, 64, \
123 filter_x, x_step_q4, filter_y, y_step_q4, \
125 vp9_convolve8_##avg##vert_##opt(fdata2 + 3 * 64, 64, dst, dst_stride, \
126 filter_x, x_step_q4, filter_y, \
129 DECLARE_ALIGNED_ARRAY(16, unsigned char, fdata2, 64 * 65); \
130 vp9_convolve8_horiz_##opt(src, src_stride, fdata2, 64, \
131 filter_x, x_step_q4, filter_y, y_step_q4, \
133 vp9_convolve8_##avg##vert_##opt(fdata2, 64, dst, dst_stride, \
134 filter_x, x_step_q4, filter_y, \
138 vp9_convolve8_##avg##c(src, src_stride, dst, dst_stride, \
139 filter_x, x_step_q4, filter_y, y_step_q4, w, h); \
143 filter8_1dfunction vp9_filter_block1d16_v8_avx2;
144 filter8_1dfunction vp9_filter_block1d16_h8_avx2;
145 filter8_1dfunction vp9_filter_block1d8_v8_ssse3;
146 filter8_1dfunction vp9_filter_block1d8_h8_ssse3;
147 filter8_1dfunction vp9_filter_block1d4_v8_ssse3;
148 filter8_1dfunction vp9_filter_block1d4_h8_ssse3;
149 filter8_1dfunction vp9_filter_block1d16_v2_ssse3;
150 filter8_1dfunction vp9_filter_block1d16_h2_ssse3;
151 filter8_1dfunction vp9_filter_block1d8_v2_ssse3;
152 filter8_1dfunction vp9_filter_block1d8_h2_ssse3;
153 filter8_1dfunction vp9_filter_block1d4_v2_ssse3;
154 filter8_1dfunction vp9_filter_block1d4_h2_ssse3;
155 #define vp9_filter_block1d8_v8_avx2 vp9_filter_block1d8_v8_ssse3
156 #define vp9_filter_block1d8_h8_avx2 vp9_filter_block1d8_h8_ssse3
157 #define vp9_filter_block1d4_v8_avx2 vp9_filter_block1d4_v8_ssse3
158 #define vp9_filter_block1d4_h8_avx2 vp9_filter_block1d4_h8_ssse3
159 #define vp9_filter_block1d16_v2_avx2 vp9_filter_block1d16_v2_ssse3
160 #define vp9_filter_block1d16_h2_avx2 vp9_filter_block1d16_h2_ssse3
161 #define vp9_filter_block1d8_v2_avx2 vp9_filter_block1d8_v2_ssse3
162 #define vp9_filter_block1d8_h2_avx2 vp9_filter_block1d8_h2_ssse3
163 #define vp9_filter_block1d4_v2_avx2 vp9_filter_block1d4_v2_ssse3
164 #define vp9_filter_block1d4_h2_avx2 vp9_filter_block1d4_h2_ssse3
165 // void vp9_convolve8_horiz_avx2(const uint8_t *src, ptrdiff_t src_stride,
166 // uint8_t *dst, ptrdiff_t dst_stride,
167 // const int16_t *filter_x, int x_step_q4,
168 // const int16_t *filter_y, int y_step_q4,
170 // void vp9_convolve8_vert_avx2(const uint8_t *src, ptrdiff_t src_stride,
171 // uint8_t *dst, ptrdiff_t dst_stride,
172 // const int16_t *filter_x, int x_step_q4,
173 // const int16_t *filter_y, int y_step_q4,
175 FUN_CONV_1D(horiz, x_step_q4, filter_x, h, src, , avx2);
176 FUN_CONV_1D(vert, y_step_q4, filter_y, v, src - src_stride * 3, , avx2);
178 // void vp9_convolve8_avx2(const uint8_t *src, ptrdiff_t src_stride,
179 // uint8_t *dst, ptrdiff_t dst_stride,
180 // const int16_t *filter_x, int x_step_q4,
181 // const int16_t *filter_y, int y_step_q4,
186 filter8_1dfunction vp9_filter_block1d16_v8_ssse3;
187 filter8_1dfunction vp9_filter_block1d16_h8_ssse3;
188 filter8_1dfunction vp9_filter_block1d8_v8_ssse3;
189 filter8_1dfunction vp9_filter_block1d8_h8_ssse3;
190 filter8_1dfunction vp9_filter_block1d4_v8_ssse3;
191 filter8_1dfunction vp9_filter_block1d4_h8_ssse3;
192 filter8_1dfunction vp9_filter_block1d16_v8_avg_ssse3;
193 filter8_1dfunction vp9_filter_block1d16_h8_avg_ssse3;
194 filter8_1dfunction vp9_filter_block1d8_v8_avg_ssse3;
195 filter8_1dfunction vp9_filter_block1d8_h8_avg_ssse3;
196 filter8_1dfunction vp9_filter_block1d4_v8_avg_ssse3;
197 filter8_1dfunction vp9_filter_block1d4_h8_avg_ssse3;
199 filter8_1dfunction vp9_filter_block1d16_v2_ssse3;
200 filter8_1dfunction vp9_filter_block1d16_h2_ssse3;
201 filter8_1dfunction vp9_filter_block1d8_v2_ssse3;
202 filter8_1dfunction vp9_filter_block1d8_h2_ssse3;
203 filter8_1dfunction vp9_filter_block1d4_v2_ssse3;
204 filter8_1dfunction vp9_filter_block1d4_h2_ssse3;
205 filter8_1dfunction vp9_filter_block1d16_v2_avg_ssse3;
206 filter8_1dfunction vp9_filter_block1d16_h2_avg_ssse3;
207 filter8_1dfunction vp9_filter_block1d8_v2_avg_ssse3;
208 filter8_1dfunction vp9_filter_block1d8_h2_avg_ssse3;
209 filter8_1dfunction vp9_filter_block1d4_v2_avg_ssse3;
210 filter8_1dfunction vp9_filter_block1d4_h2_avg_ssse3;
212 // void vp9_convolve8_horiz_ssse3(const uint8_t *src, ptrdiff_t src_stride,
213 // uint8_t *dst, ptrdiff_t dst_stride,
214 // const int16_t *filter_x, int x_step_q4,
215 // const int16_t *filter_y, int y_step_q4,
217 // void vp9_convolve8_vert_ssse3(const uint8_t *src, ptrdiff_t src_stride,
218 // uint8_t *dst, ptrdiff_t dst_stride,
219 // const int16_t *filter_x, int x_step_q4,
220 // const int16_t *filter_y, int y_step_q4,
222 // void vp9_convolve8_avg_horiz_ssse3(const uint8_t *src, ptrdiff_t src_stride,
223 // uint8_t *dst, ptrdiff_t dst_stride,
224 // const int16_t *filter_x, int x_step_q4,
225 // const int16_t *filter_y, int y_step_q4,
227 // void vp9_convolve8_avg_vert_ssse3(const uint8_t *src, ptrdiff_t src_stride,
228 // uint8_t *dst, ptrdiff_t dst_stride,
229 // const int16_t *filter_x, int x_step_q4,
230 // const int16_t *filter_y, int y_step_q4,
232 FUN_CONV_1D(horiz, x_step_q4, filter_x, h, src, , ssse3);
233 FUN_CONV_1D(vert, y_step_q4, filter_y, v, src - src_stride * 3, , ssse3);
234 FUN_CONV_1D(avg_horiz, x_step_q4, filter_x, h, src, avg_, ssse3);
235 FUN_CONV_1D(avg_vert, y_step_q4, filter_y, v, src - src_stride * 3, avg_,
238 // void vp9_convolve8_ssse3(const uint8_t *src, ptrdiff_t src_stride,
239 // uint8_t *dst, ptrdiff_t dst_stride,
240 // const int16_t *filter_x, int x_step_q4,
241 // const int16_t *filter_y, int y_step_q4,
243 // void vp9_convolve8_avg_ssse3(const uint8_t *src, ptrdiff_t src_stride,
244 // uint8_t *dst, ptrdiff_t dst_stride,
245 // const int16_t *filter_x, int x_step_q4,
246 // const int16_t *filter_y, int y_step_q4,
248 FUN_CONV_2D(, ssse3);
249 FUN_CONV_2D(avg_ , ssse3);
253 filter8_1dfunction vp9_filter_block1d16_v8_sse2;
254 filter8_1dfunction vp9_filter_block1d16_h8_sse2;
255 filter8_1dfunction vp9_filter_block1d8_v8_sse2;
256 filter8_1dfunction vp9_filter_block1d8_h8_sse2;
257 filter8_1dfunction vp9_filter_block1d4_v8_sse2;
258 filter8_1dfunction vp9_filter_block1d4_h8_sse2;
259 filter8_1dfunction vp9_filter_block1d16_v8_avg_sse2;
260 filter8_1dfunction vp9_filter_block1d16_h8_avg_sse2;
261 filter8_1dfunction vp9_filter_block1d8_v8_avg_sse2;
262 filter8_1dfunction vp9_filter_block1d8_h8_avg_sse2;
263 filter8_1dfunction vp9_filter_block1d4_v8_avg_sse2;
264 filter8_1dfunction vp9_filter_block1d4_h8_avg_sse2;
266 filter8_1dfunction vp9_filter_block1d16_v2_sse2;
267 filter8_1dfunction vp9_filter_block1d16_h2_sse2;
268 filter8_1dfunction vp9_filter_block1d8_v2_sse2;
269 filter8_1dfunction vp9_filter_block1d8_h2_sse2;
270 filter8_1dfunction vp9_filter_block1d4_v2_sse2;
271 filter8_1dfunction vp9_filter_block1d4_h2_sse2;
272 filter8_1dfunction vp9_filter_block1d16_v2_avg_sse2;
273 filter8_1dfunction vp9_filter_block1d16_h2_avg_sse2;
274 filter8_1dfunction vp9_filter_block1d8_v2_avg_sse2;
275 filter8_1dfunction vp9_filter_block1d8_h2_avg_sse2;
276 filter8_1dfunction vp9_filter_block1d4_v2_avg_sse2;
277 filter8_1dfunction vp9_filter_block1d4_h2_avg_sse2;
279 // void vp9_convolve8_horiz_sse2(const uint8_t *src, ptrdiff_t src_stride,
280 // uint8_t *dst, ptrdiff_t dst_stride,
281 // const int16_t *filter_x, int x_step_q4,
282 // const int16_t *filter_y, int y_step_q4,
284 // void vp9_convolve8_vert_sse2(const uint8_t *src, ptrdiff_t src_stride,
285 // uint8_t *dst, ptrdiff_t dst_stride,
286 // const int16_t *filter_x, int x_step_q4,
287 // const int16_t *filter_y, int y_step_q4,
289 // void vp9_convolve8_avg_horiz_sse2(const uint8_t *src, ptrdiff_t src_stride,
290 // uint8_t *dst, ptrdiff_t dst_stride,
291 // const int16_t *filter_x, int x_step_q4,
292 // const int16_t *filter_y, int y_step_q4,
294 // void vp9_convolve8_avg_vert_sse2(const uint8_t *src, ptrdiff_t src_stride,
295 // uint8_t *dst, ptrdiff_t dst_stride,
296 // const int16_t *filter_x, int x_step_q4,
297 // const int16_t *filter_y, int y_step_q4,
299 FUN_CONV_1D(horiz, x_step_q4, filter_x, h, src, , sse2);
300 FUN_CONV_1D(vert, y_step_q4, filter_y, v, src - src_stride * 3, , sse2);
301 FUN_CONV_1D(avg_horiz, x_step_q4, filter_x, h, src, avg_, sse2);
302 FUN_CONV_1D(avg_vert, y_step_q4, filter_y, v, src - src_stride * 3, avg_, sse2);
304 // void vp9_convolve8_sse2(const uint8_t *src, ptrdiff_t src_stride,
305 // uint8_t *dst, ptrdiff_t dst_stride,
306 // const int16_t *filter_x, int x_step_q4,
307 // const int16_t *filter_y, int y_step_q4,
309 // void vp9_convolve8_avg_sse2(const uint8_t *src, ptrdiff_t src_stride,
310 // uint8_t *dst, ptrdiff_t dst_stride,
311 // const int16_t *filter_x, int x_step_q4,
312 // const int16_t *filter_y, int y_step_q4,
315 FUN_CONV_2D(avg_ , sse2);