2 * Copyright (c) 2010 The WebM project authors. All Rights Reserved.
4 * Use of this source code is governed by a BSD-style license
5 * that can be found in the LICENSE file in the root of the source
6 * tree. An additional intellectual property rights grant can be found
7 * in the file PATENTS. All contributing project authors may
8 * be found in the AUTHORS file in the root of the source tree.
16 /****************************************************************************
18 ****************************************************************************/
20 #define ROUNDUP32(X) ( ( ( (unsigned long) X ) + 31 )&( 0xFFFFFFE0 ) )
22 /****************************************************************************
24 ****************************************************************************/
25 extern void vpx_get_processor_flags(int *mmx_enabled, int *xmm_enabled, int *wmt_enabled);
27 /****************************************************************************
28 * Exported Global Variables
29 ****************************************************************************/
30 void (*temp_filter)(pre_proc_instance *ppi, unsigned char *s, unsigned char *d, int bytes, int strength);
32 /****************************************************************************
34 * ROUTINE : temp_filter_wmt
36 * INPUTS : pre_proc_instance *ppi : Pointer to pre-processor instance.
37 * unsigned char *s : Pointer to source frame.
38 * unsigned char *d : Pointer to destination frame.
39 * int bytes : Number of bytes to filter.
40 * int strength : Strength of filter to apply.
46 * FUNCTION : Performs a closesness adjusted temporarl blur
48 * SPECIAL NOTES : Destination frame can be same as source frame.
50 ****************************************************************************/
53 pre_proc_instance *ppi,
61 unsigned char *frameptr = ppi->frame_buffer;
63 __declspec(align(16)) unsigned short threes[] = { 3, 3, 3, 3, 3, 3, 3, 3};
64 __declspec(align(16)) unsigned short sixteens[] = {16, 16, 16, 16, 16, 16, 16, 16};
75 for (i = 0; i < 8; i++)
77 *frameptr = s[byte+i];
83 while (frame < FRAMECOUNT);
85 for (i = 0; i < 8; i++)
86 d[byte+i] = s[byte+i];
96 int offset2 = (ppi->frame % FRAMECOUNT);
100 __declspec(align(16)) unsigned short counts[8];
101 __declspec(align(16)) unsigned short sums[8];
105 mov edi, s // source pixels
106 pxor xmm1, xmm1 // accumulator
110 mov esi, frameptr // accumulator
111 pxor xmm2, xmm2 // count
113 movq xmm3, QWORD PTR [edi]
115 movq QWORD PTR [esi+8*eax], xmm3
117 punpcklbw xmm3, xmm2 // xmm3 source pixels
121 movq xmm4, QWORD PTR [esi] // get frame buffer values
122 punpcklbw xmm4, xmm7 // xmm4 frame buffer pixels
123 movdqa xmm6, xmm4 // save the pixel values
124 psubsw xmm4, xmm3 // subtracted pixel values
125 pmullw xmm4, xmm4 // square xmm4
127 psrlw xmm4, xmm5 // should be strength
128 pmullw xmm4, threes // 3 * modifier
129 movdqa xmm5, sixteens // 16s
130 psubusw xmm5, xmm4 // 16 - modifiers
131 movdqa xmm4, xmm5 // save the modifiers
132 pmullw xmm4, xmm6 // multiplier values
133 paddusw xmm1, xmm4 // accumulator
134 paddusw xmm2, xmm5 // count
135 add esi, 8 // next frame
136 dec ecx // next set of eight pixels
140 psrlw xmm2, 1 // divide count by 2 for rounding
141 paddusw xmm1, xmm2 // rounding added in
148 for (i = 0; i < 8; i++)
150 int blurvalue = sums[i] * ppi->fixed_divide[counts[i]];
159 while (byte < bytes);
166 /****************************************************************************
168 * ROUTINE : temp_filter_mmx
170 * INPUTS : pre_proc_instance *ppi : Pointer to pre-processor instance.
171 * unsigned char *s : Pointer to source frame.
172 * unsigned char *d : Pointer to destination frame.
173 * int bytes : Number of bytes to filter.
174 * int strength : Strength of filter to apply.
180 * FUNCTION : Performs a closesness adjusted temporarl blur
182 * SPECIAL NOTES : Destination frame can be same as source frame.
184 ****************************************************************************/
187 pre_proc_instance *ppi,
195 unsigned char *frameptr = ppi->frame_buffer;
197 __declspec(align(16)) unsigned short threes[] = { 3, 3, 3, 3};
198 __declspec(align(16)) unsigned short sixteens[] = {16, 16, 16, 16};
209 for (i = 0; i < 4; i++)
211 *frameptr = s[byte+i];
217 while (frame < FRAMECOUNT);
219 for (i = 0; i < 4; i++)
220 d[byte+i] = s[byte+i];
225 while (byte < bytes);
230 int offset2 = (ppi->frame % FRAMECOUNT);
234 __declspec(align(16)) unsigned short counts[8];
235 __declspec(align(16)) unsigned short sums[8];
240 mov edi, s // source pixels
241 pxor mm1, mm1 // accumulator
244 mov esi, frameptr // accumulator
245 pxor mm2, mm2 // count
247 movd mm3, DWORD PTR [edi]
248 movd DWORD PTR [esi+4*eax], mm3
250 punpcklbw mm3, mm2 // mm3 source pixels
254 movd mm4, DWORD PTR [esi] // get frame buffer values
255 punpcklbw mm4, mm7 // mm4 frame buffer pixels
256 movq mm6, mm4 // save the pixel values
257 psubsw mm4, mm3 // subtracted pixel values
258 pmullw mm4, mm4 // square mm4
260 psrlw mm4, mm5 // should be strength
261 pmullw mm4, threes // 3 * modifier
262 movq mm5, sixteens // 16s
263 psubusw mm5, mm4 // 16 - modifiers
264 movq mm4, mm5 // save the modifiers
265 pmullw mm4, mm6 // multiplier values
266 paddusw mm1, mm4 // accumulator
267 paddusw mm2, mm5 // count
268 add esi, 4 // next frame
269 dec ecx // next set of eight pixels
273 psrlw mm2, 1 // divide count by 2 for rounding
274 paddusw mm1, mm2 // rounding added in
282 for (i = 0; i < 4; i++)
284 int blurvalue = sums[i] * ppi->fixed_divide[counts[i]];
293 while (byte < bytes);