2 * Copyright 2009 The Android Open Source Project
4 * Use of this source code is governed by a BSD-style license that can be
5 * found in the LICENSE file.
8 #include "SkBitmapFilter_opts_SSE2.h"
9 #include "SkBitmapProcState_opts_SSE2.h"
10 #include "SkBitmapProcState_opts_SSSE3.h"
11 #include "SkBitmapScaler.h"
12 #include "SkBlitMask.h"
13 #include "SkBlitRect_opts_SSE2.h"
14 #include "SkBlitRow.h"
15 #include "SkBlitRow_opts_SSE2.h"
16 #include "SkBlitRow_opts_SSE4.h"
17 #include "SkBlurImage_opts_SSE2.h"
18 #include "SkBlurImage_opts_SSE4.h"
19 #include "SkLazyPtr.h"
20 #include "SkMorphology_opts.h"
21 #include "SkMorphology_opts_SSE2.h"
24 #include "SkUtils_opts_SSE2.h"
25 #include "SkXfermode.h"
26 #include "SkXfermode_proccoeff.h"
28 #if defined(_MSC_VER) && defined(_WIN64)
32 /* This file must *not* be compiled with -msse or any other optional SIMD
33 extension, otherwise gcc may generate SIMD instructions even for scalar ops
34 (and thus give an invalid instruction on Pentium3 on the code below).
35 For example, only files named *_SSE2.cpp in this directory should be
36 compiled with -msse2 or higher. */
39 /* Function to get the CPU SSE-level in runtime, for different compilers. */
41 static inline void getcpuid(int info_type, int info[4]) {
43 __cpuid(info, info_type);
56 #elif defined(__x86_64__)
57 static inline void getcpuid(int info_type, int info[4]) {
60 : "=a"(info[0]), "=b"(info[1]), "=c"(info[2]), "=d"(info[3])
65 static inline void getcpuid(int info_type, int info[4]) {
66 // We save and restore ebx, so this code can be compatible with -fPIC
72 : "=a"(info[0]), "=r"(info[1]), "=c"(info[2]), "=d"(info[3])
78 ////////////////////////////////////////////////////////////////////////////////
80 /* Fetch the SIMD level directly from the CPU, at run-time.
81 * Only checks the levels needed by the optimizations in this file.
83 namespace { // get_SIMD_level() technically must have external linkage, so no static.
84 int* get_SIMD_level() {
85 int cpu_info[4] = { 0, 0, 0, 0 };
86 getcpuid(1, cpu_info);
88 int* level = SkNEW(int);
90 if ((cpu_info[2] & (1<<20)) != 0) {
91 *level = SK_CPU_SSE_LEVEL_SSE42;
92 } else if ((cpu_info[2] & (1<<19)) != 0) {
93 *level = SK_CPU_SSE_LEVEL_SSE41;
94 } else if ((cpu_info[2] & (1<<9)) != 0) {
95 *level = SK_CPU_SSE_LEVEL_SSSE3;
96 } else if ((cpu_info[3] & (1<<26)) != 0) {
97 *level = SK_CPU_SSE_LEVEL_SSE2;
105 SK_DECLARE_STATIC_LAZY_PTR(int, gSIMDLevel, get_SIMD_level);
107 /* Verify that the requested SIMD level is supported in the build.
108 * If not, check if the platform supports it.
110 static inline bool supports_simd(int minLevel) {
111 #if defined(SK_CPU_SSE_LEVEL)
112 if (minLevel <= SK_CPU_SSE_LEVEL) {
117 #if defined(SK_BUILD_FOR_ANDROID_FRAMEWORK)
118 /* For the Android framework we should always know at compile time if the device
119 * we are building for supports SSSE3. The one exception to this rule is on the
120 * emulator where we are compiled without the -mssse3 option (so we have no
121 * SSSE3 procs) but can be run on a host machine that supports SSSE3
122 * instructions. So for that particular case we disable our SSSE3 options.
126 return minLevel <= *gSIMDLevel.get();
131 ////////////////////////////////////////////////////////////////////////////////
133 SK_CONF_DECLARE( bool, c_hqfilter_sse, "bitmap.filter.highQualitySSE", true, "Use SSE optimized version of high quality image filters");
135 void SkBitmapScaler::PlatformConvolutionProcs(SkConvolutionProcs* procs) {
136 if (supports_simd(SK_CPU_SSE_LEVEL_SSE2)) {
137 procs->fExtraHorizontalReads = 3;
138 procs->fConvolveVertically = &convolveVertically_SSE2;
139 procs->fConvolve4RowsHorizontally = &convolve4RowsHorizontally_SSE2;
140 procs->fConvolveHorizontally = &convolveHorizontally_SSE2;
141 procs->fApplySIMDPadding = &applySIMDPadding_SSE2;
145 ////////////////////////////////////////////////////////////////////////////////
147 void SkBitmapProcState::platformProcs() {
148 /* Every optimization in the function requires at least SSE2 */
149 if (!supports_simd(SK_CPU_SSE_LEVEL_SSE2)) {
153 /* Check fSampleProc32 */
154 if (fSampleProc32 == S32_opaque_D32_filter_DX) {
155 if (supports_simd(SK_CPU_SSE_LEVEL_SSSE3)) {
156 fSampleProc32 = S32_opaque_D32_filter_DX_SSSE3;
158 fSampleProc32 = S32_opaque_D32_filter_DX_SSE2;
160 } else if (fSampleProc32 == S32_opaque_D32_filter_DXDY) {
161 if (supports_simd(SK_CPU_SSE_LEVEL_SSSE3)) {
162 fSampleProc32 = S32_opaque_D32_filter_DXDY_SSSE3;
164 } else if (fSampleProc32 == S32_alpha_D32_filter_DX) {
165 if (supports_simd(SK_CPU_SSE_LEVEL_SSSE3)) {
166 fSampleProc32 = S32_alpha_D32_filter_DX_SSSE3;
168 fSampleProc32 = S32_alpha_D32_filter_DX_SSE2;
170 } else if (fSampleProc32 == S32_alpha_D32_filter_DXDY) {
171 if (supports_simd(SK_CPU_SSE_LEVEL_SSSE3)) {
172 fSampleProc32 = S32_alpha_D32_filter_DXDY_SSSE3;
176 /* Check fSampleProc16 */
177 if (fSampleProc16 == S32_D16_filter_DX) {
178 fSampleProc16 = S32_D16_filter_DX_SSE2;
181 /* Check fMatrixProc */
182 if (fMatrixProc == ClampX_ClampY_filter_scale) {
183 fMatrixProc = ClampX_ClampY_filter_scale_SSE2;
184 } else if (fMatrixProc == ClampX_ClampY_nofilter_scale) {
185 fMatrixProc = ClampX_ClampY_nofilter_scale_SSE2;
186 } else if (fMatrixProc == ClampX_ClampY_filter_affine) {
187 fMatrixProc = ClampX_ClampY_filter_affine_SSE2;
188 } else if (fMatrixProc == ClampX_ClampY_nofilter_affine) {
189 fMatrixProc = ClampX_ClampY_nofilter_affine_SSE2;
192 /* Check fShaderProc32 */
193 if (c_hqfilter_sse) {
194 if (fShaderProc32 == highQualityFilter32) {
195 fShaderProc32 = highQualityFilter_SSE2;
200 ////////////////////////////////////////////////////////////////////////////////
202 static SkBlitRow::Proc platform_16_procs[] = {
203 S32_D565_Opaque_SSE2, // S32_D565_Opaque
204 NULL, // S32_D565_Blend
205 S32A_D565_Opaque_SSE2, // S32A_D565_Opaque
206 NULL, // S32A_D565_Blend
207 S32_D565_Opaque_Dither_SSE2, // S32_D565_Opaque_Dither
208 NULL, // S32_D565_Blend_Dither
209 S32A_D565_Opaque_Dither_SSE2, // S32A_D565_Opaque_Dither
210 NULL, // S32A_D565_Blend_Dither
213 SkBlitRow::Proc SkBlitRow::PlatformProcs565(unsigned flags) {
214 if (supports_simd(SK_CPU_SSE_LEVEL_SSE2)) {
215 return platform_16_procs[flags];
221 static SkBlitRow::Proc32 platform_32_procs_SSE2[] = {
223 S32_Blend_BlitRow32_SSE2, // S32_Blend,
224 S32A_Opaque_BlitRow32_SSE2, // S32A_Opaque
225 S32A_Blend_BlitRow32_SSE2, // S32A_Blend,
228 #if defined(SK_ATT_ASM_SUPPORTED)
229 static SkBlitRow::Proc32 platform_32_procs_SSE4[] = {
231 S32_Blend_BlitRow32_SSE2, // S32_Blend,
232 S32A_Opaque_BlitRow32_SSE4_asm, // S32A_Opaque
233 S32A_Blend_BlitRow32_SSE2, // S32A_Blend,
237 SkBlitRow::Proc32 SkBlitRow::PlatformProcs32(unsigned flags) {
238 #if defined(SK_ATT_ASM_SUPPORTED)
239 if (supports_simd(SK_CPU_SSE_LEVEL_SSE41)) {
240 return platform_32_procs_SSE4[flags];
243 if (supports_simd(SK_CPU_SSE_LEVEL_SSE2)) {
244 return platform_32_procs_SSE2[flags];
250 SkBlitRow::ColorProc SkBlitRow::PlatformColorProc() {
251 if (supports_simd(SK_CPU_SSE_LEVEL_SSE2)) {
258 SkBlitRow::ColorRectProc PlatformColorRectProcFactory(); // suppress warning
260 SkBlitRow::ColorRectProc PlatformColorRectProcFactory() {
261 /* Return NULL for now, since the optimized path in ColorRect32_SSE2 is disabled.
262 if (supports_simd(SK_CPU_SSE_LEVEL_SSE2)) {
263 return ColorRect32_SSE2;
271 ////////////////////////////////////////////////////////////////////////////////
273 SkBlitMask::ColorProc SkBlitMask::PlatformColorProcs(SkColorType dstCT,
274 SkMask::Format maskFormat,
276 if (SkMask::kA8_Format != maskFormat) {
280 ColorProc proc = NULL;
281 if (supports_simd(SK_CPU_SSE_LEVEL_SSE2)) {
283 case kN32_SkColorType:
284 // The SSE2 version is not (yet) faster for black, so we check
286 if (SK_ColorBLACK != color) {
287 proc = SkARGB32_A8_BlitMask_SSE2;
297 SkBlitMask::BlitLCD16RowProc SkBlitMask::PlatformBlitRowProcs16(bool isOpaque) {
298 if (supports_simd(SK_CPU_SSE_LEVEL_SSE2)) {
300 return SkBlitLCD16OpaqueRow_SSE2;
302 return SkBlitLCD16Row_SSE2;
310 SkBlitMask::RowProc SkBlitMask::PlatformRowProcs(SkColorType, SkMask::Format, RowFlags) {
314 ////////////////////////////////////////////////////////////////////////////////
316 SkMemset16Proc SkMemset16GetPlatformProc() {
317 if (supports_simd(SK_CPU_SSE_LEVEL_SSE2)) {
318 return sk_memset16_SSE2;
324 SkMemset32Proc SkMemset32GetPlatformProc() {
325 if (supports_simd(SK_CPU_SSE_LEVEL_SSE2)) {
326 return sk_memset32_SSE2;
332 SkMemcpy32Proc SkMemcpy32GetPlatformProc() {
333 if (supports_simd(SK_CPU_SSE_LEVEL_SSE2)) {
334 return sk_memcpy32_SSE2;
340 ////////////////////////////////////////////////////////////////////////////////
342 SkMorphologyImageFilter::Proc SkMorphologyGetPlatformProc(SkMorphologyProcType type) {
343 if (!supports_simd(SK_CPU_SSE_LEVEL_SSE2)) {
347 case kDilateX_SkMorphologyProcType:
348 return SkDilateX_SSE2;
349 case kDilateY_SkMorphologyProcType:
350 return SkDilateY_SSE2;
351 case kErodeX_SkMorphologyProcType:
352 return SkErodeX_SSE2;
353 case kErodeY_SkMorphologyProcType:
354 return SkErodeY_SSE2;
360 ////////////////////////////////////////////////////////////////////////////////
362 bool SkBoxBlurGetPlatformProcs(SkBoxBlurProc* boxBlurX,
363 SkBoxBlurProc* boxBlurY,
364 SkBoxBlurProc* boxBlurXY,
365 SkBoxBlurProc* boxBlurYX) {
366 #ifdef SK_DISABLE_BLUR_DIVISION_OPTIMIZATION
369 if (supports_simd(SK_CPU_SSE_LEVEL_SSE41)) {
370 return SkBoxBlurGetPlatformProcs_SSE4(boxBlurX, boxBlurY, boxBlurXY, boxBlurYX);
372 else if (supports_simd(SK_CPU_SSE_LEVEL_SSE2)) {
373 return SkBoxBlurGetPlatformProcs_SSE2(boxBlurX, boxBlurY, boxBlurXY, boxBlurYX);
379 ////////////////////////////////////////////////////////////////////////////////
381 extern SkProcCoeffXfermode* SkPlatformXfermodeFactory_impl_SSE2(const ProcCoeff& rec,
382 SkXfermode::Mode mode);
384 SkProcCoeffXfermode* SkPlatformXfermodeFactory_impl(const ProcCoeff& rec,
385 SkXfermode::Mode mode);
387 SkProcCoeffXfermode* SkPlatformXfermodeFactory_impl(const ProcCoeff& rec,
388 SkXfermode::Mode mode) {
392 SkProcCoeffXfermode* SkPlatformXfermodeFactory(const ProcCoeff& rec,
393 SkXfermode::Mode mode);
395 SkProcCoeffXfermode* SkPlatformXfermodeFactory(const ProcCoeff& rec,
396 SkXfermode::Mode mode) {
397 if (supports_simd(SK_CPU_SSE_LEVEL_SSE2)) {
398 return SkPlatformXfermodeFactory_impl_SSE2(rec, mode);
400 return SkPlatformXfermodeFactory_impl(rec, mode);
404 SkXfermodeProc SkPlatformXfermodeProcFactory(SkXfermode::Mode mode);
406 SkXfermodeProc SkPlatformXfermodeProcFactory(SkXfermode::Mode mode) {