2 Simple DirectMedia Layer
3 Copyright (C) 1997-2016 Sam Lantinga <slouken@libsdl.org>
5 This software is provided 'as-is', without any express or implied
6 warranty. In no event will the authors be held liable for any damages
7 arising from the use of this software.
9 Permission is granted to anyone to use this software for any purpose,
10 including commercial applications, and to alter it and redistribute it
11 freely, subject to the following restrictions:
13 1. The origin of this software must not be misrepresented; you must not
14 claim that you wrote the original software. If you use this software
15 in a product, an acknowledgment in the product documentation would be
16 appreciated but is not required.
17 2. Altered source versions must be plainly marked as such, and must not be
18 misrepresented as being the original software.
19 3. This notice may not be removed or altered from any source distribution.
21 #include "../SDL_internal.h"
23 #include "SDL_video.h"
26 /* Functions to perform alpha blended blitting */
28 /* N->1 blending with per-surface alpha */
30 BlitNto1SurfaceAlpha(SDL_BlitInfo * info)
32 int width = info->dst_w;
33 int height = info->dst_h;
34 Uint8 *src = info->src;
35 int srcskip = info->src_skip;
36 Uint8 *dst = info->dst;
37 int dstskip = info->dst_skip;
38 Uint8 *palmap = info->table;
39 SDL_PixelFormat *srcfmt = info->src_fmt;
40 SDL_PixelFormat *dstfmt = info->dst_fmt;
41 int srcbpp = srcfmt->BytesPerPixel;
45 const unsigned A = info->a;
51 DISEMBLE_RGB(src, srcbpp, srcfmt, Pixel, sR, sG, sB);
52 dR = dstfmt->palette->colors[*dst].r;
53 dG = dstfmt->palette->colors[*dst].g;
54 dB = dstfmt->palette->colors[*dst].b;
55 ALPHA_BLEND_RGB(sR, sG, sB, A, dR, dG, dB);
59 /* Pack RGB into 8bit pixel */
60 if ( palmap == NULL ) {
61 *dst =((dR>>5)<<(3+2))|((dG>>5)<<(2))|((dB>>6)<<(0));
63 *dst = palmap[((dR>>5)<<(3+2))|((dG>>5)<<(2))|((dB>>6)<<(0))];
75 /* N->1 blending with pixel alpha */
77 BlitNto1PixelAlpha(SDL_BlitInfo * info)
79 int width = info->dst_w;
80 int height = info->dst_h;
81 Uint8 *src = info->src;
82 int srcskip = info->src_skip;
83 Uint8 *dst = info->dst;
84 int dstskip = info->dst_skip;
85 Uint8 *palmap = info->table;
86 SDL_PixelFormat *srcfmt = info->src_fmt;
87 SDL_PixelFormat *dstfmt = info->dst_fmt;
88 int srcbpp = srcfmt->BytesPerPixel;
90 unsigned sR, sG, sB, sA;
97 DISEMBLE_RGBA(src,srcbpp,srcfmt,Pixel,sR,sG,sB,sA);
98 dR = dstfmt->palette->colors[*dst].r;
99 dG = dstfmt->palette->colors[*dst].g;
100 dB = dstfmt->palette->colors[*dst].b;
101 ALPHA_BLEND_RGB(sR, sG, sB, sA, dR, dG, dB);
105 /* Pack RGB into 8bit pixel */
106 if ( palmap == NULL ) {
107 *dst =((dR>>5)<<(3+2))|((dG>>5)<<(2))|((dB>>6)<<(0));
109 *dst = palmap[((dR>>5)<<(3+2))|((dG>>5)<<(2))|((dB>>6)<<(0))];
121 /* colorkeyed N->1 blending with per-surface alpha */
123 BlitNto1SurfaceAlphaKey(SDL_BlitInfo * info)
125 int width = info->dst_w;
126 int height = info->dst_h;
127 Uint8 *src = info->src;
128 int srcskip = info->src_skip;
129 Uint8 *dst = info->dst;
130 int dstskip = info->dst_skip;
131 Uint8 *palmap = info->table;
132 SDL_PixelFormat *srcfmt = info->src_fmt;
133 SDL_PixelFormat *dstfmt = info->dst_fmt;
134 int srcbpp = srcfmt->BytesPerPixel;
135 Uint32 ckey = info->colorkey;
139 const unsigned A = info->a;
145 DISEMBLE_RGB(src, srcbpp, srcfmt, Pixel, sR, sG, sB);
146 if ( Pixel != ckey ) {
147 dR = dstfmt->palette->colors[*dst].r;
148 dG = dstfmt->palette->colors[*dst].g;
149 dB = dstfmt->palette->colors[*dst].b;
150 ALPHA_BLEND_RGB(sR, sG, sB, A, dR, dG, dB);
154 /* Pack RGB into 8bit pixel */
155 if ( palmap == NULL ) {
156 *dst =((dR>>5)<<(3+2))|((dG>>5)<<(2))|((dB>>6)<<(0));
158 *dst = palmap[((dR>>5)<<(3+2))|((dG>>5)<<(2))|((dB>>6)<<(0))];
173 /* fast RGB888->(A)RGB888 blending with surface alpha=128 special case */
175 BlitRGBtoRGBSurfaceAlpha128MMX(SDL_BlitInfo * info)
177 int width = info->dst_w;
178 int height = info->dst_h;
179 Uint32 *srcp = (Uint32 *) info->src;
180 int srcskip = info->src_skip >> 2;
181 Uint32 *dstp = (Uint32 *) info->dst;
182 int dstskip = info->dst_skip >> 2;
183 Uint32 dalpha = info->dst_fmt->Amask;
185 __m64 src1, src2, dst1, dst2, lmask, hmask, dsta;
187 hmask = _mm_set_pi32(0x00fefefe, 0x00fefefe); /* alpha128 mask -> hmask */
188 lmask = _mm_set_pi32(0x00010101, 0x00010101); /* !alpha128 mask -> lmask */
189 dsta = _mm_set_pi32(dalpha, dalpha); /* dst alpha mask -> dsta */
196 *dstp++ = ((((s & 0x00fefefe) + (d & 0x00fefefe)) >> 1)
197 + (s & d & 0x00010101)) | dalpha;
201 for (n >>= 1; n > 0; --n) {
202 dst1 = *(__m64 *) dstp; /* 2 x dst -> dst1(ARGBARGB) */
203 dst2 = dst1; /* 2 x dst -> dst2(ARGBARGB) */
205 src1 = *(__m64 *) srcp; /* 2 x src -> src1(ARGBARGB) */
206 src2 = src1; /* 2 x src -> src2(ARGBARGB) */
208 dst2 = _mm_and_si64(dst2, hmask); /* dst & mask -> dst2 */
209 src2 = _mm_and_si64(src2, hmask); /* src & mask -> src2 */
210 src2 = _mm_add_pi32(src2, dst2); /* dst2 + src2 -> src2 */
211 src2 = _mm_srli_pi32(src2, 1); /* src2 >> 1 -> src2 */
213 dst1 = _mm_and_si64(dst1, src1); /* src & dst -> dst1 */
214 dst1 = _mm_and_si64(dst1, lmask); /* dst1 & !mask -> dst1 */
215 dst1 = _mm_add_pi32(dst1, src2); /* src2 + dst1 -> dst1 */
216 dst1 = _mm_or_si64(dst1, dsta); /* dsta(full alpha) | dst1 -> dst1 */
218 *(__m64 *) dstp = dst1; /* dst1 -> 2 x dst pixels */
229 /* fast RGB888->(A)RGB888 blending with surface alpha */
231 BlitRGBtoRGBSurfaceAlphaMMX(SDL_BlitInfo * info)
233 SDL_PixelFormat *df = info->dst_fmt;
235 unsigned alpha = info->a;
237 if (alpha == 128 && (df->Rmask | df->Gmask | df->Bmask) == 0x00FFFFFF) {
238 /* only call a128 version when R,G,B occupy lower bits */
239 BlitRGBtoRGBSurfaceAlpha128MMX(info);
241 int width = info->dst_w;
242 int height = info->dst_h;
243 Uint32 *srcp = (Uint32 *) info->src;
244 int srcskip = info->src_skip >> 2;
245 Uint32 *dstp = (Uint32 *) info->dst;
246 int dstskip = info->dst_skip >> 2;
247 Uint32 dalpha = df->Amask;
250 __m64 src1, src2, dst1, dst2, mm_alpha, mm_zero, dsta;
252 mm_zero = _mm_setzero_si64(); /* 0 -> mm_zero */
253 /* form the alpha mult */
254 amult = alpha | (alpha << 8);
255 amult = amult | (amult << 16);
257 (0xff << df->Rshift) | (0xff << df->
258 Gshift) | (0xff << df->Bshift);
259 mm_alpha = _mm_set_pi32(0, amult & chanmask); /* 0000AAAA -> mm_alpha, minus 1 chan */
260 mm_alpha = _mm_unpacklo_pi8(mm_alpha, mm_zero); /* 0A0A0A0A -> mm_alpha, minus 1 chan */
261 /* at this point mm_alpha can be 000A0A0A or 0A0A0A00 or another combo */
262 dsta = _mm_set_pi32(dalpha, dalpha); /* dst alpha mask -> dsta */
267 /* One Pixel Blend */
268 src2 = _mm_cvtsi32_si64(*srcp); /* src(ARGB) -> src2 (0000ARGB) */
269 src2 = _mm_unpacklo_pi8(src2, mm_zero); /* 0A0R0G0B -> src2 */
271 dst1 = _mm_cvtsi32_si64(*dstp); /* dst(ARGB) -> dst1 (0000ARGB) */
272 dst1 = _mm_unpacklo_pi8(dst1, mm_zero); /* 0A0R0G0B -> dst1 */
274 src2 = _mm_sub_pi16(src2, dst1); /* src2 - dst2 -> src2 */
275 src2 = _mm_mullo_pi16(src2, mm_alpha); /* src2 * alpha -> src2 */
276 src2 = _mm_srli_pi16(src2, 8); /* src2 >> 8 -> src2 */
277 dst1 = _mm_add_pi8(src2, dst1); /* src2 + dst1 -> dst1 */
279 dst1 = _mm_packs_pu16(dst1, mm_zero); /* 0000ARGB -> dst1 */
280 dst1 = _mm_or_si64(dst1, dsta); /* dsta | dst1 -> dst1 */
281 *dstp = _mm_cvtsi64_si32(dst1); /* dst1 -> pixel */
289 for (n >>= 1; n > 0; --n) {
290 /* Two Pixels Blend */
291 src1 = *(__m64 *) srcp; /* 2 x src -> src1(ARGBARGB) */
292 src2 = src1; /* 2 x src -> src2(ARGBARGB) */
293 src1 = _mm_unpacklo_pi8(src1, mm_zero); /* low - 0A0R0G0B -> src1 */
294 src2 = _mm_unpackhi_pi8(src2, mm_zero); /* high - 0A0R0G0B -> src2 */
296 dst1 = *(__m64 *) dstp; /* 2 x dst -> dst1(ARGBARGB) */
297 dst2 = dst1; /* 2 x dst -> dst2(ARGBARGB) */
298 dst1 = _mm_unpacklo_pi8(dst1, mm_zero); /* low - 0A0R0G0B -> dst1 */
299 dst2 = _mm_unpackhi_pi8(dst2, mm_zero); /* high - 0A0R0G0B -> dst2 */
301 src1 = _mm_sub_pi16(src1, dst1); /* src1 - dst1 -> src1 */
302 src1 = _mm_mullo_pi16(src1, mm_alpha); /* src1 * alpha -> src1 */
303 src1 = _mm_srli_pi16(src1, 8); /* src1 >> 8 -> src1 */
304 dst1 = _mm_add_pi8(src1, dst1); /* src1 + dst1(dst1) -> dst1 */
306 src2 = _mm_sub_pi16(src2, dst2); /* src2 - dst2 -> src2 */
307 src2 = _mm_mullo_pi16(src2, mm_alpha); /* src2 * alpha -> src2 */
308 src2 = _mm_srli_pi16(src2, 8); /* src2 >> 8 -> src2 */
309 dst2 = _mm_add_pi8(src2, dst2); /* src2 + dst2(dst2) -> dst2 */
311 dst1 = _mm_packs_pu16(dst1, dst2); /* 0A0R0G0B(res1), 0A0R0G0B(res2) -> dst1(ARGBARGB) */
312 dst1 = _mm_or_si64(dst1, dsta); /* dsta | dst1 -> dst1 */
314 *(__m64 *) dstp = dst1; /* dst1 -> 2 x pixel */
326 /* fast ARGB888->(A)RGB888 blending with pixel alpha */
328 BlitRGBtoRGBPixelAlphaMMX(SDL_BlitInfo * info)
330 int width = info->dst_w;
331 int height = info->dst_h;
332 Uint32 *srcp = (Uint32 *) info->src;
333 int srcskip = info->src_skip >> 2;
334 Uint32 *dstp = (Uint32 *) info->dst;
335 int dstskip = info->dst_skip >> 2;
336 SDL_PixelFormat *sf = info->src_fmt;
337 Uint32 amask = sf->Amask;
338 Uint32 ashift = sf->Ashift;
339 Uint64 multmask, multmask2;
341 __m64 src1, dst1, mm_alpha, mm_zero, mm_alpha2;
343 mm_zero = _mm_setzero_si64(); /* 0 -> mm_zero */
345 multmask <<= (ashift * 2);
346 multmask2 = 0x00FF00FF00FF00FFULL;
351 Uint32 alpha = *srcp & amask;
354 } else if (alpha == amask) {
357 src1 = _mm_cvtsi32_si64(*srcp); /* src(ARGB) -> src1 (0000ARGB) */
358 src1 = _mm_unpacklo_pi8(src1, mm_zero); /* 0A0R0G0B -> src1 */
360 dst1 = _mm_cvtsi32_si64(*dstp); /* dst(ARGB) -> dst1 (0000ARGB) */
361 dst1 = _mm_unpacklo_pi8(dst1, mm_zero); /* 0A0R0G0B -> dst1 */
363 mm_alpha = _mm_cvtsi32_si64(alpha); /* alpha -> mm_alpha (0000000A) */
364 mm_alpha = _mm_srli_si64(mm_alpha, ashift); /* mm_alpha >> ashift -> mm_alpha(0000000A) */
365 mm_alpha = _mm_unpacklo_pi16(mm_alpha, mm_alpha); /* 00000A0A -> mm_alpha */
366 mm_alpha2 = _mm_unpacklo_pi32(mm_alpha, mm_alpha); /* 0A0A0A0A -> mm_alpha2 */
367 mm_alpha = _mm_or_si64(mm_alpha2, *(__m64 *) & multmask); /* 0F0A0A0A -> mm_alpha */
368 mm_alpha2 = _mm_xor_si64(mm_alpha2, *(__m64 *) & multmask2); /* 255 - mm_alpha -> mm_alpha */
371 src1 = _mm_mullo_pi16(src1, mm_alpha);
372 src1 = _mm_srli_pi16(src1, 8);
373 dst1 = _mm_mullo_pi16(dst1, mm_alpha2);
374 dst1 = _mm_srli_pi16(dst1, 8);
375 dst1 = _mm_add_pi16(src1, dst1);
376 dst1 = _mm_packs_pu16(dst1, mm_zero);
378 *dstp = _mm_cvtsi64_si32(dst1); /* dst1 -> pixel */
392 /* fast RGB888->(A)RGB888 blending with surface alpha=128 special case */
394 BlitRGBtoRGBSurfaceAlpha128(SDL_BlitInfo * info)
396 int width = info->dst_w;
397 int height = info->dst_h;
398 Uint32 *srcp = (Uint32 *) info->src;
399 int srcskip = info->src_skip >> 2;
400 Uint32 *dstp = (Uint32 *) info->dst;
401 int dstskip = info->dst_skip >> 2;
408 *dstp++ = ((((s & 0x00fefefe) + (d & 0x00fefefe)) >> 1)
409 + (s & d & 0x00010101)) | 0xff000000;
417 /* fast RGB888->(A)RGB888 blending with surface alpha */
419 BlitRGBtoRGBSurfaceAlpha(SDL_BlitInfo * info)
421 unsigned alpha = info->a;
423 BlitRGBtoRGBSurfaceAlpha128(info);
425 int width = info->dst_w;
426 int height = info->dst_h;
427 Uint32 *srcp = (Uint32 *) info->src;
428 int srcskip = info->src_skip >> 2;
429 Uint32 *dstp = (Uint32 *) info->dst;
430 int dstskip = info->dst_skip >> 2;
443 d1 = (d1 + ((s1 - d1) * alpha >> 8))
447 d = (d + ((s - d) * alpha >> 8)) & 0xff00;
448 *dstp = d1 | d | 0xff000000;
459 /* fast ARGB888->(A)RGB888 blending with pixel alpha */
461 BlitRGBtoRGBPixelAlpha(SDL_BlitInfo * info)
463 int width = info->dst_w;
464 int height = info->dst_h;
465 Uint32 *srcp = (Uint32 *) info->src;
466 int srcskip = info->src_skip >> 2;
467 Uint32 *dstp = (Uint32 *) info->dst;
468 int dstskip = info->dst_skip >> 2;
478 Uint32 alpha = s >> 24;
479 /* FIXME: Here we special-case opaque alpha since the
480 compositioning used (>>8 instead of /255) doesn't handle
481 it correctly. Also special-case alpha=0 for speed?
484 if (alpha == SDL_ALPHA_OPAQUE) {
488 * take out the middle component (green), and process
489 * the other two in parallel. One multiply less.
495 d1 = (d1 + ((s1 - d1) * alpha >> 8)) & 0xff00ff;
498 d = (d + ((s - d) * alpha >> 8)) & 0xff00;
499 dalpha = alpha + (dalpha * (alpha ^ 0xFF) >> 8);
500 *dstp = d1 | d | (dalpha << 24);
513 /* fast (as in MMX with prefetch) ARGB888->(A)RGB888 blending with pixel alpha */
515 BlitRGBtoRGBPixelAlphaMMX3DNOW(SDL_BlitInfo * info)
517 int width = info->dst_w;
518 int height = info->dst_h;
519 Uint32 *srcp = (Uint32 *) info->src;
520 int srcskip = info->src_skip >> 2;
521 Uint32 *dstp = (Uint32 *) info->dst;
522 int dstskip = info->dst_skip >> 2;
523 SDL_PixelFormat *sf = info->src_fmt;
524 Uint32 amask = sf->Amask;
525 Uint32 ashift = sf->Ashift;
526 Uint64 multmask, multmask2;
528 __m64 src1, dst1, mm_alpha, mm_zero, mm_alpha2;
530 mm_zero = _mm_setzero_si64(); /* 0 -> mm_zero */
532 multmask <<= (ashift * 2);
533 multmask2 = 0x00FF00FF00FF00FFULL;
540 _m_prefetch(srcp + 16);
541 _m_prefetch(dstp + 16);
543 alpha = *srcp & amask;
546 } else if (alpha == amask) {
549 src1 = _mm_cvtsi32_si64(*srcp); /* src(ARGB) -> src1 (0000ARGB) */
550 src1 = _mm_unpacklo_pi8(src1, mm_zero); /* 0A0R0G0B -> src1 */
552 dst1 = _mm_cvtsi32_si64(*dstp); /* dst(ARGB) -> dst1 (0000ARGB) */
553 dst1 = _mm_unpacklo_pi8(dst1, mm_zero); /* 0A0R0G0B -> dst1 */
555 mm_alpha = _mm_cvtsi32_si64(alpha); /* alpha -> mm_alpha (0000000A) */
556 mm_alpha = _mm_srli_si64(mm_alpha, ashift); /* mm_alpha >> ashift -> mm_alpha(0000000A) */
557 mm_alpha = _mm_unpacklo_pi16(mm_alpha, mm_alpha); /* 00000A0A -> mm_alpha */
558 mm_alpha2 = _mm_unpacklo_pi32(mm_alpha, mm_alpha); /* 0A0A0A0A -> mm_alpha2 */
559 mm_alpha = _mm_or_si64(mm_alpha2, *(__m64 *) & multmask); /* 0F0A0A0A -> mm_alpha */
560 mm_alpha2 = _mm_xor_si64(mm_alpha2, *(__m64 *) & multmask2); /* 255 - mm_alpha -> mm_alpha */
564 src1 = _mm_mullo_pi16(src1, mm_alpha);
565 src1 = _mm_srli_pi16(src1, 8);
566 dst1 = _mm_mullo_pi16(dst1, mm_alpha2);
567 dst1 = _mm_srli_pi16(dst1, 8);
568 dst1 = _mm_add_pi16(src1, dst1);
569 dst1 = _mm_packs_pu16(dst1, mm_zero);
571 *dstp = _mm_cvtsi64_si32(dst1); /* dst1 -> pixel */
583 #endif /* __3dNOW__ */
585 /* 16bpp special case for per-surface alpha=50%: blend 2 pixels in parallel */
587 /* blend a single 16 bit pixel at 50% */
588 #define BLEND16_50(d, s, mask) \
589 ((((s & mask) + (d & mask)) >> 1) + (s & d & (~mask & 0xffff)))
591 /* blend two 16 bit pixels at 50% */
592 #define BLEND2x16_50(d, s, mask) \
593 (((s & (mask | mask << 16)) >> 1) + ((d & (mask | mask << 16)) >> 1) \
594 + (s & d & (~(mask | mask << 16))))
597 Blit16to16SurfaceAlpha128(SDL_BlitInfo * info, Uint16 mask)
599 int width = info->dst_w;
600 int height = info->dst_h;
601 Uint16 *srcp = (Uint16 *) info->src;
602 int srcskip = info->src_skip >> 1;
603 Uint16 *dstp = (Uint16 *) info->dst;
604 int dstskip = info->dst_skip >> 1;
607 if (((uintptr_t) srcp ^ (uintptr_t) dstp) & 2) {
609 * Source and destination not aligned, pipeline it.
610 * This is mostly a win for big blits but no loss for
616 /* handle odd destination */
617 if ((uintptr_t) dstp & 2) {
618 Uint16 d = *dstp, s = *srcp;
619 *dstp = BLEND16_50(d, s, mask);
624 srcp++; /* srcp is now 32-bit aligned */
626 /* bootstrap pipeline with first halfword */
627 prev_sw = ((Uint32 *) srcp)[-1];
631 sw = *(Uint32 *) srcp;
632 dw = *(Uint32 *) dstp;
633 #if SDL_BYTEORDER == SDL_BIG_ENDIAN
634 s = (prev_sw << 16) + (sw >> 16);
636 s = (prev_sw >> 16) + (sw << 16);
639 *(Uint32 *) dstp = BLEND2x16_50(dw, s, mask);
645 /* final pixel if any */
648 #if SDL_BYTEORDER == SDL_BIG_ENDIAN
649 s = (Uint16) prev_sw;
651 s = (Uint16) (prev_sw >> 16);
653 *dstp = BLEND16_50(d, s, mask);
660 /* source and destination are aligned */
663 /* first odd pixel? */
664 if ((uintptr_t) srcp & 2) {
665 Uint16 d = *dstp, s = *srcp;
666 *dstp = BLEND16_50(d, s, mask);
671 /* srcp and dstp are now 32-bit aligned */
674 Uint32 sw = *(Uint32 *) srcp;
675 Uint32 dw = *(Uint32 *) dstp;
676 *(Uint32 *) dstp = BLEND2x16_50(dw, sw, mask);
682 /* last odd pixel? */
684 Uint16 d = *dstp, s = *srcp;
685 *dstp = BLEND16_50(d, s, mask);
697 /* fast RGB565->RGB565 blending with surface alpha */
699 Blit565to565SurfaceAlphaMMX(SDL_BlitInfo * info)
701 unsigned alpha = info->a;
703 Blit16to16SurfaceAlpha128(info, 0xf7de);
705 int width = info->dst_w;
706 int height = info->dst_h;
707 Uint16 *srcp = (Uint16 *) info->src;
708 int srcskip = info->src_skip >> 1;
709 Uint16 *dstp = (Uint16 *) info->dst;
710 int dstskip = info->dst_skip >> 1;
713 __m64 src1, dst1, src2, dst2, gmask, bmask, mm_res, mm_alpha;
715 alpha &= ~(1 + 2 + 4); /* cut alpha to get the exact same behaviour */
716 mm_alpha = _mm_set_pi32(0, alpha); /* 0000000A -> mm_alpha */
717 alpha >>= 3; /* downscale alpha to 5 bits */
719 mm_alpha = _mm_unpacklo_pi16(mm_alpha, mm_alpha); /* 00000A0A -> mm_alpha */
720 mm_alpha = _mm_unpacklo_pi32(mm_alpha, mm_alpha); /* 0A0A0A0A -> mm_alpha */
721 /* position alpha to allow for mullo and mulhi on diff channels
722 to reduce the number of operations */
723 mm_alpha = _mm_slli_si64(mm_alpha, 3);
725 /* Setup the 565 color channel masks */
726 gmask = _mm_set_pi32(0x07E007E0, 0x07E007E0); /* MASKGREEN -> gmask */
727 bmask = _mm_set_pi32(0x001F001F, 0x001F001F); /* MASKBLUE -> bmask */
736 * shift out the middle component (green) to
737 * the high 16 bits, and process all three RGB
738 * components at the same time.
740 s = (s | s << 16) & 0x07e0f81f;
741 d = (d | d << 16) & 0x07e0f81f;
742 d += (s - d) * alpha >> 5;
744 *dstp++ = (Uint16)(d | d >> 16);
749 * shift out the middle component (green) to
750 * the high 16 bits, and process all three RGB
751 * components at the same time.
753 s = (s | s << 16) & 0x07e0f81f;
754 d = (d | d << 16) & 0x07e0f81f;
755 d += (s - d) * alpha >> 5;
757 *dstp++ = (Uint16)(d | d >> 16);
761 * shift out the middle component (green) to
762 * the high 16 bits, and process all three RGB
763 * components at the same time.
765 s = (s | s << 16) & 0x07e0f81f;
766 d = (d | d << 16) & 0x07e0f81f;
767 d += (s - d) * alpha >> 5;
769 *dstp++ = (Uint16)(d | d >> 16);
771 src1 = *(__m64*)srcp; /* 4 src pixels -> src1 */
772 dst1 = *(__m64*)dstp; /* 4 dst pixels -> dst1 */
776 src2 = _mm_srli_pi16(src2, 11); /* src2 >> 11 -> src2 [000r 000r 000r 000r] */
779 dst2 = _mm_srli_pi16(dst2, 11); /* dst2 >> 11 -> dst2 [000r 000r 000r 000r] */
782 src2 = _mm_sub_pi16(src2, dst2);/* src - dst -> src2 */
783 src2 = _mm_mullo_pi16(src2, mm_alpha); /* src2 * alpha -> src2 */
784 src2 = _mm_srli_pi16(src2, 11); /* src2 >> 11 -> src2 */
785 dst2 = _mm_add_pi16(src2, dst2); /* src2 + dst2 -> dst2 */
786 dst2 = _mm_slli_pi16(dst2, 11); /* dst2 << 11 -> dst2 */
788 mm_res = dst2; /* RED -> mm_res */
790 /* green -- process the bits in place */
792 src2 = _mm_and_si64(src2, gmask); /* src & MASKGREEN -> src2 */
795 dst2 = _mm_and_si64(dst2, gmask); /* dst & MASKGREEN -> dst2 */
798 src2 = _mm_sub_pi16(src2, dst2);/* src - dst -> src2 */
799 src2 = _mm_mulhi_pi16(src2, mm_alpha); /* src2 * alpha -> src2 */
800 src2 = _mm_slli_pi16(src2, 5); /* src2 << 5 -> src2 */
801 dst2 = _mm_add_pi16(src2, dst2); /* src2 + dst2 -> dst2 */
803 mm_res = _mm_or_si64(mm_res, dst2); /* RED | GREEN -> mm_res */
807 src2 = _mm_and_si64(src2, bmask); /* src & MASKBLUE -> src2[000b 000b 000b 000b] */
810 dst2 = _mm_and_si64(dst2, bmask); /* dst & MASKBLUE -> dst2[000b 000b 000b 000b] */
813 src2 = _mm_sub_pi16(src2, dst2);/* src - dst -> src2 */
814 src2 = _mm_mullo_pi16(src2, mm_alpha); /* src2 * alpha -> src2 */
815 src2 = _mm_srli_pi16(src2, 11); /* src2 >> 11 -> src2 */
816 dst2 = _mm_add_pi16(src2, dst2); /* src2 + dst2 -> dst2 */
817 dst2 = _mm_and_si64(dst2, bmask); /* dst2 & MASKBLUE -> dst2 */
819 mm_res = _mm_or_si64(mm_res, dst2); /* RED | GREEN | BLUE -> mm_res */
821 *(__m64*)dstp = mm_res; /* mm_res -> 4 dst pixels */
834 /* fast RGB555->RGB555 blending with surface alpha */
836 Blit555to555SurfaceAlphaMMX(SDL_BlitInfo * info)
838 unsigned alpha = info->a;
840 Blit16to16SurfaceAlpha128(info, 0xfbde);
842 int width = info->dst_w;
843 int height = info->dst_h;
844 Uint16 *srcp = (Uint16 *) info->src;
845 int srcskip = info->src_skip >> 1;
846 Uint16 *dstp = (Uint16 *) info->dst;
847 int dstskip = info->dst_skip >> 1;
850 __m64 src1, dst1, src2, dst2, rmask, gmask, bmask, mm_res, mm_alpha;
852 alpha &= ~(1 + 2 + 4); /* cut alpha to get the exact same behaviour */
853 mm_alpha = _mm_set_pi32(0, alpha); /* 0000000A -> mm_alpha */
854 alpha >>= 3; /* downscale alpha to 5 bits */
856 mm_alpha = _mm_unpacklo_pi16(mm_alpha, mm_alpha); /* 00000A0A -> mm_alpha */
857 mm_alpha = _mm_unpacklo_pi32(mm_alpha, mm_alpha); /* 0A0A0A0A -> mm_alpha */
858 /* position alpha to allow for mullo and mulhi on diff channels
859 to reduce the number of operations */
860 mm_alpha = _mm_slli_si64(mm_alpha, 3);
862 /* Setup the 555 color channel masks */
863 rmask = _mm_set_pi32(0x7C007C00, 0x7C007C00); /* MASKRED -> rmask */
864 gmask = _mm_set_pi32(0x03E003E0, 0x03E003E0); /* MASKGREEN -> gmask */
865 bmask = _mm_set_pi32(0x001F001F, 0x001F001F); /* MASKBLUE -> bmask */
874 * shift out the middle component (green) to
875 * the high 16 bits, and process all three RGB
876 * components at the same time.
878 s = (s | s << 16) & 0x03e07c1f;
879 d = (d | d << 16) & 0x03e07c1f;
880 d += (s - d) * alpha >> 5;
882 *dstp++ = (Uint16)(d | d >> 16);
887 * shift out the middle component (green) to
888 * the high 16 bits, and process all three RGB
889 * components at the same time.
891 s = (s | s << 16) & 0x03e07c1f;
892 d = (d | d << 16) & 0x03e07c1f;
893 d += (s - d) * alpha >> 5;
895 *dstp++ = (Uint16)(d | d >> 16);
899 * shift out the middle component (green) to
900 * the high 16 bits, and process all three RGB
901 * components at the same time.
903 s = (s | s << 16) & 0x03e07c1f;
904 d = (d | d << 16) & 0x03e07c1f;
905 d += (s - d) * alpha >> 5;
907 *dstp++ = (Uint16)(d | d >> 16);
909 src1 = *(__m64*)srcp; /* 4 src pixels -> src1 */
910 dst1 = *(__m64*)dstp; /* 4 dst pixels -> dst1 */
912 /* red -- process the bits in place */
914 src2 = _mm_and_si64(src2, rmask); /* src & MASKRED -> src2 */
917 dst2 = _mm_and_si64(dst2, rmask); /* dst & MASKRED -> dst2 */
920 src2 = _mm_sub_pi16(src2, dst2);/* src - dst -> src2 */
921 src2 = _mm_mulhi_pi16(src2, mm_alpha); /* src2 * alpha -> src2 */
922 src2 = _mm_slli_pi16(src2, 5); /* src2 << 5 -> src2 */
923 dst2 = _mm_add_pi16(src2, dst2); /* src2 + dst2 -> dst2 */
924 dst2 = _mm_and_si64(dst2, rmask); /* dst2 & MASKRED -> dst2 */
926 mm_res = dst2; /* RED -> mm_res */
928 /* green -- process the bits in place */
930 src2 = _mm_and_si64(src2, gmask); /* src & MASKGREEN -> src2 */
933 dst2 = _mm_and_si64(dst2, gmask); /* dst & MASKGREEN -> dst2 */
936 src2 = _mm_sub_pi16(src2, dst2);/* src - dst -> src2 */
937 src2 = _mm_mulhi_pi16(src2, mm_alpha); /* src2 * alpha -> src2 */
938 src2 = _mm_slli_pi16(src2, 5); /* src2 << 5 -> src2 */
939 dst2 = _mm_add_pi16(src2, dst2); /* src2 + dst2 -> dst2 */
941 mm_res = _mm_or_si64(mm_res, dst2); /* RED | GREEN -> mm_res */
944 src2 = src1; /* src -> src2 */
945 src2 = _mm_and_si64(src2, bmask); /* src & MASKBLUE -> src2[000b 000b 000b 000b] */
947 dst2 = dst1; /* dst -> dst2 */
948 dst2 = _mm_and_si64(dst2, bmask); /* dst & MASKBLUE -> dst2[000b 000b 000b 000b] */
951 src2 = _mm_sub_pi16(src2, dst2);/* src - dst -> src2 */
952 src2 = _mm_mullo_pi16(src2, mm_alpha); /* src2 * alpha -> src2 */
953 src2 = _mm_srli_pi16(src2, 11); /* src2 >> 11 -> src2 */
954 dst2 = _mm_add_pi16(src2, dst2); /* src2 + dst2 -> dst2 */
955 dst2 = _mm_and_si64(dst2, bmask); /* dst2 & MASKBLUE -> dst2 */
957 mm_res = _mm_or_si64(mm_res, dst2); /* RED | GREEN | BLUE -> mm_res */
959 *(__m64*)dstp = mm_res; /* mm_res -> 4 dst pixels */
974 /* fast RGB565->RGB565 blending with surface alpha */
976 Blit565to565SurfaceAlpha(SDL_BlitInfo * info)
978 unsigned alpha = info->a;
980 Blit16to16SurfaceAlpha128(info, 0xf7de);
982 int width = info->dst_w;
983 int height = info->dst_h;
984 Uint16 *srcp = (Uint16 *) info->src;
985 int srcskip = info->src_skip >> 1;
986 Uint16 *dstp = (Uint16 *) info->dst;
987 int dstskip = info->dst_skip >> 1;
988 alpha >>= 3; /* downscale alpha to 5 bits */
996 * shift out the middle component (green) to
997 * the high 16 bits, and process all three RGB
998 * components at the same time.
1000 s = (s | s << 16) & 0x07e0f81f;
1001 d = (d | d << 16) & 0x07e0f81f;
1002 d += (s - d) * alpha >> 5;
1004 *dstp++ = (Uint16)(d | d >> 16);
1013 /* fast RGB555->RGB555 blending with surface alpha */
1015 Blit555to555SurfaceAlpha(SDL_BlitInfo * info)
1017 unsigned alpha = info->a; /* downscale alpha to 5 bits */
1019 Blit16to16SurfaceAlpha128(info, 0xfbde);
1021 int width = info->dst_w;
1022 int height = info->dst_h;
1023 Uint16 *srcp = (Uint16 *) info->src;
1024 int srcskip = info->src_skip >> 1;
1025 Uint16 *dstp = (Uint16 *) info->dst;
1026 int dstskip = info->dst_skip >> 1;
1027 alpha >>= 3; /* downscale alpha to 5 bits */
1035 * shift out the middle component (green) to
1036 * the high 16 bits, and process all three RGB
1037 * components at the same time.
1039 s = (s | s << 16) & 0x03e07c1f;
1040 d = (d | d << 16) & 0x03e07c1f;
1041 d += (s - d) * alpha >> 5;
1043 *dstp++ = (Uint16)(d | d >> 16);
1052 /* fast ARGB8888->RGB565 blending with pixel alpha */
1054 BlitARGBto565PixelAlpha(SDL_BlitInfo * info)
1056 int width = info->dst_w;
1057 int height = info->dst_h;
1058 Uint32 *srcp = (Uint32 *) info->src;
1059 int srcskip = info->src_skip >> 2;
1060 Uint16 *dstp = (Uint16 *) info->dst;
1061 int dstskip = info->dst_skip >> 1;
1067 unsigned alpha = s >> 27; /* downscale alpha to 5 bits */
1068 /* FIXME: Here we special-case opaque alpha since the
1069 compositioning used (>>8 instead of /255) doesn't handle
1070 it correctly. Also special-case alpha=0 for speed?
1073 if(alpha == (SDL_ALPHA_OPAQUE >> 3)) {
1074 *dstp = (Uint16)((s >> 8 & 0xf800) + (s >> 5 & 0x7e0) + (s >> 3 & 0x1f));
1078 * convert source and destination to G0RAB65565
1079 * and blend all components at the same time
1081 s = ((s & 0xfc00) << 11) + (s >> 8 & 0xf800)
1083 d = (d | d << 16) & 0x07e0f81f;
1084 d += (s - d) * alpha >> 5;
1086 *dstp = (Uint16)(d | d >> 16);
1098 /* fast ARGB8888->RGB555 blending with pixel alpha */
1100 BlitARGBto555PixelAlpha(SDL_BlitInfo * info)
1102 int width = info->dst_w;
1103 int height = info->dst_h;
1104 Uint32 *srcp = (Uint32 *) info->src;
1105 int srcskip = info->src_skip >> 2;
1106 Uint16 *dstp = (Uint16 *) info->dst;
1107 int dstskip = info->dst_skip >> 1;
1114 alpha = s >> 27; /* downscale alpha to 5 bits */
1115 /* FIXME: Here we special-case opaque alpha since the
1116 compositioning used (>>8 instead of /255) doesn't handle
1117 it correctly. Also special-case alpha=0 for speed?
1120 if(alpha == (SDL_ALPHA_OPAQUE >> 3)) {
1121 *dstp = (Uint16)((s >> 9 & 0x7c00) + (s >> 6 & 0x3e0) + (s >> 3 & 0x1f));
1125 * convert source and destination to G0RAB65565
1126 * and blend all components at the same time
1128 s = ((s & 0xf800) << 10) + (s >> 9 & 0x7c00)
1130 d = (d | d << 16) & 0x03e07c1f;
1131 d += (s - d) * alpha >> 5;
1133 *dstp = (Uint16)(d | d >> 16);
1145 /* General (slow) N->N blending with per-surface alpha */
1147 BlitNtoNSurfaceAlpha(SDL_BlitInfo * info)
1149 int width = info->dst_w;
1150 int height = info->dst_h;
1151 Uint8 *src = info->src;
1152 int srcskip = info->src_skip;
1153 Uint8 *dst = info->dst;
1154 int dstskip = info->dst_skip;
1155 SDL_PixelFormat *srcfmt = info->src_fmt;
1156 SDL_PixelFormat *dstfmt = info->dst_fmt;
1157 int srcbpp = srcfmt->BytesPerPixel;
1158 int dstbpp = dstfmt->BytesPerPixel;
1160 unsigned sR, sG, sB;
1161 unsigned dR, dG, dB, dA;
1162 const unsigned sA = info->a;
1169 DISEMBLE_RGB(src, srcbpp, srcfmt, Pixel, sR, sG, sB);
1170 DISEMBLE_RGBA(dst, dstbpp, dstfmt, Pixel, dR, dG, dB, dA);
1171 ALPHA_BLEND_RGBA(sR, sG, sB, sA, dR, dG, dB, dA);
1172 ASSEMBLE_RGBA(dst, dstbpp, dstfmt, dR, dG, dB, dA);
1184 /* General (slow) colorkeyed N->N blending with per-surface alpha */
1186 BlitNtoNSurfaceAlphaKey(SDL_BlitInfo * info)
1188 int width = info->dst_w;
1189 int height = info->dst_h;
1190 Uint8 *src = info->src;
1191 int srcskip = info->src_skip;
1192 Uint8 *dst = info->dst;
1193 int dstskip = info->dst_skip;
1194 SDL_PixelFormat *srcfmt = info->src_fmt;
1195 SDL_PixelFormat *dstfmt = info->dst_fmt;
1196 Uint32 ckey = info->colorkey;
1197 int srcbpp = srcfmt->BytesPerPixel;
1198 int dstbpp = dstfmt->BytesPerPixel;
1200 unsigned sR, sG, sB;
1201 unsigned dR, dG, dB, dA;
1202 const unsigned sA = info->a;
1208 RETRIEVE_RGB_PIXEL(src, srcbpp, Pixel);
1209 if(sA && Pixel != ckey) {
1210 RGB_FROM_PIXEL(Pixel, srcfmt, sR, sG, sB);
1211 DISEMBLE_RGBA(dst, dstbpp, dstfmt, Pixel, dR, dG, dB, dA);
1212 ALPHA_BLEND_RGBA(sR, sG, sB, sA, dR, dG, dB, dA);
1213 ASSEMBLE_RGBA(dst, dstbpp, dstfmt, dR, dG, dB, dA);
1225 /* General (slow) N->N blending with pixel alpha */
1227 BlitNtoNPixelAlpha(SDL_BlitInfo * info)
1229 int width = info->dst_w;
1230 int height = info->dst_h;
1231 Uint8 *src = info->src;
1232 int srcskip = info->src_skip;
1233 Uint8 *dst = info->dst;
1234 int dstskip = info->dst_skip;
1235 SDL_PixelFormat *srcfmt = info->src_fmt;
1236 SDL_PixelFormat *dstfmt = info->dst_fmt;
1240 unsigned sR, sG, sB, sA;
1241 unsigned dR, dG, dB, dA;
1243 /* Set up some basic variables */
1244 srcbpp = srcfmt->BytesPerPixel;
1245 dstbpp = dstfmt->BytesPerPixel;
1251 DISEMBLE_RGBA(src, srcbpp, srcfmt, Pixel, sR, sG, sB, sA);
1253 DISEMBLE_RGBA(dst, dstbpp, dstfmt, Pixel, dR, dG, dB, dA);
1254 ALPHA_BLEND_RGBA(sR, sG, sB, sA, dR, dG, dB, dA);
1255 ASSEMBLE_RGBA(dst, dstbpp, dstfmt, dR, dG, dB, dA);
1269 SDL_CalculateBlitA(SDL_Surface * surface)
1271 SDL_PixelFormat *sf = surface->format;
1272 SDL_PixelFormat *df = surface->map->dst->format;
1274 switch (surface->map->info.flags & ~SDL_COPY_RLE_MASK) {
1275 case SDL_COPY_BLEND:
1276 /* Per-pixel alpha blits */
1277 switch (df->BytesPerPixel) {
1279 return BlitNto1PixelAlpha;
1282 if (sf->BytesPerPixel == 4 && sf->Amask == 0xff000000
1283 && sf->Gmask == 0xff00
1284 && ((sf->Rmask == 0xff && df->Rmask == 0x1f)
1285 || (sf->Bmask == 0xff && df->Bmask == 0x1f))) {
1286 if (df->Gmask == 0x7e0)
1287 return BlitARGBto565PixelAlpha;
1288 else if (df->Gmask == 0x3e0)
1289 return BlitARGBto555PixelAlpha;
1291 return BlitNtoNPixelAlpha;
1294 if (sf->Rmask == df->Rmask
1295 && sf->Gmask == df->Gmask
1296 && sf->Bmask == df->Bmask && sf->BytesPerPixel == 4) {
1297 #if defined(__MMX__) || defined(__3dNOW__)
1298 if (sf->Rshift % 8 == 0
1299 && sf->Gshift % 8 == 0
1300 && sf->Bshift % 8 == 0
1301 && sf->Ashift % 8 == 0 && sf->Aloss == 0) {
1304 return BlitRGBtoRGBPixelAlphaMMX3DNOW;
1308 return BlitRGBtoRGBPixelAlphaMMX;
1311 #endif /* __MMX__ || __3dNOW__ */
1312 if (sf->Amask == 0xff000000) {
1313 return BlitRGBtoRGBPixelAlpha;
1316 return BlitNtoNPixelAlpha;
1320 return BlitNtoNPixelAlpha;
1324 case SDL_COPY_MODULATE_ALPHA | SDL_COPY_BLEND:
1325 if (sf->Amask == 0) {
1326 /* Per-surface alpha blits */
1327 switch (df->BytesPerPixel) {
1329 return BlitNto1SurfaceAlpha;
1332 if (surface->map->identity) {
1333 if (df->Gmask == 0x7e0) {
1336 return Blit565to565SurfaceAlphaMMX;
1339 return Blit565to565SurfaceAlpha;
1340 } else if (df->Gmask == 0x3e0) {
1343 return Blit555to555SurfaceAlphaMMX;
1346 return Blit555to555SurfaceAlpha;
1349 return BlitNtoNSurfaceAlpha;
1352 if (sf->Rmask == df->Rmask
1353 && sf->Gmask == df->Gmask
1354 && sf->Bmask == df->Bmask && sf->BytesPerPixel == 4) {
1356 if (sf->Rshift % 8 == 0
1357 && sf->Gshift % 8 == 0
1358 && sf->Bshift % 8 == 0 && SDL_HasMMX())
1359 return BlitRGBtoRGBSurfaceAlphaMMX;
1361 if ((sf->Rmask | sf->Gmask | sf->Bmask) == 0xffffff) {
1362 return BlitRGBtoRGBSurfaceAlpha;
1365 return BlitNtoNSurfaceAlpha;
1369 return BlitNtoNSurfaceAlpha;
1374 case SDL_COPY_COLORKEY | SDL_COPY_MODULATE_ALPHA | SDL_COPY_BLEND:
1375 if (sf->Amask == 0) {
1376 if (df->BytesPerPixel == 1) {
1377 return BlitNto1SurfaceAlphaKey;
1379 return BlitNtoNSurfaceAlphaKey;
1388 /* vi: set ts=4 sw=4 expandtab: */