change SDL 1.2 to SDL 2.0
[platform/upstream/SDL.git] / src / video / SDL_blit_A.c
1 /*
2   Simple DirectMedia Layer
3   Copyright (C) 1997-2016 Sam Lantinga <slouken@libsdl.org>
4
5   This software is provided 'as-is', without any express or implied
6   warranty.  In no event will the authors be held liable for any damages
7   arising from the use of this software.
8
9   Permission is granted to anyone to use this software for any purpose,
10   including commercial applications, and to alter it and redistribute it
11   freely, subject to the following restrictions:
12
13   1. The origin of this software must not be misrepresented; you must not
14      claim that you wrote the original software. If you use this software
15      in a product, an acknowledgment in the product documentation would be
16      appreciated but is not required.
17   2. Altered source versions must be plainly marked as such, and must not be
18      misrepresented as being the original software.
19   3. This notice may not be removed or altered from any source distribution.
20 */
21 #include "../SDL_internal.h"
22
23 #include "SDL_video.h"
24 #include "SDL_blit.h"
25
26 /* Functions to perform alpha blended blitting */
27
28 /* N->1 blending with per-surface alpha */
29 static void
30 BlitNto1SurfaceAlpha(SDL_BlitInfo * info)
31 {
32     int width = info->dst_w;
33     int height = info->dst_h;
34     Uint8 *src = info->src;
35     int srcskip = info->src_skip;
36     Uint8 *dst = info->dst;
37     int dstskip = info->dst_skip;
38     Uint8 *palmap = info->table;
39     SDL_PixelFormat *srcfmt = info->src_fmt;
40     SDL_PixelFormat *dstfmt = info->dst_fmt;
41     int srcbpp = srcfmt->BytesPerPixel;
42     Uint32 Pixel;
43     unsigned sR, sG, sB;
44     unsigned dR, dG, dB;
45     const unsigned A = info->a;
46
47     while (height--) {
48             /* *INDENT-OFF* */
49             DUFFS_LOOP4(
50             {
51                 DISEMBLE_RGB(src, srcbpp, srcfmt, Pixel, sR, sG, sB);
52                 dR = dstfmt->palette->colors[*dst].r;
53                 dG = dstfmt->palette->colors[*dst].g;
54                 dB = dstfmt->palette->colors[*dst].b;
55                 ALPHA_BLEND_RGB(sR, sG, sB, A, dR, dG, dB);
56                 dR &= 0xff;
57                 dG &= 0xff;
58                 dB &= 0xff;
59                 /* Pack RGB into 8bit pixel */
60                 if ( palmap == NULL ) {
61                     *dst =((dR>>5)<<(3+2))|((dG>>5)<<(2))|((dB>>6)<<(0));
62                 } else {
63                     *dst = palmap[((dR>>5)<<(3+2))|((dG>>5)<<(2))|((dB>>6)<<(0))];
64                 }
65                 dst++;
66                 src += srcbpp;
67             },
68             width);
69             /* *INDENT-ON* */
70         src += srcskip;
71         dst += dstskip;
72     }
73 }
74
75 /* N->1 blending with pixel alpha */
76 static void
77 BlitNto1PixelAlpha(SDL_BlitInfo * info)
78 {
79     int width = info->dst_w;
80     int height = info->dst_h;
81     Uint8 *src = info->src;
82     int srcskip = info->src_skip;
83     Uint8 *dst = info->dst;
84     int dstskip = info->dst_skip;
85     Uint8 *palmap = info->table;
86     SDL_PixelFormat *srcfmt = info->src_fmt;
87     SDL_PixelFormat *dstfmt = info->dst_fmt;
88     int srcbpp = srcfmt->BytesPerPixel;
89     Uint32 Pixel;
90     unsigned sR, sG, sB, sA;
91     unsigned dR, dG, dB;
92
93     while (height--) {
94             /* *INDENT-OFF* */
95             DUFFS_LOOP4(
96             {
97                 DISEMBLE_RGBA(src,srcbpp,srcfmt,Pixel,sR,sG,sB,sA);
98                 dR = dstfmt->palette->colors[*dst].r;
99                 dG = dstfmt->palette->colors[*dst].g;
100                 dB = dstfmt->palette->colors[*dst].b;
101                 ALPHA_BLEND_RGB(sR, sG, sB, sA, dR, dG, dB);
102                 dR &= 0xff;
103                 dG &= 0xff;
104                 dB &= 0xff;
105                 /* Pack RGB into 8bit pixel */
106                 if ( palmap == NULL ) {
107                     *dst =((dR>>5)<<(3+2))|((dG>>5)<<(2))|((dB>>6)<<(0));
108                 } else {
109                     *dst = palmap[((dR>>5)<<(3+2))|((dG>>5)<<(2))|((dB>>6)<<(0))];
110                 }
111                 dst++;
112                 src += srcbpp;
113             },
114             width);
115             /* *INDENT-ON* */
116         src += srcskip;
117         dst += dstskip;
118     }
119 }
120
121 /* colorkeyed N->1 blending with per-surface alpha */
122 static void
123 BlitNto1SurfaceAlphaKey(SDL_BlitInfo * info)
124 {
125     int width = info->dst_w;
126     int height = info->dst_h;
127     Uint8 *src = info->src;
128     int srcskip = info->src_skip;
129     Uint8 *dst = info->dst;
130     int dstskip = info->dst_skip;
131     Uint8 *palmap = info->table;
132     SDL_PixelFormat *srcfmt = info->src_fmt;
133     SDL_PixelFormat *dstfmt = info->dst_fmt;
134     int srcbpp = srcfmt->BytesPerPixel;
135     Uint32 ckey = info->colorkey;
136     Uint32 Pixel;
137     unsigned sR, sG, sB;
138     unsigned dR, dG, dB;
139     const unsigned A = info->a;
140
141     while (height--) {
142             /* *INDENT-OFF* */
143             DUFFS_LOOP(
144             {
145                 DISEMBLE_RGB(src, srcbpp, srcfmt, Pixel, sR, sG, sB);
146                 if ( Pixel != ckey ) {
147                     dR = dstfmt->palette->colors[*dst].r;
148                     dG = dstfmt->palette->colors[*dst].g;
149                     dB = dstfmt->palette->colors[*dst].b;
150                     ALPHA_BLEND_RGB(sR, sG, sB, A, dR, dG, dB);
151                     dR &= 0xff;
152                     dG &= 0xff;
153                     dB &= 0xff;
154                     /* Pack RGB into 8bit pixel */
155                     if ( palmap == NULL ) {
156                 *dst =((dR>>5)<<(3+2))|((dG>>5)<<(2))|((dB>>6)<<(0));
157                     } else {
158                 *dst = palmap[((dR>>5)<<(3+2))|((dG>>5)<<(2))|((dB>>6)<<(0))];
159                     }
160                 }
161                 dst++;
162                 src += srcbpp;
163             },
164             width);
165             /* *INDENT-ON* */
166         src += srcskip;
167         dst += dstskip;
168     }
169 }
170
171 #ifdef __MMX__
172
173 /* fast RGB888->(A)RGB888 blending with surface alpha=128 special case */
174 static void
175 BlitRGBtoRGBSurfaceAlpha128MMX(SDL_BlitInfo * info)
176 {
177     int width = info->dst_w;
178     int height = info->dst_h;
179     Uint32 *srcp = (Uint32 *) info->src;
180     int srcskip = info->src_skip >> 2;
181     Uint32 *dstp = (Uint32 *) info->dst;
182     int dstskip = info->dst_skip >> 2;
183     Uint32 dalpha = info->dst_fmt->Amask;
184
185     __m64 src1, src2, dst1, dst2, lmask, hmask, dsta;
186
187     hmask = _mm_set_pi32(0x00fefefe, 0x00fefefe);       /* alpha128 mask -> hmask */
188     lmask = _mm_set_pi32(0x00010101, 0x00010101);       /* !alpha128 mask -> lmask */
189     dsta = _mm_set_pi32(dalpha, dalpha);        /* dst alpha mask -> dsta */
190
191     while (height--) {
192         int n = width;
193         if (n & 1) {
194             Uint32 s = *srcp++;
195             Uint32 d = *dstp;
196             *dstp++ = ((((s & 0x00fefefe) + (d & 0x00fefefe)) >> 1)
197                        + (s & d & 0x00010101)) | dalpha;
198             n--;
199         }
200
201         for (n >>= 1; n > 0; --n) {
202             dst1 = *(__m64 *) dstp;     /* 2 x dst -> dst1(ARGBARGB) */
203             dst2 = dst1;        /* 2 x dst -> dst2(ARGBARGB) */
204
205             src1 = *(__m64 *) srcp;     /* 2 x src -> src1(ARGBARGB) */
206             src2 = src1;        /* 2 x src -> src2(ARGBARGB) */
207
208             dst2 = _mm_and_si64(dst2, hmask);   /* dst & mask -> dst2 */
209             src2 = _mm_and_si64(src2, hmask);   /* src & mask -> src2 */
210             src2 = _mm_add_pi32(src2, dst2);    /* dst2 + src2 -> src2 */
211             src2 = _mm_srli_pi32(src2, 1);      /* src2 >> 1 -> src2 */
212
213             dst1 = _mm_and_si64(dst1, src1);    /* src & dst -> dst1 */
214             dst1 = _mm_and_si64(dst1, lmask);   /* dst1 & !mask -> dst1 */
215             dst1 = _mm_add_pi32(dst1, src2);    /* src2 + dst1 -> dst1 */
216             dst1 = _mm_or_si64(dst1, dsta);     /* dsta(full alpha) | dst1 -> dst1 */
217
218             *(__m64 *) dstp = dst1;     /* dst1 -> 2 x dst pixels */
219             dstp += 2;
220             srcp += 2;
221         }
222
223         srcp += srcskip;
224         dstp += dstskip;
225     }
226     _mm_empty();
227 }
228
229 /* fast RGB888->(A)RGB888 blending with surface alpha */
230 static void
231 BlitRGBtoRGBSurfaceAlphaMMX(SDL_BlitInfo * info)
232 {
233     SDL_PixelFormat *df = info->dst_fmt;
234     Uint32 chanmask;
235     unsigned alpha = info->a;
236
237     if (alpha == 128 && (df->Rmask | df->Gmask | df->Bmask) == 0x00FFFFFF) {
238         /* only call a128 version when R,G,B occupy lower bits */
239         BlitRGBtoRGBSurfaceAlpha128MMX(info);
240     } else {
241         int width = info->dst_w;
242         int height = info->dst_h;
243         Uint32 *srcp = (Uint32 *) info->src;
244         int srcskip = info->src_skip >> 2;
245         Uint32 *dstp = (Uint32 *) info->dst;
246         int dstskip = info->dst_skip >> 2;
247         Uint32 dalpha = df->Amask;
248         Uint32 amult;
249
250         __m64 src1, src2, dst1, dst2, mm_alpha, mm_zero, dsta;
251
252         mm_zero = _mm_setzero_si64();   /* 0 -> mm_zero */
253         /* form the alpha mult */
254         amult = alpha | (alpha << 8);
255         amult = amult | (amult << 16);
256         chanmask =
257             (0xff << df->Rshift) | (0xff << df->
258                                     Gshift) | (0xff << df->Bshift);
259         mm_alpha = _mm_set_pi32(0, amult & chanmask);   /* 0000AAAA -> mm_alpha, minus 1 chan */
260         mm_alpha = _mm_unpacklo_pi8(mm_alpha, mm_zero); /* 0A0A0A0A -> mm_alpha, minus 1 chan */
261         /* at this point mm_alpha can be 000A0A0A or 0A0A0A00 or another combo */
262         dsta = _mm_set_pi32(dalpha, dalpha);    /* dst alpha mask -> dsta */
263
264         while (height--) {
265             int n = width;
266             if (n & 1) {
267                 /* One Pixel Blend */
268                 src2 = _mm_cvtsi32_si64(*srcp); /* src(ARGB) -> src2 (0000ARGB) */
269                 src2 = _mm_unpacklo_pi8(src2, mm_zero); /* 0A0R0G0B -> src2 */
270
271                 dst1 = _mm_cvtsi32_si64(*dstp); /* dst(ARGB) -> dst1 (0000ARGB) */
272                 dst1 = _mm_unpacklo_pi8(dst1, mm_zero); /* 0A0R0G0B -> dst1 */
273
274                 src2 = _mm_sub_pi16(src2, dst1);        /* src2 - dst2 -> src2 */
275                 src2 = _mm_mullo_pi16(src2, mm_alpha);  /* src2 * alpha -> src2 */
276                 src2 = _mm_srli_pi16(src2, 8);  /* src2 >> 8 -> src2 */
277                 dst1 = _mm_add_pi8(src2, dst1); /* src2 + dst1 -> dst1 */
278
279                 dst1 = _mm_packs_pu16(dst1, mm_zero);   /* 0000ARGB -> dst1 */
280                 dst1 = _mm_or_si64(dst1, dsta); /* dsta | dst1 -> dst1 */
281                 *dstp = _mm_cvtsi64_si32(dst1); /* dst1 -> pixel */
282
283                 ++srcp;
284                 ++dstp;
285
286                 n--;
287             }
288
289             for (n >>= 1; n > 0; --n) {
290                 /* Two Pixels Blend */
291                 src1 = *(__m64 *) srcp; /* 2 x src -> src1(ARGBARGB) */
292                 src2 = src1;    /* 2 x src -> src2(ARGBARGB) */
293                 src1 = _mm_unpacklo_pi8(src1, mm_zero); /* low - 0A0R0G0B -> src1 */
294                 src2 = _mm_unpackhi_pi8(src2, mm_zero); /* high - 0A0R0G0B -> src2 */
295
296                 dst1 = *(__m64 *) dstp; /* 2 x dst -> dst1(ARGBARGB) */
297                 dst2 = dst1;    /* 2 x dst -> dst2(ARGBARGB) */
298                 dst1 = _mm_unpacklo_pi8(dst1, mm_zero); /* low - 0A0R0G0B -> dst1 */
299                 dst2 = _mm_unpackhi_pi8(dst2, mm_zero); /* high - 0A0R0G0B -> dst2 */
300
301                 src1 = _mm_sub_pi16(src1, dst1);        /* src1 - dst1 -> src1 */
302                 src1 = _mm_mullo_pi16(src1, mm_alpha);  /* src1 * alpha -> src1 */
303                 src1 = _mm_srli_pi16(src1, 8);  /* src1 >> 8 -> src1 */
304                 dst1 = _mm_add_pi8(src1, dst1); /* src1 + dst1(dst1) -> dst1 */
305
306                 src2 = _mm_sub_pi16(src2, dst2);        /* src2 - dst2 -> src2 */
307                 src2 = _mm_mullo_pi16(src2, mm_alpha);  /* src2 * alpha -> src2 */
308                 src2 = _mm_srli_pi16(src2, 8);  /* src2 >> 8 -> src2 */
309                 dst2 = _mm_add_pi8(src2, dst2); /* src2 + dst2(dst2) -> dst2 */
310
311                 dst1 = _mm_packs_pu16(dst1, dst2);      /* 0A0R0G0B(res1), 0A0R0G0B(res2) -> dst1(ARGBARGB) */
312                 dst1 = _mm_or_si64(dst1, dsta); /* dsta | dst1 -> dst1 */
313
314                 *(__m64 *) dstp = dst1; /* dst1 -> 2 x pixel */
315
316                 srcp += 2;
317                 dstp += 2;
318             }
319             srcp += srcskip;
320             dstp += dstskip;
321         }
322         _mm_empty();
323     }
324 }
325
326 /* fast ARGB888->(A)RGB888 blending with pixel alpha */
327 static void
328 BlitRGBtoRGBPixelAlphaMMX(SDL_BlitInfo * info)
329 {
330     int width = info->dst_w;
331     int height = info->dst_h;
332     Uint32 *srcp = (Uint32 *) info->src;
333     int srcskip = info->src_skip >> 2;
334     Uint32 *dstp = (Uint32 *) info->dst;
335     int dstskip = info->dst_skip >> 2;
336     SDL_PixelFormat *sf = info->src_fmt;
337     Uint32 amask = sf->Amask;
338     Uint32 ashift = sf->Ashift;
339     Uint64 multmask, multmask2;
340
341     __m64 src1, dst1, mm_alpha, mm_zero, mm_alpha2;
342
343     mm_zero = _mm_setzero_si64();       /* 0 -> mm_zero */
344     multmask = 0x00FF;
345         multmask <<= (ashift * 2);
346         multmask2 = 0x00FF00FF00FF00FFULL;
347
348     while (height--) {
349                 /* *INDENT-OFF* */
350                 DUFFS_LOOP4({
351                 Uint32 alpha = *srcp & amask;
352                 if (alpha == 0) {
353                         /* do nothing */
354                 } else if (alpha == amask) {
355                         *dstp = *srcp;
356                 } else {
357                         src1 = _mm_cvtsi32_si64(*srcp); /* src(ARGB) -> src1 (0000ARGB) */
358                         src1 = _mm_unpacklo_pi8(src1, mm_zero); /* 0A0R0G0B -> src1 */
359
360                         dst1 = _mm_cvtsi32_si64(*dstp); /* dst(ARGB) -> dst1 (0000ARGB) */
361                         dst1 = _mm_unpacklo_pi8(dst1, mm_zero); /* 0A0R0G0B -> dst1 */
362
363                         mm_alpha = _mm_cvtsi32_si64(alpha); /* alpha -> mm_alpha (0000000A) */
364                         mm_alpha = _mm_srli_si64(mm_alpha, ashift); /* mm_alpha >> ashift -> mm_alpha(0000000A) */
365                         mm_alpha = _mm_unpacklo_pi16(mm_alpha, mm_alpha); /* 00000A0A -> mm_alpha */
366                         mm_alpha2 = _mm_unpacklo_pi32(mm_alpha, mm_alpha); /* 0A0A0A0A -> mm_alpha2 */
367                         mm_alpha = _mm_or_si64(mm_alpha2, *(__m64 *) & multmask);       /* 0F0A0A0A -> mm_alpha */
368                         mm_alpha2 = _mm_xor_si64(mm_alpha2, *(__m64 *) & multmask2);    /* 255 - mm_alpha -> mm_alpha */
369
370                         /* blend */                 
371                         src1 = _mm_mullo_pi16(src1, mm_alpha);
372                         src1 = _mm_srli_pi16(src1, 8);
373                         dst1 = _mm_mullo_pi16(dst1, mm_alpha2);
374                         dst1 = _mm_srli_pi16(dst1, 8);
375                         dst1 = _mm_add_pi16(src1, dst1);
376                         dst1 = _mm_packs_pu16(dst1, mm_zero);
377                         
378                         *dstp = _mm_cvtsi64_si32(dst1); /* dst1 -> pixel */
379                 }
380                 ++srcp;
381                 ++dstp;
382             }, width);
383                 /* *INDENT-ON* */
384         srcp += srcskip;
385         dstp += dstskip;
386     }
387     _mm_empty();
388 }
389
390 #endif /* __MMX__ */
391
392 /* fast RGB888->(A)RGB888 blending with surface alpha=128 special case */
393 static void
394 BlitRGBtoRGBSurfaceAlpha128(SDL_BlitInfo * info)
395 {
396     int width = info->dst_w;
397     int height = info->dst_h;
398     Uint32 *srcp = (Uint32 *) info->src;
399     int srcskip = info->src_skip >> 2;
400     Uint32 *dstp = (Uint32 *) info->dst;
401     int dstskip = info->dst_skip >> 2;
402
403     while (height--) {
404             /* *INDENT-OFF* */
405             DUFFS_LOOP4({
406                     Uint32 s = *srcp++;
407                     Uint32 d = *dstp;
408                     *dstp++ = ((((s & 0x00fefefe) + (d & 0x00fefefe)) >> 1)
409                                + (s & d & 0x00010101)) | 0xff000000;
410             }, width);
411             /* *INDENT-ON* */
412         srcp += srcskip;
413         dstp += dstskip;
414     }
415 }
416
417 /* fast RGB888->(A)RGB888 blending with surface alpha */
418 static void
419 BlitRGBtoRGBSurfaceAlpha(SDL_BlitInfo * info)
420 {
421     unsigned alpha = info->a;
422     if (alpha == 128) {
423         BlitRGBtoRGBSurfaceAlpha128(info);
424     } else {
425         int width = info->dst_w;
426         int height = info->dst_h;
427         Uint32 *srcp = (Uint32 *) info->src;
428         int srcskip = info->src_skip >> 2;
429         Uint32 *dstp = (Uint32 *) info->dst;
430         int dstskip = info->dst_skip >> 2;
431         Uint32 s;
432         Uint32 d;
433         Uint32 s1;
434         Uint32 d1;
435
436         while (height--) {
437                         /* *INDENT-OFF* */
438                         DUFFS_LOOP4({
439                                 s = *srcp;
440                                 d = *dstp;
441                                 s1 = s & 0xff00ff;
442                                 d1 = d & 0xff00ff;
443                                 d1 = (d1 + ((s1 - d1) * alpha >> 8))
444                                      & 0xff00ff;
445                                 s &= 0xff00;
446                                 d &= 0xff00;
447                                 d = (d + ((s - d) * alpha >> 8)) & 0xff00;
448                                 *dstp = d1 | d | 0xff000000;
449                                 ++srcp;
450                                 ++dstp;
451                         }, width);
452                         /* *INDENT-ON* */
453             srcp += srcskip;
454             dstp += dstskip;
455         }
456     }
457 }
458
459 /* fast ARGB888->(A)RGB888 blending with pixel alpha */
460 static void
461 BlitRGBtoRGBPixelAlpha(SDL_BlitInfo * info)
462 {
463     int width = info->dst_w;
464     int height = info->dst_h;
465     Uint32 *srcp = (Uint32 *) info->src;
466     int srcskip = info->src_skip >> 2;
467     Uint32 *dstp = (Uint32 *) info->dst;
468     int dstskip = info->dst_skip >> 2;
469
470     while (height--) {
471             /* *INDENT-OFF* */
472             DUFFS_LOOP4({
473                 Uint32 dalpha;
474                 Uint32 d;
475                 Uint32 s1;
476                 Uint32 d1;
477                 Uint32 s = *srcp;
478                 Uint32 alpha = s >> 24;
479                 /* FIXME: Here we special-case opaque alpha since the
480                    compositioning used (>>8 instead of /255) doesn't handle
481                    it correctly. Also special-case alpha=0 for speed?
482                    Benchmark this! */
483                 if (alpha) {
484                   if (alpha == SDL_ALPHA_OPAQUE) {
485                           *dstp = *srcp;
486                   } else {
487                     /*
488                      * take out the middle component (green), and process
489                      * the other two in parallel. One multiply less.
490                      */
491                     d = *dstp;
492                         dalpha = d >> 24;
493                     s1 = s & 0xff00ff;
494                     d1 = d & 0xff00ff;
495                     d1 = (d1 + ((s1 - d1) * alpha >> 8)) & 0xff00ff;
496                     s &= 0xff00;
497                     d &= 0xff00;
498                     d = (d + ((s - d) * alpha >> 8)) & 0xff00;
499                         dalpha = alpha + (dalpha * (alpha ^ 0xFF) >> 8);
500                     *dstp = d1 | d | (dalpha << 24);
501                   }
502                 }
503                 ++srcp;
504                 ++dstp;
505             }, width);
506             /* *INDENT-ON* */
507         srcp += srcskip;
508         dstp += dstskip;
509     }
510 }
511
512 #ifdef __3dNOW__
513 /* fast (as in MMX with prefetch) ARGB888->(A)RGB888 blending with pixel alpha */
514 static void
515 BlitRGBtoRGBPixelAlphaMMX3DNOW(SDL_BlitInfo * info)
516 {
517     int width = info->dst_w;
518     int height = info->dst_h;
519     Uint32 *srcp = (Uint32 *) info->src;
520     int srcskip = info->src_skip >> 2;
521     Uint32 *dstp = (Uint32 *) info->dst;
522     int dstskip = info->dst_skip >> 2;
523     SDL_PixelFormat *sf = info->src_fmt;
524     Uint32 amask = sf->Amask;
525     Uint32 ashift = sf->Ashift;
526     Uint64 multmask, multmask2;
527
528     __m64 src1, dst1, mm_alpha, mm_zero, mm_alpha2;
529
530     mm_zero = _mm_setzero_si64();       /* 0 -> mm_zero */
531     multmask = 0x00FF;
532     multmask <<= (ashift * 2);
533     multmask2 = 0x00FF00FF00FF00FFULL;
534
535     while (height--) {
536             /* *INDENT-OFF* */
537             DUFFS_LOOP4({
538                 Uint32 alpha;
539
540                 _m_prefetch(srcp + 16);
541                 _m_prefetch(dstp + 16);
542
543                 alpha = *srcp & amask;
544                 if (alpha == 0) {
545                         /* do nothing */
546                 } else if (alpha == amask) {
547                         *dstp = *srcp;
548                 } else {
549                         src1 = _mm_cvtsi32_si64(*srcp); /* src(ARGB) -> src1 (0000ARGB) */
550                         src1 = _mm_unpacklo_pi8(src1, mm_zero); /* 0A0R0G0B -> src1 */
551
552                         dst1 = _mm_cvtsi32_si64(*dstp); /* dst(ARGB) -> dst1 (0000ARGB) */
553                         dst1 = _mm_unpacklo_pi8(dst1, mm_zero); /* 0A0R0G0B -> dst1 */
554
555                         mm_alpha = _mm_cvtsi32_si64(alpha); /* alpha -> mm_alpha (0000000A) */
556                         mm_alpha = _mm_srli_si64(mm_alpha, ashift); /* mm_alpha >> ashift -> mm_alpha(0000000A) */
557                         mm_alpha = _mm_unpacklo_pi16(mm_alpha, mm_alpha); /* 00000A0A -> mm_alpha */
558                         mm_alpha2 = _mm_unpacklo_pi32(mm_alpha, mm_alpha); /* 0A0A0A0A -> mm_alpha2 */
559                         mm_alpha = _mm_or_si64(mm_alpha2, *(__m64 *) & multmask);       /* 0F0A0A0A -> mm_alpha */
560                         mm_alpha2 = _mm_xor_si64(mm_alpha2, *(__m64 *) & multmask2);    /* 255 - mm_alpha -> mm_alpha */
561
562
563                         /* blend */                 
564                         src1 = _mm_mullo_pi16(src1, mm_alpha);
565                         src1 = _mm_srli_pi16(src1, 8);
566                         dst1 = _mm_mullo_pi16(dst1, mm_alpha2);
567                         dst1 = _mm_srli_pi16(dst1, 8);
568                         dst1 = _mm_add_pi16(src1, dst1);
569                         dst1 = _mm_packs_pu16(dst1, mm_zero);
570                         
571                         *dstp = _mm_cvtsi64_si32(dst1); /* dst1 -> pixel */
572                 }
573                 ++srcp;
574                 ++dstp;
575             }, width);
576             /* *INDENT-ON* */
577         srcp += srcskip;
578         dstp += dstskip;
579     }
580     _mm_empty();
581 }
582
583 #endif /* __3dNOW__ */
584
585 /* 16bpp special case for per-surface alpha=50%: blend 2 pixels in parallel */
586
587 /* blend a single 16 bit pixel at 50% */
588 #define BLEND16_50(d, s, mask)                                          \
589         ((((s & mask) + (d & mask)) >> 1) + (s & d & (~mask & 0xffff)))
590
591 /* blend two 16 bit pixels at 50% */
592 #define BLEND2x16_50(d, s, mask)                                             \
593         (((s & (mask | mask << 16)) >> 1) + ((d & (mask | mask << 16)) >> 1) \
594          + (s & d & (~(mask | mask << 16))))
595
596 static void
597 Blit16to16SurfaceAlpha128(SDL_BlitInfo * info, Uint16 mask)
598 {
599     int width = info->dst_w;
600     int height = info->dst_h;
601     Uint16 *srcp = (Uint16 *) info->src;
602     int srcskip = info->src_skip >> 1;
603     Uint16 *dstp = (Uint16 *) info->dst;
604     int dstskip = info->dst_skip >> 1;
605
606     while (height--) {
607         if (((uintptr_t) srcp ^ (uintptr_t) dstp) & 2) {
608             /*
609              * Source and destination not aligned, pipeline it.
610              * This is mostly a win for big blits but no loss for
611              * small ones
612              */
613             Uint32 prev_sw;
614             int w = width;
615
616             /* handle odd destination */
617             if ((uintptr_t) dstp & 2) {
618                 Uint16 d = *dstp, s = *srcp;
619                 *dstp = BLEND16_50(d, s, mask);
620                 dstp++;
621                 srcp++;
622                 w--;
623             }
624             srcp++;             /* srcp is now 32-bit aligned */
625
626             /* bootstrap pipeline with first halfword */
627             prev_sw = ((Uint32 *) srcp)[-1];
628
629             while (w > 1) {
630                 Uint32 sw, dw, s;
631                 sw = *(Uint32 *) srcp;
632                 dw = *(Uint32 *) dstp;
633 #if SDL_BYTEORDER == SDL_BIG_ENDIAN
634                 s = (prev_sw << 16) + (sw >> 16);
635 #else
636                 s = (prev_sw >> 16) + (sw << 16);
637 #endif
638                 prev_sw = sw;
639                 *(Uint32 *) dstp = BLEND2x16_50(dw, s, mask);
640                 dstp += 2;
641                 srcp += 2;
642                 w -= 2;
643             }
644
645             /* final pixel if any */
646             if (w) {
647                 Uint16 d = *dstp, s;
648 #if SDL_BYTEORDER == SDL_BIG_ENDIAN
649                 s = (Uint16) prev_sw;
650 #else
651                 s = (Uint16) (prev_sw >> 16);
652 #endif
653                 *dstp = BLEND16_50(d, s, mask);
654                 srcp++;
655                 dstp++;
656             }
657             srcp += srcskip - 1;
658             dstp += dstskip;
659         } else {
660             /* source and destination are aligned */
661             int w = width;
662
663             /* first odd pixel? */
664             if ((uintptr_t) srcp & 2) {
665                 Uint16 d = *dstp, s = *srcp;
666                 *dstp = BLEND16_50(d, s, mask);
667                 srcp++;
668                 dstp++;
669                 w--;
670             }
671             /* srcp and dstp are now 32-bit aligned */
672
673             while (w > 1) {
674                 Uint32 sw = *(Uint32 *) srcp;
675                 Uint32 dw = *(Uint32 *) dstp;
676                 *(Uint32 *) dstp = BLEND2x16_50(dw, sw, mask);
677                 srcp += 2;
678                 dstp += 2;
679                 w -= 2;
680             }
681
682             /* last odd pixel? */
683             if (w) {
684                 Uint16 d = *dstp, s = *srcp;
685                 *dstp = BLEND16_50(d, s, mask);
686                 srcp++;
687                 dstp++;
688             }
689             srcp += srcskip;
690             dstp += dstskip;
691         }
692     }
693 }
694
695 #ifdef __MMX__
696
697 /* fast RGB565->RGB565 blending with surface alpha */
698 static void
699 Blit565to565SurfaceAlphaMMX(SDL_BlitInfo * info)
700 {
701     unsigned alpha = info->a;
702     if (alpha == 128) {
703         Blit16to16SurfaceAlpha128(info, 0xf7de);
704     } else {
705         int width = info->dst_w;
706         int height = info->dst_h;
707         Uint16 *srcp = (Uint16 *) info->src;
708         int srcskip = info->src_skip >> 1;
709         Uint16 *dstp = (Uint16 *) info->dst;
710         int dstskip = info->dst_skip >> 1;
711         Uint32 s, d;
712
713         __m64 src1, dst1, src2, dst2, gmask, bmask, mm_res, mm_alpha;
714
715         alpha &= ~(1 + 2 + 4);  /* cut alpha to get the exact same behaviour */
716         mm_alpha = _mm_set_pi32(0, alpha);      /* 0000000A -> mm_alpha */
717         alpha >>= 3;            /* downscale alpha to 5 bits */
718
719         mm_alpha = _mm_unpacklo_pi16(mm_alpha, mm_alpha);       /* 00000A0A -> mm_alpha */
720         mm_alpha = _mm_unpacklo_pi32(mm_alpha, mm_alpha);       /* 0A0A0A0A -> mm_alpha */
721         /* position alpha to allow for mullo and mulhi on diff channels
722            to reduce the number of operations */
723         mm_alpha = _mm_slli_si64(mm_alpha, 3);
724
725         /* Setup the 565 color channel masks */
726         gmask = _mm_set_pi32(0x07E007E0, 0x07E007E0);   /* MASKGREEN -> gmask */
727         bmask = _mm_set_pi32(0x001F001F, 0x001F001F);   /* MASKBLUE -> bmask */
728
729         while (height--) {
730                         /* *INDENT-OFF* */
731                         DUFFS_LOOP_124(
732                         {
733                                 s = *srcp++;
734                                 d = *dstp;
735                                 /*
736                                  * shift out the middle component (green) to
737                                  * the high 16 bits, and process all three RGB
738                                  * components at the same time.
739                                  */
740                                 s = (s | s << 16) & 0x07e0f81f;
741                                 d = (d | d << 16) & 0x07e0f81f;
742                                 d += (s - d) * alpha >> 5;
743                                 d &= 0x07e0f81f;
744                                 *dstp++ = (Uint16)(d | d >> 16);
745                         },{
746                                 s = *srcp++;
747                                 d = *dstp;
748                                 /*
749                                  * shift out the middle component (green) to
750                                  * the high 16 bits, and process all three RGB
751                                  * components at the same time.
752                                  */
753                                 s = (s | s << 16) & 0x07e0f81f;
754                                 d = (d | d << 16) & 0x07e0f81f;
755                                 d += (s - d) * alpha >> 5;
756                                 d &= 0x07e0f81f;
757                                 *dstp++ = (Uint16)(d | d >> 16);
758                                 s = *srcp++;
759                                 d = *dstp;
760                                 /*
761                                  * shift out the middle component (green) to
762                                  * the high 16 bits, and process all three RGB
763                                  * components at the same time.
764                                  */
765                                 s = (s | s << 16) & 0x07e0f81f;
766                                 d = (d | d << 16) & 0x07e0f81f;
767                                 d += (s - d) * alpha >> 5;
768                                 d &= 0x07e0f81f;
769                                 *dstp++ = (Uint16)(d | d >> 16);
770                         },{
771                                 src1 = *(__m64*)srcp; /* 4 src pixels -> src1 */
772                                 dst1 = *(__m64*)dstp; /* 4 dst pixels -> dst1 */
773
774                                 /* red */
775                                 src2 = src1;
776                                 src2 = _mm_srli_pi16(src2, 11); /* src2 >> 11 -> src2 [000r 000r 000r 000r] */
777
778                                 dst2 = dst1;
779                                 dst2 = _mm_srli_pi16(dst2, 11); /* dst2 >> 11 -> dst2 [000r 000r 000r 000r] */
780
781                                 /* blend */
782                                 src2 = _mm_sub_pi16(src2, dst2);/* src - dst -> src2 */
783                                 src2 = _mm_mullo_pi16(src2, mm_alpha); /* src2 * alpha -> src2 */
784                                 src2 = _mm_srli_pi16(src2, 11); /* src2 >> 11 -> src2 */
785                                 dst2 = _mm_add_pi16(src2, dst2); /* src2 + dst2 -> dst2 */
786                                 dst2 = _mm_slli_pi16(dst2, 11); /* dst2 << 11 -> dst2 */
787
788                                 mm_res = dst2; /* RED -> mm_res */
789
790                                 /* green -- process the bits in place */
791                                 src2 = src1;
792                                 src2 = _mm_and_si64(src2, gmask); /* src & MASKGREEN -> src2 */
793
794                                 dst2 = dst1;
795                                 dst2 = _mm_and_si64(dst2, gmask); /* dst & MASKGREEN -> dst2 */
796
797                                 /* blend */
798                                 src2 = _mm_sub_pi16(src2, dst2);/* src - dst -> src2 */
799                                 src2 = _mm_mulhi_pi16(src2, mm_alpha); /* src2 * alpha -> src2 */
800                                 src2 = _mm_slli_pi16(src2, 5); /* src2 << 5 -> src2 */
801                                 dst2 = _mm_add_pi16(src2, dst2); /* src2 + dst2 -> dst2 */
802
803                                 mm_res = _mm_or_si64(mm_res, dst2); /* RED | GREEN -> mm_res */
804
805                                 /* blue */
806                                 src2 = src1;
807                                 src2 = _mm_and_si64(src2, bmask); /* src & MASKBLUE -> src2[000b 000b 000b 000b] */
808
809                                 dst2 = dst1;
810                                 dst2 = _mm_and_si64(dst2, bmask); /* dst & MASKBLUE -> dst2[000b 000b 000b 000b] */
811
812                                 /* blend */
813                                 src2 = _mm_sub_pi16(src2, dst2);/* src - dst -> src2 */
814                                 src2 = _mm_mullo_pi16(src2, mm_alpha); /* src2 * alpha -> src2 */
815                                 src2 = _mm_srli_pi16(src2, 11); /* src2 >> 11 -> src2 */
816                                 dst2 = _mm_add_pi16(src2, dst2); /* src2 + dst2 -> dst2 */
817                                 dst2 = _mm_and_si64(dst2, bmask); /* dst2 & MASKBLUE -> dst2 */
818
819                                 mm_res = _mm_or_si64(mm_res, dst2); /* RED | GREEN | BLUE -> mm_res */
820
821                                 *(__m64*)dstp = mm_res; /* mm_res -> 4 dst pixels */
822
823                                 srcp += 4;
824                                 dstp += 4;
825                         }, width);
826                         /* *INDENT-ON* */
827             srcp += srcskip;
828             dstp += dstskip;
829         }
830         _mm_empty();
831     }
832 }
833
834 /* fast RGB555->RGB555 blending with surface alpha */
835 static void
836 Blit555to555SurfaceAlphaMMX(SDL_BlitInfo * info)
837 {
838     unsigned alpha = info->a;
839     if (alpha == 128) {
840         Blit16to16SurfaceAlpha128(info, 0xfbde);
841     } else {
842         int width = info->dst_w;
843         int height = info->dst_h;
844         Uint16 *srcp = (Uint16 *) info->src;
845         int srcskip = info->src_skip >> 1;
846         Uint16 *dstp = (Uint16 *) info->dst;
847         int dstskip = info->dst_skip >> 1;
848         Uint32 s, d;
849
850         __m64 src1, dst1, src2, dst2, rmask, gmask, bmask, mm_res, mm_alpha;
851
852         alpha &= ~(1 + 2 + 4);  /* cut alpha to get the exact same behaviour */
853         mm_alpha = _mm_set_pi32(0, alpha);      /* 0000000A -> mm_alpha */
854         alpha >>= 3;            /* downscale alpha to 5 bits */
855
856         mm_alpha = _mm_unpacklo_pi16(mm_alpha, mm_alpha);       /* 00000A0A -> mm_alpha */
857         mm_alpha = _mm_unpacklo_pi32(mm_alpha, mm_alpha);       /* 0A0A0A0A -> mm_alpha */
858         /* position alpha to allow for mullo and mulhi on diff channels
859            to reduce the number of operations */
860         mm_alpha = _mm_slli_si64(mm_alpha, 3);
861
862         /* Setup the 555 color channel masks */
863         rmask = _mm_set_pi32(0x7C007C00, 0x7C007C00);   /* MASKRED -> rmask */
864         gmask = _mm_set_pi32(0x03E003E0, 0x03E003E0);   /* MASKGREEN -> gmask */
865         bmask = _mm_set_pi32(0x001F001F, 0x001F001F);   /* MASKBLUE -> bmask */
866
867         while (height--) {
868                         /* *INDENT-OFF* */
869                         DUFFS_LOOP_124(
870                         {
871                                 s = *srcp++;
872                                 d = *dstp;
873                                 /*
874                                  * shift out the middle component (green) to
875                                  * the high 16 bits, and process all three RGB
876                                  * components at the same time.
877                                  */
878                                 s = (s | s << 16) & 0x03e07c1f;
879                                 d = (d | d << 16) & 0x03e07c1f;
880                                 d += (s - d) * alpha >> 5;
881                                 d &= 0x03e07c1f;
882                                 *dstp++ = (Uint16)(d | d >> 16);
883                         },{
884                                 s = *srcp++;
885                                 d = *dstp;
886                                 /*
887                                  * shift out the middle component (green) to
888                                  * the high 16 bits, and process all three RGB
889                                  * components at the same time.
890                                  */
891                                 s = (s | s << 16) & 0x03e07c1f;
892                                 d = (d | d << 16) & 0x03e07c1f;
893                                 d += (s - d) * alpha >> 5;
894                                 d &= 0x03e07c1f;
895                                 *dstp++ = (Uint16)(d | d >> 16);
896                                 s = *srcp++;
897                                 d = *dstp;
898                                 /*
899                                  * shift out the middle component (green) to
900                                  * the high 16 bits, and process all three RGB
901                                  * components at the same time.
902                                  */
903                                 s = (s | s << 16) & 0x03e07c1f;
904                                 d = (d | d << 16) & 0x03e07c1f;
905                                 d += (s - d) * alpha >> 5;
906                                 d &= 0x03e07c1f;
907                                 *dstp++ = (Uint16)(d | d >> 16);
908                         },{
909                                 src1 = *(__m64*)srcp; /* 4 src pixels -> src1 */
910                                 dst1 = *(__m64*)dstp; /* 4 dst pixels -> dst1 */
911
912                                 /* red -- process the bits in place */
913                                 src2 = src1;
914                                 src2 = _mm_and_si64(src2, rmask); /* src & MASKRED -> src2 */
915
916                                 dst2 = dst1;
917                                 dst2 = _mm_and_si64(dst2, rmask); /* dst & MASKRED -> dst2 */
918
919                                 /* blend */
920                                 src2 = _mm_sub_pi16(src2, dst2);/* src - dst -> src2 */
921                                 src2 = _mm_mulhi_pi16(src2, mm_alpha); /* src2 * alpha -> src2 */
922                                 src2 = _mm_slli_pi16(src2, 5); /* src2 << 5 -> src2 */
923                                 dst2 = _mm_add_pi16(src2, dst2); /* src2 + dst2 -> dst2 */
924                                 dst2 = _mm_and_si64(dst2, rmask); /* dst2 & MASKRED -> dst2 */
925
926                                 mm_res = dst2; /* RED -> mm_res */
927                                 
928                                 /* green -- process the bits in place */
929                                 src2 = src1;
930                                 src2 = _mm_and_si64(src2, gmask); /* src & MASKGREEN -> src2 */
931
932                                 dst2 = dst1;
933                                 dst2 = _mm_and_si64(dst2, gmask); /* dst & MASKGREEN -> dst2 */
934
935                                 /* blend */
936                                 src2 = _mm_sub_pi16(src2, dst2);/* src - dst -> src2 */
937                                 src2 = _mm_mulhi_pi16(src2, mm_alpha); /* src2 * alpha -> src2 */
938                                 src2 = _mm_slli_pi16(src2, 5); /* src2 << 5 -> src2 */
939                                 dst2 = _mm_add_pi16(src2, dst2); /* src2 + dst2 -> dst2 */
940
941                                 mm_res = _mm_or_si64(mm_res, dst2); /* RED | GREEN -> mm_res */
942
943                                 /* blue */
944                                 src2 = src1; /* src -> src2 */
945                                 src2 = _mm_and_si64(src2, bmask); /* src & MASKBLUE -> src2[000b 000b 000b 000b] */
946
947                                 dst2 = dst1; /* dst -> dst2 */
948                                 dst2 = _mm_and_si64(dst2, bmask); /* dst & MASKBLUE -> dst2[000b 000b 000b 000b] */
949
950                                 /* blend */
951                                 src2 = _mm_sub_pi16(src2, dst2);/* src - dst -> src2 */
952                                 src2 = _mm_mullo_pi16(src2, mm_alpha); /* src2 * alpha -> src2 */
953                                 src2 = _mm_srli_pi16(src2, 11); /* src2 >> 11 -> src2 */
954                                 dst2 = _mm_add_pi16(src2, dst2); /* src2 + dst2 -> dst2 */
955                                 dst2 = _mm_and_si64(dst2, bmask); /* dst2 & MASKBLUE -> dst2 */
956
957                                 mm_res = _mm_or_si64(mm_res, dst2); /* RED | GREEN | BLUE -> mm_res */
958
959                                 *(__m64*)dstp = mm_res; /* mm_res -> 4 dst pixels */
960
961                                 srcp += 4;
962                                 dstp += 4;
963                         }, width);
964                         /* *INDENT-ON* */
965             srcp += srcskip;
966             dstp += dstskip;
967         }
968         _mm_empty();
969     }
970 }
971
972 #endif /* __MMX__ */
973
974 /* fast RGB565->RGB565 blending with surface alpha */
975 static void
976 Blit565to565SurfaceAlpha(SDL_BlitInfo * info)
977 {
978     unsigned alpha = info->a;
979     if (alpha == 128) {
980         Blit16to16SurfaceAlpha128(info, 0xf7de);
981     } else {
982         int width = info->dst_w;
983         int height = info->dst_h;
984         Uint16 *srcp = (Uint16 *) info->src;
985         int srcskip = info->src_skip >> 1;
986         Uint16 *dstp = (Uint16 *) info->dst;
987         int dstskip = info->dst_skip >> 1;
988         alpha >>= 3;            /* downscale alpha to 5 bits */
989
990         while (height--) {
991                         /* *INDENT-OFF* */
992                         DUFFS_LOOP4({
993                                 Uint32 s = *srcp++;
994                                 Uint32 d = *dstp;
995                                 /*
996                                  * shift out the middle component (green) to
997                                  * the high 16 bits, and process all three RGB
998                                  * components at the same time.
999                                  */
1000                                 s = (s | s << 16) & 0x07e0f81f;
1001                                 d = (d | d << 16) & 0x07e0f81f;
1002                                 d += (s - d) * alpha >> 5;
1003                                 d &= 0x07e0f81f;
1004                                 *dstp++ = (Uint16)(d | d >> 16);
1005                         }, width);
1006                         /* *INDENT-ON* */
1007             srcp += srcskip;
1008             dstp += dstskip;
1009         }
1010     }
1011 }
1012
1013 /* fast RGB555->RGB555 blending with surface alpha */
1014 static void
1015 Blit555to555SurfaceAlpha(SDL_BlitInfo * info)
1016 {
1017     unsigned alpha = info->a;   /* downscale alpha to 5 bits */
1018     if (alpha == 128) {
1019         Blit16to16SurfaceAlpha128(info, 0xfbde);
1020     } else {
1021         int width = info->dst_w;
1022         int height = info->dst_h;
1023         Uint16 *srcp = (Uint16 *) info->src;
1024         int srcskip = info->src_skip >> 1;
1025         Uint16 *dstp = (Uint16 *) info->dst;
1026         int dstskip = info->dst_skip >> 1;
1027         alpha >>= 3;            /* downscale alpha to 5 bits */
1028
1029         while (height--) {
1030                         /* *INDENT-OFF* */
1031                         DUFFS_LOOP4({
1032                                 Uint32 s = *srcp++;
1033                                 Uint32 d = *dstp;
1034                                 /*
1035                                  * shift out the middle component (green) to
1036                                  * the high 16 bits, and process all three RGB
1037                                  * components at the same time.
1038                                  */
1039                                 s = (s | s << 16) & 0x03e07c1f;
1040                                 d = (d | d << 16) & 0x03e07c1f;
1041                                 d += (s - d) * alpha >> 5;
1042                                 d &= 0x03e07c1f;
1043                                 *dstp++ = (Uint16)(d | d >> 16);
1044                         }, width);
1045                         /* *INDENT-ON* */
1046             srcp += srcskip;
1047             dstp += dstskip;
1048         }
1049     }
1050 }
1051
1052 /* fast ARGB8888->RGB565 blending with pixel alpha */
1053 static void
1054 BlitARGBto565PixelAlpha(SDL_BlitInfo * info)
1055 {
1056     int width = info->dst_w;
1057     int height = info->dst_h;
1058     Uint32 *srcp = (Uint32 *) info->src;
1059     int srcskip = info->src_skip >> 2;
1060     Uint16 *dstp = (Uint16 *) info->dst;
1061     int dstskip = info->dst_skip >> 1;
1062
1063     while (height--) {
1064             /* *INDENT-OFF* */
1065             DUFFS_LOOP4({
1066                 Uint32 s = *srcp;
1067                 unsigned alpha = s >> 27; /* downscale alpha to 5 bits */
1068                 /* FIXME: Here we special-case opaque alpha since the
1069                    compositioning used (>>8 instead of /255) doesn't handle
1070                    it correctly. Also special-case alpha=0 for speed?
1071                    Benchmark this! */
1072                 if(alpha) {   
1073                   if(alpha == (SDL_ALPHA_OPAQUE >> 3)) {
1074                     *dstp = (Uint16)((s >> 8 & 0xf800) + (s >> 5 & 0x7e0) + (s >> 3  & 0x1f));
1075                   } else {
1076                     Uint32 d = *dstp;
1077                     /*
1078                      * convert source and destination to G0RAB65565
1079                      * and blend all components at the same time
1080                      */
1081                     s = ((s & 0xfc00) << 11) + (s >> 8 & 0xf800)
1082                       + (s >> 3 & 0x1f);
1083                     d = (d | d << 16) & 0x07e0f81f;
1084                     d += (s - d) * alpha >> 5;
1085                     d &= 0x07e0f81f;
1086                     *dstp = (Uint16)(d | d >> 16);
1087                   }
1088                 }
1089                 srcp++;
1090                 dstp++;
1091             }, width);
1092             /* *INDENT-ON* */
1093         srcp += srcskip;
1094         dstp += dstskip;
1095     }
1096 }
1097
1098 /* fast ARGB8888->RGB555 blending with pixel alpha */
1099 static void
1100 BlitARGBto555PixelAlpha(SDL_BlitInfo * info)
1101 {
1102     int width = info->dst_w;
1103     int height = info->dst_h;
1104     Uint32 *srcp = (Uint32 *) info->src;
1105     int srcskip = info->src_skip >> 2;
1106     Uint16 *dstp = (Uint16 *) info->dst;
1107     int dstskip = info->dst_skip >> 1;
1108
1109     while (height--) {
1110             /* *INDENT-OFF* */
1111             DUFFS_LOOP4({
1112                 unsigned alpha;
1113                 Uint32 s = *srcp;
1114                 alpha = s >> 27; /* downscale alpha to 5 bits */
1115                 /* FIXME: Here we special-case opaque alpha since the
1116                    compositioning used (>>8 instead of /255) doesn't handle
1117                    it correctly. Also special-case alpha=0 for speed?
1118                    Benchmark this! */
1119                 if(alpha) {   
1120                   if(alpha == (SDL_ALPHA_OPAQUE >> 3)) {
1121                     *dstp = (Uint16)((s >> 9 & 0x7c00) + (s >> 6 & 0x3e0) + (s >> 3  & 0x1f));
1122                   } else {
1123                     Uint32 d = *dstp;
1124                     /*
1125                      * convert source and destination to G0RAB65565
1126                      * and blend all components at the same time
1127                      */
1128                     s = ((s & 0xf800) << 10) + (s >> 9 & 0x7c00)
1129                       + (s >> 3 & 0x1f);
1130                     d = (d | d << 16) & 0x03e07c1f;
1131                     d += (s - d) * alpha >> 5;
1132                     d &= 0x03e07c1f;
1133                     *dstp = (Uint16)(d | d >> 16);
1134                   }
1135                 }
1136                 srcp++;
1137                 dstp++;
1138             }, width);
1139             /* *INDENT-ON* */
1140         srcp += srcskip;
1141         dstp += dstskip;
1142     }
1143 }
1144
1145 /* General (slow) N->N blending with per-surface alpha */
1146 static void
1147 BlitNtoNSurfaceAlpha(SDL_BlitInfo * info)
1148 {
1149     int width = info->dst_w;
1150     int height = info->dst_h;
1151     Uint8 *src = info->src;
1152     int srcskip = info->src_skip;
1153     Uint8 *dst = info->dst;
1154     int dstskip = info->dst_skip;
1155     SDL_PixelFormat *srcfmt = info->src_fmt;
1156     SDL_PixelFormat *dstfmt = info->dst_fmt;
1157     int srcbpp = srcfmt->BytesPerPixel;
1158     int dstbpp = dstfmt->BytesPerPixel;
1159     Uint32 Pixel;
1160     unsigned sR, sG, sB;
1161     unsigned dR, dG, dB, dA;
1162     const unsigned sA = info->a;
1163
1164     if (sA) {
1165         while (height--) {
1166             /* *INDENT-OFF* */
1167             DUFFS_LOOP4(
1168             {
1169                 DISEMBLE_RGB(src, srcbpp, srcfmt, Pixel, sR, sG, sB);
1170                 DISEMBLE_RGBA(dst, dstbpp, dstfmt, Pixel, dR, dG, dB, dA);
1171                 ALPHA_BLEND_RGBA(sR, sG, sB, sA, dR, dG, dB, dA);
1172                 ASSEMBLE_RGBA(dst, dstbpp, dstfmt, dR, dG, dB, dA);
1173                 src += srcbpp;
1174                 dst += dstbpp;
1175             },
1176             width);
1177             /* *INDENT-ON* */
1178             src += srcskip;
1179             dst += dstskip;
1180         }
1181     }
1182 }
1183
1184 /* General (slow) colorkeyed N->N blending with per-surface alpha */
1185 static void
1186 BlitNtoNSurfaceAlphaKey(SDL_BlitInfo * info)
1187 {
1188     int width = info->dst_w;
1189     int height = info->dst_h;
1190     Uint8 *src = info->src;
1191     int srcskip = info->src_skip;
1192     Uint8 *dst = info->dst;
1193     int dstskip = info->dst_skip;
1194     SDL_PixelFormat *srcfmt = info->src_fmt;
1195     SDL_PixelFormat *dstfmt = info->dst_fmt;
1196     Uint32 ckey = info->colorkey;
1197     int srcbpp = srcfmt->BytesPerPixel;
1198     int dstbpp = dstfmt->BytesPerPixel;
1199     Uint32 Pixel;
1200     unsigned sR, sG, sB;
1201     unsigned dR, dG, dB, dA;
1202     const unsigned sA = info->a;
1203
1204     while (height--) {
1205             /* *INDENT-OFF* */
1206             DUFFS_LOOP4(
1207             {
1208                 RETRIEVE_RGB_PIXEL(src, srcbpp, Pixel);
1209                 if(sA && Pixel != ckey) {
1210                     RGB_FROM_PIXEL(Pixel, srcfmt, sR, sG, sB);
1211                     DISEMBLE_RGBA(dst, dstbpp, dstfmt, Pixel, dR, dG, dB, dA);
1212                     ALPHA_BLEND_RGBA(sR, sG, sB, sA, dR, dG, dB, dA);
1213                     ASSEMBLE_RGBA(dst, dstbpp, dstfmt, dR, dG, dB, dA);
1214                 }
1215                 src += srcbpp;
1216                 dst += dstbpp;
1217             },
1218             width);
1219             /* *INDENT-ON* */
1220         src += srcskip;
1221         dst += dstskip;
1222     }
1223 }
1224
1225 /* General (slow) N->N blending with pixel alpha */
1226 static void
1227 BlitNtoNPixelAlpha(SDL_BlitInfo * info)
1228 {
1229     int width = info->dst_w;
1230     int height = info->dst_h;
1231     Uint8 *src = info->src;
1232     int srcskip = info->src_skip;
1233     Uint8 *dst = info->dst;
1234     int dstskip = info->dst_skip;
1235     SDL_PixelFormat *srcfmt = info->src_fmt;
1236     SDL_PixelFormat *dstfmt = info->dst_fmt;
1237     int srcbpp;
1238     int dstbpp;
1239     Uint32 Pixel;
1240     unsigned sR, sG, sB, sA;
1241     unsigned dR, dG, dB, dA;
1242
1243     /* Set up some basic variables */
1244     srcbpp = srcfmt->BytesPerPixel;
1245     dstbpp = dstfmt->BytesPerPixel;
1246
1247     while (height--) {
1248             /* *INDENT-OFF* */
1249             DUFFS_LOOP4(
1250             {
1251                 DISEMBLE_RGBA(src, srcbpp, srcfmt, Pixel, sR, sG, sB, sA);
1252                 if(sA) {
1253                     DISEMBLE_RGBA(dst, dstbpp, dstfmt, Pixel, dR, dG, dB, dA);
1254                     ALPHA_BLEND_RGBA(sR, sG, sB, sA, dR, dG, dB, dA);
1255                     ASSEMBLE_RGBA(dst, dstbpp, dstfmt, dR, dG, dB, dA);
1256                 }
1257                 src += srcbpp;
1258                 dst += dstbpp;
1259             },
1260             width);
1261             /* *INDENT-ON* */
1262         src += srcskip;
1263         dst += dstskip;
1264     }
1265 }
1266
1267
1268 SDL_BlitFunc
1269 SDL_CalculateBlitA(SDL_Surface * surface)
1270 {
1271     SDL_PixelFormat *sf = surface->format;
1272     SDL_PixelFormat *df = surface->map->dst->format;
1273
1274     switch (surface->map->info.flags & ~SDL_COPY_RLE_MASK) {
1275     case SDL_COPY_BLEND:
1276         /* Per-pixel alpha blits */
1277         switch (df->BytesPerPixel) {
1278         case 1:
1279             return BlitNto1PixelAlpha;
1280
1281         case 2:
1282                 if (sf->BytesPerPixel == 4 && sf->Amask == 0xff000000
1283                     && sf->Gmask == 0xff00
1284                     && ((sf->Rmask == 0xff && df->Rmask == 0x1f)
1285                         || (sf->Bmask == 0xff && df->Bmask == 0x1f))) {
1286                 if (df->Gmask == 0x7e0)
1287                     return BlitARGBto565PixelAlpha;
1288                 else if (df->Gmask == 0x3e0)
1289                     return BlitARGBto555PixelAlpha;
1290             }
1291             return BlitNtoNPixelAlpha;
1292
1293         case 4:
1294             if (sf->Rmask == df->Rmask
1295                 && sf->Gmask == df->Gmask
1296                 && sf->Bmask == df->Bmask && sf->BytesPerPixel == 4) {
1297 #if defined(__MMX__) || defined(__3dNOW__)
1298                 if (sf->Rshift % 8 == 0
1299                     && sf->Gshift % 8 == 0
1300                     && sf->Bshift % 8 == 0
1301                     && sf->Ashift % 8 == 0 && sf->Aloss == 0) {
1302 #ifdef __3dNOW__
1303                     if (SDL_Has3DNow())
1304                         return BlitRGBtoRGBPixelAlphaMMX3DNOW;
1305 #endif
1306 #ifdef __MMX__
1307                     if (SDL_HasMMX())
1308                         return BlitRGBtoRGBPixelAlphaMMX;
1309 #endif
1310                 }
1311 #endif /* __MMX__ || __3dNOW__ */
1312                 if (sf->Amask == 0xff000000) {
1313                     return BlitRGBtoRGBPixelAlpha;
1314                 }
1315             }
1316             return BlitNtoNPixelAlpha;
1317
1318         case 3:
1319         default:
1320             return BlitNtoNPixelAlpha;
1321         }
1322         break;
1323
1324     case SDL_COPY_MODULATE_ALPHA | SDL_COPY_BLEND:
1325         if (sf->Amask == 0) {
1326             /* Per-surface alpha blits */
1327             switch (df->BytesPerPixel) {
1328             case 1:
1329                 return BlitNto1SurfaceAlpha;
1330
1331             case 2:
1332                 if (surface->map->identity) {
1333                     if (df->Gmask == 0x7e0) {
1334 #ifdef __MMX__
1335                         if (SDL_HasMMX())
1336                             return Blit565to565SurfaceAlphaMMX;
1337                         else
1338 #endif
1339                             return Blit565to565SurfaceAlpha;
1340                     } else if (df->Gmask == 0x3e0) {
1341 #ifdef __MMX__
1342                         if (SDL_HasMMX())
1343                             return Blit555to555SurfaceAlphaMMX;
1344                         else
1345 #endif
1346                             return Blit555to555SurfaceAlpha;
1347                     }
1348                 }
1349                 return BlitNtoNSurfaceAlpha;
1350
1351             case 4:
1352                 if (sf->Rmask == df->Rmask
1353                     && sf->Gmask == df->Gmask
1354                     && sf->Bmask == df->Bmask && sf->BytesPerPixel == 4) {
1355 #ifdef __MMX__
1356                     if (sf->Rshift % 8 == 0
1357                         && sf->Gshift % 8 == 0
1358                         && sf->Bshift % 8 == 0 && SDL_HasMMX())
1359                         return BlitRGBtoRGBSurfaceAlphaMMX;
1360 #endif
1361                     if ((sf->Rmask | sf->Gmask | sf->Bmask) == 0xffffff) {
1362                         return BlitRGBtoRGBSurfaceAlpha;
1363                     }
1364                 }
1365                 return BlitNtoNSurfaceAlpha;
1366
1367             case 3:
1368             default:
1369                 return BlitNtoNSurfaceAlpha;
1370             }
1371         }
1372         break;
1373
1374     case SDL_COPY_COLORKEY | SDL_COPY_MODULATE_ALPHA | SDL_COPY_BLEND:
1375         if (sf->Amask == 0) {
1376             if (df->BytesPerPixel == 1) {
1377                 return BlitNto1SurfaceAlphaKey;
1378             } else {
1379                 return BlitNtoNSurfaceAlphaKey;
1380             }
1381         }
1382         break;
1383     }
1384
1385     return NULL;
1386 }
1387
1388 /* vi: set ts=4 sw=4 expandtab: */