Git init
authorKibum Kim <kb0929.kim@samsung.com>
Fri, 6 Jan 2012 15:50:43 +0000 (00:50 +0900)
committerKibum Kim <kb0929.kim@samsung.com>
Fri, 6 Jan 2012 15:50:43 +0000 (00:50 +0900)
116 files changed:
AUTHORS [new file with mode: 0644]
CODING_STYLE [new file with mode: 0644]
COPYING [new file with mode: 0644]
ChangeLog [new file with mode: 0644]
Makefile.am [new file with mode: 0644]
Makefile.win32 [new file with mode: 0644]
Makefile.win32.common [new file with mode: 0644]
NEWS [new file with mode: 0644]
README [new file with mode: 0644]
RELEASING [new file with mode: 0644]
TODO [new file with mode: 0644]
autogen.sh [new file with mode: 0755]
configure.ac [new file with mode: 0755]
debian/README.source [new file with mode: 0755]
debian/changelog [new file with mode: 0755]
debian/compat [new file with mode: 0755]
debian/control [new file with mode: 0755]
debian/copyright [new file with mode: 0755]
debian/libpixman-1-0-udeb.install [new file with mode: 0755]
debian/libpixman-1-0.install [new file with mode: 0755]
debian/libpixman-1-0.symbols [new file with mode: 0755]
debian/libpixman-1-dev.install [new file with mode: 0755]
debian/rules [new file with mode: 0755]
debian/watch [new file with mode: 0755]
demos/Makefile.am [new file with mode: 0644]
demos/alpha-test.c [new file with mode: 0644]
demos/clip-in.c [new file with mode: 0644]
demos/clip-test.c [new file with mode: 0644]
demos/composite-test.c [new file with mode: 0644]
demos/convolution-test.c [new file with mode: 0644]
demos/gradient-test.c [new file with mode: 0644]
demos/gtk-utils.c [new file with mode: 0644]
demos/gtk-utils.h [new file with mode: 0644]
demos/radial-test.c [new file with mode: 0644]
demos/screen-test.c [new file with mode: 0644]
demos/trap-test.c [new file with mode: 0644]
demos/tri-test.c [new file with mode: 0644]
packaging/pixman.spec [new file with mode: 0644]
pixman-1-uninstalled.pc.in [new file with mode: 0644]
pixman-1.pc.in [new file with mode: 0644]
pixman/Makefile.am [new file with mode: 0644]
pixman/Makefile.sources [new file with mode: 0644]
pixman/Makefile.win32 [new file with mode: 0644]
pixman/make-combine.pl [new file with mode: 0644]
pixman/pixman-access-accessors.c [new file with mode: 0644]
pixman/pixman-access.c [new file with mode: 0644]
pixman/pixman-accessor.h [new file with mode: 0644]
pixman/pixman-arm-common.h [new file with mode: 0644]
pixman/pixman-arm-detect-win32.asm [new file with mode: 0644]
pixman/pixman-arm-neon-asm-bilinear.S [new file with mode: 0644]
pixman/pixman-arm-neon-asm.S [new file with mode: 0644]
pixman/pixman-arm-neon-asm.h [new file with mode: 0644]
pixman/pixman-arm-neon.c [new file with mode: 0644]
pixman/pixman-arm-simd-asm.S [new file with mode: 0644]
pixman/pixman-arm-simd.c [new file with mode: 0644]
pixman/pixman-bits-image.c [new file with mode: 0644]
pixman/pixman-combine.c.template [new file with mode: 0644]
pixman/pixman-combine.h.template [new file with mode: 0644]
pixman/pixman-compiler.h [new file with mode: 0644]
pixman/pixman-conical-gradient.c [new file with mode: 0644]
pixman/pixman-cpu.c [new file with mode: 0644]
pixman/pixman-edge-accessors.c [new file with mode: 0644]
pixman/pixman-edge-imp.h [new file with mode: 0644]
pixman/pixman-edge.c [new file with mode: 0644]
pixman/pixman-fast-path.c [new file with mode: 0644]
pixman/pixman-general.c [new file with mode: 0644]
pixman/pixman-gradient-walker.c [new file with mode: 0644]
pixman/pixman-image.c [new file with mode: 0644]
pixman/pixman-implementation.c [new file with mode: 0644]
pixman/pixman-inlines.h [new file with mode: 0644]
pixman/pixman-linear-gradient.c [new file with mode: 0644]
pixman/pixman-matrix.c [new file with mode: 0644]
pixman/pixman-mmx.c [new file with mode: 0644]
pixman/pixman-noop.c [new file with mode: 0644]
pixman/pixman-private.h [new file with mode: 0644]
pixman/pixman-radial-gradient.c [new file with mode: 0644]
pixman/pixman-region.c [new file with mode: 0644]
pixman/pixman-region16.c [new file with mode: 0644]
pixman/pixman-region32.c [new file with mode: 0644]
pixman/pixman-solid-fill.c [new file with mode: 0644]
pixman/pixman-sse2.c [new file with mode: 0644]
pixman/pixman-timer.c [new file with mode: 0644]
pixman/pixman-trap.c [new file with mode: 0644]
pixman/pixman-utils.c [new file with mode: 0644]
pixman/pixman-version.h.in [new file with mode: 0644]
pixman/pixman-vmx.c [new file with mode: 0644]
pixman/pixman.c [new file with mode: 0644]
pixman/pixman.h [new file with mode: 0644]
pixman/refactor [new file with mode: 0644]
pixman/solaris-hwcap.mapfile [new file with mode: 0644]
test/Makefile.am [new file with mode: 0755]
test/Makefile.sources [new file with mode: 0644]
test/Makefile.win32 [new file with mode: 0755]
test/a1-trap-test.c [new file with mode: 0644]
test/affine-test.c [new file with mode: 0755]
test/alpha-loop.c [new file with mode: 0644]
test/alphamap.c [new file with mode: 0644]
test/blitters-test.c [new file with mode: 0755]
test/composite-traps-test.c [new file with mode: 0755]
test/composite.c [new file with mode: 0755]
test/fetch-test.c [new file with mode: 0755]
test/fuzzer-find-diff.pl [new file with mode: 0644]
test/gradient-crash-test.c [new file with mode: 0644]
test/lowlevel-blt-bench.c [new file with mode: 0644]
test/oob-test.c [new file with mode: 0644]
test/pdf-op-test.c [new file with mode: 0644]
test/region-contains-test.c [new file with mode: 0644]
test/region-test.c [new file with mode: 0644]
test/region-translate-test.c [new file with mode: 0644]
test/scaling-crash-test.c [new file with mode: 0644]
test/scaling-helpers-test.c [new file with mode: 0755]
test/scaling-test.c [new file with mode: 0755]
test/stress-test.c [new file with mode: 0755]
test/trap-crasher.c [new file with mode: 0755]
test/utils.c [new file with mode: 0755]
test/utils.h [new file with mode: 0755]

diff --git a/AUTHORS b/AUTHORS
new file mode 100644 (file)
index 0000000..e69de29
diff --git a/CODING_STYLE b/CODING_STYLE
new file mode 100644 (file)
index 0000000..9f5171d
--- /dev/null
@@ -0,0 +1,199 @@
+Pixman coding style.
+====================
+
+The pixman coding style is close to cairo's with one exception: braces
+go on their own line, rather than on the line of the if/while/for:
+
+       if (condition)
+       {
+           do_something();
+           do_something_else();
+       }
+
+not
+
+       if (condition) {
+           do_something();
+           do_something_else();
+        }
+
+
+
+Indentation
+===========
+
+Each new level is indented four spaces:
+
+       if (condition)
+           do_something();
+
+This may be achieved with space characters or with a combination of
+tab characters and space characters. Tab characters are interpreted as
+
+       Advance to the next column which is a multiple of 8.
+
+
+Names
+=====
+
+In all names, words are separated with underscores. Do not use
+CamelCase for any names.
+
+Macros have ALL_CAPITAL_NAMES
+
+Type names are in lower case and end with "_t". For example
+pixman_image_t.
+
+Labels, functions and variables have lower case names.
+
+
+Braces
+======
+
+Braces always go on their own line:
+
+       if (condition)
+       {
+           do_this ();
+           do_that ();
+       }
+       else
+       {
+           do_the_other ();
+       }
+
+Rules for braces and substatements of if/while/for/do:
+
+* If a substatement spans multiple lines, then there must be braces
+  around it.
+
+* If the condition of an if/while/for spans multiple lines, then 
+  braces must be used for the substatements.
+
+* If one substatement of an if statement has braces, then the other
+  must too.
+
+* Otherwise, don't add braces.
+
+
+Comments
+========
+
+For comments either like this:
+
+        /* One line comment */
+
+or like this:
+
+       /* This is a multi-line comment
+        *
+         * It extends over multiple lines
+        */
+
+Generally comments should say things that aren't clear from the code
+itself. If too many comments say obvious things, then people will just
+stop reading all comments, including the good ones.
+
+
+Whitespace
+==========
+
+* Put a single space after commas
+
+* Put spaces around arithmetic operators such a +, -, *, /:
+
+        y * stride + x
+
+        x / unit_x
+
+* Do not put spaces after the address-of operator, the * when used as
+  a pointer derefernce or the ! and ~ operators:
+
+     &foo;
+
+     ~0x00000000
+
+     !condition
+
+     *result = 100
+
+* Break up long lines (> ~80 characters) and use whitespace to align
+  things nicely. This is one way:
+
+        some_very_long_function name (
+               implementation, op, src, mask, dest, 
+               src_x, src_y, mask_x, mask_y, dest_x, dest_y,
+               width, height);
+
+  This is another:
+
+        some_very_long_function_name (implementation, op,
+                                      src, mask, dest,
+                                     src_x, src_y,
+                                     mask_x, mask_y,
+                                     dest_x, dest_y,
+                                     width, height);
+
+* Separate logically distinct chunks with a single newline. This
+  obviously applies between functions, but also applies within a
+  function or block or structure definition.
+
+* Use a newline after a block of variable declarations.
+
+* Use a single space before a left parenthesis, except where the
+  standard will not allow it, (eg. when defining a parameterized macro).
+
+* Don't eliminate newlines just because things would still fit on one
+  line. This breaks the expected visual structure of the code making
+  it much harder to read and understand:
+
+       if (condition) foo (); else bar ();     /* Yuck! */
+
+
+Function Definitions
+====================
+
+Function definitions should take the following form:
+
+       void
+       my_function (int argument)
+       {
+           do_my_things ();
+       }
+
+If all the parameters to a function fit naturally on one line, format
+them that way. Otherwise, put one argument on each line, adding
+whitespace so that the parameter names are aligned with each other.
+
+I.e., do either this:
+
+        void
+        short_arguments (const char *str, int x, int y, int z)
+        {
+        }
+
+or this:
+
+       void
+       long_arguments (const char *char_star_arg,
+                       int         int_arg,
+                       double     *double_star_arg,
+                       double      double_arg)
+       {
+       }
+
+
+Mode lines
+==========
+
+Given the rules above, what is the best way to simplify one's life as
+a code monkey? Get your editor to do most of the tedious work of
+beautifying your code!
+
+As a reward for reading this far, here are some mode lines for the more
+popular editors:
+/*
+ * vim:sw=4:sts=4:ts=8:tw=78:fo=tcroq:cindent:cino=\:0,(0
+ * vim:isk=a-z,A-Z,48-57,_,.,-,>
+ */
+
diff --git a/COPYING b/COPYING
new file mode 100644 (file)
index 0000000..6168dea
--- /dev/null
+++ b/COPYING
@@ -0,0 +1,42 @@
+The following is the MIT license, agreed upon by most contributors.
+Copyright holders of new code should use this license statement where
+possible. They may also add themselves to the list below.
+
+/*
+ * Copyright 1987, 1988, 1989, 1998  The Open Group
+ * Copyright 1987, 1988, 1989 Digital Equipment Corporation
+ * Copyright 1999, 2004, 2008 Keith Packard
+ * Copyright 2000 SuSE, Inc.
+ * Copyright 2000 Keith Packard, member of The XFree86 Project, Inc.
+ * Copyright 2004, 2005, 2007, 2008, 2009, 2010 Red Hat, Inc.
+ * Copyright 2004 Nicholas Miell
+ * Copyright 2005 Lars Knoll & Zack Rusin, Trolltech
+ * Copyright 2005 Trolltech AS
+ * Copyright 2007 Luca Barbato
+ * Copyright 2008 Aaron Plattner, NVIDIA Corporation
+ * Copyright 2008 Rodrigo Kumpera
+ * Copyright 2008 André Tupinambá
+ * Copyright 2008 Mozilla Corporation
+ * Copyright 2008 Frederic Plourde
+ * Copyright 2009, Oracle and/or its affiliates. All rights reserved.
+ * Copyright 2009, 2010 Nokia Corporation
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and associated documentation files (the "Software"),
+ * to deal in the Software without restriction, including without limitation
+ * the rights to use, copy, modify, merge, publish, distribute, sublicense,
+ * and/or sell copies of the Software, and to permit persons to whom the
+ * Software is furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice (including the next
+ * paragraph) shall be included in all copies or substantial portions of the
+ * Software.
+ * 
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
+ * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
+ * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER
+ * DEALINGS IN THE SOFTWARE.
+ */
diff --git a/ChangeLog b/ChangeLog
new file mode 100644 (file)
index 0000000..e69de29
diff --git a/Makefile.am b/Makefile.am
new file mode 100644 (file)
index 0000000..ff87e26
--- /dev/null
@@ -0,0 +1,131 @@
+SUBDIRS = pixman demos test
+
+pkgconfigdir=$(libdir)/pkgconfig
+pkgconfig_DATA=pixman-1.pc
+
+$(pkgconfig_DATA): pixman-1.pc.in
+
+snapshot:
+       distdir="$(distdir)-`date '+%Y%m%d'`"; \
+       test -d "$(srcdir)/.git" && distdir=$$distdir-`cd "$(srcdir)" && git rev-parse HEAD | cut -c 1-6`; \
+       $(MAKE) $(AM_MAKEFLAGS) distdir="$$distdir" dist
+
+GPGKEY=6FF7C1A8
+USERNAME=$$USER
+RELEASE_OR_SNAPSHOT = $$(if test "x$(PIXMAN_VERSION_MINOR)" = "x$$(echo "$(PIXMAN_VERSION_MINOR)/2*2" | bc)" ; then echo release; else echo snapshot; fi)
+RELEASE_CAIRO_HOST =   $(USERNAME)@cairographics.org
+RELEASE_CAIRO_DIR =    /srv/cairo.freedesktop.org/www/$(RELEASE_OR_SNAPSHOT)s
+RELEASE_CAIRO_URL =    http://cairographics.org/$(RELEASE_OR_SNAPSHOT)s
+RELEASE_XORG_URL =     http://xorg.freedesktop.org/archive/individual/lib
+RELEASE_XORG_HOST =    $(USERNAME)@xorg.freedesktop.org
+RELEASE_XORG_DIR =     /srv/xorg.freedesktop.org/archive/individual/lib
+RELEASE_ANNOUNCE_LIST = cairo-announce@cairographics.org, xorg-announce@lists.freedesktop.org, pixman@lists.freedesktop.org
+
+tar_gz = $(PACKAGE)-$(VERSION).tar.gz
+tar_bz2 = $(PACKAGE)-$(VERSION).tar.bz2
+
+sha1_tgz = $(tar_gz).sha1
+md5_tgz = $(tar_gz).md5
+
+sha1_tbz2 = $(tar_bz2).sha1
+md5_tbz2 = $(tar_bz2).md5
+
+gpg_file = $(sha1_tgz).asc
+
+$(sha1_tgz): $(tar_gz)
+       sha1sum $^ > $@
+
+$(md5_tgz): $(tar_gz)
+       md5sum $^ > $@
+
+$(sha1_tbz2): $(tar_bz2)
+       sha1sum $^ > $@
+
+$(md5_tbz2): $(tar_bz2)
+       md5sum $^ > $@
+
+$(gpg_file): $(sha1_tgz)
+       @echo "Please enter your GPG password to sign the checksum."
+       gpg --armor --sign $^ 
+
+HASHFILES = $(sha1_tgz) $(sha1_tbz2) $(md5_tgz) $(md5_tbz2)
+
+release-verify-newer:
+       @echo -n "Checking that no $(VERSION) release already exists at $(RELEASE_XORG_HOST)..."
+       @ssh $(RELEASE_XORG_HOST) test ! -e $(RELEASE_XORG_DIR)/$(tar_gz) \
+               || (echo "Ouch." && echo "Found: $(RELEASE_XORG_HOST):$(RELEASE_XORG_DIR)/$(tar_gz)" \
+               && echo "Refusing to try to generate a new release of the same name." \
+               && false)
+       @ssh $(RELEASE_CAIRO_HOST) test ! -e $(RELEASE_CAIRO_DIR)/$(tar_gz) \
+               || (echo "Ouch." && echo "Found: $(RELEASE_CAIRO_HOST):$(RELEASE_CAIRO_DIR)/$(tar_gz)" \
+               && echo "Refusing to try to generate a new release of the same name." \
+               && false)
+       @echo "Good."
+
+release-remove-old:
+       $(RM) $(tar_gz) $(tar_bz2) $(HASHFILES) $(gpg_file)
+
+ensure-prev:
+       @if [[ "$(PREV)" == "" ]]; then                                                 \
+               echo ""                                                           &&    \
+               echo "You must set the PREV variable on the make command line to" &&    \
+               echo "the last version."                                          &&    \
+               echo ""                                                           &&    \
+               echo "For example:"                                               &&    \
+               echo "      make PREV=0.7.3"                                      &&    \
+               echo ""                                                           &&    \
+               false;                                                                  \
+       fi
+
+release-check: ensure-prev release-verify-newer release-remove-old distcheck
+
+release-tag:
+       git tag -u $(GPGKEY) -m "$(PACKAGE) $(VERSION) release" $(PACKAGE)-$(VERSION)
+
+release-upload: release-check $(tar_gz) $(tar_bz2) $(sha1_tgz) $(sha1_tbz2) $(md5_tgz) $(gpg_file)
+       scp $(tar_gz) $(sha1_tgz) $(gpg_file) $(RELEASE_CAIRO_HOST):$(RELEASE_CAIRO_DIR)
+       scp $(tar_gz) $(tar_bz2) $(RELEASE_XORG_HOST):$(RELEASE_XORG_DIR)
+       ssh $(RELEASE_CAIRO_HOST) "rm -f $(RELEASE_CAIRO_DIR)/LATEST-$(PACKAGE)-[0-9]* && ln -s $(tar_gz) $(RELEASE_CAIRO_DIR)/LATEST-$(PACKAGE)-$(VERSION)"
+
+release-publish-message: $(HASHFILES) ensure-prev
+       @echo "Please follow the instructions in RELEASING to push stuff out and"
+       @echo "send out the announcement mails.  Here is the excerpt you need:"
+       @echo ""
+       @echo "Lists:  $(RELEASE_ANNOUNCE_LIST)"
+       @echo "Subject: [ANNOUNCE] $(PACKAGE) release $(VERSION) now available"
+       @echo "============================== CUT HERE =============================="
+       @echo "A new $(PACKAGE) release $(VERSION) is now available"
+       @echo ""
+       @echo "tar.gz:"
+       @echo " $(RELEASE_CAIRO_URL)/$(tar_gz)"
+       @echo " $(RELEASE_XORG_URL)/$(tar_gz)"
+       @echo ""
+       @echo "tar.bz2:"
+       @echo " $(RELEASE_XORG_URL)/$(tar_bz2)"
+       @echo ""
+       @echo "Hashes:"
+       @echo -n "      MD5:  "
+       @cat $(md5_tgz)
+       @echo -n "      MD5:  "
+       @cat $(md5_tbz2)
+       @echo -n "      SHA1: "
+       @cat $(sha1_tgz)
+       @echo -n "      SHA1: "
+       @cat $(sha1_tbz2)
+       @echo ""
+       @echo "GPG signature:"
+       @echo " $(RELEASE_CAIRO_URL)/$(gpg_file)"
+       @echo " (signed by `git config --get user.name` <`git config --get user.email`>)"
+       @echo ""
+       @echo "Git:"
+       @echo " git://git.freedesktop.org/git/pixman"
+       @echo " tag: $(PACKAGE)-$(VERSION)"
+       @echo ""
+       @echo "Log:"
+       @git log --no-merges "$(PACKAGE)-$(PREV)".."$(PACKAGE)-$(VERSION)" | git shortlog | awk '{ printf "\t"; print ; }' | cut -b1-80
+       @echo "============================== CUT HERE =============================="
+       @echo ""
+
+release-publish: release-upload release-tag release-publish-message
+
+.PHONY: release-upload release-publish release-publish-message release-tag
diff --git a/Makefile.win32 b/Makefile.win32
new file mode 100644 (file)
index 0000000..91cd12a
--- /dev/null
@@ -0,0 +1,25 @@
+default: all
+
+top_srcdir = .
+include $(top_srcdir)/Makefile.win32.common
+
+# Recursive targets
+pixman_r:
+       @$(MAKE) -C pixman -f Makefile.win32
+
+test_r:
+       @$(MAKE) -C test -f Makefile.win32
+
+clean_r:
+       @$(MAKE) -C pixman -f Makefile.win32 clean
+       @$(MAKE) -C test   -f Makefile.win32 clean
+
+check_r:
+       @$(MAKE) -C test -f Makefile.win32 check
+
+# Base targets
+all: test_r
+
+clean: clean_r
+
+check: check_r
diff --git a/Makefile.win32.common b/Makefile.win32.common
new file mode 100644 (file)
index 0000000..56c3593
--- /dev/null
@@ -0,0 +1,54 @@
+LIBRARY = pixman-1
+
+CC = cl
+LD = link
+AR = lib
+PERL = perl
+
+ifeq ($(top_builddir),)
+top_builddir = $(top_srcdir)
+endif
+
+CFG_VAR = $(CFG)
+ifeq ($(CFG_VAR),)
+CFG_VAR = release
+endif
+
+ifeq ($(CFG_VAR),debug)
+CFG_CFLAGS  = -MDd -Od -Zi
+CFG_LDFLAGS = -DEBUG
+else
+CFG_CFLAGS  = -MD -O2
+CFG_LDFLAGS =
+endif
+
+# Package definitions, to be used instead of those provided in config.h
+PKG_CFLAGS  = -DPACKAGE=$(LIBRARY) -DPACKAGE_VERSION="" -DPACKAGE_BUGREPORT=""
+
+BASE_CFLAGS = -nologo -I. -I$(top_srcdir) -I$(top_srcdir)/pixman
+
+PIXMAN_CFLAGS  = $(BASE_CFLAGS) $(PKG_CFLAGS) $(CFG_CFLAGS) $(CFLAGS)
+PIXMAN_LDFLAGS = -nologo $(CFG_LDFLAGS) $(LDFLAGS)
+PIXMAN_ARFLAGS = -nologo $(LDFLAGS)
+
+
+inform:
+ifneq ($(CFG),release)
+ifneq ($(CFG),debug)
+ifneq ($(CFG),)
+       @echo "Invalid specified configuration option: "$(CFG)"."
+       @echo
+       @echo "Possible choices for configuration are 'release' and 'debug'"
+       @exit 1
+endif
+       @echo "Using default RELEASE configuration... (use CFG=release or CFG=debug)"
+endif
+endif
+
+
+$(CFG_VAR)/%.obj: %.c $(BUILT_SOURCES)
+       @mkdir -p $(CFG_VAR)
+       @$(CC) -c $(PIXMAN_CFLAGS) -Fo"$@" $<
+
+clean: inform
+       @$(RM) $(CFG_VAR)/*.{exe,ilk,lib,obj,pdb} $(BUILT_SOURCES) || exit 0
diff --git a/NEWS b/NEWS
new file mode 100644 (file)
index 0000000..e69de29
diff --git a/README b/README
new file mode 100644 (file)
index 0000000..3cfbc50
--- /dev/null
+++ b/README
@@ -0,0 +1,22 @@
+pixman is a library that provides low-level pixel manipulation
+features such as image compositing and trapezoid rasterization.
+
+All questions regarding this software should be directed to the pixman
+mailing list:
+
+        http://lists.freedesktop.org/mailman/listinfo/pixman
+
+Please send patches and bug reports either to the mailing list above,
+or file them at the freedesktop bug tracker:
+
+        https://bugs.freedesktop.org/enter_bug.cgi?product=pixman
+
+The master development code repository can be found at:
+
+       git://anongit.freedesktop.org/git/pixman
+
+       http://gitweb.freedesktop.org/?p=pixman;a=summary
+
+For more information on the git code manager, see:
+
+       http://wiki.x.org/wiki/GitPage
diff --git a/RELEASING b/RELEASING
new file mode 100644 (file)
index 0000000..fbe1581
--- /dev/null
+++ b/RELEASING
@@ -0,0 +1,57 @@
+Here are the steps to follow to create a new pixman release:
+
+1) Ensure that there are no uncommitted changes or unpushed commits,
+   and that you are up to date with the latest commits in the central
+   repository. Here are a couple of useful commands:
+
+       git diff                        (no output)
+       
+       git status                      (should report "nothing to commit")
+
+       git log master...origin         (no output; note: *3* dots)
+
+2) Increment pixman_(major|minor|micro) in configure.ac according to
+   the directions in that file.
+
+3) Make sure that new version works, including
+
+       - make distcheck passes
+
+       - the X server still works with the new pixman version
+         installed
+
+       - the cairo test suite hasn't gained any new failures compared
+         to last pixman version.
+
+4) Use "git commit" to record the changes made in step 2 and 3.
+
+5) Generate and publish the tar files by running 
+
+       make PREV=<last version> GPGKEY=<your gpg key id> release-publish
+
+   If your freedesktop user name is different from your local one,
+   then also set the variable USER to your freedesktop user name.
+
+6) Run 
+
+       make release-publish-message
+
+   to generate a draft release announcement. Edit it as appropriate and
+   send it to 
+
+       cairo-announce@cairographics.org
+
+       pixman@lists.freedesktop.org
+
+       xorg-announce@lists.freedesktop.org
+
+7) Increment pixman_micro to the next larger (odd) number in
+   configure.ac. Commit this change, and push all commits created
+   during this process using
+
+       git push
+       git push --tags
+
+   You must use "--tags" here; otherwise the new tag will not
+   be pushed out.
+
diff --git a/TODO b/TODO
new file mode 100644 (file)
index 0000000..4434ec7
--- /dev/null
+++ b/TODO
@@ -0,0 +1,271 @@
+  - Testing
+    - Test implementations against each other
+    - Test both with and without the operator strength reduction.
+      They shold be identical.
+
+  - SSE 2 issues:
+
+      - Use MM_HINT_NTA instead of MM_HINT_T0
+
+      - Use of fbCompositeOver_x888x8x8888sse2()
+
+  - Update the RLEASING file
+
+  - Things to keep in mind if breaking ABI:
+
+      - There should be a guard #ifndef I_AM_EITHER_CAIRO_OR_THE_X_SERVER
+
+      - X server will require 16.16 essentially forever. Can we get
+        the required precision by simply adding offset_x/y to the
+        relevant rendering API?
+
+      - Get rid of workaround for X server bug.
+
+      - pixman_image_set_indexed() should copy its argument, and X
+        should be ported over to use a pixman_image as the
+        representation of a Picture, rather than creating one on each
+        operation.
+
+      - We should get rid of pixman_set_static_pointers()
+
+      - We should get rid of the various trapezoid helper functions().
+        (They only exist because they are theoretically available to
+        drivers).
+
+      - 16 bit regions should be deleted
+
+      - There should only be one trap rasterization API.
+
+      - The PIXMAN_g8/c8/etc formats should use the A channel
+        to indicate the actual depth. That way PIXMAN_x4c4 and PIXMAN_c8
+       won't collide.
+
+  - Maybe bite the bullet and make configure.ac generate a pixman-types.h
+    file that can be included from pixman.h to avoid the #ifdef magic
+    in pixman.h
+
+  - Make pixman_region_point_in() survive a NULL box, then fix up
+    pixman-compose.c
+
+      - Possibly look into inlining the fetch functions
+
+  - There is a bug with source clipping demonstrated by clip-test in the
+    test directory. If we interprete source clipping as given in
+    destination coordinates, which is probably the only sane choice,
+    then the result should have two red bars down the sides.
+    
+  - Test suite
+
+  - Add a general way of dealing with architecture specific
+    fast-paths.  The current idea is to have each operation that can
+    be optimized is called through a function pointer that is
+    initially set to an initialization function that is responsible for
+    setting the function pointer to the appropriate fast-path.
+
+  - Go through things marked FIXME
+
+  - Add calls to prepare and finish access where necessary.  grep for
+    ACCESS_MEM, and make sure they are correctly wrapped in prepare
+    and finish.
+
+  - restore READ/WRITE in the fbcompose combiners since they sometimes
+    store directly to destination drawables.
+
+  - It probably makes sense to move the more strange X region API
+    into pixman as well, but guarded with PIXMAN_XORG_COMPATIBILITY
+
+  - Reinstate the FbBits typedef? At the moment we don't
+    even have the FbBits type; we just use uint32_t everywhere.
+
+    Keith says in bug 2335:
+
+        The 64-bit code in fb (pixman) is probably broken; it hasn't been
+        used in quite some time as PCI (and AGP) is 32-bits wide, so
+        doing things 64-bits at a time is a net loss.  To quickly fix
+        this, I suggest just using 32-bit datatypes by setting
+        IC_SHIFT to 5 for all machines.
+
+  - Consider optimizing the 8/16 bit solid fills in pixman-util.c by
+    storing more than one value at a time.
+
+  - Add an image cache to prevent excessive malloc/free. Note that pixman
+    needs to be thread safe when used from cairo.
+
+  - Moving to 24.8 coordinates. This is tricky because X is still
+    defined as 16.16 and will be basically forever. It's possible we
+    could do this by adding extra offset_x/y parameters to the
+    trapezoid calls. The X server could then just call the API with
+    (0, 0). Cairo would have to make sure that the delta *within* a
+    batch of trapezoids does not exceed 16 bit.
+
+  - Consider adding actual backends. Brain dump:
+
+    A backend is something that knows how to
+
+      - Create images
+      - Composite three images
+      - Rasterize trapezoids
+      - Do solid fills and blits
+
+    These operations are provided by a vtable that the backend will
+    create when it is initialized. Initial backends:
+
+      - VMX
+      - SSE2
+      - MMX
+      - Plain Old C
+
+    When the SIMD backends are initialized, they will be passed a
+    pointer to the Plain Old C backend that they can use for fallback
+    purposes.
+
+    Images would gain a vtable as well that would contain things like
+
+      - Read scanline
+      - Write scanline
+
+    (Or even read_patch/write_patch as suggested by Keith a while
+    back).
+
+    This could simplify the compositing code considerably.
+
+  - Review the pixman_format_code_t enum to make sure it will support
+    future formats. Some formats we will probably need:
+
+          ARGB/ABGR with 16/32/64 bit integer/floating channels
+          YUV2,
+          YV12
+
+    Also we may need the ability to distinguish between PICT_c8 and
+    PICT_x4c4. (This could be done by interpreting the A channel as
+    the depth for TYPE_COLOR and TYPE_GRAY formats).
+
+    A possibility may be to reserve the two top bits and make them
+    encode "number of places to shift the channel widths given" Since
+    these bits are 00 at the moment everything will continue to work,
+    but these additional widths will be allowed:
+
+            All even widths between 18-32
+            All multiples of four widths between 33 and 64
+            All multiples of eight between 64 and 128
+
+    This means things like r21g22b21 won't work - is that worth
+    worrying about? I don't think so. And of course the bpp field
+    can't handle a depth of over 256, so > 64 bit channels arent'
+    really all that useful.
+
+    We could reserve one extra bit to indicate floating point, but
+    we may also just add 
+
+                  PIXMAN_TYPE_ARGB_FLOAT
+          PIXMAN_TYPE_BGRA_FLOAT
+          PIXMAN_TYPE_A_FLOAT
+    
+    image types. With five bits we can support up to 32 different
+    format types, which should be enough for everybody, even if we
+    decide to support all the various video formats here:
+
+               http://www.fourcc.org/yuv.php
+
+    It may make sense to have a PIXMAN_TYPE_YUV, and then use the
+    channel bits to specify the exact subtype.
+
+    Another possibility is to add 
+
+         PIXMAN_TYPE_ARGB_W
+         PIXMAN_TYPE_ARGB_WW
+    
+    where the channel widths would get 16 and 32 added to them,
+    respectively.
+
+    What about color spaces such a linear vs. srGB etc.?
+
+
+done:
+
+- Use pixmanFillsse2 and pixmanBltsse2
+
+- Be consistent about calling sse2 sse2
+
+- Rename "SSE" to "MMX_EXTENSIONS". (Deleted mmx extensions).
+
+- Commented-out uses of fbCompositeCopyAreasse2()
+
+- Consider whether calling regions region16 is really such a great
+  idea. Vlad wants 32 bit regions for Cairo. This will break X server
+  ABI, but should otherwise be mostly harmless, though a
+  pixman_region_get_boxes16() may be useful.
+
+- Altivec signal issue (Company has fix, there is also a patch by
+  dwmw2 in rawhide).
+
+- Behdad's MMX issue - see list
+
+- SSE2 issues:
+    - Crashes in Mozilla because of unaligned stack. Possible fixes
+        - Make use of gcc 4.2 feature to align the stack
+        - Write some sort of trampoline that aligns the stack
+          before calling SSE functions.
+
+- Get rid of the switch-of-doom; replace it with a big table
+  describing the various fast paths.
+
+- Make source clipping optional.
+    - done: source clipping happens through an indirection.
+        still needs to make the indirection settable. (And call it
+        from X)
+
+- Run cairo test suite; fix bugs
+       - one bug in source-scale-clip
+
+ - Remove the warning suppression in the ACCESS_MEM macro and fix the
+    warnings that are real
+       - irrelevant now.
+
+- make the wrapper functions global instead of image specific
+       - this won't work since pixman is linked to both fb and wfb
+
+- Add non-mmx solid fill
+
+- Make sure the endian-ness macros are defined correctly.
+
+- The rectangles in a region probably shouldn't be returned const as
+  the X server will be changing them.
+
+- Right now we _always_ have a clip region, which is empty by default.
+  Why does this work at all? It probably doesn't. The server
+  distinguishes two cases, one where nothing is clipped (CT_NONE), and
+  one where there is a clip region (CT_REGION).
+
+- Default clip region should be the full image
+
+  - Test if pseudo color still works. It does, but it also shows that
+    copying a pixman_indexed_t on every composite operation is not
+    going to fly. So, for now set_indexed() does not copy the 
+    indexed table. 
+
+    Also just the malloc() to allocate a pixman image shows up pretty
+    high.
+
+    Options include
+
+      - Make all the setters not copy their arguments
+
+      - Possibly combined with going back to the stack allocated 
+        approach that we already use for regions.
+
+      - Keep a cached pixman_image_t around for every picture. It would
+        have to be kept uptodate every time something changes about the
+        picture.
+
+      - Break the X server ABI and simply have the relevant parameter
+        stored in the pixman image. This would have the additional benefits
+        that:
+
+          - We can get rid of the annoying repeat field which is duplicated
+            elsewhere.
+
+          - We can use pixman_color_t and pixman_gradient_stop_t
+            etc. instead of the types that are defined in
+            renderproto.h
+
diff --git a/autogen.sh b/autogen.sh
new file mode 100755 (executable)
index 0000000..354f254
--- /dev/null
@@ -0,0 +1,12 @@
+#! /bin/sh
+
+srcdir=`dirname $0`
+test -z "$srcdir" && srcdir=.
+
+ORIGDIR=`pwd`
+cd $srcdir
+
+autoreconf -v --install || exit 1
+cd $ORIGDIR || exit $?
+
+$srcdir/configure "$@"
diff --git a/configure.ac b/configure.ac
new file mode 100755 (executable)
index 0000000..6c88c84
--- /dev/null
@@ -0,0 +1,895 @@
+dnl  Copyright 2005 Red Hat, Inc.
+dnl 
+dnl  Permission to use, copy, modify, distribute, and sell this software and its
+dnl  documentation for any purpose is hereby granted without fee, provided that
+dnl  the above copyright notice appear in all copies and that both that
+dnl  copyright notice and this permission notice appear in supporting
+dnl  documentation, and that the name of Red Hat not be used in
+dnl  advertising or publicity pertaining to distribution of the software without
+dnl  specific, written prior permission.  Red Hat makes no
+dnl  representations about the suitability of this software for any purpose.  It
+dnl  is provided "as is" without express or implied warranty.
+dnl 
+dnl  RED HAT DISCLAIMS ALL WARRANTIES WITH REGARD TO THIS SOFTWARE,
+dnl  INCLUDING ALL IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS, IN NO
+dnl  EVENT SHALL RED HAT BE LIABLE FOR ANY SPECIAL, INDIRECT OR
+dnl  CONSEQUENTIAL DAMAGES OR ANY DAMAGES WHATSOEVER RESULTING FROM LOSS OF USE,
+dnl  DATA OR PROFITS, WHETHER IN AN ACTION OF CONTRACT, NEGLIGENCE OR OTHER
+dnl  TORTIOUS ACTION, ARISING OUT OF OR IN CONNECTION WITH THE USE OR
+dnl  PERFORMANCE OF THIS SOFTWARE.
+dnl
+dnl Process this file with autoconf to create configure.
+
+AC_PREREQ([2.57])
+
+#   Pixman versioning scheme
+#
+#   - The version in git has an odd MICRO version number
+#
+#   - Released versions, both development and stable, have an
+#     even MICRO version number
+#
+#   - Released development versions have an odd MINOR number
+#
+#   - Released stable versions have an even MINOR number
+#
+#   - Versions that break ABI must have a new MAJOR number
+#
+#   - If you break the ABI, then at least this must be done:
+#
+#        - increment MAJOR
+#
+#        - In the first development release where you break ABI, find
+#          all instances of "pixman-n" and change them to pixman-(n+1)
+#
+#          This needs to be done at least in 
+#                    configure.ac
+#                    all Makefile.am's
+#                    pixman-n.pc.in
+#
+#      This ensures that binary incompatible versions can be installed
+#      in parallel.  See http://www106.pair.com/rhp/parallel.html for
+#      more information
+#
+
+m4_define([pixman_major], 0)
+m4_define([pixman_minor], 23)
+m4_define([pixman_micro], 7)
+
+m4_define([pixman_version],[pixman_major.pixman_minor.pixman_micro])
+
+AC_INIT(pixman, pixman_version, [pixman@lists.freedesktop.org], pixman)
+AM_INIT_AUTOMAKE([foreign dist-bzip2])
+
+# Suppress verbose compile lines
+m4_ifdef([AM_SILENT_RULES], [AM_SILENT_RULES([yes])])
+
+AM_CONFIG_HEADER(config.h)
+
+AC_CANONICAL_HOST
+
+test_CFLAGS=${CFLAGS+set} # We may override autoconf default CFLAGS.
+
+AC_PROG_CC
+AM_PROG_AS
+AC_PROG_LIBTOOL
+AC_CHECK_FUNCS([getisax])
+AC_C_BIGENDIAN
+AC_C_INLINE
+
+dnl PIXMAN_LINK_WITH_ENV(env-setup, program, true-action, false-action)
+dnl
+dnl Compiles and links the given program in the environment setup by env-setup
+dnl and executes true-action on success and false-action on failure.
+AC_DEFUN([PIXMAN_LINK_WITH_ENV],[dnl
+       save_CFLAGS="$CFLAGS"
+       save_LDFLAGS="$LDFLAGS"
+       save_LIBS="$LIBS"
+       CFLAGS=""
+       LDFLAGS=""
+       LIBS=""
+       $1
+       AC_LINK_IFELSE(
+               [AC_LANG_SOURCE([$2])],
+               [pixman_cc_stderr=`test -f conftest.err && cat conftest.err`
+                pixman_cc_flag=yes],
+               [pixman_cc_stderr=`test -f conftest.err && cat conftest.err`
+                pixman_cc_flag=no])
+
+       if test "x$pixman_cc_stderr" != "x"; then
+               pixman_cc_flag=no
+       fi
+
+       if test "x$pixman_cc_flag" = "xyes"; then
+               ifelse([$3], , :, [$3])
+       else
+               ifelse([$4], , :, [$4])
+       fi
+       CFLAGS="$save_CFLAGS"
+       LDFLAGS="$save_LDFLAGS"
+       LIBS="$save_LIBS"
+])
+
+dnl Find a -Werror for catching warnings.
+WERROR=
+for w in -Werror -errwarn; do
+    if test "z$WERROR" = "z"; then
+        AC_MSG_CHECKING([whether the compiler supports $w])
+        PIXMAN_LINK_WITH_ENV(
+               [CFLAGS=$w],
+               [int main(int c, char **v) { (void)c; (void)v; return 0; }],
+               [WERROR=$w; yesno=yes], [yesno=no])
+       AC_MSG_RESULT($yesno)
+    fi
+done
+
+dnl PIXMAN_CHECK_CFLAG(flag, [program])
+dnl  Adds flag to CFLAGS if the given program links without warnings or errors.
+AC_DEFUN([PIXMAN_CHECK_CFLAG], [dnl
+       AC_MSG_CHECKING([whether the compiler supports $1])
+       PIXMAN_LINK_WITH_ENV(
+               [CFLAGS="$WERROR $1"],
+               [$2
+                int main(int c, char **v) { (void)c; (void)v; return 0; }
+               ],
+               [_yesno=yes],
+               [_yesno=no])
+       if test "x$_yesno" = xyes; then
+          CFLAGS="$CFLAGS $1"
+       fi
+       AC_MSG_RESULT($_yesno)
+])
+
+AC_CHECK_SIZEOF(long)
+
+# Checks for Sun Studio compilers
+AC_CHECK_DECL([__SUNPRO_C], [SUNCC="yes"], [SUNCC="no"])
+AC_CHECK_DECL([__amd64], [AMD64_ABI="yes"], [AMD64_ABI="no"])
+
+# Default CFLAGS to -O -g rather than just the -g from AC_PROG_CC
+# if we're using Sun Studio and neither the user nor a config.site
+# has set CFLAGS.
+if test $SUNCC = yes &&                        \
+   test "$test_CFLAGS" == "" &&                \
+   test "$CFLAGS" = "-g"
+then
+  CFLAGS="-O -g"
+fi
+
+# 
+# We ignore pixman_major in the version here because the major version should
+# always be encoded in the actual library name. Ie., the soname is:
+#
+#      pixman-$(pixman_major).0.minor.micro
+#
+m4_define([lt_current], [pixman_minor])
+m4_define([lt_revision], [pixman_micro])
+m4_define([lt_age], [pixman_minor])
+
+LT_VERSION_INFO="lt_current:lt_revision:lt_age"
+
+PIXMAN_VERSION_MAJOR=pixman_major()
+AC_SUBST(PIXMAN_VERSION_MAJOR)
+PIXMAN_VERSION_MINOR=pixman_minor()
+AC_SUBST(PIXMAN_VERSION_MINOR)
+PIXMAN_VERSION_MICRO=pixman_micro()
+AC_SUBST(PIXMAN_VERSION_MICRO)
+
+AC_SUBST(LT_VERSION_INFO)
+
+# Check for dependencies
+
+PIXMAN_CHECK_CFLAG([-Wall])
+PIXMAN_CHECK_CFLAG([-fno-strict-aliasing])
+
+AC_PATH_PROG(PERL, perl, no)
+if test "x$PERL" = xno; then
+    AC_MSG_ERROR([Perl is required to build pixman.])
+fi
+AC_SUBST(PERL)
+
+dnl =========================================================================
+dnl OpenMP for the test suite?
+dnl
+
+# Check for OpenMP support only when autoconf support that (require autoconf >=2.62)
+OPENMP_CFLAGS=
+m4_ifdef([AC_OPENMP], [AC_OPENMP])
+
+if test "x$enable_openmp" = "xyes" && test "x$ac_cv_prog_c_openmp" = "xunsupported" ; then
+  AC_MSG_WARN([OpenMP support requested but found unsupported])
+fi
+
+dnl May not fail to link without -Wall -Werror added
+dnl So try to link only when openmp is supported
+dnl ac_cv_prog_c_openmp is not defined when --disable-openmp is used
+if test "x$ac_cv_prog_c_openmp" != "xunsupported" && test "x$ac_cv_prog_c_openmp" != "x"; then
+  m4_define([openmp_test_program],[dnl
+  #include <stdio.h>
+
+  extern unsigned int lcg_seed;
+  #pragma omp threadprivate(lcg_seed)
+  unsigned int lcg_seed;
+
+  unsigned function(unsigned a, unsigned b)
+  {
+       lcg_seed ^= b;
+       return ((a + b) ^ a ) + lcg_seed;
+  }
+
+  int main(int argc, char **argv)
+  {
+       int i;
+       int n1 = 0, n2 = argc;
+       unsigned checksum = 0;
+       int verbose = argv != NULL;
+       unsigned (*test_function)(unsigned, unsigned);
+       test_function = function;
+       #pragma omp parallel for reduction(+:checksum) default(none) \
+                                       shared(n1, n2, test_function, verbose)
+       for (i = n1; i < n2; i++)
+       {
+               unsigned crc = test_function (i, 0);
+               if (verbose)
+                       printf ("%d: %08X\n", i, crc);
+               checksum += crc;
+       }
+       printf("%u\n", checksum);
+       return 0;
+  }
+  ])
+
+  PIXMAN_LINK_WITH_ENV(
+       [CFLAGS="$OPENMP_CFLAGS" LDFLAGS="$OPENMP_CFLAGS"],
+       [openmp_test_program],
+       [have_openmp=yes],
+       [have_openmp=no])
+  if test "x$have_openmp" = "xyes" ; then
+    AC_DEFINE(USE_OPENMP, 1, [use OpenMP in the test suite])
+  fi
+fi
+AC_SUBST(OPENMP_CFLAGS)
+
+dnl =========================================================================
+dnl -fvisibility stuff
+
+PIXMAN_CHECK_CFLAG([-fvisibility=hidden], [dnl
+#if defined(__GNUC__) && (__GNUC__ >= 4)
+#ifdef _WIN32
+#error Have -fvisibility but it is ignored and generates a warning
+#endif
+#else
+error Need GCC 4.0 for visibility
+#endif
+])
+
+PIXMAN_CHECK_CFLAG([-xldscope=hidden], [dnl
+#if defined(__SUNPRO_C) && (__SUNPRO_C >= 0x550)
+#else
+error Need Sun Studio 8 for visibility
+#endif
+])
+
+dnl ===========================================================================
+dnl Check for MMX
+
+if test "x$MMX_CFLAGS" = "x" ; then
+   if test "x$SUNCC" = "xyes"; then
+      # Sun Studio doesn't have an -xarch=mmx flag, so we have to use sse
+      # but if we're building 64-bit, mmx & sse support is on by default and
+      # -xarch=sse throws an error instead
+      if test "$AMD64_ABI" = "no" ; then
+         MMX_CFLAGS="-xarch=sse"
+      fi
+   else
+      MMX_CFLAGS="-mmmx -Winline"
+   fi
+fi
+
+have_mmx_intrinsics=no
+AC_MSG_CHECKING(whether to use MMX intrinsics)
+xserver_save_CFLAGS=$CFLAGS
+CFLAGS="$MMX_CFLAGS $CFLAGS"
+AC_COMPILE_IFELSE([AC_LANG_SOURCE([[
+#if defined(__GNUC__) && (__GNUC__ < 3 || (__GNUC__ == 3 && __GNUC_MINOR__ < 4))
+error "Need GCC >= 3.4 for MMX intrinsics"
+#endif
+#include <mmintrin.h>
+int main () {
+    __m64 v = _mm_cvtsi32_si64 (1);
+    return _mm_cvtsi64_si32 (v);
+}]])], have_mmx_intrinsics=yes)
+CFLAGS=$xserver_save_CFLAGS
+
+AC_ARG_ENABLE(mmx,
+   [AC_HELP_STRING([--disable-mmx],
+                   [disable x86 MMX fast paths])],
+   [enable_mmx=$enableval], [enable_mmx=auto])
+
+if test $enable_mmx = no ; then
+   have_mmx_intrinsics=disabled
+fi
+
+if test $have_mmx_intrinsics = yes ; then
+   AC_DEFINE(USE_X86_MMX, 1, [use x86 MMX compiler intrinsics])
+else
+   MMX_CFLAGS=
+fi
+
+AC_MSG_RESULT($have_mmx_intrinsics)
+if test $enable_mmx = yes && test $have_mmx_intrinsics = no ; then
+   AC_MSG_ERROR([x86 MMX intrinsics not detected])
+fi
+
+AM_CONDITIONAL(USE_X86_MMX, test $have_mmx_intrinsics = yes)
+
+dnl ===========================================================================
+dnl Check for SSE2
+
+if test "x$SSE2_CFLAGS" = "x" ; then
+   if test "x$SUNCC" = "xyes"; then
+      # SSE2 is enabled by default in the Sun Studio 64-bit environment
+      if test "$AMD64_ABI" = "no" ; then
+         SSE2_CFLAGS="-xarch=sse2"
+      fi
+   else
+      SSE2_CFLAGS="-msse2 -Winline"
+   fi
+fi
+
+have_sse2_intrinsics=no
+AC_MSG_CHECKING(whether to use SSE2 intrinsics)
+xserver_save_CFLAGS=$CFLAGS
+CFLAGS="$SSE2_CFLAGS $CFLAGS"
+
+AC_COMPILE_IFELSE([AC_LANG_SOURCE([[
+#if defined(__GNUC__) && (__GNUC__ < 4 || (__GNUC__ == 4 && __GNUC_MINOR__ < 2))
+#   if !defined(__amd64__) && !defined(__x86_64__)
+#      error "Need GCC >= 4.2 for SSE2 intrinsics on x86"
+#   endif
+#endif
+#include <mmintrin.h>
+#include <xmmintrin.h>
+#include <emmintrin.h>
+int main () {
+    __m128i a = _mm_set1_epi32 (0), b = _mm_set1_epi32 (0), c;
+       c = _mm_xor_si128 (a, b);
+    return 0;
+}]])], have_sse2_intrinsics=yes)
+CFLAGS=$xserver_save_CFLAGS
+
+AC_ARG_ENABLE(sse2,
+   [AC_HELP_STRING([--disable-sse2],
+                   [disable SSE2 fast paths])],
+   [enable_sse2=$enableval], [enable_sse2=auto])
+
+if test $enable_sse2 = no ; then
+   have_sse2_intrinsics=disabled
+fi
+
+if test $have_sse2_intrinsics = yes ; then
+   AC_DEFINE(USE_SSE2, 1, [use SSE2 compiler intrinsics])
+fi
+
+AC_MSG_RESULT($have_sse2_intrinsics)
+if test $enable_sse2 = yes && test $have_sse2_intrinsics = no ; then
+   AC_MSG_ERROR([SSE2 intrinsics not detected])
+fi
+
+AM_CONDITIONAL(USE_SSE2, test $have_sse2_intrinsics = yes)
+
+dnl ===========================================================================
+dnl Other special flags needed when building code using MMX or SSE instructions
+case $host_os in
+   solaris*)
+      # When building 32-bit binaries, apply a mapfile to ensure that the
+      # binaries aren't flagged as only able to run on MMX+SSE capable CPUs
+      # since they check at runtime before using those instructions.
+      # Not all linkers grok the mapfile format so we check for that first.
+      if test "$AMD64_ABI" = "no" ; then
+        use_hwcap_mapfile=no
+        AC_MSG_CHECKING(whether to use a hardware capability map file)
+        hwcap_save_LDFLAGS="$LDFLAGS"
+        HWCAP_LDFLAGS='-Wl,-M,$(srcdir)/solaris-hwcap.mapfile'
+        LDFLAGS="$LDFLAGS -Wl,-M,pixman/solaris-hwcap.mapfile"
+        AC_LINK_IFELSE([AC_LANG_SOURCE([[int main() { return 0; }]])],
+                       use_hwcap_mapfile=yes,
+                       HWCAP_LDFLAGS="")
+        LDFLAGS="$hwcap_save_LDFLAGS"
+        AC_MSG_RESULT($use_hwcap_mapfile)
+      fi
+      if test "x$MMX_LDFLAGS" = "x" ; then
+         MMX_LDFLAGS="$HWCAP_LDFLAGS"
+      fi
+      if test "x$SSE2_LDFLAGS" = "x" ; then
+        SSE2_LDFLAGS="$HWCAP_LDFLAGS"
+      fi
+      ;;
+esac
+
+AC_SUBST(MMX_CFLAGS)
+AC_SUBST(MMX_LDFLAGS)
+AC_SUBST(SSE2_CFLAGS)
+AC_SUBST(SSE2_LDFLAGS)
+
+dnl ===========================================================================
+dnl Check for VMX/Altivec
+if test -n "`$CC -v 2>&1 | grep version | grep Apple`"; then
+    VMX_CFLAGS="-faltivec"
+else
+    VMX_CFLAGS="-maltivec -mabi=altivec"
+fi
+
+have_vmx_intrinsics=no
+AC_MSG_CHECKING(whether to use VMX/Altivec intrinsics)
+xserver_save_CFLAGS=$CFLAGS
+CFLAGS="$VMX_CFLAGS $CFLAGS"
+AC_COMPILE_IFELSE([AC_LANG_SOURCE([[
+#if defined(__GNUC__) && (__GNUC__ < 3 || (__GNUC__ == 3 && __GNUC_MINOR__ < 4))
+error "Need GCC >= 3.4 for sane altivec support"
+#endif
+#include <altivec.h>
+int main () {
+    vector unsigned int v = vec_splat_u32 (1);
+    v = vec_sub (v, v);
+    return 0;
+}]])], have_vmx_intrinsics=yes)
+CFLAGS=$xserver_save_CFLAGS
+
+AC_ARG_ENABLE(vmx,
+   [AC_HELP_STRING([--disable-vmx],
+                   [disable VMX fast paths])],
+   [enable_vmx=$enableval], [enable_vmx=auto])
+
+if test $enable_vmx = no ; then
+   have_vmx_intrinsics=disabled
+fi
+
+if test $have_vmx_intrinsics = yes ; then
+   AC_DEFINE(USE_VMX, 1, [use VMX compiler intrinsics])
+else
+   VMX_CFLAGS=
+fi
+
+AC_MSG_RESULT($have_vmx_intrinsics)
+if test $enable_vmx = yes && test $have_vmx_intrinsics = no ; then
+   AC_MSG_ERROR([VMX intrinsics not detected])
+fi
+
+AC_SUBST(VMX_CFLAGS)
+
+AM_CONDITIONAL(USE_VMX, test $have_vmx_intrinsics = yes)
+
+dnl ==========================================================================
+dnl Check if assembler is gas compatible and supports ARM SIMD instructions
+have_arm_simd=no
+AC_MSG_CHECKING(whether to use ARM SIMD assembler)
+xserver_save_CFLAGS=$CFLAGS
+CFLAGS="-x assembler-with-cpp $CFLAGS"
+AC_COMPILE_IFELSE([AC_LANG_SOURCE([[
+.text
+.arch armv6
+.object_arch armv4
+.arm
+.altmacro
+#ifndef __ARM_EABI__
+#error EABI is required (to be sure that calling conventions are compatible)
+#endif
+pld [r0]
+uqadd8 r0, r0, r0]])], have_arm_simd=yes)
+CFLAGS=$xserver_save_CFLAGS
+
+AC_ARG_ENABLE(arm-simd,
+   [AC_HELP_STRING([--disable-arm-simd],
+                   [disable ARM SIMD fast paths])],
+   [enable_arm_simd=$enableval], [enable_arm_simd=auto])
+
+if test $enable_arm_simd = no ; then
+   have_arm_simd=disabled
+fi
+
+if test $have_arm_simd = yes ; then
+   AC_DEFINE(USE_ARM_SIMD, 1, [use ARM SIMD assembly optimizations])
+fi
+
+AM_CONDITIONAL(USE_ARM_SIMD, test $have_arm_simd = yes)
+
+AC_MSG_RESULT($have_arm_simd)
+if test $enable_arm_simd = yes && test $have_arm_simd = no ; then
+   AC_MSG_ERROR([ARM SIMD intrinsics not detected])
+fi
+
+dnl ==========================================================================
+dnl Check if assembler is gas compatible and supports NEON instructions
+have_arm_neon=no
+AC_MSG_CHECKING(whether to use ARM NEON assembler)
+xserver_save_CFLAGS=$CFLAGS
+CFLAGS="-x assembler-with-cpp $CFLAGS"
+AC_COMPILE_IFELSE([AC_LANG_SOURCE([[
+.text
+.fpu neon
+.arch armv7a
+.object_arch armv4
+.eabi_attribute 10, 0
+.arm
+.altmacro
+#ifndef __ARM_EABI__
+#error EABI is required (to be sure that calling conventions are compatible)
+#endif
+pld [r0]
+vmovn.u16 d0, q0]])], have_arm_neon=yes)
+CFLAGS=$xserver_save_CFLAGS
+
+AC_ARG_ENABLE(arm-neon,
+   [AC_HELP_STRING([--disable-arm-neon],
+                   [disable ARM NEON fast paths])],
+   [enable_arm_neon=$enableval], [enable_arm_neon=auto])
+
+if test $enable_arm_neon = no ; then
+   have_arm_neon=disabled
+fi
+
+if test $have_arm_neon = yes ; then
+   AC_DEFINE(USE_ARM_NEON, 1, [use ARM NEON assembly optimizations])
+fi
+
+AM_CONDITIONAL(USE_ARM_NEON, test $have_arm_neon = yes)
+
+AC_MSG_RESULT($have_arm_neon)
+if test $enable_arm_neon = yes && test $have_arm_neon = no ; then
+   AC_MSG_ERROR([ARM NEON intrinsics not detected])
+fi
+
+dnl ===========================================================================
+dnl Check for IWMMXT
+
+if test "x$IWMMXT_CFLAGS" = "x" ; then
+   IWMMXT_CFLAGS="-march=iwmmxt -flax-vector-conversions -Winline"
+fi
+
+have_iwmmxt_intrinsics=no
+AC_MSG_CHECKING(whether to use ARM IWMMXT intrinsics)
+xserver_save_CFLAGS=$CFLAGS
+CFLAGS="$IWMMXT_CFLAGS $CFLAGS"
+AC_COMPILE_IFELSE([
+#ifndef __arm__
+#error "IWMMXT is only available on ARM"
+#endif
+#if defined(__GNUC__) && (__GNUC__ < 4 || (__GNUC__ == 4 && __GNUC_MINOR__ < 6))
+#error "Need GCC >= 4.6 for IWMMXT intrinsics"
+#endif
+#include <mmintrin.h>
+int main () {
+       union {
+               __m64 v;
+               [char c[8];]
+       } a = { .c = {1, 2, 3, 4, 5, 6, 7, 8} };
+       int b = 4;
+       __m64 c = _mm_srli_si64 (a.v, b);
+}], have_iwmmxt_intrinsics=yes)
+CFLAGS=$xserver_save_CFLAGS
+
+AC_ARG_ENABLE(arm-iwmmxt,
+   [AC_HELP_STRING([--disable-arm-iwmmxt],
+                   [disable ARM IWMMXT fast paths])],
+   [enable_iwmmxt=$enableval], [enable_iwmmxt=auto])
+
+if test $enable_iwmmxt = no ; then
+   have_iwmmxt_intrinsics=disabled
+fi
+
+if test $have_iwmmxt_intrinsics = yes ; then
+   AC_DEFINE(USE_ARM_IWMMXT, 1, [use ARM IWMMXT compiler intrinsics])
+else
+   IWMMXT_CFLAGS=
+fi
+
+AC_MSG_RESULT($have_iwmmxt_intrinsics)
+if test $enable_iwmmxt = yes && test $have_iwmmxt_intrinsics = no ; then
+   AC_MSG_ERROR([IWMMXT intrinsics not detected])
+fi
+
+AM_CONDITIONAL(USE_ARM_IWMMXT, test $have_iwmmxt_intrinsics = yes)
+
+dnl =========================================================================================
+dnl Check for GNU-style inline assembly support
+
+have_gcc_inline_asm=no
+AC_MSG_CHECKING(whether to use GNU-style inline assembler)
+AC_COMPILE_IFELSE([AC_LANG_SOURCE([[
+int main () {
+    /* Most modern architectures have a NOP instruction, so this is a fairly generic test. */
+       asm volatile ( "\tnop\n" : : : "cc", "memory" );
+    return 0;
+}]])], have_gcc_inline_asm=yes)
+
+AC_ARG_ENABLE(gcc-inline-asm,
+   [AC_HELP_STRING([--disable-gcc-inline-asm],
+                   [disable GNU-style inline assembler])],
+   [enable_gcc_inline_asm=$enableval], [enable_gcc_inline_asm=auto])
+
+if test $enable_gcc_inline_asm = no ; then
+   have_gcc_inline_asm=disabled
+fi
+
+if test $have_gcc_inline_asm = yes ; then
+   AC_DEFINE(USE_GCC_INLINE_ASM, 1, [use GNU-style inline assembler])
+fi
+
+AC_MSG_RESULT($have_gcc_inline_asm)
+if test $enable_gcc_inline_asm = yes && test $have_gcc_inline_asm = no ; then
+   AC_MSG_ERROR([GNU-style inline assembler not detected])
+fi
+
+AM_CONDITIONAL(USE_GCC_INLINE_ASM, test $have_gcc_inline_asm = yes)
+
+dnl ==============================================
+dnl Static test programs
+
+AC_ARG_ENABLE(static-testprogs,
+   [AC_HELP_STRING([--enable-static-testprogs],
+                  [build test programs as static binaries [default=no]])],
+   [enable_static_testprogs=$enableval], [enable_static_testprogs=no])
+
+TESTPROGS_EXTRA_LDFLAGS=
+if test "x$enable_static_testprogs" = "xyes" ; then
+   TESTPROGS_EXTRA_LDFLAGS="-all-static"
+fi
+AC_SUBST(TESTPROGS_EXTRA_LDFLAGS)
+
+dnl ==============================================
+dnl Timers
+
+AC_ARG_ENABLE(timers,
+   [AC_HELP_STRING([--enable-timers],
+                  [enable TIMER_BEGIN and TIMER_END macros [default=no]])],
+   [enable_timers=$enableval], [enable_timers=no])
+
+if test $enable_timers = yes ; then 
+   AC_DEFINE(PIXMAN_TIMERS, 1, [enable TIMER_BEGIN/TIMER_END macros])
+fi
+AC_SUBST(PIXMAN_TIMERS)
+
+dnl ===================================
+dnl GTK+
+
+AC_ARG_ENABLE(gtk,
+   [AC_HELP_STRING([--enable-gtk],
+                   [enable tests using GTK+ [default=auto]])],
+   [enable_gtk=$enableval], [enable_gtk=auto])
+
+PKG_PROG_PKG_CONFIG
+
+if test $enable_gtk = yes ; then
+   AC_CHECK_LIB([pixman-1], [pixman_version_string])
+   PKG_CHECK_MODULES(GTK, [gtk+-2.0 pixman-1])
+fi
+
+if test $enable_gtk = auto ; then
+   AC_CHECK_LIB([pixman-1], [pixman_version_string], [enable_gtk=auto], [enable_gtk=no])
+fi
+
+if test $enable_gtk = auto ; then
+   PKG_CHECK_MODULES(GTK, [gtk+-2.0 pixman-1], [enable_gtk=yes], [enable_gtk=no])
+fi
+
+AM_CONDITIONAL(HAVE_GTK, [test "x$enable_gtk" = xyes])
+
+AC_SUBST(GTK_CFLAGS)
+AC_SUBST(GTK_LIBS)
+AC_SUBST(DEP_CFLAGS)
+AC_SUBST(DEP_LIBS)
+
+dnl =====================================
+dnl posix_memalign, sigaction, alarm, gettimeofday
+
+AC_CHECK_FUNC(posix_memalign, have_posix_memalign=yes, have_posix_memalign=no)
+if test x$have_posix_memalign = xyes; then
+   AC_DEFINE(HAVE_POSIX_MEMALIGN, 1, [Whether we have posix_memalign()])
+fi
+
+AC_CHECK_FUNC(sigaction, have_sigaction=yes, have_sigaction=no)
+if test x$have_sigaction = xyes; then
+   AC_DEFINE(HAVE_SIGACTION, 1, [Whether we have sigaction()])
+fi
+
+AC_CHECK_FUNC(alarm, have_alarm=yes, have_alarm=no)
+if test x$have_alarm = xyes; then
+   AC_DEFINE(HAVE_ALARM, 1, [Whether we have alarm()])
+fi
+
+AC_CHECK_HEADER([sys/mman.h],
+   [AC_DEFINE(HAVE_SYS_MMAN_H, [1], [Define to 1 if we have <sys/mman.h>])])
+
+AC_CHECK_FUNC(mmap, have_mmap=yes, have_mmap=no)
+if test x$have_mmap = xyes; then
+   AC_DEFINE(HAVE_MMAP, 1, [Whether we have mmap()])
+fi
+
+AC_CHECK_FUNC(mprotect, have_mprotect=yes, have_mprotect=no)
+if test x$have_mprotect = xyes; then
+   AC_DEFINE(HAVE_MPROTECT, 1, [Whether we have mprotect()])
+fi
+
+AC_CHECK_FUNC(getpagesize, have_getpagesize=yes, have_getpagesize=no)
+if test x$have_getpagesize = xyes; then
+   AC_DEFINE(HAVE_GETPAGESIZE, 1, [Whether we have getpagesize()])
+fi
+
+AC_CHECK_HEADER([fenv.h],
+   [AC_DEFINE(HAVE_FENV_H, [1], [Define to 1 if we have <fenv.h>])])
+
+AC_CHECK_LIB(m, feenableexcept, have_feenableexcept=yes, have_feenableexcept=no)
+if test x$have_feenableexcept = xyes; then
+   AC_DEFINE(HAVE_FEENABLEEXCEPT, 1, [Whether we have feenableexcept()])
+fi
+
+AC_CHECK_FUNC(gettimeofday, have_gettimeofday=yes, have_gettimeofday=no)
+AC_CHECK_HEADER(sys/time.h, have_sys_time_h=yes, have_sys_time_h=no)
+if test x$have_gettimeofday = xyes && test x$have_sys_time_h = xyes; then
+   AC_DEFINE(HAVE_GETTIMEOFDAY, 1, [Whether we have gettimeofday()])
+fi
+
+dnl =====================================
+dnl Thread local storage
+
+support_for__thread=no
+
+AC_MSG_CHECKING(for __thread)
+AC_LINK_IFELSE([AC_LANG_SOURCE([[
+#if defined(__MINGW32__) && !(__GNUC__ > 4 || (__GNUC__ == 4 && __GNUC_MINOR__ >= 5))
+#error This MinGW version has broken __thread support
+#endif
+#ifdef __OpenBSD__
+#error OpenBSD has broken __thread support
+#endif
+static __thread int x ;
+int main () { x = 123; return x; }
+]])], support_for__thread=yes)
+
+if test $support_for__thread = yes; then 
+   AC_DEFINE([TOOLCHAIN_SUPPORTS__THREAD],[],[Whether the tool chain supports __thread])
+fi
+
+AC_MSG_RESULT($support_for__thread)
+
+dnl
+dnl posix tls
+dnl
+
+m4_define([pthread_test_program],AC_LANG_SOURCE([[dnl
+#include <stdlib.h>
+#include <pthread.h>
+
+static pthread_once_t once_control = PTHREAD_ONCE_INIT;
+static pthread_key_t key;
+
+static void
+make_key (void)
+{
+    pthread_key_create (&key, NULL);
+}
+
+int
+main ()
+{
+    void *value = NULL;
+
+    if (pthread_once (&once_control, make_key) != 0)
+    {
+       value = NULL;
+    }
+    else
+    {
+       value = pthread_getspecific (key);
+       if (!value)
+       {
+           value = malloc (100);
+           pthread_setspecific (key, value);
+       }
+    }
+    return 0;
+}
+]]))
+
+AC_DEFUN([PIXMAN_CHECK_PTHREAD],[dnl
+    if test "z$support_for_pthread_setspecific" != "zyes"; then
+       PIXMAN_LINK_WITH_ENV(
+               [$1], [pthread_test_program],
+               [PTHREAD_CFLAGS="$CFLAGS"
+                PTHREAD_LIBS="$LIBS"
+                PTHREAD_LDFLAGS="$LDFLAGS"
+                support_for_pthread_setspecific=yes])
+    fi
+])
+
+if test $support_for__thread = no; then
+    support_for_pthread_setspecific=no
+
+    AC_MSG_CHECKING(for pthread_setspecific)
+
+    PIXMAN_CHECK_PTHREAD([CFLAGS="-D_REENTRANT"; LIBS="-lpthread"])
+    PIXMAN_CHECK_PTHREAD([CFLAGS="-pthread"; LDFLAGS="-pthread"])
+    PIXMAN_CHECK_PTHREAD([CFLAGS="-D_REENTRANT"; LDFLAGS="-lroot"])
+    
+    if test $support_for_pthread_setspecific = yes; then
+       CFLAGS="$CFLAGS $PTHREAD_CFLAGS"
+       AC_DEFINE([HAVE_PTHREAD_SETSPECIFIC], [], [Whether pthread_setspecific() is supported])
+    fi
+
+    AC_MSG_RESULT($support_for_pthread_setspecific);
+fi
+
+AC_SUBST(TOOLCHAIN_SUPPORTS__THREAD)
+AC_SUBST(HAVE_PTHREAD_SETSPECIFIC)
+AC_SUBST(PTHREAD_LDFLAGS)
+AC_SUBST(PTHREAD_LIBS)
+
+dnl =====================================
+dnl __attribute__((constructor))
+
+support_for_attribute_constructor=no
+
+AC_MSG_CHECKING(for __attribute__((constructor)))
+AC_LINK_IFELSE([AC_LANG_SOURCE([[
+#if defined(__GNUC__) && (__GNUC__ > 2 || (__GNUC__ == 2 && __GNUC_MINOR__ >= 7))
+/* attribute 'constructor' is supported since gcc 2.7, but some compilers
+ * may only pretend to be gcc, so let's try to actually use it
+ */
+static int x = 1;
+static void __attribute__((constructor)) constructor_function () { x = 0; }
+int main (void) { return x; }
+#else
+#error not gcc or gcc version is older than 2.7
+#endif
+]])], support_for_attribute_constructor=yes)
+
+if test x$support_for_attribute_constructor = xyes; then
+   AC_DEFINE([TOOLCHAIN_SUPPORTS_ATTRIBUTE_CONSTRUCTOR],
+             [],[Whether the tool chain supports __attribute__((constructor))])
+fi
+
+AC_MSG_RESULT($support_for_attribute_constructor)
+AC_SUBST(TOOLCHAIN_SUPPORTS_ATTRIBUTE_CONSTRUCTOR)
+
+dnl ==================
+dnl libpng
+
+PKG_CHECK_MODULES(PNG, [libpng], have_libpng=yes, have_libpng=no)
+
+if test x$have_libpng = xyes; then
+    AC_DEFINE([HAVE_LIBPNG], [1], [Whether we have libpng])
+fi
+
+AC_SUBST(HAVE_LIBPNG)
+
+AC_OUTPUT([pixman-1.pc
+           pixman-1-uninstalled.pc
+           Makefile
+          pixman/Makefile
+          pixman/pixman-version.h
+          demos/Makefile
+          test/Makefile])
+
+m4_if(m4_eval(pixman_minor % 2), [1], [
+   echo
+   echo "@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@"
+   echo
+   echo "      Thanks for testing this development snapshot of pixman. Please"
+   echo "      report any problems you find, either by sending email to "
+   echo
+   echo "          pixman@lists.freedesktop.org"
+   echo
+   echo "      or by filing a bug at "
+   echo
+   echo "          https://bugs.freedesktop.org/enter_bug.cgi?product=pixman "
+   echo
+   echo "      If you are looking for a stable release of pixman, please note "
+   echo "      that stable releases have _even_ minor version numbers. Ie., "
+   echo "      pixman-0.]m4_eval(pixman_minor & ~1)[.x are stable releases, whereas pixman-$PIXMAN_VERSION_MAJOR.$PIXMAN_VERSION_MINOR.$PIXMAN_VERSION_MICRO is a "
+   echo "      development snapshot that may contain bugs and experimental "
+   echo "      features. "
+   echo 
+   echo "@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@"
+   echo
+])
diff --git a/debian/README.source b/debian/README.source
new file mode 100755 (executable)
index 0000000..e9078df
--- /dev/null
@@ -0,0 +1,3 @@
+This package uses quilt to manage modifications to the upstream source.  See
+/usr/share/doc/quilt/README.source for details.
+
diff --git a/debian/changelog b/debian/changelog
new file mode 100755 (executable)
index 0000000..7593eeb
--- /dev/null
@@ -0,0 +1,7 @@
+pixman (0.23.7-1slp2+1) unstable; urgency=low
+
+  * Initial version
+  * Git: pkgs/p/pixman
+  * Tag: pixman_0.23.7-1slp2+1
+
+ -- Seongwon Cho <seongwon1.cho@samsung.com>  Thu, 08 Dec 2011 13:45:27 +0900
diff --git a/debian/compat b/debian/compat
new file mode 100755 (executable)
index 0000000..7ed6ff8
--- /dev/null
@@ -0,0 +1 @@
+5
diff --git a/debian/control b/debian/control
new file mode 100755 (executable)
index 0000000..5169912
--- /dev/null
@@ -0,0 +1,48 @@
+Source: pixman
+Section: devel
+Priority: optional
+Maintainer: Debian X Strike Force <debian-x@lists.debian.org>, Seongwon Cho <seongwon1.cho@samsung.com> 
+Uploaders: Julien Cristau <jcristau@debian.org>, David Nusinow <dnusinow@debian.org>, Seongwon Cho <seongwon1.cho@samsung.com> 
+Build-Depends: debhelper (>= 5), automake, autoconf, libtool, pkg-config, quilt, libpng12-dev
+Standards-Version: 3.8.3
+Vcs-Git: git://git.debian.org/git/pkg-xorg/lib/pixman
+Vcs-Browser: http://git.debian.org/?p=pkg-xorg/lib/pixman.git
+
+Package: libpixman-1-0
+Section: libs
+Architecture: any
+Depends:  ${shlibs:Depends}, ${misc:Depends}, libpng12-0
+Description: pixel-manipulation library for X and cairo
+ A library for manipulating pixel regions -- a set of Y-X banded
+ rectangles, image compositing using the Porter/Duff model
+ and implicit mask generation for geometric primitives including
+ trapezoids, triangles, and rectangles.
+
+#Package: libpixman-1-0-udeb
+#Section: debian-installer
+#XC-Package-Type: udeb
+#Architecture: any
+#Depends:
+# ${shlibs:Depends},
+# ${misc:Depends},
+#Description: pixel-manipulation library for X and cairo
+# This package contains a minimal set of libraries needed for the Debian
+# installer.  Do not install it on a normal system.
+
+Package: libpixman-1-0-dbg
+Section: debug
+Priority: extra
+Architecture: any
+Depends: libpixman-1-0 (= ${binary:Version}), ${misc:Depends},
+Description: pixel-manipulation library for X and cairo (debugging symbols)
+ Debugging symbols for the Cairo/X pixel manipulation library.  This is
+ needed to debug programs linked against libpixman0.
+
+Package: libpixman-1-dev
+Section: libdevel
+Architecture: any
+Depends: libpixman-1-0 (= ${binary:Version}), ${misc:Depends},libpng12-dev
+Conflicts: libpixman1-dev
+Description: pixel-manipulation library for X and cairo (development files)
+ Development libraries, header files and documentation needed by
+ programs that want to compile with the Cairo/X pixman library.
diff --git a/debian/copyright b/debian/copyright
new file mode 100755 (executable)
index 0000000..93ed0b7
--- /dev/null
@@ -0,0 +1,114 @@
+This package was downloaded from
+http://xorg.freedesktop.org/releases/individual/lib/
+
+Debian packaging by Julien Cristau <jcristau@debian.org>, 18 May 2007.
+
+The following is the 'standard copyright' agreed upon by most contributors,
+and is currently the canonical license, though a modification is currently
+under discussion.  Copyright holders of new code should use this license
+statement where possible, and append their name to this list.  
+
+Copyright 1987, 1988, 1989, 1998  The Open Group
+Copyright 1987, 1988, 1989 Digital Equipment Corporation
+Copyright 1999, 2004, 2008 Keith Packard
+Copyright 2000 SuSE, Inc.
+Copyright 2000 Keith Packard, member of The XFree86 Project, Inc.
+Copyright 2004, 2005, 2007, 2008 Red Hat, Inc.
+Copyright 2004 Nicholas Miell
+Copyright 2005 Lars Knoll & Zack Rusin, Trolltech
+Copyright 2005 Trolltech AS
+Copyright 2007 Luca Barbato
+Copyright 2008 Aaron Plattner, NVIDIA Corporation
+Copyright 2008 Rodrigo Kumpera
+Copyright 2008 André Tupinambá
+Copyright 2008 Mozilla Corporation
+Copyright 2008 Frederic Plourde
+
+Permission is hereby granted, free of charge, to any person obtaining a
+copy of this software and associated documentation files (the "Software"),
+to deal in the Software without restriction, including without limitation
+the rights to use, copy, modify, merge, publish, distribute, sublicense,
+and/or sell copies of the Software, and to permit persons to whom the
+Software is furnished to do so, subject to the following conditions:
+
+The above copyright notice and this permission notice (including the next
+paragraph) shall be included in all copies or substantial portions of the
+Software.
+
+THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
+THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
+FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER
+DEALINGS IN THE SOFTWARE.
+
+Other licenses:
+
+Copyright © 2000 Keith Packard, member of The XFree86 Project, Inc.
+            2005 Lars Knoll & Zack Rusin, Trolltech
+Copyright © 2000 SuSE, Inc.
+Copyright © 2007 Red Hat, Inc.
+Copyright © 1998 Keith Packard
+
+Permission to use, copy, modify, distribute, and sell this software and its
+documentation for any purpose is hereby granted without fee, provided that
+the above copyright notice appear in all copies and that both that
+copyright notice and this permission notice appear in supporting
+documentation, and that the name of the copyright holders not be used in
+advertising or publicity pertaining to distribution of the software without
+specific, written prior permission.  The copyright holders make no
+representations about the suitability of this software for any purpose.  It
+is provided "as is" without express or implied warranty.
+
+THE COPYRIGHT HOLDERS DISCLAIM ALL WARRANTIES WITH REGARD TO THIS
+SOFTWARE, INCLUDING ALL IMPLIED WARRANTIES OF MERCHANTABILITY AND
+FITNESS, IN NO EVENT SHALL THE COPYRIGHT HOLDERS BE LIABLE FOR ANY
+SPECIAL, INDIRECT OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES
+WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN
+AN ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING
+OUT OF OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS
+SOFTWARE.
+
+Copyright 1987, 1988, 1989, 1998  The Open Group
+
+Permission to use, copy, modify, distribute, and sell this software and its
+documentation for any purpose is hereby granted without fee, provided that
+the above copyright notice appear in all copies and that both that
+copyright notice and this permission notice appear in supporting
+documentation.
+
+The above copyright notice and this permission notice shall be included in
+all copies or substantial portions of the Software.
+
+THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL THE
+OPEN GROUP BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN
+AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
+CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+
+Except as contained in this notice, the name of The Open Group shall not be
+used in advertising or otherwise to promote the sale, use or other dealings
+in this Software without prior written authorization from The Open Group.
+
+Copyright 1987, 1988, 1989 by
+Digital Equipment Corporation, Maynard, Massachusetts.
+
+                        All Rights Reserved
+
+Permission to use, copy, modify, and distribute this software and its
+documentation for any purpose and without fee is hereby granted,
+provided that the above copyright notice appear in all copies and that
+both that copyright notice and this permission notice appear in
+supporting documentation, and that the name of Digital not be
+used in advertising or publicity pertaining to distribution of the
+software without specific, written prior permission.
+
+DIGITAL DISCLAIMS ALL WARRANTIES WITH REGARD TO THIS SOFTWARE, INCLUDING
+ALL IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS, IN NO EVENT SHALL
+DIGITAL BE LIABLE FOR ANY SPECIAL, INDIRECT OR CONSEQUENTIAL DAMAGES OR
+ANY DAMAGES WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS,
+WHETHER IN AN ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION,
+ARISING OUT OF OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS
+SOFTWARE.
diff --git a/debian/libpixman-1-0-udeb.install b/debian/libpixman-1-0-udeb.install
new file mode 100755 (executable)
index 0000000..44c3b82
--- /dev/null
@@ -0,0 +1 @@
+usr/lib/libpixman-1.so.*
diff --git a/debian/libpixman-1-0.install b/debian/libpixman-1-0.install
new file mode 100755 (executable)
index 0000000..44c3b82
--- /dev/null
@@ -0,0 +1 @@
+usr/lib/libpixman-1.so.*
diff --git a/debian/libpixman-1-0.symbols b/debian/libpixman-1-0.symbols
new file mode 100755 (executable)
index 0000000..db16c97
--- /dev/null
@@ -0,0 +1,117 @@
+libpixman-1.so.0 libpixman-1-0 #MINVER#
+ pixman_add_trapezoids@Base 0
+ pixman_add_traps@Base 0
+ pixman_blt@Base 0
+ pixman_compute_composite_region@Base 0
+ pixman_disable_out_of_bounds_workaround@Base 0.15.16
+ pixman_edge_init@Base 0
+ pixman_edge_step@Base 0
+ pixman_f_transform_bounds@Base 0.13.2
+ pixman_f_transform_from_pixman_transform@Base 0.13.2
+ pixman_f_transform_init_identity@Base 0.13.2
+ pixman_f_transform_init_rotate@Base 0.13.2
+ pixman_f_transform_init_scale@Base 0.13.2
+ pixman_f_transform_init_translate@Base 0.13.2
+ pixman_f_transform_invert@Base 0.13.2
+ pixman_f_transform_multiply@Base 0.13.2
+ pixman_f_transform_point@Base 0.13.2
+ pixman_f_transform_point_3d@Base 0.13.2
+ pixman_f_transform_rotate@Base 0.13.2
+ pixman_f_transform_scale@Base 0.13.2
+ pixman_f_transform_translate@Base 0.13.2
+ pixman_fill@Base 0
+ pixman_image_composite@Base 0.15.14
+ pixman_image_create_bits@Base 0.15.12
+ pixman_image_create_conical_gradient@Base 0
+ pixman_image_create_linear_gradient@Base 0
+ pixman_image_create_radial_gradient@Base 0
+ pixman_image_create_solid_fill@Base 0
+ pixman_image_fill_rectangles@Base 0.15.14
+ pixman_image_get_data@Base 0
+ pixman_image_get_depth@Base 0
+ pixman_image_get_height@Base 0
+ pixman_image_get_stride@Base 0
+ pixman_image_get_width@Base 0
+ pixman_image_ref@Base 0
+ pixman_image_set_accessors@Base 0
+ pixman_image_set_alpha_map@Base 0
+ pixman_image_set_clip_region32@Base 0.11.2
+ pixman_image_set_clip_region@Base 0
+ pixman_image_set_component_alpha@Base 0
+ pixman_image_set_destroy_function@Base 0.15.12
+ pixman_image_set_filter@Base 0
+ pixman_image_set_has_client_clip@Base 0
+ pixman_image_set_indexed@Base 0
+ pixman_image_set_repeat@Base 0
+ pixman_image_set_source_clipping@Base 0.9.4-2~
+ pixman_image_set_transform@Base 0
+ pixman_image_unref@Base 0
+ pixman_line_fixed_edge_init@Base 0
+ pixman_rasterize_edges@Base 0
+ pixman_rasterize_trapezoid@Base 0
+ pixman_region32_contains_point@Base 0.11.2
+ pixman_region32_contains_rectangle@Base 0.11.2
+ pixman_region32_copy@Base 0.11.2
+ pixman_region32_equal@Base 0.11.2
+ pixman_region32_extents@Base 0.11.2
+ pixman_region32_fini@Base 0.11.2
+ pixman_region32_init@Base 0.11.2
+ pixman_region32_init_rect@Base 0.11.2
+ pixman_region32_init_rects@Base 0.11.2
+ pixman_region32_init_with_extents@Base 0.11.2
+ pixman_region32_intersect@Base 0.11.2
+ pixman_region32_inverse@Base 0.11.2
+ pixman_region32_n_rects@Base 0.11.2
+ pixman_region32_not_empty@Base 0.11.2
+ pixman_region32_rectangles@Base 0.11.2
+ pixman_region32_reset@Base 0.11.2
+ pixman_region32_selfcheck@Base 0.11.2
+ pixman_region32_subtract@Base 0.11.2
+ pixman_region32_translate@Base 0.11.2
+ pixman_region32_union@Base 0.11.2
+ pixman_region32_union_rect@Base 0.11.2
+ pixman_region_contains_point@Base 0
+ pixman_region_contains_rectangle@Base 0
+ pixman_region_copy@Base 0
+ pixman_region_equal@Base 0
+ pixman_region_extents@Base 0
+ pixman_region_fini@Base 0
+ pixman_region_init@Base 0
+ pixman_region_init_rect@Base 0
+ pixman_region_init_rects@Base 0
+ pixman_region_init_with_extents@Base 0
+ pixman_region_intersect@Base 0
+ pixman_region_inverse@Base 0
+ pixman_region_n_rects@Base 0
+ pixman_region_not_empty@Base 0
+ pixman_region_rectangles@Base 0
+ pixman_region_reset@Base 0
+ pixman_region_selfcheck@Base 0
+ pixman_region_set_static_pointers@Base 0
+ pixman_region_subtract@Base 0
+ pixman_region_translate@Base 0
+ pixman_region_union@Base 0
+ pixman_region_union_rect@Base 0
+ pixman_sample_ceil_y@Base 0
+ pixman_sample_floor_y@Base 0
+ pixman_transform_bounds@Base 0.13.2
+ pixman_transform_from_pixman_f_transform@Base 0.13.2
+ pixman_transform_init_identity@Base 0.13.2
+ pixman_transform_init_rotate@Base 0.13.2
+ pixman_transform_init_scale@Base 0.13.2
+ pixman_transform_init_translate@Base 0.13.2
+ pixman_transform_invert@Base 0.13.2
+ pixman_transform_is_identity@Base 0.13.2
+ pixman_transform_is_int_translate@Base 0.13.2
+ pixman_transform_is_inverse@Base 0.13.2
+ pixman_transform_is_scale@Base 0.13.2
+ pixman_transform_multiply@Base 0.13.2
+ pixman_transform_point@Base 0.13.2
+ pixman_transform_rotate@Base 0.13.2
+ pixman_transform_scale@Base 0.13.2
+ pixman_transform_translate@Base 0.13.2
+ pixman_transform_point_3d@Base 0
+ pixman_version@Base 0.10.0
+ pixman_version_string@Base 0.10.0
+ pixman_format_supported_destination@Base 0.15.16
+ pixman_format_supported_source@Base 0.15.16
diff --git a/debian/libpixman-1-dev.install b/debian/libpixman-1-dev.install
new file mode 100755 (executable)
index 0000000..7f75e79
--- /dev/null
@@ -0,0 +1,5 @@
+usr/lib/libpixman-1.la
+usr/lib/libpixman-1.so
+usr/lib/libpixman-1.a
+usr/lib/pkgconfig
+usr/include/pixman-1
diff --git a/debian/rules b/debian/rules
new file mode 100755 (executable)
index 0000000..11a0b83
--- /dev/null
@@ -0,0 +1,110 @@
+#!/usr/bin/make -f
+
+#include /usr/share/quilt/quilt.make
+
+PACKAGE = libpixman-1-0
+SHLIBS_VERSION = 0.15.16
+
+CFLAGS = -Wall -g
+ifneq (,$(filter noopt,$(DEB_BUILD_OPTIONS)))
+       CFLAGS += -O0
+else
+       CFLAGS += -O2
+endif
+ifneq (,$(filter parallel=%,$(DEB_BUILD_OPTIONS)))
+       NUMJOBS = $(patsubst parallel=%,%,$(filter parallel=%,$(DEB_BUILD_OPTIONS)))
+       MAKEFLAGS += -j$(NUMJOBS)
+endif
+
+DEB_HOST_ARCH      ?= $(shell dpkg-architecture -qDEB_HOST_ARCH)
+DEB_HOST_GNU_TYPE  ?= $(shell dpkg-architecture -qDEB_HOST_GNU_TYPE)
+DEB_BUILD_GNU_TYPE ?= $(shell dpkg-architecture -qDEB_BUILD_GNU_TYPE)
+ifeq ($(DEB_BUILD_GNU_TYPE), $(DEB_HOST_GNU_TYPE))
+       confflags += --build=$(DEB_HOST_GNU_TYPE)
+else
+       confflags += --build=$(DEB_HOST_GNU_TYPE) --host=$(DEB_HOST_GNU_TYPE)
+#      confflags += --build=$(DEB_BUILD_GNU_TYPE) --host=$(DEB_HOST_GNU_TYPE)
+endif
+
+ifeq (armel, $(DEB_HOST_ARCH))
+       CFLAGS += -mfpu=vfp -mfloat-abi=softfp
+endif
+
+autogen: autogen-stamp
+autogen-stamp: $(QUILT_STAMPFN)
+       dh_testdir
+       autoreconf -vfi
+       touch $@
+
+config: config-stamp
+config-stamp: autogen-stamp
+       dh_testdir
+       test -d obj-$(DEB_BUILD_GNU_TYPE) || mkdir obj-$(DEB_BUILD_GNU_TYPE)
+       cd obj-$(DEB_BUILD_GNU_TYPE) && \
+       ../configure \
+         --prefix=/usr \
+         --mandir=\$${prefix}/share/man \
+         --infodir=\$${prefix}/share/info \
+         $(confflags) \
+         CFLAGS="$(CFLAGS)"
+       touch $@
+
+
+build: build-stamp
+build-stamp: config-stamp
+       dh_testdir
+       cd obj-$(DEB_BUILD_GNU_TYPE) && $(MAKE)
+       
+       touch $@
+
+clean: 
+       #unpatch
+       dh_testdir
+       dh_testroot
+       rm -f autogen-stamp config-stamp build-stamp install-stamp
+       
+       rm -f config.cache config.log config.status
+       rm -f */config.cache */config.log */config.status
+       rm -f conftest* */conftest*
+       rm -rf autom4te.cache */autom4te.cache
+       rm -rf obj-*
+       rm -f $$(find -name Makefile.in)
+       rm -f compile config.guess config.sub configure depcomp install-sh
+       rm -f ltmain.sh missing INSTALL aclocal.m4 config.h.in
+       
+       dh_clean
+
+install: install-stamp
+install-stamp: build-stamp
+       dh_testdir
+       dh_testroot
+       dh_clean -k
+       dh_installdirs
+
+       cd obj-$(DEB_BUILD_GNU_TYPE) && $(MAKE) DESTDIR=$(CURDIR)/debian/tmp install
+       touch $@
+
+# Install architecture-dependent files here.
+binary-arch: install
+       dh_testdir
+       dh_testroot
+
+       dh_installdocs
+       dh_install --sourcedir=debian/tmp --list-missing
+       dh_installchangelogs ChangeLog
+       dh_link
+       dh_strip --dbg-package=$(PACKAGE)-dbg
+       dh_compress
+       dh_fixperms
+       dh_makeshlibs -p$(PACKAGE) --add-udeb $(PACKAGE)-udeb -V"$(PACKAGE) (>= $(SHLIBS_VERSION))"
+       dh_installdeb
+       dh_shlibdeps
+       dh_gencontrol
+       dh_md5sums
+       dh_builddeb
+
+binary-indep: install
+# Nothing to do
+
+binary: binary-indep binary-arch
+.PHONY: autogen config build clean binary-indep binary-arch binary install
diff --git a/debian/watch b/debian/watch
new file mode 100755 (executable)
index 0000000..b83209f
--- /dev/null
@@ -0,0 +1,2 @@
+version=3
+http://xorg.freedesktop.org/releases/individual/lib/ pixman-(.*)\.tar\.gz
diff --git a/demos/Makefile.am b/demos/Makefile.am
new file mode 100644 (file)
index 0000000..070c2d7
--- /dev/null
@@ -0,0 +1,36 @@
+if HAVE_GTK
+
+AM_CFLAGS = $(OPENMP_CFLAGS)
+AM_LDFLAGS = $(OPENMP_CFLAGS)
+
+LDADD = $(top_builddir)/pixman/libpixman-1.la -lm $(GTK_LIBS) $(PNG_LIBS)
+INCLUDES = -I$(top_srcdir)/pixman -I$(top_builddir)/pixman $(GTK_CFLAGS) $(PNG_CFLAGS)
+
+GTK_UTILS = gtk-utils.c gtk-utils.h
+
+DEMOS =                                \
+       clip-test               \
+       clip-in                 \
+       composite-test          \
+       gradient-test           \
+       radial-test             \
+       alpha-test              \
+       screen-test             \
+       convolution-test        \
+       trap-test               \
+       tri-test
+
+gradient_test_SOURCES = gradient-test.c $(GTK_UTILS)
+alpha_test_SOURCES = alpha-test.c $(GTK_UTILS)
+composite_test_SOURCES = composite-test.c $(GTK_UTILS)
+clip_test_SOURCES = clip-test.c $(GTK_UTILS)
+clip_in_SOURCES = clip-in.c $(GTK_UTILS)
+trap_test_SOURCES = trap-test.c $(GTK_UTILS)
+screen_test_SOURCES = screen-test.c $(GTK_UTILS)
+convolution_test_SOURCES = convolution-test.c $(GTK_UTILS)
+radial_test_SOURCES = radial-test.c ../test/utils.c ../test/utils.h $(GTK_UTILS)
+tri_test_SOURCES = tri-test.c ../test/utils.c ../test/utils.h $(GTK_UTILS)
+
+noinst_PROGRAMS = $(DEMOS)
+
+endif
diff --git a/demos/alpha-test.c b/demos/alpha-test.c
new file mode 100644 (file)
index 0000000..54e30fa
--- /dev/null
@@ -0,0 +1,119 @@
+#include <stdio.h>
+#include <stdlib.h>
+#include "pixman.h"
+#include "gtk-utils.h"
+
+int
+main (int argc, char **argv)
+{
+#define WIDTH 400
+#define HEIGHT 200
+    
+    uint32_t *alpha = malloc (WIDTH * HEIGHT * 4);
+    uint32_t *dest = malloc (WIDTH * HEIGHT * 4);
+    uint32_t *src = malloc (WIDTH * HEIGHT * 4);
+    pixman_image_t *grad_img;
+    pixman_image_t *alpha_img;
+    pixman_image_t *dest_img;
+    pixman_image_t *src_img;
+    int i;
+    pixman_gradient_stop_t stops[2] =
+       {
+           { pixman_int_to_fixed (0), { 0x0000, 0x0000, 0x0000, 0x0000 } },
+           { pixman_int_to_fixed (1), { 0xffff, 0x0000, 0x1111, 0xffff } }
+       };
+    pixman_point_fixed_t p1 = { pixman_double_to_fixed (0), 0 };
+    pixman_point_fixed_t p2 = { pixman_double_to_fixed (WIDTH),
+                               pixman_int_to_fixed (0) };
+#if 0
+    pixman_transform_t trans = {
+       { { pixman_double_to_fixed (2), pixman_double_to_fixed (0.5), pixman_double_to_fixed (-100), },
+         { pixman_double_to_fixed (0), pixman_double_to_fixed (3), pixman_double_to_fixed (0), },
+         { pixman_double_to_fixed (0), pixman_double_to_fixed (0.000), pixman_double_to_fixed (1.0) } 
+       }
+    };
+#else
+    pixman_transform_t trans = {
+       { { pixman_fixed_1, 0, 0 },
+         { 0, pixman_fixed_1, 0 },
+         { 0, 0, pixman_fixed_1 } }
+    };
+#endif
+
+#if 0
+    pixman_point_fixed_t c_inner;
+    pixman_point_fixed_t c_outer;
+    pixman_fixed_t r_inner;
+    pixman_fixed_t r_outer;
+#endif
+    
+    for (i = 0; i < WIDTH * HEIGHT; ++i)
+       alpha[i] = 0x4f00004f; /* pale blue */
+    
+    alpha_img = pixman_image_create_bits (PIXMAN_a8r8g8b8,
+                                        WIDTH, HEIGHT, 
+                                         alpha,
+                                        WIDTH * 4);
+
+    for (i = 0; i < WIDTH * HEIGHT; ++i)
+       dest[i] = 0xffffff00;           /* yellow */
+    
+    dest_img = pixman_image_create_bits (PIXMAN_a8r8g8b8,
+                                        WIDTH, HEIGHT, 
+                                        dest,
+                                        WIDTH * 4);
+
+    for (i = 0; i < WIDTH * HEIGHT; ++i)
+       src[i] = 0xffff0000;
+
+    src_img = pixman_image_create_bits (PIXMAN_a8r8g8b8,
+                                       WIDTH, HEIGHT,
+                                       src,
+                                       WIDTH * 4);
+    
+#if 0
+    c_inner.x = pixman_double_to_fixed (50.0);
+    c_inner.y = pixman_double_to_fixed (50.0);
+    c_outer.x = pixman_double_to_fixed (50.0);
+    c_outer.y = pixman_double_to_fixed (50.0);
+    r_inner = 0;
+    r_outer = pixman_double_to_fixed (50.0);
+    
+    grad_img = pixman_image_create_conical_gradient (&c_inner, r_inner,
+                                                   stops, 2);
+#endif
+#if 0
+    grad_img = pixman_image_create_conical_gradient (&c_inner, r_inner,
+                                                   stops, 2);
+    grad_img = pixman_image_create_linear_gradient (&c_inner, &c_outer,
+                                                  r_inner, r_outer,
+                                                  stops, 2);
+#endif
+    
+    grad_img = pixman_image_create_linear_gradient  (&p1, &p2,
+                                                   stops, 2);
+
+    pixman_image_set_transform (grad_img, &trans);
+    pixman_image_set_repeat (grad_img, PIXMAN_REPEAT_PAD);
+    
+    pixman_image_composite (PIXMAN_OP_OVER, grad_img, NULL, alpha_img,
+                           0, 0, 0, 0, 0, 0, 10 * WIDTH, HEIGHT);
+
+    pixman_image_set_alpha_map (src_img, alpha_img, 10, 10);
+    
+    pixman_image_composite (PIXMAN_OP_OVER, src_img, NULL, dest_img,
+                           0, 0, 0, 0, 0, 0, 10 * WIDTH, HEIGHT);
+    
+    printf ("0, 0: %x\n", dest[0]);
+    printf ("10, 10: %x\n", dest[10 * 10 + 10]);
+    printf ("w, h: %x\n", dest[(HEIGHT - 1) * 100 + (WIDTH - 1)]);
+    
+    show_image (dest_img);
+
+    pixman_image_unref (src_img);
+    pixman_image_unref (grad_img);
+    pixman_image_unref (alpha_img);
+    free (dest);
+    
+    return 0;
+}
diff --git a/demos/clip-in.c b/demos/clip-in.c
new file mode 100644 (file)
index 0000000..5157981
--- /dev/null
@@ -0,0 +1,50 @@
+#include <stdio.h>
+#include <stdlib.h>
+#include <string.h>
+#include "pixman.h"
+#include "gtk-utils.h"
+
+/* This test demonstrates that clipping is done totally different depending
+ * on whether the source is transformed or not.
+ */
+int
+main (int argc, char **argv)
+{
+#define WIDTH 200
+#define HEIGHT 200
+
+#define SMALL 25
+    
+    uint32_t *sbits = malloc (SMALL * SMALL * 4);
+    uint32_t *bits = malloc (WIDTH * HEIGHT * 4);
+    pixman_transform_t trans = {
+    {
+       { pixman_double_to_fixed (1.0), pixman_double_to_fixed (0), pixman_double_to_fixed (-0.1), },
+       { pixman_double_to_fixed (0), pixman_double_to_fixed (1), pixman_double_to_fixed (-0.1), },
+       { pixman_double_to_fixed (0), pixman_double_to_fixed (0), pixman_double_to_fixed (1.0) }
+    } };
+         
+    pixman_image_t *src_img = pixman_image_create_bits (PIXMAN_a8r8g8b8, SMALL, SMALL, sbits, 4 * SMALL);
+    pixman_image_t *dest_img = pixman_image_create_bits (PIXMAN_a8r8g8b8, WIDTH, HEIGHT, bits, 4 * WIDTH);
+
+    memset (bits, 0xff, WIDTH * HEIGHT * 4);
+    memset (sbits, 0x00, SMALL * SMALL * 4);
+
+    pixman_image_composite (PIXMAN_OP_IN,
+                           src_img, NULL, dest_img,
+                           0, 0, 0, 0, SMALL, SMALL, 200, 200);
+    
+    pixman_image_set_transform (src_img, &trans);
+    
+    pixman_image_composite (PIXMAN_OP_IN,
+                           src_img, NULL, dest_img,
+                           0, 0, 0, 0, SMALL * 2, SMALL * 2, 200, 200);
+    
+    show_image (dest_img);
+    
+    pixman_image_unref (src_img);
+    pixman_image_unref (dest_img);
+    free (bits);
+    
+    return 0;
+}
diff --git a/demos/clip-test.c b/demos/clip-test.c
new file mode 100644 (file)
index 0000000..aa0df44
--- /dev/null
@@ -0,0 +1,97 @@
+#include <stdio.h>
+#include <stdlib.h>
+#include "pixman.h"
+#include "gtk-utils.h"
+
+#define WIDTH 200
+#define HEIGHT 200
+    
+static pixman_image_t *
+create_solid_bits (uint32_t pixel)
+{
+    uint32_t *pixels = malloc (WIDTH * HEIGHT * 4);
+    int i;
+    
+    for (i = 0; i < WIDTH * HEIGHT; ++i)
+       pixels[i] = pixel;
+
+    return pixman_image_create_bits (PIXMAN_a8r8g8b8,
+                                    WIDTH, HEIGHT, 
+                                    pixels,
+                                    WIDTH * 4);
+}
+
+int
+main (int argc, char **argv)
+{
+    pixman_image_t *gradient_img;
+    pixman_image_t *src_img, *dst_img;
+    pixman_gradient_stop_t stops[2] =
+       {
+           { pixman_int_to_fixed (0), { 0xffff, 0x0000, 0x0000, 0xffff } },
+           { pixman_int_to_fixed (1), { 0xffff, 0xffff, 0x0000, 0xffff } }
+       };
+#if 0
+    pixman_point_fixed_t p1 = { 0, 0 };
+    pixman_point_fixed_t p2 = { pixman_int_to_fixed (WIDTH),
+                               pixman_int_to_fixed (HEIGHT) };
+#endif
+    pixman_point_fixed_t c_inner;
+    pixman_point_fixed_t c_outer;
+    pixman_fixed_t r_inner;
+    pixman_fixed_t r_outer;
+    pixman_region32_t clip_region;
+    pixman_transform_t trans = {
+       { { pixman_double_to_fixed (1.3), pixman_double_to_fixed (0), pixman_double_to_fixed (-0.5), },
+         { pixman_double_to_fixed (0), pixman_double_to_fixed (1), pixman_double_to_fixed (-0.5), },
+         { pixman_double_to_fixed (0), pixman_double_to_fixed (0), pixman_double_to_fixed (1.0) } 
+       }
+    };
+    
+    src_img = create_solid_bits (0xff0000ff);
+    
+    c_inner.x = pixman_double_to_fixed (100.0);
+    c_inner.y = pixman_double_to_fixed (100.0);
+    c_outer.x = pixman_double_to_fixed (100.0);
+    c_outer.y = pixman_double_to_fixed (100.0);
+    r_inner = 0;
+    r_outer = pixman_double_to_fixed (100.0);
+    
+    gradient_img = pixman_image_create_radial_gradient (&c_inner, &c_outer,
+                                                       r_inner, r_outer,
+                                                       stops, 2);
+
+#if 0
+    gradient_img = pixman_image_create_linear_gradient  (&p1, &p2,
+                                                        stops, 2);
+    
+#endif
+
+    pixman_image_composite (PIXMAN_OP_OVER, gradient_img, NULL, src_img,
+                           0, 0, 0, 0, 0, 0, WIDTH, HEIGHT);
+    
+    pixman_region32_init_rect (&clip_region, 50, 0, 100, 200);
+    pixman_image_set_clip_region32 (src_img, &clip_region);
+    pixman_image_set_source_clipping (src_img, TRUE);
+    pixman_image_set_has_client_clip (src_img, TRUE);
+    pixman_image_set_transform (src_img, &trans);
+    pixman_image_set_repeat (src_img, PIXMAN_REPEAT_NORMAL);
+    
+    dst_img = create_solid_bits (0xffff0000);
+    pixman_image_composite (PIXMAN_OP_OVER, src_img, NULL, dst_img,
+                           0, 0, 0, 0, 0, 0, WIDTH, HEIGHT);
+    
+
+#if 0
+    printf ("0, 0: %x\n", src[0]);
+    printf ("10, 10: %x\n", src[10 * 10 + 10]);
+    printf ("w, h: %x\n", src[(HEIGHT - 1) * 100 + (WIDTH - 1)]);
+#endif
+    
+    show_image (dst_img);
+    
+    pixman_image_unref (gradient_img);
+    pixman_image_unref (src_img);
+    
+    return 0;
+}
diff --git a/demos/composite-test.c b/demos/composite-test.c
new file mode 100644 (file)
index 0000000..f5f352f
--- /dev/null
@@ -0,0 +1,191 @@
+#include <gtk/gtk.h>
+#include <stdlib.h>
+#include <stdio.h>
+#include "pixman.h"
+#include "gtk-utils.h"
+
+#define WIDTH  60
+#define HEIGHT 60
+
+typedef struct {
+    const char *name;
+    pixman_op_t op;
+} operator_t;
+
+static const operator_t operators[] = {
+    { "CLEAR",         PIXMAN_OP_CLEAR },
+    { "SRC",           PIXMAN_OP_SRC },
+    { "DST",           PIXMAN_OP_DST },
+    { "OVER",          PIXMAN_OP_OVER },
+    { "OVER_REVERSE",  PIXMAN_OP_OVER_REVERSE },
+    { "IN",            PIXMAN_OP_IN },
+    { "IN_REVERSE",    PIXMAN_OP_IN_REVERSE },
+    { "OUT",           PIXMAN_OP_OUT },
+    { "OUT_REVERSE",   PIXMAN_OP_OUT_REVERSE },
+    { "ATOP",          PIXMAN_OP_ATOP },
+    { "ATOP_REVERSE",  PIXMAN_OP_ATOP_REVERSE },
+    { "XOR",           PIXMAN_OP_XOR },
+    { "ADD",           PIXMAN_OP_ADD },
+    { "SATURATE",      PIXMAN_OP_SATURATE },
+
+    { "MULTIPLY",      PIXMAN_OP_MULTIPLY },
+    { "SCREEN",                PIXMAN_OP_SCREEN },
+    { "OVERLAY",       PIXMAN_OP_OVERLAY },
+    { "DARKEN",                PIXMAN_OP_DARKEN },
+    { "LIGHTEN",       PIXMAN_OP_LIGHTEN },
+    { "COLOR_DODGE",   PIXMAN_OP_COLOR_DODGE },
+    { "COLOR_BURN",    PIXMAN_OP_COLOR_BURN },
+    { "HARD_LIGHT",    PIXMAN_OP_HARD_LIGHT },
+    { "SOFT_LIGHT",    PIXMAN_OP_SOFT_LIGHT },
+    { "DIFFERENCE",    PIXMAN_OP_DIFFERENCE },
+    { "EXCLUSION",     PIXMAN_OP_EXCLUSION },
+    { "HSL_HUE",       PIXMAN_OP_HSL_HUE },
+    { "HSL_SATURATION",        PIXMAN_OP_HSL_SATURATION },
+    { "HSL_COLOR",     PIXMAN_OP_HSL_COLOR },
+    { "HSL_LUMINOSITY",        PIXMAN_OP_HSL_LUMINOSITY },
+};
+
+static uint32_t
+reader (const void *src, int size)
+{
+    switch (size)
+    {
+    case 1:
+       return *(uint8_t *)src;
+    case 2:
+       return *(uint16_t *)src;
+    case 4:
+       return *(uint32_t *)src;
+    default:
+       g_assert_not_reached();
+    }
+}
+
+static void
+writer (void *src, uint32_t value, int size)
+{
+    switch (size)
+    {
+    case 1:
+       *(uint8_t *)src = value;
+       break;
+
+    case 2:
+       *(uint16_t *)src = value;
+       break;
+
+    case 4:
+       *(uint32_t *)src = value;
+       break;
+
+    default:
+        break;
+    }
+}
+
+int
+main (int argc, char **argv)
+{
+#define d2f pixman_double_to_fixed
+    
+    GtkWidget *window, *swindow;
+    GtkWidget *table;
+    uint32_t *dest = malloc (WIDTH * HEIGHT * 4);
+    uint32_t *src = malloc (WIDTH * HEIGHT * 4);
+    pixman_image_t *src_img;
+    pixman_image_t *dest_img;
+    pixman_point_fixed_t p1 = { -10 << 0, 0 };
+    pixman_point_fixed_t p2 = { WIDTH << 16, (HEIGHT - 10) << 16 };
+    uint16_t full = 0xcfff;
+    uint16_t low  = 0x5000;
+    uint16_t alpha = 0xffff;
+    pixman_gradient_stop_t stops[6] =
+    {
+       { d2f (0.0), { full, low, low, alpha } },
+       { d2f (0.25), { full, full, low, alpha } },
+       { d2f (0.4), { low, full, low, alpha } },
+       { d2f (0.6), { low, full, full, alpha } },
+       { d2f (0.8), { low, low, full, alpha } },
+       { d2f (1.0), { full, low, full, alpha } },
+    };
+
+    int i;
+
+    gtk_init (&argc, &argv);
+
+    window = gtk_window_new (GTK_WINDOW_TOPLEVEL);
+
+    gtk_window_set_default_size (GTK_WINDOW (window), 800, 600);
+    
+    g_signal_connect (window, "delete-event",
+                     G_CALLBACK (gtk_main_quit),
+                     NULL);
+    table = gtk_table_new (G_N_ELEMENTS (operators) / 6, 6, TRUE);
+
+    src_img = pixman_image_create_linear_gradient (&p1, &p2, stops,
+                                                  sizeof (stops) / sizeof (stops[0]));
+
+    pixman_image_set_repeat (src_img, PIXMAN_REPEAT_PAD);
+    
+    dest_img = pixman_image_create_bits (PIXMAN_a8r8g8b8,
+                                        WIDTH, HEIGHT,
+                                        dest,
+                                        WIDTH * 4);
+    pixman_image_set_accessors (dest_img, reader, writer);
+
+    for (i = 0; i < G_N_ELEMENTS (operators); ++i)
+    {
+       GtkWidget *image;
+       GdkPixbuf *pixbuf;
+       GtkWidget *vbox;
+       GtkWidget *label;
+       int j, k;
+
+       vbox = gtk_vbox_new (FALSE, 0);
+
+       label = gtk_label_new (operators[i].name);
+       gtk_box_pack_start (GTK_BOX (vbox), label, FALSE, FALSE, 6);
+       gtk_widget_show (label);
+
+       for (j = 0; j < HEIGHT; ++j)
+       {
+           for (k = 0; k < WIDTH; ++k)
+               dest[j * WIDTH + k] = 0x7f6f6f00;
+       }
+       pixman_image_composite (operators[i].op, src_img, NULL, dest_img,
+                               0, 0, 0, 0, 0, 0, WIDTH, HEIGHT);
+       pixbuf = pixbuf_from_argb32 (pixman_image_get_data (dest_img), TRUE,
+                                    WIDTH, HEIGHT, WIDTH * 4);
+       image = gtk_image_new_from_pixbuf (pixbuf);
+       gtk_box_pack_start (GTK_BOX (vbox), image, FALSE, FALSE, 0);
+       gtk_widget_show (image);
+
+       gtk_table_attach_defaults (GTK_TABLE (table), vbox,
+                                  i % 6, (i % 6) + 1, i / 6, (i / 6) + 1);
+       gtk_widget_show (vbox);
+
+       g_object_unref (pixbuf);
+    }
+
+    pixman_image_unref (src_img);
+    free (src);
+    pixman_image_unref (dest_img);
+    free (dest);
+
+    swindow = gtk_scrolled_window_new (NULL, NULL);
+    gtk_scrolled_window_set_policy (GTK_SCROLLED_WINDOW (swindow),
+                                   GTK_POLICY_AUTOMATIC,
+                                   GTK_POLICY_AUTOMATIC);
+    
+    gtk_scrolled_window_add_with_viewport (GTK_SCROLLED_WINDOW (swindow), table);
+    gtk_widget_show (table);
+
+    gtk_container_add (GTK_CONTAINER (window), swindow);
+    gtk_widget_show (swindow);
+
+    gtk_widget_show (window);
+
+    gtk_main ();
+
+    return 0;
+}
diff --git a/demos/convolution-test.c b/demos/convolution-test.c
new file mode 100644 (file)
index 0000000..da284af
--- /dev/null
@@ -0,0 +1,47 @@
+#include <stdio.h>
+#include <stdlib.h>
+#include "pixman.h"
+#include "gtk-utils.h"
+
+int
+main (int argc, char **argv)
+{
+#define WIDTH 200
+#define HEIGHT 200
+
+#define d2f pixman_double_to_fixed
+    
+    uint32_t *src = malloc (WIDTH * HEIGHT * 4);
+    uint32_t *mask = malloc (WIDTH * HEIGHT * 4);
+    uint32_t *dest = malloc (WIDTH * HEIGHT * 4);
+    pixman_fixed_t convolution[] =
+    {
+       d2f (3), d2f (3),
+       d2f (0.5), d2f (0.5), d2f (0.5),
+       d2f (0.5), d2f (0.5), d2f (0.5),
+       d2f (0.5), d2f (0.5), d2f (0.5),
+    };
+    pixman_image_t *simg, *mimg, *dimg;
+
+    int i;
+
+    for (i = 0; i < WIDTH * HEIGHT; ++i)
+    {
+       src[i] = 0x7f007f00;
+       mask[i] = (i % 256) * 0x01000000;
+       dest[i] = 0;
+    }
+
+    simg = pixman_image_create_bits (PIXMAN_a8r8g8b8, WIDTH, HEIGHT, src, WIDTH * 4);
+    mimg = pixman_image_create_bits (PIXMAN_a8r8g8b8, WIDTH, HEIGHT, mask, WIDTH * 4);
+    dimg = pixman_image_create_bits (PIXMAN_a8r8g8b8, WIDTH, HEIGHT, dest, WIDTH * 4);
+
+    pixman_image_set_filter (mimg, PIXMAN_FILTER_CONVOLUTION,
+                            convolution, 11);
+
+    pixman_image_composite (PIXMAN_OP_OVER, simg, mimg, dimg, 0, 0, 0, 0, 0, 0, WIDTH, HEIGHT);
+
+    show_image (dimg);
+    
+    return 0;
+}
diff --git a/demos/gradient-test.c b/demos/gradient-test.c
new file mode 100644 (file)
index 0000000..20f78a6
--- /dev/null
@@ -0,0 +1,93 @@
+#include <stdio.h>
+#include <stdlib.h>
+#include "pixman.h"
+#include "gtk-utils.h"
+
+int
+main (int argc, char **argv)
+{
+#define WIDTH 400
+#define HEIGHT 200
+    
+    uint32_t *dest = malloc (WIDTH * HEIGHT * 4);
+    pixman_image_t *src_img;
+    pixman_image_t *dest_img;
+    int i;
+    pixman_gradient_stop_t stops[2] =
+       {
+           { pixman_int_to_fixed (0), { 0xffff, 0xeeee, 0xeeee, 0xeeee } },
+           { pixman_int_to_fixed (1), { 0xffff, 0x1111, 0x1111, 0x1111 } }
+       };
+    pixman_point_fixed_t p1 = { pixman_double_to_fixed (0), 0 };
+    pixman_point_fixed_t p2 = { pixman_double_to_fixed (WIDTH / 8.),
+                               pixman_int_to_fixed (0) };
+#if 0
+    pixman_transform_t trans = {
+       { { pixman_double_to_fixed (2), pixman_double_to_fixed (0.5), pixman_double_to_fixed (-100), },
+         { pixman_double_to_fixed (0), pixman_double_to_fixed (3), pixman_double_to_fixed (0), },
+         { pixman_double_to_fixed (0), pixman_double_to_fixed (0.000), pixman_double_to_fixed (1.0) } 
+       }
+    };
+#else
+    pixman_transform_t trans = {
+       { { pixman_fixed_1, 0, 0 },
+         { 0, pixman_fixed_1, 0 },
+         { 0, 0, pixman_fixed_1 } }
+    };
+#endif
+
+#if 0
+    pixman_point_fixed_t c_inner;
+    pixman_point_fixed_t c_outer;
+    pixman_fixed_t r_inner;
+    pixman_fixed_t r_outer;
+#endif
+    
+    for (i = 0; i < WIDTH * HEIGHT; ++i)
+       dest[i] = 0x4f00004f; /* pale blue */
+    
+    dest_img = pixman_image_create_bits (PIXMAN_a8r8g8b8,
+                                        WIDTH, HEIGHT, 
+                                        dest,
+                                        WIDTH * 4);
+
+#if 0
+    c_inner.x = pixman_double_to_fixed (50.0);
+    c_inner.y = pixman_double_to_fixed (50.0);
+    c_outer.x = pixman_double_to_fixed (50.0);
+    c_outer.y = pixman_double_to_fixed (50.0);
+    r_inner = 0;
+    r_outer = pixman_double_to_fixed (50.0);
+    
+    src_img = pixman_image_create_conical_gradient (&c_inner, r_inner,
+                                                   stops, 2);
+#endif
+#if 0
+    src_img = pixman_image_create_conical_gradient (&c_inner, r_inner,
+                                                   stops, 2);
+    src_img = pixman_image_create_linear_gradient (&c_inner, &c_outer,
+                                                  r_inner, r_outer,
+                                                  stops, 2);
+#endif
+    
+    src_img = pixman_image_create_linear_gradient  (&p1, &p2,
+                                                   stops, 2);
+    
+    pixman_image_set_transform (src_img, &trans);
+    pixman_image_set_repeat (src_img, PIXMAN_REPEAT_PAD);
+    
+    pixman_image_composite (PIXMAN_OP_OVER, src_img, NULL, dest_img,
+                           0, 0, 0, 0, 0, 0, 10 * WIDTH, HEIGHT);
+    
+    printf ("0, 0: %x\n", dest[0]);
+    printf ("10, 10: %x\n", dest[10 * 10 + 10]);
+    printf ("w, h: %x\n", dest[(HEIGHT - 1) * 100 + (WIDTH - 1)]);
+    
+    show_image (dest_img);
+    
+    pixman_image_unref (src_img);
+    pixman_image_unref (dest_img);
+    free (dest);
+    
+    return 0;
+}
diff --git a/demos/gtk-utils.c b/demos/gtk-utils.c
new file mode 100644 (file)
index 0000000..0e7cb5c
--- /dev/null
@@ -0,0 +1,115 @@
+#include <gtk/gtk.h>
+#include <config.h>
+#include "pixman-private.h"    /* For image->bits.format
+                                * FIXME: there should probably be public API for this
+                                */
+#include "gtk-utils.h"
+
+GdkPixbuf *
+pixbuf_from_argb32 (uint32_t *bits,
+                   gboolean has_alpha,
+                   int width,
+                   int height,
+                   int stride)
+{
+    GdkPixbuf *pixbuf = gdk_pixbuf_new (GDK_COLORSPACE_RGB, TRUE,
+                                       8, width, height);
+    int p_stride = gdk_pixbuf_get_rowstride (pixbuf);
+    guint32 *p_bits = (guint32 *)gdk_pixbuf_get_pixels (pixbuf);
+    int w, h;
+    
+    for (h = 0; h < height; ++h)
+    {
+       for (w = 0; w < width; ++w)
+       {
+           uint32_t argb = bits[h * (stride / 4) + w];
+           guint r, g, b, a;
+           char *pb = (char *)p_bits;
+
+           pb += h * p_stride + w * 4;
+
+           r = (argb & 0x00ff0000) >> 16;
+           g = (argb & 0x0000ff00) >> 8;
+           b = (argb & 0x000000ff) >> 0;
+           a = has_alpha? (argb & 0xff000000) >> 24 : 0xff;
+
+           if (a)
+           {
+               r = (r * 255) / a;
+               g = (g * 255) / a;
+               b = (b * 255) / a;
+           }
+
+           if (r > 255) r = 255;
+           if (g > 255) g = 255;
+           if (b > 255) b = 255;
+           
+           pb[0] = r;
+           pb[1] = g;
+           pb[2] = b;
+           pb[3] = a;
+       }
+    }
+    
+    return pixbuf;
+}
+
+
+static gboolean
+on_expose (GtkWidget *widget, GdkEventExpose *expose, gpointer data)
+{
+    GdkPixbuf *pixbuf = data;
+    
+    gdk_draw_pixbuf (widget->window, NULL,
+                    pixbuf, 0, 0, 0, 0,
+                    gdk_pixbuf_get_width (pixbuf),
+                    gdk_pixbuf_get_height (pixbuf),
+                    GDK_RGB_DITHER_NONE,
+                    0, 0);
+    
+    return TRUE;
+}
+
+void
+show_image (pixman_image_t *image)
+{
+    GtkWidget *window;
+    GdkPixbuf *pixbuf;
+    int width, height, stride;
+    int argc;
+    char **argv;
+    char *arg0 = g_strdup ("pixman-test-program");
+    gboolean has_alpha;
+    pixman_format_code_t format;
+
+    argc = 1;
+    argv = (char **)&arg0;
+
+    gtk_init (&argc, &argv);
+    
+    window = gtk_window_new (GTK_WINDOW_TOPLEVEL);
+    width = pixman_image_get_width (image);
+    height = pixman_image_get_height (image);
+    stride = pixman_image_get_stride (image);
+
+    gtk_window_set_default_size (GTK_WINDOW (window), width, height);
+    
+    format = image->bits.format;
+    
+    if (format == PIXMAN_a8r8g8b8)
+       has_alpha = TRUE;
+    else if (format == PIXMAN_x8r8g8b8)
+       has_alpha = FALSE;
+    else
+       g_error ("Can't deal with this format: %x\n", format);
+    
+    pixbuf = pixbuf_from_argb32 (pixman_image_get_data (image), has_alpha,
+                                width, height, stride);
+    
+    g_signal_connect (window, "expose_event", G_CALLBACK (on_expose), pixbuf);
+    g_signal_connect (window, "delete_event", G_CALLBACK (gtk_main_quit), NULL);
+    
+    gtk_widget_show (window);
+    
+    gtk_main ();
+}
diff --git a/demos/gtk-utils.h b/demos/gtk-utils.h
new file mode 100644 (file)
index 0000000..2cb13bc
--- /dev/null
@@ -0,0 +1,13 @@
+#include <stdio.h>
+#include <stdlib.h>
+#include <glib.h>
+#include <gtk/gtk.h>
+#include "pixman.h"
+
+void show_image (pixman_image_t *image);
+
+GdkPixbuf *pixbuf_from_argb32 (uint32_t *bits,
+                              gboolean has_alpha,
+                               int width,
+                               int height,
+                               int stride);
diff --git a/demos/radial-test.c b/demos/radial-test.c
new file mode 100644 (file)
index 0000000..35e90d7
--- /dev/null
@@ -0,0 +1,198 @@
+#include "../test/utils.h"
+#include "gtk-utils.h"
+
+#define NUM_GRADIENTS 7
+#define NUM_STOPS 3
+#define NUM_REPEAT 4
+#define SIZE 128
+#define WIDTH (SIZE * NUM_GRADIENTS)
+#define HEIGHT (SIZE * NUM_REPEAT)
+
+/*
+ * We want to test all the possible relative positions of the start
+ * and end circle:
+ *
+ *  - The start circle can be smaller/equal/bigger than the end
+ *    circle. A radial gradient can be classified in one of these
+ *    three cases depending on the sign of dr.
+ *
+ *  - The smaller circle can be completely inside/internally
+ *    tangent/outside (at least in part) of the bigger circle. This
+ *    classification is the same as the one which can be computed by
+ *    examining the sign of a = (dx^2 + dy^2 - dr^2).
+ *
+ *  - If the two circles have the same size, neither can be inside or
+ *    internally tangent
+ *
+ * This test draws radial gradients whose circles always have the same
+ * centers (0, 0) and (1, 0), but with different radiuses. From left
+ * to right:
+ *
+ * - Small start circle completely inside the end circle
+ *     0.25 -> 1.75; dr =  1.5 > 0; a = 1 - 1.50^2 < 0
+ *
+ * - Small start circle internally tangent to the end circle
+ *     0.50 -> 1.50; dr =  1.0 > 0; a = 1 - 1.00^2 = 0
+ *
+ * - Small start circle outside of the end circle
+ *     0.50 -> 1.00; dr =  0.5 > 0; a = 1 - 0.50^2 > 0
+ *
+ * - Start circle with the same size as the end circle
+ *     1.00 -> 1.00; dr =  0.0 = 0; a = 1 - 0.00^2 > 0
+ *
+ * - Small end circle outside of the start circle
+ *     1.00 -> 0.50; dr = -0.5 > 0; a = 1 - 0.50^2 > 0
+ *
+ * - Small end circle internally tangent to the start circle
+ *     1.50 -> 0.50; dr = -1.0 > 0; a = 1 - 1.00^2 = 0
+ *
+ * - Small end circle completely inside the start circle
+ *     1.75 -> 0.25; dr = -1.5 > 0; a = 1 - 1.50^2 < 0
+ *
+ */
+
+const static double radiuses[NUM_GRADIENTS] = {
+    0.25,
+    0.50,
+    0.50,
+    1.00,
+    1.00,
+    1.50,
+    1.75
+};
+
+#define double_to_color(x)                                     \
+    (((uint32_t) ((x)*65536)) - (((uint32_t) ((x)*65536)) >> 16))
+
+#define PIXMAN_STOP(offset,r,g,b,a)            \
+    { pixman_double_to_fixed (offset),         \
+       {                                       \
+       double_to_color (r),                    \
+       double_to_color (g),                    \
+       double_to_color (b),                    \
+       double_to_color (a)                     \
+       }                                       \
+    }
+
+static const pixman_gradient_stop_t stops[NUM_STOPS] = {
+    PIXMAN_STOP (0.0,        1, 0, 0, 0.75),
+    PIXMAN_STOP (0.70710678, 0, 1, 0, 0),
+    PIXMAN_STOP (1.0,        0, 0, 1, 1)
+};
+
+static pixman_image_t *
+create_radial (int index)
+{
+    pixman_point_fixed_t p0, p1;
+    pixman_fixed_t r0, r1;
+    double x0, x1, radius0, radius1, left, right, center;
+
+    x0 = 0;
+    x1 = 1;
+    radius0 = radiuses[index];
+    radius1 = radiuses[NUM_GRADIENTS - index - 1];
+
+    /* center the gradient */
+    left = MIN (x0 - radius0, x1 - radius1);
+    right = MAX (x0 + radius0, x1 + radius1);
+    center = (left + right) * 0.5;
+    x0 -= center;
+    x1 -= center;
+
+    /* scale to make it fit within a 1x1 rect centered in (0,0) */
+    x0 *= 0.25;
+    x1 *= 0.25;
+    radius0 *= 0.25;
+    radius1 *= 0.25;
+
+    p0.x = pixman_double_to_fixed (x0);
+    p0.y = pixman_double_to_fixed (0);
+
+    p1.x = pixman_double_to_fixed (x1);
+    p1.y = pixman_double_to_fixed (0);
+
+    r0 = pixman_double_to_fixed (radius0);
+    r1 = pixman_double_to_fixed (radius1);
+
+    return pixman_image_create_radial_gradient (&p0, &p1,
+                                               r0, r1,
+                                               stops, NUM_STOPS);
+}
+
+static const pixman_repeat_t repeat[NUM_REPEAT] = {
+    PIXMAN_REPEAT_NONE,
+    PIXMAN_REPEAT_NORMAL,
+    PIXMAN_REPEAT_REFLECT,
+    PIXMAN_REPEAT_PAD
+};
+
+int
+main (int argc, char **argv)
+{
+    pixman_transform_t transform;
+    pixman_image_t *src_img, *dest_img;
+    int i, j;
+
+    enable_fp_exceptions ();
+
+    dest_img = pixman_image_create_bits (PIXMAN_a8r8g8b8,
+                                        WIDTH, HEIGHT,
+                                        NULL, 0);
+
+    pixman_transform_init_identity (&transform);
+
+    /*
+     * The create_radial() function returns gradients centered in the
+     * origin and whose interesting part fits a 1x1 square. We want to
+     * paint these gradients on a SIZExSIZE square and to make things
+     * easier we want the origin in the top-left corner of the square
+     * we want to see.
+     */
+    pixman_transform_translate (NULL, &transform,
+                               pixman_double_to_fixed (0.5),
+                               pixman_double_to_fixed (0.5));
+
+    pixman_transform_scale (NULL, &transform,
+                           pixman_double_to_fixed (SIZE),
+                           pixman_double_to_fixed (SIZE));
+
+    /*
+     * Gradients are evaluated at the center of each pixel, so we need
+     * to translate by half a pixel to trigger some interesting
+     * cornercases. In particular, the original implementation of PDF
+     * radial gradients tried to divide by 0 when using this transform
+     * on the "tangent circles" cases.
+     */
+    pixman_transform_translate (NULL, &transform,
+                               pixman_double_to_fixed (0.5),
+                               pixman_double_to_fixed (0.5));
+
+    for (i = 0; i < NUM_GRADIENTS; i++)
+    {
+       src_img = create_radial (i);
+       pixman_image_set_transform (src_img, &transform);
+
+       for (j = 0; j < NUM_REPEAT; j++)
+       {
+           pixman_image_set_repeat (src_img, repeat[j]);
+
+           pixman_image_composite32 (PIXMAN_OP_OVER,
+                                     src_img,
+                                     NULL,
+                                     dest_img,
+                                     0, 0,
+                                     0, 0,
+                                     i * SIZE, j * SIZE,
+                                     SIZE, SIZE);
+
+       }
+
+       pixman_image_unref (src_img);
+    }
+
+    show_image (dest_img);
+
+    pixman_image_unref (dest_img);
+
+    return 0;
+}
diff --git a/demos/screen-test.c b/demos/screen-test.c
new file mode 100644 (file)
index 0000000..e69dba3
--- /dev/null
@@ -0,0 +1,44 @@
+#include <stdio.h>
+#include <stdlib.h>
+#include "pixman.h"
+#include "gtk-utils.h"
+
+int
+main (int argc, char **argv)
+{
+#define WIDTH 40
+#define HEIGHT 40
+    
+    uint32_t *src1 = malloc (WIDTH * HEIGHT * 4);
+    uint32_t *src2 = malloc (WIDTH * HEIGHT * 4);
+    uint32_t *src3 = malloc (WIDTH * HEIGHT * 4);
+    uint32_t *dest = malloc (3 * WIDTH * 2 * HEIGHT * 4);
+    pixman_image_t *simg1, *simg2, *simg3, *dimg;
+
+    int i;
+
+    for (i = 0; i < WIDTH * HEIGHT; ++i)
+    {
+       src1[i] = 0x7ff00000;
+       src2[i] = 0x7f00ff00;
+       src3[i] = 0x7f0000ff;
+    }
+
+    for (i = 0; i < 3 * WIDTH * 2 * HEIGHT; ++i)
+    {
+       dest[i] = 0x0;
+    }
+
+    simg1 = pixman_image_create_bits (PIXMAN_a8r8g8b8, WIDTH, HEIGHT, src1, WIDTH * 4);
+    simg2 = pixman_image_create_bits (PIXMAN_a8r8g8b8, WIDTH, HEIGHT, src2, WIDTH * 4);
+    simg3 = pixman_image_create_bits (PIXMAN_a8r8g8b8, WIDTH, HEIGHT, src3, WIDTH * 4);
+    dimg  = pixman_image_create_bits (PIXMAN_a8r8g8b8, 3 * WIDTH, 2 * HEIGHT, dest, 3 * WIDTH * 4);
+
+    pixman_image_composite (PIXMAN_OP_SCREEN, simg1, NULL, dimg, 0, 0, 0, 0, WIDTH, HEIGHT / 4, WIDTH, HEIGHT);
+    pixman_image_composite (PIXMAN_OP_SCREEN, simg2, NULL, dimg, 0, 0, 0, 0, (WIDTH/2), HEIGHT / 4 + HEIGHT / 2, WIDTH, HEIGHT);
+    pixman_image_composite (PIXMAN_OP_SCREEN, simg3, NULL, dimg, 0, 0, 0, 0, (4 * WIDTH) / 3, HEIGHT, WIDTH, HEIGHT);
+
+    show_image (dimg);
+    
+    return 0;
+}
diff --git a/demos/trap-test.c b/demos/trap-test.c
new file mode 100644 (file)
index 0000000..19295e7
--- /dev/null
@@ -0,0 +1,49 @@
+#include <stdio.h>
+#include <stdlib.h>
+#include <string.h>
+#include "pixman.h"
+#include "gtk-utils.h"
+
+int
+main (int argc, char **argv)
+{
+#define WIDTH 200
+#define HEIGHT 200
+
+    pixman_image_t *src_img;
+    pixman_image_t *mask_img;
+    pixman_image_t *dest_img;
+    pixman_trap_t trap;
+    pixman_color_t white = { 0x0000, 0xffff, 0x0000, 0xffff };
+    uint32_t *bits = malloc (WIDTH * HEIGHT * 4);
+    uint32_t *mbits = malloc (WIDTH * HEIGHT);
+
+    memset (mbits, 0, WIDTH * HEIGHT);
+    memset (bits, 0xff, WIDTH * HEIGHT * 4);
+    
+    trap.top.l = pixman_int_to_fixed (50) + 0x8000;
+    trap.top.r = pixman_int_to_fixed (150) + 0x8000;
+    trap.top.y = pixman_int_to_fixed (30);
+
+    trap.bot.l = pixman_int_to_fixed (50) + 0x8000;
+    trap.bot.r = pixman_int_to_fixed (150) + 0x8000;
+    trap.bot.y = pixman_int_to_fixed (150);
+
+    mask_img = pixman_image_create_bits (PIXMAN_a8, WIDTH, HEIGHT, mbits, WIDTH);
+    src_img = pixman_image_create_solid_fill (&white);
+    dest_img = pixman_image_create_bits (PIXMAN_a8r8g8b8, WIDTH, HEIGHT, bits, WIDTH * 4);
+    
+    pixman_add_traps (mask_img, 0, 0, 1, &trap);
+
+    pixman_image_composite (PIXMAN_OP_OVER,
+                           src_img, mask_img, dest_img,
+                           0, 0, 0, 0, 0, 0, WIDTH, HEIGHT);
+    
+    show_image (dest_img);
+    
+    pixman_image_unref (src_img);
+    pixman_image_unref (dest_img);
+    free (bits);
+    
+    return 0;
+}
diff --git a/demos/tri-test.c b/demos/tri-test.c
new file mode 100644 (file)
index 0000000..a71869a
--- /dev/null
@@ -0,0 +1,48 @@
+#include <stdio.h>
+#include <stdlib.h>
+#include <string.h>
+#include "../test/utils.h"
+#include "gtk-utils.h"
+
+int
+main (int argc, char **argv)
+{
+#define WIDTH 200
+#define HEIGHT 200
+
+#define POINT(x,y)                                                     \
+    { pixman_double_to_fixed ((x)), pixman_double_to_fixed ((y)) }
+    
+    pixman_image_t *src_img, *dest_img;
+    pixman_triangle_t tris[4] =
+    {
+       { POINT (100, 100), POINT (10, 50), POINT (110, 10) },
+       { POINT (100, 100), POINT (150, 10), POINT (200, 50) },
+       { POINT (100, 100), POINT (10, 170), POINT (90, 175) },
+       { POINT (100, 100), POINT (170, 150), POINT (120, 190) },
+    };
+    pixman_color_t color = { 0x4444, 0x4444, 0xffff, 0xffff };
+    uint32_t *bits = malloc (WIDTH * HEIGHT * 4);
+    int i;
+
+    for (i = 0; i < WIDTH * HEIGHT; ++i)
+       bits[i] = (i / HEIGHT) * 0x01010000;
+    
+    src_img = pixman_image_create_solid_fill (&color);
+    dest_img = pixman_image_create_bits (PIXMAN_a8r8g8b8, WIDTH, HEIGHT, bits, WIDTH * 4);
+    
+    pixman_composite_triangles (PIXMAN_OP_ATOP_REVERSE,
+                               src_img,
+                               dest_img,
+                               PIXMAN_a8,
+                               200, 200,
+                               -5, 5,
+                               ARRAY_LENGTH (tris), tris);
+    show_image (dest_img);
+    
+    pixman_image_unref (src_img);
+    pixman_image_unref (dest_img);
+    free (bits);
+    
+    return 0;
+}
diff --git a/packaging/pixman.spec b/packaging/pixman.spec
new file mode 100644 (file)
index 0000000..b10c4dd
--- /dev/null
@@ -0,0 +1,62 @@
+
+Name:       pixman
+Summary:    Pixel manipulation library
+Version:    0.21.6
+Release:    1
+Group:      System/Libraries
+License:    MIT
+URL:        http://www.x.org/
+Source0:    http://xorg.freedesktop.org/archive/individual/lib/%{name}-%{version}.tar.gz
+Requires(post): /sbin/ldconfig
+Requires(postun): /sbin/ldconfig
+
+
+%description
+Description: %{summary}
+
+
+%package devel
+Summary:    Development components for the pixman library
+Group:      Development/Libraries
+Requires:   %{name} = %{version}-%{release}
+
+%description devel
+Description: %{summary}
+
+
+%prep
+%setup -q -n %{name}-%{version}
+
+%build
+
+%reconfigure
+make %{?jobs:-j%jobs}
+
+%install
+rm -rf %{buildroot}
+%make_install
+
+
+
+
+%post -p /sbin/ldconfig
+
+%postun -p /sbin/ldconfig
+
+
+
+
+
+%files
+%defattr(-,root,root,-)
+%{_libdir}/libpixman-1*.so.*
+
+
+%files devel
+%defattr(-,root,root,-)
+%dir %{_includedir}/pixman-1
+%{_includedir}/pixman-1/pixman.h
+%{_includedir}/pixman-1/pixman-version.h
+%{_libdir}/libpixman-1*.so
+%{_libdir}/pkgconfig/pixman-1.pc
+
diff --git a/pixman-1-uninstalled.pc.in b/pixman-1-uninstalled.pc.in
new file mode 100644 (file)
index 0000000..e0347d0
--- /dev/null
@@ -0,0 +1,5 @@
+Name: Pixman
+Description: The pixman library (version 1)
+Version: @PACKAGE_VERSION@
+Cflags: -I${pc_top_builddir}/${pcfiledir}/pixman
+Libs: ${pc_top_builddir}/${pcfiledir}/pixman/libpixman-1.la
diff --git a/pixman-1.pc.in b/pixman-1.pc.in
new file mode 100644 (file)
index 0000000..936d95d
--- /dev/null
@@ -0,0 +1,11 @@
+prefix=@prefix@
+exec_prefix=@exec_prefix@
+libdir=@libdir@
+includedir=@includedir@
+
+Name: Pixman
+Description: The pixman library (version 1)
+Version: @PACKAGE_VERSION@
+Cflags: -I${includedir}/pixman-1 @DEP_CFLAGS@
+Libs: -L${libdir} -lpixman-1 @DEP_LIBS@
+
diff --git a/pixman/Makefile.am b/pixman/Makefile.am
new file mode 100644 (file)
index 0000000..286b7cf
--- /dev/null
@@ -0,0 +1,106 @@
+include $(top_srcdir)/pixman/Makefile.sources
+
+lib_LTLIBRARIES = libpixman-1.la
+
+libpixman_1_la_LDFLAGS = -version-info $(LT_VERSION_INFO) -no-undefined @PTHREAD_LDFLAGS@ 
+libpixman_1_la_LIBADD = @PTHREAD_LIBS@ @DEP_LIBS@ -lm
+libpixman_1_la_SOURCES = $(libpixman_sources) $(libpixman_headers)
+
+libpixmanincludedir = $(includedir)/pixman-1
+libpixmaninclude_HEADERS = pixman.h pixman-version.h
+noinst_LTLIBRARIES = 
+
+EXTRA_DIST =                           \
+       Makefile.win32                  \
+       make-combine.pl                 \
+       pixman-combine.c.template       \
+       pixman-combine.h.template       \
+       pixman-region.c                 \
+       solaris-hwcap.mapfile           \
+       $(NULL)
+
+DISTCLEANFILES = $(BUILT_SOURCES)
+
+# mmx code
+if USE_X86_MMX
+noinst_LTLIBRARIES += libpixman-mmx.la
+libpixman_mmx_la_SOURCES = \
+       pixman-mmx.c
+libpixman_mmx_la_CFLAGS = $(DEP_CFLAGS) $(MMX_CFLAGS)
+libpixman_mmx_la_LIBADD = $(DEP_LIBS)
+libpixman_1_la_LDFLAGS += $(MMX_LDFLAGS)
+libpixman_1_la_LIBADD += libpixman-mmx.la
+
+ASM_CFLAGS_mmx=$(MMX_CFLAGS)
+endif
+
+# vmx code
+if USE_VMX
+noinst_LTLIBRARIES += libpixman-vmx.la
+libpixman_vmx_la_SOURCES = \
+       pixman-vmx.c \
+       pixman-combine32.h
+libpixman_vmx_la_CFLAGS = $(DEP_CFLAGS) $(VMX_CFLAGS)
+libpixman_vmx_la_LIBADD = $(DEP_LIBS)
+libpixman_1_la_LIBADD += libpixman-vmx.la
+
+ASM_CFLAGS_vmx=$(VMX_CFLAGS)
+endif
+
+# sse2 code
+if USE_SSE2
+noinst_LTLIBRARIES += libpixman-sse2.la
+libpixman_sse2_la_SOURCES = \
+       pixman-sse2.c
+libpixman_sse2_la_CFLAGS = $(DEP_CFLAGS) $(SSE2_CFLAGS)
+libpixman_sse2_la_LIBADD = $(DEP_LIBS)
+libpixman_1_la_LDFLAGS += $(SSE2_LDFLAGS)
+libpixman_1_la_LIBADD += libpixman-sse2.la
+
+ASM_CFLAGS_sse2=$(SSE2_CFLAGS)
+endif
+
+# arm simd code
+if USE_ARM_SIMD
+noinst_LTLIBRARIES += libpixman-arm-simd.la
+libpixman_arm_simd_la_SOURCES = \
+       pixman-arm-simd.c       \
+       pixman-arm-common.h     \
+       pixman-arm-simd-asm.S
+libpixman_arm_simd_la_CFLAGS = $(DEP_CFLAGS)
+libpixman_arm_simd_la_LIBADD = $(DEP_LIBS)
+libpixman_1_la_LIBADD += libpixman-arm-simd.la
+
+ASM_CFLAGS_arm_simd=
+endif
+
+# arm neon code
+if USE_ARM_NEON
+noinst_LTLIBRARIES += libpixman-arm-neon.la
+libpixman_arm_neon_la_SOURCES = \
+        pixman-arm-neon.c      \
+        pixman-arm-common.h    \
+        pixman-arm-neon-asm.S  \
+               pixman-arm-neon-asm-bilinear.S \
+        pixman-arm-neon-asm.h
+libpixman_arm_neon_la_CFLAGS = $(DEP_CFLAGS)
+libpixman_arm_neon_la_LIBADD = $(DEP_LIBS)
+libpixman_1_la_LIBADD += libpixman-arm-neon.la
+
+ASM_CFLAGS_arm_neon=
+endif
+
+# iwmmxt code
+if USE_ARM_IWMMXT
+noinst_LTLIBRARIES += libpixman-iwmmxt.la
+libpixman_iwmmxt_la_SOURCES = pixman-mmx.c
+libpixman_iwmmxt_la_CFLAGS = $(DEP_CFLAGS) $(IWMMXT_CFLAGS)
+libpixman_iwmmxt_la_LIBADD = $(DEP_LIBS)
+libpixman_1_la_LDFLAGS += $(IWMMXT_LDFLAGS)
+libpixman_1_la_LIBADD += libpixman-iwmmxt.la
+
+ASM_CFLAGS_IWMMXT=$(IWMMXT_CFLAGS)
+endif
+
+.c.s : $(libpixmaninclude_HEADERS) $(BUILT_SOURCES)
+       $(CC) $(CFLAGS) $(ASM_CFLAGS_$(@:pixman-%.s=%)) $(ASM_CFLAGS_$(@:pixman-arm-%.s=arm_%)) -DHAVE_CONFIG_H -I$(srcdir) -I$(builddir) -I$(top_builddir) -S -o $@ $<
diff --git a/pixman/Makefile.sources b/pixman/Makefile.sources
new file mode 100644 (file)
index 0000000..ca3f001
--- /dev/null
@@ -0,0 +1,55 @@
+libpixman_sources =                    \
+       pixman.c                        \
+       pixman-access.c                 \
+       pixman-access-accessors.c       \
+       pixman-bits-image.c             \
+       pixman-combine32.c              \
+       pixman-combine64.c              \
+       pixman-conical-gradient.c       \
+       pixman-cpu.c                    \
+       pixman-edge.c                   \
+       pixman-edge-accessors.c         \
+       pixman-fast-path.c              \
+       pixman-general.c                \
+       pixman-gradient-walker.c        \
+       pixman-image.c                  \
+       pixman-implementation.c         \
+       pixman-linear-gradient.c        \
+       pixman-matrix.c                 \
+       pixman-noop.c                   \
+       pixman-radial-gradient.c        \
+       pixman-region16.c               \
+       pixman-region32.c               \
+       pixman-solid-fill.c             \
+       pixman-timer.c                  \
+       pixman-trap.c                   \
+       pixman-utils.c                  \
+       $(NULL)
+
+libpixman_headers =                    \
+       pixman.h                        \
+       pixman-accessor.h               \
+       pixman-combine32.h              \
+       pixman-combine64.h              \
+       pixman-compiler.h               \
+       pixman-edge-imp.h               \
+       pixman-inlines.h                \
+       pixman-private.h                \
+       $(NULL)
+
+BUILT_SOURCES =                                \
+       pixman-combine32.c              \
+       pixman-combine32.h              \
+       pixman-combine64.c              \
+       pixman-combine64.h              \
+       $(NULL)
+
+pixman-combine32.c: pixman-combine.c.template make-combine.pl
+       $(PERL) $(lastword $+) 8 < $< > $@ || ($(RM) $@; exit 1)
+pixman-combine32.h: pixman-combine.h.template make-combine.pl
+       $(PERL) $(lastword $+) 8 < $< > $@ || ($(RM) $@; exit 1)
+
+pixman-combine64.c: pixman-combine.c.template make-combine.pl
+       $(PERL) $(lastword $+) 16 < $< > $@ || ($(RM) $@; exit 1)
+pixman-combine64.h: pixman-combine.h.template make-combine.pl
+       $(PERL) $(lastword $+) 16 < $< > $@ || ($(RM) $@; exit 1)
diff --git a/pixman/Makefile.win32 b/pixman/Makefile.win32
new file mode 100644 (file)
index 0000000..381f2cd
--- /dev/null
@@ -0,0 +1,66 @@
+default: all
+
+top_srcdir = ..
+include $(top_srcdir)/pixman/Makefile.sources
+include $(top_srcdir)/Makefile.win32.common
+
+MMX_VAR = $(MMX)
+ifeq ($(MMX_VAR),)
+MMX_VAR=on
+endif
+
+SSE2_VAR = $(SSE2)
+ifeq ($(SSE2_VAR),)
+SSE2_VAR=on
+endif
+
+MMX_CFLAGS = -DUSE_X86_MMX -w14710 -w14714
+SSE2_CFLAGS = -DUSE_SSE2
+
+# MMX compilation flags
+ifeq ($(MMX_VAR),on)
+PIXMAN_CFLAGS += $(MMX_CFLAGS)
+libpixman_sources += pixman-mmx.c
+endif
+
+# SSE2 compilation flags
+ifeq ($(SSE2_VAR),on)
+PIXMAN_CFLAGS += $(SSE2_CFLAGS)
+libpixman_sources += pixman-sse2.c
+endif
+
+OBJECTS = $(patsubst %.c, $(CFG_VAR)/%.obj, $(libpixman_sources))
+
+# targets
+all: inform informMMX informSSE2 $(CFG_VAR)/$(LIBRARY).lib
+
+informMMX:
+ifneq ($(MMX),off)
+ifneq ($(MMX),on)
+ifneq ($(MMX),)
+       @echo "Invalid specified MMX option : "$(MMX_VAR)"."
+       @echo
+       @echo "Possible choices for MMX are 'on' or 'off'"
+       @exit 1
+endif
+       @echo "Setting MMX flag to default value 'on'... (use MMX=on or MMX=off)"
+endif
+endif
+
+informSSE2:
+ifneq ($(SSE2),off)
+ifneq ($(SSE2),on)
+ifneq ($(SSE2),)
+       @echo "Invalid specified SSE option : "$(SSE2)"."
+       @echo
+       @echo "Possible choices for SSE2 are 'on' or 'off'"
+       @exit 1
+endif
+       @echo "Setting SSE2 flag to default value 'on'... (use SSE2=on or SSE2=off)"
+endif
+endif
+
+
+# pixman linking
+$(CFG_VAR)/$(LIBRARY).lib: $(OBJECTS)
+       @$(AR) $(PIXMAN_ARFLAGS) -OUT:$@ $^
diff --git a/pixman/make-combine.pl b/pixman/make-combine.pl
new file mode 100644 (file)
index 0000000..210a5da
--- /dev/null
@@ -0,0 +1,86 @@
+$usage = "Usage: combine.pl { 8 | 16 } < pixman-combine.c.template";
+
+$#ARGV == 0 or die $usage;
+
+# Get the component size.
+$size = int($ARGV[0]);
+$size == 8 or $size == 16 or die $usage;
+
+$pixel_size = $size * 4;
+$half_pixel_size = $size * 2;
+
+sub mask {
+    my $str = shift;
+    my $suffix;
+    $suffix = "ULL" if $size > 8;
+
+    return "0x" . $str . $suffix;
+}
+
+# Generate mask strings.
+$nibbles = $size / 4;
+$mask = "f" x $nibbles;
+$zero_mask = "0" x $nibbles;
+$one_half = "8" . "0" x ($nibbles - 1);
+
+print "/* WARNING: This file is generated by combine.pl from combine.inc.\n";
+print "   Please edit one of those files rather than this one. */\n";
+print "\n";
+
+print "#line 1 \"pixman-combine.c.template\"\n";
+
+$mask_ = mask($mask);
+$one_half_ = mask($one_half);
+$g_mask = mask($mask . $zero_mask);
+$b_mask = mask($mask . $zero_mask x 2);
+$a_mask = mask($mask . $zero_mask x 3);
+$rb_mask = mask($mask . $zero_mask . $mask);
+$ag_mask = mask($mask . $zero_mask . $mask . $zero_mask);
+$rb_one_half = mask($one_half . $zero_mask . $one_half);
+$rb_mask_plus_one = mask("1" . $zero_mask x 2 . "1" .  $zero_mask);
+
+while (<STDIN>) {
+    # Mask and 1/2 value for a single component.
+    s/#define COMPONENT_SIZE\b/$& $size/;
+    s/#define MASK\b/$& $mask_/;
+    s/#define ONE_HALF\b/$& $one_half_/;
+
+    # Shifts and masks for green, blue, and alpha.
+    s/#define G_SHIFT\b/$& $size/;
+    s/#define R_SHIFT\b/$& $size * 2/;
+    s/#define A_SHIFT\b/$& $size * 3/;
+    s/#define G_MASK\b/$& $g_mask/;
+    s/#define R_MASK\b/$& $b_mask/;
+    s/#define A_MASK\b/$& $a_mask/;
+
+    # Special values for dealing with red + blue at the same time.
+    s/#define RB_MASK\b/$& $rb_mask/;
+    s/#define AG_MASK\b/$& $ag_mask/;
+    s/#define RB_ONE_HALF\b/$& $rb_one_half/;
+    s/#define RB_MASK_PLUS_ONE\b/$& $rb_mask_plus_one/;
+
+    # Add 32/64 suffix to combining function types.
+    s/\bCombineFunc\b/CombineFunc$pixel_size/;
+    s/\bFbComposeFunctions\b/FbComposeFunctions$pixel_size/;
+    s/combine_width/combine_$pixel_size/;
+    s/_pixman_setup_combiner_functions_width/_pixman_setup_combiner_functions_$pixel_size/;
+    s/UNc/UN$size/g;
+    s/ALPHA_c/ALPHA_$size/g;
+    s/RED_c/RED_$size/g;
+    s/GREEN_c/GREEN_$size/g;
+    s/BLUE_c/BLUE_$size/g;
+
+    # Convert comp*_t values into the appropriate real types.
+    s/comp1_t/uint${size}_t/g;
+    s/comp2_t/uint${half_pixel_size}_t/g;
+    s/comp4_t/uint${pixel_size}_t/g;
+
+    # Change the function table name for the 64-bit version.
+    s/pixman_composeFunctions/pixman_composeFunctions64/ if $size == 16;
+
+    # Change the header for the 64-bit version
+    s/pixman-combine.h/pixman-combine64.h/ if $size == 16;
+    s/pixman-combine.h/pixman-combine32.h/ if $size == 8;
+
+    print;
+}
diff --git a/pixman/pixman-access-accessors.c b/pixman/pixman-access-accessors.c
new file mode 100644 (file)
index 0000000..3263582
--- /dev/null
@@ -0,0 +1,3 @@
+#define PIXMAN_FB_ACCESSORS
+
+#include "pixman-access.c"
diff --git a/pixman/pixman-access.c b/pixman/pixman-access.c
new file mode 100644 (file)
index 0000000..189b191
--- /dev/null
@@ -0,0 +1,1226 @@
+/*
+ *
+ * Copyright © 2000 Keith Packard, member of The XFree86 Project, Inc.
+ *             2005 Lars Knoll & Zack Rusin, Trolltech
+ *             2008 Aaron Plattner, NVIDIA Corporation
+ *
+ * Permission to use, copy, modify, distribute, and sell this software and its
+ * documentation for any purpose is hereby granted without fee, provided that
+ * the above copyright notice appear in all copies and that both that
+ * copyright notice and this permission notice appear in supporting
+ * documentation, and that the name of Keith Packard not be used in
+ * advertising or publicity pertaining to distribution of the software without
+ * specific, written prior permission.  Keith Packard makes no
+ * representations about the suitability of this software for any purpose.  It
+ * is provided "as is" without express or implied warranty.
+ *
+ * THE COPYRIGHT HOLDERS DISCLAIM ALL WARRANTIES WITH REGARD TO THIS
+ * SOFTWARE, INCLUDING ALL IMPLIED WARRANTIES OF MERCHANTABILITY AND
+ * FITNESS, IN NO EVENT SHALL THE COPYRIGHT HOLDERS BE LIABLE FOR ANY
+ * SPECIAL, INDIRECT OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES
+ * WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN
+ * AN ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING
+ * OUT OF OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS
+ * SOFTWARE.
+ */
+
+#ifdef HAVE_CONFIG_H
+#include <config.h>
+#endif
+
+#include <stdlib.h>
+#include <string.h>
+#include <assert.h>
+
+#include "pixman-private.h"
+#include "pixman-accessor.h"
+
+#define CONVERT_RGB24_TO_Y15(s)                                                \
+    (((((s) >> 16) & 0xff) * 153 +                                     \
+      (((s) >>  8) & 0xff) * 301 +                                     \
+      (((s)      ) & 0xff) * 58) >> 2)
+
+#define CONVERT_RGB24_TO_RGB15(s)                                       \
+    ((((s) >> 3) & 0x001f) |                                            \
+     (((s) >> 6) & 0x03e0) |                                            \
+     (((s) >> 9) & 0x7c00))
+
+#define RGB15_TO_ENTRY(mif,rgb15)                                      \
+    ((mif)->ent[rgb15])
+
+#define RGB24_TO_ENTRY(mif,rgb24)                                      \
+    RGB15_TO_ENTRY (mif,CONVERT_RGB24_TO_RGB15 (rgb24))
+
+#define RGB24_TO_ENTRY_Y(mif,rgb24)                                    \
+    ((mif)->ent[CONVERT_RGB24_TO_Y15 (rgb24)])
+
+/* Fetch macros */
+
+#ifdef WORDS_BIGENDIAN
+#define FETCH_1(img,l,o)                                               \
+    (((READ ((img), ((uint32_t *)(l)) + ((o) >> 5))) >> (0x1f - ((o) & 0x1f))) & 0x1)
+#else
+#define FETCH_1(img,l,o)                                               \
+    ((((READ ((img), ((uint32_t *)(l)) + ((o) >> 5))) >> ((o) & 0x1f))) & 0x1)
+#endif
+
+#define FETCH_8(img,l,o)    (READ (img, (((uint8_t *)(l)) + ((o) >> 3))))
+
+#ifdef WORDS_BIGENDIAN
+#define FETCH_4(img,l,o)                                               \
+    (((4 * (o)) & 4) ? (FETCH_8 (img,l, 4 * (o)) & 0xf) : (FETCH_8 (img,l,(4 * (o))) >> 4))
+#else
+#define FETCH_4(img,l,o)                                               \
+    (((4 * (o)) & 4) ? (FETCH_8 (img, l, 4 * (o)) >> 4) : (FETCH_8 (img, l, (4 * (o))) & 0xf))
+#endif
+
+#ifdef WORDS_BIGENDIAN
+#define FETCH_24(img,l,o)                                              \
+    ((READ (img, (((uint8_t *)(l)) + ((o) * 3) + 0)) << 16)    |       \
+     (READ (img, (((uint8_t *)(l)) + ((o) * 3) + 1)) << 8)     |       \
+     (READ (img, (((uint8_t *)(l)) + ((o) * 3) + 2)) << 0))
+#else
+#define FETCH_24(img,l,o)                                              \
+    ((READ (img, (((uint8_t *)(l)) + ((o) * 3) + 0)) << 0)     |       \
+     (READ (img, (((uint8_t *)(l)) + ((o) * 3) + 1)) << 8)     |       \
+     (READ (img, (((uint8_t *)(l)) + ((o) * 3) + 2)) << 16))
+#endif
+
+/* Store macros */
+
+#ifdef WORDS_BIGENDIAN
+#define STORE_1(img,l,o,v)                                             \
+    do                                                                 \
+    {                                                                  \
+       uint32_t  *__d = ((uint32_t *)(l)) + ((o) >> 5);                \
+       uint32_t __m, __v;                                              \
+                                                                       \
+       __m = 1 << (0x1f - ((o) & 0x1f));                               \
+       __v = (v)? __m : 0;                                             \
+                                                                       \
+       WRITE((img), __d, (READ((img), __d) & ~__m) | __v);             \
+    }                                                                  \
+    while (0)
+#else
+#define STORE_1(img,l,o,v)                                             \
+    do                                                                 \
+    {                                                                  \
+       uint32_t  *__d = ((uint32_t *)(l)) + ((o) >> 5);                \
+       uint32_t __m, __v;                                              \
+                                                                       \
+       __m = 1 << ((o) & 0x1f);                                        \
+       __v = (v)? __m : 0;                                             \
+                                                                       \
+       WRITE((img), __d, (READ((img), __d) & ~__m) | __v);             \
+    }                                                                  \
+    while (0)
+#endif
+
+#define STORE_8(img,l,o,v)  (WRITE (img, (uint8_t *)(l) + ((o) >> 3), (v)))
+
+#ifdef WORDS_BIGENDIAN
+#define STORE_4(img,l,o,v)                                             \
+    do                                                                 \
+    {                                                                  \
+       int bo = 4 * (o);                                               \
+       int v4 = (v) & 0x0f;                                            \
+                                                                       \
+       STORE_8 (img, l, bo, (                                          \
+                    bo & 4 ?                                           \
+                    (FETCH_8 (img, l, bo) & 0xf0) | (v4) :             \
+                    (FETCH_8 (img, l, bo) & 0x0f) | (v4 << 4)));       \
+    } while (0)
+#else
+#define STORE_4(img,l,o,v)                                             \
+    do                                                                 \
+    {                                                                  \
+       int bo = 4 * (o);                                               \
+       int v4 = (v) & 0x0f;                                            \
+                                                                       \
+       STORE_8 (img, l, bo, (                                          \
+                    bo & 4 ?                                           \
+                    (FETCH_8 (img, l, bo) & 0x0f) | (v4 << 4) :        \
+                    (FETCH_8 (img, l, bo) & 0xf0) | (v4)));            \
+    } while (0)
+#endif
+
+#ifdef WORDS_BIGENDIAN
+#define STORE_24(img,l,o,v)                                            \
+    do                                                                 \
+    {                                                                  \
+       uint8_t *__tmp = (l) + 3 * (o);                                \
+                                                                      \
+       WRITE ((img), __tmp++, ((v) & 0x00ff0000) >> 16);              \
+       WRITE ((img), __tmp++, ((v) & 0x0000ff00) >>  8);              \
+       WRITE ((img), __tmp++, ((v) & 0x000000ff) >>  0);              \
+    }                                                                  \
+    while (0)
+#else
+#define STORE_24(img,l,o,v)                                            \
+    do                                                                 \
+    {                                                                  \
+       uint8_t *__tmp = (l) + 3 * (o);                                \
+                                                                      \
+       WRITE ((img), __tmp++, ((v) & 0x000000ff) >>  0);              \
+       WRITE ((img), __tmp++, ((v) & 0x0000ff00) >>  8);              \
+       WRITE ((img), __tmp++, ((v) & 0x00ff0000) >> 16);              \
+    }                                                                 \
+    while (0)
+#endif
+
+/*
+ * YV12 setup and access macros
+ */
+
+#define YV12_SETUP(image)                                               \
+    bits_image_t *__bits_image = (bits_image_t *)image;                 \
+    uint32_t *bits = __bits_image->bits;                                \
+    int stride = __bits_image->rowstride;                               \
+    int offset0 = stride < 0 ?                                          \
+    ((-stride) >> 1) * ((__bits_image->height - 1) >> 1) - stride :    \
+    stride * __bits_image->height;                                     \
+    int offset1 = stride < 0 ?                                          \
+    offset0 + ((-stride) >> 1) * ((__bits_image->height) >> 1) :       \
+       offset0 + (offset0 >> 2)
+
+/* Note no trailing semicolon on the above macro; if it's there, then
+ * the typical usage of YV12_SETUP(image); will have an extra trailing ;
+ * that some compilers will interpret as a statement -- and then any further
+ * variable declarations will cause an error.
+ */
+
+#define YV12_Y(line)                                                    \
+    ((uint8_t *) ((bits) + (stride) * (line)))
+
+#define YV12_U(line)                                                    \
+    ((uint8_t *) ((bits) + offset1 +                                    \
+                  ((stride) >> 1) * ((line) >> 1)))
+
+#define YV12_V(line)                                                    \
+    ((uint8_t *) ((bits) + offset0 +                                    \
+                  ((stride) >> 1) * ((line) >> 1)))
+
+/* Misc. helpers */
+
+static force_inline void
+get_shifts (pixman_format_code_t  format,
+           int                  *a,
+           int                  *r,
+           int                  *g,
+           int                  *b)
+{
+    switch (PIXMAN_FORMAT_TYPE (format))
+    {
+    case PIXMAN_TYPE_A:
+       *b = 0;
+       *g = 0;
+       *r = 0;
+       *a = 0;
+       break;
+
+    case PIXMAN_TYPE_ARGB:
+       *b = 0;
+       *g = *b + PIXMAN_FORMAT_B (format);
+       *r = *g + PIXMAN_FORMAT_G (format);
+       *a = *r + PIXMAN_FORMAT_R (format);
+       break;
+
+    case PIXMAN_TYPE_ABGR:
+       *r = 0;
+       *g = *r + PIXMAN_FORMAT_R (format);
+       *b = *g + PIXMAN_FORMAT_G (format);
+       *a = *b + PIXMAN_FORMAT_B (format);
+       break;
+
+    case PIXMAN_TYPE_BGRA:
+       /* With BGRA formats we start counting at the high end of the pixel */
+       *b = PIXMAN_FORMAT_BPP (format) - PIXMAN_FORMAT_B (format);
+       *g = *b - PIXMAN_FORMAT_B (format);
+       *r = *g - PIXMAN_FORMAT_G (format);
+       *a = *r - PIXMAN_FORMAT_R (format);
+       break;
+
+    case PIXMAN_TYPE_RGBA:
+       /* With BGRA formats we start counting at the high end of the pixel */
+       *r = PIXMAN_FORMAT_BPP (format) - PIXMAN_FORMAT_R (format);
+       *g = *r - PIXMAN_FORMAT_R (format);
+       *b = *g - PIXMAN_FORMAT_G (format);
+       *a = *b - PIXMAN_FORMAT_B (format);
+       break;
+
+    default:
+       assert (0);
+       break;
+    }
+}
+
+static force_inline uint32_t
+convert_channel (uint32_t pixel, uint32_t def_value,
+                int n_from_bits, int from_shift,
+                int n_to_bits, int to_shift)
+{
+    uint32_t v;
+
+    if (n_from_bits && n_to_bits)
+       v  = unorm_to_unorm (pixel >> from_shift, n_from_bits, n_to_bits);
+    else if (n_to_bits)
+       v = def_value;
+    else
+       v = 0;
+
+    return (v & ((1 << n_to_bits) - 1)) << to_shift;
+}
+
+static force_inline uint32_t
+convert_pixel (pixman_format_code_t from, pixman_format_code_t to, uint32_t pixel)
+{
+    int a_from_shift, r_from_shift, g_from_shift, b_from_shift;
+    int a_to_shift, r_to_shift, g_to_shift, b_to_shift;
+    uint32_t a, r, g, b;
+
+    get_shifts (from, &a_from_shift, &r_from_shift, &g_from_shift, &b_from_shift);
+    get_shifts (to, &a_to_shift, &r_to_shift, &g_to_shift, &b_to_shift);
+
+    a = convert_channel (pixel, ~0,
+                        PIXMAN_FORMAT_A (from), a_from_shift,
+                        PIXMAN_FORMAT_A (to), a_to_shift);
+
+    r = convert_channel (pixel, 0,
+                        PIXMAN_FORMAT_R (from), r_from_shift,
+                        PIXMAN_FORMAT_R (to), r_to_shift);
+
+    g = convert_channel (pixel, 0,
+                        PIXMAN_FORMAT_G (from), g_from_shift,
+                        PIXMAN_FORMAT_G (to), g_to_shift);
+
+    b = convert_channel (pixel, 0,
+                        PIXMAN_FORMAT_B (from), b_from_shift,
+                        PIXMAN_FORMAT_B (to), b_to_shift);
+
+    return a | r | g | b;
+}
+
+static force_inline uint32_t
+convert_pixel_to_a8r8g8b8 (pixman_image_t *image,
+                          pixman_format_code_t format,
+                          uint32_t pixel)
+{
+    if (PIXMAN_FORMAT_TYPE (format) == PIXMAN_TYPE_GRAY                ||
+       PIXMAN_FORMAT_TYPE (format) == PIXMAN_TYPE_COLOR)
+    {
+       return image->bits.indexed->rgba[pixel];
+    }
+    else
+    {
+       return convert_pixel (format, PIXMAN_a8r8g8b8, pixel);
+    }
+}
+
+static force_inline uint32_t
+convert_pixel_from_a8r8g8b8 (pixman_image_t *image,
+                            pixman_format_code_t format, uint32_t pixel)
+{
+    if (PIXMAN_FORMAT_TYPE (format) == PIXMAN_TYPE_GRAY)
+    {
+       pixel = CONVERT_RGB24_TO_Y15 (pixel);
+
+       return image->bits.indexed->ent[pixel & 0x7fff];
+    }
+    else if (PIXMAN_FORMAT_TYPE (format) == PIXMAN_TYPE_COLOR)
+    {
+       pixel = convert_pixel (PIXMAN_a8r8g8b8, PIXMAN_x1r5g5b5, pixel);
+
+       return image->bits.indexed->ent[pixel & 0x7fff];
+    }
+    else
+    {
+       return convert_pixel (PIXMAN_a8r8g8b8, format, pixel);
+    }
+}
+
+static force_inline uint32_t
+fetch_and_convert_pixel (pixman_image_t        *       image,
+                        const uint8_t *        bits,
+                        int                    offset,
+                        pixman_format_code_t   format)
+{
+    uint32_t pixel;
+
+    switch (PIXMAN_FORMAT_BPP (format))
+    {
+    case 1:
+       pixel = FETCH_1 (image, bits, offset);
+       break;
+
+    case 4:
+       pixel = FETCH_4 (image, bits, offset);
+       break;
+
+    case 8:
+       pixel = READ (image, bits + offset);
+       break;
+
+    case 16:
+       pixel = READ (image, ((uint16_t *)bits + offset));
+       break;
+
+    case 24:
+       pixel = FETCH_24 (image, bits, offset);
+       break;
+
+    case 32:
+       pixel = READ (image, ((uint32_t *)bits + offset));
+       break;
+
+    default:
+       pixel = 0xffff00ff; /* As ugly as possible to detect the bug */
+       break;
+    }
+
+    return convert_pixel_to_a8r8g8b8 (image, format, pixel);
+}
+
+static force_inline void
+convert_and_store_pixel (bits_image_t *                image,
+                        uint8_t *              dest,
+                        int                    offset,
+                        pixman_format_code_t   format,
+                        uint32_t               pixel)
+{
+    uint32_t converted = convert_pixel_from_a8r8g8b8 (
+       (pixman_image_t *)image, format, pixel);
+
+    switch (PIXMAN_FORMAT_BPP (format))
+    {
+    case 1:
+       STORE_1 (image, dest, offset, converted & 0x01);
+       break;
+
+    case 4:
+       STORE_4 (image, dest, offset, converted & 0xf);
+       break;
+
+    case 8:
+       WRITE (image, (dest + offset), converted & 0xff);
+       break;
+
+    case 16:
+       WRITE (image, ((uint16_t *)dest + offset), converted & 0xffff);
+       break;
+
+    case 24:
+       STORE_24 (image, dest, offset, converted);
+       break;
+
+    case 32:
+       WRITE (image, ((uint32_t *)dest + offset), converted);
+       break;
+
+    default:
+       *dest = 0x0;
+       break;
+    }
+}
+
+#define MAKE_ACCESSORS(format)                                         \
+    static void                                                                \
+    fetch_scanline_ ## format (pixman_image_t *image,                  \
+                              int             x,                       \
+                              int             y,                       \
+                              int             width,                   \
+                              uint32_t *      buffer,                  \
+                              const uint32_t *mask)                    \
+    {                                                                  \
+       uint8_t *bits =                                                 \
+           (uint8_t *)(image->bits.bits + y * image->bits.rowstride);  \
+       int i;                                                          \
+                                                                       \
+       for (i = 0; i < width; ++i)                                     \
+       {                                                               \
+           *buffer++ =                                                 \
+               fetch_and_convert_pixel (image, bits, x + i, PIXMAN_ ## format); \
+       }                                                               \
+    }                                                                  \
+                                                                       \
+    static void                                                                \
+    store_scanline_ ## format (bits_image_t *  image,                  \
+                              int             x,                       \
+                              int             y,                       \
+                              int             width,                   \
+                              const uint32_t *values)                  \
+    {                                                                  \
+       uint8_t *dest =                                                 \
+           (uint8_t *)(image->bits + y * image->rowstride);            \
+       int i;                                                          \
+                                                                       \
+       for (i = 0; i < width; ++i)                                     \
+       {                                                               \
+           convert_and_store_pixel (                                   \
+               image, dest, i + x, PIXMAN_ ## format, values[i]);      \
+       }                                                               \
+    }                                                                  \
+                                                                       \
+    static uint32_t                                                    \
+    fetch_pixel_ ## format (bits_image_t *image,                       \
+                           int         offset,                         \
+                           int         line)                           \
+    {                                                                  \
+       uint8_t *bits =                                                 \
+           (uint8_t *)(image->bits + line * image->rowstride);         \
+                                                                       \
+       return fetch_and_convert_pixel ((pixman_image_t *)image,        \
+                                       bits, offset, PIXMAN_ ## format); \
+    }                                                                  \
+                                                                       \
+    static const void *const __dummy__ ## format
+
+MAKE_ACCESSORS(a8r8g8b8);
+MAKE_ACCESSORS(x8r8g8b8);
+MAKE_ACCESSORS(a8b8g8r8);
+MAKE_ACCESSORS(x8b8g8r8);
+MAKE_ACCESSORS(x14r6g6b6);
+MAKE_ACCESSORS(b8g8r8a8);
+MAKE_ACCESSORS(b8g8r8x8);
+MAKE_ACCESSORS(r8g8b8x8);
+MAKE_ACCESSORS(r8g8b8a8);
+MAKE_ACCESSORS(r8g8b8);
+MAKE_ACCESSORS(b8g8r8);
+MAKE_ACCESSORS(r5g6b5);
+MAKE_ACCESSORS(b5g6r5);
+MAKE_ACCESSORS(a1r5g5b5);
+MAKE_ACCESSORS(x1r5g5b5);
+MAKE_ACCESSORS(a1b5g5r5);
+MAKE_ACCESSORS(x1b5g5r5);
+MAKE_ACCESSORS(a4r4g4b4);
+MAKE_ACCESSORS(x4r4g4b4);
+MAKE_ACCESSORS(a4b4g4r4);
+MAKE_ACCESSORS(x4b4g4r4);
+MAKE_ACCESSORS(a8);
+MAKE_ACCESSORS(c8);
+MAKE_ACCESSORS(g8);
+MAKE_ACCESSORS(r3g3b2);
+MAKE_ACCESSORS(b2g3r3);
+MAKE_ACCESSORS(a2r2g2b2);
+MAKE_ACCESSORS(a2b2g2r2);
+MAKE_ACCESSORS(x4a4);
+MAKE_ACCESSORS(a4);
+MAKE_ACCESSORS(g4);
+MAKE_ACCESSORS(c4);
+MAKE_ACCESSORS(r1g2b1);
+MAKE_ACCESSORS(b1g2r1);
+MAKE_ACCESSORS(a1r1g1b1);
+MAKE_ACCESSORS(a1b1g1r1);
+MAKE_ACCESSORS(a1);
+MAKE_ACCESSORS(g1);
+
+/********************************** Fetch ************************************/
+
+/* Expects a uint64_t buffer */
+static void
+fetch_scanline_a2r10g10b10 (pixman_image_t *image,
+                            int             x,
+                            int             y,
+                            int             width,
+                            uint32_t *      b,
+                            const uint32_t *mask)
+{
+    const uint32_t *bits = image->bits.bits + y * image->bits.rowstride;
+    const uint32_t *pixel = bits + x;
+    const uint32_t *end = pixel + width;
+    uint64_t *buffer = (uint64_t *)b;
+
+    while (pixel < end)
+    {
+       uint32_t p = READ (image, pixel++);
+       uint64_t a = p >> 30;
+       uint64_t r = (p >> 20) & 0x3ff;
+       uint64_t g = (p >> 10) & 0x3ff;
+       uint64_t b = p & 0x3ff;
+
+       r = r << 6 | r >> 4;
+       g = g << 6 | g >> 4;
+       b = b << 6 | b >> 4;
+
+       a <<= 14;
+       a |= a >> 2;
+       a |= a >> 4;
+       a |= a >> 8;
+
+       *buffer++ = a << 48 | r << 32 | g << 16 | b;
+    }
+}
+
+/* Expects a uint64_t buffer */
+static void
+fetch_scanline_x2r10g10b10 (pixman_image_t *image,
+                            int             x,
+                            int             y,
+                            int             width,
+                            uint32_t *      b,
+                            const uint32_t *mask)
+{
+    const uint32_t *bits = image->bits.bits + y * image->bits.rowstride;
+    const uint32_t *pixel = (uint32_t *)bits + x;
+    const uint32_t *end = pixel + width;
+    uint64_t *buffer = (uint64_t *)b;
+    
+    while (pixel < end)
+    {
+       uint32_t p = READ (image, pixel++);
+       uint64_t r = (p >> 20) & 0x3ff;
+       uint64_t g = (p >> 10) & 0x3ff;
+       uint64_t b = p & 0x3ff;
+       
+       r = r << 6 | r >> 4;
+       g = g << 6 | g >> 4;
+       b = b << 6 | b >> 4;
+       
+       *buffer++ = 0xffffULL << 48 | r << 32 | g << 16 | b;
+    }
+}
+
+/* Expects a uint64_t buffer */
+static void
+fetch_scanline_a2b10g10r10 (pixman_image_t *image,
+                            int             x,
+                            int             y,
+                            int             width,
+                            uint32_t *      b,
+                            const uint32_t *mask)
+{
+    const uint32_t *bits = image->bits.bits + y * image->bits.rowstride;
+    const uint32_t *pixel = bits + x;
+    const uint32_t *end = pixel + width;
+    uint64_t *buffer = (uint64_t *)b;
+    
+    while (pixel < end)
+    {
+       uint32_t p = READ (image, pixel++);
+       uint64_t a = p >> 30;
+       uint64_t b = (p >> 20) & 0x3ff;
+       uint64_t g = (p >> 10) & 0x3ff;
+       uint64_t r = p & 0x3ff;
+       
+       r = r << 6 | r >> 4;
+       g = g << 6 | g >> 4;
+       b = b << 6 | b >> 4;
+       
+       a <<= 14;
+       a |= a >> 2;
+       a |= a >> 4;
+       a |= a >> 8;
+
+       *buffer++ = a << 48 | r << 32 | g << 16 | b;
+    }
+}
+
+/* Expects a uint64_t buffer */
+static void
+fetch_scanline_x2b10g10r10 (pixman_image_t *image,
+                            int             x,
+                            int             y,
+                            int             width,
+                            uint32_t *      b,
+                            const uint32_t *mask)
+{
+    const uint32_t *bits = image->bits.bits + y * image->bits.rowstride;
+    const uint32_t *pixel = (uint32_t *)bits + x;
+    const uint32_t *end = pixel + width;
+    uint64_t *buffer = (uint64_t *)b;
+    
+    while (pixel < end)
+    {
+       uint32_t p = READ (image, pixel++);
+       uint64_t b = (p >> 20) & 0x3ff;
+       uint64_t g = (p >> 10) & 0x3ff;
+       uint64_t r = p & 0x3ff;
+       
+       r = r << 6 | r >> 4;
+       g = g << 6 | g >> 4;
+       b = b << 6 | b >> 4;
+       
+       *buffer++ = 0xffffULL << 48 | r << 32 | g << 16 | b;
+    }
+}
+
+static void
+fetch_scanline_yuy2 (pixman_image_t *image,
+                     int             x,
+                     int             line,
+                     int             width,
+                     uint32_t *      buffer,
+                     const uint32_t *mask)
+{
+    const uint32_t *bits = image->bits.bits + image->bits.rowstride * line;
+    int i;
+    
+    for (i = 0; i < width; i++)
+    {
+       int16_t y, u, v;
+       int32_t r, g, b;
+       
+       y = ((uint8_t *) bits)[(x + i) << 1] - 16;
+       u = ((uint8_t *) bits)[(((x + i) << 1) & - 4) + 1] - 128;
+       v = ((uint8_t *) bits)[(((x + i) << 1) & - 4) + 3] - 128;
+       
+       /* R = 1.164(Y - 16) + 1.596(V - 128) */
+       r = 0x012b27 * y + 0x019a2e * v;
+       /* G = 1.164(Y - 16) - 0.813(V - 128) - 0.391(U - 128) */
+       g = 0x012b27 * y - 0x00d0f2 * v - 0x00647e * u;
+       /* B = 1.164(Y - 16) + 2.018(U - 128) */
+       b = 0x012b27 * y + 0x0206a2 * u;
+       
+       *buffer++ = 0xff000000 |
+           (r >= 0 ? r < 0x1000000 ? r         & 0xff0000 : 0xff0000 : 0) |
+           (g >= 0 ? g < 0x1000000 ? (g >> 8)  & 0x00ff00 : 0x00ff00 : 0) |
+           (b >= 0 ? b < 0x1000000 ? (b >> 16) & 0x0000ff : 0x0000ff : 0);
+    }
+}
+
+static void
+fetch_scanline_yv12 (pixman_image_t *image,
+                     int             x,
+                     int             line,
+                     int             width,
+                     uint32_t *      buffer,
+                     const uint32_t *mask)
+{
+    YV12_SETUP (image);
+    uint8_t *y_line = YV12_Y (line);
+    uint8_t *u_line = YV12_U (line);
+    uint8_t *v_line = YV12_V (line);
+    int i;
+    
+    for (i = 0; i < width; i++)
+    {
+       int16_t y, u, v;
+       int32_t r, g, b;
+
+       y = y_line[x + i] - 16;
+       u = u_line[(x + i) >> 1] - 128;
+       v = v_line[(x + i) >> 1] - 128;
+
+       /* R = 1.164(Y - 16) + 1.596(V - 128) */
+       r = 0x012b27 * y + 0x019a2e * v;
+       /* G = 1.164(Y - 16) - 0.813(V - 128) - 0.391(U - 128) */
+       g = 0x012b27 * y - 0x00d0f2 * v - 0x00647e * u;
+       /* B = 1.164(Y - 16) + 2.018(U - 128) */
+       b = 0x012b27 * y + 0x0206a2 * u;
+
+       *buffer++ = 0xff000000 |
+           (r >= 0 ? r < 0x1000000 ? r         & 0xff0000 : 0xff0000 : 0) |
+           (g >= 0 ? g < 0x1000000 ? (g >> 8)  & 0x00ff00 : 0x00ff00 : 0) |
+           (b >= 0 ? b < 0x1000000 ? (b >> 16) & 0x0000ff : 0x0000ff : 0);
+    }
+}
+
+/**************************** Pixel wise fetching *****************************/
+
+/* Despite the type, expects a uint64_t buffer */
+static uint64_t
+fetch_pixel_a2r10g10b10 (bits_image_t *image,
+                        int              offset,
+                        int           line)
+{
+    uint32_t *bits = image->bits + line * image->rowstride;
+    uint32_t p = READ (image, bits + offset);
+    uint64_t a = p >> 30;
+    uint64_t r = (p >> 20) & 0x3ff;
+    uint64_t g = (p >> 10) & 0x3ff;
+    uint64_t b = p & 0x3ff;
+
+    r = r << 6 | r >> 4;
+    g = g << 6 | g >> 4;
+    b = b << 6 | b >> 4;
+
+    a <<= 14;
+    a |= a >> 2;
+    a |= a >> 4;
+    a |= a >> 8;
+
+    return a << 48 | r << 32 | g << 16 | b;
+}
+
+/* Despite the type, this function expects a uint64_t buffer */
+static uint64_t
+fetch_pixel_x2r10g10b10 (bits_image_t *image,
+                        int       offset,
+                        int           line)
+{
+    uint32_t *bits = image->bits + line * image->rowstride;
+    uint32_t p = READ (image, bits + offset);
+    uint64_t r = (p >> 20) & 0x3ff;
+    uint64_t g = (p >> 10) & 0x3ff;
+    uint64_t b = p & 0x3ff;
+    
+    r = r << 6 | r >> 4;
+    g = g << 6 | g >> 4;
+    b = b << 6 | b >> 4;
+    
+    return 0xffffULL << 48 | r << 32 | g << 16 | b;
+}
+
+/* Despite the type, expects a uint64_t buffer */
+static uint64_t
+fetch_pixel_a2b10g10r10 (bits_image_t *image,
+                        int           offset,
+                        int           line)
+{
+    uint32_t *bits = image->bits + line * image->rowstride;
+    uint32_t p = READ (image, bits + offset);
+    uint64_t a = p >> 30;
+    uint64_t b = (p >> 20) & 0x3ff;
+    uint64_t g = (p >> 10) & 0x3ff;
+    uint64_t r = p & 0x3ff;
+    
+    r = r << 6 | r >> 4;
+    g = g << 6 | g >> 4;
+    b = b << 6 | b >> 4;
+    
+    a <<= 14;
+    a |= a >> 2;
+    a |= a >> 4;
+    a |= a >> 8;
+    
+    return a << 48 | r << 32 | g << 16 | b;
+}
+
+/* Despite the type, this function expects a uint64_t buffer */
+static uint64_t
+fetch_pixel_x2b10g10r10 (bits_image_t *image,
+                        int           offset,
+                        int           line)
+{
+    uint32_t *bits = image->bits + line * image->rowstride;
+    uint32_t p = READ (image, bits + offset);
+    uint64_t b = (p >> 20) & 0x3ff;
+    uint64_t g = (p >> 10) & 0x3ff;
+    uint64_t r = p & 0x3ff;
+    
+    r = r << 6 | r >> 4;
+    g = g << 6 | g >> 4;
+    b = b << 6 | b >> 4;
+    
+    return 0xffffULL << 48 | r << 32 | g << 16 | b;
+}
+
+static uint32_t
+fetch_pixel_yuy2 (bits_image_t *image,
+                 int           offset,
+                 int           line)
+{
+    const uint32_t *bits = image->bits + image->rowstride * line;
+    
+    int16_t y, u, v;
+    int32_t r, g, b;
+    
+    y = ((uint8_t *) bits)[offset << 1] - 16;
+    u = ((uint8_t *) bits)[((offset << 1) & - 4) + 1] - 128;
+    v = ((uint8_t *) bits)[((offset << 1) & - 4) + 3] - 128;
+    
+    /* R = 1.164(Y - 16) + 1.596(V - 128) */
+    r = 0x012b27 * y + 0x019a2e * v;
+    
+    /* G = 1.164(Y - 16) - 0.813(V - 128) - 0.391(U - 128) */
+    g = 0x012b27 * y - 0x00d0f2 * v - 0x00647e * u;
+    
+    /* B = 1.164(Y - 16) + 2.018(U - 128) */
+    b = 0x012b27 * y + 0x0206a2 * u;
+    
+    return 0xff000000 |
+       (r >= 0 ? r < 0x1000000 ? r         & 0xff0000 : 0xff0000 : 0) |
+       (g >= 0 ? g < 0x1000000 ? (g >> 8)  & 0x00ff00 : 0x00ff00 : 0) |
+       (b >= 0 ? b < 0x1000000 ? (b >> 16) & 0x0000ff : 0x0000ff : 0);
+}
+
+static uint32_t
+fetch_pixel_yv12 (bits_image_t *image,
+                 int           offset,
+                 int           line)
+{
+    YV12_SETUP (image);
+    int16_t y = YV12_Y (line)[offset] - 16;
+    int16_t u = YV12_U (line)[offset >> 1] - 128;
+    int16_t v = YV12_V (line)[offset >> 1] - 128;
+    int32_t r, g, b;
+    
+    /* R = 1.164(Y - 16) + 1.596(V - 128) */
+    r = 0x012b27 * y + 0x019a2e * v;
+    
+    /* G = 1.164(Y - 16) - 0.813(V - 128) - 0.391(U - 128) */
+    g = 0x012b27 * y - 0x00d0f2 * v - 0x00647e * u;
+    
+    /* B = 1.164(Y - 16) + 2.018(U - 128) */
+    b = 0x012b27 * y + 0x0206a2 * u;
+    
+    return 0xff000000 |
+       (r >= 0 ? r < 0x1000000 ? r         & 0xff0000 : 0xff0000 : 0) |
+       (g >= 0 ? g < 0x1000000 ? (g >> 8)  & 0x00ff00 : 0x00ff00 : 0) |
+       (b >= 0 ? b < 0x1000000 ? (b >> 16) & 0x0000ff : 0x0000ff : 0);
+}
+
+/*********************************** Store ************************************/
+
+static void
+store_scanline_a2r10g10b10 (bits_image_t *  image,
+                            int             x,
+                            int             y,
+                            int             width,
+                            const uint32_t *v)
+{
+    uint32_t *bits = image->bits + image->rowstride * y;
+    uint32_t *pixel = bits + x;
+    uint64_t *values = (uint64_t *)v;
+    int i;
+    
+    for (i = 0; i < width; ++i)
+    {
+       WRITE (image, pixel++,
+              ((values[i] >> 32) & 0xc0000000) |
+              ((values[i] >> 18) & 0x3ff00000) |
+              ((values[i] >> 12) & 0xffc00) | 
+              ((values[i] >> 6) & 0x3ff));    
+    }
+}
+
+static void
+store_scanline_x2r10g10b10 (bits_image_t *  image,
+                            int             x,
+                            int             y,
+                            int             width,
+                            const uint32_t *v)
+{
+    uint32_t *bits = image->bits + image->rowstride * y;
+    uint64_t *values = (uint64_t *)v;
+    uint32_t *pixel = bits + x;
+    int i;
+    
+    for (i = 0; i < width; ++i)
+    {
+       WRITE (image, pixel++,
+              ((values[i] >> 18) & 0x3ff00000) | 
+              ((values[i] >> 12) & 0xffc00) |
+              ((values[i] >> 6) & 0x3ff));
+    }
+}
+
+static void
+store_scanline_a2b10g10r10 (bits_image_t *  image,
+                            int             x,
+                            int             y,
+                            int             width,
+                            const uint32_t *v)
+{
+    uint32_t *bits = image->bits + image->rowstride * y;
+    uint32_t *pixel = bits + x;
+    uint64_t *values = (uint64_t *)v;
+    int i;
+    
+    for (i = 0; i < width; ++i)
+    {
+       WRITE (image, pixel++,
+              ((values[i] >> 32) & 0xc0000000) |
+              ((values[i] >> 38) & 0x3ff) |
+              ((values[i] >> 12) & 0xffc00) |
+              ((values[i] << 14) & 0x3ff00000));
+    }
+}
+
+static void
+store_scanline_x2b10g10r10 (bits_image_t *  image,
+                            int             x,
+                            int             y,
+                            int             width,
+                            const uint32_t *v)
+{
+    uint32_t *bits = image->bits + image->rowstride * y;
+    uint64_t *values = (uint64_t *)v;
+    uint32_t *pixel = bits + x;
+    int i;
+    
+    for (i = 0; i < width; ++i)
+    {
+       WRITE (image, pixel++,
+              ((values[i] >> 38) & 0x3ff) |
+              ((values[i] >> 12) & 0xffc00) |
+              ((values[i] << 14) & 0x3ff00000));
+    }
+}
+
+/*
+ * Contracts a 64bpp image to 32bpp and then stores it using a regular 32-bit
+ * store proc. Despite the type, this function expects a uint64_t buffer.
+ */
+static void
+store_scanline_generic_64 (bits_image_t *  image,
+                           int             x,
+                           int             y,
+                           int             width,
+                           const uint32_t *values)
+{
+    uint32_t *argb8_pixels;
+    
+    assert (image->common.type == BITS);
+    
+    argb8_pixels = pixman_malloc_ab (width, sizeof(uint32_t));
+    if (!argb8_pixels)
+       return;
+    
+    /* Contract the scanline.  We could do this in place if values weren't
+     * const.
+     */
+    pixman_contract (argb8_pixels, (uint64_t *)values, width);
+    
+    image->store_scanline_32 (image, x, y, width, argb8_pixels);
+    
+    free (argb8_pixels);
+}
+
+/* Despite the type, this function expects both buffer
+ * and mask to be uint64_t
+ */
+static void
+fetch_scanline_generic_64 (pixman_image_t *image,
+                           int             x,
+                           int             y,
+                           int             width,
+                           uint32_t *      buffer,
+                           const uint32_t *mask)
+{
+    pixman_format_code_t format;
+
+    /* Fetch the pixels into the first half of buffer and then expand them in
+     * place.
+     */
+    image->bits.fetch_scanline_32 (image, x, y, width, buffer, NULL);
+
+    format = image->bits.format;
+    if (PIXMAN_FORMAT_TYPE (format) == PIXMAN_TYPE_COLOR       ||
+       PIXMAN_FORMAT_TYPE (format) == PIXMAN_TYPE_GRAY)
+    {
+       /* Indexed formats are mapped to a8r8g8b8 with full
+        * precision, so when expanding we shouldn't correct
+        * for the width of the channels
+        */
+
+       format = PIXMAN_a8r8g8b8;
+    }
+
+    pixman_expand ((uint64_t *)buffer, buffer, format, width);
+}
+
+/* Despite the type, this function expects a uint64_t *buffer */
+static uint64_t
+fetch_pixel_generic_64 (bits_image_t *image,
+                       int           offset,
+                       int           line)
+{
+    uint32_t pixel32 = image->fetch_pixel_32 (image, offset, line);
+    uint64_t result;
+    pixman_format_code_t format;
+
+    format = image->format;
+    if (PIXMAN_FORMAT_TYPE (format) == PIXMAN_TYPE_COLOR       ||
+       PIXMAN_FORMAT_TYPE (format) == PIXMAN_TYPE_GRAY)
+    {
+       /* Indexed formats are mapped to a8r8g8b8 with full
+        * precision, so when expanding we shouldn't correct
+        * for the width of the channels
+        */
+
+       format = PIXMAN_a8r8g8b8;
+    }
+
+    pixman_expand ((uint64_t *)&result, &pixel32, format, 1);
+
+    return result;
+}
+
+/*
+ * XXX: The transformed fetch path only works at 32-bpp so far.  When all
+ * paths have wide versions, this can be removed.
+ *
+ * WARNING: This function loses precision!
+ */
+static uint32_t
+fetch_pixel_generic_lossy_32 (bits_image_t *image,
+                             int           offset,
+                             int           line)
+{
+    uint64_t pixel64 = image->fetch_pixel_64 (image, offset, line);
+    uint32_t result;
+
+    pixman_contract (&result, &pixel64, 1);
+
+    return result;
+}
+
+typedef struct
+{
+    pixman_format_code_t       format;
+    fetch_scanline_t           fetch_scanline_32;
+    fetch_scanline_t           fetch_scanline_64;
+    fetch_pixel_32_t           fetch_pixel_32;
+    fetch_pixel_64_t           fetch_pixel_64;
+    store_scanline_t           store_scanline_32;
+    store_scanline_t           store_scanline_64;
+} format_info_t;
+
+#define FORMAT_INFO(format)                                            \
+    {                                                                  \
+       PIXMAN_ ## format,                                              \
+           fetch_scanline_ ## format,                                  \
+           fetch_scanline_generic_64,                                  \
+           fetch_pixel_ ## format, fetch_pixel_generic_64,             \
+           store_scanline_ ## format, store_scanline_generic_64        \
+    }
+
+static const format_info_t accessors[] =
+{
+/* 32 bpp formats */
+    FORMAT_INFO (a8r8g8b8),
+    FORMAT_INFO (x8r8g8b8),
+    FORMAT_INFO (a8b8g8r8),
+    FORMAT_INFO (x8b8g8r8),
+    FORMAT_INFO (b8g8r8a8),
+    FORMAT_INFO (b8g8r8x8),
+    FORMAT_INFO (r8g8b8a8),
+    FORMAT_INFO (r8g8b8x8),
+    FORMAT_INFO (x14r6g6b6),
+
+/* 24bpp formats */
+    FORMAT_INFO (r8g8b8),
+    FORMAT_INFO (b8g8r8),
+    
+/* 16bpp formats */
+    FORMAT_INFO (r5g6b5),
+    FORMAT_INFO (b5g6r5),
+    
+    FORMAT_INFO (a1r5g5b5),
+    FORMAT_INFO (x1r5g5b5),
+    FORMAT_INFO (a1b5g5r5),
+    FORMAT_INFO (x1b5g5r5),
+    FORMAT_INFO (a4r4g4b4),
+    FORMAT_INFO (x4r4g4b4),
+    FORMAT_INFO (a4b4g4r4),
+    FORMAT_INFO (x4b4g4r4),
+    
+/* 8bpp formats */
+    FORMAT_INFO (a8),
+    FORMAT_INFO (r3g3b2),
+    FORMAT_INFO (b2g3r3),
+    FORMAT_INFO (a2r2g2b2),
+    FORMAT_INFO (a2b2g2r2),
+    
+    FORMAT_INFO (c8),
+    
+    FORMAT_INFO (g8),
+    
+#define fetch_scanline_x4c4 fetch_scanline_c8
+#define fetch_pixel_x4c4 fetch_pixel_c8
+#define store_scanline_x4c4 store_scanline_c8
+    FORMAT_INFO (x4c4),
+    
+#define fetch_scanline_x4g4 fetch_scanline_g8
+#define fetch_pixel_x4g4 fetch_pixel_g8
+#define store_scanline_x4g4 store_scanline_g8
+    FORMAT_INFO (x4g4),
+    
+    FORMAT_INFO (x4a4),
+    
+/* 4bpp formats */
+    FORMAT_INFO (a4),
+    FORMAT_INFO (r1g2b1),
+    FORMAT_INFO (b1g2r1),
+    FORMAT_INFO (a1r1g1b1),
+    FORMAT_INFO (a1b1g1r1),
+    
+    FORMAT_INFO (c4),
+    
+    FORMAT_INFO (g4),
+    
+/* 1bpp formats */
+    FORMAT_INFO (a1),
+    FORMAT_INFO (g1),
+    
+/* Wide formats */
+    
+    { PIXMAN_a2r10g10b10,
+      NULL, fetch_scanline_a2r10g10b10,
+      fetch_pixel_generic_lossy_32, fetch_pixel_a2r10g10b10,
+      NULL, store_scanline_a2r10g10b10 },
+    
+    { PIXMAN_x2r10g10b10,
+      NULL, fetch_scanline_x2r10g10b10,
+      fetch_pixel_generic_lossy_32, fetch_pixel_x2r10g10b10,
+      NULL, store_scanline_x2r10g10b10 },
+    
+    { PIXMAN_a2b10g10r10,
+      NULL, fetch_scanline_a2b10g10r10,
+      fetch_pixel_generic_lossy_32, fetch_pixel_a2b10g10r10,
+      NULL, store_scanline_a2b10g10r10 },
+    
+    { PIXMAN_x2b10g10r10,
+      NULL, fetch_scanline_x2b10g10r10,
+      fetch_pixel_generic_lossy_32, fetch_pixel_x2b10g10r10,
+      NULL, store_scanline_x2b10g10r10 },
+    
+/* YUV formats */
+    { PIXMAN_yuy2,
+      fetch_scanline_yuy2, fetch_scanline_generic_64,
+      fetch_pixel_yuy2, fetch_pixel_generic_64,
+      NULL, NULL },
+    
+    { PIXMAN_yv12,
+      fetch_scanline_yv12, fetch_scanline_generic_64,
+      fetch_pixel_yv12, fetch_pixel_generic_64,
+      NULL, NULL },
+    
+    { PIXMAN_null },
+};
+
+static void
+setup_accessors (bits_image_t *image)
+{
+    const format_info_t *info = accessors;
+    
+    while (info->format != PIXMAN_null)
+    {
+       if (info->format == image->format)
+       {
+           image->fetch_scanline_32 = info->fetch_scanline_32;
+           image->fetch_scanline_64 = info->fetch_scanline_64;
+           image->fetch_pixel_32 = info->fetch_pixel_32;
+           image->fetch_pixel_64 = info->fetch_pixel_64;
+           image->store_scanline_32 = info->store_scanline_32;
+           image->store_scanline_64 = info->store_scanline_64;
+           
+           return;
+       }
+       
+       info++;
+    }
+}
+
+#ifndef PIXMAN_FB_ACCESSORS
+void
+_pixman_bits_image_setup_accessors_accessors (bits_image_t *image);
+
+void
+_pixman_bits_image_setup_accessors (bits_image_t *image)
+{
+    if (image->read_func || image->write_func)
+       _pixman_bits_image_setup_accessors_accessors (image);
+    else
+       setup_accessors (image);
+}
+
+#else
+
+void
+_pixman_bits_image_setup_accessors_accessors (bits_image_t *image)
+{
+    setup_accessors (image);
+}
+
+#endif
diff --git a/pixman/pixman-accessor.h b/pixman/pixman-accessor.h
new file mode 100644 (file)
index 0000000..90c8ea7
--- /dev/null
@@ -0,0 +1,40 @@
+#ifdef PIXMAN_FB_ACCESSORS
+
+#define ACCESS(sym) sym##_accessors
+
+#define READ(img, ptr)                                                 \
+    (((bits_image_t *)(img))->read_func ((ptr), sizeof(*(ptr))))
+#define WRITE(img, ptr,val)                                            \
+    (((bits_image_t *)(img))->write_func ((ptr), (val), sizeof (*(ptr))))
+
+#define MEMCPY_WRAPPED(img, dst, src, size)                            \
+    do {                                                               \
+       size_t _i;                                                      \
+       uint8_t *_dst = (uint8_t*)(dst), *_src = (uint8_t*)(src);       \
+       for(_i = 0; _i < size; _i++) {                                  \
+           WRITE((img), _dst +_i, READ((img), _src + _i));             \
+       }                                                               \
+    } while (0)
+
+#define MEMSET_WRAPPED(img, dst, val, size)                            \
+    do {                                                               \
+       size_t _i;                                                      \
+       uint8_t *_dst = (uint8_t*)(dst);                                \
+       for(_i = 0; _i < (size_t) size; _i++) {                         \
+           WRITE((img), _dst +_i, (val));                              \
+       }                                                               \
+    } while (0)
+
+#else
+
+#define ACCESS(sym) sym
+
+#define READ(img, ptr)         (*(ptr))
+#define WRITE(img, ptr, val)   (*(ptr) = (val))
+#define MEMCPY_WRAPPED(img, dst, src, size)                            \
+    memcpy(dst, src, size)
+#define MEMSET_WRAPPED(img, dst, val, size)                            \
+    memset(dst, val, size)
+
+#endif
+
diff --git a/pixman/pixman-arm-common.h b/pixman/pixman-arm-common.h
new file mode 100644 (file)
index 0000000..f56264e
--- /dev/null
@@ -0,0 +1,416 @@
+/*
+ * Copyright © 2010 Nokia Corporation
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and associated documentation files (the "Software"),
+ * to deal in the Software without restriction, including without limitation
+ * the rights to use, copy, modify, merge, publish, distribute, sublicense,
+ * and/or sell copies of the Software, and to permit persons to whom the
+ * Software is furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice (including the next
+ * paragraph) shall be included in all copies or substantial portions of the
+ * Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
+ * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
+ * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER
+ * DEALINGS IN THE SOFTWARE.
+ *
+ * Author:  Siarhei Siamashka (siarhei.siamashka@nokia.com)
+ */
+
+#ifndef PIXMAN_ARM_COMMON_H
+#define PIXMAN_ARM_COMMON_H
+
+#include "pixman-inlines.h"
+
+/* Define some macros which can expand into proxy functions between
+ * ARM assembly optimized functions and the rest of pixman fast path API.
+ *
+ * All the low level ARM assembly functions have to use ARM EABI
+ * calling convention and take up to 8 arguments:
+ *    width, height, dst, dst_stride, src, src_stride, mask, mask_stride
+ *
+ * The arguments are ordered with the most important coming first (the
+ * first 4 arguments are passed to function in registers, the rest are
+ * on stack). The last arguments are optional, for example if the
+ * function is not using mask, then 'mask' and 'mask_stride' can be
+ * omitted when doing a function call.
+ *
+ * Arguments 'src' and 'mask' contain either a pointer to the top left
+ * pixel of the composited rectangle or a pixel color value depending
+ * on the function type. In the case of just a color value (solid source
+ * or mask), the corresponding stride argument is unused.
+ */
+
+#define SKIP_ZERO_SRC  1
+#define SKIP_ZERO_MASK 2
+
+#define PIXMAN_ARM_BIND_FAST_PATH_SRC_DST(cputype, name,                \
+                                          src_type, src_cnt,            \
+                                          dst_type, dst_cnt)            \
+void                                                                    \
+pixman_composite_##name##_asm_##cputype (int32_t   w,                   \
+                                         int32_t   h,                   \
+                                         dst_type *dst,                 \
+                                         int32_t   dst_stride,          \
+                                         src_type *src,                 \
+                                         int32_t   src_stride);         \
+                                                                        \
+static void                                                             \
+cputype##_composite_##name (pixman_implementation_t *imp,               \
+                            pixman_composite_info_t *info)              \
+{                                                                       \
+    PIXMAN_COMPOSITE_ARGS (info);                                       \
+    dst_type *dst_line;                                                        \
+    src_type *src_line;                                                 \
+    int32_t dst_stride, src_stride;                                     \
+                                                                        \
+    PIXMAN_IMAGE_GET_LINE (src_image, src_x, src_y, src_type,           \
+                           src_stride, src_line, src_cnt);              \
+    PIXMAN_IMAGE_GET_LINE (dest_image, dest_x, dest_y, dst_type,        \
+                           dst_stride, dst_line, dst_cnt);              \
+                                                                        \
+    pixman_composite_##name##_asm_##cputype (width, height,             \
+                                             dst_line, dst_stride,      \
+                                             src_line, src_stride);     \
+}
+
+#define PIXMAN_ARM_BIND_FAST_PATH_N_DST(flags, cputype, name,           \
+                                        dst_type, dst_cnt)              \
+void                                                                    \
+pixman_composite_##name##_asm_##cputype (int32_t    w,                  \
+                                         int32_t    h,                  \
+                                         dst_type  *dst,                \
+                                         int32_t    dst_stride,         \
+                                         uint32_t   src);               \
+                                                                        \
+static void                                                             \
+cputype##_composite_##name (pixman_implementation_t *imp,               \
+                           pixman_composite_info_t *info)              \
+{                                                                       \
+    PIXMAN_COMPOSITE_ARGS (info);                                      \
+    dst_type  *dst_line;                                                \
+    int32_t    dst_stride;                                              \
+    uint32_t   src;                                                     \
+                                                                        \
+    src = _pixman_image_get_solid (                                    \
+       imp, src_image, dest_image->bits.format);                       \
+                                                                        \
+    if ((flags & SKIP_ZERO_SRC) && src == 0)                            \
+       return;                                                         \
+                                                                        \
+    PIXMAN_IMAGE_GET_LINE (dest_image, dest_x, dest_y, dst_type,        \
+                           dst_stride, dst_line, dst_cnt);              \
+                                                                        \
+    pixman_composite_##name##_asm_##cputype (width, height,             \
+                                             dst_line, dst_stride,      \
+                                             src);                      \
+}
+
+#define PIXMAN_ARM_BIND_FAST_PATH_N_MASK_DST(flags, cputype, name,      \
+                                             mask_type, mask_cnt,       \
+                                             dst_type, dst_cnt)         \
+void                                                                    \
+pixman_composite_##name##_asm_##cputype (int32_t    w,                  \
+                                         int32_t    h,                  \
+                                         dst_type  *dst,                \
+                                         int32_t    dst_stride,         \
+                                         uint32_t   src,                \
+                                         int32_t    unused,             \
+                                         mask_type *mask,               \
+                                         int32_t    mask_stride);       \
+                                                                        \
+static void                                                             \
+cputype##_composite_##name (pixman_implementation_t *imp,               \
+                            pixman_composite_info_t *info)              \
+{                                                                       \
+    PIXMAN_COMPOSITE_ARGS (info);                                       \
+    dst_type  *dst_line;                                               \
+    mask_type *mask_line;                                               \
+    int32_t    dst_stride, mask_stride;                                 \
+    uint32_t   src;                                                     \
+                                                                        \
+    src = _pixman_image_get_solid (                                    \
+       imp, src_image, dest_image->bits.format);                       \
+                                                                        \
+    if ((flags & SKIP_ZERO_SRC) && src == 0)                            \
+       return;                                                         \
+                                                                        \
+    PIXMAN_IMAGE_GET_LINE (dest_image, dest_x, dest_y, dst_type,        \
+                           dst_stride, dst_line, dst_cnt);              \
+    PIXMAN_IMAGE_GET_LINE (mask_image, mask_x, mask_y, mask_type,       \
+                           mask_stride, mask_line, mask_cnt);           \
+                                                                        \
+    pixman_composite_##name##_asm_##cputype (width, height,             \
+                                             dst_line, dst_stride,      \
+                                             src, 0,                    \
+                                             mask_line, mask_stride);   \
+}
+
+#define PIXMAN_ARM_BIND_FAST_PATH_SRC_N_DST(flags, cputype, name,       \
+                                            src_type, src_cnt,          \
+                                            dst_type, dst_cnt)          \
+void                                                                    \
+pixman_composite_##name##_asm_##cputype (int32_t    w,                  \
+                                         int32_t    h,                  \
+                                         dst_type  *dst,                \
+                                         int32_t    dst_stride,         \
+                                         src_type  *src,                \
+                                         int32_t    src_stride,         \
+                                         uint32_t   mask);              \
+                                                                        \
+static void                                                             \
+cputype##_composite_##name (pixman_implementation_t *imp,               \
+                            pixman_composite_info_t *info)              \
+{                                                                       \
+    PIXMAN_COMPOSITE_ARGS (info);                                       \
+    dst_type  *dst_line;                                               \
+    src_type  *src_line;                                                \
+    int32_t    dst_stride, src_stride;                                  \
+    uint32_t   mask;                                                    \
+                                                                        \
+    mask = _pixman_image_get_solid (                                   \
+       imp, mask_image, dest_image->bits.format);                      \
+                                                                        \
+    if ((flags & SKIP_ZERO_MASK) && mask == 0)                          \
+       return;                                                         \
+                                                                        \
+    PIXMAN_IMAGE_GET_LINE (dest_image, dest_x, dest_y, dst_type,        \
+                           dst_stride, dst_line, dst_cnt);              \
+    PIXMAN_IMAGE_GET_LINE (src_image, src_x, src_y, src_type,           \
+                           src_stride, src_line, src_cnt);              \
+                                                                        \
+    pixman_composite_##name##_asm_##cputype (width, height,             \
+                                             dst_line, dst_stride,      \
+                                             src_line, src_stride,      \
+                                             mask);                     \
+}
+
+#define PIXMAN_ARM_BIND_FAST_PATH_SRC_MASK_DST(cputype, name,           \
+                                               src_type, src_cnt,       \
+                                               mask_type, mask_cnt,     \
+                                               dst_type, dst_cnt)       \
+void                                                                    \
+pixman_composite_##name##_asm_##cputype (int32_t    w,                  \
+                                         int32_t    h,                  \
+                                         dst_type  *dst,                \
+                                         int32_t    dst_stride,         \
+                                         src_type  *src,                \
+                                         int32_t    src_stride,         \
+                                         mask_type *mask,               \
+                                         int32_t    mask_stride);       \
+                                                                        \
+static void                                                             \
+cputype##_composite_##name (pixman_implementation_t *imp,               \
+                            pixman_composite_info_t *info)              \
+{                                                                       \
+    PIXMAN_COMPOSITE_ARGS (info);                                       \
+    dst_type  *dst_line;                                               \
+    src_type  *src_line;                                                \
+    mask_type *mask_line;                                               \
+    int32_t    dst_stride, src_stride, mask_stride;                     \
+                                                                        \
+    PIXMAN_IMAGE_GET_LINE (dest_image, dest_x, dest_y, dst_type,        \
+                           dst_stride, dst_line, dst_cnt);              \
+    PIXMAN_IMAGE_GET_LINE (src_image, src_x, src_y, src_type,           \
+                           src_stride, src_line, src_cnt);              \
+    PIXMAN_IMAGE_GET_LINE (mask_image, mask_x, mask_y, mask_type,       \
+                           mask_stride, mask_line, mask_cnt);           \
+                                                                        \
+    pixman_composite_##name##_asm_##cputype (width, height,             \
+                                             dst_line, dst_stride,      \
+                                             src_line, src_stride,      \
+                                             mask_line, mask_stride);   \
+}
+
+#define PIXMAN_ARM_BIND_SCALED_NEAREST_SRC_DST(cputype, name, op,             \
+                                               src_type, dst_type)            \
+void                                                                          \
+pixman_scaled_nearest_scanline_##name##_##op##_asm_##cputype (                \
+                                                   int32_t          w,        \
+                                                   dst_type *       dst,      \
+                                                   const src_type * src,      \
+                                                   pixman_fixed_t   vx,       \
+                                                   pixman_fixed_t   unit_x);  \
+                                                                              \
+static force_inline void                                                      \
+scaled_nearest_scanline_##cputype##_##name##_##op (dst_type *       pd,       \
+                                                   const src_type * ps,       \
+                                                   int32_t          w,        \
+                                                   pixman_fixed_t   vx,       \
+                                                   pixman_fixed_t   unit_x,   \
+                                                   pixman_fixed_t   max_vx,   \
+                                                   pixman_bool_t    zero_src) \
+{                                                                             \
+    pixman_scaled_nearest_scanline_##name##_##op##_asm_##cputype (w, pd, ps,  \
+                                                                  vx, unit_x);\
+}                                                                             \
+                                                                              \
+FAST_NEAREST_MAINLOOP (cputype##_##name##_cover_##op,                         \
+                       scaled_nearest_scanline_##cputype##_##name##_##op,     \
+                       src_type, dst_type, COVER)                             \
+FAST_NEAREST_MAINLOOP (cputype##_##name##_none_##op,                          \
+                       scaled_nearest_scanline_##cputype##_##name##_##op,     \
+                       src_type, dst_type, NONE)                              \
+FAST_NEAREST_MAINLOOP (cputype##_##name##_pad_##op,                           \
+                       scaled_nearest_scanline_##cputype##_##name##_##op,     \
+                       src_type, dst_type, PAD)
+
+/* Provide entries for the fast path table */
+#define PIXMAN_ARM_SIMPLE_NEAREST_FAST_PATH(op,s,d,func)                      \
+    SIMPLE_NEAREST_FAST_PATH_COVER (op,s,d,func),                             \
+    SIMPLE_NEAREST_FAST_PATH_NONE (op,s,d,func),                              \
+    SIMPLE_NEAREST_FAST_PATH_PAD (op,s,d,func)
+
+#define PIXMAN_ARM_BIND_SCALED_NEAREST_SRC_A8_DST(flags, cputype, name, op,   \
+                                                  src_type, dst_type)         \
+void                                                                          \
+pixman_scaled_nearest_scanline_##name##_##op##_asm_##cputype (                \
+                                                   int32_t          w,        \
+                                                   dst_type *       dst,      \
+                                                   const src_type * src,      \
+                                                   pixman_fixed_t   vx,       \
+                                                   pixman_fixed_t   unit_x,   \
+                                                   const uint8_t *  mask);    \
+                                                                              \
+static force_inline void                                                      \
+scaled_nearest_scanline_##cputype##_##name##_##op (const uint8_t *  mask,     \
+                                                   dst_type *       pd,       \
+                                                   const src_type * ps,       \
+                                                   int32_t          w,        \
+                                                   pixman_fixed_t   vx,       \
+                                                   pixman_fixed_t   unit_x,   \
+                                                   pixman_fixed_t   max_vx,   \
+                                                   pixman_bool_t    zero_src) \
+{                                                                             \
+    if ((flags & SKIP_ZERO_SRC) && zero_src)                                  \
+       return;                                                               \
+    pixman_scaled_nearest_scanline_##name##_##op##_asm_##cputype (w, pd, ps,  \
+                                                                  vx, unit_x, \
+                                                                  mask);      \
+}                                                                             \
+                                                                              \
+FAST_NEAREST_MAINLOOP_COMMON (cputype##_##name##_cover_##op,                  \
+                              scaled_nearest_scanline_##cputype##_##name##_##op,\
+                              src_type, uint8_t, dst_type, COVER, TRUE, FALSE)\
+FAST_NEAREST_MAINLOOP_COMMON (cputype##_##name##_none_##op,                   \
+                              scaled_nearest_scanline_##cputype##_##name##_##op,\
+                              src_type, uint8_t, dst_type, NONE, TRUE, FALSE) \
+FAST_NEAREST_MAINLOOP_COMMON (cputype##_##name##_pad_##op,                    \
+                              scaled_nearest_scanline_##cputype##_##name##_##op,\
+                              src_type, uint8_t, dst_type, PAD, TRUE, FALSE)
+
+/* Provide entries for the fast path table */
+#define PIXMAN_ARM_SIMPLE_NEAREST_A8_MASK_FAST_PATH(op,s,d,func)              \
+    SIMPLE_NEAREST_A8_MASK_FAST_PATH_COVER (op,s,d,func),                     \
+    SIMPLE_NEAREST_A8_MASK_FAST_PATH_NONE (op,s,d,func),                      \
+    SIMPLE_NEAREST_A8_MASK_FAST_PATH_PAD (op,s,d,func)
+
+/*****************************************************************************/
+
+#define PIXMAN_ARM_BIND_SCALED_BILINEAR_SRC_DST(flags, cputype, name, op,     \
+                                                src_type, dst_type)           \
+void                                                                          \
+pixman_scaled_bilinear_scanline_##name##_##op##_asm_##cputype (               \
+                                                dst_type *       dst,         \
+                                                const src_type * top,         \
+                                                const src_type * bottom,      \
+                                                int              wt,          \
+                                                int              wb,          \
+                                                pixman_fixed_t   x,           \
+                                                pixman_fixed_t   ux,          \
+                                                int              width);      \
+                                                                              \
+static force_inline void                                                      \
+scaled_bilinear_scanline_##cputype##_##name##_##op (                          \
+                                                dst_type *       dst,         \
+                                                const uint32_t * mask,        \
+                                                const src_type * src_top,     \
+                                                const src_type * src_bottom,  \
+                                                int32_t          w,           \
+                                                int              wt,          \
+                                                int              wb,          \
+                                                pixman_fixed_t   vx,          \
+                                                pixman_fixed_t   unit_x,      \
+                                                pixman_fixed_t   max_vx,      \
+                                                pixman_bool_t    zero_src)    \
+{                                                                             \
+    if ((flags & SKIP_ZERO_SRC) && zero_src)                                  \
+       return;                                                               \
+    pixman_scaled_bilinear_scanline_##name##_##op##_asm_##cputype (           \
+                            dst, src_top, src_bottom, wt, wb, vx, unit_x, w); \
+}                                                                             \
+                                                                              \
+FAST_BILINEAR_MAINLOOP_COMMON (cputype##_##name##_cover_##op,                 \
+                       scaled_bilinear_scanline_##cputype##_##name##_##op,    \
+                       src_type, uint32_t, dst_type, COVER, FLAG_NONE)        \
+FAST_BILINEAR_MAINLOOP_COMMON (cputype##_##name##_none_##op,                  \
+                       scaled_bilinear_scanline_##cputype##_##name##_##op,    \
+                       src_type, uint32_t, dst_type, NONE, FLAG_NONE)         \
+FAST_BILINEAR_MAINLOOP_COMMON (cputype##_##name##_pad_##op,                   \
+                       scaled_bilinear_scanline_##cputype##_##name##_##op,    \
+                       src_type, uint32_t, dst_type, PAD, FLAG_NONE)          \
+FAST_BILINEAR_MAINLOOP_COMMON (cputype##_##name##_normal_##op,                \
+                       scaled_bilinear_scanline_##cputype##_##name##_##op,    \
+                       src_type, uint32_t, dst_type, NORMAL,                  \
+                       FLAG_NONE)
+
+
+#define PIXMAN_ARM_BIND_SCALED_BILINEAR_SRC_A8_DST(flags, cputype, name, op,  \
+                                                src_type, dst_type)           \
+void                                                                          \
+pixman_scaled_bilinear_scanline_##name##_##op##_asm_##cputype (               \
+                                                dst_type *       dst,         \
+                                                const uint8_t *  mask,        \
+                                                const src_type * top,         \
+                                                const src_type * bottom,      \
+                                                int              wt,          \
+                                                int              wb,          \
+                                                pixman_fixed_t   x,           \
+                                                pixman_fixed_t   ux,          \
+                                                int              width);      \
+                                                                              \
+static force_inline void                                                      \
+scaled_bilinear_scanline_##cputype##_##name##_##op (                          \
+                                                dst_type *       dst,         \
+                                                const uint8_t *  mask,        \
+                                                const src_type * src_top,     \
+                                                const src_type * src_bottom,  \
+                                                int32_t          w,           \
+                                                int              wt,          \
+                                                int              wb,          \
+                                                pixman_fixed_t   vx,          \
+                                                pixman_fixed_t   unit_x,      \
+                                                pixman_fixed_t   max_vx,      \
+                                                pixman_bool_t    zero_src)    \
+{                                                                             \
+    if ((flags & SKIP_ZERO_SRC) && zero_src)                                  \
+       return;                                                                   \
+    pixman_scaled_bilinear_scanline_##name##_##op##_asm_##cputype (           \
+                      dst, mask, src_top, src_bottom, wt, wb, vx, unit_x, w); \
+}                                                                             \
+                                                                              \
+FAST_BILINEAR_MAINLOOP_COMMON (cputype##_##name##_cover_##op,                 \
+                       scaled_bilinear_scanline_##cputype##_##name##_##op,    \
+                       src_type, uint8_t, dst_type, COVER,                    \
+                       FLAG_HAVE_NON_SOLID_MASK)                              \
+FAST_BILINEAR_MAINLOOP_COMMON (cputype##_##name##_none_##op,                  \
+                       scaled_bilinear_scanline_##cputype##_##name##_##op,    \
+                       src_type, uint8_t, dst_type, NONE,                     \
+                       FLAG_HAVE_NON_SOLID_MASK)                              \
+FAST_BILINEAR_MAINLOOP_COMMON (cputype##_##name##_pad_##op,                   \
+                       scaled_bilinear_scanline_##cputype##_##name##_##op,    \
+                       src_type, uint8_t, dst_type, PAD,                      \
+                       FLAG_HAVE_NON_SOLID_MASK)                              \
+FAST_BILINEAR_MAINLOOP_COMMON (cputype##_##name##_normal_##op,                \
+                       scaled_bilinear_scanline_##cputype##_##name##_##op,    \
+                       src_type, uint8_t, dst_type, NORMAL,                   \
+                       FLAG_HAVE_NON_SOLID_MASK)
+
+
+#endif
diff --git a/pixman/pixman-arm-detect-win32.asm b/pixman/pixman-arm-detect-win32.asm
new file mode 100644 (file)
index 0000000..8f5d5eb
--- /dev/null
@@ -0,0 +1,21 @@
+    area pixman_msvc, code, readonly\r
+\r
+    export  pixman_msvc_try_arm_simd_op\r
+\r
+pixman_msvc_try_arm_simd_op\r
+    ;; I don't think the msvc arm asm knows how to do SIMD insns\r
+    ;; uqadd8 r3,r3,r3\r
+    dcd 0xe6633f93\r
+    mov pc,lr\r
+    endp\r
+\r
+    export  pixman_msvc_try_arm_neon_op\r
+\r
+pixman_msvc_try_arm_neon_op\r
+    ;; I don't think the msvc arm asm knows how to do NEON insns\r
+    ;; veor d0,d0,d0\r
+    dcd 0xf3000110\r
+    mov pc,lr\r
+    endp\r
+\r
+    end\r
diff --git a/pixman/pixman-arm-neon-asm-bilinear.S b/pixman/pixman-arm-neon-asm-bilinear.S
new file mode 100644 (file)
index 0000000..f7913ad
--- /dev/null
@@ -0,0 +1,1367 @@
+/*
+ * Copyright © 2011 SCore Corporation
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and associated documentation files (the "Software"),
+ * to deal in the Software without restriction, including without limitation
+ * the rights to use, copy, modify, merge, publish, distribute, sublicense,
+ * and/or sell copies of the Software, and to permit persons to whom the
+ * Software is furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice (including the next
+ * paragraph) shall be included in all copies or substantial portions of the
+ * Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
+ * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
+ * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER
+ * DEALINGS IN THE SOFTWARE.
+ *
+ * Author:  Siarhei Siamashka (siarhei.siamashka@nokia.com)
+ * Author:  Taekyun Kim (tkq.kim@samsung.com)
+ */
+
+/*
+ * This file contains scaled bilinear scanline functions implemented
+ * using older siarhei's bilinear macro template.
+ *
+ * << General scanline function procedures >>
+ *  1. bilinear interpolate source pixels
+ *  2. load mask pixels
+ *  3. load destination pixels
+ *  4. duplicate mask to fill whole register
+ *  5. interleave source & destination pixels
+ *  6. apply mask to source pixels
+ *  7. combine source & destination pixels
+ *  8, Deinterleave final result
+ *  9. store destination pixels
+ *
+ * All registers with single number (i.e. src0, tmp0) are 64-bits registers.
+ * Registers with double numbers(src01, dst01) are 128-bits registers.
+ * All temp registers can be used freely outside the code block.
+ * Assume that symbol(register .req) OUT and MASK are defined at caller of these macro blocks.
+ *
+ * Remarks
+ *  There can be lots of pipeline stalls inside code block and between code blocks.
+ *  Further optimizations will be done by new macro templates using head/tail_head/tail scheme.
+ */
+
+/* Prevent the stack from becoming executable for no reason... */
+#if defined(__linux__) && defined (__ELF__)
+.section .note.GNU-stack,"",%progbits
+#endif
+
+.text
+.fpu neon
+.arch armv7a
+.object_arch armv4
+.eabi_attribute 10, 0
+.eabi_attribute 12, 0
+.arm
+.altmacro
+.p2align 2
+
+#include "pixman-arm-neon-asm.h"
+
+/*
+ * Bilinear macros from pixman-arm-neon-asm.S
+ */
+
+/* Supplementary macro for setting function attributes */
+.macro pixman_asm_function fname
+    .func fname
+    .global fname
+#ifdef __ELF__
+    .hidden fname
+    .type fname, %function
+#endif
+fname:
+.endm
+
+/*
+ * Bilinear scaling support code which tries to provide pixel fetching, color
+ * format conversion, and interpolation as separate macros which can be used
+ * as the basic building blocks for constructing bilinear scanline functions.
+ */
+
+.macro bilinear_load_8888 reg1, reg2, tmp
+    mov       TMP1, X, asr #16
+    add       X, X, UX
+    add       TMP1, TOP, TMP1, asl #2
+    vld1.32   {reg1}, [TMP1], STRIDE
+    vld1.32   {reg2}, [TMP1]
+.endm
+
+.macro bilinear_load_0565 reg1, reg2, tmp
+    mov       TMP1, X, asr #16
+    add       X, X, UX
+    add       TMP1, TOP, TMP1, asl #1
+    vld1.32   {reg2[0]}, [TMP1], STRIDE
+    vld1.32   {reg2[1]}, [TMP1]
+    convert_four_0565_to_x888_packed reg2, reg1, reg2, tmp
+.endm
+
+.macro bilinear_load_and_vertical_interpolate_two_8888 \
+                    acc1, acc2, reg1, reg2, reg3, reg4, tmp1, tmp2
+
+    bilinear_load_8888 reg1, reg2, tmp1
+    vmull.u8  acc1, reg1, d28
+    vmlal.u8  acc1, reg2, d29
+    bilinear_load_8888 reg3, reg4, tmp2
+    vmull.u8  acc2, reg3, d28
+    vmlal.u8  acc2, reg4, d29
+.endm
+
+.macro bilinear_load_and_vertical_interpolate_four_8888 \
+                xacc1, xacc2, xreg1, xreg2, xreg3, xreg4, xacc2lo, xacc2hi \
+                yacc1, yacc2, yreg1, yreg2, yreg3, yreg4, yacc2lo, yacc2hi
+
+    bilinear_load_and_vertical_interpolate_two_8888 \
+                xacc1, xacc2, xreg1, xreg2, xreg3, xreg4, xacc2lo, xacc2hi
+    bilinear_load_and_vertical_interpolate_two_8888 \
+                yacc1, yacc2, yreg1, yreg2, yreg3, yreg4, yacc2lo, yacc2hi
+.endm
+
+.macro bilinear_load_and_vertical_interpolate_two_0565 \
+                acc1, acc2, reg1, reg2, reg3, reg4, acc2lo, acc2hi
+
+    mov       TMP1, X, asr #16
+    add       X, X, UX
+    add       TMP1, TOP, TMP1, asl #1
+    mov       TMP2, X, asr #16
+    add       X, X, UX
+    add       TMP2, TOP, TMP2, asl #1
+    vld1.32   {acc2lo[0]}, [TMP1], STRIDE
+    vld1.32   {acc2hi[0]}, [TMP2], STRIDE
+    vld1.32   {acc2lo[1]}, [TMP1]
+    vld1.32   {acc2hi[1]}, [TMP2]
+    convert_0565_to_x888 acc2, reg3, reg2, reg1
+    vzip.u8   reg1, reg3
+    vzip.u8   reg2, reg4
+    vzip.u8   reg3, reg4
+    vzip.u8   reg1, reg2
+    vmull.u8  acc1, reg1, d28
+    vmlal.u8  acc1, reg2, d29
+    vmull.u8  acc2, reg3, d28
+    vmlal.u8  acc2, reg4, d29
+.endm
+
+.macro bilinear_load_and_vertical_interpolate_four_0565 \
+                xacc1, xacc2, xreg1, xreg2, xreg3, xreg4, xacc2lo, xacc2hi \
+                yacc1, yacc2, yreg1, yreg2, yreg3, yreg4, yacc2lo, yacc2hi
+
+    mov       TMP1, X, asr #16
+    add       X, X, UX
+    add       TMP1, TOP, TMP1, asl #1
+    mov       TMP2, X, asr #16
+    add       X, X, UX
+    add       TMP2, TOP, TMP2, asl #1
+    vld1.32   {xacc2lo[0]}, [TMP1], STRIDE
+    vld1.32   {xacc2hi[0]}, [TMP2], STRIDE
+    vld1.32   {xacc2lo[1]}, [TMP1]
+    vld1.32   {xacc2hi[1]}, [TMP2]
+    convert_0565_to_x888 xacc2, xreg3, xreg2, xreg1
+    mov       TMP1, X, asr #16
+    add       X, X, UX
+    add       TMP1, TOP, TMP1, asl #1
+    mov       TMP2, X, asr #16
+    add       X, X, UX
+    add       TMP2, TOP, TMP2, asl #1
+    vld1.32   {yacc2lo[0]}, [TMP1], STRIDE
+    vzip.u8   xreg1, xreg3
+    vld1.32   {yacc2hi[0]}, [TMP2], STRIDE
+    vzip.u8   xreg2, xreg4
+    vld1.32   {yacc2lo[1]}, [TMP1]
+    vzip.u8   xreg3, xreg4
+    vld1.32   {yacc2hi[1]}, [TMP2]
+    vzip.u8   xreg1, xreg2
+    convert_0565_to_x888 yacc2, yreg3, yreg2, yreg1
+    vmull.u8  xacc1, xreg1, d28
+    vzip.u8   yreg1, yreg3
+    vmlal.u8  xacc1, xreg2, d29
+    vzip.u8   yreg2, yreg4
+    vmull.u8  xacc2, xreg3, d28
+    vzip.u8   yreg3, yreg4
+    vmlal.u8  xacc2, xreg4, d29
+    vzip.u8   yreg1, yreg2
+    vmull.u8  yacc1, yreg1, d28
+    vmlal.u8  yacc1, yreg2, d29
+    vmull.u8  yacc2, yreg3, d28
+    vmlal.u8  yacc2, yreg4, d29
+.endm
+
+.macro bilinear_store_8888 numpix, tmp1, tmp2
+.if numpix == 4
+    vst1.32   {d0, d1}, [OUT]!
+.elseif numpix == 2
+    vst1.32   {d0}, [OUT]!
+.elseif numpix == 1
+    vst1.32   {d0[0]}, [OUT, :32]!
+.else
+    .error bilinear_store_8888 numpix is unsupported
+.endif
+.endm
+
+.macro bilinear_store_0565 numpix, tmp1, tmp2
+    vuzp.u8 d0, d1
+    vuzp.u8 d2, d3
+    vuzp.u8 d1, d3
+    vuzp.u8 d0, d2
+    convert_8888_to_0565 d2, d1, d0, q1, tmp1, tmp2
+.if numpix == 4
+    vst1.16   {d2}, [OUT]!
+.elseif numpix == 2
+    vst1.32   {d2[0]}, [OUT]!
+.elseif numpix == 1
+    vst1.16   {d2[0]}, [OUT]!
+.else
+    .error bilinear_store_0565 numpix is unsupported
+.endif
+.endm
+
+
+/*
+ * Macros for loading mask pixels into register 'mask'.
+ * vdup must be done in somewhere else.
+ */
+.macro bilinear_load_mask_x numpix, mask
+.endm
+
+.macro bilinear_load_mask_8 numpix, mask
+.if numpix == 4
+    vld1.32     {mask[0]}, [MASK]!
+.elseif numpix == 2
+    vld1.16     {mask[0]}, [MASK]!
+.elseif numpix == 1
+    vld1.8      {mask[0]}, [MASK]!
+.else
+    .error bilinear_load_mask_8 numpix is unsupported
+.endif
+    pld         [MASK, #prefetch_offset]
+.endm
+
+.macro bilinear_load_mask mask_fmt, numpix, mask
+    bilinear_load_mask_&mask_fmt numpix, mask
+.endm
+
+
+/*
+ * Macros for loading destination pixels into register 'dst0' and 'dst1'.
+ * Interleave should be done somewhere else.
+ */
+.macro bilinear_load_dst_0565_src numpix, dst0, dst1, dst01
+.endm
+
+.macro bilinear_load_dst_8888_src numpix, dst0, dst1, dst01
+.endm
+
+.macro bilinear_load_dst_8888 numpix, dst0, dst1, dst01
+.if numpix == 4
+    vld1.32     {dst0, dst1}, [OUT]
+.elseif numpix == 2
+    vld1.32     {dst0}, [OUT]
+.elseif numpix == 1
+    vld1.32     {dst0[0]}, [OUT]
+.else
+    .error bilinear_load_dst_8888 numpix is unsupported
+.endif
+    pld         [OUT, #(prefetch_offset * 4)]
+.endm
+
+.macro bilinear_load_dst_8888_over numpix, dst0, dst1, dst01
+    bilinear_load_dst_8888 numpix, dst0, dst1, dst01
+.endm
+
+.macro bilinear_load_dst_8888_add numpix, dst0, dst1, dst01
+    bilinear_load_dst_8888 numpix, dst0, dst1, dst01
+.endm
+
+.macro bilinear_load_dst dst_fmt, op, numpix, dst0, dst1, dst01
+    bilinear_load_dst_&dst_fmt&_&op numpix, dst0, dst1, dst01
+.endm
+
+/*
+ * Macros for duplicating partially loaded mask to fill entire register.
+ * We will apply mask to interleaved source pixels, that is
+ *  (r0, r1, r2, r3, g0, g1, g2, g3) x (m0, m1, m2, m3, m0, m1, m2, m3)
+ *  (b0, b1, b2, b3, a0, a1, a2, a3) x (m0, m1, m2, m3, m0, m1, m2, m3)
+ * So, we need to duplicate loaded mask into whole register.
+ *
+ * For two pixel case
+ *  (r0, r1, x, x, g0, g1, x, x) x (m0, m1, m0, m1, m0, m1, m0, m1)
+ *  (b0, b1, x, x, a0, a1, x, x) x (m0, m1, m0, m1, m0, m1, m0, m1)
+ * We can do some optimizations for this including last pixel cases.
+ */
+.macro bilinear_duplicate_mask_x numpix, mask
+.endm
+
+.macro bilinear_duplicate_mask_8 numpix, mask
+.if numpix == 4
+    vdup.32     mask, mask[0]
+.elseif numpix == 2
+    vdup.16     mask, mask[0]
+.elseif numpix == 1
+    vdup.8      mask, mask[0]
+.else
+    .error bilinear_duplicate_mask_8 is unsupported
+.endif
+.endm
+
+.macro bilinear_duplicate_mask mask_fmt, numpix, mask
+    bilinear_duplicate_mask_&mask_fmt numpix, mask
+.endm
+
+/*
+ * Macros for interleaving src and dst pixels to rrrr gggg bbbb aaaa form.
+ * Interleave should be done when maks is enabled or operator is 'over'.
+ */
+.macro bilinear_interleave src0, src1, dst0, dst1
+    vuzp.8      src0, src1
+    vuzp.8      dst0, dst1
+    vuzp.8      src0, src1
+    vuzp.8      dst0, dst1
+.endm
+
+.macro bilinear_interleave_src_dst_x_src \
+                numpix, src0, src1, src01, dst0, dst1, dst01
+.endm
+
+.macro bilinear_interleave_src_dst_x_over \
+                numpix, src0, src1, src01, dst0, dst1, dst01
+
+    bilinear_interleave src0, src1, dst0, dst1
+.endm
+
+.macro bilinear_interleave_src_dst_x_add \
+                numpix, src0, src1, src01, dst0, dst1, dst01
+.endm
+
+.macro bilinear_interleave_src_dst_8_src \
+                numpix, src0, src1, src01, dst0, dst1, dst01
+
+    bilinear_interleave src0, src1, dst0, dst1
+.endm
+
+.macro bilinear_interleave_src_dst_8_over \
+                numpix, src0, src1, src01, dst0, dst1, dst01
+
+    bilinear_interleave src0, src1, dst0, dst1
+.endm
+
+.macro bilinear_interleave_src_dst_8_add \
+                numpix, src0, src1, src01, dst0, dst1, dst01
+
+    bilinear_interleave src0, src1, dst0, dst1
+.endm
+
+.macro bilinear_interleave_src_dst \
+                mask_fmt, op, numpix, src0, src1, src01, dst0, dst1, dst01
+
+    bilinear_interleave_src_dst_&mask_fmt&_&op \
+                numpix, src0, src1, src01, dst0, dst1, dst01
+.endm
+
+
+/*
+ * Macros for applying masks to src pixels. (see combine_mask_u() function)
+ * src, dst should be in interleaved form.
+ * mask register should be in form (m0, m1, m2, m3).
+ */
+.macro bilinear_apply_mask_to_src_x \
+                numpix, src0, src1, src01, mask, \
+                tmp01, tmp23, tmp45, tmp67
+.endm
+
+.macro bilinear_apply_mask_to_src_8 \
+                numpix, src0, src1, src01, mask, \
+                tmp01, tmp23, tmp45, tmp67
+
+    vmull.u8        tmp01, src0, mask
+    vmull.u8        tmp23, src1, mask
+    /* bubbles */
+    vrshr.u16       tmp45, tmp01, #8
+    vrshr.u16       tmp67, tmp23, #8
+    /* bubbles */
+    vraddhn.u16     src0, tmp45, tmp01
+    vraddhn.u16     src1, tmp67, tmp23
+.endm
+
+.macro bilinear_apply_mask_to_src \
+                mask_fmt, numpix, src0, src1, src01, mask, \
+                tmp01, tmp23, tmp45, tmp67
+
+    bilinear_apply_mask_to_src_&mask_fmt \
+                numpix, src0, src1, src01, mask, \
+                tmp01, tmp23, tmp45, tmp67
+.endm
+
+
+/*
+ * Macros for combining src and destination pixels.
+ * Interleave or not is depending on operator 'op'.
+ */
+.macro bilinear_combine_src \
+                numpix, src0, src1, src01, dst0, dst1, dst01, \
+                tmp01, tmp23, tmp45, tmp67, tmp8
+.endm
+
+.macro bilinear_combine_over \
+                numpix, src0, src1, src01, dst0, dst1, dst01, \
+                tmp01, tmp23, tmp45, tmp67, tmp8
+
+    vdup.32     tmp8, src1[1]
+    /* bubbles */
+    vmvn.8      tmp8, tmp8
+    /* bubbles */
+    vmull.u8    tmp01, dst0, tmp8
+    /* bubbles */
+    vmull.u8    tmp23, dst1, tmp8
+    /* bubbles */
+    vrshr.u16   tmp45, tmp01, #8
+    vrshr.u16   tmp67, tmp23, #8
+    /* bubbles */
+    vraddhn.u16 dst0, tmp45, tmp01
+    vraddhn.u16 dst1, tmp67, tmp23
+    /* bubbles */
+    vqadd.u8    src01, dst01, src01
+.endm
+
+.macro bilinear_combine_add \
+                numpix, src0, src1, src01, dst0, dst1, dst01, \
+                tmp01, tmp23, tmp45, tmp67, tmp8
+
+    vqadd.u8    src01, dst01, src01
+.endm
+
+.macro bilinear_combine \
+                op, numpix, src0, src1, src01, dst0, dst1, dst01, \
+                tmp01, tmp23, tmp45, tmp67, tmp8
+
+    bilinear_combine_&op \
+                numpix, src0, src1, src01, dst0, dst1, dst01, \
+                tmp01, tmp23, tmp45, tmp67, tmp8
+.endm
+
+/*
+ * Macros for final deinterleaving of destination pixels if needed.
+ */
+.macro bilinear_deinterleave numpix, dst0, dst1, dst01
+    vuzp.8      dst0, dst1
+    /* bubbles */
+    vuzp.8      dst0, dst1
+.endm
+
+.macro bilinear_deinterleave_dst_x_src numpix, dst0, dst1, dst01
+.endm
+
+.macro bilinear_deinterleave_dst_x_over numpix, dst0, dst1, dst01
+    bilinear_deinterleave numpix, dst0, dst1, dst01
+.endm
+
+.macro bilinear_deinterleave_dst_x_add numpix, dst0, dst1, dst01
+.endm
+
+.macro bilinear_deinterleave_dst_8_src numpix, dst0, dst1, dst01
+    bilinear_deinterleave numpix, dst0, dst1, dst01
+.endm
+
+.macro bilinear_deinterleave_dst_8_over numpix, dst0, dst1, dst01
+    bilinear_deinterleave numpix, dst0, dst1, dst01
+.endm
+
+.macro bilinear_deinterleave_dst_8_add numpix, dst0, dst1, dst01
+    bilinear_deinterleave numpix, dst0, dst1, dst01
+.endm
+
+.macro bilinear_deinterleave_dst mask_fmt, op, numpix, dst0, dst1, dst01
+    bilinear_deinterleave_dst_&mask_fmt&_&op numpix, dst0, dst1, dst01
+.endm
+
+
+.macro bilinear_interpolate_last_pixel src_fmt, mask_fmt, dst_fmt, op
+    bilinear_load_&src_fmt d0, d1, d2
+    bilinear_load_mask mask_fmt, 1, d4
+    bilinear_load_dst dst_fmt, op, 1, d18, d19, q9
+    vmull.u8  q1, d0, d28
+    vmlal.u8  q1, d1, d29
+    /* 5 cycles bubble */
+    vshll.u16 q0, d2, #8
+    vmlsl.u16 q0, d2, d30
+    vmlal.u16 q0, d3, d30
+    /* 5 cycles bubble */
+    bilinear_duplicate_mask mask_fmt, 1, d4
+    vshrn.u32 d0, q0, #16
+    /* 3 cycles bubble */
+    vmovn.u16 d0, q0
+    /* 1 cycle bubble */
+    bilinear_interleave_src_dst \
+                mask_fmt, op, 1, d0, d1, q0, d18, d19, q9
+    bilinear_apply_mask_to_src \
+                mask_fmt, 1, d0, d1, q0, d4, \
+                q3, q8, q10, q11
+    bilinear_combine \
+                op, 1, d0, d1, q0, d18, d19, q9, \
+                q3, q8, q10, q11, d5
+    bilinear_deinterleave_dst mask_fmt, op, 1, d0, d1, q0
+    bilinear_store_&dst_fmt 1, q2, q3
+.endm
+
+.macro bilinear_interpolate_two_pixels src_fmt, mask_fmt, dst_fmt, op
+    bilinear_load_and_vertical_interpolate_two_&src_fmt \
+                q1, q11, d0, d1, d20, d21, d22, d23
+    bilinear_load_mask mask_fmt, 2, d4
+    bilinear_load_dst dst_fmt, op, 2, d18, d19, q9
+    vshll.u16 q0, d2, #8
+    vmlsl.u16 q0, d2, d30
+    vmlal.u16 q0, d3, d30
+    vshll.u16 q10, d22, #8
+    vmlsl.u16 q10, d22, d31
+    vmlal.u16 q10, d23, d31
+    vshrn.u32 d0, q0, #16
+    vshrn.u32 d1, q10, #16
+    bilinear_duplicate_mask mask_fmt, 2, d4
+    vshr.u16  q15, q12, #8
+    vadd.u16  q12, q12, q13
+    vmovn.u16 d0, q0
+    bilinear_interleave_src_dst \
+                mask_fmt, op, 2, d0, d1, q0, d18, d19, q9
+    bilinear_apply_mask_to_src \
+                mask_fmt, 2, d0, d1, q0, d4, \
+                q3, q8, q10, q11
+    bilinear_combine \
+                op, 2, d0, d1, q0, d18, d19, q9, \
+                q3, q8, q10, q11, d5
+    bilinear_deinterleave_dst mask_fmt, op, 2, d0, d1, q0
+    bilinear_store_&dst_fmt 2, q2, q3
+.endm
+
+.macro bilinear_interpolate_four_pixels src_fmt, mask_fmt, dst_fmt, op
+    bilinear_load_and_vertical_interpolate_four_&src_fmt \
+                q1, q11, d0, d1, d20, d21, d22, d23 \
+                q3, q9,  d4, d5, d16, d17, d18, d19
+    pld       [TMP1, PF_OFFS]
+    sub       TMP1, TMP1, STRIDE
+    vshll.u16 q0, d2, #8
+    vmlsl.u16 q0, d2, d30
+    vmlal.u16 q0, d3, d30
+    vshll.u16 q10, d22, #8
+    vmlsl.u16 q10, d22, d31
+    vmlal.u16 q10, d23, d31
+    vshr.u16  q15, q12, #8
+    vshll.u16 q2, d6, #8
+    vmlsl.u16 q2, d6, d30
+    vmlal.u16 q2, d7, d30
+    vshll.u16 q8, d18, #8
+    bilinear_load_mask mask_fmt, 4, d22
+    bilinear_load_dst dst_fmt, op, 4, d2, d3, q1
+    pld       [TMP1, PF_OFFS]
+    vmlsl.u16 q8, d18, d31
+    vmlal.u16 q8, d19, d31
+    vadd.u16  q12, q12, q13
+    vshrn.u32 d0, q0, #16
+    vshrn.u32 d1, q10, #16
+    vshrn.u32 d4, q2, #16
+    vshrn.u32 d5, q8, #16
+    bilinear_duplicate_mask mask_fmt, 4, d22
+    vshr.u16  q15, q12, #8
+    vmovn.u16 d0, q0
+    vmovn.u16 d1, q2
+    vadd.u16  q12, q12, q13
+    bilinear_interleave_src_dst \
+                mask_fmt, op, 4, d0, d1, q0, d2, d3, q1
+    bilinear_apply_mask_to_src \
+                mask_fmt, 4, d0, d1, q0, d22, \
+                q3, q8, q9, q10
+    bilinear_combine \
+                op, 4, d0, d1, q0, d2, d3, q1, \
+                q3, q8, q9, q10, d23
+    bilinear_deinterleave_dst mask_fmt, op, 4, d0, d1, q0
+    bilinear_store_&dst_fmt 4, q2, q3
+.endm
+
+.set BILINEAR_FLAG_USE_MASK,           1
+.set BILINEAR_FLAG_USE_ALL_NEON_REGS,  2
+
+/*
+ * Main template macro for generating NEON optimized bilinear scanline functions.
+ *
+ * Bilinear scanline generator macro take folling arguments:
+ *  fname                      - name of the function to generate
+ *  src_fmt                    - source color format (8888 or 0565)
+ *  dst_fmt                    - destination color format (8888 or 0565)
+ *  src/dst_bpp_shift          - (1 << bpp_shift) is the size of src/dst pixel in bytes
+ *  process_last_pixel         - code block that interpolate one pixel and does not
+ *                               update horizontal weight
+ *  process_two_pixels         - code block that interpolate two pixels and update
+ *                               horizontal weight
+ *  process_four_pixels                - code block that interpolate four pixels and update
+ *                               horizontal weight
+ *  process_pixblock_head      - head part of middle loop
+ *  process_pixblock_tail      - tail part of middle loop
+ *  process_pixblock_tail_head - tail_head of middle loop
+ *  pixblock_size              - number of pixels processed in a single middle loop
+ *  prefetch_distance          - prefetch in the source image by that many pixels ahead
+ */
+
+.macro generate_bilinear_scanline_func \
+       fname, \
+       src_fmt, dst_fmt, src_bpp_shift, dst_bpp_shift, \
+       bilinear_process_last_pixel, \
+       bilinear_process_two_pixels, \
+       bilinear_process_four_pixels, \
+       bilinear_process_pixblock_head, \
+       bilinear_process_pixblock_tail, \
+       bilinear_process_pixblock_tail_head, \
+       pixblock_size, \
+       prefetch_distance, \
+       flags
+
+pixman_asm_function fname
+.if pixblock_size == 8
+.elseif pixblock_size == 4
+.else
+    .error unsupported pixblock size
+.endif
+
+.if ((flags) & BILINEAR_FLAG_USE_MASK) == 0
+    OUT       .req    r0
+    TOP       .req    r1
+    BOTTOM    .req    r2
+    WT        .req    r3
+    WB        .req    r4
+    X         .req    r5
+    UX        .req    r6
+    WIDTH     .req    ip
+    TMP1      .req    r3
+    TMP2      .req    r4
+    PF_OFFS   .req    r7
+    TMP3      .req    r8
+    TMP4      .req    r9
+    STRIDE    .req    r2
+
+    mov                ip, sp
+    push       {r4, r5, r6, r7, r8, r9}
+    mov                PF_OFFS, #prefetch_distance
+    ldmia      ip, {WB, X, UX, WIDTH}
+.else
+    OUT       .req      r0
+    MASK      .req      r1
+    TOP       .req      r2
+    BOTTOM    .req      r3
+    WT        .req      r4
+    WB        .req      r5
+    X         .req      r6
+    UX        .req      r7
+    WIDTH     .req      ip
+    TMP1      .req      r4
+    TMP2      .req      r5
+    PF_OFFS   .req      r8
+    TMP3      .req      r9
+    TMP4      .req      r10
+    STRIDE    .req      r3
+
+    .set prefetch_offset, prefetch_distance
+
+    mov       ip, sp
+    push      {r4, r5, r6, r7, r8, r9, r10, ip}
+    mov       PF_OFFS, #prefetch_distance
+    ldmia     ip, {WT, WB, X, UX, WIDTH}
+.endif
+
+    mul       PF_OFFS, PF_OFFS, UX
+
+.if ((flags) & BILINEAR_FLAG_USE_ALL_NEON_REGS) != 0
+    vpush     {d8-d15}
+.endif
+
+    sub              STRIDE, BOTTOM, TOP
+    .unreq    BOTTOM
+
+    cmp       WIDTH, #0
+    ble       3f
+
+    vdup.u16  q12, X
+    vdup.u16  q13, UX
+    vdup.u8   d28, WT
+    vdup.u8   d29, WB
+    vadd.u16  d25, d25, d26
+
+    /* ensure good destination alignment  */
+    cmp       WIDTH, #1
+    blt       0f
+    tst       OUT, #(1 << dst_bpp_shift)
+    beq       0f
+    vshr.u16  q15, q12, #8
+    vadd.u16  q12, q12, q13
+    bilinear_process_last_pixel
+    sub       WIDTH, WIDTH, #1
+0:
+    vadd.u16  q13, q13, q13
+    vshr.u16  q15, q12, #8
+    vadd.u16  q12, q12, q13
+
+    cmp       WIDTH, #2
+    blt       0f
+    tst       OUT, #(1 << (dst_bpp_shift + 1))
+    beq       0f
+    bilinear_process_two_pixels
+    sub       WIDTH, WIDTH, #2
+0:
+.if pixblock_size == 8
+    cmp       WIDTH, #4
+    blt       0f
+    tst       OUT, #(1 << (dst_bpp_shift + 2))
+    beq       0f
+    bilinear_process_four_pixels
+    sub       WIDTH, WIDTH, #4
+0:
+.endif
+    subs      WIDTH, WIDTH, #pixblock_size
+    blt       1f
+    mov       PF_OFFS, PF_OFFS, asr #(16 - src_bpp_shift)
+    bilinear_process_pixblock_head
+    subs      WIDTH, WIDTH, #pixblock_size
+    blt       5f
+0:
+    bilinear_process_pixblock_tail_head
+    subs      WIDTH, WIDTH, #pixblock_size
+    bge       0b
+5:
+    bilinear_process_pixblock_tail
+1:
+.if pixblock_size == 8
+    tst       WIDTH, #4
+    beq       2f
+    bilinear_process_four_pixels
+2:
+.endif
+    /* handle the remaining trailing pixels */
+    tst       WIDTH, #2
+    beq       2f
+    bilinear_process_two_pixels
+2:
+    tst       WIDTH, #1
+    beq       3f
+    bilinear_process_last_pixel
+3:
+.if ((flags) & BILINEAR_FLAG_USE_ALL_NEON_REGS) != 0
+    vpop      {d8-d15}
+.endif
+
+.if ((flags) & BILINEAR_FLAG_USE_MASK) == 0
+    pop       {r4, r5, r6, r7, r8, r9}
+.else
+    pop       {r4, r5, r6, r7, r8, r9, r10, ip}
+.endif
+    bx        lr
+
+    .unreq    OUT
+    .unreq    TOP
+    .unreq    WT
+    .unreq    WB
+    .unreq    X
+    .unreq    UX
+    .unreq    WIDTH
+    .unreq    TMP1
+    .unreq    TMP2
+    .unreq    PF_OFFS
+    .unreq    TMP3
+    .unreq    TMP4
+    .unreq    STRIDE
+.if ((flags) & BILINEAR_FLAG_USE_MASK) != 0
+    .unreq    MASK
+.endif
+
+.endfunc
+
+.endm
+
+/* src_8888_8_8888 */
+.macro bilinear_src_8888_8_8888_process_last_pixel
+    bilinear_interpolate_last_pixel 8888, 8, 8888, src
+.endm
+
+.macro bilinear_src_8888_8_8888_process_two_pixels
+    bilinear_interpolate_two_pixels 8888, 8, 8888, src
+.endm
+
+.macro bilinear_src_8888_8_8888_process_four_pixels
+    bilinear_interpolate_four_pixels 8888, 8, 8888, src
+.endm
+
+.macro bilinear_src_8888_8_8888_process_pixblock_head
+    bilinear_src_8888_8_8888_process_four_pixels
+.endm
+
+.macro bilinear_src_8888_8_8888_process_pixblock_tail
+.endm
+
+.macro bilinear_src_8888_8_8888_process_pixblock_tail_head
+    bilinear_src_8888_8_8888_process_pixblock_tail
+    bilinear_src_8888_8_8888_process_pixblock_head
+.endm
+
+/* src_8888_8_0565 */
+.macro bilinear_src_8888_8_0565_process_last_pixel
+    bilinear_interpolate_last_pixel 8888, 8, 0565, src
+.endm
+
+.macro bilinear_src_8888_8_0565_process_two_pixels
+    bilinear_interpolate_two_pixels 8888, 8, 0565, src
+.endm
+
+.macro bilinear_src_8888_8_0565_process_four_pixels
+    bilinear_interpolate_four_pixels 8888, 8, 0565, src
+.endm
+
+.macro bilinear_src_8888_8_0565_process_pixblock_head
+    bilinear_src_8888_8_0565_process_four_pixels
+.endm
+
+.macro bilinear_src_8888_8_0565_process_pixblock_tail
+.endm
+
+.macro bilinear_src_8888_8_0565_process_pixblock_tail_head
+    bilinear_src_8888_8_0565_process_pixblock_tail
+    bilinear_src_8888_8_0565_process_pixblock_head
+.endm
+
+/* src_0565_8_x888 */
+.macro bilinear_src_0565_8_x888_process_last_pixel
+    bilinear_interpolate_last_pixel 0565, 8, 8888, src
+.endm
+
+.macro bilinear_src_0565_8_x888_process_two_pixels
+    bilinear_interpolate_two_pixels 0565, 8, 8888, src
+.endm
+
+.macro bilinear_src_0565_8_x888_process_four_pixels
+    bilinear_interpolate_four_pixels 0565, 8, 8888, src
+.endm
+
+.macro bilinear_src_0565_8_x888_process_pixblock_head
+    bilinear_src_0565_8_x888_process_four_pixels
+.endm
+
+.macro bilinear_src_0565_8_x888_process_pixblock_tail
+.endm
+
+.macro bilinear_src_0565_8_x888_process_pixblock_tail_head
+    bilinear_src_0565_8_x888_process_pixblock_tail
+    bilinear_src_0565_8_x888_process_pixblock_head
+.endm
+
+/* src_0565_8_0565 */
+.macro bilinear_src_0565_8_0565_process_last_pixel
+    bilinear_interpolate_last_pixel 0565, 8, 0565, src
+.endm
+
+.macro bilinear_src_0565_8_0565_process_two_pixels
+    bilinear_interpolate_two_pixels 0565, 8, 0565, src
+.endm
+
+.macro bilinear_src_0565_8_0565_process_four_pixels
+    bilinear_interpolate_four_pixels 0565, 8, 0565, src
+.endm
+
+.macro bilinear_src_0565_8_0565_process_pixblock_head
+    bilinear_src_0565_8_0565_process_four_pixels
+.endm
+
+.macro bilinear_src_0565_8_0565_process_pixblock_tail
+.endm
+
+.macro bilinear_src_0565_8_0565_process_pixblock_tail_head
+    bilinear_src_0565_8_0565_process_pixblock_tail
+    bilinear_src_0565_8_0565_process_pixblock_head
+.endm
+
+/* over_8888_8888 */
+.macro bilinear_over_8888_8888_process_last_pixel
+    bilinear_interpolate_last_pixel 8888, x, 8888, over
+.endm
+
+.macro bilinear_over_8888_8888_process_two_pixels
+    bilinear_interpolate_two_pixels 8888, x, 8888, over
+.endm
+
+.macro bilinear_over_8888_8888_process_four_pixels
+    bilinear_interpolate_four_pixels 8888, x, 8888, over
+.endm
+
+.macro bilinear_over_8888_8888_process_pixblock_head
+    mov         TMP1, X, asr #16
+    add         X, X, UX
+    add         TMP1, TOP, TMP1, asl #2
+    mov         TMP2, X, asr #16
+    add         X, X, UX
+    add         TMP2, TOP, TMP2, asl #2
+
+    vld1.32     {d22}, [TMP1], STRIDE
+    vld1.32     {d23}, [TMP1]
+    mov         TMP3, X, asr #16
+    add         X, X, UX
+    add         TMP3, TOP, TMP3, asl #2
+    vmull.u8    q8, d22, d28
+    vmlal.u8    q8, d23, d29
+
+    vld1.32     {d22}, [TMP2], STRIDE
+    vld1.32     {d23}, [TMP2]
+    mov         TMP4, X, asr #16
+    add         X, X, UX
+    add         TMP4, TOP, TMP4, asl #2
+    vmull.u8    q9, d22, d28
+    vmlal.u8    q9, d23, d29
+
+    vld1.32     {d22}, [TMP3], STRIDE
+    vld1.32     {d23}, [TMP3]
+    vmull.u8    q10, d22, d28
+    vmlal.u8    q10, d23, d29
+
+    vshll.u16   q0, d16, #8
+    vmlsl.u16   q0, d16, d30
+    vmlal.u16   q0, d17, d30
+
+    pld         [TMP4, PF_OFFS]
+    vld1.32     {d16}, [TMP4], STRIDE
+    vld1.32     {d17}, [TMP4]
+    pld         [TMP4, PF_OFFS]
+    vmull.u8    q11, d16, d28
+    vmlal.u8    q11, d17, d29
+
+    vshll.u16   q1, d18, #8
+    vmlsl.u16   q1, d18, d31
+    vmlal.u16   q1, d19, d31
+    vshr.u16    q15, q12, #8
+    vadd.u16    q12, q12, q13
+.endm
+
+.macro bilinear_over_8888_8888_process_pixblock_tail
+    vshll.u16   q2, d20, #8
+    vmlsl.u16   q2, d20, d30
+    vmlal.u16   q2, d21, d30
+    vshll.u16   q3, d22, #8
+    vmlsl.u16   q3, d22, d31
+    vmlal.u16   q3, d23, d31
+    vshrn.u32   d0, q0, #16
+    vshrn.u32   d1, q1, #16
+    vld1.32     {d2, d3}, [OUT, :128]
+    pld         [OUT, #(prefetch_offset * 4)]
+    vshrn.u32   d4, q2, #16
+    vshr.u16    q15, q12, #8
+    vshrn.u32   d5, q3, #16
+    vmovn.u16   d6, q0
+    vmovn.u16   d7, q2
+    vuzp.8      d6, d7
+    vuzp.8      d2, d3
+    vuzp.8      d6, d7
+    vuzp.8      d2, d3
+    vdup.32     d4, d7[1]
+    vmvn.8      d4, d4
+    vmull.u8    q11, d2, d4
+    vmull.u8    q2, d3, d4
+    vrshr.u16   q1, q11, #8
+    vrshr.u16   q10, q2, #8
+    vraddhn.u16 d2, q1, q11
+    vraddhn.u16 d3, q10, q2
+    vqadd.u8    q3, q1, q3
+    vuzp.8      d6, d7
+    vuzp.8      d6, d7
+    vadd.u16    q12, q12, q13
+    vst1.32     {d6, d7}, [OUT, :128]!
+.endm
+
+.macro bilinear_over_8888_8888_process_pixblock_tail_head
+                                            vshll.u16   q2, d20, #8
+    mov         TMP1, X, asr #16
+    add         X, X, UX
+    add         TMP1, TOP, TMP1, asl #2
+                                            vmlsl.u16   q2, d20, d30
+    mov         TMP2, X, asr #16
+    add         X, X, UX
+    add         TMP2, TOP, TMP2, asl #2
+                                            vmlal.u16   q2, d21, d30
+                                            vshll.u16   q3, d22, #8
+    vld1.32     {d20}, [TMP1], STRIDE
+                                            vmlsl.u16   q3, d22, d31
+                                            vmlal.u16   q3, d23, d31
+    vld1.32     {d21}, [TMP1]
+    vmull.u8    q8, d20, d28
+    vmlal.u8    q8, d21, d29
+                                            vshrn.u32   d0, q0, #16
+                                            vshrn.u32   d1, q1, #16
+                                            vld1.32     {d2, d3}, [OUT, :128]
+                                            pld         [OUT, PF_OFFS]
+                                            vshrn.u32   d4, q2, #16
+                                            vshr.u16    q15, q12, #8
+    vld1.32     {d22}, [TMP2], STRIDE
+                                            vshrn.u32   d5, q3, #16
+                                            vmovn.u16   d6, q0
+    vld1.32     {d23}, [TMP2]
+    vmull.u8    q9, d22, d28
+    mov         TMP3, X, asr #16
+    add         X, X, UX
+    add         TMP3, TOP, TMP3, asl #2
+    mov         TMP4, X, asr #16
+    add         X, X, UX
+    add         TMP4, TOP, TMP4, asl #2
+    vmlal.u8    q9, d23, d29
+                                            vmovn.u16   d7, q2
+    vld1.32     {d22}, [TMP3], STRIDE
+                                            vuzp.8      d6, d7
+                                            vuzp.8      d2, d3
+                                            vuzp.8      d6, d7
+                                            vuzp.8      d2, d3
+                                            vdup.32     d4, d7[1]
+    vld1.32     {d23}, [TMP3]
+                                            vmvn.8      d4, d4
+    vmull.u8    q10, d22, d28
+    vmlal.u8    q10, d23, d29
+                                            vmull.u8    q11, d2, d4
+                                            vmull.u8    q2, d3, d4
+    vshll.u16   q0, d16, #8
+    vmlsl.u16   q0, d16, d30
+                                            vrshr.u16   q1, q11, #8
+    vmlal.u16   q0, d17, d30
+                                            vrshr.u16   q8, q2, #8
+                                            vraddhn.u16 d2, q1, q11
+                                            vraddhn.u16 d3, q8, q2
+    pld         [TMP4, PF_OFFS]
+    vld1.32     {d16}, [TMP4], STRIDE
+                                            vqadd.u8    q3, q1, q3
+    vld1.32     {d17}, [TMP4]
+    pld         [TMP4, PF_OFFS]
+    vmull.u8    q11, d16, d28
+    vmlal.u8    q11, d17, d29
+                                            vuzp.8      d6, d7
+    vshll.u16   q1, d18, #8
+                                            vuzp.8      d6, d7
+    vmlsl.u16   q1, d18, d31
+                                            vadd.u16    q12, q12, q13
+    vmlal.u16   q1, d19, d31
+    vshr.u16    q15, q12, #8
+    vadd.u16    q12, q12, q13
+                                            vst1.32     {d6, d7}, [OUT, :128]!
+.endm
+
+/* over_8888_8_8888 */
+.macro bilinear_over_8888_8_8888_process_last_pixel
+    bilinear_interpolate_last_pixel 8888, 8, 8888, over
+.endm
+
+.macro bilinear_over_8888_8_8888_process_two_pixels
+    bilinear_interpolate_two_pixels 8888, 8, 8888, over
+.endm
+
+.macro bilinear_over_8888_8_8888_process_four_pixels
+    bilinear_interpolate_four_pixels 8888, 8, 8888, over
+.endm
+
+.macro bilinear_over_8888_8_8888_process_pixblock_head
+    mov         TMP1, X, asr #16
+    add         X, X, UX
+    add         TMP1, TOP, TMP1, asl #2
+    vld1.32     {d0}, [TMP1], STRIDE
+    mov         TMP2, X, asr #16
+    add         X, X, UX
+    add         TMP2, TOP, TMP2, asl #2
+    vld1.32     {d1}, [TMP1]
+    mov         TMP3, X, asr #16
+    add         X, X, UX
+    add         TMP3, TOP, TMP3, asl #2
+    vld1.32     {d2}, [TMP2], STRIDE
+    mov         TMP4, X, asr #16
+    add         X, X, UX
+    add         TMP4, TOP, TMP4, asl #2
+    vld1.32     {d3}, [TMP2]
+    vmull.u8    q2, d0, d28
+    vmull.u8    q3, d2, d28
+    vmlal.u8    q2, d1, d29
+    vmlal.u8    q3, d3, d29
+    vshll.u16   q0, d4, #8
+    vshll.u16   q1, d6, #8
+    vmlsl.u16   q0, d4, d30
+    vmlsl.u16   q1, d6, d31
+    vmlal.u16   q0, d5, d30
+    vmlal.u16   q1, d7, d31
+    vshrn.u32   d0, q0, #16
+    vshrn.u32   d1, q1, #16
+    vld1.32     {d2}, [TMP3], STRIDE
+    vld1.32     {d3}, [TMP3]
+    pld         [TMP4, PF_OFFS]
+    vld1.32     {d4}, [TMP4], STRIDE
+    vld1.32     {d5}, [TMP4]
+    pld         [TMP4, PF_OFFS]
+    vmull.u8    q3, d2, d28
+    vmlal.u8    q3, d3, d29
+    vmull.u8    q1, d4, d28
+    vmlal.u8    q1, d5, d29
+    vshr.u16    q15, q12, #8
+    vld1.32     {d22[0]}, [MASK]!
+    pld         [MASK, #prefetch_offset]
+    vadd.u16    q12, q12, q13
+    vmovn.u16   d16, q0
+.endm
+
+.macro bilinear_over_8888_8_8888_process_pixblock_tail
+    vshll.u16   q9, d6, #8
+    vshll.u16   q10, d2, #8
+    vmlsl.u16   q9, d6, d30
+    vmlsl.u16   q10, d2, d31
+    vmlal.u16   q9, d7, d30
+    vmlal.u16   q10, d3, d31
+    vshr.u16    q15, q12, #8
+    vadd.u16    q12, q12, q13
+    vdup.32     d22, d22[0]
+    vshrn.u32   d18, q9, #16
+    vshrn.u32   d19, q10, #16
+    vmovn.u16   d17, q9
+    vld1.32     {d18, d19}, [OUT, :128]
+    pld         [OUT, PF_OFFS]
+    vuzp.8      d16, d17
+    vuzp.8      d18, d19
+    vuzp.8      d16, d17
+    vuzp.8      d18, d19
+    vmull.u8    q10, d16, d22
+    vmull.u8    q11, d17, d22
+    vrsra.u16   q10, q10, #8
+    vrsra.u16   q11, q11, #8
+    vrshrn.u16  d16, q10, #8
+    vrshrn.u16  d17, q11, #8
+    vdup.32     d22, d17[1]
+    vmvn.8      d22, d22
+    vmull.u8    q10, d18, d22
+    vmull.u8    q11, d19, d22
+    vrshr.u16   q9, q10, #8
+    vrshr.u16   q0, q11, #8
+    vraddhn.u16 d18, q9, q10
+    vraddhn.u16 d19, q0, q11
+    vqadd.u8    q9, q8, q9
+    vuzp.8      d18, d19
+    vuzp.8      d18, d19
+    vst1.32     {d18, d19}, [OUT, :128]!
+.endm
+
+.macro bilinear_over_8888_8_8888_process_pixblock_tail_head
+                                            vshll.u16   q9, d6, #8
+    mov         TMP1, X, asr #16
+    add         X, X, UX
+    add         TMP1, TOP, TMP1, asl #2
+                                            vshll.u16   q10, d2, #8
+    vld1.32     {d0}, [TMP1], STRIDE
+    mov         TMP2, X, asr #16
+    add         X, X, UX
+    add         TMP2, TOP, TMP2, asl #2
+                                            vmlsl.u16   q9, d6, d30
+                                            vmlsl.u16   q10, d2, d31
+    vld1.32     {d1}, [TMP1]
+    mov         TMP3, X, asr #16
+    add         X, X, UX
+    add         TMP3, TOP, TMP3, asl #2
+                                            vmlal.u16   q9, d7, d30
+                                            vmlal.u16   q10, d3, d31
+    vld1.32     {d2}, [TMP2], STRIDE
+    mov         TMP4, X, asr #16
+    add         X, X, UX
+    add         TMP4, TOP, TMP4, asl #2
+                                            vshr.u16    q15, q12, #8
+                                            vadd.u16    q12, q12, q13
+    vld1.32     {d3}, [TMP2]
+                                            vdup.32     d22, d22[0]
+                                            vshrn.u32   d18, q9, #16
+                                            vshrn.u32   d19, q10, #16
+    vmull.u8    q2, d0, d28
+    vmull.u8    q3, d2, d28
+                                            vmovn.u16   d17, q9
+                                            vld1.32     {d18, d19}, [OUT, :128]
+                                            pld         [OUT, #(prefetch_offset * 4)]
+    vmlal.u8    q2, d1, d29
+    vmlal.u8    q3, d3, d29
+                                            vuzp.8      d16, d17
+                                            vuzp.8      d18, d19
+    vshll.u16   q0, d4, #8
+    vshll.u16   q1, d6, #8
+                                            vuzp.8      d16, d17
+                                            vuzp.8      d18, d19
+    vmlsl.u16   q0, d4, d30
+    vmlsl.u16   q1, d6, d31
+                                            vmull.u8    q10, d16, d22
+                                            vmull.u8    q11, d17, d22
+    vmlal.u16   q0, d5, d30
+    vmlal.u16   q1, d7, d31
+                                            vrsra.u16   q10, q10, #8
+                                            vrsra.u16   q11, q11, #8
+    vshrn.u32   d0, q0, #16
+    vshrn.u32   d1, q1, #16
+                                            vrshrn.u16  d16, q10, #8
+                                            vrshrn.u16  d17, q11, #8
+    vld1.32     {d2}, [TMP3], STRIDE
+                                            vdup.32     d22, d17[1]
+    vld1.32     {d3}, [TMP3]
+                                            vmvn.8      d22, d22
+    pld         [TMP4, PF_OFFS]
+    vld1.32     {d4}, [TMP4], STRIDE
+                                            vmull.u8    q10, d18, d22
+                                            vmull.u8    q11, d19, d22
+    vld1.32     {d5}, [TMP4]
+    pld         [TMP4, PF_OFFS]
+    vmull.u8    q3, d2, d28
+                                            vrshr.u16   q9, q10, #8
+                                            vrshr.u16   q15, q11, #8
+    vmlal.u8    q3, d3, d29
+    vmull.u8    q1, d4, d28
+                                            vraddhn.u16 d18, q9, q10
+                                            vraddhn.u16 d19, q15, q11
+    vmlal.u8    q1, d5, d29
+    vshr.u16    q15, q12, #8
+                                            vqadd.u8    q9, q8, q9
+    vld1.32     {d22[0]}, [MASK]!
+                                            vuzp.8      d18, d19
+    vadd.u16    q12, q12, q13
+                                            vuzp.8      d18, d19
+    vmovn.u16   d16, q0
+                                            vst1.32     {d18, d19}, [OUT, :128]!
+.endm
+
+/* add_8888_8888 */
+.macro bilinear_add_8888_8888_process_last_pixel
+    bilinear_interpolate_last_pixel 8888, x, 8888, add
+.endm
+
+.macro bilinear_add_8888_8888_process_two_pixels
+    bilinear_interpolate_two_pixels 8888, x, 8888, add
+.endm
+
+.macro bilinear_add_8888_8888_process_four_pixels
+    bilinear_interpolate_four_pixels 8888, x, 8888, add
+.endm
+
+.macro bilinear_add_8888_8888_process_pixblock_head
+    bilinear_add_8888_8888_process_four_pixels
+.endm
+
+.macro bilinear_add_8888_8888_process_pixblock_tail
+.endm
+
+.macro bilinear_add_8888_8888_process_pixblock_tail_head
+    bilinear_add_8888_8888_process_pixblock_tail
+    bilinear_add_8888_8888_process_pixblock_head
+.endm
+
+/* add_8888_8_8888 */
+.macro bilinear_add_8888_8_8888_process_last_pixel
+    bilinear_interpolate_last_pixel 8888, 8, 8888, add
+.endm
+
+.macro bilinear_add_8888_8_8888_process_two_pixels
+    bilinear_interpolate_two_pixels 8888, 8, 8888, add
+.endm
+
+.macro bilinear_add_8888_8_8888_process_four_pixels
+    bilinear_interpolate_four_pixels 8888, 8, 8888, add
+.endm
+
+.macro bilinear_add_8888_8_8888_process_pixblock_head
+    bilinear_add_8888_8_8888_process_four_pixels
+.endm
+
+.macro bilinear_add_8888_8_8888_process_pixblock_tail
+.endm
+
+.macro bilinear_add_8888_8_8888_process_pixblock_tail_head
+    bilinear_add_8888_8_8888_process_pixblock_tail
+    bilinear_add_8888_8_8888_process_pixblock_head
+.endm
+
+
+/* Bilinear scanline functions */
+generate_bilinear_scanline_func \
+    pixman_scaled_bilinear_scanline_8888_8_8888_SRC_asm_neon, \
+    8888, 8888, 2, 2, \
+    bilinear_src_8888_8_8888_process_last_pixel, \
+    bilinear_src_8888_8_8888_process_two_pixels, \
+    bilinear_src_8888_8_8888_process_four_pixels, \
+    bilinear_src_8888_8_8888_process_pixblock_head, \
+    bilinear_src_8888_8_8888_process_pixblock_tail, \
+    bilinear_src_8888_8_8888_process_pixblock_tail_head, \
+    4, 28, BILINEAR_FLAG_USE_MASK
+
+generate_bilinear_scanline_func \
+    pixman_scaled_bilinear_scanline_8888_8_0565_SRC_asm_neon, \
+    8888, 0565, 2, 1, \
+    bilinear_src_8888_8_0565_process_last_pixel, \
+    bilinear_src_8888_8_0565_process_two_pixels, \
+    bilinear_src_8888_8_0565_process_four_pixels, \
+    bilinear_src_8888_8_0565_process_pixblock_head, \
+    bilinear_src_8888_8_0565_process_pixblock_tail, \
+    bilinear_src_8888_8_0565_process_pixblock_tail_head, \
+    4, 28, BILINEAR_FLAG_USE_MASK
+
+generate_bilinear_scanline_func \
+    pixman_scaled_bilinear_scanline_0565_8_x888_SRC_asm_neon, \
+    0565, 8888, 1, 2, \
+    bilinear_src_0565_8_x888_process_last_pixel, \
+    bilinear_src_0565_8_x888_process_two_pixels, \
+    bilinear_src_0565_8_x888_process_four_pixels, \
+    bilinear_src_0565_8_x888_process_pixblock_head, \
+    bilinear_src_0565_8_x888_process_pixblock_tail, \
+    bilinear_src_0565_8_x888_process_pixblock_tail_head, \
+    4, 28, BILINEAR_FLAG_USE_MASK
+
+generate_bilinear_scanline_func \
+    pixman_scaled_bilinear_scanline_0565_8_0565_SRC_asm_neon, \
+    0565, 0565, 1, 1, \
+    bilinear_src_0565_8_0565_process_last_pixel, \
+    bilinear_src_0565_8_0565_process_two_pixels, \
+    bilinear_src_0565_8_0565_process_four_pixels, \
+    bilinear_src_0565_8_0565_process_pixblock_head, \
+    bilinear_src_0565_8_0565_process_pixblock_tail, \
+    bilinear_src_0565_8_0565_process_pixblock_tail_head, \
+    4, 28, BILINEAR_FLAG_USE_MASK
+
+generate_bilinear_scanline_func \
+    pixman_scaled_bilinear_scanline_8888_8888_OVER_asm_neon, \
+    8888, 8888, 2, 2, \
+    bilinear_over_8888_8888_process_last_pixel, \
+    bilinear_over_8888_8888_process_two_pixels, \
+    bilinear_over_8888_8888_process_four_pixels, \
+    bilinear_over_8888_8888_process_pixblock_head, \
+    bilinear_over_8888_8888_process_pixblock_tail, \
+    bilinear_over_8888_8888_process_pixblock_tail_head, \
+    4, 28, 0
+
+generate_bilinear_scanline_func \
+    pixman_scaled_bilinear_scanline_8888_8_8888_OVER_asm_neon, \
+    8888, 8888, 2, 2, \
+    bilinear_over_8888_8_8888_process_last_pixel, \
+    bilinear_over_8888_8_8888_process_two_pixels, \
+    bilinear_over_8888_8_8888_process_four_pixels, \
+    bilinear_over_8888_8_8888_process_pixblock_head, \
+    bilinear_over_8888_8_8888_process_pixblock_tail, \
+    bilinear_over_8888_8_8888_process_pixblock_tail_head, \
+    4, 28, BILINEAR_FLAG_USE_MASK
+
+generate_bilinear_scanline_func \
+    pixman_scaled_bilinear_scanline_8888_8888_ADD_asm_neon, \
+    8888, 8888, 2, 2, \
+    bilinear_add_8888_8888_process_last_pixel, \
+    bilinear_add_8888_8888_process_two_pixels, \
+    bilinear_add_8888_8888_process_four_pixels, \
+    bilinear_add_8888_8888_process_pixblock_head, \
+    bilinear_add_8888_8888_process_pixblock_tail, \
+    bilinear_add_8888_8888_process_pixblock_tail_head, \
+    4, 28, 0
+
+generate_bilinear_scanline_func \
+    pixman_scaled_bilinear_scanline_8888_8_8888_ADD_asm_neon, \
+    8888, 8888, 2, 2, \
+    bilinear_add_8888_8_8888_process_last_pixel, \
+    bilinear_add_8888_8_8888_process_two_pixels, \
+    bilinear_add_8888_8_8888_process_four_pixels, \
+    bilinear_add_8888_8_8888_process_pixblock_head, \
+    bilinear_add_8888_8_8888_process_pixblock_tail, \
+    bilinear_add_8888_8_8888_process_pixblock_tail_head, \
+    4, 28, BILINEAR_FLAG_USE_MASK
diff --git a/pixman/pixman-arm-neon-asm.S b/pixman/pixman-arm-neon-asm.S
new file mode 100644 (file)
index 0000000..87aae1d
--- /dev/null
@@ -0,0 +1,3636 @@
+/*
+ * Copyright © 2009 Nokia Corporation
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and associated documentation files (the "Software"),
+ * to deal in the Software without restriction, including without limitation
+ * the rights to use, copy, modify, merge, publish, distribute, sublicense,
+ * and/or sell copies of the Software, and to permit persons to whom the
+ * Software is furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice (including the next
+ * paragraph) shall be included in all copies or substantial portions of the
+ * Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
+ * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
+ * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER
+ * DEALINGS IN THE SOFTWARE.
+ *
+ * Author:  Siarhei Siamashka (siarhei.siamashka@nokia.com)
+ */
+
+/*
+ * This file contains implementations of NEON optimized pixel processing
+ * functions. There is no full and detailed tutorial, but some functions
+ * (those which are exposing some new or interesting features) are
+ * extensively commented and can be used as examples.
+ *
+ * You may want to have a look at the comments for following functions:
+ *  - pixman_composite_over_8888_0565_asm_neon
+ *  - pixman_composite_over_n_8_0565_asm_neon
+ */
+
+/* Prevent the stack from becoming executable for no reason... */
+#if defined(__linux__) && defined(__ELF__)
+.section .note.GNU-stack,"",%progbits
+#endif
+
+    .text
+    .fpu neon
+    .arch armv7a
+    .object_arch armv4
+    .eabi_attribute 10, 0 /* suppress Tag_FP_arch */
+    .eabi_attribute 12, 0 /* suppress Tag_Advanced_SIMD_arch */
+    .arm
+    .altmacro
+    .p2align 2
+
+#include "pixman-arm-neon-asm.h"
+
+/* Global configuration options and preferences */
+
+/*
+ * The code can optionally make use of unaligned memory accesses to improve
+ * performance of handling leading/trailing pixels for each scanline.
+ * Configuration variable RESPECT_STRICT_ALIGNMENT can be set to 0 for
+ * example in linux if unaligned memory accesses are not configured to
+ * generate.exceptions.
+ */
+.set RESPECT_STRICT_ALIGNMENT, 1
+
+/*
+ * Set default prefetch type. There is a choice between the following options:
+ *
+ * PREFETCH_TYPE_NONE (may be useful for the ARM cores where PLD is set to work
+ * as NOP to workaround some HW bugs or for whatever other reason)
+ *
+ * PREFETCH_TYPE_SIMPLE (may be useful for simple single-issue ARM cores where
+ * advanced prefetch intruduces heavy overhead)
+ *
+ * PREFETCH_TYPE_ADVANCED (useful for superscalar cores such as ARM Cortex-A8
+ * which can run ARM and NEON instructions simultaneously so that extra ARM
+ * instructions do not add (many) extra cycles, but improve prefetch efficiency)
+ *
+ * Note: some types of function can't support advanced prefetch and fallback
+ *       to simple one (those which handle 24bpp pixels)
+ */
+.set PREFETCH_TYPE_DEFAULT, PREFETCH_TYPE_ADVANCED
+
+/* Prefetch distance in pixels for simple prefetch */
+.set PREFETCH_DISTANCE_SIMPLE, 64
+
+/*
+ * Implementation of pixman_composite_over_8888_0565_asm_neon
+ *
+ * This function takes a8r8g8b8 source buffer, r5g6b5 destination buffer and
+ * performs OVER compositing operation. Function fast_composite_over_8888_0565
+ * from pixman-fast-path.c does the same in C and can be used as a reference.
+ *
+ * First we need to have some NEON assembly code which can do the actual
+ * operation on the pixels and provide it to the template macro.
+ *
+ * Template macro quite conveniently takes care of emitting all the necessary
+ * code for memory reading and writing (including quite tricky cases of
+ * handling unaligned leading/trailing pixels), so we only need to deal with
+ * the data in NEON registers.
+ *
+ * NEON registers allocation in general is recommented to be the following:
+ * d0,  d1,  d2,  d3  - contain loaded source pixel data
+ * d4,  d5,  d6,  d7  - contain loaded destination pixels (if they are needed)
+ * d24, d25, d26, d27 - contain loading mask pixel data (if mask is used)
+ * d28, d29, d30, d31 - place for storing the result (destination pixels)
+ *
+ * As can be seen above, four 64-bit NEON registers are used for keeping
+ * intermediate pixel data and up to 8 pixels can be processed in one step
+ * for 32bpp formats (16 pixels for 16bpp, 32 pixels for 8bpp).
+ *
+ * This particular function uses the following registers allocation:
+ * d0,  d1,  d2,  d3  - contain loaded source pixel data
+ * d4,  d5            - contain loaded destination pixels (they are needed)
+ * d28, d29           - place for storing the result (destination pixels)
+ */
+
+/*
+ * Step one. We need to have some code to do some arithmetics on pixel data.
+ * This is implemented as a pair of macros: '*_head' and '*_tail'. When used
+ * back-to-back, they take pixel data from {d0, d1, d2, d3} and {d4, d5},
+ * perform all the needed calculations and write the result to {d28, d29}.
+ * The rationale for having two macros and not just one will be explained
+ * later. In practice, any single monolitic function which does the work can
+ * be split into two parts in any arbitrary way without affecting correctness.
+ *
+ * There is one special trick here too. Common template macro can optionally
+ * make our life a bit easier by doing R, G, B, A color components
+ * deinterleaving for 32bpp pixel formats (and this feature is used in
+ * 'pixman_composite_over_8888_0565_asm_neon' function). So it means that
+ * instead of having 8 packed pixels in {d0, d1, d2, d3} registers, we
+ * actually use d0 register for blue channel (a vector of eight 8-bit
+ * values), d1 register for green, d2 for red and d3 for alpha. This
+ * simple conversion can be also done with a few NEON instructions:
+ *
+ * Packed to planar conversion:
+ *  vuzp.8 d0, d1
+ *  vuzp.8 d2, d3
+ *  vuzp.8 d1, d3
+ *  vuzp.8 d0, d2
+ *
+ * Planar to packed conversion:
+ *  vzip.8 d0, d2
+ *  vzip.8 d1, d3
+ *  vzip.8 d2, d3
+ *  vzip.8 d0, d1
+ *
+ * But pixel can be loaded directly in planar format using VLD4.8 NEON
+ * instruction. It is 1 cycle slower than VLD1.32, so this is not always
+ * desirable, that's why deinterleaving is optional.
+ *
+ * But anyway, here is the code:
+ */
+.macro pixman_composite_over_8888_0565_process_pixblock_head
+    /* convert 8 r5g6b5 pixel data from {d4, d5} to planar 8-bit format
+       and put data into d6 - red, d7 - green, d30 - blue */
+    vshrn.u16   d6, q2, #8
+    vshrn.u16   d7, q2, #3
+    vsli.u16    q2, q2, #5
+    vsri.u8     d6, d6, #5
+    vmvn.8      d3, d3      /* invert source alpha */
+    vsri.u8     d7, d7, #6
+    vshrn.u16   d30, q2, #2
+    /* now do alpha blending, storing results in 8-bit planar format
+       into d16 - red, d19 - green, d18 - blue */
+    vmull.u8    q10, d3, d6
+    vmull.u8    q11, d3, d7
+    vmull.u8    q12, d3, d30
+    vrshr.u16   q13, q10, #8
+    vrshr.u16   q3, q11, #8
+    vrshr.u16   q15, q12, #8
+    vraddhn.u16 d20, q10, q13
+    vraddhn.u16 d23, q11, q3
+    vraddhn.u16 d22, q12, q15
+.endm
+
+.macro pixman_composite_over_8888_0565_process_pixblock_tail
+    /* ... continue alpha blending */
+    vqadd.u8    d16, d2, d20
+    vqadd.u8    q9, q0, q11
+    /* convert the result to r5g6b5 and store it into {d28, d29} */
+    vshll.u8    q14, d16, #8
+    vshll.u8    q8, d19, #8
+    vshll.u8    q9, d18, #8
+    vsri.u16    q14, q8, #5
+    vsri.u16    q14, q9, #11
+.endm
+
+/*
+ * OK, now we got almost everything that we need. Using the above two
+ * macros, the work can be done right. But now we want to optimize
+ * it a bit. ARM Cortex-A8 is an in-order core, and benefits really
+ * a lot from good code scheduling and software pipelining.
+ *
+ * Let's construct some code, which will run in the core main loop.
+ * Some pseudo-code of the main loop will look like this:
+ *   head
+ *   while (...) {
+ *     tail
+ *     head
+ *   }
+ *   tail
+ *
+ * It may look a bit weird, but this setup allows to hide instruction
+ * latencies better and also utilize dual-issue capability more
+ * efficiently (make pairs of load-store and ALU instructions).
+ *
+ * So what we need now is a '*_tail_head' macro, which will be used
+ * in the core main loop. A trivial straightforward implementation
+ * of this macro would look like this:
+ *
+ *   pixman_composite_over_8888_0565_process_pixblock_tail
+ *   vst1.16     {d28, d29}, [DST_W, :128]!
+ *   vld1.16     {d4, d5}, [DST_R, :128]!
+ *   vld4.32     {d0, d1, d2, d3}, [SRC]!
+ *   pixman_composite_over_8888_0565_process_pixblock_head
+ *   cache_preload 8, 8
+ *
+ * Now it also got some VLD/VST instructions. We simply can't move from
+ * processing one block of pixels to the other one with just arithmetics.
+ * The previously processed data needs to be written to memory and new
+ * data needs to be fetched. Fortunately, this main loop does not deal
+ * with partial leading/trailing pixels and can load/store a full block
+ * of pixels in a bulk. Additionally, destination buffer is already
+ * 16 bytes aligned here (which is good for performance).
+ *
+ * New things here are DST_R, DST_W, SRC and MASK identifiers. These
+ * are the aliases for ARM registers which are used as pointers for
+ * accessing data. We maintain separate pointers for reading and writing
+ * destination buffer (DST_R and DST_W).
+ *
+ * Another new thing is 'cache_preload' macro. It is used for prefetching
+ * data into CPU L2 cache and improve performance when dealing with large
+ * images which are far larger than cache size. It uses one argument
+ * (actually two, but they need to be the same here) - number of pixels
+ * in a block. Looking into 'pixman-arm-neon-asm.h' can provide some
+ * details about this macro. Moreover, if good performance is needed
+ * the code from this macro needs to be copied into '*_tail_head' macro
+ * and mixed with the rest of code for optimal instructions scheduling.
+ * We are actually doing it below.
+ *
+ * Now after all the explanations, here is the optimized code.
+ * Different instruction streams (originaling from '*_head', '*_tail'
+ * and 'cache_preload' macro) use different indentation levels for
+ * better readability. Actually taking the code from one of these
+ * indentation levels and ignoring a few VLD/VST instructions would
+ * result in exactly the code from '*_head', '*_tail' or 'cache_preload'
+ * macro!
+ */
+
+#if 1
+
+.macro pixman_composite_over_8888_0565_process_pixblock_tail_head
+        vqadd.u8    d16, d2, d20
+    vld1.16     {d4, d5}, [DST_R, :128]!
+        vqadd.u8    q9, q0, q11
+    vshrn.u16   d6, q2, #8
+    fetch_src_pixblock
+    vshrn.u16   d7, q2, #3
+    vsli.u16    q2, q2, #5
+        vshll.u8    q14, d16, #8
+                                    PF add PF_X, PF_X, #8
+        vshll.u8    q8, d19, #8
+                                    PF tst PF_CTL, #0xF
+    vsri.u8     d6, d6, #5
+                                    PF addne PF_X, PF_X, #8
+    vmvn.8      d3, d3
+                                    PF subne PF_CTL, PF_CTL, #1
+    vsri.u8     d7, d7, #6
+    vshrn.u16   d30, q2, #2
+    vmull.u8    q10, d3, d6
+                                    PF pld, [PF_SRC, PF_X, lsl #src_bpp_shift]
+    vmull.u8    q11, d3, d7
+    vmull.u8    q12, d3, d30
+                                    PF pld, [PF_DST, PF_X, lsl #dst_bpp_shift]
+        vsri.u16    q14, q8, #5
+                                    PF cmp PF_X, ORIG_W
+        vshll.u8    q9, d18, #8
+    vrshr.u16   q13, q10, #8
+                                    PF subge PF_X, PF_X, ORIG_W
+    vrshr.u16   q3, q11, #8
+    vrshr.u16   q15, q12, #8
+                                    PF subges PF_CTL, PF_CTL, #0x10
+        vsri.u16    q14, q9, #11
+                                    PF ldrgeb DUMMY, [PF_SRC, SRC_STRIDE, lsl #src_bpp_shift]!
+    vraddhn.u16 d20, q10, q13
+    vraddhn.u16 d23, q11, q3
+                                    PF ldrgeb DUMMY, [PF_DST, DST_STRIDE, lsl #dst_bpp_shift]!
+    vraddhn.u16 d22, q12, q15
+        vst1.16     {d28, d29}, [DST_W, :128]!
+.endm
+
+#else
+
+/* If we did not care much about the performance, we would just use this... */
+.macro pixman_composite_over_8888_0565_process_pixblock_tail_head
+    pixman_composite_over_8888_0565_process_pixblock_tail
+    vst1.16     {d28, d29}, [DST_W, :128]!
+    vld1.16     {d4, d5}, [DST_R, :128]!
+    fetch_src_pixblock
+    pixman_composite_over_8888_0565_process_pixblock_head
+    cache_preload 8, 8
+.endm
+
+#endif
+
+/*
+ * And now the final part. We are using 'generate_composite_function' macro
+ * to put all the stuff together. We are specifying the name of the function
+ * which we want to get, number of bits per pixel for the source, mask and
+ * destination (0 if unused, like mask in this case). Next come some bit
+ * flags:
+ *   FLAG_DST_READWRITE      - tells that the destination buffer is both read
+ *                             and written, for write-only buffer we would use
+ *                             FLAG_DST_WRITEONLY flag instead
+ *   FLAG_DEINTERLEAVE_32BPP - tells that we prefer to work with planar data
+ *                             and separate color channels for 32bpp format.
+ * The next things are:
+ *  - the number of pixels processed per iteration (8 in this case, because
+ *    that's the maximum what can fit into four 64-bit NEON registers).
+ *  - prefetch distance, measured in pixel blocks. In this case it is 5 times
+ *    by 8 pixels. That would be 40 pixels, or up to 160 bytes. Optimal
+ *    prefetch distance can be selected by running some benchmarks.
+ *
+ * After that we specify some macros, these are 'default_init',
+ * 'default_cleanup' here which are empty (but it is possible to have custom
+ * init/cleanup macros to be able to save/restore some extra NEON registers
+ * like d8-d15 or do anything else) followed by
+ * 'pixman_composite_over_8888_0565_process_pixblock_head',
+ * 'pixman_composite_over_8888_0565_process_pixblock_tail' and
+ * 'pixman_composite_over_8888_0565_process_pixblock_tail_head'
+ * which we got implemented above.
+ *
+ * The last part is the NEON registers allocation scheme.
+ */
+generate_composite_function \
+    pixman_composite_over_8888_0565_asm_neon, 32, 0, 16, \
+    FLAG_DST_READWRITE | FLAG_DEINTERLEAVE_32BPP, \
+    8, /* number of pixels, processed in a single block */ \
+    5, /* prefetch distance */ \
+    default_init, \
+    default_cleanup, \
+    pixman_composite_over_8888_0565_process_pixblock_head, \
+    pixman_composite_over_8888_0565_process_pixblock_tail, \
+    pixman_composite_over_8888_0565_process_pixblock_tail_head, \
+    28, /* dst_w_basereg */ \
+    4,  /* dst_r_basereg */ \
+    0,  /* src_basereg   */ \
+    24  /* mask_basereg  */
+
+/******************************************************************************/
+
+.macro pixman_composite_over_n_0565_process_pixblock_head
+    /* convert 8 r5g6b5 pixel data from {d4, d5} to planar 8-bit format
+       and put data into d6 - red, d7 - green, d30 - blue */
+    vshrn.u16   d6, q2, #8
+    vshrn.u16   d7, q2, #3
+    vsli.u16    q2, q2, #5
+    vsri.u8     d6, d6, #5
+    vsri.u8     d7, d7, #6
+    vshrn.u16   d30, q2, #2
+    /* now do alpha blending, storing results in 8-bit planar format
+       into d16 - red, d19 - green, d18 - blue */
+    vmull.u8    q10, d3, d6
+    vmull.u8    q11, d3, d7
+    vmull.u8    q12, d3, d30
+    vrshr.u16   q13, q10, #8
+    vrshr.u16   q3, q11, #8
+    vrshr.u16   q15, q12, #8
+    vraddhn.u16 d20, q10, q13
+    vraddhn.u16 d23, q11, q3
+    vraddhn.u16 d22, q12, q15
+.endm
+
+.macro pixman_composite_over_n_0565_process_pixblock_tail
+    /* ... continue alpha blending */
+    vqadd.u8    d16, d2, d20
+    vqadd.u8    q9, q0, q11
+    /* convert the result to r5g6b5 and store it into {d28, d29} */
+    vshll.u8    q14, d16, #8
+    vshll.u8    q8, d19, #8
+    vshll.u8    q9, d18, #8
+    vsri.u16    q14, q8, #5
+    vsri.u16    q14, q9, #11
+.endm
+
+/* TODO: expand macros and do better instructions scheduling */
+.macro pixman_composite_over_n_0565_process_pixblock_tail_head
+    pixman_composite_over_n_0565_process_pixblock_tail
+    vld1.16     {d4, d5}, [DST_R, :128]!
+    vst1.16     {d28, d29}, [DST_W, :128]!
+    pixman_composite_over_n_0565_process_pixblock_head
+    cache_preload 8, 8
+.endm
+
+.macro pixman_composite_over_n_0565_init
+    add         DUMMY, sp, #ARGS_STACK_OFFSET
+    vld1.32     {d3[0]}, [DUMMY]
+    vdup.8      d0, d3[0]
+    vdup.8      d1, d3[1]
+    vdup.8      d2, d3[2]
+    vdup.8      d3, d3[3]
+    vmvn.8      d3, d3      /* invert source alpha */
+.endm
+
+generate_composite_function \
+    pixman_composite_over_n_0565_asm_neon, 0, 0, 16, \
+    FLAG_DST_READWRITE, \
+    8, /* number of pixels, processed in a single block */ \
+    5, /* prefetch distance */ \
+    pixman_composite_over_n_0565_init, \
+    default_cleanup, \
+    pixman_composite_over_n_0565_process_pixblock_head, \
+    pixman_composite_over_n_0565_process_pixblock_tail, \
+    pixman_composite_over_n_0565_process_pixblock_tail_head, \
+    28, /* dst_w_basereg */ \
+    4,  /* dst_r_basereg */ \
+    0,  /* src_basereg   */ \
+    24  /* mask_basereg  */
+
+/******************************************************************************/
+
+.macro pixman_composite_src_8888_0565_process_pixblock_head
+    vshll.u8    q8, d1, #8
+    vshll.u8    q14, d2, #8
+    vshll.u8    q9, d0, #8
+.endm
+
+.macro pixman_composite_src_8888_0565_process_pixblock_tail
+    vsri.u16    q14, q8, #5
+    vsri.u16    q14, q9, #11
+.endm
+
+.macro pixman_composite_src_8888_0565_process_pixblock_tail_head
+        vsri.u16    q14, q8, #5
+                                    PF add PF_X, PF_X, #8
+                                    PF tst PF_CTL, #0xF
+    fetch_src_pixblock
+                                    PF addne PF_X, PF_X, #8
+                                    PF subne PF_CTL, PF_CTL, #1
+        vsri.u16    q14, q9, #11
+                                    PF cmp PF_X, ORIG_W
+                                    PF pld, [PF_SRC, PF_X, lsl #src_bpp_shift]
+    vshll.u8    q8, d1, #8
+        vst1.16     {d28, d29}, [DST_W, :128]!
+                                    PF subge PF_X, PF_X, ORIG_W
+                                    PF subges PF_CTL, PF_CTL, #0x10
+    vshll.u8    q14, d2, #8
+                                    PF ldrgeb DUMMY, [PF_SRC, SRC_STRIDE, lsl #src_bpp_shift]!
+    vshll.u8    q9, d0, #8
+.endm
+
+generate_composite_function \
+    pixman_composite_src_8888_0565_asm_neon, 32, 0, 16, \
+    FLAG_DST_WRITEONLY | FLAG_DEINTERLEAVE_32BPP, \
+    8, /* number of pixels, processed in a single block */ \
+    10, /* prefetch distance */ \
+    default_init, \
+    default_cleanup, \
+    pixman_composite_src_8888_0565_process_pixblock_head, \
+    pixman_composite_src_8888_0565_process_pixblock_tail, \
+    pixman_composite_src_8888_0565_process_pixblock_tail_head
+
+/******************************************************************************/
+
+.macro pixman_composite_src_0565_8888_process_pixblock_head
+    vshrn.u16   d30, q0, #8
+    vshrn.u16   d29, q0, #3
+    vsli.u16    q0, q0, #5
+    vmov.u8     d31, #255
+    vsri.u8     d30, d30, #5
+    vsri.u8     d29, d29, #6
+    vshrn.u16   d28, q0, #2
+.endm
+
+.macro pixman_composite_src_0565_8888_process_pixblock_tail
+.endm
+
+/* TODO: expand macros and do better instructions scheduling */
+.macro pixman_composite_src_0565_8888_process_pixblock_tail_head
+    pixman_composite_src_0565_8888_process_pixblock_tail
+    vst4.8     {d28, d29, d30, d31}, [DST_W, :128]!
+    fetch_src_pixblock
+    pixman_composite_src_0565_8888_process_pixblock_head
+    cache_preload 8, 8
+.endm
+
+generate_composite_function \
+    pixman_composite_src_0565_8888_asm_neon, 16, 0, 32, \
+    FLAG_DST_WRITEONLY | FLAG_DEINTERLEAVE_32BPP, \
+    8, /* number of pixels, processed in a single block */ \
+    10, /* prefetch distance */ \
+    default_init, \
+    default_cleanup, \
+    pixman_composite_src_0565_8888_process_pixblock_head, \
+    pixman_composite_src_0565_8888_process_pixblock_tail, \
+    pixman_composite_src_0565_8888_process_pixblock_tail_head
+
+/******************************************************************************/
+
+.macro pixman_composite_add_8_8_process_pixblock_head
+    vqadd.u8    q14, q0, q2
+    vqadd.u8    q15, q1, q3
+.endm
+
+.macro pixman_composite_add_8_8_process_pixblock_tail
+.endm
+
+.macro pixman_composite_add_8_8_process_pixblock_tail_head
+    fetch_src_pixblock
+                                    PF add PF_X, PF_X, #32
+                                    PF tst PF_CTL, #0xF
+    vld1.8      {d4, d5, d6, d7}, [DST_R, :128]!
+                                    PF addne PF_X, PF_X, #32
+                                    PF subne PF_CTL, PF_CTL, #1
+        vst1.8      {d28, d29, d30, d31}, [DST_W, :128]!
+                                    PF cmp PF_X, ORIG_W
+                                    PF pld, [PF_SRC, PF_X, lsl #src_bpp_shift]
+                                    PF pld, [PF_DST, PF_X, lsl #dst_bpp_shift]
+                                    PF subge PF_X, PF_X, ORIG_W
+                                    PF subges PF_CTL, PF_CTL, #0x10
+    vqadd.u8    q14, q0, q2
+                                    PF ldrgeb DUMMY, [PF_SRC, SRC_STRIDE, lsl #src_bpp_shift]!
+                                    PF ldrgeb DUMMY, [PF_DST, DST_STRIDE, lsl #dst_bpp_shift]!
+    vqadd.u8    q15, q1, q3
+.endm
+
+generate_composite_function \
+    pixman_composite_add_8_8_asm_neon, 8, 0, 8, \
+    FLAG_DST_READWRITE, \
+    32, /* number of pixels, processed in a single block */ \
+    10, /* prefetch distance */ \
+    default_init, \
+    default_cleanup, \
+    pixman_composite_add_8_8_process_pixblock_head, \
+    pixman_composite_add_8_8_process_pixblock_tail, \
+    pixman_composite_add_8_8_process_pixblock_tail_head
+
+/******************************************************************************/
+
+.macro pixman_composite_add_8888_8888_process_pixblock_tail_head
+    fetch_src_pixblock
+                                    PF add PF_X, PF_X, #8
+                                    PF tst PF_CTL, #0xF
+    vld1.32     {d4, d5, d6, d7}, [DST_R, :128]!
+                                    PF addne PF_X, PF_X, #8
+                                    PF subne PF_CTL, PF_CTL, #1
+        vst1.32     {d28, d29, d30, d31}, [DST_W, :128]!
+                                    PF cmp PF_X, ORIG_W
+                                    PF pld, [PF_SRC, PF_X, lsl #src_bpp_shift]
+                                    PF pld, [PF_DST, PF_X, lsl #dst_bpp_shift]
+                                    PF subge PF_X, PF_X, ORIG_W
+                                    PF subges PF_CTL, PF_CTL, #0x10
+    vqadd.u8    q14, q0, q2
+                                    PF ldrgeb DUMMY, [PF_SRC, SRC_STRIDE, lsl #src_bpp_shift]!
+                                    PF ldrgeb DUMMY, [PF_DST, DST_STRIDE, lsl #dst_bpp_shift]!
+    vqadd.u8    q15, q1, q3
+.endm
+
+generate_composite_function \
+    pixman_composite_add_8888_8888_asm_neon, 32, 0, 32, \
+    FLAG_DST_READWRITE, \
+    8, /* number of pixels, processed in a single block */ \
+    10, /* prefetch distance */ \
+    default_init, \
+    default_cleanup, \
+    pixman_composite_add_8_8_process_pixblock_head, \
+    pixman_composite_add_8_8_process_pixblock_tail, \
+    pixman_composite_add_8888_8888_process_pixblock_tail_head
+
+generate_composite_function_single_scanline \
+    pixman_composite_scanline_add_asm_neon, 32, 0, 32, \
+    FLAG_DST_READWRITE, \
+    8, /* number of pixels, processed in a single block */ \
+    default_init, \
+    default_cleanup, \
+    pixman_composite_add_8_8_process_pixblock_head, \
+    pixman_composite_add_8_8_process_pixblock_tail, \
+    pixman_composite_add_8888_8888_process_pixblock_tail_head
+
+/******************************************************************************/
+
+.macro pixman_composite_out_reverse_8888_8888_process_pixblock_head
+    vmvn.8      d24, d3  /* get inverted alpha */
+    /* do alpha blending */
+    vmull.u8    q8, d24, d4
+    vmull.u8    q9, d24, d5
+    vmull.u8    q10, d24, d6
+    vmull.u8    q11, d24, d7
+.endm
+
+.macro pixman_composite_out_reverse_8888_8888_process_pixblock_tail
+    vrshr.u16   q14, q8, #8
+    vrshr.u16   q15, q9, #8
+    vrshr.u16   q12, q10, #8
+    vrshr.u16   q13, q11, #8
+    vraddhn.u16 d28, q14, q8
+    vraddhn.u16 d29, q15, q9
+    vraddhn.u16 d30, q12, q10
+    vraddhn.u16 d31, q13, q11
+.endm
+
+.macro pixman_composite_out_reverse_8888_8888_process_pixblock_tail_head
+    vld4.8      {d4, d5, d6, d7}, [DST_R, :128]!
+        vrshr.u16   q14, q8, #8
+                                    PF add PF_X, PF_X, #8
+                                    PF tst PF_CTL, #0xF
+        vrshr.u16   q15, q9, #8
+        vrshr.u16   q12, q10, #8
+        vrshr.u16   q13, q11, #8
+                                    PF addne PF_X, PF_X, #8
+                                    PF subne PF_CTL, PF_CTL, #1
+        vraddhn.u16 d28, q14, q8
+        vraddhn.u16 d29, q15, q9
+                                    PF cmp PF_X, ORIG_W
+        vraddhn.u16 d30, q12, q10
+        vraddhn.u16 d31, q13, q11
+    fetch_src_pixblock
+                                    PF pld, [PF_SRC, PF_X, lsl #src_bpp_shift]
+    vmvn.8      d22, d3
+                                    PF pld, [PF_DST, PF_X, lsl #dst_bpp_shift]
+        vst4.8      {d28, d29, d30, d31}, [DST_W, :128]!
+                                    PF subge PF_X, PF_X, ORIG_W
+    vmull.u8    q8, d22, d4
+                                    PF subges PF_CTL, PF_CTL, #0x10
+    vmull.u8    q9, d22, d5
+                                    PF ldrgeb DUMMY, [PF_SRC, SRC_STRIDE, lsl #src_bpp_shift]!
+    vmull.u8    q10, d22, d6
+                                    PF ldrgeb DUMMY, [PF_DST, DST_STRIDE, lsl #dst_bpp_shift]!
+    vmull.u8    q11, d22, d7
+.endm
+
+generate_composite_function_single_scanline \
+    pixman_composite_scanline_out_reverse_asm_neon, 32, 0, 32, \
+    FLAG_DST_READWRITE | FLAG_DEINTERLEAVE_32BPP, \
+    8, /* number of pixels, processed in a single block */ \
+    default_init, \
+    default_cleanup, \
+    pixman_composite_out_reverse_8888_8888_process_pixblock_head, \
+    pixman_composite_out_reverse_8888_8888_process_pixblock_tail, \
+    pixman_composite_out_reverse_8888_8888_process_pixblock_tail_head
+
+/******************************************************************************/
+
+.macro pixman_composite_over_8888_8888_process_pixblock_head
+    pixman_composite_out_reverse_8888_8888_process_pixblock_head
+.endm
+
+.macro pixman_composite_over_8888_8888_process_pixblock_tail
+    pixman_composite_out_reverse_8888_8888_process_pixblock_tail
+    vqadd.u8    q14, q0, q14
+    vqadd.u8    q15, q1, q15
+.endm
+
+.macro pixman_composite_over_8888_8888_process_pixblock_tail_head
+    vld4.8      {d4, d5, d6, d7}, [DST_R, :128]!
+        vrshr.u16   q14, q8, #8
+                                    PF add PF_X, PF_X, #8
+                                    PF tst PF_CTL, #0xF
+        vrshr.u16   q15, q9, #8
+        vrshr.u16   q12, q10, #8
+        vrshr.u16   q13, q11, #8
+                                    PF addne PF_X, PF_X, #8
+                                    PF subne PF_CTL, PF_CTL, #1
+        vraddhn.u16 d28, q14, q8
+        vraddhn.u16 d29, q15, q9
+                                    PF cmp PF_X, ORIG_W
+        vraddhn.u16 d30, q12, q10
+        vraddhn.u16 d31, q13, q11
+        vqadd.u8    q14, q0, q14
+        vqadd.u8    q15, q1, q15
+    fetch_src_pixblock
+                                    PF pld, [PF_SRC, PF_X, lsl #src_bpp_shift]
+    vmvn.8      d22, d3
+                                    PF pld, [PF_DST, PF_X, lsl #dst_bpp_shift]
+        vst4.8      {d28, d29, d30, d31}, [DST_W, :128]!
+                                    PF subge PF_X, PF_X, ORIG_W
+    vmull.u8    q8, d22, d4
+                                    PF subges PF_CTL, PF_CTL, #0x10
+    vmull.u8    q9, d22, d5
+                                    PF ldrgeb DUMMY, [PF_SRC, SRC_STRIDE, lsl #src_bpp_shift]!
+    vmull.u8    q10, d22, d6
+                                    PF ldrgeb DUMMY, [PF_DST, DST_STRIDE, lsl #dst_bpp_shift]!
+    vmull.u8    q11, d22, d7
+.endm
+
+generate_composite_function \
+    pixman_composite_over_8888_8888_asm_neon, 32, 0, 32, \
+    FLAG_DST_READWRITE | FLAG_DEINTERLEAVE_32BPP, \
+    8, /* number of pixels, processed in a single block */ \
+    5, /* prefetch distance */ \
+    default_init, \
+    default_cleanup, \
+    pixman_composite_over_8888_8888_process_pixblock_head, \
+    pixman_composite_over_8888_8888_process_pixblock_tail, \
+    pixman_composite_over_8888_8888_process_pixblock_tail_head
+
+generate_composite_function_single_scanline \
+    pixman_composite_scanline_over_asm_neon, 32, 0, 32, \
+    FLAG_DST_READWRITE | FLAG_DEINTERLEAVE_32BPP, \
+    8, /* number of pixels, processed in a single block */ \
+    default_init, \
+    default_cleanup, \
+    pixman_composite_over_8888_8888_process_pixblock_head, \
+    pixman_composite_over_8888_8888_process_pixblock_tail, \
+    pixman_composite_over_8888_8888_process_pixblock_tail_head
+
+/******************************************************************************/
+
+.macro pixman_composite_over_n_8888_process_pixblock_head
+    /* deinterleaved source pixels in {d0, d1, d2, d3} */
+    /* inverted alpha in {d24} */
+    /* destination pixels in {d4, d5, d6, d7} */
+    vmull.u8    q8, d24, d4
+    vmull.u8    q9, d24, d5
+    vmull.u8    q10, d24, d6
+    vmull.u8    q11, d24, d7
+.endm
+
+.macro pixman_composite_over_n_8888_process_pixblock_tail
+    vrshr.u16   q14, q8, #8
+    vrshr.u16   q15, q9, #8
+    vrshr.u16   q2, q10, #8
+    vrshr.u16   q3, q11, #8
+    vraddhn.u16 d28, q14, q8
+    vraddhn.u16 d29, q15, q9
+    vraddhn.u16 d30, q2, q10
+    vraddhn.u16 d31, q3, q11
+    vqadd.u8    q14, q0, q14
+    vqadd.u8    q15, q1, q15
+.endm
+
+.macro pixman_composite_over_n_8888_process_pixblock_tail_head
+        vrshr.u16   q14, q8, #8
+        vrshr.u16   q15, q9, #8
+        vrshr.u16   q2, q10, #8
+        vrshr.u16   q3, q11, #8
+        vraddhn.u16 d28, q14, q8
+        vraddhn.u16 d29, q15, q9
+        vraddhn.u16 d30, q2, q10
+        vraddhn.u16 d31, q3, q11
+    vld4.8      {d4, d5, d6, d7}, [DST_R, :128]!
+        vqadd.u8    q14, q0, q14
+                                    PF add PF_X, PF_X, #8
+                                    PF tst PF_CTL, #0x0F
+                                    PF addne PF_X, PF_X, #8
+                                    PF subne PF_CTL, PF_CTL, #1
+        vqadd.u8    q15, q1, q15
+                                    PF cmp PF_X, ORIG_W
+    vmull.u8    q8, d24, d4
+                                    PF pld, [PF_DST, PF_X, lsl #dst_bpp_shift]
+    vmull.u8    q9, d24, d5
+                                    PF subge PF_X, PF_X, ORIG_W
+    vmull.u8    q10, d24, d6
+                                    PF subges PF_CTL, PF_CTL, #0x10
+    vmull.u8    q11, d24, d7
+                                    PF ldrgeb DUMMY, [PF_DST, DST_STRIDE, lsl #dst_bpp_shift]!
+        vst4.8      {d28, d29, d30, d31}, [DST_W, :128]!
+.endm
+
+.macro pixman_composite_over_n_8888_init
+    add         DUMMY, sp, #ARGS_STACK_OFFSET
+    vld1.32     {d3[0]}, [DUMMY]
+    vdup.8      d0, d3[0]
+    vdup.8      d1, d3[1]
+    vdup.8      d2, d3[2]
+    vdup.8      d3, d3[3]
+    vmvn.8      d24, d3  /* get inverted alpha */
+.endm
+
+generate_composite_function \
+    pixman_composite_over_n_8888_asm_neon, 0, 0, 32, \
+    FLAG_DST_READWRITE | FLAG_DEINTERLEAVE_32BPP, \
+    8, /* number of pixels, processed in a single block */ \
+    5, /* prefetch distance */ \
+    pixman_composite_over_n_8888_init, \
+    default_cleanup, \
+    pixman_composite_over_8888_8888_process_pixblock_head, \
+    pixman_composite_over_8888_8888_process_pixblock_tail, \
+    pixman_composite_over_n_8888_process_pixblock_tail_head
+
+/******************************************************************************/
+
+.macro pixman_composite_over_reverse_n_8888_process_pixblock_tail_head
+        vrshr.u16   q14, q8, #8
+                                    PF add PF_X, PF_X, #8
+                                    PF tst PF_CTL, #0xF
+        vrshr.u16   q15, q9, #8
+        vrshr.u16   q12, q10, #8
+        vrshr.u16   q13, q11, #8
+                                    PF addne PF_X, PF_X, #8
+                                    PF subne PF_CTL, PF_CTL, #1
+        vraddhn.u16 d28, q14, q8
+        vraddhn.u16 d29, q15, q9
+                                    PF cmp PF_X, ORIG_W
+        vraddhn.u16 d30, q12, q10
+        vraddhn.u16 d31, q13, q11
+        vqadd.u8    q14, q0, q14
+        vqadd.u8    q15, q1, q15
+    vld4.8      {d0, d1, d2, d3}, [DST_R, :128]!
+    vmvn.8      d22, d3
+                                    PF pld, [PF_DST, PF_X, lsl #dst_bpp_shift]
+        vst4.8      {d28, d29, d30, d31}, [DST_W, :128]!
+                                    PF subge PF_X, PF_X, ORIG_W
+    vmull.u8    q8, d22, d4
+                                    PF subges PF_CTL, PF_CTL, #0x10
+    vmull.u8    q9, d22, d5
+    vmull.u8    q10, d22, d6
+                                    PF ldrgeb DUMMY, [PF_DST, DST_STRIDE, lsl #dst_bpp_shift]!
+    vmull.u8    q11, d22, d7
+.endm
+
+.macro pixman_composite_over_reverse_n_8888_init
+    add         DUMMY, sp, #ARGS_STACK_OFFSET
+    vld1.32     {d7[0]}, [DUMMY]
+    vdup.8      d4, d7[0]
+    vdup.8      d5, d7[1]
+    vdup.8      d6, d7[2]
+    vdup.8      d7, d7[3]
+.endm
+
+generate_composite_function \
+    pixman_composite_over_reverse_n_8888_asm_neon, 0, 0, 32, \
+    FLAG_DST_READWRITE | FLAG_DEINTERLEAVE_32BPP, \
+    8, /* number of pixels, processed in a single block */ \
+    5, /* prefetch distance */ \
+    pixman_composite_over_reverse_n_8888_init, \
+    default_cleanup, \
+    pixman_composite_over_8888_8888_process_pixblock_head, \
+    pixman_composite_over_8888_8888_process_pixblock_tail, \
+    pixman_composite_over_reverse_n_8888_process_pixblock_tail_head, \
+    28, /* dst_w_basereg */ \
+    0,  /* dst_r_basereg */ \
+    4,  /* src_basereg   */ \
+    24  /* mask_basereg  */
+
+/******************************************************************************/
+
+.macro pixman_composite_over_8888_8_0565_process_pixblock_head
+    vmull.u8    q0,  d24, d8    /* IN for SRC pixels (part1) */
+    vmull.u8    q1,  d24, d9
+    vmull.u8    q6,  d24, d10
+    vmull.u8    q7,  d24, d11
+        vshrn.u16   d6,  q2, #8 /* convert DST_R data to 32-bpp (part1) */
+        vshrn.u16   d7,  q2, #3
+        vsli.u16    q2,  q2, #5
+    vrshr.u16   q8,  q0,  #8    /* IN for SRC pixels (part2) */
+    vrshr.u16   q9,  q1,  #8
+    vrshr.u16   q10, q6,  #8
+    vrshr.u16   q11, q7,  #8
+    vraddhn.u16 d0,  q0,  q8
+    vraddhn.u16 d1,  q1,  q9
+    vraddhn.u16 d2,  q6,  q10
+    vraddhn.u16 d3,  q7,  q11
+        vsri.u8     d6,  d6, #5 /* convert DST_R data to 32-bpp (part2) */
+        vsri.u8     d7,  d7, #6
+    vmvn.8      d3,  d3
+        vshrn.u16   d30, q2, #2
+    vmull.u8    q8,  d3, d6     /* now do alpha blending */
+    vmull.u8    q9,  d3, d7
+    vmull.u8    q10, d3, d30
+.endm
+
+.macro pixman_composite_over_8888_8_0565_process_pixblock_tail
+    /* 3 cycle bubble (after vmull.u8) */
+    vrshr.u16   q13, q8,  #8
+    vrshr.u16   q11, q9,  #8
+    vrshr.u16   q15, q10, #8
+    vraddhn.u16 d16, q8,  q13
+    vraddhn.u16 d27, q9,  q11
+    vraddhn.u16 d26, q10, q15
+    vqadd.u8    d16, d2,  d16
+    /* 1 cycle bubble */
+    vqadd.u8    q9,  q0,  q13
+    vshll.u8    q14, d16, #8    /* convert to 16bpp */
+    vshll.u8    q8,  d19, #8
+    vshll.u8    q9,  d18, #8
+    vsri.u16    q14, q8,  #5
+    /* 1 cycle bubble */
+    vsri.u16    q14, q9,  #11
+.endm
+
+.macro pixman_composite_over_8888_8_0565_process_pixblock_tail_head
+    vld1.16     {d4, d5}, [DST_R, :128]!
+    vshrn.u16   d6,  q2,  #8
+    fetch_mask_pixblock
+    vshrn.u16   d7,  q2,  #3
+    fetch_src_pixblock
+    vmull.u8    q6,  d24, d10
+        vrshr.u16   q13, q8,  #8
+        vrshr.u16   q11, q9,  #8
+        vrshr.u16   q15, q10, #8
+        vraddhn.u16 d16, q8,  q13
+        vraddhn.u16 d27, q9,  q11
+        vraddhn.u16 d26, q10, q15
+        vqadd.u8    d16, d2,  d16
+    vmull.u8    q1,  d24, d9
+        vqadd.u8    q9,  q0,  q13
+        vshll.u8    q14, d16, #8
+    vmull.u8    q0,  d24, d8
+        vshll.u8    q8,  d19, #8
+        vshll.u8    q9,  d18, #8
+        vsri.u16    q14, q8,  #5
+    vmull.u8    q7,  d24, d11
+        vsri.u16    q14, q9,  #11
+
+    cache_preload 8, 8
+
+    vsli.u16    q2,  q2,  #5
+    vrshr.u16   q8,  q0,  #8
+    vrshr.u16   q9,  q1,  #8
+    vrshr.u16   q10, q6,  #8
+    vrshr.u16   q11, q7,  #8
+    vraddhn.u16 d0,  q0,  q8
+    vraddhn.u16 d1,  q1,  q9
+    vraddhn.u16 d2,  q6,  q10
+    vraddhn.u16 d3,  q7,  q11
+    vsri.u8     d6,  d6,  #5
+    vsri.u8     d7,  d7,  #6
+    vmvn.8      d3,  d3
+    vshrn.u16   d30, q2,  #2
+    vst1.16     {d28, d29}, [DST_W, :128]!
+    vmull.u8    q8,  d3,  d6
+    vmull.u8    q9,  d3,  d7
+    vmull.u8    q10, d3,  d30
+.endm
+
+generate_composite_function \
+    pixman_composite_over_8888_8_0565_asm_neon, 32, 8, 16, \
+    FLAG_DST_READWRITE | FLAG_DEINTERLEAVE_32BPP, \
+    8, /* number of pixels, processed in a single block */ \
+    5, /* prefetch distance */ \
+    default_init_need_all_regs, \
+    default_cleanup_need_all_regs, \
+    pixman_composite_over_8888_8_0565_process_pixblock_head, \
+    pixman_composite_over_8888_8_0565_process_pixblock_tail, \
+    pixman_composite_over_8888_8_0565_process_pixblock_tail_head, \
+    28, /* dst_w_basereg */ \
+    4,  /* dst_r_basereg */ \
+    8,  /* src_basereg   */ \
+    24  /* mask_basereg  */
+
+/******************************************************************************/
+
+/*
+ * This function needs a special initialization of solid mask.
+ * Solid source pixel data is fetched from stack at ARGS_STACK_OFFSET
+ * offset, split into color components and replicated in d8-d11
+ * registers. Additionally, this function needs all the NEON registers,
+ * so it has to save d8-d15 registers which are callee saved according
+ * to ABI. These registers are restored from 'cleanup' macro. All the
+ * other NEON registers are caller saved, so can be clobbered freely
+ * without introducing any problems.
+ */
+.macro pixman_composite_over_n_8_0565_init
+    add         DUMMY, sp, #ARGS_STACK_OFFSET
+    vpush       {d8-d15}
+    vld1.32     {d11[0]}, [DUMMY]
+    vdup.8      d8, d11[0]
+    vdup.8      d9, d11[1]
+    vdup.8      d10, d11[2]
+    vdup.8      d11, d11[3]
+.endm
+
+.macro pixman_composite_over_n_8_0565_cleanup
+    vpop        {d8-d15}
+.endm
+
+generate_composite_function \
+    pixman_composite_over_n_8_0565_asm_neon, 0, 8, 16, \
+    FLAG_DST_READWRITE, \
+    8, /* number of pixels, processed in a single block */ \
+    5, /* prefetch distance */ \
+    pixman_composite_over_n_8_0565_init, \
+    pixman_composite_over_n_8_0565_cleanup, \
+    pixman_composite_over_8888_8_0565_process_pixblock_head, \
+    pixman_composite_over_8888_8_0565_process_pixblock_tail, \
+    pixman_composite_over_8888_8_0565_process_pixblock_tail_head
+
+/******************************************************************************/
+
+.macro pixman_composite_over_8888_n_0565_init
+    add         DUMMY, sp, #(ARGS_STACK_OFFSET + 8)
+    vpush       {d8-d15}
+    vld1.32     {d24[0]}, [DUMMY]
+    vdup.8      d24, d24[3]
+.endm
+
+.macro pixman_composite_over_8888_n_0565_cleanup
+    vpop        {d8-d15}
+.endm
+
+generate_composite_function \
+    pixman_composite_over_8888_n_0565_asm_neon, 32, 0, 16, \
+    FLAG_DST_READWRITE | FLAG_DEINTERLEAVE_32BPP, \
+    8, /* number of pixels, processed in a single block */ \
+    5, /* prefetch distance */ \
+    pixman_composite_over_8888_n_0565_init, \
+    pixman_composite_over_8888_n_0565_cleanup, \
+    pixman_composite_over_8888_8_0565_process_pixblock_head, \
+    pixman_composite_over_8888_8_0565_process_pixblock_tail, \
+    pixman_composite_over_8888_8_0565_process_pixblock_tail_head, \
+    28, /* dst_w_basereg */ \
+    4,  /* dst_r_basereg */ \
+    8,  /* src_basereg   */ \
+    24  /* mask_basereg  */
+
+/******************************************************************************/
+
+.macro pixman_composite_src_0565_0565_process_pixblock_head
+.endm
+
+.macro pixman_composite_src_0565_0565_process_pixblock_tail
+.endm
+
+.macro pixman_composite_src_0565_0565_process_pixblock_tail_head
+    vst1.16 {d0, d1, d2, d3}, [DST_W, :128]!
+    fetch_src_pixblock
+    cache_preload 16, 16
+.endm
+
+generate_composite_function \
+    pixman_composite_src_0565_0565_asm_neon, 16, 0, 16, \
+    FLAG_DST_WRITEONLY, \
+    16, /* number of pixels, processed in a single block */ \
+    10, /* prefetch distance */ \
+    default_init, \
+    default_cleanup, \
+    pixman_composite_src_0565_0565_process_pixblock_head, \
+    pixman_composite_src_0565_0565_process_pixblock_tail, \
+    pixman_composite_src_0565_0565_process_pixblock_tail_head, \
+    0, /* dst_w_basereg */ \
+    0, /* dst_r_basereg */ \
+    0, /* src_basereg   */ \
+    0  /* mask_basereg  */
+
+/******************************************************************************/
+
+.macro pixman_composite_src_n_8_process_pixblock_head
+.endm
+
+.macro pixman_composite_src_n_8_process_pixblock_tail
+.endm
+
+.macro pixman_composite_src_n_8_process_pixblock_tail_head
+    vst1.8  {d0, d1, d2, d3}, [DST_W, :128]!
+.endm
+
+.macro pixman_composite_src_n_8_init
+    add         DUMMY, sp, #ARGS_STACK_OFFSET
+    vld1.32     {d0[0]}, [DUMMY]
+    vsli.u64    d0, d0, #8
+    vsli.u64    d0, d0, #16
+    vsli.u64    d0, d0, #32
+    vorr        d1, d0, d0
+    vorr        q1, q0, q0
+.endm
+
+.macro pixman_composite_src_n_8_cleanup
+.endm
+
+generate_composite_function \
+    pixman_composite_src_n_8_asm_neon, 0, 0, 8, \
+    FLAG_DST_WRITEONLY, \
+    32, /* number of pixels, processed in a single block */ \
+    0,  /* prefetch distance */ \
+    pixman_composite_src_n_8_init, \
+    pixman_composite_src_n_8_cleanup, \
+    pixman_composite_src_n_8_process_pixblock_head, \
+    pixman_composite_src_n_8_process_pixblock_tail, \
+    pixman_composite_src_n_8_process_pixblock_tail_head, \
+    0, /* dst_w_basereg */ \
+    0, /* dst_r_basereg */ \
+    0, /* src_basereg   */ \
+    0  /* mask_basereg  */
+
+/******************************************************************************/
+
+.macro pixman_composite_src_n_0565_process_pixblock_head
+.endm
+
+.macro pixman_composite_src_n_0565_process_pixblock_tail
+.endm
+
+.macro pixman_composite_src_n_0565_process_pixblock_tail_head
+    vst1.16 {d0, d1, d2, d3}, [DST_W, :128]!
+.endm
+
+.macro pixman_composite_src_n_0565_init
+    add         DUMMY, sp, #ARGS_STACK_OFFSET
+    vld1.32     {d0[0]}, [DUMMY]
+    vsli.u64    d0, d0, #16
+    vsli.u64    d0, d0, #32
+    vorr        d1, d0, d0
+    vorr        q1, q0, q0
+.endm
+
+.macro pixman_composite_src_n_0565_cleanup
+.endm
+
+generate_composite_function \
+    pixman_composite_src_n_0565_asm_neon, 0, 0, 16, \
+    FLAG_DST_WRITEONLY, \
+    16, /* number of pixels, processed in a single block */ \
+    0,  /* prefetch distance */ \
+    pixman_composite_src_n_0565_init, \
+    pixman_composite_src_n_0565_cleanup, \
+    pixman_composite_src_n_0565_process_pixblock_head, \
+    pixman_composite_src_n_0565_process_pixblock_tail, \
+    pixman_composite_src_n_0565_process_pixblock_tail_head, \
+    0, /* dst_w_basereg */ \
+    0, /* dst_r_basereg */ \
+    0, /* src_basereg   */ \
+    0  /* mask_basereg  */
+
+/******************************************************************************/
+
+.macro pixman_composite_src_n_8888_process_pixblock_head
+.endm
+
+.macro pixman_composite_src_n_8888_process_pixblock_tail
+.endm
+
+.macro pixman_composite_src_n_8888_process_pixblock_tail_head
+    vst1.32 {d0, d1, d2, d3}, [DST_W, :128]!
+.endm
+
+.macro pixman_composite_src_n_8888_init
+    add         DUMMY, sp, #ARGS_STACK_OFFSET
+    vld1.32     {d0[0]}, [DUMMY]
+    vsli.u64    d0, d0, #32
+    vorr        d1, d0, d0
+    vorr        q1, q0, q0
+.endm
+
+.macro pixman_composite_src_n_8888_cleanup
+.endm
+
+generate_composite_function \
+    pixman_composite_src_n_8888_asm_neon, 0, 0, 32, \
+    FLAG_DST_WRITEONLY, \
+    8, /* number of pixels, processed in a single block */ \
+    0, /* prefetch distance */ \
+    pixman_composite_src_n_8888_init, \
+    pixman_composite_src_n_8888_cleanup, \
+    pixman_composite_src_n_8888_process_pixblock_head, \
+    pixman_composite_src_n_8888_process_pixblock_tail, \
+    pixman_composite_src_n_8888_process_pixblock_tail_head, \
+    0, /* dst_w_basereg */ \
+    0, /* dst_r_basereg */ \
+    0, /* src_basereg   */ \
+    0  /* mask_basereg  */
+
+/******************************************************************************/
+
+.macro pixman_composite_src_8888_8888_process_pixblock_head
+.endm
+
+.macro pixman_composite_src_8888_8888_process_pixblock_tail
+.endm
+
+.macro pixman_composite_src_8888_8888_process_pixblock_tail_head
+    vst1.32 {d0, d1, d2, d3}, [DST_W, :128]!
+    fetch_src_pixblock
+    cache_preload 8, 8
+.endm
+
+generate_composite_function \
+    pixman_composite_src_8888_8888_asm_neon, 32, 0, 32, \
+    FLAG_DST_WRITEONLY, \
+    8, /* number of pixels, processed in a single block */ \
+    10, /* prefetch distance */ \
+    default_init, \
+    default_cleanup, \
+    pixman_composite_src_8888_8888_process_pixblock_head, \
+    pixman_composite_src_8888_8888_process_pixblock_tail, \
+    pixman_composite_src_8888_8888_process_pixblock_tail_head, \
+    0, /* dst_w_basereg */ \
+    0, /* dst_r_basereg */ \
+    0, /* src_basereg   */ \
+    0  /* mask_basereg  */
+
+/******************************************************************************/
+
+.macro pixman_composite_src_x888_8888_process_pixblock_head
+    vorr     q0, q0, q2
+    vorr     q1, q1, q2
+.endm
+
+.macro pixman_composite_src_x888_8888_process_pixblock_tail
+.endm
+
+.macro pixman_composite_src_x888_8888_process_pixblock_tail_head
+    vst1.32 {d0, d1, d2, d3}, [DST_W, :128]!
+    fetch_src_pixblock
+    vorr     q0, q0, q2
+    vorr     q1, q1, q2
+    cache_preload 8, 8
+.endm
+
+.macro pixman_composite_src_x888_8888_init
+    vmov.u8  q2, #0xFF
+    vshl.u32 q2, q2, #24
+.endm
+
+generate_composite_function \
+    pixman_composite_src_x888_8888_asm_neon, 32, 0, 32, \
+    FLAG_DST_WRITEONLY, \
+    8, /* number of pixels, processed in a single block */ \
+    10, /* prefetch distance */ \
+    pixman_composite_src_x888_8888_init, \
+    default_cleanup, \
+    pixman_composite_src_x888_8888_process_pixblock_head, \
+    pixman_composite_src_x888_8888_process_pixblock_tail, \
+    pixman_composite_src_x888_8888_process_pixblock_tail_head, \
+    0, /* dst_w_basereg */ \
+    0, /* dst_r_basereg */ \
+    0, /* src_basereg   */ \
+    0  /* mask_basereg  */
+
+/******************************************************************************/
+
+.macro pixman_composite_src_n_8_8888_process_pixblock_head
+    /* expecting solid source in {d0, d1, d2, d3} */
+    /* mask is in d24 (d25, d26, d27 are unused) */
+
+    /* in */
+    vmull.u8    q8, d24, d0
+    vmull.u8    q9, d24, d1
+    vmull.u8    q10, d24, d2
+    vmull.u8    q11, d24, d3
+    vrsra.u16   q8, q8, #8
+    vrsra.u16   q9, q9, #8
+    vrsra.u16   q10, q10, #8
+    vrsra.u16   q11, q11, #8
+.endm
+
+.macro pixman_composite_src_n_8_8888_process_pixblock_tail
+    vrshrn.u16  d28, q8, #8
+    vrshrn.u16  d29, q9, #8
+    vrshrn.u16  d30, q10, #8
+    vrshrn.u16  d31, q11, #8
+.endm
+
+.macro pixman_composite_src_n_8_8888_process_pixblock_tail_head
+    fetch_mask_pixblock
+                                    PF add PF_X, PF_X, #8
+        vrshrn.u16  d28, q8, #8
+                                    PF tst PF_CTL, #0x0F
+        vrshrn.u16  d29, q9, #8
+                                    PF addne PF_X, PF_X, #8
+        vrshrn.u16  d30, q10, #8
+                                    PF subne PF_CTL, PF_CTL, #1
+        vrshrn.u16  d31, q11, #8
+                                    PF cmp PF_X, ORIG_W
+    vmull.u8    q8, d24, d0
+                                    PF pld, [PF_MASK, PF_X, lsl #mask_bpp_shift]
+    vmull.u8    q9, d24, d1
+                                    PF subge PF_X, PF_X, ORIG_W
+    vmull.u8    q10, d24, d2
+                                    PF subges PF_CTL, PF_CTL, #0x10
+    vmull.u8    q11, d24, d3
+                                    PF ldrgeb DUMMY, [PF_MASK, MASK_STRIDE, lsl #mask_bpp_shift]!
+        vst4.8      {d28, d29, d30, d31}, [DST_W, :128]!
+    vrsra.u16   q8, q8, #8
+    vrsra.u16   q9, q9, #8
+    vrsra.u16   q10, q10, #8
+    vrsra.u16   q11, q11, #8
+.endm
+
+.macro pixman_composite_src_n_8_8888_init
+    add         DUMMY, sp, #ARGS_STACK_OFFSET
+    vld1.32     {d3[0]}, [DUMMY]
+    vdup.8      d0, d3[0]
+    vdup.8      d1, d3[1]
+    vdup.8      d2, d3[2]
+    vdup.8      d3, d3[3]
+.endm
+
+.macro pixman_composite_src_n_8_8888_cleanup
+.endm
+
+generate_composite_function \
+    pixman_composite_src_n_8_8888_asm_neon, 0, 8, 32, \
+    FLAG_DST_WRITEONLY | FLAG_DEINTERLEAVE_32BPP, \
+    8, /* number of pixels, processed in a single block */ \
+    5, /* prefetch distance */ \
+    pixman_composite_src_n_8_8888_init, \
+    pixman_composite_src_n_8_8888_cleanup, \
+    pixman_composite_src_n_8_8888_process_pixblock_head, \
+    pixman_composite_src_n_8_8888_process_pixblock_tail, \
+    pixman_composite_src_n_8_8888_process_pixblock_tail_head, \
+
+/******************************************************************************/
+
+.macro pixman_composite_src_n_8_8_process_pixblock_head
+    vmull.u8    q0, d24, d16
+    vmull.u8    q1, d25, d16
+    vmull.u8    q2, d26, d16
+    vmull.u8    q3, d27, d16
+    vrsra.u16   q0, q0,  #8
+    vrsra.u16   q1, q1,  #8
+    vrsra.u16   q2, q2,  #8
+    vrsra.u16   q3, q3,  #8
+.endm
+
+.macro pixman_composite_src_n_8_8_process_pixblock_tail
+    vrshrn.u16  d28, q0, #8
+    vrshrn.u16  d29, q1, #8
+    vrshrn.u16  d30, q2, #8
+    vrshrn.u16  d31, q3, #8
+.endm
+
+.macro pixman_composite_src_n_8_8_process_pixblock_tail_head
+    fetch_mask_pixblock
+                                    PF add PF_X, PF_X, #8
+        vrshrn.u16  d28, q0, #8
+                                    PF tst PF_CTL, #0x0F
+        vrshrn.u16  d29, q1, #8
+                                    PF addne PF_X, PF_X, #8
+        vrshrn.u16  d30, q2, #8
+                                    PF subne PF_CTL, PF_CTL, #1
+        vrshrn.u16  d31, q3, #8
+                                    PF cmp PF_X, ORIG_W
+    vmull.u8    q0,  d24, d16
+                                    PF pld, [PF_MASK, PF_X, lsl #mask_bpp_shift]
+    vmull.u8    q1,  d25, d16
+                                    PF subge PF_X, PF_X, ORIG_W
+    vmull.u8    q2,  d26, d16
+                                    PF subges PF_CTL, PF_CTL, #0x10
+    vmull.u8    q3,  d27, d16
+                                    PF ldrgeb DUMMY, [PF_MASK, MASK_STRIDE, lsl #mask_bpp_shift]!
+        vst1.8      {d28, d29, d30, d31}, [DST_W, :128]!
+    vrsra.u16   q0, q0,  #8
+    vrsra.u16   q1, q1,  #8
+    vrsra.u16   q2, q2,  #8
+    vrsra.u16   q3, q3,  #8
+.endm
+
+.macro pixman_composite_src_n_8_8_init
+    add         DUMMY, sp, #ARGS_STACK_OFFSET
+    vld1.32     {d16[0]}, [DUMMY]
+    vdup.8      d16, d16[3]
+.endm
+
+.macro pixman_composite_src_n_8_8_cleanup
+.endm
+
+generate_composite_function \
+    pixman_composite_src_n_8_8_asm_neon, 0, 8, 8, \
+    FLAG_DST_WRITEONLY, \
+    32, /* number of pixels, processed in a single block */ \
+    5, /* prefetch distance */ \
+    pixman_composite_src_n_8_8_init, \
+    pixman_composite_src_n_8_8_cleanup, \
+    pixman_composite_src_n_8_8_process_pixblock_head, \
+    pixman_composite_src_n_8_8_process_pixblock_tail, \
+    pixman_composite_src_n_8_8_process_pixblock_tail_head
+
+/******************************************************************************/
+
+.macro pixman_composite_over_n_8_8888_process_pixblock_head
+    /* expecting deinterleaved source data in {d8, d9, d10, d11} */
+    /* d8 - blue, d9 - green, d10 - red, d11 - alpha */
+    /* and destination data in {d4, d5, d6, d7} */
+    /* mask is in d24 (d25, d26, d27 are unused) */
+
+    /* in */
+    vmull.u8    q6, d24, d8
+    vmull.u8    q7, d24, d9
+    vmull.u8    q8, d24, d10
+    vmull.u8    q9, d24, d11
+    vrshr.u16   q10, q6, #8
+    vrshr.u16   q11, q7, #8
+    vrshr.u16   q12, q8, #8
+    vrshr.u16   q13, q9, #8
+    vraddhn.u16 d0, q6, q10
+    vraddhn.u16 d1, q7, q11
+    vraddhn.u16 d2, q8, q12
+    vraddhn.u16 d3, q9, q13
+    vmvn.8      d25, d3  /* get inverted alpha */
+    /* source:      d0 - blue, d1 - green, d2 - red, d3 - alpha */
+    /* destination: d4 - blue, d5 - green, d6 - red, d7 - alpha */
+    /* now do alpha blending */
+    vmull.u8    q8, d25, d4
+    vmull.u8    q9, d25, d5
+    vmull.u8    q10, d25, d6
+    vmull.u8    q11, d25, d7
+.endm
+
+.macro pixman_composite_over_n_8_8888_process_pixblock_tail
+    vrshr.u16   q14, q8, #8
+    vrshr.u16   q15, q9, #8
+    vrshr.u16   q6, q10, #8
+    vrshr.u16   q7, q11, #8
+    vraddhn.u16 d28, q14, q8
+    vraddhn.u16 d29, q15, q9
+    vraddhn.u16 d30, q6, q10
+    vraddhn.u16 d31, q7, q11
+    vqadd.u8    q14, q0, q14
+    vqadd.u8    q15, q1, q15
+.endm
+
+.macro pixman_composite_over_n_8_8888_process_pixblock_tail_head
+        vrshr.u16   q14, q8, #8
+    vld4.8      {d4, d5, d6, d7}, [DST_R, :128]!
+        vrshr.u16   q15, q9, #8
+    fetch_mask_pixblock
+        vrshr.u16   q6, q10, #8
+                                    PF add PF_X, PF_X, #8
+        vrshr.u16   q7, q11, #8
+                                    PF tst PF_CTL, #0x0F
+        vraddhn.u16 d28, q14, q8
+                                    PF addne PF_X, PF_X, #8
+        vraddhn.u16 d29, q15, q9
+                                    PF subne PF_CTL, PF_CTL, #1
+        vraddhn.u16 d30, q6, q10
+                                    PF cmp PF_X, ORIG_W
+        vraddhn.u16 d31, q7, q11
+                                    PF pld, [PF_DST, PF_X, lsl #dst_bpp_shift]
+    vmull.u8    q6, d24, d8
+                                    PF pld, [PF_MASK, PF_X, lsl #mask_bpp_shift]
+    vmull.u8    q7, d24, d9
+                                    PF subge PF_X, PF_X, ORIG_W
+    vmull.u8    q8, d24, d10
+                                    PF subges PF_CTL, PF_CTL, #0x10
+    vmull.u8    q9, d24, d11
+                                    PF ldrgeb DUMMY, [PF_DST, DST_STRIDE, lsl #dst_bpp_shift]!
+        vqadd.u8    q14, q0, q14
+                                    PF ldrgeb DUMMY, [PF_MASK, MASK_STRIDE, lsl #mask_bpp_shift]!
+        vqadd.u8    q15, q1, q15
+    vrshr.u16   q10, q6, #8
+    vrshr.u16   q11, q7, #8
+    vrshr.u16   q12, q8, #8
+    vrshr.u16   q13, q9, #8
+    vraddhn.u16 d0, q6, q10
+    vraddhn.u16 d1, q7, q11
+    vraddhn.u16 d2, q8, q12
+    vraddhn.u16 d3, q9, q13
+        vst4.8      {d28, d29, d30, d31}, [DST_W, :128]!
+    vmvn.8      d25, d3
+    vmull.u8    q8, d25, d4
+    vmull.u8    q9, d25, d5
+    vmull.u8    q10, d25, d6
+    vmull.u8    q11, d25, d7
+.endm
+
+.macro pixman_composite_over_n_8_8888_init
+    add         DUMMY, sp, #ARGS_STACK_OFFSET
+    vpush       {d8-d15}
+    vld1.32     {d11[0]}, [DUMMY]
+    vdup.8      d8, d11[0]
+    vdup.8      d9, d11[1]
+    vdup.8      d10, d11[2]
+    vdup.8      d11, d11[3]
+.endm
+
+.macro pixman_composite_over_n_8_8888_cleanup
+    vpop        {d8-d15}
+.endm
+
+generate_composite_function \
+    pixman_composite_over_n_8_8888_asm_neon, 0, 8, 32, \
+    FLAG_DST_READWRITE | FLAG_DEINTERLEAVE_32BPP, \
+    8, /* number of pixels, processed in a single block */ \
+    5, /* prefetch distance */ \
+    pixman_composite_over_n_8_8888_init, \
+    pixman_composite_over_n_8_8888_cleanup, \
+    pixman_composite_over_n_8_8888_process_pixblock_head, \
+    pixman_composite_over_n_8_8888_process_pixblock_tail, \
+    pixman_composite_over_n_8_8888_process_pixblock_tail_head
+
+/******************************************************************************/
+
+.macro pixman_composite_over_n_8_8_process_pixblock_head
+    vmull.u8    q0,  d24, d8
+    vmull.u8    q1,  d25, d8
+    vmull.u8    q6,  d26, d8
+    vmull.u8    q7,  d27, d8
+    vrshr.u16   q10, q0,  #8
+    vrshr.u16   q11, q1,  #8
+    vrshr.u16   q12, q6,  #8
+    vrshr.u16   q13, q7,  #8
+    vraddhn.u16 d0,  q0,  q10
+    vraddhn.u16 d1,  q1,  q11
+    vraddhn.u16 d2,  q6,  q12
+    vraddhn.u16 d3,  q7,  q13
+    vmvn.8      q12, q0
+    vmvn.8      q13, q1
+    vmull.u8    q8,  d24, d4
+    vmull.u8    q9,  d25, d5
+    vmull.u8    q10, d26, d6
+    vmull.u8    q11, d27, d7
+.endm
+
+.macro pixman_composite_over_n_8_8_process_pixblock_tail
+    vrshr.u16   q14, q8,  #8
+    vrshr.u16   q15, q9,  #8
+    vrshr.u16   q12, q10, #8
+    vrshr.u16   q13, q11, #8
+    vraddhn.u16 d28, q14, q8
+    vraddhn.u16 d29, q15, q9
+    vraddhn.u16 d30, q12, q10
+    vraddhn.u16 d31, q13, q11
+    vqadd.u8    q14, q0,  q14
+    vqadd.u8    q15, q1,  q15
+.endm
+
+/* TODO: expand macros and do better instructions scheduling */
+.macro pixman_composite_over_n_8_8_process_pixblock_tail_head
+    vld1.8      {d4, d5, d6, d7}, [DST_R, :128]!
+    pixman_composite_over_n_8_8_process_pixblock_tail
+    fetch_mask_pixblock
+    cache_preload 32, 32
+    vst1.8      {d28, d29, d30, d31}, [DST_W, :128]!
+    pixman_composite_over_n_8_8_process_pixblock_head
+.endm
+
+.macro pixman_composite_over_n_8_8_init
+    add         DUMMY, sp, #ARGS_STACK_OFFSET
+    vpush       {d8-d15}
+    vld1.32     {d8[0]}, [DUMMY]
+    vdup.8      d8, d8[3]
+.endm
+
+.macro pixman_composite_over_n_8_8_cleanup
+    vpop        {d8-d15}
+.endm
+
+generate_composite_function \
+    pixman_composite_over_n_8_8_asm_neon, 0, 8, 8, \
+    FLAG_DST_READWRITE, \
+    32, /* number of pixels, processed in a single block */ \
+    5, /* prefetch distance */ \
+    pixman_composite_over_n_8_8_init, \
+    pixman_composite_over_n_8_8_cleanup, \
+    pixman_composite_over_n_8_8_process_pixblock_head, \
+    pixman_composite_over_n_8_8_process_pixblock_tail, \
+    pixman_composite_over_n_8_8_process_pixblock_tail_head
+
+/******************************************************************************/
+
+.macro pixman_composite_over_n_8888_8888_ca_process_pixblock_head
+    /*
+     * 'combine_mask_ca' replacement
+     *
+     * input:  solid src (n) in {d8,  d9,  d10, d11}
+     *         dest in          {d4,  d5,  d6,  d7 }
+     *         mask in          {d24, d25, d26, d27}
+     * output: updated src in   {d0,  d1,  d2,  d3 }
+     *         updated mask in  {d24, d25, d26, d3 }
+     */
+    vmull.u8    q0,  d24, d8
+    vmull.u8    q1,  d25, d9
+    vmull.u8    q6,  d26, d10
+    vmull.u8    q7,  d27, d11
+    vmull.u8    q9,  d11, d25
+    vmull.u8    q12, d11, d24
+    vmull.u8    q13, d11, d26
+    vrshr.u16   q8,  q0,  #8
+    vrshr.u16   q10, q1,  #8
+    vrshr.u16   q11, q6,  #8
+    vraddhn.u16 d0,  q0,  q8
+    vraddhn.u16 d1,  q1,  q10
+    vraddhn.u16 d2,  q6,  q11
+    vrshr.u16   q11, q12, #8
+    vrshr.u16   q8,  q9,  #8
+    vrshr.u16   q6,  q13, #8
+    vrshr.u16   q10, q7,  #8
+    vraddhn.u16 d24, q12, q11
+    vraddhn.u16 d25, q9,  q8
+    vraddhn.u16 d26, q13, q6
+    vraddhn.u16 d3,  q7,  q10
+    /*
+     * 'combine_over_ca' replacement
+     *
+     * output: updated dest in {d28, d29, d30, d31}
+     */
+    vmvn.8      q12, q12
+    vmvn.8      d26, d26
+    vmull.u8    q8,  d24, d4
+    vmull.u8    q9,  d25, d5
+    vmvn.8      d27, d3
+    vmull.u8    q10, d26, d6
+    vmull.u8    q11, d27, d7
+.endm
+
+.macro pixman_composite_over_n_8888_8888_ca_process_pixblock_tail
+    /* ... continue 'combine_over_ca' replacement */
+    vrshr.u16   q14, q8,  #8
+    vrshr.u16   q15, q9,  #8
+    vrshr.u16   q6,  q10, #8
+    vrshr.u16   q7,  q11, #8
+    vraddhn.u16 d28, q14, q8
+    vraddhn.u16 d29, q15, q9
+    vraddhn.u16 d30, q6,  q10
+    vraddhn.u16 d31, q7,  q11
+    vqadd.u8    q14, q0,  q14
+    vqadd.u8    q15, q1,  q15
+.endm
+
+.macro pixman_composite_over_n_8888_8888_ca_process_pixblock_tail_head
+        vrshr.u16   q14, q8, #8
+        vrshr.u16   q15, q9, #8
+    vld4.8      {d4, d5, d6, d7}, [DST_R, :128]!
+        vrshr.u16   q6, q10, #8
+        vrshr.u16   q7, q11, #8
+        vraddhn.u16 d28, q14, q8
+        vraddhn.u16 d29, q15, q9
+        vraddhn.u16 d30, q6, q10
+        vraddhn.u16 d31, q7, q11
+    fetch_mask_pixblock
+        vqadd.u8    q14, q0, q14
+        vqadd.u8    q15, q1, q15
+    cache_preload 8, 8
+    pixman_composite_over_n_8888_8888_ca_process_pixblock_head
+    vst4.8      {d28, d29, d30, d31}, [DST_W, :128]!
+.endm
+
+.macro pixman_composite_over_n_8888_8888_ca_init
+    add         DUMMY, sp, #ARGS_STACK_OFFSET
+    vpush       {d8-d15}
+    vld1.32     {d11[0]}, [DUMMY]
+    vdup.8      d8, d11[0]
+    vdup.8      d9, d11[1]
+    vdup.8      d10, d11[2]
+    vdup.8      d11, d11[3]
+.endm
+
+.macro pixman_composite_over_n_8888_8888_ca_cleanup
+    vpop        {d8-d15}
+.endm
+
+generate_composite_function \
+    pixman_composite_over_n_8888_8888_ca_asm_neon, 0, 32, 32, \
+    FLAG_DST_READWRITE | FLAG_DEINTERLEAVE_32BPP, \
+    8, /* number of pixels, processed in a single block */ \
+    5, /* prefetch distance */ \
+    pixman_composite_over_n_8888_8888_ca_init, \
+    pixman_composite_over_n_8888_8888_ca_cleanup, \
+    pixman_composite_over_n_8888_8888_ca_process_pixblock_head, \
+    pixman_composite_over_n_8888_8888_ca_process_pixblock_tail, \
+    pixman_composite_over_n_8888_8888_ca_process_pixblock_tail_head
+
+/******************************************************************************/
+
+.macro pixman_composite_over_n_8888_0565_ca_process_pixblock_head
+    /*
+     * 'combine_mask_ca' replacement
+     *
+     * input:  solid src (n) in {d8,  d9,  d10, d11}  [B, G, R, A]
+     *         mask in          {d24, d25, d26}       [B, G, R]
+     * output: updated src in   {d0,  d1,  d2 }       [B, G, R]
+     *         updated mask in  {d24, d25, d26}       [B, G, R]
+     */
+    vmull.u8    q0,  d24, d8
+    vmull.u8    q1,  d25, d9
+    vmull.u8    q6,  d26, d10
+    vmull.u8    q9,  d11, d25
+    vmull.u8    q12, d11, d24
+    vmull.u8    q13, d11, d26
+    vrshr.u16   q8,  q0,  #8
+    vrshr.u16   q10, q1,  #8
+    vrshr.u16   q11, q6,  #8
+    vraddhn.u16 d0,  q0,  q8
+    vraddhn.u16 d1,  q1,  q10
+    vraddhn.u16 d2,  q6,  q11
+    vrshr.u16   q11, q12, #8
+    vrshr.u16   q8,  q9,  #8
+    vrshr.u16   q6,  q13, #8
+    vraddhn.u16 d24, q12, q11
+    vraddhn.u16 d25, q9,  q8
+    /*
+     * convert 8 r5g6b5 pixel data from {d4, d5} to planar 8-bit format
+     * and put data into d16 - blue, d17 - green, d18 - red
+     */
+       vshrn.u16   d17, q2,  #3
+       vshrn.u16   d18, q2,  #8
+    vraddhn.u16 d26, q13, q6
+       vsli.u16    q2,  q2,  #5
+       vsri.u8     d18, d18, #5
+       vsri.u8     d17, d17, #6
+    /*
+     * 'combine_over_ca' replacement
+     *
+     * output: updated dest in d16 - blue, d17 - green, d18 - red
+     */
+    vmvn.8      q12, q12
+       vshrn.u16   d16, q2,  #2
+    vmvn.8      d26, d26
+    vmull.u8    q6,  d16, d24
+    vmull.u8    q7,  d17, d25
+    vmull.u8    q11, d18, d26
+.endm
+
+.macro pixman_composite_over_n_8888_0565_ca_process_pixblock_tail
+    /* ... continue 'combine_over_ca' replacement */
+    vrshr.u16   q10, q6,  #8
+    vrshr.u16   q14, q7,  #8
+    vrshr.u16   q15, q11, #8
+    vraddhn.u16 d16, q10, q6
+    vraddhn.u16 d17, q14, q7
+    vraddhn.u16 d18, q15, q11
+    vqadd.u8    q8,  q0,  q8
+    vqadd.u8    d18, d2,  d18
+    /*
+     * convert the results in d16, d17, d18 to r5g6b5 and store
+     * them into {d28, d29}
+     */
+    vshll.u8    q14, d18, #8
+    vshll.u8    q10, d17, #8
+    vshll.u8    q15, d16, #8
+    vsri.u16    q14, q10, #5
+    vsri.u16    q14, q15, #11
+.endm
+
+.macro pixman_composite_over_n_8888_0565_ca_process_pixblock_tail_head
+    fetch_mask_pixblock
+        vrshr.u16   q10, q6, #8
+        vrshr.u16   q14, q7, #8
+    vld1.16     {d4, d5}, [DST_R, :128]!
+        vrshr.u16   q15, q11, #8
+        vraddhn.u16 d16, q10, q6
+        vraddhn.u16 d17, q14, q7
+        vraddhn.u16 d22, q15, q11
+            /* process_pixblock_head */
+            /*
+             * 'combine_mask_ca' replacement
+             *
+             * input:  solid src (n) in {d8,  d9,  d10, d11}  [B, G, R, A]
+             *         mask in          {d24, d25, d26}       [B, G, R]
+             * output: updated src in   {d0,  d1,  d2 }       [B, G, R]
+             *         updated mask in  {d24, d25, d26}       [B, G, R]
+             */
+            vmull.u8    q6,  d26, d10
+        vqadd.u8    q8,  q0, q8
+            vmull.u8    q0,  d24, d8
+        vqadd.u8    d22, d2, d22
+            vmull.u8    q1,  d25, d9
+        /*
+         * convert the result in d16, d17, d22 to r5g6b5 and store
+         * it into {d28, d29}
+         */
+        vshll.u8    q14, d22, #8
+        vshll.u8    q10, d17, #8
+        vshll.u8    q15, d16, #8
+            vmull.u8    q9,  d11, d25
+        vsri.u16    q14, q10, #5
+            vmull.u8    q12, d11, d24
+            vmull.u8    q13, d11, d26
+        vsri.u16    q14, q15, #11
+    cache_preload 8, 8
+            vrshr.u16   q8,  q0,  #8
+            vrshr.u16   q10, q1,  #8
+            vrshr.u16   q11, q6,  #8
+            vraddhn.u16 d0,  q0,  q8
+            vraddhn.u16 d1,  q1,  q10
+            vraddhn.u16 d2,  q6,  q11
+            vrshr.u16   q11, q12, #8
+            vrshr.u16   q8,  q9,  #8
+            vrshr.u16   q6,  q13, #8
+            vraddhn.u16 d24, q12, q11
+            vraddhn.u16 d25, q9,  q8
+                /*
+                 * convert 8 r5g6b5 pixel data from {d4, d5} to planar
+                * 8-bit format and put data into d16 - blue, d17 - green,
+                * d18 - red
+                 */
+                vshrn.u16   d17, q2,  #3
+                vshrn.u16   d18, q2,  #8
+            vraddhn.u16 d26, q13, q6
+                vsli.u16    q2,  q2,  #5
+                vsri.u8     d17, d17, #6
+                vsri.u8     d18, d18, #5
+            /*
+             * 'combine_over_ca' replacement
+             *
+             * output: updated dest in d16 - blue, d17 - green, d18 - red
+             */
+            vmvn.8      q12, q12
+                vshrn.u16   d16, q2,  #2
+            vmvn.8      d26, d26
+            vmull.u8    q7,  d17, d25
+            vmull.u8    q6,  d16, d24
+            vmull.u8    q11, d18, d26
+    vst1.16     {d28, d29}, [DST_W, :128]!
+.endm
+
+.macro pixman_composite_over_n_8888_0565_ca_init
+    add         DUMMY, sp, #ARGS_STACK_OFFSET
+    vpush       {d8-d15}
+    vld1.32     {d11[0]}, [DUMMY]
+    vdup.8      d8, d11[0]
+    vdup.8      d9, d11[1]
+    vdup.8      d10, d11[2]
+    vdup.8      d11, d11[3]
+.endm
+
+.macro pixman_composite_over_n_8888_0565_ca_cleanup
+    vpop        {d8-d15}
+.endm
+
+generate_composite_function \
+    pixman_composite_over_n_8888_0565_ca_asm_neon, 0, 32, 16, \
+    FLAG_DST_READWRITE | FLAG_DEINTERLEAVE_32BPP, \
+    8, /* number of pixels, processed in a single block */ \
+    5, /* prefetch distance */ \
+    pixman_composite_over_n_8888_0565_ca_init, \
+    pixman_composite_over_n_8888_0565_ca_cleanup, \
+    pixman_composite_over_n_8888_0565_ca_process_pixblock_head, \
+    pixman_composite_over_n_8888_0565_ca_process_pixblock_tail, \
+    pixman_composite_over_n_8888_0565_ca_process_pixblock_tail_head
+
+/******************************************************************************/
+
+.macro pixman_composite_in_n_8_process_pixblock_head
+    /* expecting source data in {d0, d1, d2, d3} */
+    /* and destination data in {d4, d5, d6, d7} */
+    vmull.u8    q8,  d4,  d3
+    vmull.u8    q9,  d5,  d3
+    vmull.u8    q10, d6,  d3
+    vmull.u8    q11, d7,  d3
+.endm
+
+.macro pixman_composite_in_n_8_process_pixblock_tail
+    vrshr.u16   q14, q8,  #8
+    vrshr.u16   q15, q9,  #8
+    vrshr.u16   q12, q10, #8
+    vrshr.u16   q13, q11, #8
+    vraddhn.u16 d28, q8,  q14
+    vraddhn.u16 d29, q9,  q15
+    vraddhn.u16 d30, q10, q12
+    vraddhn.u16 d31, q11, q13
+.endm
+
+.macro pixman_composite_in_n_8_process_pixblock_tail_head
+    pixman_composite_in_n_8_process_pixblock_tail
+    vld1.8      {d4, d5, d6, d7}, [DST_R, :128]!
+    cache_preload 32, 32
+    pixman_composite_in_n_8_process_pixblock_head
+    vst1.8      {d28, d29, d30, d31}, [DST_W, :128]!
+.endm
+
+.macro pixman_composite_in_n_8_init
+    add         DUMMY, sp, #ARGS_STACK_OFFSET
+    vld1.32     {d3[0]}, [DUMMY]
+    vdup.8      d3, d3[3]
+.endm
+
+.macro pixman_composite_in_n_8_cleanup
+.endm
+
+generate_composite_function \
+    pixman_composite_in_n_8_asm_neon, 0, 0, 8, \
+    FLAG_DST_READWRITE, \
+    32, /* number of pixels, processed in a single block */ \
+    5, /* prefetch distance */ \
+    pixman_composite_in_n_8_init, \
+    pixman_composite_in_n_8_cleanup, \
+    pixman_composite_in_n_8_process_pixblock_head, \
+    pixman_composite_in_n_8_process_pixblock_tail, \
+    pixman_composite_in_n_8_process_pixblock_tail_head, \
+    28, /* dst_w_basereg */ \
+    4,  /* dst_r_basereg */ \
+    0,  /* src_basereg   */ \
+    24  /* mask_basereg  */
+
+.macro pixman_composite_add_n_8_8_process_pixblock_head
+    /* expecting source data in {d8, d9, d10, d11} */
+    /* d8 - blue, d9 - green, d10 - red, d11 - alpha */
+    /* and destination data in {d4, d5, d6, d7} */
+    /* mask is in d24, d25, d26, d27 */
+    vmull.u8    q0, d24, d11
+    vmull.u8    q1, d25, d11
+    vmull.u8    q6, d26, d11
+    vmull.u8    q7, d27, d11
+    vrshr.u16   q10, q0, #8
+    vrshr.u16   q11, q1, #8
+    vrshr.u16   q12, q6, #8
+    vrshr.u16   q13, q7, #8
+    vraddhn.u16 d0, q0, q10
+    vraddhn.u16 d1, q1, q11
+    vraddhn.u16 d2, q6, q12
+    vraddhn.u16 d3, q7, q13
+    vqadd.u8    q14, q0, q2
+    vqadd.u8    q15, q1, q3
+.endm
+
+.macro pixman_composite_add_n_8_8_process_pixblock_tail
+.endm
+
+/* TODO: expand macros and do better instructions scheduling */
+.macro pixman_composite_add_n_8_8_process_pixblock_tail_head
+    pixman_composite_add_n_8_8_process_pixblock_tail
+    vst1.8      {d28, d29, d30, d31}, [DST_W, :128]!
+    vld1.8      {d4, d5, d6, d7}, [DST_R, :128]!
+    fetch_mask_pixblock
+    cache_preload 32, 32
+    pixman_composite_add_n_8_8_process_pixblock_head
+.endm
+
+.macro pixman_composite_add_n_8_8_init
+    add         DUMMY, sp, #ARGS_STACK_OFFSET
+    vpush       {d8-d15}
+    vld1.32     {d11[0]}, [DUMMY]
+    vdup.8      d11, d11[3]
+.endm
+
+.macro pixman_composite_add_n_8_8_cleanup
+    vpop        {d8-d15}
+.endm
+
+generate_composite_function \
+    pixman_composite_add_n_8_8_asm_neon, 0, 8, 8, \
+    FLAG_DST_READWRITE, \
+    32, /* number of pixels, processed in a single block */ \
+    5, /* prefetch distance */ \
+    pixman_composite_add_n_8_8_init, \
+    pixman_composite_add_n_8_8_cleanup, \
+    pixman_composite_add_n_8_8_process_pixblock_head, \
+    pixman_composite_add_n_8_8_process_pixblock_tail, \
+    pixman_composite_add_n_8_8_process_pixblock_tail_head
+
+/******************************************************************************/
+
+.macro pixman_composite_add_8_8_8_process_pixblock_head
+    /* expecting source data in {d0, d1, d2, d3} */
+    /* destination data in {d4, d5, d6, d7} */
+    /* mask in {d24, d25, d26, d27} */
+    vmull.u8    q8, d24, d0
+    vmull.u8    q9, d25, d1
+    vmull.u8    q10, d26, d2
+    vmull.u8    q11, d27, d3
+    vrshr.u16   q0, q8, #8
+    vrshr.u16   q1, q9, #8
+    vrshr.u16   q12, q10, #8
+    vrshr.u16   q13, q11, #8
+    vraddhn.u16 d0, q0, q8
+    vraddhn.u16 d1, q1, q9
+    vraddhn.u16 d2, q12, q10
+    vraddhn.u16 d3, q13, q11
+    vqadd.u8    q14, q0, q2
+    vqadd.u8    q15, q1, q3
+.endm
+
+.macro pixman_composite_add_8_8_8_process_pixblock_tail
+.endm
+
+/* TODO: expand macros and do better instructions scheduling */
+.macro pixman_composite_add_8_8_8_process_pixblock_tail_head
+    pixman_composite_add_8_8_8_process_pixblock_tail
+    vst1.8      {d28, d29, d30, d31}, [DST_W, :128]!
+    vld1.8      {d4, d5, d6, d7}, [DST_R, :128]!
+    fetch_mask_pixblock
+    fetch_src_pixblock
+    cache_preload 32, 32
+    pixman_composite_add_8_8_8_process_pixblock_head
+.endm
+
+.macro pixman_composite_add_8_8_8_init
+.endm
+
+.macro pixman_composite_add_8_8_8_cleanup
+.endm
+
+generate_composite_function \
+    pixman_composite_add_8_8_8_asm_neon, 8, 8, 8, \
+    FLAG_DST_READWRITE, \
+    32, /* number of pixels, processed in a single block */ \
+    5, /* prefetch distance */ \
+    pixman_composite_add_8_8_8_init, \
+    pixman_composite_add_8_8_8_cleanup, \
+    pixman_composite_add_8_8_8_process_pixblock_head, \
+    pixman_composite_add_8_8_8_process_pixblock_tail, \
+    pixman_composite_add_8_8_8_process_pixblock_tail_head
+
+/******************************************************************************/
+
+.macro pixman_composite_add_8888_8888_8888_process_pixblock_head
+    /* expecting source data in {d0, d1, d2, d3} */
+    /* destination data in {d4, d5, d6, d7} */
+    /* mask in {d24, d25, d26, d27} */
+    vmull.u8    q8,  d27, d0
+    vmull.u8    q9,  d27, d1
+    vmull.u8    q10, d27, d2
+    vmull.u8    q11, d27, d3
+    /* 1 cycle bubble */
+    vrsra.u16   q8,  q8,  #8
+    vrsra.u16   q9,  q9,  #8
+    vrsra.u16   q10, q10, #8
+    vrsra.u16   q11, q11, #8
+.endm
+
+.macro pixman_composite_add_8888_8888_8888_process_pixblock_tail
+    /* 2 cycle bubble */
+    vrshrn.u16  d28, q8,  #8
+    vrshrn.u16  d29, q9,  #8
+    vrshrn.u16  d30, q10, #8
+    vrshrn.u16  d31, q11, #8
+    vqadd.u8    q14, q2,  q14
+    /* 1 cycle bubble */
+    vqadd.u8    q15, q3,  q15
+.endm
+
+.macro pixman_composite_add_8888_8888_8888_process_pixblock_tail_head
+    fetch_src_pixblock
+        vrshrn.u16  d28, q8,  #8
+    fetch_mask_pixblock
+        vrshrn.u16  d29, q9,  #8
+    vmull.u8    q8,  d27, d0
+        vrshrn.u16  d30, q10, #8
+    vmull.u8    q9,  d27, d1
+        vrshrn.u16  d31, q11, #8
+    vmull.u8    q10, d27, d2
+        vqadd.u8    q14, q2,  q14
+    vmull.u8    q11, d27, d3
+        vqadd.u8    q15, q3,  q15
+    vrsra.u16   q8,  q8,  #8
+    vld4.8      {d4, d5, d6, d7}, [DST_R, :128]!
+    vrsra.u16   q9,  q9,  #8
+        vst4.8      {d28, d29, d30, d31}, [DST_W, :128]!
+    vrsra.u16   q10, q10, #8
+
+    cache_preload 8, 8
+
+    vrsra.u16   q11, q11, #8
+.endm
+
+generate_composite_function \
+    pixman_composite_add_8888_8888_8888_asm_neon, 32, 32, 32, \
+    FLAG_DST_READWRITE | FLAG_DEINTERLEAVE_32BPP, \
+    8, /* number of pixels, processed in a single block */ \
+    10, /* prefetch distance */ \
+    default_init, \
+    default_cleanup, \
+    pixman_composite_add_8888_8888_8888_process_pixblock_head, \
+    pixman_composite_add_8888_8888_8888_process_pixblock_tail, \
+    pixman_composite_add_8888_8888_8888_process_pixblock_tail_head
+
+generate_composite_function_single_scanline \
+    pixman_composite_scanline_add_mask_asm_neon, 32, 32, 32, \
+    FLAG_DST_READWRITE | FLAG_DEINTERLEAVE_32BPP, \
+    8, /* number of pixels, processed in a single block */ \
+    default_init, \
+    default_cleanup, \
+    pixman_composite_add_8888_8888_8888_process_pixblock_head, \
+    pixman_composite_add_8888_8888_8888_process_pixblock_tail, \
+    pixman_composite_add_8888_8888_8888_process_pixblock_tail_head
+
+/******************************************************************************/
+
+generate_composite_function \
+    pixman_composite_add_8888_8_8888_asm_neon, 32, 8, 32, \
+    FLAG_DST_READWRITE | FLAG_DEINTERLEAVE_32BPP, \
+    8, /* number of pixels, processed in a single block */ \
+    5, /* prefetch distance */ \
+    default_init, \
+    default_cleanup, \
+    pixman_composite_add_8888_8888_8888_process_pixblock_head, \
+    pixman_composite_add_8888_8888_8888_process_pixblock_tail, \
+    pixman_composite_add_8888_8888_8888_process_pixblock_tail_head, \
+    28, /* dst_w_basereg */ \
+    4,  /* dst_r_basereg */ \
+    0,  /* src_basereg   */ \
+    27  /* mask_basereg  */
+
+/******************************************************************************/
+
+.macro pixman_composite_add_n_8_8888_init
+    add         DUMMY, sp, #ARGS_STACK_OFFSET
+    vld1.32     {d3[0]}, [DUMMY]
+    vdup.8      d0, d3[0]
+    vdup.8      d1, d3[1]
+    vdup.8      d2, d3[2]
+    vdup.8      d3, d3[3]
+.endm
+
+.macro pixman_composite_add_n_8_8888_cleanup
+.endm
+
+generate_composite_function \
+    pixman_composite_add_n_8_8888_asm_neon, 0, 8, 32, \
+    FLAG_DST_READWRITE | FLAG_DEINTERLEAVE_32BPP, \
+    8, /* number of pixels, processed in a single block */ \
+    5, /* prefetch distance */ \
+    pixman_composite_add_n_8_8888_init, \
+    pixman_composite_add_n_8_8888_cleanup, \
+    pixman_composite_add_8888_8888_8888_process_pixblock_head, \
+    pixman_composite_add_8888_8888_8888_process_pixblock_tail, \
+    pixman_composite_add_8888_8888_8888_process_pixblock_tail_head, \
+    28, /* dst_w_basereg */ \
+    4,  /* dst_r_basereg */ \
+    0,  /* src_basereg   */ \
+    27  /* mask_basereg  */
+
+/******************************************************************************/
+
+.macro pixman_composite_add_8888_n_8888_init
+    add         DUMMY, sp, #(ARGS_STACK_OFFSET + 8)
+    vld1.32     {d27[0]}, [DUMMY]
+    vdup.8      d27, d27[3]
+.endm
+
+.macro pixman_composite_add_8888_n_8888_cleanup
+.endm
+
+generate_composite_function \
+    pixman_composite_add_8888_n_8888_asm_neon, 32, 0, 32, \
+    FLAG_DST_READWRITE | FLAG_DEINTERLEAVE_32BPP, \
+    8, /* number of pixels, processed in a single block */ \
+    5, /* prefetch distance */ \
+    pixman_composite_add_8888_n_8888_init, \
+    pixman_composite_add_8888_n_8888_cleanup, \
+    pixman_composite_add_8888_8888_8888_process_pixblock_head, \
+    pixman_composite_add_8888_8888_8888_process_pixblock_tail, \
+    pixman_composite_add_8888_8888_8888_process_pixblock_tail_head, \
+    28, /* dst_w_basereg */ \
+    4,  /* dst_r_basereg */ \
+    0,  /* src_basereg   */ \
+    27  /* mask_basereg  */
+
+/******************************************************************************/
+
+.macro pixman_composite_out_reverse_8888_n_8888_process_pixblock_head
+    /* expecting source data in {d0, d1, d2, d3} */
+    /* destination data in {d4, d5, d6, d7} */
+    /* solid mask is in d15 */
+
+    /* 'in' */
+    vmull.u8    q8, d15, d3
+    vmull.u8    q6, d15, d2
+    vmull.u8    q5, d15, d1
+    vmull.u8    q4, d15, d0
+    vrshr.u16   q13, q8, #8
+    vrshr.u16   q12, q6, #8
+    vrshr.u16   q11, q5, #8
+    vrshr.u16   q10, q4, #8
+    vraddhn.u16 d3, q8, q13
+    vraddhn.u16 d2, q6, q12
+    vraddhn.u16 d1, q5, q11
+    vraddhn.u16 d0, q4, q10
+    vmvn.8      d24, d3  /* get inverted alpha */
+    /* now do alpha blending */
+    vmull.u8    q8, d24, d4
+    vmull.u8    q9, d24, d5
+    vmull.u8    q10, d24, d6
+    vmull.u8    q11, d24, d7
+.endm
+
+.macro pixman_composite_out_reverse_8888_n_8888_process_pixblock_tail
+    vrshr.u16   q14, q8, #8
+    vrshr.u16   q15, q9, #8
+    vrshr.u16   q12, q10, #8
+    vrshr.u16   q13, q11, #8
+    vraddhn.u16 d28, q14, q8
+    vraddhn.u16 d29, q15, q9
+    vraddhn.u16 d30, q12, q10
+    vraddhn.u16 d31, q13, q11
+.endm
+
+/* TODO: expand macros and do better instructions scheduling */
+.macro pixman_composite_out_reverse_8888_8888_8888_process_pixblock_tail_head
+    vld4.8     {d4, d5, d6, d7}, [DST_R, :128]!
+    pixman_composite_out_reverse_8888_n_8888_process_pixblock_tail
+    fetch_src_pixblock
+    cache_preload 8, 8
+    fetch_mask_pixblock
+    pixman_composite_out_reverse_8888_n_8888_process_pixblock_head
+    vst4.8     {d28, d29, d30, d31}, [DST_W, :128]!
+.endm
+
+generate_composite_function_single_scanline \
+    pixman_composite_scanline_out_reverse_mask_asm_neon, 32, 32, 32, \
+    FLAG_DST_READWRITE | FLAG_DEINTERLEAVE_32BPP, \
+    8, /* number of pixels, processed in a single block */ \
+    default_init_need_all_regs, \
+    default_cleanup_need_all_regs, \
+    pixman_composite_out_reverse_8888_n_8888_process_pixblock_head, \
+    pixman_composite_out_reverse_8888_n_8888_process_pixblock_tail, \
+    pixman_composite_out_reverse_8888_8888_8888_process_pixblock_tail_head \
+    28, /* dst_w_basereg */ \
+    4,  /* dst_r_basereg */ \
+    0,  /* src_basereg   */ \
+    12  /* mask_basereg  */
+
+/******************************************************************************/
+
+.macro pixman_composite_over_8888_n_8888_process_pixblock_head
+    pixman_composite_out_reverse_8888_n_8888_process_pixblock_head
+.endm
+
+.macro pixman_composite_over_8888_n_8888_process_pixblock_tail
+    pixman_composite_out_reverse_8888_n_8888_process_pixblock_tail
+    vqadd.u8    q14, q0, q14
+    vqadd.u8    q15, q1, q15
+.endm
+
+/* TODO: expand macros and do better instructions scheduling */
+.macro pixman_composite_over_8888_n_8888_process_pixblock_tail_head
+    vld4.8     {d4, d5, d6, d7}, [DST_R, :128]!
+    pixman_composite_over_8888_n_8888_process_pixblock_tail
+    fetch_src_pixblock
+    cache_preload 8, 8
+    pixman_composite_over_8888_n_8888_process_pixblock_head
+    vst4.8     {d28, d29, d30, d31}, [DST_W, :128]!
+.endm
+
+.macro pixman_composite_over_8888_n_8888_init
+    add         DUMMY, sp, #48
+    vpush       {d8-d15}
+    vld1.32     {d15[0]}, [DUMMY]
+    vdup.8      d15, d15[3]
+.endm
+
+.macro pixman_composite_over_8888_n_8888_cleanup
+    vpop        {d8-d15}
+.endm
+
+generate_composite_function \
+    pixman_composite_over_8888_n_8888_asm_neon, 32, 0, 32, \
+    FLAG_DST_READWRITE | FLAG_DEINTERLEAVE_32BPP, \
+    8, /* number of pixels, processed in a single block */ \
+    5, /* prefetch distance */ \
+    pixman_composite_over_8888_n_8888_init, \
+    pixman_composite_over_8888_n_8888_cleanup, \
+    pixman_composite_over_8888_n_8888_process_pixblock_head, \
+    pixman_composite_over_8888_n_8888_process_pixblock_tail, \
+    pixman_composite_over_8888_n_8888_process_pixblock_tail_head
+
+/******************************************************************************/
+
+/* TODO: expand macros and do better instructions scheduling */
+.macro pixman_composite_over_8888_8888_8888_process_pixblock_tail_head
+    vld4.8     {d4, d5, d6, d7}, [DST_R, :128]!
+    pixman_composite_over_8888_n_8888_process_pixblock_tail
+    fetch_src_pixblock
+    cache_preload 8, 8
+    fetch_mask_pixblock
+    pixman_composite_over_8888_n_8888_process_pixblock_head
+    vst4.8     {d28, d29, d30, d31}, [DST_W, :128]!
+.endm
+
+generate_composite_function \
+    pixman_composite_over_8888_8888_8888_asm_neon, 32, 32, 32, \
+    FLAG_DST_READWRITE | FLAG_DEINTERLEAVE_32BPP, \
+    8, /* number of pixels, processed in a single block */ \
+    5, /* prefetch distance */ \
+    default_init_need_all_regs, \
+    default_cleanup_need_all_regs, \
+    pixman_composite_over_8888_n_8888_process_pixblock_head, \
+    pixman_composite_over_8888_n_8888_process_pixblock_tail, \
+    pixman_composite_over_8888_8888_8888_process_pixblock_tail_head \
+    28, /* dst_w_basereg */ \
+    4,  /* dst_r_basereg */ \
+    0,  /* src_basereg   */ \
+    12  /* mask_basereg  */
+
+generate_composite_function_single_scanline \
+    pixman_composite_scanline_over_mask_asm_neon, 32, 32, 32, \
+    FLAG_DST_READWRITE | FLAG_DEINTERLEAVE_32BPP, \
+    8, /* number of pixels, processed in a single block */ \
+    default_init_need_all_regs, \
+    default_cleanup_need_all_regs, \
+    pixman_composite_over_8888_n_8888_process_pixblock_head, \
+    pixman_composite_over_8888_n_8888_process_pixblock_tail, \
+    pixman_composite_over_8888_8888_8888_process_pixblock_tail_head \
+    28, /* dst_w_basereg */ \
+    4,  /* dst_r_basereg */ \
+    0,  /* src_basereg   */ \
+    12  /* mask_basereg  */
+
+/******************************************************************************/
+
+/* TODO: expand macros and do better instructions scheduling */
+.macro pixman_composite_over_8888_8_8888_process_pixblock_tail_head
+    vld4.8     {d4, d5, d6, d7}, [DST_R, :128]!
+    pixman_composite_over_8888_n_8888_process_pixblock_tail
+    fetch_src_pixblock
+    cache_preload 8, 8
+    fetch_mask_pixblock
+    pixman_composite_over_8888_n_8888_process_pixblock_head
+    vst4.8     {d28, d29, d30, d31}, [DST_W, :128]!
+.endm
+
+generate_composite_function \
+    pixman_composite_over_8888_8_8888_asm_neon, 32, 8, 32, \
+    FLAG_DST_READWRITE | FLAG_DEINTERLEAVE_32BPP, \
+    8, /* number of pixels, processed in a single block */ \
+    5, /* prefetch distance */ \
+    default_init_need_all_regs, \
+    default_cleanup_need_all_regs, \
+    pixman_composite_over_8888_n_8888_process_pixblock_head, \
+    pixman_composite_over_8888_n_8888_process_pixblock_tail, \
+    pixman_composite_over_8888_8_8888_process_pixblock_tail_head \
+    28, /* dst_w_basereg */ \
+    4,  /* dst_r_basereg */ \
+    0,  /* src_basereg   */ \
+    15  /* mask_basereg  */
+
+/******************************************************************************/
+
+.macro pixman_composite_src_0888_0888_process_pixblock_head
+.endm
+
+.macro pixman_composite_src_0888_0888_process_pixblock_tail
+.endm
+
+.macro pixman_composite_src_0888_0888_process_pixblock_tail_head
+    vst3.8 {d0, d1, d2}, [DST_W]!
+    fetch_src_pixblock
+    cache_preload 8, 8
+.endm
+
+generate_composite_function \
+    pixman_composite_src_0888_0888_asm_neon, 24, 0, 24, \
+    FLAG_DST_WRITEONLY, \
+    8, /* number of pixels, processed in a single block */ \
+    10, /* prefetch distance */ \
+    default_init, \
+    default_cleanup, \
+    pixman_composite_src_0888_0888_process_pixblock_head, \
+    pixman_composite_src_0888_0888_process_pixblock_tail, \
+    pixman_composite_src_0888_0888_process_pixblock_tail_head, \
+    0, /* dst_w_basereg */ \
+    0, /* dst_r_basereg */ \
+    0, /* src_basereg   */ \
+    0  /* mask_basereg  */
+
+/******************************************************************************/
+
+.macro pixman_composite_src_0888_8888_rev_process_pixblock_head
+    vswp   d0, d2
+.endm
+
+.macro pixman_composite_src_0888_8888_rev_process_pixblock_tail
+.endm
+
+.macro pixman_composite_src_0888_8888_rev_process_pixblock_tail_head
+    vst4.8 {d0, d1, d2, d3}, [DST_W]!
+    fetch_src_pixblock
+    vswp   d0, d2
+    cache_preload 8, 8
+.endm
+
+.macro pixman_composite_src_0888_8888_rev_init
+    veor   d3, d3, d3
+.endm
+
+generate_composite_function \
+    pixman_composite_src_0888_8888_rev_asm_neon, 24, 0, 32, \
+    FLAG_DST_WRITEONLY | FLAG_DEINTERLEAVE_32BPP, \
+    8, /* number of pixels, processed in a single block */ \
+    10, /* prefetch distance */ \
+    pixman_composite_src_0888_8888_rev_init, \
+    default_cleanup, \
+    pixman_composite_src_0888_8888_rev_process_pixblock_head, \
+    pixman_composite_src_0888_8888_rev_process_pixblock_tail, \
+    pixman_composite_src_0888_8888_rev_process_pixblock_tail_head, \
+    0, /* dst_w_basereg */ \
+    0, /* dst_r_basereg */ \
+    0, /* src_basereg   */ \
+    0  /* mask_basereg  */
+
+/******************************************************************************/
+
+.macro pixman_composite_src_0888_0565_rev_process_pixblock_head
+    vshll.u8    q8, d1, #8
+    vshll.u8    q9, d2, #8
+.endm
+
+.macro pixman_composite_src_0888_0565_rev_process_pixblock_tail
+    vshll.u8    q14, d0, #8
+    vsri.u16    q14, q8, #5
+    vsri.u16    q14, q9, #11
+.endm
+
+.macro pixman_composite_src_0888_0565_rev_process_pixblock_tail_head
+        vshll.u8    q14, d0, #8
+    fetch_src_pixblock
+        vsri.u16    q14, q8, #5
+        vsri.u16    q14, q9, #11
+    vshll.u8    q8, d1, #8
+        vst1.16 {d28, d29}, [DST_W, :128]!
+    vshll.u8    q9, d2, #8
+.endm
+
+generate_composite_function \
+    pixman_composite_src_0888_0565_rev_asm_neon, 24, 0, 16, \
+    FLAG_DST_WRITEONLY, \
+    8, /* number of pixels, processed in a single block */ \
+    10, /* prefetch distance */ \
+    default_init, \
+    default_cleanup, \
+    pixman_composite_src_0888_0565_rev_process_pixblock_head, \
+    pixman_composite_src_0888_0565_rev_process_pixblock_tail, \
+    pixman_composite_src_0888_0565_rev_process_pixblock_tail_head, \
+    28, /* dst_w_basereg */ \
+    0, /* dst_r_basereg */ \
+    0, /* src_basereg   */ \
+    0  /* mask_basereg  */
+
+/******************************************************************************/
+
+.macro pixman_composite_src_pixbuf_8888_process_pixblock_head
+    vmull.u8    q8, d3, d0
+    vmull.u8    q9, d3, d1
+    vmull.u8    q10, d3, d2
+.endm
+
+.macro pixman_composite_src_pixbuf_8888_process_pixblock_tail
+    vrshr.u16   q11, q8, #8
+    vswp        d3, d31
+    vrshr.u16   q12, q9, #8
+    vrshr.u16   q13, q10, #8
+    vraddhn.u16 d30, q11, q8
+    vraddhn.u16 d29, q12, q9
+    vraddhn.u16 d28, q13, q10
+.endm
+
+.macro pixman_composite_src_pixbuf_8888_process_pixblock_tail_head
+        vrshr.u16   q11, q8, #8
+        vswp        d3, d31
+        vrshr.u16   q12, q9, #8
+        vrshr.u16   q13, q10, #8
+    fetch_src_pixblock
+        vraddhn.u16 d30, q11, q8
+                                    PF add PF_X, PF_X, #8
+                                    PF tst PF_CTL, #0xF
+                                    PF addne PF_X, PF_X, #8
+                                    PF subne PF_CTL, PF_CTL, #1
+        vraddhn.u16 d29, q12, q9
+        vraddhn.u16 d28, q13, q10
+    vmull.u8    q8, d3, d0
+    vmull.u8    q9, d3, d1
+    vmull.u8    q10, d3, d2
+        vst4.8 {d28, d29, d30, d31}, [DST_W, :128]!
+                                    PF cmp PF_X, ORIG_W
+                                    PF pld, [PF_SRC, PF_X, lsl #src_bpp_shift]
+                                    PF subge PF_X, PF_X, ORIG_W
+                                    PF subges PF_CTL, PF_CTL, #0x10
+                                    PF ldrgeb DUMMY, [PF_SRC, SRC_STRIDE, lsl #src_bpp_shift]!
+.endm
+
+generate_composite_function \
+    pixman_composite_src_pixbuf_8888_asm_neon, 32, 0, 32, \
+    FLAG_DST_WRITEONLY | FLAG_DEINTERLEAVE_32BPP, \
+    8, /* number of pixels, processed in a single block */ \
+    10, /* prefetch distance */ \
+    default_init, \
+    default_cleanup, \
+    pixman_composite_src_pixbuf_8888_process_pixblock_head, \
+    pixman_composite_src_pixbuf_8888_process_pixblock_tail, \
+    pixman_composite_src_pixbuf_8888_process_pixblock_tail_head, \
+    28, /* dst_w_basereg */ \
+    0, /* dst_r_basereg */ \
+    0, /* src_basereg   */ \
+    0  /* mask_basereg  */
+
+/******************************************************************************/
+
+.macro pixman_composite_src_rpixbuf_8888_process_pixblock_head
+    vmull.u8    q8, d3, d0
+    vmull.u8    q9, d3, d1
+    vmull.u8    q10, d3, d2
+.endm
+
+.macro pixman_composite_src_rpixbuf_8888_process_pixblock_tail
+    vrshr.u16   q11, q8, #8
+    vswp        d3, d31
+    vrshr.u16   q12, q9, #8
+    vrshr.u16   q13, q10, #8
+    vraddhn.u16 d28, q11, q8
+    vraddhn.u16 d29, q12, q9
+    vraddhn.u16 d30, q13, q10
+.endm
+
+.macro pixman_composite_src_rpixbuf_8888_process_pixblock_tail_head
+        vrshr.u16   q11, q8, #8
+        vswp        d3, d31
+        vrshr.u16   q12, q9, #8
+        vrshr.u16   q13, q10, #8
+    fetch_src_pixblock
+        vraddhn.u16 d28, q11, q8
+                                    PF add PF_X, PF_X, #8
+                                    PF tst PF_CTL, #0xF
+                                    PF addne PF_X, PF_X, #8
+                                    PF subne PF_CTL, PF_CTL, #1
+        vraddhn.u16 d29, q12, q9
+        vraddhn.u16 d30, q13, q10
+    vmull.u8    q8, d3, d0
+    vmull.u8    q9, d3, d1
+    vmull.u8    q10, d3, d2
+        vst4.8 {d28, d29, d30, d31}, [DST_W, :128]!
+                                    PF cmp PF_X, ORIG_W
+                                    PF pld, [PF_SRC, PF_X, lsl #src_bpp_shift]
+                                    PF subge PF_X, PF_X, ORIG_W
+                                    PF subges PF_CTL, PF_CTL, #0x10
+                                    PF ldrgeb DUMMY, [PF_SRC, SRC_STRIDE, lsl #src_bpp_shift]!
+.endm
+
+generate_composite_function \
+    pixman_composite_src_rpixbuf_8888_asm_neon, 32, 0, 32, \
+    FLAG_DST_WRITEONLY | FLAG_DEINTERLEAVE_32BPP, \
+    8, /* number of pixels, processed in a single block */ \
+    10, /* prefetch distance */ \
+    default_init, \
+    default_cleanup, \
+    pixman_composite_src_rpixbuf_8888_process_pixblock_head, \
+    pixman_composite_src_rpixbuf_8888_process_pixblock_tail, \
+    pixman_composite_src_rpixbuf_8888_process_pixblock_tail_head, \
+    28, /* dst_w_basereg */ \
+    0, /* dst_r_basereg */ \
+    0, /* src_basereg   */ \
+    0  /* mask_basereg  */
+
+/******************************************************************************/
+
+.macro pixman_composite_over_0565_8_0565_process_pixblock_head
+    /* mask is in d15 */
+    convert_0565_to_x888 q4, d2, d1, d0
+    convert_0565_to_x888 q5, d6, d5, d4
+    /* source pixel data is in      {d0, d1, d2, XX} */
+    /* destination pixel data is in {d4, d5, d6, XX} */
+    vmvn.8      d7,  d15
+    vmull.u8    q6,  d15, d2
+    vmull.u8    q5,  d15, d1
+    vmull.u8    q4,  d15, d0
+    vmull.u8    q8,  d7,  d4
+    vmull.u8    q9,  d7,  d5
+    vmull.u8    q13, d7,  d6
+    vrshr.u16   q12, q6,  #8
+    vrshr.u16   q11, q5,  #8
+    vrshr.u16   q10, q4,  #8
+    vraddhn.u16 d2,  q6,  q12
+    vraddhn.u16 d1,  q5,  q11
+    vraddhn.u16 d0,  q4,  q10
+.endm
+
+.macro pixman_composite_over_0565_8_0565_process_pixblock_tail
+    vrshr.u16   q14, q8,  #8
+    vrshr.u16   q15, q9,  #8
+    vrshr.u16   q12, q13, #8
+    vraddhn.u16 d28, q14, q8
+    vraddhn.u16 d29, q15, q9
+    vraddhn.u16 d30, q12, q13
+    vqadd.u8    q0,  q0,  q14
+    vqadd.u8    q1,  q1,  q15
+    /* 32bpp result is in {d0, d1, d2, XX} */
+    convert_8888_to_0565 d2, d1, d0, q14, q15, q3
+.endm
+
+/* TODO: expand macros and do better instructions scheduling */
+.macro pixman_composite_over_0565_8_0565_process_pixblock_tail_head
+    fetch_mask_pixblock
+    pixman_composite_over_0565_8_0565_process_pixblock_tail
+    fetch_src_pixblock
+    vld1.16    {d10, d11}, [DST_R, :128]!
+    cache_preload 8, 8
+    pixman_composite_over_0565_8_0565_process_pixblock_head
+    vst1.16    {d28, d29}, [DST_W, :128]!
+.endm
+
+generate_composite_function \
+    pixman_composite_over_0565_8_0565_asm_neon, 16, 8, 16, \
+    FLAG_DST_READWRITE, \
+    8, /* number of pixels, processed in a single block */ \
+    5, /* prefetch distance */ \
+    default_init_need_all_regs, \
+    default_cleanup_need_all_regs, \
+    pixman_composite_over_0565_8_0565_process_pixblock_head, \
+    pixman_composite_over_0565_8_0565_process_pixblock_tail, \
+    pixman_composite_over_0565_8_0565_process_pixblock_tail_head, \
+    28, /* dst_w_basereg */ \
+    10,  /* dst_r_basereg */ \
+    8,  /* src_basereg   */ \
+    15  /* mask_basereg  */
+
+/******************************************************************************/
+
+.macro pixman_composite_over_0565_n_0565_init
+    add         DUMMY, sp, #(ARGS_STACK_OFFSET + 8)
+    vpush       {d8-d15}
+    vld1.32     {d15[0]}, [DUMMY]
+    vdup.8      d15, d15[3]
+.endm
+
+.macro pixman_composite_over_0565_n_0565_cleanup
+    vpop        {d8-d15}
+.endm
+
+generate_composite_function \
+    pixman_composite_over_0565_n_0565_asm_neon, 16, 0, 16, \
+    FLAG_DST_READWRITE, \
+    8, /* number of pixels, processed in a single block */ \
+    5, /* prefetch distance */ \
+    pixman_composite_over_0565_n_0565_init, \
+    pixman_composite_over_0565_n_0565_cleanup, \
+    pixman_composite_over_0565_8_0565_process_pixblock_head, \
+    pixman_composite_over_0565_8_0565_process_pixblock_tail, \
+    pixman_composite_over_0565_8_0565_process_pixblock_tail_head, \
+    28, /* dst_w_basereg */ \
+    10, /* dst_r_basereg */ \
+    8,  /* src_basereg   */ \
+    15  /* mask_basereg  */
+
+/******************************************************************************/
+
+.macro pixman_composite_add_0565_8_0565_process_pixblock_head
+    /* mask is in d15 */
+    convert_0565_to_x888 q4, d2, d1, d0
+    convert_0565_to_x888 q5, d6, d5, d4
+    /* source pixel data is in      {d0, d1, d2, XX} */
+    /* destination pixel data is in {d4, d5, d6, XX} */
+    vmull.u8    q6,  d15, d2
+    vmull.u8    q5,  d15, d1
+    vmull.u8    q4,  d15, d0
+    vrshr.u16   q12, q6,  #8
+    vrshr.u16   q11, q5,  #8
+    vrshr.u16   q10, q4,  #8
+    vraddhn.u16 d2,  q6,  q12
+    vraddhn.u16 d1,  q5,  q11
+    vraddhn.u16 d0,  q4,  q10
+.endm
+
+.macro pixman_composite_add_0565_8_0565_process_pixblock_tail
+    vqadd.u8    q0,  q0,  q2
+    vqadd.u8    q1,  q1,  q3
+    /* 32bpp result is in {d0, d1, d2, XX} */
+    convert_8888_to_0565 d2, d1, d0, q14, q15, q3
+.endm
+
+/* TODO: expand macros and do better instructions scheduling */
+.macro pixman_composite_add_0565_8_0565_process_pixblock_tail_head
+    fetch_mask_pixblock
+    pixman_composite_add_0565_8_0565_process_pixblock_tail
+    fetch_src_pixblock
+    vld1.16    {d10, d11}, [DST_R, :128]!
+    cache_preload 8, 8
+    pixman_composite_add_0565_8_0565_process_pixblock_head
+    vst1.16    {d28, d29}, [DST_W, :128]!
+.endm
+
+generate_composite_function \
+    pixman_composite_add_0565_8_0565_asm_neon, 16, 8, 16, \
+    FLAG_DST_READWRITE, \
+    8, /* number of pixels, processed in a single block */ \
+    5, /* prefetch distance */ \
+    default_init_need_all_regs, \
+    default_cleanup_need_all_regs, \
+    pixman_composite_add_0565_8_0565_process_pixblock_head, \
+    pixman_composite_add_0565_8_0565_process_pixblock_tail, \
+    pixman_composite_add_0565_8_0565_process_pixblock_tail_head, \
+    28, /* dst_w_basereg */ \
+    10, /* dst_r_basereg */ \
+    8,  /* src_basereg   */ \
+    15  /* mask_basereg  */
+
+/******************************************************************************/
+
+.macro pixman_composite_out_reverse_8_0565_process_pixblock_head
+    /* mask is in d15 */
+    convert_0565_to_x888 q5, d6, d5, d4
+    /* destination pixel data is in {d4, d5, d6, xx} */
+    vmvn.8      d24, d15 /* get inverted alpha */
+    /* now do alpha blending */
+    vmull.u8    q8, d24, d4
+    vmull.u8    q9, d24, d5
+    vmull.u8    q10, d24, d6
+.endm
+
+.macro pixman_composite_out_reverse_8_0565_process_pixblock_tail
+    vrshr.u16   q14, q8, #8
+    vrshr.u16   q15, q9, #8
+    vrshr.u16   q12, q10, #8
+    vraddhn.u16 d0, q14, q8
+    vraddhn.u16 d1, q15, q9
+    vraddhn.u16 d2, q12, q10
+    /* 32bpp result is in {d0, d1, d2, XX} */
+    convert_8888_to_0565 d2, d1, d0, q14, q15, q3
+.endm
+
+/* TODO: expand macros and do better instructions scheduling */
+.macro pixman_composite_out_reverse_8_0565_process_pixblock_tail_head
+    fetch_src_pixblock
+    pixman_composite_out_reverse_8_0565_process_pixblock_tail
+    vld1.16    {d10, d11}, [DST_R, :128]!
+    cache_preload 8, 8
+    pixman_composite_out_reverse_8_0565_process_pixblock_head
+    vst1.16    {d28, d29}, [DST_W, :128]!
+.endm
+
+generate_composite_function \
+    pixman_composite_out_reverse_8_0565_asm_neon, 8, 0, 16, \
+    FLAG_DST_READWRITE, \
+    8, /* number of pixels, processed in a single block */ \
+    5, /* prefetch distance */ \
+    default_init_need_all_regs, \
+    default_cleanup_need_all_regs, \
+    pixman_composite_out_reverse_8_0565_process_pixblock_head, \
+    pixman_composite_out_reverse_8_0565_process_pixblock_tail, \
+    pixman_composite_out_reverse_8_0565_process_pixblock_tail_head, \
+    28, /* dst_w_basereg */ \
+    10, /* dst_r_basereg */ \
+    15, /* src_basereg   */ \
+    0   /* mask_basereg  */
+
+/******************************************************************************/
+
+.macro pixman_composite_out_reverse_8_8888_process_pixblock_head
+    /* src is in d0 */
+    /* destination pixel data is in {d4, d5, d6, d7} */
+    vmvn.8      d1, d0 /* get inverted alpha */
+    /* now do alpha blending */
+    vmull.u8    q8, d1, d4
+    vmull.u8    q9, d1, d5
+    vmull.u8    q10, d1, d6
+    vmull.u8    q11, d1, d7
+.endm
+
+.macro pixman_composite_out_reverse_8_8888_process_pixblock_tail
+    vrshr.u16   q14, q8, #8
+    vrshr.u16   q15, q9, #8
+    vrshr.u16   q12, q10, #8
+    vrshr.u16   q13, q11, #8
+    vraddhn.u16 d28, q14, q8
+    vraddhn.u16 d29, q15, q9
+    vraddhn.u16 d30, q12, q10
+    vraddhn.u16 d31, q13, q11
+    /* 32bpp result is in {d28, d29, d30, d31} */
+.endm
+
+/* TODO: expand macros and do better instructions scheduling */
+.macro pixman_composite_out_reverse_8_8888_process_pixblock_tail_head
+    fetch_src_pixblock
+    pixman_composite_out_reverse_8_8888_process_pixblock_tail
+    vld4.8    {d4, d5, d6, d7}, [DST_R, :128]!
+    cache_preload 8, 8
+    pixman_composite_out_reverse_8_8888_process_pixblock_head
+    vst4.8    {d28, d29, d30, d31}, [DST_W, :128]!
+.endm
+
+generate_composite_function \
+    pixman_composite_out_reverse_8_8888_asm_neon, 8, 0, 32, \
+    FLAG_DST_READWRITE | FLAG_DEINTERLEAVE_32BPP, \
+    8, /* number of pixels, processed in a single block */ \
+    5, /* prefetch distance */ \
+    default_init, \
+    default_cleanup, \
+    pixman_composite_out_reverse_8_8888_process_pixblock_head, \
+    pixman_composite_out_reverse_8_8888_process_pixblock_tail, \
+    pixman_composite_out_reverse_8_8888_process_pixblock_tail_head, \
+    28, /* dst_w_basereg */ \
+    4, /* dst_r_basereg */ \
+    0, /* src_basereg   */ \
+    0   /* mask_basereg  */
+
+/******************************************************************************/
+
+generate_composite_function_nearest_scanline \
+    pixman_scaled_nearest_scanline_8888_8888_OVER_asm_neon, 32, 0, 32, \
+    FLAG_DST_READWRITE | FLAG_DEINTERLEAVE_32BPP, \
+    8, /* number of pixels, processed in a single block */ \
+    default_init, \
+    default_cleanup, \
+    pixman_composite_over_8888_8888_process_pixblock_head, \
+    pixman_composite_over_8888_8888_process_pixblock_tail, \
+    pixman_composite_over_8888_8888_process_pixblock_tail_head
+
+generate_composite_function_nearest_scanline \
+    pixman_scaled_nearest_scanline_8888_0565_OVER_asm_neon, 32, 0, 16, \
+    FLAG_DST_READWRITE | FLAG_DEINTERLEAVE_32BPP, \
+    8, /* number of pixels, processed in a single block */ \
+    default_init, \
+    default_cleanup, \
+    pixman_composite_over_8888_0565_process_pixblock_head, \
+    pixman_composite_over_8888_0565_process_pixblock_tail, \
+    pixman_composite_over_8888_0565_process_pixblock_tail_head, \
+    28, /* dst_w_basereg */ \
+    4,  /* dst_r_basereg */ \
+    0,  /* src_basereg   */ \
+    24  /* mask_basereg  */
+
+generate_composite_function_nearest_scanline \
+    pixman_scaled_nearest_scanline_8888_0565_SRC_asm_neon, 32, 0, 16, \
+    FLAG_DST_WRITEONLY | FLAG_DEINTERLEAVE_32BPP, \
+    8, /* number of pixels, processed in a single block */ \
+    default_init, \
+    default_cleanup, \
+    pixman_composite_src_8888_0565_process_pixblock_head, \
+    pixman_composite_src_8888_0565_process_pixblock_tail, \
+    pixman_composite_src_8888_0565_process_pixblock_tail_head
+
+generate_composite_function_nearest_scanline \
+    pixman_scaled_nearest_scanline_0565_8888_SRC_asm_neon, 16, 0, 32, \
+    FLAG_DST_WRITEONLY | FLAG_DEINTERLEAVE_32BPP, \
+    8, /* number of pixels, processed in a single block */ \
+    default_init, \
+    default_cleanup, \
+    pixman_composite_src_0565_8888_process_pixblock_head, \
+    pixman_composite_src_0565_8888_process_pixblock_tail, \
+    pixman_composite_src_0565_8888_process_pixblock_tail_head
+
+generate_composite_function_nearest_scanline \
+    pixman_scaled_nearest_scanline_8888_8_0565_OVER_asm_neon, 32, 8, 16, \
+    FLAG_DST_READWRITE | FLAG_DEINTERLEAVE_32BPP, \
+    8, /* number of pixels, processed in a single block */ \
+    default_init_need_all_regs, \
+    default_cleanup_need_all_regs, \
+    pixman_composite_over_8888_8_0565_process_pixblock_head, \
+    pixman_composite_over_8888_8_0565_process_pixblock_tail, \
+    pixman_composite_over_8888_8_0565_process_pixblock_tail_head, \
+    28, /* dst_w_basereg */ \
+    4,  /* dst_r_basereg */ \
+    8,  /* src_basereg   */ \
+    24  /* mask_basereg  */
+
+generate_composite_function_nearest_scanline \
+    pixman_scaled_nearest_scanline_0565_8_0565_OVER_asm_neon, 16, 8, 16, \
+    FLAG_DST_READWRITE, \
+    8, /* number of pixels, processed in a single block */ \
+    default_init_need_all_regs, \
+    default_cleanup_need_all_regs, \
+    pixman_composite_over_0565_8_0565_process_pixblock_head, \
+    pixman_composite_over_0565_8_0565_process_pixblock_tail, \
+    pixman_composite_over_0565_8_0565_process_pixblock_tail_head, \
+    28, /* dst_w_basereg */ \
+    10,  /* dst_r_basereg */ \
+    8,  /* src_basereg   */ \
+    15  /* mask_basereg  */
+
+/******************************************************************************/
+
+/* Supplementary macro for setting function attributes */
+.macro pixman_asm_function fname
+    .func fname
+    .global fname
+#ifdef __ELF__
+    .hidden fname
+    .type fname, %function
+#endif
+fname:
+.endm
+
+/*
+ * Bilinear scaling support code which tries to provide pixel fetching, color
+ * format conversion, and interpolation as separate macros which can be used
+ * as the basic building blocks for constructing bilinear scanline functions.
+ */
+
+.macro bilinear_load_8888 reg1, reg2, tmp
+    mov       TMP1, X, asr #16
+    add       X, X, UX
+    add       TMP1, TOP, TMP1, asl #2
+    vld1.32   {reg1}, [TMP1], STRIDE
+    vld1.32   {reg2}, [TMP1]
+.endm
+
+.macro bilinear_load_0565 reg1, reg2, tmp
+    mov       TMP1, X, asr #16
+    add       X, X, UX
+    add       TMP1, TOP, TMP1, asl #1
+    vld1.32   {reg2[0]}, [TMP1], STRIDE
+    vld1.32   {reg2[1]}, [TMP1]
+    convert_four_0565_to_x888_packed reg2, reg1, reg2, tmp
+.endm
+
+.macro bilinear_load_and_vertical_interpolate_two_8888 \
+                    acc1, acc2, reg1, reg2, reg3, reg4, tmp1, tmp2
+
+    bilinear_load_8888 reg1, reg2, tmp1
+    vmull.u8  acc1, reg1, d28
+    vmlal.u8  acc1, reg2, d29
+    bilinear_load_8888 reg3, reg4, tmp2
+    vmull.u8  acc2, reg3, d28
+    vmlal.u8  acc2, reg4, d29
+.endm
+
+.macro bilinear_load_and_vertical_interpolate_four_8888 \
+                xacc1, xacc2, xreg1, xreg2, xreg3, xreg4, xacc2lo, xacc2hi \
+                yacc1, yacc2, yreg1, yreg2, yreg3, yreg4, yacc2lo, yacc2hi
+
+    bilinear_load_and_vertical_interpolate_two_8888 \
+                xacc1, xacc2, xreg1, xreg2, xreg3, xreg4, xacc2lo, xacc2hi
+    bilinear_load_and_vertical_interpolate_two_8888 \
+                yacc1, yacc2, yreg1, yreg2, yreg3, yreg4, yacc2lo, yacc2hi
+.endm
+
+.macro bilinear_load_and_vertical_interpolate_two_0565 \
+                acc1, acc2, reg1, reg2, reg3, reg4, acc2lo, acc2hi
+
+    mov       TMP1, X, asr #16
+    add       X, X, UX
+    add       TMP1, TOP, TMP1, asl #1
+    mov       TMP2, X, asr #16
+    add       X, X, UX
+    add       TMP2, TOP, TMP2, asl #1
+    vld1.32   {acc2lo[0]}, [TMP1], STRIDE
+    vld1.32   {acc2hi[0]}, [TMP2], STRIDE
+    vld1.32   {acc2lo[1]}, [TMP1]
+    vld1.32   {acc2hi[1]}, [TMP2]
+    convert_0565_to_x888 acc2, reg3, reg2, reg1
+    vzip.u8   reg1, reg3
+    vzip.u8   reg2, reg4
+    vzip.u8   reg3, reg4
+    vzip.u8   reg1, reg2
+    vmull.u8  acc1, reg1, d28
+    vmlal.u8  acc1, reg2, d29
+    vmull.u8  acc2, reg3, d28
+    vmlal.u8  acc2, reg4, d29
+.endm
+
+.macro bilinear_load_and_vertical_interpolate_four_0565 \
+                xacc1, xacc2, xreg1, xreg2, xreg3, xreg4, xacc2lo, xacc2hi \
+                yacc1, yacc2, yreg1, yreg2, yreg3, yreg4, yacc2lo, yacc2hi
+
+    mov       TMP1, X, asr #16
+    add       X, X, UX
+    add       TMP1, TOP, TMP1, asl #1
+    mov       TMP2, X, asr #16
+    add       X, X, UX
+    add       TMP2, TOP, TMP2, asl #1
+    vld1.32   {xacc2lo[0]}, [TMP1], STRIDE
+    vld1.32   {xacc2hi[0]}, [TMP2], STRIDE
+    vld1.32   {xacc2lo[1]}, [TMP1]
+    vld1.32   {xacc2hi[1]}, [TMP2]
+    convert_0565_to_x888 xacc2, xreg3, xreg2, xreg1
+    mov       TMP1, X, asr #16
+    add       X, X, UX
+    add       TMP1, TOP, TMP1, asl #1
+    mov       TMP2, X, asr #16
+    add       X, X, UX
+    add       TMP2, TOP, TMP2, asl #1
+    vld1.32   {yacc2lo[0]}, [TMP1], STRIDE
+    vzip.u8   xreg1, xreg3
+    vld1.32   {yacc2hi[0]}, [TMP2], STRIDE
+    vzip.u8   xreg2, xreg4
+    vld1.32   {yacc2lo[1]}, [TMP1]
+    vzip.u8   xreg3, xreg4
+    vld1.32   {yacc2hi[1]}, [TMP2]
+    vzip.u8   xreg1, xreg2
+    convert_0565_to_x888 yacc2, yreg3, yreg2, yreg1
+    vmull.u8  xacc1, xreg1, d28
+    vzip.u8   yreg1, yreg3
+    vmlal.u8  xacc1, xreg2, d29
+    vzip.u8   yreg2, yreg4
+    vmull.u8  xacc2, xreg3, d28
+    vzip.u8   yreg3, yreg4
+    vmlal.u8  xacc2, xreg4, d29
+    vzip.u8   yreg1, yreg2
+    vmull.u8  yacc1, yreg1, d28
+    vmlal.u8  yacc1, yreg2, d29
+    vmull.u8  yacc2, yreg3, d28
+    vmlal.u8  yacc2, yreg4, d29
+.endm
+
+.macro bilinear_store_8888 numpix, tmp1, tmp2
+.if numpix == 4
+    vst1.32   {d0, d1}, [OUT, :128]!
+.elseif numpix == 2
+    vst1.32   {d0}, [OUT, :64]!
+.elseif numpix == 1
+    vst1.32   {d0[0]}, [OUT, :32]!
+.else
+    .error bilinear_store_8888 numpix is unsupported
+.endif
+.endm
+
+.macro bilinear_store_0565 numpix, tmp1, tmp2
+    vuzp.u8 d0, d1
+    vuzp.u8 d2, d3
+    vuzp.u8 d1, d3
+    vuzp.u8 d0, d2
+    convert_8888_to_0565 d2, d1, d0, q1, tmp1, tmp2
+.if numpix == 4
+    vst1.16   {d2}, [OUT, :64]!
+.elseif numpix == 2
+    vst1.32   {d2[0]}, [OUT, :32]!
+.elseif numpix == 1
+    vst1.16   {d2[0]}, [OUT, :16]!
+.else
+    .error bilinear_store_0565 numpix is unsupported
+.endif
+.endm
+
+.macro bilinear_interpolate_last_pixel src_fmt, dst_fmt
+    bilinear_load_&src_fmt d0, d1, d2
+    vmull.u8  q1, d0, d28
+    vmlal.u8  q1, d1, d29
+    /* 5 cycles bubble */
+    vshll.u16 q0, d2, #8
+    vmlsl.u16 q0, d2, d30
+    vmlal.u16 q0, d3, d30
+    /* 5 cycles bubble */
+    vshrn.u32 d0, q0, #16
+    /* 3 cycles bubble */
+    vmovn.u16 d0, q0
+    /* 1 cycle bubble */
+    bilinear_store_&dst_fmt 1, q2, q3
+.endm
+
+.macro bilinear_interpolate_two_pixels src_fmt, dst_fmt
+    bilinear_load_and_vertical_interpolate_two_&src_fmt \
+                q1, q11, d0, d1, d20, d21, d22, d23
+    vshll.u16 q0, d2, #8
+    vmlsl.u16 q0, d2, d30
+    vmlal.u16 q0, d3, d30
+    vshll.u16 q10, d22, #8
+    vmlsl.u16 q10, d22, d31
+    vmlal.u16 q10, d23, d31
+    vshrn.u32 d0, q0, #16
+    vshrn.u32 d1, q10, #16
+    vshr.u16  q15, q12, #8
+    vadd.u16  q12, q12, q13
+    vmovn.u16 d0, q0
+    bilinear_store_&dst_fmt 2, q2, q3
+.endm
+
+.macro bilinear_interpolate_four_pixels src_fmt, dst_fmt
+    bilinear_load_and_vertical_interpolate_four_&src_fmt \
+                q1, q11, d0, d1, d20, d21, d22, d23 \
+                q3, q9,  d4, d5, d16, d17, d18, d19
+    pld       [TMP1, PF_OFFS]
+    sub       TMP1, TMP1, STRIDE
+    vshll.u16 q0, d2, #8
+    vmlsl.u16 q0, d2, d30
+    vmlal.u16 q0, d3, d30
+    vshll.u16 q10, d22, #8
+    vmlsl.u16 q10, d22, d31
+    vmlal.u16 q10, d23, d31
+    vshr.u16  q15, q12, #8
+    vshll.u16 q2, d6, #8
+    vmlsl.u16 q2, d6, d30
+    vmlal.u16 q2, d7, d30
+    vshll.u16 q8, d18, #8
+    pld       [TMP2, PF_OFFS]
+    vmlsl.u16 q8, d18, d31
+    vmlal.u16 q8, d19, d31
+    vadd.u16  q12, q12, q13
+    vshrn.u32 d0, q0, #16
+    vshrn.u32 d1, q10, #16
+    vshrn.u32 d4, q2, #16
+    vshrn.u32 d5, q8, #16
+    vshr.u16  q15, q12, #8
+    vmovn.u16 d0, q0
+    vmovn.u16 d1, q2
+    vadd.u16  q12, q12, q13
+    bilinear_store_&dst_fmt 4, q2, q3
+.endm
+
+.macro bilinear_interpolate_four_pixels_head src_fmt, dst_fmt
+.ifdef have_bilinear_interpolate_four_pixels_&src_fmt&_&dst_fmt
+    bilinear_interpolate_four_pixels_&src_fmt&_&dst_fmt&_head
+.else
+    bilinear_interpolate_four_pixels src_fmt, dst_fmt
+.endif
+.endm
+
+.macro bilinear_interpolate_four_pixels_tail src_fmt, dst_fmt
+.ifdef have_bilinear_interpolate_four_pixels_&src_fmt&_&dst_fmt
+    bilinear_interpolate_four_pixels_&src_fmt&_&dst_fmt&_tail
+.endif
+.endm
+
+.macro bilinear_interpolate_four_pixels_tail_head src_fmt, dst_fmt
+.ifdef have_bilinear_interpolate_four_pixels_&src_fmt&_&dst_fmt
+    bilinear_interpolate_four_pixels_&src_fmt&_&dst_fmt&_tail_head
+.else
+    bilinear_interpolate_four_pixels src_fmt, dst_fmt
+.endif
+.endm
+
+.macro bilinear_interpolate_eight_pixels_head src_fmt, dst_fmt
+.ifdef have_bilinear_interpolate_eight_pixels_&src_fmt&_&dst_fmt
+    bilinear_interpolate_eight_pixels_&src_fmt&_&dst_fmt&_head
+.else
+    bilinear_interpolate_four_pixels_head src_fmt, dst_fmt
+    bilinear_interpolate_four_pixels_tail_head src_fmt, dst_fmt
+.endif
+.endm
+
+.macro bilinear_interpolate_eight_pixels_tail src_fmt, dst_fmt
+.ifdef have_bilinear_interpolate_eight_pixels_&src_fmt&_&dst_fmt
+    bilinear_interpolate_eight_pixels_&src_fmt&_&dst_fmt&_tail
+.else
+    bilinear_interpolate_four_pixels_tail src_fmt, dst_fmt
+.endif
+.endm
+
+.macro bilinear_interpolate_eight_pixels_tail_head src_fmt, dst_fmt
+.ifdef have_bilinear_interpolate_eight_pixels_&src_fmt&_&dst_fmt
+    bilinear_interpolate_eight_pixels_&src_fmt&_&dst_fmt&_tail_head
+.else
+    bilinear_interpolate_four_pixels_tail_head src_fmt, dst_fmt
+    bilinear_interpolate_four_pixels_tail_head src_fmt, dst_fmt
+.endif
+.endm
+
+.set BILINEAR_FLAG_UNROLL_4,          0
+.set BILINEAR_FLAG_UNROLL_8,          1
+.set BILINEAR_FLAG_USE_ALL_NEON_REGS, 2
+
+/*
+ * Main template macro for generating NEON optimized bilinear scanline
+ * functions.
+ *
+ * Bilinear scanline scaler macro template uses the following arguments:
+ *  fname             - name of the function to generate
+ *  src_fmt           - source color format (8888 or 0565)
+ *  dst_fmt           - destination color format (8888 or 0565)
+ *  bpp_shift         - (1 << bpp_shift) is the size of source pixel in bytes
+ *  prefetch_distance - prefetch in the source image by that many
+ *                      pixels ahead
+ */
+
+.macro generate_bilinear_scanline_func fname, src_fmt, dst_fmt, \
+                                       src_bpp_shift, dst_bpp_shift, \
+                                       prefetch_distance, flags
+
+pixman_asm_function fname
+    OUT       .req      r0
+    TOP       .req      r1
+    BOTTOM    .req      r2
+    WT        .req      r3
+    WB        .req      r4
+    X         .req      r5
+    UX        .req      r6
+    WIDTH     .req      ip
+    TMP1      .req      r3
+    TMP2      .req      r4
+    PF_OFFS   .req      r7
+    TMP3      .req      r8
+    TMP4      .req      r9
+    STRIDE    .req      r2
+
+    mov       ip, sp
+    push      {r4, r5, r6, r7, r8, r9}
+    mov       PF_OFFS, #prefetch_distance
+    ldmia     ip, {WB, X, UX, WIDTH}
+    mul       PF_OFFS, PF_OFFS, UX
+
+.if ((flags) & BILINEAR_FLAG_USE_ALL_NEON_REGS) != 0
+    vpush     {d8-d15}
+.endif
+
+    sub       STRIDE, BOTTOM, TOP
+    .unreq    BOTTOM
+
+    cmp       WIDTH, #0
+    ble       3f
+
+    vdup.u16  q12, X
+    vdup.u16  q13, UX
+    vdup.u8   d28, WT
+    vdup.u8   d29, WB
+    vadd.u16  d25, d25, d26
+
+    /* ensure good destination alignment  */
+    cmp       WIDTH, #1
+    blt       0f
+    tst       OUT, #(1 << dst_bpp_shift)
+    beq       0f
+    vshr.u16  q15, q12, #8
+    vadd.u16  q12, q12, q13
+    bilinear_interpolate_last_pixel src_fmt, dst_fmt
+    sub       WIDTH, WIDTH, #1
+0:
+    vadd.u16  q13, q13, q13
+    vshr.u16  q15, q12, #8
+    vadd.u16  q12, q12, q13
+
+    cmp       WIDTH, #2
+    blt       0f
+    tst       OUT, #(1 << (dst_bpp_shift + 1))
+    beq       0f
+    bilinear_interpolate_two_pixels src_fmt, dst_fmt
+    sub       WIDTH, WIDTH, #2
+0:
+.if ((flags) & BILINEAR_FLAG_UNROLL_8) != 0
+/*********** 8 pixels per iteration *****************/
+    cmp       WIDTH, #4
+    blt       0f
+    tst       OUT, #(1 << (dst_bpp_shift + 2))
+    beq       0f
+    bilinear_interpolate_four_pixels src_fmt, dst_fmt
+    sub       WIDTH, WIDTH, #4
+0:
+    subs      WIDTH, WIDTH, #8
+    blt       1f
+    mov       PF_OFFS, PF_OFFS, asr #(16 - src_bpp_shift)
+    bilinear_interpolate_eight_pixels_head src_fmt, dst_fmt
+    subs      WIDTH, WIDTH, #8
+    blt       5f
+0:
+    bilinear_interpolate_eight_pixels_tail_head src_fmt, dst_fmt
+    subs      WIDTH, WIDTH, #8
+    bge       0b
+5:
+    bilinear_interpolate_eight_pixels_tail src_fmt, dst_fmt
+1:
+    tst       WIDTH, #4
+    beq       2f
+    bilinear_interpolate_four_pixels src_fmt, dst_fmt
+2:
+.else
+/*********** 4 pixels per iteration *****************/
+    subs      WIDTH, WIDTH, #4
+    blt       1f
+    mov       PF_OFFS, PF_OFFS, asr #(16 - src_bpp_shift)
+    bilinear_interpolate_four_pixels_head src_fmt, dst_fmt
+    subs      WIDTH, WIDTH, #4
+    blt       5f
+0:
+    bilinear_interpolate_four_pixels_tail_head src_fmt, dst_fmt
+    subs      WIDTH, WIDTH, #4
+    bge       0b
+5:
+    bilinear_interpolate_four_pixels_tail src_fmt, dst_fmt
+1:
+/****************************************************/
+.endif
+    /* handle the remaining trailing pixels */
+    tst       WIDTH, #2
+    beq       2f
+    bilinear_interpolate_two_pixels src_fmt, dst_fmt
+2:
+    tst       WIDTH, #1
+    beq       3f
+    bilinear_interpolate_last_pixel src_fmt, dst_fmt
+3:
+.if ((flags) & BILINEAR_FLAG_USE_ALL_NEON_REGS) != 0
+    vpop      {d8-d15}
+.endif
+    pop       {r4, r5, r6, r7, r8, r9}
+    bx        lr
+
+    .unreq    OUT
+    .unreq    TOP
+    .unreq    WT
+    .unreq    WB
+    .unreq    X
+    .unreq    UX
+    .unreq    WIDTH
+    .unreq    TMP1
+    .unreq    TMP2
+    .unreq    PF_OFFS
+    .unreq    TMP3
+    .unreq    TMP4
+    .unreq    STRIDE
+.endfunc
+
+.endm
+
+/*****************************************************************************/
+
+.set have_bilinear_interpolate_four_pixels_8888_8888, 1
+
+.macro bilinear_interpolate_four_pixels_8888_8888_head
+    mov       TMP1, X, asr #16
+    add       X, X, UX
+    add       TMP1, TOP, TMP1, asl #2
+    mov       TMP2, X, asr #16
+    add       X, X, UX
+    add       TMP2, TOP, TMP2, asl #2
+
+    vld1.32   {d22}, [TMP1], STRIDE
+    vld1.32   {d23}, [TMP1]
+    mov       TMP3, X, asr #16
+    add       X, X, UX
+    add       TMP3, TOP, TMP3, asl #2
+    vmull.u8  q8, d22, d28
+    vmlal.u8  q8, d23, d29
+
+    vld1.32   {d22}, [TMP2], STRIDE
+    vld1.32   {d23}, [TMP2]
+    mov       TMP4, X, asr #16
+    add       X, X, UX
+    add       TMP4, TOP, TMP4, asl #2
+    vmull.u8  q9, d22, d28
+    vmlal.u8  q9, d23, d29
+
+    vld1.32   {d22}, [TMP3], STRIDE
+    vld1.32   {d23}, [TMP3]
+    vmull.u8  q10, d22, d28
+    vmlal.u8  q10, d23, d29
+
+    vshll.u16 q0, d16, #8
+    vmlsl.u16 q0, d16, d30
+    vmlal.u16 q0, d17, d30
+
+    pld       [TMP4, PF_OFFS]
+    vld1.32   {d16}, [TMP4], STRIDE
+    vld1.32   {d17}, [TMP4]
+    pld       [TMP4, PF_OFFS]
+    vmull.u8  q11, d16, d28
+    vmlal.u8  q11, d17, d29
+
+    vshll.u16 q1, d18, #8
+    vmlsl.u16 q1, d18, d31
+.endm
+
+.macro bilinear_interpolate_four_pixels_8888_8888_tail
+    vmlal.u16 q1, d19, d31
+    vshr.u16  q15, q12, #8
+    vshll.u16 q2, d20, #8
+    vmlsl.u16 q2, d20, d30
+    vmlal.u16 q2, d21, d30
+    vshll.u16 q3, d22, #8
+    vmlsl.u16 q3, d22, d31
+    vmlal.u16 q3, d23, d31
+    vadd.u16  q12, q12, q13
+    vshrn.u32 d0, q0, #16
+    vshrn.u32 d1, q1, #16
+    vshrn.u32 d4, q2, #16
+    vshr.u16  q15, q12, #8
+    vshrn.u32 d5, q3, #16
+    vmovn.u16 d6, q0
+    vmovn.u16 d7, q2
+    vadd.u16  q12, q12, q13
+    vst1.32   {d6, d7}, [OUT, :128]!
+.endm
+
+.macro bilinear_interpolate_four_pixels_8888_8888_tail_head
+    mov       TMP1, X, asr #16
+    add       X, X, UX
+    add       TMP1, TOP, TMP1, asl #2
+    mov       TMP2, X, asr #16
+    add       X, X, UX
+    add       TMP2, TOP, TMP2, asl #2
+        vmlal.u16 q1, d19, d31
+        vshr.u16  q15, q12, #8
+        vshll.u16 q2, d20, #8
+        vmlsl.u16 q2, d20, d30
+        vmlal.u16 q2, d21, d30
+        vshll.u16 q3, d22, #8
+    vld1.32   {d20}, [TMP1], STRIDE
+        vmlsl.u16 q3, d22, d31
+        vmlal.u16 q3, d23, d31
+    vld1.32   {d21}, [TMP1]
+    vmull.u8  q8, d20, d28
+    vmlal.u8  q8, d21, d29
+        vshrn.u32 d0, q0, #16
+        vshrn.u32 d1, q1, #16
+        vshrn.u32 d4, q2, #16
+    vld1.32   {d22}, [TMP2], STRIDE
+        vshrn.u32 d5, q3, #16
+        vadd.u16  q12, q12, q13
+    vld1.32   {d23}, [TMP2]
+    vmull.u8  q9, d22, d28
+    mov       TMP3, X, asr #16
+    add       X, X, UX
+    add       TMP3, TOP, TMP3, asl #2
+    mov       TMP4, X, asr #16
+    add       X, X, UX
+    add       TMP4, TOP, TMP4, asl #2
+    vmlal.u8  q9, d23, d29
+    vld1.32   {d22}, [TMP3], STRIDE
+        vshr.u16  q15, q12, #8
+    vld1.32   {d23}, [TMP3]
+    vmull.u8  q10, d22, d28
+    vmlal.u8  q10, d23, d29
+        vmovn.u16 d6, q0
+    vshll.u16 q0, d16, #8
+        vmovn.u16 d7, q2
+    vmlsl.u16 q0, d16, d30
+    vmlal.u16 q0, d17, d30
+    pld       [TMP4, PF_OFFS]
+    vld1.32   {d16}, [TMP4], STRIDE
+        vadd.u16  q12, q12, q13
+    vld1.32   {d17}, [TMP4]
+    pld       [TMP4, PF_OFFS]
+    vmull.u8  q11, d16, d28
+    vmlal.u8  q11, d17, d29
+        vst1.32   {d6, d7}, [OUT, :128]!
+    vshll.u16 q1, d18, #8
+    vmlsl.u16 q1, d18, d31
+.endm
+
+/*****************************************************************************/
+
+.set have_bilinear_interpolate_eight_pixels_8888_0565, 1
+
+.macro bilinear_interpolate_eight_pixels_8888_0565_head
+    mov       TMP1, X, asr #16
+    add       X, X, UX
+    add       TMP1, TOP, TMP1, asl #2
+    mov       TMP2, X, asr #16
+    add       X, X, UX
+    add       TMP2, TOP, TMP2, asl #2
+    vld1.32   {d20}, [TMP1], STRIDE
+    vld1.32   {d21}, [TMP1]
+    vmull.u8  q8, d20, d28
+    vmlal.u8  q8, d21, d29
+    vld1.32   {d22}, [TMP2], STRIDE
+    vld1.32   {d23}, [TMP2]
+    vmull.u8  q9, d22, d28
+    mov       TMP3, X, asr #16
+    add       X, X, UX
+    add       TMP3, TOP, TMP3, asl #2
+    mov       TMP4, X, asr #16
+    add       X, X, UX
+    add       TMP4, TOP, TMP4, asl #2
+    vmlal.u8  q9, d23, d29
+    vld1.32   {d22}, [TMP3], STRIDE
+    vld1.32   {d23}, [TMP3]
+    vmull.u8  q10, d22, d28
+    vmlal.u8  q10, d23, d29
+    vshll.u16 q0, d16, #8
+    vmlsl.u16 q0, d16, d30
+    vmlal.u16 q0, d17, d30
+    pld       [TMP4, PF_OFFS]
+    vld1.32   {d16}, [TMP4], STRIDE
+    vld1.32   {d17}, [TMP4]
+    pld       [TMP4, PF_OFFS]
+    vmull.u8  q11, d16, d28
+    vmlal.u8  q11, d17, d29
+    vshll.u16 q1, d18, #8
+    vmlsl.u16 q1, d18, d31
+
+    mov       TMP1, X, asr #16
+    add       X, X, UX
+    add       TMP1, TOP, TMP1, asl #2
+    mov       TMP2, X, asr #16
+    add       X, X, UX
+    add       TMP2, TOP, TMP2, asl #2
+        vmlal.u16 q1, d19, d31
+        vshr.u16  q15, q12, #8
+        vshll.u16 q2, d20, #8
+        vmlsl.u16 q2, d20, d30
+        vmlal.u16 q2, d21, d30
+        vshll.u16 q3, d22, #8
+    vld1.32   {d20}, [TMP1], STRIDE
+        vmlsl.u16 q3, d22, d31
+        vmlal.u16 q3, d23, d31
+    vld1.32   {d21}, [TMP1]
+    vmull.u8  q8, d20, d28
+    vmlal.u8  q8, d21, d29
+        vshrn.u32 d0, q0, #16
+        vshrn.u32 d1, q1, #16
+        vshrn.u32 d4, q2, #16
+    vld1.32   {d22}, [TMP2], STRIDE
+        vshrn.u32 d5, q3, #16
+        vadd.u16  q12, q12, q13
+    vld1.32   {d23}, [TMP2]
+    vmull.u8  q9, d22, d28
+    mov       TMP3, X, asr #16
+    add       X, X, UX
+    add       TMP3, TOP, TMP3, asl #2
+    mov       TMP4, X, asr #16
+    add       X, X, UX
+    add       TMP4, TOP, TMP4, asl #2
+    vmlal.u8  q9, d23, d29
+    vld1.32   {d22}, [TMP3], STRIDE
+        vshr.u16  q15, q12, #8
+    vld1.32   {d23}, [TMP3]
+    vmull.u8  q10, d22, d28
+    vmlal.u8  q10, d23, d29
+        vmovn.u16 d8, q0
+    vshll.u16 q0, d16, #8
+        vmovn.u16 d9, q2
+    vmlsl.u16 q0, d16, d30
+    vmlal.u16 q0, d17, d30
+    pld       [TMP4, PF_OFFS]
+    vld1.32   {d16}, [TMP4], STRIDE
+        vadd.u16  q12, q12, q13
+    vld1.32   {d17}, [TMP4]
+    pld       [TMP4, PF_OFFS]
+    vmull.u8  q11, d16, d28
+    vmlal.u8  q11, d17, d29
+    vshll.u16 q1, d18, #8
+    vmlsl.u16 q1, d18, d31
+.endm
+
+.macro bilinear_interpolate_eight_pixels_8888_0565_tail
+    vmlal.u16 q1, d19, d31
+    vshr.u16  q15, q12, #8
+    vshll.u16 q2, d20, #8
+    vmlsl.u16 q2, d20, d30
+    vmlal.u16 q2, d21, d30
+    vshll.u16 q3, d22, #8
+    vmlsl.u16 q3, d22, d31
+    vmlal.u16 q3, d23, d31
+    vadd.u16  q12, q12, q13
+    vshrn.u32 d0, q0, #16
+    vshrn.u32 d1, q1, #16
+    vshrn.u32 d4, q2, #16
+    vshr.u16  q15, q12, #8
+    vshrn.u32 d5, q3, #16
+    vmovn.u16 d10, q0
+    vmovn.u16 d11, q2
+    vadd.u16  q12, q12, q13
+
+    vuzp.u8   d8, d9
+    vuzp.u8   d10, d11
+    vuzp.u8   d9, d11
+    vuzp.u8   d8, d10
+    vshll.u8  q6, d9, #8
+    vshll.u8  q5, d10, #8
+    vshll.u8  q7, d8, #8
+    vsri.u16  q5, q6, #5
+    vsri.u16  q5, q7, #11
+    vst1.32   {d10, d11}, [OUT, :128]!
+.endm
+
+.macro bilinear_interpolate_eight_pixels_8888_0565_tail_head
+    mov       TMP1, X, asr #16
+    add       X, X, UX
+    add       TMP1, TOP, TMP1, asl #2
+    mov       TMP2, X, asr #16
+    add       X, X, UX
+    add       TMP2, TOP, TMP2, asl #2
+        vmlal.u16 q1, d19, d31
+        vshr.u16  q15, q12, #8
+            vuzp.u8 d8, d9
+        vshll.u16 q2, d20, #8
+        vmlsl.u16 q2, d20, d30
+        vmlal.u16 q2, d21, d30
+        vshll.u16 q3, d22, #8
+    vld1.32   {d20}, [TMP1], STRIDE
+        vmlsl.u16 q3, d22, d31
+        vmlal.u16 q3, d23, d31
+    vld1.32   {d21}, [TMP1]
+    vmull.u8  q8, d20, d28
+    vmlal.u8  q8, d21, d29
+        vshrn.u32 d0, q0, #16
+        vshrn.u32 d1, q1, #16
+        vshrn.u32 d4, q2, #16
+    vld1.32   {d22}, [TMP2], STRIDE
+        vshrn.u32 d5, q3, #16
+        vadd.u16  q12, q12, q13
+    vld1.32   {d23}, [TMP2]
+    vmull.u8  q9, d22, d28
+    mov       TMP3, X, asr #16
+    add       X, X, UX
+    add       TMP3, TOP, TMP3, asl #2
+    mov       TMP4, X, asr #16
+    add       X, X, UX
+    add       TMP4, TOP, TMP4, asl #2
+    vmlal.u8  q9, d23, d29
+    vld1.32   {d22}, [TMP3], STRIDE
+        vshr.u16  q15, q12, #8
+    vld1.32   {d23}, [TMP3]
+    vmull.u8  q10, d22, d28
+    vmlal.u8  q10, d23, d29
+        vmovn.u16 d10, q0
+    vshll.u16 q0, d16, #8
+        vmovn.u16 d11, q2
+    vmlsl.u16 q0, d16, d30
+    vmlal.u16 q0, d17, d30
+    pld       [TMP4, PF_OFFS]
+    vld1.32   {d16}, [TMP4], STRIDE
+        vadd.u16  q12, q12, q13
+    vld1.32   {d17}, [TMP4]
+    pld       [TMP4, PF_OFFS]
+    vmull.u8  q11, d16, d28
+    vmlal.u8  q11, d17, d29
+            vuzp.u8 d10, d11
+    vshll.u16 q1, d18, #8
+    vmlsl.u16 q1, d18, d31
+
+    mov       TMP1, X, asr #16
+    add       X, X, UX
+    add       TMP1, TOP, TMP1, asl #2
+    mov       TMP2, X, asr #16
+    add       X, X, UX
+    add       TMP2, TOP, TMP2, asl #2
+        vmlal.u16 q1, d19, d31
+            vuzp.u8 d9, d11
+        vshr.u16  q15, q12, #8
+        vshll.u16 q2, d20, #8
+            vuzp.u8 d8, d10
+        vmlsl.u16 q2, d20, d30
+        vmlal.u16 q2, d21, d30
+        vshll.u16 q3, d22, #8
+    vld1.32   {d20}, [TMP1], STRIDE
+        vmlsl.u16 q3, d22, d31
+        vmlal.u16 q3, d23, d31
+    vld1.32   {d21}, [TMP1]
+    vmull.u8  q8, d20, d28
+    vmlal.u8  q8, d21, d29
+            vshll.u8  q6, d9, #8
+            vshll.u8  q5, d10, #8
+            vshll.u8  q7, d8, #8
+        vshrn.u32 d0, q0, #16
+            vsri.u16  q5, q6, #5
+        vshrn.u32 d1, q1, #16
+            vsri.u16  q5, q7, #11
+        vshrn.u32 d4, q2, #16
+    vld1.32   {d22}, [TMP2], STRIDE
+        vshrn.u32 d5, q3, #16
+        vadd.u16  q12, q12, q13
+    vld1.32   {d23}, [TMP2]
+    vmull.u8  q9, d22, d28
+    mov       TMP3, X, asr #16
+    add       X, X, UX
+    add       TMP3, TOP, TMP3, asl #2
+    mov       TMP4, X, asr #16
+    add       X, X, UX
+    add       TMP4, TOP, TMP4, asl #2
+    vmlal.u8  q9, d23, d29
+    vld1.32   {d22}, [TMP3], STRIDE
+        vshr.u16  q15, q12, #8
+    vld1.32   {d23}, [TMP3]
+    vmull.u8  q10, d22, d28
+    vmlal.u8  q10, d23, d29
+        vmovn.u16 d8, q0
+    vshll.u16 q0, d16, #8
+        vmovn.u16 d9, q2
+    vmlsl.u16 q0, d16, d30
+    vmlal.u16 q0, d17, d30
+    pld       [TMP4, PF_OFFS]
+    vld1.32   {d16}, [TMP4], STRIDE
+        vadd.u16  q12, q12, q13
+    vld1.32   {d17}, [TMP4]
+    pld       [TMP4, PF_OFFS]
+    vmull.u8  q11, d16, d28
+    vmlal.u8  q11, d17, d29
+    vshll.u16 q1, d18, #8
+            vst1.32   {d10, d11}, [OUT, :128]!
+    vmlsl.u16 q1, d18, d31
+.endm
+/*****************************************************************************/
+
+generate_bilinear_scanline_func \
+    pixman_scaled_bilinear_scanline_8888_8888_SRC_asm_neon, 8888, 8888, \
+    2, 2, 28, BILINEAR_FLAG_UNROLL_4
+
+generate_bilinear_scanline_func \
+    pixman_scaled_bilinear_scanline_8888_0565_SRC_asm_neon, 8888, 0565, \
+    2, 1, 28, BILINEAR_FLAG_UNROLL_8 | BILINEAR_FLAG_USE_ALL_NEON_REGS
+
+generate_bilinear_scanline_func \
+    pixman_scaled_bilinear_scanline_0565_x888_SRC_asm_neon, 0565, 8888, \
+    1, 2, 28, BILINEAR_FLAG_UNROLL_4
+
+generate_bilinear_scanline_func \
+    pixman_scaled_bilinear_scanline_0565_0565_SRC_asm_neon, 0565, 0565, \
+    1, 1, 28, BILINEAR_FLAG_UNROLL_4
diff --git a/pixman/pixman-arm-neon-asm.h b/pixman/pixman-arm-neon-asm.h
new file mode 100644 (file)
index 0000000..97adc6a
--- /dev/null
@@ -0,0 +1,1177 @@
+/*
+ * Copyright © 2009 Nokia Corporation
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and associated documentation files (the "Software"),
+ * to deal in the Software without restriction, including without limitation
+ * the rights to use, copy, modify, merge, publish, distribute, sublicense,
+ * and/or sell copies of the Software, and to permit persons to whom the
+ * Software is furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice (including the next
+ * paragraph) shall be included in all copies or substantial portions of the
+ * Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
+ * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
+ * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER
+ * DEALINGS IN THE SOFTWARE.
+ *
+ * Author:  Siarhei Siamashka (siarhei.siamashka@nokia.com)
+ */
+
+/*
+ * This file contains a macro ('generate_composite_function') which can
+ * construct 2D image processing functions, based on a common template.
+ * Any combinations of source, destination and mask images with 8bpp,
+ * 16bpp, 24bpp, 32bpp color formats are supported.
+ *
+ * This macro takes care of:
+ *  - handling of leading and trailing unaligned pixels
+ *  - doing most of the work related to L2 cache preload
+ *  - encourages the use of software pipelining for better instructions
+ *    scheduling
+ *
+ * The user of this macro has to provide some configuration parameters
+ * (bit depths for the images, prefetch distance, etc.) and a set of
+ * macros, which should implement basic code chunks responsible for
+ * pixels processing. See 'pixman-arm-neon-asm.S' file for the usage
+ * examples.
+ *
+ * TODO:
+ *  - try overlapped pixel method (from Ian Rickards) when processing
+ *    exactly two blocks of pixels
+ *  - maybe add an option to do reverse scanline processing
+ */
+
+/*
+ * Bit flags for 'generate_composite_function' macro which are used
+ * to tune generated functions behavior.
+ */
+.set FLAG_DST_WRITEONLY,       0
+.set FLAG_DST_READWRITE,       1
+.set FLAG_DEINTERLEAVE_32BPP,  2
+
+/*
+ * Offset in stack where mask and source pointer/stride can be accessed
+ * from 'init' macro. This is useful for doing special handling for solid mask.
+ */
+.set ARGS_STACK_OFFSET,        40
+
+/*
+ * Constants for selecting preferable prefetch type.
+ */
+.set PREFETCH_TYPE_NONE,       0 /* No prefetch at all */
+.set PREFETCH_TYPE_SIMPLE,     1 /* A simple, fixed-distance-ahead prefetch */
+.set PREFETCH_TYPE_ADVANCED,   2 /* Advanced fine-grained prefetch */
+
+/*
+ * Definitions of supplementary pixld/pixst macros (for partial load/store of
+ * pixel data).
+ */
+
+.macro pixldst1 op, elem_size, reg1, mem_operand, abits
+.if abits > 0
+    op&.&elem_size {d&reg1}, [&mem_operand&, :&abits&]!
+.else
+    op&.&elem_size {d&reg1}, [&mem_operand&]!
+.endif
+.endm
+
+.macro pixldst2 op, elem_size, reg1, reg2, mem_operand, abits
+.if abits > 0
+    op&.&elem_size {d&reg1, d&reg2}, [&mem_operand&, :&abits&]!
+.else
+    op&.&elem_size {d&reg1, d&reg2}, [&mem_operand&]!
+.endif
+.endm
+
+.macro pixldst4 op, elem_size, reg1, reg2, reg3, reg4, mem_operand, abits
+.if abits > 0
+    op&.&elem_size {d&reg1, d&reg2, d&reg3, d&reg4}, [&mem_operand&, :&abits&]!
+.else
+    op&.&elem_size {d&reg1, d&reg2, d&reg3, d&reg4}, [&mem_operand&]!
+.endif
+.endm
+
+.macro pixldst0 op, elem_size, reg1, idx, mem_operand, abits
+    op&.&elem_size {d&reg1[idx]}, [&mem_operand&]!
+.endm
+
+.macro pixldst3 op, elem_size, reg1, reg2, reg3, mem_operand
+    op&.&elem_size {d&reg1, d&reg2, d&reg3}, [&mem_operand&]!
+.endm
+
+.macro pixldst30 op, elem_size, reg1, reg2, reg3, idx, mem_operand
+    op&.&elem_size {d&reg1[idx], d&reg2[idx], d&reg3[idx]}, [&mem_operand&]!
+.endm
+
+.macro pixldst numbytes, op, elem_size, basereg, mem_operand, abits
+.if numbytes == 32
+    pixldst4 op, elem_size, %(basereg+4), %(basereg+5), \
+                              %(basereg+6), %(basereg+7), mem_operand, abits
+.elseif numbytes == 16
+    pixldst2 op, elem_size, %(basereg+2), %(basereg+3), mem_operand, abits
+.elseif numbytes == 8
+    pixldst1 op, elem_size, %(basereg+1), mem_operand, abits
+.elseif numbytes == 4
+    .if !RESPECT_STRICT_ALIGNMENT || (elem_size == 32)
+        pixldst0 op, 32, %(basereg+0), 1, mem_operand, abits
+    .elseif elem_size == 16
+        pixldst0 op, 16, %(basereg+0), 2, mem_operand, abits
+        pixldst0 op, 16, %(basereg+0), 3, mem_operand, abits
+    .else
+        pixldst0 op, 8, %(basereg+0), 4, mem_operand, abits
+        pixldst0 op, 8, %(basereg+0), 5, mem_operand, abits
+        pixldst0 op, 8, %(basereg+0), 6, mem_operand, abits
+        pixldst0 op, 8, %(basereg+0), 7, mem_operand, abits
+    .endif
+.elseif numbytes == 2
+    .if !RESPECT_STRICT_ALIGNMENT || (elem_size == 16)
+        pixldst0 op, 16, %(basereg+0), 1, mem_operand, abits
+    .else
+        pixldst0 op, 8, %(basereg+0), 2, mem_operand, abits
+        pixldst0 op, 8, %(basereg+0), 3, mem_operand, abits
+    .endif
+.elseif numbytes == 1
+    pixldst0 op, 8, %(basereg+0), 1, mem_operand, abits
+.else
+    .error "unsupported size: numbytes"
+.endif
+.endm
+
+.macro pixld numpix, bpp, basereg, mem_operand, abits=0
+.if bpp > 0
+.if (bpp == 32) && (numpix == 8) && (DEINTERLEAVE_32BPP_ENABLED != 0)
+    pixldst4 vld4, 8, %(basereg+4), %(basereg+5), \
+                      %(basereg+6), %(basereg+7), mem_operand, abits
+.elseif (bpp == 24) && (numpix == 8)
+    pixldst3 vld3, 8, %(basereg+3), %(basereg+4), %(basereg+5), mem_operand
+.elseif (bpp == 24) && (numpix == 4)
+    pixldst30 vld3, 8, %(basereg+0), %(basereg+1), %(basereg+2), 4, mem_operand
+    pixldst30 vld3, 8, %(basereg+0), %(basereg+1), %(basereg+2), 5, mem_operand
+    pixldst30 vld3, 8, %(basereg+0), %(basereg+1), %(basereg+2), 6, mem_operand
+    pixldst30 vld3, 8, %(basereg+0), %(basereg+1), %(basereg+2), 7, mem_operand
+.elseif (bpp == 24) && (numpix == 2)
+    pixldst30 vld3, 8, %(basereg+0), %(basereg+1), %(basereg+2), 2, mem_operand
+    pixldst30 vld3, 8, %(basereg+0), %(basereg+1), %(basereg+2), 3, mem_operand
+.elseif (bpp == 24) && (numpix == 1)
+    pixldst30 vld3, 8, %(basereg+0), %(basereg+1), %(basereg+2), 1, mem_operand
+.else
+    pixldst %(numpix * bpp / 8), vld1, %(bpp), basereg, mem_operand, abits
+.endif
+.endif
+.endm
+
+.macro pixst numpix, bpp, basereg, mem_operand, abits=0
+.if bpp > 0
+.if (bpp == 32) && (numpix == 8) && (DEINTERLEAVE_32BPP_ENABLED != 0)
+    pixldst4 vst4, 8, %(basereg+4), %(basereg+5), \
+                      %(basereg+6), %(basereg+7), mem_operand, abits
+.elseif (bpp == 24) && (numpix == 8)
+    pixldst3 vst3, 8, %(basereg+3), %(basereg+4), %(basereg+5), mem_operand
+.elseif (bpp == 24) && (numpix == 4)
+    pixldst30 vst3, 8, %(basereg+0), %(basereg+1), %(basereg+2), 4, mem_operand
+    pixldst30 vst3, 8, %(basereg+0), %(basereg+1), %(basereg+2), 5, mem_operand
+    pixldst30 vst3, 8, %(basereg+0), %(basereg+1), %(basereg+2), 6, mem_operand
+    pixldst30 vst3, 8, %(basereg+0), %(basereg+1), %(basereg+2), 7, mem_operand
+.elseif (bpp == 24) && (numpix == 2)
+    pixldst30 vst3, 8, %(basereg+0), %(basereg+1), %(basereg+2), 2, mem_operand
+    pixldst30 vst3, 8, %(basereg+0), %(basereg+1), %(basereg+2), 3, mem_operand
+.elseif (bpp == 24) && (numpix == 1)
+    pixldst30 vst3, 8, %(basereg+0), %(basereg+1), %(basereg+2), 1, mem_operand
+.else
+    pixldst %(numpix * bpp / 8), vst1, %(bpp), basereg, mem_operand, abits
+.endif
+.endif
+.endm
+
+.macro pixld_a numpix, bpp, basereg, mem_operand
+.if (bpp * numpix) <= 128
+    pixld numpix, bpp, basereg, mem_operand, %(bpp * numpix)
+.else
+    pixld numpix, bpp, basereg, mem_operand, 128
+.endif
+.endm
+
+.macro pixst_a numpix, bpp, basereg, mem_operand
+.if (bpp * numpix) <= 128
+    pixst numpix, bpp, basereg, mem_operand, %(bpp * numpix)
+.else
+    pixst numpix, bpp, basereg, mem_operand, 128
+.endif
+.endm
+
+/*
+ * Pixel fetcher for nearest scaling (needs TMP1, TMP2, VX, UNIT_X register
+ * aliases to be defined)
+ */
+.macro pixld1_s elem_size, reg1, mem_operand
+.if elem_size == 16
+    mov     TMP1, VX, asr #16
+    add     VX, VX, UNIT_X
+    add     TMP1, mem_operand, TMP1, asl #1
+    mov     TMP2, VX, asr #16
+    add     VX, VX, UNIT_X
+    add     TMP2, mem_operand, TMP2, asl #1
+    vld1.16 {d&reg1&[0]}, [TMP1, :16]
+    mov     TMP1, VX, asr #16
+    add     VX, VX, UNIT_X
+    add     TMP1, mem_operand, TMP1, asl #1
+    vld1.16 {d&reg1&[1]}, [TMP2, :16]
+    mov     TMP2, VX, asr #16
+    add     VX, VX, UNIT_X
+    add     TMP2, mem_operand, TMP2, asl #1
+    vld1.16 {d&reg1&[2]}, [TMP1, :16]
+    vld1.16 {d&reg1&[3]}, [TMP2, :16]
+.elseif elem_size == 32
+    mov     TMP1, VX, asr #16
+    add     VX, VX, UNIT_X
+    add     TMP1, mem_operand, TMP1, asl #2
+    mov     TMP2, VX, asr #16
+    add     VX, VX, UNIT_X
+    add     TMP2, mem_operand, TMP2, asl #2
+    vld1.32 {d&reg1&[0]}, [TMP1, :32]
+    vld1.32 {d&reg1&[1]}, [TMP2, :32]
+.else
+    .error "unsupported"
+.endif
+.endm
+
+.macro pixld2_s elem_size, reg1, reg2, mem_operand
+.if elem_size == 32
+    mov     TMP1, VX, asr #16
+    add     VX, VX, UNIT_X, asl #1
+    add     TMP1, mem_operand, TMP1, asl #2
+    mov     TMP2, VX, asr #16
+    sub     VX, VX, UNIT_X
+    add     TMP2, mem_operand, TMP2, asl #2
+    vld1.32 {d&reg1&[0]}, [TMP1, :32]
+    mov     TMP1, VX, asr #16
+    add     VX, VX, UNIT_X, asl #1
+    add     TMP1, mem_operand, TMP1, asl #2
+    vld1.32 {d&reg2&[0]}, [TMP2, :32]
+    mov     TMP2, VX, asr #16
+    add     VX, VX, UNIT_X
+    add     TMP2, mem_operand, TMP2, asl #2
+    vld1.32 {d&reg1&[1]}, [TMP1, :32]
+    vld1.32 {d&reg2&[1]}, [TMP2, :32]
+.else
+    pixld1_s elem_size, reg1, mem_operand
+    pixld1_s elem_size, reg2, mem_operand
+.endif
+.endm
+
+.macro pixld0_s elem_size, reg1, idx, mem_operand
+.if elem_size == 16
+    mov     TMP1, VX, asr #16
+    add     VX, VX, UNIT_X
+    add     TMP1, mem_operand, TMP1, asl #1
+    vld1.16 {d&reg1&[idx]}, [TMP1, :16]
+.elseif elem_size == 32
+    mov     TMP1, VX, asr #16
+    add     VX, VX, UNIT_X
+    add     TMP1, mem_operand, TMP1, asl #2
+    vld1.32 {d&reg1&[idx]}, [TMP1, :32]
+.endif
+.endm
+
+.macro pixld_s_internal numbytes, elem_size, basereg, mem_operand
+.if numbytes == 32
+    pixld2_s elem_size, %(basereg+4), %(basereg+5), mem_operand
+    pixld2_s elem_size, %(basereg+6), %(basereg+7), mem_operand
+    pixdeinterleave elem_size, %(basereg+4)
+.elseif numbytes == 16
+    pixld2_s elem_size, %(basereg+2), %(basereg+3), mem_operand
+.elseif numbytes == 8
+    pixld1_s elem_size, %(basereg+1), mem_operand
+.elseif numbytes == 4
+    .if elem_size == 32
+        pixld0_s elem_size, %(basereg+0), 1, mem_operand
+    .elseif elem_size == 16
+        pixld0_s elem_size, %(basereg+0), 2, mem_operand
+        pixld0_s elem_size, %(basereg+0), 3, mem_operand
+    .else
+        pixld0_s elem_size, %(basereg+0), 4, mem_operand
+        pixld0_s elem_size, %(basereg+0), 5, mem_operand
+        pixld0_s elem_size, %(basereg+0), 6, mem_operand
+        pixld0_s elem_size, %(basereg+0), 7, mem_operand
+    .endif
+.elseif numbytes == 2
+    .if elem_size == 16
+        pixld0_s elem_size, %(basereg+0), 1, mem_operand
+    .else
+        pixld0_s elem_size, %(basereg+0), 2, mem_operand
+        pixld0_s elem_size, %(basereg+0), 3, mem_operand
+    .endif
+.elseif numbytes == 1
+    pixld0_s elem_size, %(basereg+0), 1, mem_operand
+.else
+    .error "unsupported size: numbytes"
+.endif
+.endm
+
+.macro pixld_s numpix, bpp, basereg, mem_operand
+.if bpp > 0
+    pixld_s_internal %(numpix * bpp / 8), %(bpp), basereg, mem_operand
+.endif
+.endm
+
+.macro vuzp8 reg1, reg2
+    vuzp.8 d&reg1, d&reg2
+.endm
+
+.macro vzip8 reg1, reg2
+    vzip.8 d&reg1, d&reg2
+.endm
+
+/* deinterleave B, G, R, A channels for eight 32bpp pixels in 4 registers */
+.macro pixdeinterleave bpp, basereg
+.if (bpp == 32) && (DEINTERLEAVE_32BPP_ENABLED != 0)
+    vuzp8 %(basereg+0), %(basereg+1)
+    vuzp8 %(basereg+2), %(basereg+3)
+    vuzp8 %(basereg+1), %(basereg+3)
+    vuzp8 %(basereg+0), %(basereg+2)
+.endif
+.endm
+
+/* interleave B, G, R, A channels for eight 32bpp pixels in 4 registers */
+.macro pixinterleave bpp, basereg
+.if (bpp == 32) && (DEINTERLEAVE_32BPP_ENABLED != 0)
+    vzip8 %(basereg+0), %(basereg+2)
+    vzip8 %(basereg+1), %(basereg+3)
+    vzip8 %(basereg+2), %(basereg+3)
+    vzip8 %(basereg+0), %(basereg+1)
+.endif
+.endm
+
+/*
+ * This is a macro for implementing cache preload. The main idea is that
+ * cache preload logic is mostly independent from the rest of pixels
+ * processing code. It starts at the top left pixel and moves forward
+ * across pixels and can jump across scanlines. Prefetch distance is
+ * handled in an 'incremental' way: it starts from 0 and advances to the
+ * optimal distance over time. After reaching optimal prefetch distance,
+ * it is kept constant. There are some checks which prevent prefetching
+ * unneeded pixel lines below the image (but it still can prefetch a bit
+ * more data on the right side of the image - not a big issue and may
+ * be actually helpful when rendering text glyphs). Additional trick is
+ * the use of LDR instruction for prefetch instead of PLD when moving to
+ * the next line, the point is that we have a high chance of getting TLB
+ * miss in this case, and PLD would be useless.
+ *
+ * This sounds like it may introduce a noticeable overhead (when working with
+ * fully cached data). But in reality, due to having a separate pipeline and
+ * instruction queue for NEON unit in ARM Cortex-A8, normal ARM code can
+ * execute simultaneously with NEON and be completely shadowed by it. Thus
+ * we get no performance overhead at all (*). This looks like a very nice
+ * feature of Cortex-A8, if used wisely. We don't have a hardware prefetcher,
+ * but still can implement some rather advanced prefetch logic in sofware
+ * for almost zero cost!
+ *
+ * (*) The overhead of the prefetcher is visible when running some trivial
+ * pixels processing like simple copy. Anyway, having prefetch is a must
+ * when working with the graphics data.
+ */
+.macro PF a, x:vararg
+.if (PREFETCH_TYPE_CURRENT == PREFETCH_TYPE_ADVANCED)
+    a x
+.endif
+.endm
+
+.macro cache_preload std_increment, boost_increment
+.if (src_bpp_shift >= 0) || (dst_r_bpp != 0) || (mask_bpp_shift >= 0)
+.if regs_shortage
+    PF ldr ORIG_W, [sp] /* If we are short on regs, ORIG_W is kept on stack */
+.endif
+.if std_increment != 0
+    PF add PF_X, PF_X, #std_increment
+.endif
+    PF tst PF_CTL, #0xF
+    PF addne PF_X, PF_X, #boost_increment
+    PF subne PF_CTL, PF_CTL, #1
+    PF cmp PF_X, ORIG_W
+.if src_bpp_shift >= 0
+    PF pld, [PF_SRC, PF_X, lsl #src_bpp_shift]
+.endif
+.if dst_r_bpp != 0
+    PF pld, [PF_DST, PF_X, lsl #dst_bpp_shift]
+.endif
+.if mask_bpp_shift >= 0
+    PF pld, [PF_MASK, PF_X, lsl #mask_bpp_shift]
+.endif
+    PF subge PF_X, PF_X, ORIG_W
+    PF subges PF_CTL, PF_CTL, #0x10
+.if src_bpp_shift >= 0
+    PF ldrgeb DUMMY, [PF_SRC, SRC_STRIDE, lsl #src_bpp_shift]!
+.endif
+.if dst_r_bpp != 0
+    PF ldrgeb DUMMY, [PF_DST, DST_STRIDE, lsl #dst_bpp_shift]!
+.endif
+.if mask_bpp_shift >= 0
+    PF ldrgeb DUMMY, [PF_MASK, MASK_STRIDE, lsl #mask_bpp_shift]!
+.endif
+.endif
+.endm
+
+.macro cache_preload_simple
+.if (PREFETCH_TYPE_CURRENT == PREFETCH_TYPE_SIMPLE)
+.if src_bpp > 0
+    pld [SRC, #(PREFETCH_DISTANCE_SIMPLE * src_bpp / 8)]
+.endif
+.if dst_r_bpp > 0
+    pld [DST_R, #(PREFETCH_DISTANCE_SIMPLE * dst_r_bpp / 8)]
+.endif
+.if mask_bpp > 0
+    pld [MASK, #(PREFETCH_DISTANCE_SIMPLE * mask_bpp / 8)]
+.endif
+.endif
+.endm
+
+.macro fetch_mask_pixblock
+    pixld       pixblock_size, mask_bpp, \
+                (mask_basereg - pixblock_size * mask_bpp / 64), MASK
+.endm
+
+/*
+ * Macro which is used to process leading pixels until destination
+ * pointer is properly aligned (at 16 bytes boundary). When destination
+ * buffer uses 16bpp format, this is unnecessary, or even pointless.
+ */
+.macro ensure_destination_ptr_alignment process_pixblock_head, \
+                                        process_pixblock_tail, \
+                                        process_pixblock_tail_head
+.if dst_w_bpp != 24
+    tst         DST_R, #0xF
+    beq         2f
+
+.irp lowbit, 1, 2, 4, 8, 16
+local skip1
+.if (dst_w_bpp <= (lowbit * 8)) && ((lowbit * 8) < (pixblock_size * dst_w_bpp))
+.if lowbit < 16 /* we don't need more than 16-byte alignment */
+    tst         DST_R, #lowbit
+    beq         1f
+.endif
+    pixld_src   (lowbit * 8 / dst_w_bpp), src_bpp, src_basereg, SRC
+    pixld       (lowbit * 8 / dst_w_bpp), mask_bpp, mask_basereg, MASK
+.if dst_r_bpp > 0
+    pixld_a     (lowbit * 8 / dst_r_bpp), dst_r_bpp, dst_r_basereg, DST_R
+.else
+    add         DST_R, DST_R, #lowbit
+.endif
+    PF add      PF_X, PF_X, #(lowbit * 8 / dst_w_bpp)
+    sub         W, W, #(lowbit * 8 / dst_w_bpp)
+1:
+.endif
+.endr
+    pixdeinterleave src_bpp, src_basereg
+    pixdeinterleave mask_bpp, mask_basereg
+    pixdeinterleave dst_r_bpp, dst_r_basereg
+
+    process_pixblock_head
+    cache_preload 0, pixblock_size
+    cache_preload_simple
+    process_pixblock_tail
+
+    pixinterleave dst_w_bpp, dst_w_basereg
+.irp lowbit, 1, 2, 4, 8, 16
+.if (dst_w_bpp <= (lowbit * 8)) && ((lowbit * 8) < (pixblock_size * dst_w_bpp))
+.if lowbit < 16 /* we don't need more than 16-byte alignment */
+    tst         DST_W, #lowbit
+    beq         1f
+.endif
+    pixst_a     (lowbit * 8 / dst_w_bpp), dst_w_bpp, dst_w_basereg, DST_W
+1:
+.endif
+.endr
+.endif
+2:
+.endm
+
+/*
+ * Special code for processing up to (pixblock_size - 1) remaining
+ * trailing pixels. As SIMD processing performs operation on
+ * pixblock_size pixels, anything smaller than this has to be loaded
+ * and stored in a special way. Loading and storing of pixel data is
+ * performed in such a way that we fill some 'slots' in the NEON
+ * registers (some slots naturally are unused), then perform compositing
+ * operation as usual. In the end, the data is taken from these 'slots'
+ * and saved to memory.
+ *
+ * cache_preload_flag - allows to suppress prefetch if
+ *                      set to 0
+ * dst_aligned_flag   - selects whether destination buffer
+ *                      is aligned
+ */
+.macro process_trailing_pixels cache_preload_flag, \
+                               dst_aligned_flag, \
+                               process_pixblock_head, \
+                               process_pixblock_tail, \
+                               process_pixblock_tail_head
+    tst         W, #(pixblock_size - 1)
+    beq         2f
+.irp chunk_size, 16, 8, 4, 2, 1
+.if pixblock_size > chunk_size
+    tst         W, #chunk_size
+    beq         1f
+    pixld_src   chunk_size, src_bpp, src_basereg, SRC
+    pixld       chunk_size, mask_bpp, mask_basereg, MASK
+.if dst_aligned_flag != 0
+    pixld_a     chunk_size, dst_r_bpp, dst_r_basereg, DST_R
+.else
+    pixld       chunk_size, dst_r_bpp, dst_r_basereg, DST_R
+.endif
+.if cache_preload_flag != 0
+    PF add      PF_X, PF_X, #chunk_size
+.endif
+1:
+.endif
+.endr
+    pixdeinterleave src_bpp, src_basereg
+    pixdeinterleave mask_bpp, mask_basereg
+    pixdeinterleave dst_r_bpp, dst_r_basereg
+
+    process_pixblock_head
+.if cache_preload_flag != 0
+    cache_preload 0, pixblock_size
+    cache_preload_simple
+.endif
+    process_pixblock_tail
+    pixinterleave dst_w_bpp, dst_w_basereg
+.irp chunk_size, 16, 8, 4, 2, 1
+.if pixblock_size > chunk_size
+    tst         W, #chunk_size
+    beq         1f
+.if dst_aligned_flag != 0
+    pixst_a     chunk_size, dst_w_bpp, dst_w_basereg, DST_W
+.else
+    pixst       chunk_size, dst_w_bpp, dst_w_basereg, DST_W
+.endif
+1:
+.endif
+.endr
+2:
+.endm
+
+/*
+ * Macro, which performs all the needed operations to switch to the next
+ * scanline and start the next loop iteration unless all the scanlines
+ * are already processed.
+ */
+.macro advance_to_next_scanline start_of_loop_label
+.if regs_shortage
+    ldrd        W, [sp] /* load W and H (width and height) from stack */
+.else
+    mov         W, ORIG_W
+.endif
+    add         DST_W, DST_W, DST_STRIDE, lsl #dst_bpp_shift
+.if src_bpp != 0
+    add         SRC, SRC, SRC_STRIDE, lsl #src_bpp_shift
+.endif
+.if mask_bpp != 0
+    add         MASK, MASK, MASK_STRIDE, lsl #mask_bpp_shift
+.endif
+.if (dst_w_bpp != 24)
+    sub         DST_W, DST_W, W, lsl #dst_bpp_shift
+.endif
+.if (src_bpp != 24) && (src_bpp != 0)
+    sub         SRC, SRC, W, lsl #src_bpp_shift
+.endif
+.if (mask_bpp != 24) && (mask_bpp != 0)
+    sub         MASK, MASK, W, lsl #mask_bpp_shift
+.endif
+    subs        H, H, #1
+    mov         DST_R, DST_W
+.if regs_shortage
+    str         H, [sp, #4] /* save updated height to stack */
+.endif
+    bge         start_of_loop_label
+.endm
+
+/*
+ * Registers are allocated in the following way by default:
+ * d0, d1, d2, d3     - reserved for loading source pixel data
+ * d4, d5, d6, d7     - reserved for loading destination pixel data
+ * d24, d25, d26, d27 - reserved for loading mask pixel data
+ * d28, d29, d30, d31 - final destination pixel data for writeback to memory
+ */
+.macro generate_composite_function fname, \
+                                   src_bpp_, \
+                                   mask_bpp_, \
+                                   dst_w_bpp_, \
+                                   flags, \
+                                   pixblock_size_, \
+                                   prefetch_distance, \
+                                   init, \
+                                   cleanup, \
+                                   process_pixblock_head, \
+                                   process_pixblock_tail, \
+                                   process_pixblock_tail_head, \
+                                   dst_w_basereg_ = 28, \
+                                   dst_r_basereg_ = 4, \
+                                   src_basereg_   = 0, \
+                                   mask_basereg_  = 24
+
+    .func fname
+    .global fname
+    /* For ELF format also set function visibility to hidden */
+#ifdef __ELF__
+    .hidden fname
+    .type fname, %function
+#endif
+fname:
+    push        {r4-r12, lr}        /* save all registers */
+
+/*
+ * Select prefetch type for this function. If prefetch distance is
+ * set to 0 or one of the color formats is 24bpp, SIMPLE prefetch
+ * has to be used instead of ADVANCED.
+ */
+    .set PREFETCH_TYPE_CURRENT, PREFETCH_TYPE_DEFAULT
+.if prefetch_distance == 0
+    .set PREFETCH_TYPE_CURRENT, PREFETCH_TYPE_NONE
+.elseif (PREFETCH_TYPE_CURRENT > PREFETCH_TYPE_SIMPLE) && \
+        ((src_bpp_ == 24) || (mask_bpp_ == 24) || (dst_w_bpp_ == 24))
+    .set PREFETCH_TYPE_CURRENT, PREFETCH_TYPE_SIMPLE
+.endif
+
+/*
+ * Make some macro arguments globally visible and accessible
+ * from other macros
+ */
+    .set src_bpp, src_bpp_
+    .set mask_bpp, mask_bpp_
+    .set dst_w_bpp, dst_w_bpp_
+    .set pixblock_size, pixblock_size_
+    .set dst_w_basereg, dst_w_basereg_
+    .set dst_r_basereg, dst_r_basereg_
+    .set src_basereg, src_basereg_
+    .set mask_basereg, mask_basereg_
+
+    .macro pixld_src x:vararg
+        pixld x
+    .endm
+    .macro fetch_src_pixblock
+        pixld_src   pixblock_size, src_bpp, \
+                    (src_basereg - pixblock_size * src_bpp / 64), SRC
+    .endm
+/*
+ * Assign symbolic names to registers
+ */
+    W           .req        r0      /* width (is updated during processing) */
+    H           .req        r1      /* height (is updated during processing) */
+    DST_W       .req        r2      /* destination buffer pointer for writes */
+    DST_STRIDE  .req        r3      /* destination image stride */
+    SRC         .req        r4      /* source buffer pointer */
+    SRC_STRIDE  .req        r5      /* source image stride */
+    DST_R       .req        r6      /* destination buffer pointer for reads */
+
+    MASK        .req        r7      /* mask pointer */
+    MASK_STRIDE .req        r8      /* mask stride */
+
+    PF_CTL      .req        r9      /* combined lines counter and prefetch */
+                                    /* distance increment counter */
+    PF_X        .req        r10     /* pixel index in a scanline for current */
+                                    /* pretetch position */
+    PF_SRC      .req        r11     /* pointer to source scanline start */
+                                    /* for prefetch purposes */
+    PF_DST      .req        r12     /* pointer to destination scanline start */
+                                    /* for prefetch purposes */
+    PF_MASK     .req        r14     /* pointer to mask scanline start */
+                                    /* for prefetch purposes */
+/*
+ * Check whether we have enough registers for all the local variables.
+ * If we don't have enough registers, original width and height are
+ * kept on top of stack (and 'regs_shortage' variable is set to indicate
+ * this for the rest of code). Even if there are enough registers, the
+ * allocation scheme may be a bit different depending on whether source
+ * or mask is not used.
+ */
+.if (PREFETCH_TYPE_CURRENT < PREFETCH_TYPE_ADVANCED)
+    ORIG_W      .req        r10     /* saved original width */
+    DUMMY       .req        r12     /* temporary register */
+    .set        regs_shortage, 0
+.elseif mask_bpp == 0
+    ORIG_W      .req        r7      /* saved original width */
+    DUMMY       .req        r8      /* temporary register */
+    .set        regs_shortage, 0
+.elseif src_bpp == 0
+    ORIG_W      .req        r4      /* saved original width */
+    DUMMY       .req        r5      /* temporary register */
+    .set        regs_shortage, 0
+.else
+    ORIG_W      .req        r1      /* saved original width */
+    DUMMY       .req        r1      /* temporary register */
+    .set        regs_shortage, 1
+.endif
+
+    .set mask_bpp_shift, -1
+.if src_bpp == 32
+    .set src_bpp_shift, 2
+.elseif src_bpp == 24
+    .set src_bpp_shift, 0
+.elseif src_bpp == 16
+    .set src_bpp_shift, 1
+.elseif src_bpp == 8
+    .set src_bpp_shift, 0
+.elseif src_bpp == 0
+    .set src_bpp_shift, -1
+.else
+    .error "requested src bpp (src_bpp) is not supported"
+.endif
+.if mask_bpp == 32
+    .set mask_bpp_shift, 2
+.elseif mask_bpp == 24
+    .set mask_bpp_shift, 0
+.elseif mask_bpp == 8
+    .set mask_bpp_shift, 0
+.elseif mask_bpp == 0
+    .set mask_bpp_shift, -1
+.else
+    .error "requested mask bpp (mask_bpp) is not supported"
+.endif
+.if dst_w_bpp == 32
+    .set dst_bpp_shift, 2
+.elseif dst_w_bpp == 24
+    .set dst_bpp_shift, 0
+.elseif dst_w_bpp == 16
+    .set dst_bpp_shift, 1
+.elseif dst_w_bpp == 8
+    .set dst_bpp_shift, 0
+.else
+    .error "requested dst bpp (dst_w_bpp) is not supported"
+.endif
+
+.if (((flags) & FLAG_DST_READWRITE) != 0)
+    .set dst_r_bpp, dst_w_bpp
+.else
+    .set dst_r_bpp, 0
+.endif
+.if (((flags) & FLAG_DEINTERLEAVE_32BPP) != 0)
+    .set DEINTERLEAVE_32BPP_ENABLED, 1
+.else
+    .set DEINTERLEAVE_32BPP_ENABLED, 0
+.endif
+
+.if prefetch_distance < 0 || prefetch_distance > 15
+    .error "invalid prefetch distance (prefetch_distance)"
+.endif
+
+.if src_bpp > 0
+    ldr         SRC, [sp, #40]
+.endif
+.if mask_bpp > 0
+    ldr         MASK, [sp, #48]
+.endif
+    PF mov      PF_X, #0
+.if src_bpp > 0
+    ldr         SRC_STRIDE, [sp, #44]
+.endif
+.if mask_bpp > 0
+    ldr         MASK_STRIDE, [sp, #52]
+.endif
+    mov         DST_R, DST_W
+
+.if src_bpp == 24
+    sub         SRC_STRIDE, SRC_STRIDE, W
+    sub         SRC_STRIDE, SRC_STRIDE, W, lsl #1
+.endif
+.if mask_bpp == 24
+    sub         MASK_STRIDE, MASK_STRIDE, W
+    sub         MASK_STRIDE, MASK_STRIDE, W, lsl #1
+.endif
+.if dst_w_bpp == 24
+    sub         DST_STRIDE, DST_STRIDE, W
+    sub         DST_STRIDE, DST_STRIDE, W, lsl #1
+.endif
+
+/*
+ * Setup advanced prefetcher initial state
+ */
+    PF mov      PF_SRC, SRC
+    PF mov      PF_DST, DST_R
+    PF mov      PF_MASK, MASK
+    /* PF_CTL = prefetch_distance | ((h - 1) << 4) */
+    PF mov      PF_CTL, H, lsl #4
+    PF add      PF_CTL, #(prefetch_distance - 0x10)
+
+    init
+.if regs_shortage
+    push        {r0, r1}
+.endif
+    subs        H, H, #1
+.if regs_shortage
+    str         H, [sp, #4] /* save updated height to stack */
+.else
+    mov         ORIG_W, W
+.endif
+    blt         9f
+    cmp         W, #(pixblock_size * 2)
+    blt         8f
+/*
+ * This is the start of the pipelined loop, which if optimized for
+ * long scanlines
+ */
+0:
+    ensure_destination_ptr_alignment process_pixblock_head, \
+                                     process_pixblock_tail, \
+                                     process_pixblock_tail_head
+
+    /* Implement "head (tail_head) ... (tail_head) tail" loop pattern */
+    pixld_a     pixblock_size, dst_r_bpp, \
+                (dst_r_basereg - pixblock_size * dst_r_bpp / 64), DST_R
+    fetch_src_pixblock
+    pixld       pixblock_size, mask_bpp, \
+                (mask_basereg - pixblock_size * mask_bpp / 64), MASK
+    PF add      PF_X, PF_X, #pixblock_size
+    process_pixblock_head
+    cache_preload 0, pixblock_size
+    cache_preload_simple
+    subs        W, W, #(pixblock_size * 2)
+    blt         2f
+1:
+    process_pixblock_tail_head
+    cache_preload_simple
+    subs        W, W, #pixblock_size
+    bge         1b
+2:
+    process_pixblock_tail
+    pixst_a     pixblock_size, dst_w_bpp, \
+                (dst_w_basereg - pixblock_size * dst_w_bpp / 64), DST_W
+
+    /* Process the remaining trailing pixels in the scanline */
+    process_trailing_pixels 1, 1, \
+                            process_pixblock_head, \
+                            process_pixblock_tail, \
+                            process_pixblock_tail_head
+    advance_to_next_scanline 0b
+
+.if regs_shortage
+    pop         {r0, r1}
+.endif
+    cleanup
+    pop         {r4-r12, pc}  /* exit */
+/*
+ * This is the start of the loop, designed to process images with small width
+ * (less than pixblock_size * 2 pixels). In this case neither pipelining
+ * nor prefetch are used.
+ */
+8:
+    /* Process exactly pixblock_size pixels if needed */
+    tst         W, #pixblock_size
+    beq         1f
+    pixld       pixblock_size, dst_r_bpp, \
+                (dst_r_basereg - pixblock_size * dst_r_bpp / 64), DST_R
+    fetch_src_pixblock
+    pixld       pixblock_size, mask_bpp, \
+                (mask_basereg - pixblock_size * mask_bpp / 64), MASK
+    process_pixblock_head
+    process_pixblock_tail
+    pixst       pixblock_size, dst_w_bpp, \
+                (dst_w_basereg - pixblock_size * dst_w_bpp / 64), DST_W
+1:
+    /* Process the remaining trailing pixels in the scanline */
+    process_trailing_pixels 0, 0, \
+                            process_pixblock_head, \
+                            process_pixblock_tail, \
+                            process_pixblock_tail_head
+    advance_to_next_scanline 8b
+9:
+.if regs_shortage
+    pop         {r0, r1}
+.endif
+    cleanup
+    pop         {r4-r12, pc}  /* exit */
+
+    .purgem     fetch_src_pixblock
+    .purgem     pixld_src
+
+    .unreq      SRC
+    .unreq      MASK
+    .unreq      DST_R
+    .unreq      DST_W
+    .unreq      ORIG_W
+    .unreq      W
+    .unreq      H
+    .unreq      SRC_STRIDE
+    .unreq      DST_STRIDE
+    .unreq      MASK_STRIDE
+    .unreq      PF_CTL
+    .unreq      PF_X
+    .unreq      PF_SRC
+    .unreq      PF_DST
+    .unreq      PF_MASK
+    .unreq      DUMMY
+    .endfunc
+.endm
+
+/*
+ * A simplified variant of function generation template for a single
+ * scanline processing (for implementing pixman combine functions)
+ */
+.macro generate_composite_function_scanline        use_nearest_scaling, \
+                                                   fname, \
+                                                   src_bpp_, \
+                                                   mask_bpp_, \
+                                                   dst_w_bpp_, \
+                                                   flags, \
+                                                   pixblock_size_, \
+                                                   init, \
+                                                   cleanup, \
+                                                   process_pixblock_head, \
+                                                   process_pixblock_tail, \
+                                                   process_pixblock_tail_head, \
+                                                   dst_w_basereg_ = 28, \
+                                                   dst_r_basereg_ = 4, \
+                                                   src_basereg_   = 0, \
+                                                   mask_basereg_  = 24
+
+    .func fname
+    .global fname
+    /* For ELF format also set function visibility to hidden */
+#ifdef __ELF__
+    .hidden fname
+    .type fname, %function
+#endif
+fname:
+    .set PREFETCH_TYPE_CURRENT, PREFETCH_TYPE_NONE
+/*
+ * Make some macro arguments globally visible and accessible
+ * from other macros
+ */
+    .set src_bpp, src_bpp_
+    .set mask_bpp, mask_bpp_
+    .set dst_w_bpp, dst_w_bpp_
+    .set pixblock_size, pixblock_size_
+    .set dst_w_basereg, dst_w_basereg_
+    .set dst_r_basereg, dst_r_basereg_
+    .set src_basereg, src_basereg_
+    .set mask_basereg, mask_basereg_
+
+.if use_nearest_scaling != 0
+    /*
+     * Assign symbolic names to registers for nearest scaling
+     */
+    W           .req        r0
+    DST_W       .req        r1
+    SRC         .req        r2
+    VX          .req        r3
+    UNIT_X      .req        ip
+    MASK        .req        lr
+    TMP1        .req        r4
+    TMP2        .req        r5
+    DST_R       .req        r6
+
+    .macro pixld_src x:vararg
+        pixld_s x
+    .endm
+
+    ldr         UNIT_X, [sp]
+    push        {r4-r6, lr}
+    .if mask_bpp != 0
+    ldr         MASK, [sp, #(16 + 4)]
+    .endif
+.else
+    /*
+     * Assign symbolic names to registers
+     */
+    W           .req        r0      /* width (is updated during processing) */
+    DST_W       .req        r1      /* destination buffer pointer for writes */
+    SRC         .req        r2      /* source buffer pointer */
+    DST_R       .req        ip      /* destination buffer pointer for reads */
+    MASK        .req        r3      /* mask pointer */
+
+    .macro pixld_src x:vararg
+        pixld x
+    .endm
+.endif
+
+.if (((flags) & FLAG_DST_READWRITE) != 0)
+    .set dst_r_bpp, dst_w_bpp
+.else
+    .set dst_r_bpp, 0
+.endif
+.if (((flags) & FLAG_DEINTERLEAVE_32BPP) != 0)
+    .set DEINTERLEAVE_32BPP_ENABLED, 1
+.else
+    .set DEINTERLEAVE_32BPP_ENABLED, 0
+.endif
+
+    .macro fetch_src_pixblock
+        pixld_src   pixblock_size, src_bpp, \
+                    (src_basereg - pixblock_size * src_bpp / 64), SRC
+    .endm
+
+    init
+    mov         DST_R, DST_W
+
+    cmp         W, #pixblock_size
+    blt         8f
+
+    ensure_destination_ptr_alignment process_pixblock_head, \
+                                     process_pixblock_tail, \
+                                     process_pixblock_tail_head
+
+    subs        W, W, #pixblock_size
+    blt         7f
+
+    /* Implement "head (tail_head) ... (tail_head) tail" loop pattern */
+    pixld_a     pixblock_size, dst_r_bpp, \
+                (dst_r_basereg - pixblock_size * dst_r_bpp / 64), DST_R
+    fetch_src_pixblock
+    pixld       pixblock_size, mask_bpp, \
+                (mask_basereg - pixblock_size * mask_bpp / 64), MASK
+    process_pixblock_head
+    subs        W, W, #pixblock_size
+    blt         2f
+1:
+    process_pixblock_tail_head
+    subs        W, W, #pixblock_size
+    bge         1b
+2:
+    process_pixblock_tail
+    pixst_a     pixblock_size, dst_w_bpp, \
+                (dst_w_basereg - pixblock_size * dst_w_bpp / 64), DST_W
+7:
+    /* Process the remaining trailing pixels in the scanline (dst aligned) */
+    process_trailing_pixels 0, 1, \
+                            process_pixblock_head, \
+                            process_pixblock_tail, \
+                            process_pixblock_tail_head
+
+    cleanup
+.if use_nearest_scaling != 0
+    pop         {r4-r6, pc}  /* exit */
+.else
+    bx          lr  /* exit */
+.endif
+8:
+    /* Process the remaining trailing pixels in the scanline (dst unaligned) */
+    process_trailing_pixels 0, 0, \
+                            process_pixblock_head, \
+                            process_pixblock_tail, \
+                            process_pixblock_tail_head
+
+    cleanup
+
+.if use_nearest_scaling != 0
+    pop         {r4-r6, pc}  /* exit */
+
+    .unreq      DST_R
+    .unreq      SRC
+    .unreq      W
+    .unreq      VX
+    .unreq      UNIT_X
+    .unreq      TMP1
+    .unreq      TMP2
+    .unreq      DST_W
+    .unreq      MASK
+
+.else
+    bx          lr  /* exit */
+
+    .unreq      SRC
+    .unreq      MASK
+    .unreq      DST_R
+    .unreq      DST_W
+    .unreq      W
+.endif
+
+    .purgem     fetch_src_pixblock
+    .purgem     pixld_src
+
+    .endfunc
+.endm
+
+.macro generate_composite_function_single_scanline x:vararg
+    generate_composite_function_scanline 0, x
+.endm
+
+.macro generate_composite_function_nearest_scanline x:vararg
+    generate_composite_function_scanline 1, x
+.endm
+
+/* Default prologue/epilogue, nothing special needs to be done */
+
+.macro default_init
+.endm
+
+.macro default_cleanup
+.endm
+
+/*
+ * Prologue/epilogue variant which additionally saves/restores d8-d15
+ * registers (they need to be saved/restored by callee according to ABI).
+ * This is required if the code needs to use all the NEON registers.
+ */
+
+.macro default_init_need_all_regs
+    vpush       {d8-d15}
+.endm
+
+.macro default_cleanup_need_all_regs
+    vpop        {d8-d15}
+.endm
+
+/******************************************************************************/
+
+/*
+ * Conversion of 8 r5g6b6 pixels packed in 128-bit register (in)
+ * into a planar a8r8g8b8 format (with a, r, g, b color components
+ * stored into 64-bit registers out_a, out_r, out_g, out_b respectively).
+ *
+ * Warning: the conversion is destructive and the original
+ *          value (in) is lost.
+ */
+.macro convert_0565_to_8888 in, out_a, out_r, out_g, out_b
+    vshrn.u16   out_r, in,    #8
+    vshrn.u16   out_g, in,    #3
+    vsli.u16    in,    in,    #5
+    vmov.u8     out_a, #255
+    vsri.u8     out_r, out_r, #5
+    vsri.u8     out_g, out_g, #6
+    vshrn.u16   out_b, in,    #2
+.endm
+
+.macro convert_0565_to_x888 in, out_r, out_g, out_b
+    vshrn.u16   out_r, in,    #8
+    vshrn.u16   out_g, in,    #3
+    vsli.u16    in,    in,    #5
+    vsri.u8     out_r, out_r, #5
+    vsri.u8     out_g, out_g, #6
+    vshrn.u16   out_b, in,    #2
+.endm
+
+/*
+ * Conversion from planar a8r8g8b8 format (with a, r, g, b color components
+ * in 64-bit registers in_a, in_r, in_g, in_b respectively) into 8 r5g6b6
+ * pixels packed in 128-bit register (out). Requires two temporary 128-bit
+ * registers (tmp1, tmp2)
+ */
+.macro convert_8888_to_0565 in_r, in_g, in_b, out, tmp1, tmp2
+    vshll.u8    tmp1, in_g, #8
+    vshll.u8    out, in_r, #8
+    vshll.u8    tmp2, in_b, #8
+    vsri.u16    out, tmp1, #5
+    vsri.u16    out, tmp2, #11
+.endm
+
+/*
+ * Conversion of four r5g6b5 pixels (in) to four x8r8g8b8 pixels
+ * returned in (out0, out1) registers pair. Requires one temporary
+ * 64-bit register (tmp). 'out1' and 'in' may overlap, the original
+ * value from 'in' is lost
+ */
+.macro convert_four_0565_to_x888_packed in, out0, out1, tmp
+    vshl.u16    out0, in,   #5  /* G top 6 bits */
+    vshl.u16    tmp,  in,   #11 /* B top 5 bits */
+    vsri.u16    in,   in,   #5  /* R is ready in top bits */
+    vsri.u16    out0, out0, #6  /* G is ready in top bits */
+    vsri.u16    tmp,  tmp,  #5  /* B is ready in top bits */
+    vshr.u16    out1, in,   #8  /* R is in place */
+    vsri.u16    out0, tmp,  #8  /* G & B is in place */
+    vzip.u16    out0, out1      /* everything is in place */
+.endm
diff --git a/pixman/pixman-arm-neon.c b/pixman/pixman-arm-neon.c
new file mode 100644 (file)
index 0000000..ca139de
--- /dev/null
@@ -0,0 +1,517 @@
+/*
+ * Copyright © 2009 ARM Ltd, Movial Creative Technologies Oy
+ *
+ * Permission to use, copy, modify, distribute, and sell this software and its
+ * documentation for any purpose is hereby granted without fee, provided that
+ * the above copyright notice appear in all copies and that both that
+ * copyright notice and this permission notice appear in supporting
+ * documentation, and that the name of ARM Ltd not be used in
+ * advertising or publicity pertaining to distribution of the software without
+ * specific, written prior permission.  ARM Ltd makes no
+ * representations about the suitability of this software for any purpose.  It
+ * is provided "as is" without express or implied warranty.
+ *
+ * THE COPYRIGHT HOLDERS DISCLAIM ALL WARRANTIES WITH REGARD TO THIS
+ * SOFTWARE, INCLUDING ALL IMPLIED WARRANTIES OF MERCHANTABILITY AND
+ * FITNESS, IN NO EVENT SHALL THE COPYRIGHT HOLDERS BE LIABLE FOR ANY
+ * SPECIAL, INDIRECT OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES
+ * WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN
+ * AN ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING
+ * OUT OF OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS
+ * SOFTWARE.
+ *
+ * Author:  Ian Rickards (ian.rickards@arm.com)
+ * Author:  Jonathan Morton (jonathan.morton@movial.com)
+ * Author:  Markku Vire (markku.vire@movial.com)
+ *
+ */
+
+#ifdef HAVE_CONFIG_H
+#include <config.h>
+#endif
+
+#include <string.h>
+#include "pixman-private.h"
+#include "pixman-arm-common.h"
+
+PIXMAN_ARM_BIND_FAST_PATH_SRC_DST (neon, src_8888_8888,
+                                   uint32_t, 1, uint32_t, 1)
+PIXMAN_ARM_BIND_FAST_PATH_SRC_DST (neon, src_x888_8888,
+                                   uint32_t, 1, uint32_t, 1)
+PIXMAN_ARM_BIND_FAST_PATH_SRC_DST (neon, src_0565_0565,
+                                   uint16_t, 1, uint16_t, 1)
+PIXMAN_ARM_BIND_FAST_PATH_SRC_DST (neon, src_0888_0888,
+                                   uint8_t, 3, uint8_t, 3)
+PIXMAN_ARM_BIND_FAST_PATH_SRC_DST (neon, src_8888_0565,
+                                   uint32_t, 1, uint16_t, 1)
+PIXMAN_ARM_BIND_FAST_PATH_SRC_DST (neon, src_0565_8888,
+                                   uint16_t, 1, uint32_t, 1)
+PIXMAN_ARM_BIND_FAST_PATH_SRC_DST (neon, src_0888_8888_rev,
+                                   uint8_t, 3, uint32_t, 1)
+PIXMAN_ARM_BIND_FAST_PATH_SRC_DST (neon, src_0888_0565_rev,
+                                   uint8_t, 3, uint16_t, 1)
+PIXMAN_ARM_BIND_FAST_PATH_SRC_DST (neon, src_pixbuf_8888,
+                                   uint32_t, 1, uint32_t, 1)
+PIXMAN_ARM_BIND_FAST_PATH_SRC_DST (neon, src_rpixbuf_8888,
+                                   uint32_t, 1, uint32_t, 1)
+PIXMAN_ARM_BIND_FAST_PATH_SRC_DST (neon, add_8_8,
+                                   uint8_t, 1, uint8_t, 1)
+PIXMAN_ARM_BIND_FAST_PATH_SRC_DST (neon, add_8888_8888,
+                                   uint32_t, 1, uint32_t, 1)
+PIXMAN_ARM_BIND_FAST_PATH_SRC_DST (neon, over_8888_0565,
+                                   uint32_t, 1, uint16_t, 1)
+PIXMAN_ARM_BIND_FAST_PATH_SRC_DST (neon, over_8888_8888,
+                                   uint32_t, 1, uint32_t, 1)
+PIXMAN_ARM_BIND_FAST_PATH_SRC_DST (neon, out_reverse_8_0565,
+                                   uint8_t, 1, uint16_t, 1)
+PIXMAN_ARM_BIND_FAST_PATH_SRC_DST (neon, out_reverse_8_8888,
+                                   uint8_t, 1, uint32_t, 1)
+
+PIXMAN_ARM_BIND_FAST_PATH_N_DST (SKIP_ZERO_SRC, neon, over_n_0565,
+                                 uint16_t, 1)
+PIXMAN_ARM_BIND_FAST_PATH_N_DST (SKIP_ZERO_SRC, neon, over_n_8888,
+                                 uint32_t, 1)
+PIXMAN_ARM_BIND_FAST_PATH_N_DST (SKIP_ZERO_SRC, neon, over_reverse_n_8888,
+                                 uint32_t, 1)
+PIXMAN_ARM_BIND_FAST_PATH_N_DST (0, neon, in_n_8,
+                                 uint8_t, 1)
+
+PIXMAN_ARM_BIND_FAST_PATH_N_MASK_DST (SKIP_ZERO_SRC, neon, over_n_8_0565,
+                                      uint8_t, 1, uint16_t, 1)
+PIXMAN_ARM_BIND_FAST_PATH_N_MASK_DST (SKIP_ZERO_SRC, neon, over_n_8_8888,
+                                      uint8_t, 1, uint32_t, 1)
+PIXMAN_ARM_BIND_FAST_PATH_N_MASK_DST (SKIP_ZERO_SRC, neon, over_n_8888_8888_ca,
+                                      uint32_t, 1, uint32_t, 1)
+PIXMAN_ARM_BIND_FAST_PATH_N_MASK_DST (SKIP_ZERO_SRC, neon, over_n_8888_0565_ca,
+                                     uint32_t, 1, uint16_t, 1)
+PIXMAN_ARM_BIND_FAST_PATH_N_MASK_DST (SKIP_ZERO_SRC, neon, over_n_8_8,
+                                      uint8_t, 1, uint8_t, 1)
+PIXMAN_ARM_BIND_FAST_PATH_N_MASK_DST (SKIP_ZERO_SRC, neon, add_n_8_8,
+                                      uint8_t, 1, uint8_t, 1)
+PIXMAN_ARM_BIND_FAST_PATH_N_MASK_DST (SKIP_ZERO_SRC, neon, add_n_8_8888,
+                                      uint8_t, 1, uint32_t, 1)
+PIXMAN_ARM_BIND_FAST_PATH_N_MASK_DST (0, neon, src_n_8_8888,
+                                      uint8_t, 1, uint32_t, 1)
+PIXMAN_ARM_BIND_FAST_PATH_N_MASK_DST (0, neon, src_n_8_8,
+                                      uint8_t, 1, uint8_t, 1)
+
+PIXMAN_ARM_BIND_FAST_PATH_SRC_N_DST (SKIP_ZERO_MASK, neon, over_8888_n_8888,
+                                     uint32_t, 1, uint32_t, 1)
+PIXMAN_ARM_BIND_FAST_PATH_SRC_N_DST (SKIP_ZERO_MASK, neon, over_8888_n_0565,
+                                     uint32_t, 1, uint16_t, 1)
+PIXMAN_ARM_BIND_FAST_PATH_SRC_N_DST (SKIP_ZERO_MASK, neon, over_0565_n_0565,
+                                     uint16_t, 1, uint16_t, 1)
+PIXMAN_ARM_BIND_FAST_PATH_SRC_N_DST (SKIP_ZERO_MASK, neon, add_8888_n_8888,
+                                     uint32_t, 1, uint32_t, 1)
+
+PIXMAN_ARM_BIND_FAST_PATH_SRC_MASK_DST (neon, add_8_8_8,
+                                        uint8_t, 1, uint8_t, 1, uint8_t, 1)
+PIXMAN_ARM_BIND_FAST_PATH_SRC_MASK_DST (neon, add_0565_8_0565,
+                                        uint16_t, 1, uint8_t, 1, uint16_t, 1)
+PIXMAN_ARM_BIND_FAST_PATH_SRC_MASK_DST (neon, add_8888_8_8888,
+                                        uint32_t, 1, uint8_t, 1, uint32_t, 1)
+PIXMAN_ARM_BIND_FAST_PATH_SRC_MASK_DST (neon, add_8888_8888_8888,
+                                        uint32_t, 1, uint32_t, 1, uint32_t, 1)
+PIXMAN_ARM_BIND_FAST_PATH_SRC_MASK_DST (neon, over_8888_8_8888,
+                                        uint32_t, 1, uint8_t, 1, uint32_t, 1)
+PIXMAN_ARM_BIND_FAST_PATH_SRC_MASK_DST (neon, over_8888_8888_8888,
+                                        uint32_t, 1, uint32_t, 1, uint32_t, 1)
+PIXMAN_ARM_BIND_FAST_PATH_SRC_MASK_DST (neon, over_8888_8_0565,
+                                        uint32_t, 1, uint8_t, 1, uint16_t, 1)
+PIXMAN_ARM_BIND_FAST_PATH_SRC_MASK_DST (neon, over_0565_8_0565,
+                                        uint16_t, 1, uint8_t, 1, uint16_t, 1)
+
+PIXMAN_ARM_BIND_SCALED_NEAREST_SRC_DST (neon, 8888_8888, OVER,
+                                        uint32_t, uint32_t)
+PIXMAN_ARM_BIND_SCALED_NEAREST_SRC_DST (neon, 8888_0565, OVER,
+                                        uint32_t, uint16_t)
+PIXMAN_ARM_BIND_SCALED_NEAREST_SRC_DST (neon, 8888_0565, SRC,
+                                        uint32_t, uint16_t)
+PIXMAN_ARM_BIND_SCALED_NEAREST_SRC_DST (neon, 0565_8888, SRC,
+                                        uint16_t, uint32_t)
+
+PIXMAN_ARM_BIND_SCALED_NEAREST_SRC_A8_DST (SKIP_ZERO_SRC, neon, 8888_8_0565,
+                                           OVER, uint32_t, uint16_t)
+PIXMAN_ARM_BIND_SCALED_NEAREST_SRC_A8_DST (SKIP_ZERO_SRC, neon, 0565_8_0565,
+                                           OVER, uint16_t, uint16_t)
+
+PIXMAN_ARM_BIND_SCALED_BILINEAR_SRC_DST (0, neon, 8888_8888, SRC,
+                                         uint32_t, uint32_t)
+PIXMAN_ARM_BIND_SCALED_BILINEAR_SRC_DST (0, neon, 8888_0565, SRC,
+                                         uint32_t, uint16_t)
+PIXMAN_ARM_BIND_SCALED_BILINEAR_SRC_DST (0, neon, 0565_x888, SRC,
+                                         uint16_t, uint32_t)
+PIXMAN_ARM_BIND_SCALED_BILINEAR_SRC_DST (0, neon, 0565_0565, SRC,
+                                         uint16_t, uint16_t)
+PIXMAN_ARM_BIND_SCALED_BILINEAR_SRC_DST (SKIP_ZERO_SRC, neon, 8888_8888, OVER,
+                                         uint32_t, uint32_t)
+PIXMAN_ARM_BIND_SCALED_BILINEAR_SRC_DST (SKIP_ZERO_SRC, neon, 8888_8888, ADD,
+                                         uint32_t, uint32_t)
+
+PIXMAN_ARM_BIND_SCALED_BILINEAR_SRC_A8_DST (0, neon, 8888_8_8888, SRC,
+                                            uint32_t, uint32_t)
+PIXMAN_ARM_BIND_SCALED_BILINEAR_SRC_A8_DST (0, neon, 8888_8_0565, SRC,
+                                            uint32_t, uint16_t)
+PIXMAN_ARM_BIND_SCALED_BILINEAR_SRC_A8_DST (0, neon, 0565_8_x888, SRC,
+                                            uint16_t, uint32_t)
+PIXMAN_ARM_BIND_SCALED_BILINEAR_SRC_A8_DST (0, neon, 0565_8_0565, SRC,
+                                            uint16_t, uint16_t)
+PIXMAN_ARM_BIND_SCALED_BILINEAR_SRC_A8_DST (SKIP_ZERO_SRC, neon, 8888_8_8888, OVER,
+                                            uint32_t, uint32_t)
+PIXMAN_ARM_BIND_SCALED_BILINEAR_SRC_A8_DST (SKIP_ZERO_SRC, neon, 8888_8_8888, ADD,
+                                            uint32_t, uint32_t)
+
+void
+pixman_composite_src_n_8_asm_neon (int32_t   w,
+                                   int32_t   h,
+                                   uint8_t  *dst,
+                                   int32_t   dst_stride,
+                                   uint8_t   src);
+
+void
+pixman_composite_src_n_0565_asm_neon (int32_t   w,
+                                      int32_t   h,
+                                      uint16_t *dst,
+                                      int32_t   dst_stride,
+                                      uint16_t  src);
+
+void
+pixman_composite_src_n_8888_asm_neon (int32_t   w,
+                                      int32_t   h,
+                                      uint32_t *dst,
+                                      int32_t   dst_stride,
+                                      uint32_t  src);
+
+static pixman_bool_t
+pixman_fill_neon (uint32_t *bits,
+                  int       stride,
+                  int       bpp,
+                  int       x,
+                  int       y,
+                  int       width,
+                  int       height,
+                  uint32_t  _xor)
+{
+    /* stride is always multiple of 32bit units in pixman */
+    uint32_t byte_stride = stride * sizeof(uint32_t);
+
+    switch (bpp)
+    {
+    case 8:
+       pixman_composite_src_n_8_asm_neon (
+               width,
+               height,
+               (uint8_t *)(((char *) bits) + y * byte_stride + x),
+               byte_stride,
+               _xor & 0xff);
+       return TRUE;
+    case 16:
+       pixman_composite_src_n_0565_asm_neon (
+               width,
+               height,
+               (uint16_t *)(((char *) bits) + y * byte_stride + x * 2),
+               byte_stride / 2,
+               _xor & 0xffff);
+       return TRUE;
+    case 32:
+       pixman_composite_src_n_8888_asm_neon (
+               width,
+               height,
+               (uint32_t *)(((char *) bits) + y * byte_stride + x * 4),
+               byte_stride / 4,
+               _xor);
+       return TRUE;
+    default:
+       return FALSE;
+    }
+}
+
+static pixman_bool_t
+pixman_blt_neon (uint32_t *src_bits,
+                 uint32_t *dst_bits,
+                 int       src_stride,
+                 int       dst_stride,
+                 int       src_bpp,
+                 int       dst_bpp,
+                 int       src_x,
+                 int       src_y,
+                 int       dest_x,
+                 int       dest_y,
+                 int       width,
+                 int       height)
+{
+    if (src_bpp != dst_bpp)
+       return FALSE;
+
+    switch (src_bpp)
+    {
+    case 16:
+       pixman_composite_src_0565_0565_asm_neon (
+               width, height,
+               (uint16_t *)(((char *) dst_bits) +
+               dest_y * dst_stride * 4 + dest_x * 2), dst_stride * 2,
+               (uint16_t *)(((char *) src_bits) +
+               src_y * src_stride * 4 + src_x * 2), src_stride * 2);
+       return TRUE;
+    case 32:
+       pixman_composite_src_8888_8888_asm_neon (
+               width, height,
+               (uint32_t *)(((char *) dst_bits) +
+               dest_y * dst_stride * 4 + dest_x * 4), dst_stride,
+               (uint32_t *)(((char *) src_bits) +
+               src_y * src_stride * 4 + src_x * 4), src_stride);
+       return TRUE;
+    default:
+       return FALSE;
+    }
+}
+
+static const pixman_fast_path_t arm_neon_fast_paths[] =
+{
+    PIXMAN_STD_FAST_PATH (SRC,  r5g6b5,   null,     r5g6b5,   neon_composite_src_0565_0565),
+    PIXMAN_STD_FAST_PATH (SRC,  b5g6r5,   null,     b5g6r5,   neon_composite_src_0565_0565),
+    PIXMAN_STD_FAST_PATH (SRC,  a8r8g8b8, null,     r5g6b5,   neon_composite_src_8888_0565),
+    PIXMAN_STD_FAST_PATH (SRC,  x8r8g8b8, null,     r5g6b5,   neon_composite_src_8888_0565),
+    PIXMAN_STD_FAST_PATH (SRC,  a8b8g8r8, null,     b5g6r5,   neon_composite_src_8888_0565),
+    PIXMAN_STD_FAST_PATH (SRC,  x8b8g8r8, null,     b5g6r5,   neon_composite_src_8888_0565),
+    PIXMAN_STD_FAST_PATH (SRC,  r5g6b5,   null,     a8r8g8b8, neon_composite_src_0565_8888),
+    PIXMAN_STD_FAST_PATH (SRC,  r5g6b5,   null,     x8r8g8b8, neon_composite_src_0565_8888),
+    PIXMAN_STD_FAST_PATH (SRC,  b5g6r5,   null,     a8b8g8r8, neon_composite_src_0565_8888),
+    PIXMAN_STD_FAST_PATH (SRC,  b5g6r5,   null,     x8b8g8r8, neon_composite_src_0565_8888),
+    PIXMAN_STD_FAST_PATH (SRC,  a8r8g8b8, null,     x8r8g8b8, neon_composite_src_8888_8888),
+    PIXMAN_STD_FAST_PATH (SRC,  x8r8g8b8, null,     x8r8g8b8, neon_composite_src_8888_8888),
+    PIXMAN_STD_FAST_PATH (SRC,  a8b8g8r8, null,     x8b8g8r8, neon_composite_src_8888_8888),
+    PIXMAN_STD_FAST_PATH (SRC,  x8b8g8r8, null,     x8b8g8r8, neon_composite_src_8888_8888),
+    PIXMAN_STD_FAST_PATH (SRC,  a8r8g8b8, null,     a8r8g8b8, neon_composite_src_8888_8888),
+    PIXMAN_STD_FAST_PATH (SRC,  a8b8g8r8, null,     a8b8g8r8, neon_composite_src_8888_8888),
+    PIXMAN_STD_FAST_PATH (SRC,  x8r8g8b8, null,     a8r8g8b8, neon_composite_src_x888_8888),
+    PIXMAN_STD_FAST_PATH (SRC,  x8b8g8r8, null,     a8b8g8r8, neon_composite_src_x888_8888),
+    PIXMAN_STD_FAST_PATH (SRC,  r8g8b8,   null,     r8g8b8,   neon_composite_src_0888_0888),
+    PIXMAN_STD_FAST_PATH (SRC,  b8g8r8,   null,     x8r8g8b8, neon_composite_src_0888_8888_rev),
+    PIXMAN_STD_FAST_PATH (SRC,  b8g8r8,   null,     r5g6b5,   neon_composite_src_0888_0565_rev),
+    PIXMAN_STD_FAST_PATH (SRC,  pixbuf,   pixbuf,   a8r8g8b8, neon_composite_src_pixbuf_8888),
+    PIXMAN_STD_FAST_PATH (SRC,  pixbuf,   pixbuf,   a8b8g8r8, neon_composite_src_rpixbuf_8888),
+    PIXMAN_STD_FAST_PATH (SRC,  rpixbuf,  rpixbuf,  a8r8g8b8, neon_composite_src_rpixbuf_8888),
+    PIXMAN_STD_FAST_PATH (SRC,  rpixbuf,  rpixbuf,  a8b8g8r8, neon_composite_src_pixbuf_8888),
+    PIXMAN_STD_FAST_PATH (SRC,  solid,    a8,       a8r8g8b8, neon_composite_src_n_8_8888),
+    PIXMAN_STD_FAST_PATH (SRC,  solid,    a8,       x8r8g8b8, neon_composite_src_n_8_8888),
+    PIXMAN_STD_FAST_PATH (SRC,  solid,    a8,       a8b8g8r8, neon_composite_src_n_8_8888),
+    PIXMAN_STD_FAST_PATH (SRC,  solid,    a8,       x8b8g8r8, neon_composite_src_n_8_8888),
+    PIXMAN_STD_FAST_PATH (SRC,  solid,    a8,       a8,       neon_composite_src_n_8_8),
+
+    PIXMAN_STD_FAST_PATH (OVER, solid,    a8,       a8,       neon_composite_over_n_8_8),
+    PIXMAN_STD_FAST_PATH (OVER, solid,    a8,       r5g6b5,   neon_composite_over_n_8_0565),
+    PIXMAN_STD_FAST_PATH (OVER, solid,    a8,       b5g6r5,   neon_composite_over_n_8_0565),
+    PIXMAN_STD_FAST_PATH (OVER, solid,    a8,       a8r8g8b8, neon_composite_over_n_8_8888),
+    PIXMAN_STD_FAST_PATH (OVER, solid,    a8,       x8r8g8b8, neon_composite_over_n_8_8888),
+    PIXMAN_STD_FAST_PATH (OVER, solid,    a8,       a8b8g8r8, neon_composite_over_n_8_8888),
+    PIXMAN_STD_FAST_PATH (OVER, solid,    a8,       x8b8g8r8, neon_composite_over_n_8_8888),
+    PIXMAN_STD_FAST_PATH (OVER, solid,    null,     r5g6b5,   neon_composite_over_n_0565),
+    PIXMAN_STD_FAST_PATH (OVER, solid,    null,     a8r8g8b8, neon_composite_over_n_8888),
+    PIXMAN_STD_FAST_PATH (OVER, solid,    null,     x8r8g8b8, neon_composite_over_n_8888),
+    PIXMAN_STD_FAST_PATH_CA (OVER, solid, a8r8g8b8, a8r8g8b8, neon_composite_over_n_8888_8888_ca),
+    PIXMAN_STD_FAST_PATH_CA (OVER, solid, a8r8g8b8, x8r8g8b8, neon_composite_over_n_8888_8888_ca),
+    PIXMAN_STD_FAST_PATH_CA (OVER, solid, a8b8g8r8, a8b8g8r8, neon_composite_over_n_8888_8888_ca),
+    PIXMAN_STD_FAST_PATH_CA (OVER, solid, a8b8g8r8, x8b8g8r8, neon_composite_over_n_8888_8888_ca),
+    PIXMAN_STD_FAST_PATH_CA (OVER, solid, a8r8g8b8, r5g6b5,   neon_composite_over_n_8888_0565_ca),
+    PIXMAN_STD_FAST_PATH_CA (OVER, solid, a8b8g8r8, b5g6r5,   neon_composite_over_n_8888_0565_ca),
+    PIXMAN_STD_FAST_PATH (OVER, a8r8g8b8, solid,    a8r8g8b8, neon_composite_over_8888_n_8888),
+    PIXMAN_STD_FAST_PATH (OVER, a8r8g8b8, solid,    x8r8g8b8, neon_composite_over_8888_n_8888),
+    PIXMAN_STD_FAST_PATH (OVER, a8r8g8b8, solid,    r5g6b5,   neon_composite_over_8888_n_0565),
+    PIXMAN_STD_FAST_PATH (OVER, a8b8g8r8, solid,    b5g6r5,   neon_composite_over_8888_n_0565),
+    PIXMAN_STD_FAST_PATH (OVER, r5g6b5,   solid,    r5g6b5,   neon_composite_over_0565_n_0565),
+    PIXMAN_STD_FAST_PATH (OVER, b5g6r5,   solid,    b5g6r5,   neon_composite_over_0565_n_0565),
+    PIXMAN_STD_FAST_PATH (OVER, a8r8g8b8, a8,       a8r8g8b8, neon_composite_over_8888_8_8888),
+    PIXMAN_STD_FAST_PATH (OVER, a8r8g8b8, a8,       x8r8g8b8, neon_composite_over_8888_8_8888),
+    PIXMAN_STD_FAST_PATH (OVER, a8b8g8r8, a8,       a8b8g8r8, neon_composite_over_8888_8_8888),
+    PIXMAN_STD_FAST_PATH (OVER, a8b8g8r8, a8,       x8b8g8r8, neon_composite_over_8888_8_8888),
+    PIXMAN_STD_FAST_PATH (OVER, a8r8g8b8, a8,       r5g6b5,   neon_composite_over_8888_8_0565),
+    PIXMAN_STD_FAST_PATH (OVER, a8b8g8r8, a8,       b5g6r5,   neon_composite_over_8888_8_0565),
+    PIXMAN_STD_FAST_PATH (OVER, r5g6b5,   a8,       r5g6b5,   neon_composite_over_0565_8_0565),
+    PIXMAN_STD_FAST_PATH (OVER, b5g6r5,   a8,       b5g6r5,   neon_composite_over_0565_8_0565),
+    PIXMAN_STD_FAST_PATH (OVER, a8r8g8b8, a8r8g8b8, a8r8g8b8, neon_composite_over_8888_8888_8888),
+    PIXMAN_STD_FAST_PATH (OVER, a8r8g8b8, null,     r5g6b5,   neon_composite_over_8888_0565),
+    PIXMAN_STD_FAST_PATH (OVER, a8b8g8r8, null,     b5g6r5,   neon_composite_over_8888_0565),
+    PIXMAN_STD_FAST_PATH (OVER, a8r8g8b8, null,     a8r8g8b8, neon_composite_over_8888_8888),
+    PIXMAN_STD_FAST_PATH (OVER, a8r8g8b8, null,     x8r8g8b8, neon_composite_over_8888_8888),
+    PIXMAN_STD_FAST_PATH (OVER, a8b8g8r8, null,     a8b8g8r8, neon_composite_over_8888_8888),
+    PIXMAN_STD_FAST_PATH (OVER, a8b8g8r8, null,     x8b8g8r8, neon_composite_over_8888_8888),
+    PIXMAN_STD_FAST_PATH (OVER, x8r8g8b8, null,     a8r8g8b8, neon_composite_src_x888_8888),
+    PIXMAN_STD_FAST_PATH (OVER, x8b8g8r8, null,     a8b8g8r8, neon_composite_src_x888_8888),
+    PIXMAN_STD_FAST_PATH (ADD,  solid,    a8,       a8,       neon_composite_add_n_8_8),
+    PIXMAN_STD_FAST_PATH (ADD,  solid,    a8,       a8r8g8b8, neon_composite_add_n_8_8888),
+    PIXMAN_STD_FAST_PATH (ADD,  solid,    a8,       a8b8g8r8, neon_composite_add_n_8_8888),
+    PIXMAN_STD_FAST_PATH (ADD,  a8,       a8,       a8,       neon_composite_add_8_8_8),
+    PIXMAN_STD_FAST_PATH (ADD,  r5g6b5,   a8,       r5g6b5,   neon_composite_add_0565_8_0565),
+    PIXMAN_STD_FAST_PATH (ADD,  b5g6r5,   a8,       b5g6r5,   neon_composite_add_0565_8_0565),
+    PIXMAN_STD_FAST_PATH (ADD,  a8r8g8b8, a8,       a8r8g8b8, neon_composite_add_8888_8_8888),
+    PIXMAN_STD_FAST_PATH (ADD,  a8b8g8r8, a8,       a8b8g8r8, neon_composite_add_8888_8_8888),
+    PIXMAN_STD_FAST_PATH (ADD,  a8r8g8b8, a8r8g8b8, a8r8g8b8, neon_composite_add_8888_8888_8888),
+    PIXMAN_STD_FAST_PATH (ADD,  a8r8g8b8, solid,    a8r8g8b8, neon_composite_add_8888_n_8888),
+    PIXMAN_STD_FAST_PATH (ADD,  a8b8g8r8, solid,    a8b8g8r8, neon_composite_add_8888_n_8888),
+    PIXMAN_STD_FAST_PATH (ADD,  a8,       null,     a8,       neon_composite_add_8_8),
+    PIXMAN_STD_FAST_PATH (ADD,  a8r8g8b8, null,     a8r8g8b8, neon_composite_add_8888_8888),
+    PIXMAN_STD_FAST_PATH (ADD,  a8b8g8r8, null,     a8b8g8r8, neon_composite_add_8888_8888),
+    PIXMAN_STD_FAST_PATH (IN,   solid,    null,     a8,       neon_composite_in_n_8),
+    PIXMAN_STD_FAST_PATH (OVER_REVERSE, solid, null, a8r8g8b8, neon_composite_over_reverse_n_8888),
+    PIXMAN_STD_FAST_PATH (OVER_REVERSE, solid, null, a8b8g8r8, neon_composite_over_reverse_n_8888),
+    PIXMAN_STD_FAST_PATH (OUT_REVERSE,  a8,    null, r5g6b5,   neon_composite_out_reverse_8_0565),
+    PIXMAN_STD_FAST_PATH (OUT_REVERSE,  a8,    null, b5g6r5,   neon_composite_out_reverse_8_0565),
+    PIXMAN_STD_FAST_PATH (OUT_REVERSE,  a8,    null, a8r8g8b8, neon_composite_out_reverse_8_8888),
+    PIXMAN_STD_FAST_PATH (OUT_REVERSE,  a8,    null, a8b8g8r8, neon_composite_out_reverse_8_8888),
+
+    PIXMAN_ARM_SIMPLE_NEAREST_FAST_PATH (OVER, a8r8g8b8, a8r8g8b8, neon_8888_8888),
+    PIXMAN_ARM_SIMPLE_NEAREST_FAST_PATH (OVER, a8b8g8r8, a8b8g8r8, neon_8888_8888),
+    PIXMAN_ARM_SIMPLE_NEAREST_FAST_PATH (OVER, a8r8g8b8, x8r8g8b8, neon_8888_8888),
+    PIXMAN_ARM_SIMPLE_NEAREST_FAST_PATH (OVER, a8b8g8r8, x8b8g8r8, neon_8888_8888),
+
+    PIXMAN_ARM_SIMPLE_NEAREST_FAST_PATH (OVER, a8r8g8b8, r5g6b5, neon_8888_0565),
+    PIXMAN_ARM_SIMPLE_NEAREST_FAST_PATH (OVER, a8b8g8r8, b5g6r5, neon_8888_0565),
+
+    PIXMAN_ARM_SIMPLE_NEAREST_FAST_PATH (SRC, a8r8g8b8, r5g6b5, neon_8888_0565),
+    PIXMAN_ARM_SIMPLE_NEAREST_FAST_PATH (SRC, x8r8g8b8, r5g6b5, neon_8888_0565),
+    PIXMAN_ARM_SIMPLE_NEAREST_FAST_PATH (SRC, a8b8g8r8, b5g6r5, neon_8888_0565),
+    PIXMAN_ARM_SIMPLE_NEAREST_FAST_PATH (SRC, x8b8g8r8, b5g6r5, neon_8888_0565),
+
+    PIXMAN_ARM_SIMPLE_NEAREST_FAST_PATH (SRC, b5g6r5, x8b8g8r8, neon_0565_8888),
+    PIXMAN_ARM_SIMPLE_NEAREST_FAST_PATH (SRC, r5g6b5, x8r8g8b8, neon_0565_8888),
+    /* Note: NONE repeat is not supported yet */
+    SIMPLE_NEAREST_FAST_PATH_COVER (SRC, r5g6b5, a8r8g8b8, neon_0565_8888),
+    SIMPLE_NEAREST_FAST_PATH_COVER (SRC, b5g6r5, a8b8g8r8, neon_0565_8888),
+    SIMPLE_NEAREST_FAST_PATH_PAD (SRC, r5g6b5, a8r8g8b8, neon_0565_8888),
+    SIMPLE_NEAREST_FAST_PATH_PAD (SRC, b5g6r5, a8b8g8r8, neon_0565_8888),
+
+    PIXMAN_ARM_SIMPLE_NEAREST_A8_MASK_FAST_PATH (OVER, a8r8g8b8, r5g6b5, neon_8888_8_0565),
+    PIXMAN_ARM_SIMPLE_NEAREST_A8_MASK_FAST_PATH (OVER, a8b8g8r8, b5g6r5, neon_8888_8_0565),
+
+    PIXMAN_ARM_SIMPLE_NEAREST_A8_MASK_FAST_PATH (OVER, r5g6b5, r5g6b5, neon_0565_8_0565),
+    PIXMAN_ARM_SIMPLE_NEAREST_A8_MASK_FAST_PATH (OVER, b5g6r5, b5g6r5, neon_0565_8_0565),
+
+    SIMPLE_BILINEAR_FAST_PATH (SRC, a8r8g8b8, a8r8g8b8, neon_8888_8888),
+    SIMPLE_BILINEAR_FAST_PATH (SRC, a8r8g8b8, x8r8g8b8, neon_8888_8888),
+    SIMPLE_BILINEAR_FAST_PATH (SRC, x8r8g8b8, x8r8g8b8, neon_8888_8888),
+
+    SIMPLE_BILINEAR_FAST_PATH (SRC, a8r8g8b8, r5g6b5, neon_8888_0565),
+    SIMPLE_BILINEAR_FAST_PATH (SRC, x8r8g8b8, r5g6b5, neon_8888_0565),
+
+    SIMPLE_BILINEAR_FAST_PATH (SRC, r5g6b5, x8r8g8b8, neon_0565_x888),
+    SIMPLE_BILINEAR_FAST_PATH (SRC, r5g6b5, r5g6b5, neon_0565_0565),
+
+    SIMPLE_BILINEAR_FAST_PATH (OVER, a8r8g8b8, a8r8g8b8, neon_8888_8888),
+    SIMPLE_BILINEAR_FAST_PATH (OVER, a8r8g8b8, x8r8g8b8, neon_8888_8888),
+
+    SIMPLE_BILINEAR_FAST_PATH (ADD, a8r8g8b8, a8r8g8b8, neon_8888_8888),
+    SIMPLE_BILINEAR_FAST_PATH (ADD, a8r8g8b8, x8r8g8b8, neon_8888_8888),
+
+    SIMPLE_BILINEAR_A8_MASK_FAST_PATH (SRC, a8r8g8b8, a8r8g8b8, neon_8888_8_8888),
+    SIMPLE_BILINEAR_A8_MASK_FAST_PATH (SRC, a8r8g8b8, x8r8g8b8, neon_8888_8_8888),
+    SIMPLE_BILINEAR_A8_MASK_FAST_PATH (SRC, x8r8g8b8, x8r8g8b8, neon_8888_8_8888),
+
+    SIMPLE_BILINEAR_A8_MASK_FAST_PATH (SRC, a8r8g8b8, r5g6b5, neon_8888_8_0565),
+    SIMPLE_BILINEAR_A8_MASK_FAST_PATH (SRC, x8r8g8b8, r5g6b5, neon_8888_8_0565),
+
+    SIMPLE_BILINEAR_A8_MASK_FAST_PATH (SRC, r5g6b5, x8r8g8b8, neon_0565_8_x888),
+    SIMPLE_BILINEAR_A8_MASK_FAST_PATH (SRC, r5g6b5, r5g6b5, neon_0565_8_0565),
+
+    SIMPLE_BILINEAR_A8_MASK_FAST_PATH (OVER, a8r8g8b8, a8r8g8b8, neon_8888_8_8888),
+    SIMPLE_BILINEAR_A8_MASK_FAST_PATH (OVER, a8r8g8b8, x8r8g8b8, neon_8888_8_8888),
+
+    SIMPLE_BILINEAR_A8_MASK_FAST_PATH (ADD, a8r8g8b8, a8r8g8b8, neon_8888_8_8888),
+    SIMPLE_BILINEAR_A8_MASK_FAST_PATH (ADD, a8r8g8b8, x8r8g8b8, neon_8888_8_8888),
+
+    { PIXMAN_OP_NONE },
+};
+
+static pixman_bool_t
+arm_neon_blt (pixman_implementation_t *imp,
+              uint32_t *               src_bits,
+              uint32_t *               dst_bits,
+              int                      src_stride,
+              int                      dst_stride,
+              int                      src_bpp,
+              int                      dst_bpp,
+              int                      src_x,
+              int                      src_y,
+              int                      dest_x,
+              int                      dest_y,
+              int                      width,
+              int                      height)
+{
+    if (!pixman_blt_neon (
+            src_bits, dst_bits, src_stride, dst_stride, src_bpp, dst_bpp,
+            src_x, src_y, dest_x, dest_y, width, height))
+
+    {
+       return _pixman_implementation_blt (
+           imp->delegate,
+           src_bits, dst_bits, src_stride, dst_stride, src_bpp, dst_bpp,
+           src_x, src_y, dest_x, dest_y, width, height);
+    }
+
+    return TRUE;
+}
+
+static pixman_bool_t
+arm_neon_fill (pixman_implementation_t *imp,
+               uint32_t *               bits,
+               int                      stride,
+               int                      bpp,
+               int                      x,
+               int                      y,
+               int                      width,
+               int                      height,
+               uint32_t xor)
+{
+    if (pixman_fill_neon (bits, stride, bpp, x, y, width, height, xor))
+       return TRUE;
+
+    return _pixman_implementation_fill (
+       imp->delegate, bits, stride, bpp, x, y, width, height, xor);
+}
+
+#define BIND_COMBINE_U(name)                                             \
+void                                                                     \
+pixman_composite_scanline_##name##_mask_asm_neon (int32_t         w,     \
+                                                  const uint32_t *dst,   \
+                                                  const uint32_t *src,   \
+                                                  const uint32_t *mask); \
+                                                                         \
+void                                                                     \
+pixman_composite_scanline_##name##_asm_neon (int32_t         w,          \
+                                             const uint32_t *dst,        \
+                                             const uint32_t *src);       \
+                                                                         \
+static void                                                              \
+neon_combine_##name##_u (pixman_implementation_t *imp,                   \
+                         pixman_op_t              op,                    \
+                         uint32_t *               dest,                  \
+                         const uint32_t *         src,                   \
+                         const uint32_t *         mask,                  \
+                         int                      width)                 \
+{                                                                        \
+    if (mask)                                                            \
+       pixman_composite_scanline_##name##_mask_asm_neon (width, dest,   \
+                                                         src, mask);    \
+    else                                                                 \
+       pixman_composite_scanline_##name##_asm_neon (width, dest, src);  \
+}
+
+BIND_COMBINE_U (over)
+BIND_COMBINE_U (add)
+BIND_COMBINE_U (out_reverse)
+
+pixman_implementation_t *
+_pixman_implementation_create_arm_neon (pixman_implementation_t *fallback)
+{
+    pixman_implementation_t *imp =
+       _pixman_implementation_create (fallback, arm_neon_fast_paths);
+
+    imp->combine_32[PIXMAN_OP_OVER] = neon_combine_over_u;
+    imp->combine_32[PIXMAN_OP_ADD] = neon_combine_add_u;
+    imp->combine_32[PIXMAN_OP_OUT_REVERSE] = neon_combine_out_reverse_u;
+
+    imp->blt = arm_neon_blt;
+    imp->fill = arm_neon_fill;
+
+    return imp;
+}
diff --git a/pixman/pixman-arm-simd-asm.S b/pixman/pixman-arm-simd-asm.S
new file mode 100644 (file)
index 0000000..8fe1b50
--- /dev/null
@@ -0,0 +1,439 @@
+/*
+ * Copyright © 2008 Mozilla Corporation
+ * Copyright © 2010 Nokia Corporation
+ *
+ * Permission to use, copy, modify, distribute, and sell this software and its
+ * documentation for any purpose is hereby granted without fee, provided that
+ * the above copyright notice appear in all copies and that both that
+ * copyright notice and this permission notice appear in supporting
+ * documentation, and that the name of Mozilla Corporation not be used in
+ * advertising or publicity pertaining to distribution of the software without
+ * specific, written prior permission.  Mozilla Corporation makes no
+ * representations about the suitability of this software for any purpose.  It
+ * is provided "as is" without express or implied warranty.
+ *
+ * THE COPYRIGHT HOLDERS DISCLAIM ALL WARRANTIES WITH REGARD TO THIS
+ * SOFTWARE, INCLUDING ALL IMPLIED WARRANTIES OF MERCHANTABILITY AND
+ * FITNESS, IN NO EVENT SHALL THE COPYRIGHT HOLDERS BE LIABLE FOR ANY
+ * SPECIAL, INDIRECT OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES
+ * WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN
+ * AN ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING
+ * OUT OF OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS
+ * SOFTWARE.
+ *
+ * Author:  Jeff Muizelaar (jeff@infidigm.net)
+ *
+ */
+
+/* Prevent the stack from becoming executable */
+#if defined(__linux__) && defined(__ELF__)
+.section .note.GNU-stack,"",%progbits
+#endif
+
+       .text
+       .arch armv6
+       .object_arch armv4
+       .arm
+       .altmacro
+       .p2align 2
+
+/* Supplementary macro for setting function attributes */
+.macro pixman_asm_function fname
+       .func fname
+       .global fname
+#ifdef __ELF__
+       .hidden fname
+       .type fname, %function
+#endif
+fname:
+.endm
+
+/*
+ * The code below was generated by gcc 4.3.4 from the commented out
+ * functions in 'pixman-arm-simd.c' file with the following optimization
+ * options: "-O3 -mcpu=arm1136jf-s -fomit-frame-pointer"
+ *
+ * TODO: replace gcc generated code with hand tuned versions because
+ * the code quality is not very good, introduce symbolic register
+ * aliases for better readability and maintainability.
+ */
+
+pixman_asm_function pixman_composite_add_8_8_asm_armv6
+       push    {r4, r5, r6, r7, r8, r9, r10, r11}
+       mov     r10, r1
+       sub     sp, sp, #4
+       subs    r10, r10, #1
+       mov     r11, r0
+       mov     r8, r2
+       str     r3, [sp]
+       ldr     r7, [sp, #36]
+       bcc     0f
+6:     cmp     r11, #0
+       beq     1f
+       orr     r3, r8, r7
+       tst     r3, #3
+       beq     2f
+       mov     r1, r8
+       mov     r0, r7
+       mov     r12, r11
+       b       3f
+5:     tst     r3, #3
+       beq     4f
+3:     ldrb    r2, [r0], #1
+       subs    r12, r12, #1
+       ldrb    r3, [r1]
+       uqadd8  r3, r2, r3
+       strb    r3, [r1], #1
+       orr     r3, r1, r0
+       bne     5b
+1:     ldr     r3, [sp]
+       add     r8, r8, r3
+       ldr     r3, [sp, #40]
+       add     r7, r7, r3
+10:    subs    r10, r10, #1
+       bcs     6b
+0:     add     sp, sp, #4
+       pop     {r4, r5, r6, r7, r8, r9, r10, r11}
+       bx      lr
+2:     mov     r12, r11
+       mov     r1, r8
+       mov     r0, r7
+4:     cmp     r12, #3
+       subgt   r6, r12, #4
+       movgt   r9, r12
+       lsrgt   r5, r6, #2
+       addgt   r3, r5, #1
+       movgt   r12, #0
+       lslgt   r4, r3, #2
+       ble     7f
+8:     ldr     r3, [r0, r12]
+       ldr     r2, [r1, r12]
+       uqadd8  r3, r3, r2
+       str     r3, [r1, r12]
+       add     r12, r12, #4
+       cmp     r12, r4
+       bne     8b
+       sub     r3, r9, #4
+       bic     r3, r3, #3
+       add     r3, r3, #4
+       subs    r12, r6, r5, lsl #2
+       add     r1, r1, r3
+       add     r0, r0, r3
+       beq     1b
+7:     mov     r4, #0
+9:     ldrb    r3, [r1, r4]
+       ldrb    r2, [r0, r4]
+       uqadd8  r3, r2, r3
+       strb    r3, [r1, r4]
+       add     r4, r4, #1
+       cmp     r4, r12
+       bne     9b
+       ldr     r3, [sp]
+       add     r8, r8, r3
+       ldr     r3, [sp, #40]
+       add     r7, r7, r3
+       b       10b
+.endfunc
+
+pixman_asm_function pixman_composite_over_8888_8888_asm_armv6
+       push    {r4, r5, r6, r7, r8, r9, r10, r11}
+       sub     sp, sp, #20
+       cmp     r1, #0
+       mov     r12, r2
+       str     r1, [sp, #12]
+       str     r0, [sp, #16]
+       ldr     r2, [sp, #52]
+       beq     0f
+       lsl     r3, r3, #2
+       str     r3, [sp]
+       ldr     r3, [sp, #56]
+       mov     r10, #0
+       lsl     r3, r3, #2
+       str     r3, [sp, #8]
+       mov     r11, r3
+       b       1f
+6:     ldr     r11, [sp, #8]
+1:     ldr     r9, [sp]
+       mov     r0, r12
+       add     r12, r12, r9
+       mov     r1, r2
+       str     r12, [sp, #4]
+       add     r2, r2, r11
+       ldr     r12, [sp, #16]
+       ldr     r3, =0x00800080
+       ldr     r9, =0xff00ff00
+       mov     r11, #255
+       cmp     r12, #0
+       beq     4f
+5:     ldr     r5, [r1], #4
+       ldr     r4, [r0]
+       sub     r8, r11, r5, lsr #24
+       uxtb16  r6, r4
+       uxtb16  r7, r4, ror #8
+       mla     r6, r6, r8, r3
+       mla     r7, r7, r8, r3
+       uxtab16 r6, r6, r6, ror #8
+       uxtab16 r7, r7, r7, ror #8
+       and     r7, r7, r9
+       uxtab16 r6, r7, r6, ror #8
+       uqadd8  r5, r6, r5
+       str     r5, [r0], #4
+       subs    r12, r12, #1
+       bne     5b
+4:     ldr     r3, [sp, #12]
+       add     r10, r10, #1
+       cmp     r10, r3
+       ldr     r12, [sp, #4]
+       bne     6b
+0:     add     sp, sp, #20
+       pop     {r4, r5, r6, r7, r8, r9, r10, r11}
+       bx      lr
+.endfunc
+
+pixman_asm_function pixman_composite_over_8888_n_8888_asm_armv6
+       push    {r4, r5, r6, r7, r8, r9, r10, r11}
+       sub     sp, sp, #28
+       cmp     r1, #0
+       str     r1, [sp, #12]
+       ldrb    r1, [sp, #71]
+       mov     r12, r2
+       str     r0, [sp, #16]
+       ldr     r2, [sp, #60]
+       str     r1, [sp, #24]
+       beq     0f
+       lsl     r3, r3, #2
+       str     r3, [sp, #20]
+       ldr     r3, [sp, #64]
+       mov     r10, #0
+       lsl     r3, r3, #2
+       str     r3, [sp, #8]
+       mov     r11, r3
+       b       1f
+5:     ldr     r11, [sp, #8]
+1:     ldr     r4, [sp, #20]
+       mov     r0, r12
+       mov     r1, r2
+       add     r12, r12, r4
+       add     r2, r2, r11
+       str     r12, [sp]
+       str     r2, [sp, #4]
+       ldr     r12, [sp, #16]
+       ldr     r2, =0x00800080
+       ldr     r3, [sp, #24]
+       mov     r11, #255
+       cmp     r12, #0
+       beq     3f
+4:     ldr     r5, [r1], #4
+       ldr     r4, [r0]
+       uxtb16  r6, r5
+       uxtb16  r7, r5, ror #8
+       mla     r6, r6, r3, r2
+       mla     r7, r7, r3, r2
+       uxtab16 r6, r6, r6, ror #8
+       uxtab16 r7, r7, r7, ror #8
+       uxtb16  r6, r6, ror #8
+       uxtb16  r7, r7, ror #8
+       orr     r5, r6, r7, lsl #8
+       uxtb16  r6, r4
+       uxtb16  r7, r4, ror #8
+       sub     r8, r11, r5, lsr #24
+       mla     r6, r6, r8, r2
+       mla     r7, r7, r8, r2
+       uxtab16 r6, r6, r6, ror #8
+       uxtab16 r7, r7, r7, ror #8
+       uxtb16  r6, r6, ror #8
+       uxtb16  r7, r7, ror #8
+       orr     r6, r6, r7, lsl #8
+       uqadd8  r5, r6, r5
+       str     r5, [r0], #4
+       subs    r12, r12, #1
+       bne     4b
+3:     ldr     r1, [sp, #12]
+       add     r10, r10, #1
+       cmp     r10, r1
+       ldr     r12, [sp]
+       ldr     r2, [sp, #4]
+       bne     5b
+0:     add     sp, sp, #28
+       pop     {r4, r5, r6, r7, r8, r9, r10, r11}
+       bx      lr
+.endfunc
+
+pixman_asm_function pixman_composite_over_n_8_8888_asm_armv6
+       push    {r4, r5, r6, r7, r8, r9, r10, r11}
+       sub     sp, sp, #28
+       cmp     r1, #0
+       ldr     r9, [sp, #60]
+       str     r1, [sp, #12]
+       bic     r1, r9, #-16777216
+       str     r1, [sp, #20]
+       mov     r12, r2
+       lsr     r1, r9, #8
+       ldr     r2, [sp, #20]
+       bic     r1, r1, #-16777216
+       bic     r2, r2, #65280
+       bic     r1, r1, #65280
+       str     r2, [sp, #20]
+       str     r0, [sp, #16]
+       str     r1, [sp, #4]
+       ldr     r2, [sp, #68]
+       beq     0f
+       lsl     r3, r3, #2
+       str     r3, [sp, #24]
+       mov     r0, #0
+       b       1f
+5:     ldr     r3, [sp, #24]
+1:     ldr     r4, [sp, #72]
+       mov     r10, r12
+       mov     r1, r2
+       add     r12, r12, r3
+       add     r2, r2, r4
+       str     r12, [sp, #8]
+       str     r2, [sp]
+       ldr     r12, [sp, #16]
+       ldr     r11, =0x00800080
+       ldr     r2, [sp, #4]
+       ldr     r3, [sp, #20]
+       cmp     r12, #0
+       beq     3f
+4:     ldrb    r5, [r1], #1
+       ldr     r4, [r10]
+       mla     r6, r3, r5, r11
+       mla     r7, r2, r5, r11
+       uxtab16 r6, r6, r6, ror #8
+       uxtab16 r7, r7, r7, ror #8
+       uxtb16  r6, r6, ror #8
+       uxtb16  r7, r7, ror #8
+       orr     r5, r6, r7, lsl #8
+       uxtb16  r6, r4
+       uxtb16  r7, r4, ror #8
+       mvn     r8, r5
+       lsr     r8, r8, #24
+       mla     r6, r6, r8, r11
+       mla     r7, r7, r8, r11
+       uxtab16 r6, r6, r6, ror #8
+       uxtab16 r7, r7, r7, ror #8
+       uxtb16  r6, r6, ror #8
+       uxtb16  r7, r7, ror #8
+       orr     r6, r6, r7, lsl #8
+       uqadd8  r5, r6, r5
+       str     r5, [r10], #4
+       subs    r12, r12, #1
+       bne     4b
+3:     ldr     r4, [sp, #12]
+       add     r0, r0, #1
+       cmp     r0, r4
+       ldr     r12, [sp, #8]
+       ldr     r2, [sp]
+       bne     5b
+0:     add     sp, sp, #28
+       pop     {r4, r5, r6, r7, r8, r9, r10, r11}
+       bx      lr
+.endfunc
+
+/*
+ * Note: This code is only using armv5te instructions (not even armv6),
+ *       but is scheduled for ARM Cortex-A8 pipeline. So it might need to
+ *       be split into a few variants, tuned for each microarchitecture.
+ *
+ * TODO: In order to get good performance on ARM9/ARM11 cores (which don't
+ * have efficient write combining), it needs to be changed to use 16-byte
+ * aligned writes using STM instruction.
+ *
+ * Nearest scanline scaler macro template uses the following arguments:
+ *  fname                     - name of the function to generate
+ *  bpp_shift                 - (1 << bpp_shift) is the size of pixel in bytes
+ *  t                         - type suffix for LDR/STR instructions
+ *  prefetch_distance         - prefetch in the source image by that many
+ *                              pixels ahead
+ *  prefetch_braking_distance - stop prefetching when that many pixels are
+ *                              remaining before the end of scanline
+ */
+
+.macro generate_nearest_scanline_func fname, bpp_shift, t,      \
+                                      prefetch_distance,        \
+                                      prefetch_braking_distance
+
+pixman_asm_function fname
+       W       .req    r0
+       DST     .req    r1
+       SRC     .req    r2
+       VX      .req    r3
+       UNIT_X  .req    ip
+       TMP1    .req    r4
+       TMP2    .req    r5
+       VXMASK  .req    r6
+       PF_OFFS .req    r7
+
+       ldr     UNIT_X, [sp]
+       push    {r4, r5, r6, r7}
+       mvn     VXMASK, #((1 << bpp_shift) - 1)
+
+       /* define helper macro */
+       .macro  scale_2_pixels
+               ldr&t   TMP1, [SRC, TMP1]
+               and     TMP2, VXMASK, VX, lsr #(16 - bpp_shift)
+               add     VX, VX, UNIT_X
+               str&t   TMP1, [DST], #(1 << bpp_shift)
+
+               ldr&t   TMP2, [SRC, TMP2]
+               and     TMP1, VXMASK, VX, lsr #(16 - bpp_shift)
+               add     VX, VX, UNIT_X
+               str&t   TMP2, [DST], #(1 << bpp_shift)
+       .endm
+
+       /* now do the scaling */
+       and     TMP1, VXMASK, VX, lsr #(16 - bpp_shift)
+       add     VX, VX, UNIT_X
+       subs    W, W, #(8 + prefetch_braking_distance)
+       blt     2f
+       /* calculate prefetch offset */
+       mov     PF_OFFS, #prefetch_distance
+       mla     PF_OFFS, UNIT_X, PF_OFFS, VX
+1:     /* main loop, process 8 pixels per iteration with prefetch */
+       subs    W, W, #8
+       add     PF_OFFS, UNIT_X, lsl #3
+       scale_2_pixels
+       scale_2_pixels
+       scale_2_pixels
+       scale_2_pixels
+       pld     [SRC, PF_OFFS, lsr #(16 - bpp_shift)]
+       bge     1b
+2:
+       subs    W, W, #(4 - 8 - prefetch_braking_distance)
+       blt     2f
+1:     /* process the remaining pixels */
+       scale_2_pixels
+       scale_2_pixels
+       subs    W, W, #4
+       bge     1b
+2:
+       tst     W, #2
+       beq     2f
+       scale_2_pixels
+2:
+       tst     W, #1
+       ldrne&t TMP1, [SRC, TMP1]
+       strne&t TMP1, [DST]
+       /* cleanup helper macro */
+       .purgem scale_2_pixels
+       .unreq  DST
+       .unreq  SRC
+       .unreq  W
+       .unreq  VX
+       .unreq  UNIT_X
+       .unreq  TMP1
+       .unreq  TMP2
+       .unreq  VXMASK
+       .unreq  PF_OFFS
+       /* return */
+       pop     {r4, r5, r6, r7}
+       bx      lr
+.endfunc
+.endm
+
+generate_nearest_scanline_func \
+    pixman_scaled_nearest_scanline_0565_0565_SRC_asm_armv6, 1, h, 80, 32
+
+generate_nearest_scanline_func \
+    pixman_scaled_nearest_scanline_8888_8888_SRC_asm_armv6, 2,  , 48, 32
diff --git a/pixman/pixman-arm-simd.c b/pixman/pixman-arm-simd.c
new file mode 100644 (file)
index 0000000..3d19bfa
--- /dev/null
@@ -0,0 +1,432 @@
+/*
+ * Copyright © 2008 Mozilla Corporation
+ *
+ * Permission to use, copy, modify, distribute, and sell this software and its
+ * documentation for any purpose is hereby granted without fee, provided that
+ * the above copyright notice appear in all copies and that both that
+ * copyright notice and this permission notice appear in supporting
+ * documentation, and that the name of Mozilla Corporation not be used in
+ * advertising or publicity pertaining to distribution of the software without
+ * specific, written prior permission.  Mozilla Corporation makes no
+ * representations about the suitability of this software for any purpose.  It
+ * is provided "as is" without express or implied warranty.
+ *
+ * THE COPYRIGHT HOLDERS DISCLAIM ALL WARRANTIES WITH REGARD TO THIS
+ * SOFTWARE, INCLUDING ALL IMPLIED WARRANTIES OF MERCHANTABILITY AND
+ * FITNESS, IN NO EVENT SHALL THE COPYRIGHT HOLDERS BE LIABLE FOR ANY
+ * SPECIAL, INDIRECT OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES
+ * WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN
+ * AN ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING
+ * OUT OF OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS
+ * SOFTWARE.
+ *
+ * Author:  Jeff Muizelaar (jeff@infidigm.net)
+ *
+ */
+#ifdef HAVE_CONFIG_H
+#include <config.h>
+#endif
+
+#include "pixman-private.h"
+#include "pixman-arm-common.h"
+#include "pixman-inlines.h"
+
+#if 0 /* This code was moved to 'pixman-arm-simd-asm.S' */
+
+void
+pixman_composite_add_8_8_asm_armv6 (int32_t  width,
+                                   int32_t  height,
+                                   uint8_t *dst_line,
+                                   int32_t  dst_stride,
+                                   uint8_t *src_line,
+                                   int32_t  src_stride)
+{
+    uint8_t *dst, *src;
+    int32_t w;
+    uint8_t s, d;
+
+    while (height--)
+    {
+       dst = dst_line;
+       dst_line += dst_stride;
+       src = src_line;
+       src_line += src_stride;
+       w = width;
+
+       /* ensure both src and dst are properly aligned before doing 32 bit reads
+        * we'll stay in this loop if src and dst have differing alignments
+        */
+       while (w && (((unsigned long)dst & 3) || ((unsigned long)src & 3)))
+       {
+           s = *src;
+           d = *dst;
+           asm ("uqadd8 %0, %1, %2" : "+r" (d) : "r" (s));
+           *dst = d;
+
+           dst++;
+           src++;
+           w--;
+       }
+
+       while (w >= 4)
+       {
+           asm ("uqadd8 %0, %1, %2"
+                : "=r" (*(uint32_t*)dst)
+                : "r" (*(uint32_t*)src), "r" (*(uint32_t*)dst));
+           dst += 4;
+           src += 4;
+           w -= 4;
+       }
+
+       while (w)
+       {
+           s = *src;
+           d = *dst;
+           asm ("uqadd8 %0, %1, %2" : "+r" (d) : "r" (s));
+           *dst = d;
+
+           dst++;
+           src++;
+           w--;
+       }
+    }
+
+}
+
+void
+pixman_composite_over_8888_8888_asm_armv6 (int32_t   width,
+                                           int32_t   height,
+                                           uint32_t *dst_line,
+                                           int32_t   dst_stride,
+                                           uint32_t *src_line,
+                                           int32_t   src_stride)
+{
+    uint32_t    *dst;
+    uint32_t    *src;
+    int32_t w;
+    uint32_t component_half = 0x800080;
+    uint32_t upper_component_mask = 0xff00ff00;
+    uint32_t alpha_mask = 0xff;
+
+    while (height--)
+    {
+       dst = dst_line;
+       dst_line += dst_stride;
+       src = src_line;
+       src_line += src_stride;
+       w = width;
+
+/* #define inner_branch */
+       asm volatile (
+           "cmp %[w], #0\n\t"
+           "beq 2f\n\t"
+           "1:\n\t"
+           /* load src */
+           "ldr r5, [%[src]], #4\n\t"
+#ifdef inner_branch
+           /* We can avoid doing the multiplication in two cases: 0x0 or 0xff.
+            * The 0x0 case also allows us to avoid doing an unecessary data
+            * write which is more valuable so we only check for that
+            */
+           "cmp r5, #0\n\t"
+           "beq 3f\n\t"
+
+           /* = 255 - alpha */
+           "sub r8, %[alpha_mask], r5, lsr #24\n\t"
+
+           "ldr r4, [%[dest]] \n\t"
+
+#else
+           "ldr r4, [%[dest]] \n\t"
+
+           /* = 255 - alpha */
+           "sub r8, %[alpha_mask], r5, lsr #24\n\t"
+#endif
+           "uxtb16 r6, r4\n\t"
+           "uxtb16 r7, r4, ror #8\n\t"
+
+           /* multiply by 257 and divide by 65536 */
+           "mla r6, r6, r8, %[component_half]\n\t"
+           "mla r7, r7, r8, %[component_half]\n\t"
+
+           "uxtab16 r6, r6, r6, ror #8\n\t"
+           "uxtab16 r7, r7, r7, ror #8\n\t"
+
+           /* recombine the 0xff00ff00 bytes of r6 and r7 */
+           "and r7, r7, %[upper_component_mask]\n\t"
+           "uxtab16 r6, r7, r6, ror #8\n\t"
+
+           "uqadd8 r5, r6, r5\n\t"
+
+#ifdef inner_branch
+           "3:\n\t"
+
+#endif
+           "str r5, [%[dest]], #4\n\t"
+           /* increment counter and jmp to top */
+           "subs       %[w], %[w], #1\n\t"
+           "bne        1b\n\t"
+           "2:\n\t"
+           : [w] "+r" (w), [dest] "+r" (dst), [src] "+r" (src)
+           : [component_half] "r" (component_half), [upper_component_mask] "r" (upper_component_mask),
+             [alpha_mask] "r" (alpha_mask)
+           : "r4", "r5", "r6", "r7", "r8", "cc", "memory"
+           );
+    }
+}
+
+void
+pixman_composite_over_8888_n_8888_asm_armv6 (int32_t   width,
+                                             int32_t   height,
+                                             uint32_t *dst_line,
+                                             int32_t   dst_stride,
+                                             uint32_t *src_line,
+                                             int32_t   src_stride,
+                                             uint32_t  mask)
+{
+    uint32_t *dst;
+    uint32_t *src;
+    int32_t w;
+    uint32_t component_half = 0x800080;
+    uint32_t alpha_mask = 0xff;
+
+    mask = (mask) >> 24;
+
+    while (height--)
+    {
+       dst = dst_line;
+       dst_line += dst_stride;
+       src = src_line;
+       src_line += src_stride;
+       w = width;
+
+/* #define inner_branch */
+       asm volatile (
+           "cmp %[w], #0\n\t"
+           "beq 2f\n\t"
+           "1:\n\t"
+           /* load src */
+           "ldr r5, [%[src]], #4\n\t"
+#ifdef inner_branch
+           /* We can avoid doing the multiplication in two cases: 0x0 or 0xff.
+            * The 0x0 case also allows us to avoid doing an unecessary data
+            * write which is more valuable so we only check for that
+            */
+           "cmp r5, #0\n\t"
+           "beq 3f\n\t"
+
+#endif
+           "ldr r4, [%[dest]] \n\t"
+
+           "uxtb16 r6, r5\n\t"
+           "uxtb16 r7, r5, ror #8\n\t"
+
+           /* multiply by alpha (r8) then by 257 and divide by 65536 */
+           "mla r6, r6, %[mask_alpha], %[component_half]\n\t"
+           "mla r7, r7, %[mask_alpha], %[component_half]\n\t"
+
+           "uxtab16 r6, r6, r6, ror #8\n\t"
+           "uxtab16 r7, r7, r7, ror #8\n\t"
+
+           "uxtb16 r6, r6, ror #8\n\t"
+           "uxtb16 r7, r7, ror #8\n\t"
+
+           /* recombine */
+           "orr r5, r6, r7, lsl #8\n\t"
+
+           "uxtb16 r6, r4\n\t"
+           "uxtb16 r7, r4, ror #8\n\t"
+
+           /* 255 - alpha */
+           "sub r8, %[alpha_mask], r5, lsr #24\n\t"
+
+           /* multiply by alpha (r8) then by 257 and divide by 65536 */
+           "mla r6, r6, r8, %[component_half]\n\t"
+           "mla r7, r7, r8, %[component_half]\n\t"
+
+           "uxtab16 r6, r6, r6, ror #8\n\t"
+           "uxtab16 r7, r7, r7, ror #8\n\t"
+
+           "uxtb16 r6, r6, ror #8\n\t"
+           "uxtb16 r7, r7, ror #8\n\t"
+
+           /* recombine */
+           "orr r6, r6, r7, lsl #8\n\t"
+
+           "uqadd8 r5, r6, r5\n\t"
+
+#ifdef inner_branch
+           "3:\n\t"
+
+#endif
+           "str r5, [%[dest]], #4\n\t"
+           /* increment counter and jmp to top */
+           "subs       %[w], %[w], #1\n\t"
+           "bne        1b\n\t"
+           "2:\n\t"
+           : [w] "+r" (w), [dest] "+r" (dst), [src] "+r" (src)
+           : [component_half] "r" (component_half), [mask_alpha] "r" (mask),
+             [alpha_mask] "r" (alpha_mask)
+           : "r4", "r5", "r6", "r7", "r8", "r9", "cc", "memory"
+           );
+    }
+}
+
+void
+pixman_composite_over_n_8_8888_asm_armv6 (int32_t   width,
+                                          int32_t   height,
+                                          uint32_t *dst_line,
+                                          int32_t   dst_stride,
+                                          uint32_t  src,
+                                          int32_t   unused,
+                                          uint8_t  *mask_line,
+                                          int32_t   mask_stride)
+{
+    uint32_t  srca;
+    uint32_t *dst;
+    uint8_t  *mask;
+    int32_t w;
+
+    srca = src >> 24;
+
+    uint32_t component_mask = 0xff00ff;
+    uint32_t component_half = 0x800080;
+
+    uint32_t src_hi = (src >> 8) & component_mask;
+    uint32_t src_lo = src & component_mask;
+
+    while (height--)
+    {
+       dst = dst_line;
+       dst_line += dst_stride;
+       mask = mask_line;
+       mask_line += mask_stride;
+       w = width;
+
+/* #define inner_branch */
+       asm volatile (
+           "cmp %[w], #0\n\t"
+           "beq 2f\n\t"
+           "1:\n\t"
+           /* load mask */
+           "ldrb r5, [%[mask]], #1\n\t"
+#ifdef inner_branch
+           /* We can avoid doing the multiplication in two cases: 0x0 or 0xff.
+            * The 0x0 case also allows us to avoid doing an unecessary data
+            * write which is more valuable so we only check for that
+            */
+           "cmp r5, #0\n\t"
+           "beq 3f\n\t"
+
+#endif
+           "ldr r4, [%[dest]] \n\t"
+
+           /* multiply by alpha (r8) then by 257 and divide by 65536 */
+           "mla r6, %[src_lo], r5, %[component_half]\n\t"
+           "mla r7, %[src_hi], r5, %[component_half]\n\t"
+
+           "uxtab16 r6, r6, r6, ror #8\n\t"
+           "uxtab16 r7, r7, r7, ror #8\n\t"
+
+           "uxtb16 r6, r6, ror #8\n\t"
+           "uxtb16 r7, r7, ror #8\n\t"
+
+           /* recombine */
+           "orr r5, r6, r7, lsl #8\n\t"
+
+           "uxtb16 r6, r4\n\t"
+           "uxtb16 r7, r4, ror #8\n\t"
+
+           /* we could simplify this to use 'sub' if we were
+            * willing to give up a register for alpha_mask
+            */
+           "mvn r8, r5\n\t"
+           "mov r8, r8, lsr #24\n\t"
+
+           /* multiply by alpha (r8) then by 257 and divide by 65536 */
+           "mla r6, r6, r8, %[component_half]\n\t"
+           "mla r7, r7, r8, %[component_half]\n\t"
+
+           "uxtab16 r6, r6, r6, ror #8\n\t"
+           "uxtab16 r7, r7, r7, ror #8\n\t"
+
+           "uxtb16 r6, r6, ror #8\n\t"
+           "uxtb16 r7, r7, ror #8\n\t"
+
+           /* recombine */
+           "orr r6, r6, r7, lsl #8\n\t"
+
+           "uqadd8 r5, r6, r5\n\t"
+
+#ifdef inner_branch
+           "3:\n\t"
+
+#endif
+           "str r5, [%[dest]], #4\n\t"
+           /* increment counter and jmp to top */
+           "subs       %[w], %[w], #1\n\t"
+           "bne        1b\n\t"
+           "2:\n\t"
+           : [w] "+r" (w), [dest] "+r" (dst), [src] "+r" (src), [mask] "+r" (mask)
+           : [component_half] "r" (component_half),
+             [src_hi] "r" (src_hi), [src_lo] "r" (src_lo)
+           : "r4", "r5", "r6", "r7", "r8", "cc", "memory");
+    }
+}
+
+#endif
+
+PIXMAN_ARM_BIND_FAST_PATH_SRC_DST (armv6, add_8_8,
+                                   uint8_t, 1, uint8_t, 1)
+PIXMAN_ARM_BIND_FAST_PATH_SRC_DST (armv6, over_8888_8888,
+                                   uint32_t, 1, uint32_t, 1)
+
+PIXMAN_ARM_BIND_FAST_PATH_SRC_N_DST (SKIP_ZERO_MASK, armv6, over_8888_n_8888,
+                                     uint32_t, 1, uint32_t, 1)
+
+PIXMAN_ARM_BIND_FAST_PATH_N_MASK_DST (SKIP_ZERO_SRC, armv6, over_n_8_8888,
+                                      uint8_t, 1, uint32_t, 1)
+
+PIXMAN_ARM_BIND_SCALED_NEAREST_SRC_DST (armv6, 0565_0565, SRC,
+                                        uint16_t, uint16_t)
+PIXMAN_ARM_BIND_SCALED_NEAREST_SRC_DST (armv6, 8888_8888, SRC,
+                                        uint32_t, uint32_t)
+
+static const pixman_fast_path_t arm_simd_fast_paths[] =
+{
+    PIXMAN_STD_FAST_PATH (OVER, a8r8g8b8, null, a8r8g8b8, armv6_composite_over_8888_8888),
+    PIXMAN_STD_FAST_PATH (OVER, a8r8g8b8, null, x8r8g8b8, armv6_composite_over_8888_8888),
+    PIXMAN_STD_FAST_PATH (OVER, a8b8g8r8, null, a8b8g8r8, armv6_composite_over_8888_8888),
+    PIXMAN_STD_FAST_PATH (OVER, a8b8g8r8, null, x8b8g8r8, armv6_composite_over_8888_8888),
+    PIXMAN_STD_FAST_PATH (OVER, a8r8g8b8, solid, a8r8g8b8, armv6_composite_over_8888_n_8888),
+    PIXMAN_STD_FAST_PATH (OVER, a8r8g8b8, solid, x8r8g8b8, armv6_composite_over_8888_n_8888),
+    PIXMAN_STD_FAST_PATH (OVER, a8b8g8r8, solid, a8b8g8r8, armv6_composite_over_8888_n_8888),
+    PIXMAN_STD_FAST_PATH (OVER, a8b8g8r8, solid, x8b8g8r8, armv6_composite_over_8888_n_8888),
+
+    PIXMAN_STD_FAST_PATH (ADD, a8, null, a8, armv6_composite_add_8_8),
+
+    PIXMAN_STD_FAST_PATH (OVER, solid, a8, a8r8g8b8, armv6_composite_over_n_8_8888),
+    PIXMAN_STD_FAST_PATH (OVER, solid, a8, x8r8g8b8, armv6_composite_over_n_8_8888),
+    PIXMAN_STD_FAST_PATH (OVER, solid, a8, a8b8g8r8, armv6_composite_over_n_8_8888),
+    PIXMAN_STD_FAST_PATH (OVER, solid, a8, x8b8g8r8, armv6_composite_over_n_8_8888),
+
+    PIXMAN_ARM_SIMPLE_NEAREST_FAST_PATH (SRC, r5g6b5, r5g6b5, armv6_0565_0565),
+    PIXMAN_ARM_SIMPLE_NEAREST_FAST_PATH (SRC, b5g6r5, b5g6r5, armv6_0565_0565),
+
+    PIXMAN_ARM_SIMPLE_NEAREST_FAST_PATH (SRC, a8r8g8b8, a8r8g8b8, armv6_8888_8888),
+    PIXMAN_ARM_SIMPLE_NEAREST_FAST_PATH (SRC, a8r8g8b8, x8r8g8b8, armv6_8888_8888),
+    PIXMAN_ARM_SIMPLE_NEAREST_FAST_PATH (SRC, x8r8g8b8, x8r8g8b8, armv6_8888_8888),
+    PIXMAN_ARM_SIMPLE_NEAREST_FAST_PATH (SRC, a8b8g8r8, a8b8g8r8, armv6_8888_8888),
+    PIXMAN_ARM_SIMPLE_NEAREST_FAST_PATH (SRC, a8b8g8r8, x8b8g8r8, armv6_8888_8888),
+    PIXMAN_ARM_SIMPLE_NEAREST_FAST_PATH (SRC, x8b8g8r8, x8b8g8r8, armv6_8888_8888),
+
+    { PIXMAN_OP_NONE },
+};
+
+pixman_implementation_t *
+_pixman_implementation_create_arm_simd (pixman_implementation_t *fallback)
+{
+    pixman_implementation_t *imp = _pixman_implementation_create (fallback, arm_simd_fast_paths);
+
+    return imp;
+}
diff --git a/pixman/pixman-bits-image.c b/pixman/pixman-bits-image.c
new file mode 100644 (file)
index 0000000..99c0dfe
--- /dev/null
@@ -0,0 +1,1511 @@
+/*
+ * Copyright © 2000 Keith Packard, member of The XFree86 Project, Inc.
+ *             2005 Lars Knoll & Zack Rusin, Trolltech
+ *             2008 Aaron Plattner, NVIDIA Corporation
+ * Copyright © 2000 SuSE, Inc.
+ * Copyright © 2007, 2009 Red Hat, Inc.
+ * Copyright © 2008 André Tupinambá <andrelrt@gmail.com>
+ *
+ * Permission to use, copy, modify, distribute, and sell this software and its
+ * documentation for any purpose is hereby granted without fee, provided that
+ * the above copyright notice appear in all copies and that both that
+ * copyright notice and this permission notice appear in supporting
+ * documentation, and that the name of Keith Packard not be used in
+ * advertising or publicity pertaining to distribution of the software without
+ * specific, written prior permission.  Keith Packard makes no
+ * representations about the suitability of this software for any purpose.  It
+ * is provided "as is" without express or implied warranty.
+ *
+ * THE COPYRIGHT HOLDERS DISCLAIM ALL WARRANTIES WITH REGARD TO THIS
+ * SOFTWARE, INCLUDING ALL IMPLIED WARRANTIES OF MERCHANTABILITY AND
+ * FITNESS, IN NO EVENT SHALL THE COPYRIGHT HOLDERS BE LIABLE FOR ANY
+ * SPECIAL, INDIRECT OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES
+ * WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN
+ * AN ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING
+ * OUT OF OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS
+ * SOFTWARE.
+ */
+
+#ifdef HAVE_CONFIG_H
+#include <config.h>
+#endif
+#include <stdio.h>
+#include <stdlib.h>
+#include <string.h>
+#include "pixman-private.h"
+#include "pixman-combine32.h"
+#include "pixman-inlines.h"
+
+/*
+ * By default, just evaluate the image at 32bpp and expand.  Individual image
+ * types can plug in a better scanline getter if they want to. For example
+ * we  could produce smoother gradients by evaluating them at higher color
+ * depth, but that's a project for the future.
+ */
+static void
+_pixman_image_get_scanline_generic_64 (pixman_image_t * image,
+                                       int              x,
+                                       int              y,
+                                       int              width,
+                                       uint32_t *       buffer,
+                                       const uint32_t * mask)
+{
+    uint32_t *mask8 = NULL;
+
+    /* Contract the mask image, if one exists, so that the 32-bit fetch
+     * function can use it.
+     */
+    if (mask)
+    {
+       mask8 = pixman_malloc_ab (width, sizeof(uint32_t));
+       if (!mask8)
+           return;
+
+       pixman_contract (mask8, (uint64_t *)mask, width);
+    }
+
+    /* Fetch the source image into the first half of buffer. */
+    image->bits.get_scanline_32 (image, x, y, width, (uint32_t*)buffer, mask8);
+
+    /* Expand from 32bpp to 64bpp in place. */
+    pixman_expand ((uint64_t *)buffer, buffer, PIXMAN_a8r8g8b8, width);
+
+    free (mask8);
+}
+
+/* Fetch functions */
+
+static force_inline uint32_t
+fetch_pixel_no_alpha (bits_image_t *image,
+                     int x, int y, pixman_bool_t check_bounds)
+{
+    if (check_bounds &&
+       (x < 0 || x >= image->width || y < 0 || y >= image->height))
+    {
+       return 0;
+    }
+
+    return image->fetch_pixel_32 (image, x, y);
+}
+
+typedef uint32_t (* get_pixel_t) (bits_image_t *image,
+                                 int x, int y, pixman_bool_t check_bounds);
+
+static force_inline uint32_t
+bits_image_fetch_pixel_nearest (bits_image_t   *image,
+                               pixman_fixed_t  x,
+                               pixman_fixed_t  y,
+                               get_pixel_t     get_pixel)
+{
+    int x0 = pixman_fixed_to_int (x - pixman_fixed_e);
+    int y0 = pixman_fixed_to_int (y - pixman_fixed_e);
+
+    if (image->common.repeat != PIXMAN_REPEAT_NONE)
+    {
+       repeat (image->common.repeat, &x0, image->width);
+       repeat (image->common.repeat, &y0, image->height);
+
+       return get_pixel (image, x0, y0, FALSE);
+    }
+    else
+    {
+       return get_pixel (image, x0, y0, TRUE);
+    }
+}
+
+static force_inline uint32_t
+bits_image_fetch_pixel_bilinear (bits_image_t   *image,
+                                pixman_fixed_t  x,
+                                pixman_fixed_t  y,
+                                get_pixel_t     get_pixel)
+{
+    pixman_repeat_t repeat_mode = image->common.repeat;
+    int width = image->width;
+    int height = image->height;
+    int x1, y1, x2, y2;
+    uint32_t tl, tr, bl, br;
+    int32_t distx, disty;
+
+    x1 = x - pixman_fixed_1 / 2;
+    y1 = y - pixman_fixed_1 / 2;
+
+    distx = (x1 >> 8) & 0xff;
+    disty = (y1 >> 8) & 0xff;
+
+    x1 = pixman_fixed_to_int (x1);
+    y1 = pixman_fixed_to_int (y1);
+    x2 = x1 + 1;
+    y2 = y1 + 1;
+
+    if (repeat_mode != PIXMAN_REPEAT_NONE)
+    {
+       repeat (repeat_mode, &x1, width);
+       repeat (repeat_mode, &y1, height);
+       repeat (repeat_mode, &x2, width);
+       repeat (repeat_mode, &y2, height);
+
+       tl = get_pixel (image, x1, y1, FALSE);
+       bl = get_pixel (image, x1, y2, FALSE);
+       tr = get_pixel (image, x2, y1, FALSE);
+       br = get_pixel (image, x2, y2, FALSE);
+    }
+    else
+    {
+       tl = get_pixel (image, x1, y1, TRUE);
+       tr = get_pixel (image, x2, y1, TRUE);
+       bl = get_pixel (image, x1, y2, TRUE);
+       br = get_pixel (image, x2, y2, TRUE);
+    }
+
+    return bilinear_interpolation (tl, tr, bl, br, distx, disty);
+}
+
+static void
+bits_image_fetch_bilinear_no_repeat_8888 (pixman_image_t * ima,
+                                         int              offset,
+                                         int              line,
+                                         int              width,
+                                         uint32_t *       buffer,
+                                         const uint32_t * mask)
+{
+    bits_image_t *bits = &ima->bits;
+    pixman_fixed_t x_top, x_bottom, x;
+    pixman_fixed_t ux_top, ux_bottom, ux;
+    pixman_vector_t v;
+    uint32_t top_mask, bottom_mask;
+    uint32_t *top_row;
+    uint32_t *bottom_row;
+    uint32_t *end;
+    uint32_t zero[2] = { 0, 0 };
+    uint32_t one = 1;
+    int y, y1, y2;
+    int disty;
+    int mask_inc;
+    int w;
+
+    /* reference point is the center of the pixel */
+    v.vector[0] = pixman_int_to_fixed (offset) + pixman_fixed_1 / 2;
+    v.vector[1] = pixman_int_to_fixed (line) + pixman_fixed_1 / 2;
+    v.vector[2] = pixman_fixed_1;
+
+    if (!pixman_transform_point_3d (bits->common.transform, &v))
+       return;
+
+    ux = ux_top = ux_bottom = bits->common.transform->matrix[0][0];
+    x = x_top = x_bottom = v.vector[0] - pixman_fixed_1/2;
+
+    y = v.vector[1] - pixman_fixed_1/2;
+    disty = (y >> 8) & 0xff;
+
+    /* Load the pointers to the first and second lines from the source
+     * image that bilinear code must read.
+     *
+     * The main trick in this code is about the check if any line are
+     * outside of the image;
+     *
+     * When I realize that a line (any one) is outside, I change
+     * the pointer to a dummy area with zeros. Once I change this, I
+     * must be sure the pointer will not change, so I set the
+     * variables to each pointer increments inside the loop.
+     */
+    y1 = pixman_fixed_to_int (y);
+    y2 = y1 + 1;
+
+    if (y1 < 0 || y1 >= bits->height)
+    {
+       top_row = zero;
+       x_top = 0;
+       ux_top = 0;
+    }
+    else
+    {
+       top_row = bits->bits + y1 * bits->rowstride;
+       x_top = x;
+       ux_top = ux;
+    }
+
+    if (y2 < 0 || y2 >= bits->height)
+    {
+       bottom_row = zero;
+       x_bottom = 0;
+       ux_bottom = 0;
+    }
+    else
+    {
+       bottom_row = bits->bits + y2 * bits->rowstride;
+       x_bottom = x;
+       ux_bottom = ux;
+    }
+
+    /* Instead of checking whether the operation uses the mast in
+     * each loop iteration, verify this only once and prepare the
+     * variables to make the code smaller inside the loop.
+     */
+    if (!mask)
+    {
+        mask_inc = 0;
+        mask = &one;
+    }
+    else
+    {
+        /* If have a mask, prepare the variables to check it */
+        mask_inc = 1;
+    }
+
+    /* If both are zero, then the whole thing is zero */
+    if (top_row == zero && bottom_row == zero)
+    {
+       memset (buffer, 0, width * sizeof (uint32_t));
+       return;
+    }
+    else if (bits->format == PIXMAN_x8r8g8b8)
+    {
+       if (top_row == zero)
+       {
+           top_mask = 0;
+           bottom_mask = 0xff000000;
+       }
+       else if (bottom_row == zero)
+       {
+           top_mask = 0xff000000;
+           bottom_mask = 0;
+       }
+       else
+       {
+           top_mask = 0xff000000;
+           bottom_mask = 0xff000000;
+       }
+    }
+    else
+    {
+       top_mask = 0;
+       bottom_mask = 0;
+    }
+
+    end = buffer + width;
+
+    /* Zero fill to the left of the image */
+    while (buffer < end && x < pixman_fixed_minus_1)
+    {
+       *buffer++ = 0;
+       x += ux;
+       x_top += ux_top;
+       x_bottom += ux_bottom;
+       mask += mask_inc;
+    }
+
+    /* Left edge
+     */
+    while (buffer < end && x < 0)
+    {
+       uint32_t tr, br;
+       int32_t distx;
+
+       tr = top_row[pixman_fixed_to_int (x_top) + 1] | top_mask;
+       br = bottom_row[pixman_fixed_to_int (x_bottom) + 1] | bottom_mask;
+
+       distx = (x >> 8) & 0xff;
+
+       *buffer++ = bilinear_interpolation (0, tr, 0, br, distx, disty);
+
+       x += ux;
+       x_top += ux_top;
+       x_bottom += ux_bottom;
+       mask += mask_inc;
+    }
+
+    /* Main part */
+    w = pixman_int_to_fixed (bits->width - 1);
+
+    while (buffer < end  &&  x < w)
+    {
+       if (*mask)
+       {
+           uint32_t tl, tr, bl, br;
+           int32_t distx;
+
+           tl = top_row [pixman_fixed_to_int (x_top)] | top_mask;
+           tr = top_row [pixman_fixed_to_int (x_top) + 1] | top_mask;
+           bl = bottom_row [pixman_fixed_to_int (x_bottom)] | bottom_mask;
+           br = bottom_row [pixman_fixed_to_int (x_bottom) + 1] | bottom_mask;
+
+           distx = (x >> 8) & 0xff;
+
+           *buffer = bilinear_interpolation (tl, tr, bl, br, distx, disty);
+       }
+
+       buffer++;
+       x += ux;
+       x_top += ux_top;
+       x_bottom += ux_bottom;
+       mask += mask_inc;
+    }
+
+    /* Right Edge */
+    w = pixman_int_to_fixed (bits->width);
+    while (buffer < end  &&  x < w)
+    {
+       if (*mask)
+       {
+           uint32_t tl, bl;
+           int32_t distx;
+
+           tl = top_row [pixman_fixed_to_int (x_top)] | top_mask;
+           bl = bottom_row [pixman_fixed_to_int (x_bottom)] | bottom_mask;
+
+           distx = (x >> 8) & 0xff;
+
+           *buffer = bilinear_interpolation (tl, 0, bl, 0, distx, disty);
+       }
+
+       buffer++;
+       x += ux;
+       x_top += ux_top;
+       x_bottom += ux_bottom;
+       mask += mask_inc;
+    }
+
+    /* Zero fill to the left of the image */
+    while (buffer < end)
+       *buffer++ = 0;
+}
+
+static force_inline uint32_t
+bits_image_fetch_pixel_convolution (bits_image_t   *image,
+                                   pixman_fixed_t  x,
+                                   pixman_fixed_t  y,
+                                   get_pixel_t     get_pixel)
+{
+    pixman_fixed_t *params = image->common.filter_params;
+    int x_off = (params[0] - pixman_fixed_1) >> 1;
+    int y_off = (params[1] - pixman_fixed_1) >> 1;
+    int32_t cwidth = pixman_fixed_to_int (params[0]);
+    int32_t cheight = pixman_fixed_to_int (params[1]);
+    int32_t srtot, sgtot, sbtot, satot;
+    int32_t i, j, x1, x2, y1, y2;
+    pixman_repeat_t repeat_mode = image->common.repeat;
+    int width = image->width;
+    int height = image->height;
+
+    params += 2;
+
+    x1 = pixman_fixed_to_int (x - pixman_fixed_e - x_off);
+    y1 = pixman_fixed_to_int (y - pixman_fixed_e - y_off);
+    x2 = x1 + cwidth;
+    y2 = y1 + cheight;
+
+    srtot = sgtot = sbtot = satot = 0;
+
+    for (i = y1; i < y2; ++i)
+    {
+       for (j = x1; j < x2; ++j)
+       {
+           int rx = j;
+           int ry = i;
+
+           pixman_fixed_t f = *params;
+
+           if (f)
+           {
+               uint32_t pixel;
+
+               if (repeat_mode != PIXMAN_REPEAT_NONE)
+               {
+                   repeat (repeat_mode, &rx, width);
+                   repeat (repeat_mode, &ry, height);
+
+                   pixel = get_pixel (image, rx, ry, FALSE);
+               }
+               else
+               {
+                   pixel = get_pixel (image, rx, ry, TRUE);
+               }
+
+               srtot += RED_8 (pixel) * f;
+               sgtot += GREEN_8 (pixel) * f;
+               sbtot += BLUE_8 (pixel) * f;
+               satot += ALPHA_8 (pixel) * f;
+           }
+
+           params++;
+       }
+    }
+
+    satot >>= 16;
+    srtot >>= 16;
+    sgtot >>= 16;
+    sbtot >>= 16;
+
+    satot = CLIP (satot, 0, 0xff);
+    srtot = CLIP (srtot, 0, 0xff);
+    sgtot = CLIP (sgtot, 0, 0xff);
+    sbtot = CLIP (sbtot, 0, 0xff);
+
+    return ((satot << 24) | (srtot << 16) | (sgtot <<  8) | (sbtot));
+}
+
+static force_inline uint32_t
+bits_image_fetch_pixel_filtered (bits_image_t *image,
+                                pixman_fixed_t x,
+                                pixman_fixed_t y,
+                                get_pixel_t    get_pixel)
+{
+    switch (image->common.filter)
+    {
+    case PIXMAN_FILTER_NEAREST:
+    case PIXMAN_FILTER_FAST:
+       return bits_image_fetch_pixel_nearest (image, x, y, get_pixel);
+       break;
+
+    case PIXMAN_FILTER_BILINEAR:
+    case PIXMAN_FILTER_GOOD:
+    case PIXMAN_FILTER_BEST:
+       return bits_image_fetch_pixel_bilinear (image, x, y, get_pixel);
+       break;
+
+    case PIXMAN_FILTER_CONVOLUTION:
+       return bits_image_fetch_pixel_convolution (image, x, y, get_pixel);
+       break;
+
+    default:
+        break;
+    }
+
+    return 0;
+}
+
+static void
+bits_image_fetch_affine_no_alpha (pixman_image_t * image,
+                                 int              offset,
+                                 int              line,
+                                 int              width,
+                                 uint32_t *       buffer,
+                                 const uint32_t * mask)
+{
+    pixman_fixed_t x, y;
+    pixman_fixed_t ux, uy;
+    pixman_vector_t v;
+    int i;
+
+    /* reference point is the center of the pixel */
+    v.vector[0] = pixman_int_to_fixed (offset) + pixman_fixed_1 / 2;
+    v.vector[1] = pixman_int_to_fixed (line) + pixman_fixed_1 / 2;
+    v.vector[2] = pixman_fixed_1;
+
+    if (image->common.transform)
+    {
+       if (!pixman_transform_point_3d (image->common.transform, &v))
+           return;
+
+       ux = image->common.transform->matrix[0][0];
+       uy = image->common.transform->matrix[1][0];
+    }
+    else
+    {
+       ux = pixman_fixed_1;
+       uy = 0;
+    }
+
+    x = v.vector[0];
+    y = v.vector[1];
+
+    for (i = 0; i < width; ++i)
+    {
+       if (!mask || mask[i])
+       {
+           buffer[i] = bits_image_fetch_pixel_filtered (
+               &image->bits, x, y, fetch_pixel_no_alpha);
+       }
+
+       x += ux;
+       y += uy;
+    }
+}
+
+/* General fetcher */
+static force_inline uint32_t
+fetch_pixel_general (bits_image_t *image, int x, int y, pixman_bool_t check_bounds)
+{
+    uint32_t pixel;
+
+    if (check_bounds &&
+       (x < 0 || x >= image->width || y < 0 || y >= image->height))
+    {
+       return 0;
+    }
+
+    pixel = image->fetch_pixel_32 (image, x, y);
+
+    if (image->common.alpha_map)
+    {
+       uint32_t pixel_a;
+
+       x -= image->common.alpha_origin_x;
+       y -= image->common.alpha_origin_y;
+
+       if (x < 0 || x >= image->common.alpha_map->width ||
+           y < 0 || y >= image->common.alpha_map->height)
+       {
+           pixel_a = 0;
+       }
+       else
+       {
+           pixel_a = image->common.alpha_map->fetch_pixel_32 (
+               image->common.alpha_map, x, y);
+
+           pixel_a = ALPHA_8 (pixel_a);
+       }
+
+       pixel &= 0x00ffffff;
+       pixel |= (pixel_a << 24);
+    }
+
+    return pixel;
+}
+
+static void
+bits_image_fetch_general (pixman_image_t * image,
+                         int              offset,
+                         int              line,
+                         int              width,
+                         uint32_t *       buffer,
+                         const uint32_t * mask)
+{
+    pixman_fixed_t x, y, w;
+    pixman_fixed_t ux, uy, uw;
+    pixman_vector_t v;
+    int i;
+
+    /* reference point is the center of the pixel */
+    v.vector[0] = pixman_int_to_fixed (offset) + pixman_fixed_1 / 2;
+    v.vector[1] = pixman_int_to_fixed (line) + pixman_fixed_1 / 2;
+    v.vector[2] = pixman_fixed_1;
+
+    if (image->common.transform)
+    {
+       if (!pixman_transform_point_3d (image->common.transform, &v))
+           return;
+
+       ux = image->common.transform->matrix[0][0];
+       uy = image->common.transform->matrix[1][0];
+       uw = image->common.transform->matrix[2][0];
+    }
+    else
+    {
+       ux = pixman_fixed_1;
+       uy = 0;
+       uw = 0;
+    }
+
+    x = v.vector[0];
+    y = v.vector[1];
+    w = v.vector[2];
+
+    for (i = 0; i < width; ++i)
+    {
+       pixman_fixed_t x0, y0;
+
+       if (!mask || mask[i])
+       {
+           if (w != 0)
+           {
+               x0 = ((pixman_fixed_48_16_t)x << 16) / w;
+               y0 = ((pixman_fixed_48_16_t)y << 16) / w;
+           }
+           else
+           {
+               x0 = 0;
+               y0 = 0;
+           }
+
+           buffer[i] = bits_image_fetch_pixel_filtered (
+               &image->bits, x0, y0, fetch_pixel_general);
+       }
+
+       x += ux;
+       y += uy;
+       w += uw;
+    }
+}
+
+static const uint8_t zero[8] = { 0, 0, 0, 0, 0, 0, 0, 0 };
+
+typedef uint32_t (* convert_pixel_t) (const uint8_t *row, int x);
+
+static force_inline void
+bits_image_fetch_bilinear_affine (pixman_image_t * image,
+                                 int              offset,
+                                 int              line,
+                                 int              width,
+                                 uint32_t *       buffer,
+                                 const uint32_t * mask,
+
+                                 convert_pixel_t       convert_pixel,
+                                 pixman_format_code_t  format,
+                                 pixman_repeat_t       repeat_mode)
+{
+    pixman_fixed_t x, y;
+    pixman_fixed_t ux, uy;
+    pixman_vector_t v;
+    bits_image_t *bits = &image->bits;
+    int i;
+
+    /* reference point is the center of the pixel */
+    v.vector[0] = pixman_int_to_fixed (offset) + pixman_fixed_1 / 2;
+    v.vector[1] = pixman_int_to_fixed (line) + pixman_fixed_1 / 2;
+    v.vector[2] = pixman_fixed_1;
+
+    if (!pixman_transform_point_3d (image->common.transform, &v))
+       return;
+
+    ux = image->common.transform->matrix[0][0];
+    uy = image->common.transform->matrix[1][0];
+
+    x = v.vector[0];
+    y = v.vector[1];
+
+    for (i = 0; i < width; ++i)
+    {
+       int x1, y1, x2, y2;
+       uint32_t tl, tr, bl, br;
+       int32_t distx, disty;
+       int width = image->bits.width;
+       int height = image->bits.height;
+       const uint8_t *row1;
+       const uint8_t *row2;
+
+       if (mask && !mask[i])
+           goto next;
+
+       x1 = x - pixman_fixed_1 / 2;
+       y1 = y - pixman_fixed_1 / 2;
+
+       distx = (x1 >> 8) & 0xff;
+       disty = (y1 >> 8) & 0xff;
+
+       y1 = pixman_fixed_to_int (y1);
+       y2 = y1 + 1;
+       x1 = pixman_fixed_to_int (x1);
+       x2 = x1 + 1;
+
+       if (repeat_mode != PIXMAN_REPEAT_NONE)
+       {
+           uint32_t mask;
+
+           mask = PIXMAN_FORMAT_A (format)? 0 : 0xff000000;
+
+           repeat (repeat_mode, &x1, width);
+           repeat (repeat_mode, &y1, height);
+           repeat (repeat_mode, &x2, width);
+           repeat (repeat_mode, &y2, height);
+
+           row1 = (uint8_t *)bits->bits + bits->rowstride * 4 * y1;
+           row2 = (uint8_t *)bits->bits + bits->rowstride * 4 * y2;
+
+           tl = convert_pixel (row1, x1) | mask;
+           tr = convert_pixel (row1, x2) | mask;
+           bl = convert_pixel (row2, x1) | mask;
+           br = convert_pixel (row2, x2) | mask;
+       }
+       else
+       {
+           uint32_t mask1, mask2;
+           int bpp;
+
+           /* Note: PIXMAN_FORMAT_BPP() returns an unsigned value,
+            * which means if you use it in expressions, those
+            * expressions become unsigned themselves. Since
+            * the variables below can be negative in some cases,
+            * that will lead to crashes on 64 bit architectures.
+            *
+            * So this line makes sure bpp is signed
+            */
+           bpp = PIXMAN_FORMAT_BPP (format);
+
+           if (x1 >= width || x2 < 0 || y1 >= height || y2 < 0)
+           {
+               buffer[i] = 0;
+               goto next;
+           }
+
+           if (y2 == 0)
+           {
+               row1 = zero;
+               mask1 = 0;
+           }
+           else
+           {
+               row1 = (uint8_t *)bits->bits + bits->rowstride * 4 * y1;
+               row1 += bpp / 8 * x1;
+
+               mask1 = PIXMAN_FORMAT_A (format)? 0 : 0xff000000;
+           }
+
+           if (y1 == height - 1)
+           {
+               row2 = zero;
+               mask2 = 0;
+           }
+           else
+           {
+               row2 = (uint8_t *)bits->bits + bits->rowstride * 4 * y2;
+               row2 += bpp / 8 * x1;
+
+               mask2 = PIXMAN_FORMAT_A (format)? 0 : 0xff000000;
+           }
+
+           if (x2 == 0)
+           {
+               tl = 0;
+               bl = 0;
+           }
+           else
+           {
+               tl = convert_pixel (row1, 0) | mask1;
+               bl = convert_pixel (row2, 0) | mask2;
+           }
+
+           if (x1 == width - 1)
+           {
+               tr = 0;
+               br = 0;
+           }
+           else
+           {
+               tr = convert_pixel (row1, 1) | mask1;
+               br = convert_pixel (row2, 1) | mask2;
+           }
+       }
+
+       buffer[i] = bilinear_interpolation (
+           tl, tr, bl, br, distx, disty);
+
+    next:
+       x += ux;
+       y += uy;
+    }
+}
+
+static force_inline void
+bits_image_fetch_nearest_affine (pixman_image_t * image,
+                                int              offset,
+                                int              line,
+                                int              width,
+                                uint32_t *       buffer,
+                                const uint32_t * mask,
+                                
+                                convert_pixel_t        convert_pixel,
+                                pixman_format_code_t   format,
+                                pixman_repeat_t        repeat_mode)
+{
+    pixman_fixed_t x, y;
+    pixman_fixed_t ux, uy;
+    pixman_vector_t v;
+    bits_image_t *bits = &image->bits;
+    int i;
+
+    /* reference point is the center of the pixel */
+    v.vector[0] = pixman_int_to_fixed (offset) + pixman_fixed_1 / 2;
+    v.vector[1] = pixman_int_to_fixed (line) + pixman_fixed_1 / 2;
+    v.vector[2] = pixman_fixed_1;
+
+    if (!pixman_transform_point_3d (image->common.transform, &v))
+       return;
+
+    ux = image->common.transform->matrix[0][0];
+    uy = image->common.transform->matrix[1][0];
+
+    x = v.vector[0];
+    y = v.vector[1];
+
+    for (i = 0; i < width; ++i)
+    {
+       int width, height, x0, y0;
+       const uint8_t *row;
+
+       if (mask && !mask[i])
+           goto next;
+       
+       width = image->bits.width;
+       height = image->bits.height;
+       x0 = pixman_fixed_to_int (x - pixman_fixed_e);
+       y0 = pixman_fixed_to_int (y - pixman_fixed_e);
+
+       if (repeat_mode == PIXMAN_REPEAT_NONE &&
+           (y0 < 0 || y0 >= height || x0 < 0 || x0 >= width))
+       {
+           buffer[i] = 0;
+       }
+       else
+       {
+           uint32_t mask = PIXMAN_FORMAT_A (format)? 0 : 0xff000000;
+
+           if (repeat_mode != PIXMAN_REPEAT_NONE)
+           {
+               repeat (repeat_mode, &x0, width);
+               repeat (repeat_mode, &y0, height);
+           }
+
+           row = (uint8_t *)bits->bits + bits->rowstride * 4 * y0;
+
+           buffer[i] = convert_pixel (row, x0) | mask;
+       }
+
+    next:
+       x += ux;
+       y += uy;
+    }
+}
+
+static force_inline uint32_t
+convert_a8r8g8b8 (const uint8_t *row, int x)
+{
+    return *(((uint32_t *)row) + x);
+}
+
+static force_inline uint32_t
+convert_x8r8g8b8 (const uint8_t *row, int x)
+{
+    return *(((uint32_t *)row) + x);
+}
+
+static force_inline uint32_t
+convert_a8 (const uint8_t *row, int x)
+{
+    return *(row + x) << 24;
+}
+
+static force_inline uint32_t
+convert_r5g6b5 (const uint8_t *row, int x)
+{
+    return CONVERT_0565_TO_0888 (*((uint16_t *)row + x));
+}
+
+#define MAKE_BILINEAR_FETCHER(name, format, repeat_mode)               \
+    static void                                                                \
+    bits_image_fetch_bilinear_affine_ ## name (pixman_image_t *image,  \
+                                              int              offset, \
+                                              int              line,   \
+                                              int              width,  \
+                                              uint32_t *       buffer, \
+                                              const uint32_t * mask)   \
+    {                                                                  \
+       bits_image_fetch_bilinear_affine (image, offset, line,          \
+                                         width, buffer, mask,          \
+                                         convert_ ## format,           \
+                                         PIXMAN_ ## format,            \
+                                         repeat_mode);                 \
+    }
+
+#define MAKE_NEAREST_FETCHER(name, format, repeat_mode)                        \
+    static void                                                                \
+    bits_image_fetch_nearest_affine_ ## name (pixman_image_t *image,   \
+                                             int              offset,  \
+                                             int              line,    \
+                                             int              width,   \
+                                             uint32_t *       buffer,  \
+                                             const uint32_t * mask)    \
+    {                                                                  \
+       bits_image_fetch_nearest_affine (image, offset, line,           \
+                                        width, buffer, mask,           \
+                                        convert_ ## format,            \
+                                        PIXMAN_ ## format,             \
+                                        repeat_mode);                  \
+    }
+
+#define MAKE_FETCHERS(name, format, repeat_mode)                       \
+    MAKE_NEAREST_FETCHER (name, format, repeat_mode)                   \
+    MAKE_BILINEAR_FETCHER (name, format, repeat_mode)
+
+MAKE_FETCHERS (pad_a8r8g8b8,     a8r8g8b8, PIXMAN_REPEAT_PAD)
+MAKE_FETCHERS (none_a8r8g8b8,    a8r8g8b8, PIXMAN_REPEAT_NONE)
+MAKE_FETCHERS (reflect_a8r8g8b8, a8r8g8b8, PIXMAN_REPEAT_REFLECT)
+MAKE_FETCHERS (normal_a8r8g8b8,  a8r8g8b8, PIXMAN_REPEAT_NORMAL)
+MAKE_FETCHERS (pad_x8r8g8b8,     x8r8g8b8, PIXMAN_REPEAT_PAD)
+MAKE_FETCHERS (none_x8r8g8b8,    x8r8g8b8, PIXMAN_REPEAT_NONE)
+MAKE_FETCHERS (reflect_x8r8g8b8, x8r8g8b8, PIXMAN_REPEAT_REFLECT)
+MAKE_FETCHERS (normal_x8r8g8b8,  x8r8g8b8, PIXMAN_REPEAT_NORMAL)
+MAKE_FETCHERS (pad_a8,           a8,       PIXMAN_REPEAT_PAD)
+MAKE_FETCHERS (none_a8,          a8,       PIXMAN_REPEAT_NONE)
+MAKE_FETCHERS (reflect_a8,      a8,       PIXMAN_REPEAT_REFLECT)
+MAKE_FETCHERS (normal_a8,       a8,       PIXMAN_REPEAT_NORMAL)
+MAKE_FETCHERS (pad_r5g6b5,       r5g6b5,   PIXMAN_REPEAT_PAD)
+MAKE_FETCHERS (none_r5g6b5,      r5g6b5,   PIXMAN_REPEAT_NONE)
+MAKE_FETCHERS (reflect_r5g6b5,   r5g6b5,   PIXMAN_REPEAT_REFLECT)
+MAKE_FETCHERS (normal_r5g6b5,    r5g6b5,   PIXMAN_REPEAT_NORMAL)
+
+static void
+replicate_pixel_32 (bits_image_t *   bits,
+                   int              x,
+                   int              y,
+                   int              width,
+                   uint32_t *       buffer)
+{
+    uint32_t color;
+    uint32_t *end;
+
+    color = bits->fetch_pixel_32 (bits, x, y);
+
+    end = buffer + width;
+    while (buffer < end)
+       *(buffer++) = color;
+}
+
+static void
+replicate_pixel_64 (bits_image_t *   bits,
+                   int              x,
+                   int              y,
+                   int              width,
+                   uint32_t *       b)
+{
+    uint64_t color;
+    uint64_t *buffer = (uint64_t *)b;
+    uint64_t *end;
+
+    color = bits->fetch_pixel_64 (bits, x, y);
+
+    end = buffer + width;
+    while (buffer < end)
+       *(buffer++) = color;
+}
+
+static void
+bits_image_fetch_solid_32 (pixman_image_t * image,
+                           int              x,
+                           int              y,
+                           int              width,
+                           uint32_t *       buffer,
+                           const uint32_t * mask)
+{
+    replicate_pixel_32 (&image->bits, 0, 0, width, buffer);
+}
+
+static void
+bits_image_fetch_solid_64 (pixman_image_t * image,
+                           int              x,
+                           int              y,
+                           int              width,
+                           uint32_t *       b,
+                           const uint32_t * unused)
+{
+    replicate_pixel_64 (&image->bits, 0, 0, width, b);
+}
+
+static void
+bits_image_fetch_untransformed_repeat_none (bits_image_t *image,
+                                            pixman_bool_t wide,
+                                            int           x,
+                                            int           y,
+                                            int           width,
+                                            uint32_t *    buffer)
+{
+    uint32_t w;
+
+    if (y < 0 || y >= image->height)
+    {
+       memset (buffer, 0, width * (wide? 8 : 4));
+       return;
+    }
+
+    if (x < 0)
+    {
+       w = MIN (width, -x);
+
+       memset (buffer, 0, w * (wide ? 8 : 4));
+
+       width -= w;
+       buffer += w * (wide? 2 : 1);
+       x += w;
+    }
+
+    if (x < image->width)
+    {
+       w = MIN (width, image->width - x);
+
+       if (wide)
+           image->fetch_scanline_64 ((pixman_image_t *)image, x, y, w, buffer, NULL);
+       else
+           image->fetch_scanline_32 ((pixman_image_t *)image, x, y, w, buffer, NULL);
+
+       width -= w;
+       buffer += w * (wide? 2 : 1);
+       x += w;
+    }
+
+    memset (buffer, 0, width * (wide ? 8 : 4));
+}
+
+static void
+bits_image_fetch_untransformed_repeat_normal (bits_image_t *image,
+                                              pixman_bool_t wide,
+                                              int           x,
+                                              int           y,
+                                              int           width,
+                                              uint32_t *    buffer)
+{
+    uint32_t w;
+
+    while (y < 0)
+       y += image->height;
+
+    while (y >= image->height)
+       y -= image->height;
+
+    if (image->width == 1)
+    {
+       if (wide)
+           replicate_pixel_64 (image, 0, y, width, buffer);
+       else
+           replicate_pixel_32 (image, 0, y, width, buffer);
+
+       return;
+    }
+
+    while (width)
+    {
+       while (x < 0)
+           x += image->width;
+       while (x >= image->width)
+           x -= image->width;
+
+       w = MIN (width, image->width - x);
+
+       if (wide)
+           image->fetch_scanline_64 ((pixman_image_t *)image, x, y, w, buffer, NULL);
+       else
+           image->fetch_scanline_32 ((pixman_image_t *)image, x, y, w, buffer, NULL);
+
+       buffer += w * (wide? 2 : 1);
+       x += w;
+       width -= w;
+    }
+}
+
+static void
+bits_image_fetch_untransformed_32 (pixman_image_t * image,
+                                   int              x,
+                                   int              y,
+                                   int              width,
+                                   uint32_t *       buffer,
+                                   const uint32_t * mask)
+{
+    if (image->common.repeat == PIXMAN_REPEAT_NONE)
+    {
+       bits_image_fetch_untransformed_repeat_none (
+           &image->bits, FALSE, x, y, width, buffer);
+    }
+    else
+    {
+       bits_image_fetch_untransformed_repeat_normal (
+           &image->bits, FALSE, x, y, width, buffer);
+    }
+}
+
+static void
+bits_image_fetch_untransformed_64 (pixman_image_t * image,
+                                   int              x,
+                                   int              y,
+                                   int              width,
+                                   uint32_t *       buffer,
+                                   const uint32_t * unused)
+{
+    if (image->common.repeat == PIXMAN_REPEAT_NONE)
+    {
+       bits_image_fetch_untransformed_repeat_none (
+           &image->bits, TRUE, x, y, width, buffer);
+    }
+    else
+    {
+       bits_image_fetch_untransformed_repeat_normal (
+           &image->bits, TRUE, x, y, width, buffer);
+    }
+}
+
+typedef struct
+{
+    pixman_format_code_t       format;
+    uint32_t                   flags;
+    fetch_scanline_t           fetch_32;
+    fetch_scanline_t           fetch_64;
+} fetcher_info_t;
+
+static const fetcher_info_t fetcher_info[] =
+{
+    { PIXMAN_solid,
+      FAST_PATH_NO_ALPHA_MAP,
+      bits_image_fetch_solid_32,
+      bits_image_fetch_solid_64
+    },
+
+    { PIXMAN_any,
+      (FAST_PATH_NO_ALPHA_MAP                  |
+       FAST_PATH_ID_TRANSFORM                  |
+       FAST_PATH_NO_CONVOLUTION_FILTER         |
+       FAST_PATH_NO_PAD_REPEAT                 |
+       FAST_PATH_NO_REFLECT_REPEAT),
+      bits_image_fetch_untransformed_32,
+      bits_image_fetch_untransformed_64
+    },
+
+#define FAST_BILINEAR_FLAGS                                            \
+    (FAST_PATH_NO_ALPHA_MAP            |                               \
+     FAST_PATH_NO_ACCESSORS            |                               \
+     FAST_PATH_HAS_TRANSFORM           |                               \
+     FAST_PATH_AFFINE_TRANSFORM                |                               \
+     FAST_PATH_X_UNIT_POSITIVE         |                               \
+     FAST_PATH_Y_UNIT_ZERO             |                               \
+     FAST_PATH_NONE_REPEAT             |                               \
+     FAST_PATH_BILINEAR_FILTER)
+
+    { PIXMAN_a8r8g8b8,
+      FAST_BILINEAR_FLAGS,
+      bits_image_fetch_bilinear_no_repeat_8888,
+      _pixman_image_get_scanline_generic_64
+    },
+
+    { PIXMAN_x8r8g8b8,
+      FAST_BILINEAR_FLAGS,
+      bits_image_fetch_bilinear_no_repeat_8888,
+      _pixman_image_get_scanline_generic_64
+    },
+
+#define GENERAL_BILINEAR_FLAGS                                         \
+    (FAST_PATH_NO_ALPHA_MAP            |                               \
+     FAST_PATH_NO_ACCESSORS            |                               \
+     FAST_PATH_HAS_TRANSFORM           |                               \
+     FAST_PATH_AFFINE_TRANSFORM                |                               \
+     FAST_PATH_BILINEAR_FILTER)
+
+#define GENERAL_NEAREST_FLAGS                                          \
+    (FAST_PATH_NO_ALPHA_MAP            |                               \
+     FAST_PATH_NO_ACCESSORS            |                               \
+     FAST_PATH_HAS_TRANSFORM           |                               \
+     FAST_PATH_AFFINE_TRANSFORM                |                               \
+     FAST_PATH_NEAREST_FILTER)
+
+#define BILINEAR_AFFINE_FAST_PATH(name, format, repeat)                        \
+    { PIXMAN_ ## format,                                               \
+      GENERAL_BILINEAR_FLAGS | FAST_PATH_ ## repeat ## _REPEAT,                \
+      bits_image_fetch_bilinear_affine_ ## name,                       \
+      _pixman_image_get_scanline_generic_64                            \
+    },
+
+#define NEAREST_AFFINE_FAST_PATH(name, format, repeat)                 \
+    { PIXMAN_ ## format,                                               \
+      GENERAL_NEAREST_FLAGS | FAST_PATH_ ## repeat ## _REPEAT,         \
+      bits_image_fetch_nearest_affine_ ## name,                        \
+      _pixman_image_get_scanline_generic_64                            \
+    },
+
+#define AFFINE_FAST_PATHS(name, format, repeat)                                \
+    BILINEAR_AFFINE_FAST_PATH(name, format, repeat)                    \
+    NEAREST_AFFINE_FAST_PATH(name, format, repeat)
+    
+    AFFINE_FAST_PATHS (pad_a8r8g8b8, a8r8g8b8, PAD)
+    AFFINE_FAST_PATHS (none_a8r8g8b8, a8r8g8b8, NONE)
+    AFFINE_FAST_PATHS (reflect_a8r8g8b8, a8r8g8b8, REFLECT)
+    AFFINE_FAST_PATHS (normal_a8r8g8b8, a8r8g8b8, NORMAL)
+    AFFINE_FAST_PATHS (pad_x8r8g8b8, x8r8g8b8, PAD)
+    AFFINE_FAST_PATHS (none_x8r8g8b8, x8r8g8b8, NONE)
+    AFFINE_FAST_PATHS (reflect_x8r8g8b8, x8r8g8b8, REFLECT)
+    AFFINE_FAST_PATHS (normal_x8r8g8b8, x8r8g8b8, NORMAL)
+    AFFINE_FAST_PATHS (pad_a8, a8, PAD)
+    AFFINE_FAST_PATHS (none_a8, a8, NONE)
+    AFFINE_FAST_PATHS (reflect_a8, a8, REFLECT)
+    AFFINE_FAST_PATHS (normal_a8, a8, NORMAL)
+    AFFINE_FAST_PATHS (pad_r5g6b5, r5g6b5, PAD)
+    AFFINE_FAST_PATHS (none_r5g6b5, r5g6b5, NONE)
+    AFFINE_FAST_PATHS (reflect_r5g6b5, r5g6b5, REFLECT)
+    AFFINE_FAST_PATHS (normal_r5g6b5, r5g6b5, NORMAL)
+
+    /* Affine, no alpha */
+    { PIXMAN_any,
+      (FAST_PATH_NO_ALPHA_MAP | FAST_PATH_HAS_TRANSFORM | FAST_PATH_AFFINE_TRANSFORM),
+      bits_image_fetch_affine_no_alpha,
+      _pixman_image_get_scanline_generic_64
+    },
+
+    /* General */
+    { PIXMAN_any, 0, bits_image_fetch_general, _pixman_image_get_scanline_generic_64 },
+
+    { PIXMAN_null },
+};
+
+static void
+bits_image_property_changed (pixman_image_t *image)
+{
+    uint32_t flags = image->common.flags;
+    pixman_format_code_t format = image->common.extended_format_code;
+    const fetcher_info_t *info;
+
+    _pixman_bits_image_setup_accessors (&image->bits);
+
+    info = fetcher_info;
+    while (info->format != PIXMAN_null)
+    {
+       if ((info->format == format || info->format == PIXMAN_any)      &&
+           (info->flags & flags) == info->flags)
+       {
+           image->bits.get_scanline_32 = info->fetch_32;
+           image->bits.get_scanline_64 = info->fetch_64;
+           break;
+       }
+
+       info++;
+    }
+}
+
+static uint32_t *
+src_get_scanline_narrow (pixman_iter_t *iter, const uint32_t *mask)
+{
+    iter->image->bits.get_scanline_32 (
+       iter->image, iter->x, iter->y++, iter->width, iter->buffer, mask);
+
+    return iter->buffer;
+}
+
+static uint32_t *
+src_get_scanline_wide (pixman_iter_t *iter, const uint32_t *mask)
+{
+    iter->image->bits.get_scanline_64 (
+       iter->image, iter->x, iter->y++, iter->width, iter->buffer, mask);
+
+    return iter->buffer;
+}
+
+void
+_pixman_bits_image_src_iter_init (pixman_image_t *image, pixman_iter_t *iter)
+{
+    if (iter->flags & ITER_NARROW)
+       iter->get_scanline = src_get_scanline_narrow;
+    else
+       iter->get_scanline = src_get_scanline_wide;
+}
+
+static uint32_t *
+dest_get_scanline_narrow (pixman_iter_t *iter, const uint32_t *mask)
+{
+    pixman_image_t *image  = iter->image;
+    int             x      = iter->x;
+    int             y      = iter->y;
+    int             width  = iter->width;
+    uint32_t *     buffer = iter->buffer;
+
+    image->bits.fetch_scanline_32 (image, x, y, width, buffer, mask);
+    if (image->common.alpha_map)
+    {
+       x -= image->common.alpha_origin_x;
+       y -= image->common.alpha_origin_y;
+
+       image->common.alpha_map->fetch_scanline_32 (
+           (pixman_image_t *)image->common.alpha_map,
+           x, y, width, buffer, mask);
+    }
+
+    return iter->buffer;
+}
+
+static uint32_t *
+dest_get_scanline_wide (pixman_iter_t *iter, const uint32_t *mask)
+{
+    bits_image_t *  image  = &iter->image->bits;
+    int             x      = iter->x;
+    int             y      = iter->y;
+    int             width  = iter->width;
+    uint32_t *     buffer = iter->buffer;
+
+    image->fetch_scanline_64 (
+       (pixman_image_t *)image, x, y, width, buffer, mask);
+    if (image->common.alpha_map)
+    {
+       x -= image->common.alpha_origin_x;
+       y -= image->common.alpha_origin_y;
+
+       image->common.alpha_map->fetch_scanline_64 (
+           (pixman_image_t *)image->common.alpha_map, x, y, width, buffer, mask);
+    }
+
+    return iter->buffer;
+}
+
+static void
+dest_write_back_narrow (pixman_iter_t *iter)
+{
+    bits_image_t *  image  = &iter->image->bits;
+    int             x      = iter->x;
+    int             y      = iter->y;
+    int             width  = iter->width;
+    const uint32_t *buffer = iter->buffer;
+
+    image->store_scanline_32 (image, x, y, width, buffer);
+
+    if (image->common.alpha_map)
+    {
+       x -= image->common.alpha_origin_x;
+       y -= image->common.alpha_origin_y;
+
+       image->common.alpha_map->store_scanline_32 (
+           image->common.alpha_map, x, y, width, buffer);
+    }
+
+    iter->y++;
+}
+
+static void
+dest_write_back_wide (pixman_iter_t *iter)
+{
+    bits_image_t *  image  = &iter->image->bits;
+    int             x      = iter->x;
+    int             y      = iter->y;
+    int             width  = iter->width;
+    const uint32_t *buffer = iter->buffer;
+
+    image->store_scanline_64 (image, x, y, width, buffer);
+
+    if (image->common.alpha_map)
+    {
+       x -= image->common.alpha_origin_x;
+       y -= image->common.alpha_origin_y;
+
+       image->common.alpha_map->store_scanline_64 (
+           image->common.alpha_map, x, y, width, buffer);
+    }
+
+    iter->y++;
+}
+
+void
+_pixman_bits_image_dest_iter_init (pixman_image_t *image, pixman_iter_t *iter)
+{
+    if (iter->flags & ITER_NARROW)
+    {
+       if ((iter->flags & (ITER_IGNORE_RGB | ITER_IGNORE_ALPHA)) ==
+           (ITER_IGNORE_RGB | ITER_IGNORE_ALPHA))
+       {
+           iter->get_scanline = _pixman_iter_get_scanline_noop;
+       }
+       else
+       {
+           iter->get_scanline = dest_get_scanline_narrow;
+       }
+       
+       iter->write_back = dest_write_back_narrow;
+    }
+    else
+    {
+       iter->get_scanline = dest_get_scanline_wide;
+       iter->write_back = dest_write_back_wide;
+    }
+}
+
+static uint32_t *
+create_bits (pixman_format_code_t format,
+             int                  width,
+             int                  height,
+             int *               rowstride_bytes)
+{
+    int stride;
+    size_t buf_size;
+    int bpp;
+
+    /* what follows is a long-winded way, avoiding any possibility of integer
+     * overflows, of saying:
+     * stride = ((width * bpp + 0x1f) >> 5) * sizeof (uint32_t);
+     */
+
+    bpp = PIXMAN_FORMAT_BPP (format);
+    if (_pixman_multiply_overflows_int (width, bpp))
+       return NULL;
+
+    stride = width * bpp;
+    if (_pixman_addition_overflows_int (stride, 0x1f))
+       return NULL;
+
+    stride += 0x1f;
+    stride >>= 5;
+
+    stride *= sizeof (uint32_t);
+
+    if (_pixman_multiply_overflows_size (height, stride))
+       return NULL;
+
+    buf_size = height * stride;
+
+    if (rowstride_bytes)
+       *rowstride_bytes = stride;
+
+    return calloc (buf_size, 1);
+}
+
+pixman_bool_t
+_pixman_bits_image_init (pixman_image_t *     image,
+                         pixman_format_code_t format,
+                         int                  width,
+                         int                  height,
+                         uint32_t *           bits,
+                         int                  rowstride)
+{
+    uint32_t *free_me = NULL;
+
+    if (!bits && width && height)
+    {
+       int rowstride_bytes;
+
+       free_me = bits = create_bits (format, width, height, &rowstride_bytes);
+
+       if (!bits)
+           return FALSE;
+
+       rowstride = rowstride_bytes / (int) sizeof (uint32_t);
+    }
+
+    _pixman_image_init (image);
+
+    image->type = BITS;
+    image->bits.format = format;
+    image->bits.width = width;
+    image->bits.height = height;
+    image->bits.bits = bits;
+    image->bits.free_me = free_me;
+    image->bits.read_func = NULL;
+    image->bits.write_func = NULL;
+    image->bits.rowstride = rowstride;
+    image->bits.indexed = NULL;
+
+    image->common.property_changed = bits_image_property_changed;
+
+    _pixman_image_reset_clip_region (image);
+
+    return TRUE;
+}
+
+PIXMAN_EXPORT pixman_image_t *
+pixman_image_create_bits (pixman_format_code_t format,
+                          int                  width,
+                          int                  height,
+                          uint32_t *           bits,
+                          int                  rowstride_bytes)
+{
+    pixman_image_t *image;
+
+    /* must be a whole number of uint32_t's
+     */
+    return_val_if_fail (
+       bits == NULL || (rowstride_bytes % sizeof (uint32_t)) == 0, NULL);
+
+    return_val_if_fail (PIXMAN_FORMAT_BPP (format) >= PIXMAN_FORMAT_DEPTH (format), NULL);
+
+    image = _pixman_image_allocate ();
+
+    if (!image)
+       return NULL;
+
+    if (!_pixman_bits_image_init (image, format, width, height, bits,
+                                 rowstride_bytes / (int) sizeof (uint32_t)))
+    {
+       free (image);
+       return NULL;
+    }
+
+    return image;
+}
diff --git a/pixman/pixman-combine.c.template b/pixman/pixman-combine.c.template
new file mode 100644 (file)
index 0000000..c17bcea
--- /dev/null
@@ -0,0 +1,2461 @@
+#ifdef HAVE_CONFIG_H
+#include <config.h>
+#endif
+
+#include <math.h>
+#include <string.h>
+
+#include "pixman-private.h"
+
+#include "pixman-combine.h"
+
+/*** per channel helper functions ***/
+
+static void
+combine_mask_ca (comp4_t *src, comp4_t *mask)
+{
+    comp4_t a = *mask;
+
+    comp4_t x;
+    comp2_t xa;
+
+    if (!a)
+    {
+       *(src) = 0;
+       return;
+    }
+
+    x = *(src);
+    if (a == ~0)
+    {
+       x = x >> A_SHIFT;
+       x |= x << G_SHIFT;
+       x |= x << R_SHIFT;
+       *(mask) = x;
+       return;
+    }
+
+    xa = x >> A_SHIFT;
+    UNcx4_MUL_UNcx4 (x, a);
+    *(src) = x;
+    
+    UNcx4_MUL_UNc (a, xa);
+    *(mask) = a;
+}
+
+static void
+combine_mask_value_ca (comp4_t *src, const comp4_t *mask)
+{
+    comp4_t a = *mask;
+    comp4_t x;
+
+    if (!a)
+    {
+       *(src) = 0;
+       return;
+    }
+
+    if (a == ~0)
+       return;
+
+    x = *(src);
+    UNcx4_MUL_UNcx4 (x, a);
+    *(src) = x;
+}
+
+static void
+combine_mask_alpha_ca (const comp4_t *src, comp4_t *mask)
+{
+    comp4_t a = *(mask);
+    comp4_t x;
+
+    if (!a)
+       return;
+
+    x = *(src) >> A_SHIFT;
+    if (x == MASK)
+       return;
+
+    if (a == ~0)
+    {
+       x |= x << G_SHIFT;
+       x |= x << R_SHIFT;
+       *(mask) = x;
+       return;
+    }
+
+    UNcx4_MUL_UNc (a, x);
+    *(mask) = a;
+}
+
+/*
+ * There are two ways of handling alpha -- either as a single unified value or
+ * a separate value for each component, hence each macro must have two
+ * versions.  The unified alpha version has a 'U' at the end of the name,
+ * the component version has a 'C'.  Similarly, functions which deal with
+ * this difference will have two versions using the same convention.
+ */
+
+/*
+ * All of the composing functions
+ */
+
+static force_inline comp4_t
+combine_mask (const comp4_t *src, const comp4_t *mask, int i)
+{
+    comp4_t s, m;
+
+    if (mask)
+    {
+       m = *(mask + i) >> A_SHIFT;
+
+       if (!m)
+           return 0;
+    }
+
+    s = *(src + i);
+
+    if (mask)
+       UNcx4_MUL_UNc (s, m);
+
+    return s;
+}
+
+static void
+combine_clear (pixman_implementation_t *imp,
+               pixman_op_t              op,
+               comp4_t *                dest,
+               const comp4_t *          src,
+               const comp4_t *          mask,
+               int                      width)
+{
+    memset (dest, 0, width * sizeof(comp4_t));
+}
+
+static void
+combine_dst (pixman_implementation_t *imp,
+            pixman_op_t              op,
+            comp4_t *                dest,
+            const comp4_t *          src,
+            const comp4_t *          mask,
+            int                      width)
+{
+    return;
+}
+
+static void
+combine_src_u (pixman_implementation_t *imp,
+               pixman_op_t              op,
+               comp4_t *                dest,
+               const comp4_t *          src,
+               const comp4_t *          mask,
+               int                      width)
+{
+    int i;
+
+    if (!mask)
+       memcpy (dest, src, width * sizeof (comp4_t));
+    else
+    {
+       for (i = 0; i < width; ++i)
+       {
+           comp4_t s = combine_mask (src, mask, i);
+
+           *(dest + i) = s;
+       }
+    }
+}
+
+/* if the Src is opaque, call combine_src_u */
+static void
+combine_over_u (pixman_implementation_t *imp,
+                pixman_op_t              op,
+                comp4_t *                dest,
+                const comp4_t *          src,
+                const comp4_t *          mask,
+                int                      width)
+{
+    int i;
+
+    for (i = 0; i < width; ++i)
+    {
+       comp4_t s = combine_mask (src, mask, i);
+       comp4_t d = *(dest + i);
+       comp4_t ia = ALPHA_c (~s);
+
+       UNcx4_MUL_UNc_ADD_UNcx4 (d, ia, s);
+       *(dest + i) = d;
+    }
+}
+
+/* if the Dst is opaque, this is a noop */
+static void
+combine_over_reverse_u (pixman_implementation_t *imp,
+                        pixman_op_t              op,
+                        comp4_t *                dest,
+                        const comp4_t *          src,
+                        const comp4_t *          mask,
+                        int                      width)
+{
+    int i;
+
+    for (i = 0; i < width; ++i)
+    {
+       comp4_t s = combine_mask (src, mask, i);
+       comp4_t d = *(dest + i);
+       comp4_t ia = ALPHA_c (~*(dest + i));
+       UNcx4_MUL_UNc_ADD_UNcx4 (s, ia, d);
+       *(dest + i) = s;
+    }
+}
+
+/* if the Dst is opaque, call combine_src_u */
+static void
+combine_in_u (pixman_implementation_t *imp,
+              pixman_op_t              op,
+              comp4_t *                dest,
+              const comp4_t *          src,
+              const comp4_t *          mask,
+              int                      width)
+{
+    int i;
+
+    for (i = 0; i < width; ++i)
+    {
+       comp4_t s = combine_mask (src, mask, i);
+       comp4_t a = ALPHA_c (*(dest + i));
+       UNcx4_MUL_UNc (s, a);
+       *(dest + i) = s;
+    }
+}
+
+/* if the Src is opaque, this is a noop */
+static void
+combine_in_reverse_u (pixman_implementation_t *imp,
+                      pixman_op_t              op,
+                      comp4_t *                dest,
+                      const comp4_t *          src,
+                      const comp4_t *          mask,
+                      int                      width)
+{
+    int i;
+
+    for (i = 0; i < width; ++i)
+    {
+       comp4_t s = combine_mask (src, mask, i);
+       comp4_t d = *(dest + i);
+       comp4_t a = ALPHA_c (s);
+       UNcx4_MUL_UNc (d, a);
+       *(dest + i) = d;
+    }
+}
+
+/* if the Dst is opaque, call combine_clear */
+static void
+combine_out_u (pixman_implementation_t *imp,
+               pixman_op_t              op,
+               comp4_t *                dest,
+               const comp4_t *          src,
+               const comp4_t *          mask,
+               int                      width)
+{
+    int i;
+
+    for (i = 0; i < width; ++i)
+    {
+       comp4_t s = combine_mask (src, mask, i);
+       comp4_t a = ALPHA_c (~*(dest + i));
+       UNcx4_MUL_UNc (s, a);
+       *(dest + i) = s;
+    }
+}
+
+/* if the Src is opaque, call combine_clear */
+static void
+combine_out_reverse_u (pixman_implementation_t *imp,
+                       pixman_op_t              op,
+                       comp4_t *                dest,
+                       const comp4_t *          src,
+                       const comp4_t *          mask,
+                       int                      width)
+{
+    int i;
+
+    for (i = 0; i < width; ++i)
+    {
+       comp4_t s = combine_mask (src, mask, i);
+       comp4_t d = *(dest + i);
+       comp4_t a = ALPHA_c (~s);
+       UNcx4_MUL_UNc (d, a);
+       *(dest + i) = d;
+    }
+}
+
+/* if the Src is opaque, call combine_in_u */
+/* if the Dst is opaque, call combine_over_u */
+/* if both the Src and Dst are opaque, call combine_src_u */
+static void
+combine_atop_u (pixman_implementation_t *imp,
+                pixman_op_t              op,
+                comp4_t *                dest,
+                const comp4_t *          src,
+                const comp4_t *          mask,
+                int                      width)
+{
+    int i;
+
+    for (i = 0; i < width; ++i)
+    {
+       comp4_t s = combine_mask (src, mask, i);
+       comp4_t d = *(dest + i);
+       comp4_t dest_a = ALPHA_c (d);
+       comp4_t src_ia = ALPHA_c (~s);
+
+       UNcx4_MUL_UNc_ADD_UNcx4_MUL_UNc (s, dest_a, d, src_ia);
+       *(dest + i) = s;
+    }
+}
+
+/* if the Src is opaque, call combine_over_reverse_u */
+/* if the Dst is opaque, call combine_in_reverse_u */
+/* if both the Src and Dst are opaque, call combine_dst_u */
+static void
+combine_atop_reverse_u (pixman_implementation_t *imp,
+                        pixman_op_t              op,
+                        comp4_t *                dest,
+                        const comp4_t *          src,
+                        const comp4_t *          mask,
+                        int                      width)
+{
+    int i;
+
+    for (i = 0; i < width; ++i)
+    {
+       comp4_t s = combine_mask (src, mask, i);
+       comp4_t d = *(dest + i);
+       comp4_t src_a = ALPHA_c (s);
+       comp4_t dest_ia = ALPHA_c (~d);
+
+       UNcx4_MUL_UNc_ADD_UNcx4_MUL_UNc (s, dest_ia, d, src_a);
+       *(dest + i) = s;
+    }
+}
+
+/* if the Src is opaque, call combine_over_u */
+/* if the Dst is opaque, call combine_over_reverse_u */
+/* if both the Src and Dst are opaque, call combine_clear */
+static void
+combine_xor_u (pixman_implementation_t *imp,
+               pixman_op_t              op,
+               comp4_t *                dest,
+               const comp4_t *          src,
+               const comp4_t *          mask,
+               int                      width)
+{
+    int i;
+
+    for (i = 0; i < width; ++i)
+    {
+       comp4_t s = combine_mask (src, mask, i);
+       comp4_t d = *(dest + i);
+       comp4_t src_ia = ALPHA_c (~s);
+       comp4_t dest_ia = ALPHA_c (~d);
+
+       UNcx4_MUL_UNc_ADD_UNcx4_MUL_UNc (s, dest_ia, d, src_ia);
+       *(dest + i) = s;
+    }
+}
+
+static void
+combine_add_u (pixman_implementation_t *imp,
+               pixman_op_t              op,
+               comp4_t *                dest,
+               const comp4_t *          src,
+               const comp4_t *          mask,
+               int                      width)
+{
+    int i;
+
+    for (i = 0; i < width; ++i)
+    {
+       comp4_t s = combine_mask (src, mask, i);
+       comp4_t d = *(dest + i);
+       UNcx4_ADD_UNcx4 (d, s);
+       *(dest + i) = d;
+    }
+}
+
+/* if the Src is opaque, call combine_add_u */
+/* if the Dst is opaque, call combine_add_u */
+/* if both the Src and Dst are opaque, call combine_add_u */
+static void
+combine_saturate_u (pixman_implementation_t *imp,
+                    pixman_op_t              op,
+                    comp4_t *                dest,
+                    const comp4_t *          src,
+                    const comp4_t *          mask,
+                    int                      width)
+{
+    int i;
+
+    for (i = 0; i < width; ++i)
+    {
+       comp4_t s = combine_mask (src, mask, i);
+       comp4_t d = *(dest + i);
+       comp2_t sa, da;
+
+       sa = s >> A_SHIFT;
+       da = ~d >> A_SHIFT;
+       if (sa > da)
+       {
+           sa = DIV_UNc (da, sa);
+           UNcx4_MUL_UNc (s, sa);
+       }
+       ;
+       UNcx4_ADD_UNcx4 (d, s);
+       *(dest + i) = d;
+    }
+}
+
+/*
+ * PDF blend modes:
+ * The following blend modes have been taken from the PDF ISO 32000
+ * specification, which at this point in time is available from
+ * http://www.adobe.com/devnet/acrobat/pdfs/PDF32000_2008.pdf
+ * The relevant chapters are 11.3.5 and 11.3.6.
+ * The formula for computing the final pixel color given in 11.3.6 is:
+ * αr × Cr = (1 – αs) × αb × Cb + (1 – αb) × αs × Cs + αb × αs × B(Cb, Cs)
+ * with B() being the blend function.
+ * Note that OVER is a special case of this operation, using B(Cb, Cs) = Cs
+ *
+ * These blend modes should match the SVG filter draft specification, as
+ * it has been designed to mirror ISO 32000. Note that at the current point
+ * no released draft exists that shows this, as the formulas have not been
+ * updated yet after the release of ISO 32000.
+ *
+ * The default implementation here uses the PDF_SEPARABLE_BLEND_MODE and
+ * PDF_NON_SEPARABLE_BLEND_MODE macros, which take the blend function as an
+ * argument. Note that this implementation operates on premultiplied colors,
+ * while the PDF specification does not. Therefore the code uses the formula
+ * Cra = (1 – as) . Dca + (1 – ad) . Sca + B(Dca, ad, Sca, as)
+ */
+
+/*
+ * Multiply
+ * B(Dca, ad, Sca, as) = Dca.Sca
+ */
+
+static void
+combine_multiply_u (pixman_implementation_t *imp,
+                    pixman_op_t              op,
+                    comp4_t *                dest,
+                    const comp4_t *          src,
+                    const comp4_t *          mask,
+                    int                      width)
+{
+    int i;
+
+    for (i = 0; i < width; ++i)
+    {
+       comp4_t s = combine_mask (src, mask, i);
+       comp4_t d = *(dest + i);
+       comp4_t ss = s;
+       comp4_t src_ia = ALPHA_c (~s);
+       comp4_t dest_ia = ALPHA_c (~d);
+
+       UNcx4_MUL_UNc_ADD_UNcx4_MUL_UNc (ss, dest_ia, d, src_ia);
+       UNcx4_MUL_UNcx4 (d, s);
+       UNcx4_ADD_UNcx4 (d, ss);
+
+       *(dest + i) = d;
+    }
+}
+
+static void
+combine_multiply_ca (pixman_implementation_t *imp,
+                     pixman_op_t              op,
+                     comp4_t *                dest,
+                     const comp4_t *          src,
+                     const comp4_t *          mask,
+                     int                      width)
+{
+    int i;
+
+    for (i = 0; i < width; ++i)
+    {
+       comp4_t m = *(mask + i);
+       comp4_t s = *(src + i);
+       comp4_t d = *(dest + i);
+       comp4_t r = d;
+       comp4_t dest_ia = ALPHA_c (~d);
+
+       combine_mask_value_ca (&s, &m);
+
+       UNcx4_MUL_UNcx4_ADD_UNcx4_MUL_UNc (r, ~m, s, dest_ia);
+       UNcx4_MUL_UNcx4 (d, s);
+       UNcx4_ADD_UNcx4 (r, d);
+
+       *(dest + i) = r;
+    }
+}
+
+#define PDF_SEPARABLE_BLEND_MODE(name)                                 \
+    static void                                                                \
+    combine_ ## name ## _u (pixman_implementation_t *imp,              \
+                           pixman_op_t              op,                \
+                            comp4_t *                dest,             \
+                           const comp4_t *          src,               \
+                           const comp4_t *          mask,              \
+                           int                      width)             \
+    {                                                                  \
+       int i;                                                          \
+       for (i = 0; i < width; ++i) {                                   \
+           comp4_t s = combine_mask (src, mask, i);                    \
+           comp4_t d = *(dest + i);                                    \
+           comp1_t sa = ALPHA_c (s);                                   \
+           comp1_t isa = ~sa;                                          \
+           comp1_t da = ALPHA_c (d);                                   \
+           comp1_t ida = ~da;                                          \
+           comp4_t result;                                             \
+                                                                       \
+           result = d;                                                 \
+           UNcx4_MUL_UNc_ADD_UNcx4_MUL_UNc (result, isa, s, ida);      \
+                                                                       \
+           *(dest + i) = result +                                      \
+               (DIV_ONE_UNc (sa * da) << A_SHIFT) +                    \
+               (blend_ ## name (RED_c (d), da, RED_c (s), sa) << R_SHIFT) + \
+               (blend_ ## name (GREEN_c (d), da, GREEN_c (s), sa) << G_SHIFT) + \
+               (blend_ ## name (BLUE_c (d), da, BLUE_c (s), sa));      \
+       }                                                               \
+    }                                                                  \
+                                                                       \
+    static void                                                                \
+    combine_ ## name ## _ca (pixman_implementation_t *imp,             \
+                            pixman_op_t              op,               \
+                             comp4_t *                dest,            \
+                            const comp4_t *          src,              \
+                            const comp4_t *          mask,             \
+                            int                     width)             \
+    {                                                                  \
+       int i;                                                          \
+       for (i = 0; i < width; ++i) {                                   \
+           comp4_t m = *(mask + i);                                    \
+           comp4_t s = *(src + i);                                     \
+           comp4_t d = *(dest + i);                                    \
+           comp1_t da = ALPHA_c (d);                                   \
+           comp1_t ida = ~da;                                          \
+           comp4_t result;                                             \
+                                                                       \
+           combine_mask_value_ca (&s, &m);                             \
+                                                                       \
+           result = d;                                                 \
+           UNcx4_MUL_UNcx4_ADD_UNcx4_MUL_UNc (result, ~m, s, ida);     \
+                                                                       \
+           result +=                                                   \
+               (DIV_ONE_UNc (ALPHA_c (m) * da) << A_SHIFT) +           \
+               (blend_ ## name (RED_c (d), da, RED_c (s), RED_c (m)) << R_SHIFT) + \
+               (blend_ ## name (GREEN_c (d), da, GREEN_c (s), GREEN_c (m)) << G_SHIFT) + \
+               (blend_ ## name (BLUE_c (d), da, BLUE_c (s), BLUE_c (m))); \
+                                                                       \
+           *(dest + i) = result;                                       \
+       }                                                               \
+    }
+
+/*
+ * Screen
+ * B(Dca, ad, Sca, as) = Dca.sa + Sca.da - Dca.Sca
+ */
+static inline comp4_t
+blend_screen (comp4_t dca, comp4_t da, comp4_t sca, comp4_t sa)
+{
+    return DIV_ONE_UNc (sca * da + dca * sa - sca * dca);
+}
+
+PDF_SEPARABLE_BLEND_MODE (screen)
+
+/*
+ * Overlay
+ * B(Dca, Da, Sca, Sa) =
+ *   if 2.Dca < Da
+ *     2.Sca.Dca
+ *   otherwise
+ *     Sa.Da - 2.(Da - Dca).(Sa - Sca)
+ */
+static inline comp4_t
+blend_overlay (comp4_t dca, comp4_t da, comp4_t sca, comp4_t sa)
+{
+    comp4_t rca;
+
+    if (2 * dca < da)
+       rca = 2 * sca * dca;
+    else
+       rca = sa * da - 2 * (da - dca) * (sa - sca);
+    return DIV_ONE_UNc (rca);
+}
+
+PDF_SEPARABLE_BLEND_MODE (overlay)
+
+/*
+ * Darken
+ * B(Dca, Da, Sca, Sa) = min (Sca.Da, Dca.Sa)
+ */
+static inline comp4_t
+blend_darken (comp4_t dca, comp4_t da, comp4_t sca, comp4_t sa)
+{
+    comp4_t s, d;
+
+    s = sca * da;
+    d = dca * sa;
+    return DIV_ONE_UNc (s > d ? d : s);
+}
+
+PDF_SEPARABLE_BLEND_MODE (darken)
+
+/*
+ * Lighten
+ * B(Dca, Da, Sca, Sa) = max (Sca.Da, Dca.Sa)
+ */
+static inline comp4_t
+blend_lighten (comp4_t dca, comp4_t da, comp4_t sca, comp4_t sa)
+{
+    comp4_t s, d;
+
+    s = sca * da;
+    d = dca * sa;
+    return DIV_ONE_UNc (s > d ? s : d);
+}
+
+PDF_SEPARABLE_BLEND_MODE (lighten)
+
+/*
+ * Color dodge
+ * B(Dca, Da, Sca, Sa) =
+ *   if Dca == 0
+ *     0
+ *   if Sca == Sa
+ *     Sa.Da
+ *   otherwise
+ *     Sa.Da. min (1, Dca / Da / (1 - Sca/Sa))
+ */
+static inline comp4_t
+blend_color_dodge (comp4_t dca, comp4_t da, comp4_t sca, comp4_t sa)
+{
+    if (sca >= sa)
+    {
+       return dca == 0 ? 0 : DIV_ONE_UNc (sa * da);
+    }
+    else
+    {
+       comp4_t rca = dca * sa / (sa - sca);
+       return DIV_ONE_UNc (sa * MIN (rca, da));
+    }
+}
+
+PDF_SEPARABLE_BLEND_MODE (color_dodge)
+
+/*
+ * Color burn
+ * B(Dca, Da, Sca, Sa) =
+ *   if Dca == Da
+ *     Sa.Da
+ *   if Sca == 0
+ *     0
+ *   otherwise
+ *     Sa.Da.(1 - min (1, (1 - Dca/Da).Sa / Sca))
+ */
+static inline comp4_t
+blend_color_burn (comp4_t dca, comp4_t da, comp4_t sca, comp4_t sa)
+{
+    if (sca == 0)
+    {
+       return dca < da ? 0 : DIV_ONE_UNc (sa * da);
+    }
+    else
+    {
+       comp4_t rca = (da - dca) * sa / sca;
+       return DIV_ONE_UNc (sa * (MAX (rca, da) - rca));
+    }
+}
+
+PDF_SEPARABLE_BLEND_MODE (color_burn)
+
+/*
+ * Hard light
+ * B(Dca, Da, Sca, Sa) =
+ *   if 2.Sca < Sa
+ *     2.Sca.Dca
+ *   otherwise
+ *     Sa.Da - 2.(Da - Dca).(Sa - Sca)
+ */
+static inline comp4_t
+blend_hard_light (comp4_t dca, comp4_t da, comp4_t sca, comp4_t sa)
+{
+    if (2 * sca < sa)
+       return DIV_ONE_UNc (2 * sca * dca);
+    else
+       return DIV_ONE_UNc (sa * da - 2 * (da - dca) * (sa - sca));
+}
+
+PDF_SEPARABLE_BLEND_MODE (hard_light)
+
+/*
+ * Soft light
+ * B(Dca, Da, Sca, Sa) =
+ *   if (2.Sca <= Sa)
+ *     Dca.(Sa - (1 - Dca/Da).(2.Sca - Sa))
+ *   otherwise if Dca.4 <= Da
+ *     Dca.(Sa + (2.Sca - Sa).((16.Dca/Da - 12).Dca/Da + 3)
+ *   otherwise
+ *     (Dca.Sa + (SQRT (Dca/Da).Da - Dca).(2.Sca - Sa))
+ */
+static inline comp4_t
+blend_soft_light (comp4_t dca_org,
+                 comp4_t da_org,
+                 comp4_t sca_org,
+                 comp4_t sa_org)
+{
+    double dca = dca_org * (1.0 / MASK);
+    double da = da_org * (1.0 / MASK);
+    double sca = sca_org * (1.0 / MASK);
+    double sa = sa_org * (1.0 / MASK);
+    double rca;
+
+    if (2 * sca < sa)
+    {
+       if (da == 0)
+           rca = dca * sa;
+       else
+           rca = dca * sa - dca * (da - dca) * (sa - 2 * sca) / da;
+    }
+    else if (da == 0)
+    {
+       rca = 0;
+    }
+    else if (4 * dca <= da)
+    {
+       rca = dca * sa +
+           (2 * sca - sa) * dca * ((16 * dca / da - 12) * dca / da + 3);
+    }
+    else
+    {
+       rca = dca * sa + (sqrt (dca * da) - dca) * (2 * sca - sa);
+    }
+    return rca * MASK + 0.5;
+}
+
+PDF_SEPARABLE_BLEND_MODE (soft_light)
+
+/*
+ * Difference
+ * B(Dca, Da, Sca, Sa) = abs (Dca.Sa - Sca.Da)
+ */
+static inline comp4_t
+blend_difference (comp4_t dca, comp4_t da, comp4_t sca, comp4_t sa)
+{
+    comp4_t dcasa = dca * sa;
+    comp4_t scada = sca * da;
+
+    if (scada < dcasa)
+       return DIV_ONE_UNc (dcasa - scada);
+    else
+       return DIV_ONE_UNc (scada - dcasa);
+}
+
+PDF_SEPARABLE_BLEND_MODE (difference)
+
+/*
+ * Exclusion
+ * B(Dca, Da, Sca, Sa) = (Sca.Da + Dca.Sa - 2.Sca.Dca)
+ */
+
+/* This can be made faster by writing it directly and not using
+ * PDF_SEPARABLE_BLEND_MODE, but that's a performance optimization */
+
+static inline comp4_t
+blend_exclusion (comp4_t dca, comp4_t da, comp4_t sca, comp4_t sa)
+{
+    return DIV_ONE_UNc (sca * da + dca * sa - 2 * dca * sca);
+}
+
+PDF_SEPARABLE_BLEND_MODE (exclusion)
+
+#undef PDF_SEPARABLE_BLEND_MODE
+
+/*
+ * PDF nonseperable blend modes are implemented using the following functions
+ * to operate in Hsl space, with Cmax, Cmid, Cmin referring to the max, mid
+ * and min value of the red, green and blue components.
+ *
+ * LUM (C) = 0.3 × Cred + 0.59 × Cgreen + 0.11 × Cblue
+ *
+ * clip_color (C):
+ *   l = LUM (C)
+ *   min = Cmin
+ *   max = Cmax
+ *   if n < 0.0
+ *     C = l + ( ( ( C – l ) × l ) ⁄ ( l – min ) )
+ *   if x > 1.0
+ *     C = l + ( ( ( C – l ) × ( 1 – l ) ) ⁄ ( max – l ) )
+ *   return C
+ *
+ * set_lum (C, l):
+ *   d = l – LUM (C)
+ *   C += d
+ *   return clip_color (C)
+ *
+ * SAT (C) = CH_MAX (C) - CH_MIN (C)
+ *
+ * set_sat (C, s):
+ *  if Cmax > Cmin
+ *    Cmid = ( ( ( Cmid – Cmin ) × s ) ⁄ ( Cmax – Cmin ) )
+ *    Cmax = s
+ *  else
+ *    Cmid = Cmax = 0.0
+ *  Cmin = 0.0
+ *  return C
+ */
+
+/* For premultiplied colors, we need to know what happens when C is
+ * multiplied by a real number. LUM and SAT are linear:
+ *
+ *    LUM (r × C) = r × LUM (C)              SAT (r * C) = r * SAT (C)
+ *
+ * If we extend clip_color with an extra argument a and change
+ *
+ *        if x >= 1.0
+ *
+ * into
+ *
+ *        if x >= a
+ *
+ * then clip_color is also linear:
+ *
+ *    r * clip_color (C, a) = clip_color (r_c, ra);
+ *
+ * for positive r.
+ *
+ * Similarly, we can extend set_lum with an extra argument that is just passed
+ * on to clip_color:
+ *
+ *   r * set_lum ( C, l, a)
+ *
+ *   = r × clip_color ( C + l - LUM (C), a)
+ *
+ *   = clip_color ( r * C + r × l - r * LUM (C), r * a)
+ *
+ *   = set_lum ( r * C, r * l, r * a)
+ *
+ * Finally, set_sat:
+ *
+ *    r * set_sat (C, s) = set_sat (x * C, r * s)
+ *
+ * The above holds for all non-zero x, because the x'es in the fraction for
+ * C_mid cancel out. Specifically, it holds for x = r:
+ *
+ *    r * set_sat (C, s) = set_sat (r_c, rs)
+ *
+ */
+
+/* So, for the non-separable PDF blend modes, we have (using s, d for
+ * non-premultiplied colors, and S, D for premultiplied:
+ *
+ *   Color:
+ *
+ *     a_s * a_d * B(s, d)
+ *   = a_s * a_d * set_lum (S/a_s, LUM (D/a_d), 1)
+ *   = set_lum (S * a_d, a_s * LUM (D), a_s * a_d)
+ *
+ *
+ *   Luminosity:
+ *
+ *     a_s * a_d * B(s, d)
+ *   = a_s * a_d * set_lum (D/a_d, LUM(S/a_s), 1)
+ *   = set_lum (a_s * D, a_d * LUM(S), a_s * a_d)
+ *
+ *
+ *   Saturation:
+ *
+ *     a_s * a_d * B(s, d)
+ *   = a_s * a_d * set_lum (set_sat (D/a_d, SAT (S/a_s)), LUM (D/a_d), 1)
+ *   = set_lum (a_s * a_d * set_sat (D/a_d, SAT (S/a_s)),
+ *                                        a_s * LUM (D), a_s * a_d)
+ *   = set_lum (set_sat (a_s * D, a_d * SAT (S), a_s * LUM (D), a_s * a_d))
+ *
+ *   Hue:
+ *
+ *     a_s * a_d * B(s, d)
+ *   = a_s * a_d * set_lum (set_sat (S/a_s, SAT (D/a_d)), LUM (D/a_d), 1)
+ *   = set_lum (set_sat (a_d * S, a_s * SAT (D)), a_s * LUM (D), a_s * a_d)
+ *
+ */
+
+#define CH_MIN(c) (c[0] < c[1] ? (c[0] < c[2] ? c[0] : c[2]) : (c[1] < c[2] ? c[1] : c[2]))
+#define CH_MAX(c) (c[0] > c[1] ? (c[0] > c[2] ? c[0] : c[2]) : (c[1] > c[2] ? c[1] : c[2]))
+#define LUM(c) ((c[0] * 30 + c[1] * 59 + c[2] * 11) / 100)
+#define SAT(c) (CH_MAX (c) - CH_MIN (c))
+
+#define PDF_NON_SEPARABLE_BLEND_MODE(name)                             \
+    static void                                                                \
+    combine_ ## name ## _u (pixman_implementation_t *imp,              \
+                           pixman_op_t op,                             \
+                            comp4_t *dest,                             \
+                           const comp4_t *src,                         \
+                           const comp4_t *mask,                        \
+                           int width)                                  \
+    {                                                                  \
+       int i;                                                          \
+       for (i = 0; i < width; ++i)                                     \
+       {                                                               \
+           comp4_t s = combine_mask (src, mask, i);                    \
+           comp4_t d = *(dest + i);                                    \
+           comp1_t sa = ALPHA_c (s);                                   \
+           comp1_t isa = ~sa;                                          \
+           comp1_t da = ALPHA_c (d);                                   \
+           comp1_t ida = ~da;                                          \
+           comp4_t result;                                             \
+           comp4_t sc[3], dc[3], c[3];                                 \
+                                                                       \
+           result = d;                                                 \
+           UNcx4_MUL_UNc_ADD_UNcx4_MUL_UNc (result, isa, s, ida);      \
+           dc[0] = RED_c (d);                                          \
+           sc[0] = RED_c (s);                                          \
+           dc[1] = GREEN_c (d);                                        \
+           sc[1] = GREEN_c (s);                                        \
+           dc[2] = BLUE_c (d);                                         \
+           sc[2] = BLUE_c (s);                                         \
+           blend_ ## name (c, dc, da, sc, sa);                         \
+                                                                       \
+           *(dest + i) = result +                                      \
+               (DIV_ONE_UNc (sa * da) << A_SHIFT) +                    \
+               (DIV_ONE_UNc (c[0]) << R_SHIFT) +                       \
+               (DIV_ONE_UNc (c[1]) << G_SHIFT) +                       \
+               (DIV_ONE_UNc (c[2]));                                   \
+       }                                                               \
+    }
+
+static void
+set_lum (comp4_t dest[3], comp4_t src[3], comp4_t sa, comp4_t lum)
+{
+    double a, l, min, max;
+    double tmp[3];
+
+    a = sa * (1.0 / MASK);
+
+    l = lum * (1.0 / MASK);
+    tmp[0] = src[0] * (1.0 / MASK);
+    tmp[1] = src[1] * (1.0 / MASK);
+    tmp[2] = src[2] * (1.0 / MASK);
+
+    l = l - LUM (tmp);
+    tmp[0] += l;
+    tmp[1] += l;
+    tmp[2] += l;
+
+    /* clip_color */
+    l = LUM (tmp);
+    min = CH_MIN (tmp);
+    max = CH_MAX (tmp);
+
+    if (min < 0)
+    {
+       if (l - min == 0.0)
+       {
+           tmp[0] = 0;
+           tmp[1] = 0;
+           tmp[2] = 0;
+       }
+       else
+       {
+           tmp[0] = l + (tmp[0] - l) * l / (l - min);
+           tmp[1] = l + (tmp[1] - l) * l / (l - min);
+           tmp[2] = l + (tmp[2] - l) * l / (l - min);
+       }
+    }
+    if (max > a)
+    {
+       if (max - l == 0.0)
+       {
+           tmp[0] = a;
+           tmp[1] = a;
+           tmp[2] = a;
+       }
+       else
+       {
+           tmp[0] = l + (tmp[0] - l) * (a - l) / (max - l);
+           tmp[1] = l + (tmp[1] - l) * (a - l) / (max - l);
+           tmp[2] = l + (tmp[2] - l) * (a - l) / (max - l);
+       }
+    }
+
+    dest[0] = tmp[0] * MASK + 0.5;
+    dest[1] = tmp[1] * MASK + 0.5;
+    dest[2] = tmp[2] * MASK + 0.5;
+}
+
+static void
+set_sat (comp4_t dest[3], comp4_t src[3], comp4_t sat)
+{
+    int id[3];
+    comp4_t min, max;
+
+    if (src[0] > src[1])
+    {
+       if (src[0] > src[2])
+       {
+           id[0] = 0;
+           if (src[1] > src[2])
+           {
+               id[1] = 1;
+               id[2] = 2;
+           }
+           else
+           {
+               id[1] = 2;
+               id[2] = 1;
+           }
+       }
+       else
+       {
+           id[0] = 2;
+           id[1] = 0;
+           id[2] = 1;
+       }
+    }
+    else
+    {
+       if (src[0] > src[2])
+       {
+           id[0] = 1;
+           id[1] = 0;
+           id[2] = 2;
+       }
+       else
+       {
+           id[2] = 0;
+           if (src[1] > src[2])
+           {
+               id[0] = 1;
+               id[1] = 2;
+           }
+           else
+           {
+               id[0] = 2;
+               id[1] = 1;
+           }
+       }
+    }
+
+    max = dest[id[0]];
+    min = dest[id[2]];
+    if (max > min)
+    {
+       dest[id[1]] = (dest[id[1]] - min) * sat / (max - min);
+       dest[id[0]] = sat;
+       dest[id[2]] = 0;
+    }
+    else
+    {
+       dest[0] = dest[1] = dest[2] = 0;
+    }
+}
+
+/*
+ * Hue:
+ * B(Cb, Cs) = set_lum (set_sat (Cs, SAT (Cb)), LUM (Cb))
+ */
+static inline void
+blend_hsl_hue (comp4_t c[3],
+               comp4_t dc[3],
+               comp4_t da,
+               comp4_t sc[3],
+               comp4_t sa)
+{
+    c[0] = sc[0] * da;
+    c[1] = sc[1] * da;
+    c[2] = sc[2] * da;
+    set_sat (c, c, SAT (dc) * sa);
+    set_lum (c, c, sa * da, LUM (dc) * sa);
+}
+
+PDF_NON_SEPARABLE_BLEND_MODE (hsl_hue)
+
+/*
+ * Saturation:
+ * B(Cb, Cs) = set_lum (set_sat (Cb, SAT (Cs)), LUM (Cb))
+ */
+static inline void
+blend_hsl_saturation (comp4_t c[3],
+                      comp4_t dc[3],
+                      comp4_t da,
+                      comp4_t sc[3],
+                      comp4_t sa)
+{
+    c[0] = dc[0] * sa;
+    c[1] = dc[1] * sa;
+    c[2] = dc[2] * sa;
+    set_sat (c, c, SAT (sc) * da);
+    set_lum (c, c, sa * da, LUM (dc) * sa);
+}
+
+PDF_NON_SEPARABLE_BLEND_MODE (hsl_saturation)
+
+/*
+ * Color:
+ * B(Cb, Cs) = set_lum (Cs, LUM (Cb))
+ */
+static inline void
+blend_hsl_color (comp4_t c[3],
+                 comp4_t dc[3],
+                 comp4_t da,
+                 comp4_t sc[3],
+                 comp4_t sa)
+{
+    c[0] = sc[0] * da;
+    c[1] = sc[1] * da;
+    c[2] = sc[2] * da;
+    set_lum (c, c, sa * da, LUM (dc) * sa);
+}
+
+PDF_NON_SEPARABLE_BLEND_MODE (hsl_color)
+
+/*
+ * Luminosity:
+ * B(Cb, Cs) = set_lum (Cb, LUM (Cs))
+ */
+static inline void
+blend_hsl_luminosity (comp4_t c[3],
+                      comp4_t dc[3],
+                      comp4_t da,
+                      comp4_t sc[3],
+                      comp4_t sa)
+{
+    c[0] = dc[0] * sa;
+    c[1] = dc[1] * sa;
+    c[2] = dc[2] * sa;
+    set_lum (c, c, sa * da, LUM (sc) * da);
+}
+
+PDF_NON_SEPARABLE_BLEND_MODE (hsl_luminosity)
+
+#undef SAT
+#undef LUM
+#undef CH_MAX
+#undef CH_MIN
+#undef PDF_NON_SEPARABLE_BLEND_MODE
+
+/* All of the disjoint/conjoint composing functions
+ *
+ * The four entries in the first column indicate what source contributions
+ * come from each of the four areas of the picture -- areas covered by neither
+ * A nor B, areas covered only by A, areas covered only by B and finally
+ * areas covered by both A and B.
+ * 
+ * Disjoint                    Conjoint
+ * Fa          Fb              Fa              Fb
+ * (0,0,0,0)   0               0               0               0
+ * (0,A,0,A)   1               0               1               0
+ * (0,0,B,B)   0               1               0               1
+ * (0,A,B,A)   1               min((1-a)/b,1)  1               max(1-a/b,0)
+ * (0,A,B,B)   min((1-b)/a,1)  1               max(1-b/a,0)    1
+ * (0,0,0,A)   max(1-(1-b)/a,0) 0              min(1,b/a)      0
+ * (0,0,0,B)   0               max(1-(1-a)/b,0) 0              min(a/b,1)
+ * (0,A,0,0)   min(1,(1-b)/a)  0               max(1-b/a,0)    0
+ * (0,0,B,0)   0               min(1,(1-a)/b)  0               max(1-a/b,0)
+ * (0,0,B,A)   max(1-(1-b)/a,0) min(1,(1-a)/b)  min(1,b/a)     max(1-a/b,0)
+ * (0,A,0,B)   min(1,(1-b)/a)  max(1-(1-a)/b,0) max(1-b/a,0)   min(1,a/b)
+ * (0,A,B,0)   min(1,(1-b)/a)  min(1,(1-a)/b)  max(1-b/a,0)    max(1-a/b,0)
+ *
+ * See  http://marc.info/?l=xfree-render&m=99792000027857&w=2  for more
+ * information about these operators.
+ */
+
+#define COMBINE_A_OUT 1
+#define COMBINE_A_IN  2
+#define COMBINE_B_OUT 4
+#define COMBINE_B_IN  8
+
+#define COMBINE_CLEAR   0
+#define COMBINE_A       (COMBINE_A_OUT | COMBINE_A_IN)
+#define COMBINE_B       (COMBINE_B_OUT | COMBINE_B_IN)
+#define COMBINE_A_OVER  (COMBINE_A_OUT | COMBINE_B_OUT | COMBINE_A_IN)
+#define COMBINE_B_OVER  (COMBINE_A_OUT | COMBINE_B_OUT | COMBINE_B_IN)
+#define COMBINE_A_ATOP  (COMBINE_B_OUT | COMBINE_A_IN)
+#define COMBINE_B_ATOP  (COMBINE_A_OUT | COMBINE_B_IN)
+#define COMBINE_XOR     (COMBINE_A_OUT | COMBINE_B_OUT)
+
+/* portion covered by a but not b */
+static comp1_t
+combine_disjoint_out_part (comp1_t a, comp1_t b)
+{
+    /* min (1, (1-b) / a) */
+
+    b = ~b;                 /* 1 - b */
+    if (b >= a)             /* 1 - b >= a -> (1-b)/a >= 1 */
+       return MASK;        /* 1 */
+    return DIV_UNc (b, a);     /* (1-b) / a */
+}
+
+/* portion covered by both a and b */
+static comp1_t
+combine_disjoint_in_part (comp1_t a, comp1_t b)
+{
+    /* max (1-(1-b)/a,0) */
+    /*  = - min ((1-b)/a - 1, 0) */
+    /*  = 1 - min (1, (1-b)/a) */
+
+    b = ~b;                 /* 1 - b */
+    if (b >= a)             /* 1 - b >= a -> (1-b)/a >= 1 */
+       return 0;           /* 1 - 1 */
+    return ~DIV_UNc(b, a);    /* 1 - (1-b) / a */
+}
+
+/* portion covered by a but not b */
+static comp1_t
+combine_conjoint_out_part (comp1_t a, comp1_t b)
+{
+    /* max (1-b/a,0) */
+    /* = 1-min(b/a,1) */
+
+    /* min (1, (1-b) / a) */
+
+    if (b >= a)             /* b >= a -> b/a >= 1 */
+       return 0x00;        /* 0 */
+    return ~DIV_UNc(b, a);    /* 1 - b/a */
+}
+
+/* portion covered by both a and b */
+static comp1_t
+combine_conjoint_in_part (comp1_t a, comp1_t b)
+{
+    /* min (1,b/a) */
+
+    if (b >= a)             /* b >= a -> b/a >= 1 */
+       return MASK;        /* 1 */
+    return DIV_UNc (b, a);     /* b/a */
+}
+
+#define GET_COMP(v, i)   ((comp2_t) (comp1_t) ((v) >> i))
+
+#define ADD(x, y, i, t)                                                        \
+    ((t) = GET_COMP (x, i) + GET_COMP (y, i),                          \
+     (comp4_t) ((comp1_t) ((t) | (0 - ((t) >> G_SHIFT)))) << (i))
+
+#define GENERIC(x, y, i, ax, ay, t, u, v)                              \
+    ((t) = (MUL_UNc (GET_COMP (y, i), ay, (u)) +                       \
+            MUL_UNc (GET_COMP (x, i), ax, (v))),                       \
+     (comp4_t) ((comp1_t) ((t) |                                       \
+                           (0 - ((t) >> G_SHIFT)))) << (i))
+
+static void
+combine_disjoint_general_u (comp4_t *      dest,
+                            const comp4_t *src,
+                            const comp4_t *mask,
+                            int            width,
+                            comp1_t        combine)
+{
+    int i;
+
+    for (i = 0; i < width; ++i)
+    {
+       comp4_t s = combine_mask (src, mask, i);
+       comp4_t d = *(dest + i);
+       comp4_t m, n, o, p;
+       comp2_t Fa, Fb, t, u, v;
+       comp1_t sa = s >> A_SHIFT;
+       comp1_t da = d >> A_SHIFT;
+
+       switch (combine & COMBINE_A)
+       {
+       default:
+           Fa = 0;
+           break;
+
+       case COMBINE_A_OUT:
+           Fa = combine_disjoint_out_part (sa, da);
+           break;
+
+       case COMBINE_A_IN:
+           Fa = combine_disjoint_in_part (sa, da);
+           break;
+
+       case COMBINE_A:
+           Fa = MASK;
+           break;
+       }
+
+       switch (combine & COMBINE_B)
+       {
+       default:
+           Fb = 0;
+           break;
+
+       case COMBINE_B_OUT:
+           Fb = combine_disjoint_out_part (da, sa);
+           break;
+
+       case COMBINE_B_IN:
+           Fb = combine_disjoint_in_part (da, sa);
+           break;
+
+       case COMBINE_B:
+           Fb = MASK;
+           break;
+       }
+       m = GENERIC (s, d, 0, Fa, Fb, t, u, v);
+       n = GENERIC (s, d, G_SHIFT, Fa, Fb, t, u, v);
+       o = GENERIC (s, d, R_SHIFT, Fa, Fb, t, u, v);
+       p = GENERIC (s, d, A_SHIFT, Fa, Fb, t, u, v);
+       s = m | n | o | p;
+       *(dest + i) = s;
+    }
+}
+
+static void
+combine_disjoint_over_u (pixman_implementation_t *imp,
+                         pixman_op_t              op,
+                         comp4_t *                dest,
+                         const comp4_t *          src,
+                         const comp4_t *          mask,
+                         int                      width)
+{
+    int i;
+
+    for (i = 0; i < width; ++i)
+    {
+       comp4_t s = combine_mask (src, mask, i);
+       comp2_t a = s >> A_SHIFT;
+
+       if (s != 0x00)
+       {
+           comp4_t d = *(dest + i);
+           a = combine_disjoint_out_part (d >> A_SHIFT, a);
+           UNcx4_MUL_UNc_ADD_UNcx4 (d, a, s);
+
+           *(dest + i) = d;
+       }
+    }
+}
+
+static void
+combine_disjoint_in_u (pixman_implementation_t *imp,
+                       pixman_op_t              op,
+                       comp4_t *                dest,
+                       const comp4_t *          src,
+                       const comp4_t *          mask,
+                       int                      width)
+{
+    combine_disjoint_general_u (dest, src, mask, width, COMBINE_A_IN);
+}
+
+static void
+combine_disjoint_in_reverse_u (pixman_implementation_t *imp,
+                               pixman_op_t              op,
+                               comp4_t *                dest,
+                               const comp4_t *          src,
+                               const comp4_t *          mask,
+                               int                      width)
+{
+    combine_disjoint_general_u (dest, src, mask, width, COMBINE_B_IN);
+}
+
+static void
+combine_disjoint_out_u (pixman_implementation_t *imp,
+                        pixman_op_t              op,
+                        comp4_t *                dest,
+                        const comp4_t *          src,
+                        const comp4_t *          mask,
+                        int                      width)
+{
+    combine_disjoint_general_u (dest, src, mask, width, COMBINE_A_OUT);
+}
+
+static void
+combine_disjoint_out_reverse_u (pixman_implementation_t *imp,
+                                pixman_op_t              op,
+                                comp4_t *                dest,
+                                const comp4_t *          src,
+                                const comp4_t *          mask,
+                                int                      width)
+{
+    combine_disjoint_general_u (dest, src, mask, width, COMBINE_B_OUT);
+}
+
+static void
+combine_disjoint_atop_u (pixman_implementation_t *imp,
+                         pixman_op_t              op,
+                         comp4_t *                dest,
+                         const comp4_t *          src,
+                         const comp4_t *          mask,
+                         int                      width)
+{
+    combine_disjoint_general_u (dest, src, mask, width, COMBINE_A_ATOP);
+}
+
+static void
+combine_disjoint_atop_reverse_u (pixman_implementation_t *imp,
+                                 pixman_op_t              op,
+                                 comp4_t *                dest,
+                                 const comp4_t *          src,
+                                 const comp4_t *          mask,
+                                 int                      width)
+{
+    combine_disjoint_general_u (dest, src, mask, width, COMBINE_B_ATOP);
+}
+
+static void
+combine_disjoint_xor_u (pixman_implementation_t *imp,
+                        pixman_op_t              op,
+                        comp4_t *                dest,
+                        const comp4_t *          src,
+                        const comp4_t *          mask,
+                        int                      width)
+{
+    combine_disjoint_general_u (dest, src, mask, width, COMBINE_XOR);
+}
+
+static void
+combine_conjoint_general_u (comp4_t *      dest,
+                            const comp4_t *src,
+                            const comp4_t *mask,
+                            int            width,
+                            comp1_t        combine)
+{
+    int i;
+
+    for (i = 0; i < width; ++i)
+    {
+       comp4_t s = combine_mask (src, mask, i);
+       comp4_t d = *(dest + i);
+       comp4_t m, n, o, p;
+       comp2_t Fa, Fb, t, u, v;
+       comp1_t sa = s >> A_SHIFT;
+       comp1_t da = d >> A_SHIFT;
+
+       switch (combine & COMBINE_A)
+       {
+       default:
+           Fa = 0;
+           break;
+
+       case COMBINE_A_OUT:
+           Fa = combine_conjoint_out_part (sa, da);
+           break;
+
+       case COMBINE_A_IN:
+           Fa = combine_conjoint_in_part (sa, da);
+           break;
+
+       case COMBINE_A:
+           Fa = MASK;
+           break;
+       }
+
+       switch (combine & COMBINE_B)
+       {
+       default:
+           Fb = 0;
+           break;
+
+       case COMBINE_B_OUT:
+           Fb = combine_conjoint_out_part (da, sa);
+           break;
+
+       case COMBINE_B_IN:
+           Fb = combine_conjoint_in_part (da, sa);
+           break;
+
+       case COMBINE_B:
+           Fb = MASK;
+           break;
+       }
+
+       m = GENERIC (s, d, 0, Fa, Fb, t, u, v);
+       n = GENERIC (s, d, G_SHIFT, Fa, Fb, t, u, v);
+       o = GENERIC (s, d, R_SHIFT, Fa, Fb, t, u, v);
+       p = GENERIC (s, d, A_SHIFT, Fa, Fb, t, u, v);
+
+       s = m | n | o | p;
+
+       *(dest + i) = s;
+    }
+}
+
+static void
+combine_conjoint_over_u (pixman_implementation_t *imp,
+                         pixman_op_t              op,
+                         comp4_t *                dest,
+                         const comp4_t *          src,
+                         const comp4_t *          mask,
+                         int                      width)
+{
+    combine_conjoint_general_u (dest, src, mask, width, COMBINE_A_OVER);
+}
+
+static void
+combine_conjoint_over_reverse_u (pixman_implementation_t *imp,
+                                 pixman_op_t              op,
+                                 comp4_t *                dest,
+                                 const comp4_t *          src,
+                                 const comp4_t *          mask,
+                                 int                      width)
+{
+    combine_conjoint_general_u (dest, src, mask, width, COMBINE_B_OVER);
+}
+
+static void
+combine_conjoint_in_u (pixman_implementation_t *imp,
+                       pixman_op_t              op,
+                       comp4_t *                dest,
+                       const comp4_t *          src,
+                       const comp4_t *          mask,
+                       int                      width)
+{
+    combine_conjoint_general_u (dest, src, mask, width, COMBINE_A_IN);
+}
+
+static void
+combine_conjoint_in_reverse_u (pixman_implementation_t *imp,
+                               pixman_op_t              op,
+                               comp4_t *                dest,
+                               const comp4_t *          src,
+                               const comp4_t *          mask,
+                               int                      width)
+{
+    combine_conjoint_general_u (dest, src, mask, width, COMBINE_B_IN);
+}
+
+static void
+combine_conjoint_out_u (pixman_implementation_t *imp,
+                        pixman_op_t              op,
+                        comp4_t *                dest,
+                        const comp4_t *          src,
+                        const comp4_t *          mask,
+                        int                      width)
+{
+    combine_conjoint_general_u (dest, src, mask, width, COMBINE_A_OUT);
+}
+
+static void
+combine_conjoint_out_reverse_u (pixman_implementation_t *imp,
+                                pixman_op_t              op,
+                                comp4_t *                dest,
+                                const comp4_t *          src,
+                                const comp4_t *          mask,
+                                int                      width)
+{
+    combine_conjoint_general_u (dest, src, mask, width, COMBINE_B_OUT);
+}
+
+static void
+combine_conjoint_atop_u (pixman_implementation_t *imp,
+                         pixman_op_t              op,
+                         comp4_t *                dest,
+                         const comp4_t *          src,
+                         const comp4_t *          mask,
+                         int                      width)
+{
+    combine_conjoint_general_u (dest, src, mask, width, COMBINE_A_ATOP);
+}
+
+static void
+combine_conjoint_atop_reverse_u (pixman_implementation_t *imp,
+                                 pixman_op_t              op,
+                                 comp4_t *                dest,
+                                 const comp4_t *          src,
+                                 const comp4_t *          mask,
+                                 int                      width)
+{
+    combine_conjoint_general_u (dest, src, mask, width, COMBINE_B_ATOP);
+}
+
+static void
+combine_conjoint_xor_u (pixman_implementation_t *imp,
+                        pixman_op_t              op,
+                        comp4_t *                dest,
+                        const comp4_t *          src,
+                        const comp4_t *          mask,
+                        int                      width)
+{
+    combine_conjoint_general_u (dest, src, mask, width, COMBINE_XOR);
+}
+
+/************************************************************************/
+/*********************** Per Channel functions **************************/
+/************************************************************************/
+
+static void
+combine_clear_ca (pixman_implementation_t *imp,
+                  pixman_op_t              op,
+                  comp4_t *                dest,
+                  const comp4_t *          src,
+                  const comp4_t *          mask,
+                  int                      width)
+{
+    memset (dest, 0, width * sizeof(comp4_t));
+}
+
+static void
+combine_src_ca (pixman_implementation_t *imp,
+                pixman_op_t              op,
+                comp4_t *                dest,
+                const comp4_t *          src,
+                const comp4_t *          mask,
+                int                      width)
+{
+    int i;
+
+    for (i = 0; i < width; ++i)
+    {
+       comp4_t s = *(src + i);
+       comp4_t m = *(mask + i);
+
+       combine_mask_value_ca (&s, &m);
+
+       *(dest + i) = s;
+    }
+}
+
+static void
+combine_over_ca (pixman_implementation_t *imp,
+                 pixman_op_t              op,
+                 comp4_t *                dest,
+                 const comp4_t *          src,
+                 const comp4_t *          mask,
+                 int                      width)
+{
+    int i;
+
+    for (i = 0; i < width; ++i)
+    {
+       comp4_t s = *(src + i);
+       comp4_t m = *(mask + i);
+       comp4_t a;
+
+       combine_mask_ca (&s, &m);
+
+       a = ~m;
+       if (a)
+       {
+           comp4_t d = *(dest + i);
+           UNcx4_MUL_UNcx4_ADD_UNcx4 (d, a, s);
+           s = d;
+       }
+
+       *(dest + i) = s;
+    }
+}
+
+static void
+combine_over_reverse_ca (pixman_implementation_t *imp,
+                         pixman_op_t              op,
+                         comp4_t *                dest,
+                         const comp4_t *          src,
+                         const comp4_t *          mask,
+                         int                      width)
+{
+    int i;
+
+    for (i = 0; i < width; ++i)
+    {
+       comp4_t d = *(dest + i);
+       comp4_t a = ~d >> A_SHIFT;
+
+       if (a)
+       {
+           comp4_t s = *(src + i);
+           comp4_t m = *(mask + i);
+
+           UNcx4_MUL_UNcx4 (s, m);
+           UNcx4_MUL_UNc_ADD_UNcx4 (s, a, d);
+
+           *(dest + i) = s;
+       }
+    }
+}
+
+static void
+combine_in_ca (pixman_implementation_t *imp,
+               pixman_op_t              op,
+               comp4_t *                dest,
+               const comp4_t *          src,
+               const comp4_t *          mask,
+               int                      width)
+{
+    int i;
+
+    for (i = 0; i < width; ++i)
+    {
+       comp4_t d = *(dest + i);
+       comp2_t a = d >> A_SHIFT;
+       comp4_t s = 0;
+
+       if (a)
+       {
+           comp4_t m = *(mask + i);
+
+           s = *(src + i);
+           combine_mask_value_ca (&s, &m);
+
+           if (a != MASK)
+               UNcx4_MUL_UNc (s, a);
+       }
+
+       *(dest + i) = s;
+    }
+}
+
+static void
+combine_in_reverse_ca (pixman_implementation_t *imp,
+                       pixman_op_t              op,
+                       comp4_t *                dest,
+                       const comp4_t *          src,
+                       const comp4_t *          mask,
+                       int                      width)
+{
+    int i;
+
+    for (i = 0; i < width; ++i)
+    {
+       comp4_t s = *(src + i);
+       comp4_t m = *(mask + i);
+       comp4_t a;
+
+       combine_mask_alpha_ca (&s, &m);
+
+       a = m;
+       if (a != ~0)
+       {
+           comp4_t d = 0;
+
+           if (a)
+           {
+               d = *(dest + i);
+               UNcx4_MUL_UNcx4 (d, a);
+           }
+
+           *(dest + i) = d;
+       }
+    }
+}
+
+static void
+combine_out_ca (pixman_implementation_t *imp,
+                pixman_op_t              op,
+                comp4_t *                dest,
+                const comp4_t *          src,
+                const comp4_t *          mask,
+                int                      width)
+{
+    int i;
+
+    for (i = 0; i < width; ++i)
+    {
+       comp4_t d = *(dest + i);
+       comp2_t a = ~d >> A_SHIFT;
+       comp4_t s = 0;
+
+       if (a)
+       {
+           comp4_t m = *(mask + i);
+
+           s = *(src + i);
+           combine_mask_value_ca (&s, &m);
+
+           if (a != MASK)
+               UNcx4_MUL_UNc (s, a);
+       }
+
+       *(dest + i) = s;
+    }
+}
+
+static void
+combine_out_reverse_ca (pixman_implementation_t *imp,
+                        pixman_op_t              op,
+                        comp4_t *                dest,
+                        const comp4_t *          src,
+                        const comp4_t *          mask,
+                        int                      width)
+{
+    int i;
+
+    for (i = 0; i < width; ++i)
+    {
+       comp4_t s = *(src + i);
+       comp4_t m = *(mask + i);
+       comp4_t a;
+
+       combine_mask_alpha_ca (&s, &m);
+
+       a = ~m;
+       if (a != ~0)
+       {
+           comp4_t d = 0;
+
+           if (a)
+           {
+               d = *(dest + i);
+               UNcx4_MUL_UNcx4 (d, a);
+           }
+
+           *(dest + i) = d;
+       }
+    }
+}
+
+static void
+combine_atop_ca (pixman_implementation_t *imp,
+                 pixman_op_t              op,
+                 comp4_t *                dest,
+                 const comp4_t *          src,
+                 const comp4_t *          mask,
+                 int                      width)
+{
+    int i;
+
+    for (i = 0; i < width; ++i)
+    {
+       comp4_t d = *(dest + i);
+       comp4_t s = *(src + i);
+       comp4_t m = *(mask + i);
+       comp4_t ad;
+       comp2_t as = d >> A_SHIFT;
+
+       combine_mask_ca (&s, &m);
+
+       ad = ~m;
+
+       UNcx4_MUL_UNcx4_ADD_UNcx4_MUL_UNc (d, ad, s, as);
+
+       *(dest + i) = d;
+    }
+}
+
+static void
+combine_atop_reverse_ca (pixman_implementation_t *imp,
+                         pixman_op_t              op,
+                         comp4_t *                dest,
+                         const comp4_t *          src,
+                         const comp4_t *          mask,
+                         int                      width)
+{
+    int i;
+
+    for (i = 0; i < width; ++i)
+    {
+       comp4_t d = *(dest + i);
+       comp4_t s = *(src + i);
+       comp4_t m = *(mask + i);
+       comp4_t ad;
+       comp2_t as = ~d >> A_SHIFT;
+
+       combine_mask_ca (&s, &m);
+
+       ad = m;
+
+       UNcx4_MUL_UNcx4_ADD_UNcx4_MUL_UNc (d, ad, s, as);
+
+       *(dest + i) = d;
+    }
+}
+
+static void
+combine_xor_ca (pixman_implementation_t *imp,
+                pixman_op_t              op,
+                comp4_t *                dest,
+                const comp4_t *          src,
+                const comp4_t *          mask,
+                int                      width)
+{
+    int i;
+
+    for (i = 0; i < width; ++i)
+    {
+       comp4_t d = *(dest + i);
+       comp4_t s = *(src + i);
+       comp4_t m = *(mask + i);
+       comp4_t ad;
+       comp2_t as = ~d >> A_SHIFT;
+
+       combine_mask_ca (&s, &m);
+
+       ad = ~m;
+
+       UNcx4_MUL_UNcx4_ADD_UNcx4_MUL_UNc (d, ad, s, as);
+
+       *(dest + i) = d;
+    }
+}
+
+static void
+combine_add_ca (pixman_implementation_t *imp,
+                pixman_op_t              op,
+                comp4_t *                dest,
+                const comp4_t *          src,
+                const comp4_t *          mask,
+                int                      width)
+{
+    int i;
+
+    for (i = 0; i < width; ++i)
+    {
+       comp4_t s = *(src + i);
+       comp4_t m = *(mask + i);
+       comp4_t d = *(dest + i);
+
+       combine_mask_value_ca (&s, &m);
+
+       UNcx4_ADD_UNcx4 (d, s);
+
+       *(dest + i) = d;
+    }
+}
+
+static void
+combine_saturate_ca (pixman_implementation_t *imp,
+                     pixman_op_t              op,
+                     comp4_t *                dest,
+                     const comp4_t *          src,
+                     const comp4_t *          mask,
+                     int                      width)
+{
+    int i;
+
+    for (i = 0; i < width; ++i)
+    {
+       comp4_t s, d;
+       comp2_t sa, sr, sg, sb, da;
+       comp2_t t, u, v;
+       comp4_t m, n, o, p;
+
+       d = *(dest + i);
+       s = *(src + i);
+       m = *(mask + i);
+
+       combine_mask_ca (&s, &m);
+
+       sa = (m >> A_SHIFT);
+       sr = (m >> R_SHIFT) & MASK;
+       sg = (m >> G_SHIFT) & MASK;
+       sb =  m             & MASK;
+       da = ~d >> A_SHIFT;
+
+       if (sb <= da)
+           m = ADD (s, d, 0, t);
+       else
+           m = GENERIC (s, d, 0, (da << G_SHIFT) / sb, MASK, t, u, v);
+
+       if (sg <= da)
+           n = ADD (s, d, G_SHIFT, t);
+       else
+           n = GENERIC (s, d, G_SHIFT, (da << G_SHIFT) / sg, MASK, t, u, v);
+
+       if (sr <= da)
+           o = ADD (s, d, R_SHIFT, t);
+       else
+           o = GENERIC (s, d, R_SHIFT, (da << G_SHIFT) / sr, MASK, t, u, v);
+
+       if (sa <= da)
+           p = ADD (s, d, A_SHIFT, t);
+       else
+           p = GENERIC (s, d, A_SHIFT, (da << G_SHIFT) / sa, MASK, t, u, v);
+
+       *(dest + i) = m | n | o | p;
+    }
+}
+
+static void
+combine_disjoint_general_ca (comp4_t *      dest,
+                             const comp4_t *src,
+                             const comp4_t *mask,
+                             int            width,
+                             comp1_t        combine)
+{
+    int i;
+
+    for (i = 0; i < width; ++i)
+    {
+       comp4_t s, d;
+       comp4_t m, n, o, p;
+       comp4_t Fa, Fb;
+       comp2_t t, u, v;
+       comp4_t sa;
+       comp1_t da;
+
+       s = *(src + i);
+       m = *(mask + i);
+       d = *(dest + i);
+       da = d >> A_SHIFT;
+
+       combine_mask_ca (&s, &m);
+
+       sa = m;
+
+       switch (combine & COMBINE_A)
+       {
+       default:
+           Fa = 0;
+           break;
+
+       case COMBINE_A_OUT:
+           m = (comp4_t)combine_disjoint_out_part ((comp1_t) (sa >> 0), da);
+           n = (comp4_t)combine_disjoint_out_part ((comp1_t) (sa >> G_SHIFT), da) << G_SHIFT;
+           o = (comp4_t)combine_disjoint_out_part ((comp1_t) (sa >> R_SHIFT), da) << R_SHIFT;
+           p = (comp4_t)combine_disjoint_out_part ((comp1_t) (sa >> A_SHIFT), da) << A_SHIFT;
+           Fa = m | n | o | p;
+           break;
+
+       case COMBINE_A_IN:
+           m = (comp4_t)combine_disjoint_in_part ((comp1_t) (sa >> 0), da);
+           n = (comp4_t)combine_disjoint_in_part ((comp1_t) (sa >> G_SHIFT), da) << G_SHIFT;
+           o = (comp4_t)combine_disjoint_in_part ((comp1_t) (sa >> R_SHIFT), da) << R_SHIFT;
+           p = (comp4_t)combine_disjoint_in_part ((comp1_t) (sa >> A_SHIFT), da) << A_SHIFT;
+           Fa = m | n | o | p;
+           break;
+
+       case COMBINE_A:
+           Fa = ~0;
+           break;
+       }
+
+       switch (combine & COMBINE_B)
+       {
+       default:
+           Fb = 0;
+           break;
+
+       case COMBINE_B_OUT:
+           m = (comp4_t)combine_disjoint_out_part (da, (comp1_t) (sa >> 0));
+           n = (comp4_t)combine_disjoint_out_part (da, (comp1_t) (sa >> G_SHIFT)) << G_SHIFT;
+           o = (comp4_t)combine_disjoint_out_part (da, (comp1_t) (sa >> R_SHIFT)) << R_SHIFT;
+           p = (comp4_t)combine_disjoint_out_part (da, (comp1_t) (sa >> A_SHIFT)) << A_SHIFT;
+           Fb = m | n | o | p;
+           break;
+
+       case COMBINE_B_IN:
+           m = (comp4_t)combine_disjoint_in_part (da, (comp1_t) (sa >> 0));
+           n = (comp4_t)combine_disjoint_in_part (da, (comp1_t) (sa >> G_SHIFT)) << G_SHIFT;
+           o = (comp4_t)combine_disjoint_in_part (da, (comp1_t) (sa >> R_SHIFT)) << R_SHIFT;
+           p = (comp4_t)combine_disjoint_in_part (da, (comp1_t) (sa >> A_SHIFT)) << A_SHIFT;
+           Fb = m | n | o | p;
+           break;
+
+       case COMBINE_B:
+           Fb = ~0;
+           break;
+       }
+       m = GENERIC (s, d, 0, GET_COMP (Fa, 0), GET_COMP (Fb, 0), t, u, v);
+       n = GENERIC (s, d, G_SHIFT, GET_COMP (Fa, G_SHIFT), GET_COMP (Fb, G_SHIFT), t, u, v);
+       o = GENERIC (s, d, R_SHIFT, GET_COMP (Fa, R_SHIFT), GET_COMP (Fb, R_SHIFT), t, u, v);
+       p = GENERIC (s, d, A_SHIFT, GET_COMP (Fa, A_SHIFT), GET_COMP (Fb, A_SHIFT), t, u, v);
+
+       s = m | n | o | p;
+
+       *(dest + i) = s;
+    }
+}
+
+static void
+combine_disjoint_over_ca (pixman_implementation_t *imp,
+                          pixman_op_t              op,
+                          comp4_t *                dest,
+                          const comp4_t *          src,
+                          const comp4_t *          mask,
+                          int                      width)
+{
+    combine_disjoint_general_ca (dest, src, mask, width, COMBINE_A_OVER);
+}
+
+static void
+combine_disjoint_in_ca (pixman_implementation_t *imp,
+                        pixman_op_t              op,
+                        comp4_t *                dest,
+                        const comp4_t *          src,
+                        const comp4_t *          mask,
+                        int                      width)
+{
+    combine_disjoint_general_ca (dest, src, mask, width, COMBINE_A_IN);
+}
+
+static void
+combine_disjoint_in_reverse_ca (pixman_implementation_t *imp,
+                                pixman_op_t              op,
+                                comp4_t *                dest,
+                                const comp4_t *          src,
+                                const comp4_t *          mask,
+                                int                      width)
+{
+    combine_disjoint_general_ca (dest, src, mask, width, COMBINE_B_IN);
+}
+
+static void
+combine_disjoint_out_ca (pixman_implementation_t *imp,
+                         pixman_op_t              op,
+                         comp4_t *                dest,
+                         const comp4_t *          src,
+                         const comp4_t *          mask,
+                         int                      width)
+{
+    combine_disjoint_general_ca (dest, src, mask, width, COMBINE_A_OUT);
+}
+
+static void
+combine_disjoint_out_reverse_ca (pixman_implementation_t *imp,
+                                 pixman_op_t              op,
+                                 comp4_t *                dest,
+                                 const comp4_t *          src,
+                                 const comp4_t *          mask,
+                                 int                      width)
+{
+    combine_disjoint_general_ca (dest, src, mask, width, COMBINE_B_OUT);
+}
+
+static void
+combine_disjoint_atop_ca (pixman_implementation_t *imp,
+                          pixman_op_t              op,
+                          comp4_t *                dest,
+                          const comp4_t *          src,
+                          const comp4_t *          mask,
+                          int                      width)
+{
+    combine_disjoint_general_ca (dest, src, mask, width, COMBINE_A_ATOP);
+}
+
+static void
+combine_disjoint_atop_reverse_ca (pixman_implementation_t *imp,
+                                  pixman_op_t              op,
+                                  comp4_t *                dest,
+                                  const comp4_t *          src,
+                                  const comp4_t *          mask,
+                                  int                      width)
+{
+    combine_disjoint_general_ca (dest, src, mask, width, COMBINE_B_ATOP);
+}
+
+static void
+combine_disjoint_xor_ca (pixman_implementation_t *imp,
+                         pixman_op_t              op,
+                         comp4_t *                dest,
+                         const comp4_t *          src,
+                         const comp4_t *          mask,
+                         int                      width)
+{
+    combine_disjoint_general_ca (dest, src, mask, width, COMBINE_XOR);
+}
+
+static void
+combine_conjoint_general_ca (comp4_t *      dest,
+                             const comp4_t *src,
+                             const comp4_t *mask,
+                             int            width,
+                             comp1_t        combine)
+{
+    int i;
+
+    for (i = 0; i < width; ++i)
+    {
+       comp4_t s, d;
+       comp4_t m, n, o, p;
+       comp4_t Fa, Fb;
+       comp2_t t, u, v;
+       comp4_t sa;
+       comp1_t da;
+
+       s = *(src + i);
+       m = *(mask + i);
+       d = *(dest + i);
+       da = d >> A_SHIFT;
+
+       combine_mask_ca (&s, &m);
+
+       sa = m;
+
+       switch (combine & COMBINE_A)
+       {
+       default:
+           Fa = 0;
+           break;
+
+       case COMBINE_A_OUT:
+           m = (comp4_t)combine_conjoint_out_part ((comp1_t) (sa >> 0), da);
+           n = (comp4_t)combine_conjoint_out_part ((comp1_t) (sa >> G_SHIFT), da) << G_SHIFT;
+           o = (comp4_t)combine_conjoint_out_part ((comp1_t) (sa >> R_SHIFT), da) << R_SHIFT;
+           p = (comp4_t)combine_conjoint_out_part ((comp1_t) (sa >> A_SHIFT), da) << A_SHIFT;
+           Fa = m | n | o | p;
+           break;
+
+       case COMBINE_A_IN:
+           m = (comp4_t)combine_conjoint_in_part ((comp1_t) (sa >> 0), da);
+           n = (comp4_t)combine_conjoint_in_part ((comp1_t) (sa >> G_SHIFT), da) << G_SHIFT;
+           o = (comp4_t)combine_conjoint_in_part ((comp1_t) (sa >> R_SHIFT), da) << R_SHIFT;
+           p = (comp4_t)combine_conjoint_in_part ((comp1_t) (sa >> A_SHIFT), da) << A_SHIFT;
+           Fa = m | n | o | p;
+           break;
+
+       case COMBINE_A:
+           Fa = ~0;
+           break;
+       }
+
+       switch (combine & COMBINE_B)
+       {
+       default:
+           Fb = 0;
+           break;
+
+       case COMBINE_B_OUT:
+           m = (comp4_t)combine_conjoint_out_part (da, (comp1_t) (sa >> 0));
+           n = (comp4_t)combine_conjoint_out_part (da, (comp1_t) (sa >> G_SHIFT)) << G_SHIFT;
+           o = (comp4_t)combine_conjoint_out_part (da, (comp1_t) (sa >> R_SHIFT)) << R_SHIFT;
+           p = (comp4_t)combine_conjoint_out_part (da, (comp1_t) (sa >> A_SHIFT)) << A_SHIFT;
+           Fb = m | n | o | p;
+           break;
+
+       case COMBINE_B_IN:
+           m = (comp4_t)combine_conjoint_in_part (da, (comp1_t) (sa >> 0));
+           n = (comp4_t)combine_conjoint_in_part (da, (comp1_t) (sa >> G_SHIFT)) << G_SHIFT;
+           o = (comp4_t)combine_conjoint_in_part (da, (comp1_t) (sa >> R_SHIFT)) << R_SHIFT;
+           p = (comp4_t)combine_conjoint_in_part (da, (comp1_t) (sa >> A_SHIFT)) << A_SHIFT;
+           Fb = m | n | o | p;
+           break;
+
+       case COMBINE_B:
+           Fb = ~0;
+           break;
+       }
+       m = GENERIC (s, d, 0, GET_COMP (Fa, 0), GET_COMP (Fb, 0), t, u, v);
+       n = GENERIC (s, d, G_SHIFT, GET_COMP (Fa, G_SHIFT), GET_COMP (Fb, G_SHIFT), t, u, v);
+       o = GENERIC (s, d, R_SHIFT, GET_COMP (Fa, R_SHIFT), GET_COMP (Fb, R_SHIFT), t, u, v);
+       p = GENERIC (s, d, A_SHIFT, GET_COMP (Fa, A_SHIFT), GET_COMP (Fb, A_SHIFT), t, u, v);
+
+       s = m | n | o | p;
+
+       *(dest + i) = s;
+    }
+}
+
+static void
+combine_conjoint_over_ca (pixman_implementation_t *imp,
+                          pixman_op_t              op,
+                          comp4_t *                dest,
+                          const comp4_t *          src,
+                          const comp4_t *          mask,
+                          int                      width)
+{
+    combine_conjoint_general_ca (dest, src, mask, width, COMBINE_A_OVER);
+}
+
+static void
+combine_conjoint_over_reverse_ca (pixman_implementation_t *imp,
+                                  pixman_op_t              op,
+                                  comp4_t *                dest,
+                                  const comp4_t *          src,
+                                  const comp4_t *          mask,
+                                  int                      width)
+{
+    combine_conjoint_general_ca (dest, src, mask, width, COMBINE_B_OVER);
+}
+
+static void
+combine_conjoint_in_ca (pixman_implementation_t *imp,
+                        pixman_op_t              op,
+                        comp4_t *                dest,
+                        const comp4_t *          src,
+                        const comp4_t *          mask,
+                        int                      width)
+{
+    combine_conjoint_general_ca (dest, src, mask, width, COMBINE_A_IN);
+}
+
+static void
+combine_conjoint_in_reverse_ca (pixman_implementation_t *imp,
+                                pixman_op_t              op,
+                                comp4_t *                dest,
+                                const comp4_t *          src,
+                                const comp4_t *          mask,
+                                int                      width)
+{
+    combine_conjoint_general_ca (dest, src, mask, width, COMBINE_B_IN);
+}
+
+static void
+combine_conjoint_out_ca (pixman_implementation_t *imp,
+                         pixman_op_t              op,
+                         comp4_t *                dest,
+                         const comp4_t *          src,
+                         const comp4_t *          mask,
+                         int                      width)
+{
+    combine_conjoint_general_ca (dest, src, mask, width, COMBINE_A_OUT);
+}
+
+static void
+combine_conjoint_out_reverse_ca (pixman_implementation_t *imp,
+                                 pixman_op_t              op,
+                                 comp4_t *                dest,
+                                 const comp4_t *          src,
+                                 const comp4_t *          mask,
+                                 int                      width)
+{
+    combine_conjoint_general_ca (dest, src, mask, width, COMBINE_B_OUT);
+}
+
+static void
+combine_conjoint_atop_ca (pixman_implementation_t *imp,
+                          pixman_op_t              op,
+                          comp4_t *                dest,
+                          const comp4_t *          src,
+                          const comp4_t *          mask,
+                          int                      width)
+{
+    combine_conjoint_general_ca (dest, src, mask, width, COMBINE_A_ATOP);
+}
+
+static void
+combine_conjoint_atop_reverse_ca (pixman_implementation_t *imp,
+                                  pixman_op_t              op,
+                                  comp4_t *                dest,
+                                  const comp4_t *          src,
+                                  const comp4_t *          mask,
+                                  int                      width)
+{
+    combine_conjoint_general_ca (dest, src, mask, width, COMBINE_B_ATOP);
+}
+
+static void
+combine_conjoint_xor_ca (pixman_implementation_t *imp,
+                         pixman_op_t              op,
+                         comp4_t *                dest,
+                         const comp4_t *          src,
+                         const comp4_t *          mask,
+                         int                      width)
+{
+    combine_conjoint_general_ca (dest, src, mask, width, COMBINE_XOR);
+}
+
+void
+_pixman_setup_combiner_functions_width (pixman_implementation_t *imp)
+{
+    /* Unified alpha */
+    imp->combine_width[PIXMAN_OP_CLEAR] = combine_clear;
+    imp->combine_width[PIXMAN_OP_SRC] = combine_src_u;
+    imp->combine_width[PIXMAN_OP_DST] = combine_dst;
+    imp->combine_width[PIXMAN_OP_OVER] = combine_over_u;
+    imp->combine_width[PIXMAN_OP_OVER_REVERSE] = combine_over_reverse_u;
+    imp->combine_width[PIXMAN_OP_IN] = combine_in_u;
+    imp->combine_width[PIXMAN_OP_IN_REVERSE] = combine_in_reverse_u;
+    imp->combine_width[PIXMAN_OP_OUT] = combine_out_u;
+    imp->combine_width[PIXMAN_OP_OUT_REVERSE] = combine_out_reverse_u;
+    imp->combine_width[PIXMAN_OP_ATOP] = combine_atop_u;
+    imp->combine_width[PIXMAN_OP_ATOP_REVERSE] = combine_atop_reverse_u;
+    imp->combine_width[PIXMAN_OP_XOR] = combine_xor_u;
+    imp->combine_width[PIXMAN_OP_ADD] = combine_add_u;
+    imp->combine_width[PIXMAN_OP_SATURATE] = combine_saturate_u;
+
+    /* Disjoint, unified */
+    imp->combine_width[PIXMAN_OP_DISJOINT_CLEAR] = combine_clear;
+    imp->combine_width[PIXMAN_OP_DISJOINT_SRC] = combine_src_u;
+    imp->combine_width[PIXMAN_OP_DISJOINT_DST] = combine_dst;
+    imp->combine_width[PIXMAN_OP_DISJOINT_OVER] = combine_disjoint_over_u;
+    imp->combine_width[PIXMAN_OP_DISJOINT_OVER_REVERSE] = combine_saturate_u;
+    imp->combine_width[PIXMAN_OP_DISJOINT_IN] = combine_disjoint_in_u;
+    imp->combine_width[PIXMAN_OP_DISJOINT_IN_REVERSE] = combine_disjoint_in_reverse_u;
+    imp->combine_width[PIXMAN_OP_DISJOINT_OUT] = combine_disjoint_out_u;
+    imp->combine_width[PIXMAN_OP_DISJOINT_OUT_REVERSE] = combine_disjoint_out_reverse_u;
+    imp->combine_width[PIXMAN_OP_DISJOINT_ATOP] = combine_disjoint_atop_u;
+    imp->combine_width[PIXMAN_OP_DISJOINT_ATOP_REVERSE] = combine_disjoint_atop_reverse_u;
+    imp->combine_width[PIXMAN_OP_DISJOINT_XOR] = combine_disjoint_xor_u;
+
+    /* Conjoint, unified */
+    imp->combine_width[PIXMAN_OP_CONJOINT_CLEAR] = combine_clear;
+    imp->combine_width[PIXMAN_OP_CONJOINT_SRC] = combine_src_u;
+    imp->combine_width[PIXMAN_OP_CONJOINT_DST] = combine_dst;
+    imp->combine_width[PIXMAN_OP_CONJOINT_OVER] = combine_conjoint_over_u;
+    imp->combine_width[PIXMAN_OP_CONJOINT_OVER_REVERSE] = combine_conjoint_over_reverse_u;
+    imp->combine_width[PIXMAN_OP_CONJOINT_IN] = combine_conjoint_in_u;
+    imp->combine_width[PIXMAN_OP_CONJOINT_IN_REVERSE] = combine_conjoint_in_reverse_u;
+    imp->combine_width[PIXMAN_OP_CONJOINT_OUT] = combine_conjoint_out_u;
+    imp->combine_width[PIXMAN_OP_CONJOINT_OUT_REVERSE] = combine_conjoint_out_reverse_u;
+    imp->combine_width[PIXMAN_OP_CONJOINT_ATOP] = combine_conjoint_atop_u;
+    imp->combine_width[PIXMAN_OP_CONJOINT_ATOP_REVERSE] = combine_conjoint_atop_reverse_u;
+    imp->combine_width[PIXMAN_OP_CONJOINT_XOR] = combine_conjoint_xor_u;
+
+    imp->combine_width[PIXMAN_OP_MULTIPLY] = combine_multiply_u;
+    imp->combine_width[PIXMAN_OP_SCREEN] = combine_screen_u;
+    imp->combine_width[PIXMAN_OP_OVERLAY] = combine_overlay_u;
+    imp->combine_width[PIXMAN_OP_DARKEN] = combine_darken_u;
+    imp->combine_width[PIXMAN_OP_LIGHTEN] = combine_lighten_u;
+    imp->combine_width[PIXMAN_OP_COLOR_DODGE] = combine_color_dodge_u;
+    imp->combine_width[PIXMAN_OP_COLOR_BURN] = combine_color_burn_u;
+    imp->combine_width[PIXMAN_OP_HARD_LIGHT] = combine_hard_light_u;
+    imp->combine_width[PIXMAN_OP_SOFT_LIGHT] = combine_soft_light_u;
+    imp->combine_width[PIXMAN_OP_DIFFERENCE] = combine_difference_u;
+    imp->combine_width[PIXMAN_OP_EXCLUSION] = combine_exclusion_u;
+    imp->combine_width[PIXMAN_OP_HSL_HUE] = combine_hsl_hue_u;
+    imp->combine_width[PIXMAN_OP_HSL_SATURATION] = combine_hsl_saturation_u;
+    imp->combine_width[PIXMAN_OP_HSL_COLOR] = combine_hsl_color_u;
+    imp->combine_width[PIXMAN_OP_HSL_LUMINOSITY] = combine_hsl_luminosity_u;
+
+    /* Component alpha combiners */
+    imp->combine_width_ca[PIXMAN_OP_CLEAR] = combine_clear_ca;
+    imp->combine_width_ca[PIXMAN_OP_SRC] = combine_src_ca;
+    /* dest */
+    imp->combine_width_ca[PIXMAN_OP_OVER] = combine_over_ca;
+    imp->combine_width_ca[PIXMAN_OP_OVER_REVERSE] = combine_over_reverse_ca;
+    imp->combine_width_ca[PIXMAN_OP_IN] = combine_in_ca;
+    imp->combine_width_ca[PIXMAN_OP_IN_REVERSE] = combine_in_reverse_ca;
+    imp->combine_width_ca[PIXMAN_OP_OUT] = combine_out_ca;
+    imp->combine_width_ca[PIXMAN_OP_OUT_REVERSE] = combine_out_reverse_ca;
+    imp->combine_width_ca[PIXMAN_OP_ATOP] = combine_atop_ca;
+    imp->combine_width_ca[PIXMAN_OP_ATOP_REVERSE] = combine_atop_reverse_ca;
+    imp->combine_width_ca[PIXMAN_OP_XOR] = combine_xor_ca;
+    imp->combine_width_ca[PIXMAN_OP_ADD] = combine_add_ca;
+    imp->combine_width_ca[PIXMAN_OP_SATURATE] = combine_saturate_ca;
+
+    /* Disjoint CA */
+    imp->combine_width_ca[PIXMAN_OP_DISJOINT_CLEAR] = combine_clear_ca;
+    imp->combine_width_ca[PIXMAN_OP_DISJOINT_SRC] = combine_src_ca;
+    imp->combine_width_ca[PIXMAN_OP_DISJOINT_DST] = combine_dst;
+    imp->combine_width_ca[PIXMAN_OP_DISJOINT_OVER] = combine_disjoint_over_ca;
+    imp->combine_width_ca[PIXMAN_OP_DISJOINT_OVER_REVERSE] = combine_saturate_ca;
+    imp->combine_width_ca[PIXMAN_OP_DISJOINT_IN] = combine_disjoint_in_ca;
+    imp->combine_width_ca[PIXMAN_OP_DISJOINT_IN_REVERSE] = combine_disjoint_in_reverse_ca;
+    imp->combine_width_ca[PIXMAN_OP_DISJOINT_OUT] = combine_disjoint_out_ca;
+    imp->combine_width_ca[PIXMAN_OP_DISJOINT_OUT_REVERSE] = combine_disjoint_out_reverse_ca;
+    imp->combine_width_ca[PIXMAN_OP_DISJOINT_ATOP] = combine_disjoint_atop_ca;
+    imp->combine_width_ca[PIXMAN_OP_DISJOINT_ATOP_REVERSE] = combine_disjoint_atop_reverse_ca;
+    imp->combine_width_ca[PIXMAN_OP_DISJOINT_XOR] = combine_disjoint_xor_ca;
+
+    /* Conjoint CA */
+    imp->combine_width_ca[PIXMAN_OP_CONJOINT_CLEAR] = combine_clear_ca;
+    imp->combine_width_ca[PIXMAN_OP_CONJOINT_SRC] = combine_src_ca;
+    imp->combine_width_ca[PIXMAN_OP_CONJOINT_DST] = combine_dst;
+    imp->combine_width_ca[PIXMAN_OP_CONJOINT_OVER] = combine_conjoint_over_ca;
+    imp->combine_width_ca[PIXMAN_OP_CONJOINT_OVER_REVERSE] = combine_conjoint_over_reverse_ca;
+    imp->combine_width_ca[PIXMAN_OP_CONJOINT_IN] = combine_conjoint_in_ca;
+    imp->combine_width_ca[PIXMAN_OP_CONJOINT_IN_REVERSE] = combine_conjoint_in_reverse_ca;
+    imp->combine_width_ca[PIXMAN_OP_CONJOINT_OUT] = combine_conjoint_out_ca;
+    imp->combine_width_ca[PIXMAN_OP_CONJOINT_OUT_REVERSE] = combine_conjoint_out_reverse_ca;
+    imp->combine_width_ca[PIXMAN_OP_CONJOINT_ATOP] = combine_conjoint_atop_ca;
+    imp->combine_width_ca[PIXMAN_OP_CONJOINT_ATOP_REVERSE] = combine_conjoint_atop_reverse_ca;
+    imp->combine_width_ca[PIXMAN_OP_CONJOINT_XOR] = combine_conjoint_xor_ca;
+
+    imp->combine_width_ca[PIXMAN_OP_MULTIPLY] = combine_multiply_ca;
+    imp->combine_width_ca[PIXMAN_OP_SCREEN] = combine_screen_ca;
+    imp->combine_width_ca[PIXMAN_OP_OVERLAY] = combine_overlay_ca;
+    imp->combine_width_ca[PIXMAN_OP_DARKEN] = combine_darken_ca;
+    imp->combine_width_ca[PIXMAN_OP_LIGHTEN] = combine_lighten_ca;
+    imp->combine_width_ca[PIXMAN_OP_COLOR_DODGE] = combine_color_dodge_ca;
+    imp->combine_width_ca[PIXMAN_OP_COLOR_BURN] = combine_color_burn_ca;
+    imp->combine_width_ca[PIXMAN_OP_HARD_LIGHT] = combine_hard_light_ca;
+    imp->combine_width_ca[PIXMAN_OP_SOFT_LIGHT] = combine_soft_light_ca;
+    imp->combine_width_ca[PIXMAN_OP_DIFFERENCE] = combine_difference_ca;
+    imp->combine_width_ca[PIXMAN_OP_EXCLUSION] = combine_exclusion_ca;
+
+    /* It is not clear that these make sense, so make them noops for now */
+    imp->combine_width_ca[PIXMAN_OP_HSL_HUE] = combine_dst;
+    imp->combine_width_ca[PIXMAN_OP_HSL_SATURATION] = combine_dst;
+    imp->combine_width_ca[PIXMAN_OP_HSL_COLOR] = combine_dst;
+    imp->combine_width_ca[PIXMAN_OP_HSL_LUMINOSITY] = combine_dst;
+}
+
diff --git a/pixman/pixman-combine.h.template b/pixman/pixman-combine.h.template
new file mode 100644 (file)
index 0000000..67ed309
--- /dev/null
@@ -0,0 +1,226 @@
+
+#define COMPONENT_SIZE
+#define MASK
+#define ONE_HALF
+
+#define A_SHIFT
+#define R_SHIFT
+#define G_SHIFT
+#define A_MASK
+#define R_MASK
+#define G_MASK
+
+#define RB_MASK
+#define AG_MASK
+#define RB_ONE_HALF
+#define RB_MASK_PLUS_ONE
+
+#define ALPHA_c(x) ((x) >> A_SHIFT)
+#define RED_c(x) (((x) >> R_SHIFT) & MASK)
+#define GREEN_c(x) (((x) >> G_SHIFT) & MASK)
+#define BLUE_c(x) ((x) & MASK)
+
+/*
+ * Helper macros.
+ */
+
+#define MUL_UNc(a, b, t)                                               \
+    ((t) = (a) * (b) + ONE_HALF, ((((t) >> G_SHIFT ) + (t) ) >> G_SHIFT ))
+
+#define DIV_UNc(a, b)                                                  \
+    (((comp2_t) (a) * MASK) / (b))
+
+#define ADD_UNc(x, y, t)                                    \
+    ((t) = (x) + (y),                                       \
+     (comp4_t) (comp1_t) ((t) | (0 - ((t) >> G_SHIFT))))
+
+#define DIV_ONE_UNc(x)                                                 \
+    (((x) + ONE_HALF + (((x) + ONE_HALF) >> G_SHIFT)) >> G_SHIFT)
+
+/*
+ * The methods below use some tricks to be able to do two color
+ * components at the same time.
+ */
+
+/*
+ * x_rb = (x_rb * a) / 255
+ */
+#define UNc_rb_MUL_UNc(x, a, t)                                                \
+    do                                                                 \
+    {                                                                  \
+       t  = ((x) & RB_MASK) * (a);                                     \
+       t += RB_ONE_HALF;                                               \
+       x = (t + ((t >> G_SHIFT) & RB_MASK)) >> G_SHIFT;                \
+       x &= RB_MASK;                                                   \
+    } while (0)
+
+/*
+ * x_rb = min (x_rb + y_rb, 255)
+ */
+#define UNc_rb_ADD_UNc_rb(x, y, t)                                     \
+    do                                                                 \
+    {                                                                  \
+       t = ((x) + (y));                                                \
+       t |= RB_MASK_PLUS_ONE - ((t >> G_SHIFT) & RB_MASK);             \
+       x = (t & RB_MASK);                                              \
+    } while (0)
+
+/*
+ * x_rb = (x_rb * a_rb) / 255
+ */
+#define UNc_rb_MUL_UNc_rb(x, a, t)                                     \
+    do                                                                 \
+    {                                                                  \
+       t  = (x & MASK) * (a & MASK);                                   \
+       t |= (x & R_MASK) * ((a >> R_SHIFT) & MASK);                    \
+       t += RB_ONE_HALF;                                               \
+       t = (t + ((t >> G_SHIFT) & RB_MASK)) >> G_SHIFT;                \
+       x = t & RB_MASK;                                                \
+    } while (0)
+
+/*
+ * x_c = (x_c * a) / 255
+ */
+#define UNcx4_MUL_UNc(x, a)                                            \
+    do                                                                 \
+    {                                                                  \
+       comp4_t r1__, r2__, t__;                                        \
+                                                                       \
+       r1__ = (x);                                                     \
+       UNc_rb_MUL_UNc (r1__, (a), t__);                                \
+                                                                       \
+       r2__ = (x) >> G_SHIFT;                                          \
+       UNc_rb_MUL_UNc (r2__, (a), t__);                                \
+                                                                       \
+       (x) = r1__ | (r2__ << G_SHIFT);                                 \
+    } while (0)
+
+/*
+ * x_c = (x_c * a) / 255 + y_c
+ */
+#define UNcx4_MUL_UNc_ADD_UNcx4(x, a, y)                               \
+    do                                                                 \
+    {                                                                  \
+       comp4_t r1__, r2__, r3__, t__;                                  \
+                                                                       \
+       r1__ = (x);                                                     \
+       r2__ = (y) & RB_MASK;                                           \
+       UNc_rb_MUL_UNc (r1__, (a), t__);                                \
+       UNc_rb_ADD_UNc_rb (r1__, r2__, t__);                            \
+                                                                       \
+       r2__ = (x) >> G_SHIFT;                                          \
+       r3__ = ((y) >> G_SHIFT) & RB_MASK;                              \
+       UNc_rb_MUL_UNc (r2__, (a), t__);                                \
+       UNc_rb_ADD_UNc_rb (r2__, r3__, t__);                            \
+                                                                       \
+       (x) = r1__ | (r2__ << G_SHIFT);                                 \
+    } while (0)
+
+/*
+ * x_c = (x_c * a + y_c * b) / 255
+ */
+#define UNcx4_MUL_UNc_ADD_UNcx4_MUL_UNc(x, a, y, b)                    \
+    do                                                                 \
+    {                                                                  \
+       comp4_t r1__, r2__, r3__, t__;                                  \
+                                                                       \
+       r1__ = (x);                                                     \
+       r2__ = (y);                                                     \
+       UNc_rb_MUL_UNc (r1__, (a), t__);                                \
+       UNc_rb_MUL_UNc (r2__, (b), t__);                                \
+       UNc_rb_ADD_UNc_rb (r1__, r2__, t__);                            \
+                                                                       \
+       r2__ = ((x) >> G_SHIFT);                                        \
+       r3__ = ((y) >> G_SHIFT);                                        \
+       UNc_rb_MUL_UNc (r2__, (a), t__);                                \
+       UNc_rb_MUL_UNc (r3__, (b), t__);                                \
+       UNc_rb_ADD_UNc_rb (r2__, r3__, t__);                            \
+                                                                       \
+       (x) = r1__ | (r2__ << G_SHIFT);                                 \
+    } while (0)
+
+/*
+ * x_c = (x_c * a_c) / 255
+ */
+#define UNcx4_MUL_UNcx4(x, a)                                          \
+    do                                                                 \
+    {                                                                  \
+       comp4_t r1__, r2__, r3__, t__;                                  \
+                                                                       \
+       r1__ = (x);                                                     \
+       r2__ = (a);                                                     \
+       UNc_rb_MUL_UNc_rb (r1__, r2__, t__);                            \
+                                                                       \
+       r2__ = (x) >> G_SHIFT;                                          \
+       r3__ = (a) >> G_SHIFT;                                          \
+       UNc_rb_MUL_UNc_rb (r2__, r3__, t__);                            \
+                                                                       \
+       (x) = r1__ | (r2__ << G_SHIFT);                                 \
+    } while (0)
+
+/*
+ * x_c = (x_c * a_c) / 255 + y_c
+ */
+#define UNcx4_MUL_UNcx4_ADD_UNcx4(x, a, y)                             \
+    do                                                                 \
+    {                                                                  \
+       comp4_t r1__, r2__, r3__, t__;                                  \
+                                                                       \
+       r1__ = (x);                                                     \
+       r2__ = (a);                                                     \
+       UNc_rb_MUL_UNc_rb (r1__, r2__, t__);                            \
+       r2__ = (y) & RB_MASK;                                           \
+       UNc_rb_ADD_UNc_rb (r1__, r2__, t__);                            \
+                                                                       \
+       r2__ = ((x) >> G_SHIFT);                                        \
+       r3__ = ((a) >> G_SHIFT);                                        \
+       UNc_rb_MUL_UNc_rb (r2__, r3__, t__);                            \
+       r3__ = ((y) >> G_SHIFT) & RB_MASK;                              \
+       UNc_rb_ADD_UNc_rb (r2__, r3__, t__);                            \
+                                                                       \
+       (x) = r1__ | (r2__ << G_SHIFT);                                 \
+    } while (0)
+
+/*
+ * x_c = (x_c * a_c + y_c * b) / 255
+ */
+#define UNcx4_MUL_UNcx4_ADD_UNcx4_MUL_UNc(x, a, y, b)                  \
+    do                                                                 \
+    {                                                                  \
+       comp4_t r1__, r2__, r3__, t__;                                  \
+                                                                       \
+       r1__ = (x);                                                     \
+       r2__ = (a);                                                     \
+       UNc_rb_MUL_UNc_rb (r1__, r2__, t__);                            \
+       r2__ = (y);                                                     \
+       UNc_rb_MUL_UNc (r2__, (b), t__);                                \
+       UNc_rb_ADD_UNc_rb (r1__, r2__, t__);                            \
+                                                                       \
+       r2__ = (x) >> G_SHIFT;                                          \
+       r3__ = (a) >> G_SHIFT;                                          \
+       UNc_rb_MUL_UNc_rb (r2__, r3__, t__);                            \
+       r3__ = (y) >> G_SHIFT;                                          \
+       UNc_rb_MUL_UNc (r3__, (b), t__);                                \
+       UNc_rb_ADD_UNc_rb (r2__, r3__, t__);                            \
+                                                                       \
+       x = r1__ | (r2__ << G_SHIFT);                                   \
+    } while (0)
+
+/*
+  x_c = min(x_c + y_c, 255)
+*/
+#define UNcx4_ADD_UNcx4(x, y)                                          \
+    do                                                                 \
+    {                                                                  \
+       comp4_t r1__, r2__, r3__, t__;                                  \
+                                                                       \
+       r1__ = (x) & RB_MASK;                                           \
+       r2__ = (y) & RB_MASK;                                           \
+       UNc_rb_ADD_UNc_rb (r1__, r2__, t__);                            \
+                                                                       \
+       r2__ = ((x) >> G_SHIFT) & RB_MASK;                              \
+       r3__ = ((y) >> G_SHIFT) & RB_MASK;                              \
+       UNc_rb_ADD_UNc_rb (r2__, r3__, t__);                            \
+                                                                       \
+       x = r1__ | (r2__ << G_SHIFT);                                   \
+    } while (0)
diff --git a/pixman/pixman-compiler.h b/pixman/pixman-compiler.h
new file mode 100644 (file)
index 0000000..fe2a613
--- /dev/null
@@ -0,0 +1,209 @@
+/* Pixman uses some non-standard compiler features. This file ensures
+ * they exist
+ *
+ * The features are:
+ *
+ *    FUNC          must be defined to expand to the current function
+ *    PIXMAN_EXPORT  should be defined to whatever is required to
+ *                   export functions from a shared library
+ *    limits        limits for various types must be defined
+ *    inline         must be defined
+ *    force_inline   must be defined
+ */
+#if defined (__GNUC__)
+#  define FUNC     ((const char*) (__PRETTY_FUNCTION__))
+#elif defined (__sun) || (defined (__STDC_VERSION__) && __STDC_VERSION__ >= 199901L)
+#  define FUNC     ((const char*) (__func__))
+#else
+#  define FUNC     ((const char*) ("???"))
+#endif
+
+#if defined (__GNUC__)
+#  define MAYBE_UNUSED  __attribute__((unused))
+#else
+#  define MAYBE_UNUSED
+#endif
+
+#ifndef INT16_MIN
+# define INT16_MIN              (-32767-1)
+#endif
+
+#ifndef INT16_MAX
+# define INT16_MAX              (32767)
+#endif
+
+#ifndef INT32_MIN
+# define INT32_MIN              (-2147483647-1)
+#endif
+
+#ifndef INT32_MAX
+# define INT32_MAX              (2147483647)
+#endif
+
+#ifndef UINT32_MIN
+# define UINT32_MIN             (0)
+#endif
+
+#ifndef UINT32_MAX
+# define UINT32_MAX             (4294967295U)
+#endif
+
+#ifndef M_PI
+# define M_PI                  3.14159265358979323846
+#endif
+
+#ifdef _MSC_VER
+/* 'inline' is available only in C++ in MSVC */
+#   define inline __inline
+#   define force_inline __forceinline
+#   define noinline __declspec(noinline)
+#elif defined __GNUC__ || (defined(__SUNPRO_C) && (__SUNPRO_C >= 0x590))
+#   define inline __inline__
+#   define force_inline __inline__ __attribute__ ((__always_inline__))
+#   define noinline __attribute__((noinline))
+#else
+#   ifndef force_inline
+#      define force_inline inline
+#   endif
+#   ifndef noinline
+#      define noinline
+#   endif
+#endif
+
+/* GCC visibility */
+#if defined(__GNUC__) && __GNUC__ >= 4 && !defined(_WIN32)
+#   define PIXMAN_EXPORT __attribute__ ((visibility("default")))
+/* Sun Studio 8 visibility */
+#elif defined(__SUNPRO_C) && (__SUNPRO_C >= 0x550)
+#   define PIXMAN_EXPORT __global
+#else
+#   define PIXMAN_EXPORT
+#endif
+
+/* TLS */
+#if defined(PIXMAN_NO_TLS)
+
+#   define PIXMAN_DEFINE_THREAD_LOCAL(type, name)                      \
+    static type name
+#   define PIXMAN_GET_THREAD_LOCAL(name)                               \
+    (&name)
+
+#elif defined(TOOLCHAIN_SUPPORTS__THREAD)
+
+#   define PIXMAN_DEFINE_THREAD_LOCAL(type, name)                      \
+    static __thread type name
+#   define PIXMAN_GET_THREAD_LOCAL(name)                               \
+    (&name)
+
+#elif defined(__MINGW32__)
+
+#   define _NO_W32_PSEUDO_MODIFIERS
+#   include <windows.h>
+
+#   define PIXMAN_DEFINE_THREAD_LOCAL(type, name)                      \
+    static volatile int tls_ ## name ## _initialized = 0;              \
+    static void *tls_ ## name ## _mutex = NULL;                                \
+    static unsigned tls_ ## name ## _index;                            \
+                                                                       \
+    static type *                                                      \
+    tls_ ## name ## _alloc (void)                                      \
+    {                                                                  \
+        type *value = calloc (1, sizeof (type));                       \
+        if (value)                                                     \
+            TlsSetValue (tls_ ## name ## _index, value);               \
+        return value;                                                  \
+    }                                                                  \
+                                                                       \
+    static force_inline type *                                         \
+    tls_ ## name ## _get (void)                                                \
+    {                                                                  \
+       type *value;                                                    \
+       if (!tls_ ## name ## _initialized)                              \
+       {                                                               \
+           if (!tls_ ## name ## _mutex)                                \
+           {                                                           \
+               void *mutex = CreateMutexA (NULL, 0, NULL);             \
+               if (InterlockedCompareExchangePointer (                 \
+                       &tls_ ## name ## _mutex, mutex, NULL) != NULL)  \
+               {                                                       \
+                   CloseHandle (mutex);                                \
+               }                                                       \
+           }                                                           \
+           WaitForSingleObject (tls_ ## name ## _mutex, 0xFFFFFFFF);   \
+           if (!tls_ ## name ## _initialized)                          \
+           {                                                           \
+               tls_ ## name ## _index = TlsAlloc ();                   \
+               tls_ ## name ## _initialized = 1;                       \
+           }                                                           \
+           ReleaseMutex (tls_ ## name ## _mutex);                      \
+       }                                                               \
+       if (tls_ ## name ## _index == 0xFFFFFFFF)                       \
+           return NULL;                                                \
+       value = TlsGetValue (tls_ ## name ## _index);                   \
+       if (!value)                                                     \
+           value = tls_ ## name ## _alloc ();                          \
+       return value;                                                   \
+    }
+
+#   define PIXMAN_GET_THREAD_LOCAL(name)                               \
+    tls_ ## name ## _get ()
+
+#elif defined(_MSC_VER)
+
+#   define PIXMAN_DEFINE_THREAD_LOCAL(type, name)                      \
+    static __declspec(thread) type name
+#   define PIXMAN_GET_THREAD_LOCAL(name)                               \
+    (&name)
+
+#elif defined(HAVE_PTHREAD_SETSPECIFIC)
+
+#include <pthread.h>
+
+#  define PIXMAN_DEFINE_THREAD_LOCAL(type, name)                       \
+    static pthread_once_t tls_ ## name ## _once_control = PTHREAD_ONCE_INIT; \
+    static pthread_key_t tls_ ## name ## _key;                         \
+                                                                       \
+    static void                                                                \
+    tls_ ## name ## _destroy_value (void *value)                       \
+    {                                                                  \
+       free (value);                                                   \
+    }                                                                  \
+                                                                       \
+    static void                                                                \
+    tls_ ## name ## _make_key (void)                                   \
+    {                                                                  \
+       pthread_key_create (&tls_ ## name ## _key,                      \
+                           tls_ ## name ## _destroy_value);            \
+    }                                                                  \
+                                                                       \
+    static type *                                                      \
+    tls_ ## name ## _alloc (void)                                      \
+    {                                                                  \
+       type *value = calloc (1, sizeof (type));                        \
+       if (value)                                                      \
+           pthread_setspecific (tls_ ## name ## _key, value);          \
+       return value;                                                   \
+    }                                                                  \
+                                                                       \
+    static force_inline type *                                         \
+    tls_ ## name ## _get (void)                                                \
+    {                                                                  \
+       type *value = NULL;                                             \
+       if (pthread_once (&tls_ ## name ## _once_control,               \
+                         tls_ ## name ## _make_key) == 0)              \
+       {                                                               \
+           value = pthread_getspecific (tls_ ## name ## _key);         \
+           if (!value)                                                 \
+               value = tls_ ## name ## _alloc ();                      \
+       }                                                               \
+       return value;                                                   \
+    }
+
+#   define PIXMAN_GET_THREAD_LOCAL(name)                               \
+    tls_ ## name ## _get ()
+
+#else
+
+#    error "Unknown thread local support for this system. Pixman will not work with multiple threads. Define PIXMAN_NO_TLS to acknowledge and accept this limitation and compile pixman without thread-safety support."
+
+#endif
diff --git a/pixman/pixman-conical-gradient.c b/pixman/pixman-conical-gradient.c
new file mode 100644 (file)
index 0000000..791d4f3
--- /dev/null
@@ -0,0 +1,211 @@
+/*
+ * Copyright © 2000 SuSE, Inc.
+ * Copyright © 2007 Red Hat, Inc.
+ * Copyright © 2000 Keith Packard, member of The XFree86 Project, Inc.
+ *             2005 Lars Knoll & Zack Rusin, Trolltech
+ *
+ * Permission to use, copy, modify, distribute, and sell this software and its
+ * documentation for any purpose is hereby granted without fee, provided that
+ * the above copyright notice appear in all copies and that both that
+ * copyright notice and this permission notice appear in supporting
+ * documentation, and that the name of Keith Packard not be used in
+ * advertising or publicity pertaining to distribution of the software without
+ * specific, written prior permission.  Keith Packard makes no
+ * representations about the suitability of this software for any purpose.  It
+ * is provided "as is" without express or implied warranty.
+ *
+ * THE COPYRIGHT HOLDERS DISCLAIM ALL WARRANTIES WITH REGARD TO THIS
+ * SOFTWARE, INCLUDING ALL IMPLIED WARRANTIES OF MERCHANTABILITY AND
+ * FITNESS, IN NO EVENT SHALL THE COPYRIGHT HOLDERS BE LIABLE FOR ANY
+ * SPECIAL, INDIRECT OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES
+ * WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN
+ * AN ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING
+ * OUT OF OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS
+ * SOFTWARE.
+ */
+
+#ifdef HAVE_CONFIG_H
+#include <config.h>
+#endif
+
+#include <stdlib.h>
+#include <math.h>
+#include "pixman-private.h"
+
+static force_inline double
+coordinates_to_parameter (double x, double y, double angle)
+{
+    double t;
+
+    t = atan2 (y, x) + angle;
+
+    while (t < 0)
+       t += 2 * M_PI;
+
+    while (t >= 2 * M_PI)
+       t -= 2 * M_PI;
+
+    return 1 - t * (1 / (2 * M_PI)); /* Scale t to [0, 1] and
+                                     * make rotation CCW
+                                     */
+}
+
+static uint32_t *
+conical_get_scanline_narrow (pixman_iter_t *iter, const uint32_t *mask)
+{
+    pixman_image_t *image = iter->image;
+    int x = iter->x;
+    int y = iter->y;
+    int width = iter->width;
+    uint32_t *buffer = iter->buffer;
+
+    gradient_t *gradient = (gradient_t *)image;
+    conical_gradient_t *conical = (conical_gradient_t *)image;
+    uint32_t       *end = buffer + width;
+    pixman_gradient_walker_t walker;
+    pixman_bool_t affine = TRUE;
+    double cx = 1.;
+    double cy = 0.;
+    double cz = 0.;
+    double rx = x + 0.5;
+    double ry = y + 0.5;
+    double rz = 1.;
+
+    _pixman_gradient_walker_init (&walker, gradient, image->common.repeat);
+
+    if (image->common.transform)
+    {
+       pixman_vector_t v;
+
+       /* reference point is the center of the pixel */
+       v.vector[0] = pixman_int_to_fixed (x) + pixman_fixed_1 / 2;
+       v.vector[1] = pixman_int_to_fixed (y) + pixman_fixed_1 / 2;
+       v.vector[2] = pixman_fixed_1;
+
+       if (!pixman_transform_point_3d (image->common.transform, &v))
+           return iter->buffer;
+
+       cx = image->common.transform->matrix[0][0] / 65536.;
+       cy = image->common.transform->matrix[1][0] / 65536.;
+       cz = image->common.transform->matrix[2][0] / 65536.;
+
+       rx = v.vector[0] / 65536.;
+       ry = v.vector[1] / 65536.;
+       rz = v.vector[2] / 65536.;
+
+       affine =
+           image->common.transform->matrix[2][0] == 0 &&
+           v.vector[2] == pixman_fixed_1;
+    }
+
+    if (affine)
+    {
+       rx -= conical->center.x / 65536.;
+       ry -= conical->center.y / 65536.;
+
+       while (buffer < end)
+       {
+           if (!mask || *mask++)
+           {
+               double t = coordinates_to_parameter (rx, ry, conical->angle);
+
+               *buffer = _pixman_gradient_walker_pixel (
+                   &walker, (pixman_fixed_48_16_t)pixman_double_to_fixed (t));
+           }
+
+           ++buffer;
+
+           rx += cx;
+           ry += cy;
+       }
+    }
+    else
+    {
+       while (buffer < end)
+       {
+           double x, y;
+
+           if (!mask || *mask++)
+           {
+               double t;
+
+               if (rz != 0)
+               {
+                   x = rx / rz;
+                   y = ry / rz;
+               }
+               else
+               {
+                   x = y = 0.;
+               }
+
+               x -= conical->center.x / 65536.;
+               y -= conical->center.y / 65536.;
+
+               t = coordinates_to_parameter (x, y, conical->angle);
+
+               *buffer = _pixman_gradient_walker_pixel (
+                   &walker, (pixman_fixed_48_16_t)pixman_double_to_fixed (t));
+           }
+
+           ++buffer;
+
+           rx += cx;
+           ry += cy;
+           rz += cz;
+       }
+    }
+
+    iter->y++;
+    return iter->buffer;
+}
+
+static uint32_t *
+conical_get_scanline_wide (pixman_iter_t *iter, const uint32_t *mask)
+{
+    uint32_t *buffer = conical_get_scanline_narrow (iter, NULL);
+
+    pixman_expand ((uint64_t *)buffer, buffer, PIXMAN_a8r8g8b8, iter->width);
+
+    return buffer;
+}
+
+void
+_pixman_conical_gradient_iter_init (pixman_image_t *image, pixman_iter_t *iter)
+{
+    if (iter->flags & ITER_NARROW)
+       iter->get_scanline = conical_get_scanline_narrow;
+    else
+       iter->get_scanline = conical_get_scanline_wide;
+}
+
+PIXMAN_EXPORT pixman_image_t *
+pixman_image_create_conical_gradient (pixman_point_fixed_t *        center,
+                                      pixman_fixed_t                angle,
+                                      const pixman_gradient_stop_t *stops,
+                                      int                           n_stops)
+{
+    pixman_image_t *image = _pixman_image_allocate ();
+    conical_gradient_t *conical;
+
+    if (!image)
+       return NULL;
+
+    conical = &image->conical;
+
+    if (!_pixman_init_gradient (&conical->common, stops, n_stops))
+    {
+       free (image);
+       return NULL;
+    }
+
+    angle = MOD (angle, pixman_int_to_fixed (360));
+
+    image->type = CONICAL;
+
+    conical->center = *center;
+    conical->angle = (pixman_fixed_to_double (angle) / 180.0) * M_PI;
+
+    return image;
+}
+
diff --git a/pixman/pixman-cpu.c b/pixman/pixman-cpu.c
new file mode 100644 (file)
index 0000000..dff27d1
--- /dev/null
@@ -0,0 +1,631 @@
+/*
+ * Copyright © 2000 SuSE, Inc.
+ * Copyright © 2007 Red Hat, Inc.
+ *
+ * Permission to use, copy, modify, distribute, and sell this software and its
+ * documentation for any purpose is hereby granted without fee, provided that
+ * the above copyright notice appear in all copies and that both that
+ * copyright notice and this permission notice appear in supporting
+ * documentation, and that the name of SuSE not be used in advertising or
+ * publicity pertaining to distribution of the software without specific,
+ * written prior permission.  SuSE makes no representations about the
+ * suitability of this software for any purpose.  It is provided "as is"
+ * without express or implied warranty.
+ *
+ * SuSE DISCLAIMS ALL WARRANTIES WITH REGARD TO THIS SOFTWARE, INCLUDING ALL
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS, IN NO EVENT SHALL SuSE
+ * BE LIABLE FOR ANY SPECIAL, INDIRECT OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES
+ * WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN ACTION
+ * OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT OF OR IN
+ * CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE.
+ */
+#ifdef HAVE_CONFIG_H
+#include <config.h>
+#endif
+
+#include <string.h>
+
+#if defined(USE_ARM_SIMD) && defined(_MSC_VER)
+/* Needed for EXCEPTION_ILLEGAL_INSTRUCTION */
+#include <windows.h>
+#endif
+
+#include "pixman-private.h"
+
+#ifdef USE_VMX
+
+/* The CPU detection code needs to be in a file not compiled with
+ * "-maltivec -mabi=altivec", as gcc would try to save vector register
+ * across function calls causing SIGILL on cpus without Altivec/vmx.
+ */
+static pixman_bool_t initialized = FALSE;
+static volatile pixman_bool_t have_vmx = TRUE;
+
+#ifdef __APPLE__
+#include <sys/sysctl.h>
+
+static pixman_bool_t
+pixman_have_vmx (void)
+{
+    if (!initialized)
+    {
+       size_t length = sizeof(have_vmx);
+       int error =
+           sysctlbyname ("hw.optional.altivec", &have_vmx, &length, NULL, 0);
+
+       if (error)
+           have_vmx = FALSE;
+
+       initialized = TRUE;
+    }
+    return have_vmx;
+}
+
+#elif defined (__OpenBSD__)
+#include <sys/param.h>
+#include <sys/sysctl.h>
+#include <machine/cpu.h>
+
+static pixman_bool_t
+pixman_have_vmx (void)
+{
+    if (!initialized)
+    {
+       int mib[2] = { CTL_MACHDEP, CPU_ALTIVEC };
+       size_t length = sizeof(have_vmx);
+       int error =
+           sysctl (mib, 2, &have_vmx, &length, NULL, 0);
+
+       if (error != 0)
+           have_vmx = FALSE;
+
+       initialized = TRUE;
+    }
+    return have_vmx;
+}
+
+#elif defined (__linux__)
+#include <sys/types.h>
+#include <sys/stat.h>
+#include <fcntl.h>
+#include <unistd.h>
+#include <stdio.h>
+#include <linux/auxvec.h>
+#include <asm/cputable.h>
+
+static pixman_bool_t
+pixman_have_vmx (void)
+{
+    if (!initialized)
+    {
+       char fname[64];
+       unsigned long buf[64];
+       ssize_t count = 0;
+       pid_t pid;
+       int fd, i;
+
+       pid = getpid ();
+       snprintf (fname, sizeof(fname) - 1, "/proc/%d/auxv", pid);
+
+       fd = open (fname, O_RDONLY);
+       if (fd >= 0)
+       {
+           for (i = 0; i <= (count / sizeof(unsigned long)); i += 2)
+           {
+               /* Read more if buf is empty... */
+               if (i == (count / sizeof(unsigned long)))
+               {
+                   count = read (fd, buf, sizeof(buf));
+                   if (count <= 0)
+                       break;
+                   i = 0;
+               }
+
+               if (buf[i] == AT_HWCAP)
+               {
+                   have_vmx = !!(buf[i + 1] & PPC_FEATURE_HAS_ALTIVEC);
+                   initialized = TRUE;
+                   break;
+               }
+               else if (buf[i] == AT_NULL)
+               {
+                   break;
+               }
+           }
+           close (fd);
+       }
+    }
+    if (!initialized)
+    {
+       /* Something went wrong. Assume 'no' rather than playing
+          fragile tricks with catching SIGILL. */
+       have_vmx = FALSE;
+       initialized = TRUE;
+    }
+
+    return have_vmx;
+}
+
+#else /* !__APPLE__ && !__OpenBSD__ && !__linux__ */
+#include <signal.h>
+#include <setjmp.h>
+
+static jmp_buf jump_env;
+
+static void
+vmx_test (int        sig,
+         siginfo_t *si,
+         void *     unused)
+{
+    longjmp (jump_env, 1);
+}
+
+static pixman_bool_t
+pixman_have_vmx (void)
+{
+    struct sigaction sa, osa;
+    int jmp_result;
+
+    if (!initialized)
+    {
+       sa.sa_flags = SA_SIGINFO;
+       sigemptyset (&sa.sa_mask);
+       sa.sa_sigaction = vmx_test;
+       sigaction (SIGILL, &sa, &osa);
+       jmp_result = setjmp (jump_env);
+       if (jmp_result == 0)
+       {
+           asm volatile ( "vor 0, 0, 0" );
+       }
+       sigaction (SIGILL, &osa, NULL);
+       have_vmx = (jmp_result == 0);
+       initialized = TRUE;
+    }
+    return have_vmx;
+}
+
+#endif /* __APPLE__ */
+#endif /* USE_VMX */
+
+#if defined(USE_ARM_SIMD) || defined(USE_ARM_NEON) || defined(USE_ARM_IWMMXT)
+
+#if defined(_MSC_VER)
+
+#if defined(USE_ARM_SIMD)
+extern int pixman_msvc_try_arm_simd_op ();
+
+pixman_bool_t
+pixman_have_arm_simd (void)
+{
+    static pixman_bool_t initialized = FALSE;
+    static pixman_bool_t have_arm_simd = FALSE;
+
+    if (!initialized)
+    {
+       __try {
+           pixman_msvc_try_arm_simd_op ();
+           have_arm_simd = TRUE;
+       } __except (GetExceptionCode () == EXCEPTION_ILLEGAL_INSTRUCTION) {
+           have_arm_simd = FALSE;
+       }
+       initialized = TRUE;
+    }
+
+    return have_arm_simd;
+}
+
+#endif /* USE_ARM_SIMD */
+
+#if defined(USE_ARM_NEON)
+extern int pixman_msvc_try_arm_neon_op ();
+
+pixman_bool_t
+pixman_have_arm_neon (void)
+{
+    static pixman_bool_t initialized = FALSE;
+    static pixman_bool_t have_arm_neon = FALSE;
+
+    if (!initialized)
+    {
+       __try
+       {
+           pixman_msvc_try_arm_neon_op ();
+           have_arm_neon = TRUE;
+       }
+       __except (GetExceptionCode () == EXCEPTION_ILLEGAL_INSTRUCTION)
+       {
+           have_arm_neon = FALSE;
+       }
+       initialized = TRUE;
+    }
+
+    return have_arm_neon;
+}
+
+#endif /* USE_ARM_NEON */
+
+#elif defined (__linux__) /* linux ELF */
+
+#include <stdlib.h>
+#include <unistd.h>
+#include <sys/types.h>
+#include <sys/stat.h>
+#include <sys/mman.h>
+#include <fcntl.h>
+#include <string.h>
+#include <elf.h>
+
+static pixman_bool_t arm_has_v7 = FALSE;
+static pixman_bool_t arm_has_v6 = FALSE;
+static pixman_bool_t arm_has_vfp = FALSE;
+static pixman_bool_t arm_has_neon = FALSE;
+static pixman_bool_t arm_has_iwmmxt = FALSE;
+static pixman_bool_t arm_tests_initialized = FALSE;
+
+static void
+pixman_arm_read_auxv ()
+{
+    int fd;
+    Elf32_auxv_t aux;
+
+    fd = open ("/proc/self/auxv", O_RDONLY);
+    if (fd >= 0)
+    {
+       while (read (fd, &aux, sizeof(Elf32_auxv_t)) == sizeof(Elf32_auxv_t))
+       {
+           if (aux.a_type == AT_HWCAP)
+           {
+               uint32_t hwcap = aux.a_un.a_val;
+               /* hardcode these values to avoid depending on specific
+                * versions of the hwcap header, e.g. HWCAP_NEON
+                */
+               arm_has_vfp = (hwcap & 64) != 0;
+               arm_has_iwmmxt = (hwcap & 512) != 0;
+               /* this flag is only present on kernel 2.6.29 */
+               arm_has_neon = (hwcap & 4096) != 0;
+           }
+           else if (aux.a_type == AT_PLATFORM)
+           {
+               const char *plat = (const char*) aux.a_un.a_val;
+               if (strncmp (plat, "v7l", 3) == 0)
+               {
+                   arm_has_v7 = TRUE;
+                   arm_has_v6 = TRUE;
+               }
+               else if (strncmp (plat, "v6l", 3) == 0)
+               {
+                   arm_has_v6 = TRUE;
+               }
+           }
+       }
+       close (fd);
+    }
+
+    arm_tests_initialized = TRUE;
+}
+
+#if defined(USE_ARM_SIMD)
+pixman_bool_t
+pixman_have_arm_simd (void)
+{
+    if (!arm_tests_initialized)
+       pixman_arm_read_auxv ();
+
+    return arm_has_v6;
+}
+
+#endif /* USE_ARM_SIMD */
+
+#if defined(USE_ARM_NEON)
+pixman_bool_t
+pixman_have_arm_neon (void)
+{
+    if (!arm_tests_initialized)
+       pixman_arm_read_auxv ();
+
+    return arm_has_neon;
+}
+
+#endif /* USE_ARM_NEON */
+
+#if defined(USE_ARM_IWMMXT)
+pixman_bool_t
+pixman_have_arm_iwmmxt (void)
+{
+    if (!arm_tests_initialized)
+       pixman_arm_read_auxv ();
+
+    return arm_has_iwmmxt;
+}
+
+#endif /* USE_ARM_IWMMXT */
+
+#else /* linux ELF */
+
+#define pixman_have_arm_simd() FALSE
+#define pixman_have_arm_neon() FALSE
+#define pixman_have_arm_iwmmxt() FALSE
+
+#endif
+
+#endif /* USE_ARM_SIMD || USE_ARM_NEON || USE_ARM_IWMMXT */
+
+#if defined(USE_X86_MMX) || defined(USE_SSE2)
+/* The CPU detection code needs to be in a file not compiled with
+ * "-mmmx -msse", as gcc would generate CMOV instructions otherwise
+ * that would lead to SIGILL instructions on old CPUs that don't have
+ * it.
+ */
+#if !defined(__amd64__) && !defined(__x86_64__) && !defined(_M_AMD64)
+
+#ifdef HAVE_GETISAX
+#include <sys/auxv.h>
+#endif
+
+typedef enum
+{
+    NO_FEATURES = 0,
+    MMX = 0x1,
+    MMX_EXTENSIONS = 0x2,
+    SSE = 0x6,
+    SSE2 = 0x8,
+    CMOV = 0x10
+} cpu_features_t;
+
+
+static unsigned int
+detect_cpu_features (void)
+{
+    unsigned int features = 0;
+    unsigned int result = 0;
+
+#ifdef HAVE_GETISAX
+    if (getisax (&result, 1))
+    {
+       if (result & AV_386_CMOV)
+           features |= CMOV;
+       if (result & AV_386_MMX)
+           features |= MMX;
+       if (result & AV_386_AMD_MMX)
+           features |= MMX_EXTENSIONS;
+       if (result & AV_386_SSE)
+           features |= SSE;
+       if (result & AV_386_SSE2)
+           features |= SSE2;
+    }
+#else
+    char vendor[13];
+#ifdef _MSC_VER
+    int vendor0 = 0, vendor1, vendor2;
+#endif
+    vendor[0] = 0;
+    vendor[12] = 0;
+
+#ifdef __GNUC__
+    /* see p. 118 of amd64 instruction set manual Vol3 */
+    /* We need to be careful about the handling of %ebx and
+     * %esp here. We can't declare either one as clobbered
+     * since they are special registers (%ebx is the "PIC
+     * register" holding an offset to global data, %esp the
+     * stack pointer), so we need to make sure they have their
+     * original values when we access the output operands.
+     */
+    __asm__ (
+        "pushf\n"
+        "pop %%eax\n"
+        "mov %%eax, %%ecx\n"
+        "xor $0x00200000, %%eax\n"
+        "push %%eax\n"
+        "popf\n"
+        "pushf\n"
+        "pop %%eax\n"
+        "mov $0x0, %%edx\n"
+        "xor %%ecx, %%eax\n"
+        "jz 1f\n"
+
+        "mov $0x00000000, %%eax\n"
+        "push %%ebx\n"
+        "cpuid\n"
+        "mov %%ebx, %%eax\n"
+        "pop %%ebx\n"
+        "mov %%eax, %1\n"
+        "mov %%edx, %2\n"
+        "mov %%ecx, %3\n"
+        "mov $0x00000001, %%eax\n"
+        "push %%ebx\n"
+        "cpuid\n"
+        "pop %%ebx\n"
+        "1:\n"
+        "mov %%edx, %0\n"
+       : "=r" (result),
+        "=m" (vendor[0]),
+        "=m" (vendor[4]),
+        "=m" (vendor[8])
+       :
+       : "%eax", "%ecx", "%edx"
+        );
+
+#elif defined (_MSC_VER)
+
+    _asm {
+       pushfd
+       pop eax
+       mov ecx, eax
+       xor eax, 00200000h
+       push eax
+       popfd
+       pushfd
+       pop eax
+       mov edx, 0
+       xor eax, ecx
+       jz nocpuid
+
+       mov eax, 0
+       push ebx
+       cpuid
+       mov eax, ebx
+       pop ebx
+       mov vendor0, eax
+       mov vendor1, edx
+       mov vendor2, ecx
+       mov eax, 1
+       push ebx
+       cpuid
+       pop ebx
+    nocpuid:
+       mov result, edx
+    }
+    memmove (vendor + 0, &vendor0, 4);
+    memmove (vendor + 4, &vendor1, 4);
+    memmove (vendor + 8, &vendor2, 4);
+
+#else
+#   error unsupported compiler
+#endif
+
+    features = 0;
+    if (result)
+    {
+       /* result now contains the standard feature bits */
+       if (result & (1 << 15))
+           features |= CMOV;
+       if (result & (1 << 23))
+           features |= MMX;
+       if (result & (1 << 25))
+           features |= SSE;
+       if (result & (1 << 26))
+           features |= SSE2;
+       if ((features & MMX) && !(features & SSE) &&
+           (strcmp (vendor, "AuthenticAMD") == 0 ||
+            strcmp (vendor, "Geode by NSC") == 0))
+       {
+           /* check for AMD MMX extensions */
+#ifdef __GNUC__
+           __asm__ (
+               "       push %%ebx\n"
+               "       mov $0x80000000, %%eax\n"
+               "       cpuid\n"
+               "       xor %%edx, %%edx\n"
+               "       cmp $0x1, %%eax\n"
+               "       jge 2f\n"
+               "       mov $0x80000001, %%eax\n"
+               "       cpuid\n"
+               "2:\n"
+               "       pop %%ebx\n"
+               "       mov %%edx, %0\n"
+               : "=r" (result)
+               :
+               : "%eax", "%ecx", "%edx"
+               );
+#elif defined _MSC_VER
+           _asm {
+               push ebx
+               mov eax, 80000000h
+               cpuid
+               xor edx, edx
+               cmp eax, 1
+               jge notamd
+               mov eax, 80000001h
+               cpuid
+           notamd:
+               pop ebx
+               mov result, edx
+           }
+#endif
+           if (result & (1 << 22))
+               features |= MMX_EXTENSIONS;
+       }
+    }
+#endif /* HAVE_GETISAX */
+
+    return features;
+}
+
+static pixman_bool_t
+pixman_have_mmx (void)
+{
+    static pixman_bool_t initialized = FALSE;
+    static pixman_bool_t mmx_present;
+
+    if (!initialized)
+    {
+       unsigned int features = detect_cpu_features ();
+       mmx_present = (features & (MMX | MMX_EXTENSIONS)) == (MMX | MMX_EXTENSIONS);
+       initialized = TRUE;
+    }
+
+    return mmx_present;
+}
+
+#ifdef USE_SSE2
+static pixman_bool_t
+pixman_have_sse2 (void)
+{
+    static pixman_bool_t initialized = FALSE;
+    static pixman_bool_t sse2_present;
+
+    if (!initialized)
+    {
+       unsigned int features = detect_cpu_features ();
+       sse2_present = (features & (MMX | MMX_EXTENSIONS | SSE | SSE2)) == (MMX | MMX_EXTENSIONS | SSE | SSE2);
+       initialized = TRUE;
+    }
+
+    return sse2_present;
+}
+
+#endif
+
+#else /* __amd64__ */
+#ifdef USE_X86_MMX
+#define pixman_have_mmx() TRUE
+#endif
+#ifdef USE_SSE2
+#define pixman_have_sse2() TRUE
+#endif
+#endif /* __amd64__ */
+#endif
+
+pixman_implementation_t *
+_pixman_choose_implementation (void)
+{
+    pixman_implementation_t *imp;
+
+    imp = _pixman_implementation_create_general();
+    imp = _pixman_implementation_create_fast_path (imp);
+    
+#ifdef USE_X86_MMX
+    if (pixman_have_mmx ())
+       imp = _pixman_implementation_create_mmx (imp);
+#endif
+
+#ifdef USE_SSE2
+    if (pixman_have_sse2 ())
+       imp = _pixman_implementation_create_sse2 (imp);
+#endif
+
+#ifdef USE_ARM_SIMD
+    if (pixman_have_arm_simd ())
+       imp = _pixman_implementation_create_arm_simd (imp);
+#endif
+
+#ifdef USE_ARM_IWMMXT
+    if (pixman_have_arm_iwmmxt ())
+       imp = _pixman_implementation_create_mmx (imp);
+#endif
+
+#ifdef USE_ARM_NEON
+    if (pixman_have_arm_neon ())
+       imp = _pixman_implementation_create_arm_neon (imp);
+#endif
+
+#ifdef USE_VMX
+    if (pixman_have_vmx ())
+       imp = _pixman_implementation_create_vmx (imp);
+#endif
+
+    imp = _pixman_implementation_create_noop (imp);
+    
+    return imp;
+}
+
diff --git a/pixman/pixman-edge-accessors.c b/pixman/pixman-edge-accessors.c
new file mode 100644 (file)
index 0000000..ea3a31e
--- /dev/null
@@ -0,0 +1,4 @@
+
+#define PIXMAN_FB_ACCESSORS
+
+#include "pixman-edge.c"
diff --git a/pixman/pixman-edge-imp.h b/pixman/pixman-edge-imp.h
new file mode 100644 (file)
index 0000000..a4698ed
--- /dev/null
@@ -0,0 +1,182 @@
+/*
+ * Copyright © 2004 Keith Packard
+ *
+ * Permission to use, copy, modify, distribute, and sell this software and its
+ * documentation for any purpose is hereby granted without fee, provided that
+ * the above copyright notice appear in all copies and that both that
+ * copyright notice and this permission notice appear in supporting
+ * documentation, and that the name of Keith Packard not be used in
+ * advertising or publicity pertaining to distribution of the software without
+ * specific, written prior permission.  Keith Packard makes no
+ * representations about the suitability of this software for any purpose.  It
+ * is provided "as is" without express or implied warranty.
+ *
+ * KEITH PACKARD DISCLAIMS ALL WARRANTIES WITH REGARD TO THIS SOFTWARE,
+ * INCLUDING ALL IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS, IN NO
+ * EVENT SHALL KEITH PACKARD BE LIABLE FOR ANY SPECIAL, INDIRECT OR
+ * CONSEQUENTIAL DAMAGES OR ANY DAMAGES WHATSOEVER RESULTING FROM LOSS OF USE,
+ * DATA OR PROFITS, WHETHER IN AN ACTION OF CONTRACT, NEGLIGENCE OR OTHER
+ * TORTIOUS ACTION, ARISING OUT OF OR IN CONNECTION WITH THE USE OR
+ * PERFORMANCE OF THIS SOFTWARE.
+ */
+
+#ifndef rasterize_span
+#endif
+
+static void
+RASTERIZE_EDGES (pixman_image_t  *image,
+               pixman_edge_t   *l,
+               pixman_edge_t   *r,
+               pixman_fixed_t          t,
+               pixman_fixed_t          b)
+{
+    pixman_fixed_t  y = t;
+    uint32_t  *line;
+    uint32_t *buf = (image)->bits.bits;
+    int stride = (image)->bits.rowstride;
+    int width = (image)->bits.width;
+
+    line = buf + pixman_fixed_to_int (y) * stride;
+
+    for (;;)
+    {
+       pixman_fixed_t  lx;
+       pixman_fixed_t      rx;
+       int     lxi;
+       int rxi;
+
+       lx = l->x;
+       rx = r->x;
+#if N_BITS == 1
+       /* For the non-antialiased case, round the coordinates up, in effect
+        * sampling just slightly to the left of the pixel. This is so that
+        * when the sample point lies exactly on the line, we round towards
+        * north-west.
+        *
+        * (The AA case does a similar  adjustment in RENDER_SAMPLES_X)
+        */
+       lx += X_FRAC_FIRST(1) - pixman_fixed_e;
+       rx += X_FRAC_FIRST(1) - pixman_fixed_e;
+#endif
+       /* clip X */
+       if (lx < 0)
+           lx = 0;
+       if (pixman_fixed_to_int (rx) >= width)
+#if N_BITS == 1
+           rx = pixman_int_to_fixed (width);
+#else
+           /* Use the last pixel of the scanline, covered 100%.
+            * We can't use the first pixel following the scanline,
+            * because accessing it could result in a buffer overrun.
+            */
+           rx = pixman_int_to_fixed (width) - 1;
+#endif
+
+       /* Skip empty (or backwards) sections */
+       if (rx > lx)
+       {
+
+           /* Find pixel bounds for span */
+           lxi = pixman_fixed_to_int (lx);
+           rxi = pixman_fixed_to_int (rx);
+
+#if N_BITS == 1
+           {
+
+#define LEFT_MASK(x)                                                   \
+               (((x) & 0x1f) ?                                         \
+                SCREEN_SHIFT_RIGHT (0xffffffff, (x) & 0x1f) : 0)
+#define RIGHT_MASK(x)                                                  \
+               (((32 - (x)) & 0x1f) ?                                  \
+                SCREEN_SHIFT_LEFT (0xffffffff, (32 - (x)) & 0x1f) : 0)
+               
+#define MASK_BITS(x,w,l,n,r) {                                         \
+                   n = (w);                                            \
+                   r = RIGHT_MASK ((x) + n);                           \
+                   l = LEFT_MASK (x);                                  \
+                   if (l) {                                            \
+                       n -= 32 - ((x) & 0x1f);                         \
+                       if (n < 0) {                                    \
+                           n = 0;                                      \
+                           l &= r;                                     \
+                           r = 0;                                      \
+                       }                                               \
+                   }                                                   \
+                   n >>= 5;                                            \
+               }
+               
+               uint32_t  *a = line;
+               uint32_t  startmask;
+               uint32_t  endmask;
+               int         nmiddle;
+               int         width = rxi - lxi;
+               int         x = lxi;
+               
+               a += x >> 5;
+               x &= 0x1f;
+               
+               MASK_BITS (x, width, startmask, nmiddle, endmask);
+
+               if (startmask) {
+                   WRITE(image, a, READ(image, a) | startmask);
+                   a++;
+               }
+               while (nmiddle--)
+                   WRITE(image, a++, 0xffffffff);
+               if (endmask)
+                   WRITE(image, a, READ(image, a) | endmask);
+           }
+#else
+           {
+               DEFINE_ALPHA(line,lxi);
+               int         lxs;
+               int     rxs;
+
+               /* Sample coverage for edge pixels */
+               lxs = RENDER_SAMPLES_X (lx, N_BITS);
+               rxs = RENDER_SAMPLES_X (rx, N_BITS);
+
+               /* Add coverage across row */
+               if (lxi == rxi)
+               {
+                   ADD_ALPHA (rxs - lxs);
+               }
+               else
+               {
+                   int xi;
+
+                   ADD_ALPHA (N_X_FRAC(N_BITS) - lxs);
+                   STEP_ALPHA;
+                   for (xi = lxi + 1; xi < rxi; xi++)
+                   {
+                       ADD_ALPHA (N_X_FRAC(N_BITS));
+                       STEP_ALPHA;
+                   }
+                   ADD_ALPHA (rxs);
+               }
+           }
+#endif
+       }
+
+       if (y == b)
+           break;
+
+#if N_BITS > 1
+       if (pixman_fixed_frac (y) != Y_FRAC_LAST(N_BITS))
+       {
+           RENDER_EDGE_STEP_SMALL (l);
+           RENDER_EDGE_STEP_SMALL (r);
+           y += STEP_Y_SMALL(N_BITS);
+       }
+       else
+#endif
+       {
+           RENDER_EDGE_STEP_BIG (l);
+           RENDER_EDGE_STEP_BIG (r);
+           y += STEP_Y_BIG(N_BITS);
+           line += stride;
+       }
+    }
+}
+
+#undef rasterize_span
diff --git a/pixman/pixman-edge.c b/pixman/pixman-edge.c
new file mode 100644 (file)
index 0000000..8d498ab
--- /dev/null
@@ -0,0 +1,384 @@
+/*
+ * Copyright © 2004 Keith Packard
+ *
+ * Permission to use, copy, modify, distribute, and sell this software and its
+ * documentation for any purpose is hereby granted without fee, provided that
+ * the above copyright notice appear in all copies and that both that
+ * copyright notice and this permission notice appear in supporting
+ * documentation, and that the name of Keith Packard not be used in
+ * advertising or publicity pertaining to distribution of the software without
+ * specific, written prior permission.  Keith Packard makes no
+ * representations about the suitability of this software for any purpose.  It
+ * is provided "as is" without express or implied warranty.
+ *
+ * KEITH PACKARD DISCLAIMS ALL WARRANTIES WITH REGARD TO THIS SOFTWARE,
+ * INCLUDING ALL IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS, IN NO
+ * EVENT SHALL KEITH PACKARD BE LIABLE FOR ANY SPECIAL, INDIRECT OR
+ * CONSEQUENTIAL DAMAGES OR ANY DAMAGES WHATSOEVER RESULTING FROM LOSS OF USE,
+ * DATA OR PROFITS, WHETHER IN AN ACTION OF CONTRACT, NEGLIGENCE OR OTHER
+ * TORTIOUS ACTION, ARISING OUT OF OR IN CONNECTION WITH THE USE OR
+ * PERFORMANCE OF THIS SOFTWARE.
+ */
+
+#ifdef HAVE_CONFIG_H
+#include <config.h>
+#endif
+
+#include <string.h>
+
+#include "pixman-private.h"
+#include "pixman-accessor.h"
+
+/*
+ * Step across a small sample grid gap
+ */
+#define RENDER_EDGE_STEP_SMALL(edge)                                   \
+    {                                                                  \
+       edge->x += edge->stepx_small;                                   \
+       edge->e += edge->dx_small;                                      \
+       if (edge->e > 0)                                                \
+       {                                                               \
+           edge->e -= edge->dy;                                        \
+           edge->x += edge->signdx;                                    \
+       }                                                               \
+    }
+
+/*
+ * Step across a large sample grid gap
+ */
+#define RENDER_EDGE_STEP_BIG(edge)                                     \
+    {                                                                  \
+       edge->x += edge->stepx_big;                                     \
+       edge->e += edge->dx_big;                                        \
+       if (edge->e > 0)                                                \
+       {                                                               \
+           edge->e -= edge->dy;                                        \
+           edge->x += edge->signdx;                                    \
+       }                                                               \
+    }
+
+#ifdef PIXMAN_FB_ACCESSORS
+#define PIXMAN_RASTERIZE_EDGES pixman_rasterize_edges_accessors
+#else
+#define PIXMAN_RASTERIZE_EDGES pixman_rasterize_edges_no_accessors
+#endif
+
+/*
+ * 4 bit alpha
+ */
+
+#define N_BITS  4
+#define RASTERIZE_EDGES rasterize_edges_4
+
+#ifndef WORDS_BIGENDIAN
+#define SHIFT_4(o)      ((o) << 2)
+#else
+#define SHIFT_4(o)      ((1 - (o)) << 2)
+#endif
+
+#define GET_4(x, o)      (((x) >> SHIFT_4 (o)) & 0xf)
+#define PUT_4(x, o, v)                                                 \
+    (((x) & ~(0xf << SHIFT_4 (o))) | (((v) & 0xf) << SHIFT_4 (o)))
+
+#define DEFINE_ALPHA(line, x)                                          \
+    uint8_t   *__ap = (uint8_t *) line + ((x) >> 1);                   \
+    int __ao = (x) & 1
+
+#define STEP_ALPHA      ((__ap += __ao), (__ao ^= 1))
+
+#define ADD_ALPHA(a)                                                   \
+    {                                                                  \
+        uint8_t __o = READ (image, __ap);                              \
+        uint8_t __a = (a) + GET_4 (__o, __ao);                         \
+        WRITE (image, __ap, PUT_4 (__o, __ao, __a | (0 - ((__a) >> 4)))); \
+    }
+
+#include "pixman-edge-imp.h"
+
+#undef ADD_ALPHA
+#undef STEP_ALPHA
+#undef DEFINE_ALPHA
+#undef RASTERIZE_EDGES
+#undef N_BITS
+
+
+/*
+ * 1 bit alpha
+ */
+
+#define N_BITS 1
+#define RASTERIZE_EDGES rasterize_edges_1
+
+#include "pixman-edge-imp.h"
+
+#undef RASTERIZE_EDGES
+#undef N_BITS
+
+/*
+ * 8 bit alpha
+ */
+
+static force_inline uint8_t
+clip255 (int x)
+{
+    if (x > 255)
+       return 255;
+
+    return x;
+}
+
+#define ADD_SATURATE_8(buf, val, length)                               \
+    do                                                                 \
+    {                                                                  \
+        int i__ = (length);                                            \
+        uint8_t *buf__ = (buf);                                                \
+        int val__ = (val);                                             \
+                                                                       \
+        while (i__--)                                                  \
+        {                                                              \
+            WRITE (image, (buf__), clip255 (READ (image, (buf__)) + (val__))); \
+            (buf__)++;                                                 \
+       }                                                               \
+    } while (0)
+
+/*
+ * We want to detect the case where we add the same value to a long
+ * span of pixels.  The triangles on the end are filled in while we
+ * count how many sub-pixel scanlines contribute to the middle section.
+ *
+ *                 +--------------------------+
+ *  fill_height =|   \                      /
+ *                     +------------------+
+ *                      |================|
+ *                   fill_start       fill_end
+ */
+static void
+rasterize_edges_8 (pixman_image_t *image,
+                   pixman_edge_t * l,
+                   pixman_edge_t * r,
+                   pixman_fixed_t  t,
+                   pixman_fixed_t  b)
+{
+    pixman_fixed_t y = t;
+    uint32_t  *line;
+    int fill_start = -1, fill_end = -1;
+    int fill_size = 0;
+    uint32_t *buf = (image)->bits.bits;
+    int stride = (image)->bits.rowstride;
+    int width = (image)->bits.width;
+
+    line = buf + pixman_fixed_to_int (y) * stride;
+
+    for (;;)
+    {
+        uint8_t *ap = (uint8_t *) line;
+        pixman_fixed_t lx, rx;
+        int lxi, rxi;
+
+        /* clip X */
+        lx = l->x;
+        if (lx < 0)
+           lx = 0;
+
+        rx = r->x;
+
+        if (pixman_fixed_to_int (rx) >= width)
+       {
+           /* Use the last pixel of the scanline, covered 100%.
+            * We can't use the first pixel following the scanline,
+            * because accessing it could result in a buffer overrun.
+            */
+           rx = pixman_int_to_fixed (width) - 1;
+       }
+
+        /* Skip empty (or backwards) sections */
+        if (rx > lx)
+        {
+            int lxs, rxs;
+
+            /* Find pixel bounds for span. */
+            lxi = pixman_fixed_to_int (lx);
+            rxi = pixman_fixed_to_int (rx);
+
+            /* Sample coverage for edge pixels */
+            lxs = RENDER_SAMPLES_X (lx, 8);
+            rxs = RENDER_SAMPLES_X (rx, 8);
+
+            /* Add coverage across row */
+            if (lxi == rxi)
+            {
+                WRITE (image, ap + lxi,
+                      clip255 (READ (image, ap + lxi) + rxs - lxs));
+           }
+            else
+            {
+                WRITE (image, ap + lxi,
+                      clip255 (READ (image, ap + lxi) + N_X_FRAC (8) - lxs));
+
+                /* Move forward so that lxi/rxi is the pixel span */
+                lxi++;
+
+                /* Don't bother trying to optimize the fill unless
+                * the span is longer than 4 pixels. */
+                if (rxi - lxi > 4)
+                {
+                    if (fill_start < 0)
+                    {
+                        fill_start = lxi;
+                        fill_end = rxi;
+                        fill_size++;
+                   }
+                    else
+                    {
+                        if (lxi >= fill_end || rxi < fill_start)
+                        {
+                            /* We're beyond what we saved, just fill it */
+                            ADD_SATURATE_8 (ap + fill_start,
+                                            fill_size * N_X_FRAC (8),
+                                            fill_end - fill_start);
+                            fill_start = lxi;
+                            fill_end = rxi;
+                            fill_size = 1;
+                       }
+                        else
+                        {
+                            /* Update fill_start */
+                            if (lxi > fill_start)
+                            {
+                                ADD_SATURATE_8 (ap + fill_start,
+                                                fill_size * N_X_FRAC (8),
+                                                lxi - fill_start);
+                                fill_start = lxi;
+                           }
+                            else if (lxi < fill_start)
+                            {
+                                ADD_SATURATE_8 (ap + lxi, N_X_FRAC (8),
+                                                fill_start - lxi);
+                           }
+
+                            /* Update fill_end */
+                            if (rxi < fill_end)
+                            {
+                                ADD_SATURATE_8 (ap + rxi,
+                                                fill_size * N_X_FRAC (8),
+                                                fill_end - rxi);
+                                fill_end = rxi;
+                           }
+                            else if (fill_end < rxi)
+                            {
+                                ADD_SATURATE_8 (ap + fill_end,
+                                                N_X_FRAC (8),
+                                                rxi - fill_end);
+                           }
+                            fill_size++;
+                       }
+                   }
+               }
+                else
+                {
+                    ADD_SATURATE_8 (ap + lxi, N_X_FRAC (8), rxi - lxi);
+               }
+
+                WRITE (image, ap + rxi, clip255 (READ (image, ap + rxi) + rxs));
+           }
+       }
+
+        if (y == b)
+        {
+            /* We're done, make sure we clean up any remaining fill. */
+            if (fill_start != fill_end)
+            {
+                if (fill_size == N_Y_FRAC (8))
+                {
+                    MEMSET_WRAPPED (image, ap + fill_start,
+                                   0xff, fill_end - fill_start);
+               }
+                else
+                {
+                    ADD_SATURATE_8 (ap + fill_start, fill_size * N_X_FRAC (8),
+                                    fill_end - fill_start);
+               }
+           }
+            break;
+       }
+
+        if (pixman_fixed_frac (y) != Y_FRAC_LAST (8))
+        {
+            RENDER_EDGE_STEP_SMALL (l);
+            RENDER_EDGE_STEP_SMALL (r);
+            y += STEP_Y_SMALL (8);
+       }
+        else
+        {
+            RENDER_EDGE_STEP_BIG (l);
+            RENDER_EDGE_STEP_BIG (r);
+            y += STEP_Y_BIG (8);
+            if (fill_start != fill_end)
+            {
+                if (fill_size == N_Y_FRAC (8))
+                {
+                    MEMSET_WRAPPED (image, ap + fill_start,
+                                   0xff, fill_end - fill_start);
+               }
+                else
+                {
+                    ADD_SATURATE_8 (ap + fill_start, fill_size * N_X_FRAC (8),
+                                    fill_end - fill_start);
+               }
+               
+                fill_start = fill_end = -1;
+                fill_size = 0;
+           }
+           
+            line += stride;
+       }
+    }
+}
+
+#ifndef PIXMAN_FB_ACCESSORS
+static
+#endif
+void
+PIXMAN_RASTERIZE_EDGES (pixman_image_t *image,
+                        pixman_edge_t * l,
+                        pixman_edge_t * r,
+                        pixman_fixed_t  t,
+                        pixman_fixed_t  b)
+{
+    switch (PIXMAN_FORMAT_BPP (image->bits.format))
+    {
+    case 1:
+       rasterize_edges_1 (image, l, r, t, b);
+       break;
+
+    case 4:
+       rasterize_edges_4 (image, l, r, t, b);
+       break;
+
+    case 8:
+       rasterize_edges_8 (image, l, r, t, b);
+       break;
+
+    default:
+        break;
+    }
+}
+
+#ifndef PIXMAN_FB_ACCESSORS
+
+PIXMAN_EXPORT void
+pixman_rasterize_edges (pixman_image_t *image,
+                        pixman_edge_t * l,
+                        pixman_edge_t * r,
+                        pixman_fixed_t  t,
+                        pixman_fixed_t  b)
+{
+    return_if_fail (image->type == BITS);
+    
+    if (image->bits.read_func || image->bits.write_func)
+       pixman_rasterize_edges_accessors (image, l, r, t, b);
+    else
+       pixman_rasterize_edges_no_accessors (image, l, r, t, b);
+}
+
+#endif
diff --git a/pixman/pixman-fast-path.c b/pixman/pixman-fast-path.c
new file mode 100644 (file)
index 0000000..038dcf7
--- /dev/null
@@ -0,0 +1,2166 @@
+/* -*- Mode: c; c-basic-offset: 4; tab-width: 8; indent-tabs-mode: t; -*- */
+/*
+ * Copyright © 2000 SuSE, Inc.
+ * Copyright © 2007 Red Hat, Inc.
+ *
+ * Permission to use, copy, modify, distribute, and sell this software and its
+ * documentation for any purpose is hereby granted without fee, provided that
+ * the above copyright notice appear in all copies and that both that
+ * copyright notice and this permission notice appear in supporting
+ * documentation, and that the name of SuSE not be used in advertising or
+ * publicity pertaining to distribution of the software without specific,
+ * written prior permission.  SuSE makes no representations about the
+ * suitability of this software for any purpose.  It is provided "as is"
+ * without express or implied warranty.
+ *
+ * SuSE DISCLAIMS ALL WARRANTIES WITH REGARD TO THIS SOFTWARE, INCLUDING ALL
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS, IN NO EVENT SHALL SuSE
+ * BE LIABLE FOR ANY SPECIAL, INDIRECT OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES
+ * WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN ACTION
+ * OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT OF OR IN
+ * CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE.
+ *
+ * Author:  Keith Packard, SuSE, Inc.
+ */
+
+#ifdef HAVE_CONFIG_H
+#include <config.h>
+#endif
+#include <string.h>
+#include <stdlib.h>
+#include "pixman-private.h"
+#include "pixman-combine32.h"
+#include "pixman-inlines.h"
+
+static force_inline uint32_t
+fetch_24 (uint8_t *a)
+{
+    if (((unsigned long)a) & 1)
+    {
+#ifdef WORDS_BIGENDIAN
+       return (*a << 16) | (*(uint16_t *)(a + 1));
+#else
+       return *a | (*(uint16_t *)(a + 1) << 8);
+#endif
+    }
+    else
+    {
+#ifdef WORDS_BIGENDIAN
+       return (*(uint16_t *)a << 8) | *(a + 2);
+#else
+       return *(uint16_t *)a | (*(a + 2) << 16);
+#endif
+    }
+}
+
+static force_inline void
+store_24 (uint8_t *a,
+          uint32_t v)
+{
+    if (((unsigned long)a) & 1)
+    {
+#ifdef WORDS_BIGENDIAN
+       *a = (uint8_t) (v >> 16);
+       *(uint16_t *)(a + 1) = (uint16_t) (v);
+#else
+       *a = (uint8_t) (v);
+       *(uint16_t *)(a + 1) = (uint16_t) (v >> 8);
+#endif
+    }
+    else
+    {
+#ifdef WORDS_BIGENDIAN
+       *(uint16_t *)a = (uint16_t)(v >> 8);
+       *(a + 2) = (uint8_t)v;
+#else
+       *(uint16_t *)a = (uint16_t)v;
+       *(a + 2) = (uint8_t)(v >> 16);
+#endif
+    }
+}
+
+static force_inline uint32_t
+over (uint32_t src,
+      uint32_t dest)
+{
+    uint32_t a = ~src >> 24;
+
+    UN8x4_MUL_UN8_ADD_UN8x4 (dest, a, src);
+
+    return dest;
+}
+
+static uint32_t
+in (uint32_t x,
+    uint8_t  y)
+{
+    uint16_t a = y;
+
+    UN8x4_MUL_UN8 (x, a);
+
+    return x;
+}
+
+/*
+ * Naming convention:
+ *
+ *  op_src_mask_dest
+ */
+static void
+fast_composite_over_x888_8_8888 (pixman_implementation_t *imp,
+                                 pixman_composite_info_t *info)
+{
+    PIXMAN_COMPOSITE_ARGS (info);
+    uint32_t    *src, *src_line;
+    uint32_t    *dst, *dst_line;
+    uint8_t     *mask, *mask_line;
+    int src_stride, mask_stride, dst_stride;
+    uint8_t m;
+    uint32_t s, d;
+    int32_t w;
+
+    PIXMAN_IMAGE_GET_LINE (dest_image, dest_x, dest_y, uint32_t, dst_stride, dst_line, 1);
+    PIXMAN_IMAGE_GET_LINE (mask_image, mask_x, mask_y, uint8_t, mask_stride, mask_line, 1);
+    PIXMAN_IMAGE_GET_LINE (src_image, src_x, src_y, uint32_t, src_stride, src_line, 1);
+
+    while (height--)
+    {
+       src = src_line;
+       src_line += src_stride;
+       dst = dst_line;
+       dst_line += dst_stride;
+       mask = mask_line;
+       mask_line += mask_stride;
+
+       w = width;
+       while (w--)
+       {
+           m = *mask++;
+           if (m)
+           {
+               s = *src | 0xff000000;
+
+               if (m == 0xff)
+               {
+                   *dst = s;
+               }
+               else
+               {
+                   d = in (s, m);
+                   *dst = over (d, *dst);
+               }
+           }
+           src++;
+           dst++;
+       }
+    }
+}
+
+static void
+fast_composite_in_n_8_8 (pixman_implementation_t *imp,
+                         pixman_composite_info_t *info)
+{
+    PIXMAN_COMPOSITE_ARGS (info);
+    uint32_t src, srca;
+    uint8_t     *dst_line, *dst;
+    uint8_t     *mask_line, *mask, m;
+    int dst_stride, mask_stride;
+    int32_t w;
+    uint16_t t;
+
+    src = _pixman_image_get_solid (imp, src_image, dest_image->bits.format);
+
+    srca = src >> 24;
+
+    PIXMAN_IMAGE_GET_LINE (dest_image, dest_x, dest_y, uint8_t, dst_stride, dst_line, 1);
+    PIXMAN_IMAGE_GET_LINE (mask_image, mask_x, mask_y, uint8_t, mask_stride, mask_line, 1);
+
+    if (srca == 0xff)
+    {
+       while (height--)
+       {
+           dst = dst_line;
+           dst_line += dst_stride;
+           mask = mask_line;
+           mask_line += mask_stride;
+           w = width;
+
+           while (w--)
+           {
+               m = *mask++;
+
+               if (m == 0)
+                   *dst = 0;
+               else if (m != 0xff)
+                   *dst = MUL_UN8 (m, *dst, t);
+
+               dst++;
+           }
+       }
+    }
+    else
+    {
+       while (height--)
+       {
+           dst = dst_line;
+           dst_line += dst_stride;
+           mask = mask_line;
+           mask_line += mask_stride;
+           w = width;
+
+           while (w--)
+           {
+               m = *mask++;
+               m = MUL_UN8 (m, srca, t);
+
+               if (m == 0)
+                   *dst = 0;
+               else if (m != 0xff)
+                   *dst = MUL_UN8 (m, *dst, t);
+
+               dst++;
+           }
+       }
+    }
+}
+
+static void
+fast_composite_in_8_8 (pixman_implementation_t *imp,
+                       pixman_composite_info_t *info)
+{
+    PIXMAN_COMPOSITE_ARGS (info);
+    uint8_t     *dst_line, *dst;
+    uint8_t     *src_line, *src;
+    int dst_stride, src_stride;
+    int32_t w;
+    uint8_t s;
+    uint16_t t;
+
+    PIXMAN_IMAGE_GET_LINE (src_image, src_x, src_y, uint8_t, src_stride, src_line, 1);
+    PIXMAN_IMAGE_GET_LINE (dest_image, dest_x, dest_y, uint8_t, dst_stride, dst_line, 1);
+
+    while (height--)
+    {
+       dst = dst_line;
+       dst_line += dst_stride;
+       src = src_line;
+       src_line += src_stride;
+       w = width;
+
+       while (w--)
+       {
+           s = *src++;
+
+           if (s == 0)
+               *dst = 0;
+           else if (s != 0xff)
+               *dst = MUL_UN8 (s, *dst, t);
+
+           dst++;
+       }
+    }
+}
+
+static void
+fast_composite_over_n_8_8888 (pixman_implementation_t *imp,
+                              pixman_composite_info_t *info)
+{
+    PIXMAN_COMPOSITE_ARGS (info);
+    uint32_t src, srca;
+    uint32_t    *dst_line, *dst, d;
+    uint8_t     *mask_line, *mask, m;
+    int dst_stride, mask_stride;
+    int32_t w;
+
+    src = _pixman_image_get_solid (imp, src_image, dest_image->bits.format);
+
+    srca = src >> 24;
+    if (src == 0)
+       return;
+
+    PIXMAN_IMAGE_GET_LINE (dest_image, dest_x, dest_y, uint32_t, dst_stride, dst_line, 1);
+    PIXMAN_IMAGE_GET_LINE (mask_image, mask_x, mask_y, uint8_t, mask_stride, mask_line, 1);
+
+    while (height--)
+    {
+       dst = dst_line;
+       dst_line += dst_stride;
+       mask = mask_line;
+       mask_line += mask_stride;
+       w = width;
+
+       while (w--)
+       {
+           m = *mask++;
+           if (m == 0xff)
+           {
+               if (srca == 0xff)
+                   *dst = src;
+               else
+                   *dst = over (src, *dst);
+           }
+           else if (m)
+           {
+               d = in (src, m);
+               *dst = over (d, *dst);
+           }
+           dst++;
+       }
+    }
+}
+
+static void
+fast_composite_add_n_8888_8888_ca (pixman_implementation_t *imp,
+                                  pixman_composite_info_t *info)
+{
+    PIXMAN_COMPOSITE_ARGS (info);
+    uint32_t src, s;
+    uint32_t    *dst_line, *dst, d;
+    uint32_t    *mask_line, *mask, ma;
+    int dst_stride, mask_stride;
+    int32_t w;
+
+    src = _pixman_image_get_solid (imp, src_image, dest_image->bits.format);
+
+    if (src == 0)
+       return;
+
+    PIXMAN_IMAGE_GET_LINE (dest_image, dest_x, dest_y, uint32_t, dst_stride, dst_line, 1);
+    PIXMAN_IMAGE_GET_LINE (mask_image, mask_x, mask_y, uint32_t, mask_stride, mask_line, 1);
+
+    while (height--)
+    {
+       dst = dst_line;
+       dst_line += dst_stride;
+       mask = mask_line;
+       mask_line += mask_stride;
+       w = width;
+
+       while (w--)
+       {
+           ma = *mask++;
+
+           if (ma)
+           {
+               d = *dst;
+               s = src;
+
+               UN8x4_MUL_UN8x4_ADD_UN8x4 (s, ma, d);
+
+               *dst = s;
+           }
+
+           dst++;
+       }
+    }
+}
+
+static void
+fast_composite_over_n_8888_8888_ca (pixman_implementation_t *imp,
+                                    pixman_composite_info_t *info)
+{
+    PIXMAN_COMPOSITE_ARGS (info);
+    uint32_t src, srca, s;
+    uint32_t    *dst_line, *dst, d;
+    uint32_t    *mask_line, *mask, ma;
+    int dst_stride, mask_stride;
+    int32_t w;
+
+    src = _pixman_image_get_solid (imp, src_image, dest_image->bits.format);
+
+    srca = src >> 24;
+    if (src == 0)
+       return;
+
+    PIXMAN_IMAGE_GET_LINE (dest_image, dest_x, dest_y, uint32_t, dst_stride, dst_line, 1);
+    PIXMAN_IMAGE_GET_LINE (mask_image, mask_x, mask_y, uint32_t, mask_stride, mask_line, 1);
+
+    while (height--)
+    {
+       dst = dst_line;
+       dst_line += dst_stride;
+       mask = mask_line;
+       mask_line += mask_stride;
+       w = width;
+
+       while (w--)
+       {
+           ma = *mask++;
+           if (ma == 0xffffffff)
+           {
+               if (srca == 0xff)
+                   *dst = src;
+               else
+                   *dst = over (src, *dst);
+           }
+           else if (ma)
+           {
+               d = *dst;
+               s = src;
+
+               UN8x4_MUL_UN8x4 (s, ma);
+               UN8x4_MUL_UN8 (ma, srca);
+               ma = ~ma;
+               UN8x4_MUL_UN8x4_ADD_UN8x4 (d, ma, s);
+
+               *dst = d;
+           }
+
+           dst++;
+       }
+    }
+}
+
+static void
+fast_composite_over_n_8_0888 (pixman_implementation_t *imp,
+                              pixman_composite_info_t *info)
+{
+    PIXMAN_COMPOSITE_ARGS (info);
+    uint32_t src, srca;
+    uint8_t     *dst_line, *dst;
+    uint32_t d;
+    uint8_t     *mask_line, *mask, m;
+    int dst_stride, mask_stride;
+    int32_t w;
+
+    src = _pixman_image_get_solid (imp, src_image, dest_image->bits.format);
+
+    srca = src >> 24;
+    if (src == 0)
+       return;
+
+    PIXMAN_IMAGE_GET_LINE (dest_image, dest_x, dest_y, uint8_t, dst_stride, dst_line, 3);
+    PIXMAN_IMAGE_GET_LINE (mask_image, mask_x, mask_y, uint8_t, mask_stride, mask_line, 1);
+
+    while (height--)
+    {
+       dst = dst_line;
+       dst_line += dst_stride;
+       mask = mask_line;
+       mask_line += mask_stride;
+       w = width;
+
+       while (w--)
+       {
+           m = *mask++;
+           if (m == 0xff)
+           {
+               if (srca == 0xff)
+               {
+                   d = src;
+               }
+               else
+               {
+                   d = fetch_24 (dst);
+                   d = over (src, d);
+               }
+               store_24 (dst, d);
+           }
+           else if (m)
+           {
+               d = over (in (src, m), fetch_24 (dst));
+               store_24 (dst, d);
+           }
+           dst += 3;
+       }
+    }
+}
+
+static void
+fast_composite_over_n_8_0565 (pixman_implementation_t *imp,
+                              pixman_composite_info_t *info)
+{
+    PIXMAN_COMPOSITE_ARGS (info);
+    uint32_t src, srca;
+    uint16_t    *dst_line, *dst;
+    uint32_t d;
+    uint8_t     *mask_line, *mask, m;
+    int dst_stride, mask_stride;
+    int32_t w;
+
+    src = _pixman_image_get_solid (imp, src_image, dest_image->bits.format);
+
+    srca = src >> 24;
+    if (src == 0)
+       return;
+
+    PIXMAN_IMAGE_GET_LINE (dest_image, dest_x, dest_y, uint16_t, dst_stride, dst_line, 1);
+    PIXMAN_IMAGE_GET_LINE (mask_image, mask_x, mask_y, uint8_t, mask_stride, mask_line, 1);
+
+    while (height--)
+    {
+       dst = dst_line;
+       dst_line += dst_stride;
+       mask = mask_line;
+       mask_line += mask_stride;
+       w = width;
+
+       while (w--)
+       {
+           m = *mask++;
+           if (m == 0xff)
+           {
+               if (srca == 0xff)
+               {
+                   d = src;
+               }
+               else
+               {
+                   d = *dst;
+                   d = over (src, CONVERT_0565_TO_0888 (d));
+               }
+               *dst = CONVERT_8888_TO_0565 (d);
+           }
+           else if (m)
+           {
+               d = *dst;
+               d = over (in (src, m), CONVERT_0565_TO_0888 (d));
+               *dst = CONVERT_8888_TO_0565 (d);
+           }
+           dst++;
+       }
+    }
+}
+
+static void
+fast_composite_over_n_8888_0565_ca (pixman_implementation_t *imp,
+                                    pixman_composite_info_t *info)
+{
+    PIXMAN_COMPOSITE_ARGS (info);
+    uint32_t  src, srca, s;
+    uint16_t  src16;
+    uint16_t *dst_line, *dst;
+    uint32_t  d;
+    uint32_t *mask_line, *mask, ma;
+    int dst_stride, mask_stride;
+    int32_t w;
+
+    src = _pixman_image_get_solid (imp, src_image, dest_image->bits.format);
+
+    srca = src >> 24;
+    if (src == 0)
+       return;
+
+    src16 = CONVERT_8888_TO_0565 (src);
+
+    PIXMAN_IMAGE_GET_LINE (dest_image, dest_x, dest_y, uint16_t, dst_stride, dst_line, 1);
+    PIXMAN_IMAGE_GET_LINE (mask_image, mask_x, mask_y, uint32_t, mask_stride, mask_line, 1);
+
+    while (height--)
+    {
+       dst = dst_line;
+       dst_line += dst_stride;
+       mask = mask_line;
+       mask_line += mask_stride;
+       w = width;
+
+       while (w--)
+       {
+           ma = *mask++;
+           if (ma == 0xffffffff)
+           {
+               if (srca == 0xff)
+               {
+                   *dst = src16;
+               }
+               else
+               {
+                   d = *dst;
+                   d = over (src, CONVERT_0565_TO_0888 (d));
+                   *dst = CONVERT_8888_TO_0565 (d);
+               }
+           }
+           else if (ma)
+           {
+               d = *dst;
+               d = CONVERT_0565_TO_0888 (d);
+
+               s = src;
+
+               UN8x4_MUL_UN8x4 (s, ma);
+               UN8x4_MUL_UN8 (ma, srca);
+               ma = ~ma;
+               UN8x4_MUL_UN8x4_ADD_UN8x4 (d, ma, s);
+
+               *dst = CONVERT_8888_TO_0565 (d);
+           }
+           dst++;
+       }
+    }
+}
+
+static void
+fast_composite_over_8888_8888 (pixman_implementation_t *imp,
+                               pixman_composite_info_t *info)
+{
+    PIXMAN_COMPOSITE_ARGS (info);
+    uint32_t    *dst_line, *dst;
+    uint32_t    *src_line, *src, s;
+    int dst_stride, src_stride;
+    uint8_t a;
+    int32_t w;
+
+    PIXMAN_IMAGE_GET_LINE (dest_image, dest_x, dest_y, uint32_t, dst_stride, dst_line, 1);
+    PIXMAN_IMAGE_GET_LINE (src_image, src_x, src_y, uint32_t, src_stride, src_line, 1);
+
+    while (height--)
+    {
+       dst = dst_line;
+       dst_line += dst_stride;
+       src = src_line;
+       src_line += src_stride;
+       w = width;
+
+       while (w--)
+       {
+           s = *src++;
+           a = s >> 24;
+           if (a == 0xff)
+               *dst = s;
+           else if (s)
+               *dst = over (s, *dst);
+           dst++;
+       }
+    }
+}
+
+static void
+fast_composite_src_x888_8888 (pixman_implementation_t *imp,
+                             pixman_composite_info_t *info)
+{
+    PIXMAN_COMPOSITE_ARGS (info);
+    uint32_t    *dst_line, *dst;
+    uint32_t    *src_line, *src;
+    int dst_stride, src_stride;
+    int32_t w;
+
+    PIXMAN_IMAGE_GET_LINE (dest_image, dest_x, dest_y, uint32_t, dst_stride, dst_line, 1);
+    PIXMAN_IMAGE_GET_LINE (src_image, src_x, src_y, uint32_t, src_stride, src_line, 1);
+
+    while (height--)
+    {
+       dst = dst_line;
+       dst_line += dst_stride;
+       src = src_line;
+       src_line += src_stride;
+       w = width;
+
+       while (w--)
+           *dst++ = (*src++) | 0xff000000;
+    }
+}
+
+#if 0
+static void
+fast_composite_over_8888_0888 (pixman_implementation_t *imp,
+                              pixman_composite_info_t *info)
+{
+    PIXMAN_COMPOSITE_ARGS (info);
+    uint8_t     *dst_line, *dst;
+    uint32_t d;
+    uint32_t    *src_line, *src, s;
+    uint8_t a;
+    int dst_stride, src_stride;
+    int32_t w;
+
+    PIXMAN_IMAGE_GET_LINE (dest_image, dest_x, dest_y, uint8_t, dst_stride, dst_line, 3);
+    PIXMAN_IMAGE_GET_LINE (src_image, src_x, src_y, uint32_t, src_stride, src_line, 1);
+
+    while (height--)
+    {
+       dst = dst_line;
+       dst_line += dst_stride;
+       src = src_line;
+       src_line += src_stride;
+       w = width;
+
+       while (w--)
+       {
+           s = *src++;
+           a = s >> 24;
+           if (a)
+           {
+               if (a == 0xff)
+                   d = s;
+               else
+                   d = over (s, fetch_24 (dst));
+
+               store_24 (dst, d);
+           }
+           dst += 3;
+       }
+    }
+}
+#endif
+
+static void
+fast_composite_over_8888_0565 (pixman_implementation_t *imp,
+                               pixman_composite_info_t *info)
+{
+    PIXMAN_COMPOSITE_ARGS (info);
+    uint16_t    *dst_line, *dst;
+    uint32_t d;
+    uint32_t    *src_line, *src, s;
+    uint8_t a;
+    int dst_stride, src_stride;
+    int32_t w;
+
+    PIXMAN_IMAGE_GET_LINE (src_image, src_x, src_y, uint32_t, src_stride, src_line, 1);
+    PIXMAN_IMAGE_GET_LINE (dest_image, dest_x, dest_y, uint16_t, dst_stride, dst_line, 1);
+
+    while (height--)
+    {
+       dst = dst_line;
+       dst_line += dst_stride;
+       src = src_line;
+       src_line += src_stride;
+       w = width;
+
+       while (w--)
+       {
+           s = *src++;
+           a = s >> 24;
+           if (s)
+           {
+               if (a == 0xff)
+               {
+                   d = s;
+               }
+               else
+               {
+                   d = *dst;
+                   d = over (s, CONVERT_0565_TO_0888 (d));
+               }
+               *dst = CONVERT_8888_TO_0565 (d);
+           }
+           dst++;
+       }
+    }
+}
+
+static void
+fast_composite_src_x888_0565 (pixman_implementation_t *imp,
+                              pixman_composite_info_t *info)
+{
+    PIXMAN_COMPOSITE_ARGS (info);
+    uint16_t    *dst_line, *dst;
+    uint32_t    *src_line, *src, s;
+    int dst_stride, src_stride;
+    int32_t w;
+
+    PIXMAN_IMAGE_GET_LINE (src_image, src_x, src_y, uint32_t, src_stride, src_line, 1);
+    PIXMAN_IMAGE_GET_LINE (dest_image, dest_x, dest_y, uint16_t, dst_stride, dst_line, 1);
+
+    while (height--)
+    {
+       dst = dst_line;
+       dst_line += dst_stride;
+       src = src_line;
+       src_line += src_stride;
+       w = width;
+
+       while (w--)
+       {
+           s = *src++;
+           *dst = CONVERT_8888_TO_0565 (s);
+           dst++;
+       }
+    }
+}
+
+static void
+fast_composite_add_8_8 (pixman_implementation_t *imp,
+                       pixman_composite_info_t *info)
+{
+    PIXMAN_COMPOSITE_ARGS (info);
+    uint8_t     *dst_line, *dst;
+    uint8_t     *src_line, *src;
+    int dst_stride, src_stride;
+    int32_t w;
+    uint8_t s, d;
+    uint16_t t;
+
+    PIXMAN_IMAGE_GET_LINE (src_image, src_x, src_y, uint8_t, src_stride, src_line, 1);
+    PIXMAN_IMAGE_GET_LINE (dest_image, dest_x, dest_y, uint8_t, dst_stride, dst_line, 1);
+
+    while (height--)
+    {
+       dst = dst_line;
+       dst_line += dst_stride;
+       src = src_line;
+       src_line += src_stride;
+       w = width;
+
+       while (w--)
+       {
+           s = *src++;
+           if (s)
+           {
+               if (s != 0xff)
+               {
+                   d = *dst;
+                   t = d + s;
+                   s = t | (0 - (t >> 8));
+               }
+               *dst = s;
+           }
+           dst++;
+       }
+    }
+}
+
+static void
+fast_composite_add_8888_8888 (pixman_implementation_t *imp,
+                              pixman_composite_info_t *info)
+{
+    PIXMAN_COMPOSITE_ARGS (info);
+    uint32_t    *dst_line, *dst;
+    uint32_t    *src_line, *src;
+    int dst_stride, src_stride;
+    int32_t w;
+    uint32_t s, d;
+
+    PIXMAN_IMAGE_GET_LINE (src_image, src_x, src_y, uint32_t, src_stride, src_line, 1);
+    PIXMAN_IMAGE_GET_LINE (dest_image, dest_x, dest_y, uint32_t, dst_stride, dst_line, 1);
+
+    while (height--)
+    {
+       dst = dst_line;
+       dst_line += dst_stride;
+       src = src_line;
+       src_line += src_stride;
+       w = width;
+
+       while (w--)
+       {
+           s = *src++;
+           if (s)
+           {
+               if (s != 0xffffffff)
+               {
+                   d = *dst;
+                   if (d)
+                       UN8x4_ADD_UN8x4 (s, d);
+               }
+               *dst = s;
+           }
+           dst++;
+       }
+    }
+}
+
+static void
+fast_composite_add_n_8_8 (pixman_implementation_t *imp,
+                         pixman_composite_info_t *info)
+{
+    PIXMAN_COMPOSITE_ARGS (info);
+    uint8_t     *dst_line, *dst;
+    uint8_t     *mask_line, *mask;
+    int dst_stride, mask_stride;
+    int32_t w;
+    uint32_t src;
+    uint8_t sa;
+
+    PIXMAN_IMAGE_GET_LINE (dest_image, dest_x, dest_y, uint8_t, dst_stride, dst_line, 1);
+    PIXMAN_IMAGE_GET_LINE (mask_image, mask_x, mask_y, uint8_t, mask_stride, mask_line, 1);
+    src = _pixman_image_get_solid (imp, src_image, dest_image->bits.format);
+    sa = (src >> 24);
+
+    while (height--)
+    {
+       dst = dst_line;
+       dst_line += dst_stride;
+       mask = mask_line;
+       mask_line += mask_stride;
+       w = width;
+
+       while (w--)
+       {
+           uint16_t tmp;
+           uint16_t a;
+           uint32_t m, d;
+           uint32_t r;
+
+           a = *mask++;
+           d = *dst;
+
+           m = MUL_UN8 (sa, a, tmp);
+           r = ADD_UN8 (m, d, tmp);
+
+           *dst++ = r;
+       }
+    }
+}
+
+#ifdef WORDS_BIGENDIAN
+#define CREATE_BITMASK(n) (0x80000000 >> (n))
+#define UPDATE_BITMASK(n) ((n) >> 1)
+#else
+#define CREATE_BITMASK(n) (1 << (n))
+#define UPDATE_BITMASK(n) ((n) << 1)
+#endif
+
+#define TEST_BIT(p, n)                                 \
+    (*((p) + ((n) >> 5)) & CREATE_BITMASK ((n) & 31))
+#define SET_BIT(p, n)                                                  \
+    do { *((p) + ((n) >> 5)) |= CREATE_BITMASK ((n) & 31); } while (0);
+
+static void
+fast_composite_add_1000_1000 (pixman_implementation_t *imp,
+                              pixman_composite_info_t *info)
+{
+    PIXMAN_COMPOSITE_ARGS (info);
+    uint32_t     *dst_line, *dst;
+    uint32_t     *src_line, *src;
+    int           dst_stride, src_stride;
+    int32_t       w;
+
+    PIXMAN_IMAGE_GET_LINE (src_image, 0, src_y, uint32_t,
+                           src_stride, src_line, 1);
+    PIXMAN_IMAGE_GET_LINE (dest_image, 0, dest_y, uint32_t,
+                           dst_stride, dst_line, 1);
+
+    while (height--)
+    {
+       dst = dst_line;
+       dst_line += dst_stride;
+       src = src_line;
+       src_line += src_stride;
+       w = width;
+
+       while (w--)
+       {
+           /*
+            * TODO: improve performance by processing uint32_t data instead
+            *       of individual bits
+            */
+           if (TEST_BIT (src, src_x + w))
+               SET_BIT (dst, dest_x + w);
+       }
+    }
+}
+
+static void
+fast_composite_over_n_1_8888 (pixman_implementation_t *imp,
+                              pixman_composite_info_t *info)
+{
+    PIXMAN_COMPOSITE_ARGS (info);
+    uint32_t     src, srca;
+    uint32_t    *dst, *dst_line;
+    uint32_t    *mask, *mask_line;
+    int          mask_stride, dst_stride;
+    uint32_t     bitcache, bitmask;
+    int32_t      w;
+
+    if (width <= 0)
+       return;
+
+    src = _pixman_image_get_solid (imp, src_image, dest_image->bits.format);
+    srca = src >> 24;
+    if (src == 0)
+       return;
+
+    PIXMAN_IMAGE_GET_LINE (dest_image, dest_x, dest_y, uint32_t,
+                           dst_stride, dst_line, 1);
+    PIXMAN_IMAGE_GET_LINE (mask_image, 0, mask_y, uint32_t,
+                           mask_stride, mask_line, 1);
+    mask_line += mask_x >> 5;
+
+    if (srca == 0xff)
+    {
+       while (height--)
+       {
+           dst = dst_line;
+           dst_line += dst_stride;
+           mask = mask_line;
+           mask_line += mask_stride;
+           w = width;
+
+           bitcache = *mask++;
+           bitmask = CREATE_BITMASK (mask_x & 31);
+
+           while (w--)
+           {
+               if (bitmask == 0)
+               {
+                   bitcache = *mask++;
+                   bitmask = CREATE_BITMASK (0);
+               }
+               if (bitcache & bitmask)
+                   *dst = src;
+               bitmask = UPDATE_BITMASK (bitmask);
+               dst++;
+           }
+       }
+    }
+    else
+    {
+       while (height--)
+       {
+           dst = dst_line;
+           dst_line += dst_stride;
+           mask = mask_line;
+           mask_line += mask_stride;
+           w = width;
+
+           bitcache = *mask++;
+           bitmask = CREATE_BITMASK (mask_x & 31);
+
+           while (w--)
+           {
+               if (bitmask == 0)
+               {
+                   bitcache = *mask++;
+                   bitmask = CREATE_BITMASK (0);
+               }
+               if (bitcache & bitmask)
+                   *dst = over (src, *dst);
+               bitmask = UPDATE_BITMASK (bitmask);
+               dst++;
+           }
+       }
+    }
+}
+
+static void
+fast_composite_over_n_1_0565 (pixman_implementation_t *imp,
+                              pixman_composite_info_t *info)
+{
+    PIXMAN_COMPOSITE_ARGS (info);
+    uint32_t     src, srca;
+    uint16_t    *dst, *dst_line;
+    uint32_t    *mask, *mask_line;
+    int          mask_stride, dst_stride;
+    uint32_t     bitcache, bitmask;
+    int32_t      w;
+    uint32_t     d;
+    uint16_t     src565;
+
+    if (width <= 0)
+       return;
+
+    src = _pixman_image_get_solid (imp, src_image, dest_image->bits.format);
+    srca = src >> 24;
+    if (src == 0)
+       return;
+
+    PIXMAN_IMAGE_GET_LINE (dest_image, dest_x, dest_y, uint16_t,
+                           dst_stride, dst_line, 1);
+    PIXMAN_IMAGE_GET_LINE (mask_image, 0, mask_y, uint32_t,
+                           mask_stride, mask_line, 1);
+    mask_line += mask_x >> 5;
+
+    if (srca == 0xff)
+    {
+       src565 = CONVERT_8888_TO_0565 (src);
+       while (height--)
+       {
+           dst = dst_line;
+           dst_line += dst_stride;
+           mask = mask_line;
+           mask_line += mask_stride;
+           w = width;
+
+           bitcache = *mask++;
+           bitmask = CREATE_BITMASK (mask_x & 31);
+
+           while (w--)
+           {
+               if (bitmask == 0)
+               {
+                   bitcache = *mask++;
+                   bitmask = CREATE_BITMASK (0);
+               }
+               if (bitcache & bitmask)
+                   *dst = src565;
+               bitmask = UPDATE_BITMASK (bitmask);
+               dst++;
+           }
+       }
+    }
+    else
+    {
+       while (height--)
+       {
+           dst = dst_line;
+           dst_line += dst_stride;
+           mask = mask_line;
+           mask_line += mask_stride;
+           w = width;
+
+           bitcache = *mask++;
+           bitmask = CREATE_BITMASK (mask_x & 31);
+
+           while (w--)
+           {
+               if (bitmask == 0)
+               {
+                   bitcache = *mask++;
+                   bitmask = CREATE_BITMASK (0);
+               }
+               if (bitcache & bitmask)
+               {
+                   d = over (src, CONVERT_0565_TO_0888 (*dst));
+                   *dst = CONVERT_8888_TO_0565 (d);
+               }
+               bitmask = UPDATE_BITMASK (bitmask);
+               dst++;
+           }
+       }
+    }
+}
+
+/*
+ * Simple bitblt
+ */
+
+static void
+fast_composite_solid_fill (pixman_implementation_t *imp,
+                           pixman_composite_info_t *info)
+{
+    PIXMAN_COMPOSITE_ARGS (info);
+    uint32_t src;
+
+    src = _pixman_image_get_solid (imp, src_image, dest_image->bits.format);
+
+    if (dest_image->bits.format == PIXMAN_a1)
+    {
+       src = src >> 31;
+    }
+    else if (dest_image->bits.format == PIXMAN_a8)
+    {
+       src = src >> 24;
+    }
+    else if (dest_image->bits.format == PIXMAN_r5g6b5 ||
+             dest_image->bits.format == PIXMAN_b5g6r5)
+    {
+       src = CONVERT_8888_TO_0565 (src);
+    }
+
+    pixman_fill (dest_image->bits.bits, dest_image->bits.rowstride,
+                 PIXMAN_FORMAT_BPP (dest_image->bits.format),
+                 dest_x, dest_y,
+                 width, height,
+                 src);
+}
+
+static void
+fast_composite_src_memcpy (pixman_implementation_t *imp,
+                          pixman_composite_info_t *info)
+{
+    PIXMAN_COMPOSITE_ARGS (info);
+    int bpp = PIXMAN_FORMAT_BPP (dest_image->bits.format) / 8;
+    uint32_t n_bytes = width * bpp;
+    int dst_stride, src_stride;
+    uint8_t    *dst;
+    uint8_t    *src;
+
+    src_stride = src_image->bits.rowstride * 4;
+    dst_stride = dest_image->bits.rowstride * 4;
+
+    src = (uint8_t *)src_image->bits.bits + src_y * src_stride + src_x * bpp;
+    dst = (uint8_t *)dest_image->bits.bits + dest_y * dst_stride + dest_x * bpp;
+
+    while (height--)
+    {
+       memcpy (dst, src, n_bytes);
+
+       dst += dst_stride;
+       src += src_stride;
+    }
+}
+
+FAST_NEAREST (8888_8888_cover, 8888, 8888, uint32_t, uint32_t, SRC, COVER)
+FAST_NEAREST (8888_8888_none, 8888, 8888, uint32_t, uint32_t, SRC, NONE)
+FAST_NEAREST (8888_8888_pad, 8888, 8888, uint32_t, uint32_t, SRC, PAD)
+FAST_NEAREST (8888_8888_normal, 8888, 8888, uint32_t, uint32_t, SRC, NORMAL)
+FAST_NEAREST (x888_8888_cover, x888, 8888, uint32_t, uint32_t, SRC, COVER)
+FAST_NEAREST (x888_8888_pad, x888, 8888, uint32_t, uint32_t, SRC, PAD)
+FAST_NEAREST (x888_8888_normal, x888, 8888, uint32_t, uint32_t, SRC, NORMAL)
+FAST_NEAREST (8888_8888_cover, 8888, 8888, uint32_t, uint32_t, OVER, COVER)
+FAST_NEAREST (8888_8888_none, 8888, 8888, uint32_t, uint32_t, OVER, NONE)
+FAST_NEAREST (8888_8888_pad, 8888, 8888, uint32_t, uint32_t, OVER, PAD)
+FAST_NEAREST (8888_8888_normal, 8888, 8888, uint32_t, uint32_t, OVER, NORMAL)
+FAST_NEAREST (8888_565_cover, 8888, 0565, uint32_t, uint16_t, SRC, COVER)
+FAST_NEAREST (8888_565_none, 8888, 0565, uint32_t, uint16_t, SRC, NONE)
+FAST_NEAREST (8888_565_pad, 8888, 0565, uint32_t, uint16_t, SRC, PAD)
+FAST_NEAREST (8888_565_normal, 8888, 0565, uint32_t, uint16_t, SRC, NORMAL)
+FAST_NEAREST (565_565_normal, 0565, 0565, uint16_t, uint16_t, SRC, NORMAL)
+FAST_NEAREST (8888_565_cover, 8888, 0565, uint32_t, uint16_t, OVER, COVER)
+FAST_NEAREST (8888_565_none, 8888, 0565, uint32_t, uint16_t, OVER, NONE)
+FAST_NEAREST (8888_565_pad, 8888, 0565, uint32_t, uint16_t, OVER, PAD)
+FAST_NEAREST (8888_565_normal, 8888, 0565, uint32_t, uint16_t, OVER, NORMAL)
+
+#define REPEAT_MIN_WIDTH    32
+
+static void
+fast_composite_tiled_repeat (pixman_implementation_t *imp,
+                            pixman_composite_info_t *info)
+{
+    PIXMAN_COMPOSITE_ARGS (info);
+    pixman_composite_func_t func;
+    pixman_format_code_t mask_format;
+    uint32_t src_flags, mask_flags;
+
+    src_flags = (info->src_flags & ~FAST_PATH_NORMAL_REPEAT) |
+                   FAST_PATH_SAMPLES_COVER_CLIP_NEAREST;
+
+    if (mask_image)
+    {
+       mask_format = mask_image->common.extended_format_code;
+       mask_flags = info->mask_flags;
+    }
+    else
+    {
+       mask_format = PIXMAN_null;
+       mask_flags = FAST_PATH_IS_OPAQUE;
+    }
+
+    if (_pixman_lookup_composite_function (
+           imp->toplevel, info->op,
+           src_image->common.extended_format_code, src_flags,
+           mask_format, mask_flags,
+           dest_image->common.extended_format_code, info->dest_flags,
+           &imp, &func))
+    {
+       int32_t sx, sy;
+       int32_t width_remain;
+       int32_t num_pixels;
+       int32_t src_width;
+       int32_t i, j;
+       pixman_image_t extended_src_image;
+       uint32_t extended_src[REPEAT_MIN_WIDTH * 2];
+       pixman_bool_t need_src_extension;
+       uint32_t *src_line;
+       int32_t src_stride;
+       int32_t src_bpp;
+       pixman_composite_info_t info2 = *info;
+
+       src_bpp = PIXMAN_FORMAT_BPP (src_image->bits.format);
+
+       if (src_image->bits.width < REPEAT_MIN_WIDTH &&
+           (src_bpp == 32 || src_bpp == 16 || src_bpp == 8))
+       {
+           sx = src_x;
+           sx = MOD (sx, src_image->bits.width);
+           sx += width;
+           src_width = 0;
+
+           while (src_width < REPEAT_MIN_WIDTH && src_width <= sx)
+               src_width += src_image->bits.width;
+
+           src_stride = (src_width * (src_bpp >> 3) + 3) / (int) sizeof (uint32_t);
+
+           /* Initialize/validate stack-allocated temporary image */
+           _pixman_bits_image_init (&extended_src_image, src_image->bits.format,
+                                    src_width, 1, &extended_src[0], src_stride);
+           _pixman_image_validate (&extended_src_image);
+
+           info2.src_image = &extended_src_image;
+           need_src_extension = TRUE;
+       }
+       else
+       {
+           src_width = src_image->bits.width;
+           need_src_extension = FALSE;
+       }
+
+       sx = src_x;
+       sy = src_y;
+
+       while (--height >= 0)
+       {
+           sx = MOD (sx, src_width);
+           sy = MOD (sy, src_image->bits.height);
+
+           if (need_src_extension)
+           {
+               if (src_bpp == 32)
+               {
+                   PIXMAN_IMAGE_GET_LINE (src_image, 0, sy, uint32_t, src_stride, src_line, 1);
+
+                   for (i = 0; i < src_width; )
+                   {
+                       for (j = 0; j < src_image->bits.width; j++, i++)
+                           extended_src[i] = src_line[j];
+                   }
+               }
+               else if (src_bpp == 16)
+               {
+                   uint16_t *src_line_16;
+
+                   PIXMAN_IMAGE_GET_LINE (src_image, 0, sy, uint16_t, src_stride,
+                                          src_line_16, 1);
+                   src_line = (uint32_t*)src_line_16;
+
+                   for (i = 0; i < src_width; )
+                   {
+                       for (j = 0; j < src_image->bits.width; j++, i++)
+                           ((uint16_t*)extended_src)[i] = ((uint16_t*)src_line)[j];
+                   }
+               }
+               else if (src_bpp == 8)
+               {
+                   uint8_t *src_line_8;
+
+                   PIXMAN_IMAGE_GET_LINE (src_image, 0, sy, uint8_t, src_stride,
+                                          src_line_8, 1);
+                   src_line = (uint32_t*)src_line_8;
+
+                   for (i = 0; i < src_width; )
+                   {
+                       for (j = 0; j < src_image->bits.width; j++, i++)
+                           ((uint8_t*)extended_src)[i] = ((uint8_t*)src_line)[j];
+                   }
+               }
+
+               info2.src_y = 0;
+           }
+           else
+           {
+               info2.src_y = sy;
+           }
+
+           width_remain = width;
+
+           while (width_remain > 0)
+           {
+               num_pixels = src_width - sx;
+
+               if (num_pixels > width_remain)
+                   num_pixels = width_remain;
+
+               info2.src_x = sx;
+               info2.width = num_pixels;
+               info2.height = 1;
+
+               func (imp, &info2);
+
+               width_remain -= num_pixels;
+               info2.mask_x += num_pixels;
+               info2.dest_x += num_pixels;
+               sx = 0;
+           }
+
+           sx = src_x;
+           sy++;
+           info2.mask_x = info->mask_x;
+           info2.mask_y++;
+           info2.dest_x = info->dest_x;
+           info2.dest_y++;
+       }
+
+       if (need_src_extension)
+           _pixman_image_fini (&extended_src_image);
+    }
+    else
+    {
+       _pixman_log_error (FUNC, "Didn't find a suitable function ");
+    }
+}
+
+/* Use more unrolling for src_0565_0565 because it is typically CPU bound */
+static force_inline void
+scaled_nearest_scanline_565_565_SRC (uint16_t *       dst,
+                                    const uint16_t * src,
+                                    int32_t          w,
+                                    pixman_fixed_t   vx,
+                                    pixman_fixed_t   unit_x,
+                                    pixman_fixed_t   max_vx,
+                                    pixman_bool_t    fully_transparent_src)
+{
+    uint16_t tmp1, tmp2, tmp3, tmp4;
+    while ((w -= 4) >= 0)
+    {
+       tmp1 = src[pixman_fixed_to_int (vx)];
+       vx += unit_x;
+       tmp2 = src[pixman_fixed_to_int (vx)];
+       vx += unit_x;
+       tmp3 = src[pixman_fixed_to_int (vx)];
+       vx += unit_x;
+       tmp4 = src[pixman_fixed_to_int (vx)];
+       vx += unit_x;
+       *dst++ = tmp1;
+       *dst++ = tmp2;
+       *dst++ = tmp3;
+       *dst++ = tmp4;
+    }
+    if (w & 2)
+    {
+       tmp1 = src[pixman_fixed_to_int (vx)];
+       vx += unit_x;
+       tmp2 = src[pixman_fixed_to_int (vx)];
+       vx += unit_x;
+       *dst++ = tmp1;
+       *dst++ = tmp2;
+    }
+    if (w & 1)
+       *dst++ = src[pixman_fixed_to_int (vx)];
+}
+
+FAST_NEAREST_MAINLOOP (565_565_cover_SRC,
+                      scaled_nearest_scanline_565_565_SRC,
+                      uint16_t, uint16_t, COVER)
+FAST_NEAREST_MAINLOOP (565_565_none_SRC,
+                      scaled_nearest_scanline_565_565_SRC,
+                      uint16_t, uint16_t, NONE)
+FAST_NEAREST_MAINLOOP (565_565_pad_SRC,
+                      scaled_nearest_scanline_565_565_SRC,
+                      uint16_t, uint16_t, PAD)
+
+static force_inline uint32_t
+fetch_nearest (pixman_repeat_t src_repeat,
+              pixman_format_code_t format,
+              uint32_t *src, int x, int src_width)
+{
+    if (repeat (src_repeat, &x, src_width))
+    {
+       if (format == PIXMAN_x8r8g8b8)
+           return *(src + x) | 0xff000000;
+       else
+           return *(src + x);
+    }
+    else
+    {
+       return 0;
+    }
+}
+
+static force_inline void
+combine_over (uint32_t s, uint32_t *dst)
+{
+    if (s)
+    {
+       uint8_t ia = 0xff - (s >> 24);
+
+       if (ia)
+           UN8x4_MUL_UN8_ADD_UN8x4 (*dst, ia, s);
+       else
+           *dst = s;
+    }
+}
+
+static force_inline void
+combine_src (uint32_t s, uint32_t *dst)
+{
+    *dst = s;
+}
+
+static void
+fast_composite_scaled_nearest (pixman_implementation_t *imp,
+                              pixman_composite_info_t *info)
+{
+    PIXMAN_COMPOSITE_ARGS (info);
+    uint32_t       *dst_line;
+    uint32_t       *src_line;
+    int             dst_stride, src_stride;
+    int                    src_width, src_height;
+    pixman_repeat_t src_repeat;
+    pixman_fixed_t unit_x, unit_y;
+    pixman_format_code_t src_format;
+    pixman_vector_t v;
+    pixman_fixed_t vy;
+
+    PIXMAN_IMAGE_GET_LINE (dest_image, dest_x, dest_y, uint32_t, dst_stride, dst_line, 1);
+    /* pass in 0 instead of src_x and src_y because src_x and src_y need to be
+     * transformed from destination space to source space
+     */
+    PIXMAN_IMAGE_GET_LINE (src_image, 0, 0, uint32_t, src_stride, src_line, 1);
+
+    /* reference point is the center of the pixel */
+    v.vector[0] = pixman_int_to_fixed (src_x) + pixman_fixed_1 / 2;
+    v.vector[1] = pixman_int_to_fixed (src_y) + pixman_fixed_1 / 2;
+    v.vector[2] = pixman_fixed_1;
+
+    if (!pixman_transform_point_3d (src_image->common.transform, &v))
+       return;
+
+    unit_x = src_image->common.transform->matrix[0][0];
+    unit_y = src_image->common.transform->matrix[1][1];
+
+    /* Round down to closest integer, ensuring that 0.5 rounds to 0, not 1 */
+    v.vector[0] -= pixman_fixed_e;
+    v.vector[1] -= pixman_fixed_e;
+
+    src_height = src_image->bits.height;
+    src_width = src_image->bits.width;
+    src_repeat = src_image->common.repeat;
+    src_format = src_image->bits.format;
+
+    vy = v.vector[1];
+    while (height--)
+    {
+        pixman_fixed_t vx = v.vector[0];
+       int y = pixman_fixed_to_int (vy);
+       uint32_t *dst = dst_line;
+
+       dst_line += dst_stride;
+
+        /* adjust the y location by a unit vector in the y direction
+         * this is equivalent to transforming y+1 of the destination point to source space */
+        vy += unit_y;
+
+       if (!repeat (src_repeat, &y, src_height))
+       {
+           if (op == PIXMAN_OP_SRC)
+               memset (dst, 0, sizeof (*dst) * width);
+       }
+       else
+       {
+           int w = width;
+
+           uint32_t *src = src_line + y * src_stride;
+
+           while (w >= 2)
+           {
+               uint32_t s1, s2;
+               int x1, x2;
+
+               x1 = pixman_fixed_to_int (vx);
+               vx += unit_x;
+
+               x2 = pixman_fixed_to_int (vx);
+               vx += unit_x;
+
+               w -= 2;
+
+               s1 = fetch_nearest (src_repeat, src_format, src, x1, src_width);
+               s2 = fetch_nearest (src_repeat, src_format, src, x2, src_width);
+
+               if (op == PIXMAN_OP_OVER)
+               {
+                   combine_over (s1, dst++);
+                   combine_over (s2, dst++);
+               }
+               else
+               {
+                   combine_src (s1, dst++);
+                   combine_src (s2, dst++);
+               }
+           }
+
+           while (w--)
+           {
+               uint32_t s;
+               int x;
+
+               x = pixman_fixed_to_int (vx);
+               vx += unit_x;
+
+               s = fetch_nearest (src_repeat, src_format, src, x, src_width);
+
+               if (op == PIXMAN_OP_OVER)
+                   combine_over (s, dst++);
+               else
+                   combine_src (s, dst++);
+           }
+       }
+    }
+}
+
+#define CACHE_LINE_SIZE 64
+
+#define FAST_SIMPLE_ROTATE(suffix, pix_type)                                  \
+                                                                              \
+static void                                                                   \
+blt_rotated_90_trivial_##suffix (pix_type       *dst,                         \
+                                int             dst_stride,                  \
+                                const pix_type *src,                         \
+                                int             src_stride,                  \
+                                int             w,                           \
+                                int             h)                           \
+{                                                                             \
+    int x, y;                                                                 \
+    for (y = 0; y < h; y++)                                                   \
+    {                                                                         \
+       const pix_type *s = src + (h - y - 1);                                \
+       pix_type *d = dst + dst_stride * y;                                   \
+       for (x = 0; x < w; x++)                                               \
+       {                                                                     \
+           *d++ = *s;                                                        \
+           s += src_stride;                                                  \
+       }                                                                     \
+    }                                                                         \
+}                                                                             \
+                                                                              \
+static void                                                                   \
+blt_rotated_270_trivial_##suffix (pix_type       *dst,                        \
+                                 int             dst_stride,                 \
+                                 const pix_type *src,                        \
+                                 int             src_stride,                 \
+                                 int             w,                          \
+                                 int             h)                          \
+{                                                                             \
+    int x, y;                                                                 \
+    for (y = 0; y < h; y++)                                                   \
+    {                                                                         \
+       const pix_type *s = src + src_stride * (w - 1) + y;                   \
+       pix_type *d = dst + dst_stride * y;                                   \
+       for (x = 0; x < w; x++)                                               \
+       {                                                                     \
+           *d++ = *s;                                                        \
+           s -= src_stride;                                                  \
+       }                                                                     \
+    }                                                                         \
+}                                                                             \
+                                                                              \
+static void                                                                   \
+blt_rotated_90_##suffix (pix_type       *dst,                                 \
+                        int             dst_stride,                          \
+                        const pix_type *src,                                 \
+                        int             src_stride,                          \
+                        int             W,                                   \
+                        int             H)                                   \
+{                                                                             \
+    int x;                                                                    \
+    int leading_pixels = 0, trailing_pixels = 0;                              \
+    const int TILE_SIZE = CACHE_LINE_SIZE / sizeof(pix_type);                 \
+                                                                              \
+    /*                                                                        \
+     * split processing into handling destination as TILE_SIZExH cache line   \
+     * aligned vertical stripes (optimistically assuming that destination     \
+     * stride is a multiple of cache line, if not - it will be just a bit     \
+     * slower)                                                                \
+     */                                                                       \
+                                                                              \
+    if ((uintptr_t)dst & (CACHE_LINE_SIZE - 1))                               \
+    {                                                                         \
+       leading_pixels = TILE_SIZE - (((uintptr_t)dst &                       \
+                           (CACHE_LINE_SIZE - 1)) / sizeof(pix_type));       \
+       if (leading_pixels > W)                                               \
+           leading_pixels = W;                                               \
+                                                                              \
+       /* unaligned leading part NxH (where N < TILE_SIZE) */                \
+       blt_rotated_90_trivial_##suffix (                                     \
+           dst,                                                              \
+           dst_stride,                                                       \
+           src,                                                              \
+           src_stride,                                                       \
+           leading_pixels,                                                   \
+           H);                                                               \
+                                                                             \
+       dst += leading_pixels;                                                \
+       src += leading_pixels * src_stride;                                   \
+       W -= leading_pixels;                                                  \
+    }                                                                         \
+                                                                              \
+    if ((uintptr_t)(dst + W) & (CACHE_LINE_SIZE - 1))                         \
+    {                                                                         \
+       trailing_pixels = (((uintptr_t)(dst + W) &                            \
+                           (CACHE_LINE_SIZE - 1)) / sizeof(pix_type));       \
+       if (trailing_pixels > W)                                              \
+           trailing_pixels = W;                                              \
+       W -= trailing_pixels;                                                 \
+    }                                                                         \
+                                                                              \
+    for (x = 0; x < W; x += TILE_SIZE)                                        \
+    {                                                                         \
+       /* aligned middle part TILE_SIZExH */                                 \
+       blt_rotated_90_trivial_##suffix (                                     \
+           dst + x,                                                          \
+           dst_stride,                                                       \
+           src + src_stride * x,                                             \
+           src_stride,                                                       \
+           TILE_SIZE,                                                        \
+           H);                                                               \
+    }                                                                         \
+                                                                              \
+    if (trailing_pixels)                                                      \
+    {                                                                         \
+       /* unaligned trailing part NxH (where N < TILE_SIZE) */               \
+       blt_rotated_90_trivial_##suffix (                                     \
+           dst + W,                                                          \
+           dst_stride,                                                       \
+           src + W * src_stride,                                             \
+           src_stride,                                                       \
+           trailing_pixels,                                                  \
+           H);                                                               \
+    }                                                                         \
+}                                                                             \
+                                                                              \
+static void                                                                   \
+blt_rotated_270_##suffix (pix_type       *dst,                                \
+                         int             dst_stride,                         \
+                         const pix_type *src,                                \
+                         int             src_stride,                         \
+                         int             W,                                  \
+                         int             H)                                  \
+{                                                                             \
+    int x;                                                                    \
+    int leading_pixels = 0, trailing_pixels = 0;                              \
+    const int TILE_SIZE = CACHE_LINE_SIZE / sizeof(pix_type);                 \
+                                                                              \
+    /*                                                                        \
+     * split processing into handling destination as TILE_SIZExH cache line   \
+     * aligned vertical stripes (optimistically assuming that destination     \
+     * stride is a multiple of cache line, if not - it will be just a bit     \
+     * slower)                                                                \
+     */                                                                       \
+                                                                              \
+    if ((uintptr_t)dst & (CACHE_LINE_SIZE - 1))                               \
+    {                                                                         \
+       leading_pixels = TILE_SIZE - (((uintptr_t)dst &                       \
+                           (CACHE_LINE_SIZE - 1)) / sizeof(pix_type));       \
+       if (leading_pixels > W)                                               \
+           leading_pixels = W;                                               \
+                                                                              \
+       /* unaligned leading part NxH (where N < TILE_SIZE) */                \
+       blt_rotated_270_trivial_##suffix (                                    \
+           dst,                                                              \
+           dst_stride,                                                       \
+           src + src_stride * (W - leading_pixels),                          \
+           src_stride,                                                       \
+           leading_pixels,                                                   \
+           H);                                                               \
+                                                                             \
+       dst += leading_pixels;                                                \
+       W -= leading_pixels;                                                  \
+    }                                                                         \
+                                                                              \
+    if ((uintptr_t)(dst + W) & (CACHE_LINE_SIZE - 1))                         \
+    {                                                                         \
+       trailing_pixels = (((uintptr_t)(dst + W) &                            \
+                           (CACHE_LINE_SIZE - 1)) / sizeof(pix_type));       \
+       if (trailing_pixels > W)                                              \
+           trailing_pixels = W;                                              \
+       W -= trailing_pixels;                                                 \
+       src += trailing_pixels * src_stride;                                  \
+    }                                                                         \
+                                                                              \
+    for (x = 0; x < W; x += TILE_SIZE)                                        \
+    {                                                                         \
+       /* aligned middle part TILE_SIZExH */                                 \
+       blt_rotated_270_trivial_##suffix (                                    \
+           dst + x,                                                          \
+           dst_stride,                                                       \
+           src + src_stride * (W - x - TILE_SIZE),                           \
+           src_stride,                                                       \
+           TILE_SIZE,                                                        \
+           H);                                                               \
+    }                                                                         \
+                                                                              \
+    if (trailing_pixels)                                                      \
+    {                                                                         \
+       /* unaligned trailing part NxH (where N < TILE_SIZE) */               \
+       blt_rotated_270_trivial_##suffix (                                    \
+           dst + W,                                                          \
+           dst_stride,                                                       \
+           src - trailing_pixels * src_stride,                               \
+           src_stride,                                                       \
+           trailing_pixels,                                                  \
+           H);                                                               \
+    }                                                                         \
+}                                                                             \
+                                                                              \
+static void                                                                   \
+fast_composite_rotate_90_##suffix (pixman_implementation_t *imp,              \
+                                  pixman_composite_info_t *info)             \
+{                                                                            \
+    PIXMAN_COMPOSITE_ARGS (info);                                            \
+    pix_type       *dst_line;                                                \
+    pix_type       *src_line;                                                 \
+    int             dst_stride, src_stride;                                   \
+    int             src_x_t, src_y_t;                                         \
+                                                                              \
+    PIXMAN_IMAGE_GET_LINE (dest_image, dest_x, dest_y, pix_type,              \
+                          dst_stride, dst_line, 1);                          \
+    src_x_t = -src_y + pixman_fixed_to_int (                                  \
+                               src_image->common.transform->matrix[0][2] +   \
+                               pixman_fixed_1 / 2 - pixman_fixed_e) - height;\
+    src_y_t = src_x + pixman_fixed_to_int (                                   \
+                               src_image->common.transform->matrix[1][2] +   \
+                               pixman_fixed_1 / 2 - pixman_fixed_e);         \
+    PIXMAN_IMAGE_GET_LINE (src_image, src_x_t, src_y_t, pix_type,             \
+                          src_stride, src_line, 1);                          \
+    blt_rotated_90_##suffix (dst_line, dst_stride, src_line, src_stride,      \
+                            width, height);                                  \
+}                                                                             \
+                                                                              \
+static void                                                                   \
+fast_composite_rotate_270_##suffix (pixman_implementation_t *imp,             \
+                                   pixman_composite_info_t *info)            \
+{                                                                             \
+    PIXMAN_COMPOSITE_ARGS (info);                                            \
+    pix_type       *dst_line;                                                \
+    pix_type       *src_line;                                                 \
+    int             dst_stride, src_stride;                                   \
+    int             src_x_t, src_y_t;                                         \
+                                                                              \
+    PIXMAN_IMAGE_GET_LINE (dest_image, dest_x, dest_y, pix_type,              \
+                          dst_stride, dst_line, 1);                          \
+    src_x_t = src_y + pixman_fixed_to_int (                                   \
+                               src_image->common.transform->matrix[0][2] +   \
+                               pixman_fixed_1 / 2 - pixman_fixed_e);         \
+    src_y_t = -src_x + pixman_fixed_to_int (                                  \
+                               src_image->common.transform->matrix[1][2] +   \
+                               pixman_fixed_1 / 2 - pixman_fixed_e) - width; \
+    PIXMAN_IMAGE_GET_LINE (src_image, src_x_t, src_y_t, pix_type,             \
+                          src_stride, src_line, 1);                          \
+    blt_rotated_270_##suffix (dst_line, dst_stride, src_line, src_stride,     \
+                             width, height);                                 \
+}
+
+FAST_SIMPLE_ROTATE (8, uint8_t)
+FAST_SIMPLE_ROTATE (565, uint16_t)
+FAST_SIMPLE_ROTATE (8888, uint32_t)
+
+static const pixman_fast_path_t c_fast_paths[] =
+{
+    PIXMAN_STD_FAST_PATH (OVER, solid, a8, r5g6b5, fast_composite_over_n_8_0565),
+    PIXMAN_STD_FAST_PATH (OVER, solid, a8, b5g6r5, fast_composite_over_n_8_0565),
+    PIXMAN_STD_FAST_PATH (OVER, solid, a8, r8g8b8, fast_composite_over_n_8_0888),
+    PIXMAN_STD_FAST_PATH (OVER, solid, a8, b8g8r8, fast_composite_over_n_8_0888),
+    PIXMAN_STD_FAST_PATH (OVER, solid, a8, a8r8g8b8, fast_composite_over_n_8_8888),
+    PIXMAN_STD_FAST_PATH (OVER, solid, a8, x8r8g8b8, fast_composite_over_n_8_8888),
+    PIXMAN_STD_FAST_PATH (OVER, solid, a8, a8b8g8r8, fast_composite_over_n_8_8888),
+    PIXMAN_STD_FAST_PATH (OVER, solid, a8, x8b8g8r8, fast_composite_over_n_8_8888),
+    PIXMAN_STD_FAST_PATH (OVER, solid, a1, a8r8g8b8, fast_composite_over_n_1_8888),
+    PIXMAN_STD_FAST_PATH (OVER, solid, a1, x8r8g8b8, fast_composite_over_n_1_8888),
+    PIXMAN_STD_FAST_PATH (OVER, solid, a1, a8b8g8r8, fast_composite_over_n_1_8888),
+    PIXMAN_STD_FAST_PATH (OVER, solid, a1, x8b8g8r8, fast_composite_over_n_1_8888),
+    PIXMAN_STD_FAST_PATH (OVER, solid, a1, r5g6b5,   fast_composite_over_n_1_0565),
+    PIXMAN_STD_FAST_PATH (OVER, solid, a1, b5g6r5,   fast_composite_over_n_1_0565),
+    PIXMAN_STD_FAST_PATH_CA (OVER, solid, a8r8g8b8, a8r8g8b8, fast_composite_over_n_8888_8888_ca),
+    PIXMAN_STD_FAST_PATH_CA (OVER, solid, a8r8g8b8, x8r8g8b8, fast_composite_over_n_8888_8888_ca),
+    PIXMAN_STD_FAST_PATH_CA (OVER, solid, a8r8g8b8, r5g6b5, fast_composite_over_n_8888_0565_ca),
+    PIXMAN_STD_FAST_PATH_CA (OVER, solid, a8b8g8r8, a8b8g8r8, fast_composite_over_n_8888_8888_ca),
+    PIXMAN_STD_FAST_PATH_CA (OVER, solid, a8b8g8r8, x8b8g8r8, fast_composite_over_n_8888_8888_ca),
+    PIXMAN_STD_FAST_PATH_CA (OVER, solid, a8b8g8r8, b5g6r5, fast_composite_over_n_8888_0565_ca),
+    PIXMAN_STD_FAST_PATH (OVER, x8r8g8b8, a8, x8r8g8b8, fast_composite_over_x888_8_8888),
+    PIXMAN_STD_FAST_PATH (OVER, x8r8g8b8, a8, a8r8g8b8, fast_composite_over_x888_8_8888),
+    PIXMAN_STD_FAST_PATH (OVER, x8b8g8r8, a8, x8b8g8r8, fast_composite_over_x888_8_8888),
+    PIXMAN_STD_FAST_PATH (OVER, x8b8g8r8, a8, a8b8g8r8, fast_composite_over_x888_8_8888),
+    PIXMAN_STD_FAST_PATH (OVER, a8r8g8b8, null, a8r8g8b8, fast_composite_over_8888_8888),
+    PIXMAN_STD_FAST_PATH (OVER, a8r8g8b8, null, x8r8g8b8, fast_composite_over_8888_8888),
+    PIXMAN_STD_FAST_PATH (OVER, a8r8g8b8, null, r5g6b5, fast_composite_over_8888_0565),
+    PIXMAN_STD_FAST_PATH (OVER, a8b8g8r8, null, a8b8g8r8, fast_composite_over_8888_8888),
+    PIXMAN_STD_FAST_PATH (OVER, a8b8g8r8, null, x8b8g8r8, fast_composite_over_8888_8888),
+    PIXMAN_STD_FAST_PATH (OVER, a8b8g8r8, null, b5g6r5, fast_composite_over_8888_0565),
+    PIXMAN_STD_FAST_PATH (ADD, a8r8g8b8, null, a8r8g8b8, fast_composite_add_8888_8888),
+    PIXMAN_STD_FAST_PATH (ADD, a8b8g8r8, null, a8b8g8r8, fast_composite_add_8888_8888),
+    PIXMAN_STD_FAST_PATH (ADD, a8, null, a8, fast_composite_add_8_8),
+    PIXMAN_STD_FAST_PATH (ADD, a1, null, a1, fast_composite_add_1000_1000),
+    PIXMAN_STD_FAST_PATH_CA (ADD, solid, a8r8g8b8, a8r8g8b8, fast_composite_add_n_8888_8888_ca),
+    PIXMAN_STD_FAST_PATH (ADD, solid, a8, a8, fast_composite_add_n_8_8),
+    PIXMAN_STD_FAST_PATH (SRC, solid, null, a8r8g8b8, fast_composite_solid_fill),
+    PIXMAN_STD_FAST_PATH (SRC, solid, null, x8r8g8b8, fast_composite_solid_fill),
+    PIXMAN_STD_FAST_PATH (SRC, solid, null, a8b8g8r8, fast_composite_solid_fill),
+    PIXMAN_STD_FAST_PATH (SRC, solid, null, x8b8g8r8, fast_composite_solid_fill),
+    PIXMAN_STD_FAST_PATH (SRC, solid, null, a1, fast_composite_solid_fill),
+    PIXMAN_STD_FAST_PATH (SRC, solid, null, a8, fast_composite_solid_fill),
+    PIXMAN_STD_FAST_PATH (SRC, solid, null, r5g6b5, fast_composite_solid_fill),
+    PIXMAN_STD_FAST_PATH (SRC, x8r8g8b8, null, a8r8g8b8, fast_composite_src_x888_8888),
+    PIXMAN_STD_FAST_PATH (SRC, x8b8g8r8, null, a8b8g8r8, fast_composite_src_x888_8888),
+    PIXMAN_STD_FAST_PATH (SRC, a8r8g8b8, null, x8r8g8b8, fast_composite_src_memcpy),
+    PIXMAN_STD_FAST_PATH (SRC, a8r8g8b8, null, a8r8g8b8, fast_composite_src_memcpy),
+    PIXMAN_STD_FAST_PATH (SRC, x8r8g8b8, null, x8r8g8b8, fast_composite_src_memcpy),
+    PIXMAN_STD_FAST_PATH (SRC, a8b8g8r8, null, x8b8g8r8, fast_composite_src_memcpy),
+    PIXMAN_STD_FAST_PATH (SRC, a8b8g8r8, null, a8b8g8r8, fast_composite_src_memcpy),
+    PIXMAN_STD_FAST_PATH (SRC, x8b8g8r8, null, x8b8g8r8, fast_composite_src_memcpy),
+    PIXMAN_STD_FAST_PATH (SRC, b8g8r8a8, null, b8g8r8x8, fast_composite_src_memcpy),
+    PIXMAN_STD_FAST_PATH (SRC, b8g8r8a8, null, b8g8r8a8, fast_composite_src_memcpy),
+    PIXMAN_STD_FAST_PATH (SRC, b8g8r8x8, null, b8g8r8x8, fast_composite_src_memcpy),
+    PIXMAN_STD_FAST_PATH (SRC, r5g6b5, null, r5g6b5, fast_composite_src_memcpy),
+    PIXMAN_STD_FAST_PATH (SRC, b5g6r5, null, b5g6r5, fast_composite_src_memcpy),
+    PIXMAN_STD_FAST_PATH (SRC, r8g8b8, null, r8g8b8, fast_composite_src_memcpy),
+    PIXMAN_STD_FAST_PATH (SRC, b8g8r8, null, b8g8r8, fast_composite_src_memcpy),
+    PIXMAN_STD_FAST_PATH (SRC, x1r5g5b5, null, x1r5g5b5, fast_composite_src_memcpy),
+    PIXMAN_STD_FAST_PATH (SRC, a1r5g5b5, null, x1r5g5b5, fast_composite_src_memcpy),
+    PIXMAN_STD_FAST_PATH (SRC, a8, null, a8, fast_composite_src_memcpy),
+    PIXMAN_STD_FAST_PATH (SRC, a8r8g8b8, null, r5g6b5, fast_composite_src_x888_0565),
+    PIXMAN_STD_FAST_PATH (SRC, x8r8g8b8, null, r5g6b5, fast_composite_src_x888_0565),
+    PIXMAN_STD_FAST_PATH (SRC, a8b8g8r8, null, b5g6r5, fast_composite_src_x888_0565),
+    PIXMAN_STD_FAST_PATH (SRC, x8b8g8r8, null, b5g6r5, fast_composite_src_x888_0565),
+    PIXMAN_STD_FAST_PATH (IN, a8, null, a8, fast_composite_in_8_8),
+    PIXMAN_STD_FAST_PATH (IN, solid, a8, a8, fast_composite_in_n_8_8),
+
+    SIMPLE_NEAREST_FAST_PATH (SRC, x8r8g8b8, x8r8g8b8, 8888_8888),
+    SIMPLE_NEAREST_FAST_PATH (SRC, a8r8g8b8, x8r8g8b8, 8888_8888),
+    SIMPLE_NEAREST_FAST_PATH (SRC, x8b8g8r8, x8b8g8r8, 8888_8888),
+    SIMPLE_NEAREST_FAST_PATH (SRC, a8b8g8r8, x8b8g8r8, 8888_8888),
+
+    SIMPLE_NEAREST_FAST_PATH (SRC, a8r8g8b8, a8r8g8b8, 8888_8888),
+    SIMPLE_NEAREST_FAST_PATH (SRC, a8b8g8r8, a8b8g8r8, 8888_8888),
+
+    SIMPLE_NEAREST_FAST_PATH (SRC, x8r8g8b8, r5g6b5, 8888_565),
+    SIMPLE_NEAREST_FAST_PATH (SRC, a8r8g8b8, r5g6b5, 8888_565),
+
+    SIMPLE_NEAREST_FAST_PATH (SRC, r5g6b5, r5g6b5, 565_565),
+
+    SIMPLE_NEAREST_FAST_PATH_COVER (SRC, x8r8g8b8, a8r8g8b8, x888_8888),
+    SIMPLE_NEAREST_FAST_PATH_COVER (SRC, x8b8g8r8, a8b8g8r8, x888_8888),
+    SIMPLE_NEAREST_FAST_PATH_PAD (SRC, x8r8g8b8, a8r8g8b8, x888_8888),
+    SIMPLE_NEAREST_FAST_PATH_PAD (SRC, x8b8g8r8, a8b8g8r8, x888_8888),
+    SIMPLE_NEAREST_FAST_PATH_NORMAL (SRC, x8r8g8b8, a8r8g8b8, x888_8888),
+    SIMPLE_NEAREST_FAST_PATH_NORMAL (SRC, x8b8g8r8, a8b8g8r8, x888_8888),
+
+    SIMPLE_NEAREST_FAST_PATH (OVER, a8r8g8b8, x8r8g8b8, 8888_8888),
+    SIMPLE_NEAREST_FAST_PATH (OVER, a8b8g8r8, x8b8g8r8, 8888_8888),
+    SIMPLE_NEAREST_FAST_PATH (OVER, a8r8g8b8, a8r8g8b8, 8888_8888),
+    SIMPLE_NEAREST_FAST_PATH (OVER, a8b8g8r8, a8b8g8r8, 8888_8888),
+
+    SIMPLE_NEAREST_FAST_PATH (OVER, a8r8g8b8, r5g6b5, 8888_565),
+
+#define NEAREST_FAST_PATH(op,s,d)              \
+    {   PIXMAN_OP_ ## op,                      \
+       PIXMAN_ ## s, SCALED_NEAREST_FLAGS,     \
+       PIXMAN_null, 0,                         \
+       PIXMAN_ ## d, FAST_PATH_STD_DEST_FLAGS, \
+       fast_composite_scaled_nearest,          \
+    }
+
+    NEAREST_FAST_PATH (SRC, x8r8g8b8, x8r8g8b8),
+    NEAREST_FAST_PATH (SRC, a8r8g8b8, x8r8g8b8),
+    NEAREST_FAST_PATH (SRC, x8b8g8r8, x8b8g8r8),
+    NEAREST_FAST_PATH (SRC, a8b8g8r8, x8b8g8r8),
+
+    NEAREST_FAST_PATH (SRC, x8r8g8b8, a8r8g8b8),
+    NEAREST_FAST_PATH (SRC, a8r8g8b8, a8r8g8b8),
+    NEAREST_FAST_PATH (SRC, x8b8g8r8, a8b8g8r8),
+    NEAREST_FAST_PATH (SRC, a8b8g8r8, a8b8g8r8),
+
+    NEAREST_FAST_PATH (OVER, x8r8g8b8, x8r8g8b8),
+    NEAREST_FAST_PATH (OVER, a8r8g8b8, x8r8g8b8),
+    NEAREST_FAST_PATH (OVER, x8b8g8r8, x8b8g8r8),
+    NEAREST_FAST_PATH (OVER, a8b8g8r8, x8b8g8r8),
+
+    NEAREST_FAST_PATH (OVER, x8r8g8b8, a8r8g8b8),
+    NEAREST_FAST_PATH (OVER, a8r8g8b8, a8r8g8b8),
+    NEAREST_FAST_PATH (OVER, x8b8g8r8, a8b8g8r8),
+    NEAREST_FAST_PATH (OVER, a8b8g8r8, a8b8g8r8),
+
+#define SIMPLE_ROTATE_FLAGS(angle)                                       \
+    (FAST_PATH_ROTATE_ ## angle ## _TRANSFORM  |                         \
+     FAST_PATH_NEAREST_FILTER                  |                         \
+     FAST_PATH_SAMPLES_COVER_CLIP_NEAREST      |                         \
+     FAST_PATH_STANDARD_FLAGS)
+
+#define SIMPLE_ROTATE_FAST_PATH(op,s,d,suffix)                           \
+    {   PIXMAN_OP_ ## op,                                                \
+       PIXMAN_ ## s, SIMPLE_ROTATE_FLAGS (90),                           \
+       PIXMAN_null, 0,                                                   \
+       PIXMAN_ ## d, FAST_PATH_STD_DEST_FLAGS,                           \
+       fast_composite_rotate_90_##suffix,                                \
+    },                                                                   \
+    {   PIXMAN_OP_ ## op,                                                \
+       PIXMAN_ ## s, SIMPLE_ROTATE_FLAGS (270),                          \
+       PIXMAN_null, 0,                                                   \
+       PIXMAN_ ## d, FAST_PATH_STD_DEST_FLAGS,                           \
+       fast_composite_rotate_270_##suffix,                               \
+    }
+
+    SIMPLE_ROTATE_FAST_PATH (SRC, a8r8g8b8, a8r8g8b8, 8888),
+    SIMPLE_ROTATE_FAST_PATH (SRC, a8r8g8b8, x8r8g8b8, 8888),
+    SIMPLE_ROTATE_FAST_PATH (SRC, x8r8g8b8, x8r8g8b8, 8888),
+    SIMPLE_ROTATE_FAST_PATH (SRC, r5g6b5, r5g6b5, 565),
+    SIMPLE_ROTATE_FAST_PATH (SRC, a8, a8, 8),
+
+    /* Simple repeat fast path entry. */
+    {  PIXMAN_OP_any,
+       PIXMAN_any,
+       (FAST_PATH_STANDARD_FLAGS | FAST_PATH_ID_TRANSFORM | FAST_PATH_BITS_IMAGE |
+        FAST_PATH_NORMAL_REPEAT),
+       PIXMAN_any, 0,
+       PIXMAN_any, FAST_PATH_STD_DEST_FLAGS,
+       fast_composite_tiled_repeat
+    },
+
+    {   PIXMAN_OP_NONE },
+};
+
+#ifdef WORDS_BIGENDIAN
+#define A1_FILL_MASK(n, offs) (((1 << (n)) - 1) << (32 - (offs) - (n)))
+#else
+#define A1_FILL_MASK(n, offs) (((1 << (n)) - 1) << (offs))
+#endif
+
+static force_inline void
+pixman_fill1_line (uint32_t *dst, int offs, int width, int v)
+{
+    if (offs)
+    {
+       int leading_pixels = 32 - offs;
+       if (leading_pixels >= width)
+       {
+           if (v)
+               *dst |= A1_FILL_MASK (width, offs);
+           else
+               *dst &= ~A1_FILL_MASK (width, offs);
+           return;
+       }
+       else
+       {
+           if (v)
+               *dst++ |= A1_FILL_MASK (leading_pixels, offs);
+           else
+               *dst++ &= ~A1_FILL_MASK (leading_pixels, offs);
+           width -= leading_pixels;
+       }
+    }
+    while (width >= 32)
+    {
+       if (v)
+           *dst++ = 0xFFFFFFFF;
+       else
+           *dst++ = 0;
+       width -= 32;
+    }
+    if (width > 0)
+    {
+       if (v)
+           *dst |= A1_FILL_MASK (width, 0);
+       else
+           *dst &= ~A1_FILL_MASK (width, 0);
+    }
+}
+
+static void
+pixman_fill1 (uint32_t *bits,
+              int       stride,
+              int       x,
+              int       y,
+              int       width,
+              int       height,
+              uint32_t  xor)
+{
+    uint32_t *dst = bits + y * stride + (x >> 5);
+    int offs = x & 31;
+
+    if (xor & 1)
+    {
+       while (height--)
+       {
+           pixman_fill1_line (dst, offs, width, 1);
+           dst += stride;
+       }
+    }
+    else
+    {
+       while (height--)
+       {
+           pixman_fill1_line (dst, offs, width, 0);
+           dst += stride;
+       }
+    }
+}
+
+static void
+pixman_fill8 (uint32_t *bits,
+              int       stride,
+              int       x,
+              int       y,
+              int       width,
+              int       height,
+              uint32_t xor)
+{
+    int byte_stride = stride * (int) sizeof (uint32_t);
+    uint8_t *dst = (uint8_t *) bits;
+    uint8_t v = xor & 0xff;
+    int i;
+
+    dst = dst + y * byte_stride + x;
+
+    while (height--)
+    {
+       for (i = 0; i < width; ++i)
+           dst[i] = v;
+
+       dst += byte_stride;
+    }
+}
+
+static void
+pixman_fill16 (uint32_t *bits,
+               int       stride,
+               int       x,
+               int       y,
+               int       width,
+               int       height,
+               uint32_t xor)
+{
+    int short_stride =
+       (stride * (int)sizeof (uint32_t)) / (int)sizeof (uint16_t);
+    uint16_t *dst = (uint16_t *)bits;
+    uint16_t v = xor & 0xffff;
+    int i;
+
+    dst = dst + y * short_stride + x;
+
+    while (height--)
+    {
+       for (i = 0; i < width; ++i)
+           dst[i] = v;
+
+       dst += short_stride;
+    }
+}
+
+static void
+pixman_fill32 (uint32_t *bits,
+               int       stride,
+               int       x,
+               int       y,
+               int       width,
+               int       height,
+               uint32_t  xor)
+{
+    int i;
+
+    bits = bits + y * stride + x;
+
+    while (height--)
+    {
+       for (i = 0; i < width; ++i)
+           bits[i] = xor;
+
+       bits += stride;
+    }
+}
+
+static pixman_bool_t
+fast_path_fill (pixman_implementation_t *imp,
+                uint32_t *               bits,
+                int                      stride,
+                int                      bpp,
+                int                      x,
+                int                      y,
+                int                      width,
+                int                      height,
+                uint32_t                xor)
+{
+    switch (bpp)
+    {
+    case 1:
+       pixman_fill1 (bits, stride, x, y, width, height, xor);
+       break;
+
+    case 8:
+       pixman_fill8 (bits, stride, x, y, width, height, xor);
+       break;
+
+    case 16:
+       pixman_fill16 (bits, stride, x, y, width, height, xor);
+       break;
+
+    case 32:
+       pixman_fill32 (bits, stride, x, y, width, height, xor);
+       break;
+
+    default:
+       return _pixman_implementation_fill (
+           imp->delegate, bits, stride, bpp, x, y, width, height, xor);
+       break;
+    }
+
+    return TRUE;
+}
+
+pixman_implementation_t *
+_pixman_implementation_create_fast_path (pixman_implementation_t *fallback)
+{
+    pixman_implementation_t *imp = _pixman_implementation_create (fallback, c_fast_paths);
+
+    imp->fill = fast_path_fill;
+
+    return imp;
+}
diff --git a/pixman/pixman-general.c b/pixman/pixman-general.c
new file mode 100644 (file)
index 0000000..2ccdfcd
--- /dev/null
@@ -0,0 +1,264 @@
+/*
+ * Copyright © 2009 Red Hat, Inc.
+ * Copyright © 2000 SuSE, Inc.
+ * Copyright © 2007 Red Hat, Inc.
+ * Copyright © 2000 Keith Packard, member of The XFree86 Project, Inc.
+ *             2005 Lars Knoll & Zack Rusin, Trolltech
+ *             2008 Aaron Plattner, NVIDIA Corporation
+ *
+ * Permission to use, copy, modify, distribute, and sell this software and its
+ * documentation for any purpose is hereby granted without fee, provided that
+ * the above copyright notice appear in all copies and that both that
+ * copyright notice and this permission notice appear in supporting
+ * documentation, and that the name of Red Hat not be used in advertising or
+ * publicity pertaining to distribution of the software without specific,
+ * written prior permission.  Red Hat makes no representations about the
+ * suitability of this software for any purpose.  It is provided "as is"
+ * without express or implied warranty.
+ *
+ * THE COPYRIGHT HOLDERS DISCLAIM ALL WARRANTIES WITH REGARD TO THIS
+ * SOFTWARE, INCLUDING ALL IMPLIED WARRANTIES OF MERCHANTABILITY AND
+ * FITNESS, IN NO EVENT SHALL THE COPYRIGHT HOLDERS BE LIABLE FOR ANY
+ * SPECIAL, INDIRECT OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES
+ * WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN
+ * AN ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING
+ * OUT OF OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS
+ * SOFTWARE.
+ */
+#ifdef HAVE_CONFIG_H
+#include <config.h>
+#endif
+#include <stdlib.h>
+#include <string.h>
+#include <math.h>
+#include <limits.h>
+#include <stdio.h>
+#include <stdlib.h>
+#include <string.h>
+#include "pixman-private.h"
+
+static void
+general_src_iter_init (pixman_implementation_t *imp, pixman_iter_t *iter)
+{
+    pixman_image_t *image = iter->image;
+
+    if (image->type == SOLID)
+       _pixman_solid_fill_iter_init (image, iter);
+    else if (image->type == LINEAR)
+       _pixman_linear_gradient_iter_init (image, iter);
+    else if (image->type == RADIAL)
+       _pixman_radial_gradient_iter_init (image, iter);
+    else if (image->type == CONICAL)
+       _pixman_conical_gradient_iter_init (image, iter);
+    else if (image->type == BITS)
+       _pixman_bits_image_src_iter_init (image, iter);
+    else
+       _pixman_log_error (FUNC, "Pixman bug: unknown image type\n");
+}
+
+static void
+general_dest_iter_init (pixman_implementation_t *imp, pixman_iter_t *iter)
+{
+    if (iter->image->type == BITS)
+    {
+       _pixman_bits_image_dest_iter_init (iter->image, iter);
+    }
+    else
+    {
+       _pixman_log_error (FUNC, "Trying to write to a non-writable image");
+    }
+}
+
+typedef struct op_info_t op_info_t;
+struct op_info_t
+{
+    uint8_t src, dst;
+};
+
+#define ITER_IGNORE_BOTH                                               \
+    (ITER_IGNORE_ALPHA | ITER_IGNORE_RGB | ITER_LOCALIZED_ALPHA)
+
+static const op_info_t op_flags[PIXMAN_N_OPERATORS] =
+{
+    /* Src                   Dst                   */
+    { ITER_IGNORE_BOTH,      ITER_IGNORE_BOTH      }, /* CLEAR */
+    { ITER_LOCALIZED_ALPHA,  ITER_IGNORE_BOTH      }, /* SRC */
+    { ITER_IGNORE_BOTH,      ITER_LOCALIZED_ALPHA  }, /* DST */
+    { 0,                     ITER_LOCALIZED_ALPHA  }, /* OVER */
+    { ITER_LOCALIZED_ALPHA,  0                     }, /* OVER_REVERSE */
+    { ITER_LOCALIZED_ALPHA,  ITER_IGNORE_RGB       }, /* IN */
+    { ITER_IGNORE_RGB,       ITER_LOCALIZED_ALPHA  }, /* IN_REVERSE */
+    { ITER_LOCALIZED_ALPHA,  ITER_IGNORE_RGB       }, /* OUT */
+    { ITER_IGNORE_RGB,       ITER_LOCALIZED_ALPHA  }, /* OUT_REVERSE */
+    { 0,                     0                     }, /* ATOP */
+    { 0,                     0                     }, /* ATOP_REVERSE */
+    { 0,                     0                     }, /* XOR */
+    { ITER_LOCALIZED_ALPHA,  ITER_LOCALIZED_ALPHA  }, /* ADD */
+    { 0,                     0                     }, /* SATURATE */
+};
+
+#define SCANLINE_BUFFER_LENGTH 8192
+
+static void
+general_composite_rect  (pixman_implementation_t *imp,
+                         pixman_composite_info_t *info)
+{
+    PIXMAN_COMPOSITE_ARGS (info);
+    uint64_t stack_scanline_buffer[(SCANLINE_BUFFER_LENGTH * 3 + 7) / 8];
+    uint8_t *scanline_buffer = (uint8_t *) stack_scanline_buffer;
+    uint8_t *src_buffer, *mask_buffer, *dest_buffer;
+    pixman_iter_t src_iter, mask_iter, dest_iter;
+    pixman_combine_32_func_t compose;
+    pixman_bool_t component_alpha;
+    iter_flags_t narrow, src_flags;
+    int Bpp;
+    int i;
+
+    if ((src_image->common.flags & FAST_PATH_NARROW_FORMAT)                &&
+       (!mask_image || mask_image->common.flags & FAST_PATH_NARROW_FORMAT) &&
+       (dest_image->common.flags & FAST_PATH_NARROW_FORMAT))
+    {
+       narrow = ITER_NARROW;
+       Bpp = 4;
+    }
+    else
+    {
+       narrow = 0;
+       Bpp = 8;
+    }
+
+    if (width * Bpp > SCANLINE_BUFFER_LENGTH)
+    {
+       scanline_buffer = pixman_malloc_abc (width, 3, Bpp);
+
+       if (!scanline_buffer)
+           return;
+    }
+
+    src_buffer = scanline_buffer;
+    mask_buffer = src_buffer + width * Bpp;
+    dest_buffer = mask_buffer + width * Bpp;
+
+    /* src iter */
+    src_flags = narrow | op_flags[op].src;
+
+    _pixman_implementation_src_iter_init (imp->toplevel, &src_iter, src_image,
+                                         src_x, src_y, width, height,
+                                         src_buffer, src_flags);
+
+    /* mask iter */
+    if ((src_flags & (ITER_IGNORE_ALPHA | ITER_IGNORE_RGB)) ==
+       (ITER_IGNORE_ALPHA | ITER_IGNORE_RGB))
+    {
+       /* If it doesn't matter what the source is, then it doesn't matter
+        * what the mask is
+        */
+       mask_image = NULL;
+    }
+
+    component_alpha =
+        mask_image                           &&
+        mask_image->common.type == BITS       &&
+        mask_image->common.component_alpha    &&
+        PIXMAN_FORMAT_RGB (mask_image->bits.format);
+
+    _pixman_implementation_src_iter_init (
+       imp->toplevel, &mask_iter, mask_image, mask_x, mask_y, width, height,
+       mask_buffer, narrow | (component_alpha? 0 : ITER_IGNORE_RGB));
+
+    /* dest iter */
+    _pixman_implementation_dest_iter_init (
+       imp->toplevel, &dest_iter, dest_image, dest_x, dest_y, width, height,
+       dest_buffer, narrow | op_flags[op].dst);
+
+    if (narrow)
+    {
+       if (component_alpha)
+           compose = _pixman_implementation_combine_32_ca;
+       else
+           compose = _pixman_implementation_combine_32;
+    }
+    else
+    {
+       if (component_alpha)
+           compose = (pixman_combine_32_func_t)_pixman_implementation_combine_64_ca;
+       else
+           compose = (pixman_combine_32_func_t)_pixman_implementation_combine_64;
+    }
+
+    if (!compose)
+       return;
+
+    for (i = 0; i < height; ++i)
+    {
+       uint32_t *s, *m, *d;
+
+       m = mask_iter.get_scanline (&mask_iter, NULL);
+       s = src_iter.get_scanline (&src_iter, m);
+       d = dest_iter.get_scanline (&dest_iter, NULL);
+
+       compose (imp->toplevel, op, d, s, m, width);
+
+       dest_iter.write_back (&dest_iter);
+    }
+
+    if (scanline_buffer != (uint8_t *) stack_scanline_buffer)
+       free (scanline_buffer);
+}
+
+static const pixman_fast_path_t general_fast_path[] =
+{
+    { PIXMAN_OP_any, PIXMAN_any, 0, PIXMAN_any,        0, PIXMAN_any, 0, general_composite_rect },
+    { PIXMAN_OP_NONE }
+};
+
+static pixman_bool_t
+general_blt (pixman_implementation_t *imp,
+             uint32_t *               src_bits,
+             uint32_t *               dst_bits,
+             int                      src_stride,
+             int                      dst_stride,
+             int                      src_bpp,
+             int                      dst_bpp,
+             int                      src_x,
+             int                      src_y,
+             int                      dest_x,
+             int                      dest_y,
+             int                      width,
+             int                      height)
+{
+    /* We can't blit unless we have sse2 or mmx */
+
+    return FALSE;
+}
+
+static pixman_bool_t
+general_fill (pixman_implementation_t *imp,
+              uint32_t *               bits,
+              int                      stride,
+              int                      bpp,
+              int                      x,
+              int                      y,
+              int                      width,
+              int                      height,
+              uint32_t xor)
+{
+    return FALSE;
+}
+
+pixman_implementation_t *
+_pixman_implementation_create_general (void)
+{
+    pixman_implementation_t *imp = _pixman_implementation_create (NULL, general_fast_path);
+
+    _pixman_setup_combiner_functions_32 (imp);
+    _pixman_setup_combiner_functions_64 (imp);
+
+    imp->blt = general_blt;
+    imp->fill = general_fill;
+    imp->src_iter_init = general_src_iter_init;
+    imp->dest_iter_init = general_dest_iter_init;
+
+    return imp;
+}
+
diff --git a/pixman/pixman-gradient-walker.c b/pixman/pixman-gradient-walker.c
new file mode 100644 (file)
index 0000000..dd666b4
--- /dev/null
@@ -0,0 +1,254 @@
+/*
+ *
+ * Copyright © 2000 Keith Packard, member of The XFree86 Project, Inc.
+ *             2005 Lars Knoll & Zack Rusin, Trolltech
+ *
+ * Permission to use, copy, modify, distribute, and sell this software and its
+ * documentation for any purpose is hereby granted without fee, provided that
+ * the above copyright notice appear in all copies and that both that
+ * copyright notice and this permission notice appear in supporting
+ * documentation, and that the name of Keith Packard not be used in
+ * advertising or publicity pertaining to distribution of the software without
+ * specific, written prior permission.  Keith Packard makes no
+ * representations about the suitability of this software for any purpose.  It
+ * is provided "as is" without express or implied warranty.
+ *
+ * THE COPYRIGHT HOLDERS DISCLAIM ALL WARRANTIES WITH REGARD TO THIS
+ * SOFTWARE, INCLUDING ALL IMPLIED WARRANTIES OF MERCHANTABILITY AND
+ * FITNESS, IN NO EVENT SHALL THE COPYRIGHT HOLDERS BE LIABLE FOR ANY
+ * SPECIAL, INDIRECT OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES
+ * WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN
+ * AN ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING
+ * OUT OF OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS
+ * SOFTWARE.
+ */
+
+#ifdef HAVE_CONFIG_H
+#include <config.h>
+#endif
+#include "pixman-private.h"
+
+void
+_pixman_gradient_walker_init (pixman_gradient_walker_t *walker,
+                              gradient_t *              gradient,
+                              unsigned int              spread)
+{
+    walker->num_stops = gradient->n_stops;
+    walker->stops     = gradient->stops;
+    walker->left_x    = 0;
+    walker->right_x   = 0x10000;
+    walker->stepper   = 0;
+    walker->left_ag   = 0;
+    walker->left_rb   = 0;
+    walker->right_ag  = 0;
+    walker->right_rb  = 0;
+    walker->spread    = spread;
+
+    walker->need_reset = TRUE;
+}
+
+void
+_pixman_gradient_walker_reset (pixman_gradient_walker_t *walker,
+                               pixman_fixed_32_32_t      pos)
+{
+    int32_t x, left_x, right_x;
+    pixman_color_t          *left_c, *right_c;
+    int n, count = walker->num_stops;
+    pixman_gradient_stop_t *      stops = walker->stops;
+
+    static const pixman_color_t transparent_black = { 0, 0, 0, 0 };
+
+    switch (walker->spread)
+    {
+    case PIXMAN_REPEAT_NORMAL:
+       x = (int32_t)pos & 0xFFFF;
+       for (n = 0; n < count; n++)
+           if (x < stops[n].x)
+               break;
+       if (n == 0)
+       {
+           left_x =  stops[count - 1].x - 0x10000;
+           left_c = &stops[count - 1].color;
+       }
+       else
+       {
+           left_x =  stops[n - 1].x;
+           left_c = &stops[n - 1].color;
+       }
+
+       if (n == count)
+       {
+           right_x =  stops[0].x + 0x10000;
+           right_c = &stops[0].color;
+       }
+       else
+       {
+           right_x =  stops[n].x;
+           right_c = &stops[n].color;
+       }
+       left_x  += (pos - x);
+       right_x += (pos - x);
+       break;
+
+    case PIXMAN_REPEAT_PAD:
+       for (n = 0; n < count; n++)
+           if (pos < stops[n].x)
+               break;
+
+       if (n == 0)
+       {
+           left_x =  INT32_MIN;
+           left_c = &stops[0].color;
+       }
+       else
+       {
+           left_x =  stops[n - 1].x;
+           left_c = &stops[n - 1].color;
+       }
+
+       if (n == count)
+       {
+           right_x =  INT32_MAX;
+           right_c = &stops[n - 1].color;
+       }
+       else
+       {
+           right_x =  stops[n].x;
+           right_c = &stops[n].color;
+       }
+       break;
+
+    case PIXMAN_REPEAT_REFLECT:
+       x = (int32_t)pos & 0xFFFF;
+       if ((int32_t)pos & 0x10000)
+           x = 0x10000 - x;
+       for (n = 0; n < count; n++)
+           if (x < stops[n].x)
+               break;
+
+       if (n == 0)
+       {
+           left_x =  -stops[0].x;
+           left_c = &stops[0].color;
+       }
+       else
+       {
+           left_x =  stops[n - 1].x;
+           left_c = &stops[n - 1].color;
+       }
+
+       if (n == count)
+       {
+           right_x = 0x20000 - stops[n - 1].x;
+           right_c = &stops[n - 1].color;
+       }
+       else
+       {
+           right_x =  stops[n].x;
+           right_c = &stops[n].color;
+       }
+
+       if ((int32_t)pos & 0x10000)
+       {
+           pixman_color_t  *tmp_c;
+           int32_t tmp_x;
+
+           tmp_x   = 0x10000 - right_x;
+           right_x = 0x10000 - left_x;
+           left_x  = tmp_x;
+
+           tmp_c   = right_c;
+           right_c = left_c;
+           left_c  = tmp_c;
+
+           x = 0x10000 - x;
+       }
+       left_x  += (pos - x);
+       right_x += (pos - x);
+       break;
+
+    default:  /* REPEAT_NONE */
+       for (n = 0; n < count; n++)
+           if (pos < stops[n].x)
+               break;
+
+       if (n == 0)
+       {
+           left_x  =  INT32_MIN;
+           right_x =  stops[0].x;
+           left_c  = right_c = (pixman_color_t*) &transparent_black;
+       }
+       else if (n == count)
+       {
+           left_x  = stops[n - 1].x;
+           right_x = INT32_MAX;
+           left_c  = right_c = (pixman_color_t*) &transparent_black;
+       }
+       else
+       {
+           left_x  =  stops[n - 1].x;
+           right_x =  stops[n].x;
+           left_c  = &stops[n - 1].color;
+           right_c = &stops[n].color;
+       }
+    }
+
+    walker->left_x   = left_x;
+    walker->right_x  = right_x;
+    walker->left_ag  = ((left_c->alpha >> 8) << 16)   | (left_c->green >> 8);
+    walker->left_rb  = ((left_c->red & 0xff00) << 8)  | (left_c->blue >> 8);
+    walker->right_ag = ((right_c->alpha >> 8) << 16)  | (right_c->green >> 8);
+    walker->right_rb = ((right_c->red & 0xff00) << 8) | (right_c->blue >> 8);
+
+    if (walker->left_x == walker->right_x                ||
+        ( walker->left_ag == walker->right_ag &&
+          walker->left_rb == walker->right_rb )   )
+    {
+       walker->stepper = 0;
+    }
+    else
+    {
+       int32_t width = right_x - left_x;
+       walker->stepper = ((1 << 24) + width / 2) / width;
+    }
+
+    walker->need_reset = FALSE;
+}
+
+#define  PIXMAN_GRADIENT_WALKER_NEED_RESET(w, x)                         \
+    ( (w)->need_reset || (x) < (w)->left_x || (x) >= (w)->right_x)
+
+
+/* the following assumes that PIXMAN_GRADIENT_WALKER_NEED_RESET(w,x) is FALSE */
+uint32_t
+_pixman_gradient_walker_pixel (pixman_gradient_walker_t *walker,
+                               pixman_fixed_32_32_t      x)
+{
+    int dist, idist;
+    uint32_t t1, t2, a, color;
+
+    if (PIXMAN_GRADIENT_WALKER_NEED_RESET (walker, x))
+       _pixman_gradient_walker_reset (walker, x);
+
+    dist  = ((int)(x - walker->left_x) * walker->stepper) >> 16;
+    idist = 256 - dist;
+
+    /* combined INTERPOLATE and premultiply */
+    t1 = walker->left_rb * idist + walker->right_rb * dist;
+    t1 = (t1 >> 8) & 0xff00ff;
+
+    t2  = walker->left_ag * idist + walker->right_ag * dist;
+    t2 &= 0xff00ff00;
+
+    color = t2 & 0xff000000;
+    a     = t2 >> 24;
+
+    t1  = t1 * a + 0x800080;
+    t1  = (t1 + ((t1 >> 8) & 0xff00ff)) >> 8;
+
+    t2  = (t2 >> 8) * a + 0x800080;
+    t2  = (t2 + ((t2 >> 8) & 0xff00ff));
+
+    return (color | (t1 & 0xff00ff) | (t2 & 0xff00));
+}
+
diff --git a/pixman/pixman-image.c b/pixman/pixman-image.c
new file mode 100644 (file)
index 0000000..afe587f
--- /dev/null
@@ -0,0 +1,837 @@
+/*
+ * Copyright © 2000 SuSE, Inc.
+ * Copyright © 2007 Red Hat, Inc.
+ *
+ * Permission to use, copy, modify, distribute, and sell this software and its
+ * documentation for any purpose is hereby granted without fee, provided that
+ * the above copyright notice appear in all copies and that both that
+ * copyright notice and this permission notice appear in supporting
+ * documentation, and that the name of SuSE not be used in advertising or
+ * publicity pertaining to distribution of the software without specific,
+ * written prior permission.  SuSE makes no representations about the
+ * suitability of this software for any purpose.  It is provided "as is"
+ * without express or implied warranty.
+ *
+ * SuSE DISCLAIMS ALL WARRANTIES WITH REGARD TO THIS SOFTWARE, INCLUDING ALL
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS, IN NO EVENT SHALL SuSE
+ * BE LIABLE FOR ANY SPECIAL, INDIRECT OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES
+ * WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN ACTION
+ * OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT OF OR IN
+ * CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE.
+ */
+
+#ifdef HAVE_CONFIG_H
+#include <config.h>
+#endif
+
+#include <stdlib.h>
+#include <stdio.h>
+#include <string.h>
+#include <assert.h>
+
+#include "pixman-private.h"
+
+pixman_bool_t
+_pixman_init_gradient (gradient_t *                  gradient,
+                       const pixman_gradient_stop_t *stops,
+                       int                           n_stops)
+{
+    return_val_if_fail (n_stops > 0, FALSE);
+
+    gradient->stops = pixman_malloc_ab (n_stops, sizeof (pixman_gradient_stop_t));
+    if (!gradient->stops)
+       return FALSE;
+
+    memcpy (gradient->stops, stops, n_stops * sizeof (pixman_gradient_stop_t));
+
+    gradient->n_stops = n_stops;
+
+    return TRUE;
+}
+
+void
+_pixman_image_init (pixman_image_t *image)
+{
+    image_common_t *common = &image->common;
+
+    pixman_region32_init (&common->clip_region);
+
+    common->alpha_count = 0;
+    common->have_clip_region = FALSE;
+    common->clip_sources = FALSE;
+    common->transform = NULL;
+    common->repeat = PIXMAN_REPEAT_NONE;
+    common->filter = PIXMAN_FILTER_NEAREST;
+    common->filter_params = NULL;
+    common->n_filter_params = 0;
+    common->alpha_map = NULL;
+    common->component_alpha = FALSE;
+    common->ref_count = 1;
+    common->property_changed = NULL;
+    common->client_clip = FALSE;
+    common->destroy_func = NULL;
+    common->destroy_data = NULL;
+    common->dirty = TRUE;
+}
+
+pixman_bool_t
+_pixman_image_fini (pixman_image_t *image)
+{
+    image_common_t *common = (image_common_t *)image;
+
+    common->ref_count--;
+
+    if (common->ref_count == 0)
+    {
+       if (image->common.destroy_func)
+           image->common.destroy_func (image, image->common.destroy_data);
+
+       pixman_region32_fini (&common->clip_region);
+
+       if (common->transform)
+           free (common->transform);
+
+       if (common->filter_params)
+           free (common->filter_params);
+
+       if (common->alpha_map)
+           pixman_image_unref ((pixman_image_t *)common->alpha_map);
+
+       if (image->type == LINEAR ||
+           image->type == RADIAL ||
+           image->type == CONICAL)
+       {
+           if (image->gradient.stops)
+               free (image->gradient.stops);
+       }
+
+       if (image->type == BITS && image->bits.free_me)
+           free (image->bits.free_me);
+
+       return TRUE;
+    }
+
+    return FALSE;
+}
+
+pixman_image_t *
+_pixman_image_allocate (void)
+{
+    pixman_image_t *image = malloc (sizeof (pixman_image_t));
+
+    if (image)
+       _pixman_image_init (image);
+
+    return image;
+}
+
+static void
+image_property_changed (pixman_image_t *image)
+{
+    image->common.dirty = TRUE;
+}
+
+/* Ref Counting */
+PIXMAN_EXPORT pixman_image_t *
+pixman_image_ref (pixman_image_t *image)
+{
+    image->common.ref_count++;
+
+    return image;
+}
+
+/* returns TRUE when the image is freed */
+PIXMAN_EXPORT pixman_bool_t
+pixman_image_unref (pixman_image_t *image)
+{
+    if (_pixman_image_fini (image))
+    {
+       free (image);
+       return TRUE;
+    }
+
+    return FALSE;
+}
+
+PIXMAN_EXPORT void
+pixman_image_set_destroy_function (pixman_image_t *            image,
+                                   pixman_image_destroy_func_t func,
+                                   void *                      data)
+{
+    image->common.destroy_func = func;
+    image->common.destroy_data = data;
+}
+
+PIXMAN_EXPORT void *
+pixman_image_get_destroy_data (pixman_image_t *image)
+{
+  return image->common.destroy_data;
+}
+
+void
+_pixman_image_reset_clip_region (pixman_image_t *image)
+{
+    image->common.have_clip_region = FALSE;
+}
+
+/* Executive Summary: This function is a no-op that only exists
+ * for historical reasons.
+ *
+ * There used to be a bug in the X server where it would rely on
+ * out-of-bounds accesses when it was asked to composite with a
+ * window as the source. It would create a pixman image pointing
+ * to some bogus position in memory, but then set a clip region
+ * to the position where the actual bits were.
+ *
+ * Due to a bug in old versions of pixman, where it would not clip
+ * against the image bounds when a clip region was set, this would
+ * actually work. So when the pixman bug was fixed, a workaround was
+ * added to allow certain out-of-bound accesses. This function disabled
+ * those workarounds.
+ *
+ * Since 0.21.2, pixman doesn't do these workarounds anymore, so now
+ * this function is a no-op.
+ */
+PIXMAN_EXPORT void
+pixman_disable_out_of_bounds_workaround (void)
+{
+}
+
+static void
+compute_image_info (pixman_image_t *image)
+{
+    pixman_format_code_t code;
+    uint32_t flags = 0;
+
+    /* Transform */
+    if (!image->common.transform)
+    {
+       flags |= (FAST_PATH_ID_TRANSFORM        |
+                 FAST_PATH_X_UNIT_POSITIVE     |
+                 FAST_PATH_Y_UNIT_ZERO         |
+                 FAST_PATH_AFFINE_TRANSFORM);
+    }
+    else
+    {
+       flags |= FAST_PATH_HAS_TRANSFORM;
+
+       if (image->common.transform->matrix[2][0] == 0                  &&
+           image->common.transform->matrix[2][1] == 0                  &&
+           image->common.transform->matrix[2][2] == pixman_fixed_1)
+       {
+           flags |= FAST_PATH_AFFINE_TRANSFORM;
+
+           if (image->common.transform->matrix[0][1] == 0 &&
+               image->common.transform->matrix[1][0] == 0)
+           {
+               if (image->common.transform->matrix[0][0] == -pixman_fixed_1 &&
+                   image->common.transform->matrix[1][1] == -pixman_fixed_1)
+               {
+                   flags |= FAST_PATH_ROTATE_180_TRANSFORM;
+               }
+               flags |= FAST_PATH_SCALE_TRANSFORM;
+           }
+           else if (image->common.transform->matrix[0][0] == 0 &&
+                    image->common.transform->matrix[1][1] == 0)
+           {
+               pixman_fixed_t m01 = image->common.transform->matrix[0][1];
+               if (m01 == -image->common.transform->matrix[1][0])
+               {
+                       if (m01 == -pixman_fixed_1)
+                           flags |= FAST_PATH_ROTATE_90_TRANSFORM;
+                       else if (m01 == pixman_fixed_1)
+                           flags |= FAST_PATH_ROTATE_270_TRANSFORM;
+               }
+           }
+       }
+
+       if (image->common.transform->matrix[0][0] > 0)
+           flags |= FAST_PATH_X_UNIT_POSITIVE;
+
+       if (image->common.transform->matrix[1][0] == 0)
+           flags |= FAST_PATH_Y_UNIT_ZERO;
+    }
+
+    /* Filter */
+    switch (image->common.filter)
+    {
+    case PIXMAN_FILTER_NEAREST:
+    case PIXMAN_FILTER_FAST:
+       flags |= (FAST_PATH_NEAREST_FILTER | FAST_PATH_NO_CONVOLUTION_FILTER);
+       break;
+
+    case PIXMAN_FILTER_BILINEAR:
+    case PIXMAN_FILTER_GOOD:
+    case PIXMAN_FILTER_BEST:
+       flags |= (FAST_PATH_BILINEAR_FILTER | FAST_PATH_NO_CONVOLUTION_FILTER);
+
+       /* Here we have a chance to optimize BILINEAR filter to NEAREST if
+        * they are equivalent for the currently used transformation matrix.
+        */
+       if (flags & FAST_PATH_ID_TRANSFORM)
+       {
+           flags |= FAST_PATH_NEAREST_FILTER;
+       }
+       else if (
+           /* affine and integer translation components in matrix ... */
+           ((flags & FAST_PATH_AFFINE_TRANSFORM) &&
+            !pixman_fixed_frac (image->common.transform->matrix[0][2] |
+                                image->common.transform->matrix[1][2])) &&
+           (
+               /* ... combined with a simple rotation */
+               (flags & (FAST_PATH_ROTATE_90_TRANSFORM |
+                         FAST_PATH_ROTATE_180_TRANSFORM |
+                         FAST_PATH_ROTATE_270_TRANSFORM)) ||
+               /* ... or combined with a simple non-rotated translation */
+               (image->common.transform->matrix[0][0] == pixman_fixed_1 &&
+                image->common.transform->matrix[1][1] == pixman_fixed_1 &&
+                image->common.transform->matrix[0][1] == 0 &&
+                image->common.transform->matrix[1][0] == 0)
+               )
+           )
+       {
+           /* FIXME: there are some affine-test failures, showing that
+            * handling of BILINEAR and NEAREST filter is not quite
+            * equivalent when getting close to 32K for the translation
+            * components of the matrix. That's likely some bug, but for
+            * now just skip BILINEAR->NEAREST optimization in this case.
+            */
+           pixman_fixed_t magic_limit = pixman_int_to_fixed (30000);
+           if (image->common.transform->matrix[0][2] <= magic_limit  &&
+               image->common.transform->matrix[1][2] <= magic_limit  &&
+               image->common.transform->matrix[0][2] >= -magic_limit &&
+               image->common.transform->matrix[1][2] >= -magic_limit)
+           {
+               flags |= FAST_PATH_NEAREST_FILTER;
+           }
+       }
+       break;
+
+    case PIXMAN_FILTER_CONVOLUTION:
+       break;
+
+    default:
+       flags |= FAST_PATH_NO_CONVOLUTION_FILTER;
+       break;
+    }
+
+    /* Repeat mode */
+    switch (image->common.repeat)
+    {
+    case PIXMAN_REPEAT_NONE:
+       flags |=
+           FAST_PATH_NO_REFLECT_REPEAT         |
+           FAST_PATH_NO_PAD_REPEAT             |
+           FAST_PATH_NO_NORMAL_REPEAT;
+       break;
+
+    case PIXMAN_REPEAT_REFLECT:
+       flags |=
+           FAST_PATH_NO_PAD_REPEAT             |
+           FAST_PATH_NO_NONE_REPEAT            |
+           FAST_PATH_NO_NORMAL_REPEAT;
+       break;
+
+    case PIXMAN_REPEAT_PAD:
+       flags |=
+           FAST_PATH_NO_REFLECT_REPEAT         |
+           FAST_PATH_NO_NONE_REPEAT            |
+           FAST_PATH_NO_NORMAL_REPEAT;
+       break;
+
+    default:
+       flags |=
+           FAST_PATH_NO_REFLECT_REPEAT         |
+           FAST_PATH_NO_PAD_REPEAT             |
+           FAST_PATH_NO_NONE_REPEAT;
+       break;
+    }
+
+    /* Component alpha */
+    if (image->common.component_alpha)
+       flags |= FAST_PATH_COMPONENT_ALPHA;
+    else
+       flags |= FAST_PATH_UNIFIED_ALPHA;
+
+    flags |= (FAST_PATH_NO_ACCESSORS | FAST_PATH_NARROW_FORMAT);
+
+    /* Type specific checks */
+    switch (image->type)
+    {
+    case SOLID:
+       code = PIXMAN_solid;
+
+       if (image->solid.color.alpha == 0xffff)
+           flags |= FAST_PATH_IS_OPAQUE;
+       break;
+
+    case BITS:
+       if (image->bits.width == 1      &&
+           image->bits.height == 1     &&
+           image->common.repeat != PIXMAN_REPEAT_NONE)
+       {
+           code = PIXMAN_solid;
+       }
+       else
+       {
+           code = image->bits.format;
+           flags |= FAST_PATH_BITS_IMAGE;
+       }
+
+       if (!PIXMAN_FORMAT_A (image->bits.format)                               &&
+           PIXMAN_FORMAT_TYPE (image->bits.format) != PIXMAN_TYPE_GRAY         &&
+           PIXMAN_FORMAT_TYPE (image->bits.format) != PIXMAN_TYPE_COLOR)
+       {
+           flags |= FAST_PATH_SAMPLES_OPAQUE;
+
+           if (image->common.repeat != PIXMAN_REPEAT_NONE)
+               flags |= FAST_PATH_IS_OPAQUE;
+       }
+
+       if (image->bits.read_func || image->bits.write_func)
+           flags &= ~FAST_PATH_NO_ACCESSORS;
+
+       if (PIXMAN_FORMAT_IS_WIDE (image->bits.format))
+           flags &= ~FAST_PATH_NARROW_FORMAT;
+       break;
+
+    case RADIAL:
+       code = PIXMAN_unknown;
+
+       /*
+        * As explained in pixman-radial-gradient.c, every point of
+        * the plane has a valid associated radius (and thus will be
+        * colored) if and only if a is negative (i.e. one of the two
+        * circles contains the other one).
+        */
+
+        if (image->radial.a >= 0)
+           break;
+
+       /* Fall through */
+
+    case CONICAL:
+    case LINEAR:
+       code = PIXMAN_unknown;
+
+       if (image->common.repeat != PIXMAN_REPEAT_NONE)
+       {
+           int i;
+
+           flags |= FAST_PATH_IS_OPAQUE;
+           for (i = 0; i < image->gradient.n_stops; ++i)
+           {
+               if (image->gradient.stops[i].color.alpha != 0xffff)
+               {
+                   flags &= ~FAST_PATH_IS_OPAQUE;
+                   break;
+               }
+           }
+       }
+       break;
+
+    default:
+       code = PIXMAN_unknown;
+       break;
+    }
+
+    /* Alpha map */
+    if (!image->common.alpha_map)
+    {
+       flags |= FAST_PATH_NO_ALPHA_MAP;
+    }
+    else
+    {
+       if (PIXMAN_FORMAT_IS_WIDE (image->common.alpha_map->format))
+           flags &= ~FAST_PATH_NARROW_FORMAT;
+    }
+
+    /* Both alpha maps and convolution filters can introduce
+     * non-opaqueness in otherwise opaque images. Also
+     * an image with component alpha turned on is only opaque
+     * if all channels are opaque, so we simply turn it off
+     * unconditionally for those images.
+     */
+    if (image->common.alpha_map                                        ||
+       image->common.filter == PIXMAN_FILTER_CONVOLUTION       ||
+       image->common.component_alpha)
+    {
+       flags &= ~(FAST_PATH_IS_OPAQUE | FAST_PATH_SAMPLES_OPAQUE);
+    }
+
+    image->common.flags = flags;
+    image->common.extended_format_code = code;
+}
+
+void
+_pixman_image_validate (pixman_image_t *image)
+{
+    if (image->common.dirty)
+    {
+       compute_image_info (image);
+
+       /* It is important that property_changed is
+        * called *after* compute_image_info() because
+        * property_changed() can make use of the flags
+        * to set up accessors etc.
+        */
+       if (image->common.property_changed)
+           image->common.property_changed (image);
+
+       image->common.dirty = FALSE;
+    }
+
+    if (image->common.alpha_map)
+       _pixman_image_validate ((pixman_image_t *)image->common.alpha_map);
+}
+
+PIXMAN_EXPORT pixman_bool_t
+pixman_image_set_clip_region32 (pixman_image_t *   image,
+                                pixman_region32_t *region)
+{
+    image_common_t *common = (image_common_t *)image;
+    pixman_bool_t result;
+
+    if (region)
+    {
+       if ((result = pixman_region32_copy (&common->clip_region, region)))
+           image->common.have_clip_region = TRUE;
+    }
+    else
+    {
+       _pixman_image_reset_clip_region (image);
+
+       result = TRUE;
+    }
+
+    image_property_changed (image);
+
+    return result;
+}
+
+PIXMAN_EXPORT pixman_bool_t
+pixman_image_set_clip_region (pixman_image_t *   image,
+                              pixman_region16_t *region)
+{
+    image_common_t *common = (image_common_t *)image;
+    pixman_bool_t result;
+
+    if (region)
+    {
+       if ((result = pixman_region32_copy_from_region16 (&common->clip_region, region)))
+           image->common.have_clip_region = TRUE;
+    }
+    else
+    {
+       _pixman_image_reset_clip_region (image);
+
+       result = TRUE;
+    }
+
+    image_property_changed (image);
+
+    return result;
+}
+
+PIXMAN_EXPORT void
+pixman_image_set_has_client_clip (pixman_image_t *image,
+                                  pixman_bool_t   client_clip)
+{
+    image->common.client_clip = client_clip;
+}
+
+PIXMAN_EXPORT pixman_bool_t
+pixman_image_set_transform (pixman_image_t *          image,
+                            const pixman_transform_t *transform)
+{
+    static const pixman_transform_t id =
+    {
+       { { pixman_fixed_1, 0, 0 },
+         { 0, pixman_fixed_1, 0 },
+         { 0, 0, pixman_fixed_1 } }
+    };
+
+    image_common_t *common = (image_common_t *)image;
+    pixman_bool_t result;
+
+    if (common->transform == transform)
+       return TRUE;
+
+    if (!transform || memcmp (&id, transform, sizeof (pixman_transform_t)) == 0)
+    {
+       free (common->transform);
+       common->transform = NULL;
+       result = TRUE;
+
+       goto out;
+    }
+
+    if (common->transform &&
+       memcmp (common->transform, transform, sizeof (pixman_transform_t) == 0))
+    {
+       return TRUE;
+    }
+
+    if (common->transform == NULL)
+       common->transform = malloc (sizeof (pixman_transform_t));
+
+    if (common->transform == NULL)
+    {
+       result = FALSE;
+
+       goto out;
+    }
+
+    memcpy (common->transform, transform, sizeof(pixman_transform_t));
+
+    result = TRUE;
+
+out:
+    image_property_changed (image);
+
+    return result;
+}
+
+PIXMAN_EXPORT void
+pixman_image_set_repeat (pixman_image_t *image,
+                         pixman_repeat_t repeat)
+{
+    if (image->common.repeat == repeat)
+       return;
+
+    image->common.repeat = repeat;
+
+    image_property_changed (image);
+}
+
+PIXMAN_EXPORT pixman_bool_t
+pixman_image_set_filter (pixman_image_t *      image,
+                         pixman_filter_t       filter,
+                         const pixman_fixed_t *params,
+                         int                   n_params)
+{
+    image_common_t *common = (image_common_t *)image;
+    pixman_fixed_t *new_params;
+
+    if (params == common->filter_params && filter == common->filter)
+       return TRUE;
+
+    new_params = NULL;
+    if (params)
+    {
+       new_params = pixman_malloc_ab (n_params, sizeof (pixman_fixed_t));
+       if (!new_params)
+           return FALSE;
+
+       memcpy (new_params,
+               params, n_params * sizeof (pixman_fixed_t));
+    }
+
+    common->filter = filter;
+
+    if (common->filter_params)
+       free (common->filter_params);
+
+    common->filter_params = new_params;
+    common->n_filter_params = n_params;
+
+    image_property_changed (image);
+    return TRUE;
+}
+
+PIXMAN_EXPORT void
+pixman_image_set_source_clipping (pixman_image_t *image,
+                                  pixman_bool_t   clip_sources)
+{
+    if (image->common.clip_sources == clip_sources)
+       return;
+
+    image->common.clip_sources = clip_sources;
+
+    image_property_changed (image);
+}
+
+/* Unlike all the other property setters, this function does not
+ * copy the content of indexed. Doing this copying is simply
+ * way, way too expensive.
+ */
+PIXMAN_EXPORT void
+pixman_image_set_indexed (pixman_image_t *        image,
+                          const pixman_indexed_t *indexed)
+{
+    bits_image_t *bits = (bits_image_t *)image;
+
+    if (bits->indexed == indexed)
+       return;
+
+    bits->indexed = indexed;
+
+    image_property_changed (image);
+}
+
+PIXMAN_EXPORT void
+pixman_image_set_alpha_map (pixman_image_t *image,
+                            pixman_image_t *alpha_map,
+                            int16_t         x,
+                            int16_t         y)
+{
+    image_common_t *common = (image_common_t *)image;
+
+    return_if_fail (!alpha_map || alpha_map->type == BITS);
+
+    if (alpha_map && common->alpha_count > 0)
+    {
+       /* If this image is being used as an alpha map itself,
+        * then you can't give it an alpha map of its own.
+        */
+       return;
+    }
+
+    if (alpha_map && alpha_map->common.alpha_map)
+    {
+       /* If the image has an alpha map of its own,
+        * then it can't be used as an alpha map itself
+        */
+       return;
+    }
+
+    if (common->alpha_map != (bits_image_t *)alpha_map)
+    {
+       if (common->alpha_map)
+       {
+           common->alpha_map->common.alpha_count--;
+
+           pixman_image_unref ((pixman_image_t *)common->alpha_map);
+       }
+
+       if (alpha_map)
+       {
+           common->alpha_map = (bits_image_t *)pixman_image_ref (alpha_map);
+
+           common->alpha_map->common.alpha_count++;
+       }
+       else
+       {
+           common->alpha_map = NULL;
+       }
+    }
+
+    common->alpha_origin_x = x;
+    common->alpha_origin_y = y;
+
+    image_property_changed (image);
+}
+
+PIXMAN_EXPORT void
+pixman_image_set_component_alpha   (pixman_image_t *image,
+                                    pixman_bool_t   component_alpha)
+{
+    if (image->common.component_alpha == component_alpha)
+       return;
+
+    image->common.component_alpha = component_alpha;
+
+    image_property_changed (image);
+}
+
+PIXMAN_EXPORT pixman_bool_t
+pixman_image_get_component_alpha   (pixman_image_t       *image)
+{
+    return image->common.component_alpha;
+}
+
+PIXMAN_EXPORT void
+pixman_image_set_accessors (pixman_image_t *           image,
+                            pixman_read_memory_func_t  read_func,
+                            pixman_write_memory_func_t write_func)
+{
+    return_if_fail (image != NULL);
+
+    if (image->type == BITS)
+    {
+       image->bits.read_func = read_func;
+       image->bits.write_func = write_func;
+
+       image_property_changed (image);
+    }
+}
+
+PIXMAN_EXPORT uint32_t *
+pixman_image_get_data (pixman_image_t *image)
+{
+    if (image->type == BITS)
+       return image->bits.bits;
+
+    return NULL;
+}
+
+PIXMAN_EXPORT int
+pixman_image_get_width (pixman_image_t *image)
+{
+    if (image->type == BITS)
+       return image->bits.width;
+
+    return 0;
+}
+
+PIXMAN_EXPORT int
+pixman_image_get_height (pixman_image_t *image)
+{
+    if (image->type == BITS)
+       return image->bits.height;
+
+    return 0;
+}
+
+PIXMAN_EXPORT int
+pixman_image_get_stride (pixman_image_t *image)
+{
+    if (image->type == BITS)
+       return image->bits.rowstride * (int) sizeof (uint32_t);
+
+    return 0;
+}
+
+PIXMAN_EXPORT int
+pixman_image_get_depth (pixman_image_t *image)
+{
+    if (image->type == BITS)
+       return PIXMAN_FORMAT_DEPTH (image->bits.format);
+
+    return 0;
+}
+
+PIXMAN_EXPORT pixman_format_code_t
+pixman_image_get_format (pixman_image_t *image)
+{
+    if (image->type == BITS)
+       return image->bits.format;
+
+    return 0;
+}
+
+uint32_t
+_pixman_image_get_solid (pixman_implementation_t *imp,
+                        pixman_image_t *         image,
+                         pixman_format_code_t     format)
+{
+    uint32_t result;
+    pixman_iter_t iter;
+
+    _pixman_implementation_src_iter_init (
+       imp, &iter, image, 0, 0, 1, 1,
+       (uint8_t *)&result, ITER_NARROW);
+
+    result = *iter.get_scanline (&iter, NULL);
+
+    /* If necessary, convert RGB <--> BGR. */
+    if (PIXMAN_FORMAT_TYPE (format) != PIXMAN_TYPE_ARGB)
+    {
+       result = (((result & 0xff000000) >>  0) |
+                 ((result & 0x00ff0000) >> 16) |
+                 ((result & 0x0000ff00) >>  0) |
+                 ((result & 0x000000ff) << 16));
+    }
+
+    return result;
+}
diff --git a/pixman/pixman-implementation.c b/pixman/pixman-implementation.c
new file mode 100644 (file)
index 0000000..2b7b19d
--- /dev/null
@@ -0,0 +1,286 @@
+/*
+ * Copyright © 2009 Red Hat, Inc.
+ *
+ * Permission to use, copy, modify, distribute, and sell this software and its
+ * documentation for any purpose is hereby granted without fee, provided that
+ * the above copyright notice appear in all copies and that both that
+ * copyright notice and this permission notice appear in supporting
+ * documentation, and that the name of Red Hat not be used in advertising or
+ * publicity pertaining to distribution of the software without specific,
+ * written prior permission.  Red Hat makes no representations about the
+ * suitability of this software for any purpose.  It is provided "as is"
+ * without express or implied warranty.
+ *
+ * THE COPYRIGHT HOLDERS DISCLAIM ALL WARRANTIES WITH REGARD TO THIS
+ * SOFTWARE, INCLUDING ALL IMPLIED WARRANTIES OF MERCHANTABILITY AND
+ * FITNESS, IN NO EVENT SHALL THE COPYRIGHT HOLDERS BE LIABLE FOR ANY
+ * SPECIAL, INDIRECT OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES
+ * WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN
+ * AN ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING
+ * OUT OF OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS
+ * SOFTWARE.
+ */
+
+#ifdef HAVE_CONFIG_H
+#include <config.h>
+#endif
+#include <stdlib.h>
+#include "pixman-private.h"
+
+static void
+delegate_combine_32 (pixman_implementation_t * imp,
+                     pixman_op_t               op,
+                     uint32_t *                dest,
+                     const uint32_t *          src,
+                     const uint32_t *          mask,
+                     int                       width)
+{
+    _pixman_implementation_combine_32 (imp->delegate,
+                                       op, dest, src, mask, width);
+}
+
+static void
+delegate_combine_64 (pixman_implementation_t * imp,
+                     pixman_op_t               op,
+                     uint64_t *                dest,
+                     const uint64_t *          src,
+                     const uint64_t *          mask,
+                     int                       width)
+{
+    _pixman_implementation_combine_64 (imp->delegate,
+                                       op, dest, src, mask, width);
+}
+
+static void
+delegate_combine_32_ca (pixman_implementation_t * imp,
+                        pixman_op_t               op,
+                        uint32_t *                dest,
+                        const uint32_t *          src,
+                        const uint32_t *          mask,
+                        int                       width)
+{
+    _pixman_implementation_combine_32_ca (imp->delegate,
+                                          op, dest, src, mask, width);
+}
+
+static void
+delegate_combine_64_ca (pixman_implementation_t * imp,
+                        pixman_op_t               op,
+                        uint64_t *                dest,
+                        const uint64_t *          src,
+                        const uint64_t *          mask,
+                        int                       width)
+{
+    _pixman_implementation_combine_64_ca (imp->delegate,
+                                          op, dest, src, mask, width);
+}
+
+static pixman_bool_t
+delegate_blt (pixman_implementation_t * imp,
+              uint32_t *                src_bits,
+              uint32_t *                dst_bits,
+              int                       src_stride,
+              int                       dst_stride,
+              int                       src_bpp,
+              int                       dst_bpp,
+              int                       src_x,
+              int                       src_y,
+              int                       dest_x,
+              int                       dest_y,
+              int                       width,
+              int                       height)
+{
+    return _pixman_implementation_blt (
+       imp->delegate, src_bits, dst_bits, src_stride, dst_stride,
+       src_bpp, dst_bpp, src_x, src_y, dest_x, dest_y,
+       width, height);
+}
+
+static pixman_bool_t
+delegate_fill (pixman_implementation_t *imp,
+               uint32_t *               bits,
+               int                      stride,
+               int                      bpp,
+               int                      x,
+               int                      y,
+               int                      width,
+               int                      height,
+               uint32_t                 xor)
+{
+    return _pixman_implementation_fill (
+       imp->delegate, bits, stride, bpp, x, y, width, height, xor);
+}
+
+static void
+delegate_src_iter_init (pixman_implementation_t *imp,
+                       pixman_iter_t *          iter)
+{
+    imp->delegate->src_iter_init (imp->delegate, iter);
+}
+
+static void
+delegate_dest_iter_init (pixman_implementation_t *imp,
+                        pixman_iter_t *          iter)
+{
+    imp->delegate->dest_iter_init (imp->delegate, iter);
+}
+
+pixman_implementation_t *
+_pixman_implementation_create (pixman_implementation_t *delegate,
+                              const pixman_fast_path_t *fast_paths)
+{
+    pixman_implementation_t *imp = malloc (sizeof (pixman_implementation_t));
+    pixman_implementation_t *d;
+    int i;
+
+    if (!imp)
+       return NULL;
+
+    assert (fast_paths);
+
+    /* Make sure the whole delegate chain has the right toplevel */
+    imp->delegate = delegate;
+    for (d = imp; d != NULL; d = d->delegate)
+       d->toplevel = imp;
+
+    /* Fill out function pointers with ones that just delegate
+     */
+    imp->blt = delegate_blt;
+    imp->fill = delegate_fill;
+    imp->src_iter_init = delegate_src_iter_init;
+    imp->dest_iter_init = delegate_dest_iter_init;
+
+    for (i = 0; i < PIXMAN_N_OPERATORS; ++i)
+    {
+       imp->combine_32[i] = delegate_combine_32;
+       imp->combine_64[i] = delegate_combine_64;
+       imp->combine_32_ca[i] = delegate_combine_32_ca;
+       imp->combine_64_ca[i] = delegate_combine_64_ca;
+    }
+
+    imp->fast_paths = fast_paths;
+
+    return imp;
+}
+
+void
+_pixman_implementation_combine_32 (pixman_implementation_t * imp,
+                                   pixman_op_t               op,
+                                   uint32_t *                dest,
+                                   const uint32_t *          src,
+                                   const uint32_t *          mask,
+                                   int                       width)
+{
+    (*imp->combine_32[op]) (imp, op, dest, src, mask, width);
+}
+
+void
+_pixman_implementation_combine_64 (pixman_implementation_t * imp,
+                                   pixman_op_t               op,
+                                   uint64_t *                dest,
+                                   const uint64_t *          src,
+                                   const uint64_t *          mask,
+                                   int                       width)
+{
+    (*imp->combine_64[op]) (imp, op, dest, src, mask, width);
+}
+
+void
+_pixman_implementation_combine_32_ca (pixman_implementation_t * imp,
+                                      pixman_op_t               op,
+                                      uint32_t *                dest,
+                                      const uint32_t *          src,
+                                      const uint32_t *          mask,
+                                      int                       width)
+{
+    (*imp->combine_32_ca[op]) (imp, op, dest, src, mask, width);
+}
+
+void
+_pixman_implementation_combine_64_ca (pixman_implementation_t * imp,
+                                      pixman_op_t               op,
+                                      uint64_t *                dest,
+                                      const uint64_t *          src,
+                                      const uint64_t *          mask,
+                                      int                       width)
+{
+    (*imp->combine_64_ca[op]) (imp, op, dest, src, mask, width);
+}
+
+pixman_bool_t
+_pixman_implementation_blt (pixman_implementation_t * imp,
+                            uint32_t *                src_bits,
+                            uint32_t *                dst_bits,
+                            int                       src_stride,
+                            int                       dst_stride,
+                            int                       src_bpp,
+                            int                       dst_bpp,
+                            int                       src_x,
+                            int                       src_y,
+                            int                       dest_x,
+                            int                       dest_y,
+                            int                       width,
+                            int                       height)
+{
+    return (*imp->blt) (imp, src_bits, dst_bits, src_stride, dst_stride,
+                       src_bpp, dst_bpp, src_x, src_y, dest_x, dest_y,
+                       width, height);
+}
+
+pixman_bool_t
+_pixman_implementation_fill (pixman_implementation_t *imp,
+                             uint32_t *               bits,
+                             int                      stride,
+                             int                      bpp,
+                             int                      x,
+                             int                      y,
+                             int                      width,
+                             int                      height,
+                             uint32_t                 xor)
+{
+    return (*imp->fill) (imp, bits, stride, bpp, x, y, width, height, xor);
+}
+
+void
+_pixman_implementation_src_iter_init (pixman_implementation_t  *imp,
+                                     pixman_iter_t             *iter,
+                                     pixman_image_t            *image,
+                                     int                        x,
+                                     int                        y,
+                                     int                        width,
+                                     int                        height,
+                                     uint8_t                   *buffer,
+                                     iter_flags_t               flags)
+{
+    iter->image = image;
+    iter->buffer = (uint32_t *)buffer;
+    iter->x = x;
+    iter->y = y;
+    iter->width = width;
+    iter->height = height;
+    iter->flags = flags;
+
+    (*imp->src_iter_init) (imp, iter);
+}
+
+void
+_pixman_implementation_dest_iter_init (pixman_implementation_t *imp,
+                                      pixman_iter_t            *iter,
+                                      pixman_image_t           *image,
+                                      int                       x,
+                                      int                       y,
+                                      int                       width,
+                                      int                       height,
+                                      uint8_t                  *buffer,
+                                      iter_flags_t              flags)
+{
+    iter->image = image;
+    iter->buffer = (uint32_t *)buffer;
+    iter->x = x;
+    iter->y = y;
+    iter->width = width;
+    iter->height = height;
+    iter->flags = flags;
+
+    (*imp->dest_iter_init) (imp, iter);
+}
diff --git a/pixman/pixman-inlines.h b/pixman/pixman-inlines.h
new file mode 100644 (file)
index 0000000..3532867
--- /dev/null
@@ -0,0 +1,1280 @@
+/* -*- Mode: c; c-basic-offset: 4; tab-width: 8; indent-tabs-mode: t; -*- */
+/*
+ * Copyright © 2000 SuSE, Inc.
+ * Copyright © 2007 Red Hat, Inc.
+ *
+ * Permission to use, copy, modify, distribute, and sell this software and its
+ * documentation for any purpose is hereby granted without fee, provided that
+ * the above copyright notice appear in all copies and that both that
+ * copyright notice and this permission notice appear in supporting
+ * documentation, and that the name of SuSE not be used in advertising or
+ * publicity pertaining to distribution of the software without specific,
+ * written prior permission.  SuSE makes no representations about the
+ * suitability of this software for any purpose.  It is provided "as is"
+ * without express or implied warranty.
+ *
+ * SuSE DISCLAIMS ALL WARRANTIES WITH REGARD TO THIS SOFTWARE, INCLUDING ALL
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS, IN NO EVENT SHALL SuSE
+ * BE LIABLE FOR ANY SPECIAL, INDIRECT OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES
+ * WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN ACTION
+ * OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT OF OR IN
+ * CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE.
+ *
+ * Author:  Keith Packard, SuSE, Inc.
+ */
+
+#ifndef PIXMAN_FAST_PATH_H__
+#define PIXMAN_FAST_PATH_H__
+
+#include "pixman-private.h"
+
+#define PIXMAN_REPEAT_COVER -1
+
+/* Flags describing input parameters to fast path macro template.
+ * Turning on some flag values may indicate that
+ * "some property X is available so template can use this" or
+ * "some property X should be handled by template".
+ *
+ * FLAG_HAVE_SOLID_MASK
+ *  Input mask is solid so template should handle this.
+ *
+ * FLAG_HAVE_NON_SOLID_MASK
+ *  Input mask is bits mask so template should handle this.
+ *
+ * FLAG_HAVE_SOLID_MASK and FLAG_HAVE_NON_SOLID_MASK are mutually
+ * exclusive. (It's not allowed to turn both flags on)
+ */
+#define FLAG_NONE                              (0)
+#define FLAG_HAVE_SOLID_MASK                   (1 <<   1)
+#define FLAG_HAVE_NON_SOLID_MASK               (1 <<   2)
+
+/* To avoid too short repeated scanline function calls, extend source
+ * scanlines having width less than below constant value.
+ */
+#define REPEAT_NORMAL_MIN_WIDTH                        64
+
+static force_inline pixman_bool_t
+repeat (pixman_repeat_t repeat, int *c, int size)
+{
+    if (repeat == PIXMAN_REPEAT_NONE)
+    {
+       if (*c < 0 || *c >= size)
+           return FALSE;
+    }
+    else if (repeat == PIXMAN_REPEAT_NORMAL)
+    {
+       while (*c >= size)
+           *c -= size;
+       while (*c < 0)
+           *c += size;
+    }
+    else if (repeat == PIXMAN_REPEAT_PAD)
+    {
+       *c = CLIP (*c, 0, size - 1);
+    }
+    else /* REFLECT */
+    {
+       *c = MOD (*c, size * 2);
+       if (*c >= size)
+           *c = size * 2 - *c - 1;
+    }
+    return TRUE;
+}
+
+#if SIZEOF_LONG > 4
+
+static force_inline uint32_t
+bilinear_interpolation (uint32_t tl, uint32_t tr,
+                       uint32_t bl, uint32_t br,
+                       int distx, int disty)
+{
+    uint64_t distxy, distxiy, distixy, distixiy;
+    uint64_t tl64, tr64, bl64, br64;
+    uint64_t f, r;
+
+    distxy = distx * disty;
+    distxiy = distx * (256 - disty);
+    distixy = (256 - distx) * disty;
+    distixiy = (256 - distx) * (256 - disty);
+
+    /* Alpha and Blue */
+    tl64 = tl & 0xff0000ff;
+    tr64 = tr & 0xff0000ff;
+    bl64 = bl & 0xff0000ff;
+    br64 = br & 0xff0000ff;
+
+    f = tl64 * distixiy + tr64 * distxiy + bl64 * distixy + br64 * distxy;
+    r = f & 0x0000ff0000ff0000ull;
+
+    /* Red and Green */
+    tl64 = tl;
+    tl64 = ((tl64 << 16) & 0x000000ff00000000ull) | (tl64 & 0x0000ff00ull);
+
+    tr64 = tr;
+    tr64 = ((tr64 << 16) & 0x000000ff00000000ull) | (tr64 & 0x0000ff00ull);
+
+    bl64 = bl;
+    bl64 = ((bl64 << 16) & 0x000000ff00000000ull) | (bl64 & 0x0000ff00ull);
+
+    br64 = br;
+    br64 = ((br64 << 16) & 0x000000ff00000000ull) | (br64 & 0x0000ff00ull);
+
+    f = tl64 * distixiy + tr64 * distxiy + bl64 * distixy + br64 * distxy;
+    r |= ((f >> 16) & 0x000000ff00000000ull) | (f & 0xff000000ull);
+
+    return (uint32_t)(r >> 16);
+}
+
+#else
+
+static force_inline uint32_t
+bilinear_interpolation (uint32_t tl, uint32_t tr,
+                       uint32_t bl, uint32_t br,
+                       int distx, int disty)
+{
+    int distxy, distxiy, distixy, distixiy;
+    uint32_t f, r;
+
+    distxy = distx * disty;
+    distxiy = (distx << 8) - distxy;   /* distx * (256 - disty) */
+    distixy = (disty << 8) - distxy;   /* disty * (256 - distx) */
+    distixiy =
+       256 * 256 - (disty << 8) -
+       (distx << 8) + distxy;          /* (256 - distx) * (256 - disty) */
+
+    /* Blue */
+    r = (tl & 0x000000ff) * distixiy + (tr & 0x000000ff) * distxiy
+      + (bl & 0x000000ff) * distixy  + (br & 0x000000ff) * distxy;
+
+    /* Green */
+    f = (tl & 0x0000ff00) * distixiy + (tr & 0x0000ff00) * distxiy
+      + (bl & 0x0000ff00) * distixy  + (br & 0x0000ff00) * distxy;
+    r |= f & 0xff000000;
+
+    tl >>= 16;
+    tr >>= 16;
+    bl >>= 16;
+    br >>= 16;
+    r >>= 16;
+
+    /* Red */
+    f = (tl & 0x000000ff) * distixiy + (tr & 0x000000ff) * distxiy
+      + (bl & 0x000000ff) * distixy  + (br & 0x000000ff) * distxy;
+    r |= f & 0x00ff0000;
+
+    /* Alpha */
+    f = (tl & 0x0000ff00) * distixiy + (tr & 0x0000ff00) * distxiy
+      + (bl & 0x0000ff00) * distixy  + (br & 0x0000ff00) * distxy;
+    r |= f & 0xff000000;
+
+    return r;
+}
+
+#endif
+
+/*
+ * For each scanline fetched from source image with PAD repeat:
+ * - calculate how many pixels need to be padded on the left side
+ * - calculate how many pixels need to be padded on the right side
+ * - update width to only count pixels which are fetched from the image
+ * All this information is returned via 'width', 'left_pad', 'right_pad'
+ * arguments. The code is assuming that 'unit_x' is positive.
+ *
+ * Note: 64-bit math is used in order to avoid potential overflows, which
+ *       is probably excessive in many cases. This particular function
+ *       may need its own correctness test and performance tuning.
+ */
+static force_inline void
+pad_repeat_get_scanline_bounds (int32_t         source_image_width,
+                               pixman_fixed_t  vx,
+                               pixman_fixed_t  unit_x,
+                               int32_t *       width,
+                               int32_t *       left_pad,
+                               int32_t *       right_pad)
+{
+    int64_t max_vx = (int64_t) source_image_width << 16;
+    int64_t tmp;
+    if (vx < 0)
+    {
+       tmp = ((int64_t) unit_x - 1 - vx) / unit_x;
+       if (tmp > *width)
+       {
+           *left_pad = *width;
+           *width = 0;
+       }
+       else
+       {
+           *left_pad = (int32_t) tmp;
+           *width -= (int32_t) tmp;
+       }
+    }
+    else
+    {
+       *left_pad = 0;
+    }
+    tmp = ((int64_t) unit_x - 1 - vx + max_vx) / unit_x - *left_pad;
+    if (tmp < 0)
+    {
+       *right_pad = *width;
+       *width = 0;
+    }
+    else if (tmp >= *width)
+    {
+       *right_pad = 0;
+    }
+    else
+    {
+       *right_pad = *width - (int32_t) tmp;
+       *width = (int32_t) tmp;
+    }
+}
+
+/* A macroified version of specialized nearest scalers for some
+ * common 8888 and 565 formats. It supports SRC and OVER ops.
+ *
+ * There are two repeat versions, one that handles repeat normal,
+ * and one without repeat handling that only works if the src region
+ * used is completely covered by the pre-repeated source samples.
+ *
+ * The loops are unrolled to process two pixels per iteration for better
+ * performance on most CPU architectures (superscalar processors
+ * can issue several operations simultaneously, other processors can hide
+ * instructions latencies by pipelining operations). Unrolling more
+ * does not make much sense because the compiler will start running out
+ * of spare registers soon.
+ */
+
+#define GET_8888_ALPHA(s) ((s) >> 24)
+ /* This is not actually used since we don't have an OVER with
+    565 source, but it is needed to build. */
+#define GET_0565_ALPHA(s) 0xff
+#define GET_x888_ALPHA(s) 0xff
+
+#define FAST_NEAREST_SCANLINE(scanline_func_name, SRC_FORMAT, DST_FORMAT,                      \
+                             src_type_t, dst_type_t, OP, repeat_mode)                          \
+static force_inline void                                                                       \
+scanline_func_name (dst_type_t       *dst,                                                     \
+                   const src_type_t *src,                                                      \
+                   int32_t           w,                                                        \
+                   pixman_fixed_t    vx,                                                       \
+                   pixman_fixed_t    unit_x,                                                   \
+                   pixman_fixed_t    max_vx,                                                   \
+                   pixman_bool_t     fully_transparent_src)                                    \
+{                                                                                              \
+       uint32_t   d;                                                                           \
+       src_type_t s1, s2;                                                                      \
+       uint8_t    a1, a2;                                                                      \
+       int        x1, x2;                                                                      \
+                                                                                               \
+       if (PIXMAN_OP_ ## OP == PIXMAN_OP_OVER && fully_transparent_src)                        \
+           return;                                                                             \
+                                                                                               \
+       if (PIXMAN_OP_ ## OP != PIXMAN_OP_SRC && PIXMAN_OP_ ## OP != PIXMAN_OP_OVER)            \
+           abort();                                                                            \
+                                                                                               \
+       while ((w -= 2) >= 0)                                                                   \
+       {                                                                                       \
+           x1 = vx >> 16;                                                                      \
+           vx += unit_x;                                                                       \
+           if (PIXMAN_REPEAT_ ## repeat_mode == PIXMAN_REPEAT_NORMAL)                          \
+           {                                                                                   \
+               /* This works because we know that unit_x is positive */                        \
+               while (vx >= max_vx)                                                            \
+                   vx -= max_vx;                                                               \
+           }                                                                                   \
+           s1 = src[x1];                                                                       \
+                                                                                               \
+           x2 = vx >> 16;                                                                      \
+           vx += unit_x;                                                                       \
+           if (PIXMAN_REPEAT_ ## repeat_mode == PIXMAN_REPEAT_NORMAL)                          \
+           {                                                                                   \
+               /* This works because we know that unit_x is positive */                        \
+               while (vx >= max_vx)                                                            \
+                   vx -= max_vx;                                                               \
+           }                                                                                   \
+           s2 = src[x2];                                                                       \
+                                                                                               \
+           if (PIXMAN_OP_ ## OP == PIXMAN_OP_OVER)                                             \
+           {                                                                                   \
+               a1 = GET_ ## SRC_FORMAT ## _ALPHA(s1);                                          \
+               a2 = GET_ ## SRC_FORMAT ## _ALPHA(s2);                                          \
+                                                                                               \
+               if (a1 == 0xff)                                                                 \
+               {                                                                               \
+                   *dst = CONVERT_ ## SRC_FORMAT ## _TO_ ## DST_FORMAT (s1);                   \
+               }                                                                               \
+               else if (s1)                                                                    \
+               {                                                                               \
+                   d = CONVERT_ ## DST_FORMAT ## _TO_8888 (*dst);                              \
+                   s1 = CONVERT_ ## SRC_FORMAT ## _TO_8888 (s1);                               \
+                   a1 ^= 0xff;                                                                 \
+                   UN8x4_MUL_UN8_ADD_UN8x4 (d, a1, s1);                                        \
+                   *dst = CONVERT_8888_TO_ ## DST_FORMAT (d);                                  \
+               }                                                                               \
+               dst++;                                                                          \
+                                                                                               \
+               if (a2 == 0xff)                                                                 \
+               {                                                                               \
+                   *dst = CONVERT_ ## SRC_FORMAT ## _TO_ ## DST_FORMAT (s2);                   \
+               }                                                                               \
+               else if (s2)                                                                    \
+               {                                                                               \
+                   d = CONVERT_## DST_FORMAT ## _TO_8888 (*dst);                               \
+                   s2 = CONVERT_## SRC_FORMAT ## _TO_8888 (s2);                                \
+                   a2 ^= 0xff;                                                                 \
+                   UN8x4_MUL_UN8_ADD_UN8x4 (d, a2, s2);                                        \
+                   *dst = CONVERT_8888_TO_ ## DST_FORMAT (d);                                  \
+               }                                                                               \
+               dst++;                                                                          \
+           }                                                                                   \
+           else /* PIXMAN_OP_SRC */                                                            \
+           {                                                                                   \
+               *dst++ = CONVERT_ ## SRC_FORMAT ## _TO_ ## DST_FORMAT (s1);                     \
+               *dst++ = CONVERT_ ## SRC_FORMAT ## _TO_ ## DST_FORMAT (s2);                     \
+           }                                                                                   \
+       }                                                                                       \
+                                                                                               \
+       if (w & 1)                                                                              \
+       {                                                                                       \
+           x1 = vx >> 16;                                                                      \
+           s1 = src[x1];                                                                       \
+                                                                                               \
+           if (PIXMAN_OP_ ## OP == PIXMAN_OP_OVER)                                             \
+           {                                                                                   \
+               a1 = GET_ ## SRC_FORMAT ## _ALPHA(s1);                                          \
+                                                                                               \
+               if (a1 == 0xff)                                                                 \
+               {                                                                               \
+                   *dst = CONVERT_ ## SRC_FORMAT ## _TO_ ## DST_FORMAT (s1);                   \
+               }                                                                               \
+               else if (s1)                                                                    \
+               {                                                                               \
+                   d = CONVERT_## DST_FORMAT ## _TO_8888 (*dst);                               \
+                   s1 = CONVERT_ ## SRC_FORMAT ## _TO_8888 (s1);                               \
+                   a1 ^= 0xff;                                                                 \
+                   UN8x4_MUL_UN8_ADD_UN8x4 (d, a1, s1);                                        \
+                   *dst = CONVERT_8888_TO_ ## DST_FORMAT (d);                                  \
+               }                                                                               \
+               dst++;                                                                          \
+           }                                                                                   \
+           else /* PIXMAN_OP_SRC */                                                            \
+           {                                                                                   \
+               *dst++ = CONVERT_ ## SRC_FORMAT ## _TO_ ## DST_FORMAT (s1);                     \
+           }                                                                                   \
+       }                                                                                       \
+}
+
+#define FAST_NEAREST_MAINLOOP_INT(scale_func_name, scanline_func, src_type_t, mask_type_t,     \
+                                 dst_type_t, repeat_mode, have_mask, mask_is_solid)            \
+static void                                                                                    \
+fast_composite_scaled_nearest  ## scale_func_name (pixman_implementation_t *imp,               \
+                                                  pixman_composite_info_t *info)               \
+{                                                                                              \
+    PIXMAN_COMPOSITE_ARGS (info);                                                              \
+    dst_type_t *dst_line;                                                                      \
+    mask_type_t *mask_line;                                                                    \
+    src_type_t *src_first_line;                                                                        \
+    int       y;                                                                               \
+    pixman_fixed_t max_vx = INT32_MAX; /* suppress uninitialized variable warning */           \
+    pixman_fixed_t max_vy;                                                                     \
+    pixman_vector_t v;                                                                         \
+    pixman_fixed_t vx, vy;                                                                     \
+    pixman_fixed_t unit_x, unit_y;                                                             \
+    int32_t left_pad, right_pad;                                                               \
+                                                                                               \
+    src_type_t *src;                                                                           \
+    dst_type_t *dst;                                                                           \
+    mask_type_t solid_mask;                                                                    \
+    const mask_type_t *mask = &solid_mask;                                                     \
+    int src_stride, mask_stride, dst_stride;                                                   \
+                                                                                               \
+    PIXMAN_IMAGE_GET_LINE (dest_image, dest_x, dest_y, dst_type_t, dst_stride, dst_line, 1);   \
+    if (have_mask)                                                                             \
+    {                                                                                          \
+       if (mask_is_solid)                                                                      \
+           solid_mask = _pixman_image_get_solid (imp, mask_image, dest_image->bits.format);    \
+       else                                                                                    \
+           PIXMAN_IMAGE_GET_LINE (mask_image, mask_x, mask_y, mask_type_t,                     \
+                                  mask_stride, mask_line, 1);                                  \
+    }                                                                                          \
+    /* pass in 0 instead of src_x and src_y because src_x and src_y need to be                 \
+     * transformed from destination space to source space */                                   \
+    PIXMAN_IMAGE_GET_LINE (src_image, 0, 0, src_type_t, src_stride, src_first_line, 1);                \
+                                                                                               \
+    /* reference point is the center of the pixel */                                           \
+    v.vector[0] = pixman_int_to_fixed (src_x) + pixman_fixed_1 / 2;                            \
+    v.vector[1] = pixman_int_to_fixed (src_y) + pixman_fixed_1 / 2;                            \
+    v.vector[2] = pixman_fixed_1;                                                              \
+                                                                                               \
+    if (!pixman_transform_point_3d (src_image->common.transform, &v))                          \
+       return;                                                                                 \
+                                                                                               \
+    unit_x = src_image->common.transform->matrix[0][0];                                                \
+    unit_y = src_image->common.transform->matrix[1][1];                                                \
+                                                                                               \
+    /* Round down to closest integer, ensuring that 0.5 rounds to 0, not 1 */                  \
+    v.vector[0] -= pixman_fixed_e;                                                             \
+    v.vector[1] -= pixman_fixed_e;                                                             \
+                                                                                               \
+    vx = v.vector[0];                                                                          \
+    vy = v.vector[1];                                                                          \
+                                                                                               \
+    if (PIXMAN_REPEAT_ ## repeat_mode == PIXMAN_REPEAT_NORMAL)                                 \
+    {                                                                                          \
+       /* Clamp repeating positions inside the actual samples */                               \
+       max_vx = src_image->bits.width << 16;                                                   \
+       max_vy = src_image->bits.height << 16;                                                  \
+                                                                                               \
+       repeat (PIXMAN_REPEAT_NORMAL, &vx, max_vx);                                             \
+       repeat (PIXMAN_REPEAT_NORMAL, &vy, max_vy);                                             \
+    }                                                                                          \
+                                                                                               \
+    if (PIXMAN_REPEAT_ ## repeat_mode == PIXMAN_REPEAT_PAD ||                                  \
+       PIXMAN_REPEAT_ ## repeat_mode == PIXMAN_REPEAT_NONE)                                    \
+    {                                                                                          \
+       pad_repeat_get_scanline_bounds (src_image->bits.width, vx, unit_x,                      \
+                                       &width, &left_pad, &right_pad);                         \
+       vx += left_pad * unit_x;                                                                \
+    }                                                                                          \
+                                                                                               \
+    while (--height >= 0)                                                                      \
+    {                                                                                          \
+       dst = dst_line;                                                                         \
+       dst_line += dst_stride;                                                                 \
+       if (have_mask && !mask_is_solid)                                                        \
+       {                                                                                       \
+           mask = mask_line;                                                                   \
+           mask_line += mask_stride;                                                           \
+       }                                                                                       \
+                                                                                               \
+       y = vy >> 16;                                                                           \
+       vy += unit_y;                                                                           \
+       if (PIXMAN_REPEAT_ ## repeat_mode == PIXMAN_REPEAT_NORMAL)                              \
+           repeat (PIXMAN_REPEAT_NORMAL, &vy, max_vy);                                         \
+       if (PIXMAN_REPEAT_ ## repeat_mode == PIXMAN_REPEAT_PAD)                                 \
+       {                                                                                       \
+           repeat (PIXMAN_REPEAT_PAD, &y, src_image->bits.height);                             \
+           src = src_first_line + src_stride * y;                                              \
+           if (left_pad > 0)                                                                   \
+           {                                                                                   \
+               scanline_func (mask, dst, src, left_pad, 0, 0, 0, FALSE);                       \
+           }                                                                                   \
+           if (width > 0)                                                                      \
+           {                                                                                   \
+               scanline_func (mask + (mask_is_solid ? 0 : left_pad),                           \
+                              dst + left_pad, src, width, vx, unit_x, 0, FALSE);               \
+           }                                                                                   \
+           if (right_pad > 0)                                                                  \
+           {                                                                                   \
+               scanline_func (mask + (mask_is_solid ? 0 : left_pad + width),                   \
+                              dst + left_pad + width, src + src_image->bits.width - 1,         \
+                              right_pad, 0, 0, 0, FALSE);                                      \
+           }                                                                                   \
+       }                                                                                       \
+       else if (PIXMAN_REPEAT_ ## repeat_mode == PIXMAN_REPEAT_NONE)                           \
+       {                                                                                       \
+           static const src_type_t zero[1] = { 0 };                                            \
+           if (y < 0 || y >= src_image->bits.height)                                           \
+           {                                                                                   \
+               scanline_func (mask, dst, zero, left_pad + width + right_pad, 0, 0, 0, TRUE);   \
+               continue;                                                                       \
+           }                                                                                   \
+           src = src_first_line + src_stride * y;                                              \
+           if (left_pad > 0)                                                                   \
+           {                                                                                   \
+               scanline_func (mask, dst, zero, left_pad, 0, 0, 0, TRUE);                       \
+           }                                                                                   \
+           if (width > 0)                                                                      \
+           {                                                                                   \
+               scanline_func (mask + (mask_is_solid ? 0 : left_pad),                           \
+                              dst + left_pad, src, width, vx, unit_x, 0, FALSE);               \
+           }                                                                                   \
+           if (right_pad > 0)                                                                  \
+           {                                                                                   \
+               scanline_func (mask + (mask_is_solid ? 0 : left_pad + width),                   \
+                              dst + left_pad + width, zero, right_pad, 0, 0, 0, TRUE);         \
+           }                                                                                   \
+       }                                                                                       \
+       else                                                                                    \
+       {                                                                                       \
+           src = src_first_line + src_stride * y;                                              \
+           scanline_func (mask, dst, src, width, vx, unit_x, max_vx, FALSE);                   \
+       }                                                                                       \
+    }                                                                                          \
+}
+
+/* A workaround for old sun studio, see: https://bugs.freedesktop.org/show_bug.cgi?id=32764 */
+#define FAST_NEAREST_MAINLOOP_COMMON(scale_func_name, scanline_func, src_type_t, mask_type_t,  \
+                                 dst_type_t, repeat_mode, have_mask, mask_is_solid)            \
+       FAST_NEAREST_MAINLOOP_INT(_ ## scale_func_name, scanline_func, src_type_t, mask_type_t, \
+                                 dst_type_t, repeat_mode, have_mask, mask_is_solid)
+
+#define FAST_NEAREST_MAINLOOP_NOMASK(scale_func_name, scanline_func, src_type_t, dst_type_t,   \
+                             repeat_mode)                                                      \
+    static force_inline void                                                                   \
+    scanline_func##scale_func_name##_wrapper (                                                 \
+                   const uint8_t    *mask,                                                     \
+                   dst_type_t       *dst,                                                      \
+                   const src_type_t *src,                                                      \
+                   int32_t          w,                                                         \
+                   pixman_fixed_t   vx,                                                        \
+                   pixman_fixed_t   unit_x,                                                    \
+                   pixman_fixed_t   max_vx,                                                    \
+                   pixman_bool_t    fully_transparent_src)                                     \
+    {                                                                                          \
+       scanline_func (dst, src, w, vx, unit_x, max_vx, fully_transparent_src);                 \
+    }                                                                                          \
+    FAST_NEAREST_MAINLOOP_INT (scale_func_name, scanline_func##scale_func_name##_wrapper,      \
+                              src_type_t, uint8_t, dst_type_t, repeat_mode, FALSE, FALSE)
+
+#define FAST_NEAREST_MAINLOOP(scale_func_name, scanline_func, src_type_t, dst_type_t,          \
+                             repeat_mode)                                                      \
+       FAST_NEAREST_MAINLOOP_NOMASK(_ ## scale_func_name, scanline_func, src_type_t,           \
+                             dst_type_t, repeat_mode)
+
+#define FAST_NEAREST(scale_func_name, SRC_FORMAT, DST_FORMAT,                          \
+                    src_type_t, dst_type_t, OP, repeat_mode)                           \
+    FAST_NEAREST_SCANLINE(scaled_nearest_scanline_ ## scale_func_name ## _ ## OP,      \
+                         SRC_FORMAT, DST_FORMAT, src_type_t, dst_type_t,               \
+                         OP, repeat_mode)                                              \
+    FAST_NEAREST_MAINLOOP_NOMASK(_ ## scale_func_name ## _ ## OP,                      \
+                         scaled_nearest_scanline_ ## scale_func_name ## _ ## OP,       \
+                         src_type_t, dst_type_t, repeat_mode)
+
+
+#define SCALED_NEAREST_FLAGS                                           \
+    (FAST_PATH_SCALE_TRANSFORM |                                       \
+     FAST_PATH_NO_ALPHA_MAP    |                                       \
+     FAST_PATH_NEAREST_FILTER  |                                       \
+     FAST_PATH_NO_ACCESSORS    |                                       \
+     FAST_PATH_NARROW_FORMAT)
+
+#define SIMPLE_NEAREST_FAST_PATH_NORMAL(op,s,d,func)                   \
+    {   PIXMAN_OP_ ## op,                                              \
+       PIXMAN_ ## s,                                                   \
+       (SCALED_NEAREST_FLAGS           |                               \
+        FAST_PATH_NORMAL_REPEAT        |                               \
+        FAST_PATH_X_UNIT_POSITIVE),                                    \
+       PIXMAN_null, 0,                                                 \
+       PIXMAN_ ## d, FAST_PATH_STD_DEST_FLAGS,                         \
+       fast_composite_scaled_nearest_ ## func ## _normal ## _ ## op,   \
+    }
+
+#define SIMPLE_NEAREST_FAST_PATH_PAD(op,s,d,func)                      \
+    {   PIXMAN_OP_ ## op,                                              \
+       PIXMAN_ ## s,                                                   \
+       (SCALED_NEAREST_FLAGS           |                               \
+        FAST_PATH_PAD_REPEAT           |                               \
+        FAST_PATH_X_UNIT_POSITIVE),                                    \
+       PIXMAN_null, 0,                                                 \
+       PIXMAN_ ## d, FAST_PATH_STD_DEST_FLAGS,                         \
+       fast_composite_scaled_nearest_ ## func ## _pad ## _ ## op,      \
+    }
+
+#define SIMPLE_NEAREST_FAST_PATH_NONE(op,s,d,func)                     \
+    {   PIXMAN_OP_ ## op,                                              \
+       PIXMAN_ ## s,                                                   \
+       (SCALED_NEAREST_FLAGS           |                               \
+        FAST_PATH_NONE_REPEAT          |                               \
+        FAST_PATH_X_UNIT_POSITIVE),                                    \
+       PIXMAN_null, 0,                                                 \
+       PIXMAN_ ## d, FAST_PATH_STD_DEST_FLAGS,                         \
+       fast_composite_scaled_nearest_ ## func ## _none ## _ ## op,     \
+    }
+
+#define SIMPLE_NEAREST_FAST_PATH_COVER(op,s,d,func)                    \
+    {   PIXMAN_OP_ ## op,                                              \
+       PIXMAN_ ## s,                                                   \
+       SCALED_NEAREST_FLAGS | FAST_PATH_SAMPLES_COVER_CLIP_NEAREST,    \
+       PIXMAN_null, 0,                                                 \
+       PIXMAN_ ## d, FAST_PATH_STD_DEST_FLAGS,                         \
+       fast_composite_scaled_nearest_ ## func ## _cover ## _ ## op,    \
+    }
+
+#define SIMPLE_NEAREST_A8_MASK_FAST_PATH_NORMAL(op,s,d,func)           \
+    {   PIXMAN_OP_ ## op,                                              \
+       PIXMAN_ ## s,                                                   \
+       (SCALED_NEAREST_FLAGS           |                               \
+        FAST_PATH_NORMAL_REPEAT        |                               \
+        FAST_PATH_X_UNIT_POSITIVE),                                    \
+       PIXMAN_a8, MASK_FLAGS (a8, FAST_PATH_UNIFIED_ALPHA),            \
+       PIXMAN_ ## d, FAST_PATH_STD_DEST_FLAGS,                         \
+       fast_composite_scaled_nearest_ ## func ## _normal ## _ ## op,   \
+    }
+
+#define SIMPLE_NEAREST_A8_MASK_FAST_PATH_PAD(op,s,d,func)              \
+    {   PIXMAN_OP_ ## op,                                              \
+       PIXMAN_ ## s,                                                   \
+       (SCALED_NEAREST_FLAGS           |                               \
+        FAST_PATH_PAD_REPEAT           |                               \
+        FAST_PATH_X_UNIT_POSITIVE),                                    \
+       PIXMAN_a8, MASK_FLAGS (a8, FAST_PATH_UNIFIED_ALPHA),            \
+       PIXMAN_ ## d, FAST_PATH_STD_DEST_FLAGS,                         \
+       fast_composite_scaled_nearest_ ## func ## _pad ## _ ## op,      \
+    }
+
+#define SIMPLE_NEAREST_A8_MASK_FAST_PATH_NONE(op,s,d,func)             \
+    {   PIXMAN_OP_ ## op,                                              \
+       PIXMAN_ ## s,                                                   \
+       (SCALED_NEAREST_FLAGS           |                               \
+        FAST_PATH_NONE_REPEAT          |                               \
+        FAST_PATH_X_UNIT_POSITIVE),                                    \
+       PIXMAN_a8, MASK_FLAGS (a8, FAST_PATH_UNIFIED_ALPHA),            \
+       PIXMAN_ ## d, FAST_PATH_STD_DEST_FLAGS,                         \
+       fast_composite_scaled_nearest_ ## func ## _none ## _ ## op,     \
+    }
+
+#define SIMPLE_NEAREST_A8_MASK_FAST_PATH_COVER(op,s,d,func)            \
+    {   PIXMAN_OP_ ## op,                                              \
+       PIXMAN_ ## s,                                                   \
+       SCALED_NEAREST_FLAGS | FAST_PATH_SAMPLES_COVER_CLIP_NEAREST,    \
+       PIXMAN_a8, MASK_FLAGS (a8, FAST_PATH_UNIFIED_ALPHA),            \
+       PIXMAN_ ## d, FAST_PATH_STD_DEST_FLAGS,                         \
+       fast_composite_scaled_nearest_ ## func ## _cover ## _ ## op,    \
+    }
+
+#define SIMPLE_NEAREST_SOLID_MASK_FAST_PATH_NORMAL(op,s,d,func)                \
+    {   PIXMAN_OP_ ## op,                                              \
+       PIXMAN_ ## s,                                                   \
+       (SCALED_NEAREST_FLAGS           |                               \
+        FAST_PATH_NORMAL_REPEAT        |                               \
+        FAST_PATH_X_UNIT_POSITIVE),                                    \
+       PIXMAN_solid, MASK_FLAGS (solid, FAST_PATH_UNIFIED_ALPHA),      \
+       PIXMAN_ ## d, FAST_PATH_STD_DEST_FLAGS,                         \
+       fast_composite_scaled_nearest_ ## func ## _normal ## _ ## op,   \
+    }
+
+#define SIMPLE_NEAREST_SOLID_MASK_FAST_PATH_PAD(op,s,d,func)           \
+    {   PIXMAN_OP_ ## op,                                              \
+       PIXMAN_ ## s,                                                   \
+       (SCALED_NEAREST_FLAGS           |                               \
+        FAST_PATH_PAD_REPEAT           |                               \
+        FAST_PATH_X_UNIT_POSITIVE),                                    \
+       PIXMAN_solid, MASK_FLAGS (solid, FAST_PATH_UNIFIED_ALPHA),      \
+       PIXMAN_ ## d, FAST_PATH_STD_DEST_FLAGS,                         \
+       fast_composite_scaled_nearest_ ## func ## _pad ## _ ## op,      \
+    }
+
+#define SIMPLE_NEAREST_SOLID_MASK_FAST_PATH_NONE(op,s,d,func)          \
+    {   PIXMAN_OP_ ## op,                                              \
+       PIXMAN_ ## s,                                                   \
+       (SCALED_NEAREST_FLAGS           |                               \
+        FAST_PATH_NONE_REPEAT          |                               \
+        FAST_PATH_X_UNIT_POSITIVE),                                    \
+       PIXMAN_solid, MASK_FLAGS (solid, FAST_PATH_UNIFIED_ALPHA),      \
+       PIXMAN_ ## d, FAST_PATH_STD_DEST_FLAGS,                         \
+       fast_composite_scaled_nearest_ ## func ## _none ## _ ## op,     \
+    }
+
+#define SIMPLE_NEAREST_SOLID_MASK_FAST_PATH_COVER(op,s,d,func)         \
+    {   PIXMAN_OP_ ## op,                                              \
+       PIXMAN_ ## s,                                                   \
+       SCALED_NEAREST_FLAGS | FAST_PATH_SAMPLES_COVER_CLIP_NEAREST,    \
+       PIXMAN_solid, MASK_FLAGS (solid, FAST_PATH_UNIFIED_ALPHA),      \
+       PIXMAN_ ## d, FAST_PATH_STD_DEST_FLAGS,                         \
+       fast_composite_scaled_nearest_ ## func ## _cover ## _ ## op,    \
+    }
+
+/* Prefer the use of 'cover' variant, because it is faster */
+#define SIMPLE_NEAREST_FAST_PATH(op,s,d,func)                          \
+    SIMPLE_NEAREST_FAST_PATH_COVER (op,s,d,func),                      \
+    SIMPLE_NEAREST_FAST_PATH_NONE (op,s,d,func),                       \
+    SIMPLE_NEAREST_FAST_PATH_PAD (op,s,d,func),                                \
+    SIMPLE_NEAREST_FAST_PATH_NORMAL (op,s,d,func)
+
+#define SIMPLE_NEAREST_A8_MASK_FAST_PATH(op,s,d,func)                  \
+    SIMPLE_NEAREST_A8_MASK_FAST_PATH_COVER (op,s,d,func),              \
+    SIMPLE_NEAREST_A8_MASK_FAST_PATH_NONE (op,s,d,func),               \
+    SIMPLE_NEAREST_A8_MASK_FAST_PATH_PAD (op,s,d,func)
+
+#define SIMPLE_NEAREST_SOLID_MASK_FAST_PATH(op,s,d,func)               \
+    SIMPLE_NEAREST_SOLID_MASK_FAST_PATH_COVER (op,s,d,func),           \
+    SIMPLE_NEAREST_SOLID_MASK_FAST_PATH_NONE (op,s,d,func),            \
+    SIMPLE_NEAREST_SOLID_MASK_FAST_PATH_PAD (op,s,d,func)
+
+/*****************************************************************************/
+
+/*
+ * Identify 5 zones in each scanline for bilinear scaling. Depending on
+ * whether 2 pixels to be interpolated are fetched from the image itself,
+ * from the padding area around it or from both image and padding area.
+ */
+static force_inline void
+bilinear_pad_repeat_get_scanline_bounds (int32_t         source_image_width,
+                                        pixman_fixed_t  vx,
+                                        pixman_fixed_t  unit_x,
+                                        int32_t *       left_pad,
+                                        int32_t *       left_tz,
+                                        int32_t *       width,
+                                        int32_t *       right_tz,
+                                        int32_t *       right_pad)
+{
+       int width1 = *width, left_pad1, right_pad1;
+       int width2 = *width, left_pad2, right_pad2;
+
+       pad_repeat_get_scanline_bounds (source_image_width, vx, unit_x,
+                                       &width1, &left_pad1, &right_pad1);
+       pad_repeat_get_scanline_bounds (source_image_width, vx + pixman_fixed_1,
+                                       unit_x, &width2, &left_pad2, &right_pad2);
+
+       *left_pad = left_pad2;
+       *left_tz = left_pad1 - left_pad2;
+       *right_tz = right_pad2 - right_pad1;
+       *right_pad = right_pad1;
+       *width -= *left_pad + *left_tz + *right_tz + *right_pad;
+}
+
+/*
+ * Main loop template for single pass bilinear scaling. It needs to be
+ * provided with 'scanline_func' which should do the compositing operation.
+ * The needed function has the following prototype:
+ *
+ *     scanline_func (dst_type_t *       dst,
+ *                    const mask_type_ * mask,
+ *                    const src_type_t * src_top,
+ *                    const src_type_t * src_bottom,
+ *                    int32_t            width,
+ *                    int                weight_top,
+ *                    int                weight_bottom,
+ *                    pixman_fixed_t     vx,
+ *                    pixman_fixed_t     unit_x,
+ *                    pixman_fixed_t     max_vx,
+ *                    pixman_bool_t      zero_src)
+ *
+ * Where:
+ *  dst                 - destination scanline buffer for storing results
+ *  mask                - mask buffer (or single value for solid mask)
+ *  src_top, src_bottom - two source scanlines
+ *  width               - number of pixels to process
+ *  weight_top          - weight of the top row for interpolation
+ *  weight_bottom       - weight of the bottom row for interpolation
+ *  vx                  - initial position for fetching the first pair of
+ *                        pixels from the source buffer
+ *  unit_x              - position increment needed to move to the next pair
+ *                        of pixels
+ *  max_vx              - image size as a fixed point value, can be used for
+ *                        implementing NORMAL repeat (when it is supported)
+ *  zero_src            - boolean hint variable, which is set to TRUE when
+ *                        all source pixels are fetched from zero padding
+ *                        zone for NONE repeat
+ *
+ * Note: normally the sum of 'weight_top' and 'weight_bottom' is equal to 256,
+ *       but sometimes it may be less than that for NONE repeat when handling
+ *       fuzzy antialiased top or bottom image edges. Also both top and
+ *       bottom weight variables are guaranteed to have value in 0-255
+ *       range and can fit into unsigned byte or be used with 8-bit SIMD
+ *       multiplication instructions.
+ */
+#define FAST_BILINEAR_MAINLOOP_INT(scale_func_name, scanline_func, src_type_t, mask_type_t,    \
+                                 dst_type_t, repeat_mode, flags)                               \
+static void                                                                                    \
+fast_composite_scaled_bilinear ## scale_func_name (pixman_implementation_t *imp,               \
+                                                  pixman_composite_info_t *info)               \
+{                                                                                              \
+    PIXMAN_COMPOSITE_ARGS (info);                                                              \
+    dst_type_t *dst_line;                                                                      \
+    mask_type_t *mask_line;                                                                    \
+    src_type_t *src_first_line;                                                                        \
+    int       y1, y2;                                                                          \
+    pixman_fixed_t max_vx = INT32_MAX; /* suppress uninitialized variable warning */           \
+    pixman_vector_t v;                                                                         \
+    pixman_fixed_t vx, vy;                                                                     \
+    pixman_fixed_t unit_x, unit_y;                                                             \
+    int32_t left_pad, left_tz, right_tz, right_pad;                                            \
+                                                                                               \
+    dst_type_t *dst;                                                                           \
+    mask_type_t solid_mask;                                                                    \
+    const mask_type_t *mask = &solid_mask;                                                     \
+    int src_stride, mask_stride, dst_stride;                                                   \
+                                                                                               \
+    int src_width;                                                                             \
+    pixman_fixed_t src_width_fixed;                                                            \
+    int max_x;                                                                                 \
+    pixman_bool_t need_src_extension;                                                          \
+                                                                                               \
+    PIXMAN_IMAGE_GET_LINE (dest_image, dest_x, dest_y, dst_type_t, dst_stride, dst_line, 1);   \
+    if (flags & FLAG_HAVE_SOLID_MASK)                                                          \
+    {                                                                                          \
+       solid_mask = _pixman_image_get_solid (imp, mask_image, dest_image->bits.format);        \
+       mask_stride = 0;                                                                        \
+    }                                                                                          \
+    else if (flags & FLAG_HAVE_NON_SOLID_MASK)                                                 \
+    {                                                                                          \
+       PIXMAN_IMAGE_GET_LINE (mask_image, mask_x, mask_y, mask_type_t,                         \
+                              mask_stride, mask_line, 1);                                      \
+    }                                                                                          \
+                                                                                               \
+    /* pass in 0 instead of src_x and src_y because src_x and src_y need to be                 \
+     * transformed from destination space to source space */                                   \
+    PIXMAN_IMAGE_GET_LINE (src_image, 0, 0, src_type_t, src_stride, src_first_line, 1);                \
+                                                                                               \
+    /* reference point is the center of the pixel */                                           \
+    v.vector[0] = pixman_int_to_fixed (src_x) + pixman_fixed_1 / 2;                            \
+    v.vector[1] = pixman_int_to_fixed (src_y) + pixman_fixed_1 / 2;                            \
+    v.vector[2] = pixman_fixed_1;                                                              \
+                                                                                               \
+    if (!pixman_transform_point_3d (src_image->common.transform, &v))                          \
+       return;                                                                                 \
+                                                                                               \
+    unit_x = src_image->common.transform->matrix[0][0];                                                \
+    unit_y = src_image->common.transform->matrix[1][1];                                                \
+                                                                                               \
+    v.vector[0] -= pixman_fixed_1 / 2;                                                         \
+    v.vector[1] -= pixman_fixed_1 / 2;                                                         \
+                                                                                               \
+    vy = v.vector[1];                                                                          \
+                                                                                               \
+    if (PIXMAN_REPEAT_ ## repeat_mode == PIXMAN_REPEAT_PAD ||                                  \
+       PIXMAN_REPEAT_ ## repeat_mode == PIXMAN_REPEAT_NONE)                                    \
+    {                                                                                          \
+       bilinear_pad_repeat_get_scanline_bounds (src_image->bits.width, v.vector[0], unit_x,    \
+                                       &left_pad, &left_tz, &width, &right_tz, &right_pad);    \
+       if (PIXMAN_REPEAT_ ## repeat_mode == PIXMAN_REPEAT_PAD)                                 \
+       {                                                                                       \
+           /* PAD repeat does not need special handling for 'transition zones' and */          \
+           /* they can be combined with 'padding zones' safely */                              \
+           left_pad += left_tz;                                                                \
+           right_pad += right_tz;                                                              \
+           left_tz = right_tz = 0;                                                             \
+       }                                                                                       \
+       v.vector[0] += left_pad * unit_x;                                                       \
+    }                                                                                          \
+                                                                                               \
+    if (PIXMAN_REPEAT_ ## repeat_mode == PIXMAN_REPEAT_NORMAL)                                 \
+    {                                                                                          \
+       vx = v.vector[0];                                                                       \
+       repeat (PIXMAN_REPEAT_NORMAL, &vx, pixman_int_to_fixed(src_image->bits.width));         \
+       max_x = pixman_fixed_to_int (vx + (width - 1) * unit_x) + 1;                            \
+                                                                                               \
+       if (src_image->bits.width < REPEAT_NORMAL_MIN_WIDTH)                                    \
+       {                                                                                       \
+           src_width = 0;                                                                      \
+                                                                                               \
+           while (src_width < REPEAT_NORMAL_MIN_WIDTH && src_width <= max_x)                   \
+               src_width += src_image->bits.width;                                             \
+                                                                                               \
+           need_src_extension = TRUE;                                                          \
+       }                                                                                       \
+       else                                                                                    \
+       {                                                                                       \
+           src_width = src_image->bits.width;                                                  \
+           need_src_extension = FALSE;                                                         \
+       }                                                                                       \
+                                                                                               \
+       src_width_fixed = pixman_int_to_fixed (src_width);                                      \
+    }                                                                                          \
+                                                                                               \
+    while (--height >= 0)                                                                      \
+    {                                                                                          \
+       int weight1, weight2;                                                                   \
+       dst = dst_line;                                                                         \
+       dst_line += dst_stride;                                                                 \
+       vx = v.vector[0];                                                                       \
+       if (flags & FLAG_HAVE_NON_SOLID_MASK)                                                   \
+       {                                                                                       \
+           mask = mask_line;                                                                   \
+           mask_line += mask_stride;                                                           \
+       }                                                                                       \
+                                                                                               \
+       y1 = pixman_fixed_to_int (vy);                                                          \
+       weight2 = (vy >> 8) & 0xff;                                                             \
+       if (weight2)                                                                            \
+       {                                                                                       \
+           /* normal case, both row weights are in 0-255 range and fit unsigned byte */        \
+           y2 = y1 + 1;                                                                        \
+           weight1 = 256 - weight2;                                                            \
+       }                                                                                       \
+       else                                                                                    \
+       {                                                                                       \
+           /* set both top and bottom row to the same scanline, and weights to 128+128 */      \
+           y2 = y1;                                                                            \
+           weight1 = weight2 = 128;                                                            \
+       }                                                                                       \
+       vy += unit_y;                                                                           \
+       if (PIXMAN_REPEAT_ ## repeat_mode == PIXMAN_REPEAT_PAD)                                 \
+       {                                                                                       \
+           src_type_t *src1, *src2;                                                            \
+           src_type_t buf1[2];                                                                 \
+           src_type_t buf2[2];                                                                 \
+           repeat (PIXMAN_REPEAT_PAD, &y1, src_image->bits.height);                            \
+           repeat (PIXMAN_REPEAT_PAD, &y2, src_image->bits.height);                            \
+           src1 = src_first_line + src_stride * y1;                                            \
+           src2 = src_first_line + src_stride * y2;                                            \
+                                                                                               \
+           if (left_pad > 0)                                                                   \
+           {                                                                                   \
+               buf1[0] = buf1[1] = src1[0];                                                    \
+               buf2[0] = buf2[1] = src2[0];                                                    \
+               scanline_func (dst, mask,                                                       \
+                              buf1, buf2, left_pad, weight1, weight2, 0, 0, 0, FALSE);         \
+               dst += left_pad;                                                                \
+               if (flags & FLAG_HAVE_NON_SOLID_MASK)                                           \
+                   mask += left_pad;                                                           \
+           }                                                                                   \
+           if (width > 0)                                                                      \
+           {                                                                                   \
+               scanline_func (dst, mask,                                                       \
+                              src1, src2, width, weight1, weight2, vx, unit_x, 0, FALSE);      \
+               dst += width;                                                                   \
+               if (flags & FLAG_HAVE_NON_SOLID_MASK)                                           \
+                   mask += width;                                                              \
+           }                                                                                   \
+           if (right_pad > 0)                                                                  \
+           {                                                                                   \
+               buf1[0] = buf1[1] = src1[src_image->bits.width - 1];                            \
+               buf2[0] = buf2[1] = src2[src_image->bits.width - 1];                            \
+               scanline_func (dst, mask,                                                       \
+                              buf1, buf2, right_pad, weight1, weight2, 0, 0, 0, FALSE);        \
+           }                                                                                   \
+       }                                                                                       \
+       else if (PIXMAN_REPEAT_ ## repeat_mode == PIXMAN_REPEAT_NONE)                           \
+       {                                                                                       \
+           src_type_t *src1, *src2;                                                            \
+           src_type_t buf1[2];                                                                 \
+           src_type_t buf2[2];                                                                 \
+           /* handle top/bottom zero padding by just setting weights to 0 if needed */         \
+           if (y1 < 0)                                                                         \
+           {                                                                                   \
+               weight1 = 0;                                                                    \
+               y1 = 0;                                                                         \
+           }                                                                                   \
+           if (y1 >= src_image->bits.height)                                                   \
+           {                                                                                   \
+               weight1 = 0;                                                                    \
+               y1 = src_image->bits.height - 1;                                                \
+           }                                                                                   \
+           if (y2 < 0)                                                                         \
+           {                                                                                   \
+               weight2 = 0;                                                                    \
+               y2 = 0;                                                                         \
+           }                                                                                   \
+           if (y2 >= src_image->bits.height)                                                   \
+           {                                                                                   \
+               weight2 = 0;                                                                    \
+               y2 = src_image->bits.height - 1;                                                \
+           }                                                                                   \
+           src1 = src_first_line + src_stride * y1;                                            \
+           src2 = src_first_line + src_stride * y2;                                            \
+                                                                                               \
+           if (left_pad > 0)                                                                   \
+           {                                                                                   \
+               buf1[0] = buf1[1] = 0;                                                          \
+               buf2[0] = buf2[1] = 0;                                                          \
+               scanline_func (dst, mask,                                                       \
+                              buf1, buf2, left_pad, weight1, weight2, 0, 0, 0, TRUE);          \
+               dst += left_pad;                                                                \
+               if (flags & FLAG_HAVE_NON_SOLID_MASK)                                           \
+                   mask += left_pad;                                                           \
+           }                                                                                   \
+           if (left_tz > 0)                                                                    \
+           {                                                                                   \
+               buf1[0] = 0;                                                                    \
+               buf1[1] = src1[0];                                                              \
+               buf2[0] = 0;                                                                    \
+               buf2[1] = src2[0];                                                              \
+               scanline_func (dst, mask,                                                       \
+                              buf1, buf2, left_tz, weight1, weight2,                           \
+                              pixman_fixed_frac (vx), unit_x, 0, FALSE);                       \
+               dst += left_tz;                                                                 \
+               if (flags & FLAG_HAVE_NON_SOLID_MASK)                                           \
+                   mask += left_tz;                                                            \
+               vx += left_tz * unit_x;                                                         \
+           }                                                                                   \
+           if (width > 0)                                                                      \
+           {                                                                                   \
+               scanline_func (dst, mask,                                                       \
+                              src1, src2, width, weight1, weight2, vx, unit_x, 0, FALSE);      \
+               dst += width;                                                                   \
+               if (flags & FLAG_HAVE_NON_SOLID_MASK)                                           \
+                   mask += width;                                                              \
+               vx += width * unit_x;                                                           \
+           }                                                                                   \
+           if (right_tz > 0)                                                                   \
+           {                                                                                   \
+               buf1[0] = src1[src_image->bits.width - 1];                                      \
+               buf1[1] = 0;                                                                    \
+               buf2[0] = src2[src_image->bits.width - 1];                                      \
+               buf2[1] = 0;                                                                    \
+               scanline_func (dst, mask,                                                       \
+                              buf1, buf2, right_tz, weight1, weight2,                          \
+                              pixman_fixed_frac (vx), unit_x, 0, FALSE);                       \
+               dst += right_tz;                                                                \
+               if (flags & FLAG_HAVE_NON_SOLID_MASK)                                           \
+                   mask += right_tz;                                                           \
+           }                                                                                   \
+           if (right_pad > 0)                                                                  \
+           {                                                                                   \
+               buf1[0] = buf1[1] = 0;                                                          \
+               buf2[0] = buf2[1] = 0;                                                          \
+               scanline_func (dst, mask,                                                       \
+                              buf1, buf2, right_pad, weight1, weight2, 0, 0, 0, TRUE);         \
+           }                                                                                   \
+       }                                                                                       \
+       else if (PIXMAN_REPEAT_ ## repeat_mode == PIXMAN_REPEAT_NORMAL)                         \
+       {                                                                                       \
+           int32_t         num_pixels;                                                         \
+           int32_t         width_remain;                                                       \
+           src_type_t *    src_line_top;                                                       \
+           src_type_t *    src_line_bottom;                                                    \
+           src_type_t      buf1[2];                                                            \
+           src_type_t      buf2[2];                                                            \
+           src_type_t      extended_src_line0[REPEAT_NORMAL_MIN_WIDTH*2];                      \
+           src_type_t      extended_src_line1[REPEAT_NORMAL_MIN_WIDTH*2];                      \
+           int             i, j;                                                               \
+                                                                                               \
+           repeat (PIXMAN_REPEAT_NORMAL, &y1, src_image->bits.height);                         \
+           repeat (PIXMAN_REPEAT_NORMAL, &y2, src_image->bits.height);                         \
+           src_line_top = src_first_line + src_stride * y1;                                    \
+           src_line_bottom = src_first_line + src_stride * y2;                                 \
+                                                                                               \
+           if (need_src_extension)                                                             \
+           {                                                                                   \
+               for (i=0; i<src_width;)                                                         \
+               {                                                                               \
+                   for (j=0; j<src_image->bits.width; j++, i++)                                \
+                   {                                                                           \
+                       extended_src_line0[i] = src_line_top[j];                                \
+                       extended_src_line1[i] = src_line_bottom[j];                             \
+                   }                                                                           \
+               }                                                                               \
+                                                                                               \
+               src_line_top = &extended_src_line0[0];                                          \
+               src_line_bottom = &extended_src_line1[0];                                       \
+           }                                                                                   \
+                                                                                               \
+           /* Top & Bottom wrap around buffer */                                               \
+           buf1[0] = src_line_top[src_width - 1];                                              \
+           buf1[1] = src_line_top[0];                                                          \
+           buf2[0] = src_line_bottom[src_width - 1];                                           \
+           buf2[1] = src_line_bottom[0];                                                       \
+                                                                                               \
+           width_remain = width;                                                               \
+                                                                                               \
+           while (width_remain > 0)                                                            \
+           {                                                                                   \
+               /* We use src_width_fixed because it can make vx in original source range */    \
+               repeat (PIXMAN_REPEAT_NORMAL, &vx, src_width_fixed);                            \
+                                                                                               \
+               /* Wrap around part */                                                          \
+               if (pixman_fixed_to_int (vx) == src_width - 1)                                  \
+               {                                                                               \
+                   /* for positive unit_x                                                      \
+                    * num_pixels = max(n) + 1, where vx + n*unit_x < src_width_fixed           \
+                    *                                                                          \
+                    * vx is in range [0, src_width_fixed - pixman_fixed_e]                     \
+                    * So we are safe from overflow.                                            \
+                    */                                                                         \
+                   num_pixels = ((src_width_fixed - vx - pixman_fixed_e) / unit_x) + 1;        \
+                                                                                               \
+                   if (num_pixels > width_remain)                                              \
+                       num_pixels = width_remain;                                              \
+                                                                                               \
+                   scanline_func (dst, mask, buf1, buf2, num_pixels,                           \
+                                  weight1, weight2, pixman_fixed_frac(vx),                     \
+                                  unit_x, src_width_fixed, FALSE);                             \
+                                                                                               \
+                   width_remain -= num_pixels;                                                 \
+                   vx += num_pixels * unit_x;                                                  \
+                   dst += num_pixels;                                                          \
+                                                                                               \
+                   if (flags & FLAG_HAVE_NON_SOLID_MASK)                                       \
+                       mask += num_pixels;                                                     \
+                                                                                               \
+                   repeat (PIXMAN_REPEAT_NORMAL, &vx, src_width_fixed);                        \
+               }                                                                               \
+                                                                                               \
+               /* Normal scanline composite */                                                 \
+               if (pixman_fixed_to_int (vx) != src_width - 1 && width_remain > 0)              \
+               {                                                                               \
+                   /* for positive unit_x                                                      \
+                    * num_pixels = max(n) + 1, where vx + n*unit_x < (src_width_fixed - 1)     \
+                    *                                                                          \
+                    * vx is in range [0, src_width_fixed - pixman_fixed_e]                     \
+                    * So we are safe from overflow here.                                       \
+                    */                                                                         \
+                   num_pixels = ((src_width_fixed - pixman_fixed_1 - vx - pixman_fixed_e)      \
+                                 / unit_x) + 1;                                                \
+                                                                                               \
+                   if (num_pixels > width_remain)                                              \
+                       num_pixels = width_remain;                                              \
+                                                                                               \
+                   scanline_func (dst, mask, src_line_top, src_line_bottom, num_pixels,        \
+                                  weight1, weight2, vx, unit_x, src_width_fixed, FALSE);       \
+                                                                                               \
+                   width_remain -= num_pixels;                                                 \
+                   vx += num_pixels * unit_x;                                                  \
+                   dst += num_pixels;                                                          \
+                                                                                               \
+                   if (flags & FLAG_HAVE_NON_SOLID_MASK)                                       \
+                       mask += num_pixels;                                                     \
+               }                                                                               \
+           }                                                                                   \
+       }                                                                                       \
+       else                                                                                    \
+       {                                                                                       \
+           scanline_func (dst, mask, src_first_line + src_stride * y1,                         \
+                          src_first_line + src_stride * y2, width,                             \
+                          weight1, weight2, vx, unit_x, max_vx, FALSE);                        \
+       }                                                                                       \
+    }                                                                                          \
+}
+
+/* A workaround for old sun studio, see: https://bugs.freedesktop.org/show_bug.cgi?id=32764 */
+#define FAST_BILINEAR_MAINLOOP_COMMON(scale_func_name, scanline_func, src_type_t, mask_type_t, \
+                                 dst_type_t, repeat_mode, flags)                               \
+       FAST_BILINEAR_MAINLOOP_INT(_ ## scale_func_name, scanline_func, src_type_t, mask_type_t,\
+                                 dst_type_t, repeat_mode, flags)
+
+#define SCALED_BILINEAR_FLAGS                                          \
+    (FAST_PATH_SCALE_TRANSFORM |                                       \
+     FAST_PATH_NO_ALPHA_MAP    |                                       \
+     FAST_PATH_BILINEAR_FILTER |                                       \
+     FAST_PATH_NO_ACCESSORS    |                                       \
+     FAST_PATH_NARROW_FORMAT)
+
+#define SIMPLE_BILINEAR_FAST_PATH_PAD(op,s,d,func)                     \
+    {   PIXMAN_OP_ ## op,                                              \
+       PIXMAN_ ## s,                                                   \
+       (SCALED_BILINEAR_FLAGS          |                               \
+        FAST_PATH_PAD_REPEAT           |                               \
+        FAST_PATH_X_UNIT_POSITIVE),                                    \
+       PIXMAN_null, 0,                                                 \
+       PIXMAN_ ## d, FAST_PATH_STD_DEST_FLAGS,                         \
+       fast_composite_scaled_bilinear_ ## func ## _pad ## _ ## op,     \
+    }
+
+#define SIMPLE_BILINEAR_FAST_PATH_NONE(op,s,d,func)                    \
+    {   PIXMAN_OP_ ## op,                                              \
+       PIXMAN_ ## s,                                                   \
+       (SCALED_BILINEAR_FLAGS          |                               \
+        FAST_PATH_NONE_REPEAT          |                               \
+        FAST_PATH_X_UNIT_POSITIVE),                                    \
+       PIXMAN_null, 0,                                                 \
+       PIXMAN_ ## d, FAST_PATH_STD_DEST_FLAGS,                         \
+       fast_composite_scaled_bilinear_ ## func ## _none ## _ ## op,    \
+    }
+
+#define SIMPLE_BILINEAR_FAST_PATH_COVER(op,s,d,func)                   \
+    {   PIXMAN_OP_ ## op,                                              \
+       PIXMAN_ ## s,                                                   \
+       SCALED_BILINEAR_FLAGS | FAST_PATH_SAMPLES_COVER_CLIP_BILINEAR,  \
+       PIXMAN_null, 0,                                                 \
+       PIXMAN_ ## d, FAST_PATH_STD_DEST_FLAGS,                         \
+       fast_composite_scaled_bilinear_ ## func ## _cover ## _ ## op,   \
+    }
+
+#define SIMPLE_BILINEAR_FAST_PATH_NORMAL(op,s,d,func)                  \
+    {   PIXMAN_OP_ ## op,                                              \
+       PIXMAN_ ## s,                                                   \
+       (SCALED_BILINEAR_FLAGS          |                               \
+        FAST_PATH_NORMAL_REPEAT        |                               \
+        FAST_PATH_X_UNIT_POSITIVE),                                    \
+       PIXMAN_null, 0,                                                 \
+       PIXMAN_ ## d, FAST_PATH_STD_DEST_FLAGS,                         \
+       fast_composite_scaled_bilinear_ ## func ## _normal ## _ ## op,  \
+    }
+
+#define SIMPLE_BILINEAR_A8_MASK_FAST_PATH_PAD(op,s,d,func)             \
+    {   PIXMAN_OP_ ## op,                                              \
+       PIXMAN_ ## s,                                                   \
+       (SCALED_BILINEAR_FLAGS          |                               \
+        FAST_PATH_PAD_REPEAT           |                               \
+        FAST_PATH_X_UNIT_POSITIVE),                                    \
+       PIXMAN_a8, MASK_FLAGS (a8, FAST_PATH_UNIFIED_ALPHA),            \
+       PIXMAN_ ## d, FAST_PATH_STD_DEST_FLAGS,                         \
+       fast_composite_scaled_bilinear_ ## func ## _pad ## _ ## op,     \
+    }
+
+#define SIMPLE_BILINEAR_A8_MASK_FAST_PATH_NONE(op,s,d,func)            \
+    {   PIXMAN_OP_ ## op,                                              \
+       PIXMAN_ ## s,                                                   \
+       (SCALED_BILINEAR_FLAGS          |                               \
+        FAST_PATH_NONE_REPEAT          |                               \
+        FAST_PATH_X_UNIT_POSITIVE),                                    \
+       PIXMAN_a8, MASK_FLAGS (a8, FAST_PATH_UNIFIED_ALPHA),            \
+       PIXMAN_ ## d, FAST_PATH_STD_DEST_FLAGS,                         \
+       fast_composite_scaled_bilinear_ ## func ## _none ## _ ## op,    \
+    }
+
+#define SIMPLE_BILINEAR_A8_MASK_FAST_PATH_COVER(op,s,d,func)           \
+    {   PIXMAN_OP_ ## op,                                              \
+       PIXMAN_ ## s,                                                   \
+       SCALED_BILINEAR_FLAGS | FAST_PATH_SAMPLES_COVER_CLIP_BILINEAR,  \
+       PIXMAN_a8, MASK_FLAGS (a8, FAST_PATH_UNIFIED_ALPHA),            \
+       PIXMAN_ ## d, FAST_PATH_STD_DEST_FLAGS,                         \
+       fast_composite_scaled_bilinear_ ## func ## _cover ## _ ## op,   \
+    }
+
+#define SIMPLE_BILINEAR_A8_MASK_FAST_PATH_NORMAL(op,s,d,func)          \
+    {   PIXMAN_OP_ ## op,                                              \
+       PIXMAN_ ## s,                                                   \
+       (SCALED_BILINEAR_FLAGS          |                               \
+        FAST_PATH_NORMAL_REPEAT        |                               \
+        FAST_PATH_X_UNIT_POSITIVE),                                    \
+       PIXMAN_a8, MASK_FLAGS (a8, FAST_PATH_UNIFIED_ALPHA),            \
+       PIXMAN_ ## d, FAST_PATH_STD_DEST_FLAGS,                         \
+       fast_composite_scaled_bilinear_ ## func ## _normal ## _ ## op,  \
+    }
+
+#define SIMPLE_BILINEAR_SOLID_MASK_FAST_PATH_PAD(op,s,d,func)          \
+    {   PIXMAN_OP_ ## op,                                              \
+       PIXMAN_ ## s,                                                   \
+       (SCALED_BILINEAR_FLAGS          |                               \
+        FAST_PATH_PAD_REPEAT           |                               \
+        FAST_PATH_X_UNIT_POSITIVE),                                    \
+       PIXMAN_solid, MASK_FLAGS (solid, FAST_PATH_UNIFIED_ALPHA),      \
+       PIXMAN_ ## d, FAST_PATH_STD_DEST_FLAGS,                         \
+       fast_composite_scaled_bilinear_ ## func ## _pad ## _ ## op,     \
+    }
+
+#define SIMPLE_BILINEAR_SOLID_MASK_FAST_PATH_NONE(op,s,d,func)         \
+    {   PIXMAN_OP_ ## op,                                              \
+       PIXMAN_ ## s,                                                   \
+       (SCALED_BILINEAR_FLAGS          |                               \
+        FAST_PATH_NONE_REPEAT          |                               \
+        FAST_PATH_X_UNIT_POSITIVE),                                    \
+       PIXMAN_solid, MASK_FLAGS (solid, FAST_PATH_UNIFIED_ALPHA),      \
+       PIXMAN_ ## d, FAST_PATH_STD_DEST_FLAGS,                         \
+       fast_composite_scaled_bilinear_ ## func ## _none ## _ ## op,    \
+    }
+
+#define SIMPLE_BILINEAR_SOLID_MASK_FAST_PATH_COVER(op,s,d,func)                \
+    {   PIXMAN_OP_ ## op,                                              \
+       PIXMAN_ ## s,                                                   \
+       SCALED_BILINEAR_FLAGS | FAST_PATH_SAMPLES_COVER_CLIP_BILINEAR,  \
+       PIXMAN_solid, MASK_FLAGS (solid, FAST_PATH_UNIFIED_ALPHA),      \
+       PIXMAN_ ## d, FAST_PATH_STD_DEST_FLAGS,                         \
+       fast_composite_scaled_bilinear_ ## func ## _cover ## _ ## op,   \
+    }
+
+#define SIMPLE_BILINEAR_SOLID_MASK_FAST_PATH_NORMAL(op,s,d,func)       \
+    {   PIXMAN_OP_ ## op,                                              \
+       PIXMAN_ ## s,                                                   \
+       (SCALED_BILINEAR_FLAGS          |                               \
+        FAST_PATH_NORMAL_REPEAT        |                               \
+        FAST_PATH_X_UNIT_POSITIVE),                                    \
+       PIXMAN_solid, MASK_FLAGS (solid, FAST_PATH_UNIFIED_ALPHA),      \
+       PIXMAN_ ## d, FAST_PATH_STD_DEST_FLAGS,                         \
+       fast_composite_scaled_bilinear_ ## func ## _normal ## _ ## op,  \
+    }
+
+/* Prefer the use of 'cover' variant, because it is faster */
+#define SIMPLE_BILINEAR_FAST_PATH(op,s,d,func)                         \
+    SIMPLE_BILINEAR_FAST_PATH_COVER (op,s,d,func),                     \
+    SIMPLE_BILINEAR_FAST_PATH_NONE (op,s,d,func),                      \
+    SIMPLE_BILINEAR_FAST_PATH_PAD (op,s,d,func),                       \
+    SIMPLE_BILINEAR_FAST_PATH_NORMAL (op,s,d,func)
+
+#define SIMPLE_BILINEAR_A8_MASK_FAST_PATH(op,s,d,func)                 \
+    SIMPLE_BILINEAR_A8_MASK_FAST_PATH_COVER (op,s,d,func),             \
+    SIMPLE_BILINEAR_A8_MASK_FAST_PATH_NONE (op,s,d,func),              \
+    SIMPLE_BILINEAR_A8_MASK_FAST_PATH_PAD (op,s,d,func),               \
+    SIMPLE_BILINEAR_A8_MASK_FAST_PATH_NORMAL (op,s,d,func)
+
+#define SIMPLE_BILINEAR_SOLID_MASK_FAST_PATH(op,s,d,func)              \
+    SIMPLE_BILINEAR_SOLID_MASK_FAST_PATH_COVER (op,s,d,func),          \
+    SIMPLE_BILINEAR_SOLID_MASK_FAST_PATH_NONE (op,s,d,func),           \
+    SIMPLE_BILINEAR_SOLID_MASK_FAST_PATH_PAD (op,s,d,func),            \
+    SIMPLE_BILINEAR_SOLID_MASK_FAST_PATH_NORMAL (op,s,d,func)
+
+#endif
diff --git a/pixman/pixman-linear-gradient.c b/pixman/pixman-linear-gradient.c
new file mode 100644 (file)
index 0000000..6e1ea24
--- /dev/null
@@ -0,0 +1,286 @@
+/* -*- Mode: c; c-basic-offset: 4; tab-width: 8; indent-tabs-mode: t; -*- */
+/*
+ * Copyright © 2000 SuSE, Inc.
+ * Copyright © 2007 Red Hat, Inc.
+ * Copyright © 2000 Keith Packard, member of The XFree86 Project, Inc.
+ *             2005 Lars Knoll & Zack Rusin, Trolltech
+ *
+ * Permission to use, copy, modify, distribute, and sell this software and its
+ * documentation for any purpose is hereby granted without fee, provided that
+ * the above copyright notice appear in all copies and that both that
+ * copyright notice and this permission notice appear in supporting
+ * documentation, and that the name of Keith Packard not be used in
+ * advertising or publicity pertaining to distribution of the software without
+ * specific, written prior permission.  Keith Packard makes no
+ * representations about the suitability of this software for any purpose.  It
+ * is provided "as is" without express or implied warranty.
+ *
+ * THE COPYRIGHT HOLDERS DISCLAIM ALL WARRANTIES WITH REGARD TO THIS
+ * SOFTWARE, INCLUDING ALL IMPLIED WARRANTIES OF MERCHANTABILITY AND
+ * FITNESS, IN NO EVENT SHALL THE COPYRIGHT HOLDERS BE LIABLE FOR ANY
+ * SPECIAL, INDIRECT OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES
+ * WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN
+ * AN ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING
+ * OUT OF OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS
+ * SOFTWARE.
+ */
+
+#ifdef HAVE_CONFIG_H
+#include <config.h>
+#endif
+#include <stdlib.h>
+#include "pixman-private.h"
+
+static pixman_bool_t
+linear_gradient_is_horizontal (pixman_image_t *image,
+                              int             x,
+                              int             y,
+                              int             width,
+                              int             height)
+{
+    linear_gradient_t *linear = (linear_gradient_t *)image;
+    pixman_vector_t v;
+    pixman_fixed_32_32_t l;
+    pixman_fixed_48_16_t dx, dy;
+    double inc;
+
+    if (image->common.transform)
+    {
+       /* projective transformation */
+       if (image->common.transform->matrix[2][0] != 0 ||
+           image->common.transform->matrix[2][1] != 0 ||
+           image->common.transform->matrix[2][2] == 0)
+       {
+           return FALSE;
+       }
+
+       v.vector[0] = image->common.transform->matrix[0][1];
+       v.vector[1] = image->common.transform->matrix[1][1];
+       v.vector[2] = image->common.transform->matrix[2][2];
+    }
+    else
+    {
+       v.vector[0] = 0;
+       v.vector[1] = pixman_fixed_1;
+       v.vector[2] = pixman_fixed_1;
+    }
+
+    dx = linear->p2.x - linear->p1.x;
+    dy = linear->p2.y - linear->p1.y;
+
+    l = dx * dx + dy * dy;
+
+    if (l == 0)
+       return FALSE;
+
+    /*
+     * compute how much the input of the gradient walked changes
+     * when moving vertically through the whole image
+     */
+    inc = height * (double) pixman_fixed_1 * pixman_fixed_1 *
+       (dx * v.vector[0] + dy * v.vector[1]) /
+       (v.vector[2] * (double) l);
+
+    /* check that casting to integer would result in 0 */
+    if (-1 < inc && inc < 1)
+       return TRUE;
+
+    return FALSE;
+}
+
+static uint32_t *
+linear_get_scanline_narrow (pixman_iter_t  *iter,
+                           const uint32_t *mask)
+{
+    pixman_image_t *image  = iter->image;
+    int             x      = iter->x;
+    int             y      = iter->y;
+    int             width  = iter->width;
+    uint32_t *      buffer = iter->buffer;
+
+    pixman_vector_t v, unit;
+    pixman_fixed_32_32_t l;
+    pixman_fixed_48_16_t dx, dy;
+    gradient_t *gradient = (gradient_t *)image;
+    linear_gradient_t *linear = (linear_gradient_t *)image;
+    uint32_t *end = buffer + width;
+    pixman_gradient_walker_t walker;
+
+    _pixman_gradient_walker_init (&walker, gradient, image->common.repeat);
+
+    /* reference point is the center of the pixel */
+    v.vector[0] = pixman_int_to_fixed (x) + pixman_fixed_1 / 2;
+    v.vector[1] = pixman_int_to_fixed (y) + pixman_fixed_1 / 2;
+    v.vector[2] = pixman_fixed_1;
+
+    if (image->common.transform)
+    {
+       if (!pixman_transform_point_3d (image->common.transform, &v))
+           return iter->buffer;
+
+       unit.vector[0] = image->common.transform->matrix[0][0];
+       unit.vector[1] = image->common.transform->matrix[1][0];
+       unit.vector[2] = image->common.transform->matrix[2][0];
+    }
+    else
+    {
+       unit.vector[0] = pixman_fixed_1;
+       unit.vector[1] = 0;
+       unit.vector[2] = 0;
+    }
+
+    dx = linear->p2.x - linear->p1.x;
+    dy = linear->p2.y - linear->p1.y;
+
+    l = dx * dx + dy * dy;
+
+    if (l == 0 || unit.vector[2] == 0)
+    {
+       /* affine transformation only */
+        pixman_fixed_32_32_t t, next_inc;
+       double inc;
+
+       if (l == 0 || v.vector[2] == 0)
+       {
+           t = 0;
+           inc = 0;
+       }
+       else
+       {
+           double invden, v2;
+
+           invden = pixman_fixed_1 * (double) pixman_fixed_1 /
+               (l * (double) v.vector[2]);
+           v2 = v.vector[2] * (1. / pixman_fixed_1);
+           t = ((dx * v.vector[0] + dy * v.vector[1]) - 
+                (dx * linear->p1.x + dy * linear->p1.y) * v2) * invden;
+           inc = (dx * unit.vector[0] + dy * unit.vector[1]) * invden;
+       }
+       next_inc = 0;
+
+       if (((pixman_fixed_32_32_t )(inc * width)) == 0)
+       {
+           register uint32_t color;
+
+           color = _pixman_gradient_walker_pixel (&walker, t);
+           while (buffer < end)
+               *buffer++ = color;
+       }
+       else
+       {
+           int i;
+
+           i = 0;
+           while (buffer < end)
+           {
+               if (!mask || *mask++)
+               {
+                   *buffer = _pixman_gradient_walker_pixel (&walker,
+                                                            t + next_inc);
+               }
+               i++;
+               next_inc = inc * i;
+               buffer++;
+           }
+       }
+    }
+    else
+    {
+       /* projective transformation */
+        double t;
+
+       t = 0;
+
+       while (buffer < end)
+       {
+           if (!mask || *mask++)
+           {
+               if (v.vector[2] != 0)
+               {
+                   double invden, v2;
+
+                   invden = pixman_fixed_1 * (double) pixman_fixed_1 /
+                       (l * (double) v.vector[2]);
+                   v2 = v.vector[2] * (1. / pixman_fixed_1);
+                   t = ((dx * v.vector[0] + dy * v.vector[1]) - 
+                        (dx * linear->p1.x + dy * linear->p1.y) * v2) * invden;
+               }
+
+               *buffer = _pixman_gradient_walker_pixel (&walker, t);
+           }
+
+           ++buffer;
+
+           v.vector[0] += unit.vector[0];
+           v.vector[1] += unit.vector[1];
+           v.vector[2] += unit.vector[2];
+       }
+    }
+
+    iter->y++;
+
+    return iter->buffer;
+}
+
+static uint32_t *
+linear_get_scanline_wide (pixman_iter_t *iter, const uint32_t *mask)
+{
+    uint32_t *buffer = linear_get_scanline_narrow (iter, NULL);
+
+    pixman_expand ((uint64_t *)buffer, buffer, PIXMAN_a8r8g8b8, iter->width);
+
+    return buffer;
+}
+
+void
+_pixman_linear_gradient_iter_init (pixman_image_t *image, pixman_iter_t  *iter)
+{
+    if (linear_gradient_is_horizontal (
+           iter->image, iter->x, iter->y, iter->width, iter->height))
+    {
+       if (iter->flags & ITER_NARROW)
+           linear_get_scanline_narrow (iter, NULL);
+       else
+           linear_get_scanline_wide (iter, NULL);
+
+       iter->get_scanline = _pixman_iter_get_scanline_noop;
+    }
+    else
+    {
+       if (iter->flags & ITER_NARROW)
+           iter->get_scanline = linear_get_scanline_narrow;
+       else
+           iter->get_scanline = linear_get_scanline_wide;
+    }
+}
+
+PIXMAN_EXPORT pixman_image_t *
+pixman_image_create_linear_gradient (pixman_point_fixed_t *        p1,
+                                     pixman_point_fixed_t *        p2,
+                                     const pixman_gradient_stop_t *stops,
+                                     int                           n_stops)
+{
+    pixman_image_t *image;
+    linear_gradient_t *linear;
+
+    image = _pixman_image_allocate ();
+
+    if (!image)
+       return NULL;
+
+    linear = &image->linear;
+
+    if (!_pixman_init_gradient (&linear->common, stops, n_stops))
+    {
+       free (image);
+       return NULL;
+    }
+
+    linear->p1 = *p1;
+    linear->p2 = *p2;
+
+    image->type = LINEAR;
+
+    return image;
+}
+
diff --git a/pixman/pixman-matrix.c b/pixman/pixman-matrix.c
new file mode 100644 (file)
index 0000000..8d0d973
--- /dev/null
@@ -0,0 +1,766 @@
+/*
+ * Copyright © 2008 Keith Packard
+ *
+ * Permission to use, copy, modify, distribute, and sell this software and its
+ * documentation for any purpose is hereby granted without fee, provided that
+ * the above copyright notice appear in all copies and that both that copyright
+ * notice and this permission notice appear in supporting documentation, and
+ * that the name of the copyright holders not be used in advertising or
+ * publicity pertaining to distribution of the software without specific,
+ * written prior permission.  The copyright holders make no representations
+ * about the suitability of this software for any purpose.  It is provided "as
+ * is" without express or implied warranty.
+ *
+ * THE COPYRIGHT HOLDERS DISCLAIM ALL WARRANTIES WITH REGARD TO THIS SOFTWARE,
+ * INCLUDING ALL IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS, IN NO
+ * EVENT SHALL THE COPYRIGHT HOLDERS BE LIABLE FOR ANY SPECIAL, INDIRECT OR
+ * CONSEQUENTIAL DAMAGES OR ANY DAMAGES WHATSOEVER RESULTING FROM LOSS OF USE,
+ * DATA OR PROFITS, WHETHER IN AN ACTION OF CONTRACT, NEGLIGENCE OR OTHER
+ * TORTIOUS ACTION, ARISING OUT OF OR IN CONNECTION WITH THE USE OR PERFORMANCE
+ * OF THIS SOFTWARE.
+ */
+
+/*
+ * Matrix interfaces
+ */
+
+#ifdef HAVE_CONFIG_H
+#include "config.h"
+#endif
+
+#include <math.h>
+#include <string.h>
+#include "pixman-private.h"
+
+#define F(x)    pixman_int_to_fixed (x)
+
+PIXMAN_EXPORT void
+pixman_transform_init_identity (struct pixman_transform *matrix)
+{
+    int i;
+
+    memset (matrix, '\0', sizeof (struct pixman_transform));
+    for (i = 0; i < 3; i++)
+       matrix->matrix[i][i] = F (1);
+}
+
+typedef pixman_fixed_32_32_t pixman_fixed_34_30_t;
+
+PIXMAN_EXPORT pixman_bool_t
+pixman_transform_point_3d (const struct pixman_transform *transform,
+                           struct pixman_vector *         vector)
+{
+    struct pixman_vector result;
+    pixman_fixed_32_32_t partial;
+    pixman_fixed_48_16_t v;
+    int i, j;
+
+    for (j = 0; j < 3; j++)
+    {
+       v = 0;
+       for (i = 0; i < 3; i++)
+       {
+           partial = ((pixman_fixed_48_16_t) transform->matrix[j][i] *
+                      (pixman_fixed_48_16_t) vector->vector[i]);
+           v += partial >> 16;
+       }
+       
+       if (v > pixman_max_fixed_48_16 || v < pixman_min_fixed_48_16)
+           return FALSE;
+       
+       result.vector[j] = (pixman_fixed_t) v;
+    }
+    
+    *vector = result;
+
+    if (!result.vector[2])
+       return FALSE;
+
+    return TRUE;
+}
+
+PIXMAN_EXPORT pixman_bool_t
+pixman_transform_point (const struct pixman_transform *transform,
+                        struct pixman_vector *         vector)
+{
+    pixman_fixed_32_32_t partial;
+    pixman_fixed_34_30_t v[3];
+    pixman_fixed_48_16_t quo;
+    int i, j;
+
+    for (j = 0; j < 3; j++)
+    {
+       v[j] = 0;
+       
+       for (i = 0; i < 3; i++)
+       {
+           partial = ((pixman_fixed_32_32_t) transform->matrix[j][i] *
+                      (pixman_fixed_32_32_t) vector->vector[i]);
+           v[j] += partial >> 2;
+       }
+    }
+    
+    if (!(v[2] >> 16))
+       return FALSE;
+
+    for (j = 0; j < 2; j++)
+    {
+       quo = v[j] / (v[2] >> 16);
+       if (quo > pixman_max_fixed_48_16 || quo < pixman_min_fixed_48_16)
+           return FALSE;
+       vector->vector[j] = (pixman_fixed_t) quo;
+    }
+    
+    vector->vector[2] = pixman_fixed_1;
+    return TRUE;
+}
+
+PIXMAN_EXPORT pixman_bool_t
+pixman_transform_multiply (struct pixman_transform *      dst,
+                           const struct pixman_transform *l,
+                           const struct pixman_transform *r)
+{
+    struct pixman_transform d;
+    int dx, dy;
+    int o;
+
+    for (dy = 0; dy < 3; dy++)
+    {
+       for (dx = 0; dx < 3; dx++)
+       {
+           pixman_fixed_48_16_t v;
+           pixman_fixed_32_32_t partial;
+           
+           v = 0;
+           for (o = 0; o < 3; o++)
+           {
+               partial =
+                   (pixman_fixed_32_32_t) l->matrix[dy][o] *
+                   (pixman_fixed_32_32_t) r->matrix[o][dx];
+
+               v += partial >> 16;
+           }
+
+           if (v > pixman_max_fixed_48_16 || v < pixman_min_fixed_48_16)
+               return FALSE;
+           
+           d.matrix[dy][dx] = (pixman_fixed_t) v;
+       }
+    }
+
+    *dst = d;
+    return TRUE;
+}
+
+PIXMAN_EXPORT void
+pixman_transform_init_scale (struct pixman_transform *t,
+                             pixman_fixed_t           sx,
+                             pixman_fixed_t           sy)
+{
+    memset (t, '\0', sizeof (struct pixman_transform));
+
+    t->matrix[0][0] = sx;
+    t->matrix[1][1] = sy;
+    t->matrix[2][2] = F (1);
+}
+
+static pixman_fixed_t
+fixed_inverse (pixman_fixed_t x)
+{
+    return (pixman_fixed_t) ((((pixman_fixed_48_16_t) F (1)) * F (1)) / x);
+}
+
+PIXMAN_EXPORT pixman_bool_t
+pixman_transform_scale (struct pixman_transform *forward,
+                        struct pixman_transform *reverse,
+                        pixman_fixed_t           sx,
+                        pixman_fixed_t           sy)
+{
+    struct pixman_transform t;
+
+    if (sx == 0 || sy == 0)
+       return FALSE;
+
+    if (forward)
+    {
+       pixman_transform_init_scale (&t, sx, sy);
+       if (!pixman_transform_multiply (forward, &t, forward))
+           return FALSE;
+    }
+    
+    if (reverse)
+    {
+       pixman_transform_init_scale (&t, fixed_inverse (sx),
+                                    fixed_inverse (sy));
+       if (!pixman_transform_multiply (reverse, reverse, &t))
+           return FALSE;
+    }
+    
+    return TRUE;
+}
+
+PIXMAN_EXPORT void
+pixman_transform_init_rotate (struct pixman_transform *t,
+                              pixman_fixed_t           c,
+                              pixman_fixed_t           s)
+{
+    memset (t, '\0', sizeof (struct pixman_transform));
+
+    t->matrix[0][0] = c;
+    t->matrix[0][1] = -s;
+    t->matrix[1][0] = s;
+    t->matrix[1][1] = c;
+    t->matrix[2][2] = F (1);
+}
+
+PIXMAN_EXPORT pixman_bool_t
+pixman_transform_rotate (struct pixman_transform *forward,
+                         struct pixman_transform *reverse,
+                         pixman_fixed_t           c,
+                         pixman_fixed_t           s)
+{
+    struct pixman_transform t;
+
+    if (forward)
+    {
+       pixman_transform_init_rotate (&t, c, s);
+       if (!pixman_transform_multiply (forward, &t, forward))
+           return FALSE;
+    }
+
+    if (reverse)
+    {
+       pixman_transform_init_rotate (&t, c, -s);
+       if (!pixman_transform_multiply (reverse, reverse, &t))
+           return FALSE;
+    }
+    
+    return TRUE;
+}
+
+PIXMAN_EXPORT void
+pixman_transform_init_translate (struct pixman_transform *t,
+                                 pixman_fixed_t           tx,
+                                 pixman_fixed_t           ty)
+{
+    memset (t, '\0', sizeof (struct pixman_transform));
+
+    t->matrix[0][0] = F (1);
+    t->matrix[0][2] = tx;
+    t->matrix[1][1] = F (1);
+    t->matrix[1][2] = ty;
+    t->matrix[2][2] = F (1);
+}
+
+PIXMAN_EXPORT pixman_bool_t
+pixman_transform_translate (struct pixman_transform *forward,
+                            struct pixman_transform *reverse,
+                            pixman_fixed_t           tx,
+                            pixman_fixed_t           ty)
+{
+    struct pixman_transform t;
+
+    if (forward)
+    {
+       pixman_transform_init_translate (&t, tx, ty);
+
+       if (!pixman_transform_multiply (forward, &t, forward))
+           return FALSE;
+    }
+
+    if (reverse)
+    {
+       pixman_transform_init_translate (&t, -tx, -ty);
+
+       if (!pixman_transform_multiply (reverse, reverse, &t))
+           return FALSE;
+    }
+    return TRUE;
+}
+
+PIXMAN_EXPORT pixman_bool_t
+pixman_transform_bounds (const struct pixman_transform *matrix,
+                         struct pixman_box16 *          b)
+
+{
+    struct pixman_vector v[4];
+    int i;
+    int x1, y1, x2, y2;
+
+    v[0].vector[0] = F (b->x1);
+    v[0].vector[1] = F (b->y1);
+    v[0].vector[2] = F (1);
+
+    v[1].vector[0] = F (b->x2);
+    v[1].vector[1] = F (b->y1);
+    v[1].vector[2] = F (1);
+
+    v[2].vector[0] = F (b->x2);
+    v[2].vector[1] = F (b->y2);
+    v[2].vector[2] = F (1);
+
+    v[3].vector[0] = F (b->x1);
+    v[3].vector[1] = F (b->y2);
+    v[3].vector[2] = F (1);
+
+    for (i = 0; i < 4; i++)
+    {
+       if (!pixman_transform_point (matrix, &v[i]))
+           return FALSE;
+
+       x1 = pixman_fixed_to_int (v[i].vector[0]);
+       y1 = pixman_fixed_to_int (v[i].vector[1]);
+       x2 = pixman_fixed_to_int (pixman_fixed_ceil (v[i].vector[0]));
+       y2 = pixman_fixed_to_int (pixman_fixed_ceil (v[i].vector[1]));
+
+       if (i == 0)
+       {
+           b->x1 = x1;
+           b->y1 = y1;
+           b->x2 = x2;
+           b->y2 = y2;
+       }
+       else
+       {
+           if (x1 < b->x1) b->x1 = x1;
+           if (y1 < b->y1) b->y1 = y1;
+           if (x2 > b->x2) b->x2 = x2;
+           if (y2 > b->y2) b->y2 = y2;
+       }
+    }
+
+    return TRUE;
+}
+
+PIXMAN_EXPORT pixman_bool_t
+pixman_transform_invert (struct pixman_transform *      dst,
+                         const struct pixman_transform *src)
+{
+    struct pixman_f_transform m, r;
+
+    pixman_f_transform_from_pixman_transform (&m, src);
+
+    if (!pixman_f_transform_invert (&r, &m))
+       return FALSE;
+
+    if (!pixman_transform_from_pixman_f_transform (dst, &r))
+       return FALSE;
+
+    return TRUE;
+}
+
+static pixman_bool_t
+within_epsilon (pixman_fixed_t a,
+                pixman_fixed_t b,
+                pixman_fixed_t epsilon)
+{
+    pixman_fixed_t t = a - b;
+
+    if (t < 0)
+       t = -t;
+
+    return t <= epsilon;
+}
+
+#define EPSILON (pixman_fixed_t) (2)
+
+#define IS_SAME(a, b) (within_epsilon (a, b, EPSILON))
+#define IS_ZERO(a)    (within_epsilon (a, 0, EPSILON))
+#define IS_ONE(a)     (within_epsilon (a, F (1), EPSILON))
+#define IS_UNIT(a)                         \
+    (within_epsilon (a, F (1), EPSILON) ||  \
+     within_epsilon (a, F (-1), EPSILON) || \
+     IS_ZERO (a))
+#define IS_INT(a)    (IS_ZERO (pixman_fixed_frac (a)))
+
+PIXMAN_EXPORT pixman_bool_t
+pixman_transform_is_identity (const struct pixman_transform *t)
+{
+    return (IS_SAME (t->matrix[0][0], t->matrix[1][1]) &&
+           IS_SAME (t->matrix[0][0], t->matrix[2][2]) &&
+           !IS_ZERO (t->matrix[0][0]) &&
+           IS_ZERO (t->matrix[0][1]) &&
+           IS_ZERO (t->matrix[0][2]) &&
+           IS_ZERO (t->matrix[1][0]) &&
+           IS_ZERO (t->matrix[1][2]) &&
+           IS_ZERO (t->matrix[2][0]) &&
+           IS_ZERO (t->matrix[2][1]));
+}
+
+PIXMAN_EXPORT pixman_bool_t
+pixman_transform_is_scale (const struct pixman_transform *t)
+{
+    return (!IS_ZERO (t->matrix[0][0]) &&
+            IS_ZERO (t->matrix[0][1]) &&
+            IS_ZERO (t->matrix[0][2]) &&
+
+            IS_ZERO (t->matrix[1][0]) &&
+            !IS_ZERO (t->matrix[1][1]) &&
+            IS_ZERO (t->matrix[1][2]) &&
+
+            IS_ZERO (t->matrix[2][0]) &&
+            IS_ZERO (t->matrix[2][1]) &&
+            !IS_ZERO (t->matrix[2][2]));
+}
+
+PIXMAN_EXPORT pixman_bool_t
+pixman_transform_is_int_translate (const struct pixman_transform *t)
+{
+    return (IS_ONE (t->matrix[0][0]) &&
+            IS_ZERO (t->matrix[0][1]) &&
+            IS_INT (t->matrix[0][2]) &&
+
+            IS_ZERO (t->matrix[1][0]) &&
+            IS_ONE (t->matrix[1][1]) &&
+            IS_INT (t->matrix[1][2]) &&
+
+            IS_ZERO (t->matrix[2][0]) &&
+            IS_ZERO (t->matrix[2][1]) &&
+            IS_ONE (t->matrix[2][2]));
+}
+
+PIXMAN_EXPORT pixman_bool_t
+pixman_transform_is_inverse (const struct pixman_transform *a,
+                             const struct pixman_transform *b)
+{
+    struct pixman_transform t;
+
+    if (!pixman_transform_multiply (&t, a, b))
+       return FALSE;
+
+    return pixman_transform_is_identity (&t);
+}
+
+PIXMAN_EXPORT void
+pixman_f_transform_from_pixman_transform (struct pixman_f_transform *    ft,
+                                          const struct pixman_transform *t)
+{
+    int i, j;
+
+    for (j = 0; j < 3; j++)
+    {
+       for (i = 0; i < 3; i++)
+           ft->m[j][i] = pixman_fixed_to_double (t->matrix[j][i]);
+    }
+}
+
+PIXMAN_EXPORT pixman_bool_t
+pixman_transform_from_pixman_f_transform (struct pixman_transform *        t,
+                                          const struct pixman_f_transform *ft)
+{
+    int i, j;
+
+    for (j = 0; j < 3; j++)
+    {
+       for (i = 0; i < 3; i++)
+       {
+           double d = ft->m[j][i];
+           if (d < -32767.0 || d > 32767.0)
+               return FALSE;
+           d = d * 65536.0 + 0.5;
+           t->matrix[j][i] = (pixman_fixed_t) floor (d);
+       }
+    }
+    
+    return TRUE;
+}
+
+PIXMAN_EXPORT pixman_bool_t
+pixman_f_transform_invert (struct pixman_f_transform *      dst,
+                           const struct pixman_f_transform *src)
+{
+    double det;
+    int i, j;
+    static int a[3] = { 2, 2, 1 };
+    static int b[3] = { 1, 0, 0 };
+
+    det = 0;
+    for (i = 0; i < 3; i++)
+    {
+       double p;
+       int ai = a[i];
+       int bi = b[i];
+       p = src->m[i][0] * (src->m[ai][2] * src->m[bi][1] -
+                           src->m[ai][1] * src->m[bi][2]);
+       if (i == 1)
+           p = -p;
+       det += p;
+    }
+    
+    if (det == 0)
+       return FALSE;
+    
+    det = 1 / det;
+    for (j = 0; j < 3; j++)
+    {
+       for (i = 0; i < 3; i++)
+       {
+           double p;
+           int ai = a[i];
+           int aj = a[j];
+           int bi = b[i];
+           int bj = b[j];
+
+           p = (src->m[ai][aj] * src->m[bi][bj] -
+                src->m[ai][bj] * src->m[bi][aj]);
+           
+           if (((i + j) & 1) != 0)
+               p = -p;
+           
+           dst->m[j][i] = det * p;
+       }
+    }
+
+    return TRUE;
+}
+
+PIXMAN_EXPORT pixman_bool_t
+pixman_f_transform_point (const struct pixman_f_transform *t,
+                          struct pixman_f_vector *         v)
+{
+    struct pixman_f_vector result;
+    int i, j;
+    double a;
+
+    for (j = 0; j < 3; j++)
+    {
+       a = 0;
+       for (i = 0; i < 3; i++)
+           a += t->m[j][i] * v->v[i];
+       result.v[j] = a;
+    }
+    
+    if (!result.v[2])
+       return FALSE;
+
+    for (j = 0; j < 2; j++)
+       v->v[j] = result.v[j] / result.v[2];
+
+    v->v[2] = 1;
+
+    return TRUE;
+}
+
+PIXMAN_EXPORT void
+pixman_f_transform_point_3d (const struct pixman_f_transform *t,
+                             struct pixman_f_vector *         v)
+{
+    struct pixman_f_vector result;
+    int i, j;
+    double a;
+
+    for (j = 0; j < 3; j++)
+    {
+       a = 0;
+       for (i = 0; i < 3; i++)
+           a += t->m[j][i] * v->v[i];
+       result.v[j] = a;
+    }
+    
+    *v = result;
+}
+
+PIXMAN_EXPORT void
+pixman_f_transform_multiply (struct pixman_f_transform *      dst,
+                             const struct pixman_f_transform *l,
+                             const struct pixman_f_transform *r)
+{
+    struct pixman_f_transform d;
+    int dx, dy;
+    int o;
+
+    for (dy = 0; dy < 3; dy++)
+    {
+       for (dx = 0; dx < 3; dx++)
+       {
+           double v = 0;
+           for (o = 0; o < 3; o++)
+               v += l->m[dy][o] * r->m[o][dx];
+           d.m[dy][dx] = v;
+       }
+    }
+    
+    *dst = d;
+}
+
+PIXMAN_EXPORT void
+pixman_f_transform_init_scale (struct pixman_f_transform *t,
+                               double                     sx,
+                               double                     sy)
+{
+    t->m[0][0] = sx;
+    t->m[0][1] = 0;
+    t->m[0][2] = 0;
+    t->m[1][0] = 0;
+    t->m[1][1] = sy;
+    t->m[1][2] = 0;
+    t->m[2][0] = 0;
+    t->m[2][1] = 0;
+    t->m[2][2] = 1;
+}
+
+PIXMAN_EXPORT pixman_bool_t
+pixman_f_transform_scale (struct pixman_f_transform *forward,
+                          struct pixman_f_transform *reverse,
+                          double                     sx,
+                          double                     sy)
+{
+    struct pixman_f_transform t;
+
+    if (sx == 0 || sy == 0)
+       return FALSE;
+
+    if (forward)
+    {
+       pixman_f_transform_init_scale (&t, sx, sy);
+       pixman_f_transform_multiply (forward, &t, forward);
+    }
+    
+    if (reverse)
+    {
+       pixman_f_transform_init_scale (&t, 1 / sx, 1 / sy);
+       pixman_f_transform_multiply (reverse, reverse, &t);
+    }
+    
+    return TRUE;
+}
+
+PIXMAN_EXPORT void
+pixman_f_transform_init_rotate (struct pixman_f_transform *t,
+                                double                     c,
+                                double                     s)
+{
+    t->m[0][0] = c;
+    t->m[0][1] = -s;
+    t->m[0][2] = 0;
+    t->m[1][0] = s;
+    t->m[1][1] = c;
+    t->m[1][2] = 0;
+    t->m[2][0] = 0;
+    t->m[2][1] = 0;
+    t->m[2][2] = 1;
+}
+
+PIXMAN_EXPORT pixman_bool_t
+pixman_f_transform_rotate (struct pixman_f_transform *forward,
+                           struct pixman_f_transform *reverse,
+                           double                     c,
+                           double                     s)
+{
+    struct pixman_f_transform t;
+
+    if (forward)
+    {
+       pixman_f_transform_init_rotate (&t, c, s);
+       pixman_f_transform_multiply (forward, &t, forward);
+    }
+    
+    if (reverse)
+    {
+       pixman_f_transform_init_rotate (&t, c, -s);
+       pixman_f_transform_multiply (reverse, reverse, &t);
+    }
+
+    return TRUE;
+}
+
+PIXMAN_EXPORT void
+pixman_f_transform_init_translate (struct pixman_f_transform *t,
+                                   double                     tx,
+                                   double                     ty)
+{
+    t->m[0][0] = 1;
+    t->m[0][1] = 0;
+    t->m[0][2] = tx;
+    t->m[1][0] = 0;
+    t->m[1][1] = 1;
+    t->m[1][2] = ty;
+    t->m[2][0] = 0;
+    t->m[2][1] = 0;
+    t->m[2][2] = 1;
+}
+
+PIXMAN_EXPORT pixman_bool_t
+pixman_f_transform_translate (struct pixman_f_transform *forward,
+                              struct pixman_f_transform *reverse,
+                              double                     tx,
+                              double                     ty)
+{
+    struct pixman_f_transform t;
+
+    if (forward)
+    {
+       pixman_f_transform_init_translate (&t, tx, ty);
+       pixman_f_transform_multiply (forward, &t, forward);
+    }
+
+    if (reverse)
+    {
+       pixman_f_transform_init_translate (&t, -tx, -ty);
+       pixman_f_transform_multiply (reverse, reverse, &t);
+    }
+
+    return TRUE;
+}
+
+PIXMAN_EXPORT pixman_bool_t
+pixman_f_transform_bounds (const struct pixman_f_transform *t,
+                           struct pixman_box16 *            b)
+{
+    struct pixman_f_vector v[4];
+    int i;
+    int x1, y1, x2, y2;
+
+    v[0].v[0] = b->x1;
+    v[0].v[1] = b->y1;
+    v[0].v[2] = 1;
+    v[1].v[0] = b->x2;
+    v[1].v[1] = b->y1;
+    v[1].v[2] = 1;
+    v[2].v[0] = b->x2;
+    v[2].v[1] = b->y2;
+    v[2].v[2] = 1;
+    v[3].v[0] = b->x1;
+    v[3].v[1] = b->y2;
+    v[3].v[2] = 1;
+
+    for (i = 0; i < 4; i++)
+    {
+       if (!pixman_f_transform_point (t, &v[i]))
+           return FALSE;
+
+       x1 = floor (v[i].v[0]);
+       y1 = floor (v[i].v[1]);
+       x2 = ceil (v[i].v[0]);
+       y2 = ceil (v[i].v[1]);
+
+       if (i == 0)
+       {
+           b->x1 = x1;
+           b->y1 = y1;
+           b->x2 = x2;
+           b->y2 = y2;
+       }
+       else
+       {
+           if (x1 < b->x1) b->x1 = x1;
+           if (y1 < b->y1) b->y1 = y1;
+           if (x2 > b->x2) b->x2 = x2;
+           if (y2 > b->y2) b->y2 = y2;
+       }
+    }
+
+    return TRUE;
+}
+
+PIXMAN_EXPORT void
+pixman_f_transform_init_identity (struct pixman_f_transform *t)
+{
+    int i, j;
+
+    for (j = 0; j < 3; j++)
+    {
+       for (i = 0; i < 3; i++)
+           t->m[j][i] = i == j ? 1 : 0;
+    }
+}
diff --git a/pixman/pixman-mmx.c b/pixman/pixman-mmx.c
new file mode 100644 (file)
index 0000000..f848ab4
--- /dev/null
@@ -0,0 +1,3237 @@
+/*
+ * Copyright © 2004, 2005 Red Hat, Inc.
+ * Copyright © 2004 Nicholas Miell
+ * Copyright © 2005 Trolltech AS
+ *
+ * Permission to use, copy, modify, distribute, and sell this software and its
+ * documentation for any purpose is hereby granted without fee, provided that
+ * the above copyright notice appear in all copies and that both that
+ * copyright notice and this permission notice appear in supporting
+ * documentation, and that the name of Red Hat not be used in advertising or
+ * publicity pertaining to distribution of the software without specific,
+ * written prior permission.  Red Hat makes no representations about the
+ * suitability of this software for any purpose.  It is provided "as is"
+ * without express or implied warranty.
+ *
+ * THE COPYRIGHT HOLDERS DISCLAIM ALL WARRANTIES WITH REGARD TO THIS
+ * SOFTWARE, INCLUDING ALL IMPLIED WARRANTIES OF MERCHANTABILITY AND
+ * FITNESS, IN NO EVENT SHALL THE COPYRIGHT HOLDERS BE LIABLE FOR ANY
+ * SPECIAL, INDIRECT OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES
+ * WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN
+ * AN ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING
+ * OUT OF OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS
+ * SOFTWARE.
+ *
+ * Author:  Søren Sandmann (sandmann@redhat.com)
+ * Minor Improvements: Nicholas Miell (nmiell@gmail.com)
+ * MMX code paths for fbcompose.c by Lars Knoll (lars@trolltech.com)
+ *
+ * Based on work by Owen Taylor
+ */
+
+#ifdef HAVE_CONFIG_H
+#include <config.h>
+#endif
+
+#if defined USE_X86_MMX || defined USE_ARM_IWMMXT
+
+#include <mmintrin.h>
+#include "pixman-private.h"
+#include "pixman-combine32.h"
+
+#define no_vERBOSE
+
+#ifdef VERBOSE
+#define CHECKPOINT() error_f ("at %s %d\n", __FUNCTION__, __LINE__)
+#else
+#define CHECKPOINT()
+#endif
+
+#ifdef USE_ARM_IWMMXT
+/* Empty the multimedia state. For some reason, ARM's mmintrin.h doesn't provide this.  */
+extern __inline void __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+_mm_empty (void)
+{
+
+}
+#endif
+
+/* Notes about writing mmx code
+ *
+ * give memory operands as the second operand. If you give it as the
+ * first, gcc will first load it into a register, then use that
+ * register
+ *
+ *   ie. use
+ *
+ *         _mm_mullo_pi16 (x, mmx_constant);
+ *
+ *   not
+ *
+ *         _mm_mullo_pi16 (mmx_constant, x);
+ *
+ * Also try to minimize dependencies. i.e. when you need a value, try
+ * to calculate it from a value that was calculated as early as
+ * possible.
+ */
+
+/* --------------- MMX primitives ------------------------------------- */
+
+#ifdef __GNUC__
+typedef uint64_t mmxdatafield;
+#else
+typedef __m64 mmxdatafield;
+/* If __m64 is defined as a struct or union, define M64_MEMBER to be the
+   name of the member used to access the data */
+# ifdef _MSC_VER
+#  define M64_MEMBER m64_u64
+# elif defined(__SUNPRO_C)
+#  define M64_MEMBER l_
+# endif
+#endif
+
+typedef struct
+{
+    mmxdatafield mmx_4x00ff;
+    mmxdatafield mmx_4x0080;
+    mmxdatafield mmx_565_rgb;
+    mmxdatafield mmx_565_unpack_multiplier;
+    mmxdatafield mmx_565_r;
+    mmxdatafield mmx_565_g;
+    mmxdatafield mmx_565_b;
+    mmxdatafield mmx_mask_0;
+    mmxdatafield mmx_mask_1;
+    mmxdatafield mmx_mask_2;
+    mmxdatafield mmx_mask_3;
+    mmxdatafield mmx_full_alpha;
+    mmxdatafield mmx_ffff0000ffff0000;
+    mmxdatafield mmx_0000ffff00000000;
+    mmxdatafield mmx_000000000000ffff;
+} mmx_data_t;
+
+#if defined(_MSC_VER)
+# define MMXDATA_INIT(field, val) { val ## UI64 }
+#elif defined(M64_MEMBER)       /* __m64 is a struct, not an integral type */
+# define MMXDATA_INIT(field, val) field =   { val ## ULL }
+#else                           /* __m64 is an integral type */
+# define MMXDATA_INIT(field, val) field =   val ## ULL
+#endif
+
+static const mmx_data_t c =
+{
+    MMXDATA_INIT (.mmx_4x00ff,                   0x00ff00ff00ff00ff),
+    MMXDATA_INIT (.mmx_4x0080,                   0x0080008000800080),
+    MMXDATA_INIT (.mmx_565_rgb,                  0x000001f0003f001f),
+    MMXDATA_INIT (.mmx_565_unpack_multiplier,    0x0000008404100840),
+    MMXDATA_INIT (.mmx_565_r,                    0x000000f800000000),
+    MMXDATA_INIT (.mmx_565_g,                    0x0000000000fc0000),
+    MMXDATA_INIT (.mmx_565_b,                    0x00000000000000f8),
+    MMXDATA_INIT (.mmx_mask_0,                   0xffffffffffff0000),
+    MMXDATA_INIT (.mmx_mask_1,                   0xffffffff0000ffff),
+    MMXDATA_INIT (.mmx_mask_2,                   0xffff0000ffffffff),
+    MMXDATA_INIT (.mmx_mask_3,                   0x0000ffffffffffff),
+    MMXDATA_INIT (.mmx_full_alpha,               0x00ff000000000000),
+    MMXDATA_INIT (.mmx_ffff0000ffff0000,         0xffff0000ffff0000),
+    MMXDATA_INIT (.mmx_0000ffff00000000,         0x0000ffff00000000),
+    MMXDATA_INIT (.mmx_000000000000ffff,         0x000000000000ffff),
+};
+
+#ifdef __GNUC__
+#    ifdef __ICC
+#        define MC(x) to_m64 (c.mmx_ ## x)
+#    else
+#        define MC(x) ((__m64)c.mmx_ ## x)
+#    endif
+#else
+#    define MC(x) c.mmx_ ## x
+#endif
+
+static force_inline __m64
+to_m64 (uint64_t x)
+{
+#ifdef __ICC
+    return _mm_cvtsi64_m64 (x);
+#elif defined M64_MEMBER        /* __m64 is a struct, not an integral type */
+    __m64 res;
+
+    res.M64_MEMBER = x;
+    return res;
+#else                           /* __m64 is an integral type */
+    return (__m64)x;
+#endif
+}
+
+static force_inline uint64_t
+to_uint64 (__m64 x)
+{
+#ifdef __ICC
+    return _mm_cvtm64_si64 (x);
+#elif defined M64_MEMBER        /* __m64 is a struct, not an integral type */
+    uint64_t res = x.M64_MEMBER;
+    return res;
+#else                           /* __m64 is an integral type */
+    return (uint64_t)x;
+#endif
+}
+
+static force_inline __m64
+shift (__m64 v,
+       int   s)
+{
+    if (s > 0)
+       return _mm_slli_si64 (v, s);
+    else if (s < 0)
+       return _mm_srli_si64 (v, -s);
+    else
+       return v;
+}
+
+static force_inline __m64
+negate (__m64 mask)
+{
+    return _mm_xor_si64 (mask, MC (4x00ff));
+}
+
+static force_inline __m64
+pix_multiply (__m64 a, __m64 b)
+{
+    __m64 res;
+
+    res = _mm_mullo_pi16 (a, b);
+    res = _mm_adds_pu16 (res, MC (4x0080));
+    res = _mm_adds_pu16 (res, _mm_srli_pi16 (res, 8));
+    res = _mm_srli_pi16 (res, 8);
+
+    return res;
+}
+
+static force_inline __m64
+pix_add (__m64 a, __m64 b)
+{
+    return _mm_adds_pu8 (a, b);
+}
+
+static force_inline __m64
+expand_alpha (__m64 pixel)
+{
+    __m64 t1, t2;
+
+    t1 = shift (pixel, -48);
+    t2 = shift (t1, 16);
+    t1 = _mm_or_si64 (t1, t2);
+    t2 = shift (t1, 32);
+    t1 = _mm_or_si64 (t1, t2);
+
+    return t1;
+}
+
+static force_inline __m64
+expand_alpha_rev (__m64 pixel)
+{
+    __m64 t1, t2;
+
+    /* move alpha to low 16 bits and zero the rest */
+    t1 = shift (pixel,  48);
+    t1 = shift (t1, -48);
+
+    t2 = shift (t1, 16);
+    t1 = _mm_or_si64 (t1, t2);
+    t2 = shift (t1, 32);
+    t1 = _mm_or_si64 (t1, t2);
+
+    return t1;
+}
+
+static force_inline __m64
+invert_colors (__m64 pixel)
+{
+    __m64 x, y, z;
+
+    x = y = z = pixel;
+
+    x = _mm_and_si64 (x, MC (ffff0000ffff0000));
+    y = _mm_and_si64 (y, MC (000000000000ffff));
+    z = _mm_and_si64 (z, MC (0000ffff00000000));
+
+    y = shift (y, 32);
+    z = shift (z, -32);
+
+    x = _mm_or_si64 (x, y);
+    x = _mm_or_si64 (x, z);
+
+    return x;
+}
+
+static force_inline __m64
+over (__m64 src,
+      __m64 srca,
+      __m64 dest)
+{
+    return _mm_adds_pu8 (src, pix_multiply (dest, negate (srca)));
+}
+
+static force_inline __m64
+over_rev_non_pre (__m64 src, __m64 dest)
+{
+    __m64 srca = expand_alpha (src);
+    __m64 srcfaaa = _mm_or_si64 (srca, MC (full_alpha));
+
+    return over (pix_multiply (invert_colors (src), srcfaaa), srca, dest);
+}
+
+static force_inline __m64
+in (__m64 src, __m64 mask)
+{
+    return pix_multiply (src, mask);
+}
+
+static force_inline __m64
+in_over_full_src_alpha (__m64 src, __m64 mask, __m64 dest)
+{
+    src = _mm_or_si64 (src, MC (full_alpha));
+
+    return over (in (src, mask), mask, dest);
+}
+
+#ifndef _MSC_VER
+static force_inline __m64
+in_over (__m64 src, __m64 srca, __m64 mask, __m64 dest)
+{
+    return over (in (src, mask), pix_multiply (srca, mask), dest);
+}
+
+#else
+
+#define in_over(src, srca, mask, dest)                                 \
+    over (in (src, mask), pix_multiply (srca, mask), dest)
+
+#endif
+
+/* Elemental unaligned loads */
+
+static __inline__ uint64_t ldq_u(uint64_t *p)
+{
+#ifdef USE_X86_MMX
+    /* x86's alignment restrictions are very relaxed. */
+    return *p;
+#elif defined USE_ARM_IWMMXT
+    int align = (uintptr_t)p & 7;
+    __m64 *aligned_p;
+    if (align == 0)
+       return *p;
+    aligned_p = (__m64 *)((uintptr_t)p & ~7);
+    return _mm_align_si64 (aligned_p[0], aligned_p[1], align);
+#else
+    struct __una_u64 { uint64_t x __attribute__((packed)); };
+    const struct __una_u64 *ptr = (const struct __una_u64 *) p;
+    return ptr->x;
+#endif
+}
+
+static __inline__ uint32_t ldl_u(uint32_t *p)
+{
+#ifdef USE_X86_MMX
+    /* x86's alignment restrictions are very relaxed. */
+    return *p;
+#else
+    struct __una_u32 { uint32_t x __attribute__((packed)); };
+    const struct __una_u32 *ptr = (const struct __una_u32 *) p;
+    return ptr->x;
+#endif
+}
+
+static force_inline __m64
+load8888 (uint32_t v)
+{
+    return _mm_unpacklo_pi8 (_mm_cvtsi32_si64 (v), _mm_setzero_si64 ());
+}
+
+static force_inline __m64
+pack8888 (__m64 lo, __m64 hi)
+{
+    return _mm_packs_pu16 (lo, hi);
+}
+
+static force_inline uint32_t
+store8888 (__m64 v)
+{
+    return _mm_cvtsi64_si32 (pack8888 (v, _mm_setzero_si64 ()));
+}
+
+/* Expand 16 bits positioned at @pos (0-3) of a mmx register into
+ *
+ *    00RR00GG00BB
+ *
+ * --- Expanding 565 in the low word ---
+ *
+ * m = (m << (32 - 3)) | (m << (16 - 5)) | m;
+ * m = m & (01f0003f001f);
+ * m = m * (008404100840);
+ * m = m >> 8;
+ *
+ * Note the trick here - the top word is shifted by another nibble to
+ * avoid it bumping into the middle word
+ */
+static force_inline __m64
+expand565 (__m64 pixel, int pos)
+{
+    __m64 p = pixel;
+    __m64 t1, t2;
+
+    /* move pixel to low 16 bit and zero the rest */
+    p = shift (shift (p, (3 - pos) * 16), -48);
+
+    t1 = shift (p, 36 - 11);
+    t2 = shift (p, 16 - 5);
+
+    p = _mm_or_si64 (t1, p);
+    p = _mm_or_si64 (t2, p);
+    p = _mm_and_si64 (p, MC (565_rgb));
+
+    pixel = _mm_mullo_pi16 (p, MC (565_unpack_multiplier));
+    return _mm_srli_pi16 (pixel, 8);
+}
+
+static force_inline __m64
+expand8888 (__m64 in, int pos)
+{
+    if (pos == 0)
+       return _mm_unpacklo_pi8 (in, _mm_setzero_si64 ());
+    else
+       return _mm_unpackhi_pi8 (in, _mm_setzero_si64 ());
+}
+
+static force_inline __m64
+expandx888 (__m64 in, int pos)
+{
+    return _mm_or_si64 (expand8888 (in, pos), MC (full_alpha));
+}
+
+static force_inline __m64
+pack_565 (__m64 pixel, __m64 target, int pos)
+{
+    __m64 p = pixel;
+    __m64 t = target;
+    __m64 r, g, b;
+
+    r = _mm_and_si64 (p, MC (565_r));
+    g = _mm_and_si64 (p, MC (565_g));
+    b = _mm_and_si64 (p, MC (565_b));
+
+    r = shift (r, -(32 - 8) + pos * 16);
+    g = shift (g, -(16 - 3) + pos * 16);
+    b = shift (b, -(0  + 3) + pos * 16);
+
+    if (pos == 0)
+       t = _mm_and_si64 (t, MC (mask_0));
+    else if (pos == 1)
+       t = _mm_and_si64 (t, MC (mask_1));
+    else if (pos == 2)
+       t = _mm_and_si64 (t, MC (mask_2));
+    else if (pos == 3)
+       t = _mm_and_si64 (t, MC (mask_3));
+
+    p = _mm_or_si64 (r, t);
+    p = _mm_or_si64 (g, p);
+
+    return _mm_or_si64 (b, p);
+}
+
+#ifndef _MSC_VER
+
+static force_inline __m64
+pix_add_mul (__m64 x, __m64 a, __m64 y, __m64 b)
+{
+    x = pix_multiply (x, a);
+    y = pix_multiply (y, b);
+
+    return pix_add (x, y);
+}
+
+#else
+
+#define pix_add_mul(x, a, y, b)         \
+    ( x = pix_multiply (x, a),  \
+      y = pix_multiply (y, a),  \
+      pix_add (x, y) )
+
+#endif
+
+/* --------------- MMX code patch for fbcompose.c --------------------- */
+
+static force_inline uint32_t
+combine (const uint32_t *src, const uint32_t *mask)
+{
+    uint32_t ssrc = *src;
+
+    if (mask)
+    {
+       __m64 m = load8888 (*mask);
+       __m64 s = load8888 (ssrc);
+
+       m = expand_alpha (m);
+       s = pix_multiply (s, m);
+
+       ssrc = store8888 (s);
+    }
+
+    return ssrc;
+}
+
+static void
+mmx_combine_over_u (pixman_implementation_t *imp,
+                    pixman_op_t              op,
+                    uint32_t *               dest,
+                    const uint32_t *         src,
+                    const uint32_t *         mask,
+                    int                      width)
+{
+    const uint32_t *end = dest + width;
+
+    while (dest < end)
+    {
+       uint32_t ssrc = combine (src, mask);
+       uint32_t a = ssrc >> 24;
+
+       if (a == 0xff)
+       {
+           *dest = ssrc;
+       }
+       else if (ssrc)
+       {
+           __m64 s, sa;
+           s = load8888 (ssrc);
+           sa = expand_alpha (s);
+           *dest = store8888 (over (s, sa, load8888 (*dest)));
+       }
+
+       ++dest;
+       ++src;
+       if (mask)
+           ++mask;
+    }
+    _mm_empty ();
+}
+
+static void
+mmx_combine_over_reverse_u (pixman_implementation_t *imp,
+                            pixman_op_t              op,
+                            uint32_t *               dest,
+                            const uint32_t *         src,
+                            const uint32_t *         mask,
+                            int                      width)
+{
+    const uint32_t *end = dest + width;
+
+    while (dest < end)
+    {
+       __m64 d, da;
+       uint32_t s = combine (src, mask);
+
+       d = load8888 (*dest);
+       da = expand_alpha (d);
+       *dest = store8888 (over (d, da, load8888 (s)));
+
+       ++dest;
+       ++src;
+       if (mask)
+           mask++;
+    }
+    _mm_empty ();
+}
+
+static void
+mmx_combine_in_u (pixman_implementation_t *imp,
+                  pixman_op_t              op,
+                  uint32_t *               dest,
+                  const uint32_t *         src,
+                  const uint32_t *         mask,
+                  int                      width)
+{
+    const uint32_t *end = dest + width;
+
+    while (dest < end)
+    {
+       __m64 x, a;
+
+       x = load8888 (combine (src, mask));
+       a = load8888 (*dest);
+       a = expand_alpha (a);
+       x = pix_multiply (x, a);
+
+       *dest = store8888 (x);
+
+       ++dest;
+       ++src;
+       if (mask)
+           mask++;
+    }
+    _mm_empty ();
+}
+
+static void
+mmx_combine_in_reverse_u (pixman_implementation_t *imp,
+                          pixman_op_t              op,
+                          uint32_t *               dest,
+                          const uint32_t *         src,
+                          const uint32_t *         mask,
+                          int                      width)
+{
+    const uint32_t *end = dest + width;
+
+    while (dest < end)
+    {
+       __m64 x, a;
+
+       x = load8888 (*dest);
+       a = load8888 (combine (src, mask));
+       a = expand_alpha (a);
+       x = pix_multiply (x, a);
+       *dest = store8888 (x);
+
+       ++dest;
+       ++src;
+       if (mask)
+           mask++;
+    }
+    _mm_empty ();
+}
+
+static void
+mmx_combine_out_u (pixman_implementation_t *imp,
+                   pixman_op_t              op,
+                   uint32_t *               dest,
+                   const uint32_t *         src,
+                   const uint32_t *         mask,
+                   int                      width)
+{
+    const uint32_t *end = dest + width;
+
+    while (dest < end)
+    {
+       __m64 x, a;
+
+       x = load8888 (combine (src, mask));
+       a = load8888 (*dest);
+       a = expand_alpha (a);
+       a = negate (a);
+       x = pix_multiply (x, a);
+       *dest = store8888 (x);
+
+       ++dest;
+       ++src;
+       if (mask)
+           mask++;
+    }
+    _mm_empty ();
+}
+
+static void
+mmx_combine_out_reverse_u (pixman_implementation_t *imp,
+                           pixman_op_t              op,
+                           uint32_t *               dest,
+                           const uint32_t *         src,
+                           const uint32_t *         mask,
+                           int                      width)
+{
+    const uint32_t *end = dest + width;
+
+    while (dest < end)
+    {
+       __m64 x, a;
+
+       x = load8888 (*dest);
+       a = load8888 (combine (src, mask));
+       a = expand_alpha (a);
+       a = negate (a);
+       x = pix_multiply (x, a);
+
+       *dest = store8888 (x);
+
+       ++dest;
+       ++src;
+       if (mask)
+           mask++;
+    }
+    _mm_empty ();
+}
+
+static void
+mmx_combine_atop_u (pixman_implementation_t *imp,
+                    pixman_op_t              op,
+                    uint32_t *               dest,
+                    const uint32_t *         src,
+                    const uint32_t *         mask,
+                    int                      width)
+{
+    const uint32_t *end = dest + width;
+
+    while (dest < end)
+    {
+       __m64 s, da, d, sia;
+
+       s = load8888 (combine (src, mask));
+       d = load8888 (*dest);
+       sia = expand_alpha (s);
+       sia = negate (sia);
+       da = expand_alpha (d);
+       s = pix_add_mul (s, da, d, sia);
+       *dest = store8888 (s);
+
+       ++dest;
+       ++src;
+       if (mask)
+           mask++;
+    }
+    _mm_empty ();
+}
+
+static void
+mmx_combine_atop_reverse_u (pixman_implementation_t *imp,
+                            pixman_op_t              op,
+                            uint32_t *               dest,
+                            const uint32_t *         src,
+                            const uint32_t *         mask,
+                            int                      width)
+{
+    const uint32_t *end;
+
+    end = dest + width;
+
+    while (dest < end)
+    {
+       __m64 s, dia, d, sa;
+
+       s = load8888 (combine (src, mask));
+       d = load8888 (*dest);
+       sa = expand_alpha (s);
+       dia = expand_alpha (d);
+       dia = negate (dia);
+       s = pix_add_mul (s, dia, d, sa);
+       *dest = store8888 (s);
+
+       ++dest;
+       ++src;
+       if (mask)
+           mask++;
+    }
+    _mm_empty ();
+}
+
+static void
+mmx_combine_xor_u (pixman_implementation_t *imp,
+                   pixman_op_t              op,
+                   uint32_t *               dest,
+                   const uint32_t *         src,
+                   const uint32_t *         mask,
+                   int                      width)
+{
+    const uint32_t *end = dest + width;
+
+    while (dest < end)
+    {
+       __m64 s, dia, d, sia;
+
+       s = load8888 (combine (src, mask));
+       d = load8888 (*dest);
+       sia = expand_alpha (s);
+       dia = expand_alpha (d);
+       sia = negate (sia);
+       dia = negate (dia);
+       s = pix_add_mul (s, dia, d, sia);
+       *dest = store8888 (s);
+
+       ++dest;
+       ++src;
+       if (mask)
+           mask++;
+    }
+    _mm_empty ();
+}
+
+static void
+mmx_combine_add_u (pixman_implementation_t *imp,
+                   pixman_op_t              op,
+                   uint32_t *               dest,
+                   const uint32_t *         src,
+                   const uint32_t *         mask,
+                   int                      width)
+{
+    const uint32_t *end = dest + width;
+
+    while (dest < end)
+    {
+       __m64 s, d;
+
+       s = load8888 (combine (src, mask));
+       d = load8888 (*dest);
+       s = pix_add (s, d);
+       *dest = store8888 (s);
+
+       ++dest;
+       ++src;
+       if (mask)
+           mask++;
+    }
+    _mm_empty ();
+}
+
+static void
+mmx_combine_saturate_u (pixman_implementation_t *imp,
+                        pixman_op_t              op,
+                        uint32_t *               dest,
+                        const uint32_t *         src,
+                        const uint32_t *         mask,
+                        int                      width)
+{
+    const uint32_t *end = dest + width;
+
+    while (dest < end)
+    {
+       uint32_t s = combine (src, mask);
+       uint32_t d = *dest;
+       __m64 ms = load8888 (s);
+       __m64 md = load8888 (d);
+       uint32_t sa = s >> 24;
+       uint32_t da = ~d >> 24;
+
+       if (sa > da)
+       {
+           __m64 msa = load8888 (DIV_UN8 (da, sa) << 24);
+           msa = expand_alpha (msa);
+           ms = pix_multiply (ms, msa);
+       }
+
+       md = pix_add (md, ms);
+       *dest = store8888 (md);
+
+       ++src;
+       ++dest;
+       if (mask)
+           mask++;
+    }
+    _mm_empty ();
+}
+
+static void
+mmx_combine_src_ca (pixman_implementation_t *imp,
+                    pixman_op_t              op,
+                    uint32_t *               dest,
+                    const uint32_t *         src,
+                    const uint32_t *         mask,
+                    int                      width)
+{
+    const uint32_t *end = src + width;
+
+    while (src < end)
+    {
+       __m64 a = load8888 (*mask);
+       __m64 s = load8888 (*src);
+
+       s = pix_multiply (s, a);
+       *dest = store8888 (s);
+
+       ++src;
+       ++mask;
+       ++dest;
+    }
+    _mm_empty ();
+}
+
+static void
+mmx_combine_over_ca (pixman_implementation_t *imp,
+                     pixman_op_t              op,
+                     uint32_t *               dest,
+                     const uint32_t *         src,
+                     const uint32_t *         mask,
+                     int                      width)
+{
+    const uint32_t *end = src + width;
+
+    while (src < end)
+    {
+       __m64 a = load8888 (*mask);
+       __m64 s = load8888 (*src);
+       __m64 d = load8888 (*dest);
+       __m64 sa = expand_alpha (s);
+
+       *dest = store8888 (in_over (s, sa, a, d));
+
+       ++src;
+       ++dest;
+       ++mask;
+    }
+    _mm_empty ();
+}
+
+static void
+mmx_combine_over_reverse_ca (pixman_implementation_t *imp,
+                             pixman_op_t              op,
+                             uint32_t *               dest,
+                             const uint32_t *         src,
+                             const uint32_t *         mask,
+                             int                      width)
+{
+    const uint32_t *end = src + width;
+
+    while (src < end)
+    {
+       __m64 a = load8888 (*mask);
+       __m64 s = load8888 (*src);
+       __m64 d = load8888 (*dest);
+       __m64 da = expand_alpha (d);
+
+       *dest = store8888 (over (d, da, in (s, a)));
+
+       ++src;
+       ++dest;
+       ++mask;
+    }
+    _mm_empty ();
+}
+
+static void
+mmx_combine_in_ca (pixman_implementation_t *imp,
+                   pixman_op_t              op,
+                   uint32_t *               dest,
+                   const uint32_t *         src,
+                   const uint32_t *         mask,
+                   int                      width)
+{
+    const uint32_t *end = src + width;
+
+    while (src < end)
+    {
+       __m64 a = load8888 (*mask);
+       __m64 s = load8888 (*src);
+       __m64 d = load8888 (*dest);
+       __m64 da = expand_alpha (d);
+
+       s = pix_multiply (s, a);
+       s = pix_multiply (s, da);
+       *dest = store8888 (s);
+
+       ++src;
+       ++dest;
+       ++mask;
+    }
+    _mm_empty ();
+}
+
+static void
+mmx_combine_in_reverse_ca (pixman_implementation_t *imp,
+                           pixman_op_t              op,
+                           uint32_t *               dest,
+                           const uint32_t *         src,
+                           const uint32_t *         mask,
+                           int                      width)
+{
+    const uint32_t *end = src + width;
+
+    while (src < end)
+    {
+       __m64 a = load8888 (*mask);
+       __m64 s = load8888 (*src);
+       __m64 d = load8888 (*dest);
+       __m64 sa = expand_alpha (s);
+
+       a = pix_multiply (a, sa);
+       d = pix_multiply (d, a);
+       *dest = store8888 (d);
+
+       ++src;
+       ++dest;
+       ++mask;
+    }
+    _mm_empty ();
+}
+
+static void
+mmx_combine_out_ca (pixman_implementation_t *imp,
+                    pixman_op_t              op,
+                    uint32_t *               dest,
+                    const uint32_t *         src,
+                    const uint32_t *         mask,
+                    int                      width)
+{
+    const uint32_t *end = src + width;
+
+    while (src < end)
+    {
+       __m64 a = load8888 (*mask);
+       __m64 s = load8888 (*src);
+       __m64 d = load8888 (*dest);
+       __m64 da = expand_alpha (d);
+
+       da = negate (da);
+       s = pix_multiply (s, a);
+       s = pix_multiply (s, da);
+       *dest = store8888 (s);
+
+       ++src;
+       ++dest;
+       ++mask;
+    }
+    _mm_empty ();
+}
+
+static void
+mmx_combine_out_reverse_ca (pixman_implementation_t *imp,
+                            pixman_op_t              op,
+                            uint32_t *               dest,
+                            const uint32_t *         src,
+                            const uint32_t *         mask,
+                            int                      width)
+{
+    const uint32_t *end = src + width;
+
+    while (src < end)
+    {
+       __m64 a = load8888 (*mask);
+       __m64 s = load8888 (*src);
+       __m64 d = load8888 (*dest);
+       __m64 sa = expand_alpha (s);
+
+       a = pix_multiply (a, sa);
+       a = negate (a);
+       d = pix_multiply (d, a);
+       *dest = store8888 (d);
+
+       ++src;
+       ++dest;
+       ++mask;
+    }
+    _mm_empty ();
+}
+
+static void
+mmx_combine_atop_ca (pixman_implementation_t *imp,
+                     pixman_op_t              op,
+                     uint32_t *               dest,
+                     const uint32_t *         src,
+                     const uint32_t *         mask,
+                     int                      width)
+{
+    const uint32_t *end = src + width;
+
+    while (src < end)
+    {
+       __m64 a = load8888 (*mask);
+       __m64 s = load8888 (*src);
+       __m64 d = load8888 (*dest);
+       __m64 da = expand_alpha (d);
+       __m64 sa = expand_alpha (s);
+
+       s = pix_multiply (s, a);
+       a = pix_multiply (a, sa);
+       a = negate (a);
+       d = pix_add_mul (d, a, s, da);
+       *dest = store8888 (d);
+
+       ++src;
+       ++dest;
+       ++mask;
+    }
+    _mm_empty ();
+}
+
+static void
+mmx_combine_atop_reverse_ca (pixman_implementation_t *imp,
+                             pixman_op_t              op,
+                             uint32_t *               dest,
+                             const uint32_t *         src,
+                             const uint32_t *         mask,
+                             int                      width)
+{
+    const uint32_t *end = src + width;
+
+    while (src < end)
+    {
+       __m64 a = load8888 (*mask);
+       __m64 s = load8888 (*src);
+       __m64 d = load8888 (*dest);
+       __m64 da = expand_alpha (d);
+       __m64 sa = expand_alpha (s);
+
+       s = pix_multiply (s, a);
+       a = pix_multiply (a, sa);
+       da = negate (da);
+       d = pix_add_mul (d, a, s, da);
+       *dest = store8888 (d);
+
+       ++src;
+       ++dest;
+       ++mask;
+    }
+    _mm_empty ();
+}
+
+static void
+mmx_combine_xor_ca (pixman_implementation_t *imp,
+                    pixman_op_t              op,
+                    uint32_t *               dest,
+                    const uint32_t *         src,
+                    const uint32_t *         mask,
+                    int                      width)
+{
+    const uint32_t *end = src + width;
+
+    while (src < end)
+    {
+       __m64 a = load8888 (*mask);
+       __m64 s = load8888 (*src);
+       __m64 d = load8888 (*dest);
+       __m64 da = expand_alpha (d);
+       __m64 sa = expand_alpha (s);
+
+       s = pix_multiply (s, a);
+       a = pix_multiply (a, sa);
+       da = negate (da);
+       a = negate (a);
+       d = pix_add_mul (d, a, s, da);
+       *dest = store8888 (d);
+
+       ++src;
+       ++dest;
+       ++mask;
+    }
+    _mm_empty ();
+}
+
+static void
+mmx_combine_add_ca (pixman_implementation_t *imp,
+                    pixman_op_t              op,
+                    uint32_t *               dest,
+                    const uint32_t *         src,
+                    const uint32_t *         mask,
+                    int                      width)
+{
+    const uint32_t *end = src + width;
+
+    while (src < end)
+    {
+       __m64 a = load8888 (*mask);
+       __m64 s = load8888 (*src);
+       __m64 d = load8888 (*dest);
+
+       s = pix_multiply (s, a);
+       d = pix_add (s, d);
+       *dest = store8888 (d);
+
+       ++src;
+       ++dest;
+       ++mask;
+    }
+    _mm_empty ();
+}
+
+/* ------------- MMX code paths called from fbpict.c -------------------- */
+
+static void
+mmx_composite_over_n_8888 (pixman_implementation_t *imp,
+                           pixman_composite_info_t *info)
+{
+    PIXMAN_COMPOSITE_ARGS (info);
+    uint32_t src;
+    uint32_t    *dst_line, *dst;
+    int32_t w;
+    int dst_stride;
+    __m64 vsrc, vsrca;
+
+    CHECKPOINT ();
+
+    src = _pixman_image_get_solid (imp, src_image, dest_image->bits.format);
+
+    if (src == 0)
+       return;
+
+    PIXMAN_IMAGE_GET_LINE (dest_image, dest_x, dest_y, uint32_t, dst_stride, dst_line, 1);
+
+    vsrc = load8888 (src);
+    vsrca = expand_alpha (vsrc);
+
+    while (height--)
+    {
+       dst = dst_line;
+       dst_line += dst_stride;
+       w = width;
+
+       CHECKPOINT ();
+
+       while (w && (unsigned long)dst & 7)
+       {
+           *dst = store8888 (over (vsrc, vsrca, load8888 (*dst)));
+
+           w--;
+           dst++;
+       }
+
+       while (w >= 2)
+       {
+           __m64 vdest;
+           __m64 dest0, dest1;
+
+           vdest = *(__m64 *)dst;
+
+           dest0 = over (vsrc, vsrca, expand8888 (vdest, 0));
+           dest1 = over (vsrc, vsrca, expand8888 (vdest, 1));
+
+           *(__m64 *)dst = pack8888 (dest0, dest1);
+
+           dst += 2;
+           w -= 2;
+       }
+
+       CHECKPOINT ();
+
+       if (w)
+       {
+           *dst = store8888 (over (vsrc, vsrca, load8888 (*dst)));
+       }
+    }
+
+    _mm_empty ();
+}
+
+static void
+mmx_composite_over_n_0565 (pixman_implementation_t *imp,
+                           pixman_composite_info_t *info)
+{
+    PIXMAN_COMPOSITE_ARGS (info);
+    uint32_t src;
+    uint16_t    *dst_line, *dst;
+    int32_t w;
+    int dst_stride;
+    __m64 vsrc, vsrca;
+
+    CHECKPOINT ();
+
+    src = _pixman_image_get_solid (imp, src_image, dest_image->bits.format);
+
+    if (src == 0)
+       return;
+
+    PIXMAN_IMAGE_GET_LINE (dest_image, dest_x, dest_y, uint16_t, dst_stride, dst_line, 1);
+
+    vsrc = load8888 (src);
+    vsrca = expand_alpha (vsrc);
+
+    while (height--)
+    {
+       dst = dst_line;
+       dst_line += dst_stride;
+       w = width;
+
+       CHECKPOINT ();
+
+       while (w && (unsigned long)dst & 7)
+       {
+           uint64_t d = *dst;
+           __m64 vdest = expand565 (to_m64 (d), 0);
+
+           vdest = pack_565 (over (vsrc, vsrca, vdest), vdest, 0);
+           *dst = to_uint64 (vdest);
+
+           w--;
+           dst++;
+       }
+
+       while (w >= 4)
+       {
+           __m64 vdest;
+
+           vdest = *(__m64 *)dst;
+
+           vdest = pack_565 (over (vsrc, vsrca, expand565 (vdest, 0)), vdest, 0);
+           vdest = pack_565 (over (vsrc, vsrca, expand565 (vdest, 1)), vdest, 1);
+           vdest = pack_565 (over (vsrc, vsrca, expand565 (vdest, 2)), vdest, 2);
+           vdest = pack_565 (over (vsrc, vsrca, expand565 (vdest, 3)), vdest, 3);
+
+           *(__m64 *)dst = vdest;
+
+           dst += 4;
+           w -= 4;
+       }
+
+       CHECKPOINT ();
+
+       while (w)
+       {
+           uint64_t d = *dst;
+           __m64 vdest = expand565 (to_m64 (d), 0);
+
+           vdest = pack_565 (over (vsrc, vsrca, vdest), vdest, 0);
+           *dst = to_uint64 (vdest);
+
+           w--;
+           dst++;
+       }
+    }
+
+    _mm_empty ();
+}
+
+static void
+mmx_composite_over_n_8888_8888_ca (pixman_implementation_t *imp,
+                                   pixman_composite_info_t *info)
+{
+    PIXMAN_COMPOSITE_ARGS (info);
+    uint32_t src;
+    uint32_t    *dst_line;
+    uint32_t    *mask_line;
+    int dst_stride, mask_stride;
+    __m64 vsrc, vsrca;
+
+    CHECKPOINT ();
+
+    src = _pixman_image_get_solid (imp, src_image, dest_image->bits.format);
+
+    if (src == 0)
+       return;
+
+    PIXMAN_IMAGE_GET_LINE (dest_image, dest_x, dest_y, uint32_t, dst_stride, dst_line, 1);
+    PIXMAN_IMAGE_GET_LINE (mask_image, mask_x, mask_y, uint32_t, mask_stride, mask_line, 1);
+
+    vsrc = load8888 (src);
+    vsrca = expand_alpha (vsrc);
+
+    while (height--)
+    {
+       int twidth = width;
+       uint32_t *p = (uint32_t *)mask_line;
+       uint32_t *q = (uint32_t *)dst_line;
+
+       while (twidth && (unsigned long)q & 7)
+       {
+           uint32_t m = *(uint32_t *)p;
+
+           if (m)
+           {
+               __m64 vdest = load8888 (*q);
+               vdest = in_over (vsrc, vsrca, load8888 (m), vdest);
+               *q = store8888 (vdest);
+           }
+
+           twidth--;
+           p++;
+           q++;
+       }
+
+       while (twidth >= 2)
+       {
+           uint32_t m0, m1;
+           m0 = *p;
+           m1 = *(p + 1);
+
+           if (m0 | m1)
+           {
+               __m64 dest0, dest1;
+               __m64 vdest = *(__m64 *)q;
+
+               dest0 = in_over (vsrc, vsrca, load8888 (m0),
+                                expand8888 (vdest, 0));
+               dest1 = in_over (vsrc, vsrca, load8888 (m1),
+                                expand8888 (vdest, 1));
+
+               *(__m64 *)q = pack8888 (dest0, dest1);
+           }
+
+           p += 2;
+           q += 2;
+           twidth -= 2;
+       }
+
+       while (twidth)
+       {
+           uint32_t m = *(uint32_t *)p;
+
+           if (m)
+           {
+               __m64 vdest = load8888 (*q);
+               vdest = in_over (vsrc, vsrca, load8888 (m), vdest);
+               *q = store8888 (vdest);
+           }
+
+           twidth--;
+           p++;
+           q++;
+       }
+
+       dst_line += dst_stride;
+       mask_line += mask_stride;
+    }
+
+    _mm_empty ();
+}
+
+static void
+mmx_composite_over_8888_n_8888 (pixman_implementation_t *imp,
+                                pixman_composite_info_t *info)
+{
+    PIXMAN_COMPOSITE_ARGS (info);
+    uint32_t    *dst_line, *dst;
+    uint32_t    *src_line, *src;
+    uint32_t mask;
+    __m64 vmask;
+    int dst_stride, src_stride;
+    int32_t w;
+
+    CHECKPOINT ();
+
+    PIXMAN_IMAGE_GET_LINE (dest_image, dest_x, dest_y, uint32_t, dst_stride, dst_line, 1);
+    PIXMAN_IMAGE_GET_LINE (src_image, src_x, src_y, uint32_t, src_stride, src_line, 1);
+
+    mask = _pixman_image_get_solid (imp, mask_image, dest_image->bits.format);
+    mask &= 0xff000000;
+    mask = mask | mask >> 8 | mask >> 16 | mask >> 24;
+    vmask = load8888 (mask);
+
+    while (height--)
+    {
+       dst = dst_line;
+       dst_line += dst_stride;
+       src = src_line;
+       src_line += src_stride;
+       w = width;
+
+       while (w && (unsigned long)dst & 7)
+       {
+           __m64 s = load8888 (*src);
+           __m64 d = load8888 (*dst);
+
+           *dst = store8888 (in_over (s, expand_alpha (s), vmask, d));
+
+           w--;
+           dst++;
+           src++;
+       }
+
+       while (w >= 2)
+       {
+           __m64 vs = (__m64)ldq_u((uint64_t *)src);
+           __m64 vd = *(__m64 *)dst;
+           __m64 vsrc0 = expand8888 (vs, 0);
+           __m64 vsrc1 = expand8888 (vs, 1);
+
+           *(__m64 *)dst = pack8888 (
+               in_over (vsrc0, expand_alpha (vsrc0), vmask, expand8888 (vd, 0)),
+               in_over (vsrc1, expand_alpha (vsrc1), vmask, expand8888 (vd, 1)));
+
+           w -= 2;
+           dst += 2;
+           src += 2;
+       }
+
+       if (w)
+       {
+           __m64 s = load8888 (*src);
+           __m64 d = load8888 (*dst);
+
+           *dst = store8888 (in_over (s, expand_alpha (s), vmask, d));
+       }
+    }
+
+    _mm_empty ();
+}
+
+static void
+mmx_composite_over_x888_n_8888 (pixman_implementation_t *imp,
+                                pixman_composite_info_t *info)
+{
+    PIXMAN_COMPOSITE_ARGS (info);
+    uint32_t *dst_line, *dst;
+    uint32_t *src_line, *src;
+    uint32_t mask;
+    __m64 vmask;
+    int dst_stride, src_stride;
+    int32_t w;
+    __m64 srca;
+
+    CHECKPOINT ();
+
+    PIXMAN_IMAGE_GET_LINE (dest_image, dest_x, dest_y, uint32_t, dst_stride, dst_line, 1);
+    PIXMAN_IMAGE_GET_LINE (src_image, src_x, src_y, uint32_t, src_stride, src_line, 1);
+    mask = _pixman_image_get_solid (imp, mask_image, dest_image->bits.format);
+
+    mask &= 0xff000000;
+    mask = mask | mask >> 8 | mask >> 16 | mask >> 24;
+    vmask = load8888 (mask);
+    srca = MC (4x00ff);
+
+    while (height--)
+    {
+       dst = dst_line;
+       dst_line += dst_stride;
+       src = src_line;
+       src_line += src_stride;
+       w = width;
+
+       while (w && (unsigned long)dst & 7)
+       {
+           __m64 s = load8888 (*src | 0xff000000);
+           __m64 d = load8888 (*dst);
+
+           *dst = store8888 (in_over (s, srca, vmask, d));
+
+           w--;
+           dst++;
+           src++;
+       }
+
+       while (w >= 16)
+       {
+           __m64 vd0 = *(__m64 *)(dst + 0);
+           __m64 vd1 = *(__m64 *)(dst + 2);
+           __m64 vd2 = *(__m64 *)(dst + 4);
+           __m64 vd3 = *(__m64 *)(dst + 6);
+           __m64 vd4 = *(__m64 *)(dst + 8);
+           __m64 vd5 = *(__m64 *)(dst + 10);
+           __m64 vd6 = *(__m64 *)(dst + 12);
+           __m64 vd7 = *(__m64 *)(dst + 14);
+
+           __m64 vs0 = (__m64)ldq_u((uint64_t *)(src + 0));
+           __m64 vs1 = (__m64)ldq_u((uint64_t *)(src + 2));
+           __m64 vs2 = (__m64)ldq_u((uint64_t *)(src + 4));
+           __m64 vs3 = (__m64)ldq_u((uint64_t *)(src + 6));
+           __m64 vs4 = (__m64)ldq_u((uint64_t *)(src + 8));
+           __m64 vs5 = (__m64)ldq_u((uint64_t *)(src + 10));
+           __m64 vs6 = (__m64)ldq_u((uint64_t *)(src + 12));
+           __m64 vs7 = (__m64)ldq_u((uint64_t *)(src + 14));
+
+           vd0 = pack8888 (
+               in_over (expandx888 (vs0, 0), srca, vmask, expand8888 (vd0, 0)),
+               in_over (expandx888 (vs0, 1), srca, vmask, expand8888 (vd0, 1)));
+
+           vd1 = pack8888 (
+               in_over (expandx888 (vs1, 0), srca, vmask, expand8888 (vd1, 0)),
+               in_over (expandx888 (vs1, 1), srca, vmask, expand8888 (vd1, 1)));
+
+           vd2 = pack8888 (
+               in_over (expandx888 (vs2, 0), srca, vmask, expand8888 (vd2, 0)),
+               in_over (expandx888 (vs2, 1), srca, vmask, expand8888 (vd2, 1)));
+
+           vd3 = pack8888 (
+               in_over (expandx888 (vs3, 0), srca, vmask, expand8888 (vd3, 0)),
+               in_over (expandx888 (vs3, 1), srca, vmask, expand8888 (vd3, 1)));
+
+           vd4 = pack8888 (
+               in_over (expandx888 (vs4, 0), srca, vmask, expand8888 (vd4, 0)),
+               in_over (expandx888 (vs4, 1), srca, vmask, expand8888 (vd4, 1)));
+
+           vd5 = pack8888 (
+               in_over (expandx888 (vs5, 0), srca, vmask, expand8888 (vd5, 0)),
+               in_over (expandx888 (vs5, 1), srca, vmask, expand8888 (vd5, 1)));
+
+           vd6 = pack8888 (
+               in_over (expandx888 (vs6, 0), srca, vmask, expand8888 (vd6, 0)),
+               in_over (expandx888 (vs6, 1), srca, vmask, expand8888 (vd6, 1)));
+
+           vd7 = pack8888 (
+               in_over (expandx888 (vs7, 0), srca, vmask, expand8888 (vd7, 0)),
+               in_over (expandx888 (vs7, 1), srca, vmask, expand8888 (vd7, 1)));
+
+           *(__m64 *)(dst + 0) = vd0;
+           *(__m64 *)(dst + 2) = vd1;
+           *(__m64 *)(dst + 4) = vd2;
+           *(__m64 *)(dst + 6) = vd3;
+           *(__m64 *)(dst + 8) = vd4;
+           *(__m64 *)(dst + 10) = vd5;
+           *(__m64 *)(dst + 12) = vd6;
+           *(__m64 *)(dst + 14) = vd7;
+
+           w -= 16;
+           dst += 16;
+           src += 16;
+       }
+
+       while (w)
+       {
+           __m64 s = load8888 (*src | 0xff000000);
+           __m64 d = load8888 (*dst);
+
+           *dst = store8888 (in_over (s, srca, vmask, d));
+
+           w--;
+           dst++;
+           src++;
+       }
+    }
+
+    _mm_empty ();
+}
+
+static void
+mmx_composite_over_8888_8888 (pixman_implementation_t *imp,
+                              pixman_composite_info_t *info)
+{
+    PIXMAN_COMPOSITE_ARGS (info);
+    uint32_t *dst_line, *dst;
+    uint32_t *src_line, *src;
+    uint32_t s;
+    int dst_stride, src_stride;
+    uint8_t a;
+    int32_t w;
+
+    CHECKPOINT ();
+
+    PIXMAN_IMAGE_GET_LINE (dest_image, dest_x, dest_y, uint32_t, dst_stride, dst_line, 1);
+    PIXMAN_IMAGE_GET_LINE (src_image, src_x, src_y, uint32_t, src_stride, src_line, 1);
+
+    while (height--)
+    {
+       dst = dst_line;
+       dst_line += dst_stride;
+       src = src_line;
+       src_line += src_stride;
+       w = width;
+
+       while (w--)
+       {
+           s = *src++;
+           a = s >> 24;
+
+           if (a == 0xff)
+           {
+               *dst = s;
+           }
+           else if (s)
+           {
+               __m64 ms, sa;
+               ms = load8888 (s);
+               sa = expand_alpha (ms);
+               *dst = store8888 (over (ms, sa, load8888 (*dst)));
+           }
+
+           dst++;
+       }
+    }
+    _mm_empty ();
+}
+
+static void
+mmx_composite_over_8888_0565 (pixman_implementation_t *imp,
+                              pixman_composite_info_t *info)
+{
+    PIXMAN_COMPOSITE_ARGS (info);
+    uint16_t    *dst_line, *dst;
+    uint32_t    *src_line, *src;
+    int dst_stride, src_stride;
+    int32_t w;
+
+    CHECKPOINT ();
+
+    PIXMAN_IMAGE_GET_LINE (dest_image, dest_x, dest_y, uint16_t, dst_stride, dst_line, 1);
+    PIXMAN_IMAGE_GET_LINE (src_image, src_x, src_y, uint32_t, src_stride, src_line, 1);
+
+#if 0
+    /* FIXME */
+    assert (src_image->drawable == mask_image->drawable);
+#endif
+
+    while (height--)
+    {
+       dst = dst_line;
+       dst_line += dst_stride;
+       src = src_line;
+       src_line += src_stride;
+       w = width;
+
+       CHECKPOINT ();
+
+       while (w && (unsigned long)dst & 7)
+       {
+           __m64 vsrc = load8888 (*src);
+           uint64_t d = *dst;
+           __m64 vdest = expand565 (to_m64 (d), 0);
+
+           vdest = pack_565 (
+               over (vsrc, expand_alpha (vsrc), vdest), vdest, 0);
+
+           *dst = to_uint64 (vdest);
+
+           w--;
+           dst++;
+           src++;
+       }
+
+       CHECKPOINT ();
+
+       while (w >= 4)
+       {
+           __m64 vsrc0, vsrc1, vsrc2, vsrc3;
+           __m64 vdest;
+
+           vsrc0 = load8888 (*(src + 0));
+           vsrc1 = load8888 (*(src + 1));
+           vsrc2 = load8888 (*(src + 2));
+           vsrc3 = load8888 (*(src + 3));
+
+           vdest = *(__m64 *)dst;
+
+           vdest = pack_565 (over (vsrc0, expand_alpha (vsrc0), expand565 (vdest, 0)), vdest, 0);
+           vdest = pack_565 (over (vsrc1, expand_alpha (vsrc1), expand565 (vdest, 1)), vdest, 1);
+           vdest = pack_565 (over (vsrc2, expand_alpha (vsrc2), expand565 (vdest, 2)), vdest, 2);
+           vdest = pack_565 (over (vsrc3, expand_alpha (vsrc3), expand565 (vdest, 3)), vdest, 3);
+
+           *(__m64 *)dst = vdest;
+
+           w -= 4;
+           dst += 4;
+           src += 4;
+       }
+
+       CHECKPOINT ();
+
+       while (w)
+       {
+           __m64 vsrc = load8888 (*src);
+           uint64_t d = *dst;
+           __m64 vdest = expand565 (to_m64 (d), 0);
+
+           vdest = pack_565 (over (vsrc, expand_alpha (vsrc), vdest), vdest, 0);
+
+           *dst = to_uint64 (vdest);
+
+           w--;
+           dst++;
+           src++;
+       }
+    }
+
+    _mm_empty ();
+}
+
+static void
+mmx_composite_over_n_8_8888 (pixman_implementation_t *imp,
+                             pixman_composite_info_t *info)
+{
+    PIXMAN_COMPOSITE_ARGS (info);
+    uint32_t src, srca;
+    uint32_t *dst_line, *dst;
+    uint8_t *mask_line, *mask;
+    int dst_stride, mask_stride;
+    int32_t w;
+    __m64 vsrc, vsrca;
+    uint64_t srcsrc;
+
+    CHECKPOINT ();
+
+    src = _pixman_image_get_solid (imp, src_image, dest_image->bits.format);
+
+    srca = src >> 24;
+    if (src == 0)
+       return;
+
+    srcsrc = (uint64_t)src << 32 | src;
+
+    PIXMAN_IMAGE_GET_LINE (dest_image, dest_x, dest_y, uint32_t, dst_stride, dst_line, 1);
+    PIXMAN_IMAGE_GET_LINE (mask_image, mask_x, mask_y, uint8_t, mask_stride, mask_line, 1);
+
+    vsrc = load8888 (src);
+    vsrca = expand_alpha (vsrc);
+
+    while (height--)
+    {
+       dst = dst_line;
+       dst_line += dst_stride;
+       mask = mask_line;
+       mask_line += mask_stride;
+       w = width;
+
+       CHECKPOINT ();
+
+       while (w && (unsigned long)dst & 7)
+       {
+           uint64_t m = *mask;
+
+           if (m)
+           {
+               __m64 vdest = in_over (vsrc, vsrca,
+                                      expand_alpha_rev (to_m64 (m)),
+                                      load8888 (*dst));
+
+               *dst = store8888 (vdest);
+           }
+
+           w--;
+           mask++;
+           dst++;
+       }
+
+       CHECKPOINT ();
+
+       while (w >= 2)
+       {
+           uint64_t m0, m1;
+
+           m0 = *mask;
+           m1 = *(mask + 1);
+
+           if (srca == 0xff && (m0 & m1) == 0xff)
+           {
+               *(uint64_t *)dst = srcsrc;
+           }
+           else if (m0 | m1)
+           {
+               __m64 vdest;
+               __m64 dest0, dest1;
+
+               vdest = *(__m64 *)dst;
+
+               dest0 = in_over (vsrc, vsrca, expand_alpha_rev (to_m64 (m0)),
+                                expand8888 (vdest, 0));
+               dest1 = in_over (vsrc, vsrca, expand_alpha_rev (to_m64 (m1)),
+                                expand8888 (vdest, 1));
+
+               *(__m64 *)dst = pack8888 (dest0, dest1);
+           }
+
+           mask += 2;
+           dst += 2;
+           w -= 2;
+       }
+
+       CHECKPOINT ();
+
+       if (w)
+       {
+           uint64_t m = *mask;
+
+           if (m)
+           {
+               __m64 vdest = load8888 (*dst);
+
+               vdest = in_over (
+                   vsrc, vsrca, expand_alpha_rev (to_m64 (m)), vdest);
+               *dst = store8888 (vdest);
+           }
+       }
+    }
+
+    _mm_empty ();
+}
+
+pixman_bool_t
+pixman_fill_mmx (uint32_t *bits,
+                 int       stride,
+                 int       bpp,
+                 int       x,
+                 int       y,
+                 int       width,
+                 int       height,
+                 uint32_t xor)
+{
+    uint64_t fill;
+    __m64 vfill;
+    uint32_t byte_width;
+    uint8_t     *byte_line;
+
+#if defined __GNUC__ && defined USE_X86_MMX
+    __m64 v1, v2, v3, v4, v5, v6, v7;
+#endif
+
+    if (bpp != 16 && bpp != 32 && bpp != 8)
+       return FALSE;
+
+    if (bpp == 8)
+    {
+       stride = stride * (int) sizeof (uint32_t) / 1;
+       byte_line = (uint8_t *)(((uint8_t *)bits) + stride * y + x);
+       byte_width = width;
+       stride *= 1;
+        xor = (xor & 0xff) * 0x01010101;
+    }
+    else if (bpp == 16)
+    {
+       stride = stride * (int) sizeof (uint32_t) / 2;
+       byte_line = (uint8_t *)(((uint16_t *)bits) + stride * y + x);
+       byte_width = 2 * width;
+       stride *= 2;
+        xor = (xor & 0xffff) * 0x00010001;
+    }
+    else
+    {
+       stride = stride * (int) sizeof (uint32_t) / 4;
+       byte_line = (uint8_t *)(((uint32_t *)bits) + stride * y + x);
+       byte_width = 4 * width;
+       stride *= 4;
+    }
+
+    fill = ((uint64_t)xor << 32) | xor;
+    vfill = to_m64 (fill);
+
+#if defined __GNUC__ && defined USE_X86_MMX
+    __asm__ (
+        "movq          %7,     %0\n"
+        "movq          %7,     %1\n"
+        "movq          %7,     %2\n"
+        "movq          %7,     %3\n"
+        "movq          %7,     %4\n"
+        "movq          %7,     %5\n"
+        "movq          %7,     %6\n"
+       : "=&y" (v1), "=&y" (v2), "=&y" (v3),
+         "=&y" (v4), "=&y" (v5), "=&y" (v6), "=y" (v7)
+       : "y" (vfill));
+#endif
+
+    while (height--)
+    {
+       int w;
+       uint8_t *d = byte_line;
+
+       byte_line += stride;
+       w = byte_width;
+
+       while (w >= 1 && ((unsigned long)d & 1))
+       {
+           *(uint8_t *)d = (xor & 0xff);
+           w--;
+           d++;
+       }
+
+       while (w >= 2 && ((unsigned long)d & 3))
+       {
+           *(uint16_t *)d = xor;
+           w -= 2;
+           d += 2;
+       }
+
+       while (w >= 4 && ((unsigned long)d & 7))
+       {
+           *(uint32_t *)d = xor;
+
+           w -= 4;
+           d += 4;
+       }
+
+       while (w >= 64)
+       {
+#if defined __GNUC__ && defined USE_X86_MMX
+           __asm__ (
+               "movq   %1,       (%0)\n"
+               "movq   %2,      8(%0)\n"
+               "movq   %3,     16(%0)\n"
+               "movq   %4,     24(%0)\n"
+               "movq   %5,     32(%0)\n"
+               "movq   %6,     40(%0)\n"
+               "movq   %7,     48(%0)\n"
+               "movq   %8,     56(%0)\n"
+               :
+               : "r" (d),
+                 "y" (vfill), "y" (v1), "y" (v2), "y" (v3),
+                 "y" (v4), "y" (v5), "y" (v6), "y" (v7)
+               : "memory");
+#else
+           *(__m64*) (d +  0) = vfill;
+           *(__m64*) (d +  8) = vfill;
+           *(__m64*) (d + 16) = vfill;
+           *(__m64*) (d + 24) = vfill;
+           *(__m64*) (d + 32) = vfill;
+           *(__m64*) (d + 40) = vfill;
+           *(__m64*) (d + 48) = vfill;
+           *(__m64*) (d + 56) = vfill;
+#endif
+           w -= 64;
+           d += 64;
+       }
+
+       while (w >= 4)
+       {
+           *(uint32_t *)d = xor;
+
+           w -= 4;
+           d += 4;
+       }
+       while (w >= 2)
+       {
+           *(uint16_t *)d = xor;
+           w -= 2;
+           d += 2;
+       }
+       while (w >= 1)
+       {
+           *(uint8_t *)d = (xor & 0xff);
+           w--;
+           d++;
+       }
+
+    }
+
+    _mm_empty ();
+    return TRUE;
+}
+
+static void
+mmx_composite_src_n_8_8888 (pixman_implementation_t *imp,
+                            pixman_composite_info_t *info)
+{
+    PIXMAN_COMPOSITE_ARGS (info);
+    uint32_t src, srca;
+    uint32_t    *dst_line, *dst;
+    uint8_t     *mask_line, *mask;
+    int dst_stride, mask_stride;
+    int32_t w;
+    __m64 vsrc;
+    uint64_t srcsrc;
+
+    CHECKPOINT ();
+
+    src = _pixman_image_get_solid (imp, src_image, dest_image->bits.format);
+
+    srca = src >> 24;
+    if (src == 0)
+    {
+       pixman_fill_mmx (dest_image->bits.bits, dest_image->bits.rowstride,
+                        PIXMAN_FORMAT_BPP (dest_image->bits.format),
+                        dest_x, dest_y, width, height, 0);
+       return;
+    }
+
+    srcsrc = (uint64_t)src << 32 | src;
+
+    PIXMAN_IMAGE_GET_LINE (dest_image, dest_x, dest_y, uint32_t, dst_stride, dst_line, 1);
+    PIXMAN_IMAGE_GET_LINE (mask_image, mask_x, mask_y, uint8_t, mask_stride, mask_line, 1);
+
+    vsrc = load8888 (src);
+
+    while (height--)
+    {
+       dst = dst_line;
+       dst_line += dst_stride;
+       mask = mask_line;
+       mask_line += mask_stride;
+       w = width;
+
+       CHECKPOINT ();
+
+       while (w && (unsigned long)dst & 7)
+       {
+           uint64_t m = *mask;
+
+           if (m)
+           {
+               __m64 vdest = in (vsrc, expand_alpha_rev (to_m64 (m)));
+
+               *dst = store8888 (vdest);
+           }
+           else
+           {
+               *dst = 0;
+           }
+
+           w--;
+           mask++;
+           dst++;
+       }
+
+       CHECKPOINT ();
+
+       while (w >= 2)
+       {
+           uint64_t m0, m1;
+           m0 = *mask;
+           m1 = *(mask + 1);
+
+           if (srca == 0xff && (m0 & m1) == 0xff)
+           {
+               *(uint64_t *)dst = srcsrc;
+           }
+           else if (m0 | m1)
+           {
+               __m64 dest0, dest1;
+
+               dest0 = in (vsrc, expand_alpha_rev (to_m64 (m0)));
+               dest1 = in (vsrc, expand_alpha_rev (to_m64 (m1)));
+
+               *(__m64 *)dst = pack8888 (dest0, dest1);
+           }
+           else
+           {
+               *(uint64_t *)dst = 0;
+           }
+
+           mask += 2;
+           dst += 2;
+           w -= 2;
+       }
+
+       CHECKPOINT ();
+
+       if (w)
+       {
+           uint64_t m = *mask;
+
+           if (m)
+           {
+               __m64 vdest = load8888 (*dst);
+
+               vdest = in (vsrc, expand_alpha_rev (to_m64 (m)));
+               *dst = store8888 (vdest);
+           }
+           else
+           {
+               *dst = 0;
+           }
+       }
+    }
+
+    _mm_empty ();
+}
+
+static void
+mmx_composite_over_n_8_0565 (pixman_implementation_t *imp,
+                             pixman_composite_info_t *info)
+{
+    PIXMAN_COMPOSITE_ARGS (info);
+    uint32_t src, srca;
+    uint16_t *dst_line, *dst;
+    uint8_t *mask_line, *mask;
+    int dst_stride, mask_stride;
+    int32_t w;
+    __m64 vsrc, vsrca, tmp;
+    uint64_t srcsrcsrcsrc, src16;
+
+    CHECKPOINT ();
+
+    src = _pixman_image_get_solid (imp, src_image, dest_image->bits.format);
+
+    srca = src >> 24;
+    if (src == 0)
+       return;
+
+    PIXMAN_IMAGE_GET_LINE (dest_image, dest_x, dest_y, uint16_t, dst_stride, dst_line, 1);
+    PIXMAN_IMAGE_GET_LINE (mask_image, mask_x, mask_y, uint8_t, mask_stride, mask_line, 1);
+
+    vsrc = load8888 (src);
+    vsrca = expand_alpha (vsrc);
+
+    tmp = pack_565 (vsrc, _mm_setzero_si64 (), 0);
+    src16 = to_uint64 (tmp);
+
+    srcsrcsrcsrc =
+       (uint64_t)src16 << 48 | (uint64_t)src16 << 32 |
+       (uint64_t)src16 << 16 | (uint64_t)src16;
+
+    while (height--)
+    {
+       dst = dst_line;
+       dst_line += dst_stride;
+       mask = mask_line;
+       mask_line += mask_stride;
+       w = width;
+
+       CHECKPOINT ();
+
+       while (w && (unsigned long)dst & 7)
+       {
+           uint64_t m = *mask;
+
+           if (m)
+           {
+               uint64_t d = *dst;
+               __m64 vd = to_m64 (d);
+               __m64 vdest = in_over (
+                   vsrc, vsrca, expand_alpha_rev (to_m64 (m)), expand565 (vd, 0));
+
+               vd = pack_565 (vdest, _mm_setzero_si64 (), 0);
+               *dst = to_uint64 (vd);
+           }
+
+           w--;
+           mask++;
+           dst++;
+       }
+
+       CHECKPOINT ();
+
+       while (w >= 4)
+       {
+           uint64_t m0, m1, m2, m3;
+           m0 = *mask;
+           m1 = *(mask + 1);
+           m2 = *(mask + 2);
+           m3 = *(mask + 3);
+
+           if (srca == 0xff && (m0 & m1 & m2 & m3) == 0xff)
+           {
+               *(uint64_t *)dst = srcsrcsrcsrc;
+           }
+           else if (m0 | m1 | m2 | m3)
+           {
+               __m64 vdest;
+               __m64 vm0, vm1, vm2, vm3;
+
+               vdest = *(__m64 *)dst;
+
+               vm0 = to_m64 (m0);
+               vdest = pack_565 (in_over (vsrc, vsrca, expand_alpha_rev (vm0),
+                                          expand565 (vdest, 0)), vdest, 0);
+               vm1 = to_m64 (m1);
+               vdest = pack_565 (in_over (vsrc, vsrca, expand_alpha_rev (vm1),
+                                          expand565 (vdest, 1)), vdest, 1);
+               vm2 = to_m64 (m2);
+               vdest = pack_565 (in_over (vsrc, vsrca, expand_alpha_rev (vm2),
+                                          expand565 (vdest, 2)), vdest, 2);
+               vm3 = to_m64 (m3);
+               vdest = pack_565 (in_over (vsrc, vsrca, expand_alpha_rev (vm3),
+                                          expand565 (vdest, 3)), vdest, 3);
+
+               *(__m64 *)dst = vdest;
+           }
+
+           w -= 4;
+           mask += 4;
+           dst += 4;
+       }
+
+       CHECKPOINT ();
+
+       while (w)
+       {
+           uint64_t m = *mask;
+
+           if (m)
+           {
+               uint64_t d = *dst;
+               __m64 vd = to_m64 (d);
+               __m64 vdest = in_over (vsrc, vsrca, expand_alpha_rev (to_m64 (m)),
+                                      expand565 (vd, 0));
+               vd = pack_565 (vdest, _mm_setzero_si64 (), 0);
+               *dst = to_uint64 (vd);
+           }
+
+           w--;
+           mask++;
+           dst++;
+       }
+    }
+
+    _mm_empty ();
+}
+
+static void
+mmx_composite_over_pixbuf_0565 (pixman_implementation_t *imp,
+                                pixman_composite_info_t *info)
+{
+    PIXMAN_COMPOSITE_ARGS (info);
+    uint16_t    *dst_line, *dst;
+    uint32_t    *src_line, *src;
+    int dst_stride, src_stride;
+    int32_t w;
+
+    CHECKPOINT ();
+
+    PIXMAN_IMAGE_GET_LINE (dest_image, dest_x, dest_y, uint16_t, dst_stride, dst_line, 1);
+    PIXMAN_IMAGE_GET_LINE (src_image, src_x, src_y, uint32_t, src_stride, src_line, 1);
+
+#if 0
+    /* FIXME */
+    assert (src_image->drawable == mask_image->drawable);
+#endif
+
+    while (height--)
+    {
+       dst = dst_line;
+       dst_line += dst_stride;
+       src = src_line;
+       src_line += src_stride;
+       w = width;
+
+       CHECKPOINT ();
+
+       while (w && (unsigned long)dst & 7)
+       {
+           __m64 vsrc = load8888 (*src);
+           uint64_t d = *dst;
+           __m64 vdest = expand565 (to_m64 (d), 0);
+
+           vdest = pack_565 (over_rev_non_pre (vsrc, vdest), vdest, 0);
+
+           *dst = to_uint64 (vdest);
+
+           w--;
+           dst++;
+           src++;
+       }
+
+       CHECKPOINT ();
+
+       while (w >= 4)
+       {
+           uint32_t s0, s1, s2, s3;
+           unsigned char a0, a1, a2, a3;
+
+           s0 = *src;
+           s1 = *(src + 1);
+           s2 = *(src + 2);
+           s3 = *(src + 3);
+
+           a0 = (s0 >> 24);
+           a1 = (s1 >> 24);
+           a2 = (s2 >> 24);
+           a3 = (s3 >> 24);
+
+           if ((a0 & a1 & a2 & a3) == 0xFF)
+           {
+               __m64 vdest;
+               vdest = pack_565 (invert_colors (load8888 (s0)), _mm_setzero_si64 (), 0);
+               vdest = pack_565 (invert_colors (load8888 (s1)), vdest, 1);
+               vdest = pack_565 (invert_colors (load8888 (s2)), vdest, 2);
+               vdest = pack_565 (invert_colors (load8888 (s3)), vdest, 3);
+
+               *(__m64 *)dst = vdest;
+           }
+           else if (s0 | s1 | s2 | s3)
+           {
+               __m64 vdest = *(__m64 *)dst;
+
+               vdest = pack_565 (over_rev_non_pre (load8888 (s0), expand565 (vdest, 0)), vdest, 0);
+               vdest = pack_565 (over_rev_non_pre (load8888 (s1), expand565 (vdest, 1)), vdest, 1);
+               vdest = pack_565 (over_rev_non_pre (load8888 (s2), expand565 (vdest, 2)), vdest, 2);
+               vdest = pack_565 (over_rev_non_pre (load8888 (s3), expand565 (vdest, 3)), vdest, 3);
+
+               *(__m64 *)dst = vdest;
+           }
+
+           w -= 4;
+           dst += 4;
+           src += 4;
+       }
+
+       CHECKPOINT ();
+
+       while (w)
+       {
+           __m64 vsrc = load8888 (*src);
+           uint64_t d = *dst;
+           __m64 vdest = expand565 (to_m64 (d), 0);
+
+           vdest = pack_565 (over_rev_non_pre (vsrc, vdest), vdest, 0);
+
+           *dst = to_uint64 (vdest);
+
+           w--;
+           dst++;
+           src++;
+       }
+    }
+
+    _mm_empty ();
+}
+
+static void
+mmx_composite_over_pixbuf_8888 (pixman_implementation_t *imp,
+                                pixman_composite_info_t *info)
+{
+    PIXMAN_COMPOSITE_ARGS (info);
+    uint32_t    *dst_line, *dst;
+    uint32_t    *src_line, *src;
+    int dst_stride, src_stride;
+    int32_t w;
+
+    CHECKPOINT ();
+
+    PIXMAN_IMAGE_GET_LINE (dest_image, dest_x, dest_y, uint32_t, dst_stride, dst_line, 1);
+    PIXMAN_IMAGE_GET_LINE (src_image, src_x, src_y, uint32_t, src_stride, src_line, 1);
+
+#if 0
+    /* FIXME */
+    assert (src_image->drawable == mask_image->drawable);
+#endif
+
+    while (height--)
+    {
+       dst = dst_line;
+       dst_line += dst_stride;
+       src = src_line;
+       src_line += src_stride;
+       w = width;
+
+       while (w && (unsigned long)dst & 7)
+       {
+           __m64 s = load8888 (*src);
+           __m64 d = load8888 (*dst);
+
+           *dst = store8888 (over_rev_non_pre (s, d));
+
+           w--;
+           dst++;
+           src++;
+       }
+
+       while (w >= 2)
+       {
+           uint64_t s0, s1;
+           unsigned char a0, a1;
+           __m64 d0, d1;
+
+           s0 = *src;
+           s1 = *(src + 1);
+
+           a0 = (s0 >> 24);
+           a1 = (s1 >> 24);
+
+           if ((a0 & a1) == 0xFF)
+           {
+               d0 = invert_colors (load8888 (s0));
+               d1 = invert_colors (load8888 (s1));
+
+               *(__m64 *)dst = pack8888 (d0, d1);
+           }
+           else if (s0 | s1)
+           {
+               __m64 vdest = *(__m64 *)dst;
+
+               d0 = over_rev_non_pre (load8888 (s0), expand8888 (vdest, 0));
+               d1 = over_rev_non_pre (load8888 (s1), expand8888 (vdest, 1));
+
+               *(__m64 *)dst = pack8888 (d0, d1);
+           }
+
+           w -= 2;
+           dst += 2;
+           src += 2;
+       }
+
+       if (w)
+       {
+           __m64 s = load8888 (*src);
+           __m64 d = load8888 (*dst);
+
+           *dst = store8888 (over_rev_non_pre (s, d));
+       }
+    }
+
+    _mm_empty ();
+}
+
+static void
+mmx_composite_over_n_8888_0565_ca (pixman_implementation_t *imp,
+                                   pixman_composite_info_t *info)
+{
+    PIXMAN_COMPOSITE_ARGS (info);
+    uint32_t src;
+    uint16_t    *dst_line;
+    uint32_t    *mask_line;
+    int dst_stride, mask_stride;
+    __m64 vsrc, vsrca;
+
+    CHECKPOINT ();
+
+    src = _pixman_image_get_solid (imp, src_image, dest_image->bits.format);
+
+    if (src == 0)
+       return;
+
+    PIXMAN_IMAGE_GET_LINE (dest_image, dest_x, dest_y, uint16_t, dst_stride, dst_line, 1);
+    PIXMAN_IMAGE_GET_LINE (mask_image, mask_x, mask_y, uint32_t, mask_stride, mask_line, 1);
+
+    vsrc = load8888 (src);
+    vsrca = expand_alpha (vsrc);
+
+    while (height--)
+    {
+       int twidth = width;
+       uint32_t *p = (uint32_t *)mask_line;
+       uint16_t *q = (uint16_t *)dst_line;
+
+       while (twidth && ((unsigned long)q & 7))
+       {
+           uint32_t m = *(uint32_t *)p;
+
+           if (m)
+           {
+               uint64_t d = *q;
+               __m64 vdest = expand565 (to_m64 (d), 0);
+               vdest = pack_565 (in_over (vsrc, vsrca, load8888 (m), vdest), vdest, 0);
+               *q = to_uint64 (vdest);
+           }
+
+           twidth--;
+           p++;
+           q++;
+       }
+
+       while (twidth >= 4)
+       {
+           uint32_t m0, m1, m2, m3;
+
+           m0 = *p;
+           m1 = *(p + 1);
+           m2 = *(p + 2);
+           m3 = *(p + 3);
+
+           if ((m0 | m1 | m2 | m3))
+           {
+               __m64 vdest = *(__m64 *)q;
+
+               vdest = pack_565 (in_over (vsrc, vsrca, load8888 (m0), expand565 (vdest, 0)), vdest, 0);
+               vdest = pack_565 (in_over (vsrc, vsrca, load8888 (m1), expand565 (vdest, 1)), vdest, 1);
+               vdest = pack_565 (in_over (vsrc, vsrca, load8888 (m2), expand565 (vdest, 2)), vdest, 2);
+               vdest = pack_565 (in_over (vsrc, vsrca, load8888 (m3), expand565 (vdest, 3)), vdest, 3);
+
+               *(__m64 *)q = vdest;
+           }
+           twidth -= 4;
+           p += 4;
+           q += 4;
+       }
+
+       while (twidth)
+       {
+           uint32_t m;
+
+           m = *(uint32_t *)p;
+           if (m)
+           {
+               uint64_t d = *q;
+               __m64 vdest = expand565 (to_m64 (d), 0);
+               vdest = pack_565 (in_over (vsrc, vsrca, load8888 (m), vdest), vdest, 0);
+               *q = to_uint64 (vdest);
+           }
+
+           twidth--;
+           p++;
+           q++;
+       }
+
+       mask_line += mask_stride;
+       dst_line += dst_stride;
+    }
+
+    _mm_empty ();
+}
+
+static void
+mmx_composite_in_n_8_8 (pixman_implementation_t *imp,
+                        pixman_composite_info_t *info)
+{
+    PIXMAN_COMPOSITE_ARGS (info);
+    uint8_t *dst_line, *dst;
+    uint8_t *mask_line, *mask;
+    int dst_stride, mask_stride;
+    int32_t w;
+    uint32_t src;
+    uint8_t sa;
+    __m64 vsrc, vsrca;
+
+    PIXMAN_IMAGE_GET_LINE (dest_image, dest_x, dest_y, uint8_t, dst_stride, dst_line, 1);
+    PIXMAN_IMAGE_GET_LINE (mask_image, mask_x, mask_y, uint8_t, mask_stride, mask_line, 1);
+
+    src = _pixman_image_get_solid (imp, src_image, dest_image->bits.format);
+
+    sa = src >> 24;
+
+    vsrc = load8888 (src);
+    vsrca = expand_alpha (vsrc);
+
+    while (height--)
+    {
+       dst = dst_line;
+       dst_line += dst_stride;
+       mask = mask_line;
+       mask_line += mask_stride;
+       w = width;
+
+       while (w && (unsigned long)dst & 7)
+       {
+           uint16_t tmp;
+           uint8_t a;
+           uint32_t m, d;
+
+           a = *mask++;
+           d = *dst;
+
+           m = MUL_UN8 (sa, a, tmp);
+           d = MUL_UN8 (m, d, tmp);
+
+           *dst++ = d;
+           w--;
+       }
+
+       while (w >= 4)
+       {
+           __m64 vmask;
+           __m64 vdest;
+
+           vmask = load8888 (ldl_u((uint32_t *)mask));
+           vdest = load8888 (*(uint32_t *)dst);
+
+           *(uint32_t *)dst = store8888 (in (in (vsrca, vmask), vdest));
+
+           dst += 4;
+           mask += 4;
+           w -= 4;
+       }
+
+       while (w--)
+       {
+           uint16_t tmp;
+           uint8_t a;
+           uint32_t m, d;
+
+           a = *mask++;
+           d = *dst;
+
+           m = MUL_UN8 (sa, a, tmp);
+           d = MUL_UN8 (m, d, tmp);
+
+           *dst++ = d;
+       }
+    }
+
+    _mm_empty ();
+}
+
+static void
+mmx_composite_in_8_8 (pixman_implementation_t *imp,
+                      pixman_composite_info_t *info)
+{
+    PIXMAN_COMPOSITE_ARGS (info);
+    uint8_t     *dst_line, *dst;
+    uint8_t     *src_line, *src;
+    int src_stride, dst_stride;
+    int32_t w;
+
+    PIXMAN_IMAGE_GET_LINE (dest_image, dest_x, dest_y, uint8_t, dst_stride, dst_line, 1);
+    PIXMAN_IMAGE_GET_LINE (src_image, src_x, src_y, uint8_t, src_stride, src_line, 1);
+
+    while (height--)
+    {
+       dst = dst_line;
+       dst_line += dst_stride;
+       src = src_line;
+       src_line += src_stride;
+       w = width;
+
+       while (w && (unsigned long)dst & 3)
+       {
+           uint8_t s, d;
+           uint16_t tmp;
+
+           s = *src;
+           d = *dst;
+
+           *dst = MUL_UN8 (s, d, tmp);
+
+           src++;
+           dst++;
+           w--;
+       }
+
+       while (w >= 4)
+       {
+           uint32_t *s = (uint32_t *)src;
+           uint32_t *d = (uint32_t *)dst;
+
+           *d = store8888 (in (load8888 (ldl_u((uint32_t *)s)), load8888 (*d)));
+
+           w -= 4;
+           dst += 4;
+           src += 4;
+       }
+
+       while (w--)
+       {
+           uint8_t s, d;
+           uint16_t tmp;
+
+           s = *src;
+           d = *dst;
+
+           *dst = MUL_UN8 (s, d, tmp);
+
+           src++;
+           dst++;
+       }
+    }
+
+    _mm_empty ();
+}
+
+static void
+mmx_composite_add_n_8_8 (pixman_implementation_t *imp,
+                        pixman_composite_info_t *info)
+{
+    PIXMAN_COMPOSITE_ARGS (info);
+    uint8_t     *dst_line, *dst;
+    uint8_t     *mask_line, *mask;
+    int dst_stride, mask_stride;
+    int32_t w;
+    uint32_t src;
+    uint8_t sa;
+    __m64 vsrc, vsrca;
+
+    PIXMAN_IMAGE_GET_LINE (dest_image, dest_x, dest_y, uint8_t, dst_stride, dst_line, 1);
+    PIXMAN_IMAGE_GET_LINE (mask_image, mask_x, mask_y, uint8_t, mask_stride, mask_line, 1);
+
+    src = _pixman_image_get_solid (imp, src_image, dest_image->bits.format);
+
+    sa = src >> 24;
+
+    if (src == 0)
+       return;
+
+    vsrc = load8888 (src);
+    vsrca = expand_alpha (vsrc);
+
+    while (height--)
+    {
+       dst = dst_line;
+       dst_line += dst_stride;
+       mask = mask_line;
+       mask_line += mask_stride;
+       w = width;
+
+       while (w && (unsigned long)dst & 3)
+       {
+           uint16_t tmp;
+           uint16_t a;
+           uint32_t m, d;
+           uint32_t r;
+
+           a = *mask++;
+           d = *dst;
+
+           m = MUL_UN8 (sa, a, tmp);
+           r = ADD_UN8 (m, d, tmp);
+
+           *dst++ = r;
+           w--;
+       }
+
+       while (w >= 4)
+       {
+           __m64 vmask;
+           __m64 vdest;
+
+           vmask = load8888 (ldl_u((uint32_t *)mask));
+           vdest = load8888 (*(uint32_t *)dst);
+
+           *(uint32_t *)dst = store8888 (_mm_adds_pu8 (in (vsrca, vmask), vdest));
+
+           dst += 4;
+           mask += 4;
+           w -= 4;
+       }
+
+       while (w--)
+       {
+           uint16_t tmp;
+           uint16_t a;
+           uint32_t m, d;
+           uint32_t r;
+
+           a = *mask++;
+           d = *dst;
+
+           m = MUL_UN8 (sa, a, tmp);
+           r = ADD_UN8 (m, d, tmp);
+
+           *dst++ = r;
+       }
+    }
+
+    _mm_empty ();
+}
+
+static void
+mmx_composite_add_8_8 (pixman_implementation_t *imp,
+                      pixman_composite_info_t *info)
+{
+    PIXMAN_COMPOSITE_ARGS (info);
+    uint8_t *dst_line, *dst;
+    uint8_t *src_line, *src;
+    int dst_stride, src_stride;
+    int32_t w;
+    uint8_t s, d;
+    uint16_t t;
+
+    CHECKPOINT ();
+
+    PIXMAN_IMAGE_GET_LINE (src_image, src_x, src_y, uint8_t, src_stride, src_line, 1);
+    PIXMAN_IMAGE_GET_LINE (dest_image, dest_x, dest_y, uint8_t, dst_stride, dst_line, 1);
+
+    while (height--)
+    {
+       dst = dst_line;
+       dst_line += dst_stride;
+       src = src_line;
+       src_line += src_stride;
+       w = width;
+
+       while (w && (unsigned long)dst & 7)
+       {
+           s = *src;
+           d = *dst;
+           t = d + s;
+           s = t | (0 - (t >> 8));
+           *dst = s;
+
+           dst++;
+           src++;
+           w--;
+       }
+
+       while (w >= 8)
+       {
+           *(__m64*)dst = _mm_adds_pu8 ((__m64)ldq_u((uint64_t *)src), *(__m64*)dst);
+           dst += 8;
+           src += 8;
+           w -= 8;
+       }
+
+       while (w)
+       {
+           s = *src;
+           d = *dst;
+           t = d + s;
+           s = t | (0 - (t >> 8));
+           *dst = s;
+
+           dst++;
+           src++;
+           w--;
+       }
+    }
+
+    _mm_empty ();
+}
+
+static void
+mmx_composite_add_8888_8888 (pixman_implementation_t *imp,
+                             pixman_composite_info_t *info)
+{
+    PIXMAN_COMPOSITE_ARGS (info);
+    __m64 dst64;
+    uint32_t    *dst_line, *dst;
+    uint32_t    *src_line, *src;
+    int dst_stride, src_stride;
+    int32_t w;
+
+    CHECKPOINT ();
+
+    PIXMAN_IMAGE_GET_LINE (src_image, src_x, src_y, uint32_t, src_stride, src_line, 1);
+    PIXMAN_IMAGE_GET_LINE (dest_image, dest_x, dest_y, uint32_t, dst_stride, dst_line, 1);
+
+    while (height--)
+    {
+       dst = dst_line;
+       dst_line += dst_stride;
+       src = src_line;
+       src_line += src_stride;
+       w = width;
+
+       while (w && (unsigned long)dst & 7)
+       {
+           *dst = _mm_cvtsi64_si32 (_mm_adds_pu8 (_mm_cvtsi32_si64 (*src),
+                                                  _mm_cvtsi32_si64 (*dst)));
+           dst++;
+           src++;
+           w--;
+       }
+
+       while (w >= 2)
+       {
+           dst64 = _mm_adds_pu8 ((__m64)ldq_u((uint64_t *)src), *(__m64*)dst);
+           *(uint64_t*)dst = to_uint64 (dst64);
+           dst += 2;
+           src += 2;
+           w -= 2;
+       }
+
+       if (w)
+       {
+           *dst = _mm_cvtsi64_si32 (_mm_adds_pu8 (_mm_cvtsi32_si64 (*src),
+                                                  _mm_cvtsi32_si64 (*dst)));
+
+       }
+    }
+
+    _mm_empty ();
+}
+
+static pixman_bool_t
+pixman_blt_mmx (uint32_t *src_bits,
+                uint32_t *dst_bits,
+                int       src_stride,
+                int       dst_stride,
+                int       src_bpp,
+                int       dst_bpp,
+                int       src_x,
+                int       src_y,
+                int       dest_x,
+                int       dest_y,
+                int       width,
+                int       height)
+{
+    uint8_t *   src_bytes;
+    uint8_t *   dst_bytes;
+    int byte_width;
+
+    if (src_bpp != dst_bpp)
+       return FALSE;
+
+    if (src_bpp == 16)
+    {
+       src_stride = src_stride * (int) sizeof (uint32_t) / 2;
+       dst_stride = dst_stride * (int) sizeof (uint32_t) / 2;
+       src_bytes = (uint8_t *)(((uint16_t *)src_bits) + src_stride * (src_y) + (src_x));
+       dst_bytes = (uint8_t *)(((uint16_t *)dst_bits) + dst_stride * (dest_y) + (dest_x));
+       byte_width = 2 * width;
+       src_stride *= 2;
+       dst_stride *= 2;
+    }
+    else if (src_bpp == 32)
+    {
+       src_stride = src_stride * (int) sizeof (uint32_t) / 4;
+       dst_stride = dst_stride * (int) sizeof (uint32_t) / 4;
+       src_bytes = (uint8_t *)(((uint32_t *)src_bits) + src_stride * (src_y) + (src_x));
+       dst_bytes = (uint8_t *)(((uint32_t *)dst_bits) + dst_stride * (dest_y) + (dest_x));
+       byte_width = 4 * width;
+       src_stride *= 4;
+       dst_stride *= 4;
+    }
+    else
+    {
+       return FALSE;
+    }
+
+    while (height--)
+    {
+       int w;
+       uint8_t *s = src_bytes;
+       uint8_t *d = dst_bytes;
+       src_bytes += src_stride;
+       dst_bytes += dst_stride;
+       w = byte_width;
+
+       while (w >= 1 && ((unsigned long)d & 1))
+       {
+           *(uint8_t *)d = *(uint8_t *)s;
+           w -= 1;
+           s += 1;
+           d += 1;
+       }
+
+       while (w >= 2 && ((unsigned long)d & 3))
+       {
+           *(uint16_t *)d = *(uint16_t *)s;
+           w -= 2;
+           s += 2;
+           d += 2;
+       }
+
+       while (w >= 4 && ((unsigned long)d & 7))
+       {
+           *(uint32_t *)d = ldl_u((uint32_t *)s);
+
+           w -= 4;
+           s += 4;
+           d += 4;
+       }
+
+       while (w >= 64)
+       {
+#if (defined (__GNUC__) || (defined(__SUNPRO_C) && (__SUNPRO_C >= 0x590))) && defined USE_X86_MMX
+           __asm__ (
+               "movq     (%1),   %%mm0\n"
+               "movq    8(%1),   %%mm1\n"
+               "movq   16(%1),   %%mm2\n"
+               "movq   24(%1),   %%mm3\n"
+               "movq   32(%1),   %%mm4\n"
+               "movq   40(%1),   %%mm5\n"
+               "movq   48(%1),   %%mm6\n"
+               "movq   56(%1),   %%mm7\n"
+
+               "movq   %%mm0,    (%0)\n"
+               "movq   %%mm1,   8(%0)\n"
+               "movq   %%mm2,  16(%0)\n"
+               "movq   %%mm3,  24(%0)\n"
+               "movq   %%mm4,  32(%0)\n"
+               "movq   %%mm5,  40(%0)\n"
+               "movq   %%mm6,  48(%0)\n"
+               "movq   %%mm7,  56(%0)\n"
+               :
+               : "r" (d), "r" (s)
+               : "memory",
+                 "%mm0", "%mm1", "%mm2", "%mm3",
+                 "%mm4", "%mm5", "%mm6", "%mm7");
+#else
+           __m64 v0 = ldq_u((uint64_t *)(s + 0));
+           __m64 v1 = ldq_u((uint64_t *)(s + 8));
+           __m64 v2 = ldq_u((uint64_t *)(s + 16));
+           __m64 v3 = ldq_u((uint64_t *)(s + 24));
+           __m64 v4 = ldq_u((uint64_t *)(s + 32));
+           __m64 v5 = ldq_u((uint64_t *)(s + 40));
+           __m64 v6 = ldq_u((uint64_t *)(s + 48));
+           __m64 v7 = ldq_u((uint64_t *)(s + 56));
+           *(__m64 *)(d + 0)  = v0;
+           *(__m64 *)(d + 8)  = v1;
+           *(__m64 *)(d + 16) = v2;
+           *(__m64 *)(d + 24) = v3;
+           *(__m64 *)(d + 32) = v4;
+           *(__m64 *)(d + 40) = v5;
+           *(__m64 *)(d + 48) = v6;
+           *(__m64 *)(d + 56) = v7;
+#endif
+
+           w -= 64;
+           s += 64;
+           d += 64;
+       }
+       while (w >= 4)
+       {
+           *(uint32_t *)d = ldl_u((uint32_t *)s);
+
+           w -= 4;
+           s += 4;
+           d += 4;
+       }
+       if (w >= 2)
+       {
+           *(uint16_t *)d = *(uint16_t *)s;
+           w -= 2;
+           s += 2;
+           d += 2;
+       }
+    }
+
+    _mm_empty ();
+
+    return TRUE;
+}
+
+static void
+mmx_composite_copy_area (pixman_implementation_t *imp,
+                         pixman_composite_info_t *info)
+{
+    PIXMAN_COMPOSITE_ARGS (info);
+
+    pixman_blt_mmx (src_image->bits.bits,
+                    dest_image->bits.bits,
+                    src_image->bits.rowstride,
+                    dest_image->bits.rowstride,
+                    PIXMAN_FORMAT_BPP (src_image->bits.format),
+                    PIXMAN_FORMAT_BPP (dest_image->bits.format),
+                    src_x, src_y, dest_x, dest_y, width, height);
+}
+
+#if 0
+static void
+mmx_composite_over_x888_8_8888 (pixman_implementation_t *imp,
+                                pixman_composite_info_t *info)
+{
+    PIXMAN_COMPOSITE_ARGS (info);
+    uint32_t  *src, *src_line;
+    uint32_t  *dst, *dst_line;
+    uint8_t  *mask, *mask_line;
+    int src_stride, mask_stride, dst_stride;
+    int32_t w;
+
+    PIXMAN_IMAGE_GET_LINE (dest_image, dest_x, dest_y, uint32_t, dst_stride, dst_line, 1);
+    PIXMAN_IMAGE_GET_LINE (mask_image, mask_x, mask_y, uint8_t, mask_stride, mask_line, 1);
+    PIXMAN_IMAGE_GET_LINE (src_image, src_x, src_y, uint32_t, src_stride, src_line, 1);
+
+    while (height--)
+    {
+       src = src_line;
+       src_line += src_stride;
+       dst = dst_line;
+       dst_line += dst_stride;
+       mask = mask_line;
+       mask_line += mask_stride;
+
+       w = width;
+
+       while (w--)
+       {
+           uint64_t m = *mask;
+
+           if (m)
+           {
+               __m64 s = load8888 (*src | 0xff000000);
+
+               if (m == 0xff)
+               {
+                   *dst = store8888 (s);
+               }
+               else
+               {
+                   __m64 sa = expand_alpha (s);
+                   __m64 vm = expand_alpha_rev (to_m64 (m));
+                   __m64 vdest = in_over (s, sa, vm, load8888 (*dst));
+
+                   *dst = store8888 (vdest);
+               }
+           }
+
+           mask++;
+           dst++;
+           src++;
+       }
+    }
+
+    _mm_empty ();
+}
+#endif
+
+static const pixman_fast_path_t mmx_fast_paths[] =
+{
+    PIXMAN_STD_FAST_PATH    (OVER, solid,    a8,       r5g6b5,   mmx_composite_over_n_8_0565       ),
+    PIXMAN_STD_FAST_PATH    (OVER, solid,    a8,       b5g6r5,   mmx_composite_over_n_8_0565       ),
+    PIXMAN_STD_FAST_PATH    (OVER, solid,    a8,       a8r8g8b8, mmx_composite_over_n_8_8888       ),
+    PIXMAN_STD_FAST_PATH    (OVER, solid,    a8,       x8r8g8b8, mmx_composite_over_n_8_8888       ),
+    PIXMAN_STD_FAST_PATH    (OVER, solid,    a8,       a8b8g8r8, mmx_composite_over_n_8_8888       ),
+    PIXMAN_STD_FAST_PATH    (OVER, solid,    a8,       x8b8g8r8, mmx_composite_over_n_8_8888       ),
+    PIXMAN_STD_FAST_PATH_CA (OVER, solid,    a8r8g8b8, a8r8g8b8, mmx_composite_over_n_8888_8888_ca ),
+    PIXMAN_STD_FAST_PATH_CA (OVER, solid,    a8r8g8b8, x8r8g8b8, mmx_composite_over_n_8888_8888_ca ),
+    PIXMAN_STD_FAST_PATH_CA (OVER, solid,    a8r8g8b8, r5g6b5,   mmx_composite_over_n_8888_0565_ca ),
+    PIXMAN_STD_FAST_PATH_CA (OVER, solid,    a8b8g8r8, a8b8g8r8, mmx_composite_over_n_8888_8888_ca ),
+    PIXMAN_STD_FAST_PATH_CA (OVER, solid,    a8b8g8r8, x8b8g8r8, mmx_composite_over_n_8888_8888_ca ),
+    PIXMAN_STD_FAST_PATH_CA (OVER, solid,    a8b8g8r8, b5g6r5,   mmx_composite_over_n_8888_0565_ca ),
+    PIXMAN_STD_FAST_PATH    (OVER, pixbuf,   pixbuf,   a8r8g8b8, mmx_composite_over_pixbuf_8888    ),
+    PIXMAN_STD_FAST_PATH    (OVER, pixbuf,   pixbuf,   x8r8g8b8, mmx_composite_over_pixbuf_8888    ),
+    PIXMAN_STD_FAST_PATH    (OVER, pixbuf,   pixbuf,   r5g6b5,   mmx_composite_over_pixbuf_0565    ),
+    PIXMAN_STD_FAST_PATH    (OVER, rpixbuf,  rpixbuf,  a8b8g8r8, mmx_composite_over_pixbuf_8888    ),
+    PIXMAN_STD_FAST_PATH    (OVER, rpixbuf,  rpixbuf,  x8b8g8r8, mmx_composite_over_pixbuf_8888    ),
+    PIXMAN_STD_FAST_PATH    (OVER, rpixbuf,  rpixbuf,  b5g6r5,   mmx_composite_over_pixbuf_0565    ),
+    PIXMAN_STD_FAST_PATH    (OVER, x8r8g8b8, solid,    a8r8g8b8, mmx_composite_over_x888_n_8888    ),
+    PIXMAN_STD_FAST_PATH    (OVER, x8r8g8b8, solid,    x8r8g8b8, mmx_composite_over_x888_n_8888    ),
+    PIXMAN_STD_FAST_PATH    (OVER, x8b8g8r8, solid,    a8b8g8r8, mmx_composite_over_x888_n_8888    ),
+    PIXMAN_STD_FAST_PATH    (OVER, x8b8g8r8, solid,    x8b8g8r8, mmx_composite_over_x888_n_8888    ),
+    PIXMAN_STD_FAST_PATH    (OVER, a8r8g8b8, solid,    a8r8g8b8, mmx_composite_over_8888_n_8888    ),
+    PIXMAN_STD_FAST_PATH    (OVER, a8r8g8b8, solid,    x8r8g8b8, mmx_composite_over_8888_n_8888    ),
+    PIXMAN_STD_FAST_PATH    (OVER, a8b8g8r8, solid,    a8b8g8r8, mmx_composite_over_8888_n_8888    ),
+    PIXMAN_STD_FAST_PATH    (OVER, a8b8g8r8, solid,    x8b8g8r8, mmx_composite_over_8888_n_8888    ),
+#if 0
+    /* FIXME: This code is commented out since it's apparently
+     * not actually faster than the generic code.
+     */
+    PIXMAN_STD_FAST_PATH    (OVER, x8r8g8b8, a8,       x8r8g8b8, mmx_composite_over_x888_8_8888    ),
+    PIXMAN_STD_FAST_PATH    (OVER, x8r8g8b8, a8,       a8r8g8b8, mmx_composite_over_x888_8_8888    ),
+    PIXMAN_STD_FAST_PATH    (OVER, x8b8g8r8, a8,       x8b8g8r8, mmx_composite_over_x888_8_8888    ),
+    PIXMAN_STD_FAST_PATH    (OVER, x8b8g8r8, a8,       a8b8g8r8, mmx_composite_over_x888_8_8888    ),
+#endif
+    PIXMAN_STD_FAST_PATH    (OVER, solid,    null,     a8r8g8b8, mmx_composite_over_n_8888         ),
+    PIXMAN_STD_FAST_PATH    (OVER, solid,    null,     x8r8g8b8, mmx_composite_over_n_8888         ),
+    PIXMAN_STD_FAST_PATH    (OVER, solid,    null,     r5g6b5,   mmx_composite_over_n_0565         ),
+    PIXMAN_STD_FAST_PATH    (OVER, x8r8g8b8, null,     x8r8g8b8, mmx_composite_copy_area           ),
+    PIXMAN_STD_FAST_PATH    (OVER, x8b8g8r8, null,     x8b8g8r8, mmx_composite_copy_area           ),
+
+    PIXMAN_STD_FAST_PATH    (OVER, a8r8g8b8, null,     a8r8g8b8, mmx_composite_over_8888_8888      ),
+    PIXMAN_STD_FAST_PATH    (OVER, a8r8g8b8, null,     x8r8g8b8, mmx_composite_over_8888_8888      ),
+    PIXMAN_STD_FAST_PATH    (OVER, a8r8g8b8, null,     r5g6b5,   mmx_composite_over_8888_0565      ),
+    PIXMAN_STD_FAST_PATH    (OVER, a8b8g8r8, null,     a8b8g8r8, mmx_composite_over_8888_8888      ),
+    PIXMAN_STD_FAST_PATH    (OVER, a8b8g8r8, null,     x8b8g8r8, mmx_composite_over_8888_8888      ),
+    PIXMAN_STD_FAST_PATH    (OVER, a8b8g8r8, null,     b5g6r5,   mmx_composite_over_8888_0565      ),
+
+    PIXMAN_STD_FAST_PATH    (ADD,  a8r8g8b8, null,     a8r8g8b8, mmx_composite_add_8888_8888       ),
+    PIXMAN_STD_FAST_PATH    (ADD,  a8b8g8r8, null,     a8b8g8r8, mmx_composite_add_8888_8888       ),
+    PIXMAN_STD_FAST_PATH    (ADD,  a8,       null,     a8,       mmx_composite_add_8_8            ),
+    PIXMAN_STD_FAST_PATH    (ADD,  solid,    a8,       a8,       mmx_composite_add_n_8_8           ),
+
+    PIXMAN_STD_FAST_PATH    (SRC,  solid,    a8,       a8r8g8b8, mmx_composite_src_n_8_8888        ),
+    PIXMAN_STD_FAST_PATH    (SRC,  solid,    a8,       x8r8g8b8, mmx_composite_src_n_8_8888        ),
+    PIXMAN_STD_FAST_PATH    (SRC,  solid,    a8,       a8b8g8r8, mmx_composite_src_n_8_8888        ),
+    PIXMAN_STD_FAST_PATH    (SRC,  solid,    a8,       x8b8g8r8, mmx_composite_src_n_8_8888        ),
+    PIXMAN_STD_FAST_PATH    (SRC,  a8r8g8b8, null,     a8r8g8b8, mmx_composite_copy_area           ),
+    PIXMAN_STD_FAST_PATH    (SRC,  a8b8g8r8, null,     a8b8g8r8, mmx_composite_copy_area           ),
+    PIXMAN_STD_FAST_PATH    (SRC,  a8r8g8b8, null,     x8r8g8b8, mmx_composite_copy_area           ),
+    PIXMAN_STD_FAST_PATH    (SRC,  a8b8g8r8, null,     x8b8g8r8, mmx_composite_copy_area           ),
+    PIXMAN_STD_FAST_PATH    (SRC,  x8r8g8b8, null,     x8r8g8b8, mmx_composite_copy_area           ),
+    PIXMAN_STD_FAST_PATH    (SRC,  x8b8g8r8, null,     x8b8g8r8, mmx_composite_copy_area           ),
+    PIXMAN_STD_FAST_PATH    (SRC,  r5g6b5,   null,     r5g6b5,   mmx_composite_copy_area           ),
+    PIXMAN_STD_FAST_PATH    (SRC,  b5g6r5,   null,     b5g6r5,   mmx_composite_copy_area           ),
+
+    PIXMAN_STD_FAST_PATH    (IN,   a8,       null,     a8,       mmx_composite_in_8_8              ),
+    PIXMAN_STD_FAST_PATH    (IN,   solid,    a8,       a8,       mmx_composite_in_n_8_8            ),
+
+    { PIXMAN_OP_NONE },
+};
+
+static pixman_bool_t
+mmx_blt (pixman_implementation_t *imp,
+         uint32_t *               src_bits,
+         uint32_t *               dst_bits,
+         int                      src_stride,
+         int                      dst_stride,
+         int                      src_bpp,
+         int                      dst_bpp,
+         int                      src_x,
+         int                      src_y,
+         int                      dest_x,
+         int                      dest_y,
+         int                      width,
+         int                      height)
+{
+    if (!pixman_blt_mmx (
+            src_bits, dst_bits, src_stride, dst_stride, src_bpp, dst_bpp,
+            src_x, src_y, dest_x, dest_y, width, height))
+
+    {
+       return _pixman_implementation_blt (
+           imp->delegate,
+           src_bits, dst_bits, src_stride, dst_stride, src_bpp, dst_bpp,
+           src_x, src_y, dest_x, dest_y, width, height);
+    }
+
+    return TRUE;
+}
+
+static pixman_bool_t
+mmx_fill (pixman_implementation_t *imp,
+          uint32_t *               bits,
+          int                      stride,
+          int                      bpp,
+          int                      x,
+          int                      y,
+          int                      width,
+          int                      height,
+          uint32_t xor)
+{
+    if (!pixman_fill_mmx (bits, stride, bpp, x, y, width, height, xor))
+    {
+       return _pixman_implementation_fill (
+           imp->delegate, bits, stride, bpp, x, y, width, height, xor);
+    }
+
+    return TRUE;
+}
+
+pixman_implementation_t *
+_pixman_implementation_create_mmx (pixman_implementation_t *fallback)
+{
+    pixman_implementation_t *imp = _pixman_implementation_create (fallback, mmx_fast_paths);
+
+    imp->combine_32[PIXMAN_OP_OVER] = mmx_combine_over_u;
+    imp->combine_32[PIXMAN_OP_OVER_REVERSE] = mmx_combine_over_reverse_u;
+    imp->combine_32[PIXMAN_OP_IN] = mmx_combine_in_u;
+    imp->combine_32[PIXMAN_OP_IN_REVERSE] = mmx_combine_in_reverse_u;
+    imp->combine_32[PIXMAN_OP_OUT] = mmx_combine_out_u;
+    imp->combine_32[PIXMAN_OP_OUT_REVERSE] = mmx_combine_out_reverse_u;
+    imp->combine_32[PIXMAN_OP_ATOP] = mmx_combine_atop_u;
+    imp->combine_32[PIXMAN_OP_ATOP_REVERSE] = mmx_combine_atop_reverse_u;
+    imp->combine_32[PIXMAN_OP_XOR] = mmx_combine_xor_u;
+    imp->combine_32[PIXMAN_OP_ADD] = mmx_combine_add_u;
+    imp->combine_32[PIXMAN_OP_SATURATE] = mmx_combine_saturate_u;
+
+    imp->combine_32_ca[PIXMAN_OP_SRC] = mmx_combine_src_ca;
+    imp->combine_32_ca[PIXMAN_OP_OVER] = mmx_combine_over_ca;
+    imp->combine_32_ca[PIXMAN_OP_OVER_REVERSE] = mmx_combine_over_reverse_ca;
+    imp->combine_32_ca[PIXMAN_OP_IN] = mmx_combine_in_ca;
+    imp->combine_32_ca[PIXMAN_OP_IN_REVERSE] = mmx_combine_in_reverse_ca;
+    imp->combine_32_ca[PIXMAN_OP_OUT] = mmx_combine_out_ca;
+    imp->combine_32_ca[PIXMAN_OP_OUT_REVERSE] = mmx_combine_out_reverse_ca;
+    imp->combine_32_ca[PIXMAN_OP_ATOP] = mmx_combine_atop_ca;
+    imp->combine_32_ca[PIXMAN_OP_ATOP_REVERSE] = mmx_combine_atop_reverse_ca;
+    imp->combine_32_ca[PIXMAN_OP_XOR] = mmx_combine_xor_ca;
+    imp->combine_32_ca[PIXMAN_OP_ADD] = mmx_combine_add_ca;
+
+    imp->blt = mmx_blt;
+    imp->fill = mmx_fill;
+
+    return imp;
+}
+
+#endif /* USE_X86_MMX || USE_ARM_IWMMXT */
diff --git a/pixman/pixman-noop.c b/pixman/pixman-noop.c
new file mode 100644 (file)
index 0000000..906a491
--- /dev/null
@@ -0,0 +1,137 @@
+/* -*- Mode: c; c-basic-offset: 4; tab-width: 8; indent-tabs-mode: t; -*- */
+/*
+ * Copyright © 2011 Red Hat, Inc.
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and associated documentation files (the "Software"),
+ * to deal in the Software without restriction, including without limitation
+ * the rights to use, copy, modify, merge, publish, distribute, sublicense,
+ * and/or sell copies of the Software, and to permit persons to whom the
+ * Software is furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice (including the next
+ * paragraph) shall be included in all copies or substantial portions of the
+ * Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
+ * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
+ * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER
+ * DEALINGS IN THE SOFTWARE.
+ */
+#ifdef HAVE_CONFIG_H
+#include <config.h>
+#endif
+#include <string.h>
+#include <stdlib.h>
+#include "pixman-private.h"
+#include "pixman-combine32.h"
+#include "pixman-inlines.h"
+
+static void
+noop_composite (pixman_implementation_t *imp,
+               pixman_composite_info_t *info)
+{
+    return;
+}
+
+static void
+dest_write_back_direct (pixman_iter_t *iter)
+{
+    iter->buffer += iter->image->bits.rowstride;
+}
+
+static uint32_t *
+noop_get_scanline (pixman_iter_t *iter, const uint32_t *mask)
+{
+    uint32_t *result = iter->buffer;
+
+    iter->buffer += iter->image->bits.rowstride;
+
+    return result;
+}
+
+static uint32_t *
+get_scanline_null (pixman_iter_t *iter, const uint32_t *mask)
+{
+    return NULL;
+}
+
+static void
+noop_src_iter_init (pixman_implementation_t *imp, pixman_iter_t *iter)
+{
+    pixman_image_t *image = iter->image;
+
+#define FLAGS                                          \
+    (FAST_PATH_STANDARD_FLAGS | FAST_PATH_ID_TRANSFORM)
+
+    if (!image)
+    {
+       iter->get_scanline = get_scanline_null;
+    }
+    else if ((iter->flags & (ITER_IGNORE_ALPHA | ITER_IGNORE_RGB)) ==
+            (ITER_IGNORE_ALPHA | ITER_IGNORE_RGB))
+    {
+       iter->get_scanline = _pixman_iter_get_scanline_noop;
+    }
+    else if ((iter->flags & ITER_NARROW)                               &&
+            (image->common.flags & FLAGS) == FLAGS                     &&
+            iter->x >= 0 && iter->y >= 0                               &&
+            iter->x + iter->width <= image->bits.width                 &&
+            iter->y + iter->height <= image->bits.height               &&
+            image->common.extended_format_code == PIXMAN_a8r8g8b8)
+    {
+       iter->buffer =
+           image->bits.bits + iter->y * image->bits.rowstride + iter->x;
+
+       iter->get_scanline = noop_get_scanline;
+    }
+    else
+    {
+       (* imp->delegate->src_iter_init) (imp->delegate, iter);
+    }
+}
+
+static void
+noop_dest_iter_init (pixman_implementation_t *imp, pixman_iter_t *iter)
+{
+    pixman_image_t *image = iter->image;
+    uint32_t image_flags = image->common.flags;
+    uint32_t iter_flags = iter->flags;
+    
+    if ((image_flags & FAST_PATH_STD_DEST_FLAGS) == FAST_PATH_STD_DEST_FLAGS   &&
+       (iter_flags & ITER_NARROW) == ITER_NARROW                               &&
+       ((image->common.extended_format_code == PIXMAN_a8r8g8b8)        ||
+        (image->common.extended_format_code == PIXMAN_x8r8g8b8 &&
+         (iter_flags & (ITER_LOCALIZED_ALPHA)))))
+    {
+       iter->buffer = image->bits.bits + iter->y * image->bits.rowstride + iter->x;
+
+       iter->get_scanline = _pixman_iter_get_scanline_noop;
+       iter->write_back = dest_write_back_direct;
+    }
+    else
+    {
+       (* imp->delegate->dest_iter_init) (imp->delegate, iter);
+    }
+}
+
+static const pixman_fast_path_t noop_fast_paths[] =
+{
+    { PIXMAN_OP_DST, PIXMAN_any, 0, PIXMAN_any, 0, PIXMAN_any, 0, noop_composite },
+    { PIXMAN_OP_NONE },
+};
+
+pixman_implementation_t *
+_pixman_implementation_create_noop (pixman_implementation_t *fallback)
+{
+    pixman_implementation_t *imp =
+       _pixman_implementation_create (fallback, noop_fast_paths);
+
+    imp->src_iter_init = noop_src_iter_init;
+    imp->dest_iter_init = noop_dest_iter_init;
+
+    return imp;
+}
diff --git a/pixman/pixman-private.h b/pixman/pixman-private.h
new file mode 100644 (file)
index 0000000..cbd48f3
--- /dev/null
@@ -0,0 +1,1001 @@
+#ifndef PACKAGE
+#  error config.h must be included before pixman-private.h
+#endif
+
+#ifndef PIXMAN_PRIVATE_H
+#define PIXMAN_PRIVATE_H
+
+#define PIXMAN_DISABLE_DEPRECATED
+#define PIXMAN_USE_INTERNAL_API
+
+#include "pixman.h"
+#include <time.h>
+#include <assert.h>
+#include <stdio.h>
+#include <string.h>
+
+#include "pixman-compiler.h"
+
+/*
+ * Images
+ */
+typedef struct image_common image_common_t;
+typedef struct solid_fill solid_fill_t;
+typedef struct gradient gradient_t;
+typedef struct linear_gradient linear_gradient_t;
+typedef struct horizontal_gradient horizontal_gradient_t;
+typedef struct vertical_gradient vertical_gradient_t;
+typedef struct conical_gradient conical_gradient_t;
+typedef struct radial_gradient radial_gradient_t;
+typedef struct bits_image bits_image_t;
+typedef struct circle circle_t;
+
+typedef void (*fetch_scanline_t) (pixman_image_t *image,
+                                 int             x,
+                                 int             y,
+                                 int             width,
+                                 uint32_t       *buffer,
+                                 const uint32_t *mask);
+
+typedef uint32_t (*fetch_pixel_32_t) (bits_image_t *image,
+                                     int           x,
+                                     int           y);
+
+typedef uint64_t (*fetch_pixel_64_t) (bits_image_t *image,
+                                     int           x,
+                                     int           y);
+
+typedef void (*store_scanline_t) (bits_image_t *  image,
+                                 int             x,
+                                 int             y,
+                                 int             width,
+                                 const uint32_t *values);
+
+typedef enum
+{
+    BITS,
+    LINEAR,
+    CONICAL,
+    RADIAL,
+    SOLID
+} image_type_t;
+
+typedef void (*property_changed_func_t) (pixman_image_t *image);
+
+struct image_common
+{
+    image_type_t                type;
+    int32_t                     ref_count;
+    pixman_region32_t           clip_region;
+    int32_t                    alpha_count;        /* How many times this image is being used as an alpha map */
+    pixman_bool_t               have_clip_region;   /* FALSE if there is no clip */
+    pixman_bool_t               client_clip;        /* Whether the source clip was
+                                                      set by a client */
+    pixman_bool_t               clip_sources;       /* Whether the clip applies when
+                                                    * the image is used as a source
+                                                    */
+    pixman_bool_t              dirty;
+    pixman_transform_t *        transform;
+    pixman_repeat_t             repeat;
+    pixman_filter_t             filter;
+    pixman_fixed_t *            filter_params;
+    int                         n_filter_params;
+    bits_image_t *              alpha_map;
+    int                         alpha_origin_x;
+    int                         alpha_origin_y;
+    pixman_bool_t               component_alpha;
+    property_changed_func_t     property_changed;
+
+    pixman_image_destroy_func_t destroy_func;
+    void *                      destroy_data;
+
+    uint32_t                   flags;
+    pixman_format_code_t       extended_format_code;
+};
+
+struct solid_fill
+{
+    image_common_t common;
+    pixman_color_t color;
+    
+    uint32_t      color_32;
+    uint64_t      color_64;
+};
+
+struct gradient
+{
+    image_common_t         common;
+    int                     n_stops;
+    pixman_gradient_stop_t *stops;
+};
+
+struct linear_gradient
+{
+    gradient_t           common;
+    pixman_point_fixed_t p1;
+    pixman_point_fixed_t p2;
+};
+
+struct circle
+{
+    pixman_fixed_t x;
+    pixman_fixed_t y;
+    pixman_fixed_t radius;
+};
+
+struct radial_gradient
+{
+    gradient_t common;
+
+    circle_t   c1;
+    circle_t   c2;
+
+    circle_t   delta;
+    double     a;
+    double     inva;
+    double     mindr;
+};
+
+struct conical_gradient
+{
+    gradient_t           common;
+    pixman_point_fixed_t center;
+    double              angle;
+};
+
+struct bits_image
+{
+    image_common_t             common;
+    pixman_format_code_t       format;
+    const pixman_indexed_t *   indexed;
+    int                        width;
+    int                        height;
+    uint32_t *                 bits;
+    uint32_t *                 free_me;
+    int                        rowstride;  /* in number of uint32_t's */
+
+    fetch_scanline_t           get_scanline_32;
+    fetch_scanline_t           get_scanline_64;
+
+    fetch_scanline_t           fetch_scanline_32;
+    fetch_pixel_32_t          fetch_pixel_32;
+    store_scanline_t           store_scanline_32;
+
+    fetch_scanline_t           fetch_scanline_64;
+    fetch_pixel_64_t          fetch_pixel_64;
+    store_scanline_t           store_scanline_64;
+
+    /* Used for indirect access to the bits */
+    pixman_read_memory_func_t  read_func;
+    pixman_write_memory_func_t write_func;
+};
+
+union pixman_image
+{
+    image_type_t       type;
+    image_common_t     common;
+    bits_image_t       bits;
+    gradient_t         gradient;
+    linear_gradient_t  linear;
+    conical_gradient_t conical;
+    radial_gradient_t  radial;
+    solid_fill_t       solid;
+};
+
+typedef struct pixman_iter_t pixman_iter_t;
+typedef uint32_t *(* pixman_iter_get_scanline_t) (pixman_iter_t *iter, const uint32_t *mask);
+typedef void      (* pixman_iter_write_back_t)   (pixman_iter_t *iter);
+
+typedef enum
+{
+    ITER_NARROW =              (1 << 0),
+
+    /* "Localized alpha" is when the alpha channel is used only to compute
+     * the alpha value of the destination. This means that the computation
+     * of the RGB values of the result is independent of the alpha value.
+     *
+     * For example, the OVER operator has localized alpha for the
+     * destination, because the RGB values of the result can be computed
+     * without knowing the destination alpha. Similarly, ADD has localized
+     * alpha for both source and destination because the RGB values of the
+     * result can be computed without knowing the alpha value of source or
+     * destination.
+     *
+     * When he destination is xRGB, this is useful knowledge, because then
+     * we can treat it as if it were ARGB, which means in some cases we can
+     * avoid copying it to a temporary buffer.
+     */
+    ITER_LOCALIZED_ALPHA =     (1 << 1),
+    ITER_IGNORE_ALPHA =                (1 << 2),
+    ITER_IGNORE_RGB =          (1 << 3)
+} iter_flags_t;
+
+struct pixman_iter_t
+{
+    /* These are initialized by _pixman_implementation_{src,dest}_init */
+    pixman_image_t *           image;
+    uint32_t *                 buffer;
+    int                                x, y;
+    int                                width;
+    int                                height;
+    iter_flags_t               flags;
+
+    /* These function pointers are initialized by the implementation */
+    pixman_iter_get_scanline_t get_scanline;
+    pixman_iter_write_back_t   write_back;
+
+    /* These fields are scratch data that implementations can use */
+    uint8_t *                  bits;
+    int                                stride;
+};
+
+void
+_pixman_bits_image_setup_accessors (bits_image_t *image);
+
+void
+_pixman_bits_image_src_iter_init (pixman_image_t *image, pixman_iter_t *iter);
+
+void
+_pixman_bits_image_dest_iter_init (pixman_image_t *image, pixman_iter_t *iter);
+
+void
+_pixman_solid_fill_iter_init (pixman_image_t *image, pixman_iter_t  *iter);
+
+void
+_pixman_linear_gradient_iter_init (pixman_image_t *image, pixman_iter_t  *iter);
+
+void
+_pixman_radial_gradient_iter_init (pixman_image_t *image, pixman_iter_t *iter);
+
+void
+_pixman_conical_gradient_iter_init (pixman_image_t *image, pixman_iter_t *iter);
+
+void
+_pixman_image_init (pixman_image_t *image);
+
+pixman_bool_t
+_pixman_bits_image_init (pixman_image_t *     image,
+                         pixman_format_code_t format,
+                         int                  width,
+                         int                  height,
+                         uint32_t *           bits,
+                         int                  rowstride);
+pixman_bool_t
+_pixman_image_fini (pixman_image_t *image);
+
+pixman_image_t *
+_pixman_image_allocate (void);
+
+pixman_bool_t
+_pixman_init_gradient (gradient_t *                  gradient,
+                       const pixman_gradient_stop_t *stops,
+                       int                           n_stops);
+void
+_pixman_image_reset_clip_region (pixman_image_t *image);
+
+void
+_pixman_image_validate (pixman_image_t *image);
+
+#define PIXMAN_IMAGE_GET_LINE(image, x, y, type, out_stride, line, mul)        \
+    do                                                                 \
+    {                                                                  \
+       uint32_t *__bits__;                                             \
+       int       __stride__;                                           \
+                                                                       \
+       __bits__ = image->bits.bits;                                    \
+       __stride__ = image->bits.rowstride;                             \
+       (out_stride) =                                                  \
+           __stride__ * (int) sizeof (uint32_t) / (int) sizeof (type); \
+       (line) =                                                        \
+           ((type *) __bits__) + (out_stride) * (y) + (mul) * (x);     \
+    } while (0)
+
+/*
+ * Gradient walker
+ */
+typedef struct
+{
+    uint32_t                left_ag;
+    uint32_t                left_rb;
+    uint32_t                right_ag;
+    uint32_t                right_rb;
+    int32_t                 left_x;
+    int32_t                 right_x;
+    int32_t                 stepper;
+
+    pixman_gradient_stop_t *stops;
+    int                     num_stops;
+    unsigned int            spread;
+
+    int                     need_reset;
+} pixman_gradient_walker_t;
+
+void
+_pixman_gradient_walker_init (pixman_gradient_walker_t *walker,
+                              gradient_t *              gradient,
+                              unsigned int              spread);
+
+void
+_pixman_gradient_walker_reset (pixman_gradient_walker_t *walker,
+                               pixman_fixed_32_32_t      pos);
+
+uint32_t
+_pixman_gradient_walker_pixel (pixman_gradient_walker_t *walker,
+                               pixman_fixed_32_32_t      x);
+
+/*
+ * Edges
+ */
+
+#define MAX_ALPHA(n)    ((1 << (n)) - 1)
+#define N_Y_FRAC(n)     ((n) == 1 ? 1 : (1 << ((n) / 2)) - 1)
+#define N_X_FRAC(n)     ((n) == 1 ? 1 : (1 << ((n) / 2)) + 1)
+
+#define STEP_Y_SMALL(n) (pixman_fixed_1 / N_Y_FRAC (n))
+#define STEP_Y_BIG(n)   (pixman_fixed_1 - (N_Y_FRAC (n) - 1) * STEP_Y_SMALL (n))
+
+#define Y_FRAC_FIRST(n) (STEP_Y_BIG (n) / 2)
+#define Y_FRAC_LAST(n)  (Y_FRAC_FIRST (n) + (N_Y_FRAC (n) - 1) * STEP_Y_SMALL (n))
+
+#define STEP_X_SMALL(n) (pixman_fixed_1 / N_X_FRAC (n))
+#define STEP_X_BIG(n)   (pixman_fixed_1 - (N_X_FRAC (n) - 1) * STEP_X_SMALL (n))
+
+#define X_FRAC_FIRST(n) (STEP_X_BIG (n) / 2)
+#define X_FRAC_LAST(n)  (X_FRAC_FIRST (n) + (N_X_FRAC (n) - 1) * STEP_X_SMALL (n))
+
+#define RENDER_SAMPLES_X(x, n)                                         \
+    ((n) == 1? 0 : (pixman_fixed_frac (x) +                            \
+                   X_FRAC_FIRST (n)) / STEP_X_SMALL (n))
+
+void
+pixman_rasterize_edges_accessors (pixman_image_t *image,
+                                  pixman_edge_t * l,
+                                  pixman_edge_t * r,
+                                  pixman_fixed_t  t,
+                                  pixman_fixed_t  b);
+
+/*
+ * Implementations
+ */
+typedef struct pixman_implementation_t pixman_implementation_t;
+
+typedef struct
+{
+    pixman_op_t              op;
+    pixman_image_t *         src_image;
+    pixman_image_t *         mask_image;
+    pixman_image_t *         dest_image;
+    int32_t                  src_x;
+    int32_t                  src_y;
+    int32_t                  mask_x;
+    int32_t                  mask_y;
+    int32_t                  dest_x;
+    int32_t                  dest_y;
+    int32_t                  width;
+    int32_t                  height;
+
+    uint32_t                 src_flags;
+    uint32_t                 mask_flags;
+    uint32_t                 dest_flags;
+} pixman_composite_info_t;
+
+#define PIXMAN_COMPOSITE_ARGS(info)                                    \
+    MAYBE_UNUSED pixman_op_t        op = info->op;                     \
+    MAYBE_UNUSED pixman_image_t *   src_image = info->src_image;       \
+    MAYBE_UNUSED pixman_image_t *   mask_image = info->mask_image;     \
+    MAYBE_UNUSED pixman_image_t *   dest_image = info->dest_image;     \
+    MAYBE_UNUSED int32_t            src_x = info->src_x;               \
+    MAYBE_UNUSED int32_t            src_y = info->src_y;               \
+    MAYBE_UNUSED int32_t            mask_x = info->mask_x;             \
+    MAYBE_UNUSED int32_t            mask_y = info->mask_y;             \
+    MAYBE_UNUSED int32_t            dest_x = info->dest_x;             \
+    MAYBE_UNUSED int32_t            dest_y = info->dest_y;             \
+    MAYBE_UNUSED int32_t            width = info->width;               \
+    MAYBE_UNUSED int32_t            height = info->height
+
+typedef void (*pixman_combine_32_func_t) (pixman_implementation_t *imp,
+                                         pixman_op_t              op,
+                                         uint32_t *               dest,
+                                         const uint32_t *         src,
+                                         const uint32_t *         mask,
+                                         int                      width);
+
+typedef void (*pixman_combine_64_func_t) (pixman_implementation_t *imp,
+                                         pixman_op_t              op,
+                                         uint64_t *               dest,
+                                         const uint64_t *         src,
+                                         const uint64_t *         mask,
+                                         int                      width);
+
+typedef void (*pixman_composite_func_t) (pixman_implementation_t *imp,
+                                        pixman_composite_info_t *info);
+typedef pixman_bool_t (*pixman_blt_func_t) (pixman_implementation_t *imp,
+                                           uint32_t *               src_bits,
+                                           uint32_t *               dst_bits,
+                                           int                      src_stride,
+                                           int                      dst_stride,
+                                           int                      src_bpp,
+                                           int                      dst_bpp,
+                                           int                      src_x,
+                                           int                      src_y,
+                                           int                      dest_x,
+                                           int                      dest_y,
+                                           int                      width,
+                                           int                      height);
+typedef pixman_bool_t (*pixman_fill_func_t) (pixman_implementation_t *imp,
+                                            uint32_t *               bits,
+                                            int                      stride,
+                                            int                      bpp,
+                                            int                      x,
+                                            int                      y,
+                                            int                      width,
+                                            int                      height,
+                                            uint32_t                 xor);
+typedef void (*pixman_iter_init_func_t) (pixman_implementation_t *imp,
+                                         pixman_iter_t           *iter);
+
+void _pixman_setup_combiner_functions_32 (pixman_implementation_t *imp);
+void _pixman_setup_combiner_functions_64 (pixman_implementation_t *imp);
+
+typedef struct
+{
+    pixman_op_t             op;
+    pixman_format_code_t    src_format;
+    uint32_t               src_flags;
+    pixman_format_code_t    mask_format;
+    uint32_t               mask_flags;
+    pixman_format_code_t    dest_format;
+    uint32_t               dest_flags;
+    pixman_composite_func_t func;
+} pixman_fast_path_t;
+
+struct pixman_implementation_t
+{
+    pixman_implementation_t *  toplevel;
+    pixman_implementation_t *  delegate;
+    const pixman_fast_path_t * fast_paths;
+
+    pixman_blt_func_t          blt;
+    pixman_fill_func_t         fill;
+    pixman_iter_init_func_t     src_iter_init;
+    pixman_iter_init_func_t     dest_iter_init;
+
+    pixman_combine_32_func_t   combine_32[PIXMAN_N_OPERATORS];
+    pixman_combine_32_func_t   combine_32_ca[PIXMAN_N_OPERATORS];
+    pixman_combine_64_func_t   combine_64[PIXMAN_N_OPERATORS];
+    pixman_combine_64_func_t   combine_64_ca[PIXMAN_N_OPERATORS];
+};
+
+uint32_t
+_pixman_image_get_solid (pixman_implementation_t *imp,
+                        pixman_image_t *         image,
+                         pixman_format_code_t     format);
+
+pixman_implementation_t *
+_pixman_implementation_create (pixman_implementation_t *delegate,
+                              const pixman_fast_path_t *fast_paths);
+
+void
+_pixman_implementation_combine_32 (pixman_implementation_t *imp,
+                                   pixman_op_t              op,
+                                   uint32_t *               dest,
+                                   const uint32_t *         src,
+                                   const uint32_t *         mask,
+                                   int                      width);
+void
+_pixman_implementation_combine_64 (pixman_implementation_t *imp,
+                                   pixman_op_t              op,
+                                   uint64_t *               dest,
+                                   const uint64_t *         src,
+                                   const uint64_t *         mask,
+                                   int                      width);
+void
+_pixman_implementation_combine_32_ca (pixman_implementation_t *imp,
+                                      pixman_op_t              op,
+                                      uint32_t *               dest,
+                                      const uint32_t *         src,
+                                      const uint32_t *         mask,
+                                      int                      width);
+void
+_pixman_implementation_combine_64_ca (pixman_implementation_t *imp,
+                                      pixman_op_t              op,
+                                      uint64_t *               dest,
+                                      const uint64_t *         src,
+                                      const uint64_t *         mask,
+                                      int                      width);
+
+pixman_bool_t
+_pixman_implementation_blt (pixman_implementation_t *imp,
+                            uint32_t *               src_bits,
+                            uint32_t *               dst_bits,
+                            int                      src_stride,
+                            int                      dst_stride,
+                            int                      src_bpp,
+                            int                      dst_bpp,
+                            int                      src_x,
+                            int                      src_y,
+                            int                      dest_x,
+                            int                      dest_y,
+                            int                      width,
+                            int                      height);
+
+pixman_bool_t
+_pixman_implementation_fill (pixman_implementation_t *imp,
+                             uint32_t *               bits,
+                             int                      stride,
+                             int                      bpp,
+                             int                      x,
+                             int                      y,
+                             int                      width,
+                             int                      height,
+                             uint32_t                 xor);
+
+void
+_pixman_implementation_src_iter_init (pixman_implementation_t       *imp,
+                                     pixman_iter_t                 *iter,
+                                     pixman_image_t                *image,
+                                     int                            x,
+                                     int                            y,
+                                     int                            width,
+                                     int                            height,
+                                     uint8_t                       *buffer,
+                                     iter_flags_t                   flags);
+
+void
+_pixman_implementation_dest_iter_init (pixman_implementation_t       *imp,
+                                      pixman_iter_t                 *iter,
+                                      pixman_image_t                *image,
+                                      int                            x,
+                                      int                            y,
+                                      int                            width,
+                                      int                            height,
+                                      uint8_t                       *buffer,
+                                      iter_flags_t                   flags);
+
+/* Specific implementations */
+pixman_implementation_t *
+_pixman_implementation_create_general (void);
+
+pixman_implementation_t *
+_pixman_implementation_create_fast_path (pixman_implementation_t *fallback);
+
+pixman_implementation_t *
+_pixman_implementation_create_noop (pixman_implementation_t *fallback);
+
+#if defined USE_X86_MMX || defined USE_ARM_IWMMXT
+pixman_implementation_t *
+_pixman_implementation_create_mmx (pixman_implementation_t *fallback);
+#endif
+
+#ifdef USE_SSE2
+pixman_implementation_t *
+_pixman_implementation_create_sse2 (pixman_implementation_t *fallback);
+#endif
+
+#ifdef USE_ARM_SIMD
+pixman_implementation_t *
+_pixman_implementation_create_arm_simd (pixman_implementation_t *fallback);
+#endif
+
+#ifdef USE_ARM_NEON
+pixman_implementation_t *
+_pixman_implementation_create_arm_neon (pixman_implementation_t *fallback);
+#endif
+
+#ifdef USE_VMX
+pixman_implementation_t *
+_pixman_implementation_create_vmx (pixman_implementation_t *fallback);
+#endif
+
+pixman_implementation_t *
+_pixman_choose_implementation (void);
+
+
+
+/*
+ * Utilities
+ */
+uint32_t *
+_pixman_iter_get_scanline_noop (pixman_iter_t *iter, const uint32_t *mask);
+
+/* These "formats" all have depth 0, so they
+ * will never clash with any real ones
+ */
+#define PIXMAN_null             PIXMAN_FORMAT (0, 0, 0, 0, 0, 0)
+#define PIXMAN_solid            PIXMAN_FORMAT (0, 1, 0, 0, 0, 0)
+#define PIXMAN_pixbuf          PIXMAN_FORMAT (0, 2, 0, 0, 0, 0)
+#define PIXMAN_rpixbuf         PIXMAN_FORMAT (0, 3, 0, 0, 0, 0)
+#define PIXMAN_unknown         PIXMAN_FORMAT (0, 4, 0, 0, 0, 0)
+#define PIXMAN_any             PIXMAN_FORMAT (0, 5, 0, 0, 0, 0)
+
+#define PIXMAN_OP_any          (PIXMAN_N_OPERATORS + 1)
+
+#define FAST_PATH_ID_TRANSFORM                 (1 <<  0)
+#define FAST_PATH_NO_ALPHA_MAP                 (1 <<  1)
+#define FAST_PATH_NO_CONVOLUTION_FILTER                (1 <<  2)
+#define FAST_PATH_NO_PAD_REPEAT                        (1 <<  3)
+#define FAST_PATH_NO_REFLECT_REPEAT            (1 <<  4)
+#define FAST_PATH_NO_ACCESSORS                 (1 <<  5)
+#define FAST_PATH_NARROW_FORMAT                        (1 <<  6)
+#define FAST_PATH_COMPONENT_ALPHA              (1 <<  8)
+#define FAST_PATH_SAMPLES_OPAQUE               (1 <<  7)
+#define FAST_PATH_UNIFIED_ALPHA                        (1 <<  9)
+#define FAST_PATH_SCALE_TRANSFORM              (1 << 10)
+#define FAST_PATH_NEAREST_FILTER               (1 << 11)
+#define FAST_PATH_HAS_TRANSFORM                        (1 << 12)
+#define FAST_PATH_IS_OPAQUE                    (1 << 13)
+#define FAST_PATH_NO_NORMAL_REPEAT             (1 << 14)
+#define FAST_PATH_NO_NONE_REPEAT               (1 << 15)
+#define FAST_PATH_X_UNIT_POSITIVE              (1 << 16)
+#define FAST_PATH_AFFINE_TRANSFORM             (1 << 17)
+#define FAST_PATH_Y_UNIT_ZERO                  (1 << 18)
+#define FAST_PATH_BILINEAR_FILTER              (1 << 19)
+#define FAST_PATH_ROTATE_90_TRANSFORM          (1 << 20)
+#define FAST_PATH_ROTATE_180_TRANSFORM         (1 << 21)
+#define FAST_PATH_ROTATE_270_TRANSFORM         (1 << 22)
+#define FAST_PATH_SAMPLES_COVER_CLIP_NEAREST   (1 << 23)
+#define FAST_PATH_SAMPLES_COVER_CLIP_BILINEAR  (1 << 24)
+#define FAST_PATH_BITS_IMAGE                   (1 << 25)
+
+#define FAST_PATH_PAD_REPEAT                                           \
+    (FAST_PATH_NO_NONE_REPEAT          |                               \
+     FAST_PATH_NO_NORMAL_REPEAT                |                               \
+     FAST_PATH_NO_REFLECT_REPEAT)
+
+#define FAST_PATH_NORMAL_REPEAT                                                \
+    (FAST_PATH_NO_NONE_REPEAT          |                               \
+     FAST_PATH_NO_PAD_REPEAT           |                               \
+     FAST_PATH_NO_REFLECT_REPEAT)
+
+#define FAST_PATH_NONE_REPEAT                                          \
+    (FAST_PATH_NO_NORMAL_REPEAT                |                               \
+     FAST_PATH_NO_PAD_REPEAT           |                               \
+     FAST_PATH_NO_REFLECT_REPEAT)
+
+#define FAST_PATH_REFLECT_REPEAT                                       \
+    (FAST_PATH_NO_NONE_REPEAT          |                               \
+     FAST_PATH_NO_NORMAL_REPEAT                |                               \
+     FAST_PATH_NO_PAD_REPEAT)
+
+#define FAST_PATH_STANDARD_FLAGS                                       \
+    (FAST_PATH_NO_CONVOLUTION_FILTER   |                               \
+     FAST_PATH_NO_ACCESSORS            |                               \
+     FAST_PATH_NO_ALPHA_MAP            |                               \
+     FAST_PATH_NARROW_FORMAT)
+
+#define FAST_PATH_STD_DEST_FLAGS                                       \
+    (FAST_PATH_NO_ACCESSORS            |                               \
+     FAST_PATH_NO_ALPHA_MAP            |                               \
+     FAST_PATH_NARROW_FORMAT)
+
+#define SOURCE_FLAGS(format)                                           \
+    (FAST_PATH_STANDARD_FLAGS |                                                \
+     ((PIXMAN_ ## format == PIXMAN_solid) ?                            \
+      0 : (FAST_PATH_SAMPLES_COVER_CLIP_NEAREST | FAST_PATH_NEAREST_FILTER | FAST_PATH_ID_TRANSFORM)))
+
+#define MASK_FLAGS(format, extra)                                      \
+    ((PIXMAN_ ## format == PIXMAN_null) ? 0 : (SOURCE_FLAGS (format) | extra))
+
+#define FAST_PATH(op, src, src_flags, mask, mask_flags, dest, dest_flags, func) \
+    PIXMAN_OP_ ## op,                                                  \
+    PIXMAN_ ## src,                                                    \
+    src_flags,                                                         \
+    PIXMAN_ ## mask,                                                   \
+    mask_flags,                                                                \
+    PIXMAN_ ## dest,                                                   \
+    dest_flags,                                                                \
+    func
+
+#define PIXMAN_STD_FAST_PATH(op, src, mask, dest, func)                        \
+    { FAST_PATH (                                                      \
+           op,                                                         \
+           src,  SOURCE_FLAGS (src),                                   \
+           mask, MASK_FLAGS (mask, FAST_PATH_UNIFIED_ALPHA),           \
+           dest, FAST_PATH_STD_DEST_FLAGS,                             \
+           func) }
+
+#define PIXMAN_STD_FAST_PATH_CA(op, src, mask, dest, func)             \
+    { FAST_PATH (                                                      \
+           op,                                                         \
+           src,  SOURCE_FLAGS (src),                                   \
+           mask, MASK_FLAGS (mask, FAST_PATH_COMPONENT_ALPHA),         \
+           dest, FAST_PATH_STD_DEST_FLAGS,                             \
+           func) }
+
+/* Memory allocation helpers */
+void *
+pixman_malloc_ab (unsigned int n, unsigned int b);
+
+void *
+pixman_malloc_abc (unsigned int a, unsigned int b, unsigned int c);
+
+pixman_bool_t
+_pixman_multiply_overflows_size (size_t a, size_t b);
+
+pixman_bool_t
+_pixman_multiply_overflows_int (unsigned int a, unsigned int b);
+
+pixman_bool_t
+_pixman_addition_overflows_int (unsigned int a, unsigned int b);
+
+/* Compositing utilities */
+void
+pixman_expand (uint64_t *           dst,
+               const uint32_t *     src,
+               pixman_format_code_t format,
+               int                  width);
+
+void
+pixman_contract (uint32_t *      dst,
+                 const uint64_t *src,
+                 int             width);
+
+pixman_bool_t
+_pixman_lookup_composite_function (pixman_implementation_t     *toplevel,
+                                  pixman_op_t                  op,
+                                  pixman_format_code_t         src_format,
+                                  uint32_t                     src_flags,
+                                  pixman_format_code_t         mask_format,
+                                  uint32_t                     mask_flags,
+                                  pixman_format_code_t         dest_format,
+                                  uint32_t                     dest_flags,
+                                  pixman_implementation_t    **out_imp,
+                                  pixman_composite_func_t     *out_func);
+
+/* Region Helpers */
+pixman_bool_t
+pixman_region32_copy_from_region16 (pixman_region32_t *dst,
+                                    pixman_region16_t *src);
+
+pixman_bool_t
+pixman_region16_copy_from_region32 (pixman_region16_t *dst,
+                                    pixman_region32_t *src);
+
+
+/* Misc macros */
+
+#ifndef FALSE
+#   define FALSE 0
+#endif
+
+#ifndef TRUE
+#   define TRUE 1
+#endif
+
+#ifndef MIN
+#  define MIN(a, b) ((a < b) ? a : b)
+#endif
+
+#ifndef MAX
+#  define MAX(a, b) ((a > b) ? a : b)
+#endif
+
+/* Integer division that rounds towards -infinity */
+#define DIV(a, b)                                         \
+    ((((a) < 0) == ((b) < 0)) ? (a) / (b) :                \
+     ((a) - (b) + 1 - (((b) < 0) << 1)) / (b))
+
+/* Modulus that produces the remainder wrt. DIV */
+#define MOD(a, b) ((a) < 0 ? ((b) - ((-(a) - 1) % (b))) - 1 : (a) % (b))
+
+#define CLIP(v, low, high) ((v) < (low) ? (low) : ((v) > (high) ? (high) : (v)))
+
+/* Conversion between 8888 and 0565 */
+
+#define CONVERT_8888_TO_0565(s)                                                \
+    ((((s) >> 3) & 0x001f) |                                           \
+     (((s) >> 5) & 0x07e0) |                                           \
+     (((s) >> 8) & 0xf800))
+
+#define CONVERT_0565_TO_0888(s)                                                \
+    (((((s) << 3) & 0xf8) | (((s) >> 2) & 0x7)) |                      \
+     ((((s) << 5) & 0xfc00) | (((s) >> 1) & 0x300)) |                  \
+     ((((s) << 8) & 0xf80000) | (((s) << 3) & 0x70000)))
+
+#define CONVERT_0565_TO_8888(s) (CONVERT_0565_TO_0888(s) | 0xff000000)
+
+/* Trivial versions that are useful in macros */
+#define CONVERT_8888_TO_8888(s) (s)
+#define CONVERT_x888_TO_8888(s) ((s) | 0xff000000)
+#define CONVERT_0565_TO_0565(s) (s)
+
+#define PIXMAN_FORMAT_IS_WIDE(f)                                       \
+    (PIXMAN_FORMAT_A (f) > 8 ||                                                \
+     PIXMAN_FORMAT_R (f) > 8 ||                                                \
+     PIXMAN_FORMAT_G (f) > 8 ||                                                \
+     PIXMAN_FORMAT_B (f) > 8)
+
+#ifdef WORDS_BIGENDIAN
+#   define SCREEN_SHIFT_LEFT(x,n)      ((x) << (n))
+#   define SCREEN_SHIFT_RIGHT(x,n)     ((x) >> (n))
+#else
+#   define SCREEN_SHIFT_LEFT(x,n)      ((x) >> (n))
+#   define SCREEN_SHIFT_RIGHT(x,n)     ((x) << (n))
+#endif
+
+static force_inline uint32_t
+unorm_to_unorm (uint32_t val, int from_bits, int to_bits)
+{
+    uint32_t result;
+
+    if (from_bits == 0)
+       return 0;
+
+    /* Delete any extra bits */
+    val &= ((1 << from_bits) - 1);
+
+    if (from_bits >= to_bits)
+       return val >> (from_bits - to_bits);
+
+    /* Start out with the high bit of val in the high bit of result. */
+    result = val << (to_bits - from_bits);
+
+    /* Copy the bits in result, doubling the number of bits each time, until
+     * we fill all to_bits. Unrolled manually because from_bits and to_bits
+     * are usually known statically, so the compiler can turn all of this
+     * into a few shifts.
+     */
+#define REPLICATE()                                                    \
+    do                                                                 \
+    {                                                                  \
+       if (from_bits < to_bits)                                        \
+       {                                                               \
+           result |= result >> from_bits;                              \
+                                                                       \
+           from_bits *= 2;                                             \
+       }                                                               \
+    }                                                                  \
+    while (0)
+
+    REPLICATE();
+    REPLICATE();
+    REPLICATE();
+    REPLICATE();
+    REPLICATE();
+
+    return result;
+}
+
+/*
+ * Various debugging code
+ */
+
+#undef DEBUG
+
+#define COMPILE_TIME_ASSERT(x)                                         \
+    do { typedef int compile_time_assertion [(x)?1:-1]; } while (0)
+
+/* Turn on debugging depending on what type of release this is
+ */
+#if (((PIXMAN_VERSION_MICRO % 2) == 0) && ((PIXMAN_VERSION_MINOR % 2) == 1))
+
+/* Debugging gets turned on for development releases because these
+ * are the things that end up in bleeding edge distributions such
+ * as Rawhide etc.
+ *
+ * For performance reasons we don't turn it on for stable releases or
+ * random git checkouts. (Random git checkouts are often used for
+ * performance work).
+ */
+
+#    define DEBUG
+
+#endif
+
+#ifdef DEBUG
+
+void
+_pixman_log_error (const char *function, const char *message);
+
+#define return_if_fail(expr)                                            \
+    do                                                                  \
+    {                                                                   \
+       if (!(expr))                                                    \
+       {                                                               \
+           _pixman_log_error (FUNC, "The expression " # expr " was false"); \
+           return;                                                     \
+       }                                                               \
+    }                                                                   \
+    while (0)
+
+#define return_val_if_fail(expr, retval)                                \
+    do                                                                  \
+    {                                                                   \
+       if (!(expr))                                                    \
+       {                                                               \
+           _pixman_log_error (FUNC, "The expression " # expr " was false"); \
+           return (retval);                                            \
+       }                                                               \
+    }                                                                   \
+    while (0)
+
+#define critical_if_fail(expr)                                         \
+    do                                                                 \
+    {                                                                  \
+       if (!(expr))                                                    \
+           _pixman_log_error (FUNC, "The expression " # expr " was false"); \
+    }                                                                  \
+    while (0)
+
+
+#else
+
+#define _pixman_log_error(f,m) do { } while (0)                                \
+
+#define return_if_fail(expr)                                           \
+    do                                                                  \
+    {                                                                   \
+       if (!(expr))                                                    \
+           return;                                                     \
+    }                                                                   \
+    while (0)
+
+#define return_val_if_fail(expr, retval)                                \
+    do                                                                  \
+    {                                                                   \
+       if (!(expr))                                                    \
+           return (retval);                                            \
+    }                                                                   \
+    while (0)
+
+#define critical_if_fail(expr)                                         \
+    do                                                                 \
+    {                                                                  \
+    }                                                                  \
+    while (0)
+#endif
+
+/*
+ * Timers
+ */
+
+#ifdef PIXMAN_TIMERS
+
+static inline uint64_t
+oil_profile_stamp_rdtsc (void)
+{
+    uint64_t ts;
+
+    __asm__ __volatile__ ("rdtsc\n" : "=A" (ts));
+    return ts;
+}
+
+#define OIL_STAMP oil_profile_stamp_rdtsc
+
+typedef struct pixman_timer_t pixman_timer_t;
+
+struct pixman_timer_t
+{
+    int             initialized;
+    const char *    name;
+    uint64_t        n_times;
+    uint64_t        total;
+    pixman_timer_t *next;
+};
+
+extern int timer_defined;
+
+void pixman_timer_register (pixman_timer_t *timer);
+
+#define TIMER_BEGIN(tname)                                              \
+    {                                                                   \
+       static pixman_timer_t timer ## tname;                           \
+       uint64_t              begin ## tname;                           \
+                                                                       \
+       if (!timer ## tname.initialized)                                \
+       {                                                               \
+           timer ## tname.initialized = 1;                             \
+           timer ## tname.name = # tname;                              \
+           pixman_timer_register (&timer ## tname);                    \
+       }                                                               \
+                                                                       \
+       timer ## tname.n_times++;                                       \
+       begin ## tname = OIL_STAMP ();
+
+#define TIMER_END(tname)                                                \
+    timer ## tname.total += OIL_STAMP () - begin ## tname;             \
+    }
+
+#endif /* PIXMAN_TIMERS */
+
+#endif /* PIXMAN_PRIVATE_H */
diff --git a/pixman/pixman-radial-gradient.c b/pixman/pixman-radial-gradient.c
new file mode 100644 (file)
index 0000000..b6dd6b2
--- /dev/null
@@ -0,0 +1,470 @@
+/* -*- Mode: c; c-basic-offset: 4; tab-width: 8; indent-tabs-mode: t; -*- */
+/*
+ *
+ * Copyright © 2000 Keith Packard, member of The XFree86 Project, Inc.
+ * Copyright © 2000 SuSE, Inc.
+ *             2005 Lars Knoll & Zack Rusin, Trolltech
+ * Copyright © 2007 Red Hat, Inc.
+ *
+ *
+ * Permission to use, copy, modify, distribute, and sell this software and its
+ * documentation for any purpose is hereby granted without fee, provided that
+ * the above copyright notice appear in all copies and that both that
+ * copyright notice and this permission notice appear in supporting
+ * documentation, and that the name of Keith Packard not be used in
+ * advertising or publicity pertaining to distribution of the software without
+ * specific, written prior permission.  Keith Packard makes no
+ * representations about the suitability of this software for any purpose.  It
+ * is provided "as is" without express or implied warranty.
+ *
+ * THE COPYRIGHT HOLDERS DISCLAIM ALL WARRANTIES WITH REGARD TO THIS
+ * SOFTWARE, INCLUDING ALL IMPLIED WARRANTIES OF MERCHANTABILITY AND
+ * FITNESS, IN NO EVENT SHALL THE COPYRIGHT HOLDERS BE LIABLE FOR ANY
+ * SPECIAL, INDIRECT OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES
+ * WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN
+ * AN ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING
+ * OUT OF OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS
+ * SOFTWARE.
+ */
+
+#ifdef HAVE_CONFIG_H
+#include <config.h>
+#endif
+#include <stdlib.h>
+#include <math.h>
+#include "pixman-private.h"
+
+static inline pixman_fixed_32_32_t
+dot (pixman_fixed_48_16_t x1,
+     pixman_fixed_48_16_t y1,
+     pixman_fixed_48_16_t z1,
+     pixman_fixed_48_16_t x2,
+     pixman_fixed_48_16_t y2,
+     pixman_fixed_48_16_t z2)
+{
+    /*
+     * Exact computation, assuming that the input values can
+     * be represented as pixman_fixed_16_16_t
+     */
+    return x1 * x2 + y1 * y2 + z1 * z2;
+}
+
+static inline double
+fdot (double x1,
+      double y1,
+      double z1,
+      double x2,
+      double y2,
+      double z2)
+{
+    /*
+     * Error can be unbound in some special cases.
+     * Using clever dot product algorithms (for example compensated
+     * dot product) would improve this but make the code much less
+     * obvious
+     */
+    return x1 * x2 + y1 * y2 + z1 * z2;
+}
+
+static uint32_t
+radial_compute_color (double                    a,
+                     double                    b,
+                     double                    c,
+                     double                    inva,
+                     double                    dr,
+                     double                    mindr,
+                     pixman_gradient_walker_t *walker,
+                     pixman_repeat_t           repeat)
+{
+    /*
+     * In this function error propagation can lead to bad results:
+     *  - discr can have an unbound error (if b*b-a*c is very small),
+     *    potentially making it the opposite sign of what it should have been
+     *    (thus clearing a pixel that would have been colored or vice-versa)
+     *    or propagating the error to sqrtdiscr;
+     *    if discr has the wrong sign or b is very small, this can lead to bad
+     *    results
+     *
+     *  - the algorithm used to compute the solutions of the quadratic
+     *    equation is not numerically stable (but saves one division compared
+     *    to the numerically stable one);
+     *    this can be a problem if a*c is much smaller than b*b
+     *
+     *  - the above problems are worse if a is small (as inva becomes bigger)
+     */
+    double discr;
+
+    if (a == 0)
+    {
+       double t;
+
+       if (b == 0)
+           return 0;
+
+       t = pixman_fixed_1 / 2 * c / b;
+       if (repeat == PIXMAN_REPEAT_NONE)
+       {
+           if (0 <= t && t <= pixman_fixed_1)
+               return _pixman_gradient_walker_pixel (walker, t);
+       }
+       else
+       {
+           if (t * dr > mindr)
+               return _pixman_gradient_walker_pixel (walker, t);
+       }
+
+       return 0;
+    }
+
+    discr = fdot (b, a, 0, b, -c, 0);
+    if (discr >= 0)
+    {
+       double sqrtdiscr, t0, t1;
+
+       sqrtdiscr = sqrt (discr);
+       t0 = (b + sqrtdiscr) * inva;
+       t1 = (b - sqrtdiscr) * inva;
+
+       /*
+        * The root that must be used is the biggest one that belongs
+        * to the valid range ([0,1] for PIXMAN_REPEAT_NONE, any
+        * solution that results in a positive radius otherwise).
+        *
+        * If a > 0, t0 is the biggest solution, so if it is valid, it
+        * is the correct result.
+        *
+        * If a < 0, only one of the solutions can be valid, so the
+        * order in which they are tested is not important.
+        */
+       if (repeat == PIXMAN_REPEAT_NONE)
+       {
+           if (0 <= t0 && t0 <= pixman_fixed_1)
+               return _pixman_gradient_walker_pixel (walker, t0);
+           else if (0 <= t1 && t1 <= pixman_fixed_1)
+               return _pixman_gradient_walker_pixel (walker, t1);
+       }
+       else
+       {
+           if (t0 * dr > mindr)
+               return _pixman_gradient_walker_pixel (walker, t0);
+           else if (t1 * dr > mindr)
+               return _pixman_gradient_walker_pixel (walker, t1);
+       }
+    }
+
+    return 0;
+}
+
+static uint32_t *
+radial_get_scanline_narrow (pixman_iter_t *iter, const uint32_t *mask)
+{
+    /*
+     * Implementation of radial gradients following the PDF specification.
+     * See section 8.7.4.5.4 Type 3 (Radial) Shadings of the PDF Reference
+     * Manual (PDF 32000-1:2008 at the time of this writing).
+     *
+     * In the radial gradient problem we are given two circles (c₁,r₁) and
+     * (c₂,r₂) that define the gradient itself.
+     *
+     * Mathematically the gradient can be defined as the family of circles
+     *
+     *     ((1-t)·c₁ + t·(c₂), (1-t)·r₁ + t·r₂)
+     *
+     * excluding those circles whose radius would be < 0. When a point
+     * belongs to more than one circle, the one with a bigger t is the only
+     * one that contributes to its color. When a point does not belong
+     * to any of the circles, it is transparent black, i.e. RGBA (0, 0, 0, 0).
+     * Further limitations on the range of values for t are imposed when
+     * the gradient is not repeated, namely t must belong to [0,1].
+     *
+     * The graphical result is the same as drawing the valid (radius > 0)
+     * circles with increasing t in [-inf, +inf] (or in [0,1] if the gradient
+     * is not repeated) using SOURCE operator composition.
+     *
+     * It looks like a cone pointing towards the viewer if the ending circle
+     * is smaller than the starting one, a cone pointing inside the page if
+     * the starting circle is the smaller one and like a cylinder if they
+     * have the same radius.
+     *
+     * What we actually do is, given the point whose color we are interested
+     * in, compute the t values for that point, solving for t in:
+     *
+     *     length((1-t)·c₁ + t·(c₂) - p) = (1-t)·r₁ + t·r₂
+     *
+     * Let's rewrite it in a simpler way, by defining some auxiliary
+     * variables:
+     *
+     *     cd = c₂ - c₁
+     *     pd = p - c₁
+     *     dr = r₂ - r₁
+     *     length(t·cd - pd) = r₁ + t·dr
+     *
+     * which actually means
+     *
+     *     hypot(t·cdx - pdx, t·cdy - pdy) = r₁ + t·dr
+     *
+     * or
+     *
+     *     ⎷((t·cdx - pdx)² + (t·cdy - pdy)²) = r₁ + t·dr.
+     *
+     * If we impose (as stated earlier) that r₁ + t·dr >= 0, it becomes:
+     *
+     *     (t·cdx - pdx)² + (t·cdy - pdy)² = (r₁ + t·dr)²
+     *
+     * where we can actually expand the squares and solve for t:
+     *
+     *     t²cdx² - 2t·cdx·pdx + pdx² + t²cdy² - 2t·cdy·pdy + pdy² =
+     *       = r₁² + 2·r₁·t·dr + t²·dr²
+     *
+     *     (cdx² + cdy² - dr²)t² - 2(cdx·pdx + cdy·pdy + r₁·dr)t +
+     *         (pdx² + pdy² - r₁²) = 0
+     *
+     *     A = cdx² + cdy² - dr²
+     *     B = pdx·cdx + pdy·cdy + r₁·dr
+     *     C = pdx² + pdy² - r₁²
+     *     At² - 2Bt + C = 0
+     *
+     * The solutions (unless the equation degenerates because of A = 0) are:
+     *
+     *     t = (B ± ⎷(B² - A·C)) / A
+     *
+     * The solution we are going to prefer is the bigger one, unless the
+     * radius associated to it is negative (or it falls outside the valid t
+     * range).
+     *
+     * Additional observations (useful for optimizations):
+     * A does not depend on p
+     *
+     * A < 0 <=> one of the two circles completely contains the other one
+     *   <=> for every p, the radiuses associated with the two t solutions
+     *       have opposite sign
+     */
+    pixman_image_t *image = iter->image;
+    int x = iter->x;
+    int y = iter->y;
+    int width = iter->width;
+    uint32_t *buffer = iter->buffer;
+
+    gradient_t *gradient = (gradient_t *)image;
+    radial_gradient_t *radial = (radial_gradient_t *)image;
+    uint32_t *end = buffer + width;
+    pixman_gradient_walker_t walker;
+    pixman_vector_t v, unit;
+
+    /* reference point is the center of the pixel */
+    v.vector[0] = pixman_int_to_fixed (x) + pixman_fixed_1 / 2;
+    v.vector[1] = pixman_int_to_fixed (y) + pixman_fixed_1 / 2;
+    v.vector[2] = pixman_fixed_1;
+
+    _pixman_gradient_walker_init (&walker, gradient, image->common.repeat);
+
+    if (image->common.transform)
+    {
+       if (!pixman_transform_point_3d (image->common.transform, &v))
+           return iter->buffer;
+
+       unit.vector[0] = image->common.transform->matrix[0][0];
+       unit.vector[1] = image->common.transform->matrix[1][0];
+       unit.vector[2] = image->common.transform->matrix[2][0];
+    }
+    else
+    {
+       unit.vector[0] = pixman_fixed_1;
+       unit.vector[1] = 0;
+       unit.vector[2] = 0;
+    }
+
+    if (unit.vector[2] == 0 && v.vector[2] == pixman_fixed_1)
+    {
+       /*
+        * Given:
+        *
+        * t = (B ± ⎷(B² - A·C)) / A
+        *
+        * where
+        *
+        * A = cdx² + cdy² - dr²
+        * B = pdx·cdx + pdy·cdy + r₁·dr
+        * C = pdx² + pdy² - r₁²
+        * det = B² - A·C
+        *
+        * Since we have an affine transformation, we know that (pdx, pdy)
+        * increase linearly with each pixel,
+        *
+        * pdx = pdx₀ + n·ux,
+        * pdy = pdy₀ + n·uy,
+        *
+        * we can then express B, C and det through multiple differentiation.
+        */
+       pixman_fixed_32_32_t b, db, c, dc, ddc;
+
+       /* warning: this computation may overflow */
+       v.vector[0] -= radial->c1.x;
+       v.vector[1] -= radial->c1.y;
+
+       /*
+        * B and C are computed and updated exactly.
+        * If fdot was used instead of dot, in the worst case it would
+        * lose 11 bits of precision in each of the multiplication and
+        * summing up would zero out all the bit that were preserved,
+        * thus making the result 0 instead of the correct one.
+        * This would mean a worst case of unbound relative error or
+        * about 2^10 absolute error
+        */
+       b = dot (v.vector[0], v.vector[1], radial->c1.radius,
+                radial->delta.x, radial->delta.y, radial->delta.radius);
+       db = dot (unit.vector[0], unit.vector[1], 0,
+                 radial->delta.x, radial->delta.y, 0);
+
+       c = dot (v.vector[0], v.vector[1],
+                -((pixman_fixed_48_16_t) radial->c1.radius),
+                v.vector[0], v.vector[1], radial->c1.radius);
+       dc = dot (2 * (pixman_fixed_48_16_t) v.vector[0] + unit.vector[0],
+                 2 * (pixman_fixed_48_16_t) v.vector[1] + unit.vector[1],
+                 0,
+                 unit.vector[0], unit.vector[1], 0);
+       ddc = 2 * dot (unit.vector[0], unit.vector[1], 0,
+                      unit.vector[0], unit.vector[1], 0);
+
+       while (buffer < end)
+       {
+           if (!mask || *mask++)
+           {
+               *buffer = radial_compute_color (radial->a, b, c,
+                                               radial->inva,
+                                               radial->delta.radius,
+                                               radial->mindr,
+                                               &walker,
+                                               image->common.repeat);
+           }
+
+           b += db;
+           c += dc;
+           dc += ddc;
+           ++buffer;
+       }
+    }
+    else
+    {
+       /* projective */
+       /* Warning:
+        * error propagation guarantees are much looser than in the affine case
+        */
+       while (buffer < end)
+       {
+           if (!mask || *mask++)
+           {
+               if (v.vector[2] != 0)
+               {
+                   double pdx, pdy, invv2, b, c;
+
+                   invv2 = 1. * pixman_fixed_1 / v.vector[2];
+
+                   pdx = v.vector[0] * invv2 - radial->c1.x;
+                   /*    / pixman_fixed_1 */
+
+                   pdy = v.vector[1] * invv2 - radial->c1.y;
+                   /*    / pixman_fixed_1 */
+
+                   b = fdot (pdx, pdy, radial->c1.radius,
+                             radial->delta.x, radial->delta.y,
+                             radial->delta.radius);
+                   /*  / pixman_fixed_1 / pixman_fixed_1 */
+
+                   c = fdot (pdx, pdy, -radial->c1.radius,
+                             pdx, pdy, radial->c1.radius);
+                   /*  / pixman_fixed_1 / pixman_fixed_1 */
+
+                   *buffer = radial_compute_color (radial->a, b, c,
+                                                   radial->inva,
+                                                   radial->delta.radius,
+                                                   radial->mindr,
+                                                   &walker,
+                                                   image->common.repeat);
+               }
+               else
+               {
+                   *buffer = 0;
+               }
+           }
+
+           ++buffer;
+
+           v.vector[0] += unit.vector[0];
+           v.vector[1] += unit.vector[1];
+           v.vector[2] += unit.vector[2];
+       }
+    }
+
+    iter->y++;
+    return iter->buffer;
+}
+
+static uint32_t *
+radial_get_scanline_wide (pixman_iter_t *iter, const uint32_t *mask)
+{
+    uint32_t *buffer = radial_get_scanline_narrow (iter, NULL);
+
+    pixman_expand ((uint64_t *)buffer, buffer, PIXMAN_a8r8g8b8, iter->width);
+
+    return buffer;
+}
+
+void
+_pixman_radial_gradient_iter_init (pixman_image_t *image, pixman_iter_t *iter)
+{
+    if (iter->flags & ITER_NARROW)
+       iter->get_scanline = radial_get_scanline_narrow;
+    else
+       iter->get_scanline = radial_get_scanline_wide;
+}
+
+PIXMAN_EXPORT pixman_image_t *
+pixman_image_create_radial_gradient (pixman_point_fixed_t *        inner,
+                                     pixman_point_fixed_t *        outer,
+                                     pixman_fixed_t                inner_radius,
+                                     pixman_fixed_t                outer_radius,
+                                     const pixman_gradient_stop_t *stops,
+                                     int                           n_stops)
+{
+    pixman_image_t *image;
+    radial_gradient_t *radial;
+
+    image = _pixman_image_allocate ();
+
+    if (!image)
+       return NULL;
+
+    radial = &image->radial;
+
+    if (!_pixman_init_gradient (&radial->common, stops, n_stops))
+    {
+       free (image);
+       return NULL;
+    }
+
+    image->type = RADIAL;
+
+    radial->c1.x = inner->x;
+    radial->c1.y = inner->y;
+    radial->c1.radius = inner_radius;
+    radial->c2.x = outer->x;
+    radial->c2.y = outer->y;
+    radial->c2.radius = outer_radius;
+
+    /* warning: this computations may overflow */
+    radial->delta.x = radial->c2.x - radial->c1.x;
+    radial->delta.y = radial->c2.y - radial->c1.y;
+    radial->delta.radius = radial->c2.radius - radial->c1.radius;
+
+    /* computed exactly, then cast to double -> every bit of the double
+       representation is correct (53 bits) */
+    radial->a = dot (radial->delta.x, radial->delta.y, -radial->delta.radius,
+                    radial->delta.x, radial->delta.y, radial->delta.radius);
+    if (radial->a != 0)
+       radial->inva = 1. * pixman_fixed_1 / radial->a;
+
+    radial->mindr = -1. * pixman_fixed_1 * radial->c1.radius;
+
+    return image;
+}
diff --git a/pixman/pixman-region.c b/pixman/pixman-region.c
new file mode 100644 (file)
index 0000000..47beb52
--- /dev/null
@@ -0,0 +1,2810 @@
+/*
+ * Copyright 1987, 1988, 1989, 1998  The Open Group
+ * 
+ * Permission to use, copy, modify, distribute, and sell this software and its
+ * documentation for any purpose is hereby granted without fee, provided that
+ * the above copyright notice appear in all copies and that both that
+ * copyright notice and this permission notice appear in supporting
+ * documentation.
+ * 
+ * The above copyright notice and this permission notice shall be included in
+ * all copies or substantial portions of the Software.
+ * 
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL THE
+ * OPEN GROUP BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN
+ * AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
+ * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+ * 
+ * Except as contained in this notice, the name of The Open Group shall not be
+ * used in advertising or otherwise to promote the sale, use or other dealings
+ * in this Software without prior written authorization from The Open Group.
+ * 
+ * Copyright 1987, 1988, 1989 by
+ * Digital Equipment Corporation, Maynard, Massachusetts.
+ * 
+ *                    All Rights Reserved
+ * 
+ * Permission to use, copy, modify, and distribute this software and its
+ * documentation for any purpose and without fee is hereby granted,
+ * provided that the above copyright notice appear in all copies and that
+ * both that copyright notice and this permission notice appear in
+ * supporting documentation, and that the name of Digital not be
+ * used in advertising or publicity pertaining to distribution of the
+ * software without specific, written prior permission.
+ * 
+ * DIGITAL DISCLAIMS ALL WARRANTIES WITH REGARD TO THIS SOFTWARE, INCLUDING
+ * ALL IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS, IN NO EVENT SHALL
+ * DIGITAL BE LIABLE FOR ANY SPECIAL, INDIRECT OR CONSEQUENTIAL DAMAGES OR
+ * ANY DAMAGES WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS,
+ * WHETHER IN AN ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION,
+ * ARISING OUT OF OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS
+ * SOFTWARE.
+ *
+ * Copyright © 1998 Keith Packard
+ *
+ * Permission to use, copy, modify, distribute, and sell this software and its
+ * documentation for any purpose is hereby granted without fee, provided that
+ * the above copyright notice appear in all copies and that both that
+ * copyright notice and this permission notice appear in supporting
+ * documentation, and that the name of Keith Packard not be used in
+ * advertising or publicity pertaining to distribution of the software without
+ * specific, written prior permission.  Keith Packard makes no
+ * representations about the suitability of this software for any purpose.  It
+ * is provided "as is" without express or implied warranty.
+ *
+ * KEITH PACKARD DISCLAIMS ALL WARRANTIES WITH REGARD TO THIS SOFTWARE,
+ * INCLUDING ALL IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS, IN NO
+ * EVENT SHALL KEITH PACKARD BE LIABLE FOR ANY SPECIAL, INDIRECT OR
+ * CONSEQUENTIAL DAMAGES OR ANY DAMAGES WHATSOEVER RESULTING FROM LOSS OF USE,
+ * DATA OR PROFITS, WHETHER IN AN ACTION OF CONTRACT, NEGLIGENCE OR OTHER
+ * TORTIOUS ACTION, ARISING OUT OF OR IN CONNECTION WITH THE USE OR
+ * PERFORMANCE OF THIS SOFTWARE.
+ */
+
+#include <stdlib.h>
+#include <limits.h>
+#include <string.h>
+#include <stdio.h>
+#include "pixman-private.h"
+
+#define PIXREGION_NIL(reg) ((reg)->data && !(reg)->data->numRects)
+/* not a region */
+#define PIXREGION_NAR(reg)      ((reg)->data == pixman_broken_data)
+#define PIXREGION_NUMRECTS(reg) ((reg)->data ? (reg)->data->numRects : 1)
+#define PIXREGION_SIZE(reg) ((reg)->data ? (reg)->data->size : 0)
+#define PIXREGION_RECTS(reg) \
+    ((reg)->data ? (box_type_t *)((reg)->data + 1) \
+     : &(reg)->extents)
+#define PIXREGION_BOXPTR(reg) ((box_type_t *)((reg)->data + 1))
+#define PIXREGION_BOX(reg, i) (&PIXREGION_BOXPTR (reg)[i])
+#define PIXREGION_TOP(reg) PIXREGION_BOX (reg, (reg)->data->numRects)
+#define PIXREGION_END(reg) PIXREGION_BOX (reg, (reg)->data->numRects - 1)
+
+#define GOOD_RECT(rect) ((rect)->x1 < (rect)->x2 && (rect)->y1 < (rect)->y2)
+#define BAD_RECT(rect) ((rect)->x1 > (rect)->x2 || (rect)->y1 > (rect)->y2)
+
+#ifdef DEBUG
+
+#define GOOD(reg)                                                      \
+    do                                                                 \
+    {                                                                  \
+       if (!PREFIX (_selfcheck (reg)))                                 \
+           _pixman_log_error (FUNC, "Malformed region " # reg);        \
+    } while (0)
+
+#else
+
+#define GOOD(reg)
+
+#endif
+
+static const box_type_t PREFIX (_empty_box_) = { 0, 0, 0, 0 };
+static const region_data_type_t PREFIX (_empty_data_) = { 0, 0 };
+#if defined (__llvm__) && !defined (__clang__)
+static const volatile region_data_type_t PREFIX (_broken_data_) = { 0, 0 };
+#else
+static const region_data_type_t PREFIX (_broken_data_) = { 0, 0 };
+#endif
+
+static box_type_t *pixman_region_empty_box =
+    (box_type_t *)&PREFIX (_empty_box_);
+static region_data_type_t *pixman_region_empty_data =
+    (region_data_type_t *)&PREFIX (_empty_data_);
+static region_data_type_t *pixman_broken_data =
+    (region_data_type_t *)&PREFIX (_broken_data_);
+
+static pixman_bool_t
+pixman_break (region_type_t *region);
+
+/*
+ * The functions in this file implement the Region abstraction used extensively
+ * throughout the X11 sample server. A Region is simply a set of disjoint
+ * (non-overlapping) rectangles, plus an "extent" rectangle which is the
+ * smallest single rectangle that contains all the non-overlapping rectangles.
+ *
+ * A Region is implemented as a "y-x-banded" array of rectangles.  This array
+ * imposes two degrees of order.  First, all rectangles are sorted by top side
+ * y coordinate first (y1), and then by left side x coordinate (x1).
+ *
+ * Furthermore, the rectangles are grouped into "bands".  Each rectangle in a
+ * band has the same top y coordinate (y1), and each has the same bottom y
+ * coordinate (y2).  Thus all rectangles in a band differ only in their left
+ * and right side (x1 and x2).  Bands are implicit in the array of rectangles:
+ * there is no separate list of band start pointers.
+ *
+ * The y-x band representation does not minimize rectangles.  In particular,
+ * if a rectangle vertically crosses a band (the rectangle has scanlines in
+ * the y1 to y2 area spanned by the band), then the rectangle may be broken
+ * down into two or more smaller rectangles stacked one atop the other.
+ *
+ *  -----------                                    -----------
+ *  |         |                                    |         |             band 0
+ *  |         |  --------                  -----------  --------
+ *  |         |  |      |  in y-x banded    |         |  |      |   band 1
+ *  |         |  |      |  form is         |         |  |      |
+ *  -----------  |      |                  -----------  --------
+ *               |      |                               |      |   band 2
+ *               --------                               --------
+ *
+ * An added constraint on the rectangles is that they must cover as much
+ * horizontal area as possible: no two rectangles within a band are allowed
+ * to touch.
+ *
+ * Whenever possible, bands will be merged together to cover a greater vertical
+ * distance (and thus reduce the number of rectangles). Two bands can be merged
+ * only if the bottom of one touches the top of the other and they have
+ * rectangles in the same places (of the same width, of course).
+ *
+ * Adam de Boor wrote most of the original region code.  Joel McCormack
+ * substantially modified or rewrote most of the core arithmetic routines, and
+ * added pixman_region_validate in order to support several speed improvements
+ * to pixman_region_validate_tree.  Bob Scheifler changed the representation
+ * to be more compact when empty or a single rectangle, and did a bunch of
+ * gratuitous reformatting. Carl Worth did further gratuitous reformatting
+ * while re-merging the server and client region code into libpixregion.
+ * Soren Sandmann did even more gratuitous reformatting.
+ */
+
+/*  true iff two Boxes overlap */
+#define EXTENTCHECK(r1, r2)       \
+    (!( ((r1)->x2 <= (r2)->x1)  || \
+        ((r1)->x1 >= (r2)->x2)  || \
+        ((r1)->y2 <= (r2)->y1)  || \
+        ((r1)->y1 >= (r2)->y2) ) )
+
+/* true iff (x,y) is in Box */
+#define INBOX(r, x, y) \
+    ( ((r)->x2 >  x) && \
+      ((r)->x1 <= x) && \
+      ((r)->y2 >  y) && \
+      ((r)->y1 <= y) )
+
+/* true iff Box r1 contains Box r2 */
+#define SUBSUMES(r1, r2)       \
+    ( ((r1)->x1 <= (r2)->x1) && \
+      ((r1)->x2 >= (r2)->x2) && \
+      ((r1)->y1 <= (r2)->y1) && \
+      ((r1)->y2 >= (r2)->y2) )
+
+static size_t
+PIXREGION_SZOF (size_t n)
+{
+    size_t size = n * sizeof(box_type_t);
+    
+    if (n > UINT32_MAX / sizeof(box_type_t))
+       return 0;
+
+    if (sizeof(region_data_type_t) > UINT32_MAX - size)
+       return 0;
+
+    return size + sizeof(region_data_type_t);
+}
+
+static void *
+alloc_data (size_t n)
+{
+    size_t sz = PIXREGION_SZOF (n);
+
+    if (!sz)
+       return NULL;
+
+    return malloc (sz);
+}
+
+#define FREE_DATA(reg) if ((reg)->data && (reg)->data->size) free ((reg)->data)
+
+#define RECTALLOC_BAIL(region, n, bail)                                        \
+    do                                                                 \
+    {                                                                  \
+       if (!(region)->data ||                                          \
+           (((region)->data->numRects + (n)) > (region)->data->size))  \
+       {                                                               \
+           if (!pixman_rect_alloc (region, n))                         \
+               goto bail;                                              \
+       }                                                               \
+    } while (0)
+
+#define RECTALLOC(region, n)                                           \
+    do                                                                 \
+    {                                                                  \
+       if (!(region)->data ||                                          \
+           (((region)->data->numRects + (n)) > (region)->data->size))  \
+       {                                                               \
+           if (!pixman_rect_alloc (region, n)) {                       \
+               return FALSE;                                           \
+           }                                                           \
+       }                                                               \
+    } while (0)
+
+#define ADDRECT(next_rect, nx1, ny1, nx2, ny2)      \
+    do                                             \
+    {                                              \
+       next_rect->x1 = nx1;                        \
+       next_rect->y1 = ny1;                        \
+       next_rect->x2 = nx2;                        \
+       next_rect->y2 = ny2;                        \
+       next_rect++;                                \
+    }                                              \
+    while (0)
+
+#define NEWRECT(region, next_rect, nx1, ny1, nx2, ny2)                 \
+    do                                                                 \
+    {                                                                  \
+       if (!(region)->data ||                                          \
+           ((region)->data->numRects == (region)->data->size))         \
+       {                                                               \
+           if (!pixman_rect_alloc (region, 1))                         \
+               return FALSE;                                           \
+           next_rect = PIXREGION_TOP (region);                         \
+       }                                                               \
+       ADDRECT (next_rect, nx1, ny1, nx2, ny2);                        \
+       region->data->numRects++;                                       \
+       critical_if_fail (region->data->numRects <= region->data->size);                \
+    } while (0)
+
+#define DOWNSIZE(reg, numRects)                                                \
+    do                                                                 \
+    {                                                                  \
+       if (((numRects) < ((reg)->data->size >> 1)) &&                  \
+           ((reg)->data->size > 50))                                   \
+       {                                                               \
+           region_data_type_t * new_data;                              \
+           size_t data_size = PIXREGION_SZOF (numRects);               \
+                                                                       \
+           if (!data_size)                                             \
+           {                                                           \
+               new_data = NULL;                                        \
+           }                                                           \
+           else                                                        \
+           {                                                           \
+               new_data = (region_data_type_t *)                       \
+                   realloc ((reg)->data, data_size);                   \
+           }                                                           \
+                                                                       \
+           if (new_data)                                               \
+           {                                                           \
+               new_data->size = (numRects);                            \
+               (reg)->data = new_data;                                 \
+           }                                                           \
+       }                                                               \
+    } while (0)
+
+PIXMAN_EXPORT pixman_bool_t
+PREFIX (_equal) (region_type_t *reg1, region_type_t *reg2)
+{
+    int i;
+    box_type_t *rects1;
+    box_type_t *rects2;
+
+    if (reg1->extents.x1 != reg2->extents.x1)
+       return FALSE;
+    
+    if (reg1->extents.x2 != reg2->extents.x2)
+       return FALSE;
+    
+    if (reg1->extents.y1 != reg2->extents.y1)
+       return FALSE;
+    
+    if (reg1->extents.y2 != reg2->extents.y2)
+       return FALSE;
+    
+    if (PIXREGION_NUMRECTS (reg1) != PIXREGION_NUMRECTS (reg2))
+       return FALSE;
+
+    rects1 = PIXREGION_RECTS (reg1);
+    rects2 = PIXREGION_RECTS (reg2);
+    
+    for (i = 0; i != PIXREGION_NUMRECTS (reg1); i++)
+    {
+       if (rects1[i].x1 != rects2[i].x1)
+           return FALSE;
+       
+       if (rects1[i].x2 != rects2[i].x2)
+           return FALSE;
+       
+       if (rects1[i].y1 != rects2[i].y1)
+           return FALSE;
+       
+       if (rects1[i].y2 != rects2[i].y2)
+           return FALSE;
+    }
+
+    return TRUE;
+}
+
+int
+PREFIX (_print) (region_type_t *rgn)
+{
+    int num, size;
+    int i;
+    box_type_t * rects;
+
+    num = PIXREGION_NUMRECTS (rgn);
+    size = PIXREGION_SIZE (rgn);
+    rects = PIXREGION_RECTS (rgn);
+
+    fprintf (stderr, "num: %d size: %d\n", num, size);
+    fprintf (stderr, "extents: %d %d %d %d\n",
+             rgn->extents.x1,
+            rgn->extents.y1,
+            rgn->extents.x2,
+            rgn->extents.y2);
+    
+    for (i = 0; i < num; i++)
+    {
+       fprintf (stderr, "%d %d %d %d \n",
+                rects[i].x1, rects[i].y1, rects[i].x2, rects[i].y2);
+    }
+    
+    fprintf (stderr, "\n");
+
+    return(num);
+}
+
+
+PIXMAN_EXPORT void
+PREFIX (_init) (region_type_t *region)
+{
+    region->extents = *pixman_region_empty_box;
+    region->data = pixman_region_empty_data;
+}
+
+PIXMAN_EXPORT void
+PREFIX (_init_rect) (region_type_t *   region,
+                     int               x,
+                    int                y,
+                    unsigned int       width,
+                    unsigned int       height)
+{
+    region->extents.x1 = x;
+    region->extents.y1 = y;
+    region->extents.x2 = x + width;
+    region->extents.y2 = y + height;
+
+    if (!GOOD_RECT (&region->extents))
+    {
+        if (BAD_RECT (&region->extents))
+            _pixman_log_error (FUNC, "Invalid rectangle passed");
+        PREFIX (_init) (region);
+        return;
+    }
+
+    region->data = NULL;
+}
+
+PIXMAN_EXPORT void
+PREFIX (_init_with_extents) (region_type_t *region, box_type_t *extents)
+{
+    if (!GOOD_RECT (extents))
+    {
+        if (BAD_RECT (extents))
+            _pixman_log_error (FUNC, "Invalid rectangle passed");
+        PREFIX (_init) (region);
+        return;
+    }
+    region->extents = *extents;
+
+    region->data = NULL;
+}
+
+PIXMAN_EXPORT void
+PREFIX (_fini) (region_type_t *region)
+{
+    GOOD (region);
+    FREE_DATA (region);
+}
+
+PIXMAN_EXPORT int
+PREFIX (_n_rects) (region_type_t *region)
+{
+    return PIXREGION_NUMRECTS (region);
+}
+
+PIXMAN_EXPORT box_type_t *
+PREFIX (_rectangles) (region_type_t *region,
+                      int               *n_rects)
+{
+    if (n_rects)
+       *n_rects = PIXREGION_NUMRECTS (region);
+
+    return PIXREGION_RECTS (region);
+}
+
+static pixman_bool_t
+pixman_break (region_type_t *region)
+{
+    FREE_DATA (region);
+
+    region->extents = *pixman_region_empty_box;
+    region->data = pixman_broken_data;
+
+    return FALSE;
+}
+
+static pixman_bool_t
+pixman_rect_alloc (region_type_t * region,
+                   int             n)
+{
+    region_data_type_t *data;
+
+    if (!region->data)
+    {
+       n++;
+       region->data = alloc_data (n);
+
+       if (!region->data)
+           return pixman_break (region);
+
+       region->data->numRects = 1;
+       *PIXREGION_BOXPTR (region) = region->extents;
+    }
+    else if (!region->data->size)
+    {
+       region->data = alloc_data (n);
+
+       if (!region->data)
+           return pixman_break (region);
+
+       region->data->numRects = 0;
+    }
+    else
+    {
+       size_t data_size;
+
+       if (n == 1)
+       {
+           n = region->data->numRects;
+           if (n > 500) /* XXX pick numbers out of a hat */
+               n = 250;
+       }
+
+       n += region->data->numRects;
+       data_size = PIXREGION_SZOF (n);
+
+       if (!data_size)
+       {
+           data = NULL;
+       }
+       else
+       {
+           data = (region_data_type_t *)
+               realloc (region->data, PIXREGION_SZOF (n));
+       }
+       
+       if (!data)
+           return pixman_break (region);
+       
+       region->data = data;
+    }
+    
+    region->data->size = n;
+
+    return TRUE;
+}
+
+PIXMAN_EXPORT pixman_bool_t
+PREFIX (_copy) (region_type_t *dst, region_type_t *src)
+{
+    GOOD (dst);
+    GOOD (src);
+
+    if (dst == src)
+       return TRUE;
+    
+    dst->extents = src->extents;
+
+    if (!src->data || !src->data->size)
+    {
+       FREE_DATA (dst);
+       dst->data = src->data;
+       return TRUE;
+    }
+    
+    if (!dst->data || (dst->data->size < src->data->numRects))
+    {
+       FREE_DATA (dst);
+
+       dst->data = alloc_data (src->data->numRects);
+
+       if (!dst->data)
+           return pixman_break (dst);
+
+       dst->data->size = src->data->numRects;
+    }
+
+    dst->data->numRects = src->data->numRects;
+
+    memmove ((char *)PIXREGION_BOXPTR (dst), (char *)PIXREGION_BOXPTR (src),
+             dst->data->numRects * sizeof(box_type_t));
+
+    return TRUE;
+}
+
+/*======================================================================
+ *         Generic Region Operator
+ *====================================================================*/
+
+/*-
+ *-----------------------------------------------------------------------
+ * pixman_coalesce --
+ *     Attempt to merge the boxes in the current band with those in the
+ *     previous one.  We are guaranteed that the current band extends to
+ *      the end of the rects array.  Used only by pixman_op.
+ *
+ * Results:
+ *     The new index for the previous band.
+ *
+ * Side Effects:
+ *     If coalescing takes place:
+ *         - rectangles in the previous band will have their y2 fields
+ *           altered.
+ *         - region->data->numRects will be decreased.
+ *
+ *-----------------------------------------------------------------------
+ */
+static inline int
+pixman_coalesce (region_type_t * region,      /* Region to coalesce             */
+                int             prev_start,  /* Index of start of previous band */
+                int             cur_start)   /* Index of start of current band  */
+{
+    box_type_t *prev_box;       /* Current box in previous band             */
+    box_type_t *cur_box;        /* Current box in current band       */
+    int numRects;               /* Number rectangles in both bands   */
+    int y2;                     /* Bottom of current band           */
+
+    /*
+     * Figure out how many rectangles are in the band.
+     */
+    numRects = cur_start - prev_start;
+    critical_if_fail (numRects == region->data->numRects - cur_start);
+
+    if (!numRects) return cur_start;
+
+    /*
+     * The bands may only be coalesced if the bottom of the previous
+     * matches the top scanline of the current.
+     */
+    prev_box = PIXREGION_BOX (region, prev_start);
+    cur_box = PIXREGION_BOX (region, cur_start);
+    if (prev_box->y2 != cur_box->y1) return cur_start;
+
+    /*
+     * Make sure the bands have boxes in the same places. This
+     * assumes that boxes have been added in such a way that they
+     * cover the most area possible. I.e. two boxes in a band must
+     * have some horizontal space between them.
+     */
+    y2 = cur_box->y2;
+
+    do
+    {
+       if ((prev_box->x1 != cur_box->x1) || (prev_box->x2 != cur_box->x2))
+           return (cur_start);
+       
+       prev_box++;
+       cur_box++;
+       numRects--;
+    }
+    while (numRects);
+
+    /*
+     * The bands may be merged, so set the bottom y of each box
+     * in the previous band to the bottom y of the current band.
+     */
+    numRects = cur_start - prev_start;
+    region->data->numRects -= numRects;
+
+    do
+    {
+       prev_box--;
+       prev_box->y2 = y2;
+       numRects--;
+    }
+    while (numRects);
+
+    return prev_start;
+}
+
+/* Quicky macro to avoid trivial reject procedure calls to pixman_coalesce */
+
+#define COALESCE(new_reg, prev_band, cur_band)                          \
+    do                                                                 \
+    {                                                                  \
+       if (cur_band - prev_band == new_reg->data->numRects - cur_band) \
+           prev_band = pixman_coalesce (new_reg, prev_band, cur_band); \
+       else                                                            \
+           prev_band = cur_band;                                       \
+    } while (0)
+
+/*-
+ *-----------------------------------------------------------------------
+ * pixman_region_append_non_o --
+ *     Handle a non-overlapping band for the union and subtract operations.
+ *      Just adds the (top/bottom-clipped) rectangles into the region.
+ *      Doesn't have to check for subsumption or anything.
+ *
+ * Results:
+ *     None.
+ *
+ * Side Effects:
+ *     region->data->numRects is incremented and the rectangles overwritten
+ *     with the rectangles we're passed.
+ *
+ *-----------------------------------------------------------------------
+ */
+static inline pixman_bool_t
+pixman_region_append_non_o (region_type_t * region,
+                           box_type_t *    r,
+                           box_type_t *    r_end,
+                           int             y1,
+                           int             y2)
+{
+    box_type_t *next_rect;
+    int new_rects;
+
+    new_rects = r_end - r;
+
+    critical_if_fail (y1 < y2);
+    critical_if_fail (new_rects != 0);
+
+    /* Make sure we have enough space for all rectangles to be added */
+    RECTALLOC (region, new_rects);
+    next_rect = PIXREGION_TOP (region);
+    region->data->numRects += new_rects;
+
+    do
+    {
+       critical_if_fail (r->x1 < r->x2);
+       ADDRECT (next_rect, r->x1, y1, r->x2, y2);
+       r++;
+    }
+    while (r != r_end);
+
+    return TRUE;
+}
+
+#define FIND_BAND(r, r_band_end, r_end, ry1)                        \
+    do                                                              \
+    {                                                               \
+       ry1 = r->y1;                                                 \
+       r_band_end = r + 1;                                          \
+       while ((r_band_end != r_end) && (r_band_end->y1 == ry1)) {   \
+           r_band_end++;                                            \
+       }                                                            \
+    } while (0)
+
+#define APPEND_REGIONS(new_reg, r, r_end)                              \
+    do                                                                 \
+    {                                                                  \
+       int new_rects;                                                  \
+       if ((new_rects = r_end - r)) {                                  \
+           RECTALLOC_BAIL (new_reg, new_rects, bail);                  \
+           memmove ((char *)PIXREGION_TOP (new_reg), (char *)r,        \
+                    new_rects * sizeof(box_type_t));                   \
+           new_reg->data->numRects += new_rects;                       \
+       }                                                               \
+    } while (0)
+
+/*-
+ *-----------------------------------------------------------------------
+ * pixman_op --
+ *     Apply an operation to two regions. Called by pixman_region_union, pixman_region_inverse,
+ *     pixman_region_subtract, pixman_region_intersect....  Both regions MUST have at least one
+ *      rectangle, and cannot be the same object.
+ *
+ * Results:
+ *     TRUE if successful.
+ *
+ * Side Effects:
+ *     The new region is overwritten.
+ *     overlap set to TRUE if overlap_func ever returns TRUE.
+ *
+ * Notes:
+ *     The idea behind this function is to view the two regions as sets.
+ *     Together they cover a rectangle of area that this function divides
+ *     into horizontal bands where points are covered only by one region
+ *     or by both. For the first case, the non_overlap_func is called with
+ *     each the band and the band's upper and lower extents. For the
+ *     second, the overlap_func is called to process the entire band. It
+ *     is responsible for clipping the rectangles in the band, though
+ *     this function provides the boundaries.
+ *     At the end of each band, the new region is coalesced, if possible,
+ *     to reduce the number of rectangles in the region.
+ *
+ *-----------------------------------------------------------------------
+ */
+
+typedef pixman_bool_t (*overlap_proc_ptr) (region_type_t *region,
+                                          box_type_t *   r1,
+                                          box_type_t *   r1_end,
+                                          box_type_t *   r2,
+                                          box_type_t *   r2_end,
+                                          int            y1,
+                                          int            y2,
+                                          int *          overlap);
+
+static pixman_bool_t
+pixman_op (region_type_t *  new_reg,               /* Place to store result        */
+          region_type_t *  reg1,                  /* First region in operation     */
+          region_type_t *  reg2,                  /* 2d region in operation        */
+          overlap_proc_ptr overlap_func,          /* Function to call for over-
+                                                   * lapping bands                 */
+          int              append_non1,           /* Append non-overlapping bands  
+                                                   * in region 1 ?
+                                                   */
+          int              append_non2,           /* Append non-overlapping bands
+                                                   * in region 2 ?
+                                                   */
+          int *            overlap)
+{
+    box_type_t *r1;                 /* Pointer into first region     */
+    box_type_t *r2;                 /* Pointer into 2d region       */
+    box_type_t *r1_end;             /* End of 1st region            */
+    box_type_t *r2_end;             /* End of 2d region                     */
+    int ybot;                       /* Bottom of intersection       */
+    int ytop;                       /* Top of intersection          */
+    region_data_type_t *old_data;   /* Old data for new_reg         */
+    int prev_band;                  /* Index of start of
+                                    * previous band in new_reg       */
+    int cur_band;                   /* Index of start of current
+                                    * band in new_reg               */
+    box_type_t * r1_band_end;       /* End of current band in r1     */
+    box_type_t * r2_band_end;       /* End of current band in r2     */
+    int top;                        /* Top of non-overlapping band   */
+    int bot;                        /* Bottom of non-overlapping band*/
+    int r1y1;                       /* Temps for r1->y1 and r2->y1   */
+    int r2y1;
+    int new_size;
+    int numRects;
+
+    /*
+     * Break any region computed from a broken region
+     */
+    if (PIXREGION_NAR (reg1) || PIXREGION_NAR (reg2))
+       return pixman_break (new_reg);
+
+    /*
+     * Initialization:
+     * set r1, r2, r1_end and r2_end appropriately, save the rectangles
+     * of the destination region until the end in case it's one of
+     * the two source regions, then mark the "new" region empty, allocating
+     * another array of rectangles for it to use.
+     */
+
+    r1 = PIXREGION_RECTS (reg1);
+    new_size = PIXREGION_NUMRECTS (reg1);
+    r1_end = r1 + new_size;
+
+    numRects = PIXREGION_NUMRECTS (reg2);
+    r2 = PIXREGION_RECTS (reg2);
+    r2_end = r2 + numRects;
+    
+    critical_if_fail (r1 != r1_end);
+    critical_if_fail (r2 != r2_end);
+
+    old_data = (region_data_type_t *)NULL;
+
+    if (((new_reg == reg1) && (new_size > 1)) ||
+        ((new_reg == reg2) && (numRects > 1)))
+    {
+        old_data = new_reg->data;
+        new_reg->data = pixman_region_empty_data;
+    }
+
+    /* guess at new size */
+    if (numRects > new_size)
+       new_size = numRects;
+
+    new_size <<= 1;
+
+    if (!new_reg->data)
+       new_reg->data = pixman_region_empty_data;
+    else if (new_reg->data->size)
+       new_reg->data->numRects = 0;
+
+    if (new_size > new_reg->data->size)
+    {
+        if (!pixman_rect_alloc (new_reg, new_size))
+        {
+            if (old_data)
+               free (old_data);
+            return FALSE;
+       }
+    }
+
+    /*
+     * Initialize ybot.
+     * In the upcoming loop, ybot and ytop serve different functions depending
+     * on whether the band being handled is an overlapping or non-overlapping
+     * band.
+     *  In the case of a non-overlapping band (only one of the regions
+     * has points in the band), ybot is the bottom of the most recent
+     * intersection and thus clips the top of the rectangles in that band.
+     * ytop is the top of the next intersection between the two regions and
+     * serves to clip the bottom of the rectangles in the current band.
+     * For an overlapping band (where the two regions intersect), ytop clips
+     * the top of the rectangles of both regions and ybot clips the bottoms.
+     */
+
+    ybot = MIN (r1->y1, r2->y1);
+
+    /*
+     * prev_band serves to mark the start of the previous band so rectangles
+     * can be coalesced into larger rectangles. qv. pixman_coalesce, above.
+     * In the beginning, there is no previous band, so prev_band == cur_band
+     * (cur_band is set later on, of course, but the first band will always
+     * start at index 0). prev_band and cur_band must be indices because of
+     * the possible expansion, and resultant moving, of the new region's
+     * array of rectangles.
+     */
+    prev_band = 0;
+
+    do
+    {
+        /*
+        * This algorithm proceeds one source-band (as opposed to a
+        * destination band, which is determined by where the two regions
+        * intersect) at a time. r1_band_end and r2_band_end serve to mark the
+        * rectangle after the last one in the current band for their
+        * respective regions.
+        */
+        critical_if_fail (r1 != r1_end);
+        critical_if_fail (r2 != r2_end);
+
+        FIND_BAND (r1, r1_band_end, r1_end, r1y1);
+        FIND_BAND (r2, r2_band_end, r2_end, r2y1);
+
+        /*
+        * First handle the band that doesn't intersect, if any.
+        *
+        * Note that attention is restricted to one band in the
+        * non-intersecting region at once, so if a region has n
+        * bands between the current position and the next place it overlaps
+        * the other, this entire loop will be passed through n times.
+        */
+        if (r1y1 < r2y1)
+        {
+            if (append_non1)
+            {
+                top = MAX (r1y1, ybot);
+                bot = MIN (r1->y2, r2y1);
+                if (top != bot)
+                {
+                    cur_band = new_reg->data->numRects;
+                    if (!pixman_region_append_non_o (new_reg, r1, r1_band_end, top, bot))
+                       goto bail;
+                    COALESCE (new_reg, prev_band, cur_band);
+               }
+           }
+            ytop = r2y1;
+       }
+        else if (r2y1 < r1y1)
+        {
+            if (append_non2)
+            {
+                top = MAX (r2y1, ybot);
+                bot = MIN (r2->y2, r1y1);
+               
+                if (top != bot)
+                {
+                    cur_band = new_reg->data->numRects;
+
+                    if (!pixman_region_append_non_o (new_reg, r2, r2_band_end, top, bot))
+                       goto bail;
+
+                    COALESCE (new_reg, prev_band, cur_band);
+               }
+           }
+            ytop = r1y1;
+       }
+        else
+        {
+            ytop = r1y1;
+       }
+
+        /*
+        * Now see if we've hit an intersecting band. The two bands only
+        * intersect if ybot > ytop
+        */
+        ybot = MIN (r1->y2, r2->y2);
+        if (ybot > ytop)
+        {
+            cur_band = new_reg->data->numRects;
+
+            if (!(*overlap_func)(new_reg,
+                                 r1, r1_band_end,
+                                 r2, r2_band_end,
+                                 ytop, ybot,
+                                 overlap))
+           {
+               goto bail;
+           }
+           
+            COALESCE (new_reg, prev_band, cur_band);
+       }
+
+        /*
+        * If we've finished with a band (y2 == ybot) we skip forward
+        * in the region to the next band.
+        */
+        if (r1->y2 == ybot)
+           r1 = r1_band_end;
+
+        if (r2->y2 == ybot)
+           r2 = r2_band_end;
+
+    }
+    while (r1 != r1_end && r2 != r2_end);
+
+    /*
+     * Deal with whichever region (if any) still has rectangles left.
+     *
+     * We only need to worry about banding and coalescing for the very first
+     * band left.  After that, we can just group all remaining boxes,
+     * regardless of how many bands, into one final append to the list.
+     */
+
+    if ((r1 != r1_end) && append_non1)
+    {
+        /* Do first non_overlap1Func call, which may be able to coalesce */
+        FIND_BAND (r1, r1_band_end, r1_end, r1y1);
+       
+        cur_band = new_reg->data->numRects;
+       
+        if (!pixman_region_append_non_o (new_reg,
+                                         r1, r1_band_end,
+                                         MAX (r1y1, ybot), r1->y2))
+       {
+           goto bail;
+       }
+       
+        COALESCE (new_reg, prev_band, cur_band);
+
+        /* Just append the rest of the boxes  */
+        APPEND_REGIONS (new_reg, r1_band_end, r1_end);
+    }
+    else if ((r2 != r2_end) && append_non2)
+    {
+        /* Do first non_overlap2Func call, which may be able to coalesce */
+        FIND_BAND (r2, r2_band_end, r2_end, r2y1);
+
+       cur_band = new_reg->data->numRects;
+
+        if (!pixman_region_append_non_o (new_reg,
+                                         r2, r2_band_end,
+                                         MAX (r2y1, ybot), r2->y2))
+       {
+           goto bail;
+       }
+
+        COALESCE (new_reg, prev_band, cur_band);
+
+        /* Append rest of boxes */
+        APPEND_REGIONS (new_reg, r2_band_end, r2_end);
+    }
+
+    if (old_data)
+       free (old_data);
+
+    if (!(numRects = new_reg->data->numRects))
+    {
+        FREE_DATA (new_reg);
+        new_reg->data = pixman_region_empty_data;
+    }
+    else if (numRects == 1)
+    {
+        new_reg->extents = *PIXREGION_BOXPTR (new_reg);
+        FREE_DATA (new_reg);
+        new_reg->data = (region_data_type_t *)NULL;
+    }
+    else
+    {
+        DOWNSIZE (new_reg, numRects);
+    }
+
+    return TRUE;
+
+bail:
+    if (old_data)
+       free (old_data);
+
+    return pixman_break (new_reg);
+}
+
+/*-
+ *-----------------------------------------------------------------------
+ * pixman_set_extents --
+ *     Reset the extents of a region to what they should be. Called by
+ *     pixman_region_subtract and pixman_region_intersect as they can't
+ *      figure it out along the way or do so easily, as pixman_region_union can.
+ *
+ * Results:
+ *     None.
+ *
+ * Side Effects:
+ *     The region's 'extents' structure is overwritten.
+ *
+ *-----------------------------------------------------------------------
+ */
+static void
+pixman_set_extents (region_type_t *region)
+{
+    box_type_t *box, *box_end;
+
+    if (!region->data)
+       return;
+
+    if (!region->data->size)
+    {
+        region->extents.x2 = region->extents.x1;
+        region->extents.y2 = region->extents.y1;
+        return;
+    }
+
+    box = PIXREGION_BOXPTR (region);
+    box_end = PIXREGION_END (region);
+
+    /*
+     * Since box is the first rectangle in the region, it must have the
+     * smallest y1 and since box_end is the last rectangle in the region,
+     * it must have the largest y2, because of banding. Initialize x1 and
+     * x2 from  box and box_end, resp., as good things to initialize them
+     * to...
+     */
+    region->extents.x1 = box->x1;
+    region->extents.y1 = box->y1;
+    region->extents.x2 = box_end->x2;
+    region->extents.y2 = box_end->y2;
+
+    critical_if_fail (region->extents.y1 < region->extents.y2);
+
+    while (box <= box_end)
+    {
+        if (box->x1 < region->extents.x1)
+           region->extents.x1 = box->x1;
+        if (box->x2 > region->extents.x2)
+           region->extents.x2 = box->x2;
+        box++;
+    }
+
+    critical_if_fail (region->extents.x1 < region->extents.x2);
+}
+
+/*======================================================================
+ *         Region Intersection
+ *====================================================================*/
+/*-
+ *-----------------------------------------------------------------------
+ * pixman_region_intersect_o --
+ *     Handle an overlapping band for pixman_region_intersect.
+ *
+ * Results:
+ *     TRUE if successful.
+ *
+ * Side Effects:
+ *     Rectangles may be added to the region.
+ *
+ *-----------------------------------------------------------------------
+ */
+/*ARGSUSED*/
+static pixman_bool_t
+pixman_region_intersect_o (region_type_t *region,
+                           box_type_t *   r1,
+                           box_type_t *   r1_end,
+                           box_type_t *   r2,
+                           box_type_t *   r2_end,
+                           int            y1,
+                           int            y2,
+                           int *          overlap)
+{
+    int x1;
+    int x2;
+    box_type_t *        next_rect;
+
+    next_rect = PIXREGION_TOP (region);
+
+    critical_if_fail (y1 < y2);
+    critical_if_fail (r1 != r1_end && r2 != r2_end);
+
+    do
+    {
+        x1 = MAX (r1->x1, r2->x1);
+        x2 = MIN (r1->x2, r2->x2);
+
+        /*
+        * If there's any overlap between the two rectangles, add that
+        * overlap to the new region.
+        */
+        if (x1 < x2)
+           NEWRECT (region, next_rect, x1, y1, x2, y2);
+
+        /*
+        * Advance the pointer(s) with the leftmost right side, since the next
+        * rectangle on that list may still overlap the other region's
+        * current rectangle.
+        */
+        if (r1->x2 == x2)
+        {
+            r1++;
+       }
+        if (r2->x2 == x2)
+        {
+            r2++;
+       }
+    }
+    while ((r1 != r1_end) && (r2 != r2_end));
+
+    return TRUE;
+}
+
+PIXMAN_EXPORT pixman_bool_t
+PREFIX (_intersect) (region_type_t *     new_reg,
+                     region_type_t *        reg1,
+                     region_type_t *        reg2)
+{
+    GOOD (reg1);
+    GOOD (reg2);
+    GOOD (new_reg);
+
+    /* check for trivial reject */
+    if (PIXREGION_NIL (reg1) || PIXREGION_NIL (reg2) ||
+        !EXTENTCHECK (&reg1->extents, &reg2->extents))
+    {
+        /* Covers about 20% of all cases */
+        FREE_DATA (new_reg);
+        new_reg->extents.x2 = new_reg->extents.x1;
+        new_reg->extents.y2 = new_reg->extents.y1;
+        if (PIXREGION_NAR (reg1) || PIXREGION_NAR (reg2))
+        {
+            new_reg->data = pixman_broken_data;
+            return FALSE;
+       }
+        else
+       {
+           new_reg->data = pixman_region_empty_data;
+       }
+    }
+    else if (!reg1->data && !reg2->data)
+    {
+        /* Covers about 80% of cases that aren't trivially rejected */
+        new_reg->extents.x1 = MAX (reg1->extents.x1, reg2->extents.x1);
+        new_reg->extents.y1 = MAX (reg1->extents.y1, reg2->extents.y1);
+        new_reg->extents.x2 = MIN (reg1->extents.x2, reg2->extents.x2);
+        new_reg->extents.y2 = MIN (reg1->extents.y2, reg2->extents.y2);
+
+        FREE_DATA (new_reg);
+
+       new_reg->data = (region_data_type_t *)NULL;
+    }
+    else if (!reg2->data && SUBSUMES (&reg2->extents, &reg1->extents))
+    {
+        return PREFIX (_copy) (new_reg, reg1);
+    }
+    else if (!reg1->data && SUBSUMES (&reg1->extents, &reg2->extents))
+    {
+        return PREFIX (_copy) (new_reg, reg2);
+    }
+    else if (reg1 == reg2)
+    {
+        return PREFIX (_copy) (new_reg, reg1);
+    }
+    else
+    {
+        /* General purpose intersection */
+        int overlap; /* result ignored */
+
+        if (!pixman_op (new_reg, reg1, reg2, pixman_region_intersect_o, FALSE, FALSE,
+                        &overlap))
+       {
+           return FALSE;
+       }
+       
+        pixman_set_extents (new_reg);
+    }
+
+    GOOD (new_reg);
+    return(TRUE);
+}
+
+#define MERGERECT(r)                                                   \
+    do                                                                 \
+    {                                                                  \
+        if (r->x1 <= x2)                                               \
+       {                                                               \
+            /* Merge with current rectangle */                         \
+            if (r->x1 < x2)                                            \
+               *overlap = TRUE;                                        \
+                                                                       \
+            if (x2 < r->x2)                                            \
+               x2 = r->x2;                                             \
+       }                                                               \
+       else                                                            \
+       {                                                               \
+            /* Add current rectangle, start new one */                 \
+            NEWRECT (region, next_rect, x1, y1, x2, y2);               \
+            x1 = r->x1;                                                        \
+            x2 = r->x2;                                                        \
+       }                                                               \
+        r++;                                                           \
+    } while (0)
+
+/*======================================================================
+ *         Region Union
+ *====================================================================*/
+
+/*-
+ *-----------------------------------------------------------------------
+ * pixman_region_union_o --
+ *     Handle an overlapping band for the union operation. Picks the
+ *     left-most rectangle each time and merges it into the region.
+ *
+ * Results:
+ *     TRUE if successful.
+ *
+ * Side Effects:
+ *     region is overwritten.
+ *     overlap is set to TRUE if any boxes overlap.
+ *
+ *-----------------------------------------------------------------------
+ */
+static pixman_bool_t
+pixman_region_union_o (region_type_t *region,
+                      box_type_t *   r1,
+                      box_type_t *   r1_end,
+                      box_type_t *   r2,
+                      box_type_t *   r2_end,
+                      int            y1,
+                      int            y2,
+                      int *          overlap)
+{
+    box_type_t *next_rect;
+    int x1;            /* left and right side of current union */
+    int x2;
+
+    critical_if_fail (y1 < y2);
+    critical_if_fail (r1 != r1_end && r2 != r2_end);
+
+    next_rect = PIXREGION_TOP (region);
+
+    /* Start off current rectangle */
+    if (r1->x1 < r2->x1)
+    {
+        x1 = r1->x1;
+        x2 = r1->x2;
+        r1++;
+    }
+    else
+    {
+        x1 = r2->x1;
+        x2 = r2->x2;
+        r2++;
+    }
+    while (r1 != r1_end && r2 != r2_end)
+    {
+        if (r1->x1 < r2->x1)
+           MERGERECT (r1);
+       else
+           MERGERECT (r2);
+    }
+
+    /* Finish off whoever (if any) is left */
+    if (r1 != r1_end)
+    {
+        do
+        {
+            MERGERECT (r1);
+       }
+        while (r1 != r1_end);
+    }
+    else if (r2 != r2_end)
+    {
+        do
+        {
+            MERGERECT (r2);
+       }
+        while (r2 != r2_end);
+    }
+
+    /* Add current rectangle */
+    NEWRECT (region, next_rect, x1, y1, x2, y2);
+
+    return TRUE;
+}
+
+PIXMAN_EXPORT pixman_bool_t
+PREFIX(_intersect_rect) (region_type_t *dest,
+                        region_type_t *source,
+                        int x, int y,
+                        unsigned int width,
+                        unsigned int height)
+{
+    region_type_t region;
+
+    region.data = NULL;
+    region.extents.x1 = x;
+    region.extents.y1 = y;
+    region.extents.x2 = x + width;
+    region.extents.y2 = y + height;
+
+    return PREFIX(_intersect) (dest, source, &region);
+}
+
+/* Convenience function for performing union of region with a
+ * single rectangle
+ */
+PIXMAN_EXPORT pixman_bool_t
+PREFIX (_union_rect) (region_type_t *dest,
+                      region_type_t *source,
+                      int            x,
+                     int            y,
+                      unsigned int   width,
+                     unsigned int   height)
+{
+    region_type_t region;
+
+    region.extents.x1 = x;
+    region.extents.y1 = y;
+    region.extents.x2 = x + width;
+    region.extents.y2 = y + height;
+
+    if (!GOOD_RECT (&region.extents))
+    {
+        if (BAD_RECT (&region.extents))
+            _pixman_log_error (FUNC, "Invalid rectangle passed");
+       return PREFIX (_copy) (dest, source);
+    }
+
+    region.data = NULL;
+
+    return PREFIX (_union) (dest, source, &region);
+}
+
+PIXMAN_EXPORT pixman_bool_t
+PREFIX (_union) (region_type_t *new_reg,
+                 region_type_t *reg1,
+                 region_type_t *reg2)
+{
+    int overlap; /* result ignored */
+
+    /* Return TRUE if some overlap
+     * between reg1, reg2
+     */
+    GOOD (reg1);
+    GOOD (reg2);
+    GOOD (new_reg);
+
+    /*  checks all the simple cases */
+
+    /*
+     * Region 1 and 2 are the same
+     */
+    if (reg1 == reg2)
+        return PREFIX (_copy) (new_reg, reg1);
+
+    /*
+     * Region 1 is empty
+     */
+    if (PIXREGION_NIL (reg1))
+    {
+        if (PIXREGION_NAR (reg1))
+           return pixman_break (new_reg);
+
+        if (new_reg != reg2)
+           return PREFIX (_copy) (new_reg, reg2);
+
+       return TRUE;
+    }
+
+    /*
+     * Region 2 is empty
+     */
+    if (PIXREGION_NIL (reg2))
+    {
+        if (PIXREGION_NAR (reg2))
+           return pixman_break (new_reg);
+
+       if (new_reg != reg1)
+           return PREFIX (_copy) (new_reg, reg1);
+
+       return TRUE;
+    }
+
+    /*
+     * Region 1 completely subsumes region 2
+     */
+    if (!reg1->data && SUBSUMES (&reg1->extents, &reg2->extents))
+    {
+        if (new_reg != reg1)
+           return PREFIX (_copy) (new_reg, reg1);
+
+       return TRUE;
+    }
+
+    /*
+     * Region 2 completely subsumes region 1
+     */
+    if (!reg2->data && SUBSUMES (&reg2->extents, &reg1->extents))
+    {
+        if (new_reg != reg2)
+           return PREFIX (_copy) (new_reg, reg2);
+
+       return TRUE;
+    }
+
+    if (!pixman_op (new_reg, reg1, reg2, pixman_region_union_o, TRUE, TRUE, &overlap))
+       return FALSE;
+
+    new_reg->extents.x1 = MIN (reg1->extents.x1, reg2->extents.x1);
+    new_reg->extents.y1 = MIN (reg1->extents.y1, reg2->extents.y1);
+    new_reg->extents.x2 = MAX (reg1->extents.x2, reg2->extents.x2);
+    new_reg->extents.y2 = MAX (reg1->extents.y2, reg2->extents.y2);
+    
+    GOOD (new_reg);
+
+    return TRUE;
+}
+
+/*======================================================================
+ *         Batch Rectangle Union
+ *====================================================================*/
+
+#define EXCHANGE_RECTS(a, b)   \
+    {                           \
+        box_type_t t;          \
+        t = rects[a];           \
+        rects[a] = rects[b];    \
+        rects[b] = t;           \
+    }
+
+static void
+quick_sort_rects (
+    box_type_t rects[],
+    int        numRects)
+{
+    int y1;
+    int x1;
+    int i, j;
+    box_type_t *r;
+
+    /* Always called with numRects > 1 */
+
+    do
+    {
+        if (numRects == 2)
+        {
+            if (rects[0].y1 > rects[1].y1 ||
+                (rects[0].y1 == rects[1].y1 && rects[0].x1 > rects[1].x1))
+           {
+               EXCHANGE_RECTS (0, 1);
+           }
+
+            return;
+       }
+
+        /* Choose partition element, stick in location 0 */
+        EXCHANGE_RECTS (0, numRects >> 1);
+        y1 = rects[0].y1;
+        x1 = rects[0].x1;
+
+        /* Partition array */
+        i = 0;
+        j = numRects;
+
+        do
+        {
+            r = &(rects[i]);
+            do
+            {
+                r++;
+                i++;
+           }
+
+            while (i != numRects && (r->y1 < y1 || (r->y1 == y1 && r->x1 < x1)))
+               ;
+
+           r = &(rects[j]);
+            do
+            {
+                r--;
+                j--;
+           }
+            while (y1 < r->y1 || (y1 == r->y1 && x1 < r->x1));
+           
+            if (i < j)
+               EXCHANGE_RECTS (i, j);
+       }
+        while (i < j);
+
+        /* Move partition element back to middle */
+        EXCHANGE_RECTS (0, j);
+
+        /* Recurse */
+        if (numRects - j - 1 > 1)
+           quick_sort_rects (&rects[j + 1], numRects - j - 1);
+
+        numRects = j;
+    }
+    while (numRects > 1);
+}
+
+/*-
+ *-----------------------------------------------------------------------
+ * pixman_region_validate --
+ *
+ *      Take a ``region'' which is a non-y-x-banded random collection of
+ *      rectangles, and compute a nice region which is the union of all the
+ *      rectangles.
+ *
+ * Results:
+ *     TRUE if successful.
+ *
+ * Side Effects:
+ *      The passed-in ``region'' may be modified.
+ *     overlap set to TRUE if any retangles overlapped,
+ *      else FALSE;
+ *
+ * Strategy:
+ *      Step 1. Sort the rectangles into ascending order with primary key y1
+ *             and secondary key x1.
+ *
+ *      Step 2. Split the rectangles into the minimum number of proper y-x
+ *             banded regions.  This may require horizontally merging
+ *             rectangles, and vertically coalescing bands.  With any luck,
+ *             this step in an identity transformation (ala the Box widget),
+ *             or a coalescing into 1 box (ala Menus).
+ *
+ *     Step 3. Merge the separate regions down to a single region by calling
+ *             pixman_region_union.  Maximize the work each pixman_region_union call does by using
+ *             a binary merge.
+ *
+ *-----------------------------------------------------------------------
+ */
+
+static pixman_bool_t
+validate (region_type_t * badreg,
+          int *           overlap)
+{
+    /* Descriptor for regions under construction  in Step 2. */
+    typedef struct
+    {
+        region_type_t reg;
+        int prev_band;
+        int cur_band;
+    } region_info_t;
+
+    region_info_t stack_regions[64];
+
+    int numRects;                   /* Original numRects for badreg        */
+    region_info_t *ri;              /* Array of current regions                    */
+    int num_ri;                     /* Number of entries used in ri        */
+    int size_ri;                    /* Number of entries available in ri    */
+    int i;                          /* Index into rects                            */
+    int j;                          /* Index into ri                       */
+    region_info_t *rit;             /* &ri[j]                              */
+    region_type_t *reg;             /* ri[j].reg                           */
+    box_type_t *box;                /* Current box in rects                */
+    box_type_t *ri_box;             /* Last box in ri[j].reg               */
+    region_type_t *hreg;            /* ri[j_half].reg                      */
+    pixman_bool_t ret = TRUE;
+
+    *overlap = FALSE;
+    if (!badreg->data)
+    {
+        GOOD (badreg);
+        return TRUE;
+    }
+    
+    numRects = badreg->data->numRects;
+    if (!numRects)
+    {
+        if (PIXREGION_NAR (badreg))
+           return FALSE;
+        GOOD (badreg);
+        return TRUE;
+    }
+    
+    if (badreg->extents.x1 < badreg->extents.x2)
+    {
+        if ((numRects) == 1)
+        {
+            FREE_DATA (badreg);
+            badreg->data = (region_data_type_t *) NULL;
+       }
+        else
+        {
+            DOWNSIZE (badreg, numRects);
+       }
+
+        GOOD (badreg);
+
+       return TRUE;
+    }
+
+    /* Step 1: Sort the rects array into ascending (y1, x1) order */
+    quick_sort_rects (PIXREGION_BOXPTR (badreg), numRects);
+
+    /* Step 2: Scatter the sorted array into the minimum number of regions */
+
+    /* Set up the first region to be the first rectangle in badreg */
+    /* Note that step 2 code will never overflow the ri[0].reg rects array */
+    ri = stack_regions;
+    size_ri = sizeof (stack_regions) / sizeof (stack_regions[0]);
+    num_ri = 1;
+    ri[0].prev_band = 0;
+    ri[0].cur_band = 0;
+    ri[0].reg = *badreg;
+    box = PIXREGION_BOXPTR (&ri[0].reg);
+    ri[0].reg.extents = *box;
+    ri[0].reg.data->numRects = 1;
+    badreg->extents = *pixman_region_empty_box;
+    badreg->data = pixman_region_empty_data;
+
+    /* Now scatter rectangles into the minimum set of valid regions.  If the
+     * next rectangle to be added to a region would force an existing rectangle
+     * in the region to be split up in order to maintain y-x banding, just
+     * forget it.  Try the next region.  If it doesn't fit cleanly into any
+     * region, make a new one.
+     */
+
+    for (i = numRects; --i > 0;)
+    {
+        box++;
+        /* Look for a region to append box to */
+        for (j = num_ri, rit = ri; --j >= 0; rit++)
+        {
+            reg = &rit->reg;
+            ri_box = PIXREGION_END (reg);
+
+            if (box->y1 == ri_box->y1 && box->y2 == ri_box->y2)
+            {
+                /* box is in same band as ri_box.  Merge or append it */
+                if (box->x1 <= ri_box->x2)
+                {
+                    /* Merge it with ri_box */
+                    if (box->x1 < ri_box->x2)
+                       *overlap = TRUE;
+
+                    if (box->x2 > ri_box->x2)
+                       ri_box->x2 = box->x2;
+               }
+                else
+                {
+                    RECTALLOC_BAIL (reg, 1, bail);
+                    *PIXREGION_TOP (reg) = *box;
+                    reg->data->numRects++;
+               }
+               
+                goto next_rect;   /* So sue me */
+           }
+            else if (box->y1 >= ri_box->y2)
+            {
+                /* Put box into new band */
+                if (reg->extents.x2 < ri_box->x2)
+                   reg->extents.x2 = ri_box->x2;
+               
+                if (reg->extents.x1 > box->x1)
+                   reg->extents.x1 = box->x1;
+               
+                COALESCE (reg, rit->prev_band, rit->cur_band);
+                rit->cur_band = reg->data->numRects;
+                RECTALLOC_BAIL (reg, 1, bail);
+                *PIXREGION_TOP (reg) = *box;
+                reg->data->numRects++;
+
+                goto next_rect;
+           }
+            /* Well, this region was inappropriate.  Try the next one. */
+       } /* for j */
+
+        /* Uh-oh.  No regions were appropriate.  Create a new one. */
+        if (size_ri == num_ri)
+        {
+            size_t data_size;
+
+            /* Oops, allocate space for new region information */
+            size_ri <<= 1;
+
+            data_size = size_ri * sizeof(region_info_t);
+            if (data_size / size_ri != sizeof(region_info_t))
+               goto bail;
+
+            if (ri == stack_regions)
+            {
+                rit = malloc (data_size);
+                if (!rit)
+                   goto bail;
+                memcpy (rit, ri, num_ri * sizeof (region_info_t));
+           }
+            else
+            {
+                rit = (region_info_t *) realloc (ri, data_size);
+                if (!rit)
+                   goto bail;
+           }
+            ri = rit;
+            rit = &ri[num_ri];
+       }
+        num_ri++;
+        rit->prev_band = 0;
+        rit->cur_band = 0;
+        rit->reg.extents = *box;
+        rit->reg.data = (region_data_type_t *)NULL;
+
+       /* MUST force allocation */
+        if (!pixman_rect_alloc (&rit->reg, (i + num_ri) / num_ri))
+           goto bail;
+       
+    next_rect: ;
+    } /* for i */
+
+    /* Make a final pass over each region in order to COALESCE and set
+     * extents.x2 and extents.y2
+     */
+    for (j = num_ri, rit = ri; --j >= 0; rit++)
+    {
+        reg = &rit->reg;
+        ri_box = PIXREGION_END (reg);
+        reg->extents.y2 = ri_box->y2;
+
+        if (reg->extents.x2 < ri_box->x2)
+           reg->extents.x2 = ri_box->x2;
+       
+        COALESCE (reg, rit->prev_band, rit->cur_band);
+
+       if (reg->data->numRects == 1) /* keep unions happy below */
+        {
+            FREE_DATA (reg);
+            reg->data = (region_data_type_t *)NULL;
+       }
+    }
+
+    /* Step 3: Union all regions into a single region */
+    while (num_ri > 1)
+    {
+        int half = num_ri / 2;
+        for (j = num_ri & 1; j < (half + (num_ri & 1)); j++)
+        {
+            reg = &ri[j].reg;
+            hreg = &ri[j + half].reg;
+
+            if (!pixman_op (reg, reg, hreg, pixman_region_union_o, TRUE, TRUE, overlap))
+               ret = FALSE;
+
+            if (hreg->extents.x1 < reg->extents.x1)
+               reg->extents.x1 = hreg->extents.x1;
+
+            if (hreg->extents.y1 < reg->extents.y1)
+               reg->extents.y1 = hreg->extents.y1;
+
+            if (hreg->extents.x2 > reg->extents.x2)
+               reg->extents.x2 = hreg->extents.x2;
+
+            if (hreg->extents.y2 > reg->extents.y2)
+               reg->extents.y2 = hreg->extents.y2;
+
+            FREE_DATA (hreg);
+       }
+
+        num_ri -= half;
+
+       if (!ret)
+           goto bail;
+    }
+
+    *badreg = ri[0].reg;
+
+    if (ri != stack_regions)
+       free (ri);
+
+    GOOD (badreg);
+    return ret;
+
+bail:
+    for (i = 0; i < num_ri; i++)
+       FREE_DATA (&ri[i].reg);
+
+    if (ri != stack_regions)
+       free (ri);
+
+    return pixman_break (badreg);
+}
+
+/*======================================================================
+ *                Region Subtraction
+ *====================================================================*/
+
+/*-
+ *-----------------------------------------------------------------------
+ * pixman_region_subtract_o --
+ *     Overlapping band subtraction. x1 is the left-most point not yet
+ *     checked.
+ *
+ * Results:
+ *     TRUE if successful.
+ *
+ * Side Effects:
+ *     region may have rectangles added to it.
+ *
+ *-----------------------------------------------------------------------
+ */
+/*ARGSUSED*/
+static pixman_bool_t
+pixman_region_subtract_o (region_type_t * region,
+                         box_type_t *    r1,
+                         box_type_t *    r1_end,
+                         box_type_t *    r2,
+                         box_type_t *    r2_end,
+                         int             y1,
+                         int             y2,
+                         int *           overlap)
+{
+    box_type_t *        next_rect;
+    int x1;
+
+    x1 = r1->x1;
+
+    critical_if_fail (y1 < y2);
+    critical_if_fail (r1 != r1_end && r2 != r2_end);
+
+    next_rect = PIXREGION_TOP (region);
+
+    do
+    {
+        if (r2->x2 <= x1)
+        {
+            /*
+            * Subtrahend entirely to left of minuend: go to next subtrahend.
+            */
+            r2++;
+       }
+        else if (r2->x1 <= x1)
+        {
+            /*
+            * Subtrahend preceeds minuend: nuke left edge of minuend.
+            */
+            x1 = r2->x2;
+            if (x1 >= r1->x2)
+            {
+                /*
+                * Minuend completely covered: advance to next minuend and
+                * reset left fence to edge of new minuend.
+                */
+                r1++;
+                if (r1 != r1_end)
+                   x1 = r1->x1;
+           }
+            else
+            {
+                /*
+                * Subtrahend now used up since it doesn't extend beyond
+                * minuend
+                */
+                r2++;
+           }
+       }
+        else if (r2->x1 < r1->x2)
+        {
+            /*
+            * Left part of subtrahend covers part of minuend: add uncovered
+            * part of minuend to region and skip to next subtrahend.
+            */
+            critical_if_fail (x1 < r2->x1);
+            NEWRECT (region, next_rect, x1, y1, r2->x1, y2);
+
+            x1 = r2->x2;
+            if (x1 >= r1->x2)
+            {
+                /*
+                * Minuend used up: advance to new...
+                */
+                r1++;
+                if (r1 != r1_end)
+                   x1 = r1->x1;
+           }
+            else
+            {
+                /*
+                * Subtrahend used up
+                */
+                r2++;
+           }
+       }
+        else
+        {
+            /*
+            * Minuend used up: add any remaining piece before advancing.
+            */
+            if (r1->x2 > x1)
+               NEWRECT (region, next_rect, x1, y1, r1->x2, y2);
+
+            r1++;
+
+           if (r1 != r1_end)
+               x1 = r1->x1;
+       }
+    }
+    while ((r1 != r1_end) && (r2 != r2_end));
+
+    /*
+     * Add remaining minuend rectangles to region.
+     */
+    while (r1 != r1_end)
+    {
+        critical_if_fail (x1 < r1->x2);
+
+        NEWRECT (region, next_rect, x1, y1, r1->x2, y2);
+
+        r1++;
+        if (r1 != r1_end)
+           x1 = r1->x1;
+    }
+    return TRUE;
+}
+
+/*-
+ *-----------------------------------------------------------------------
+ * pixman_region_subtract --
+ *     Subtract reg_s from reg_m and leave the result in reg_d.
+ *     S stands for subtrahend, M for minuend and D for difference.
+ *
+ * Results:
+ *     TRUE if successful.
+ *
+ * Side Effects:
+ *     reg_d is overwritten.
+ *
+ *-----------------------------------------------------------------------
+ */
+PIXMAN_EXPORT pixman_bool_t
+PREFIX (_subtract) (region_type_t *reg_d,
+                    region_type_t *reg_m,
+                    region_type_t *reg_s)
+{
+    int overlap; /* result ignored */
+
+    GOOD (reg_m);
+    GOOD (reg_s);
+    GOOD (reg_d);
+    
+    /* check for trivial rejects */
+    if (PIXREGION_NIL (reg_m) || PIXREGION_NIL (reg_s) ||
+        !EXTENTCHECK (&reg_m->extents, &reg_s->extents))
+    {
+        if (PIXREGION_NAR (reg_s))
+           return pixman_break (reg_d);
+       
+        return PREFIX (_copy) (reg_d, reg_m);
+    }
+    else if (reg_m == reg_s)
+    {
+        FREE_DATA (reg_d);
+        reg_d->extents.x2 = reg_d->extents.x1;
+        reg_d->extents.y2 = reg_d->extents.y1;
+        reg_d->data = pixman_region_empty_data;
+
+        return TRUE;
+    }
+
+    /* Add those rectangles in region 1 that aren't in region 2,
+       do yucky substraction for overlaps, and
+       just throw away rectangles in region 2 that aren't in region 1 */
+    if (!pixman_op (reg_d, reg_m, reg_s, pixman_region_subtract_o, TRUE, FALSE, &overlap))
+       return FALSE;
+
+    /*
+     * Can't alter reg_d's extents before we call pixman_op because
+     * it might be one of the source regions and pixman_op depends
+     * on the extents of those regions being unaltered. Besides, this
+     * way there's no checking against rectangles that will be nuked
+     * due to coalescing, so we have to examine fewer rectangles.
+     */
+    pixman_set_extents (reg_d);
+    GOOD (reg_d);
+    return TRUE;
+}
+
+/*======================================================================
+ *         Region Inversion
+ *====================================================================*/
+
+/*-
+ *-----------------------------------------------------------------------
+ * pixman_region_inverse --
+ *     Take a region and a box and return a region that is everything
+ *     in the box but not in the region. The careful reader will note
+ *     that this is the same as subtracting the region from the box...
+ *
+ * Results:
+ *     TRUE.
+ *
+ * Side Effects:
+ *     new_reg is overwritten.
+ *
+ *-----------------------------------------------------------------------
+ */
+pixman_bool_t
+PIXMAN_EXPORT PREFIX (_inverse) (region_type_t *new_reg,  /* Destination region */
+                                 region_type_t *reg1,     /* Region to invert */
+                                 box_type_t *   inv_rect) /* Bounding box for inversion */
+{
+    region_type_t inv_reg; /* Quick and dirty region made from the
+                           * bounding box */
+    int overlap;           /* result ignored */
+
+    GOOD (reg1);
+    GOOD (new_reg);
+    
+    /* check for trivial rejects */
+    if (PIXREGION_NIL (reg1) || !EXTENTCHECK (inv_rect, &reg1->extents))
+    {
+        if (PIXREGION_NAR (reg1))
+           return pixman_break (new_reg);
+       
+        new_reg->extents = *inv_rect;
+        FREE_DATA (new_reg);
+        new_reg->data = (region_data_type_t *)NULL;
+       
+        return TRUE;
+    }
+
+    /* Add those rectangles in region 1 that aren't in region 2,
+     * do yucky substraction for overlaps, and
+     * just throw away rectangles in region 2 that aren't in region 1
+     */
+    inv_reg.extents = *inv_rect;
+    inv_reg.data = (region_data_type_t *)NULL;
+    if (!pixman_op (new_reg, &inv_reg, reg1, pixman_region_subtract_o, TRUE, FALSE, &overlap))
+       return FALSE;
+
+    /*
+     * Can't alter new_reg's extents before we call pixman_op because
+     * it might be one of the source regions and pixman_op depends
+     * on the extents of those regions being unaltered. Besides, this
+     * way there's no checking against rectangles that will be nuked
+     * due to coalescing, so we have to examine fewer rectangles.
+     */
+    pixman_set_extents (new_reg);
+    GOOD (new_reg);
+    return TRUE;
+}
+
+/* In time O(log n), locate the first box whose y2 is greater than y.
+ * Return @end if no such box exists.
+ */
+static box_type_t *
+find_box_for_y (box_type_t *begin, box_type_t *end, int y)
+{
+    box_type_t *mid;
+
+    if (end == begin)
+       return end;
+
+    if (end - begin == 1)
+    {
+       if (begin->y2 > y)
+           return begin;
+       else
+           return end;
+    }
+
+    mid = begin + (end - begin) / 2;
+    if (mid->y2 > y)
+    {
+       /* If no box is found in [begin, mid], the function
+        * will return @mid, which is then known to be the
+        * correct answer.
+        */
+       return find_box_for_y (begin, mid, y);
+    }
+    else
+    {
+       return find_box_for_y (mid, end, y);
+    }
+}
+
+/*
+ *   rect_in(region, rect)
+ *   This routine takes a pointer to a region and a pointer to a box
+ *   and determines if the box is outside/inside/partly inside the region.
+ *
+ *   The idea is to travel through the list of rectangles trying to cover the
+ *   passed box with them. Anytime a piece of the rectangle isn't covered
+ *   by a band of rectangles, part_out is set TRUE. Any time a rectangle in
+ *   the region covers part of the box, part_in is set TRUE. The process ends
+ *   when either the box has been completely covered (we reached a band that
+ *   doesn't overlap the box, part_in is TRUE and part_out is false), the
+ *   box has been partially covered (part_in == part_out == TRUE -- because of
+ *   the banding, the first time this is true we know the box is only
+ *   partially in the region) or is outside the region (we reached a band
+ *   that doesn't overlap the box at all and part_in is false)
+ */
+pixman_region_overlap_t
+PIXMAN_EXPORT PREFIX (_contains_rectangle) (region_type_t *  region,
+                                            box_type_t *     prect)
+{
+    box_type_t *     pbox;
+    box_type_t *     pbox_end;
+    int part_in, part_out;
+    int numRects;
+    int x, y;
+
+    GOOD (region);
+
+    numRects = PIXREGION_NUMRECTS (region);
+
+    /* useful optimization */
+    if (!numRects || !EXTENTCHECK (&region->extents, prect))
+       return(PIXMAN_REGION_OUT);
+
+    if (numRects == 1)
+    {
+        /* We know that it must be PIXMAN_REGION_IN or PIXMAN_REGION_PART */
+        if (SUBSUMES (&region->extents, prect))
+           return(PIXMAN_REGION_IN);
+        else
+           return(PIXMAN_REGION_PART);
+    }
+
+    part_out = FALSE;
+    part_in = FALSE;
+
+    /* (x,y) starts at upper left of rect, moving to the right and down */
+    x = prect->x1;
+    y = prect->y1;
+
+    /* can stop when both part_out and part_in are TRUE, or we reach prect->y2 */
+    for (pbox = PIXREGION_BOXPTR (region), pbox_end = pbox + numRects;
+        pbox != pbox_end;
+        pbox++)
+    {
+       /* getting up to speed or skipping remainder of band */
+       if (pbox->y2 <= y)
+       {
+           if ((pbox = find_box_for_y (pbox, pbox_end, y)) == pbox_end)
+               break;
+       }
+
+        if (pbox->y1 > y)
+        {
+            part_out = TRUE;     /* missed part of rectangle above */
+            if (part_in || (pbox->y1 >= prect->y2))
+               break;
+            y = pbox->y1;       /* x guaranteed to be == prect->x1 */
+       }
+
+        if (pbox->x2 <= x)
+           continue;           /* not far enough over yet */
+
+        if (pbox->x1 > x)
+        {
+            part_out = TRUE;     /* missed part of rectangle to left */
+            if (part_in)
+               break;
+       }
+
+        if (pbox->x1 < prect->x2)
+        {
+            part_in = TRUE;      /* definitely overlap */
+            if (part_out)
+               break;
+       }
+
+        if (pbox->x2 >= prect->x2)
+        {
+            y = pbox->y2;       /* finished with this band */
+            if (y >= prect->y2)
+               break;
+            x = prect->x1;      /* reset x out to left again */
+       }
+        else
+        {
+            /*
+            * Because boxes in a band are maximal width, if the first box
+            * to overlap the rectangle doesn't completely cover it in that
+            * band, the rectangle must be partially out, since some of it
+            * will be uncovered in that band. part_in will have been set true
+            * by now...
+            */
+            part_out = TRUE;
+            break;
+       }
+    }
+
+    if (part_in)
+    {
+        if (y < prect->y2)
+           return PIXMAN_REGION_PART;
+        else
+           return PIXMAN_REGION_IN;
+    }
+    else
+    {
+        return PIXMAN_REGION_OUT;
+    }
+}
+
+/* PREFIX(_translate) (region, x, y)
+ * translates in place
+ */
+
+PIXMAN_EXPORT void
+PREFIX (_translate) (region_type_t *region, int x, int y)
+{
+    overflow_int_t x1, x2, y1, y2;
+    int nbox;
+    box_type_t * pbox;
+
+    GOOD (region);
+    region->extents.x1 = x1 = region->extents.x1 + x;
+    region->extents.y1 = y1 = region->extents.y1 + y;
+    region->extents.x2 = x2 = region->extents.x2 + x;
+    region->extents.y2 = y2 = region->extents.y2 + y;
+    
+    if (((x1 - PIXMAN_REGION_MIN) | (y1 - PIXMAN_REGION_MIN) | (PIXMAN_REGION_MAX - x2) | (PIXMAN_REGION_MAX - y2)) >= 0)
+    {
+        if (region->data && (nbox = region->data->numRects))
+        {
+            for (pbox = PIXREGION_BOXPTR (region); nbox--; pbox++)
+            {
+                pbox->x1 += x;
+                pbox->y1 += y;
+                pbox->x2 += x;
+                pbox->y2 += y;
+           }
+       }
+        return;
+    }
+
+    if (((x2 - PIXMAN_REGION_MIN) | (y2 - PIXMAN_REGION_MIN) | (PIXMAN_REGION_MAX - x1) | (PIXMAN_REGION_MAX - y1)) <= 0)
+    {
+        region->extents.x2 = region->extents.x1;
+        region->extents.y2 = region->extents.y1;
+        FREE_DATA (region);
+        region->data = pixman_region_empty_data;
+        return;
+    }
+
+    if (x1 < PIXMAN_REGION_MIN)
+       region->extents.x1 = PIXMAN_REGION_MIN;
+    else if (x2 > PIXMAN_REGION_MAX)
+       region->extents.x2 = PIXMAN_REGION_MAX;
+
+    if (y1 < PIXMAN_REGION_MIN)
+       region->extents.y1 = PIXMAN_REGION_MIN;
+    else if (y2 > PIXMAN_REGION_MAX)
+       region->extents.y2 = PIXMAN_REGION_MAX;
+
+    if (region->data && (nbox = region->data->numRects))
+    {
+        box_type_t * pbox_out;
+
+        for (pbox_out = pbox = PIXREGION_BOXPTR (region); nbox--; pbox++)
+        {
+            pbox_out->x1 = x1 = pbox->x1 + x;
+            pbox_out->y1 = y1 = pbox->y1 + y;
+            pbox_out->x2 = x2 = pbox->x2 + x;
+            pbox_out->y2 = y2 = pbox->y2 + y;
+
+            if (((x2 - PIXMAN_REGION_MIN) | (y2 - PIXMAN_REGION_MIN) |
+                 (PIXMAN_REGION_MAX - x1) | (PIXMAN_REGION_MAX - y1)) <= 0)
+            {
+                region->data->numRects--;
+                continue;
+           }
+
+            if (x1 < PIXMAN_REGION_MIN)
+               pbox_out->x1 = PIXMAN_REGION_MIN;
+            else if (x2 > PIXMAN_REGION_MAX)
+               pbox_out->x2 = PIXMAN_REGION_MAX;
+
+            if (y1 < PIXMAN_REGION_MIN)
+               pbox_out->y1 = PIXMAN_REGION_MIN;
+            else if (y2 > PIXMAN_REGION_MAX)
+               pbox_out->y2 = PIXMAN_REGION_MAX;
+
+            pbox_out++;
+       }
+
+        if (pbox_out != pbox)
+        {
+            if (region->data->numRects == 1)
+            {
+                region->extents = *PIXREGION_BOXPTR (region);
+                FREE_DATA (region);
+                region->data = (region_data_type_t *)NULL;
+           }
+            else
+           {
+               pixman_set_extents (region);
+           }
+       }
+    }
+
+    GOOD (region);
+}
+
+PIXMAN_EXPORT void
+PREFIX (_reset) (region_type_t *region, box_type_t *box)
+{
+    GOOD (region);
+
+    critical_if_fail (GOOD_RECT (box));
+
+    region->extents = *box;
+
+    FREE_DATA (region);
+
+    region->data = NULL;
+}
+
+/* box is "return" value */
+PIXMAN_EXPORT int
+PREFIX (_contains_point) (region_type_t * region,
+                          int x, int y,
+                          box_type_t * box)
+{
+    box_type_t *pbox, *pbox_end;
+    int numRects;
+
+    GOOD (region);
+    numRects = PIXREGION_NUMRECTS (region);
+
+    if (!numRects || !INBOX (&region->extents, x, y))
+       return(FALSE);
+
+    if (numRects == 1)
+    {
+        if (box)
+           *box = region->extents;
+
+        return(TRUE);
+    }
+
+    pbox = PIXREGION_BOXPTR (region);
+    pbox_end = pbox + numRects;
+
+    pbox = find_box_for_y (pbox, pbox_end, y);
+
+    for (;pbox != pbox_end; pbox++)
+    {
+        if ((y < pbox->y1) || (x < pbox->x1))
+           break;              /* missed it */
+
+        if (x >= pbox->x2)
+           continue;           /* not there yet */
+
+        if (box)
+           *box = *pbox;
+
+        return(TRUE);
+    }
+
+    return(FALSE);
+}
+
+PIXMAN_EXPORT int
+PREFIX (_not_empty) (region_type_t * region)
+{
+    GOOD (region);
+
+    return(!PIXREGION_NIL (region));
+}
+
+PIXMAN_EXPORT box_type_t *
+PREFIX (_extents) (region_type_t * region)
+{
+    GOOD (region);
+
+    return(&region->extents);
+}
+
+/*
+ * Clip a list of scanlines to a region.  The caller has allocated the
+ * space.  FSorted is non-zero if the scanline origins are in ascending order.
+ *
+ * returns the number of new, clipped scanlines.
+ */
+
+PIXMAN_EXPORT pixman_bool_t
+PREFIX (_selfcheck) (region_type_t *reg)
+{
+    int i, numRects;
+
+    if ((reg->extents.x1 > reg->extents.x2) ||
+        (reg->extents.y1 > reg->extents.y2))
+    {
+       return FALSE;
+    }
+
+    numRects = PIXREGION_NUMRECTS (reg);
+    if (!numRects)
+    {
+       return ((reg->extents.x1 == reg->extents.x2) &&
+               (reg->extents.y1 == reg->extents.y2) &&
+               (reg->data->size || (reg->data == pixman_region_empty_data)));
+    }
+    else if (numRects == 1)
+    {
+       return (!reg->data);
+    }
+    else
+    {
+        box_type_t * pbox_p, * pbox_n;
+        box_type_t box;
+
+        pbox_p = PIXREGION_RECTS (reg);
+        box = *pbox_p;
+        box.y2 = pbox_p[numRects - 1].y2;
+        pbox_n = pbox_p + 1;
+
+        for (i = numRects; --i > 0; pbox_p++, pbox_n++)
+        {
+            if ((pbox_n->x1 >= pbox_n->x2) ||
+                (pbox_n->y1 >= pbox_n->y2))
+           {
+               return FALSE;
+           }
+
+            if (pbox_n->x1 < box.x1)
+               box.x1 = pbox_n->x1;
+           
+            if (pbox_n->x2 > box.x2)
+               box.x2 = pbox_n->x2;
+           
+            if ((pbox_n->y1 < pbox_p->y1) ||
+                ((pbox_n->y1 == pbox_p->y1) &&
+                 ((pbox_n->x1 < pbox_p->x2) || (pbox_n->y2 != pbox_p->y2))))
+           {
+               return FALSE;
+           }
+       }
+
+        return ((box.x1 == reg->extents.x1) &&
+                (box.x2 == reg->extents.x2) &&
+                (box.y1 == reg->extents.y1) &&
+                (box.y2 == reg->extents.y2));
+    }
+}
+
+PIXMAN_EXPORT pixman_bool_t
+PREFIX (_init_rects) (region_type_t *region,
+                      const box_type_t *boxes, int count)
+{
+    box_type_t *rects;
+    int displacement;
+    int i;
+
+    /* if it's 1, then we just want to set the extents, so call
+     * the existing method. */
+    if (count == 1)
+    {
+        PREFIX (_init_rect) (region,
+                             boxes[0].x1,
+                             boxes[0].y1,
+                             boxes[0].x2 - boxes[0].x1,
+                             boxes[0].y2 - boxes[0].y1);
+        return TRUE;
+    }
+
+    PREFIX (_init) (region);
+
+    /* if it's 0, don't call pixman_rect_alloc -- 0 rectangles is
+     * a special case, and causing pixman_rect_alloc would cause
+     * us to leak memory (because the 0-rect case should be the
+     * static pixman_region_empty_data data).
+     */
+    if (count == 0)
+       return TRUE;
+
+    if (!pixman_rect_alloc (region, count))
+       return FALSE;
+
+    rects = PIXREGION_RECTS (region);
+
+    /* Copy in the rects */
+    memcpy (rects, boxes, sizeof(box_type_t) * count);
+    region->data->numRects = count;
+
+    /* Eliminate empty and malformed rectangles */
+    displacement = 0;
+
+    for (i = 0; i < count; ++i)
+    {
+        box_type_t *box = &rects[i];
+
+        if (box->x1 >= box->x2 || box->y1 >= box->y2)
+           displacement++;
+        else if (displacement)
+           rects[i - displacement] = rects[i];
+    }
+
+    region->data->numRects -= displacement;
+
+    /* If eliminating empty rectangles caused there
+     * to be only 0 or 1 rectangles, deal with that.
+     */
+    if (region->data->numRects == 0)
+    {
+        FREE_DATA (region);
+        PREFIX (_init) (region);
+
+        return TRUE;
+    }
+
+    if (region->data->numRects == 1)
+    {
+        region->extents = rects[0];
+
+        FREE_DATA (region);
+        region->data = NULL;
+
+        GOOD (region);
+
+        return TRUE;
+    }
+
+    /* Validate */
+    region->extents.x1 = region->extents.x2 = 0;
+
+    return validate (region, &i);
+}
+
+#define READ(_ptr) (*(_ptr))
+
+static inline box_type_t *
+bitmap_addrect (region_type_t *reg,
+                box_type_t *r,
+                box_type_t **first_rect,
+                int rx1, int ry1,
+                int rx2, int ry2)
+{
+    if ((rx1 < rx2) && (ry1 < ry2) &&
+       (!(reg->data->numRects &&
+          ((r-1)->y1 == ry1) && ((r-1)->y2 == ry2) &&
+          ((r-1)->x1 <= rx1) && ((r-1)->x2 >= rx2))))
+    {
+       if (reg->data->numRects == reg->data->size)
+       {
+           if (!pixman_rect_alloc (reg, 1))
+               return NULL;
+           *first_rect = PIXREGION_BOXPTR(reg);
+           r = *first_rect + reg->data->numRects;
+       }
+       r->x1 = rx1;
+       r->y1 = ry1;
+       r->x2 = rx2;
+       r->y2 = ry2;
+       reg->data->numRects++;
+       if (r->x1 < reg->extents.x1)
+           reg->extents.x1 = r->x1;
+       if (r->x2 > reg->extents.x2)
+           reg->extents.x2 = r->x2;
+       r++;
+    }
+    return r;
+}
+
+/* Convert bitmap clip mask into clipping region.
+ * First, goes through each line and makes boxes by noting the transitions
+ * from 0 to 1 and 1 to 0.
+ * Then it coalesces the current line with the previous if they have boxes
+ * at the same X coordinates.
+ * Stride is in number of uint32_t per line.
+ */
+PIXMAN_EXPORT void
+PREFIX (_init_from_image) (region_type_t *region,
+                           pixman_image_t *image)
+{
+    uint32_t mask0 = 0xffffffff & ~SCREEN_SHIFT_RIGHT(0xffffffff, 1);
+    box_type_t *first_rect, *rects, *prect_line_start;
+    box_type_t *old_rect, *new_rect;
+    uint32_t *pw, w, *pw_line, *pw_line_end;
+    int        irect_prev_start, irect_line_start;
+    int        h, base, rx1 = 0, crects;
+    int        ib;
+    pixman_bool_t in_box, same;
+    int width, height, stride;
+
+    PREFIX(_init) (region);
+
+    critical_if_fail (region->data);
+
+    return_if_fail (image->type == BITS);
+    return_if_fail (image->bits.format == PIXMAN_a1);
+
+    pw_line = pixman_image_get_data (image);
+    width = pixman_image_get_width (image);
+    height = pixman_image_get_height (image);
+    stride = pixman_image_get_stride (image) / 4;
+
+    first_rect = PIXREGION_BOXPTR(region);
+    rects = first_rect;
+
+    region->extents.x1 = width - 1;
+    region->extents.x2 = 0;
+    irect_prev_start = -1;
+    for (h = 0; h < height; h++)
+    {
+        pw = pw_line;
+        pw_line += stride;
+        irect_line_start = rects - first_rect;
+
+        /* If the Screen left most bit of the word is set, we're starting in
+         * a box */
+        if (READ(pw) & mask0)
+        {
+            in_box = TRUE;
+            rx1 = 0;
+        }
+        else
+        {
+            in_box = FALSE;
+        }
+
+        /* Process all words which are fully in the pixmap */
+        pw_line_end = pw + (width >> 5);
+        for (base = 0; pw < pw_line_end; base += 32)
+        {
+            w = READ(pw++);
+            if (in_box)
+            {
+                if (!~w)
+                    continue;
+            }
+            else
+            {
+                if (!w)
+                    continue;
+            }
+            for (ib = 0; ib < 32; ib++)
+            {
+                /* If the Screen left most bit of the word is set, we're
+                 * starting a box */
+                if (w & mask0)
+                {
+                    if (!in_box)
+                    {
+                        rx1 = base + ib;
+                        /* start new box */
+                        in_box = TRUE;
+                    }
+                }
+                else
+                {
+                    if (in_box)
+                    {
+                        /* end box */
+                        rects = bitmap_addrect (region, rects, &first_rect,
+                                                rx1, h, base + ib, h + 1);
+                        if (rects == NULL)
+                            goto error;
+                        in_box = FALSE;
+                    }
+                }
+                /* Shift the word VISUALLY left one. */
+                w = SCREEN_SHIFT_LEFT(w, 1);
+            }
+        }
+
+        if (width & 31)
+        {
+            /* Process final partial word on line */
+             w = READ(pw++);
+            for (ib = 0; ib < (width & 31); ib++)
+            {
+                /* If the Screen left most bit of the word is set, we're
+                 * starting a box */
+                if (w & mask0)
+                {
+                    if (!in_box)
+                    {
+                        rx1 = base + ib;
+                        /* start new box */
+                        in_box = TRUE;
+                    }
+                }
+                else
+                {
+                    if (in_box)
+                    {
+                        /* end box */
+                        rects = bitmap_addrect(region, rects, &first_rect,
+                                              rx1, h, base + ib, h + 1);
+                       if (rects == NULL)
+                           goto error;
+                        in_box = FALSE;
+                    }
+                }
+                /* Shift the word VISUALLY left one. */
+                w = SCREEN_SHIFT_LEFT(w, 1);
+            }
+        }
+        /* If scanline ended with last bit set, end the box */
+        if (in_box)
+        {
+            rects = bitmap_addrect(region, rects, &first_rect,
+                                  rx1, h, base + (width & 31), h + 1);
+           if (rects == NULL)
+               goto error;
+        }
+        /* if all rectangles on this line have the same x-coords as
+         * those on the previous line, then add 1 to all the previous  y2s and
+         * throw away all the rectangles from this line
+         */
+        same = FALSE;
+        if (irect_prev_start != -1)
+        {
+            crects = irect_line_start - irect_prev_start;
+            if (crects != 0 &&
+                crects == ((rects - first_rect) - irect_line_start))
+            {
+                old_rect = first_rect + irect_prev_start;
+                new_rect = prect_line_start = first_rect + irect_line_start;
+                same = TRUE;
+                while (old_rect < prect_line_start)
+                {
+                    if ((old_rect->x1 != new_rect->x1) ||
+                        (old_rect->x2 != new_rect->x2))
+                    {
+                          same = FALSE;
+                          break;
+                    }
+                    old_rect++;
+                    new_rect++;
+                }
+                if (same)
+                {
+                    old_rect = first_rect + irect_prev_start;
+                    while (old_rect < prect_line_start)
+                    {
+                        old_rect->y2 += 1;
+                        old_rect++;
+                    }
+                    rects -= crects;
+                    region->data->numRects -= crects;
+                }
+            }
+        }
+        if(!same)
+            irect_prev_start = irect_line_start;
+    }
+    if (!region->data->numRects)
+    {
+        region->extents.x1 = region->extents.x2 = 0;
+    }
+    else
+    {
+        region->extents.y1 = PIXREGION_BOXPTR(region)->y1;
+        region->extents.y2 = PIXREGION_END(region)->y2;
+        if (region->data->numRects == 1)
+        {
+            free (region->data);
+            region->data = NULL;
+        }
+    }
+
+ error:
+    return;
+}
diff --git a/pixman/pixman-region16.c b/pixman/pixman-region16.c
new file mode 100644 (file)
index 0000000..d88d338
--- /dev/null
@@ -0,0 +1,67 @@
+/*
+ * Copyright © 2008 Red Hat, Inc.
+ *
+ * Permission to use, copy, modify, distribute, and sell this software
+ * and its documentation for any purpose is hereby granted without
+ * fee, provided that the above copyright notice appear in all copies
+ * and that both that copyright notice and this permission notice
+ * appear in supporting documentation, and that the name of
+ * Red Hat, Inc. not be used in advertising or publicity pertaining to
+ * distribution of the software without specific, written prior
+ * permission. Red Hat, Inc. makes no representations about the
+ * suitability of this software for any purpose.  It is provided "as
+ * is" without express or implied warranty.
+ *
+ * RED HAT, INC. DISCLAIMS ALL WARRANTIES WITH REGARD TO THIS
+ * SOFTWARE, INCLUDING ALL IMPLIED WARRANTIES OF MERCHANTABILITY AND
+ * FITNESS, IN NO EVENT SHALL RED HAT, INC. BE LIABLE FOR ANY SPECIAL,
+ * INDIRECT OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES WHATSOEVER
+ * RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN ACTION
+ * OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT OF OR
+ * IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE.
+ *
+ * Author: Soren Sandmann <sandmann@redhat.com>
+ */
+#ifdef HAVE_CONFIG_H
+#include <config.h>
+#endif
+
+#undef PIXMAN_DISABLE_DEPRECATED
+
+#include "pixman-private.h"
+
+#include <stdlib.h>
+
+typedef pixman_box16_t         box_type_t;
+typedef pixman_region16_data_t region_data_type_t;
+typedef pixman_region16_t      region_type_t;
+typedef int32_t                 overflow_int_t;
+
+typedef struct {
+    int x, y;
+} point_type_t;
+
+#define PREFIX(x) pixman_region##x
+
+#define PIXMAN_REGION_MAX INT16_MAX
+#define PIXMAN_REGION_MIN INT16_MIN
+
+#include "pixman-region.c"
+
+/* This function exists only to make it possible to preserve the X ABI -
+ * it should go away at first opportunity.
+ *
+ * The problem is that the X ABI exports the three structs and has used
+ * them through macros. So the X server calls this function with
+ * the addresses of those structs which makes the existing code continue to
+ * work.
+ */
+PIXMAN_EXPORT void
+pixman_region_set_static_pointers (pixman_box16_t *empty_box,
+                                  pixman_region16_data_t *empty_data,
+                                  pixman_region16_data_t *broken_data)
+{
+    pixman_region_empty_box = empty_box;
+    pixman_region_empty_data = empty_data;
+    pixman_broken_data = broken_data;
+}
diff --git a/pixman/pixman-region32.c b/pixman/pixman-region32.c
new file mode 100644 (file)
index 0000000..abd6b1a
--- /dev/null
@@ -0,0 +1,47 @@
+/*
+ * Copyright © 2008 Red Hat, Inc.
+ *
+ * Permission to use, copy, modify, distribute, and sell this software
+ * and its documentation for any purpose is hereby granted without
+ * fee, provided that the above copyright notice appear in all copies
+ * and that both that copyright notice and this permission notice
+ * appear in supporting documentation, and that the name of
+ * Red Hat, Inc. not be used in advertising or publicity pertaining to
+ * distribution of the software without specific, written prior
+ * permission. Red Hat, Inc. makes no representations about the
+ * suitability of this software for any purpose.  It is provided "as
+ * is" without express or implied warranty.
+ *
+ * RED HAT, INC. DISCLAIMS ALL WARRANTIES WITH REGARD TO THIS
+ * SOFTWARE, INCLUDING ALL IMPLIED WARRANTIES OF MERCHANTABILITY AND
+ * FITNESS, IN NO EVENT SHALL RED HAT, INC. BE LIABLE FOR ANY SPECIAL,
+ * INDIRECT OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES WHATSOEVER
+ * RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN ACTION
+ * OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT OF OR
+ * IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE.
+ *
+ * Author: Soren Sandmann <sandmann@redhat.com>
+ */
+#ifdef HAVE_CONFIG_H
+#include <config.h>
+#endif
+
+#include "pixman-private.h"
+
+#include <stdlib.h>
+
+typedef pixman_box32_t         box_type_t;
+typedef pixman_region32_data_t region_data_type_t;
+typedef pixman_region32_t      region_type_t;
+typedef int64_t                 overflow_int_t;
+
+typedef struct {
+    int x, y;
+} point_type_t;
+
+#define PREFIX(x) pixman_region32##x
+
+#define PIXMAN_REGION_MAX INT32_MAX
+#define PIXMAN_REGION_MIN INT32_MIN
+
+#include "pixman-region.c"
diff --git a/pixman/pixman-solid-fill.c b/pixman/pixman-solid-fill.c
new file mode 100644 (file)
index 0000000..852e135
--- /dev/null
@@ -0,0 +1,89 @@
+/*
+ * Copyright © 2000 SuSE, Inc.
+ * Copyright © 2007, 2009 Red Hat, Inc.
+ * Copyright © 2009 Soren Sandmann
+ *
+ * Permission to use, copy, modify, distribute, and sell this software and its
+ * documentation for any purpose is hereby granted without fee, provided that
+ * the above copyright notice appear in all copies and that both that
+ * copyright notice and this permission notice appear in supporting
+ * documentation, and that the name of SuSE not be used in advertising or
+ * publicity pertaining to distribution of the software without specific,
+ * written prior permission.  SuSE makes no representations about the
+ * suitability of this software for any purpose.  It is provided "as is"
+ * without express or implied warranty.
+ *
+ * SuSE DISCLAIMS ALL WARRANTIES WITH REGARD TO THIS SOFTWARE, INCLUDING ALL
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS, IN NO EVENT SHALL SuSE
+ * BE LIABLE FOR ANY SPECIAL, INDIRECT OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES
+ * WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN ACTION
+ * OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT OF OR IN
+ * CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE.
+ */
+
+#ifdef HAVE_CONFIG_H
+#include <config.h>
+#endif
+#include "pixman-private.h"
+
+void
+_pixman_solid_fill_iter_init (pixman_image_t *image, pixman_iter_t  *iter)
+{
+    if (iter->flags & ITER_NARROW)
+    {
+       uint32_t *b = (uint32_t *)iter->buffer;
+       uint32_t *e = b + iter->width;
+       uint32_t color = iter->image->solid.color_32;
+
+       while (b < e)
+           *(b++) = color;
+    }
+    else
+    {
+       uint64_t *b = (uint64_t *)iter->buffer;
+       uint64_t *e = b + iter->width;
+       uint64_t color = image->solid.color_64;
+
+       while (b < e)
+           *(b++) = color;
+    }
+
+    iter->get_scanline = _pixman_iter_get_scanline_noop;
+}
+
+static uint32_t
+color_to_uint32 (const pixman_color_t *color)
+{
+    return
+        (color->alpha >> 8 << 24) |
+        (color->red >> 8 << 16) |
+        (color->green & 0xff00) |
+        (color->blue >> 8);
+}
+
+static uint64_t
+color_to_uint64 (const pixman_color_t *color)
+{
+    return
+        ((uint64_t)color->alpha << 48) |
+        ((uint64_t)color->red << 32) |
+        ((uint64_t)color->green << 16) |
+        ((uint64_t)color->blue);
+}
+
+PIXMAN_EXPORT pixman_image_t *
+pixman_image_create_solid_fill (pixman_color_t *color)
+{
+    pixman_image_t *img = _pixman_image_allocate ();
+
+    if (!img)
+       return NULL;
+
+    img->type = SOLID;
+    img->solid.color = *color;
+    img->solid.color_32 = color_to_uint32 (color);
+    img->solid.color_64 = color_to_uint64 (color);
+
+    return img;
+}
+
diff --git a/pixman/pixman-sse2.c b/pixman/pixman-sse2.c
new file mode 100644 (file)
index 0000000..c419511
--- /dev/null
@@ -0,0 +1,6071 @@
+/*
+ * Copyright © 2008 Rodrigo Kumpera
+ * Copyright © 2008 André Tupinambá
+ *
+ * Permission to use, copy, modify, distribute, and sell this software and its
+ * documentation for any purpose is hereby granted without fee, provided that
+ * the above copyright notice appear in all copies and that both that
+ * copyright notice and this permission notice appear in supporting
+ * documentation, and that the name of Red Hat not be used in advertising or
+ * publicity pertaining to distribution of the software without specific,
+ * written prior permission.  Red Hat makes no representations about the
+ * suitability of this software for any purpose.  It is provided "as is"
+ * without express or implied warranty.
+ *
+ * THE COPYRIGHT HOLDERS DISCLAIM ALL WARRANTIES WITH REGARD TO THIS
+ * SOFTWARE, INCLUDING ALL IMPLIED WARRANTIES OF MERCHANTABILITY AND
+ * FITNESS, IN NO EVENT SHALL THE COPYRIGHT HOLDERS BE LIABLE FOR ANY
+ * SPECIAL, INDIRECT OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES
+ * WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN
+ * AN ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING
+ * OUT OF OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS
+ * SOFTWARE.
+ *
+ * Author:  Rodrigo Kumpera (kumpera@gmail.com)
+ *          André Tupinambá (andrelrt@gmail.com)
+ *
+ * Based on work by Owen Taylor and Søren Sandmann
+ */
+#ifdef HAVE_CONFIG_H
+#include <config.h>
+#endif
+
+#include <xmmintrin.h> /* for _mm_shuffle_pi16 and _MM_SHUFFLE */
+#include <emmintrin.h> /* for SSE2 intrinsics */
+#include "pixman-private.h"
+#include "pixman-combine32.h"
+#include "pixman-inlines.h"
+
+static __m128i mask_0080;
+static __m128i mask_00ff;
+static __m128i mask_0101;
+static __m128i mask_ffff;
+static __m128i mask_ff000000;
+static __m128i mask_alpha;
+
+static __m128i mask_565_r;
+static __m128i mask_565_g1, mask_565_g2;
+static __m128i mask_565_b;
+static __m128i mask_red;
+static __m128i mask_green;
+static __m128i mask_blue;
+
+static __m128i mask_565_fix_rb;
+static __m128i mask_565_fix_g;
+
+static force_inline __m128i
+unpack_32_1x128 (uint32_t data)
+{
+    return _mm_unpacklo_epi8 (_mm_cvtsi32_si128 (data), _mm_setzero_si128 ());
+}
+
+static force_inline void
+unpack_128_2x128 (__m128i data, __m128i* data_lo, __m128i* data_hi)
+{
+    *data_lo = _mm_unpacklo_epi8 (data, _mm_setzero_si128 ());
+    *data_hi = _mm_unpackhi_epi8 (data, _mm_setzero_si128 ());
+}
+
+static force_inline __m128i
+unpack_565_to_8888 (__m128i lo)
+{
+    __m128i r, g, b, rb, t;
+
+    r = _mm_and_si128 (_mm_slli_epi32 (lo, 8), mask_red);
+    g = _mm_and_si128 (_mm_slli_epi32 (lo, 5), mask_green);
+    b = _mm_and_si128 (_mm_slli_epi32 (lo, 3), mask_blue);
+
+    rb = _mm_or_si128 (r, b);
+    t  = _mm_and_si128 (rb, mask_565_fix_rb);
+    t  = _mm_srli_epi32 (t, 5);
+    rb = _mm_or_si128 (rb, t);
+
+    t  = _mm_and_si128 (g, mask_565_fix_g);
+    t  = _mm_srli_epi32 (t, 6);
+    g  = _mm_or_si128 (g, t);
+
+    return _mm_or_si128 (rb, g);
+}
+
+static force_inline void
+unpack_565_128_4x128 (__m128i  data,
+                      __m128i* data0,
+                      __m128i* data1,
+                      __m128i* data2,
+                      __m128i* data3)
+{
+    __m128i lo, hi;
+
+    lo = _mm_unpacklo_epi16 (data, _mm_setzero_si128 ());
+    hi = _mm_unpackhi_epi16 (data, _mm_setzero_si128 ());
+
+    lo = unpack_565_to_8888 (lo);
+    hi = unpack_565_to_8888 (hi);
+
+    unpack_128_2x128 (lo, data0, data1);
+    unpack_128_2x128 (hi, data2, data3);
+}
+
+static force_inline uint16_t
+pack_565_32_16 (uint32_t pixel)
+{
+    return (uint16_t) (((pixel >> 8) & 0xf800) |
+                      ((pixel >> 5) & 0x07e0) |
+                      ((pixel >> 3) & 0x001f));
+}
+
+static force_inline __m128i
+pack_2x128_128 (__m128i lo, __m128i hi)
+{
+    return _mm_packus_epi16 (lo, hi);
+}
+
+static force_inline __m128i
+pack_565_2x128_128 (__m128i lo, __m128i hi)
+{
+    __m128i data;
+    __m128i r, g1, g2, b;
+
+    data = pack_2x128_128 (lo, hi);
+
+    r  = _mm_and_si128 (data, mask_565_r);
+    g1 = _mm_and_si128 (_mm_slli_epi32 (data, 3), mask_565_g1);
+    g2 = _mm_and_si128 (_mm_srli_epi32 (data, 5), mask_565_g2);
+    b  = _mm_and_si128 (_mm_srli_epi32 (data, 3), mask_565_b);
+
+    return _mm_or_si128 (_mm_or_si128 (_mm_or_si128 (r, g1), g2), b);
+}
+
+static force_inline __m128i
+pack_565_4x128_128 (__m128i* xmm0, __m128i* xmm1, __m128i* xmm2, __m128i* xmm3)
+{
+    return _mm_packus_epi16 (pack_565_2x128_128 (*xmm0, *xmm1),
+                            pack_565_2x128_128 (*xmm2, *xmm3));
+}
+
+static force_inline int
+is_opaque (__m128i x)
+{
+    __m128i ffs = _mm_cmpeq_epi8 (x, x);
+
+    return (_mm_movemask_epi8 (_mm_cmpeq_epi8 (x, ffs)) & 0x8888) == 0x8888;
+}
+
+static force_inline int
+is_zero (__m128i x)
+{
+    return _mm_movemask_epi8 (
+       _mm_cmpeq_epi8 (x, _mm_setzero_si128 ())) == 0xffff;
+}
+
+static force_inline int
+is_transparent (__m128i x)
+{
+    return (_mm_movemask_epi8 (
+               _mm_cmpeq_epi8 (x, _mm_setzero_si128 ())) & 0x8888) == 0x8888;
+}
+
+static force_inline __m128i
+expand_pixel_32_1x128 (uint32_t data)
+{
+    return _mm_shuffle_epi32 (unpack_32_1x128 (data), _MM_SHUFFLE (1, 0, 1, 0));
+}
+
+static force_inline __m128i
+expand_alpha_1x128 (__m128i data)
+{
+    return _mm_shufflehi_epi16 (_mm_shufflelo_epi16 (data,
+                                                    _MM_SHUFFLE (3, 3, 3, 3)),
+                               _MM_SHUFFLE (3, 3, 3, 3));
+}
+
+static force_inline void
+expand_alpha_2x128 (__m128i  data_lo,
+                    __m128i  data_hi,
+                    __m128i* alpha_lo,
+                    __m128i* alpha_hi)
+{
+    __m128i lo, hi;
+
+    lo = _mm_shufflelo_epi16 (data_lo, _MM_SHUFFLE (3, 3, 3, 3));
+    hi = _mm_shufflelo_epi16 (data_hi, _MM_SHUFFLE (3, 3, 3, 3));
+
+    *alpha_lo = _mm_shufflehi_epi16 (lo, _MM_SHUFFLE (3, 3, 3, 3));
+    *alpha_hi = _mm_shufflehi_epi16 (hi, _MM_SHUFFLE (3, 3, 3, 3));
+}
+
+static force_inline void
+expand_alpha_rev_2x128 (__m128i  data_lo,
+                        __m128i  data_hi,
+                        __m128i* alpha_lo,
+                        __m128i* alpha_hi)
+{
+    __m128i lo, hi;
+
+    lo = _mm_shufflelo_epi16 (data_lo, _MM_SHUFFLE (0, 0, 0, 0));
+    hi = _mm_shufflelo_epi16 (data_hi, _MM_SHUFFLE (0, 0, 0, 0));
+    *alpha_lo = _mm_shufflehi_epi16 (lo, _MM_SHUFFLE (0, 0, 0, 0));
+    *alpha_hi = _mm_shufflehi_epi16 (hi, _MM_SHUFFLE (0, 0, 0, 0));
+}
+
+static force_inline void
+pix_multiply_2x128 (__m128i* data_lo,
+                    __m128i* data_hi,
+                    __m128i* alpha_lo,
+                    __m128i* alpha_hi,
+                    __m128i* ret_lo,
+                    __m128i* ret_hi)
+{
+    __m128i lo, hi;
+
+    lo = _mm_mullo_epi16 (*data_lo, *alpha_lo);
+    hi = _mm_mullo_epi16 (*data_hi, *alpha_hi);
+    lo = _mm_adds_epu16 (lo, mask_0080);
+    hi = _mm_adds_epu16 (hi, mask_0080);
+    *ret_lo = _mm_mulhi_epu16 (lo, mask_0101);
+    *ret_hi = _mm_mulhi_epu16 (hi, mask_0101);
+}
+
+static force_inline void
+pix_add_multiply_2x128 (__m128i* src_lo,
+                        __m128i* src_hi,
+                        __m128i* alpha_dst_lo,
+                        __m128i* alpha_dst_hi,
+                        __m128i* dst_lo,
+                        __m128i* dst_hi,
+                        __m128i* alpha_src_lo,
+                        __m128i* alpha_src_hi,
+                        __m128i* ret_lo,
+                        __m128i* ret_hi)
+{
+    __m128i t1_lo, t1_hi;
+    __m128i t2_lo, t2_hi;
+
+    pix_multiply_2x128 (src_lo, src_hi, alpha_dst_lo, alpha_dst_hi, &t1_lo, &t1_hi);
+    pix_multiply_2x128 (dst_lo, dst_hi, alpha_src_lo, alpha_src_hi, &t2_lo, &t2_hi);
+
+    *ret_lo = _mm_adds_epu8 (t1_lo, t2_lo);
+    *ret_hi = _mm_adds_epu8 (t1_hi, t2_hi);
+}
+
+static force_inline void
+negate_2x128 (__m128i  data_lo,
+              __m128i  data_hi,
+              __m128i* neg_lo,
+              __m128i* neg_hi)
+{
+    *neg_lo = _mm_xor_si128 (data_lo, mask_00ff);
+    *neg_hi = _mm_xor_si128 (data_hi, mask_00ff);
+}
+
+static force_inline void
+invert_colors_2x128 (__m128i  data_lo,
+                     __m128i  data_hi,
+                     __m128i* inv_lo,
+                     __m128i* inv_hi)
+{
+    __m128i lo, hi;
+
+    lo = _mm_shufflelo_epi16 (data_lo, _MM_SHUFFLE (3, 0, 1, 2));
+    hi = _mm_shufflelo_epi16 (data_hi, _MM_SHUFFLE (3, 0, 1, 2));
+    *inv_lo = _mm_shufflehi_epi16 (lo, _MM_SHUFFLE (3, 0, 1, 2));
+    *inv_hi = _mm_shufflehi_epi16 (hi, _MM_SHUFFLE (3, 0, 1, 2));
+}
+
+static force_inline void
+over_2x128 (__m128i* src_lo,
+            __m128i* src_hi,
+            __m128i* alpha_lo,
+            __m128i* alpha_hi,
+            __m128i* dst_lo,
+            __m128i* dst_hi)
+{
+    __m128i t1, t2;
+
+    negate_2x128 (*alpha_lo, *alpha_hi, &t1, &t2);
+
+    pix_multiply_2x128 (dst_lo, dst_hi, &t1, &t2, dst_lo, dst_hi);
+
+    *dst_lo = _mm_adds_epu8 (*src_lo, *dst_lo);
+    *dst_hi = _mm_adds_epu8 (*src_hi, *dst_hi);
+}
+
+static force_inline void
+over_rev_non_pre_2x128 (__m128i  src_lo,
+                        __m128i  src_hi,
+                        __m128i* dst_lo,
+                        __m128i* dst_hi)
+{
+    __m128i lo, hi;
+    __m128i alpha_lo, alpha_hi;
+
+    expand_alpha_2x128 (src_lo, src_hi, &alpha_lo, &alpha_hi);
+
+    lo = _mm_or_si128 (alpha_lo, mask_alpha);
+    hi = _mm_or_si128 (alpha_hi, mask_alpha);
+
+    invert_colors_2x128 (src_lo, src_hi, &src_lo, &src_hi);
+
+    pix_multiply_2x128 (&src_lo, &src_hi, &lo, &hi, &lo, &hi);
+
+    over_2x128 (&lo, &hi, &alpha_lo, &alpha_hi, dst_lo, dst_hi);
+}
+
+static force_inline void
+in_over_2x128 (__m128i* src_lo,
+               __m128i* src_hi,
+               __m128i* alpha_lo,
+               __m128i* alpha_hi,
+               __m128i* mask_lo,
+               __m128i* mask_hi,
+               __m128i* dst_lo,
+               __m128i* dst_hi)
+{
+    __m128i s_lo, s_hi;
+    __m128i a_lo, a_hi;
+
+    pix_multiply_2x128 (src_lo,   src_hi, mask_lo, mask_hi, &s_lo, &s_hi);
+    pix_multiply_2x128 (alpha_lo, alpha_hi, mask_lo, mask_hi, &a_lo, &a_hi);
+
+    over_2x128 (&s_lo, &s_hi, &a_lo, &a_hi, dst_lo, dst_hi);
+}
+
+/* load 4 pixels from a 16-byte boundary aligned address */
+static force_inline __m128i
+load_128_aligned (__m128i* src)
+{
+    return _mm_load_si128 (src);
+}
+
+/* load 4 pixels from a unaligned address */
+static force_inline __m128i
+load_128_unaligned (const __m128i* src)
+{
+    return _mm_loadu_si128 (src);
+}
+
+/* save 4 pixels using Write Combining memory on a 16-byte
+ * boundary aligned address
+ */
+static force_inline void
+save_128_write_combining (__m128i* dst,
+                          __m128i  data)
+{
+    _mm_stream_si128 (dst, data);
+}
+
+/* save 4 pixels on a 16-byte boundary aligned address */
+static force_inline void
+save_128_aligned (__m128i* dst,
+                  __m128i  data)
+{
+    _mm_store_si128 (dst, data);
+}
+
+/* save 4 pixels on a unaligned address */
+static force_inline void
+save_128_unaligned (__m128i* dst,
+                    __m128i  data)
+{
+    _mm_storeu_si128 (dst, data);
+}
+
+static force_inline __m128i
+load_32_1x128 (uint32_t data)
+{
+    return _mm_cvtsi32_si128 (data);
+}
+
+static force_inline __m128i
+expand_alpha_rev_1x128 (__m128i data)
+{
+    return _mm_shufflelo_epi16 (data, _MM_SHUFFLE (0, 0, 0, 0));
+}
+
+static force_inline __m128i
+expand_pixel_8_1x128 (uint8_t data)
+{
+    return _mm_shufflelo_epi16 (
+       unpack_32_1x128 ((uint32_t)data), _MM_SHUFFLE (0, 0, 0, 0));
+}
+
+static force_inline __m128i
+pix_multiply_1x128 (__m128i data,
+                   __m128i alpha)
+{
+    return _mm_mulhi_epu16 (_mm_adds_epu16 (_mm_mullo_epi16 (data, alpha),
+                                           mask_0080),
+                           mask_0101);
+}
+
+static force_inline __m128i
+pix_add_multiply_1x128 (__m128i* src,
+                       __m128i* alpha_dst,
+                       __m128i* dst,
+                       __m128i* alpha_src)
+{
+    __m128i t1 = pix_multiply_1x128 (*src, *alpha_dst);
+    __m128i t2 = pix_multiply_1x128 (*dst, *alpha_src);
+
+    return _mm_adds_epu8 (t1, t2);
+}
+
+static force_inline __m128i
+negate_1x128 (__m128i data)
+{
+    return _mm_xor_si128 (data, mask_00ff);
+}
+
+static force_inline __m128i
+invert_colors_1x128 (__m128i data)
+{
+    return _mm_shufflelo_epi16 (data, _MM_SHUFFLE (3, 0, 1, 2));
+}
+
+static force_inline __m128i
+over_1x128 (__m128i src, __m128i alpha, __m128i dst)
+{
+    return _mm_adds_epu8 (src, pix_multiply_1x128 (dst, negate_1x128 (alpha)));
+}
+
+static force_inline __m128i
+in_over_1x128 (__m128i* src, __m128i* alpha, __m128i* mask, __m128i* dst)
+{
+    return over_1x128 (pix_multiply_1x128 (*src, *mask),
+                      pix_multiply_1x128 (*alpha, *mask),
+                      *dst);
+}
+
+static force_inline __m128i
+over_rev_non_pre_1x128 (__m128i src, __m128i dst)
+{
+    __m128i alpha = expand_alpha_1x128 (src);
+
+    return over_1x128 (pix_multiply_1x128 (invert_colors_1x128 (src),
+                                          _mm_or_si128 (alpha, mask_alpha)),
+                      alpha,
+                      dst);
+}
+
+static force_inline uint32_t
+pack_1x128_32 (__m128i data)
+{
+    return _mm_cvtsi128_si32 (_mm_packus_epi16 (data, _mm_setzero_si128 ()));
+}
+
+static force_inline __m128i
+expand565_16_1x128 (uint16_t pixel)
+{
+    __m128i m = _mm_cvtsi32_si128 (pixel);
+
+    m = unpack_565_to_8888 (m);
+
+    return _mm_unpacklo_epi8 (m, _mm_setzero_si128 ());
+}
+
+static force_inline uint32_t
+core_combine_over_u_pixel_sse2 (uint32_t src, uint32_t dst)
+{
+    uint8_t a;
+    __m128i xmms;
+
+    a = src >> 24;
+
+    if (a == 0xff)
+    {
+       return src;
+    }
+    else if (src)
+    {
+       xmms = unpack_32_1x128 (src);
+       return pack_1x128_32 (
+           over_1x128 (xmms, expand_alpha_1x128 (xmms),
+                       unpack_32_1x128 (dst)));
+    }
+
+    return dst;
+}
+
+static force_inline uint32_t
+combine1 (const uint32_t *ps, const uint32_t *pm)
+{
+    uint32_t s = *ps;
+
+    if (pm)
+    {
+       __m128i ms, mm;
+
+       mm = unpack_32_1x128 (*pm);
+       mm = expand_alpha_1x128 (mm);
+
+       ms = unpack_32_1x128 (s);
+       ms = pix_multiply_1x128 (ms, mm);
+
+       s = pack_1x128_32 (ms);
+    }
+
+    return s;
+}
+
+static force_inline __m128i
+combine4 (const __m128i *ps, const __m128i *pm)
+{
+    __m128i xmm_src_lo, xmm_src_hi;
+    __m128i xmm_msk_lo, xmm_msk_hi;
+    __m128i s;
+
+    if (pm)
+    {
+       xmm_msk_lo = load_128_unaligned (pm);
+
+       if (is_transparent (xmm_msk_lo))
+           return _mm_setzero_si128 ();
+    }
+
+    s = load_128_unaligned (ps);
+
+    if (pm)
+    {
+       unpack_128_2x128 (s, &xmm_src_lo, &xmm_src_hi);
+       unpack_128_2x128 (xmm_msk_lo, &xmm_msk_lo, &xmm_msk_hi);
+
+       expand_alpha_2x128 (xmm_msk_lo, xmm_msk_hi, &xmm_msk_lo, &xmm_msk_hi);
+
+       pix_multiply_2x128 (&xmm_src_lo, &xmm_src_hi,
+                           &xmm_msk_lo, &xmm_msk_hi,
+                           &xmm_src_lo, &xmm_src_hi);
+
+       s = pack_2x128_128 (xmm_src_lo, xmm_src_hi);
+    }
+
+    return s;
+}
+
+static force_inline void
+core_combine_over_u_sse2_mask (uint32_t *        pd,
+                              const uint32_t*    ps,
+                              const uint32_t*    pm,
+                              int                w)
+{
+    uint32_t s, d;
+
+    /* Align dst on a 16-byte boundary */
+    while (w && ((unsigned long)pd & 15))
+    {
+       d = *pd;
+       s = combine1 (ps, pm);
+
+       if (s)
+           *pd = core_combine_over_u_pixel_sse2 (s, d);
+       pd++;
+       ps++;
+       pm++;
+       w--;
+    }
+
+    while (w >= 4)
+    {
+       __m128i mask = load_128_unaligned ((__m128i *)pm);
+
+       if (!is_zero (mask))
+       {
+           __m128i src;
+           __m128i src_hi, src_lo;
+           __m128i mask_hi, mask_lo;
+           __m128i alpha_hi, alpha_lo;
+
+           src = load_128_unaligned ((__m128i *)ps);
+
+           if (is_opaque (_mm_and_si128 (src, mask)))
+           {
+               save_128_aligned ((__m128i *)pd, src);
+           }
+           else
+           {
+               __m128i dst = load_128_aligned ((__m128i *)pd);
+               __m128i dst_hi, dst_lo;
+
+               unpack_128_2x128 (mask, &mask_lo, &mask_hi);
+               unpack_128_2x128 (src, &src_lo, &src_hi);
+
+               expand_alpha_2x128 (mask_lo, mask_hi, &mask_lo, &mask_hi);
+               pix_multiply_2x128 (&src_lo, &src_hi,
+                                   &mask_lo, &mask_hi,
+                                   &src_lo, &src_hi);
+
+               unpack_128_2x128 (dst, &dst_lo, &dst_hi);
+
+               expand_alpha_2x128 (src_lo, src_hi,
+                                   &alpha_lo, &alpha_hi);
+
+               over_2x128 (&src_lo, &src_hi, &alpha_lo, &alpha_hi,
+                           &dst_lo, &dst_hi);
+
+               save_128_aligned (
+                   (__m128i *)pd,
+                   pack_2x128_128 (dst_lo, dst_hi));
+           }
+       }
+
+       pm += 4;
+       ps += 4;
+       pd += 4;
+       w -= 4;
+    }
+    while (w)
+    {
+       d = *pd;
+       s = combine1 (ps, pm);
+
+       if (s)
+           *pd = core_combine_over_u_pixel_sse2 (s, d);
+       pd++;
+       ps++;
+       pm++;
+
+       w--;
+    }
+}
+
+static force_inline void
+core_combine_over_u_sse2_no_mask (uint32_t *     pd,
+                                 const uint32_t*    ps,
+                                 int                w)
+{
+    uint32_t s, d;
+
+    /* Align dst on a 16-byte boundary */
+    while (w && ((unsigned long)pd & 15))
+    {
+       d = *pd;
+       s = *ps;
+
+       if (s)
+           *pd = core_combine_over_u_pixel_sse2 (s, d);
+       pd++;
+       ps++;
+       w--;
+    }
+
+    while (w >= 4)
+    {
+       __m128i src;
+       __m128i src_hi, src_lo, dst_hi, dst_lo;
+       __m128i alpha_hi, alpha_lo;
+
+       src = load_128_unaligned ((__m128i *)ps);
+
+       if (!is_zero (src))
+       {
+           if (is_opaque (src))
+           {
+               save_128_aligned ((__m128i *)pd, src);
+           }
+           else
+           {
+               __m128i dst = load_128_aligned ((__m128i *)pd);
+
+               unpack_128_2x128 (src, &src_lo, &src_hi);
+               unpack_128_2x128 (dst, &dst_lo, &dst_hi);
+
+               expand_alpha_2x128 (src_lo, src_hi,
+                                   &alpha_lo, &alpha_hi);
+               over_2x128 (&src_lo, &src_hi, &alpha_lo, &alpha_hi,
+                           &dst_lo, &dst_hi);
+
+               save_128_aligned (
+                   (__m128i *)pd,
+                   pack_2x128_128 (dst_lo, dst_hi));
+           }
+       }
+
+       ps += 4;
+       pd += 4;
+       w -= 4;
+    }
+    while (w)
+    {
+       d = *pd;
+       s = *ps;
+
+       if (s)
+           *pd = core_combine_over_u_pixel_sse2 (s, d);
+       pd++;
+       ps++;
+
+       w--;
+    }
+}
+
+static force_inline void
+sse2_combine_over_u (pixman_implementation_t *imp,
+                     pixman_op_t              op,
+                     uint32_t *               pd,
+                     const uint32_t *         ps,
+                     const uint32_t *         pm,
+                     int                      w)
+{
+    if (pm)
+       core_combine_over_u_sse2_mask (pd, ps, pm, w);
+    else
+       core_combine_over_u_sse2_no_mask (pd, ps, w);
+}
+
+static void
+sse2_combine_over_reverse_u (pixman_implementation_t *imp,
+                             pixman_op_t              op,
+                             uint32_t *               pd,
+                             const uint32_t *         ps,
+                             const uint32_t *         pm,
+                             int                      w)
+{
+    uint32_t s, d;
+
+    __m128i xmm_dst_lo, xmm_dst_hi;
+    __m128i xmm_src_lo, xmm_src_hi;
+    __m128i xmm_alpha_lo, xmm_alpha_hi;
+
+    /* Align dst on a 16-byte boundary */
+    while (w &&
+           ((unsigned long)pd & 15))
+    {
+       d = *pd;
+       s = combine1 (ps, pm);
+
+       *pd++ = core_combine_over_u_pixel_sse2 (d, s);
+       w--;
+       ps++;
+       if (pm)
+           pm++;
+    }
+
+    while (w >= 4)
+    {
+       /* I'm loading unaligned because I'm not sure
+        * about the address alignment.
+        */
+       xmm_src_hi = combine4 ((__m128i*)ps, (__m128i*)pm);
+       xmm_dst_hi = load_128_aligned ((__m128i*) pd);
+
+       unpack_128_2x128 (xmm_src_hi, &xmm_src_lo, &xmm_src_hi);
+       unpack_128_2x128 (xmm_dst_hi, &xmm_dst_lo, &xmm_dst_hi);
+
+       expand_alpha_2x128 (xmm_dst_lo, xmm_dst_hi,
+                           &xmm_alpha_lo, &xmm_alpha_hi);
+
+       over_2x128 (&xmm_dst_lo, &xmm_dst_hi,
+                   &xmm_alpha_lo, &xmm_alpha_hi,
+                   &xmm_src_lo, &xmm_src_hi);
+
+       /* rebuid the 4 pixel data and save*/
+       save_128_aligned ((__m128i*)pd,
+                         pack_2x128_128 (xmm_src_lo, xmm_src_hi));
+
+       w -= 4;
+       ps += 4;
+       pd += 4;
+
+       if (pm)
+           pm += 4;
+    }
+
+    while (w)
+    {
+       d = *pd;
+       s = combine1 (ps, pm);
+
+       *pd++ = core_combine_over_u_pixel_sse2 (d, s);
+       ps++;
+       w--;
+       if (pm)
+           pm++;
+    }
+}
+
+static force_inline uint32_t
+core_combine_in_u_pixel_sse2 (uint32_t src, uint32_t dst)
+{
+    uint32_t maska = src >> 24;
+
+    if (maska == 0)
+    {
+       return 0;
+    }
+    else if (maska != 0xff)
+    {
+       return pack_1x128_32 (
+           pix_multiply_1x128 (unpack_32_1x128 (dst),
+                               expand_alpha_1x128 (unpack_32_1x128 (src))));
+    }
+
+    return dst;
+}
+
+static void
+sse2_combine_in_u (pixman_implementation_t *imp,
+                   pixman_op_t              op,
+                   uint32_t *               pd,
+                   const uint32_t *         ps,
+                   const uint32_t *         pm,
+                   int                      w)
+{
+    uint32_t s, d;
+
+    __m128i xmm_src_lo, xmm_src_hi;
+    __m128i xmm_dst_lo, xmm_dst_hi;
+
+    while (w && ((unsigned long) pd & 15))
+    {
+       s = combine1 (ps, pm);
+       d = *pd;
+
+       *pd++ = core_combine_in_u_pixel_sse2 (d, s);
+       w--;
+       ps++;
+       if (pm)
+           pm++;
+    }
+
+    while (w >= 4)
+    {
+       xmm_dst_hi = load_128_aligned ((__m128i*) pd);
+       xmm_src_hi = combine4 ((__m128i*) ps, (__m128i*) pm);
+
+       unpack_128_2x128 (xmm_dst_hi, &xmm_dst_lo, &xmm_dst_hi);
+       expand_alpha_2x128 (xmm_dst_lo, xmm_dst_hi, &xmm_dst_lo, &xmm_dst_hi);
+
+       unpack_128_2x128 (xmm_src_hi, &xmm_src_lo, &xmm_src_hi);
+       pix_multiply_2x128 (&xmm_src_lo, &xmm_src_hi,
+                           &xmm_dst_lo, &xmm_dst_hi,
+                           &xmm_dst_lo, &xmm_dst_hi);
+
+       save_128_aligned ((__m128i*)pd,
+                         pack_2x128_128 (xmm_dst_lo, xmm_dst_hi));
+
+       ps += 4;
+       pd += 4;
+       w -= 4;
+       if (pm)
+           pm += 4;
+    }
+
+    while (w)
+    {
+       s = combine1 (ps, pm);
+       d = *pd;
+
+       *pd++ = core_combine_in_u_pixel_sse2 (d, s);
+       w--;
+       ps++;
+       if (pm)
+           pm++;
+    }
+}
+
+static void
+sse2_combine_in_reverse_u (pixman_implementation_t *imp,
+                           pixman_op_t              op,
+                           uint32_t *               pd,
+                           const uint32_t *         ps,
+                           const uint32_t *         pm,
+                           int                      w)
+{
+    uint32_t s, d;
+
+    __m128i xmm_src_lo, xmm_src_hi;
+    __m128i xmm_dst_lo, xmm_dst_hi;
+
+    while (w && ((unsigned long) pd & 15))
+    {
+       s = combine1 (ps, pm);
+       d = *pd;
+
+       *pd++ = core_combine_in_u_pixel_sse2 (s, d);
+       ps++;
+       w--;
+       if (pm)
+           pm++;
+    }
+
+    while (w >= 4)
+    {
+       xmm_dst_hi = load_128_aligned ((__m128i*) pd);
+       xmm_src_hi = combine4 ((__m128i*) ps, (__m128i*)pm);
+
+       unpack_128_2x128 (xmm_src_hi, &xmm_src_lo, &xmm_src_hi);
+       expand_alpha_2x128 (xmm_src_lo, xmm_src_hi, &xmm_src_lo, &xmm_src_hi);
+
+       unpack_128_2x128 (xmm_dst_hi, &xmm_dst_lo, &xmm_dst_hi);
+       pix_multiply_2x128 (&xmm_dst_lo, &xmm_dst_hi,
+                           &xmm_src_lo, &xmm_src_hi,
+                           &xmm_dst_lo, &xmm_dst_hi);
+
+       save_128_aligned (
+           (__m128i*)pd, pack_2x128_128 (xmm_dst_lo, xmm_dst_hi));
+
+       ps += 4;
+       pd += 4;
+       w -= 4;
+       if (pm)
+           pm += 4;
+    }
+
+    while (w)
+    {
+       s = combine1 (ps, pm);
+       d = *pd;
+
+       *pd++ = core_combine_in_u_pixel_sse2 (s, d);
+       w--;
+       ps++;
+       if (pm)
+           pm++;
+    }
+}
+
+static void
+sse2_combine_out_reverse_u (pixman_implementation_t *imp,
+                            pixman_op_t              op,
+                            uint32_t *               pd,
+                            const uint32_t *         ps,
+                            const uint32_t *         pm,
+                            int                      w)
+{
+    while (w && ((unsigned long) pd & 15))
+    {
+       uint32_t s = combine1 (ps, pm);
+       uint32_t d = *pd;
+
+       *pd++ = pack_1x128_32 (
+           pix_multiply_1x128 (
+               unpack_32_1x128 (d), negate_1x128 (
+                   expand_alpha_1x128 (unpack_32_1x128 (s)))));
+
+       if (pm)
+           pm++;
+       ps++;
+       w--;
+    }
+
+    while (w >= 4)
+    {
+       __m128i xmm_src_lo, xmm_src_hi;
+       __m128i xmm_dst_lo, xmm_dst_hi;
+
+       xmm_src_hi = combine4 ((__m128i*)ps, (__m128i*)pm);
+       xmm_dst_hi = load_128_aligned ((__m128i*) pd);
+
+       unpack_128_2x128 (xmm_src_hi, &xmm_src_lo, &xmm_src_hi);
+       unpack_128_2x128 (xmm_dst_hi, &xmm_dst_lo, &xmm_dst_hi);
+
+       expand_alpha_2x128 (xmm_src_lo, xmm_src_hi, &xmm_src_lo, &xmm_src_hi);
+       negate_2x128       (xmm_src_lo, xmm_src_hi, &xmm_src_lo, &xmm_src_hi);
+
+       pix_multiply_2x128 (&xmm_dst_lo, &xmm_dst_hi,
+                           &xmm_src_lo, &xmm_src_hi,
+                           &xmm_dst_lo, &xmm_dst_hi);
+
+       save_128_aligned (
+           (__m128i*)pd, pack_2x128_128 (xmm_dst_lo, xmm_dst_hi));
+
+       ps += 4;
+       pd += 4;
+       if (pm)
+           pm += 4;
+
+       w -= 4;
+    }
+
+    while (w)
+    {
+       uint32_t s = combine1 (ps, pm);
+       uint32_t d = *pd;
+
+       *pd++ = pack_1x128_32 (
+           pix_multiply_1x128 (
+               unpack_32_1x128 (d), negate_1x128 (
+                   expand_alpha_1x128 (unpack_32_1x128 (s)))));
+       ps++;
+       if (pm)
+           pm++;
+       w--;
+    }
+}
+
+static void
+sse2_combine_out_u (pixman_implementation_t *imp,
+                    pixman_op_t              op,
+                    uint32_t *               pd,
+                    const uint32_t *         ps,
+                    const uint32_t *         pm,
+                    int                      w)
+{
+    while (w && ((unsigned long) pd & 15))
+    {
+       uint32_t s = combine1 (ps, pm);
+       uint32_t d = *pd;
+
+       *pd++ = pack_1x128_32 (
+           pix_multiply_1x128 (
+               unpack_32_1x128 (s), negate_1x128 (
+                   expand_alpha_1x128 (unpack_32_1x128 (d)))));
+       w--;
+       ps++;
+       if (pm)
+           pm++;
+    }
+
+    while (w >= 4)
+    {
+       __m128i xmm_src_lo, xmm_src_hi;
+       __m128i xmm_dst_lo, xmm_dst_hi;
+
+       xmm_src_hi = combine4 ((__m128i*) ps, (__m128i*)pm);
+       xmm_dst_hi = load_128_aligned ((__m128i*) pd);
+
+       unpack_128_2x128 (xmm_src_hi, &xmm_src_lo, &xmm_src_hi);
+       unpack_128_2x128 (xmm_dst_hi, &xmm_dst_lo, &xmm_dst_hi);
+
+       expand_alpha_2x128 (xmm_dst_lo, xmm_dst_hi, &xmm_dst_lo, &xmm_dst_hi);
+       negate_2x128       (xmm_dst_lo, xmm_dst_hi, &xmm_dst_lo, &xmm_dst_hi);
+
+       pix_multiply_2x128 (&xmm_src_lo, &xmm_src_hi,
+                           &xmm_dst_lo, &xmm_dst_hi,
+                           &xmm_dst_lo, &xmm_dst_hi);
+
+       save_128_aligned (
+           (__m128i*)pd, pack_2x128_128 (xmm_dst_lo, xmm_dst_hi));
+
+       ps += 4;
+       pd += 4;
+       w -= 4;
+       if (pm)
+           pm += 4;
+    }
+
+    while (w)
+    {
+       uint32_t s = combine1 (ps, pm);
+       uint32_t d = *pd;
+
+       *pd++ = pack_1x128_32 (
+           pix_multiply_1x128 (
+               unpack_32_1x128 (s), negate_1x128 (
+                   expand_alpha_1x128 (unpack_32_1x128 (d)))));
+       w--;
+       ps++;
+       if (pm)
+           pm++;
+    }
+}
+
+static force_inline uint32_t
+core_combine_atop_u_pixel_sse2 (uint32_t src,
+                                uint32_t dst)
+{
+    __m128i s = unpack_32_1x128 (src);
+    __m128i d = unpack_32_1x128 (dst);
+
+    __m128i sa = negate_1x128 (expand_alpha_1x128 (s));
+    __m128i da = expand_alpha_1x128 (d);
+
+    return pack_1x128_32 (pix_add_multiply_1x128 (&s, &da, &d, &sa));
+}
+
+static void
+sse2_combine_atop_u (pixman_implementation_t *imp,
+                     pixman_op_t              op,
+                     uint32_t *               pd,
+                     const uint32_t *         ps,
+                     const uint32_t *         pm,
+                     int                      w)
+{
+    uint32_t s, d;
+
+    __m128i xmm_src_lo, xmm_src_hi;
+    __m128i xmm_dst_lo, xmm_dst_hi;
+    __m128i xmm_alpha_src_lo, xmm_alpha_src_hi;
+    __m128i xmm_alpha_dst_lo, xmm_alpha_dst_hi;
+
+    while (w && ((unsigned long) pd & 15))
+    {
+       s = combine1 (ps, pm);
+       d = *pd;
+
+       *pd++ = core_combine_atop_u_pixel_sse2 (s, d);
+       w--;
+       ps++;
+       if (pm)
+           pm++;
+    }
+
+    while (w >= 4)
+    {
+       xmm_src_hi = combine4 ((__m128i*)ps, (__m128i*)pm);
+       xmm_dst_hi = load_128_aligned ((__m128i*) pd);
+
+       unpack_128_2x128 (xmm_src_hi, &xmm_src_lo, &xmm_src_hi);
+       unpack_128_2x128 (xmm_dst_hi, &xmm_dst_lo, &xmm_dst_hi);
+
+       expand_alpha_2x128 (xmm_src_lo, xmm_src_hi,
+                           &xmm_alpha_src_lo, &xmm_alpha_src_hi);
+       expand_alpha_2x128 (xmm_dst_lo, xmm_dst_hi,
+                           &xmm_alpha_dst_lo, &xmm_alpha_dst_hi);
+
+       negate_2x128 (xmm_alpha_src_lo, xmm_alpha_src_hi,
+                     &xmm_alpha_src_lo, &xmm_alpha_src_hi);
+
+       pix_add_multiply_2x128 (
+           &xmm_src_lo, &xmm_src_hi, &xmm_alpha_dst_lo, &xmm_alpha_dst_hi,
+           &xmm_dst_lo, &xmm_dst_hi, &xmm_alpha_src_lo, &xmm_alpha_src_hi,
+           &xmm_dst_lo, &xmm_dst_hi);
+
+       save_128_aligned (
+           (__m128i*)pd, pack_2x128_128 (xmm_dst_lo, xmm_dst_hi));
+
+       ps += 4;
+       pd += 4;
+       w -= 4;
+       if (pm)
+           pm += 4;
+    }
+
+    while (w)
+    {
+       s = combine1 (ps, pm);
+       d = *pd;
+
+       *pd++ = core_combine_atop_u_pixel_sse2 (s, d);
+       w--;
+       ps++;
+       if (pm)
+           pm++;
+    }
+}
+
+static force_inline uint32_t
+core_combine_reverse_atop_u_pixel_sse2 (uint32_t src,
+                                        uint32_t dst)
+{
+    __m128i s = unpack_32_1x128 (src);
+    __m128i d = unpack_32_1x128 (dst);
+
+    __m128i sa = expand_alpha_1x128 (s);
+    __m128i da = negate_1x128 (expand_alpha_1x128 (d));
+
+    return pack_1x128_32 (pix_add_multiply_1x128 (&s, &da, &d, &sa));
+}
+
+static void
+sse2_combine_atop_reverse_u (pixman_implementation_t *imp,
+                             pixman_op_t              op,
+                             uint32_t *               pd,
+                             const uint32_t *         ps,
+                             const uint32_t *         pm,
+                             int                      w)
+{
+    uint32_t s, d;
+
+    __m128i xmm_src_lo, xmm_src_hi;
+    __m128i xmm_dst_lo, xmm_dst_hi;
+    __m128i xmm_alpha_src_lo, xmm_alpha_src_hi;
+    __m128i xmm_alpha_dst_lo, xmm_alpha_dst_hi;
+
+    while (w && ((unsigned long) pd & 15))
+    {
+       s = combine1 (ps, pm);
+       d = *pd;
+
+       *pd++ = core_combine_reverse_atop_u_pixel_sse2 (s, d);
+       ps++;
+       w--;
+       if (pm)
+           pm++;
+    }
+
+    while (w >= 4)
+    {
+       xmm_src_hi = combine4 ((__m128i*)ps, (__m128i*)pm);
+       xmm_dst_hi = load_128_aligned ((__m128i*) pd);
+
+       unpack_128_2x128 (xmm_src_hi, &xmm_src_lo, &xmm_src_hi);
+       unpack_128_2x128 (xmm_dst_hi, &xmm_dst_lo, &xmm_dst_hi);
+
+       expand_alpha_2x128 (xmm_src_lo, xmm_src_hi,
+                           &xmm_alpha_src_lo, &xmm_alpha_src_hi);
+       expand_alpha_2x128 (xmm_dst_lo, xmm_dst_hi,
+                           &xmm_alpha_dst_lo, &xmm_alpha_dst_hi);
+
+       negate_2x128 (xmm_alpha_dst_lo, xmm_alpha_dst_hi,
+                     &xmm_alpha_dst_lo, &xmm_alpha_dst_hi);
+
+       pix_add_multiply_2x128 (
+           &xmm_src_lo, &xmm_src_hi, &xmm_alpha_dst_lo, &xmm_alpha_dst_hi,
+           &xmm_dst_lo, &xmm_dst_hi, &xmm_alpha_src_lo, &xmm_alpha_src_hi,
+           &xmm_dst_lo, &xmm_dst_hi);
+
+       save_128_aligned (
+           (__m128i*)pd, pack_2x128_128 (xmm_dst_lo, xmm_dst_hi));
+
+       ps += 4;
+       pd += 4;
+       w -= 4;
+       if (pm)
+           pm += 4;
+    }
+
+    while (w)
+    {
+       s = combine1 (ps, pm);
+       d = *pd;
+
+       *pd++ = core_combine_reverse_atop_u_pixel_sse2 (s, d);
+       ps++;
+       w--;
+       if (pm)
+           pm++;
+    }
+}
+
+static force_inline uint32_t
+core_combine_xor_u_pixel_sse2 (uint32_t src,
+                               uint32_t dst)
+{
+    __m128i s = unpack_32_1x128 (src);
+    __m128i d = unpack_32_1x128 (dst);
+
+    __m128i neg_d = negate_1x128 (expand_alpha_1x128 (d));
+    __m128i neg_s = negate_1x128 (expand_alpha_1x128 (s));
+
+    return pack_1x128_32 (pix_add_multiply_1x128 (&s, &neg_d, &d, &neg_s));
+}
+
+static void
+sse2_combine_xor_u (pixman_implementation_t *imp,
+                    pixman_op_t              op,
+                    uint32_t *               dst,
+                    const uint32_t *         src,
+                    const uint32_t *         mask,
+                    int                      width)
+{
+    int w = width;
+    uint32_t s, d;
+    uint32_t* pd = dst;
+    const uint32_t* ps = src;
+    const uint32_t* pm = mask;
+
+    __m128i xmm_src, xmm_src_lo, xmm_src_hi;
+    __m128i xmm_dst, xmm_dst_lo, xmm_dst_hi;
+    __m128i xmm_alpha_src_lo, xmm_alpha_src_hi;
+    __m128i xmm_alpha_dst_lo, xmm_alpha_dst_hi;
+
+    while (w && ((unsigned long) pd & 15))
+    {
+       s = combine1 (ps, pm);
+       d = *pd;
+
+       *pd++ = core_combine_xor_u_pixel_sse2 (s, d);
+       w--;
+       ps++;
+       if (pm)
+           pm++;
+    }
+
+    while (w >= 4)
+    {
+       xmm_src = combine4 ((__m128i*) ps, (__m128i*) pm);
+       xmm_dst = load_128_aligned ((__m128i*) pd);
+
+       unpack_128_2x128 (xmm_src, &xmm_src_lo, &xmm_src_hi);
+       unpack_128_2x128 (xmm_dst, &xmm_dst_lo, &xmm_dst_hi);
+
+       expand_alpha_2x128 (xmm_src_lo, xmm_src_hi,
+                           &xmm_alpha_src_lo, &xmm_alpha_src_hi);
+       expand_alpha_2x128 (xmm_dst_lo, xmm_dst_hi,
+                           &xmm_alpha_dst_lo, &xmm_alpha_dst_hi);
+
+       negate_2x128 (xmm_alpha_src_lo, xmm_alpha_src_hi,
+                     &xmm_alpha_src_lo, &xmm_alpha_src_hi);
+       negate_2x128 (xmm_alpha_dst_lo, xmm_alpha_dst_hi,
+                     &xmm_alpha_dst_lo, &xmm_alpha_dst_hi);
+
+       pix_add_multiply_2x128 (
+           &xmm_src_lo, &xmm_src_hi, &xmm_alpha_dst_lo, &xmm_alpha_dst_hi,
+           &xmm_dst_lo, &xmm_dst_hi, &xmm_alpha_src_lo, &xmm_alpha_src_hi,
+           &xmm_dst_lo, &xmm_dst_hi);
+
+       save_128_aligned (
+           (__m128i*)pd, pack_2x128_128 (xmm_dst_lo, xmm_dst_hi));
+
+       ps += 4;
+       pd += 4;
+       w -= 4;
+       if (pm)
+           pm += 4;
+    }
+
+    while (w)
+    {
+       s = combine1 (ps, pm);
+       d = *pd;
+
+       *pd++ = core_combine_xor_u_pixel_sse2 (s, d);
+       w--;
+       ps++;
+       if (pm)
+           pm++;
+    }
+}
+
+static force_inline void
+sse2_combine_add_u (pixman_implementation_t *imp,
+                    pixman_op_t              op,
+                    uint32_t *               dst,
+                    const uint32_t *         src,
+                    const uint32_t *         mask,
+                    int                      width)
+{
+    int w = width;
+    uint32_t s, d;
+    uint32_t* pd = dst;
+    const uint32_t* ps = src;
+    const uint32_t* pm = mask;
+
+    while (w && (unsigned long)pd & 15)
+    {
+       s = combine1 (ps, pm);
+       d = *pd;
+
+       ps++;
+       if (pm)
+           pm++;
+       *pd++ = _mm_cvtsi128_si32 (
+           _mm_adds_epu8 (_mm_cvtsi32_si128 (s), _mm_cvtsi32_si128 (d)));
+       w--;
+    }
+
+    while (w >= 4)
+    {
+       __m128i s;
+
+       s = combine4 ((__m128i*)ps, (__m128i*)pm);
+
+       save_128_aligned (
+           (__m128i*)pd, _mm_adds_epu8 (s, load_128_aligned  ((__m128i*)pd)));
+
+       pd += 4;
+       ps += 4;
+       if (pm)
+           pm += 4;
+       w -= 4;
+    }
+
+    while (w--)
+    {
+       s = combine1 (ps, pm);
+       d = *pd;
+
+       ps++;
+       *pd++ = _mm_cvtsi128_si32 (
+           _mm_adds_epu8 (_mm_cvtsi32_si128 (s), _mm_cvtsi32_si128 (d)));
+       if (pm)
+           pm++;
+    }
+}
+
+static force_inline uint32_t
+core_combine_saturate_u_pixel_sse2 (uint32_t src,
+                                    uint32_t dst)
+{
+    __m128i ms = unpack_32_1x128 (src);
+    __m128i md = unpack_32_1x128 (dst);
+    uint32_t sa = src >> 24;
+    uint32_t da = ~dst >> 24;
+
+    if (sa > da)
+    {
+       ms = pix_multiply_1x128 (
+           ms, expand_alpha_1x128 (unpack_32_1x128 (DIV_UN8 (da, sa) << 24)));
+    }
+
+    return pack_1x128_32 (_mm_adds_epu16 (md, ms));
+}
+
+static void
+sse2_combine_saturate_u (pixman_implementation_t *imp,
+                         pixman_op_t              op,
+                         uint32_t *               pd,
+                         const uint32_t *         ps,
+                         const uint32_t *         pm,
+                         int                      w)
+{
+    uint32_t s, d;
+
+    uint32_t pack_cmp;
+    __m128i xmm_src, xmm_dst;
+
+    while (w && (unsigned long)pd & 15)
+    {
+       s = combine1 (ps, pm);
+       d = *pd;
+
+       *pd++ = core_combine_saturate_u_pixel_sse2 (s, d);
+       w--;
+       ps++;
+       if (pm)
+           pm++;
+    }
+
+    while (w >= 4)
+    {
+       xmm_dst = load_128_aligned  ((__m128i*)pd);
+       xmm_src = combine4 ((__m128i*)ps, (__m128i*)pm);
+
+       pack_cmp = _mm_movemask_epi8 (
+           _mm_cmpgt_epi32 (
+               _mm_srli_epi32 (xmm_src, 24),
+               _mm_srli_epi32 (_mm_xor_si128 (xmm_dst, mask_ff000000), 24)));
+
+       /* if some alpha src is grater than respective ~alpha dst */
+       if (pack_cmp)
+       {
+           s = combine1 (ps++, pm);
+           d = *pd;
+           *pd++ = core_combine_saturate_u_pixel_sse2 (s, d);
+           if (pm)
+               pm++;
+
+           s = combine1 (ps++, pm);
+           d = *pd;
+           *pd++ = core_combine_saturate_u_pixel_sse2 (s, d);
+           if (pm)
+               pm++;
+
+           s = combine1 (ps++, pm);
+           d = *pd;
+           *pd++ = core_combine_saturate_u_pixel_sse2 (s, d);
+           if (pm)
+               pm++;
+
+           s = combine1 (ps++, pm);
+           d = *pd;
+           *pd++ = core_combine_saturate_u_pixel_sse2 (s, d);
+           if (pm)
+               pm++;
+       }
+       else
+       {
+           save_128_aligned ((__m128i*)pd, _mm_adds_epu8 (xmm_dst, xmm_src));
+
+           pd += 4;
+           ps += 4;
+           if (pm)
+               pm += 4;
+       }
+
+       w -= 4;
+    }
+
+    while (w--)
+    {
+       s = combine1 (ps, pm);
+       d = *pd;
+
+       *pd++ = core_combine_saturate_u_pixel_sse2 (s, d);
+       ps++;
+       if (pm)
+           pm++;
+    }
+}
+
+static void
+sse2_combine_src_ca (pixman_implementation_t *imp,
+                     pixman_op_t              op,
+                     uint32_t *               pd,
+                     const uint32_t *         ps,
+                     const uint32_t *         pm,
+                     int                      w)
+{
+    uint32_t s, m;
+
+    __m128i xmm_src_lo, xmm_src_hi;
+    __m128i xmm_mask_lo, xmm_mask_hi;
+    __m128i xmm_dst_lo, xmm_dst_hi;
+
+    while (w && (unsigned long)pd & 15)
+    {
+       s = *ps++;
+       m = *pm++;
+       *pd++ = pack_1x128_32 (
+           pix_multiply_1x128 (unpack_32_1x128 (s), unpack_32_1x128 (m)));
+       w--;
+    }
+
+    while (w >= 4)
+    {
+       xmm_src_hi = load_128_unaligned ((__m128i*)ps);
+       xmm_mask_hi = load_128_unaligned ((__m128i*)pm);
+
+       unpack_128_2x128 (xmm_src_hi, &xmm_src_lo, &xmm_src_hi);
+       unpack_128_2x128 (xmm_mask_hi, &xmm_mask_lo, &xmm_mask_hi);
+
+       pix_multiply_2x128 (&xmm_src_lo, &xmm_src_hi,
+                           &xmm_mask_lo, &xmm_mask_hi,
+                           &xmm_dst_lo, &xmm_dst_hi);
+
+       save_128_aligned (
+           (__m128i*)pd, pack_2x128_128 (xmm_dst_lo, xmm_dst_hi));
+
+       ps += 4;
+       pd += 4;
+       pm += 4;
+       w -= 4;
+    }
+
+    while (w)
+    {
+       s = *ps++;
+       m = *pm++;
+       *pd++ = pack_1x128_32 (
+           pix_multiply_1x128 (unpack_32_1x128 (s), unpack_32_1x128 (m)));
+       w--;
+    }
+}
+
+static force_inline uint32_t
+core_combine_over_ca_pixel_sse2 (uint32_t src,
+                                 uint32_t mask,
+                                 uint32_t dst)
+{
+    __m128i s = unpack_32_1x128 (src);
+    __m128i expAlpha = expand_alpha_1x128 (s);
+    __m128i unpk_mask = unpack_32_1x128 (mask);
+    __m128i unpk_dst  = unpack_32_1x128 (dst);
+
+    return pack_1x128_32 (in_over_1x128 (&s, &expAlpha, &unpk_mask, &unpk_dst));
+}
+
+static void
+sse2_combine_over_ca (pixman_implementation_t *imp,
+                      pixman_op_t              op,
+                      uint32_t *               pd,
+                      const uint32_t *         ps,
+                      const uint32_t *         pm,
+                      int                      w)
+{
+    uint32_t s, m, d;
+
+    __m128i xmm_alpha_lo, xmm_alpha_hi;
+    __m128i xmm_src_lo, xmm_src_hi;
+    __m128i xmm_dst_lo, xmm_dst_hi;
+    __m128i xmm_mask_lo, xmm_mask_hi;
+
+    while (w && (unsigned long)pd & 15)
+    {
+       s = *ps++;
+       m = *pm++;
+       d = *pd;
+
+       *pd++ = core_combine_over_ca_pixel_sse2 (s, m, d);
+       w--;
+    }
+
+    while (w >= 4)
+    {
+       xmm_dst_hi = load_128_aligned ((__m128i*)pd);
+       xmm_src_hi = load_128_unaligned ((__m128i*)ps);
+       xmm_mask_hi = load_128_unaligned ((__m128i*)pm);
+
+       unpack_128_2x128 (xmm_dst_hi, &xmm_dst_lo, &xmm_dst_hi);
+       unpack_128_2x128 (xmm_src_hi, &xmm_src_lo, &xmm_src_hi);
+       unpack_128_2x128 (xmm_mask_hi, &xmm_mask_lo, &xmm_mask_hi);
+
+       expand_alpha_2x128 (xmm_src_lo, xmm_src_hi,
+                           &xmm_alpha_lo, &xmm_alpha_hi);
+
+       in_over_2x128 (&xmm_src_lo, &xmm_src_hi,
+                      &xmm_alpha_lo, &xmm_alpha_hi,
+                      &xmm_mask_lo, &xmm_mask_hi,
+                      &xmm_dst_lo, &xmm_dst_hi);
+
+       save_128_aligned (
+           (__m128i*)pd, pack_2x128_128 (xmm_dst_lo, xmm_dst_hi));
+
+       ps += 4;
+       pd += 4;
+       pm += 4;
+       w -= 4;
+    }
+
+    while (w)
+    {
+       s = *ps++;
+       m = *pm++;
+       d = *pd;
+
+       *pd++ = core_combine_over_ca_pixel_sse2 (s, m, d);
+       w--;
+    }
+}
+
+static force_inline uint32_t
+core_combine_over_reverse_ca_pixel_sse2 (uint32_t src,
+                                         uint32_t mask,
+                                         uint32_t dst)
+{
+    __m128i d = unpack_32_1x128 (dst);
+
+    return pack_1x128_32 (
+       over_1x128 (d, expand_alpha_1x128 (d),
+                   pix_multiply_1x128 (unpack_32_1x128 (src),
+                                       unpack_32_1x128 (mask))));
+}
+
+static void
+sse2_combine_over_reverse_ca (pixman_implementation_t *imp,
+                              pixman_op_t              op,
+                              uint32_t *               pd,
+                              const uint32_t *         ps,
+                              const uint32_t *         pm,
+                              int                      w)
+{
+    uint32_t s, m, d;
+
+    __m128i xmm_alpha_lo, xmm_alpha_hi;
+    __m128i xmm_src_lo, xmm_src_hi;
+    __m128i xmm_dst_lo, xmm_dst_hi;
+    __m128i xmm_mask_lo, xmm_mask_hi;
+
+    while (w && (unsigned long)pd & 15)
+    {
+       s = *ps++;
+       m = *pm++;
+       d = *pd;
+
+       *pd++ = core_combine_over_reverse_ca_pixel_sse2 (s, m, d);
+       w--;
+    }
+
+    while (w >= 4)
+    {
+       xmm_dst_hi = load_128_aligned ((__m128i*)pd);
+       xmm_src_hi = load_128_unaligned ((__m128i*)ps);
+       xmm_mask_hi = load_128_unaligned ((__m128i*)pm);
+
+       unpack_128_2x128 (xmm_dst_hi, &xmm_dst_lo, &xmm_dst_hi);
+       unpack_128_2x128 (xmm_src_hi, &xmm_src_lo, &xmm_src_hi);
+       unpack_128_2x128 (xmm_mask_hi, &xmm_mask_lo, &xmm_mask_hi);
+
+       expand_alpha_2x128 (xmm_dst_lo, xmm_dst_hi,
+                           &xmm_alpha_lo, &xmm_alpha_hi);
+       pix_multiply_2x128 (&xmm_src_lo, &xmm_src_hi,
+                           &xmm_mask_lo, &xmm_mask_hi,
+                           &xmm_mask_lo, &xmm_mask_hi);
+
+       over_2x128 (&xmm_dst_lo, &xmm_dst_hi,
+                   &xmm_alpha_lo, &xmm_alpha_hi,
+                   &xmm_mask_lo, &xmm_mask_hi);
+
+       save_128_aligned (
+           (__m128i*)pd, pack_2x128_128 (xmm_mask_lo, xmm_mask_hi));
+
+       ps += 4;
+       pd += 4;
+       pm += 4;
+       w -= 4;
+    }
+
+    while (w)
+    {
+       s = *ps++;
+       m = *pm++;
+       d = *pd;
+
+       *pd++ = core_combine_over_reverse_ca_pixel_sse2 (s, m, d);
+       w--;
+    }
+}
+
+static void
+sse2_combine_in_ca (pixman_implementation_t *imp,
+                    pixman_op_t              op,
+                    uint32_t *               pd,
+                    const uint32_t *         ps,
+                    const uint32_t *         pm,
+                    int                      w)
+{
+    uint32_t s, m, d;
+
+    __m128i xmm_alpha_lo, xmm_alpha_hi;
+    __m128i xmm_src_lo, xmm_src_hi;
+    __m128i xmm_dst_lo, xmm_dst_hi;
+    __m128i xmm_mask_lo, xmm_mask_hi;
+
+    while (w && (unsigned long)pd & 15)
+    {
+       s = *ps++;
+       m = *pm++;
+       d = *pd;
+
+       *pd++ = pack_1x128_32 (
+           pix_multiply_1x128 (
+               pix_multiply_1x128 (unpack_32_1x128 (s), unpack_32_1x128 (m)),
+               expand_alpha_1x128 (unpack_32_1x128 (d))));
+
+       w--;
+    }
+
+    while (w >= 4)
+    {
+       xmm_dst_hi = load_128_aligned ((__m128i*)pd);
+       xmm_src_hi = load_128_unaligned ((__m128i*)ps);
+       xmm_mask_hi = load_128_unaligned ((__m128i*)pm);
+
+       unpack_128_2x128 (xmm_dst_hi, &xmm_dst_lo, &xmm_dst_hi);
+       unpack_128_2x128 (xmm_src_hi, &xmm_src_lo, &xmm_src_hi);
+       unpack_128_2x128 (xmm_mask_hi, &xmm_mask_lo, &xmm_mask_hi);
+
+       expand_alpha_2x128 (xmm_dst_lo, xmm_dst_hi,
+                           &xmm_alpha_lo, &xmm_alpha_hi);
+
+       pix_multiply_2x128 (&xmm_src_lo, &xmm_src_hi,
+                           &xmm_mask_lo, &xmm_mask_hi,
+                           &xmm_dst_lo, &xmm_dst_hi);
+
+       pix_multiply_2x128 (&xmm_dst_lo, &xmm_dst_hi,
+                           &xmm_alpha_lo, &xmm_alpha_hi,
+                           &xmm_dst_lo, &xmm_dst_hi);
+
+       save_128_aligned (
+           (__m128i*)pd, pack_2x128_128 (xmm_dst_lo, xmm_dst_hi));
+
+       ps += 4;
+       pd += 4;
+       pm += 4;
+       w -= 4;
+    }
+
+    while (w)
+    {
+       s = *ps++;
+       m = *pm++;
+       d = *pd;
+
+       *pd++ = pack_1x128_32 (
+           pix_multiply_1x128 (
+               pix_multiply_1x128 (
+                   unpack_32_1x128 (s), unpack_32_1x128 (m)),
+               expand_alpha_1x128 (unpack_32_1x128 (d))));
+
+       w--;
+    }
+}
+
+static void
+sse2_combine_in_reverse_ca (pixman_implementation_t *imp,
+                            pixman_op_t              op,
+                            uint32_t *               pd,
+                            const uint32_t *         ps,
+                            const uint32_t *         pm,
+                            int                      w)
+{
+    uint32_t s, m, d;
+
+    __m128i xmm_alpha_lo, xmm_alpha_hi;
+    __m128i xmm_src_lo, xmm_src_hi;
+    __m128i xmm_dst_lo, xmm_dst_hi;
+    __m128i xmm_mask_lo, xmm_mask_hi;
+
+    while (w && (unsigned long)pd & 15)
+    {
+       s = *ps++;
+       m = *pm++;
+       d = *pd;
+
+       *pd++ = pack_1x128_32 (
+           pix_multiply_1x128 (
+               unpack_32_1x128 (d),
+               pix_multiply_1x128 (unpack_32_1x128 (m),
+                                  expand_alpha_1x128 (unpack_32_1x128 (s)))));
+       w--;
+    }
+
+    while (w >= 4)
+    {
+       xmm_dst_hi = load_128_aligned ((__m128i*)pd);
+       xmm_src_hi = load_128_unaligned ((__m128i*)ps);
+       xmm_mask_hi = load_128_unaligned ((__m128i*)pm);
+
+       unpack_128_2x128 (xmm_dst_hi, &xmm_dst_lo, &xmm_dst_hi);
+       unpack_128_2x128 (xmm_src_hi, &xmm_src_lo, &xmm_src_hi);
+       unpack_128_2x128 (xmm_mask_hi, &xmm_mask_lo, &xmm_mask_hi);
+
+       expand_alpha_2x128 (xmm_src_lo, xmm_src_hi,
+                           &xmm_alpha_lo, &xmm_alpha_hi);
+       pix_multiply_2x128 (&xmm_mask_lo, &xmm_mask_hi,
+                           &xmm_alpha_lo, &xmm_alpha_hi,
+                           &xmm_alpha_lo, &xmm_alpha_hi);
+
+       pix_multiply_2x128 (&xmm_dst_lo, &xmm_dst_hi,
+                           &xmm_alpha_lo, &xmm_alpha_hi,
+                           &xmm_dst_lo, &xmm_dst_hi);
+
+       save_128_aligned (
+           (__m128i*)pd, pack_2x128_128 (xmm_dst_lo, xmm_dst_hi));
+
+       ps += 4;
+       pd += 4;
+       pm += 4;
+       w -= 4;
+    }
+
+    while (w)
+    {
+       s = *ps++;
+       m = *pm++;
+       d = *pd;
+
+       *pd++ = pack_1x128_32 (
+           pix_multiply_1x128 (
+               unpack_32_1x128 (d),
+               pix_multiply_1x128 (unpack_32_1x128 (m),
+                                  expand_alpha_1x128 (unpack_32_1x128 (s)))));
+       w--;
+    }
+}
+
+static void
+sse2_combine_out_ca (pixman_implementation_t *imp,
+                     pixman_op_t              op,
+                     uint32_t *               pd,
+                     const uint32_t *         ps,
+                     const uint32_t *         pm,
+                     int                      w)
+{
+    uint32_t s, m, d;
+
+    __m128i xmm_alpha_lo, xmm_alpha_hi;
+    __m128i xmm_src_lo, xmm_src_hi;
+    __m128i xmm_dst_lo, xmm_dst_hi;
+    __m128i xmm_mask_lo, xmm_mask_hi;
+
+    while (w && (unsigned long)pd & 15)
+    {
+       s = *ps++;
+       m = *pm++;
+       d = *pd;
+
+       *pd++ = pack_1x128_32 (
+           pix_multiply_1x128 (
+               pix_multiply_1x128 (
+                   unpack_32_1x128 (s), unpack_32_1x128 (m)),
+               negate_1x128 (expand_alpha_1x128 (unpack_32_1x128 (d)))));
+       w--;
+    }
+
+    while (w >= 4)
+    {
+       xmm_dst_hi = load_128_aligned ((__m128i*)pd);
+       xmm_src_hi = load_128_unaligned ((__m128i*)ps);
+       xmm_mask_hi = load_128_unaligned ((__m128i*)pm);
+
+       unpack_128_2x128 (xmm_dst_hi, &xmm_dst_lo, &xmm_dst_hi);
+       unpack_128_2x128 (xmm_src_hi, &xmm_src_lo, &xmm_src_hi);
+       unpack_128_2x128 (xmm_mask_hi, &xmm_mask_lo, &xmm_mask_hi);
+
+       expand_alpha_2x128 (xmm_dst_lo, xmm_dst_hi,
+                           &xmm_alpha_lo, &xmm_alpha_hi);
+       negate_2x128 (xmm_alpha_lo, xmm_alpha_hi,
+                     &xmm_alpha_lo, &xmm_alpha_hi);
+
+       pix_multiply_2x128 (&xmm_src_lo, &xmm_src_hi,
+                           &xmm_mask_lo, &xmm_mask_hi,
+                           &xmm_dst_lo, &xmm_dst_hi);
+       pix_multiply_2x128 (&xmm_dst_lo, &xmm_dst_hi,
+                           &xmm_alpha_lo, &xmm_alpha_hi,
+                           &xmm_dst_lo, &xmm_dst_hi);
+
+       save_128_aligned (
+           (__m128i*)pd, pack_2x128_128 (xmm_dst_lo, xmm_dst_hi));
+
+       ps += 4;
+       pd += 4;
+       pm += 4;
+       w -= 4;
+    }
+
+    while (w)
+    {
+       s = *ps++;
+       m = *pm++;
+       d = *pd;
+
+       *pd++ = pack_1x128_32 (
+           pix_multiply_1x128 (
+               pix_multiply_1x128 (
+                   unpack_32_1x128 (s), unpack_32_1x128 (m)),
+               negate_1x128 (expand_alpha_1x128 (unpack_32_1x128 (d)))));
+
+       w--;
+    }
+}
+
+static void
+sse2_combine_out_reverse_ca (pixman_implementation_t *imp,
+                             pixman_op_t              op,
+                             uint32_t *               pd,
+                             const uint32_t *         ps,
+                             const uint32_t *         pm,
+                             int                      w)
+{
+    uint32_t s, m, d;
+
+    __m128i xmm_alpha_lo, xmm_alpha_hi;
+    __m128i xmm_src_lo, xmm_src_hi;
+    __m128i xmm_dst_lo, xmm_dst_hi;
+    __m128i xmm_mask_lo, xmm_mask_hi;
+
+    while (w && (unsigned long)pd & 15)
+    {
+       s = *ps++;
+       m = *pm++;
+       d = *pd;
+
+       *pd++ = pack_1x128_32 (
+           pix_multiply_1x128 (
+               unpack_32_1x128 (d),
+               negate_1x128 (pix_multiply_1x128 (
+                                unpack_32_1x128 (m),
+                                expand_alpha_1x128 (unpack_32_1x128 (s))))));
+       w--;
+    }
+
+    while (w >= 4)
+    {
+       xmm_dst_hi = load_128_aligned ((__m128i*)pd);
+       xmm_src_hi = load_128_unaligned ((__m128i*)ps);
+       xmm_mask_hi = load_128_unaligned ((__m128i*)pm);
+
+       unpack_128_2x128 (xmm_dst_hi, &xmm_dst_lo, &xmm_dst_hi);
+       unpack_128_2x128 (xmm_src_hi, &xmm_src_lo, &xmm_src_hi);
+       unpack_128_2x128 (xmm_mask_hi, &xmm_mask_lo, &xmm_mask_hi);
+
+       expand_alpha_2x128 (xmm_src_lo, xmm_src_hi,
+                           &xmm_alpha_lo, &xmm_alpha_hi);
+
+       pix_multiply_2x128 (&xmm_mask_lo, &xmm_mask_hi,
+                           &xmm_alpha_lo, &xmm_alpha_hi,
+                           &xmm_mask_lo, &xmm_mask_hi);
+
+       negate_2x128 (xmm_mask_lo, xmm_mask_hi,
+                     &xmm_mask_lo, &xmm_mask_hi);
+
+       pix_multiply_2x128 (&xmm_dst_lo, &xmm_dst_hi,
+                           &xmm_mask_lo, &xmm_mask_hi,
+                           &xmm_dst_lo, &xmm_dst_hi);
+
+       save_128_aligned (
+           (__m128i*)pd, pack_2x128_128 (xmm_dst_lo, xmm_dst_hi));
+
+       ps += 4;
+       pd += 4;
+       pm += 4;
+       w -= 4;
+    }
+
+    while (w)
+    {
+       s = *ps++;
+       m = *pm++;
+       d = *pd;
+
+       *pd++ = pack_1x128_32 (
+           pix_multiply_1x128 (
+               unpack_32_1x128 (d),
+               negate_1x128 (pix_multiply_1x128 (
+                                unpack_32_1x128 (m),
+                                expand_alpha_1x128 (unpack_32_1x128 (s))))));
+       w--;
+    }
+}
+
+static force_inline uint32_t
+core_combine_atop_ca_pixel_sse2 (uint32_t src,
+                                 uint32_t mask,
+                                 uint32_t dst)
+{
+    __m128i m = unpack_32_1x128 (mask);
+    __m128i s = unpack_32_1x128 (src);
+    __m128i d = unpack_32_1x128 (dst);
+    __m128i sa = expand_alpha_1x128 (s);
+    __m128i da = expand_alpha_1x128 (d);
+
+    s = pix_multiply_1x128 (s, m);
+    m = negate_1x128 (pix_multiply_1x128 (m, sa));
+
+    return pack_1x128_32 (pix_add_multiply_1x128 (&d, &m, &s, &da));
+}
+
+static void
+sse2_combine_atop_ca (pixman_implementation_t *imp,
+                      pixman_op_t              op,
+                      uint32_t *               pd,
+                      const uint32_t *         ps,
+                      const uint32_t *         pm,
+                      int                      w)
+{
+    uint32_t s, m, d;
+
+    __m128i xmm_src_lo, xmm_src_hi;
+    __m128i xmm_dst_lo, xmm_dst_hi;
+    __m128i xmm_alpha_src_lo, xmm_alpha_src_hi;
+    __m128i xmm_alpha_dst_lo, xmm_alpha_dst_hi;
+    __m128i xmm_mask_lo, xmm_mask_hi;
+
+    while (w && (unsigned long)pd & 15)
+    {
+       s = *ps++;
+       m = *pm++;
+       d = *pd;
+
+       *pd++ = core_combine_atop_ca_pixel_sse2 (s, m, d);
+       w--;
+    }
+
+    while (w >= 4)
+    {
+       xmm_dst_hi = load_128_aligned ((__m128i*)pd);
+       xmm_src_hi = load_128_unaligned ((__m128i*)ps);
+       xmm_mask_hi = load_128_unaligned ((__m128i*)pm);
+
+       unpack_128_2x128 (xmm_dst_hi, &xmm_dst_lo, &xmm_dst_hi);
+       unpack_128_2x128 (xmm_src_hi, &xmm_src_lo, &xmm_src_hi);
+       unpack_128_2x128 (xmm_mask_hi, &xmm_mask_lo, &xmm_mask_hi);
+
+       expand_alpha_2x128 (xmm_src_lo, xmm_src_hi,
+                           &xmm_alpha_src_lo, &xmm_alpha_src_hi);
+       expand_alpha_2x128 (xmm_dst_lo, xmm_dst_hi,
+                           &xmm_alpha_dst_lo, &xmm_alpha_dst_hi);
+
+       pix_multiply_2x128 (&xmm_src_lo, &xmm_src_hi,
+                           &xmm_mask_lo, &xmm_mask_hi,
+                           &xmm_src_lo, &xmm_src_hi);
+       pix_multiply_2x128 (&xmm_mask_lo, &xmm_mask_hi,
+                           &xmm_alpha_src_lo, &xmm_alpha_src_hi,
+                           &xmm_mask_lo, &xmm_mask_hi);
+
+       negate_2x128 (xmm_mask_lo, xmm_mask_hi, &xmm_mask_lo, &xmm_mask_hi);
+
+       pix_add_multiply_2x128 (
+           &xmm_dst_lo, &xmm_dst_hi, &xmm_mask_lo, &xmm_mask_hi,
+           &xmm_src_lo, &xmm_src_hi, &xmm_alpha_dst_lo, &xmm_alpha_dst_hi,
+           &xmm_dst_lo, &xmm_dst_hi);
+
+       save_128_aligned (
+           (__m128i*)pd, pack_2x128_128 (xmm_dst_lo, xmm_dst_hi));
+
+       ps += 4;
+       pd += 4;
+       pm += 4;
+       w -= 4;
+    }
+
+    while (w)
+    {
+       s = *ps++;
+       m = *pm++;
+       d = *pd;
+
+       *pd++ = core_combine_atop_ca_pixel_sse2 (s, m, d);
+       w--;
+    }
+}
+
+static force_inline uint32_t
+core_combine_reverse_atop_ca_pixel_sse2 (uint32_t src,
+                                         uint32_t mask,
+                                         uint32_t dst)
+{
+    __m128i m = unpack_32_1x128 (mask);
+    __m128i s = unpack_32_1x128 (src);
+    __m128i d = unpack_32_1x128 (dst);
+
+    __m128i da = negate_1x128 (expand_alpha_1x128 (d));
+    __m128i sa = expand_alpha_1x128 (s);
+
+    s = pix_multiply_1x128 (s, m);
+    m = pix_multiply_1x128 (m, sa);
+
+    return pack_1x128_32 (pix_add_multiply_1x128 (&d, &m, &s, &da));
+}
+
+static void
+sse2_combine_atop_reverse_ca (pixman_implementation_t *imp,
+                              pixman_op_t              op,
+                              uint32_t *               pd,
+                              const uint32_t *         ps,
+                              const uint32_t *         pm,
+                              int                      w)
+{
+    uint32_t s, m, d;
+
+    __m128i xmm_src_lo, xmm_src_hi;
+    __m128i xmm_dst_lo, xmm_dst_hi;
+    __m128i xmm_alpha_src_lo, xmm_alpha_src_hi;
+    __m128i xmm_alpha_dst_lo, xmm_alpha_dst_hi;
+    __m128i xmm_mask_lo, xmm_mask_hi;
+
+    while (w && (unsigned long)pd & 15)
+    {
+       s = *ps++;
+       m = *pm++;
+       d = *pd;
+
+       *pd++ = core_combine_reverse_atop_ca_pixel_sse2 (s, m, d);
+       w--;
+    }
+
+    while (w >= 4)
+    {
+       xmm_dst_hi = load_128_aligned ((__m128i*)pd);
+       xmm_src_hi = load_128_unaligned ((__m128i*)ps);
+       xmm_mask_hi = load_128_unaligned ((__m128i*)pm);
+
+       unpack_128_2x128 (xmm_dst_hi, &xmm_dst_lo, &xmm_dst_hi);
+       unpack_128_2x128 (xmm_src_hi, &xmm_src_lo, &xmm_src_hi);
+       unpack_128_2x128 (xmm_mask_hi, &xmm_mask_lo, &xmm_mask_hi);
+
+       expand_alpha_2x128 (xmm_src_lo, xmm_src_hi,
+                           &xmm_alpha_src_lo, &xmm_alpha_src_hi);
+       expand_alpha_2x128 (xmm_dst_lo, xmm_dst_hi,
+                           &xmm_alpha_dst_lo, &xmm_alpha_dst_hi);
+
+       pix_multiply_2x128 (&xmm_src_lo, &xmm_src_hi,
+                           &xmm_mask_lo, &xmm_mask_hi,
+                           &xmm_src_lo, &xmm_src_hi);
+       pix_multiply_2x128 (&xmm_mask_lo, &xmm_mask_hi,
+                           &xmm_alpha_src_lo, &xmm_alpha_src_hi,
+                           &xmm_mask_lo, &xmm_mask_hi);
+
+       negate_2x128 (xmm_alpha_dst_lo, xmm_alpha_dst_hi,
+                     &xmm_alpha_dst_lo, &xmm_alpha_dst_hi);
+
+       pix_add_multiply_2x128 (
+           &xmm_dst_lo, &xmm_dst_hi, &xmm_mask_lo, &xmm_mask_hi,
+           &xmm_src_lo, &xmm_src_hi, &xmm_alpha_dst_lo, &xmm_alpha_dst_hi,
+           &xmm_dst_lo, &xmm_dst_hi);
+
+       save_128_aligned (
+           (__m128i*)pd, pack_2x128_128 (xmm_dst_lo, xmm_dst_hi));
+
+       ps += 4;
+       pd += 4;
+       pm += 4;
+       w -= 4;
+    }
+
+    while (w)
+    {
+       s = *ps++;
+       m = *pm++;
+       d = *pd;
+
+       *pd++ = core_combine_reverse_atop_ca_pixel_sse2 (s, m, d);
+       w--;
+    }
+}
+
+static force_inline uint32_t
+core_combine_xor_ca_pixel_sse2 (uint32_t src,
+                                uint32_t mask,
+                                uint32_t dst)
+{
+    __m128i a = unpack_32_1x128 (mask);
+    __m128i s = unpack_32_1x128 (src);
+    __m128i d = unpack_32_1x128 (dst);
+
+    __m128i alpha_dst = negate_1x128 (pix_multiply_1x128 (
+                                      a, expand_alpha_1x128 (s)));
+    __m128i dest      = pix_multiply_1x128 (s, a);
+    __m128i alpha_src = negate_1x128 (expand_alpha_1x128 (d));
+
+    return pack_1x128_32 (pix_add_multiply_1x128 (&d,
+                                                &alpha_dst,
+                                                &dest,
+                                                &alpha_src));
+}
+
+static void
+sse2_combine_xor_ca (pixman_implementation_t *imp,
+                     pixman_op_t              op,
+                     uint32_t *               pd,
+                     const uint32_t *         ps,
+                     const uint32_t *         pm,
+                     int                      w)
+{
+    uint32_t s, m, d;
+
+    __m128i xmm_src_lo, xmm_src_hi;
+    __m128i xmm_dst_lo, xmm_dst_hi;
+    __m128i xmm_alpha_src_lo, xmm_alpha_src_hi;
+    __m128i xmm_alpha_dst_lo, xmm_alpha_dst_hi;
+    __m128i xmm_mask_lo, xmm_mask_hi;
+
+    while (w && (unsigned long)pd & 15)
+    {
+       s = *ps++;
+       m = *pm++;
+       d = *pd;
+
+       *pd++ = core_combine_xor_ca_pixel_sse2 (s, m, d);
+       w--;
+    }
+
+    while (w >= 4)
+    {
+       xmm_dst_hi = load_128_aligned ((__m128i*)pd);
+       xmm_src_hi = load_128_unaligned ((__m128i*)ps);
+       xmm_mask_hi = load_128_unaligned ((__m128i*)pm);
+
+       unpack_128_2x128 (xmm_dst_hi, &xmm_dst_lo, &xmm_dst_hi);
+       unpack_128_2x128 (xmm_src_hi, &xmm_src_lo, &xmm_src_hi);
+       unpack_128_2x128 (xmm_mask_hi, &xmm_mask_lo, &xmm_mask_hi);
+
+       expand_alpha_2x128 (xmm_src_lo, xmm_src_hi,
+                           &xmm_alpha_src_lo, &xmm_alpha_src_hi);
+       expand_alpha_2x128 (xmm_dst_lo, xmm_dst_hi,
+                           &xmm_alpha_dst_lo, &xmm_alpha_dst_hi);
+
+       pix_multiply_2x128 (&xmm_src_lo, &xmm_src_hi,
+                           &xmm_mask_lo, &xmm_mask_hi,
+                           &xmm_src_lo, &xmm_src_hi);
+       pix_multiply_2x128 (&xmm_mask_lo, &xmm_mask_hi,
+                           &xmm_alpha_src_lo, &xmm_alpha_src_hi,
+                           &xmm_mask_lo, &xmm_mask_hi);
+
+       negate_2x128 (xmm_alpha_dst_lo, xmm_alpha_dst_hi,
+                     &xmm_alpha_dst_lo, &xmm_alpha_dst_hi);
+       negate_2x128 (xmm_mask_lo, xmm_mask_hi,
+                     &xmm_mask_lo, &xmm_mask_hi);
+
+       pix_add_multiply_2x128 (
+           &xmm_dst_lo, &xmm_dst_hi, &xmm_mask_lo, &xmm_mask_hi,
+           &xmm_src_lo, &xmm_src_hi, &xmm_alpha_dst_lo, &xmm_alpha_dst_hi,
+           &xmm_dst_lo, &xmm_dst_hi);
+
+       save_128_aligned (
+           (__m128i*)pd, pack_2x128_128 (xmm_dst_lo, xmm_dst_hi));
+
+       ps += 4;
+       pd += 4;
+       pm += 4;
+       w -= 4;
+    }
+
+    while (w)
+    {
+       s = *ps++;
+       m = *pm++;
+       d = *pd;
+
+       *pd++ = core_combine_xor_ca_pixel_sse2 (s, m, d);
+       w--;
+    }
+}
+
+static void
+sse2_combine_add_ca (pixman_implementation_t *imp,
+                     pixman_op_t              op,
+                     uint32_t *               pd,
+                     const uint32_t *         ps,
+                     const uint32_t *         pm,
+                     int                      w)
+{
+    uint32_t s, m, d;
+
+    __m128i xmm_src_lo, xmm_src_hi;
+    __m128i xmm_dst_lo, xmm_dst_hi;
+    __m128i xmm_mask_lo, xmm_mask_hi;
+
+    while (w && (unsigned long)pd & 15)
+    {
+       s = *ps++;
+       m = *pm++;
+       d = *pd;
+
+       *pd++ = pack_1x128_32 (
+           _mm_adds_epu8 (pix_multiply_1x128 (unpack_32_1x128 (s),
+                                              unpack_32_1x128 (m)),
+                          unpack_32_1x128 (d)));
+       w--;
+    }
+
+    while (w >= 4)
+    {
+       xmm_src_hi = load_128_unaligned ((__m128i*)ps);
+       xmm_mask_hi = load_128_unaligned ((__m128i*)pm);
+       xmm_dst_hi = load_128_aligned ((__m128i*)pd);
+
+       unpack_128_2x128 (xmm_src_hi, &xmm_src_lo, &xmm_src_hi);
+       unpack_128_2x128 (xmm_mask_hi, &xmm_mask_lo, &xmm_mask_hi);
+       unpack_128_2x128 (xmm_dst_hi, &xmm_dst_lo, &xmm_dst_hi);
+
+       pix_multiply_2x128 (&xmm_src_lo, &xmm_src_hi,
+                           &xmm_mask_lo, &xmm_mask_hi,
+                           &xmm_src_lo, &xmm_src_hi);
+
+       save_128_aligned (
+           (__m128i*)pd, pack_2x128_128 (
+               _mm_adds_epu8 (xmm_src_lo, xmm_dst_lo),
+               _mm_adds_epu8 (xmm_src_hi, xmm_dst_hi)));
+
+       ps += 4;
+       pd += 4;
+       pm += 4;
+       w -= 4;
+    }
+
+    while (w)
+    {
+       s = *ps++;
+       m = *pm++;
+       d = *pd;
+
+       *pd++ = pack_1x128_32 (
+           _mm_adds_epu8 (pix_multiply_1x128 (unpack_32_1x128 (s),
+                                              unpack_32_1x128 (m)),
+                          unpack_32_1x128 (d)));
+       w--;
+    }
+}
+
+static force_inline __m128i
+create_mask_16_128 (uint16_t mask)
+{
+    return _mm_set1_epi16 (mask);
+}
+
+/* Work around a code generation bug in Sun Studio 12. */
+#if defined(__SUNPRO_C) && (__SUNPRO_C >= 0x590)
+# define create_mask_2x32_128(mask0, mask1)                            \
+    (_mm_set_epi32 ((mask0), (mask1), (mask0), (mask1)))
+#else
+static force_inline __m128i
+create_mask_2x32_128 (uint32_t mask0,
+                      uint32_t mask1)
+{
+    return _mm_set_epi32 (mask0, mask1, mask0, mask1);
+}
+#endif
+
+static void
+sse2_composite_over_n_8888 (pixman_implementation_t *imp,
+                            pixman_composite_info_t *info)
+{
+    PIXMAN_COMPOSITE_ARGS (info);
+    uint32_t src;
+    uint32_t    *dst_line, *dst, d;
+    int32_t w;
+    int dst_stride;
+    __m128i xmm_src, xmm_alpha;
+    __m128i xmm_dst, xmm_dst_lo, xmm_dst_hi;
+
+    src = _pixman_image_get_solid (imp, src_image, dest_image->bits.format);
+
+    if (src == 0)
+       return;
+
+    PIXMAN_IMAGE_GET_LINE (
+       dest_image, dest_x, dest_y, uint32_t, dst_stride, dst_line, 1);
+
+    xmm_src = expand_pixel_32_1x128 (src);
+    xmm_alpha = expand_alpha_1x128 (xmm_src);
+
+    while (height--)
+    {
+       dst = dst_line;
+
+       dst_line += dst_stride;
+       w = width;
+
+       while (w && (unsigned long)dst & 15)
+       {
+           d = *dst;
+           *dst++ = pack_1x128_32 (over_1x128 (xmm_src,
+                                               xmm_alpha,
+                                               unpack_32_1x128 (d)));
+           w--;
+       }
+
+       while (w >= 4)
+       {
+           xmm_dst = load_128_aligned ((__m128i*)dst);
+
+           unpack_128_2x128 (xmm_dst, &xmm_dst_lo, &xmm_dst_hi);
+
+           over_2x128 (&xmm_src, &xmm_src,
+                       &xmm_alpha, &xmm_alpha,
+                       &xmm_dst_lo, &xmm_dst_hi);
+
+           /* rebuid the 4 pixel data and save*/
+           save_128_aligned (
+               (__m128i*)dst, pack_2x128_128 (xmm_dst_lo, xmm_dst_hi));
+
+           w -= 4;
+           dst += 4;
+       }
+
+       while (w)
+       {
+           d = *dst;
+           *dst++ = pack_1x128_32 (over_1x128 (xmm_src,
+                                               xmm_alpha,
+                                               unpack_32_1x128 (d)));
+           w--;
+       }
+
+    }
+}
+
+static void
+sse2_composite_over_n_0565 (pixman_implementation_t *imp,
+                            pixman_composite_info_t *info)
+{
+    PIXMAN_COMPOSITE_ARGS (info);
+    uint32_t src;
+    uint16_t    *dst_line, *dst, d;
+    int32_t w;
+    int dst_stride;
+    __m128i xmm_src, xmm_alpha;
+    __m128i xmm_dst, xmm_dst0, xmm_dst1, xmm_dst2, xmm_dst3;
+
+    src = _pixman_image_get_solid (imp, src_image, dest_image->bits.format);
+
+    if (src == 0)
+       return;
+
+    PIXMAN_IMAGE_GET_LINE (
+       dest_image, dest_x, dest_y, uint16_t, dst_stride, dst_line, 1);
+
+    xmm_src = expand_pixel_32_1x128 (src);
+    xmm_alpha = expand_alpha_1x128 (xmm_src);
+
+    while (height--)
+    {
+       dst = dst_line;
+
+       dst_line += dst_stride;
+       w = width;
+
+       while (w && (unsigned long)dst & 15)
+       {
+           d = *dst;
+
+           *dst++ = pack_565_32_16 (
+               pack_1x128_32 (over_1x128 (xmm_src,
+                                          xmm_alpha,
+                                          expand565_16_1x128 (d))));
+           w--;
+       }
+
+       while (w >= 8)
+       {
+           xmm_dst = load_128_aligned ((__m128i*)dst);
+
+           unpack_565_128_4x128 (xmm_dst,
+                                 &xmm_dst0, &xmm_dst1, &xmm_dst2, &xmm_dst3);
+
+           over_2x128 (&xmm_src, &xmm_src,
+                       &xmm_alpha, &xmm_alpha,
+                       &xmm_dst0, &xmm_dst1);
+           over_2x128 (&xmm_src, &xmm_src,
+                       &xmm_alpha, &xmm_alpha,
+                       &xmm_dst2, &xmm_dst3);
+
+           xmm_dst = pack_565_4x128_128 (
+               &xmm_dst0, &xmm_dst1, &xmm_dst2, &xmm_dst3);
+
+           save_128_aligned ((__m128i*)dst, xmm_dst);
+
+           dst += 8;
+           w -= 8;
+       }
+
+       while (w--)
+       {
+           d = *dst;
+           *dst++ = pack_565_32_16 (
+               pack_1x128_32 (over_1x128 (xmm_src, xmm_alpha,
+                                          expand565_16_1x128 (d))));
+       }
+    }
+
+}
+
+static void
+sse2_composite_add_n_8888_8888_ca (pixman_implementation_t *imp,
+                                  pixman_composite_info_t *info)
+{
+    PIXMAN_COMPOSITE_ARGS (info);
+    uint32_t src;
+    uint32_t    *dst_line, d;
+    uint32_t    *mask_line, m;
+    uint32_t pack_cmp;
+    int dst_stride, mask_stride;
+
+    __m128i xmm_src;
+    __m128i xmm_dst;
+    __m128i xmm_mask, xmm_mask_lo, xmm_mask_hi;
+
+    __m128i mmx_src, mmx_mask, mmx_dest;
+
+    src = _pixman_image_get_solid (imp, src_image, dest_image->bits.format);
+
+    if (src == 0)
+       return;
+
+    PIXMAN_IMAGE_GET_LINE (
+       dest_image, dest_x, dest_y, uint32_t, dst_stride, dst_line, 1);
+    PIXMAN_IMAGE_GET_LINE (
+       mask_image, mask_x, mask_y, uint32_t, mask_stride, mask_line, 1);
+
+    xmm_src = _mm_unpacklo_epi8 (
+       create_mask_2x32_128 (src, src), _mm_setzero_si128 ());
+    mmx_src   = xmm_src;
+
+    while (height--)
+    {
+       int w = width;
+       const uint32_t *pm = (uint32_t *)mask_line;
+       uint32_t *pd = (uint32_t *)dst_line;
+
+       dst_line += dst_stride;
+       mask_line += mask_stride;
+
+       while (w && (unsigned long)pd & 15)
+       {
+           m = *pm++;
+
+           if (m)
+           {
+               d = *pd;
+
+               mmx_mask = unpack_32_1x128 (m);
+               mmx_dest = unpack_32_1x128 (d);
+
+               *pd = pack_1x128_32 (
+                   _mm_adds_epu8 (pix_multiply_1x128 (mmx_mask, mmx_src),
+                                  mmx_dest));
+           }
+
+           pd++;
+           w--;
+       }
+
+       while (w >= 4)
+       {
+           xmm_mask = load_128_unaligned ((__m128i*)pm);
+
+           pack_cmp =
+               _mm_movemask_epi8 (
+                   _mm_cmpeq_epi32 (xmm_mask, _mm_setzero_si128 ()));
+
+           /* if all bits in mask are zero, pack_cmp are equal to 0xffff */
+           if (pack_cmp != 0xffff)
+           {
+               xmm_dst = load_128_aligned ((__m128i*)pd);
+
+               unpack_128_2x128 (xmm_mask, &xmm_mask_lo, &xmm_mask_hi);
+
+               pix_multiply_2x128 (&xmm_src, &xmm_src,
+                                   &xmm_mask_lo, &xmm_mask_hi,
+                                   &xmm_mask_lo, &xmm_mask_hi);
+               xmm_mask_hi = pack_2x128_128 (xmm_mask_lo, xmm_mask_hi);
+
+               save_128_aligned (
+                   (__m128i*)pd, _mm_adds_epu8 (xmm_mask_hi, xmm_dst));
+           }
+
+           pd += 4;
+           pm += 4;
+           w -= 4;
+       }
+
+       while (w)
+       {
+           m = *pm++;
+
+           if (m)
+           {
+               d = *pd;
+
+               mmx_mask = unpack_32_1x128 (m);
+               mmx_dest = unpack_32_1x128 (d);
+
+               *pd = pack_1x128_32 (
+                   _mm_adds_epu8 (pix_multiply_1x128 (mmx_mask, mmx_src),
+                                  mmx_dest));
+           }
+
+           pd++;
+           w--;
+       }
+    }
+
+}
+
+static void
+sse2_composite_over_n_8888_8888_ca (pixman_implementation_t *imp,
+                                    pixman_composite_info_t *info)
+{
+    PIXMAN_COMPOSITE_ARGS (info);
+    uint32_t src;
+    uint32_t    *dst_line, d;
+    uint32_t    *mask_line, m;
+    uint32_t pack_cmp;
+    int dst_stride, mask_stride;
+
+    __m128i xmm_src, xmm_alpha;
+    __m128i xmm_dst, xmm_dst_lo, xmm_dst_hi;
+    __m128i xmm_mask, xmm_mask_lo, xmm_mask_hi;
+
+    __m128i mmx_src, mmx_alpha, mmx_mask, mmx_dest;
+
+    src = _pixman_image_get_solid (imp, src_image, dest_image->bits.format);
+
+    if (src == 0)
+       return;
+
+    PIXMAN_IMAGE_GET_LINE (
+       dest_image, dest_x, dest_y, uint32_t, dst_stride, dst_line, 1);
+    PIXMAN_IMAGE_GET_LINE (
+       mask_image, mask_x, mask_y, uint32_t, mask_stride, mask_line, 1);
+
+    xmm_src = _mm_unpacklo_epi8 (
+       create_mask_2x32_128 (src, src), _mm_setzero_si128 ());
+    xmm_alpha = expand_alpha_1x128 (xmm_src);
+    mmx_src   = xmm_src;
+    mmx_alpha = xmm_alpha;
+
+    while (height--)
+    {
+       int w = width;
+       const uint32_t *pm = (uint32_t *)mask_line;
+       uint32_t *pd = (uint32_t *)dst_line;
+
+       dst_line += dst_stride;
+       mask_line += mask_stride;
+
+       while (w && (unsigned long)pd & 15)
+       {
+           m = *pm++;
+
+           if (m)
+           {
+               d = *pd;
+               mmx_mask = unpack_32_1x128 (m);
+               mmx_dest = unpack_32_1x128 (d);
+
+               *pd = pack_1x128_32 (in_over_1x128 (&mmx_src,
+                                                 &mmx_alpha,
+                                                 &mmx_mask,
+                                                 &mmx_dest));
+           }
+
+           pd++;
+           w--;
+       }
+
+       while (w >= 4)
+       {
+           xmm_mask = load_128_unaligned ((__m128i*)pm);
+
+           pack_cmp =
+               _mm_movemask_epi8 (
+                   _mm_cmpeq_epi32 (xmm_mask, _mm_setzero_si128 ()));
+
+           /* if all bits in mask are zero, pack_cmp are equal to 0xffff */
+           if (pack_cmp != 0xffff)
+           {
+               xmm_dst = load_128_aligned ((__m128i*)pd);
+
+               unpack_128_2x128 (xmm_mask, &xmm_mask_lo, &xmm_mask_hi);
+               unpack_128_2x128 (xmm_dst, &xmm_dst_lo, &xmm_dst_hi);
+
+               in_over_2x128 (&xmm_src, &xmm_src,
+                              &xmm_alpha, &xmm_alpha,
+                              &xmm_mask_lo, &xmm_mask_hi,
+                              &xmm_dst_lo, &xmm_dst_hi);
+
+               save_128_aligned (
+                   (__m128i*)pd, pack_2x128_128 (xmm_dst_lo, xmm_dst_hi));
+           }
+
+           pd += 4;
+           pm += 4;
+           w -= 4;
+       }
+
+       while (w)
+       {
+           m = *pm++;
+
+           if (m)
+           {
+               d = *pd;
+               mmx_mask = unpack_32_1x128 (m);
+               mmx_dest = unpack_32_1x128 (d);
+
+               *pd = pack_1x128_32 (
+                   in_over_1x128 (&mmx_src, &mmx_alpha, &mmx_mask, &mmx_dest));
+           }
+
+           pd++;
+           w--;
+       }
+    }
+
+}
+
+static void
+sse2_composite_over_8888_n_8888 (pixman_implementation_t *imp,
+                                 pixman_composite_info_t *info)
+{
+    PIXMAN_COMPOSITE_ARGS (info);
+    uint32_t    *dst_line, *dst;
+    uint32_t    *src_line, *src;
+    uint32_t mask;
+    int32_t w;
+    int dst_stride, src_stride;
+
+    __m128i xmm_mask;
+    __m128i xmm_src, xmm_src_lo, xmm_src_hi;
+    __m128i xmm_dst, xmm_dst_lo, xmm_dst_hi;
+    __m128i xmm_alpha_lo, xmm_alpha_hi;
+
+    PIXMAN_IMAGE_GET_LINE (
+       dest_image, dest_x, dest_y, uint32_t, dst_stride, dst_line, 1);
+    PIXMAN_IMAGE_GET_LINE (
+       src_image, src_x, src_y, uint32_t, src_stride, src_line, 1);
+
+    mask = _pixman_image_get_solid (imp, mask_image, PIXMAN_a8r8g8b8);
+
+    xmm_mask = create_mask_16_128 (mask >> 24);
+
+    while (height--)
+    {
+       dst = dst_line;
+       dst_line += dst_stride;
+       src = src_line;
+       src_line += src_stride;
+       w = width;
+
+       while (w && (unsigned long)dst & 15)
+       {
+           uint32_t s = *src++;
+
+           if (s)
+           {
+               uint32_t d = *dst;
+               
+               __m128i ms = unpack_32_1x128 (s);
+               __m128i alpha    = expand_alpha_1x128 (ms);
+               __m128i dest     = xmm_mask;
+               __m128i alpha_dst = unpack_32_1x128 (d);
+               
+               *dst = pack_1x128_32 (
+                   in_over_1x128 (&ms, &alpha, &dest, &alpha_dst));
+           }
+           dst++;
+           w--;
+       }
+
+       while (w >= 4)
+       {
+           xmm_src = load_128_unaligned ((__m128i*)src);
+
+           if (!is_zero (xmm_src))
+           {
+               xmm_dst = load_128_aligned ((__m128i*)dst);
+               
+               unpack_128_2x128 (xmm_src, &xmm_src_lo, &xmm_src_hi);
+               unpack_128_2x128 (xmm_dst, &xmm_dst_lo, &xmm_dst_hi);
+               expand_alpha_2x128 (xmm_src_lo, xmm_src_hi,
+                                   &xmm_alpha_lo, &xmm_alpha_hi);
+               
+               in_over_2x128 (&xmm_src_lo, &xmm_src_hi,
+                              &xmm_alpha_lo, &xmm_alpha_hi,
+                              &xmm_mask, &xmm_mask,
+                              &xmm_dst_lo, &xmm_dst_hi);
+               
+               save_128_aligned (
+                   (__m128i*)dst, pack_2x128_128 (xmm_dst_lo, xmm_dst_hi));
+           }
+               
+           dst += 4;
+           src += 4;
+           w -= 4;
+       }
+
+       while (w)
+       {
+           uint32_t s = *src++;
+
+           if (s)
+           {
+               uint32_t d = *dst;
+               
+               __m128i ms = unpack_32_1x128 (s);
+               __m128i alpha = expand_alpha_1x128 (ms);
+               __m128i mask  = xmm_mask;
+               __m128i dest  = unpack_32_1x128 (d);
+               
+               *dst = pack_1x128_32 (
+                   in_over_1x128 (&ms, &alpha, &mask, &dest));
+           }
+
+           dst++;
+           w--;
+       }
+    }
+
+}
+
+static void
+sse2_composite_src_x888_8888 (pixman_implementation_t *imp,
+                             pixman_composite_info_t *info)
+{
+    PIXMAN_COMPOSITE_ARGS (info);
+    uint32_t    *dst_line, *dst;
+    uint32_t    *src_line, *src;
+    int32_t w;
+    int dst_stride, src_stride;
+
+
+    PIXMAN_IMAGE_GET_LINE (
+       dest_image, dest_x, dest_y, uint32_t, dst_stride, dst_line, 1);
+    PIXMAN_IMAGE_GET_LINE (
+       src_image, src_x, src_y, uint32_t, src_stride, src_line, 1);
+
+    while (height--)
+    {
+       dst = dst_line;
+       dst_line += dst_stride;
+       src = src_line;
+       src_line += src_stride;
+       w = width;
+
+       while (w && (unsigned long)dst & 15)
+       {
+           *dst++ = *src++ | 0xff000000;
+           w--;
+       }
+
+       while (w >= 16)
+       {
+           __m128i xmm_src1, xmm_src2, xmm_src3, xmm_src4;
+           
+           xmm_src1 = load_128_unaligned ((__m128i*)src + 0);
+           xmm_src2 = load_128_unaligned ((__m128i*)src + 1);
+           xmm_src3 = load_128_unaligned ((__m128i*)src + 2);
+           xmm_src4 = load_128_unaligned ((__m128i*)src + 3);
+           
+           save_128_aligned ((__m128i*)dst + 0, _mm_or_si128 (xmm_src1, mask_ff000000));
+           save_128_aligned ((__m128i*)dst + 1, _mm_or_si128 (xmm_src2, mask_ff000000));
+           save_128_aligned ((__m128i*)dst + 2, _mm_or_si128 (xmm_src3, mask_ff000000));
+           save_128_aligned ((__m128i*)dst + 3, _mm_or_si128 (xmm_src4, mask_ff000000));
+           
+           dst += 16;
+           src += 16;
+           w -= 16;
+       }
+
+       while (w)
+       {
+           *dst++ = *src++ | 0xff000000;
+           w--;
+       }
+    }
+
+}
+
+static void
+sse2_composite_over_x888_n_8888 (pixman_implementation_t *imp,
+                                 pixman_composite_info_t *info)
+{
+    PIXMAN_COMPOSITE_ARGS (info);
+    uint32_t    *dst_line, *dst;
+    uint32_t    *src_line, *src;
+    uint32_t mask;
+    int dst_stride, src_stride;
+    int32_t w;
+
+    __m128i xmm_mask, xmm_alpha;
+    __m128i xmm_src, xmm_src_lo, xmm_src_hi;
+    __m128i xmm_dst, xmm_dst_lo, xmm_dst_hi;
+
+    PIXMAN_IMAGE_GET_LINE (
+       dest_image, dest_x, dest_y, uint32_t, dst_stride, dst_line, 1);
+    PIXMAN_IMAGE_GET_LINE (
+       src_image, src_x, src_y, uint32_t, src_stride, src_line, 1);
+
+    mask = _pixman_image_get_solid (imp, mask_image, PIXMAN_a8r8g8b8);
+
+    xmm_mask = create_mask_16_128 (mask >> 24);
+    xmm_alpha = mask_00ff;
+
+    while (height--)
+    {
+       dst = dst_line;
+       dst_line += dst_stride;
+       src = src_line;
+       src_line += src_stride;
+       w = width;
+
+       while (w && (unsigned long)dst & 15)
+       {
+           uint32_t s = (*src++) | 0xff000000;
+           uint32_t d = *dst;
+
+           __m128i src   = unpack_32_1x128 (s);
+           __m128i alpha = xmm_alpha;
+           __m128i mask  = xmm_mask;
+           __m128i dest  = unpack_32_1x128 (d);
+
+           *dst++ = pack_1x128_32 (
+               in_over_1x128 (&src, &alpha, &mask, &dest));
+
+           w--;
+       }
+
+       while (w >= 4)
+       {
+           xmm_src = _mm_or_si128 (
+               load_128_unaligned ((__m128i*)src), mask_ff000000);
+           xmm_dst = load_128_aligned ((__m128i*)dst);
+
+           unpack_128_2x128 (xmm_src, &xmm_src_lo, &xmm_src_hi);
+           unpack_128_2x128 (xmm_dst, &xmm_dst_lo, &xmm_dst_hi);
+
+           in_over_2x128 (&xmm_src_lo, &xmm_src_hi,
+                          &xmm_alpha, &xmm_alpha,
+                          &xmm_mask, &xmm_mask,
+                          &xmm_dst_lo, &xmm_dst_hi);
+
+           save_128_aligned (
+               (__m128i*)dst, pack_2x128_128 (xmm_dst_lo, xmm_dst_hi));
+
+           dst += 4;
+           src += 4;
+           w -= 4;
+
+       }
+
+       while (w)
+       {
+           uint32_t s = (*src++) | 0xff000000;
+           uint32_t d = *dst;
+
+           __m128i src  = unpack_32_1x128 (s);
+           __m128i alpha = xmm_alpha;
+           __m128i mask  = xmm_mask;
+           __m128i dest  = unpack_32_1x128 (d);
+
+           *dst++ = pack_1x128_32 (
+               in_over_1x128 (&src, &alpha, &mask, &dest));
+
+           w--;
+       }
+    }
+
+}
+
+static void
+sse2_composite_over_8888_8888 (pixman_implementation_t *imp,
+                               pixman_composite_info_t *info)
+{
+    PIXMAN_COMPOSITE_ARGS (info);
+    int dst_stride, src_stride;
+    uint32_t    *dst_line, *dst;
+    uint32_t    *src_line, *src;
+
+    PIXMAN_IMAGE_GET_LINE (
+       dest_image, dest_x, dest_y, uint32_t, dst_stride, dst_line, 1);
+    PIXMAN_IMAGE_GET_LINE (
+       src_image, src_x, src_y, uint32_t, src_stride, src_line, 1);
+
+    dst = dst_line;
+    src = src_line;
+
+    while (height--)
+    {
+       sse2_combine_over_u (imp, op, dst, src, NULL, width);
+
+       dst += dst_stride;
+       src += src_stride;
+    }
+}
+
+static force_inline uint16_t
+composite_over_8888_0565pixel (uint32_t src, uint16_t dst)
+{
+    __m128i ms;
+
+    ms = unpack_32_1x128 (src);
+    return pack_565_32_16 (
+       pack_1x128_32 (
+           over_1x128 (
+               ms, expand_alpha_1x128 (ms), expand565_16_1x128 (dst))));
+}
+
+static void
+sse2_composite_over_8888_0565 (pixman_implementation_t *imp,
+                               pixman_composite_info_t *info)
+{
+    PIXMAN_COMPOSITE_ARGS (info);
+    uint16_t    *dst_line, *dst, d;
+    uint32_t    *src_line, *src, s;
+    int dst_stride, src_stride;
+    int32_t w;
+
+    __m128i xmm_alpha_lo, xmm_alpha_hi;
+    __m128i xmm_src, xmm_src_lo, xmm_src_hi;
+    __m128i xmm_dst, xmm_dst0, xmm_dst1, xmm_dst2, xmm_dst3;
+
+    PIXMAN_IMAGE_GET_LINE (
+       dest_image, dest_x, dest_y, uint16_t, dst_stride, dst_line, 1);
+    PIXMAN_IMAGE_GET_LINE (
+       src_image, src_x, src_y, uint32_t, src_stride, src_line, 1);
+
+    while (height--)
+    {
+       dst = dst_line;
+       src = src_line;
+
+       dst_line += dst_stride;
+       src_line += src_stride;
+       w = width;
+
+       /* Align dst on a 16-byte boundary */
+       while (w &&
+              ((unsigned long)dst & 15))
+       {
+           s = *src++;
+           d = *dst;
+
+           *dst++ = composite_over_8888_0565pixel (s, d);
+           w--;
+       }
+
+       /* It's a 8 pixel loop */
+       while (w >= 8)
+       {
+           /* I'm loading unaligned because I'm not sure
+            * about the address alignment.
+            */
+           xmm_src = load_128_unaligned ((__m128i*) src);
+           xmm_dst = load_128_aligned ((__m128i*) dst);
+
+           /* Unpacking */
+           unpack_128_2x128 (xmm_src, &xmm_src_lo, &xmm_src_hi);
+           unpack_565_128_4x128 (xmm_dst,
+                                 &xmm_dst0, &xmm_dst1, &xmm_dst2, &xmm_dst3);
+           expand_alpha_2x128 (xmm_src_lo, xmm_src_hi,
+                               &xmm_alpha_lo, &xmm_alpha_hi);
+
+           /* I'm loading next 4 pixels from memory
+            * before to optimze the memory read.
+            */
+           xmm_src = load_128_unaligned ((__m128i*) (src + 4));
+
+           over_2x128 (&xmm_src_lo, &xmm_src_hi,
+                       &xmm_alpha_lo, &xmm_alpha_hi,
+                       &xmm_dst0, &xmm_dst1);
+
+           /* Unpacking */
+           unpack_128_2x128 (xmm_src, &xmm_src_lo, &xmm_src_hi);
+           expand_alpha_2x128 (xmm_src_lo, xmm_src_hi,
+                               &xmm_alpha_lo, &xmm_alpha_hi);
+
+           over_2x128 (&xmm_src_lo, &xmm_src_hi,
+                       &xmm_alpha_lo, &xmm_alpha_hi,
+                       &xmm_dst2, &xmm_dst3);
+
+           save_128_aligned (
+               (__m128i*)dst, pack_565_4x128_128 (
+                   &xmm_dst0, &xmm_dst1, &xmm_dst2, &xmm_dst3));
+
+           w -= 8;
+           dst += 8;
+           src += 8;
+       }
+
+       while (w--)
+       {
+           s = *src++;
+           d = *dst;
+
+           *dst++ = composite_over_8888_0565pixel (s, d);
+       }
+    }
+
+}
+
+static void
+sse2_composite_over_n_8_8888 (pixman_implementation_t *imp,
+                              pixman_composite_info_t *info)
+{
+    PIXMAN_COMPOSITE_ARGS (info);
+    uint32_t src, srca;
+    uint32_t *dst_line, *dst;
+    uint8_t *mask_line, *mask;
+    int dst_stride, mask_stride;
+    int32_t w;
+    uint32_t m, d;
+
+    __m128i xmm_src, xmm_alpha, xmm_def;
+    __m128i xmm_dst, xmm_dst_lo, xmm_dst_hi;
+    __m128i xmm_mask, xmm_mask_lo, xmm_mask_hi;
+
+    __m128i mmx_src, mmx_alpha, mmx_mask, mmx_dest;
+
+    src = _pixman_image_get_solid (imp, src_image, dest_image->bits.format);
+
+    srca = src >> 24;
+    if (src == 0)
+       return;
+
+    PIXMAN_IMAGE_GET_LINE (
+       dest_image, dest_x, dest_y, uint32_t, dst_stride, dst_line, 1);
+    PIXMAN_IMAGE_GET_LINE (
+       mask_image, mask_x, mask_y, uint8_t, mask_stride, mask_line, 1);
+
+    xmm_def = create_mask_2x32_128 (src, src);
+    xmm_src = expand_pixel_32_1x128 (src);
+    xmm_alpha = expand_alpha_1x128 (xmm_src);
+    mmx_src   = xmm_src;
+    mmx_alpha = xmm_alpha;
+
+    while (height--)
+    {
+       dst = dst_line;
+       dst_line += dst_stride;
+       mask = mask_line;
+       mask_line += mask_stride;
+       w = width;
+
+       while (w && (unsigned long)dst & 15)
+       {
+           uint8_t m = *mask++;
+
+           if (m)
+           {
+               d = *dst;
+               mmx_mask = expand_pixel_8_1x128 (m);
+               mmx_dest = unpack_32_1x128 (d);
+
+               *dst = pack_1x128_32 (in_over_1x128 (&mmx_src,
+                                                  &mmx_alpha,
+                                                  &mmx_mask,
+                                                  &mmx_dest));
+           }
+
+           w--;
+           dst++;
+       }
+
+       while (w >= 4)
+       {
+           m = *((uint32_t*)mask);
+
+           if (srca == 0xff && m == 0xffffffff)
+           {
+               save_128_aligned ((__m128i*)dst, xmm_def);
+           }
+           else if (m)
+           {
+               xmm_dst = load_128_aligned ((__m128i*) dst);
+               xmm_mask = unpack_32_1x128 (m);
+               xmm_mask = _mm_unpacklo_epi8 (xmm_mask, _mm_setzero_si128 ());
+
+               /* Unpacking */
+               unpack_128_2x128 (xmm_dst, &xmm_dst_lo, &xmm_dst_hi);
+               unpack_128_2x128 (xmm_mask, &xmm_mask_lo, &xmm_mask_hi);
+
+               expand_alpha_rev_2x128 (xmm_mask_lo, xmm_mask_hi,
+                                       &xmm_mask_lo, &xmm_mask_hi);
+
+               in_over_2x128 (&xmm_src, &xmm_src,
+                              &xmm_alpha, &xmm_alpha,
+                              &xmm_mask_lo, &xmm_mask_hi,
+                              &xmm_dst_lo, &xmm_dst_hi);
+
+               save_128_aligned (
+                   (__m128i*)dst, pack_2x128_128 (xmm_dst_lo, xmm_dst_hi));
+           }
+
+           w -= 4;
+           dst += 4;
+           mask += 4;
+       }
+
+       while (w)
+       {
+           uint8_t m = *mask++;
+
+           if (m)
+           {
+               d = *dst;
+               mmx_mask = expand_pixel_8_1x128 (m);
+               mmx_dest = unpack_32_1x128 (d);
+
+               *dst = pack_1x128_32 (in_over_1x128 (&mmx_src,
+                                                  &mmx_alpha,
+                                                  &mmx_mask,
+                                                  &mmx_dest));
+           }
+
+           w--;
+           dst++;
+       }
+    }
+
+}
+
+static pixman_bool_t
+pixman_fill_sse2 (uint32_t *bits,
+                  int       stride,
+                  int       bpp,
+                  int       x,
+                  int       y,
+                  int       width,
+                  int       height,
+                  uint32_t  data)
+{
+    uint32_t byte_width;
+    uint8_t         *byte_line;
+
+    __m128i xmm_def;
+
+    if (bpp == 8)
+    {
+       uint8_t b;
+       uint16_t w;
+
+       stride = stride * (int) sizeof (uint32_t) / 1;
+       byte_line = (uint8_t *)(((uint8_t *)bits) + stride * y + x);
+       byte_width = width;
+       stride *= 1;
+
+       b = data & 0xff;
+       w = (b << 8) | b;
+       data = (w << 16) | w;
+    }
+    else if (bpp == 16)
+    {
+       stride = stride * (int) sizeof (uint32_t) / 2;
+       byte_line = (uint8_t *)(((uint16_t *)bits) + stride * y + x);
+       byte_width = 2 * width;
+       stride *= 2;
+
+        data = (data & 0xffff) * 0x00010001;
+    }
+    else if (bpp == 32)
+    {
+       stride = stride * (int) sizeof (uint32_t) / 4;
+       byte_line = (uint8_t *)(((uint32_t *)bits) + stride * y + x);
+       byte_width = 4 * width;
+       stride *= 4;
+    }
+    else
+    {
+       return FALSE;
+    }
+
+    xmm_def = create_mask_2x32_128 (data, data);
+
+    while (height--)
+    {
+       int w;
+       uint8_t *d = byte_line;
+       byte_line += stride;
+       w = byte_width;
+
+       while (w >= 1 && ((unsigned long)d & 1))
+       {
+           *(uint8_t *)d = data;
+           w -= 1;
+           d += 1;
+       }
+
+       while (w >= 2 && ((unsigned long)d & 3))
+       {
+           *(uint16_t *)d = data;
+           w -= 2;
+           d += 2;
+       }
+
+       while (w >= 4 && ((unsigned long)d & 15))
+       {
+           *(uint32_t *)d = data;
+
+           w -= 4;
+           d += 4;
+       }
+
+       while (w >= 128)
+       {
+           save_128_aligned ((__m128i*)(d),     xmm_def);
+           save_128_aligned ((__m128i*)(d + 16),  xmm_def);
+           save_128_aligned ((__m128i*)(d + 32),  xmm_def);
+           save_128_aligned ((__m128i*)(d + 48),  xmm_def);
+           save_128_aligned ((__m128i*)(d + 64),  xmm_def);
+           save_128_aligned ((__m128i*)(d + 80),  xmm_def);
+           save_128_aligned ((__m128i*)(d + 96),  xmm_def);
+           save_128_aligned ((__m128i*)(d + 112), xmm_def);
+
+           d += 128;
+           w -= 128;
+       }
+
+       if (w >= 64)
+       {
+           save_128_aligned ((__m128i*)(d),     xmm_def);
+           save_128_aligned ((__m128i*)(d + 16),  xmm_def);
+           save_128_aligned ((__m128i*)(d + 32),  xmm_def);
+           save_128_aligned ((__m128i*)(d + 48),  xmm_def);
+
+           d += 64;
+           w -= 64;
+       }
+
+       if (w >= 32)
+       {
+           save_128_aligned ((__m128i*)(d),     xmm_def);
+           save_128_aligned ((__m128i*)(d + 16),  xmm_def);
+
+           d += 32;
+           w -= 32;
+       }
+
+       if (w >= 16)
+       {
+           save_128_aligned ((__m128i*)(d),     xmm_def);
+
+           d += 16;
+           w -= 16;
+       }
+
+       while (w >= 4)
+       {
+           *(uint32_t *)d = data;
+
+           w -= 4;
+           d += 4;
+       }
+
+       if (w >= 2)
+       {
+           *(uint16_t *)d = data;
+           w -= 2;
+           d += 2;
+       }
+
+       if (w >= 1)
+       {
+           *(uint8_t *)d = data;
+           w -= 1;
+           d += 1;
+       }
+    }
+
+    return TRUE;
+}
+
+static void
+sse2_composite_src_n_8_8888 (pixman_implementation_t *imp,
+                             pixman_composite_info_t *info)
+{
+    PIXMAN_COMPOSITE_ARGS (info);
+    uint32_t src, srca;
+    uint32_t    *dst_line, *dst;
+    uint8_t     *mask_line, *mask;
+    int dst_stride, mask_stride;
+    int32_t w;
+    uint32_t m;
+
+    __m128i xmm_src, xmm_def;
+    __m128i xmm_mask, xmm_mask_lo, xmm_mask_hi;
+
+    src = _pixman_image_get_solid (imp, src_image, dest_image->bits.format);
+
+    srca = src >> 24;
+    if (src == 0)
+    {
+       pixman_fill_sse2 (dest_image->bits.bits, dest_image->bits.rowstride,
+                         PIXMAN_FORMAT_BPP (dest_image->bits.format),
+                         dest_x, dest_y, width, height, 0);
+       return;
+    }
+
+    PIXMAN_IMAGE_GET_LINE (
+       dest_image, dest_x, dest_y, uint32_t, dst_stride, dst_line, 1);
+    PIXMAN_IMAGE_GET_LINE (
+       mask_image, mask_x, mask_y, uint8_t, mask_stride, mask_line, 1);
+
+    xmm_def = create_mask_2x32_128 (src, src);
+    xmm_src = expand_pixel_32_1x128 (src);
+
+    while (height--)
+    {
+       dst = dst_line;
+       dst_line += dst_stride;
+       mask = mask_line;
+       mask_line += mask_stride;
+       w = width;
+
+       while (w && (unsigned long)dst & 15)
+       {
+           uint8_t m = *mask++;
+
+           if (m)
+           {
+               *dst = pack_1x128_32 (
+                   pix_multiply_1x128 (xmm_src, expand_pixel_8_1x128 (m)));
+           }
+           else
+           {
+               *dst = 0;
+           }
+
+           w--;
+           dst++;
+       }
+
+       while (w >= 4)
+       {
+           m = *((uint32_t*)mask);
+
+           if (srca == 0xff && m == 0xffffffff)
+           {
+               save_128_aligned ((__m128i*)dst, xmm_def);
+           }
+           else if (m)
+           {
+               xmm_mask = unpack_32_1x128 (m);
+               xmm_mask = _mm_unpacklo_epi8 (xmm_mask, _mm_setzero_si128 ());
+
+               /* Unpacking */
+               unpack_128_2x128 (xmm_mask, &xmm_mask_lo, &xmm_mask_hi);
+
+               expand_alpha_rev_2x128 (xmm_mask_lo, xmm_mask_hi,
+                                       &xmm_mask_lo, &xmm_mask_hi);
+
+               pix_multiply_2x128 (&xmm_src, &xmm_src,
+                                   &xmm_mask_lo, &xmm_mask_hi,
+                                   &xmm_mask_lo, &xmm_mask_hi);
+
+               save_128_aligned (
+                   (__m128i*)dst, pack_2x128_128 (xmm_mask_lo, xmm_mask_hi));
+           }
+           else
+           {
+               save_128_aligned ((__m128i*)dst, _mm_setzero_si128 ());
+           }
+
+           w -= 4;
+           dst += 4;
+           mask += 4;
+       }
+
+       while (w)
+       {
+           uint8_t m = *mask++;
+
+           if (m)
+           {
+               *dst = pack_1x128_32 (
+                   pix_multiply_1x128 (
+                       xmm_src, expand_pixel_8_1x128 (m)));
+           }
+           else
+           {
+               *dst = 0;
+           }
+
+           w--;
+           dst++;
+       }
+    }
+
+}
+
+static void
+sse2_composite_over_n_8_0565 (pixman_implementation_t *imp,
+                              pixman_composite_info_t *info)
+{
+    PIXMAN_COMPOSITE_ARGS (info);
+    uint32_t src;
+    uint16_t    *dst_line, *dst, d;
+    uint8_t     *mask_line, *mask;
+    int dst_stride, mask_stride;
+    int32_t w;
+    uint32_t m;
+    __m128i mmx_src, mmx_alpha, mmx_mask, mmx_dest;
+
+    __m128i xmm_src, xmm_alpha;
+    __m128i xmm_mask, xmm_mask_lo, xmm_mask_hi;
+    __m128i xmm_dst, xmm_dst0, xmm_dst1, xmm_dst2, xmm_dst3;
+
+    src = _pixman_image_get_solid (imp, src_image, dest_image->bits.format);
+
+    if (src == 0)
+       return;
+
+    PIXMAN_IMAGE_GET_LINE (
+       dest_image, dest_x, dest_y, uint16_t, dst_stride, dst_line, 1);
+    PIXMAN_IMAGE_GET_LINE (
+       mask_image, mask_x, mask_y, uint8_t, mask_stride, mask_line, 1);
+
+    xmm_src = expand_pixel_32_1x128 (src);
+    xmm_alpha = expand_alpha_1x128 (xmm_src);
+    mmx_src = xmm_src;
+    mmx_alpha = xmm_alpha;
+
+    while (height--)
+    {
+       dst = dst_line;
+       dst_line += dst_stride;
+       mask = mask_line;
+       mask_line += mask_stride;
+       w = width;
+
+       while (w && (unsigned long)dst & 15)
+       {
+           m = *mask++;
+
+           if (m)
+           {
+               d = *dst;
+               mmx_mask = expand_alpha_rev_1x128 (unpack_32_1x128 (m));
+               mmx_dest = expand565_16_1x128 (d);
+
+               *dst = pack_565_32_16 (
+                   pack_1x128_32 (
+                       in_over_1x128 (
+                           &mmx_src, &mmx_alpha, &mmx_mask, &mmx_dest)));
+           }
+
+           w--;
+           dst++;
+       }
+
+       while (w >= 8)
+       {
+           xmm_dst = load_128_aligned ((__m128i*) dst);
+           unpack_565_128_4x128 (xmm_dst,
+                                 &xmm_dst0, &xmm_dst1, &xmm_dst2, &xmm_dst3);
+
+           m = *((uint32_t*)mask);
+           mask += 4;
+
+           if (m)
+           {
+               xmm_mask = unpack_32_1x128 (m);
+               xmm_mask = _mm_unpacklo_epi8 (xmm_mask, _mm_setzero_si128 ());
+
+               /* Unpacking */
+               unpack_128_2x128 (xmm_mask, &xmm_mask_lo, &xmm_mask_hi);
+
+               expand_alpha_rev_2x128 (xmm_mask_lo, xmm_mask_hi,
+                                       &xmm_mask_lo, &xmm_mask_hi);
+
+               in_over_2x128 (&xmm_src, &xmm_src,
+                              &xmm_alpha, &xmm_alpha,
+                              &xmm_mask_lo, &xmm_mask_hi,
+                              &xmm_dst0, &xmm_dst1);
+           }
+
+           m = *((uint32_t*)mask);
+           mask += 4;
+
+           if (m)
+           {
+               xmm_mask = unpack_32_1x128 (m);
+               xmm_mask = _mm_unpacklo_epi8 (xmm_mask, _mm_setzero_si128 ());
+
+               /* Unpacking */
+               unpack_128_2x128 (xmm_mask, &xmm_mask_lo, &xmm_mask_hi);
+
+               expand_alpha_rev_2x128 (xmm_mask_lo, xmm_mask_hi,
+                                       &xmm_mask_lo, &xmm_mask_hi);
+               in_over_2x128 (&xmm_src, &xmm_src,
+                              &xmm_alpha, &xmm_alpha,
+                              &xmm_mask_lo, &xmm_mask_hi,
+                              &xmm_dst2, &xmm_dst3);
+           }
+
+           save_128_aligned (
+               (__m128i*)dst, pack_565_4x128_128 (
+                   &xmm_dst0, &xmm_dst1, &xmm_dst2, &xmm_dst3));
+
+           w -= 8;
+           dst += 8;
+       }
+
+       while (w)
+       {
+           m = *mask++;
+
+           if (m)
+           {
+               d = *dst;
+               mmx_mask = expand_alpha_rev_1x128 (unpack_32_1x128 (m));
+               mmx_dest = expand565_16_1x128 (d);
+
+               *dst = pack_565_32_16 (
+                   pack_1x128_32 (
+                       in_over_1x128 (
+                           &mmx_src, &mmx_alpha, &mmx_mask, &mmx_dest)));
+           }
+
+           w--;
+           dst++;
+       }
+    }
+
+}
+
+static void
+sse2_composite_over_pixbuf_0565 (pixman_implementation_t *imp,
+                                 pixman_composite_info_t *info)
+{
+    PIXMAN_COMPOSITE_ARGS (info);
+    uint16_t    *dst_line, *dst, d;
+    uint32_t    *src_line, *src, s;
+    int dst_stride, src_stride;
+    int32_t w;
+    uint32_t opaque, zero;
+
+    __m128i ms;
+    __m128i xmm_src, xmm_src_lo, xmm_src_hi;
+    __m128i xmm_dst, xmm_dst0, xmm_dst1, xmm_dst2, xmm_dst3;
+
+    PIXMAN_IMAGE_GET_LINE (
+       dest_image, dest_x, dest_y, uint16_t, dst_stride, dst_line, 1);
+    PIXMAN_IMAGE_GET_LINE (
+       src_image, src_x, src_y, uint32_t, src_stride, src_line, 1);
+
+    while (height--)
+    {
+       dst = dst_line;
+       dst_line += dst_stride;
+       src = src_line;
+       src_line += src_stride;
+       w = width;
+
+       while (w && (unsigned long)dst & 15)
+       {
+           s = *src++;
+           d = *dst;
+
+           ms = unpack_32_1x128 (s);
+
+           *dst++ = pack_565_32_16 (
+               pack_1x128_32 (
+                   over_rev_non_pre_1x128 (ms, expand565_16_1x128 (d))));
+           w--;
+       }
+
+       while (w >= 8)
+       {
+           /* First round */
+           xmm_src = load_128_unaligned ((__m128i*)src);
+           xmm_dst = load_128_aligned  ((__m128i*)dst);
+
+           opaque = is_opaque (xmm_src);
+           zero = is_zero (xmm_src);
+
+           unpack_565_128_4x128 (xmm_dst,
+                                 &xmm_dst0, &xmm_dst1, &xmm_dst2, &xmm_dst3);
+           unpack_128_2x128 (xmm_src, &xmm_src_lo, &xmm_src_hi);
+
+           /* preload next round*/
+           xmm_src = load_128_unaligned ((__m128i*)(src + 4));
+
+           if (opaque)
+           {
+               invert_colors_2x128 (xmm_src_lo, xmm_src_hi,
+                                    &xmm_dst0, &xmm_dst1);
+           }
+           else if (!zero)
+           {
+               over_rev_non_pre_2x128 (xmm_src_lo, xmm_src_hi,
+                                       &xmm_dst0, &xmm_dst1);
+           }
+
+           /* Second round */
+           opaque = is_opaque (xmm_src);
+           zero = is_zero (xmm_src);
+
+           unpack_128_2x128 (xmm_src, &xmm_src_lo, &xmm_src_hi);
+
+           if (opaque)
+           {
+               invert_colors_2x128 (xmm_src_lo, xmm_src_hi,
+                                    &xmm_dst2, &xmm_dst3);
+           }
+           else if (!zero)
+           {
+               over_rev_non_pre_2x128 (xmm_src_lo, xmm_src_hi,
+                                       &xmm_dst2, &xmm_dst3);
+           }
+
+           save_128_aligned (
+               (__m128i*)dst, pack_565_4x128_128 (
+                   &xmm_dst0, &xmm_dst1, &xmm_dst2, &xmm_dst3));
+
+           w -= 8;
+           src += 8;
+           dst += 8;
+       }
+
+       while (w)
+       {
+           s = *src++;
+           d = *dst;
+
+           ms = unpack_32_1x128 (s);
+
+           *dst++ = pack_565_32_16 (
+               pack_1x128_32 (
+                   over_rev_non_pre_1x128 (ms, expand565_16_1x128 (d))));
+           w--;
+       }
+    }
+
+}
+
+static void
+sse2_composite_over_pixbuf_8888 (pixman_implementation_t *imp,
+                                 pixman_composite_info_t *info)
+{
+    PIXMAN_COMPOSITE_ARGS (info);
+    uint32_t    *dst_line, *dst, d;
+    uint32_t    *src_line, *src, s;
+    int dst_stride, src_stride;
+    int32_t w;
+    uint32_t opaque, zero;
+
+    __m128i xmm_src_lo, xmm_src_hi;
+    __m128i xmm_dst_lo, xmm_dst_hi;
+
+    PIXMAN_IMAGE_GET_LINE (
+       dest_image, dest_x, dest_y, uint32_t, dst_stride, dst_line, 1);
+    PIXMAN_IMAGE_GET_LINE (
+       src_image, src_x, src_y, uint32_t, src_stride, src_line, 1);
+
+    while (height--)
+    {
+       dst = dst_line;
+       dst_line += dst_stride;
+       src = src_line;
+       src_line += src_stride;
+       w = width;
+
+       while (w && (unsigned long)dst & 15)
+       {
+           s = *src++;
+           d = *dst;
+
+           *dst++ = pack_1x128_32 (
+               over_rev_non_pre_1x128 (
+                   unpack_32_1x128 (s), unpack_32_1x128 (d)));
+
+           w--;
+       }
+
+       while (w >= 4)
+       {
+           xmm_src_hi = load_128_unaligned ((__m128i*)src);
+
+           opaque = is_opaque (xmm_src_hi);
+           zero = is_zero (xmm_src_hi);
+
+           unpack_128_2x128 (xmm_src_hi, &xmm_src_lo, &xmm_src_hi);
+
+           if (opaque)
+           {
+               invert_colors_2x128 (xmm_src_lo, xmm_src_hi,
+                                    &xmm_dst_lo, &xmm_dst_hi);
+
+               save_128_aligned (
+                   (__m128i*)dst, pack_2x128_128 (xmm_dst_lo, xmm_dst_hi));
+           }
+           else if (!zero)
+           {
+               xmm_dst_hi = load_128_aligned  ((__m128i*)dst);
+
+               unpack_128_2x128 (xmm_dst_hi, &xmm_dst_lo, &xmm_dst_hi);
+
+               over_rev_non_pre_2x128 (xmm_src_lo, xmm_src_hi,
+                                       &xmm_dst_lo, &xmm_dst_hi);
+
+               save_128_aligned (
+                   (__m128i*)dst, pack_2x128_128 (xmm_dst_lo, xmm_dst_hi));
+           }
+
+           w -= 4;
+           dst += 4;
+           src += 4;
+       }
+
+       while (w)
+       {
+           s = *src++;
+           d = *dst;
+
+           *dst++ = pack_1x128_32 (
+               over_rev_non_pre_1x128 (
+                   unpack_32_1x128 (s), unpack_32_1x128 (d)));
+
+           w--;
+       }
+    }
+
+}
+
+static void
+sse2_composite_over_n_8888_0565_ca (pixman_implementation_t *imp,
+                                    pixman_composite_info_t *info)
+{
+    PIXMAN_COMPOSITE_ARGS (info);
+    uint32_t src;
+    uint16_t    *dst_line, *dst, d;
+    uint32_t    *mask_line, *mask, m;
+    int dst_stride, mask_stride;
+    int w;
+    uint32_t pack_cmp;
+
+    __m128i xmm_src, xmm_alpha;
+    __m128i xmm_mask, xmm_mask_lo, xmm_mask_hi;
+    __m128i xmm_dst, xmm_dst0, xmm_dst1, xmm_dst2, xmm_dst3;
+
+    __m128i mmx_src, mmx_alpha, mmx_mask, mmx_dest;
+
+    src = _pixman_image_get_solid (imp, src_image, dest_image->bits.format);
+
+    if (src == 0)
+       return;
+
+    PIXMAN_IMAGE_GET_LINE (
+       dest_image, dest_x, dest_y, uint16_t, dst_stride, dst_line, 1);
+    PIXMAN_IMAGE_GET_LINE (
+       mask_image, mask_x, mask_y, uint32_t, mask_stride, mask_line, 1);
+
+    xmm_src = expand_pixel_32_1x128 (src);
+    xmm_alpha = expand_alpha_1x128 (xmm_src);
+    mmx_src = xmm_src;
+    mmx_alpha = xmm_alpha;
+
+    while (height--)
+    {
+       w = width;
+       mask = mask_line;
+       dst = dst_line;
+       mask_line += mask_stride;
+       dst_line += dst_stride;
+
+       while (w && ((unsigned long)dst & 15))
+       {
+           m = *(uint32_t *) mask;
+
+           if (m)
+           {
+               d = *dst;
+               mmx_mask = unpack_32_1x128 (m);
+               mmx_dest = expand565_16_1x128 (d);
+
+               *dst = pack_565_32_16 (
+                   pack_1x128_32 (
+                       in_over_1x128 (
+                           &mmx_src, &mmx_alpha, &mmx_mask, &mmx_dest)));
+           }
+
+           w--;
+           dst++;
+           mask++;
+       }
+
+       while (w >= 8)
+       {
+           /* First round */
+           xmm_mask = load_128_unaligned ((__m128i*)mask);
+           xmm_dst = load_128_aligned ((__m128i*)dst);
+
+           pack_cmp = _mm_movemask_epi8 (
+               _mm_cmpeq_epi32 (xmm_mask, _mm_setzero_si128 ()));
+
+           unpack_565_128_4x128 (xmm_dst,
+                                 &xmm_dst0, &xmm_dst1, &xmm_dst2, &xmm_dst3);
+           unpack_128_2x128 (xmm_mask, &xmm_mask_lo, &xmm_mask_hi);
+
+           /* preload next round */
+           xmm_mask = load_128_unaligned ((__m128i*)(mask + 4));
+
+           /* preload next round */
+           if (pack_cmp != 0xffff)
+           {
+               in_over_2x128 (&xmm_src, &xmm_src,
+                              &xmm_alpha, &xmm_alpha,
+                              &xmm_mask_lo, &xmm_mask_hi,
+                              &xmm_dst0, &xmm_dst1);
+           }
+
+           /* Second round */
+           pack_cmp = _mm_movemask_epi8 (
+               _mm_cmpeq_epi32 (xmm_mask, _mm_setzero_si128 ()));
+
+           unpack_128_2x128 (xmm_mask, &xmm_mask_lo, &xmm_mask_hi);
+
+           if (pack_cmp != 0xffff)
+           {
+               in_over_2x128 (&xmm_src, &xmm_src,
+                              &xmm_alpha, &xmm_alpha,
+                              &xmm_mask_lo, &xmm_mask_hi,
+                              &xmm_dst2, &xmm_dst3);
+           }
+
+           save_128_aligned (
+               (__m128i*)dst, pack_565_4x128_128 (
+                   &xmm_dst0, &xmm_dst1, &xmm_dst2, &xmm_dst3));
+
+           w -= 8;
+           dst += 8;
+           mask += 8;
+       }
+
+       while (w)
+       {
+           m = *(uint32_t *) mask;
+
+           if (m)
+           {
+               d = *dst;
+               mmx_mask = unpack_32_1x128 (m);
+               mmx_dest = expand565_16_1x128 (d);
+
+               *dst = pack_565_32_16 (
+                   pack_1x128_32 (
+                       in_over_1x128 (
+                           &mmx_src, &mmx_alpha, &mmx_mask, &mmx_dest)));
+           }
+
+           w--;
+           dst++;
+           mask++;
+       }
+    }
+
+}
+
+static void
+sse2_composite_in_n_8_8 (pixman_implementation_t *imp,
+                         pixman_composite_info_t *info)
+{
+    PIXMAN_COMPOSITE_ARGS (info);
+    uint8_t     *dst_line, *dst;
+    uint8_t     *mask_line, *mask;
+    int dst_stride, mask_stride;
+    uint32_t d, m;
+    uint32_t src;
+    int32_t w;
+
+    __m128i xmm_alpha;
+    __m128i xmm_mask, xmm_mask_lo, xmm_mask_hi;
+    __m128i xmm_dst, xmm_dst_lo, xmm_dst_hi;
+
+    PIXMAN_IMAGE_GET_LINE (
+       dest_image, dest_x, dest_y, uint8_t, dst_stride, dst_line, 1);
+    PIXMAN_IMAGE_GET_LINE (
+       mask_image, mask_x, mask_y, uint8_t, mask_stride, mask_line, 1);
+
+    src = _pixman_image_get_solid (imp, src_image, dest_image->bits.format);
+
+    xmm_alpha = expand_alpha_1x128 (expand_pixel_32_1x128 (src));
+
+    while (height--)
+    {
+       dst = dst_line;
+       dst_line += dst_stride;
+       mask = mask_line;
+       mask_line += mask_stride;
+       w = width;
+
+       while (w && ((unsigned long)dst & 15))
+       {
+           m = (uint32_t) *mask++;
+           d = (uint32_t) *dst;
+
+           *dst++ = (uint8_t) pack_1x128_32 (
+               pix_multiply_1x128 (
+                   pix_multiply_1x128 (xmm_alpha,
+                                      unpack_32_1x128 (m)),
+                   unpack_32_1x128 (d)));
+           w--;
+       }
+
+       while (w >= 16)
+       {
+           xmm_mask = load_128_unaligned ((__m128i*)mask);
+           xmm_dst = load_128_aligned ((__m128i*)dst);
+
+           unpack_128_2x128 (xmm_mask, &xmm_mask_lo, &xmm_mask_hi);
+           unpack_128_2x128 (xmm_dst, &xmm_dst_lo, &xmm_dst_hi);
+
+           pix_multiply_2x128 (&xmm_alpha, &xmm_alpha,
+                               &xmm_mask_lo, &xmm_mask_hi,
+                               &xmm_mask_lo, &xmm_mask_hi);
+
+           pix_multiply_2x128 (&xmm_mask_lo, &xmm_mask_hi,
+                               &xmm_dst_lo, &xmm_dst_hi,
+                               &xmm_dst_lo, &xmm_dst_hi);
+
+           save_128_aligned (
+               (__m128i*)dst, pack_2x128_128 (xmm_dst_lo, xmm_dst_hi));
+
+           mask += 16;
+           dst += 16;
+           w -= 16;
+       }
+
+       while (w)
+       {
+           m = (uint32_t) *mask++;
+           d = (uint32_t) *dst;
+
+           *dst++ = (uint8_t) pack_1x128_32 (
+               pix_multiply_1x128 (
+                   pix_multiply_1x128 (
+                       xmm_alpha, unpack_32_1x128 (m)),
+                   unpack_32_1x128 (d)));
+           w--;
+       }
+    }
+
+}
+
+static void
+sse2_composite_in_n_8 (pixman_implementation_t *imp,
+                      pixman_composite_info_t *info)
+{
+    PIXMAN_COMPOSITE_ARGS (info);
+    uint8_t     *dst_line, *dst;
+    int dst_stride;
+    uint32_t d;
+    uint32_t src;
+    int32_t w;
+
+    __m128i xmm_alpha;
+    __m128i xmm_dst, xmm_dst_lo, xmm_dst_hi;
+
+    PIXMAN_IMAGE_GET_LINE (
+       dest_image, dest_x, dest_y, uint8_t, dst_stride, dst_line, 1);
+
+    src = _pixman_image_get_solid (imp, src_image, dest_image->bits.format);
+
+    xmm_alpha = expand_alpha_1x128 (expand_pixel_32_1x128 (src));
+
+    src = src >> 24;
+
+    if (src == 0xff)
+       return;
+
+    if (src == 0x00)
+    {
+       pixman_fill (dest_image->bits.bits, dest_image->bits.rowstride,
+                    8, dest_x, dest_y, width, height, src);
+
+       return;
+    }
+
+    while (height--)
+    {
+       dst = dst_line;
+       dst_line += dst_stride;
+       w = width;
+
+       while (w && ((unsigned long)dst & 15))
+       {
+           d = (uint32_t) *dst;
+
+           *dst++ = (uint8_t) pack_1x128_32 (
+               pix_multiply_1x128 (
+                   xmm_alpha,
+                   unpack_32_1x128 (d)));
+           w--;
+       }
+
+       while (w >= 16)
+       {
+           xmm_dst = load_128_aligned ((__m128i*)dst);
+
+           unpack_128_2x128 (xmm_dst, &xmm_dst_lo, &xmm_dst_hi);
+           
+           pix_multiply_2x128 (&xmm_alpha, &xmm_alpha,
+                               &xmm_dst_lo, &xmm_dst_hi,
+                               &xmm_dst_lo, &xmm_dst_hi);
+
+           save_128_aligned (
+               (__m128i*)dst, pack_2x128_128 (xmm_dst_lo, xmm_dst_hi));
+
+           dst += 16;
+           w -= 16;
+       }
+
+       while (w)
+       {
+           d = (uint32_t) *dst;
+
+           *dst++ = (uint8_t) pack_1x128_32 (
+               pix_multiply_1x128 (
+                   xmm_alpha,
+                   unpack_32_1x128 (d)));
+           w--;
+       }
+    }
+
+}
+
+static void
+sse2_composite_in_8_8 (pixman_implementation_t *imp,
+                       pixman_composite_info_t *info)
+{
+    PIXMAN_COMPOSITE_ARGS (info);
+    uint8_t     *dst_line, *dst;
+    uint8_t     *src_line, *src;
+    int src_stride, dst_stride;
+    int32_t w;
+    uint32_t s, d;
+
+    __m128i xmm_src, xmm_src_lo, xmm_src_hi;
+    __m128i xmm_dst, xmm_dst_lo, xmm_dst_hi;
+
+    PIXMAN_IMAGE_GET_LINE (
+       dest_image, dest_x, dest_y, uint8_t, dst_stride, dst_line, 1);
+    PIXMAN_IMAGE_GET_LINE (
+       src_image, src_x, src_y, uint8_t, src_stride, src_line, 1);
+
+    while (height--)
+    {
+       dst = dst_line;
+       dst_line += dst_stride;
+       src = src_line;
+       src_line += src_stride;
+       w = width;
+
+       while (w && ((unsigned long)dst & 15))
+       {
+           s = (uint32_t) *src++;
+           d = (uint32_t) *dst;
+
+           *dst++ = (uint8_t) pack_1x128_32 (
+               pix_multiply_1x128 (
+                   unpack_32_1x128 (s), unpack_32_1x128 (d)));
+           w--;
+       }
+
+       while (w >= 16)
+       {
+           xmm_src = load_128_unaligned ((__m128i*)src);
+           xmm_dst = load_128_aligned ((__m128i*)dst);
+
+           unpack_128_2x128 (xmm_src, &xmm_src_lo, &xmm_src_hi);
+           unpack_128_2x128 (xmm_dst, &xmm_dst_lo, &xmm_dst_hi);
+
+           pix_multiply_2x128 (&xmm_src_lo, &xmm_src_hi,
+                               &xmm_dst_lo, &xmm_dst_hi,
+                               &xmm_dst_lo, &xmm_dst_hi);
+
+           save_128_aligned (
+               (__m128i*)dst, pack_2x128_128 (xmm_dst_lo, xmm_dst_hi));
+
+           src += 16;
+           dst += 16;
+           w -= 16;
+       }
+
+       while (w)
+       {
+           s = (uint32_t) *src++;
+           d = (uint32_t) *dst;
+
+           *dst++ = (uint8_t) pack_1x128_32 (
+               pix_multiply_1x128 (unpack_32_1x128 (s), unpack_32_1x128 (d)));
+           w--;
+       }
+    }
+
+}
+
+static void
+sse2_composite_add_n_8_8 (pixman_implementation_t *imp,
+                         pixman_composite_info_t *info)
+{
+    PIXMAN_COMPOSITE_ARGS (info);
+    uint8_t     *dst_line, *dst;
+    uint8_t     *mask_line, *mask;
+    int dst_stride, mask_stride;
+    int32_t w;
+    uint32_t src;
+    uint32_t m, d;
+
+    __m128i xmm_alpha;
+    __m128i xmm_mask, xmm_mask_lo, xmm_mask_hi;
+    __m128i xmm_dst, xmm_dst_lo, xmm_dst_hi;
+
+    PIXMAN_IMAGE_GET_LINE (
+       dest_image, dest_x, dest_y, uint8_t, dst_stride, dst_line, 1);
+    PIXMAN_IMAGE_GET_LINE (
+       mask_image, mask_x, mask_y, uint8_t, mask_stride, mask_line, 1);
+
+    src = _pixman_image_get_solid (imp, src_image, dest_image->bits.format);
+
+    xmm_alpha = expand_alpha_1x128 (expand_pixel_32_1x128 (src));
+
+    while (height--)
+    {
+       dst = dst_line;
+       dst_line += dst_stride;
+       mask = mask_line;
+       mask_line += mask_stride;
+       w = width;
+
+       while (w && ((unsigned long)dst & 15))
+       {
+           m = (uint32_t) *mask++;
+           d = (uint32_t) *dst;
+
+           *dst++ = (uint8_t) pack_1x128_32 (
+               _mm_adds_epu16 (
+                   pix_multiply_1x128 (
+                       xmm_alpha, unpack_32_1x128 (m)),
+                   unpack_32_1x128 (d)));
+           w--;
+       }
+
+       while (w >= 16)
+       {
+           xmm_mask = load_128_unaligned ((__m128i*)mask);
+           xmm_dst = load_128_aligned ((__m128i*)dst);
+
+           unpack_128_2x128 (xmm_mask, &xmm_mask_lo, &xmm_mask_hi);
+           unpack_128_2x128 (xmm_dst, &xmm_dst_lo, &xmm_dst_hi);
+
+           pix_multiply_2x128 (&xmm_alpha, &xmm_alpha,
+                               &xmm_mask_lo, &xmm_mask_hi,
+                               &xmm_mask_lo, &xmm_mask_hi);
+
+           xmm_dst_lo = _mm_adds_epu16 (xmm_mask_lo, xmm_dst_lo);
+           xmm_dst_hi = _mm_adds_epu16 (xmm_mask_hi, xmm_dst_hi);
+
+           save_128_aligned (
+               (__m128i*)dst, pack_2x128_128 (xmm_dst_lo, xmm_dst_hi));
+
+           mask += 16;
+           dst += 16;
+           w -= 16;
+       }
+
+       while (w)
+       {
+           m = (uint32_t) *mask++;
+           d = (uint32_t) *dst;
+
+           *dst++ = (uint8_t) pack_1x128_32 (
+               _mm_adds_epu16 (
+                   pix_multiply_1x128 (
+                       xmm_alpha, unpack_32_1x128 (m)),
+                   unpack_32_1x128 (d)));
+
+           w--;
+       }
+    }
+
+}
+
+static void
+sse2_composite_add_n_8 (pixman_implementation_t *imp,
+                       pixman_composite_info_t *info)
+{
+    PIXMAN_COMPOSITE_ARGS (info);
+    uint8_t     *dst_line, *dst;
+    int dst_stride;
+    int32_t w;
+    uint32_t src;
+
+    __m128i xmm_src;
+
+    PIXMAN_IMAGE_GET_LINE (
+       dest_image, dest_x, dest_y, uint8_t, dst_stride, dst_line, 1);
+
+    src = _pixman_image_get_solid (imp, src_image, dest_image->bits.format);
+
+    src >>= 24;
+
+    if (src == 0x00)
+       return;
+
+    if (src == 0xff)
+    {
+       pixman_fill (dest_image->bits.bits, dest_image->bits.rowstride,
+                    8, dest_x, dest_y, width, height, 0xff);
+
+       return;
+    }
+
+    src = (src << 24) | (src << 16) | (src << 8) | src;
+    xmm_src = _mm_set_epi32 (src, src, src, src);
+
+    while (height--)
+    {
+       dst = dst_line;
+       dst_line += dst_stride;
+       w = width;
+
+       while (w && ((unsigned long)dst & 15))
+       {
+           *dst = (uint8_t)_mm_cvtsi128_si32 (
+               _mm_adds_epu8 (
+                   xmm_src,
+                   _mm_cvtsi32_si128 (*dst)));
+
+           w--;
+           dst++;
+       }
+
+       while (w >= 16)
+       {
+           save_128_aligned (
+               (__m128i*)dst, _mm_adds_epu8 (xmm_src, load_128_aligned  ((__m128i*)dst)));
+
+           dst += 16;
+           w -= 16;
+       }
+
+       while (w)
+       {
+           *dst = (uint8_t)_mm_cvtsi128_si32 (
+               _mm_adds_epu8 (
+                   xmm_src,
+                   _mm_cvtsi32_si128 (*dst)));
+
+           w--;
+           dst++;
+       }
+    }
+
+}
+
+static void
+sse2_composite_add_8_8 (pixman_implementation_t *imp,
+                       pixman_composite_info_t *info)
+{
+    PIXMAN_COMPOSITE_ARGS (info);
+    uint8_t     *dst_line, *dst;
+    uint8_t     *src_line, *src;
+    int dst_stride, src_stride;
+    int32_t w;
+    uint16_t t;
+
+    PIXMAN_IMAGE_GET_LINE (
+       src_image, src_x, src_y, uint8_t, src_stride, src_line, 1);
+    PIXMAN_IMAGE_GET_LINE (
+       dest_image, dest_x, dest_y, uint8_t, dst_stride, dst_line, 1);
+
+    while (height--)
+    {
+       dst = dst_line;
+       src = src_line;
+
+       dst_line += dst_stride;
+       src_line += src_stride;
+       w = width;
+
+       /* Small head */
+       while (w && (unsigned long)dst & 3)
+       {
+           t = (*dst) + (*src++);
+           *dst++ = t | (0 - (t >> 8));
+           w--;
+       }
+
+       sse2_combine_add_u (imp, op,
+                           (uint32_t*)dst, (uint32_t*)src, NULL, w >> 2);
+
+       /* Small tail */
+       dst += w & 0xfffc;
+       src += w & 0xfffc;
+
+       w &= 3;
+
+       while (w)
+       {
+           t = (*dst) + (*src++);
+           *dst++ = t | (0 - (t >> 8));
+           w--;
+       }
+    }
+
+}
+
+static void
+sse2_composite_add_8888_8888 (pixman_implementation_t *imp,
+                              pixman_composite_info_t *info)
+{
+    PIXMAN_COMPOSITE_ARGS (info);
+    uint32_t    *dst_line, *dst;
+    uint32_t    *src_line, *src;
+    int dst_stride, src_stride;
+
+    PIXMAN_IMAGE_GET_LINE (
+       src_image, src_x, src_y, uint32_t, src_stride, src_line, 1);
+    PIXMAN_IMAGE_GET_LINE (
+       dest_image, dest_x, dest_y, uint32_t, dst_stride, dst_line, 1);
+
+    while (height--)
+    {
+       dst = dst_line;
+       dst_line += dst_stride;
+       src = src_line;
+       src_line += src_stride;
+
+       sse2_combine_add_u (imp, op, dst, src, NULL, width);
+    }
+
+}
+
+static pixman_bool_t
+pixman_blt_sse2 (uint32_t *src_bits,
+                 uint32_t *dst_bits,
+                 int       src_stride,
+                 int       dst_stride,
+                 int       src_bpp,
+                 int       dst_bpp,
+                 int       src_x,
+                 int       src_y,
+                 int       dest_x,
+                 int       dest_y,
+                 int       width,
+                 int       height)
+{
+    uint8_t *   src_bytes;
+    uint8_t *   dst_bytes;
+    int byte_width;
+
+    if (src_bpp != dst_bpp)
+       return FALSE;
+
+    if (src_bpp == 16)
+    {
+       src_stride = src_stride * (int) sizeof (uint32_t) / 2;
+       dst_stride = dst_stride * (int) sizeof (uint32_t) / 2;
+       src_bytes =(uint8_t *)(((uint16_t *)src_bits) + src_stride * (src_y) + (src_x));
+       dst_bytes = (uint8_t *)(((uint16_t *)dst_bits) + dst_stride * (dest_y) + (dest_x));
+       byte_width = 2 * width;
+       src_stride *= 2;
+       dst_stride *= 2;
+    }
+    else if (src_bpp == 32)
+    {
+       src_stride = src_stride * (int) sizeof (uint32_t) / 4;
+       dst_stride = dst_stride * (int) sizeof (uint32_t) / 4;
+       src_bytes = (uint8_t *)(((uint32_t *)src_bits) + src_stride * (src_y) + (src_x));
+       dst_bytes = (uint8_t *)(((uint32_t *)dst_bits) + dst_stride * (dest_y) + (dest_x));
+       byte_width = 4 * width;
+       src_stride *= 4;
+       dst_stride *= 4;
+    }
+    else
+    {
+       return FALSE;
+    }
+
+    while (height--)
+    {
+       int w;
+       uint8_t *s = src_bytes;
+       uint8_t *d = dst_bytes;
+       src_bytes += src_stride;
+       dst_bytes += dst_stride;
+       w = byte_width;
+
+       while (w >= 2 && ((unsigned long)d & 3))
+       {
+           *(uint16_t *)d = *(uint16_t *)s;
+           w -= 2;
+           s += 2;
+           d += 2;
+       }
+
+       while (w >= 4 && ((unsigned long)d & 15))
+       {
+           *(uint32_t *)d = *(uint32_t *)s;
+
+           w -= 4;
+           s += 4;
+           d += 4;
+       }
+
+       while (w >= 64)
+       {
+           __m128i xmm0, xmm1, xmm2, xmm3;
+
+           xmm0 = load_128_unaligned ((__m128i*)(s));
+           xmm1 = load_128_unaligned ((__m128i*)(s + 16));
+           xmm2 = load_128_unaligned ((__m128i*)(s + 32));
+           xmm3 = load_128_unaligned ((__m128i*)(s + 48));
+
+           save_128_aligned ((__m128i*)(d),    xmm0);
+           save_128_aligned ((__m128i*)(d + 16), xmm1);
+           save_128_aligned ((__m128i*)(d + 32), xmm2);
+           save_128_aligned ((__m128i*)(d + 48), xmm3);
+
+           s += 64;
+           d += 64;
+           w -= 64;
+       }
+
+       while (w >= 16)
+       {
+           save_128_aligned ((__m128i*)d, load_128_unaligned ((__m128i*)s) );
+
+           w -= 16;
+           d += 16;
+           s += 16;
+       }
+
+       while (w >= 4)
+       {
+           *(uint32_t *)d = *(uint32_t *)s;
+
+           w -= 4;
+           s += 4;
+           d += 4;
+       }
+
+       if (w >= 2)
+       {
+           *(uint16_t *)d = *(uint16_t *)s;
+           w -= 2;
+           s += 2;
+           d += 2;
+       }
+    }
+
+
+    return TRUE;
+}
+
+static void
+sse2_composite_copy_area (pixman_implementation_t *imp,
+                          pixman_composite_info_t *info)
+{
+    PIXMAN_COMPOSITE_ARGS (info);
+    pixman_blt_sse2 (src_image->bits.bits,
+                     dest_image->bits.bits,
+                     src_image->bits.rowstride,
+                     dest_image->bits.rowstride,
+                     PIXMAN_FORMAT_BPP (src_image->bits.format),
+                     PIXMAN_FORMAT_BPP (dest_image->bits.format),
+                     src_x, src_y, dest_x, dest_y, width, height);
+}
+
+static void
+sse2_composite_over_x888_8_8888 (pixman_implementation_t *imp,
+                                 pixman_composite_info_t *info)
+{
+    PIXMAN_COMPOSITE_ARGS (info);
+    uint32_t    *src, *src_line, s;
+    uint32_t    *dst, *dst_line, d;
+    uint8_t         *mask, *mask_line;
+    uint32_t m;
+    int src_stride, mask_stride, dst_stride;
+    int32_t w;
+    __m128i ms;
+
+    __m128i xmm_src, xmm_src_lo, xmm_src_hi;
+    __m128i xmm_dst, xmm_dst_lo, xmm_dst_hi;
+    __m128i xmm_mask, xmm_mask_lo, xmm_mask_hi;
+
+    PIXMAN_IMAGE_GET_LINE (
+       dest_image, dest_x, dest_y, uint32_t, dst_stride, dst_line, 1);
+    PIXMAN_IMAGE_GET_LINE (
+       mask_image, mask_x, mask_y, uint8_t, mask_stride, mask_line, 1);
+    PIXMAN_IMAGE_GET_LINE (
+       src_image, src_x, src_y, uint32_t, src_stride, src_line, 1);
+
+    while (height--)
+    {
+        src = src_line;
+        src_line += src_stride;
+        dst = dst_line;
+        dst_line += dst_stride;
+        mask = mask_line;
+        mask_line += mask_stride;
+
+        w = width;
+
+        while (w && (unsigned long)dst & 15)
+        {
+            s = 0xff000000 | *src++;
+            m = (uint32_t) *mask++;
+            d = *dst;
+            ms = unpack_32_1x128 (s);
+
+            if (m != 0xff)
+            {
+               __m128i ma = expand_alpha_rev_1x128 (unpack_32_1x128 (m));
+               __m128i md = unpack_32_1x128 (d);
+
+                ms = in_over_1x128 (&ms, &mask_00ff, &ma, &md);
+            }
+
+            *dst++ = pack_1x128_32 (ms);
+            w--;
+        }
+
+        while (w >= 4)
+        {
+            m = *(uint32_t*) mask;
+            xmm_src = _mm_or_si128 (
+               load_128_unaligned ((__m128i*)src), mask_ff000000);
+
+            if (m == 0xffffffff)
+            {
+                save_128_aligned ((__m128i*)dst, xmm_src);
+            }
+            else
+            {
+                xmm_dst = load_128_aligned ((__m128i*)dst);
+
+                xmm_mask = _mm_unpacklo_epi16 (unpack_32_1x128 (m), _mm_setzero_si128());
+
+                unpack_128_2x128 (xmm_src, &xmm_src_lo, &xmm_src_hi);
+                unpack_128_2x128 (xmm_mask, &xmm_mask_lo, &xmm_mask_hi);
+                unpack_128_2x128 (xmm_dst, &xmm_dst_lo, &xmm_dst_hi);
+
+                expand_alpha_rev_2x128 (
+                   xmm_mask_lo, xmm_mask_hi, &xmm_mask_lo, &xmm_mask_hi);
+
+                in_over_2x128 (&xmm_src_lo, &xmm_src_hi,
+                              &mask_00ff, &mask_00ff, &xmm_mask_lo, &xmm_mask_hi,
+                              &xmm_dst_lo, &xmm_dst_hi);
+
+                save_128_aligned ((__m128i*)dst, pack_2x128_128 (xmm_dst_lo, xmm_dst_hi));
+            }
+
+            src += 4;
+            dst += 4;
+            mask += 4;
+            w -= 4;
+        }
+
+        while (w)
+        {
+            m = (uint32_t) *mask++;
+
+            if (m)
+            {
+                s = 0xff000000 | *src;
+
+                if (m == 0xff)
+                {
+                    *dst = s;
+                }
+                else
+                {
+                   __m128i ma, md, ms;
+
+                    d = *dst;
+
+                   ma = expand_alpha_rev_1x128 (unpack_32_1x128 (m));
+                   md = unpack_32_1x128 (d);
+                   ms = unpack_32_1x128 (s);
+
+                    *dst = pack_1x128_32 (in_over_1x128 (&ms, &mask_00ff, &ma, &md));
+                }
+
+            }
+
+            src++;
+            dst++;
+            w--;
+        }
+    }
+
+}
+
+static void
+sse2_composite_over_8888_8_8888 (pixman_implementation_t *imp,
+                                 pixman_composite_info_t *info)
+{
+    PIXMAN_COMPOSITE_ARGS (info);
+    uint32_t    *src, *src_line, s;
+    uint32_t    *dst, *dst_line, d;
+    uint8_t         *mask, *mask_line;
+    uint32_t m;
+    int src_stride, mask_stride, dst_stride;
+    int32_t w;
+
+    __m128i xmm_src, xmm_src_lo, xmm_src_hi, xmm_srca_lo, xmm_srca_hi;
+    __m128i xmm_dst, xmm_dst_lo, xmm_dst_hi;
+    __m128i xmm_mask, xmm_mask_lo, xmm_mask_hi;
+
+    PIXMAN_IMAGE_GET_LINE (
+       dest_image, dest_x, dest_y, uint32_t, dst_stride, dst_line, 1);
+    PIXMAN_IMAGE_GET_LINE (
+       mask_image, mask_x, mask_y, uint8_t, mask_stride, mask_line, 1);
+    PIXMAN_IMAGE_GET_LINE (
+       src_image, src_x, src_y, uint32_t, src_stride, src_line, 1);
+
+    while (height--)
+    {
+        src = src_line;
+        src_line += src_stride;
+        dst = dst_line;
+        dst_line += dst_stride;
+        mask = mask_line;
+        mask_line += mask_stride;
+
+        w = width;
+
+        while (w && (unsigned long)dst & 15)
+        {
+           uint32_t sa;
+
+            s = *src++;
+            m = (uint32_t) *mask++;
+            d = *dst;
+
+           sa = s >> 24;
+
+           if (m)
+           {
+               if (sa == 0xff && m == 0xff)
+               {
+                   *dst = s;
+               }
+               else
+               {
+                   __m128i ms, md, ma, msa;
+
+                   ma = expand_alpha_rev_1x128 (load_32_1x128 (m));
+                   ms = unpack_32_1x128 (s);
+                   md = unpack_32_1x128 (d);
+
+                   msa = expand_alpha_rev_1x128 (load_32_1x128 (sa));
+
+                   *dst = pack_1x128_32 (in_over_1x128 (&ms, &msa, &ma, &md));
+               }
+           }
+
+           dst++;
+            w--;
+        }
+
+        while (w >= 4)
+        {
+            m = *(uint32_t *) mask;
+
+           if (m)
+           {
+               xmm_src = load_128_unaligned ((__m128i*)src);
+
+               if (m == 0xffffffff && is_opaque (xmm_src))
+               {
+                   save_128_aligned ((__m128i *)dst, xmm_src);
+               }
+               else
+               {
+                   xmm_dst = load_128_aligned ((__m128i *)dst);
+
+                   xmm_mask = _mm_unpacklo_epi16 (unpack_32_1x128 (m), _mm_setzero_si128());
+
+                   unpack_128_2x128 (xmm_src, &xmm_src_lo, &xmm_src_hi);
+                   unpack_128_2x128 (xmm_mask, &xmm_mask_lo, &xmm_mask_hi);
+                   unpack_128_2x128 (xmm_dst, &xmm_dst_lo, &xmm_dst_hi);
+
+                   expand_alpha_2x128 (xmm_src_lo, xmm_src_hi, &xmm_srca_lo, &xmm_srca_hi);
+                   expand_alpha_rev_2x128 (xmm_mask_lo, xmm_mask_hi, &xmm_mask_lo, &xmm_mask_hi);
+
+                   in_over_2x128 (&xmm_src_lo, &xmm_src_hi, &xmm_srca_lo, &xmm_srca_hi,
+                                  &xmm_mask_lo, &xmm_mask_hi, &xmm_dst_lo, &xmm_dst_hi);
+
+                   save_128_aligned ((__m128i*)dst, pack_2x128_128 (xmm_dst_lo, xmm_dst_hi));
+               }
+           }
+
+            src += 4;
+            dst += 4;
+            mask += 4;
+            w -= 4;
+        }
+
+        while (w)
+        {
+           uint32_t sa;
+
+            s = *src++;
+            m = (uint32_t) *mask++;
+            d = *dst;
+
+           sa = s >> 24;
+
+           if (m)
+           {
+               if (sa == 0xff && m == 0xff)
+               {
+                   *dst = s;
+               }
+               else
+               {
+                   __m128i ms, md, ma, msa;
+
+                   ma = expand_alpha_rev_1x128 (load_32_1x128 (m));
+                   ms = unpack_32_1x128 (s);
+                   md = unpack_32_1x128 (d);
+
+                   msa = expand_alpha_rev_1x128 (load_32_1x128 (sa));
+
+                   *dst = pack_1x128_32 (in_over_1x128 (&ms, &msa, &ma, &md));
+               }
+           }
+
+           dst++;
+            w--;
+        }
+    }
+
+}
+
+static void
+sse2_composite_over_reverse_n_8888 (pixman_implementation_t *imp,
+                                   pixman_composite_info_t *info)
+{
+    PIXMAN_COMPOSITE_ARGS (info);
+    uint32_t src;
+    uint32_t    *dst_line, *dst;
+    __m128i xmm_src;
+    __m128i xmm_dst, xmm_dst_lo, xmm_dst_hi;
+    __m128i xmm_dsta_hi, xmm_dsta_lo;
+    int dst_stride;
+    int32_t w;
+
+    src = _pixman_image_get_solid (imp, src_image, dest_image->bits.format);
+
+    if (src == 0)
+       return;
+
+    PIXMAN_IMAGE_GET_LINE (
+       dest_image, dest_x, dest_y, uint32_t, dst_stride, dst_line, 1);
+
+    xmm_src = expand_pixel_32_1x128 (src);
+
+    while (height--)
+    {
+       dst = dst_line;
+
+       dst_line += dst_stride;
+       w = width;
+
+       while (w && (unsigned long)dst & 15)
+       {
+           __m128i vd;
+
+           vd = unpack_32_1x128 (*dst);
+
+           *dst = pack_1x128_32 (over_1x128 (vd, expand_alpha_1x128 (vd),
+                                             xmm_src));
+           w--;
+           dst++;
+       }
+
+       while (w >= 4)
+       {
+           __m128i tmp_lo, tmp_hi;
+
+           xmm_dst = load_128_aligned ((__m128i*)dst);
+
+           unpack_128_2x128 (xmm_dst, &xmm_dst_lo, &xmm_dst_hi);
+           expand_alpha_2x128 (xmm_dst_lo, xmm_dst_hi, &xmm_dsta_lo, &xmm_dsta_hi);
+
+           tmp_lo = xmm_src;
+           tmp_hi = xmm_src;
+
+           over_2x128 (&xmm_dst_lo, &xmm_dst_hi,
+                       &xmm_dsta_lo, &xmm_dsta_hi,
+                       &tmp_lo, &tmp_hi);
+
+           save_128_aligned (
+               (__m128i*)dst, pack_2x128_128 (tmp_lo, tmp_hi));
+
+           w -= 4;
+           dst += 4;
+       }
+
+       while (w)
+       {
+           __m128i vd;
+
+           vd = unpack_32_1x128 (*dst);
+
+           *dst = pack_1x128_32 (over_1x128 (vd, expand_alpha_1x128 (vd),
+                                             xmm_src));
+           w--;
+           dst++;
+       }
+
+    }
+
+}
+
+static void
+sse2_composite_over_8888_8888_8888 (pixman_implementation_t *imp,
+                                   pixman_composite_info_t *info)
+{
+    PIXMAN_COMPOSITE_ARGS (info);
+    uint32_t    *src, *src_line, s;
+    uint32_t    *dst, *dst_line, d;
+    uint32_t    *mask, *mask_line;
+    uint32_t    m;
+    int src_stride, mask_stride, dst_stride;
+    int32_t w;
+
+    __m128i xmm_src, xmm_src_lo, xmm_src_hi, xmm_srca_lo, xmm_srca_hi;
+    __m128i xmm_dst, xmm_dst_lo, xmm_dst_hi;
+    __m128i xmm_mask, xmm_mask_lo, xmm_mask_hi;
+
+    PIXMAN_IMAGE_GET_LINE (
+       dest_image, dest_x, dest_y, uint32_t, dst_stride, dst_line, 1);
+    PIXMAN_IMAGE_GET_LINE (
+       mask_image, mask_x, mask_y, uint32_t, mask_stride, mask_line, 1);
+    PIXMAN_IMAGE_GET_LINE (
+       src_image, src_x, src_y, uint32_t, src_stride, src_line, 1);
+
+    while (height--)
+    {
+        src = src_line;
+        src_line += src_stride;
+        dst = dst_line;
+        dst_line += dst_stride;
+        mask = mask_line;
+        mask_line += mask_stride;
+
+        w = width;
+
+        while (w && (unsigned long)dst & 15)
+        {
+           uint32_t sa;
+
+            s = *src++;
+            m = (*mask++) >> 24;
+            d = *dst;
+
+           sa = s >> 24;
+
+           if (m)
+           {
+               if (sa == 0xff && m == 0xff)
+               {
+                   *dst = s;
+               }
+               else
+               {
+                   __m128i ms, md, ma, msa;
+
+                   ma = expand_alpha_rev_1x128 (load_32_1x128 (m));
+                   ms = unpack_32_1x128 (s);
+                   md = unpack_32_1x128 (d);
+
+                   msa = expand_alpha_rev_1x128 (load_32_1x128 (sa));
+
+                   *dst = pack_1x128_32 (in_over_1x128 (&ms, &msa, &ma, &md));
+               }
+           }
+
+           dst++;
+            w--;
+        }
+
+        while (w >= 4)
+        {
+           xmm_mask = load_128_unaligned ((__m128i*)mask);
+
+           if (!is_transparent (xmm_mask))
+           {
+               xmm_src = load_128_unaligned ((__m128i*)src);
+
+               if (is_opaque (xmm_mask) && is_opaque (xmm_src))
+               {
+                   save_128_aligned ((__m128i *)dst, xmm_src);
+               }
+               else
+               {
+                   xmm_dst = load_128_aligned ((__m128i *)dst);
+
+                   unpack_128_2x128 (xmm_src, &xmm_src_lo, &xmm_src_hi);
+                   unpack_128_2x128 (xmm_mask, &xmm_mask_lo, &xmm_mask_hi);
+                   unpack_128_2x128 (xmm_dst, &xmm_dst_lo, &xmm_dst_hi);
+
+                   expand_alpha_2x128 (xmm_src_lo, xmm_src_hi, &xmm_srca_lo, &xmm_srca_hi);
+                   expand_alpha_2x128 (xmm_mask_lo, xmm_mask_hi, &xmm_mask_lo, &xmm_mask_hi);
+
+                   in_over_2x128 (&xmm_src_lo, &xmm_src_hi, &xmm_srca_lo, &xmm_srca_hi,
+                                  &xmm_mask_lo, &xmm_mask_hi, &xmm_dst_lo, &xmm_dst_hi);
+
+                   save_128_aligned ((__m128i*)dst, pack_2x128_128 (xmm_dst_lo, xmm_dst_hi));
+               }
+           }
+
+            src += 4;
+            dst += 4;
+            mask += 4;
+            w -= 4;
+        }
+
+        while (w)
+        {
+           uint32_t sa;
+
+            s = *src++;
+            m = (*mask++) >> 24;
+            d = *dst;
+
+           sa = s >> 24;
+
+           if (m)
+           {
+               if (sa == 0xff && m == 0xff)
+               {
+                   *dst = s;
+               }
+               else
+               {
+                   __m128i ms, md, ma, msa;
+
+                   ma = expand_alpha_rev_1x128 (load_32_1x128 (m));
+                   ms = unpack_32_1x128 (s);
+                   md = unpack_32_1x128 (d);
+
+                   msa = expand_alpha_rev_1x128 (load_32_1x128 (sa));
+
+                   *dst = pack_1x128_32 (in_over_1x128 (&ms, &msa, &ma, &md));
+               }
+           }
+
+           dst++;
+            w--;
+        }
+    }
+
+}
+
+/* A variant of 'sse2_combine_over_u' with minor tweaks */
+static force_inline void
+scaled_nearest_scanline_sse2_8888_8888_OVER (uint32_t*       pd,
+                                             const uint32_t* ps,
+                                             int32_t         w,
+                                             pixman_fixed_t  vx,
+                                             pixman_fixed_t  unit_x,
+                                             pixman_fixed_t  max_vx,
+                                             pixman_bool_t   fully_transparent_src)
+{
+    uint32_t s, d;
+    const uint32_t* pm = NULL;
+
+    __m128i xmm_dst_lo, xmm_dst_hi;
+    __m128i xmm_src_lo, xmm_src_hi;
+    __m128i xmm_alpha_lo, xmm_alpha_hi;
+
+    if (fully_transparent_src)
+       return;
+
+    /* Align dst on a 16-byte boundary */
+    while (w && ((unsigned long)pd & 15))
+    {
+       d = *pd;
+       s = combine1 (ps + (vx >> 16), pm);
+       vx += unit_x;
+
+       *pd++ = core_combine_over_u_pixel_sse2 (s, d);
+       if (pm)
+           pm++;
+       w--;
+    }
+
+    while (w >= 4)
+    {
+       __m128i tmp;
+       uint32_t tmp1, tmp2, tmp3, tmp4;
+
+       tmp1 = ps[vx >> 16];
+       vx += unit_x;
+       tmp2 = ps[vx >> 16];
+       vx += unit_x;
+       tmp3 = ps[vx >> 16];
+       vx += unit_x;
+       tmp4 = ps[vx >> 16];
+       vx += unit_x;
+
+       tmp = _mm_set_epi32 (tmp4, tmp3, tmp2, tmp1);
+
+       xmm_src_hi = combine4 ((__m128i*)&tmp, (__m128i*)pm);
+
+       if (is_opaque (xmm_src_hi))
+       {
+           save_128_aligned ((__m128i*)pd, xmm_src_hi);
+       }
+       else if (!is_zero (xmm_src_hi))
+       {
+           xmm_dst_hi = load_128_aligned ((__m128i*) pd);
+
+           unpack_128_2x128 (xmm_src_hi, &xmm_src_lo, &xmm_src_hi);
+           unpack_128_2x128 (xmm_dst_hi, &xmm_dst_lo, &xmm_dst_hi);
+
+           expand_alpha_2x128 (
+               xmm_src_lo, xmm_src_hi, &xmm_alpha_lo, &xmm_alpha_hi);
+
+           over_2x128 (&xmm_src_lo, &xmm_src_hi,
+                       &xmm_alpha_lo, &xmm_alpha_hi,
+                       &xmm_dst_lo, &xmm_dst_hi);
+
+           /* rebuid the 4 pixel data and save*/
+           save_128_aligned ((__m128i*)pd,
+                             pack_2x128_128 (xmm_dst_lo, xmm_dst_hi));
+       }
+
+       w -= 4;
+       pd += 4;
+       if (pm)
+           pm += 4;
+    }
+
+    while (w)
+    {
+       d = *pd;
+       s = combine1 (ps + (vx >> 16), pm);
+       vx += unit_x;
+
+       *pd++ = core_combine_over_u_pixel_sse2 (s, d);
+       if (pm)
+           pm++;
+
+       w--;
+    }
+}
+
+FAST_NEAREST_MAINLOOP (sse2_8888_8888_cover_OVER,
+                      scaled_nearest_scanline_sse2_8888_8888_OVER,
+                      uint32_t, uint32_t, COVER)
+FAST_NEAREST_MAINLOOP (sse2_8888_8888_none_OVER,
+                      scaled_nearest_scanline_sse2_8888_8888_OVER,
+                      uint32_t, uint32_t, NONE)
+FAST_NEAREST_MAINLOOP (sse2_8888_8888_pad_OVER,
+                      scaled_nearest_scanline_sse2_8888_8888_OVER,
+                      uint32_t, uint32_t, PAD)
+
+static force_inline void
+scaled_nearest_scanline_sse2_8888_n_8888_OVER (const uint32_t * mask,
+                                              uint32_t *       dst,
+                                              const uint32_t * src,
+                                              int32_t          w,
+                                              pixman_fixed_t   vx,
+                                              pixman_fixed_t   unit_x,
+                                              pixman_fixed_t   max_vx,
+                                              pixman_bool_t    zero_src)
+{
+    __m128i xmm_mask;
+    __m128i xmm_src, xmm_src_lo, xmm_src_hi;
+    __m128i xmm_dst, xmm_dst_lo, xmm_dst_hi;
+    __m128i xmm_alpha_lo, xmm_alpha_hi;
+
+    if (zero_src || (*mask >> 24) == 0)
+       return;
+
+    xmm_mask = create_mask_16_128 (*mask >> 24);
+
+    while (w && (unsigned long)dst & 15)
+    {
+       uint32_t s = src[pixman_fixed_to_int (vx)];
+       vx += unit_x;
+
+       if (s)
+       {
+           uint32_t d = *dst;
+
+           __m128i ms = unpack_32_1x128 (s);
+           __m128i alpha     = expand_alpha_1x128 (ms);
+           __m128i dest      = xmm_mask;
+           __m128i alpha_dst = unpack_32_1x128 (d);
+
+           *dst = pack_1x128_32 (
+               in_over_1x128 (&ms, &alpha, &dest, &alpha_dst));
+       }
+       dst++;
+       w--;
+    }
+
+    while (w >= 4)
+    {
+       uint32_t tmp1, tmp2, tmp3, tmp4;
+
+       tmp1 = src[pixman_fixed_to_int (vx)];
+       vx += unit_x;
+       tmp2 = src[pixman_fixed_to_int (vx)];
+       vx += unit_x;
+       tmp3 = src[pixman_fixed_to_int (vx)];
+       vx += unit_x;
+       tmp4 = src[pixman_fixed_to_int (vx)];
+       vx += unit_x;
+
+       xmm_src = _mm_set_epi32 (tmp4, tmp3, tmp2, tmp1);
+
+       if (!is_zero (xmm_src))
+       {
+           xmm_dst = load_128_aligned ((__m128i*)dst);
+
+           unpack_128_2x128 (xmm_src, &xmm_src_lo, &xmm_src_hi);
+           unpack_128_2x128 (xmm_dst, &xmm_dst_lo, &xmm_dst_hi);
+           expand_alpha_2x128 (xmm_src_lo, xmm_src_hi,
+                               &xmm_alpha_lo, &xmm_alpha_hi);
+
+           in_over_2x128 (&xmm_src_lo, &xmm_src_hi,
+                          &xmm_alpha_lo, &xmm_alpha_hi,
+                          &xmm_mask, &xmm_mask,
+                          &xmm_dst_lo, &xmm_dst_hi);
+
+           save_128_aligned (
+               (__m128i*)dst, pack_2x128_128 (xmm_dst_lo, xmm_dst_hi));
+       }
+
+       dst += 4;
+       w -= 4;
+    }
+
+    while (w)
+    {
+       uint32_t s = src[pixman_fixed_to_int (vx)];
+       vx += unit_x;
+
+       if (s)
+       {
+           uint32_t d = *dst;
+
+           __m128i ms = unpack_32_1x128 (s);
+           __m128i alpha = expand_alpha_1x128 (ms);
+           __m128i mask  = xmm_mask;
+           __m128i dest  = unpack_32_1x128 (d);
+
+           *dst = pack_1x128_32 (
+               in_over_1x128 (&ms, &alpha, &mask, &dest));
+       }
+
+       dst++;
+       w--;
+    }
+
+}
+
+FAST_NEAREST_MAINLOOP_COMMON (sse2_8888_n_8888_cover_OVER,
+                             scaled_nearest_scanline_sse2_8888_n_8888_OVER,
+                             uint32_t, uint32_t, uint32_t, COVER, TRUE, TRUE)
+FAST_NEAREST_MAINLOOP_COMMON (sse2_8888_n_8888_pad_OVER,
+                             scaled_nearest_scanline_sse2_8888_n_8888_OVER,
+                             uint32_t, uint32_t, uint32_t, PAD, TRUE, TRUE)
+FAST_NEAREST_MAINLOOP_COMMON (sse2_8888_n_8888_none_OVER,
+                             scaled_nearest_scanline_sse2_8888_n_8888_OVER,
+                             uint32_t, uint32_t, uint32_t, NONE, TRUE, TRUE)
+
+#define BILINEAR_DECLARE_VARIABLES                                             \
+    const __m128i xmm_wt = _mm_set_epi16 (wt, wt, wt, wt, wt, wt, wt, wt);     \
+    const __m128i xmm_wb = _mm_set_epi16 (wb, wb, wb, wb, wb, wb, wb, wb);     \
+    const __m128i xmm_xorc = _mm_set_epi16 (0, 0, 0, 0, 0xff, 0xff, 0xff, 0xff);\
+    const __m128i xmm_addc = _mm_set_epi16 (0, 0, 0, 0, 1, 1, 1, 1);           \
+    const __m128i xmm_ux = _mm_set_epi16 (unit_x, unit_x, unit_x, unit_x,      \
+                                         unit_x, unit_x, unit_x, unit_x);      \
+    const __m128i xmm_zero = _mm_setzero_si128 ();                             \
+    __m128i xmm_x = _mm_set_epi16 (vx, vx, vx, vx, vx, vx, vx, vx)
+
+#define BILINEAR_INTERPOLATE_ONE_PIXEL(pix)                                    \
+do {                                                                           \
+    __m128i xmm_wh, xmm_lo, xmm_hi, a;                                         \
+    /* fetch 2x2 pixel block into sse2 register */                             \
+    uint32_t tl = src_top [pixman_fixed_to_int (vx)];                          \
+    uint32_t tr = src_top [pixman_fixed_to_int (vx) + 1];                      \
+    uint32_t bl = src_bottom [pixman_fixed_to_int (vx)];                       \
+    uint32_t br = src_bottom [pixman_fixed_to_int (vx) + 1];                   \
+    a = _mm_set_epi32 (tr, tl, br, bl);                                                \
+    vx += unit_x;                                                              \
+    /* vertical interpolation */                                               \
+    a = _mm_add_epi16 (_mm_mullo_epi16 (_mm_unpackhi_epi8 (a, xmm_zero),       \
+                                       xmm_wt),                                \
+                      _mm_mullo_epi16 (_mm_unpacklo_epi8 (a, xmm_zero),        \
+                                       xmm_wb));                               \
+    /* calculate horizontal weights */                                         \
+    xmm_wh = _mm_add_epi16 (xmm_addc,                                          \
+                           _mm_xor_si128 (xmm_xorc,                            \
+                                          _mm_srli_epi16 (xmm_x, 8)));         \
+    xmm_x = _mm_add_epi16 (xmm_x, xmm_ux);                                     \
+    /* horizontal interpolation */                                             \
+    xmm_lo = _mm_mullo_epi16 (a, xmm_wh);                                      \
+    xmm_hi = _mm_mulhi_epu16 (a, xmm_wh);                                      \
+    a = _mm_add_epi32 (_mm_unpacklo_epi16 (xmm_lo, xmm_hi),                    \
+                      _mm_unpackhi_epi16 (xmm_lo, xmm_hi));                    \
+    /* shift and pack the result */                                            \
+    a = _mm_srli_epi32 (a, 16);                                                        \
+    a = _mm_packs_epi32 (a, a);                                                        \
+    a = _mm_packus_epi16 (a, a);                                               \
+    pix = _mm_cvtsi128_si32 (a);                                               \
+} while (0)
+
+#define BILINEAR_SKIP_ONE_PIXEL()                                              \
+do {                                                                           \
+    vx += unit_x;                                                              \
+    xmm_x = _mm_add_epi16 (xmm_x, xmm_ux);                                     \
+} while(0)
+
+static force_inline void
+scaled_bilinear_scanline_sse2_8888_8888_SRC (uint32_t *       dst,
+                                            const uint32_t * mask,
+                                            const uint32_t * src_top,
+                                            const uint32_t * src_bottom,
+                                            int32_t          w,
+                                            int              wt,
+                                            int              wb,
+                                            pixman_fixed_t   vx,
+                                            pixman_fixed_t   unit_x,
+                                            pixman_fixed_t   max_vx,
+                                            pixman_bool_t    zero_src)
+{
+    BILINEAR_DECLARE_VARIABLES;
+    uint32_t pix1, pix2, pix3, pix4;
+
+    while ((w -= 4) >= 0)
+    {
+       BILINEAR_INTERPOLATE_ONE_PIXEL (pix1);
+       BILINEAR_INTERPOLATE_ONE_PIXEL (pix2);
+       BILINEAR_INTERPOLATE_ONE_PIXEL (pix3);
+       BILINEAR_INTERPOLATE_ONE_PIXEL (pix4);
+       *dst++ = pix1;
+       *dst++ = pix2;
+       *dst++ = pix3;
+       *dst++ = pix4;
+    }
+
+    if (w & 2)
+    {
+       BILINEAR_INTERPOLATE_ONE_PIXEL (pix1);
+       BILINEAR_INTERPOLATE_ONE_PIXEL (pix2);
+       *dst++ = pix1;
+       *dst++ = pix2;
+    }
+
+    if (w & 1)
+    {
+       BILINEAR_INTERPOLATE_ONE_PIXEL (pix1);
+       *dst = pix1;
+    }
+
+}
+
+FAST_BILINEAR_MAINLOOP_COMMON (sse2_8888_8888_cover_SRC,
+                              scaled_bilinear_scanline_sse2_8888_8888_SRC,
+                              uint32_t, uint32_t, uint32_t,
+                              COVER, FLAG_NONE)
+FAST_BILINEAR_MAINLOOP_COMMON (sse2_8888_8888_pad_SRC,
+                              scaled_bilinear_scanline_sse2_8888_8888_SRC,
+                              uint32_t, uint32_t, uint32_t,
+                              PAD, FLAG_NONE)
+FAST_BILINEAR_MAINLOOP_COMMON (sse2_8888_8888_none_SRC,
+                              scaled_bilinear_scanline_sse2_8888_8888_SRC,
+                              uint32_t, uint32_t, uint32_t,
+                              NONE, FLAG_NONE)
+FAST_BILINEAR_MAINLOOP_COMMON (sse2_8888_8888_normal_SRC,
+                              scaled_bilinear_scanline_sse2_8888_8888_SRC,
+                              uint32_t, uint32_t, uint32_t,
+                              NORMAL, FLAG_NONE)
+
+static force_inline void
+scaled_bilinear_scanline_sse2_8888_8888_OVER (uint32_t *       dst,
+                                             const uint32_t * mask,
+                                             const uint32_t * src_top,
+                                             const uint32_t * src_bottom,
+                                             int32_t          w,
+                                             int              wt,
+                                             int              wb,
+                                             pixman_fixed_t   vx,
+                                             pixman_fixed_t   unit_x,
+                                             pixman_fixed_t   max_vx,
+                                             pixman_bool_t    zero_src)
+{
+    BILINEAR_DECLARE_VARIABLES;
+    uint32_t pix1, pix2, pix3, pix4;
+
+    while (w && ((unsigned long)dst & 15))
+    {
+       BILINEAR_INTERPOLATE_ONE_PIXEL (pix1);
+
+       if (pix1)
+       {
+           pix2 = *dst;
+           *dst = core_combine_over_u_pixel_sse2 (pix1, pix2);
+       }
+
+       w--;
+       dst++;
+    }
+
+    while (w  >= 4)
+    {
+       __m128i xmm_src;
+       __m128i xmm_src_hi, xmm_src_lo, xmm_dst_hi, xmm_dst_lo;
+       __m128i xmm_alpha_hi, xmm_alpha_lo;
+
+       BILINEAR_INTERPOLATE_ONE_PIXEL (pix1);
+       BILINEAR_INTERPOLATE_ONE_PIXEL (pix2);
+       BILINEAR_INTERPOLATE_ONE_PIXEL (pix3);
+       BILINEAR_INTERPOLATE_ONE_PIXEL (pix4);
+
+       xmm_src = _mm_set_epi32 (pix4, pix3, pix2, pix1);
+
+       if (!is_zero (xmm_src))
+       {
+           if (is_opaque (xmm_src))
+           {
+               save_128_aligned ((__m128i *)dst, xmm_src);
+           }
+           else
+           {
+               __m128i xmm_dst = load_128_aligned ((__m128i *)dst);
+
+               unpack_128_2x128 (xmm_src, &xmm_src_lo, &xmm_src_hi);
+               unpack_128_2x128 (xmm_dst, &xmm_dst_lo, &xmm_dst_hi);
+
+               expand_alpha_2x128 (xmm_src_lo, xmm_src_hi, &xmm_alpha_lo, &xmm_alpha_hi);
+               over_2x128 (&xmm_src_lo, &xmm_src_hi, &xmm_alpha_lo, &xmm_alpha_hi,
+                           &xmm_dst_lo, &xmm_dst_hi);
+
+               save_128_aligned ((__m128i *)dst, pack_2x128_128 (xmm_dst_lo, xmm_dst_hi));
+           }
+       }
+
+       w -= 4;
+       dst += 4;
+    }
+
+    while (w)
+    {
+       BILINEAR_INTERPOLATE_ONE_PIXEL (pix1);
+
+       if (pix1)
+       {
+           pix2 = *dst;
+           *dst = core_combine_over_u_pixel_sse2 (pix1, pix2);
+       }
+
+       w--;
+       dst++;
+    }
+}
+
+FAST_BILINEAR_MAINLOOP_COMMON (sse2_8888_8888_cover_OVER,
+                              scaled_bilinear_scanline_sse2_8888_8888_OVER,
+                              uint32_t, uint32_t, uint32_t,
+                              COVER, FLAG_NONE)
+FAST_BILINEAR_MAINLOOP_COMMON (sse2_8888_8888_pad_OVER,
+                              scaled_bilinear_scanline_sse2_8888_8888_OVER,
+                              uint32_t, uint32_t, uint32_t,
+                              PAD, FLAG_NONE)
+FAST_BILINEAR_MAINLOOP_COMMON (sse2_8888_8888_none_OVER,
+                              scaled_bilinear_scanline_sse2_8888_8888_OVER,
+                              uint32_t, uint32_t, uint32_t,
+                              NONE, FLAG_NONE)
+FAST_BILINEAR_MAINLOOP_COMMON (sse2_8888_8888_normal_OVER,
+                              scaled_bilinear_scanline_sse2_8888_8888_OVER,
+                              uint32_t, uint32_t, uint32_t,
+                              NORMAL, FLAG_NONE)
+
+static force_inline void
+scaled_bilinear_scanline_sse2_8888_8_8888_OVER (uint32_t *       dst,
+                                               const uint8_t  * mask,
+                                               const uint32_t * src_top,
+                                               const uint32_t * src_bottom,
+                                               int32_t          w,
+                                               int              wt,
+                                               int              wb,
+                                               pixman_fixed_t   vx,
+                                               pixman_fixed_t   unit_x,
+                                               pixman_fixed_t   max_vx,
+                                               pixman_bool_t    zero_src)
+{
+    BILINEAR_DECLARE_VARIABLES;
+    uint32_t pix1, pix2, pix3, pix4;
+    uint32_t m;
+
+    while (w && ((unsigned long)dst & 15))
+    {
+       uint32_t sa;
+
+       m = (uint32_t) *mask++;
+
+       if (m)
+       {
+           BILINEAR_INTERPOLATE_ONE_PIXEL (pix1);
+           sa = pix1 >> 24;
+
+           if (sa == 0xff && m == 0xff)
+           {
+               *dst = pix1;
+           }
+           else
+           {
+               __m128i ms, md, ma, msa;
+
+               pix2 = *dst;
+               ma = expand_alpha_rev_1x128 (load_32_1x128 (m));
+               ms = unpack_32_1x128 (pix1);
+               md = unpack_32_1x128 (pix2);
+
+               msa = expand_alpha_rev_1x128 (load_32_1x128 (sa));
+
+               *dst = pack_1x128_32 (in_over_1x128 (&ms, &msa, &ma, &md));
+           }
+       }
+       else
+       {
+           BILINEAR_SKIP_ONE_PIXEL ();
+       }
+
+       w--;
+       dst++;
+    }
+
+    while (w >= 4)
+    {
+       __m128i xmm_src, xmm_src_lo, xmm_src_hi, xmm_srca_lo, xmm_srca_hi;
+       __m128i xmm_dst, xmm_dst_lo, xmm_dst_hi;
+       __m128i xmm_mask, xmm_mask_lo, xmm_mask_hi;
+
+       m = *(uint32_t*)mask;
+
+       if (m)
+       {
+           BILINEAR_INTERPOLATE_ONE_PIXEL (pix1);
+           BILINEAR_INTERPOLATE_ONE_PIXEL (pix2);
+           BILINEAR_INTERPOLATE_ONE_PIXEL (pix3);
+           BILINEAR_INTERPOLATE_ONE_PIXEL (pix4);
+
+           xmm_src = _mm_set_epi32 (pix4, pix3, pix2, pix1);
+
+           if (m == 0xffffffff && is_opaque (xmm_src))
+           {
+               save_128_aligned ((__m128i *)dst, xmm_src);
+           }
+           else
+           {
+               xmm_dst = load_128_aligned ((__m128i *)dst);
+
+               xmm_mask = _mm_unpacklo_epi16 (unpack_32_1x128 (m), _mm_setzero_si128());
+
+               unpack_128_2x128 (xmm_src, &xmm_src_lo, &xmm_src_hi);
+               unpack_128_2x128 (xmm_mask, &xmm_mask_lo, &xmm_mask_hi);
+               unpack_128_2x128 (xmm_dst, &xmm_dst_lo, &xmm_dst_hi);
+
+               expand_alpha_2x128 (xmm_src_lo, xmm_src_hi, &xmm_srca_lo, &xmm_srca_hi);
+               expand_alpha_rev_2x128 (xmm_mask_lo, xmm_mask_hi, &xmm_mask_lo, &xmm_mask_hi);
+
+               in_over_2x128 (&xmm_src_lo, &xmm_src_hi, &xmm_srca_lo, &xmm_srca_hi,
+                              &xmm_mask_lo, &xmm_mask_hi, &xmm_dst_lo, &xmm_dst_hi);
+
+               save_128_aligned ((__m128i*)dst, pack_2x128_128 (xmm_dst_lo, xmm_dst_hi));
+           }
+       }
+       else
+       {
+           BILINEAR_SKIP_ONE_PIXEL ();
+           BILINEAR_SKIP_ONE_PIXEL ();
+           BILINEAR_SKIP_ONE_PIXEL ();
+           BILINEAR_SKIP_ONE_PIXEL ();
+       }
+
+       w -= 4;
+       dst += 4;
+       mask += 4;
+    }
+
+    while (w)
+    {
+       uint32_t sa;
+
+       m = (uint32_t) *mask++;
+
+       if (m)
+       {
+           BILINEAR_INTERPOLATE_ONE_PIXEL (pix1);
+           sa = pix1 >> 24;
+
+           if (sa == 0xff && m == 0xff)
+           {
+               *dst = pix1;
+           }
+           else
+           {
+               __m128i ms, md, ma, msa;
+
+               pix2 = *dst;
+               ma = expand_alpha_rev_1x128 (load_32_1x128 (m));
+               ms = unpack_32_1x128 (pix1);
+               md = unpack_32_1x128 (pix2);
+
+               msa = expand_alpha_rev_1x128 (load_32_1x128 (sa));
+
+               *dst = pack_1x128_32 (in_over_1x128 (&ms, &msa, &ma, &md));
+           }
+       }
+       else
+       {
+           BILINEAR_SKIP_ONE_PIXEL ();
+       }
+
+       w--;
+       dst++;
+    }
+}
+
+FAST_BILINEAR_MAINLOOP_COMMON (sse2_8888_8_8888_cover_OVER,
+                              scaled_bilinear_scanline_sse2_8888_8_8888_OVER,
+                              uint32_t, uint8_t, uint32_t,
+                              COVER, FLAG_HAVE_NON_SOLID_MASK)
+FAST_BILINEAR_MAINLOOP_COMMON (sse2_8888_8_8888_pad_OVER,
+                              scaled_bilinear_scanline_sse2_8888_8_8888_OVER,
+                              uint32_t, uint8_t, uint32_t,
+                              PAD, FLAG_HAVE_NON_SOLID_MASK)
+FAST_BILINEAR_MAINLOOP_COMMON (sse2_8888_8_8888_none_OVER,
+                              scaled_bilinear_scanline_sse2_8888_8_8888_OVER,
+                              uint32_t, uint8_t, uint32_t,
+                              NONE, FLAG_HAVE_NON_SOLID_MASK)
+FAST_BILINEAR_MAINLOOP_COMMON (sse2_8888_8_8888_normal_OVER,
+                              scaled_bilinear_scanline_sse2_8888_8_8888_OVER,
+                              uint32_t, uint8_t, uint32_t,
+                              NORMAL, FLAG_HAVE_NON_SOLID_MASK)
+
+static const pixman_fast_path_t sse2_fast_paths[] =
+{
+    /* PIXMAN_OP_OVER */
+    PIXMAN_STD_FAST_PATH (OVER, solid, a8, r5g6b5, sse2_composite_over_n_8_0565),
+    PIXMAN_STD_FAST_PATH (OVER, solid, a8, b5g6r5, sse2_composite_over_n_8_0565),
+    PIXMAN_STD_FAST_PATH (OVER, solid, null, a8r8g8b8, sse2_composite_over_n_8888),
+    PIXMAN_STD_FAST_PATH (OVER, solid, null, x8r8g8b8, sse2_composite_over_n_8888),
+    PIXMAN_STD_FAST_PATH (OVER, solid, null, r5g6b5, sse2_composite_over_n_0565),
+    PIXMAN_STD_FAST_PATH (OVER, a8r8g8b8, null, a8r8g8b8, sse2_composite_over_8888_8888),
+    PIXMAN_STD_FAST_PATH (OVER, a8r8g8b8, null, x8r8g8b8, sse2_composite_over_8888_8888),
+    PIXMAN_STD_FAST_PATH (OVER, a8b8g8r8, null, a8b8g8r8, sse2_composite_over_8888_8888),
+    PIXMAN_STD_FAST_PATH (OVER, a8b8g8r8, null, x8b8g8r8, sse2_composite_over_8888_8888),
+    PIXMAN_STD_FAST_PATH (OVER, a8r8g8b8, null, r5g6b5, sse2_composite_over_8888_0565),
+    PIXMAN_STD_FAST_PATH (OVER, a8b8g8r8, null, b5g6r5, sse2_composite_over_8888_0565),
+    PIXMAN_STD_FAST_PATH (OVER, solid, a8, a8r8g8b8, sse2_composite_over_n_8_8888),
+    PIXMAN_STD_FAST_PATH (OVER, solid, a8, x8r8g8b8, sse2_composite_over_n_8_8888),
+    PIXMAN_STD_FAST_PATH (OVER, solid, a8, a8b8g8r8, sse2_composite_over_n_8_8888),
+    PIXMAN_STD_FAST_PATH (OVER, solid, a8, x8b8g8r8, sse2_composite_over_n_8_8888),
+    PIXMAN_STD_FAST_PATH (OVER, a8r8g8b8, a8r8g8b8, a8r8g8b8, sse2_composite_over_8888_8888_8888),
+    PIXMAN_STD_FAST_PATH (OVER, a8r8g8b8, a8, x8r8g8b8, sse2_composite_over_8888_8_8888),
+    PIXMAN_STD_FAST_PATH (OVER, a8r8g8b8, a8, a8r8g8b8, sse2_composite_over_8888_8_8888),
+    PIXMAN_STD_FAST_PATH (OVER, a8b8g8r8, a8, x8b8g8r8, sse2_composite_over_8888_8_8888),
+    PIXMAN_STD_FAST_PATH (OVER, a8b8g8r8, a8, a8b8g8r8, sse2_composite_over_8888_8_8888),
+    PIXMAN_STD_FAST_PATH (OVER, x8r8g8b8, a8, x8r8g8b8, sse2_composite_over_x888_8_8888),
+    PIXMAN_STD_FAST_PATH (OVER, x8r8g8b8, a8, a8r8g8b8, sse2_composite_over_x888_8_8888),
+    PIXMAN_STD_FAST_PATH (OVER, x8b8g8r8, a8, x8b8g8r8, sse2_composite_over_x888_8_8888),
+    PIXMAN_STD_FAST_PATH (OVER, x8b8g8r8, a8, a8b8g8r8, sse2_composite_over_x888_8_8888),
+    PIXMAN_STD_FAST_PATH (OVER, x8r8g8b8, solid, a8r8g8b8, sse2_composite_over_x888_n_8888),
+    PIXMAN_STD_FAST_PATH (OVER, x8r8g8b8, solid, x8r8g8b8, sse2_composite_over_x888_n_8888),
+    PIXMAN_STD_FAST_PATH (OVER, x8b8g8r8, solid, a8b8g8r8, sse2_composite_over_x888_n_8888),
+    PIXMAN_STD_FAST_PATH (OVER, x8b8g8r8, solid, x8b8g8r8, sse2_composite_over_x888_n_8888),
+    PIXMAN_STD_FAST_PATH (OVER, a8r8g8b8, solid, a8r8g8b8, sse2_composite_over_8888_n_8888),
+    PIXMAN_STD_FAST_PATH (OVER, a8r8g8b8, solid, x8r8g8b8, sse2_composite_over_8888_n_8888),
+    PIXMAN_STD_FAST_PATH (OVER, a8b8g8r8, solid, a8b8g8r8, sse2_composite_over_8888_n_8888),
+    PIXMAN_STD_FAST_PATH (OVER, a8b8g8r8, solid, x8b8g8r8, sse2_composite_over_8888_n_8888),
+    PIXMAN_STD_FAST_PATH_CA (OVER, solid, a8r8g8b8, a8r8g8b8, sse2_composite_over_n_8888_8888_ca),
+    PIXMAN_STD_FAST_PATH_CA (OVER, solid, a8r8g8b8, x8r8g8b8, sse2_composite_over_n_8888_8888_ca),
+    PIXMAN_STD_FAST_PATH_CA (OVER, solid, a8b8g8r8, a8b8g8r8, sse2_composite_over_n_8888_8888_ca),
+    PIXMAN_STD_FAST_PATH_CA (OVER, solid, a8b8g8r8, x8b8g8r8, sse2_composite_over_n_8888_8888_ca),
+    PIXMAN_STD_FAST_PATH_CA (OVER, solid, a8r8g8b8, r5g6b5, sse2_composite_over_n_8888_0565_ca),
+    PIXMAN_STD_FAST_PATH_CA (OVER, solid, a8b8g8r8, b5g6r5, sse2_composite_over_n_8888_0565_ca),
+    PIXMAN_STD_FAST_PATH (OVER, pixbuf, pixbuf, a8r8g8b8, sse2_composite_over_pixbuf_8888),
+    PIXMAN_STD_FAST_PATH (OVER, pixbuf, pixbuf, x8r8g8b8, sse2_composite_over_pixbuf_8888),
+    PIXMAN_STD_FAST_PATH (OVER, rpixbuf, rpixbuf, a8b8g8r8, sse2_composite_over_pixbuf_8888),
+    PIXMAN_STD_FAST_PATH (OVER, rpixbuf, rpixbuf, x8b8g8r8, sse2_composite_over_pixbuf_8888),
+    PIXMAN_STD_FAST_PATH (OVER, pixbuf, pixbuf, r5g6b5, sse2_composite_over_pixbuf_0565),
+    PIXMAN_STD_FAST_PATH (OVER, rpixbuf, rpixbuf, b5g6r5, sse2_composite_over_pixbuf_0565),
+    PIXMAN_STD_FAST_PATH (OVER, x8r8g8b8, null, x8r8g8b8, sse2_composite_copy_area),
+    PIXMAN_STD_FAST_PATH (OVER, x8b8g8r8, null, x8b8g8r8, sse2_composite_copy_area),
+    
+    /* PIXMAN_OP_OVER_REVERSE */
+    PIXMAN_STD_FAST_PATH (OVER_REVERSE, solid, null, a8r8g8b8, sse2_composite_over_reverse_n_8888),
+    PIXMAN_STD_FAST_PATH (OVER_REVERSE, solid, null, a8b8g8r8, sse2_composite_over_reverse_n_8888),
+
+    /* PIXMAN_OP_ADD */
+    PIXMAN_STD_FAST_PATH_CA (ADD, solid, a8r8g8b8, a8r8g8b8, sse2_composite_add_n_8888_8888_ca),
+    PIXMAN_STD_FAST_PATH (ADD, a8, null, a8, sse2_composite_add_8_8),
+    PIXMAN_STD_FAST_PATH (ADD, a8r8g8b8, null, a8r8g8b8, sse2_composite_add_8888_8888),
+    PIXMAN_STD_FAST_PATH (ADD, a8b8g8r8, null, a8b8g8r8, sse2_composite_add_8888_8888),
+    PIXMAN_STD_FAST_PATH (ADD, solid, a8, a8, sse2_composite_add_n_8_8),
+    PIXMAN_STD_FAST_PATH (ADD, solid, null, a8, sse2_composite_add_n_8),
+
+    /* PIXMAN_OP_SRC */
+    PIXMAN_STD_FAST_PATH (SRC, solid, a8, a8r8g8b8, sse2_composite_src_n_8_8888),
+    PIXMAN_STD_FAST_PATH (SRC, solid, a8, x8r8g8b8, sse2_composite_src_n_8_8888),
+    PIXMAN_STD_FAST_PATH (SRC, solid, a8, a8b8g8r8, sse2_composite_src_n_8_8888),
+    PIXMAN_STD_FAST_PATH (SRC, solid, a8, x8b8g8r8, sse2_composite_src_n_8_8888),
+    PIXMAN_STD_FAST_PATH (SRC, x8r8g8b8, null, a8r8g8b8, sse2_composite_src_x888_8888),
+    PIXMAN_STD_FAST_PATH (SRC, x8b8g8r8, null, a8b8g8r8, sse2_composite_src_x888_8888),
+    PIXMAN_STD_FAST_PATH (SRC, a8r8g8b8, null, a8r8g8b8, sse2_composite_copy_area),
+    PIXMAN_STD_FAST_PATH (SRC, a8b8g8r8, null, a8b8g8r8, sse2_composite_copy_area),
+    PIXMAN_STD_FAST_PATH (SRC, a8r8g8b8, null, x8r8g8b8, sse2_composite_copy_area),
+    PIXMAN_STD_FAST_PATH (SRC, a8b8g8r8, null, x8b8g8r8, sse2_composite_copy_area),
+    PIXMAN_STD_FAST_PATH (SRC, x8r8g8b8, null, x8r8g8b8, sse2_composite_copy_area),
+    PIXMAN_STD_FAST_PATH (SRC, x8b8g8r8, null, x8b8g8r8, sse2_composite_copy_area),
+    PIXMAN_STD_FAST_PATH (SRC, r5g6b5, null, r5g6b5, sse2_composite_copy_area),
+    PIXMAN_STD_FAST_PATH (SRC, b5g6r5, null, b5g6r5, sse2_composite_copy_area),
+
+    /* PIXMAN_OP_IN */
+    PIXMAN_STD_FAST_PATH (IN, a8, null, a8, sse2_composite_in_8_8),
+    PIXMAN_STD_FAST_PATH (IN, solid, a8, a8, sse2_composite_in_n_8_8),
+    PIXMAN_STD_FAST_PATH (IN, solid, null, a8, sse2_composite_in_n_8),
+
+    SIMPLE_NEAREST_FAST_PATH_COVER (OVER, a8r8g8b8, x8r8g8b8, sse2_8888_8888),
+    SIMPLE_NEAREST_FAST_PATH_COVER (OVER, a8b8g8r8, x8b8g8r8, sse2_8888_8888),
+    SIMPLE_NEAREST_FAST_PATH_COVER (OVER, a8r8g8b8, a8r8g8b8, sse2_8888_8888),
+    SIMPLE_NEAREST_FAST_PATH_COVER (OVER, a8b8g8r8, a8b8g8r8, sse2_8888_8888),
+    SIMPLE_NEAREST_FAST_PATH_NONE (OVER, a8r8g8b8, x8r8g8b8, sse2_8888_8888),
+    SIMPLE_NEAREST_FAST_PATH_NONE (OVER, a8b8g8r8, x8b8g8r8, sse2_8888_8888),
+    SIMPLE_NEAREST_FAST_PATH_NONE (OVER, a8r8g8b8, a8r8g8b8, sse2_8888_8888),
+    SIMPLE_NEAREST_FAST_PATH_NONE (OVER, a8b8g8r8, a8b8g8r8, sse2_8888_8888),
+    SIMPLE_NEAREST_FAST_PATH_PAD (OVER, a8r8g8b8, x8r8g8b8, sse2_8888_8888),
+    SIMPLE_NEAREST_FAST_PATH_PAD (OVER, a8b8g8r8, x8b8g8r8, sse2_8888_8888),
+    SIMPLE_NEAREST_FAST_PATH_PAD (OVER, a8r8g8b8, a8r8g8b8, sse2_8888_8888),
+    SIMPLE_NEAREST_FAST_PATH_PAD (OVER, a8b8g8r8, a8b8g8r8, sse2_8888_8888),
+
+    SIMPLE_NEAREST_SOLID_MASK_FAST_PATH (OVER, a8r8g8b8, a8r8g8b8, sse2_8888_n_8888),
+    SIMPLE_NEAREST_SOLID_MASK_FAST_PATH (OVER, a8b8g8r8, a8b8g8r8, sse2_8888_n_8888),
+    SIMPLE_NEAREST_SOLID_MASK_FAST_PATH (OVER, a8r8g8b8, x8r8g8b8, sse2_8888_n_8888),
+    SIMPLE_NEAREST_SOLID_MASK_FAST_PATH (OVER, a8b8g8r8, x8b8g8r8, sse2_8888_n_8888),
+
+    SIMPLE_BILINEAR_FAST_PATH (SRC, a8r8g8b8, a8r8g8b8, sse2_8888_8888),
+    SIMPLE_BILINEAR_FAST_PATH (SRC, a8r8g8b8, x8r8g8b8, sse2_8888_8888),
+    SIMPLE_BILINEAR_FAST_PATH (SRC, x8r8g8b8, x8r8g8b8, sse2_8888_8888),
+
+    SIMPLE_BILINEAR_FAST_PATH (OVER, a8r8g8b8, x8r8g8b8, sse2_8888_8888),
+    SIMPLE_BILINEAR_FAST_PATH (OVER, a8b8g8r8, x8b8g8r8, sse2_8888_8888),
+    SIMPLE_BILINEAR_FAST_PATH (OVER, a8r8g8b8, a8r8g8b8, sse2_8888_8888),
+    SIMPLE_BILINEAR_FAST_PATH (OVER, a8b8g8r8, a8b8g8r8, sse2_8888_8888),
+
+    SIMPLE_BILINEAR_A8_MASK_FAST_PATH (OVER, a8r8g8b8, x8r8g8b8, sse2_8888_8_8888),
+    SIMPLE_BILINEAR_A8_MASK_FAST_PATH (OVER, a8b8g8r8, x8b8g8r8, sse2_8888_8_8888),
+    SIMPLE_BILINEAR_A8_MASK_FAST_PATH (OVER, a8r8g8b8, a8r8g8b8, sse2_8888_8_8888),
+    SIMPLE_BILINEAR_A8_MASK_FAST_PATH (OVER, a8b8g8r8, a8b8g8r8, sse2_8888_8_8888),
+
+    { PIXMAN_OP_NONE },
+};
+
+static pixman_bool_t
+sse2_blt (pixman_implementation_t *imp,
+          uint32_t *               src_bits,
+          uint32_t *               dst_bits,
+          int                      src_stride,
+          int                      dst_stride,
+          int                      src_bpp,
+          int                      dst_bpp,
+          int                      src_x,
+          int                      src_y,
+          int                      dest_x,
+          int                      dest_y,
+          int                      width,
+          int                      height)
+{
+    if (!pixman_blt_sse2 (
+            src_bits, dst_bits, src_stride, dst_stride, src_bpp, dst_bpp,
+            src_x, src_y, dest_x, dest_y, width, height))
+
+    {
+       return _pixman_implementation_blt (
+           imp->delegate,
+           src_bits, dst_bits, src_stride, dst_stride, src_bpp, dst_bpp,
+           src_x, src_y, dest_x, dest_y, width, height);
+    }
+
+    return TRUE;
+}
+
+#if defined(__GNUC__) && !defined(__x86_64__) && !defined(__amd64__)
+__attribute__((__force_align_arg_pointer__))
+#endif
+static pixman_bool_t
+sse2_fill (pixman_implementation_t *imp,
+           uint32_t *               bits,
+           int                      stride,
+           int                      bpp,
+           int                      x,
+           int                      y,
+           int                      width,
+           int                      height,
+           uint32_t xor)
+{
+    if (!pixman_fill_sse2 (bits, stride, bpp, x, y, width, height, xor))
+    {
+       return _pixman_implementation_fill (
+           imp->delegate, bits, stride, bpp, x, y, width, height, xor);
+    }
+
+    return TRUE;
+}
+
+static uint32_t *
+sse2_fetch_x8r8g8b8 (pixman_iter_t *iter, const uint32_t *mask)
+{
+    int w = iter->width;
+    __m128i ff000000 = mask_ff000000;
+    uint32_t *dst = iter->buffer;
+    uint32_t *src = (uint32_t *)iter->bits;
+
+    iter->bits += iter->stride;
+
+    while (w && ((unsigned long)dst) & 0x0f)
+    {
+       *dst++ = (*src++) | 0xff000000;
+       w--;
+    }
+
+    while (w >= 4)
+    {
+       save_128_aligned (
+           (__m128i *)dst, _mm_or_si128 (
+               load_128_unaligned ((__m128i *)src), ff000000));
+
+       dst += 4;
+       src += 4;
+       w -= 4;
+    }
+
+    while (w)
+    {
+       *dst++ = (*src++) | 0xff000000;
+       w--;
+    }
+
+    return iter->buffer;
+}
+
+static uint32_t *
+sse2_fetch_r5g6b5 (pixman_iter_t *iter, const uint32_t *mask)
+{
+    int w = iter->width;
+    uint32_t *dst = iter->buffer;
+    uint16_t *src = (uint16_t *)iter->bits;
+    __m128i ff000000 = mask_ff000000;
+
+    iter->bits += iter->stride;
+
+    while (w && ((unsigned long)dst) & 0x0f)
+    {
+       uint16_t s = *src++;
+
+       *dst++ = CONVERT_0565_TO_8888 (s);
+       w--;
+    }
+
+    while (w >= 8)
+    {
+       __m128i lo, hi, s;
+
+       s = _mm_loadu_si128 ((__m128i *)src);
+
+       lo = unpack_565_to_8888 (_mm_unpacklo_epi16 (s, _mm_setzero_si128 ()));
+       hi = unpack_565_to_8888 (_mm_unpackhi_epi16 (s, _mm_setzero_si128 ()));
+
+       save_128_aligned ((__m128i *)(dst + 0), _mm_or_si128 (lo, ff000000));
+       save_128_aligned ((__m128i *)(dst + 4), _mm_or_si128 (hi, ff000000));
+
+       dst += 8;
+       src += 8;
+       w -= 8;
+    }
+
+    while (w)
+    {
+       uint16_t s = *src++;
+
+       *dst++ = CONVERT_0565_TO_8888 (s);
+       w--;
+    }
+
+    return iter->buffer;
+}
+
+static uint32_t *
+sse2_fetch_a8 (pixman_iter_t *iter, const uint32_t *mask)
+{
+    int w = iter->width;
+    uint32_t *dst = iter->buffer;
+    uint8_t *src = iter->bits;
+    __m128i xmm0, xmm1, xmm2, xmm3, xmm4, xmm5, xmm6;
+
+    iter->bits += iter->stride;
+
+    while (w && (((unsigned long)dst) & 15))
+    {
+        *dst++ = *(src++) << 24;
+        w--;
+    }
+
+    while (w >= 16)
+    {
+       xmm0 = _mm_loadu_si128((__m128i *)src);
+
+       xmm1 = _mm_unpacklo_epi8  (_mm_setzero_si128(), xmm0);
+       xmm2 = _mm_unpackhi_epi8  (_mm_setzero_si128(), xmm0);
+       xmm3 = _mm_unpacklo_epi16 (_mm_setzero_si128(), xmm1);
+       xmm4 = _mm_unpackhi_epi16 (_mm_setzero_si128(), xmm1);
+       xmm5 = _mm_unpacklo_epi16 (_mm_setzero_si128(), xmm2);
+       xmm6 = _mm_unpackhi_epi16 (_mm_setzero_si128(), xmm2);
+
+       _mm_store_si128(((__m128i *)(dst +  0)), xmm3);
+       _mm_store_si128(((__m128i *)(dst +  4)), xmm4);
+       _mm_store_si128(((__m128i *)(dst +  8)), xmm5);
+       _mm_store_si128(((__m128i *)(dst + 12)), xmm6);
+
+       dst += 16;
+       src += 16;
+       w -= 16;
+    }
+
+    while (w)
+    {
+       *dst++ = *(src++) << 24;
+       w--;
+    }
+
+    return iter->buffer;
+}
+
+typedef struct
+{
+    pixman_format_code_t       format;
+    pixman_iter_get_scanline_t get_scanline;
+} fetcher_info_t;
+
+static const fetcher_info_t fetchers[] =
+{
+    { PIXMAN_x8r8g8b8,         sse2_fetch_x8r8g8b8 },
+    { PIXMAN_r5g6b5,           sse2_fetch_r5g6b5 },
+    { PIXMAN_a8,               sse2_fetch_a8 },
+    { PIXMAN_null }
+};
+
+static void
+sse2_src_iter_init (pixman_implementation_t *imp, pixman_iter_t *iter)
+{
+    pixman_image_t *image = iter->image;
+    int x = iter->x;
+    int y = iter->y;
+    int width = iter->width;
+    int height = iter->height;
+
+#define FLAGS                                                          \
+    (FAST_PATH_STANDARD_FLAGS | FAST_PATH_ID_TRANSFORM)
+
+    if ((iter->flags & ITER_NARROW)                            &&
+       (image->common.flags & FLAGS) == FLAGS                  &&
+       x >= 0 && y >= 0                                        &&
+       x + width <= image->bits.width                          &&
+       y + height <= image->bits.height)
+    {
+       const fetcher_info_t *f;
+
+       for (f = &fetchers[0]; f->format != PIXMAN_null; f++)
+       {
+           if (image->common.extended_format_code == f->format)
+           {
+               uint8_t *b = (uint8_t *)image->bits.bits;
+               int s = image->bits.rowstride * 4;
+
+               iter->bits = b + s * iter->y + x * PIXMAN_FORMAT_BPP (f->format) / 8;
+               iter->stride = s;
+
+               iter->get_scanline = f->get_scanline;
+               return;
+           }
+       }
+    }
+
+    imp->delegate->src_iter_init (imp->delegate, iter);
+}
+
+#if defined(__GNUC__) && !defined(__x86_64__) && !defined(__amd64__)
+__attribute__((__force_align_arg_pointer__))
+#endif
+pixman_implementation_t *
+_pixman_implementation_create_sse2 (pixman_implementation_t *fallback)
+{
+    pixman_implementation_t *imp = _pixman_implementation_create (fallback, sse2_fast_paths);
+
+    /* SSE2 constants */
+    mask_565_r  = create_mask_2x32_128 (0x00f80000, 0x00f80000);
+    mask_565_g1 = create_mask_2x32_128 (0x00070000, 0x00070000);
+    mask_565_g2 = create_mask_2x32_128 (0x000000e0, 0x000000e0);
+    mask_565_b  = create_mask_2x32_128 (0x0000001f, 0x0000001f);
+    mask_red   = create_mask_2x32_128 (0x00f80000, 0x00f80000);
+    mask_green = create_mask_2x32_128 (0x0000fc00, 0x0000fc00);
+    mask_blue  = create_mask_2x32_128 (0x000000f8, 0x000000f8);
+    mask_565_fix_rb = create_mask_2x32_128 (0x00e000e0, 0x00e000e0);
+    mask_565_fix_g = create_mask_2x32_128  (0x0000c000, 0x0000c000);
+    mask_0080 = create_mask_16_128 (0x0080);
+    mask_00ff = create_mask_16_128 (0x00ff);
+    mask_0101 = create_mask_16_128 (0x0101);
+    mask_ffff = create_mask_16_128 (0xffff);
+    mask_ff000000 = create_mask_2x32_128 (0xff000000, 0xff000000);
+    mask_alpha = create_mask_2x32_128 (0x00ff0000, 0x00000000);
+
+    /* Set up function pointers */
+    imp->combine_32[PIXMAN_OP_OVER] = sse2_combine_over_u;
+    imp->combine_32[PIXMAN_OP_OVER_REVERSE] = sse2_combine_over_reverse_u;
+    imp->combine_32[PIXMAN_OP_IN] = sse2_combine_in_u;
+    imp->combine_32[PIXMAN_OP_IN_REVERSE] = sse2_combine_in_reverse_u;
+    imp->combine_32[PIXMAN_OP_OUT] = sse2_combine_out_u;
+    imp->combine_32[PIXMAN_OP_OUT_REVERSE] = sse2_combine_out_reverse_u;
+    imp->combine_32[PIXMAN_OP_ATOP] = sse2_combine_atop_u;
+    imp->combine_32[PIXMAN_OP_ATOP_REVERSE] = sse2_combine_atop_reverse_u;
+    imp->combine_32[PIXMAN_OP_XOR] = sse2_combine_xor_u;
+    imp->combine_32[PIXMAN_OP_ADD] = sse2_combine_add_u;
+
+    imp->combine_32[PIXMAN_OP_SATURATE] = sse2_combine_saturate_u;
+
+    imp->combine_32_ca[PIXMAN_OP_SRC] = sse2_combine_src_ca;
+    imp->combine_32_ca[PIXMAN_OP_OVER] = sse2_combine_over_ca;
+    imp->combine_32_ca[PIXMAN_OP_OVER_REVERSE] = sse2_combine_over_reverse_ca;
+    imp->combine_32_ca[PIXMAN_OP_IN] = sse2_combine_in_ca;
+    imp->combine_32_ca[PIXMAN_OP_IN_REVERSE] = sse2_combine_in_reverse_ca;
+    imp->combine_32_ca[PIXMAN_OP_OUT] = sse2_combine_out_ca;
+    imp->combine_32_ca[PIXMAN_OP_OUT_REVERSE] = sse2_combine_out_reverse_ca;
+    imp->combine_32_ca[PIXMAN_OP_ATOP] = sse2_combine_atop_ca;
+    imp->combine_32_ca[PIXMAN_OP_ATOP_REVERSE] = sse2_combine_atop_reverse_ca;
+    imp->combine_32_ca[PIXMAN_OP_XOR] = sse2_combine_xor_ca;
+    imp->combine_32_ca[PIXMAN_OP_ADD] = sse2_combine_add_ca;
+
+    imp->blt = sse2_blt;
+    imp->fill = sse2_fill;
+
+    imp->src_iter_init = sse2_src_iter_init;
+
+    return imp;
+}
diff --git a/pixman/pixman-timer.c b/pixman/pixman-timer.c
new file mode 100644 (file)
index 0000000..f5ae18e
--- /dev/null
@@ -0,0 +1,66 @@
+/*
+ * Copyright © 2007 Red Hat, Inc.
+ *
+ * Permission to use, copy, modify, distribute, and sell this software and its
+ * documentation for any purpose is hereby granted without fee, provided that
+ * the above copyright notice appear in all copies and that both that
+ * copyright notice and this permission notice appear in supporting
+ * documentation, and that the name of Red Hat not be used in advertising or
+ * publicity pertaining to distribution of the software without specific,
+ * written prior permission.  Red Hat makes no representations about the
+ * suitability of this software for any purpose.  It is provided "as is"
+ * without express or implied warranty.
+ *
+ * RED HAT DISCLAIMS ALL WARRANTIES WITH REGARD TO THIS SOFTWARE, INCLUDING ALL
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS, IN NO EVENT SHALL RED HAT
+ * BE LIABLE FOR ANY SPECIAL, INDIRECT OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES
+ * WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN ACTION
+ * OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT OF OR IN
+ * CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE.
+ */
+
+#ifdef HAVE_CONFIG_H
+#include <config.h>
+#endif
+
+#include <stdlib.h>
+#include <stdio.h>
+#include "pixman-private.h"
+
+#ifdef PIXMAN_TIMERS
+
+static pixman_timer_t *timers;
+
+static void
+dump_timers (void)
+{
+    pixman_timer_t *timer;
+
+    for (timer = timers; timer != NULL; timer = timer->next)
+    {
+       printf ("%s:   total: %llu     n: %llu      avg: %f\n",
+               timer->name,
+               timer->total,
+               timer->n_times,
+               timer->total / (double)timer->n_times);
+    }
+}
+
+void
+pixman_timer_register (pixman_timer_t *timer)
+{
+    static int initialized;
+
+    int atexit (void (*function)(void));
+
+    if (!initialized)
+    {
+       atexit (dump_timers);
+       initialized = 1;
+    }
+
+    timer->next = timers;
+    timers = timer;
+}
+
+#endif
diff --git a/pixman/pixman-trap.c b/pixman/pixman-trap.c
new file mode 100644 (file)
index 0000000..c99f03e
--- /dev/null
@@ -0,0 +1,668 @@
+/*
+ * Copyright © 2002 Keith Packard, member of The XFree86 Project, Inc.
+ * Copyright © 2004 Keith Packard
+ *
+ * Permission to use, copy, modify, distribute, and sell this software and its
+ * documentation for any purpose is hereby granted without fee, provided that
+ * the above copyright notice appear in all copies and that both that
+ * copyright notice and this permission notice appear in supporting
+ * documentation, and that the name of Keith Packard not be used in
+ * advertising or publicity pertaining to distribution of the software without
+ * specific, written prior permission.  Keith Packard makes no
+ * representations about the suitability of this software for any purpose.  It
+ * is provided "as is" without express or implied warranty.
+ *
+ * KEITH PACKARD DISCLAIMS ALL WARRANTIES WITH REGARD TO THIS SOFTWARE,
+ * INCLUDING ALL IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS, IN NO
+ * EVENT SHALL KEITH PACKARD BE LIABLE FOR ANY SPECIAL, INDIRECT OR
+ * CONSEQUENTIAL DAMAGES OR ANY DAMAGES WHATSOEVER RESULTING FROM LOSS OF USE,
+ * DATA OR PROFITS, WHETHER IN AN ACTION OF CONTRACT, NEGLIGENCE OR OTHER
+ * TORTIOUS ACTION, ARISING OUT OF OR IN CONNECTION WITH THE USE OR
+ * PERFORMANCE OF THIS SOFTWARE.
+ */
+
+#ifdef HAVE_CONFIG_H
+#include <config.h>
+#endif
+
+#include <stdio.h>
+#include <stdlib.h>
+#include "pixman-private.h"
+
+/*
+ * Compute the smallest value greater than or equal to y which is on a
+ * grid row.
+ */
+
+PIXMAN_EXPORT pixman_fixed_t
+pixman_sample_ceil_y (pixman_fixed_t y, int n)
+{
+    pixman_fixed_t f = pixman_fixed_frac (y);
+    pixman_fixed_t i = pixman_fixed_floor (y);
+
+    f = DIV (f - Y_FRAC_FIRST (n) + (STEP_Y_SMALL (n) - pixman_fixed_e), STEP_Y_SMALL (n)) * STEP_Y_SMALL (n) +
+       Y_FRAC_FIRST (n);
+    
+    if (f > Y_FRAC_LAST (n))
+    {
+       if (pixman_fixed_to_int (i) == 0x7fff)
+       {
+           f = 0xffff; /* saturate */
+       }
+       else
+       {
+           f = Y_FRAC_FIRST (n);
+           i += pixman_fixed_1;
+       }
+    }
+    return (i | f);
+}
+
+/*
+ * Compute the largest value strictly less than y which is on a
+ * grid row.
+ */
+PIXMAN_EXPORT pixman_fixed_t
+pixman_sample_floor_y (pixman_fixed_t y,
+                       int            n)
+{
+    pixman_fixed_t f = pixman_fixed_frac (y);
+    pixman_fixed_t i = pixman_fixed_floor (y);
+
+    f = DIV (f - pixman_fixed_e - Y_FRAC_FIRST (n), STEP_Y_SMALL (n)) * STEP_Y_SMALL (n) +
+       Y_FRAC_FIRST (n);
+
+    if (f < Y_FRAC_FIRST (n))
+    {
+       if (pixman_fixed_to_int (i) == 0x8000)
+       {
+           f = 0; /* saturate */
+       }
+       else
+       {
+           f = Y_FRAC_LAST (n);
+           i -= pixman_fixed_1;
+       }
+    }
+    return (i | f);
+}
+
+/*
+ * Step an edge by any amount (including negative values)
+ */
+PIXMAN_EXPORT void
+pixman_edge_step (pixman_edge_t *e,
+                  int            n)
+{
+    pixman_fixed_48_16_t ne;
+
+    e->x += n * e->stepx;
+
+    ne = e->e + n * (pixman_fixed_48_16_t) e->dx;
+
+    if (n >= 0)
+    {
+       if (ne > 0)
+       {
+           int nx = (ne + e->dy - 1) / e->dy;
+           e->e = ne - nx * (pixman_fixed_48_16_t) e->dy;
+           e->x += nx * e->signdx;
+       }
+    }
+    else
+    {
+       if (ne <= -e->dy)
+       {
+           int nx = (-ne) / e->dy;
+           e->e = ne + nx * (pixman_fixed_48_16_t) e->dy;
+           e->x -= nx * e->signdx;
+       }
+    }
+}
+
+/*
+ * A private routine to initialize the multi-step
+ * elements of an edge structure
+ */
+static void
+_pixman_edge_multi_init (pixman_edge_t * e,
+                         int             n,
+                         pixman_fixed_t *stepx_p,
+                         pixman_fixed_t *dx_p)
+{
+    pixman_fixed_t stepx;
+    pixman_fixed_48_16_t ne;
+
+    ne = n * (pixman_fixed_48_16_t) e->dx;
+    stepx = n * e->stepx;
+
+    if (ne > 0)
+    {
+       int nx = ne / e->dy;
+       ne -= nx * e->dy;
+       stepx += nx * e->signdx;
+    }
+
+    *dx_p = ne;
+    *stepx_p = stepx;
+}
+
+/*
+ * Initialize one edge structure given the line endpoints and a
+ * starting y value
+ */
+PIXMAN_EXPORT void
+pixman_edge_init (pixman_edge_t *e,
+                  int            n,
+                  pixman_fixed_t y_start,
+                  pixman_fixed_t x_top,
+                  pixman_fixed_t y_top,
+                  pixman_fixed_t x_bot,
+                  pixman_fixed_t y_bot)
+{
+    pixman_fixed_t dx, dy;
+
+    e->x = x_top;
+    e->e = 0;
+    dx = x_bot - x_top;
+    dy = y_bot - y_top;
+    e->dy = dy;
+    e->dx = 0;
+
+    if (dy)
+    {
+       if (dx >= 0)
+       {
+           e->signdx = 1;
+           e->stepx = dx / dy;
+           e->dx = dx % dy;
+           e->e = -dy;
+       }
+       else
+       {
+           e->signdx = -1;
+           e->stepx = -(-dx / dy);
+           e->dx = -dx % dy;
+           e->e = 0;
+       }
+
+       _pixman_edge_multi_init (e, STEP_Y_SMALL (n),
+                                &e->stepx_small, &e->dx_small);
+
+       _pixman_edge_multi_init (e, STEP_Y_BIG (n),
+                                &e->stepx_big, &e->dx_big);
+    }
+    pixman_edge_step (e, y_start - y_top);
+}
+
+/*
+ * Initialize one edge structure given a line, starting y value
+ * and a pixel offset for the line
+ */
+PIXMAN_EXPORT void
+pixman_line_fixed_edge_init (pixman_edge_t *            e,
+                             int                        n,
+                             pixman_fixed_t             y,
+                             const pixman_line_fixed_t *line,
+                             int                        x_off,
+                             int                        y_off)
+{
+    pixman_fixed_t x_off_fixed = pixman_int_to_fixed (x_off);
+    pixman_fixed_t y_off_fixed = pixman_int_to_fixed (y_off);
+    const pixman_point_fixed_t *top, *bot;
+
+    if (line->p1.y <= line->p2.y)
+    {
+       top = &line->p1;
+       bot = &line->p2;
+    }
+    else
+    {
+       top = &line->p2;
+       bot = &line->p1;
+    }
+    
+    pixman_edge_init (e, n, y,
+                      top->x + x_off_fixed,
+                      top->y + y_off_fixed,
+                      bot->x + x_off_fixed,
+                      bot->y + y_off_fixed);
+}
+
+PIXMAN_EXPORT void
+pixman_add_traps (pixman_image_t * image,
+                  int16_t          x_off,
+                  int16_t          y_off,
+                  int              ntrap,
+                  pixman_trap_t *  traps)
+{
+    int bpp;
+    int height;
+
+    pixman_fixed_t x_off_fixed;
+    pixman_fixed_t y_off_fixed;
+    pixman_edge_t l, r;
+    pixman_fixed_t t, b;
+
+    _pixman_image_validate (image);
+    
+    height = image->bits.height;
+    bpp = PIXMAN_FORMAT_BPP (image->bits.format);
+
+    x_off_fixed = pixman_int_to_fixed (x_off);
+    y_off_fixed = pixman_int_to_fixed (y_off);
+
+    while (ntrap--)
+    {
+       t = traps->top.y + y_off_fixed;
+       if (t < 0)
+           t = 0;
+       t = pixman_sample_ceil_y (t, bpp);
+
+       b = traps->bot.y + y_off_fixed;
+       if (pixman_fixed_to_int (b) >= height)
+           b = pixman_int_to_fixed (height) - 1;
+       b = pixman_sample_floor_y (b, bpp);
+
+       if (b >= t)
+       {
+           /* initialize edge walkers */
+           pixman_edge_init (&l, bpp, t,
+                             traps->top.l + x_off_fixed,
+                             traps->top.y + y_off_fixed,
+                             traps->bot.l + x_off_fixed,
+                             traps->bot.y + y_off_fixed);
+
+           pixman_edge_init (&r, bpp, t,
+                             traps->top.r + x_off_fixed,
+                             traps->top.y + y_off_fixed,
+                             traps->bot.r + x_off_fixed,
+                             traps->bot.y + y_off_fixed);
+
+           pixman_rasterize_edges (image, &l, &r, t, b);
+       }
+
+       traps++;
+    }
+}
+
+#if 0
+static void
+dump_image (pixman_image_t *image,
+            const char *    title)
+{
+    int i, j;
+
+    if (!image->type == BITS)
+       printf ("%s is not a regular image\n", title);
+
+    if (!image->bits.format == PIXMAN_a8)
+       printf ("%s is not an alpha mask\n", title);
+
+    printf ("\n\n\n%s: \n", title);
+
+    for (i = 0; i < image->bits.height; ++i)
+    {
+       uint8_t *line =
+           (uint8_t *)&(image->bits.bits[i * image->bits.rowstride]);
+
+       for (j = 0; j < image->bits.width; ++j)
+           printf ("%c", line[j] ? '#' : ' ');
+
+       printf ("\n");
+    }
+}
+#endif
+
+PIXMAN_EXPORT void
+pixman_add_trapezoids (pixman_image_t *          image,
+                       int16_t                   x_off,
+                       int                       y_off,
+                       int                       ntraps,
+                       const pixman_trapezoid_t *traps)
+{
+    int i;
+
+#if 0
+    dump_image (image, "before");
+#endif
+
+    for (i = 0; i < ntraps; ++i)
+    {
+       const pixman_trapezoid_t *trap = &(traps[i]);
+
+       if (!pixman_trapezoid_valid (trap))
+           continue;
+
+       pixman_rasterize_trapezoid (image, trap, x_off, y_off);
+    }
+
+#if 0
+    dump_image (image, "after");
+#endif
+}
+
+PIXMAN_EXPORT void
+pixman_rasterize_trapezoid (pixman_image_t *          image,
+                            const pixman_trapezoid_t *trap,
+                            int                       x_off,
+                            int                       y_off)
+{
+    int bpp;
+    int height;
+
+    pixman_fixed_t y_off_fixed;
+    pixman_edge_t l, r;
+    pixman_fixed_t t, b;
+
+    return_if_fail (image->type == BITS);
+
+    _pixman_image_validate (image);
+    
+    if (!pixman_trapezoid_valid (trap))
+       return;
+
+    height = image->bits.height;
+    bpp = PIXMAN_FORMAT_BPP (image->bits.format);
+
+    y_off_fixed = pixman_int_to_fixed (y_off);
+
+    t = trap->top + y_off_fixed;
+    if (t < 0)
+       t = 0;
+    t = pixman_sample_ceil_y (t, bpp);
+
+    b = trap->bottom + y_off_fixed;
+    if (pixman_fixed_to_int (b) >= height)
+       b = pixman_int_to_fixed (height) - 1;
+    b = pixman_sample_floor_y (b, bpp);
+    
+    if (b >= t)
+    {
+       /* initialize edge walkers */
+       pixman_line_fixed_edge_init (&l, bpp, t, &trap->left, x_off, y_off);
+       pixman_line_fixed_edge_init (&r, bpp, t, &trap->right, x_off, y_off);
+
+       pixman_rasterize_edges (image, &l, &r, t, b);
+    }
+}
+
+/*
+ * pixman_composite_trapezoids()
+ *
+ * All the trapezoids are conceptually rendered to an infinitely big image.
+ * The (0, 0) coordinates of this image are then aligned with the (x, y)
+ * coordinates of the source image, and then both images are aligned with
+ * the (x, y) coordinates of the destination. Then, in principle, compositing
+ * of these three images takes place across the entire destination.
+ *
+ * FIXME: However, there is currently a bug, where we restrict this compositing
+ * to the bounding box of the trapezoids. This is incorrect for operators such
+ * as SRC and IN where blank source pixels do have an effect on the destination.
+ */
+PIXMAN_EXPORT void
+pixman_composite_trapezoids (pixman_op_t               op,
+                            pixman_image_t *           src,
+                            pixman_image_t *           dst,
+                            pixman_format_code_t       mask_format,
+                            int                        x_src,
+                            int                        y_src,
+                            int                        x_dst,
+                            int                        y_dst,
+                            int                        n_traps,
+                            const pixman_trapezoid_t * traps)
+{
+    int i;
+
+    if (n_traps <= 0)
+       return;
+
+    _pixman_image_validate (src);
+    _pixman_image_validate (dst);
+
+    if (op == PIXMAN_OP_ADD &&
+       (src->common.flags & FAST_PATH_IS_OPAQUE)               &&
+       (mask_format == dst->common.extended_format_code)       &&
+       !(dst->common.have_clip_region))
+    {
+       for (i = 0; i < n_traps; ++i)
+       {
+           const pixman_trapezoid_t *trap = &(traps[i]);
+           
+           if (!pixman_trapezoid_valid (trap))
+               continue;
+           
+           pixman_rasterize_trapezoid (dst, trap, x_dst, y_dst);
+       }
+    }
+    else
+    {
+       pixman_image_t *tmp;
+       pixman_box32_t box;
+       
+       box.x1 = INT32_MAX;
+       box.y1 = INT32_MAX;
+       box.x2 = INT32_MIN;
+       box.y2 = INT32_MIN;
+       
+       for (i = 0; i < n_traps; ++i)
+       {
+           const pixman_trapezoid_t *trap = &(traps[i]);
+           int y1, y2;
+           
+           if (!pixman_trapezoid_valid (trap))
+               continue;
+           
+           y1 = pixman_fixed_to_int (trap->top);
+           if (y1 < box.y1)
+               box.y1 = y1;
+           
+           y2 = pixman_fixed_to_int (pixman_fixed_ceil (trap->bottom));
+           if (y2 > box.y2)
+               box.y2 = y2;
+           
+#define EXTEND_MIN(x)                                                  \
+           if (pixman_fixed_to_int ((x)) < box.x1)                     \
+               box.x1 = pixman_fixed_to_int ((x));
+#define EXTEND_MAX(x)                                                  \
+           if (pixman_fixed_to_int (pixman_fixed_ceil ((x))) > box.x2) \
+               box.x2 = pixman_fixed_to_int (pixman_fixed_ceil ((x)));
+           
+#define EXTEND(x)                                                      \
+           EXTEND_MIN(x);                                              \
+           EXTEND_MAX(x);
+           
+           EXTEND(trap->left.p1.x);
+           EXTEND(trap->left.p2.x);
+           EXTEND(trap->right.p1.x);
+           EXTEND(trap->right.p2.x);
+       }
+       
+       if (box.x1 >= box.x2 || box.y1 >= box.y2)
+           return;
+       
+       tmp = pixman_image_create_bits (
+           mask_format, box.x2 - box.x1, box.y2 - box.y1, NULL, -1);
+       
+       for (i = 0; i < n_traps; ++i)
+       {
+           const pixman_trapezoid_t *trap = &(traps[i]);
+           
+           if (!pixman_trapezoid_valid (trap))
+               continue;
+           
+           pixman_rasterize_trapezoid (tmp, trap, - box.x1, - box.y1);
+       }
+       
+       pixman_image_composite (op, src, tmp, dst,
+                               x_src + box.x1, y_src + box.y1,
+                               0, 0,
+                               x_dst + box.x1, y_dst + box.y1,
+                               box.x2 - box.x1, box.y2 - box.y1);
+       
+       pixman_image_unref (tmp);
+    }
+}
+
+static int
+greater_y (const pixman_point_fixed_t *a, const pixman_point_fixed_t *b)
+{
+    if (a->y == b->y)
+       return a->x > b->x;
+    return a->y > b->y;
+}
+
+/*
+ * Note that the definition of this function is a bit odd because
+ * of the X coordinate space (y increasing downwards).
+ */
+static int
+clockwise (const pixman_point_fixed_t *ref,
+          const pixman_point_fixed_t *a,
+          const pixman_point_fixed_t *b)
+{
+    pixman_point_fixed_t       ad, bd;
+
+    ad.x = a->x - ref->x;
+    ad.y = a->y - ref->y;
+    bd.x = b->x - ref->x;
+    bd.y = b->y - ref->y;
+
+    return ((pixman_fixed_32_32_t) bd.y * ad.x -
+           (pixman_fixed_32_32_t) ad.y * bd.x) < 0;
+}
+
+static void
+triangle_to_trapezoids (const pixman_triangle_t *tri, pixman_trapezoid_t *traps)
+{
+    const pixman_point_fixed_t *top, *left, *right, *tmp;
+
+    top = &tri->p1;
+    left = &tri->p2;
+    right = &tri->p3;
+
+    if (greater_y (top, left))
+    {
+       tmp = left;
+       left = top;
+       top = tmp;
+    }
+
+    if (greater_y (top, right))
+    {
+       tmp = right;
+       right = top;
+       top = tmp;
+    }
+
+    if (clockwise (top, right, left))
+    {
+       tmp = right;
+       right = left;
+       left = tmp;
+    }
+    
+    /*
+     * Two cases:
+     *
+     *         +               +
+     *        / \             / \
+     *       /   \           /   \
+     *      /     +         +     \
+     *      /    --           --    \
+     *     /   --               --   \
+     *    / ---                   --- \
+     *  +--                         --+
+     */
+
+    traps->top = top->y;
+    traps->left.p1 = *top;
+    traps->left.p2 = *left;
+    traps->right.p1 = *top;
+    traps->right.p2 = *right;
+
+    if (right->y < left->y)
+       traps->bottom = right->y;
+    else
+       traps->bottom = left->y;
+
+    traps++;
+
+    *traps = *(traps - 1);
+    
+    if (right->y < left->y)
+    {
+       traps->top = right->y;
+       traps->bottom = left->y;
+       traps->right.p1 = *right;
+       traps->right.p2 = *left;
+    }
+    else
+    {
+       traps->top = left->y;
+       traps->bottom = right->y;
+       traps->left.p1 = *left;
+       traps->left.p2 = *right;
+    }
+}
+
+static pixman_trapezoid_t *
+convert_triangles (int n_tris, const pixman_triangle_t *tris)
+{
+    pixman_trapezoid_t *traps;
+    int i;
+
+    if (n_tris <= 0)
+       return NULL;
+    
+    traps = pixman_malloc_ab (n_tris, 2 * sizeof (pixman_trapezoid_t));
+    if (!traps)
+       return NULL;
+
+    for (i = 0; i < n_tris; ++i)
+       triangle_to_trapezoids (&(tris[i]), traps + 2 * i);
+
+    return traps;
+}
+
+PIXMAN_EXPORT void
+pixman_composite_triangles (pixman_op_t                        op,
+                           pixman_image_t *            src,
+                           pixman_image_t *            dst,
+                           pixman_format_code_t        mask_format,
+                           int                         x_src,
+                           int                         y_src,
+                           int                         x_dst,
+                           int                         y_dst,
+                           int                         n_tris,
+                           const pixman_triangle_t *   tris)
+{
+    pixman_trapezoid_t *traps;
+
+    if ((traps = convert_triangles (n_tris, tris)))
+    {
+       pixman_composite_trapezoids (op, src, dst, mask_format,
+                                    x_src, y_src, x_dst, y_dst,
+                                    n_tris * 2, traps);
+       
+       free (traps);
+    }
+}
+
+PIXMAN_EXPORT void
+pixman_add_triangles (pixman_image_t          *image,
+                     int32_t                  x_off,
+                     int32_t                  y_off,
+                     int                      n_tris,
+                     const pixman_triangle_t *tris)
+{
+    pixman_trapezoid_t *traps;
+
+    if ((traps = convert_triangles (n_tris, tris)))
+    {
+       pixman_add_trapezoids (image, x_off, y_off,
+                              n_tris * 2, traps);
+
+       free (traps);
+    }
+}
diff --git a/pixman/pixman-utils.c b/pixman/pixman-utils.c
new file mode 100644 (file)
index 0000000..d2af51a
--- /dev/null
@@ -0,0 +1,356 @@
+/*
+ * Copyright © 2000 SuSE, Inc.
+ * Copyright © 1999 Keith Packard
+ *
+ * Permission to use, copy, modify, distribute, and sell this software and its
+ * documentation for any purpose is hereby granted without fee, provided that
+ * the above copyright notice appear in all copies and that both that
+ * copyright notice and this permission notice appear in supporting
+ * documentation, and that the name of SuSE not be used in advertising or
+ * publicity pertaining to distribution of the software without specific,
+ * written prior permission.  SuSE makes no representations about the
+ * suitability of this software for any purpose.  It is provided "as is"
+ * without express or implied warranty.
+ *
+ * SuSE DISCLAIMS ALL WARRANTIES WITH REGARD TO THIS SOFTWARE, INCLUDING ALL
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS, IN NO EVENT SHALL SuSE
+ * BE LIABLE FOR ANY SPECIAL, INDIRECT OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES
+ * WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN ACTION
+ * OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT OF OR IN
+ * CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE.
+ *
+ * Author:  Keith Packard, SuSE, Inc.
+ */
+
+#ifdef HAVE_CONFIG_H
+#include <config.h>
+#endif
+#include <stdio.h>
+#include <stdlib.h>
+
+#include "pixman-private.h"
+
+#define N_CACHED_FAST_PATHS 8
+
+typedef struct
+{
+    struct
+    {
+       pixman_implementation_t *       imp;
+       pixman_fast_path_t              fast_path;
+    } cache [N_CACHED_FAST_PATHS];
+} cache_t;
+
+PIXMAN_DEFINE_THREAD_LOCAL (cache_t, fast_path_cache);
+
+pixman_bool_t
+_pixman_lookup_composite_function (pixman_implementation_t     *toplevel,
+                                  pixman_op_t                  op,
+                                  pixman_format_code_t         src_format,
+                                  uint32_t                     src_flags,
+                                  pixman_format_code_t         mask_format,
+                                  uint32_t                     mask_flags,
+                                  pixman_format_code_t         dest_format,
+                                  uint32_t                     dest_flags,
+                                  pixman_implementation_t    **out_imp,
+                                  pixman_composite_func_t     *out_func)
+{
+    pixman_implementation_t *imp;
+    cache_t *cache;
+    int i;
+
+    /* Check cache for fast paths */
+    cache = PIXMAN_GET_THREAD_LOCAL (fast_path_cache);
+
+    for (i = 0; i < N_CACHED_FAST_PATHS; ++i)
+    {
+       const pixman_fast_path_t *info = &(cache->cache[i].fast_path);
+
+       /* Note that we check for equality here, not whether
+        * the cached fast path matches. This is to prevent
+        * us from selecting an overly general fast path
+        * when a more specific one would work.
+        */
+       if (info->op == op                      &&
+           info->src_format == src_format      &&
+           info->mask_format == mask_format    &&
+           info->dest_format == dest_format    &&
+           info->src_flags == src_flags        &&
+           info->mask_flags == mask_flags      &&
+           info->dest_flags == dest_flags      &&
+           info->func)
+       {
+           *out_imp = cache->cache[i].imp;
+           *out_func = cache->cache[i].fast_path.func;
+
+           goto update_cache;
+       }
+    }
+
+    for (imp = toplevel; imp != NULL; imp = imp->delegate)
+    {
+       const pixman_fast_path_t *info = imp->fast_paths;
+
+       while (info->op != PIXMAN_OP_NONE)
+       {
+           if ((info->op == op || info->op == PIXMAN_OP_any)           &&
+               /* Formats */
+               ((info->src_format == src_format) ||
+                (info->src_format == PIXMAN_any))                      &&
+               ((info->mask_format == mask_format) ||
+                (info->mask_format == PIXMAN_any))                     &&
+               ((info->dest_format == dest_format) ||
+                (info->dest_format == PIXMAN_any))                     &&
+               /* Flags */
+               (info->src_flags & src_flags) == info->src_flags        &&
+               (info->mask_flags & mask_flags) == info->mask_flags     &&
+               (info->dest_flags & dest_flags) == info->dest_flags)
+           {
+               *out_imp = imp;
+               *out_func = info->func;
+
+               /* Set i to the last spot in the cache so that the
+                * move-to-front code below will work
+                */
+               i = N_CACHED_FAST_PATHS - 1;
+
+               goto update_cache;
+           }
+
+           ++info;
+       }
+    }
+    return FALSE;
+
+update_cache:
+    if (i)
+    {
+       while (i--)
+           cache->cache[i + 1] = cache->cache[i];
+
+       cache->cache[0].imp = *out_imp;
+       cache->cache[0].fast_path.op = op;
+       cache->cache[0].fast_path.src_format = src_format;
+       cache->cache[0].fast_path.src_flags = src_flags;
+       cache->cache[0].fast_path.mask_format = mask_format;
+       cache->cache[0].fast_path.mask_flags = mask_flags;
+       cache->cache[0].fast_path.dest_format = dest_format;
+       cache->cache[0].fast_path.dest_flags = dest_flags;
+       cache->cache[0].fast_path.func = *out_func;
+    }
+
+    return TRUE;
+}
+
+pixman_bool_t
+_pixman_multiply_overflows_size (size_t a, size_t b)
+{
+    return a >= SIZE_MAX / b;
+}
+
+pixman_bool_t
+_pixman_multiply_overflows_int (unsigned int a, unsigned int b)
+{
+    return a >= INT32_MAX / b;
+}
+
+pixman_bool_t
+_pixman_addition_overflows_int (unsigned int a, unsigned int b)
+{
+    return a > INT32_MAX - b;
+}
+
+void *
+pixman_malloc_ab (unsigned int a,
+                  unsigned int b)
+{
+    if (a >= INT32_MAX / b)
+       return NULL;
+
+    return malloc (a * b);
+}
+
+void *
+pixman_malloc_abc (unsigned int a,
+                   unsigned int b,
+                   unsigned int c)
+{
+    if (a >= INT32_MAX / b)
+       return NULL;
+    else if (a * b >= INT32_MAX / c)
+       return NULL;
+    else
+       return malloc (a * b * c);
+}
+
+/*
+ * This function expands images from ARGB8 format to ARGB16.  To preserve
+ * precision, it needs to know the original source format.  For example, if the
+ * source was PIXMAN_x1r5g5b5 and the red component contained bits 12345, then
+ * the expanded value is 12345123.  To correctly expand this to 16 bits, it
+ * should be 1234512345123451 and not 1234512312345123.
+ */
+void
+pixman_expand (uint64_t *           dst,
+               const uint32_t *     src,
+               pixman_format_code_t format,
+               int                  width)
+{
+    /*
+     * Determine the sizes of each component and the masks and shifts
+     * required to extract them from the source pixel.
+     */
+    const int a_size = PIXMAN_FORMAT_A (format),
+              r_size = PIXMAN_FORMAT_R (format),
+              g_size = PIXMAN_FORMAT_G (format),
+              b_size = PIXMAN_FORMAT_B (format);
+    const int a_shift = 32 - a_size,
+              r_shift = 24 - r_size,
+              g_shift = 16 - g_size,
+              b_shift =  8 - b_size;
+    const uint8_t a_mask = ~(~0 << a_size),
+                  r_mask = ~(~0 << r_size),
+                  g_mask = ~(~0 << g_size),
+                  b_mask = ~(~0 << b_size);
+    int i;
+
+    /* Start at the end so that we can do the expansion in place
+     * when src == dst
+     */
+    for (i = width - 1; i >= 0; i--)
+    {
+       const uint32_t pixel = src[i];
+       const uint8_t a = (pixel >> a_shift) & a_mask,
+                     r = (pixel >> r_shift) & r_mask,
+                     g = (pixel >> g_shift) & g_mask,
+                     b = (pixel >> b_shift) & b_mask;
+       const uint64_t
+           a16 = a_size ? unorm_to_unorm (a, a_size, 16) : 0xffff,
+           r16 = unorm_to_unorm (r, r_size, 16),
+           g16 = unorm_to_unorm (g, g_size, 16),
+           b16 = unorm_to_unorm (b, b_size, 16);
+
+       dst[i] = a16 << 48 | r16 << 32 | g16 << 16 | b16;
+    }
+}
+
+/*
+ * Contracting is easier than expanding.  We just need to truncate the
+ * components.
+ */
+void
+pixman_contract (uint32_t *      dst,
+                 const uint64_t *src,
+                 int             width)
+{
+    int i;
+
+    /* Start at the beginning so that we can do the contraction in
+     * place when src == dst
+     */
+    for (i = 0; i < width; i++)
+    {
+       const uint8_t a = src[i] >> 56,
+                     r = src[i] >> 40,
+                     g = src[i] >> 24,
+                     b = src[i] >> 8;
+
+       dst[i] = a << 24 | r << 16 | g << 8 | b;
+    }
+}
+
+uint32_t *
+_pixman_iter_get_scanline_noop (pixman_iter_t *iter, const uint32_t *mask)
+{
+    return iter->buffer;
+}
+
+#define N_TMP_BOXES (16)
+
+pixman_bool_t
+pixman_region16_copy_from_region32 (pixman_region16_t *dst,
+                                    pixman_region32_t *src)
+{
+    int n_boxes, i;
+    pixman_box32_t *boxes32;
+    pixman_box16_t *boxes16;
+    pixman_bool_t retval;
+
+    boxes32 = pixman_region32_rectangles (src, &n_boxes);
+
+    boxes16 = pixman_malloc_ab (n_boxes, sizeof (pixman_box16_t));
+
+    if (!boxes16)
+       return FALSE;
+
+    for (i = 0; i < n_boxes; ++i)
+    {
+       boxes16[i].x1 = boxes32[i].x1;
+       boxes16[i].y1 = boxes32[i].y1;
+       boxes16[i].x2 = boxes32[i].x2;
+       boxes16[i].y2 = boxes32[i].y2;
+    }
+
+    pixman_region_fini (dst);
+    retval = pixman_region_init_rects (dst, boxes16, n_boxes);
+    free (boxes16);
+    return retval;
+}
+
+pixman_bool_t
+pixman_region32_copy_from_region16 (pixman_region32_t *dst,
+                                    pixman_region16_t *src)
+{
+    int n_boxes, i;
+    pixman_box16_t *boxes16;
+    pixman_box32_t *boxes32;
+    pixman_box32_t tmp_boxes[N_TMP_BOXES];
+    pixman_bool_t retval;
+
+    boxes16 = pixman_region_rectangles (src, &n_boxes);
+
+    if (n_boxes > N_TMP_BOXES)
+       boxes32 = pixman_malloc_ab (n_boxes, sizeof (pixman_box32_t));
+    else
+       boxes32 = tmp_boxes;
+
+    if (!boxes32)
+       return FALSE;
+
+    for (i = 0; i < n_boxes; ++i)
+    {
+       boxes32[i].x1 = boxes16[i].x1;
+       boxes32[i].y1 = boxes16[i].y1;
+       boxes32[i].x2 = boxes16[i].x2;
+       boxes32[i].y2 = boxes16[i].y2;
+    }
+
+    pixman_region32_fini (dst);
+    retval = pixman_region32_init_rects (dst, boxes32, n_boxes);
+
+    if (boxes32 != tmp_boxes)
+       free (boxes32);
+
+    return retval;
+}
+
+#ifdef DEBUG
+
+void
+_pixman_log_error (const char *function, const char *message)
+{
+    static int n_messages = 0;
+
+    if (n_messages < 10)
+    {
+       fprintf (stderr,
+                "*** BUG ***\n"
+                "In %s: %s\n"
+                "Set a breakpoint on '_pixman_log_error' to debug\n\n",
+                 function, message);
+
+       n_messages++;
+    }
+}
+
+#endif
diff --git a/pixman/pixman-version.h.in b/pixman/pixman-version.h.in
new file mode 100644 (file)
index 0000000..256b2e6
--- /dev/null
@@ -0,0 +1,50 @@
+/*
+ * Copyright © 2008 Red Hat, Inc.
+ *
+ * Permission is hereby granted, free of charge, to any person
+ * obtaining a copy of this software and associated documentation
+ * files (the "Software"), to deal in the Software without
+ * restriction, including without limitation the rights to use, copy,
+ * modify, merge, publish, distribute, sublicense, and/or sell copies
+ * of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be
+ * included in all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
+ * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
+ * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
+ * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
+ * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ *
+ * Author: Carl D. Worth <cworth@cworth.org>
+ */
+
+#ifndef PIXMAN_VERSION_H__
+#define PIXMAN_VERSION_H__
+
+#ifndef PIXMAN_H__
+#  error pixman-version.h should only be included by pixman.h
+#endif
+
+#define PIXMAN_VERSION_MAJOR @PIXMAN_VERSION_MAJOR@
+#define PIXMAN_VERSION_MINOR @PIXMAN_VERSION_MINOR@
+#define PIXMAN_VERSION_MICRO @PIXMAN_VERSION_MICRO@
+
+#define PIXMAN_VERSION_STRING "@PIXMAN_VERSION_MAJOR@.@PIXMAN_VERSION_MINOR@.@PIXMAN_VERSION_MICRO@"
+
+#define PIXMAN_VERSION_ENCODE(major, minor, micro) (   \
+         ((major) * 10000)                             \
+       + ((minor) *   100)                             \
+       + ((micro) *     1))
+
+#define PIXMAN_VERSION PIXMAN_VERSION_ENCODE(  \
+       PIXMAN_VERSION_MAJOR,                   \
+       PIXMAN_VERSION_MINOR,                   \
+       PIXMAN_VERSION_MICRO)
+
+#endif /* PIXMAN_VERSION_H__ */
diff --git a/pixman/pixman-vmx.c b/pixman/pixman-vmx.c
new file mode 100644 (file)
index 0000000..6868704
--- /dev/null
@@ -0,0 +1,1647 @@
+/*
+ * Copyright © 2007 Luca Barbato
+ *
+ * Permission to use, copy, modify, distribute, and sell this software and its
+ * documentation for any purpose is hereby granted without fee, provided that
+ * the above copyright notice appear in all copies and that both that
+ * copyright notice and this permission notice appear in supporting
+ * documentation, and that the name of Luca Barbato not be used in advertising or
+ * publicity pertaining to distribution of the software without specific,
+ * written prior permission.  Luca Barbato makes no representations about the
+ * suitability of this software for any purpose.  It is provided "as is"
+ * without express or implied warranty.
+ *
+ * THE COPYRIGHT HOLDERS DISCLAIM ALL WARRANTIES WITH REGARD TO THIS
+ * SOFTWARE, INCLUDING ALL IMPLIED WARRANTIES OF MERCHANTABILITY AND
+ * FITNESS, IN NO EVENT SHALL THE COPYRIGHT HOLDERS BE LIABLE FOR ANY
+ * SPECIAL, INDIRECT OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES
+ * WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN
+ * AN ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING
+ * OUT OF OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS
+ * SOFTWARE.
+ *
+ * Author:  Luca Barbato (lu_zero@gentoo.org)
+ *
+ * Based on fbmmx.c by Owen Taylor, Søren Sandmann and Nicholas Miell
+ */
+
+#include <config.h>
+#include "pixman-private.h"
+#include "pixman-combine32.h"
+#include <altivec.h>
+
+#define AVV(x...) {x}
+
+static force_inline vector unsigned int
+splat_alpha (vector unsigned int pix)
+{
+    return vec_perm (pix, pix,
+                    (vector unsigned char)AVV (
+                        0x00, 0x00, 0x00, 0x00, 0x04, 0x04, 0x04, 0x04,
+                        0x08, 0x08, 0x08, 0x08, 0x0C, 0x0C, 0x0C, 0x0C));
+}
+
+static force_inline vector unsigned int
+pix_multiply (vector unsigned int p, vector unsigned int a)
+{
+    vector unsigned short hi, lo, mod;
+
+    /* unpack to short */
+    hi = (vector unsigned short)
+       vec_mergeh ((vector unsigned char)AVV (0),
+                   (vector unsigned char)p);
+
+    mod = (vector unsigned short)
+       vec_mergeh ((vector unsigned char)AVV (0),
+                   (vector unsigned char)a);
+
+    hi = vec_mladd (hi, mod, (vector unsigned short)
+                    AVV (0x0080, 0x0080, 0x0080, 0x0080,
+                         0x0080, 0x0080, 0x0080, 0x0080));
+
+    hi = vec_adds (hi, vec_sr (hi, vec_splat_u16 (8)));
+
+    hi = vec_sr (hi, vec_splat_u16 (8));
+
+    /* unpack to short */
+    lo = (vector unsigned short)
+       vec_mergel ((vector unsigned char)AVV (0),
+                   (vector unsigned char)p);
+    mod = (vector unsigned short)
+       vec_mergel ((vector unsigned char)AVV (0),
+                   (vector unsigned char)a);
+
+    lo = vec_mladd (lo, mod, (vector unsigned short)
+                    AVV (0x0080, 0x0080, 0x0080, 0x0080,
+                         0x0080, 0x0080, 0x0080, 0x0080));
+
+    lo = vec_adds (lo, vec_sr (lo, vec_splat_u16 (8)));
+
+    lo = vec_sr (lo, vec_splat_u16 (8));
+
+    return (vector unsigned int)vec_packsu (hi, lo);
+}
+
+static force_inline vector unsigned int
+pix_add (vector unsigned int a, vector unsigned int b)
+{
+    return (vector unsigned int)vec_adds ((vector unsigned char)a,
+                                          (vector unsigned char)b);
+}
+
+static force_inline vector unsigned int
+pix_add_mul (vector unsigned int x,
+             vector unsigned int a,
+             vector unsigned int y,
+             vector unsigned int b)
+{
+    vector unsigned int t1, t2;
+
+    t1 = pix_multiply (x, a);
+    t2 = pix_multiply (y, b);
+
+    return pix_add (t1, t2);
+}
+
+static force_inline vector unsigned int
+negate (vector unsigned int src)
+{
+    return vec_nor (src, src);
+}
+
+/* dest*~srca + src */
+static force_inline vector unsigned int
+over (vector unsigned int src,
+      vector unsigned int srca,
+      vector unsigned int dest)
+{
+    vector unsigned char tmp = (vector unsigned char)
+       pix_multiply (dest, negate (srca));
+
+    tmp = vec_adds ((vector unsigned char)src, tmp);
+    return (vector unsigned int)tmp;
+}
+
+/* in == pix_multiply */
+#define in_over(src, srca, mask, dest)                                 \
+    over (pix_multiply (src, mask),                                    \
+          pix_multiply (srca, mask), dest)
+
+
+#define COMPUTE_SHIFT_MASK(source)                                     \
+    source ## _mask = vec_lvsl (0, source);
+
+#define COMPUTE_SHIFT_MASKS(dest, source)                              \
+    dest ## _mask = vec_lvsl (0, dest);                                        \
+    source ## _mask = vec_lvsl (0, source);                            \
+    store_mask = vec_lvsr (0, dest);
+
+#define COMPUTE_SHIFT_MASKC(dest, source, mask)                                \
+    mask ## _mask = vec_lvsl (0, mask);                                        \
+    dest ## _mask = vec_lvsl (0, dest);                                        \
+    source ## _mask = vec_lvsl (0, source);                            \
+    store_mask = vec_lvsr (0, dest);
+
+/* notice you have to declare temp vars...
+ * Note: tmp3 and tmp4 must remain untouched!
+ */
+
+#define LOAD_VECTORS(dest, source)                       \
+    tmp1 = (typeof(tmp1))vec_ld (0, source);             \
+    tmp2 = (typeof(tmp2))vec_ld (15, source);            \
+    tmp3 = (typeof(tmp3))vec_ld (0, dest);               \
+    v ## source = (typeof(v ## source))                          \
+       vec_perm (tmp1, tmp2, source ## _mask);           \
+    tmp4 = (typeof(tmp4))vec_ld (15, dest);              \
+    v ## dest = (typeof(v ## dest))                      \
+       vec_perm (tmp3, tmp4, dest ## _mask);
+
+#define LOAD_VECTORSC(dest, source, mask)                \
+    tmp1 = (typeof(tmp1))vec_ld (0, source);             \
+    tmp2 = (typeof(tmp2))vec_ld (15, source);            \
+    tmp3 = (typeof(tmp3))vec_ld (0, dest);               \
+    v ## source = (typeof(v ## source))                          \
+       vec_perm (tmp1, tmp2, source ## _mask);           \
+    tmp4 = (typeof(tmp4))vec_ld (15, dest);              \
+    tmp1 = (typeof(tmp1))vec_ld (0, mask);               \
+    v ## dest = (typeof(v ## dest))                      \
+       vec_perm (tmp3, tmp4, dest ## _mask);             \
+    tmp2 = (typeof(tmp2))vec_ld (15, mask);              \
+    v ## mask = (typeof(v ## mask))                      \
+       vec_perm (tmp1, tmp2, mask ## _mask);
+
+#define LOAD_VECTORSM(dest, source, mask)                              \
+    LOAD_VECTORSC (dest, source, mask)                                 \
+    v ## source = pix_multiply (v ## source,                           \
+                                splat_alpha (v ## mask));
+
+#define STORE_VECTOR(dest)                                             \
+    edges = vec_perm (tmp4, tmp3, dest ## _mask);                      \
+    tmp3 = vec_perm ((vector unsigned char)v ## dest, edges, store_mask); \
+    tmp1 = vec_perm (edges, (vector unsigned char)v ## dest, store_mask); \
+    vec_st ((vector unsigned int) tmp3, 15, dest);                     \
+    vec_st ((vector unsigned int) tmp1, 0, dest);
+
+static void
+vmx_combine_over_u_no_mask (uint32_t *      dest,
+                            const uint32_t *src,
+                            int             width)
+{
+    int i;
+    vector unsigned int vdest, vsrc;
+    vector unsigned char tmp1, tmp2, tmp3, tmp4, edges,
+       dest_mask, src_mask, store_mask;
+
+    COMPUTE_SHIFT_MASKS (dest, src);
+
+    /* printf ("%s\n",__PRETTY_FUNCTION__); */
+    for (i = width / 4; i > 0; i--)
+    {
+
+       LOAD_VECTORS (dest, src);
+
+       vdest = over (vsrc, splat_alpha (vsrc), vdest);
+
+       STORE_VECTOR (dest);
+
+       src += 4;
+       dest += 4;
+    }
+
+    for (i = width % 4; --i >= 0;)
+    {
+       uint32_t s = src[i];
+       uint32_t d = dest[i];
+       uint32_t ia = ALPHA_8 (~s);
+
+       UN8x4_MUL_UN8_ADD_UN8x4 (d, ia, s);
+
+       dest[i] = d;
+    }
+}
+
+static void
+vmx_combine_over_u_mask (uint32_t *      dest,
+                         const uint32_t *src,
+                         const uint32_t *mask,
+                         int             width)
+{
+    int i;
+    vector unsigned int vdest, vsrc, vmask;
+    vector unsigned char tmp1, tmp2, tmp3, tmp4, edges,
+       dest_mask, src_mask, mask_mask, store_mask;
+
+    COMPUTE_SHIFT_MASKC (dest, src, mask);
+
+    /* printf ("%s\n",__PRETTY_FUNCTION__); */
+    for (i = width / 4; i > 0; i--)
+    {
+       LOAD_VECTORSM (dest, src, mask);
+
+       vdest = over (vsrc, splat_alpha (vsrc), vdest);
+
+       STORE_VECTOR (dest);
+
+       src += 4;
+       dest += 4;
+       mask += 4;
+    }
+
+    for (i = width % 4; --i >= 0;)
+    {
+       uint32_t m = ALPHA_8 (mask[i]);
+       uint32_t s = src[i];
+       uint32_t d = dest[i];
+       uint32_t ia;
+
+       UN8x4_MUL_UN8 (s, m);
+
+       ia = ALPHA_8 (~s);
+
+       UN8x4_MUL_UN8_ADD_UN8x4 (d, ia, s);
+       dest[i] = d;
+    }
+}
+
+static void
+vmx_combine_over_u (pixman_implementation_t *imp,
+                    pixman_op_t              op,
+                    uint32_t *               dest,
+                    const uint32_t *         src,
+                    const uint32_t *         mask,
+                    int                      width)
+{
+    if (mask)
+       vmx_combine_over_u_mask (dest, src, mask, width);
+    else
+       vmx_combine_over_u_no_mask (dest, src, width);
+}
+
+static void
+vmx_combine_over_reverse_u_no_mask (uint32_t *      dest,
+                                    const uint32_t *src,
+                                    int             width)
+{
+    int i;
+    vector unsigned int vdest, vsrc;
+    vector unsigned char tmp1, tmp2, tmp3, tmp4, edges,
+       dest_mask, src_mask, store_mask;
+
+    COMPUTE_SHIFT_MASKS (dest, src);
+
+    /* printf ("%s\n",__PRETTY_FUNCTION__); */
+    for (i = width / 4; i > 0; i--)
+    {
+
+       LOAD_VECTORS (dest, src);
+
+       vdest = over (vdest, splat_alpha (vdest), vsrc);
+
+       STORE_VECTOR (dest);
+
+       src += 4;
+       dest += 4;
+    }
+
+    for (i = width % 4; --i >= 0;)
+    {
+       uint32_t s = src[i];
+       uint32_t d = dest[i];
+       uint32_t ia = ALPHA_8 (~dest[i]);
+
+       UN8x4_MUL_UN8_ADD_UN8x4 (s, ia, d);
+       dest[i] = s;
+    }
+}
+
+static void
+vmx_combine_over_reverse_u_mask (uint32_t *      dest,
+                                 const uint32_t *src,
+                                 const uint32_t *mask,
+                                 int             width)
+{
+    int i;
+    vector unsigned int vdest, vsrc, vmask;
+    vector unsigned char tmp1, tmp2, tmp3, tmp4, edges,
+       dest_mask, src_mask, mask_mask, store_mask;
+
+    COMPUTE_SHIFT_MASKC (dest, src, mask);
+
+    /* printf ("%s\n",__PRETTY_FUNCTION__); */
+    for (i = width / 4; i > 0; i--)
+    {
+
+       LOAD_VECTORSM (dest, src, mask);
+
+       vdest = over (vdest, splat_alpha (vdest), vsrc);
+
+       STORE_VECTOR (dest);
+
+       src += 4;
+       dest += 4;
+       mask += 4;
+    }
+
+    for (i = width % 4; --i >= 0;)
+    {
+       uint32_t m = ALPHA_8 (mask[i]);
+       uint32_t s = src[i];
+       uint32_t d = dest[i];
+       uint32_t ia = ALPHA_8 (~dest[i]);
+
+       UN8x4_MUL_UN8 (s, m);
+
+       UN8x4_MUL_UN8_ADD_UN8x4 (s, ia, d);
+       dest[i] = s;
+    }
+}
+
+static void
+vmx_combine_over_reverse_u (pixman_implementation_t *imp,
+                            pixman_op_t              op,
+                            uint32_t *               dest,
+                            const uint32_t *         src,
+                            const uint32_t *         mask,
+                            int                      width)
+{
+    if (mask)
+       vmx_combine_over_reverse_u_mask (dest, src, mask, width);
+    else
+       vmx_combine_over_reverse_u_no_mask (dest, src, width);
+}
+
+static void
+vmx_combine_in_u_no_mask (uint32_t *      dest,
+                          const uint32_t *src,
+                          int             width)
+{
+    int i;
+    vector unsigned int vdest, vsrc;
+    vector unsigned char tmp1, tmp2, tmp3, tmp4, edges,
+       dest_mask, src_mask, store_mask;
+
+    COMPUTE_SHIFT_MASKS (dest, src);
+
+    /* printf ("%s\n",__PRETTY_FUNCTION__); */
+    for (i = width / 4; i > 0; i--)
+    {
+       LOAD_VECTORS (dest, src);
+
+       vdest = pix_multiply (vsrc, splat_alpha (vdest));
+
+       STORE_VECTOR (dest);
+
+       src += 4;
+       dest += 4;
+    }
+
+    for (i = width % 4; --i >= 0;)
+    {
+       uint32_t s = src[i];
+       uint32_t a = ALPHA_8 (dest[i]);
+
+       UN8x4_MUL_UN8 (s, a);
+       dest[i] = s;
+    }
+}
+
+static void
+vmx_combine_in_u_mask (uint32_t *      dest,
+                       const uint32_t *src,
+                       const uint32_t *mask,
+                       int             width)
+{
+    int i;
+    vector unsigned int vdest, vsrc, vmask;
+    vector unsigned char tmp1, tmp2, tmp3, tmp4, edges,
+       dest_mask, src_mask, mask_mask, store_mask;
+
+    COMPUTE_SHIFT_MASKC (dest, src, mask);
+
+    /* printf ("%s\n",__PRETTY_FUNCTION__); */
+    for (i = width / 4; i > 0; i--)
+    {
+       LOAD_VECTORSM (dest, src, mask);
+
+       vdest = pix_multiply (vsrc, splat_alpha (vdest));
+
+       STORE_VECTOR (dest);
+
+       src += 4;
+       dest += 4;
+       mask += 4;
+    }
+
+    for (i = width % 4; --i >= 0;)
+    {
+       uint32_t m = ALPHA_8 (mask[i]);
+       uint32_t s = src[i];
+       uint32_t a = ALPHA_8 (dest[i]);
+
+       UN8x4_MUL_UN8 (s, m);
+       UN8x4_MUL_UN8 (s, a);
+
+       dest[i] = s;
+    }
+}
+
+static void
+vmx_combine_in_u (pixman_implementation_t *imp,
+                  pixman_op_t              op,
+                  uint32_t *               dest,
+                  const uint32_t *         src,
+                  const uint32_t *         mask,
+                  int                      width)
+{
+    if (mask)
+       vmx_combine_in_u_mask (dest, src, mask, width);
+    else
+       vmx_combine_in_u_no_mask (dest, src, width);
+}
+
+static void
+vmx_combine_in_reverse_u_no_mask (uint32_t *      dest,
+                                  const uint32_t *src,
+                                  int             width)
+{
+    int i;
+    vector unsigned int vdest, vsrc;
+    vector unsigned char tmp1, tmp2, tmp3, tmp4, edges,
+       dest_mask, src_mask, store_mask;
+
+    COMPUTE_SHIFT_MASKS (dest, src);
+
+    /* printf ("%s\n",__PRETTY_FUNCTION__); */
+    for (i = width / 4; i > 0; i--)
+    {
+       LOAD_VECTORS (dest, src);
+
+       vdest = pix_multiply (vdest, splat_alpha (vsrc));
+
+       STORE_VECTOR (dest);
+
+       src += 4;
+       dest += 4;
+    }
+
+    for (i = width % 4; --i >= 0;)
+    {
+       uint32_t d = dest[i];
+       uint32_t a = ALPHA_8 (src[i]);
+
+       UN8x4_MUL_UN8 (d, a);
+
+       dest[i] = d;
+    }
+}
+
+static void
+vmx_combine_in_reverse_u_mask (uint32_t *      dest,
+                               const uint32_t *src,
+                               const uint32_t *mask,
+                               int             width)
+{
+    int i;
+    vector unsigned int vdest, vsrc, vmask;
+    vector unsigned char tmp1, tmp2, tmp3, tmp4, edges,
+       dest_mask, src_mask, mask_mask, store_mask;
+
+    COMPUTE_SHIFT_MASKC (dest, src, mask);
+
+    /* printf ("%s\n",__PRETTY_FUNCTION__); */
+    for (i = width / 4; i > 0; i--)
+    {
+       LOAD_VECTORSM (dest, src, mask);
+
+       vdest = pix_multiply (vdest, splat_alpha (vsrc));
+
+       STORE_VECTOR (dest);
+
+       src += 4;
+       dest += 4;
+       mask += 4;
+    }
+
+    for (i = width % 4; --i >= 0;)
+    {
+       uint32_t m = ALPHA_8 (mask[i]);
+       uint32_t d = dest[i];
+       uint32_t a = src[i];
+
+       UN8x4_MUL_UN8 (a, m);
+       a = ALPHA_8 (a);
+       UN8x4_MUL_UN8 (d, a);
+
+       dest[i] = d;
+    }
+}
+
+static void
+vmx_combine_in_reverse_u (pixman_implementation_t *imp,
+                          pixman_op_t              op,
+                          uint32_t *               dest,
+                          const uint32_t *         src,
+                          const uint32_t *         mask,
+                          int                      width)
+{
+    if (mask)
+       vmx_combine_in_reverse_u_mask (dest, src, mask, width);
+    else
+       vmx_combine_in_reverse_u_no_mask (dest, src, width);
+}
+
+static void
+vmx_combine_out_u_no_mask (uint32_t *      dest,
+                           const uint32_t *src,
+                           int             width)
+{
+    int i;
+    vector unsigned int vdest, vsrc;
+    vector unsigned char tmp1, tmp2, tmp3, tmp4, edges,
+       dest_mask, src_mask, store_mask;
+
+    COMPUTE_SHIFT_MASKS (dest, src);
+
+    /* printf ("%s\n",__PRETTY_FUNCTION__); */
+    for (i = width / 4; i > 0; i--)
+    {
+       LOAD_VECTORS (dest, src);
+
+       vdest = pix_multiply (vsrc, splat_alpha (negate (vdest)));
+
+       STORE_VECTOR (dest);
+
+       src += 4;
+       dest += 4;
+    }
+
+    for (i = width % 4; --i >= 0;)
+    {
+       uint32_t s = src[i];
+       uint32_t a = ALPHA_8 (~dest[i]);
+
+       UN8x4_MUL_UN8 (s, a);
+
+       dest[i] = s;
+    }
+}
+
+static void
+vmx_combine_out_u_mask (uint32_t *      dest,
+                        const uint32_t *src,
+                        const uint32_t *mask,
+                        int             width)
+{
+    int i;
+    vector unsigned int vdest, vsrc, vmask;
+    vector unsigned char tmp1, tmp2, tmp3, tmp4, edges,
+       dest_mask, src_mask, mask_mask, store_mask;
+
+    COMPUTE_SHIFT_MASKC (dest, src, mask);
+
+    /* printf ("%s\n",__PRETTY_FUNCTION__); */
+    for (i = width / 4; i > 0; i--)
+    {
+       LOAD_VECTORSM (dest, src, mask);
+
+       vdest = pix_multiply (vsrc, splat_alpha (negate (vdest)));
+
+       STORE_VECTOR (dest);
+
+       src += 4;
+       dest += 4;
+       mask += 4;
+    }
+
+    for (i = width % 4; --i >= 0;)
+    {
+       uint32_t m = ALPHA_8 (mask[i]);
+       uint32_t s = src[i];
+       uint32_t a = ALPHA_8 (~dest[i]);
+
+       UN8x4_MUL_UN8 (s, m);
+       UN8x4_MUL_UN8 (s, a);
+
+       dest[i] = s;
+    }
+}
+
+static void
+vmx_combine_out_u (pixman_implementation_t *imp,
+                   pixman_op_t              op,
+                   uint32_t *               dest,
+                   const uint32_t *         src,
+                   const uint32_t *         mask,
+                   int                      width)
+{
+    if (mask)
+       vmx_combine_out_u_mask (dest, src, mask, width);
+    else
+       vmx_combine_out_u_no_mask (dest, src, width);
+}
+
+static void
+vmx_combine_out_reverse_u_no_mask (uint32_t *      dest,
+                                   const uint32_t *src,
+                                   int             width)
+{
+    int i;
+    vector unsigned int vdest, vsrc;
+    vector unsigned char tmp1, tmp2, tmp3, tmp4, edges,
+       dest_mask, src_mask, store_mask;
+
+    COMPUTE_SHIFT_MASKS (dest, src);
+
+    /* printf ("%s\n",__PRETTY_FUNCTION__); */
+    for (i = width / 4; i > 0; i--)
+    {
+
+       LOAD_VECTORS (dest, src);
+
+       vdest = pix_multiply (vdest, splat_alpha (negate (vsrc)));
+
+       STORE_VECTOR (dest);
+
+       src += 4;
+       dest += 4;
+    }
+
+    for (i = width % 4; --i >= 0;)
+    {
+       uint32_t d = dest[i];
+       uint32_t a = ALPHA_8 (~src[i]);
+
+       UN8x4_MUL_UN8 (d, a);
+
+       dest[i] = d;
+    }
+}
+
+static void
+vmx_combine_out_reverse_u_mask (uint32_t *      dest,
+                                const uint32_t *src,
+                                const uint32_t *mask,
+                                int             width)
+{
+    int i;
+    vector unsigned int vdest, vsrc, vmask;
+    vector unsigned char tmp1, tmp2, tmp3, tmp4, edges,
+       dest_mask, src_mask, mask_mask, store_mask;
+
+    COMPUTE_SHIFT_MASKC (dest, src, mask);
+
+    /* printf ("%s\n",__PRETTY_FUNCTION__); */
+    for (i = width / 4; i > 0; i--)
+    {
+       LOAD_VECTORSM (dest, src, mask);
+
+       vdest = pix_multiply (vdest, splat_alpha (negate (vsrc)));
+
+       STORE_VECTOR (dest);
+
+       src += 4;
+       dest += 4;
+       mask += 4;
+    }
+
+    for (i = width % 4; --i >= 0;)
+    {
+       uint32_t m = ALPHA_8 (mask[i]);
+       uint32_t d = dest[i];
+       uint32_t a = src[i];
+
+       UN8x4_MUL_UN8 (a, m);
+       a = ALPHA_8 (~a);
+       UN8x4_MUL_UN8 (d, a);
+
+       dest[i] = d;
+    }
+}
+
+static void
+vmx_combine_out_reverse_u (pixman_implementation_t *imp,
+                           pixman_op_t              op,
+                           uint32_t *               dest,
+                           const uint32_t *         src,
+                           const uint32_t *         mask,
+                           int                      width)
+{
+    if (mask)
+       vmx_combine_out_reverse_u_mask (dest, src, mask, width);
+    else
+       vmx_combine_out_reverse_u_no_mask (dest, src, width);
+}
+
+static void
+vmx_combine_atop_u_no_mask (uint32_t *      dest,
+                            const uint32_t *src,
+                            int             width)
+{
+    int i;
+    vector unsigned int vdest, vsrc;
+    vector unsigned char tmp1, tmp2, tmp3, tmp4, edges,
+       dest_mask, src_mask, store_mask;
+
+    COMPUTE_SHIFT_MASKS (dest, src);
+
+    /* printf ("%s\n",__PRETTY_FUNCTION__); */
+    for (i = width / 4; i > 0; i--)
+    {
+       LOAD_VECTORS (dest, src);
+
+       vdest = pix_add_mul (vsrc, splat_alpha (vdest),
+                            vdest, splat_alpha (negate (vsrc)));
+
+       STORE_VECTOR (dest);
+
+       src += 4;
+       dest += 4;
+    }
+
+    for (i = width % 4; --i >= 0;)
+    {
+       uint32_t s = src[i];
+       uint32_t d = dest[i];
+       uint32_t dest_a = ALPHA_8 (d);
+       uint32_t src_ia = ALPHA_8 (~s);
+
+       UN8x4_MUL_UN8_ADD_UN8x4_MUL_UN8 (s, dest_a, d, src_ia);
+
+       dest[i] = s;
+    }
+}
+
+static void
+vmx_combine_atop_u_mask (uint32_t *      dest,
+                         const uint32_t *src,
+                         const uint32_t *mask,
+                         int             width)
+{
+    int i;
+    vector unsigned int vdest, vsrc, vmask;
+    vector unsigned char tmp1, tmp2, tmp3, tmp4, edges,
+       dest_mask, src_mask, mask_mask, store_mask;
+
+    COMPUTE_SHIFT_MASKC (dest, src, mask);
+
+    /* printf ("%s\n",__PRETTY_FUNCTION__); */
+    for (i = width / 4; i > 0; i--)
+    {
+       LOAD_VECTORSM (dest, src, mask);
+
+       vdest = pix_add_mul (vsrc, splat_alpha (vdest),
+                            vdest, splat_alpha (negate (vsrc)));
+
+       STORE_VECTOR (dest);
+
+       src += 4;
+       dest += 4;
+       mask += 4;
+    }
+
+    for (i = width % 4; --i >= 0;)
+    {
+       uint32_t m = ALPHA_8 (mask[i]);
+       uint32_t s = src[i];
+       uint32_t d = dest[i];
+       uint32_t dest_a = ALPHA_8 (d);
+       uint32_t src_ia;
+
+       UN8x4_MUL_UN8 (s, m);
+
+       src_ia = ALPHA_8 (~s);
+
+       UN8x4_MUL_UN8_ADD_UN8x4_MUL_UN8 (s, dest_a, d, src_ia);
+
+       dest[i] = s;
+    }
+}
+
+static void
+vmx_combine_atop_u (pixman_implementation_t *imp,
+                    pixman_op_t              op,
+                    uint32_t *               dest,
+                    const uint32_t *         src,
+                    const uint32_t *         mask,
+                    int                      width)
+{
+    if (mask)
+       vmx_combine_atop_u_mask (dest, src, mask, width);
+    else
+       vmx_combine_atop_u_no_mask (dest, src, width);
+}
+
+static void
+vmx_combine_atop_reverse_u_no_mask (uint32_t *      dest,
+                                    const uint32_t *src,
+                                    int             width)
+{
+    int i;
+    vector unsigned int vdest, vsrc;
+    vector unsigned char tmp1, tmp2, tmp3, tmp4, edges,
+       dest_mask, src_mask, store_mask;
+
+    COMPUTE_SHIFT_MASKS (dest, src);
+
+    /* printf ("%s\n",__PRETTY_FUNCTION__); */
+    for (i = width / 4; i > 0; i--)
+    {
+       LOAD_VECTORS (dest, src);
+
+       vdest = pix_add_mul (vdest, splat_alpha (vsrc),
+                            vsrc, splat_alpha (negate (vdest)));
+
+       STORE_VECTOR (dest);
+
+       src += 4;
+       dest += 4;
+    }
+
+    for (i = width % 4; --i >= 0;)
+    {
+       uint32_t s = src[i];
+       uint32_t d = dest[i];
+       uint32_t src_a = ALPHA_8 (s);
+       uint32_t dest_ia = ALPHA_8 (~d);
+
+       UN8x4_MUL_UN8_ADD_UN8x4_MUL_UN8 (s, dest_ia, d, src_a);
+
+       dest[i] = s;
+    }
+}
+
+static void
+vmx_combine_atop_reverse_u_mask (uint32_t *      dest,
+                                 const uint32_t *src,
+                                 const uint32_t *mask,
+                                 int             width)
+{
+    int i;
+    vector unsigned int vdest, vsrc, vmask;
+    vector unsigned char tmp1, tmp2, tmp3, tmp4, edges,
+       dest_mask, src_mask, mask_mask, store_mask;
+
+    COMPUTE_SHIFT_MASKC (dest, src, mask);
+
+    /* printf ("%s\n",__PRETTY_FUNCTION__); */
+    for (i = width / 4; i > 0; i--)
+    {
+       LOAD_VECTORSM (dest, src, mask);
+
+       vdest = pix_add_mul (vdest, splat_alpha (vsrc),
+                            vsrc, splat_alpha (negate (vdest)));
+
+       STORE_VECTOR (dest);
+
+       src += 4;
+       dest += 4;
+       mask += 4;
+    }
+
+    for (i = width % 4; --i >= 0;)
+    {
+       uint32_t m = ALPHA_8 (mask[i]);
+       uint32_t s = src[i];
+       uint32_t d = dest[i];
+       uint32_t src_a;
+       uint32_t dest_ia = ALPHA_8 (~d);
+
+       UN8x4_MUL_UN8 (s, m);
+
+       src_a = ALPHA_8 (s);
+
+       UN8x4_MUL_UN8_ADD_UN8x4_MUL_UN8 (s, dest_ia, d, src_a);
+
+       dest[i] = s;
+    }
+}
+
+static void
+vmx_combine_atop_reverse_u (pixman_implementation_t *imp,
+                            pixman_op_t              op,
+                            uint32_t *               dest,
+                            const uint32_t *         src,
+                            const uint32_t *         mask,
+                            int                      width)
+{
+    if (mask)
+       vmx_combine_atop_reverse_u_mask (dest, src, mask, width);
+    else
+       vmx_combine_atop_reverse_u_no_mask (dest, src, width);
+}
+
+static void
+vmx_combine_xor_u_no_mask (uint32_t *      dest,
+                           const uint32_t *src,
+                           int             width)
+{
+    int i;
+    vector unsigned int vdest, vsrc;
+    vector unsigned char tmp1, tmp2, tmp3, tmp4, edges,
+       dest_mask, src_mask, store_mask;
+
+    COMPUTE_SHIFT_MASKS (dest, src);
+
+    /* printf ("%s\n",__PRETTY_FUNCTION__); */
+    for (i = width / 4; i > 0; i--)
+    {
+       LOAD_VECTORS (dest, src);
+
+       vdest = pix_add_mul (vsrc, splat_alpha (negate (vdest)),
+                            vdest, splat_alpha (negate (vsrc)));
+
+       STORE_VECTOR (dest);
+
+       src += 4;
+       dest += 4;
+    }
+
+    for (i = width % 4; --i >= 0;)
+    {
+       uint32_t s = src[i];
+       uint32_t d = dest[i];
+       uint32_t src_ia = ALPHA_8 (~s);
+       uint32_t dest_ia = ALPHA_8 (~d);
+
+       UN8x4_MUL_UN8_ADD_UN8x4_MUL_UN8 (s, dest_ia, d, src_ia);
+
+       dest[i] = s;
+    }
+}
+
+static void
+vmx_combine_xor_u_mask (uint32_t *      dest,
+                        const uint32_t *src,
+                        const uint32_t *mask,
+                        int             width)
+{
+    int i;
+    vector unsigned int vdest, vsrc, vmask;
+    vector unsigned char tmp1, tmp2, tmp3, tmp4, edges,
+       dest_mask, src_mask, mask_mask, store_mask;
+
+    COMPUTE_SHIFT_MASKC (dest, src, mask);
+
+    /* printf ("%s\n",__PRETTY_FUNCTION__); */
+    for (i = width / 4; i > 0; i--)
+    {
+       LOAD_VECTORSM (dest, src, mask);
+
+       vdest = pix_add_mul (vsrc, splat_alpha (negate (vdest)),
+                            vdest, splat_alpha (negate (vsrc)));
+
+       STORE_VECTOR (dest);
+
+       src += 4;
+       dest += 4;
+       mask += 4;
+    }
+
+    for (i = width % 4; --i >= 0;)
+    {
+       uint32_t m = ALPHA_8 (mask[i]);
+       uint32_t s = src[i];
+       uint32_t d = dest[i];
+       uint32_t src_ia;
+       uint32_t dest_ia = ALPHA_8 (~d);
+
+       UN8x4_MUL_UN8 (s, m);
+
+       src_ia = ALPHA_8 (~s);
+
+       UN8x4_MUL_UN8_ADD_UN8x4_MUL_UN8 (s, dest_ia, d, src_ia);
+
+       dest[i] = s;
+    }
+}
+
+static void
+vmx_combine_xor_u (pixman_implementation_t *imp,
+                   pixman_op_t              op,
+                   uint32_t *               dest,
+                   const uint32_t *         src,
+                   const uint32_t *         mask,
+                   int                      width)
+{
+    if (mask)
+       vmx_combine_xor_u_mask (dest, src, mask, width);
+    else
+       vmx_combine_xor_u_no_mask (dest, src, width);
+}
+
+static void
+vmx_combine_add_u_no_mask (uint32_t *      dest,
+                           const uint32_t *src,
+                           int             width)
+{
+    int i;
+    vector unsigned int vdest, vsrc;
+    vector unsigned char tmp1, tmp2, tmp3, tmp4, edges,
+       dest_mask, src_mask, store_mask;
+
+    COMPUTE_SHIFT_MASKS (dest, src);
+    /* printf ("%s\n",__PRETTY_FUNCTION__); */
+    for (i = width / 4; i > 0; i--)
+    {
+       LOAD_VECTORS (dest, src);
+
+       vdest = pix_add (vsrc, vdest);
+
+       STORE_VECTOR (dest);
+
+       src += 4;
+       dest += 4;
+    }
+
+    for (i = width % 4; --i >= 0;)
+    {
+       uint32_t s = src[i];
+       uint32_t d = dest[i];
+
+       UN8x4_ADD_UN8x4 (d, s);
+
+       dest[i] = d;
+    }
+}
+
+static void
+vmx_combine_add_u_mask (uint32_t *      dest,
+                        const uint32_t *src,
+                        const uint32_t *mask,
+                        int             width)
+{
+    int i;
+    vector unsigned int vdest, vsrc, vmask;
+    vector unsigned char tmp1, tmp2, tmp3, tmp4, edges,
+       dest_mask, src_mask, mask_mask, store_mask;
+
+    COMPUTE_SHIFT_MASKC (dest, src, mask);
+
+    /* printf ("%s\n",__PRETTY_FUNCTION__); */
+    for (i = width / 4; i > 0; i--)
+    {
+       LOAD_VECTORSM (dest, src, mask);
+
+       vdest = pix_add (vsrc, vdest);
+
+       STORE_VECTOR (dest);
+
+       src += 4;
+       dest += 4;
+       mask += 4;
+    }
+
+    for (i = width % 4; --i >= 0;)
+    {
+       uint32_t m = ALPHA_8 (mask[i]);
+       uint32_t s = src[i];
+       uint32_t d = dest[i];
+
+       UN8x4_MUL_UN8 (s, m);
+       UN8x4_ADD_UN8x4 (d, s);
+
+       dest[i] = d;
+    }
+}
+
+static void
+vmx_combine_add_u (pixman_implementation_t *imp,
+                   pixman_op_t              op,
+                   uint32_t *               dest,
+                   const uint32_t *         src,
+                   const uint32_t *         mask,
+                   int                      width)
+{
+    if (mask)
+       vmx_combine_add_u_mask (dest, src, mask, width);
+    else
+       vmx_combine_add_u_no_mask (dest, src, width);
+}
+
+static void
+vmx_combine_src_ca (pixman_implementation_t *imp,
+                    pixman_op_t              op,
+                    uint32_t *               dest,
+                    const uint32_t *         src,
+                    const uint32_t *         mask,
+                    int                      width)
+{
+    int i;
+    vector unsigned int vdest, vsrc, vmask;
+    vector unsigned char tmp1, tmp2, tmp3, tmp4, edges,
+       dest_mask, mask_mask, src_mask, store_mask;
+
+    COMPUTE_SHIFT_MASKC (dest, src, mask);
+
+    /* printf ("%s\n",__PRETTY_FUNCTION__); */
+    for (i = width / 4; i > 0; i--)
+    {
+       LOAD_VECTORSC (dest, src, mask);
+
+       vdest = pix_multiply (vsrc, vmask);
+
+       STORE_VECTOR (dest);
+
+       mask += 4;
+       src += 4;
+       dest += 4;
+    }
+
+    for (i = width % 4; --i >= 0;)
+    {
+       uint32_t a = mask[i];
+       uint32_t s = src[i];
+
+       UN8x4_MUL_UN8x4 (s, a);
+
+       dest[i] = s;
+    }
+}
+
+static void
+vmx_combine_over_ca (pixman_implementation_t *imp,
+                     pixman_op_t              op,
+                     uint32_t *               dest,
+                     const uint32_t *         src,
+                     const uint32_t *         mask,
+                     int                      width)
+{
+    int i;
+    vector unsigned int vdest, vsrc, vmask;
+    vector unsigned char tmp1, tmp2, tmp3, tmp4, edges,
+       dest_mask, mask_mask, src_mask, store_mask;
+
+    COMPUTE_SHIFT_MASKC (dest, src, mask);
+
+    /* printf ("%s\n",__PRETTY_FUNCTION__); */
+    for (i = width / 4; i > 0; i--)
+    {
+       LOAD_VECTORSC (dest, src, mask);
+
+       vdest = in_over (vsrc, splat_alpha (vsrc), vmask, vdest);
+
+       STORE_VECTOR (dest);
+
+       mask += 4;
+       src += 4;
+       dest += 4;
+    }
+
+    for (i = width % 4; --i >= 0;)
+    {
+       uint32_t a = mask[i];
+       uint32_t s = src[i];
+       uint32_t d = dest[i];
+       uint32_t sa = ALPHA_8 (s);
+
+       UN8x4_MUL_UN8x4 (s, a);
+       UN8x4_MUL_UN8 (a, sa);
+       UN8x4_MUL_UN8x4_ADD_UN8x4 (d, ~a, s);
+
+       dest[i] = d;
+    }
+}
+
+static void
+vmx_combine_over_reverse_ca (pixman_implementation_t *imp,
+                             pixman_op_t              op,
+                             uint32_t *               dest,
+                             const uint32_t *         src,
+                             const uint32_t *         mask,
+                             int                      width)
+{
+    int i;
+    vector unsigned int vdest, vsrc, vmask;
+    vector unsigned char tmp1, tmp2, tmp3, tmp4, edges,
+       dest_mask, mask_mask, src_mask, store_mask;
+
+    COMPUTE_SHIFT_MASKC (dest, src, mask);
+
+    /* printf("%s\n",__PRETTY_FUNCTION__); */
+    for (i = width / 4; i > 0; i--)
+    {
+       LOAD_VECTORSC (dest, src, mask);
+
+       vdest = over (vdest, splat_alpha (vdest), pix_multiply (vsrc, vmask));
+
+       STORE_VECTOR (dest);
+
+       mask += 4;
+       src += 4;
+       dest += 4;
+    }
+
+    for (i = width % 4; --i >= 0;)
+    {
+       uint32_t a = mask[i];
+       uint32_t s = src[i];
+       uint32_t d = dest[i];
+       uint32_t ida = ALPHA_8 (~d);
+
+       UN8x4_MUL_UN8x4 (s, a);
+       UN8x4_MUL_UN8_ADD_UN8x4 (s, ida, d);
+
+       dest[i] = s;
+    }
+}
+
+static void
+vmx_combine_in_ca (pixman_implementation_t *imp,
+                   pixman_op_t              op,
+                   uint32_t *               dest,
+                   const uint32_t *         src,
+                   const uint32_t *         mask,
+                   int                      width)
+{
+    int i;
+    vector unsigned int vdest, vsrc, vmask;
+    vector unsigned char tmp1, tmp2, tmp3, tmp4, edges,
+       dest_mask, mask_mask, src_mask, store_mask;
+
+    COMPUTE_SHIFT_MASKC (dest, src, mask);
+
+    /* printf ("%s\n",__PRETTY_FUNCTION__); */
+    for (i = width / 4; i > 0; i--)
+    {
+       LOAD_VECTORSC (dest, src, mask);
+
+       vdest = pix_multiply (pix_multiply (vsrc, vmask), splat_alpha (vdest));
+
+       STORE_VECTOR (dest);
+
+       src += 4;
+       dest += 4;
+       mask += 4;
+    }
+
+    for (i = width % 4; --i >= 0;)
+    {
+       uint32_t a = mask[i];
+       uint32_t s = src[i];
+       uint32_t da = ALPHA_8 (dest[i]);
+
+       UN8x4_MUL_UN8x4 (s, a);
+       UN8x4_MUL_UN8 (s, da);
+
+       dest[i] = s;
+    }
+}
+
+static void
+vmx_combine_in_reverse_ca (pixman_implementation_t *imp,
+                           pixman_op_t              op,
+                           uint32_t *               dest,
+                           const uint32_t *         src,
+                           const uint32_t *         mask,
+                           int                      width)
+{
+    int i;
+    vector unsigned int vdest, vsrc, vmask;
+    vector unsigned char tmp1, tmp2, tmp3, tmp4, edges,
+       dest_mask, mask_mask, src_mask, store_mask;
+
+    COMPUTE_SHIFT_MASKC (dest, src, mask);
+
+    /* printf ("%s\n",__PRETTY_FUNCTION__); */
+    for (i = width / 4; i > 0; i--)
+    {
+
+       LOAD_VECTORSC (dest, src, mask);
+
+       vdest = pix_multiply (vdest, pix_multiply (vmask, splat_alpha (vsrc)));
+
+       STORE_VECTOR (dest);
+
+       src += 4;
+       dest += 4;
+       mask += 4;
+    }
+
+    for (i = width % 4; --i >= 0;)
+    {
+       uint32_t a = mask[i];
+       uint32_t d = dest[i];
+       uint32_t sa = ALPHA_8 (src[i]);
+
+       UN8x4_MUL_UN8 (a, sa);
+       UN8x4_MUL_UN8x4 (d, a);
+
+       dest[i] = d;
+    }
+}
+
+static void
+vmx_combine_out_ca (pixman_implementation_t *imp,
+                    pixman_op_t              op,
+                    uint32_t *               dest,
+                    const uint32_t *         src,
+                    const uint32_t *         mask,
+                    int                      width)
+{
+    int i;
+    vector unsigned int vdest, vsrc, vmask;
+    vector unsigned char tmp1, tmp2, tmp3, tmp4, edges,
+       dest_mask, mask_mask, src_mask, store_mask;
+
+    COMPUTE_SHIFT_MASKC (dest, src, mask);
+
+    /* printf ("%s\n",__PRETTY_FUNCTION__); */
+    for (i = width / 4; i > 0; i--)
+    {
+       LOAD_VECTORSC (dest, src, mask);
+
+       vdest = pix_multiply (
+           pix_multiply (vsrc, vmask), splat_alpha (negate (vdest)));
+
+       STORE_VECTOR (dest);
+
+       src += 4;
+       dest += 4;
+       mask += 4;
+    }
+
+    for (i = width % 4; --i >= 0;)
+    {
+       uint32_t a = mask[i];
+       uint32_t s = src[i];
+       uint32_t d = dest[i];
+       uint32_t da = ALPHA_8 (~d);
+
+       UN8x4_MUL_UN8x4 (s, a);
+       UN8x4_MUL_UN8 (s, da);
+
+       dest[i] = s;
+    }
+}
+
+static void
+vmx_combine_out_reverse_ca (pixman_implementation_t *imp,
+                            pixman_op_t              op,
+                            uint32_t *               dest,
+                            const uint32_t *         src,
+                            const uint32_t *         mask,
+                            int                      width)
+{
+    int i;
+    vector unsigned int vdest, vsrc, vmask;
+    vector unsigned char tmp1, tmp2, tmp3, tmp4, edges,
+       dest_mask, mask_mask, src_mask, store_mask;
+
+    COMPUTE_SHIFT_MASKC (dest, src, mask);
+
+    /* printf ("%s\n",__PRETTY_FUNCTION__); */
+    for (i = width / 4; i > 0; i--)
+    {
+       LOAD_VECTORSC (dest, src, mask);
+
+       vdest = pix_multiply (
+           vdest, negate (pix_multiply (vmask, splat_alpha (vsrc))));
+
+       STORE_VECTOR (dest);
+
+       src += 4;
+       dest += 4;
+       mask += 4;
+    }
+
+    for (i = width % 4; --i >= 0;)
+    {
+       uint32_t a = mask[i];
+       uint32_t s = src[i];
+       uint32_t d = dest[i];
+       uint32_t sa = ALPHA_8 (s);
+
+       UN8x4_MUL_UN8 (a, sa);
+       UN8x4_MUL_UN8x4 (d, ~a);
+
+       dest[i] = d;
+    }
+}
+
+static void
+vmx_combine_atop_ca (pixman_implementation_t *imp,
+                     pixman_op_t              op,
+                     uint32_t *               dest,
+                     const uint32_t *         src,
+                     const uint32_t *         mask,
+                     int                      width)
+{
+    int i;
+    vector unsigned int vdest, vsrc, vmask, vsrca;
+    vector unsigned char tmp1, tmp2, tmp3, tmp4, edges,
+       dest_mask, mask_mask, src_mask, store_mask;
+
+    COMPUTE_SHIFT_MASKC (dest, src, mask);
+
+    /* printf ("%s\n",__PRETTY_FUNCTION__); */
+    for (i = width / 4; i > 0; i--)
+    {
+       LOAD_VECTORSC (dest, src, mask);
+
+       vsrca = splat_alpha (vsrc);
+
+       vsrc = pix_multiply (vsrc, vmask);
+       vmask = pix_multiply (vmask, vsrca);
+
+       vdest = pix_add_mul (vsrc, splat_alpha (vdest),
+                            negate (vmask), vdest);
+
+       STORE_VECTOR (dest);
+
+       src += 4;
+       dest += 4;
+       mask += 4;
+    }
+
+    for (i = width % 4; --i >= 0;)
+    {
+       uint32_t a = mask[i];
+       uint32_t s = src[i];
+       uint32_t d = dest[i];
+       uint32_t sa = ALPHA_8 (s);
+       uint32_t da = ALPHA_8 (d);
+
+       UN8x4_MUL_UN8x4 (s, a);
+       UN8x4_MUL_UN8 (a, sa);
+       UN8x4_MUL_UN8x4_ADD_UN8x4_MUL_UN8 (d, ~a, s, da);
+
+       dest[i] = d;
+    }
+}
+
+static void
+vmx_combine_atop_reverse_ca (pixman_implementation_t *imp,
+                             pixman_op_t              op,
+                             uint32_t *               dest,
+                             const uint32_t *         src,
+                             const uint32_t *         mask,
+                             int                      width)
+{
+    int i;
+    vector unsigned int vdest, vsrc, vmask;
+    vector unsigned char tmp1, tmp2, tmp3, tmp4, edges,
+       dest_mask, mask_mask, src_mask, store_mask;
+
+    COMPUTE_SHIFT_MASKC (dest, src, mask);
+
+    /* printf ("%s\n",__PRETTY_FUNCTION__); */
+    for (i = width / 4; i > 0; i--)
+    {
+       LOAD_VECTORSC (dest, src, mask);
+
+       vdest = pix_add_mul (vdest,
+                            pix_multiply (vmask, splat_alpha (vsrc)),
+                            pix_multiply (vsrc, vmask),
+                            negate (splat_alpha (vdest)));
+
+       STORE_VECTOR (dest);
+
+       src += 4;
+       dest += 4;
+       mask += 4;
+    }
+
+    for (i = width % 4; --i >= 0;)
+    {
+       uint32_t a = mask[i];
+       uint32_t s = src[i];
+       uint32_t d = dest[i];
+       uint32_t sa = ALPHA_8 (s);
+       uint32_t da = ALPHA_8 (~d);
+
+       UN8x4_MUL_UN8x4 (s, a);
+       UN8x4_MUL_UN8 (a, sa);
+       UN8x4_MUL_UN8x4_ADD_UN8x4_MUL_UN8 (d, a, s, da);
+
+       dest[i] = d;
+    }
+}
+
+static void
+vmx_combine_xor_ca (pixman_implementation_t *imp,
+                    pixman_op_t              op,
+                    uint32_t *               dest,
+                    const uint32_t *         src,
+                    const uint32_t *         mask,
+                    int                      width)
+{
+    int i;
+    vector unsigned int vdest, vsrc, vmask;
+    vector unsigned char tmp1, tmp2, tmp3, tmp4, edges,
+       dest_mask, mask_mask, src_mask, store_mask;
+
+    COMPUTE_SHIFT_MASKC (dest, src, mask);
+
+    /* printf ("%s\n",__PRETTY_FUNCTION__); */
+    for (i = width / 4; i > 0; i--)
+    {
+       LOAD_VECTORSC (dest, src, mask);
+
+       vdest = pix_add_mul (vdest,
+                            negate (pix_multiply (vmask, splat_alpha (vsrc))),
+                            pix_multiply (vsrc, vmask),
+                            negate (splat_alpha (vdest)));
+
+       STORE_VECTOR (dest);
+
+       src += 4;
+       dest += 4;
+       mask += 4;
+    }
+
+    for (i = width % 4; --i >= 0;)
+    {
+       uint32_t a = mask[i];
+       uint32_t s = src[i];
+       uint32_t d = dest[i];
+       uint32_t sa = ALPHA_8 (s);
+       uint32_t da = ALPHA_8 (~d);
+
+       UN8x4_MUL_UN8x4 (s, a);
+       UN8x4_MUL_UN8 (a, sa);
+       UN8x4_MUL_UN8x4_ADD_UN8x4_MUL_UN8 (d, ~a, s, da);
+
+       dest[i] = d;
+    }
+}
+
+static void
+vmx_combine_add_ca (pixman_implementation_t *imp,
+                    pixman_op_t              op,
+                    uint32_t *               dest,
+                    const uint32_t *         src,
+                    const uint32_t *         mask,
+                    int                      width)
+{
+    int i;
+    vector unsigned int vdest, vsrc, vmask;
+    vector unsigned char tmp1, tmp2, tmp3, tmp4, edges,
+       dest_mask, mask_mask, src_mask, store_mask;
+
+    COMPUTE_SHIFT_MASKC (dest, src, mask);
+
+    /* printf ("%s\n",__PRETTY_FUNCTION__); */
+    for (i = width / 4; i > 0; i--)
+    {
+       LOAD_VECTORSC (dest, src, mask);
+
+       vdest = pix_add (pix_multiply (vsrc, vmask), vdest);
+
+       STORE_VECTOR (dest);
+
+       src += 4;
+       dest += 4;
+       mask += 4;
+    }
+
+    for (i = width % 4; --i >= 0;)
+    {
+       uint32_t a = mask[i];
+       uint32_t s = src[i];
+       uint32_t d = dest[i];
+
+       UN8x4_MUL_UN8x4 (s, a);
+       UN8x4_ADD_UN8x4 (s, d);
+
+       dest[i] = s;
+    }
+}
+
+static const pixman_fast_path_t vmx_fast_paths[] =
+{
+    {   PIXMAN_OP_NONE },
+};
+
+pixman_implementation_t *
+_pixman_implementation_create_vmx (pixman_implementation_t *fallback)
+{
+    pixman_implementation_t *imp = _pixman_implementation_create (fallback, vmx_fast_paths);
+
+    /* Set up function pointers */
+
+    imp->combine_32[PIXMAN_OP_OVER] = vmx_combine_over_u;
+    imp->combine_32[PIXMAN_OP_OVER_REVERSE] = vmx_combine_over_reverse_u;
+    imp->combine_32[PIXMAN_OP_IN] = vmx_combine_in_u;
+    imp->combine_32[PIXMAN_OP_IN_REVERSE] = vmx_combine_in_reverse_u;
+    imp->combine_32[PIXMAN_OP_OUT] = vmx_combine_out_u;
+    imp->combine_32[PIXMAN_OP_OUT_REVERSE] = vmx_combine_out_reverse_u;
+    imp->combine_32[PIXMAN_OP_ATOP] = vmx_combine_atop_u;
+    imp->combine_32[PIXMAN_OP_ATOP_REVERSE] = vmx_combine_atop_reverse_u;
+    imp->combine_32[PIXMAN_OP_XOR] = vmx_combine_xor_u;
+
+    imp->combine_32[PIXMAN_OP_ADD] = vmx_combine_add_u;
+
+    imp->combine_32_ca[PIXMAN_OP_SRC] = vmx_combine_src_ca;
+    imp->combine_32_ca[PIXMAN_OP_OVER] = vmx_combine_over_ca;
+    imp->combine_32_ca[PIXMAN_OP_OVER_REVERSE] = vmx_combine_over_reverse_ca;
+    imp->combine_32_ca[PIXMAN_OP_IN] = vmx_combine_in_ca;
+    imp->combine_32_ca[PIXMAN_OP_IN_REVERSE] = vmx_combine_in_reverse_ca;
+    imp->combine_32_ca[PIXMAN_OP_OUT] = vmx_combine_out_ca;
+    imp->combine_32_ca[PIXMAN_OP_OUT_REVERSE] = vmx_combine_out_reverse_ca;
+    imp->combine_32_ca[PIXMAN_OP_ATOP] = vmx_combine_atop_ca;
+    imp->combine_32_ca[PIXMAN_OP_ATOP_REVERSE] = vmx_combine_atop_reverse_ca;
+    imp->combine_32_ca[PIXMAN_OP_XOR] = vmx_combine_xor_ca;
+    imp->combine_32_ca[PIXMAN_OP_ADD] = vmx_combine_add_ca;
+
+    return imp;
+}
diff --git a/pixman/pixman.c b/pixman/pixman.c
new file mode 100644 (file)
index 0000000..8fb5356
--- /dev/null
@@ -0,0 +1,1140 @@
+/* -*- Mode: c; c-basic-offset: 4; tab-width: 8; indent-tabs-mode: t; -*- */
+/*
+ * Copyright © 2000 SuSE, Inc.
+ * Copyright © 2007 Red Hat, Inc.
+ *
+ * Permission to use, copy, modify, distribute, and sell this software and its
+ * documentation for any purpose is hereby granted without fee, provided that
+ * the above copyright notice appear in all copies and that both that
+ * copyright notice and this permission notice appear in supporting
+ * documentation, and that the name of SuSE not be used in advertising or
+ * publicity pertaining to distribution of the software without specific,
+ * written prior permission.  SuSE makes no representations about the
+ * suitability of this software for any purpose.  It is provided "as is"
+ * without express or implied warranty.
+ *
+ * SuSE DISCLAIMS ALL WARRANTIES WITH REGARD TO THIS SOFTWARE, INCLUDING ALL
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS, IN NO EVENT SHALL SuSE
+ * BE LIABLE FOR ANY SPECIAL, INDIRECT OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES
+ * WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN ACTION
+ * OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT OF OR IN
+ * CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE.
+ *
+ * Author:  Keith Packard, SuSE, Inc.
+ */
+
+#ifdef HAVE_CONFIG_H
+#include <config.h>
+#endif
+#include "pixman-private.h"
+
+#include <stdlib.h>
+
+static pixman_implementation_t *global_implementation;
+
+#ifdef TOOLCHAIN_SUPPORTS_ATTRIBUTE_CONSTRUCTOR
+static void __attribute__((constructor))
+pixman_constructor (void)
+{
+    global_implementation = _pixman_choose_implementation ();
+}
+#endif
+
+static force_inline pixman_implementation_t *
+get_implementation (void)
+{
+#ifndef TOOLCHAIN_SUPPORTS_ATTRIBUTE_CONSTRUCTOR
+    if (!global_implementation)
+       global_implementation = _pixman_choose_implementation ();
+#endif
+    return global_implementation;
+}
+
+typedef struct operator_info_t operator_info_t;
+
+struct operator_info_t
+{
+    uint8_t    opaque_info[4];
+};
+
+#define PACK(neither, src, dest, both)                 \
+    {{     (uint8_t)PIXMAN_OP_ ## neither,             \
+           (uint8_t)PIXMAN_OP_ ## src,                 \
+           (uint8_t)PIXMAN_OP_ ## dest,                \
+           (uint8_t)PIXMAN_OP_ ## both         }}
+
+static const operator_info_t operator_table[] =
+{
+    /*    Neither Opaque         Src Opaque             Dst Opaque             Both Opaque */
+    PACK (CLEAR,                 CLEAR,                 CLEAR,                 CLEAR),
+    PACK (SRC,                   SRC,                   SRC,                   SRC),
+    PACK (DST,                   DST,                   DST,                   DST),
+    PACK (OVER,                  SRC,                   OVER,                  SRC),
+    PACK (OVER_REVERSE,          OVER_REVERSE,          DST,                   DST),
+    PACK (IN,                    IN,                    SRC,                   SRC),
+    PACK (IN_REVERSE,            DST,                   IN_REVERSE,            DST),
+    PACK (OUT,                   OUT,                   CLEAR,                 CLEAR),
+    PACK (OUT_REVERSE,           CLEAR,                 OUT_REVERSE,           CLEAR),
+    PACK (ATOP,                  IN,                    OVER,                  SRC),
+    PACK (ATOP_REVERSE,          OVER_REVERSE,          IN_REVERSE,            DST),
+    PACK (XOR,                   OUT,                   OUT_REVERSE,           CLEAR),
+    PACK (ADD,                   ADD,                   ADD,                   ADD),
+    PACK (SATURATE,              OVER_REVERSE,          DST,                   DST),
+
+    {{ 0 /* 0x0e */ }},
+    {{ 0 /* 0x0f */ }},
+
+    PACK (CLEAR,                 CLEAR,                 CLEAR,                 CLEAR),
+    PACK (SRC,                   SRC,                   SRC,                   SRC),
+    PACK (DST,                   DST,                   DST,                   DST),
+    PACK (DISJOINT_OVER,         DISJOINT_OVER,         DISJOINT_OVER,         DISJOINT_OVER),
+    PACK (DISJOINT_OVER_REVERSE, DISJOINT_OVER_REVERSE, DISJOINT_OVER_REVERSE, DISJOINT_OVER_REVERSE),
+    PACK (DISJOINT_IN,           DISJOINT_IN,           DISJOINT_IN,           DISJOINT_IN),
+    PACK (DISJOINT_IN_REVERSE,   DISJOINT_IN_REVERSE,   DISJOINT_IN_REVERSE,   DISJOINT_IN_REVERSE),
+    PACK (DISJOINT_OUT,          DISJOINT_OUT,          DISJOINT_OUT,          DISJOINT_OUT),
+    PACK (DISJOINT_OUT_REVERSE,  DISJOINT_OUT_REVERSE,  DISJOINT_OUT_REVERSE,  DISJOINT_OUT_REVERSE),
+    PACK (DISJOINT_ATOP,         DISJOINT_ATOP,         DISJOINT_ATOP,         DISJOINT_ATOP),
+    PACK (DISJOINT_ATOP_REVERSE, DISJOINT_ATOP_REVERSE, DISJOINT_ATOP_REVERSE, DISJOINT_ATOP_REVERSE),
+    PACK (DISJOINT_XOR,          DISJOINT_XOR,          DISJOINT_XOR,          DISJOINT_XOR),
+
+    {{ 0 /* 0x1c */ }},
+    {{ 0 /* 0x1d */ }},
+    {{ 0 /* 0x1e */ }},
+    {{ 0 /* 0x1f */ }},
+
+    PACK (CLEAR,                 CLEAR,                 CLEAR,                 CLEAR),
+    PACK (SRC,                   SRC,                   SRC,                   SRC),
+    PACK (DST,                   DST,                   DST,                   DST),
+    PACK (CONJOINT_OVER,         CONJOINT_OVER,         CONJOINT_OVER,         CONJOINT_OVER),
+    PACK (CONJOINT_OVER_REVERSE, CONJOINT_OVER_REVERSE, CONJOINT_OVER_REVERSE, CONJOINT_OVER_REVERSE),
+    PACK (CONJOINT_IN,           CONJOINT_IN,           CONJOINT_IN,           CONJOINT_IN),
+    PACK (CONJOINT_IN_REVERSE,   CONJOINT_IN_REVERSE,   CONJOINT_IN_REVERSE,   CONJOINT_IN_REVERSE),
+    PACK (CONJOINT_OUT,          CONJOINT_OUT,          CONJOINT_OUT,          CONJOINT_OUT),
+    PACK (CONJOINT_OUT_REVERSE,  CONJOINT_OUT_REVERSE,  CONJOINT_OUT_REVERSE,  CONJOINT_OUT_REVERSE),
+    PACK (CONJOINT_ATOP,         CONJOINT_ATOP,         CONJOINT_ATOP,         CONJOINT_ATOP),
+    PACK (CONJOINT_ATOP_REVERSE, CONJOINT_ATOP_REVERSE, CONJOINT_ATOP_REVERSE, CONJOINT_ATOP_REVERSE),
+    PACK (CONJOINT_XOR,          CONJOINT_XOR,          CONJOINT_XOR,          CONJOINT_XOR),
+
+    {{ 0 /* 0x2c */ }},
+    {{ 0 /* 0x2d */ }},
+    {{ 0 /* 0x2e */ }},
+    {{ 0 /* 0x2f */ }},
+
+    PACK (MULTIPLY,              MULTIPLY,              MULTIPLY,              MULTIPLY),
+    PACK (SCREEN,                SCREEN,                SCREEN,                SCREEN),
+    PACK (OVERLAY,               OVERLAY,               OVERLAY,               OVERLAY),
+    PACK (DARKEN,                DARKEN,                DARKEN,                DARKEN),
+    PACK (LIGHTEN,               LIGHTEN,               LIGHTEN,               LIGHTEN),
+    PACK (COLOR_DODGE,           COLOR_DODGE,           COLOR_DODGE,           COLOR_DODGE),
+    PACK (COLOR_BURN,            COLOR_BURN,            COLOR_BURN,            COLOR_BURN),
+    PACK (HARD_LIGHT,            HARD_LIGHT,            HARD_LIGHT,            HARD_LIGHT),
+    PACK (SOFT_LIGHT,            SOFT_LIGHT,            SOFT_LIGHT,            SOFT_LIGHT),
+    PACK (DIFFERENCE,            DIFFERENCE,            DIFFERENCE,            DIFFERENCE),
+    PACK (EXCLUSION,             EXCLUSION,             EXCLUSION,             EXCLUSION),
+    PACK (HSL_HUE,               HSL_HUE,               HSL_HUE,               HSL_HUE),
+    PACK (HSL_SATURATION,        HSL_SATURATION,        HSL_SATURATION,        HSL_SATURATION),
+    PACK (HSL_COLOR,             HSL_COLOR,             HSL_COLOR,             HSL_COLOR),
+    PACK (HSL_LUMINOSITY,        HSL_LUMINOSITY,        HSL_LUMINOSITY,        HSL_LUMINOSITY),
+};
+
+/*
+ * Optimize the current operator based on opacity of source or destination
+ * The output operator should be mathematically equivalent to the source.
+ */
+static pixman_op_t
+optimize_operator (pixman_op_t     op,
+                  uint32_t        src_flags,
+                  uint32_t        mask_flags,
+                  uint32_t        dst_flags)
+{
+    pixman_bool_t is_source_opaque, is_dest_opaque;
+
+#define OPAQUE_SHIFT 13
+    
+    COMPILE_TIME_ASSERT (FAST_PATH_IS_OPAQUE == (1 << OPAQUE_SHIFT));
+    
+    is_dest_opaque = (dst_flags & FAST_PATH_IS_OPAQUE);
+    is_source_opaque = ((src_flags & mask_flags) & FAST_PATH_IS_OPAQUE);
+
+    is_dest_opaque >>= OPAQUE_SHIFT - 1;
+    is_source_opaque >>= OPAQUE_SHIFT;
+
+    return operator_table[op].opaque_info[is_dest_opaque | is_source_opaque];
+}
+
+/*
+ * Computing composite region
+ */
+static inline pixman_bool_t
+clip_general_image (pixman_region32_t * region,
+                    pixman_region32_t * clip,
+                    int                 dx,
+                    int                 dy)
+{
+    if (pixman_region32_n_rects (region) == 1 &&
+        pixman_region32_n_rects (clip) == 1)
+    {
+       pixman_box32_t *  rbox = pixman_region32_rectangles (region, NULL);
+       pixman_box32_t *  cbox = pixman_region32_rectangles (clip, NULL);
+       int v;
+
+       if (rbox->x1 < (v = cbox->x1 + dx))
+           rbox->x1 = v;
+       if (rbox->x2 > (v = cbox->x2 + dx))
+           rbox->x2 = v;
+       if (rbox->y1 < (v = cbox->y1 + dy))
+           rbox->y1 = v;
+       if (rbox->y2 > (v = cbox->y2 + dy))
+           rbox->y2 = v;
+       if (rbox->x1 >= rbox->x2 || rbox->y1 >= rbox->y2)
+       {
+           pixman_region32_init (region);
+           return FALSE;
+       }
+    }
+    else if (!pixman_region32_not_empty (clip))
+    {
+       return FALSE;
+    }
+    else
+    {
+       if (dx || dy)
+           pixman_region32_translate (region, -dx, -dy);
+
+       if (!pixman_region32_intersect (region, region, clip))
+           return FALSE;
+
+       if (dx || dy)
+           pixman_region32_translate (region, dx, dy);
+    }
+
+    return pixman_region32_not_empty (region);
+}
+
+static inline pixman_bool_t
+clip_source_image (pixman_region32_t * region,
+                   pixman_image_t *    image,
+                   int                 dx,
+                   int                 dy)
+{
+    /* Source clips are ignored, unless they are explicitly turned on
+     * and the clip in question was set by an X client. (Because if
+     * the clip was not set by a client, then it is a hierarchy
+     * clip and those should always be ignored for sources).
+     */
+    if (!image->common.clip_sources || !image->common.client_clip)
+       return TRUE;
+
+    return clip_general_image (region,
+                               &image->common.clip_region,
+                               dx, dy);
+}
+
+/*
+ * returns FALSE if the final region is empty.  Indistinguishable from
+ * an allocation failure, but rendering ignores those anyways.
+ */
+static pixman_bool_t
+pixman_compute_composite_region32 (pixman_region32_t * region,
+                                   pixman_image_t *    src_image,
+                                   pixman_image_t *    mask_image,
+                                   pixman_image_t *    dest_image,
+                                   int32_t             src_x,
+                                   int32_t             src_y,
+                                   int32_t             mask_x,
+                                   int32_t             mask_y,
+                                   int32_t             dest_x,
+                                   int32_t             dest_y,
+                                   int32_t             width,
+                                   int32_t             height)
+{
+    region->extents.x1 = dest_x;
+    region->extents.x2 = dest_x + width;
+    region->extents.y1 = dest_y;
+    region->extents.y2 = dest_y + height;
+
+    region->extents.x1 = MAX (region->extents.x1, 0);
+    region->extents.y1 = MAX (region->extents.y1, 0);
+    region->extents.x2 = MIN (region->extents.x2, dest_image->bits.width);
+    region->extents.y2 = MIN (region->extents.y2, dest_image->bits.height);
+
+    region->data = 0;
+
+    /* Check for empty operation */
+    if (region->extents.x1 >= region->extents.x2 ||
+        region->extents.y1 >= region->extents.y2)
+    {
+       region->extents.x1 = 0;
+       region->extents.x2 = 0;
+       region->extents.y1 = 0;
+       region->extents.y2 = 0;
+       return FALSE;
+    }
+
+    if (dest_image->common.have_clip_region)
+    {
+       if (!clip_general_image (region, &dest_image->common.clip_region, 0, 0))
+           return FALSE;
+    }
+
+    if (dest_image->common.alpha_map)
+    {
+       if (!pixman_region32_intersect_rect (region, region,
+                                            dest_image->common.alpha_origin_x,
+                                            dest_image->common.alpha_origin_y,
+                                            dest_image->common.alpha_map->width,
+                                            dest_image->common.alpha_map->height))
+       {
+           return FALSE;
+       }
+       if (!pixman_region32_not_empty (region))
+           return FALSE;
+       if (dest_image->common.alpha_map->common.have_clip_region)
+       {
+           if (!clip_general_image (region, &dest_image->common.alpha_map->common.clip_region,
+                                    -dest_image->common.alpha_origin_x,
+                                    -dest_image->common.alpha_origin_y))
+           {
+               return FALSE;
+           }
+       }
+    }
+
+    /* clip against src */
+    if (src_image->common.have_clip_region)
+    {
+       if (!clip_source_image (region, src_image, dest_x - src_x, dest_y - src_y))
+           return FALSE;
+    }
+    if (src_image->common.alpha_map && src_image->common.alpha_map->common.have_clip_region)
+    {
+       if (!clip_source_image (region, (pixman_image_t *)src_image->common.alpha_map,
+                               dest_x - (src_x - src_image->common.alpha_origin_x),
+                               dest_y - (src_y - src_image->common.alpha_origin_y)))
+       {
+           return FALSE;
+       }
+    }
+    /* clip against mask */
+    if (mask_image && mask_image->common.have_clip_region)
+    {
+       if (!clip_source_image (region, mask_image, dest_x - mask_x, dest_y - mask_y))
+           return FALSE;
+
+       if (mask_image->common.alpha_map && mask_image->common.alpha_map->common.have_clip_region)
+       {
+           if (!clip_source_image (region, (pixman_image_t *)mask_image->common.alpha_map,
+                                   dest_x - (mask_x - mask_image->common.alpha_origin_x),
+                                   dest_y - (mask_y - mask_image->common.alpha_origin_y)))
+           {
+               return FALSE;
+           }
+       }
+    }
+
+    return TRUE;
+}
+
+typedef struct
+{
+    pixman_fixed_48_16_t       x1;
+    pixman_fixed_48_16_t       y1;
+    pixman_fixed_48_16_t       x2;
+    pixman_fixed_48_16_t       y2;
+} box_48_16_t;
+
+static pixman_bool_t
+compute_transformed_extents (pixman_transform_t *transform,
+                            const pixman_box32_t *extents,
+                            box_48_16_t *transformed)
+{
+    pixman_fixed_48_16_t tx1, ty1, tx2, ty2;
+    pixman_fixed_t x1, y1, x2, y2;
+    int i;
+
+    x1 = pixman_int_to_fixed (extents->x1) + pixman_fixed_1 / 2;
+    y1 = pixman_int_to_fixed (extents->y1) + pixman_fixed_1 / 2;
+    x2 = pixman_int_to_fixed (extents->x2) - pixman_fixed_1 / 2;
+    y2 = pixman_int_to_fixed (extents->y2) - pixman_fixed_1 / 2;
+
+    if (!transform)
+    {
+       transformed->x1 = x1;
+       transformed->y1 = y1;
+       transformed->x2 = x2;
+       transformed->y2 = y2;
+
+       return TRUE;
+    }
+
+    tx1 = ty1 = INT64_MAX;
+    tx2 = ty2 = INT64_MIN;
+
+    for (i = 0; i < 4; ++i)
+    {
+       pixman_fixed_48_16_t tx, ty;
+       pixman_vector_t v;
+
+       v.vector[0] = (i & 0x01)? x1 : x2;
+       v.vector[1] = (i & 0x02)? y1 : y2;
+       v.vector[2] = pixman_fixed_1;
+
+       if (!pixman_transform_point (transform, &v))
+           return FALSE;
+
+       tx = (pixman_fixed_48_16_t)v.vector[0];
+       ty = (pixman_fixed_48_16_t)v.vector[1];
+
+       if (tx < tx1)
+           tx1 = tx;
+       if (ty < ty1)
+           ty1 = ty;
+       if (tx > tx2)
+           tx2 = tx;
+       if (ty > ty2)
+           ty2 = ty;
+    }
+
+    transformed->x1 = tx1;
+    transformed->y1 = ty1;
+    transformed->x2 = tx2;
+    transformed->y2 = ty2;
+
+    return TRUE;
+}
+
+#define IS_16BIT(x) (((x) >= INT16_MIN) && ((x) <= INT16_MAX))
+#define ABS(f)      (((f) < 0)?  (-(f)) : (f))
+#define IS_16_16(f) (((f) >= pixman_min_fixed_48_16 && ((f) <= pixman_max_fixed_48_16)))
+
+static pixman_bool_t
+analyze_extent (pixman_image_t       *image,
+               const pixman_box32_t *extents,
+               uint32_t             *flags)
+{
+    pixman_transform_t *transform;
+    pixman_fixed_t x_off, y_off;
+    pixman_fixed_t width, height;
+    pixman_fixed_t *params;
+    box_48_16_t transformed;
+    pixman_box32_t exp_extents;
+
+    if (!image)
+       return TRUE;
+
+    /* Some compositing functions walk one step
+     * outside the destination rectangle, so we
+     * check here that the expanded-by-one source
+     * extents in destination space fits in 16 bits
+     */
+    if (!IS_16BIT (extents->x1 - 1)            ||
+       !IS_16BIT (extents->y1 - 1)             ||
+       !IS_16BIT (extents->x2 + 1)             ||
+       !IS_16BIT (extents->y2 + 1))
+    {
+       return FALSE;
+    }
+
+    transform = image->common.transform;
+    if (image->common.type == BITS)
+    {
+       /* During repeat mode calculations we might convert the
+        * width/height of an image to fixed 16.16, so we need
+        * them to be smaller than 16 bits.
+        */
+       if (image->bits.width >= 0x7fff || image->bits.height >= 0x7fff)
+           return FALSE;
+
+       if ((image->common.flags & FAST_PATH_ID_TRANSFORM) == FAST_PATH_ID_TRANSFORM &&
+           extents->x1 >= 0 &&
+           extents->y1 >= 0 &&
+           extents->x2 <= image->bits.width &&
+           extents->y2 <= image->bits.height)
+       {
+           *flags |= FAST_PATH_SAMPLES_COVER_CLIP_NEAREST;
+           return TRUE;
+       }
+
+       switch (image->common.filter)
+       {
+       case PIXMAN_FILTER_CONVOLUTION:
+           params = image->common.filter_params;
+           x_off = - pixman_fixed_e - ((params[0] - pixman_fixed_1) >> 1);
+           y_off = - pixman_fixed_e - ((params[1] - pixman_fixed_1) >> 1);
+           width = params[0];
+           height = params[1];
+           break;
+
+       case PIXMAN_FILTER_GOOD:
+       case PIXMAN_FILTER_BEST:
+       case PIXMAN_FILTER_BILINEAR:
+           x_off = - pixman_fixed_1 / 2;
+           y_off = - pixman_fixed_1 / 2;
+           width = pixman_fixed_1;
+           height = pixman_fixed_1;
+           break;
+
+       case PIXMAN_FILTER_FAST:
+       case PIXMAN_FILTER_NEAREST:
+           x_off = - pixman_fixed_e;
+           y_off = - pixman_fixed_e;
+           width = 0;
+           height = 0;
+           break;
+
+       default:
+           return FALSE;
+       }
+    }
+    else
+    {
+       x_off = 0;
+       y_off = 0;
+       width = 0;
+       height = 0;
+    }
+
+    if (!compute_transformed_extents (transform, extents, &transformed))
+       return FALSE;
+
+    /* Expand the source area by a tiny bit so account of different rounding that
+     * may happen during sampling. Note that (8 * pixman_fixed_e) is very far from
+     * 0.5 so this won't cause the area computed to be overly pessimistic.
+     */
+    transformed.x1 -= 8 * pixman_fixed_e;
+    transformed.y1 -= 8 * pixman_fixed_e;
+    transformed.x2 += 8 * pixman_fixed_e;
+    transformed.y2 += 8 * pixman_fixed_e;
+
+    if (image->common.type == BITS)
+    {
+       if (pixman_fixed_to_int (transformed.x1) >= 0                   &&
+           pixman_fixed_to_int (transformed.y1) >= 0                   &&
+           pixman_fixed_to_int (transformed.x2) < image->bits.width    &&
+           pixman_fixed_to_int (transformed.y2) < image->bits.height)
+       {
+           *flags |= FAST_PATH_SAMPLES_COVER_CLIP_NEAREST;
+       }
+
+       if (pixman_fixed_to_int (transformed.x1 - pixman_fixed_1 / 2) >= 0                &&
+           pixman_fixed_to_int (transformed.y1 - pixman_fixed_1 / 2) >= 0                &&
+           pixman_fixed_to_int (transformed.x2 + pixman_fixed_1 / 2) < image->bits.width &&
+           pixman_fixed_to_int (transformed.y2 + pixman_fixed_1 / 2) < image->bits.height)
+       {
+           *flags |= FAST_PATH_SAMPLES_COVER_CLIP_BILINEAR;
+       }
+    }
+
+    /* Check we don't overflow when the destination extents are expanded by one.
+     * This ensures that compositing functions can simply walk the source space
+     * using 16.16 variables without worrying about overflow.
+     */
+    exp_extents = *extents;
+    exp_extents.x1 -= 1;
+    exp_extents.y1 -= 1;
+    exp_extents.x2 += 1;
+    exp_extents.y2 += 1;
+
+    if (!compute_transformed_extents (transform, &exp_extents, &transformed))
+       return FALSE;
+    
+    if (!IS_16_16 (transformed.x1 + x_off - 8 * pixman_fixed_e)        ||
+       !IS_16_16 (transformed.y1 + y_off - 8 * pixman_fixed_e) ||
+       !IS_16_16 (transformed.x2 + x_off + 8 * pixman_fixed_e + width) ||
+       !IS_16_16 (transformed.y2 + y_off + 8 * pixman_fixed_e + height))
+    {
+       return FALSE;
+    }
+
+    return TRUE;
+}
+
+/*
+ * Work around GCC bug causing crashes in Mozilla with SSE2
+ *
+ * When using -msse, gcc generates movdqa instructions assuming that
+ * the stack is 16 byte aligned. Unfortunately some applications, such
+ * as Mozilla and Mono, end up aligning the stack to 4 bytes, which
+ * causes the movdqa instructions to fail.
+ *
+ * The __force_align_arg_pointer__ makes gcc generate a prologue that
+ * realigns the stack pointer to 16 bytes.
+ *
+ * On x86-64 this is not necessary because the standard ABI already
+ * calls for a 16 byte aligned stack.
+ *
+ * See https://bugs.freedesktop.org/show_bug.cgi?id=15693
+ */
+#if defined (USE_SSE2) && defined(__GNUC__) && !defined(__x86_64__) && !defined(__amd64__)
+__attribute__((__force_align_arg_pointer__))
+#endif
+PIXMAN_EXPORT void
+pixman_image_composite32 (pixman_op_t      op,
+                          pixman_image_t * src,
+                          pixman_image_t * mask,
+                          pixman_image_t * dest,
+                          int32_t          src_x,
+                          int32_t          src_y,
+                          int32_t          mask_x,
+                          int32_t          mask_y,
+                          int32_t          dest_x,
+                          int32_t          dest_y,
+                          int32_t          width,
+                          int32_t          height)
+{
+    pixman_format_code_t src_format, mask_format, dest_format;
+    uint32_t src_flags, mask_flags, dest_flags;
+    pixman_region32_t region;
+    pixman_box32_t extents;
+    pixman_implementation_t *imp;
+    pixman_composite_func_t func;
+
+    _pixman_image_validate (src);
+    if (mask)
+       _pixman_image_validate (mask);
+    _pixman_image_validate (dest);
+
+    src_format = src->common.extended_format_code;
+    src_flags = src->common.flags;
+
+    if (mask)
+    {
+       mask_format = mask->common.extended_format_code;
+       mask_flags = mask->common.flags;
+    }
+    else
+    {
+       mask_format = PIXMAN_null;
+       mask_flags = FAST_PATH_IS_OPAQUE;
+    }
+
+    dest_format = dest->common.extended_format_code;
+    dest_flags = dest->common.flags;
+
+    /* Check for pixbufs */
+    if ((mask_format == PIXMAN_a8r8g8b8 || mask_format == PIXMAN_a8b8g8r8) &&
+       (src->type == BITS && src->bits.bits == mask->bits.bits)           &&
+       (src->common.repeat == mask->common.repeat)                        &&
+       (src_x == mask_x && src_y == mask_y))
+    {
+       if (src_format == PIXMAN_x8b8g8r8)
+           src_format = mask_format = PIXMAN_pixbuf;
+       else if (src_format == PIXMAN_x8r8g8b8)
+           src_format = mask_format = PIXMAN_rpixbuf;
+    }
+
+    pixman_region32_init (&region);
+
+    if (!pixman_compute_composite_region32 (
+           &region, src, mask, dest,
+           src_x, src_y, mask_x, mask_y, dest_x, dest_y, width, height))
+    {
+       goto out;
+    }
+
+    extents = *pixman_region32_extents (&region);
+
+    extents.x1 -= dest_x - src_x;
+    extents.y1 -= dest_y - src_y;
+    extents.x2 -= dest_x - src_x;
+    extents.y2 -= dest_y - src_y;
+
+    if (!analyze_extent (src, &extents, &src_flags))
+       goto out;
+
+    extents.x1 -= src_x - mask_x;
+    extents.y1 -= src_y - mask_y;
+    extents.x2 -= src_x - mask_x;
+    extents.y2 -= src_y - mask_y;
+
+    if (!analyze_extent (mask, &extents, &mask_flags))
+       goto out;
+
+    /* If the clip is within the source samples, and the samples are
+     * opaque, then the source is effectively opaque.
+     */
+#define NEAREST_OPAQUE (FAST_PATH_SAMPLES_OPAQUE |                     \
+                        FAST_PATH_NEAREST_FILTER |                     \
+                        FAST_PATH_SAMPLES_COVER_CLIP_NEAREST)
+#define BILINEAR_OPAQUE        (FAST_PATH_SAMPLES_OPAQUE |                     \
+                        FAST_PATH_BILINEAR_FILTER |                    \
+                        FAST_PATH_SAMPLES_COVER_CLIP_BILINEAR)
+
+    if ((src_flags & NEAREST_OPAQUE) == NEAREST_OPAQUE ||
+       (src_flags & BILINEAR_OPAQUE) == BILINEAR_OPAQUE)
+    {
+       src_flags |= FAST_PATH_IS_OPAQUE;
+    }
+
+    if ((mask_flags & NEAREST_OPAQUE) == NEAREST_OPAQUE ||
+       (mask_flags & BILINEAR_OPAQUE) == BILINEAR_OPAQUE)
+    {
+       mask_flags |= FAST_PATH_IS_OPAQUE;
+    }
+
+    /*
+     * Check if we can replace our operator by a simpler one
+     * if the src or dest are opaque. The output operator should be
+     * mathematically equivalent to the source.
+     */
+    op = optimize_operator (op, src_flags, mask_flags, dest_flags);
+
+    if (_pixman_lookup_composite_function (
+           get_implementation (), op,
+           src_format, src_flags, mask_format, mask_flags, dest_format, dest_flags,
+           &imp, &func))
+    {
+       pixman_composite_info_t info;
+       const pixman_box32_t *pbox;
+       int n;
+
+       info.op = op;
+       info.src_image = src;
+       info.mask_image = mask;
+       info.dest_image = dest;
+       info.src_flags = src_flags;
+       info.mask_flags = mask_flags;
+       info.dest_flags = dest_flags;
+
+       pbox = pixman_region32_rectangles (&region, &n);
+
+       while (n--)
+       {
+           info.src_x = pbox->x1 + src_x - dest_x;
+           info.src_y = pbox->y1 + src_y - dest_y;
+           info.mask_x = pbox->x1 + mask_x - dest_x;
+           info.mask_y = pbox->y1 + mask_y - dest_y;
+           info.dest_x = pbox->x1;
+           info.dest_y = pbox->y1;
+           info.width = pbox->x2 - pbox->x1;
+           info.height = pbox->y2 - pbox->y1;
+
+           func (imp, &info);
+
+           pbox++;
+       }
+    }
+
+out:
+    pixman_region32_fini (&region);
+}
+
+PIXMAN_EXPORT void
+pixman_image_composite (pixman_op_t      op,
+                        pixman_image_t * src,
+                        pixman_image_t * mask,
+                        pixman_image_t * dest,
+                        int16_t          src_x,
+                        int16_t          src_y,
+                        int16_t          mask_x,
+                        int16_t          mask_y,
+                        int16_t          dest_x,
+                        int16_t          dest_y,
+                        uint16_t         width,
+                        uint16_t         height)
+{
+    pixman_image_composite32 (op, src, mask, dest, src_x, src_y, 
+                              mask_x, mask_y, dest_x, dest_y, width, height);
+}
+
+PIXMAN_EXPORT pixman_bool_t
+pixman_blt (uint32_t *src_bits,
+            uint32_t *dst_bits,
+            int       src_stride,
+            int       dst_stride,
+            int       src_bpp,
+            int       dst_bpp,
+            int       src_x,
+            int       src_y,
+            int       dest_x,
+            int       dest_y,
+            int       width,
+            int       height)
+{
+    return _pixman_implementation_blt (get_implementation(),
+                                      src_bits, dst_bits, src_stride, dst_stride,
+                                       src_bpp, dst_bpp,
+                                       src_x, src_y,
+                                       dest_x, dest_y,
+                                       width, height);
+}
+
+PIXMAN_EXPORT pixman_bool_t
+pixman_fill (uint32_t *bits,
+             int       stride,
+             int       bpp,
+             int       x,
+             int       y,
+             int       width,
+             int       height,
+             uint32_t xor)
+{
+    return _pixman_implementation_fill (
+       get_implementation(), bits, stride, bpp, x, y, width, height, xor);
+}
+
+static uint32_t
+color_to_uint32 (const pixman_color_t *color)
+{
+    return
+        (color->alpha >> 8 << 24) |
+        (color->red >> 8 << 16) |
+        (color->green & 0xff00) |
+        (color->blue >> 8);
+}
+
+static pixman_bool_t
+color_to_pixel (pixman_color_t *     color,
+                uint32_t *           pixel,
+                pixman_format_code_t format)
+{
+    uint32_t c = color_to_uint32 (color);
+
+    if (!(format == PIXMAN_a8r8g8b8     ||
+          format == PIXMAN_x8r8g8b8     ||
+          format == PIXMAN_a8b8g8r8     ||
+          format == PIXMAN_x8b8g8r8     ||
+          format == PIXMAN_b8g8r8a8     ||
+          format == PIXMAN_b8g8r8x8     ||
+          format == PIXMAN_r8g8b8a8     ||
+          format == PIXMAN_r8g8b8x8     ||
+          format == PIXMAN_r5g6b5       ||
+          format == PIXMAN_b5g6r5       ||
+          format == PIXMAN_a8           ||
+          format == PIXMAN_a1))
+    {
+       return FALSE;
+    }
+
+    if (PIXMAN_FORMAT_TYPE (format) == PIXMAN_TYPE_ABGR)
+    {
+       c = ((c & 0xff000000) >>  0) |
+           ((c & 0x00ff0000) >> 16) |
+           ((c & 0x0000ff00) >>  0) |
+           ((c & 0x000000ff) << 16);
+    }
+    if (PIXMAN_FORMAT_TYPE (format) == PIXMAN_TYPE_BGRA)
+    {
+       c = ((c & 0xff000000) >> 24) |
+           ((c & 0x00ff0000) >>  8) |
+           ((c & 0x0000ff00) <<  8) |
+           ((c & 0x000000ff) << 24);
+    }
+    if (PIXMAN_FORMAT_TYPE (format) == PIXMAN_TYPE_RGBA)
+       c = ((c & 0xff000000) >> 24) | (c << 8);
+
+    if (format == PIXMAN_a1)
+       c = c >> 31;
+    else if (format == PIXMAN_a8)
+       c = c >> 24;
+    else if (format == PIXMAN_r5g6b5 ||
+             format == PIXMAN_b5g6r5)
+       c = CONVERT_8888_TO_0565 (c);
+
+#if 0
+    printf ("color: %x %x %x %x\n", color->alpha, color->red, color->green, color->blue);
+    printf ("pixel: %x\n", c);
+#endif
+
+    *pixel = c;
+    return TRUE;
+}
+
+PIXMAN_EXPORT pixman_bool_t
+pixman_image_fill_rectangles (pixman_op_t                 op,
+                              pixman_image_t *            dest,
+                              pixman_color_t *            color,
+                              int                         n_rects,
+                              const pixman_rectangle16_t *rects)
+{
+    pixman_box32_t stack_boxes[6];
+    pixman_box32_t *boxes;
+    pixman_bool_t result;
+    int i;
+
+    if (n_rects > 6)
+    {
+        boxes = pixman_malloc_ab (sizeof (pixman_box32_t), n_rects);
+        if (boxes == NULL)
+            return FALSE;
+    }
+    else
+    {
+        boxes = stack_boxes;
+    }
+
+    for (i = 0; i < n_rects; ++i)
+    {
+        boxes[i].x1 = rects[i].x;
+        boxes[i].y1 = rects[i].y;
+        boxes[i].x2 = boxes[i].x1 + rects[i].width;
+        boxes[i].y2 = boxes[i].y1 + rects[i].height;
+    }
+
+    result = pixman_image_fill_boxes (op, dest, color, n_rects, boxes);
+
+    if (boxes != stack_boxes)
+        free (boxes);
+    
+    return result;
+}
+
+PIXMAN_EXPORT pixman_bool_t
+pixman_image_fill_boxes (pixman_op_t           op,
+                         pixman_image_t *      dest,
+                         pixman_color_t *      color,
+                         int                   n_boxes,
+                         const pixman_box32_t *boxes)
+{
+    pixman_image_t *solid;
+    pixman_color_t c;
+    int i;
+
+    _pixman_image_validate (dest);
+    
+    if (color->alpha == 0xffff)
+    {
+        if (op == PIXMAN_OP_OVER)
+            op = PIXMAN_OP_SRC;
+    }
+
+    if (op == PIXMAN_OP_CLEAR)
+    {
+        c.red = 0;
+        c.green = 0;
+        c.blue = 0;
+        c.alpha = 0;
+
+        color = &c;
+
+        op = PIXMAN_OP_SRC;
+    }
+
+    if (op == PIXMAN_OP_SRC)
+    {
+        uint32_t pixel;
+
+        if (color_to_pixel (color, &pixel, dest->bits.format))
+        {
+            pixman_region32_t fill_region;
+            int n_rects, j;
+            pixman_box32_t *rects;
+
+            if (!pixman_region32_init_rects (&fill_region, boxes, n_boxes))
+                return FALSE;
+
+            if (dest->common.have_clip_region)
+            {
+                if (!pixman_region32_intersect (&fill_region,
+                                                &fill_region,
+                                                &dest->common.clip_region))
+                    return FALSE;
+            }
+
+            rects = pixman_region32_rectangles (&fill_region, &n_rects);
+            for (j = 0; j < n_rects; ++j)
+            {
+                const pixman_box32_t *rect = &(rects[j]);
+                pixman_fill (dest->bits.bits, dest->bits.rowstride, PIXMAN_FORMAT_BPP (dest->bits.format),
+                             rect->x1, rect->y1, rect->x2 - rect->x1, rect->y2 - rect->y1,
+                             pixel);
+            }
+
+            pixman_region32_fini (&fill_region);
+            return TRUE;
+        }
+    }
+
+    solid = pixman_image_create_solid_fill (color);
+    if (!solid)
+        return FALSE;
+
+    for (i = 0; i < n_boxes; ++i)
+    {
+        const pixman_box32_t *box = &(boxes[i]);
+
+        pixman_image_composite32 (op, solid, NULL, dest,
+                                  0, 0, 0, 0,
+                                  box->x1, box->y1,
+                                  box->x2 - box->x1, box->y2 - box->y1);
+    }
+
+    pixman_image_unref (solid);
+
+    return TRUE;
+}
+
+/**
+ * pixman_version:
+ *
+ * Returns the version of the pixman library encoded in a single
+ * integer as per %PIXMAN_VERSION_ENCODE. The encoding ensures that
+ * later versions compare greater than earlier versions.
+ *
+ * A run-time comparison to check that pixman's version is greater than
+ * or equal to version X.Y.Z could be performed as follows:
+ *
+ * <informalexample><programlisting>
+ * if (pixman_version() >= PIXMAN_VERSION_ENCODE(X,Y,Z)) {...}
+ * </programlisting></informalexample>
+ *
+ * See also pixman_version_string() as well as the compile-time
+ * equivalents %PIXMAN_VERSION and %PIXMAN_VERSION_STRING.
+ *
+ * Return value: the encoded version.
+ **/
+PIXMAN_EXPORT int
+pixman_version (void)
+{
+    return PIXMAN_VERSION;
+}
+
+/**
+ * pixman_version_string:
+ *
+ * Returns the version of the pixman library as a human-readable string
+ * of the form "X.Y.Z".
+ *
+ * See also pixman_version() as well as the compile-time equivalents
+ * %PIXMAN_VERSION_STRING and %PIXMAN_VERSION.
+ *
+ * Return value: a string containing the version.
+ **/
+PIXMAN_EXPORT const char*
+pixman_version_string (void)
+{
+    return PIXMAN_VERSION_STRING;
+}
+
+/**
+ * pixman_format_supported_source:
+ * @format: A pixman_format_code_t format
+ *
+ * Return value: whether the provided format code is a supported
+ * format for a pixman surface used as a source in
+ * rendering.
+ *
+ * Currently, all pixman_format_code_t values are supported.
+ **/
+PIXMAN_EXPORT pixman_bool_t
+pixman_format_supported_source (pixman_format_code_t format)
+{
+    switch (format)
+    {
+    /* 32 bpp formats */
+    case PIXMAN_a2b10g10r10:
+    case PIXMAN_x2b10g10r10:
+    case PIXMAN_a2r10g10b10:
+    case PIXMAN_x2r10g10b10:
+    case PIXMAN_a8r8g8b8:
+    case PIXMAN_x8r8g8b8:
+    case PIXMAN_a8b8g8r8:
+    case PIXMAN_x8b8g8r8:
+    case PIXMAN_b8g8r8a8:
+    case PIXMAN_b8g8r8x8:
+    case PIXMAN_r8g8b8a8:
+    case PIXMAN_r8g8b8x8:
+    case PIXMAN_r8g8b8:
+    case PIXMAN_b8g8r8:
+    case PIXMAN_r5g6b5:
+    case PIXMAN_b5g6r5:
+    case PIXMAN_x14r6g6b6:
+    /* 16 bpp formats */
+    case PIXMAN_a1r5g5b5:
+    case PIXMAN_x1r5g5b5:
+    case PIXMAN_a1b5g5r5:
+    case PIXMAN_x1b5g5r5:
+    case PIXMAN_a4r4g4b4:
+    case PIXMAN_x4r4g4b4:
+    case PIXMAN_a4b4g4r4:
+    case PIXMAN_x4b4g4r4:
+    /* 8bpp formats */
+    case PIXMAN_a8:
+    case PIXMAN_r3g3b2:
+    case PIXMAN_b2g3r3:
+    case PIXMAN_a2r2g2b2:
+    case PIXMAN_a2b2g2r2:
+    case PIXMAN_c8:
+    case PIXMAN_g8:
+    case PIXMAN_x4a4:
+    /* Collides with PIXMAN_c8
+       case PIXMAN_x4c4:
+     */
+    /* Collides with PIXMAN_g8
+       case PIXMAN_x4g4:
+     */
+    /* 4bpp formats */
+    case PIXMAN_a4:
+    case PIXMAN_r1g2b1:
+    case PIXMAN_b1g2r1:
+    case PIXMAN_a1r1g1b1:
+    case PIXMAN_a1b1g1r1:
+    case PIXMAN_c4:
+    case PIXMAN_g4:
+    /* 1bpp formats */
+    case PIXMAN_a1:
+    case PIXMAN_g1:
+    /* YUV formats */
+    case PIXMAN_yuy2:
+    case PIXMAN_yv12:
+       return TRUE;
+
+    default:
+       return FALSE;
+    }
+}
+
+/**
+ * pixman_format_supported_destination:
+ * @format: A pixman_format_code_t format
+ *
+ * Return value: whether the provided format code is a supported
+ * format for a pixman surface used as a destination in
+ * rendering.
+ *
+ * Currently, all pixman_format_code_t values are supported
+ * except for the YUV formats.
+ **/
+PIXMAN_EXPORT pixman_bool_t
+pixman_format_supported_destination (pixman_format_code_t format)
+{
+    /* YUV formats cannot be written to at the moment */
+    if (format == PIXMAN_yuy2 || format == PIXMAN_yv12)
+       return FALSE;
+
+    return pixman_format_supported_source (format);
+}
+
+PIXMAN_EXPORT pixman_bool_t
+pixman_compute_composite_region (pixman_region16_t * region,
+                                 pixman_image_t *    src_image,
+                                 pixman_image_t *    mask_image,
+                                 pixman_image_t *    dest_image,
+                                 int16_t             src_x,
+                                 int16_t             src_y,
+                                 int16_t             mask_x,
+                                 int16_t             mask_y,
+                                 int16_t             dest_x,
+                                 int16_t             dest_y,
+                                 uint16_t            width,
+                                 uint16_t            height)
+{
+    pixman_region32_t r32;
+    pixman_bool_t retval;
+
+    pixman_region32_init (&r32);
+
+    retval = pixman_compute_composite_region32 (
+       &r32, src_image, mask_image, dest_image,
+       src_x, src_y, mask_x, mask_y, dest_x, dest_y,
+       width, height);
+
+    if (retval)
+    {
+       if (!pixman_region16_copy_from_region32 (region, &r32))
+           retval = FALSE;
+    }
+
+    pixman_region32_fini (&r32);
+    return retval;
+}
diff --git a/pixman/pixman.h b/pixman/pixman.h
new file mode 100644 (file)
index 0000000..c57092a
--- /dev/null
@@ -0,0 +1,990 @@
+/***********************************************************
+
+Copyright 1987, 1998  The Open Group
+
+Permission to use, copy, modify, distribute, and sell this software and its
+documentation for any purpose is hereby granted without fee, provided that
+the above copyright notice appear in all copies and that both that
+copyright notice and this permission notice appear in supporting
+documentation.
+
+The above copyright notice and this permission notice shall be included in
+all copies or substantial portions of the Software.
+
+THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL THE
+OPEN GROUP BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN
+AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
+CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+
+Except as contained in this notice, the name of The Open Group shall not be
+used in advertising or otherwise to promote the sale, use or other dealings
+in this Software without prior written authorization from The Open Group.
+
+Copyright 1987 by Digital Equipment Corporation, Maynard, Massachusetts.
+
+                        All Rights Reserved
+
+Permission to use, copy, modify, and distribute this software and its
+documentation for any purpose and without fee is hereby granted,
+provided that the above copyright notice appear in all copies and that
+both that copyright notice and this permission notice appear in
+supporting documentation, and that the name of Digital not be
+used in advertising or publicity pertaining to distribution of the
+software without specific, written prior permission.
+
+DIGITAL DISCLAIMS ALL WARRANTIES WITH REGARD TO THIS SOFTWARE, INCLUDING
+ALL IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS, IN NO EVENT SHALL
+DIGITAL BE LIABLE FOR ANY SPECIAL, INDIRECT OR CONSEQUENTIAL DAMAGES OR
+ANY DAMAGES WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS,
+WHETHER IN AN ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION,
+ARISING OUT OF OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS
+SOFTWARE.
+
+******************************************************************/
+/*
+ * Copyright © 1998, 2004 Keith Packard
+ * Copyright   2007 Red Hat, Inc.
+ *
+ * Permission to use, copy, modify, distribute, and sell this software and its
+ * documentation for any purpose is hereby granted without fee, provided that
+ * the above copyright notice appear in all copies and that both that
+ * copyright notice and this permission notice appear in supporting
+ * documentation, and that the name of Keith Packard not be used in
+ * advertising or publicity pertaining to distribution of the software without
+ * specific, written prior permission.  Keith Packard makes no
+ * representations about the suitability of this software for any purpose.  It
+ * is provided "as is" without express or implied warranty.
+ *
+ * KEITH PACKARD DISCLAIMS ALL WARRANTIES WITH REGARD TO THIS SOFTWARE,
+ * INCLUDING ALL IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS, IN NO
+ * EVENT SHALL KEITH PACKARD BE LIABLE FOR ANY SPECIAL, INDIRECT OR
+ * CONSEQUENTIAL DAMAGES OR ANY DAMAGES WHATSOEVER RESULTING FROM LOSS OF USE,
+ * DATA OR PROFITS, WHETHER IN AN ACTION OF CONTRACT, NEGLIGENCE OR OTHER
+ * TORTIOUS ACTION, ARISING OUT OF OR IN CONNECTION WITH THE USE OR
+ * PERFORMANCE OF THIS SOFTWARE.
+ */
+
+#ifndef PIXMAN_H__
+#define PIXMAN_H__
+
+#include <pixman-version.h>
+
+#ifdef  __cplusplus
+#define PIXMAN_BEGIN_DECLS extern "C" {
+#define PIXMAN_END_DECLS }
+#else
+#define PIXMAN_BEGIN_DECLS
+#define PIXMAN_END_DECLS
+#endif
+
+PIXMAN_BEGIN_DECLS
+
+/*
+ * Standard integers
+ */
+
+#if !defined (PIXMAN_DONT_DEFINE_STDINT)
+
+#if defined (_SVR4) || defined (SVR4) || defined (__OpenBSD__) || defined (_sgi) || defined (__sun) || defined (sun) || defined (__digital__) || defined (__HP_cc)
+#  include <inttypes.h>
+/* VS 2010 (_MSC_VER 1600) has stdint.h */
+#elif defined (_MSC_VER) && _MSC_VER < 1600
+typedef __int8 int8_t;
+typedef unsigned __int8 uint8_t;
+typedef __int16 int16_t;
+typedef unsigned __int16 uint16_t;
+typedef __int32 int32_t;
+typedef unsigned __int32 uint32_t;
+typedef __int64 int64_t;
+typedef unsigned __int64 uint64_t;
+#elif defined (_AIX)
+#  include <sys/inttypes.h>
+#else
+#  include <stdint.h>
+#endif
+
+#endif
+
+/*
+ * Boolean
+ */
+typedef int pixman_bool_t;
+
+/*
+ * Fixpoint numbers
+ */
+typedef int64_t                        pixman_fixed_32_32_t;
+typedef pixman_fixed_32_32_t   pixman_fixed_48_16_t;
+typedef uint32_t               pixman_fixed_1_31_t;
+typedef uint32_t               pixman_fixed_1_16_t;
+typedef int32_t                        pixman_fixed_16_16_t;
+typedef pixman_fixed_16_16_t   pixman_fixed_t;
+
+#define pixman_fixed_e                 ((pixman_fixed_t) 1)
+#define pixman_fixed_1                 (pixman_int_to_fixed(1))
+#define pixman_fixed_1_minus_e         (pixman_fixed_1 - pixman_fixed_e)
+#define pixman_fixed_minus_1           (pixman_int_to_fixed(-1))
+#define pixman_fixed_to_int(f)         ((int) ((f) >> 16))
+#define pixman_int_to_fixed(i)         ((pixman_fixed_t) ((i) << 16))
+#define pixman_fixed_to_double(f)      (double) ((f) / (double) pixman_fixed_1)
+#define pixman_double_to_fixed(d)      ((pixman_fixed_t) ((d) * 65536.0))
+#define pixman_fixed_frac(f)           ((f) & pixman_fixed_1_minus_e)
+#define pixman_fixed_floor(f)          ((f) & ~pixman_fixed_1_minus_e)
+#define pixman_fixed_ceil(f)           pixman_fixed_floor ((f) + pixman_fixed_1_minus_e)
+#define pixman_fixed_fraction(f)       ((f) & pixman_fixed_1_minus_e)
+#define pixman_fixed_mod_2(f)          ((f) & (pixman_fixed1 | pixman_fixed_1_minus_e))
+#define pixman_max_fixed_48_16         ((pixman_fixed_48_16_t) 0x7fffffff)
+#define pixman_min_fixed_48_16         (-((pixman_fixed_48_16_t) 1 << 31))
+
+/*
+ * Misc structs
+ */
+typedef struct pixman_color pixman_color_t;
+typedef struct pixman_point_fixed pixman_point_fixed_t;
+typedef struct pixman_line_fixed pixman_line_fixed_t;
+typedef struct pixman_vector pixman_vector_t;
+typedef struct pixman_transform pixman_transform_t;
+
+struct pixman_color
+{
+    uint16_t   red;
+    uint16_t    green;
+    uint16_t    blue;
+    uint16_t    alpha;
+};
+
+struct pixman_point_fixed
+{
+    pixman_fixed_t     x;
+    pixman_fixed_t     y;
+};
+
+struct pixman_line_fixed
+{
+    pixman_point_fixed_t       p1, p2;
+};
+
+/*
+ * Fixed point matrices
+ */
+
+struct pixman_vector
+{
+    pixman_fixed_t     vector[3];
+};
+
+struct pixman_transform
+{
+    pixman_fixed_t     matrix[3][3];
+};
+
+/* forward declaration (sorry) */
+struct pixman_box16;
+typedef  union pixman_image            pixman_image_t;
+
+void          pixman_transform_init_identity    (struct pixman_transform       *matrix);
+pixman_bool_t pixman_transform_point_3d         (const struct pixman_transform *transform,
+                                                struct pixman_vector          *vector);
+pixman_bool_t pixman_transform_point            (const struct pixman_transform *transform,
+                                                struct pixman_vector          *vector);
+pixman_bool_t pixman_transform_multiply         (struct pixman_transform       *dst,
+                                                const struct pixman_transform *l,
+                                                const struct pixman_transform *r);
+void          pixman_transform_init_scale       (struct pixman_transform       *t,
+                                                pixman_fixed_t                 sx,
+                                                pixman_fixed_t                 sy);
+pixman_bool_t pixman_transform_scale            (struct pixman_transform       *forward,
+                                                struct pixman_transform       *reverse,
+                                                pixman_fixed_t                 sx,
+                                                pixman_fixed_t                 sy);
+void          pixman_transform_init_rotate      (struct pixman_transform       *t,
+                                                pixman_fixed_t                 cos,
+                                                pixman_fixed_t                 sin);
+pixman_bool_t pixman_transform_rotate           (struct pixman_transform       *forward,
+                                                struct pixman_transform       *reverse,
+                                                pixman_fixed_t                 c,
+                                                pixman_fixed_t                 s);
+void          pixman_transform_init_translate   (struct pixman_transform       *t,
+                                                pixman_fixed_t                 tx,
+                                                pixman_fixed_t                 ty);
+pixman_bool_t pixman_transform_translate        (struct pixman_transform       *forward,
+                                                struct pixman_transform       *reverse,
+                                                pixman_fixed_t                 tx,
+                                                pixman_fixed_t                 ty);
+pixman_bool_t pixman_transform_bounds           (const struct pixman_transform *matrix,
+                                                struct pixman_box16           *b);
+pixman_bool_t pixman_transform_invert           (struct pixman_transform       *dst,
+                                                const struct pixman_transform *src);
+pixman_bool_t pixman_transform_is_identity      (const struct pixman_transform *t);
+pixman_bool_t pixman_transform_is_scale         (const struct pixman_transform *t);
+pixman_bool_t pixman_transform_is_int_translate (const struct pixman_transform *t);
+pixman_bool_t pixman_transform_is_inverse       (const struct pixman_transform *a,
+                                                const struct pixman_transform *b);
+
+/*
+ * Floating point matrices
+ */
+struct pixman_f_vector
+{
+    double  v[3];
+};
+
+struct pixman_f_transform
+{
+    double  m[3][3];
+};
+
+pixman_bool_t pixman_transform_from_pixman_f_transform (struct pixman_transform         *t,
+                                                       const struct pixman_f_transform *ft);
+void          pixman_f_transform_from_pixman_transform (struct pixman_f_transform       *ft,
+                                                       const struct pixman_transform   *t);
+pixman_bool_t pixman_f_transform_invert                (struct pixman_f_transform       *dst,
+                                                       const struct pixman_f_transform *src);
+pixman_bool_t pixman_f_transform_point                 (const struct pixman_f_transform *t,
+                                                       struct pixman_f_vector          *v);
+void          pixman_f_transform_point_3d              (const struct pixman_f_transform *t,
+                                                       struct pixman_f_vector          *v);
+void          pixman_f_transform_multiply              (struct pixman_f_transform       *dst,
+                                                       const struct pixman_f_transform *l,
+                                                       const struct pixman_f_transform *r);
+void          pixman_f_transform_init_scale            (struct pixman_f_transform       *t,
+                                                       double                           sx,
+                                                       double                           sy);
+pixman_bool_t pixman_f_transform_scale                 (struct pixman_f_transform       *forward,
+                                                       struct pixman_f_transform       *reverse,
+                                                       double                           sx,
+                                                       double                           sy);
+void          pixman_f_transform_init_rotate           (struct pixman_f_transform       *t,
+                                                       double                           cos,
+                                                       double                           sin);
+pixman_bool_t pixman_f_transform_rotate                (struct pixman_f_transform       *forward,
+                                                       struct pixman_f_transform       *reverse,
+                                                       double                           c,
+                                                       double                           s);
+void          pixman_f_transform_init_translate        (struct pixman_f_transform       *t,
+                                                       double                           tx,
+                                                       double                           ty);
+pixman_bool_t pixman_f_transform_translate             (struct pixman_f_transform       *forward,
+                                                       struct pixman_f_transform       *reverse,
+                                                       double                           tx,
+                                                       double                           ty);
+pixman_bool_t pixman_f_transform_bounds                (const struct pixman_f_transform *t,
+                                                       struct pixman_box16             *b);
+void          pixman_f_transform_init_identity         (struct pixman_f_transform       *t);
+
+typedef enum
+{
+    PIXMAN_REPEAT_NONE,
+    PIXMAN_REPEAT_NORMAL,
+    PIXMAN_REPEAT_PAD,
+    PIXMAN_REPEAT_REFLECT
+} pixman_repeat_t;
+
+typedef enum
+{
+    PIXMAN_FILTER_FAST,
+    PIXMAN_FILTER_GOOD,
+    PIXMAN_FILTER_BEST,
+    PIXMAN_FILTER_NEAREST,
+    PIXMAN_FILTER_BILINEAR,
+    PIXMAN_FILTER_CONVOLUTION
+} pixman_filter_t;
+
+typedef enum
+{
+    PIXMAN_OP_CLEAR                    = 0x00,
+    PIXMAN_OP_SRC                      = 0x01,
+    PIXMAN_OP_DST                      = 0x02,
+    PIXMAN_OP_OVER                     = 0x03,
+    PIXMAN_OP_OVER_REVERSE             = 0x04,
+    PIXMAN_OP_IN                       = 0x05,
+    PIXMAN_OP_IN_REVERSE               = 0x06,
+    PIXMAN_OP_OUT                      = 0x07,
+    PIXMAN_OP_OUT_REVERSE              = 0x08,
+    PIXMAN_OP_ATOP                     = 0x09,
+    PIXMAN_OP_ATOP_REVERSE             = 0x0a,
+    PIXMAN_OP_XOR                      = 0x0b,
+    PIXMAN_OP_ADD                      = 0x0c,
+    PIXMAN_OP_SATURATE                 = 0x0d,
+
+    PIXMAN_OP_DISJOINT_CLEAR           = 0x10,
+    PIXMAN_OP_DISJOINT_SRC             = 0x11,
+    PIXMAN_OP_DISJOINT_DST             = 0x12,
+    PIXMAN_OP_DISJOINT_OVER            = 0x13,
+    PIXMAN_OP_DISJOINT_OVER_REVERSE    = 0x14,
+    PIXMAN_OP_DISJOINT_IN              = 0x15,
+    PIXMAN_OP_DISJOINT_IN_REVERSE      = 0x16,
+    PIXMAN_OP_DISJOINT_OUT             = 0x17,
+    PIXMAN_OP_DISJOINT_OUT_REVERSE     = 0x18,
+    PIXMAN_OP_DISJOINT_ATOP            = 0x19,
+    PIXMAN_OP_DISJOINT_ATOP_REVERSE    = 0x1a,
+    PIXMAN_OP_DISJOINT_XOR             = 0x1b,
+
+    PIXMAN_OP_CONJOINT_CLEAR           = 0x20,
+    PIXMAN_OP_CONJOINT_SRC             = 0x21,
+    PIXMAN_OP_CONJOINT_DST             = 0x22,
+    PIXMAN_OP_CONJOINT_OVER            = 0x23,
+    PIXMAN_OP_CONJOINT_OVER_REVERSE    = 0x24,
+    PIXMAN_OP_CONJOINT_IN              = 0x25,
+    PIXMAN_OP_CONJOINT_IN_REVERSE      = 0x26,
+    PIXMAN_OP_CONJOINT_OUT             = 0x27,
+    PIXMAN_OP_CONJOINT_OUT_REVERSE     = 0x28,
+    PIXMAN_OP_CONJOINT_ATOP            = 0x29,
+    PIXMAN_OP_CONJOINT_ATOP_REVERSE    = 0x2a,
+    PIXMAN_OP_CONJOINT_XOR             = 0x2b,
+
+    PIXMAN_OP_MULTIPLY                  = 0x30,
+    PIXMAN_OP_SCREEN                    = 0x31,
+    PIXMAN_OP_OVERLAY                   = 0x32,
+    PIXMAN_OP_DARKEN                    = 0x33,
+    PIXMAN_OP_LIGHTEN                   = 0x34,
+    PIXMAN_OP_COLOR_DODGE               = 0x35,
+    PIXMAN_OP_COLOR_BURN                = 0x36,
+    PIXMAN_OP_HARD_LIGHT                = 0x37,
+    PIXMAN_OP_SOFT_LIGHT                = 0x38,
+    PIXMAN_OP_DIFFERENCE                = 0x39,
+    PIXMAN_OP_EXCLUSION                 = 0x3a,
+    PIXMAN_OP_HSL_HUE                  = 0x3b,
+    PIXMAN_OP_HSL_SATURATION           = 0x3c,
+    PIXMAN_OP_HSL_COLOR                        = 0x3d,
+    PIXMAN_OP_HSL_LUMINOSITY           = 0x3e
+
+#ifdef PIXMAN_USE_INTERNAL_API
+    ,
+    PIXMAN_N_OPERATORS,
+    PIXMAN_OP_NONE = PIXMAN_N_OPERATORS
+#endif
+} pixman_op_t;
+
+/*
+ * Regions
+ */
+typedef struct pixman_region16_data    pixman_region16_data_t;
+typedef struct pixman_box16            pixman_box16_t;
+typedef struct pixman_rectangle16      pixman_rectangle16_t;
+typedef struct pixman_region16         pixman_region16_t;
+
+struct pixman_region16_data {
+    long               size;
+    long               numRects;
+/*  pixman_box16_t     rects[size];   in memory but not explicitly declared */
+};
+
+struct pixman_rectangle16
+{
+    int16_t    x, y;
+    uint16_t   width, height;
+};
+
+struct pixman_box16
+{
+    int16_t x1, y1, x2, y2;
+};
+
+struct pixman_region16
+{
+    pixman_box16_t          extents;
+    pixman_region16_data_t *data;
+};
+
+typedef enum
+{
+    PIXMAN_REGION_OUT,
+    PIXMAN_REGION_IN,
+    PIXMAN_REGION_PART
+} pixman_region_overlap_t;
+
+/* This function exists only to make it possible to preserve
+ * the X ABI - it should go away at first opportunity.
+ */
+void pixman_region_set_static_pointers (pixman_box16_t         *empty_box,
+                                       pixman_region16_data_t *empty_data,
+                                       pixman_region16_data_t *broken_data);
+
+/* creation/destruction */
+void                    pixman_region_init               (pixman_region16_t *region);
+void                    pixman_region_init_rect          (pixman_region16_t *region,
+                                                         int                x,
+                                                         int                y,
+                                                         unsigned int       width,
+                                                         unsigned int       height);
+pixman_bool_t           pixman_region_init_rects         (pixman_region16_t *region,
+                                                         const pixman_box16_t *boxes,
+                                                         int                count);
+void                    pixman_region_init_with_extents  (pixman_region16_t *region,
+                                                         pixman_box16_t    *extents);
+void                    pixman_region_init_from_image    (pixman_region16_t *region,
+                                                         pixman_image_t    *image);
+void                    pixman_region_fini               (pixman_region16_t *region);
+
+
+/* manipulation */
+void                    pixman_region_translate          (pixman_region16_t *region,
+                                                         int                x,
+                                                         int                y);
+pixman_bool_t           pixman_region_copy               (pixman_region16_t *dest,
+                                                         pixman_region16_t *source);
+pixman_bool_t           pixman_region_intersect          (pixman_region16_t *new_reg,
+                                                         pixman_region16_t *reg1,
+                                                         pixman_region16_t *reg2);
+pixman_bool_t           pixman_region_union              (pixman_region16_t *new_reg,
+                                                         pixman_region16_t *reg1,
+                                                         pixman_region16_t *reg2);
+pixman_bool_t           pixman_region_union_rect         (pixman_region16_t *dest,
+                                                         pixman_region16_t *source,
+                                                         int                x,
+                                                         int                y,
+                                                         unsigned int       width,
+                                                         unsigned int       height);
+pixman_bool_t          pixman_region_intersect_rect     (pixman_region16_t *dest,
+                                                         pixman_region16_t *source,
+                                                         int                x,
+                                                         int                y,
+                                                         unsigned int       width,
+                                                         unsigned int       height);
+pixman_bool_t           pixman_region_subtract           (pixman_region16_t *reg_d,
+                                                         pixman_region16_t *reg_m,
+                                                         pixman_region16_t *reg_s);
+pixman_bool_t           pixman_region_inverse            (pixman_region16_t *new_reg,
+                                                         pixman_region16_t *reg1,
+                                                         pixman_box16_t    *inv_rect);
+pixman_bool_t           pixman_region_contains_point     (pixman_region16_t *region,
+                                                         int                x,
+                                                         int                y,
+                                                         pixman_box16_t    *box);
+pixman_region_overlap_t pixman_region_contains_rectangle (pixman_region16_t *region,
+                                                         pixman_box16_t    *prect);
+pixman_bool_t           pixman_region_not_empty          (pixman_region16_t *region);
+pixman_box16_t *        pixman_region_extents            (pixman_region16_t *region);
+int                     pixman_region_n_rects            (pixman_region16_t *region);
+pixman_box16_t *        pixman_region_rectangles         (pixman_region16_t *region,
+                                                         int               *n_rects);
+pixman_bool_t           pixman_region_equal              (pixman_region16_t *region1,
+                                                         pixman_region16_t *region2);
+pixman_bool_t           pixman_region_selfcheck          (pixman_region16_t *region);
+void                    pixman_region_reset              (pixman_region16_t *region,
+                                                         pixman_box16_t    *box);
+/*
+ * 32 bit regions
+ */
+typedef struct pixman_region32_data    pixman_region32_data_t;
+typedef struct pixman_box32            pixman_box32_t;
+typedef struct pixman_rectangle32      pixman_rectangle32_t;
+typedef struct pixman_region32         pixman_region32_t;
+
+struct pixman_region32_data {
+    long               size;
+    long               numRects;
+/*  pixman_box32_t     rects[size];   in memory but not explicitly declared */
+};
+
+struct pixman_rectangle32
+{
+    int32_t x, y;
+    uint32_t width, height;
+};
+
+struct pixman_box32
+{
+    int32_t x1, y1, x2, y2;
+};
+
+struct pixman_region32
+{
+    pixman_box32_t          extents;
+    pixman_region32_data_t  *data;
+};
+
+/* creation/destruction */
+void                    pixman_region32_init               (pixman_region32_t *region);
+void                    pixman_region32_init_rect          (pixman_region32_t *region,
+                                                           int                x,
+                                                           int                y,
+                                                           unsigned int       width,
+                                                           unsigned int       height);
+pixman_bool_t           pixman_region32_init_rects         (pixman_region32_t *region,
+                                                           const pixman_box32_t *boxes,
+                                                           int                count);
+void                    pixman_region32_init_with_extents  (pixman_region32_t *region,
+                                                           pixman_box32_t    *extents);
+void                    pixman_region32_init_from_image    (pixman_region32_t *region,
+                                                           pixman_image_t    *image);
+void                    pixman_region32_fini               (pixman_region32_t *region);
+
+
+/* manipulation */
+void                    pixman_region32_translate          (pixman_region32_t *region,
+                                                           int                x,
+                                                           int                y);
+pixman_bool_t           pixman_region32_copy               (pixman_region32_t *dest,
+                                                           pixman_region32_t *source);
+pixman_bool_t           pixman_region32_intersect          (pixman_region32_t *new_reg,
+                                                           pixman_region32_t *reg1,
+                                                           pixman_region32_t *reg2);
+pixman_bool_t           pixman_region32_union              (pixman_region32_t *new_reg,
+                                                           pixman_region32_t *reg1,
+                                                           pixman_region32_t *reg2);
+pixman_bool_t          pixman_region32_intersect_rect     (pixman_region32_t *dest,
+                                                           pixman_region32_t *source,
+                                                           int                x,
+                                                           int                y,
+                                                           unsigned int       width,
+                                                           unsigned int       height);
+pixman_bool_t           pixman_region32_union_rect         (pixman_region32_t *dest,
+                                                           pixman_region32_t *source,
+                                                           int                x,
+                                                           int                y,
+                                                           unsigned int       width,
+                                                           unsigned int       height);
+pixman_bool_t           pixman_region32_subtract           (pixman_region32_t *reg_d,
+                                                           pixman_region32_t *reg_m,
+                                                           pixman_region32_t *reg_s);
+pixman_bool_t           pixman_region32_inverse            (pixman_region32_t *new_reg,
+                                                           pixman_region32_t *reg1,
+                                                           pixman_box32_t    *inv_rect);
+pixman_bool_t           pixman_region32_contains_point     (pixman_region32_t *region,
+                                                           int                x,
+                                                           int                y,
+                                                           pixman_box32_t    *box);
+pixman_region_overlap_t pixman_region32_contains_rectangle (pixman_region32_t *region,
+                                                           pixman_box32_t    *prect);
+pixman_bool_t           pixman_region32_not_empty          (pixman_region32_t *region);
+pixman_box32_t *        pixman_region32_extents            (pixman_region32_t *region);
+int                     pixman_region32_n_rects            (pixman_region32_t *region);
+pixman_box32_t *        pixman_region32_rectangles         (pixman_region32_t *region,
+                                                           int               *n_rects);
+pixman_bool_t           pixman_region32_equal              (pixman_region32_t *region1,
+                                                           pixman_region32_t *region2);
+pixman_bool_t           pixman_region32_selfcheck          (pixman_region32_t *region);
+void                    pixman_region32_reset              (pixman_region32_t *region,
+                                                           pixman_box32_t    *box);
+
+
+/* Copy / Fill / Misc */
+pixman_bool_t pixman_blt                (uint32_t           *src_bits,
+                                        uint32_t           *dst_bits,
+                                        int                 src_stride,
+                                        int                 dst_stride,
+                                        int                 src_bpp,
+                                        int                 dst_bpp,
+                                        int                 src_x,
+                                        int                 src_y,
+                                        int                 dest_x,
+                                        int                 dest_y,
+                                        int                 width,
+                                        int                 height);
+pixman_bool_t pixman_fill               (uint32_t           *bits,
+                                        int                 stride,
+                                        int                 bpp,
+                                        int                 x,
+                                        int                 y,
+                                        int                 width,
+                                        int                 height,
+                                        uint32_t            _xor);
+
+int           pixman_version            (void);
+const char*   pixman_version_string     (void);
+
+/*
+ * Images
+ */
+typedef struct pixman_indexed          pixman_indexed_t;
+typedef struct pixman_gradient_stop    pixman_gradient_stop_t;
+
+typedef uint32_t (* pixman_read_memory_func_t) (const void *src, int size);
+typedef void     (* pixman_write_memory_func_t) (void *dst, uint32_t value, int size);
+
+typedef void     (* pixman_image_destroy_func_t) (pixman_image_t *image, void *data);
+
+struct pixman_gradient_stop {
+    pixman_fixed_t x;
+    pixman_color_t color;
+};
+
+#define PIXMAN_MAX_INDEXED  256 /* XXX depth must be <= 8 */
+
+#if PIXMAN_MAX_INDEXED <= 256
+typedef uint8_t pixman_index_type;
+#endif
+
+struct pixman_indexed
+{
+    pixman_bool_t       color;
+    uint32_t           rgba[PIXMAN_MAX_INDEXED];
+    pixman_index_type  ent[32768];
+};
+
+/*
+ * While the protocol is generous in format support, the
+ * sample implementation allows only packed RGB and GBR
+ * representations for data to simplify software rendering,
+ */
+#define PIXMAN_FORMAT(bpp,type,a,r,g,b)        (((bpp) << 24) |  \
+                                        ((type) << 16) | \
+                                        ((a) << 12) |    \
+                                        ((r) << 8) |     \
+                                        ((g) << 4) |     \
+                                        ((b)))
+
+#define PIXMAN_FORMAT_BPP(f)   (((f) >> 24)       )
+#define PIXMAN_FORMAT_TYPE(f)  (((f) >> 16) & 0xff)
+#define PIXMAN_FORMAT_A(f)     (((f) >> 12) & 0x0f)
+#define PIXMAN_FORMAT_R(f)     (((f) >>  8) & 0x0f)
+#define PIXMAN_FORMAT_G(f)     (((f) >>  4) & 0x0f)
+#define PIXMAN_FORMAT_B(f)     (((f)      ) & 0x0f)
+#define PIXMAN_FORMAT_RGB(f)   (((f)      ) & 0xfff)
+#define PIXMAN_FORMAT_VIS(f)   (((f)      ) & 0xffff)
+#define PIXMAN_FORMAT_DEPTH(f) (PIXMAN_FORMAT_A(f) +   \
+                                PIXMAN_FORMAT_R(f) +   \
+                                PIXMAN_FORMAT_G(f) +   \
+                                PIXMAN_FORMAT_B(f))
+
+#define PIXMAN_TYPE_OTHER      0
+#define PIXMAN_TYPE_A          1
+#define PIXMAN_TYPE_ARGB       2
+#define PIXMAN_TYPE_ABGR       3
+#define PIXMAN_TYPE_COLOR      4
+#define PIXMAN_TYPE_GRAY       5
+#define PIXMAN_TYPE_YUY2       6
+#define PIXMAN_TYPE_YV12       7
+#define PIXMAN_TYPE_BGRA       8
+#define PIXMAN_TYPE_RGBA       9
+
+#define PIXMAN_FORMAT_COLOR(f)                         \
+       (PIXMAN_FORMAT_TYPE(f) == PIXMAN_TYPE_ARGB ||   \
+        PIXMAN_FORMAT_TYPE(f) == PIXMAN_TYPE_ABGR ||   \
+        PIXMAN_FORMAT_TYPE(f) == PIXMAN_TYPE_BGRA ||   \
+        PIXMAN_FORMAT_TYPE(f) == PIXMAN_TYPE_RGBA)
+
+/* 32bpp formats */
+typedef enum {
+    PIXMAN_a8r8g8b8 =   PIXMAN_FORMAT(32,PIXMAN_TYPE_ARGB,8,8,8,8),
+    PIXMAN_x8r8g8b8 =   PIXMAN_FORMAT(32,PIXMAN_TYPE_ARGB,0,8,8,8),
+    PIXMAN_a8b8g8r8 =   PIXMAN_FORMAT(32,PIXMAN_TYPE_ABGR,8,8,8,8),
+    PIXMAN_x8b8g8r8 =   PIXMAN_FORMAT(32,PIXMAN_TYPE_ABGR,0,8,8,8),
+    PIXMAN_b8g8r8a8 =   PIXMAN_FORMAT(32,PIXMAN_TYPE_BGRA,8,8,8,8),
+    PIXMAN_b8g8r8x8 =   PIXMAN_FORMAT(32,PIXMAN_TYPE_BGRA,0,8,8,8),
+    PIXMAN_r8g8b8a8 =   PIXMAN_FORMAT(32,PIXMAN_TYPE_RGBA,8,8,8,8),
+    PIXMAN_r8g8b8x8 =   PIXMAN_FORMAT(32,PIXMAN_TYPE_RGBA,0,8,8,8),
+    PIXMAN_x14r6g6b6 =  PIXMAN_FORMAT(32,PIXMAN_TYPE_ARGB,0,6,6,6),
+    PIXMAN_x2r10g10b10 = PIXMAN_FORMAT(32,PIXMAN_TYPE_ARGB,0,10,10,10),
+    PIXMAN_a2r10g10b10 = PIXMAN_FORMAT(32,PIXMAN_TYPE_ARGB,2,10,10,10),
+    PIXMAN_x2b10g10r10 = PIXMAN_FORMAT(32,PIXMAN_TYPE_ABGR,0,10,10,10),
+    PIXMAN_a2b10g10r10 = PIXMAN_FORMAT(32,PIXMAN_TYPE_ABGR,2,10,10,10),
+
+/* 24bpp formats */
+    PIXMAN_r8g8b8 =     PIXMAN_FORMAT(24,PIXMAN_TYPE_ARGB,0,8,8,8),
+    PIXMAN_b8g8r8 =     PIXMAN_FORMAT(24,PIXMAN_TYPE_ABGR,0,8,8,8),
+
+/* 16bpp formats */
+    PIXMAN_r5g6b5 =     PIXMAN_FORMAT(16,PIXMAN_TYPE_ARGB,0,5,6,5),
+    PIXMAN_b5g6r5 =     PIXMAN_FORMAT(16,PIXMAN_TYPE_ABGR,0,5,6,5),
+
+    PIXMAN_a1r5g5b5 =   PIXMAN_FORMAT(16,PIXMAN_TYPE_ARGB,1,5,5,5),
+    PIXMAN_x1r5g5b5 =   PIXMAN_FORMAT(16,PIXMAN_TYPE_ARGB,0,5,5,5),
+    PIXMAN_a1b5g5r5 =   PIXMAN_FORMAT(16,PIXMAN_TYPE_ABGR,1,5,5,5),
+    PIXMAN_x1b5g5r5 =   PIXMAN_FORMAT(16,PIXMAN_TYPE_ABGR,0,5,5,5),
+    PIXMAN_a4r4g4b4 =   PIXMAN_FORMAT(16,PIXMAN_TYPE_ARGB,4,4,4,4),
+    PIXMAN_x4r4g4b4 =   PIXMAN_FORMAT(16,PIXMAN_TYPE_ARGB,0,4,4,4),
+    PIXMAN_a4b4g4r4 =   PIXMAN_FORMAT(16,PIXMAN_TYPE_ABGR,4,4,4,4),
+    PIXMAN_x4b4g4r4 =   PIXMAN_FORMAT(16,PIXMAN_TYPE_ABGR,0,4,4,4),
+
+/* 8bpp formats */
+    PIXMAN_a8 =                 PIXMAN_FORMAT(8,PIXMAN_TYPE_A,8,0,0,0),
+    PIXMAN_r3g3b2 =     PIXMAN_FORMAT(8,PIXMAN_TYPE_ARGB,0,3,3,2),
+    PIXMAN_b2g3r3 =     PIXMAN_FORMAT(8,PIXMAN_TYPE_ABGR,0,3,3,2),
+    PIXMAN_a2r2g2b2 =   PIXMAN_FORMAT(8,PIXMAN_TYPE_ARGB,2,2,2,2),
+    PIXMAN_a2b2g2r2 =   PIXMAN_FORMAT(8,PIXMAN_TYPE_ABGR,2,2,2,2),
+
+    PIXMAN_c8 =                 PIXMAN_FORMAT(8,PIXMAN_TYPE_COLOR,0,0,0,0),
+    PIXMAN_g8 =                 PIXMAN_FORMAT(8,PIXMAN_TYPE_GRAY,0,0,0,0),
+
+    PIXMAN_x4a4 =       PIXMAN_FORMAT(8,PIXMAN_TYPE_A,4,0,0,0),
+
+    PIXMAN_x4c4 =       PIXMAN_FORMAT(8,PIXMAN_TYPE_COLOR,0,0,0,0),
+    PIXMAN_x4g4 =       PIXMAN_FORMAT(8,PIXMAN_TYPE_GRAY,0,0,0,0),
+
+/* 4bpp formats */
+    PIXMAN_a4 =                 PIXMAN_FORMAT(4,PIXMAN_TYPE_A,4,0,0,0),
+    PIXMAN_r1g2b1 =     PIXMAN_FORMAT(4,PIXMAN_TYPE_ARGB,0,1,2,1),
+    PIXMAN_b1g2r1 =     PIXMAN_FORMAT(4,PIXMAN_TYPE_ABGR,0,1,2,1),
+    PIXMAN_a1r1g1b1 =   PIXMAN_FORMAT(4,PIXMAN_TYPE_ARGB,1,1,1,1),
+    PIXMAN_a1b1g1r1 =   PIXMAN_FORMAT(4,PIXMAN_TYPE_ABGR,1,1,1,1),
+
+    PIXMAN_c4 =                 PIXMAN_FORMAT(4,PIXMAN_TYPE_COLOR,0,0,0,0),
+    PIXMAN_g4 =                 PIXMAN_FORMAT(4,PIXMAN_TYPE_GRAY,0,0,0,0),
+
+/* 1bpp formats */
+    PIXMAN_a1 =                 PIXMAN_FORMAT(1,PIXMAN_TYPE_A,1,0,0,0),
+
+    PIXMAN_g1 =                 PIXMAN_FORMAT(1,PIXMAN_TYPE_GRAY,0,0,0,0),
+
+/* YUV formats */
+    PIXMAN_yuy2 =       PIXMAN_FORMAT(16,PIXMAN_TYPE_YUY2,0,0,0,0),
+    PIXMAN_yv12 =       PIXMAN_FORMAT(12,PIXMAN_TYPE_YV12,0,0,0,0)
+} pixman_format_code_t;
+
+/* Querying supported format values. */
+pixman_bool_t pixman_format_supported_destination (pixman_format_code_t format);
+pixman_bool_t pixman_format_supported_source      (pixman_format_code_t format);
+
+/* Constructors */
+pixman_image_t *pixman_image_create_solid_fill       (pixman_color_t               *color);
+pixman_image_t *pixman_image_create_linear_gradient  (pixman_point_fixed_t         *p1,
+                                                     pixman_point_fixed_t         *p2,
+                                                     const pixman_gradient_stop_t *stops,
+                                                     int                           n_stops);
+pixman_image_t *pixman_image_create_radial_gradient  (pixman_point_fixed_t         *inner,
+                                                     pixman_point_fixed_t         *outer,
+                                                     pixman_fixed_t                inner_radius,
+                                                     pixman_fixed_t                outer_radius,
+                                                     const pixman_gradient_stop_t *stops,
+                                                     int                           n_stops);
+pixman_image_t *pixman_image_create_conical_gradient (pixman_point_fixed_t         *center,
+                                                     pixman_fixed_t                angle,
+                                                     const pixman_gradient_stop_t *stops,
+                                                     int                           n_stops);
+pixman_image_t *pixman_image_create_bits             (pixman_format_code_t          format,
+                                                     int                           width,
+                                                     int                           height,
+                                                     uint32_t                     *bits,
+                                                     int                           rowstride_bytes);
+
+/* Destructor */
+pixman_image_t *pixman_image_ref                     (pixman_image_t               *image);
+pixman_bool_t   pixman_image_unref                   (pixman_image_t               *image);
+
+void           pixman_image_set_destroy_function    (pixman_image_t               *image,
+                                                     pixman_image_destroy_func_t   function,
+                                                     void                         *data);
+void *         pixman_image_get_destroy_data        (pixman_image_t               *image);
+
+/* Set properties */
+pixman_bool_t   pixman_image_set_clip_region         (pixman_image_t               *image,
+                                                     pixman_region16_t            *region);
+pixman_bool_t   pixman_image_set_clip_region32       (pixman_image_t               *image,
+                                                     pixman_region32_t            *region);
+void           pixman_image_set_has_client_clip     (pixman_image_t               *image,
+                                                     pixman_bool_t                 clien_clip);
+pixman_bool_t   pixman_image_set_transform           (pixman_image_t               *image,
+                                                     const pixman_transform_t     *transform);
+void            pixman_image_set_repeat              (pixman_image_t               *image,
+                                                     pixman_repeat_t               repeat);
+pixman_bool_t   pixman_image_set_filter              (pixman_image_t               *image,
+                                                     pixman_filter_t               filter,
+                                                     const pixman_fixed_t         *filter_params,
+                                                     int                           n_filter_params);
+void           pixman_image_set_source_clipping     (pixman_image_t               *image,
+                                                     pixman_bool_t                 source_clipping);
+void            pixman_image_set_alpha_map           (pixman_image_t               *image,
+                                                     pixman_image_t               *alpha_map,
+                                                     int16_t                       x,
+                                                     int16_t                       y);
+void            pixman_image_set_component_alpha     (pixman_image_t               *image,
+                                                     pixman_bool_t                 component_alpha);
+pixman_bool_t   pixman_image_get_component_alpha     (pixman_image_t               *image);
+void           pixman_image_set_accessors           (pixman_image_t               *image,
+                                                     pixman_read_memory_func_t     read_func,
+                                                     pixman_write_memory_func_t    write_func);
+void           pixman_image_set_indexed             (pixman_image_t               *image,
+                                                     const pixman_indexed_t       *indexed);
+uint32_t       *pixman_image_get_data                (pixman_image_t               *image);
+int            pixman_image_get_width               (pixman_image_t               *image);
+int             pixman_image_get_height              (pixman_image_t               *image);
+int            pixman_image_get_stride              (pixman_image_t               *image); /* in bytes */
+int            pixman_image_get_depth               (pixman_image_t               *image);
+pixman_format_code_t pixman_image_get_format        (pixman_image_t               *image);
+pixman_bool_t  pixman_image_fill_rectangles         (pixman_op_t                   op,
+                                                     pixman_image_t               *image,
+                                                     pixman_color_t               *color,
+                                                     int                           n_rects,
+                                                     const pixman_rectangle16_t   *rects);
+pixman_bool_t   pixman_image_fill_boxes              (pixman_op_t                   op,
+                                                      pixman_image_t               *dest,
+                                                      pixman_color_t               *color,
+                                                      int                           n_boxes,
+                                                      const pixman_box32_t         *boxes);
+
+/* Composite */
+pixman_bool_t pixman_compute_composite_region (pixman_region16_t *region,
+                                              pixman_image_t    *src_image,
+                                              pixman_image_t    *mask_image,
+                                              pixman_image_t    *dest_image,
+                                              int16_t            src_x,
+                                              int16_t            src_y,
+                                              int16_t            mask_x,
+                                              int16_t            mask_y,
+                                              int16_t            dest_x,
+                                              int16_t            dest_y,
+                                              uint16_t           width,
+                                              uint16_t           height);
+void          pixman_image_composite          (pixman_op_t        op,
+                                              pixman_image_t    *src,
+                                              pixman_image_t    *mask,
+                                              pixman_image_t    *dest,
+                                              int16_t            src_x,
+                                              int16_t            src_y,
+                                              int16_t            mask_x,
+                                              int16_t            mask_y,
+                                              int16_t            dest_x,
+                                              int16_t            dest_y,
+                                              uint16_t           width,
+                                              uint16_t           height);
+void          pixman_image_composite32        (pixman_op_t        op,
+                                              pixman_image_t    *src,
+                                              pixman_image_t    *mask,
+                                              pixman_image_t    *dest,
+                                              int32_t            src_x,
+                                              int32_t            src_y,
+                                              int32_t            mask_x,
+                                              int32_t            mask_y,
+                                              int32_t            dest_x,
+                                              int32_t            dest_y,
+                                              int32_t            width,
+                                              int32_t            height);
+
+/* Executive Summary: This function is a no-op that only exists
+ * for historical reasons.
+ *
+ * There used to be a bug in the X server where it would rely on
+ * out-of-bounds accesses when it was asked to composite with a
+ * window as the source. It would create a pixman image pointing
+ * to some bogus position in memory, but then set a clip region
+ * to the position where the actual bits were.
+ *
+ * Due to a bug in old versions of pixman, where it would not clip
+ * against the image bounds when a clip region was set, this would
+ * actually work. So when the pixman bug was fixed, a workaround was
+ * added to allow certain out-of-bound accesses. This function disabled
+ * those workarounds.
+ *
+ * Since 0.21.2, pixman doesn't do these workarounds anymore, so now this
+ * function is a no-op.
+ */
+void pixman_disable_out_of_bounds_workaround (void);
+
+/*
+ * Trapezoids
+ */
+typedef struct pixman_edge pixman_edge_t;
+typedef struct pixman_trapezoid pixman_trapezoid_t;
+typedef struct pixman_trap pixman_trap_t;
+typedef struct pixman_span_fix pixman_span_fix_t;
+typedef struct pixman_triangle pixman_triangle_t;
+
+/*
+ * An edge structure.  This represents a single polygon edge
+ * and can be quickly stepped across small or large gaps in the
+ * sample grid
+ */
+struct pixman_edge
+{
+    pixman_fixed_t     x;
+    pixman_fixed_t     e;
+    pixman_fixed_t     stepx;
+    pixman_fixed_t     signdx;
+    pixman_fixed_t     dy;
+    pixman_fixed_t     dx;
+
+    pixman_fixed_t     stepx_small;
+    pixman_fixed_t     stepx_big;
+    pixman_fixed_t     dx_small;
+    pixman_fixed_t     dx_big;
+};
+
+struct pixman_trapezoid
+{
+    pixman_fixed_t     top, bottom;
+    pixman_line_fixed_t        left, right;
+};
+
+struct pixman_triangle
+{
+    pixman_point_fixed_t p1, p2, p3;
+};
+
+/* whether 't' is a well defined not obviously empty trapezoid */
+#define pixman_trapezoid_valid(t)                                 \
+    ((t)->left.p1.y != (t)->left.p2.y &&                          \
+     (t)->right.p1.y != (t)->right.p2.y &&                        \
+     (int) ((t)->bottom - (t)->top) > 0)
+
+struct pixman_span_fix
+{
+    pixman_fixed_t     l, r, y;
+};
+
+struct pixman_trap
+{
+    pixman_span_fix_t  top, bot;
+};
+
+pixman_fixed_t pixman_sample_ceil_y        (pixman_fixed_t             y,
+                                           int                        bpp);
+pixman_fixed_t pixman_sample_floor_y       (pixman_fixed_t             y,
+                                           int                        bpp);
+void           pixman_edge_step            (pixman_edge_t             *e,
+                                           int                        n);
+void           pixman_edge_init            (pixman_edge_t             *e,
+                                           int                        bpp,
+                                           pixman_fixed_t             y_start,
+                                           pixman_fixed_t             x_top,
+                                           pixman_fixed_t             y_top,
+                                           pixman_fixed_t             x_bot,
+                                           pixman_fixed_t             y_bot);
+void           pixman_line_fixed_edge_init (pixman_edge_t             *e,
+                                           int                        bpp,
+                                           pixman_fixed_t             y,
+                                           const pixman_line_fixed_t *line,
+                                           int                        x_off,
+                                           int                        y_off);
+void           pixman_rasterize_edges      (pixman_image_t            *image,
+                                           pixman_edge_t             *l,
+                                           pixman_edge_t             *r,
+                                           pixman_fixed_t             t,
+                                           pixman_fixed_t             b);
+void           pixman_add_traps            (pixman_image_t            *image,
+                                           int16_t                    x_off,
+                                           int16_t                    y_off,
+                                           int                        ntrap,
+                                           pixman_trap_t             *traps);
+void           pixman_add_trapezoids       (pixman_image_t            *image,
+                                           int16_t                    x_off,
+                                           int                        y_off,
+                                           int                        ntraps,
+                                           const pixman_trapezoid_t  *traps);
+void           pixman_rasterize_trapezoid  (pixman_image_t            *image,
+                                           const pixman_trapezoid_t  *trap,
+                                           int                        x_off,
+                                           int                        y_off);
+void          pixman_composite_trapezoids (pixman_op_t                op,
+                                          pixman_image_t *            src,
+                                          pixman_image_t *            dst,
+                                          pixman_format_code_t        mask_format,
+                                          int                         x_src,
+                                          int                         y_src,
+                                          int                         x_dst,
+                                          int                         y_dst,
+                                          int                         n_traps,
+                                          const pixman_trapezoid_t *  traps);
+void          pixman_composite_triangles (pixman_op_t                 op,
+                                         pixman_image_t *             src,
+                                         pixman_image_t *             dst,
+                                         pixman_format_code_t         mask_format,
+                                         int                          x_src,
+                                         int                          y_src,
+                                         int                          x_dst,
+                                         int                          y_dst,
+                                         int                          n_tris,
+                                         const pixman_triangle_t *    tris);
+void         pixman_add_triangles       (pixman_image_t              *image,
+                                         int32_t                      x_off,
+                                         int32_t                      y_off,
+                                         int                          n_tris,
+                                         const pixman_triangle_t     *tris);
+
+PIXMAN_END_DECLS
+
+#endif /* PIXMAN_H__ */
diff --git a/pixman/refactor b/pixman/refactor
new file mode 100644 (file)
index 0000000..52fceab
--- /dev/null
@@ -0,0 +1,478 @@
+Roadmap
+
+- Move all the fetchers etc. into pixman-image to make pixman-compose.c
+  less intimidating.
+
+  DONE
+
+- Make combiners for unified alpha take a mask argument. That way
+  we won't need two separate paths for unified vs component in the
+  general compositing code.
+
+  DONE, except that the Altivec code needs to be updated. Luca is
+  looking into that.
+
+- Delete separate 'unified alpha' path
+  DONE
+
+- Split images into their own files
+
+  DONE
+
+- Split the gradient walker code out into its own file
+
+  DONE
+
+- Add scanline getters per image
+
+  DONE
+
+- Generic 64 bit fetcher 
+
+  DONE
+
+- Split fast path tables into their respective architecture dependent
+  files.
+
+See "Render Algorithm" below for rationale
+
+Images will eventually have these virtual functions:
+
+       get_scanline()
+       get_scanline_wide()
+       get_pixel()
+       get_pixel_wide()
+       get_untransformed_pixel()
+       get_untransformed_pixel_wide()
+       get_unfiltered_pixel()
+       get_unfiltered_pixel_wide()
+
+       store_scanline()
+       store_scanline_wide()
+
+1.
+
+Initially we will just have get_scanline() and get_scanline_wide();
+these will be based on the ones in pixman-compose. Hopefully this will
+reduce the complexity in pixman_composite_rect_general().
+
+Note that there is access considerations - the compose function is
+being compiled twice.
+
+
+2.
+
+Split image types into their own source files. Export noop virtual
+reinit() call.  Call this whenever a property of the image changes.
+
+
+3. 
+
+Split the get_scanline() call into smaller functions that are
+initialized by the reinit() call.
+
+The Render Algorithm:
+       (first repeat, then filter, then transform, then clip)
+
+Starting from a destination pixel (x, y), do
+
+       1 x = x - xDst + xSrc
+         y = y - yDst + ySrc
+
+       2 reject pixel that is outside the clip
+
+       This treats clipping as something that happens after
+       transformation, which I think is correct for client clips. For
+       hierarchy clips it is wrong, but who really cares? Without
+       GraphicsExposes hierarchy clips are basically irrelevant. Yes,
+       you could imagine cases where the pixels of a subwindow of a
+       redirected, transformed window should be treated as
+       transparent. I don't really care
+
+       Basically, I think the render spec should say that pixels that
+       are unavailable due to the hierarcy have undefined content,
+       and that GraphicsExposes are not generated. Ie., basically
+       that using non-redirected windows as sources is fail. This is
+       at least consistent with the current implementation and we can
+       update the spec later if someone makes it work.
+
+       The implication for render is that it should stop passing the
+       hierarchy clip to pixman. In pixman, if a souce image has a
+       clip it should be used in computing the composite region and
+       nowhere else, regardless of what "has_client_clip" says. The
+       default should be for there to not be any clip.
+
+       I would really like to get rid of the client clip as well for
+       source images, but unfortunately there is at least one
+       application in the wild that uses them.
+
+       3 Transform pixel: (x, y) = T(x, y)
+
+       4 Call p = GetUntransformedPixel (x, y)
+
+       5 If the image has an alpha map, then
+
+               Call GetUntransformedPixel (x, y) on the alpha map
+               
+               add resulting alpha channel to p
+
+          return p
+
+       Where GetUnTransformedPixel is:
+
+       6 switch (filter)
+         {
+         case NEAREST:
+               return GetUnfilteredPixel (x, y);
+               break;
+
+         case BILINEAR:
+               return GetUnfilteredPixel (...) // 4 times 
+               break;
+
+         case CONVOLUTION:
+               return GetUnfilteredPixel (...) // as many times as necessary.
+               break;
+         }
+
+       Where GetUnfilteredPixel (x, y) is
+
+       7 switch (repeat)
+          {
+          case REPEAT_NORMAL:
+          case REPEAT_PAD:
+          case REPEAT_REFLECT:
+               // adjust x, y as appropriate
+               break;
+
+          case REPEAT_NONE:
+               if (x, y) is outside image bounds
+                    return 0;
+               break;
+          }
+
+          return GetRawPixel(x, y)
+
+       Where GetRawPixel (x, y) is
+
+       8 Compute the pixel in question, depending on image type.
+
+For gradients, repeat has a totally different meaning, so
+UnfilteredPixel() and RawPixel() must be the same function so that
+gradients can do their own repeat algorithm.
+
+So, the GetRawPixel
+
+       for bits must deal with repeats
+       for gradients must deal with repeats (differently)
+       for solids, should ignore repeats.
+
+       for polygons, when we add them, either ignore repeats or do
+       something similar to bits (in which case, we may want an extra
+       layer of indirection to modify the coordinates).
+
+It is then possible to build things like "get scanline" or "get tile" on
+top of this. In the simplest case, just repeatedly calling GetPixel()
+would work, but specialized get_scanline()s or get_tile()s could be
+plugged in for common cases. 
+
+By not plugging anything in for images with access functions, we only
+have to compile the pixel functions twice, not the scanline functions.
+
+And we can get rid of fetchers for the bizarre formats that no one
+uses. Such as b2g3r3 etc. r1g2b1? Seriously? It is also worth
+considering a generic format based pixel fetcher for these edge cases.
+
+Since the actual routines depend on the image attributes, the images
+must be notified when those change and update their function pointers
+appropriately. So there should probably be a virtual function called
+(* reinit) or something like that.
+
+There will also be wide fetchers for both pixels and lines. The line
+fetcher will just call the wide pixel fetcher. The wide pixel fetcher
+will just call expand, except for 10 bit formats.
+
+Rendering pipeline:
+
+Drawable:
+       0. if (picture has alpha map)
+               0.1. Position alpha map according to the alpha_x/alpha_y
+               0.2. Where the two drawables intersect, the alpha channel
+                    Replace the alpha channel of source with the one
+                    from the alpha map. Replacement only takes place
+                    in the intersection of the two drawables' geometries.
+       1. Repeat the drawable according to the repeat attribute
+       2. Reconstruct a continuous image according to the filter
+       3. Transform according to the transform attribute
+       4. Position image such that src_x, src_y is over dst_x, dst_y
+       5. Sample once per destination pixel 
+       6. Clip. If a pixel is not within the source clip, then no
+          compositing takes place at that pixel. (Ie., it's *not*
+          treated as 0).
+
+       Sampling a drawable: 
+
+       - If the channel does not have an alpha channel, the pixels in it
+         are treated as opaque.
+
+       Note on reconstruction:
+
+       - The top left pixel has coordinates (0.5, 0.5) and pixels are
+         spaced 1 apart.
+
+Gradient:
+       1. Unless gradient type is conical, repeat the underlying (0, 1)
+               gradient according to the repeat attribute
+       2. Integrate the gradient across the plane according to type.
+       3. Transform according to transform attribute
+       4. Position gradient 
+       5. Sample once per destination pixel.
+       6. Clip
+
+Solid Fill:
+       1. Repeat has no effect
+       2. Image is already continuous and defined for the entire plane
+       3. Transform has no effect
+       4. Positioning has no effect
+       5. Sample once per destination pixel.
+       6. Clip
+
+Polygon:
+       1. Repeat has no effect
+       2. Image is already continuous and defined on the whole plane
+       3. Transform according to transform attribute
+       4. Position image
+       5. Supersample 15x17 per destination pixel.
+       6. Clip
+
+Possibly interesting additions:
+       - More general transformations, such as warping, or general
+         shading.
+
+       - Shader image where a function is called to generate the
+          pixel (ie., uploading assembly code).
+
+       - Resampling kernels
+
+         In principle the polygon image uses a 15x17 box filter for
+         resampling. If we allow general resampling filters, then we
+         get all the various antialiasing types for free. 
+
+         Bilinear downsampling looks terrible and could be much 
+         improved by a resampling filter. NEAREST reconstruction
+         combined with a box resampling filter is what GdkPixbuf
+         does, I believe.
+
+         Useful for high frequency gradients as well.
+
+         (Note that the difference between a reconstruction and a
+         resampling filter is mainly where in the pipeline they
+         occur. High quality resampling should use a correctly
+         oriented kernel so it should happen after transformation.
+
+         An implementation can transform the resampling kernel and
+         convolve it with the reconstruction if it so desires, but it
+         will need to deal with the fact that the resampling kernel
+         will not necessarily be pixel aligned.
+
+         "Output kernels"
+
+         One could imagine doing the resampling after compositing,
+         ie., for each destination pixel sample each source image 16
+         times, then composite those subpixels individually, then
+         finally apply a kernel.
+
+         However, this is effectively the same as full screen
+         antialiasing, which is a simpler way to think about it. So
+         resampling kernels may make sense for individual images, but
+         not as a post-compositing step.
+         
+         Fullscreen AA is inefficient without chained compositing
+         though. Consider an (image scaled up to oversample size IN
+         some polygon) scaled down to screen size. With the current
+         implementation, there will be a huge temporary. With chained
+         compositing, the whole thing ends up being equivalent to the
+         output kernel from above.
+
+       - Color space conversion
+
+         The complete model here is that each surface has a color
+         space associated with it and that the compositing operation
+         also has one associated with it. Note also that gradients
+         should have associcated colorspaces.
+
+       - Dithering
+
+         If people dither something that is already dithered, it will
+         look terrible, but don't do that, then. (Dithering happens
+         after resampling if at all - what is the relationship
+         with color spaces? Presumably dithering should happen in linear
+         intensity space).
+
+       - Floating point surfaces, 16, 32 and possibly 64 bit per
+         channel.
+
+       Maybe crack:
+
+       - Glyph polygons
+
+         If glyphs could be given as polygons, they could be
+         positioned and rasterized more accurately. The glyph
+         structure would need subpixel positioning though.
+
+       - Luminance vs. coverage for the alpha channel
+
+         Whether the alpha channel should be interpreted as luminance
+          modulation or as coverage (intensity modulation). This is a
+          bit of a departure from the rendering model though. It could
+         also be considered whether it should be possible to have 
+         both channels in the same drawable.
+
+       - Alternative for component alpha
+
+         - Set component-alpha on the output image.
+
+           - This means each of the components are sampled
+             independently and composited in the corresponding
+             channel only.
+
+         - Have 3 x oversampled mask
+
+         - Scale it down by 3 horizontally, with [ 1/3, 1/3, 1/3 ]
+            resampling filter. 
+
+           Is this equivalent to just using a component alpha mask?
+
+       Incompatible changes:
+
+       - Gradients could be specified with premultiplied colors. (You
+         can use a mask to get things like gradients from solid red to
+         transparent red.
+
+Refactoring pixman
+
+The pixman code is not particularly nice to put it mildly. Among the
+issues are
+
+- inconsistent naming style (fb vs Fb, camelCase vs
+  underscore_naming). Sometimes there is even inconsistency *within*
+  one name.
+
+      fetchProc32 ACCESS(pixman_fetchProcForPicture32)
+
+  may be one of the uglies names ever created.
+
+  coding style: 
+        use the one from cairo except that pixman uses this brace style:
+        
+               while (blah)
+               {
+               }
+
+       Format do while like this:
+
+              do 
+              {
+
+              } 
+              while (...);
+
+- PIXMAN_COMPOSITE_RECT_GENERAL() is horribly complex
+
+- switch case logic in pixman-access.c
+
+  Instead it would be better to just store function pointers in the
+  image objects themselves,
+
+       get_pixel()
+       get_scanline()
+
+- Much of the scanline fetching code is for formats that no one 
+  ever uses. a2r2g2b2 anyone?
+
+  It would probably be worthwhile having a generic fetcher for any
+  pixman format whatsoever.
+
+- Code related to particular image types should be split into individual
+  files.
+
+       pixman-bits-image.c
+       pixman-linear-gradient-image.c
+       pixman-radial-gradient-image.c
+       pixman-solid-image.c
+
+- Fast path code should be split into files based on architecture:
+
+       pixman-mmx-fastpath.c
+       pixman-sse2-fastpath.c
+       pixman-c-fastpath.c
+
+       etc.
+
+  Each of these files should then export a fastpath table, which would
+  be declared in pixman-private.h. This should allow us to get rid
+  of the pixman-mmx.h files.
+
+  The fast path table should describe each fast path. Ie there should
+  be bitfields indicating what things the fast path can handle, rather than
+  like now where it is only allowed to take one format per src/mask/dest. Ie., 
+
+  { 
+    FAST_a8r8g8b8 | FAST_x8r8g8b8,
+    FAST_null,
+    FAST_x8r8g8b8,
+    FAST_repeat_normal | FAST_repeat_none,
+    the_fast_path
+  }
+
+There should then be *one* file that implements pixman_image_composite(). 
+This should do this:
+
+     optimize_operator();
+
+     convert 1x1 repeat to solid (actually this should be done at
+     image creation time).
+     
+     is there a useful fastpath?
+
+There should be a file called pixman-cpu.c that contains all the
+architecture specific stuff to detect what CPU features we have.
+
+Issues that must be kept in mind:
+
+       - we need accessor code to be preserved
+
+       - maybe there should be a "store_scanline" too?
+
+         Is this sufficient?
+
+        We should preserve the optimization where the
+        compositing happens directly in the destination
+        whenever possible.
+
+       - It should be possible to create GPU samplers from the
+         images.
+
+The "horizontal" classification should be a bit in the image, the
+"vertical" classification should just happen inside the gradient
+file. Note though that
+
+      (a) these will change if the tranformation/repeat changes.
+
+      (b) at the moment the optimization for linear gradients
+          takes the source rectangle into account. Presumably
+         this is to also optimize the case where the gradient
+         is close enough to horizontal?
+
+Who is responsible for repeats? In principle it should be the scanline
+fetch. Right now NORMAL repeats are handled by walk_composite_region()
+while other repeats are handled by the scanline code.
+
+
+(Random note on filtering: do you filter before or after
+transformation?  Hardware is going to filter after transformation;
+this is also what pixman does currently). It's not completely clear
+what filtering *after* transformation means. One thing that might look
+good would be to do *supersampling*, ie., compute multiple subpixels
+per destination pixel, then average them together.
diff --git a/pixman/solaris-hwcap.mapfile b/pixman/solaris-hwcap.mapfile
new file mode 100644 (file)
index 0000000..87efce1
--- /dev/null
@@ -0,0 +1,30 @@
+###############################################################################
+#
+# Copyright 2009, Oracle and/or its affiliates. All rights reserved.
+#
+# Permission is hereby granted, free of charge, to any person obtaining a
+# copy of this software and associated documentation files (the "Software"),
+# to deal in the Software without restriction, including without limitation
+# the rights to use, copy, modify, merge, publish, distribute, sublicense,
+# and/or sell copies of the Software, and to permit persons to whom the
+# Software is furnished to do so, subject to the following conditions:
+#
+# The above copyright notice and this permission notice (including the next
+# paragraph) shall be included in all copies or substantial portions of the
+# Software.
+#
+# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+# IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+# FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
+# THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+# LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
+# FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER
+# DEALINGS IN THE SOFTWARE.
+#
+###############################################################################
+#
+# Override the linker's detection of CMOV/MMX/SSE instructions so this
+# library isn't flagged as only usable on CPU's with those ISA's, since it
+# checks at runtime for availability before calling them
+
+hwcap_1 = V0x0 FPU OVERRIDE;
diff --git a/test/Makefile.am b/test/Makefile.am
new file mode 100755 (executable)
index 0000000..eeb3679
--- /dev/null
@@ -0,0 +1,13 @@
+include $(top_srcdir)/test/Makefile.sources
+
+AM_CFLAGS = $(OPENMP_CFLAGS)
+AM_LDFLAGS = $(OPENMP_CFLAGS) $(TESTPROGS_EXTRA_LDFLAGS)
+LDADD = libutils.la $(top_builddir)/pixman/libpixman-1.la -lm  $(PNG_LIBS)
+INCLUDES = -I$(top_srcdir)/pixman -I$(top_builddir)/pixman $(PNG_CFLAGS)
+
+libutils_la_SOURCES = $(libutils_sources) $(libutils_headers)
+
+noinst_LTLIBRARIES = libutils.la
+noinst_PROGRAMS = $(TESTPROGRAMS) $(BENCHMARKS)
+
+TESTS = $(TESTPROGRAMS)
diff --git a/test/Makefile.sources b/test/Makefile.sources
new file mode 100644 (file)
index 0000000..99eb705
--- /dev/null
@@ -0,0 +1,36 @@
+# Tests (sorted by expected completion time)
+TESTPROGRAMS =                 \
+       a1-trap-test            \
+       pdf-op-test             \
+       region-test             \
+       region-translate-test   \
+       fetch-test              \
+       oob-test                \
+       trap-crasher            \
+       alpha-loop              \
+       scaling-crash-test      \
+       scaling-helpers-test    \
+       gradient-crash-test     \
+       region-contains-test    \
+       alphamap                \
+       stress-test             \
+       composite-traps-test    \
+       blitters-test           \
+       scaling-test            \
+       affine-test             \
+       composite               \
+       $(NULL)
+
+# Benchmarks
+BENCHMARKS =                   \
+       lowlevel-blt-bench      \
+       $(NULL)
+
+# Utility functions
+libutils_sources =             \
+       utils.c                 \
+       $(NULL)
+
+libutils_headers =             \
+       utils.h                 \
+       $(NULL)
diff --git a/test/Makefile.win32 b/test/Makefile.win32
new file mode 100755 (executable)
index 0000000..307ba0c
--- /dev/null
@@ -0,0 +1,31 @@
+default: all
+
+top_srcdir = ..
+include $(top_srcdir)/test/Makefile.sources
+include $(top_srcdir)/Makefile.win32.common
+
+TEST_LDADD = \
+       $(top_builddir)/pixman/$(CFG_VAR)/$(LIBRARY).lib \
+       $(CFG_VAR)/libutils.lib \
+       $(NULL)
+
+libutils_OBJECTS = $(patsubst %.c, $(CFG_VAR)/%.obj, $(libutils_sources))
+
+SOURCES = $(patsubst %,   %.c,              $(TESTPROGRAMS) $(BENCHMARKS))
+OBJECTS = $(patsubst %.c, $(CFG_VAR)/%.obj, $(SOURCES))
+TESTS   = $(patsubst %,   $(CFG_VAR)/%.exe, $(TESTPROGRAMS))
+BENCHS  = $(patsubst %,   $(CFG_VAR)/%.exe, $(BENCHMARKS))
+
+all: inform $(TESTS) $(BENCHS)
+
+check: inform $(TESTS)
+       @for test in $(TESTS) ; do ./$$test && echo "PASS: $$test" || echo "FAIL: $$test" ; done
+
+$(CFG_VAR)/libutils.lib: $(libutils_OBJECTS)
+       @$(AR) $(PIXMAN_ARFLAGS) -OUT:$@ $^
+
+$(CFG_VAR)/%.exe: $(CFG_VAR)/%.obj $(TEST_LDADD)
+       @$(LD) $(PIXMAN_LDFLAGS) -OUT:$@ $^
+
+$(top_builddir)/pixman/$(CFG_VAR)/$(LIBRARY).lib:
+       @$(MAKE) -C $(top_builddir)/pixman -f Makefile.win32
diff --git a/test/a1-trap-test.c b/test/a1-trap-test.c
new file mode 100644 (file)
index 0000000..6163e7c
--- /dev/null
@@ -0,0 +1,50 @@
+#include <assert.h>
+#include <stdio.h>
+#include <stdlib.h>
+#include <string.h>
+#include "pixman.h"
+
+int
+main (int argc, char **argv)
+{
+#define WIDTH 20
+#define HEIGHT 20
+
+    pixman_image_t *src_img;
+    pixman_image_t *mask_img;
+    pixman_image_t *dest_img;
+    pixman_trap_t trap;
+    pixman_color_t red = { 0xffff, 0x0000, 0x0000, 0xffff };
+    uint32_t *bits = malloc (WIDTH * HEIGHT * 4);
+    uint32_t *mbits = malloc (WIDTH * HEIGHT);
+
+    memset (mbits, 0, WIDTH * HEIGHT);
+    memset (bits, 0xff, WIDTH * HEIGHT * 4);
+    
+    trap.top.l = pixman_double_to_fixed (0.5);
+    trap.top.r = pixman_double_to_fixed (1.5);
+    trap.top.y = pixman_double_to_fixed (0.5);
+
+    trap.bot.l = pixman_double_to_fixed (0.5);
+    trap.bot.r = pixman_double_to_fixed (1.5);
+    trap.bot.y = pixman_double_to_fixed (1.5);
+
+    mask_img = pixman_image_create_bits (
+       PIXMAN_a1, WIDTH, HEIGHT, mbits, WIDTH);
+    src_img = pixman_image_create_solid_fill (&red);
+    dest_img = pixman_image_create_bits (
+       PIXMAN_a8r8g8b8, WIDTH, HEIGHT, bits, WIDTH * 4);
+    
+    pixman_add_traps (mask_img, 0, 0, 1, &trap);
+
+    pixman_image_composite (PIXMAN_OP_OVER,
+                           src_img, mask_img, dest_img,
+                           0, 0, 0, 0, 0, 0, WIDTH, HEIGHT);
+
+    assert (bits[0] == 0xffff0000);
+    assert (bits[1] == 0xffffffff);
+    assert (bits[1 * WIDTH + 0] == 0xffffffff);
+    assert (bits[1 * WIDTH + 1] == 0xffffffff);
+    
+    return 0;
+}
diff --git a/test/affine-test.c b/test/affine-test.c
new file mode 100755 (executable)
index 0000000..a4ceed3
--- /dev/null
@@ -0,0 +1,311 @@
+/*
+ * Test program, which can detect some problems with affine transformations
+ * in pixman. Testing is done by running lots of random SRC and OVER
+ * compositing operations a8r8g8b8, x8a8r8g8b8, r5g6b5 and a8 color formats
+ * with random scaled, rotated and translated transforms.
+ *
+ * Script 'fuzzer-find-diff.pl' can be used to narrow down the problem in
+ * the case of test failure.
+ */
+#include <assert.h>
+#include <stdlib.h>
+#include <stdio.h>
+#include "utils.h"
+
+#define MAX_SRC_WIDTH  16
+#define MAX_SRC_HEIGHT 16
+#define MAX_DST_WIDTH  16
+#define MAX_DST_HEIGHT 16
+#define MAX_STRIDE     4
+
+/*
+ * Composite operation with pseudorandom images
+ */
+uint32_t
+test_composite (int      testnum,
+               int      verbose)
+{
+    int                i;
+    pixman_image_t *   src_img;
+    pixman_image_t *   dst_img;
+    pixman_transform_t transform;
+    pixman_region16_t  clip;
+    int                src_width, src_height;
+    int                dst_width, dst_height;
+    int                src_stride, dst_stride;
+    int                src_x, src_y;
+    int                dst_x, dst_y;
+    int                src_bpp;
+    int                dst_bpp;
+    int                w, h;
+    pixman_fixed_t     scale_x = 65536, scale_y = 65536;
+    pixman_fixed_t     translate_x = 0, translate_y = 0;
+    pixman_op_t        op;
+    pixman_repeat_t    repeat = PIXMAN_REPEAT_NONE;
+    pixman_format_code_t src_fmt, dst_fmt;
+    uint32_t *         srcbuf;
+    uint32_t *         dstbuf;
+    uint32_t           crc32;
+    FLOAT_REGS_CORRUPTION_DETECTOR_START ();
+
+    lcg_srand (testnum);
+
+    src_bpp = (lcg_rand_n (2) == 0) ? 2 : 4;
+    dst_bpp = (lcg_rand_n (2) == 0) ? 2 : 4;
+    op = (lcg_rand_n (2) == 0) ? PIXMAN_OP_SRC : PIXMAN_OP_OVER;
+
+    src_width = lcg_rand_n (MAX_SRC_WIDTH) + 1;
+    src_height = lcg_rand_n (MAX_SRC_HEIGHT) + 1;
+    dst_width = lcg_rand_n (MAX_DST_WIDTH) + 1;
+    dst_height = lcg_rand_n (MAX_DST_HEIGHT) + 1;
+    src_stride = src_width * src_bpp + lcg_rand_n (MAX_STRIDE) * src_bpp;
+    dst_stride = dst_width * dst_bpp + lcg_rand_n (MAX_STRIDE) * dst_bpp;
+
+    if (src_stride & 3)
+       src_stride += 2;
+
+    if (dst_stride & 3)
+       dst_stride += 2;
+
+    src_x = -(src_width / 4) + lcg_rand_n (src_width * 3 / 2);
+    src_y = -(src_height / 4) + lcg_rand_n (src_height * 3 / 2);
+    dst_x = -(dst_width / 4) + lcg_rand_n (dst_width * 3 / 2);
+    dst_y = -(dst_height / 4) + lcg_rand_n (dst_height * 3 / 2);
+    w = lcg_rand_n (dst_width * 3 / 2 - dst_x);
+    h = lcg_rand_n (dst_height * 3 / 2 - dst_y);
+
+    srcbuf = (uint32_t *)malloc (src_stride * src_height);
+    dstbuf = (uint32_t *)malloc (dst_stride * dst_height);
+
+    for (i = 0; i < src_stride * src_height; i++)
+       *((uint8_t *)srcbuf + i) = lcg_rand_n (256);
+
+    for (i = 0; i < dst_stride * dst_height; i++)
+       *((uint8_t *)dstbuf + i) = lcg_rand_n (256);
+
+    src_fmt = src_bpp == 4 ? (lcg_rand_n (2) == 0 ?
+                              PIXMAN_a8r8g8b8 : PIXMAN_x8r8g8b8) : PIXMAN_r5g6b5;
+
+    dst_fmt = dst_bpp == 4 ? (lcg_rand_n (2) == 0 ?
+                              PIXMAN_a8r8g8b8 : PIXMAN_x8r8g8b8) : PIXMAN_r5g6b5;
+
+    src_img = pixman_image_create_bits (
+        src_fmt, src_width, src_height, srcbuf, src_stride);
+
+    dst_img = pixman_image_create_bits (
+        dst_fmt, dst_width, dst_height, dstbuf, dst_stride);
+
+    image_endian_swap (src_img);
+    image_endian_swap (dst_img);
+
+    pixman_transform_init_identity (&transform);
+
+    if (lcg_rand_n (3) > 0)
+    {
+       scale_x = -65536 * 3 + lcg_rand_N (65536 * 6);
+       if (lcg_rand_n (2))
+           scale_y = -65536 * 3 + lcg_rand_N (65536 * 6);
+       else
+           scale_y = scale_x;
+       pixman_transform_init_scale (&transform, scale_x, scale_y);
+    }
+    if (lcg_rand_n (3) > 0)
+    {
+       translate_x = -65536 * 3 + lcg_rand_N (6 * 65536);
+       if (lcg_rand_n (2))
+           translate_y = -65536 * 3 + lcg_rand_N (6 * 65536);
+       else
+           translate_y = translate_x;
+       pixman_transform_translate (&transform, NULL, translate_x, translate_y);
+    }
+
+    if (lcg_rand_n (4) > 0)
+    {
+       int c, s, tx = 0, ty = 0;
+       switch (lcg_rand_n (4))
+       {
+       case 0:
+           /* 90 degrees */
+           c = 0;
+           s = pixman_fixed_1;
+           tx = pixman_int_to_fixed (MAX_SRC_HEIGHT);
+           break;
+       case 1:
+           /* 180 degrees */
+           c = -pixman_fixed_1;
+           s = 0;
+           tx = pixman_int_to_fixed (MAX_SRC_WIDTH);
+           ty = pixman_int_to_fixed (MAX_SRC_HEIGHT);
+           break;
+       case 2:
+           /* 270 degrees */
+           c = 0;
+           s = -pixman_fixed_1;
+           ty = pixman_int_to_fixed (MAX_SRC_WIDTH);
+           break;
+       default:
+           /* arbitrary rotation */
+           c = lcg_rand_N (2 * 65536) - 65536;
+           s = lcg_rand_N (2 * 65536) - 65536;
+           break;
+       }
+       pixman_transform_rotate (&transform, NULL, c, s);
+       pixman_transform_translate (&transform, NULL, tx, ty);
+    }
+
+    if (lcg_rand_n (8) == 0)
+    {
+       /* Flip random bits */
+       int maxflipcount = 8;
+       while (maxflipcount--)
+       {
+           int i = lcg_rand_n (2);
+           int j = lcg_rand_n (3);
+           int bitnum = lcg_rand_n (32);
+           transform.matrix[i][j] ^= 1 << bitnum;
+           if (lcg_rand_n (2))
+               break;
+       }
+    }
+
+    pixman_image_set_transform (src_img, &transform);
+
+    switch (lcg_rand_n (4))
+    {
+    case 0:
+       repeat = PIXMAN_REPEAT_NONE;
+       break;
+
+    case 1:
+       repeat = PIXMAN_REPEAT_NORMAL;
+       break;
+
+    case 2:
+       repeat = PIXMAN_REPEAT_PAD;
+       break;
+
+    case 3:
+       repeat = PIXMAN_REPEAT_REFLECT;
+       break;
+
+    default:
+        break;
+    }
+    pixman_image_set_repeat (src_img, repeat);
+
+    if (lcg_rand_n (2))
+       pixman_image_set_filter (src_img, PIXMAN_FILTER_NEAREST, NULL, 0);
+    else
+       pixman_image_set_filter (src_img, PIXMAN_FILTER_BILINEAR, NULL, 0);
+
+    if (verbose)
+    {
+       printf ("src_fmt=%08X, dst_fmt=%08X\n", src_fmt, dst_fmt);
+       printf ("op=%d, scale_x=%d, scale_y=%d, repeat=%d\n",
+               op, scale_x, scale_y, repeat);
+       printf ("translate_x=%d, translate_y=%d\n",
+               translate_x, translate_y);
+       printf ("src_width=%d, src_height=%d, dst_width=%d, dst_height=%d\n",
+               src_width, src_height, dst_width, dst_height);
+       printf ("src_x=%d, src_y=%d, dst_x=%d, dst_y=%d\n",
+               src_x, src_y, dst_x, dst_y);
+       printf ("w=%d, h=%d\n", w, h);
+    }
+
+    if (lcg_rand_n (8) == 0)
+    {
+       pixman_box16_t clip_boxes[2];
+       int            n = lcg_rand_n (2) + 1;
+
+       for (i = 0; i < n; i++)
+       {
+           clip_boxes[i].x1 = lcg_rand_n (src_width);
+           clip_boxes[i].y1 = lcg_rand_n (src_height);
+           clip_boxes[i].x2 =
+               clip_boxes[i].x1 + lcg_rand_n (src_width - clip_boxes[i].x1);
+           clip_boxes[i].y2 =
+               clip_boxes[i].y1 + lcg_rand_n (src_height - clip_boxes[i].y1);
+
+           if (verbose)
+           {
+               printf ("source clip box: [%d,%d-%d,%d]\n",
+                       clip_boxes[i].x1, clip_boxes[i].y1,
+                       clip_boxes[i].x2, clip_boxes[i].y2);
+           }
+       }
+
+       pixman_region_init_rects (&clip, clip_boxes, n);
+       pixman_image_set_clip_region (src_img, &clip);
+       pixman_image_set_source_clipping (src_img, 1);
+       pixman_region_fini (&clip);
+    }
+
+    if (lcg_rand_n (8) == 0)
+    {
+       pixman_box16_t clip_boxes[2];
+       int            n = lcg_rand_n (2) + 1;
+       for (i = 0; i < n; i++)
+       {
+           clip_boxes[i].x1 = lcg_rand_n (dst_width);
+           clip_boxes[i].y1 = lcg_rand_n (dst_height);
+           clip_boxes[i].x2 =
+               clip_boxes[i].x1 + lcg_rand_n (dst_width - clip_boxes[i].x1);
+           clip_boxes[i].y2 =
+               clip_boxes[i].y1 + lcg_rand_n (dst_height - clip_boxes[i].y1);
+
+           if (verbose)
+           {
+               printf ("destination clip box: [%d,%d-%d,%d]\n",
+                       clip_boxes[i].x1, clip_boxes[i].y1,
+                       clip_boxes[i].x2, clip_boxes[i].y2);
+           }
+       }
+       pixman_region_init_rects (&clip, clip_boxes, n);
+       pixman_image_set_clip_region (dst_img, &clip);
+       pixman_region_fini (&clip);
+    }
+
+    pixman_image_composite (op, src_img, NULL, dst_img,
+                            src_x, src_y, 0, 0, dst_x, dst_y, w, h);
+
+    if (dst_fmt == PIXMAN_x8r8g8b8)
+    {
+       /* ignore unused part */
+       for (i = 0; i < dst_stride * dst_height / 4; i++)
+           dstbuf[i] &= 0xFFFFFF;
+    }
+
+    image_endian_swap (dst_img);
+
+    if (verbose)
+    {
+       int j;
+
+       for (i = 0; i < dst_height; i++)
+       {
+           for (j = 0; j < dst_stride; j++)
+               printf ("%02X ", *((uint8_t *)dstbuf + i * dst_stride + j));
+
+           printf ("\n");
+       }
+    }
+
+    pixman_image_unref (src_img);
+    pixman_image_unref (dst_img);
+
+    crc32 = compute_crc32 (0, dstbuf, dst_stride * dst_height);
+    free (srcbuf);
+    free (dstbuf);
+
+    FLOAT_REGS_CORRUPTION_DETECTOR_FINISH ();
+    return crc32;
+}
+
+int
+main (int argc, const char *argv[])
+{
+    pixman_disable_out_of_bounds_workaround ();
+
+    return fuzzer_test_main ("affine", 8000000, 0x1EF2175A,
+                            test_composite, argc, argv);
+}
diff --git a/test/alpha-loop.c b/test/alpha-loop.c
new file mode 100644 (file)
index 0000000..e4d90a9
--- /dev/null
@@ -0,0 +1,29 @@
+#include <stdio.h>
+#include <stdlib.h>
+#include "utils.h"
+
+#define WIDTH 400
+#define HEIGHT 200
+
+int
+main (int argc, char **argv)
+{
+    uint8_t *alpha = make_random_bytes (WIDTH * HEIGHT);
+    uint32_t *src = (uint32_t *)make_random_bytes (WIDTH * HEIGHT * 4);
+    uint32_t *dest = (uint32_t *)make_random_bytes (WIDTH * HEIGHT * 4);
+
+    pixman_image_t *a = pixman_image_create_bits (PIXMAN_a8, WIDTH, HEIGHT, (uint32_t *)alpha, WIDTH);
+    pixman_image_t *d = pixman_image_create_bits (PIXMAN_a8r8g8b8, WIDTH, HEIGHT, dest, WIDTH * 4);
+    pixman_image_t *s = pixman_image_create_bits (PIXMAN_a2r10g10b10, WIDTH, HEIGHT, src, WIDTH * 4);
+
+    fail_after (5, "Infinite loop detected: 5 seconds without progress\n");
+
+    pixman_image_set_alpha_map (s, a, 0, 0);
+    pixman_image_set_alpha_map (a, s, 0, 0);
+
+    pixman_image_composite (PIXMAN_OP_SRC, s, NULL, d, 0, 0, 0, 0, 0, 0, WIDTH, HEIGHT);
+
+    pixman_image_unref (s);
+
+    return 0;
+}
diff --git a/test/alphamap.c b/test/alphamap.c
new file mode 100644 (file)
index 0000000..554b309
--- /dev/null
@@ -0,0 +1,256 @@
+#include <stdio.h>
+#include <stdlib.h>
+#include "utils.h"
+
+#define WIDTH 100
+#define HEIGHT 100
+
+static const pixman_format_code_t formats[] =
+{
+    PIXMAN_a8r8g8b8,
+    PIXMAN_a2r10g10b10,
+    PIXMAN_a4r4g4b4,
+    PIXMAN_a8
+};
+
+static const pixman_format_code_t alpha_formats[] =
+{
+    PIXMAN_null,
+    PIXMAN_a8,
+    PIXMAN_a2r10g10b10,
+    PIXMAN_a4r4g4b4
+};
+
+static const int origins[] =
+{
+    0, 10, -100
+};
+
+static const char *
+format_name (pixman_format_code_t format)
+{
+    if (format == PIXMAN_a8)
+       return "a8";
+    else if (format == PIXMAN_a2r10g10b10)
+       return "a2r10g10b10";
+    else if (format == PIXMAN_a8r8g8b8)
+       return "a8r8g8b8";
+    else if (format == PIXMAN_a4r4g4b4)
+       return "a4r4g4b4";
+    else if (format == PIXMAN_null)
+       return "none";
+    else
+       assert (0);
+
+    return "<unknown - bug in alphamap.c>";
+}
+
+static void
+on_destroy (pixman_image_t *image, void *data)
+{
+    uint32_t *bits = pixman_image_get_data (image);
+
+    fence_free (bits);
+}
+
+static pixman_image_t *
+make_image (pixman_format_code_t format)
+{
+    uint32_t *bits;
+    uint8_t bpp = PIXMAN_FORMAT_BPP (format) / 8;
+    pixman_image_t *image;
+
+    bits = (uint32_t *)make_random_bytes (WIDTH * HEIGHT * bpp);
+
+    image = pixman_image_create_bits (format, WIDTH, HEIGHT, bits, WIDTH * bpp);
+
+    if (image && bits)
+       pixman_image_set_destroy_function (image, on_destroy, NULL);
+
+    return image;
+}
+
+static pixman_image_t *
+create_image (pixman_format_code_t format, pixman_format_code_t alpha_format,
+             int alpha_origin_x, int alpha_origin_y)
+{
+    pixman_image_t *image = make_image (format);
+
+    if (alpha_format != PIXMAN_null)
+    {
+       pixman_image_t *alpha = make_image (alpha_format);
+
+       pixman_image_set_alpha_map (image, alpha,
+                                   alpha_origin_x, alpha_origin_y);
+       pixman_image_unref (alpha);
+    }
+
+    return image;
+}
+
+static uint8_t
+get_alpha (pixman_image_t *image, int x, int y, int orig_x, int orig_y)
+{
+    uint8_t *bits;
+    uint8_t r;
+
+    if (image->common.alpha_map)
+    {
+       if (x - orig_x >= 0 && x - orig_x < WIDTH &&
+           y - orig_y >= 0 && y - orig_y < HEIGHT)
+       {
+           image = (pixman_image_t *)image->common.alpha_map;
+
+           x -= orig_x;
+           y -= orig_y;
+       }
+       else
+       {
+           return 0;
+       }
+    }
+
+    bits = (uint8_t *)image->bits.bits;
+
+    if (image->bits.format == PIXMAN_a8)
+    {
+       r = bits[y * WIDTH + x];
+    }
+    else if (image->bits.format == PIXMAN_a2r10g10b10)
+    {
+       r = ((uint32_t *)bits)[y * WIDTH + x] >> 30;
+       r |= r << 2;
+       r |= r << 4;
+    }
+    else if (image->bits.format == PIXMAN_a8r8g8b8)
+    {
+       r = ((uint32_t *)bits)[y * WIDTH + x] >> 24;
+    }
+    else if (image->bits.format == PIXMAN_a4r4g4b4)
+    {
+       r = ((uint16_t *)bits)[y * WIDTH + x] >> 12;
+       r |= r << 4;
+    }
+    else
+    {
+       assert (0);
+    }
+
+    return r;
+}
+
+#define ARRAY_LENGTH(A) ((int) (sizeof (A) / sizeof ((A) [0])))
+
+static int
+run_test (int s, int d, int sa, int da, int soff, int doff)
+{
+    pixman_format_code_t sf = formats[s];
+    pixman_format_code_t df = formats[d];
+    pixman_format_code_t saf = alpha_formats[sa];
+    pixman_format_code_t daf = alpha_formats[da];
+    pixman_image_t *src, *dst, *orig_dst;
+    pixman_transform_t t1;
+    int j, k;
+    int n_alpha_bits;
+
+    soff = origins[soff];
+    doff = origins[doff];
+
+    n_alpha_bits = PIXMAN_FORMAT_A (df);
+    if (daf != PIXMAN_null)
+       n_alpha_bits = PIXMAN_FORMAT_A (daf);
+
+
+    src = create_image (sf, saf, soff, soff);
+    orig_dst = create_image (df, daf, doff, doff);
+    dst = create_image (df, daf, doff, doff);
+
+    /* Transformations, repeats and filters on destinations should be ignored,
+     * so just set some random ones.
+     */
+    pixman_transform_init_identity (&t1);
+    pixman_transform_scale (&t1, NULL, pixman_int_to_fixed (100), pixman_int_to_fixed (11));
+    pixman_transform_rotate (&t1, NULL, pixman_double_to_fixed (0.5), pixman_double_to_fixed (0.11));
+    pixman_transform_translate (&t1, NULL, pixman_int_to_fixed (11), pixman_int_to_fixed (17));
+
+    pixman_image_set_transform (dst, &t1);
+    pixman_image_set_filter (dst, PIXMAN_FILTER_BILINEAR, NULL, 0);
+    pixman_image_set_repeat (dst, PIXMAN_REPEAT_REFLECT);
+
+    pixman_image_composite (PIXMAN_OP_SRC, orig_dst, NULL, dst,
+                           0, 0, 0, 0, 0, 0, WIDTH, HEIGHT);
+
+    pixman_image_composite (PIXMAN_OP_ADD, src, NULL, dst,
+                           0, 0, 0, 0, 0, 0, WIDTH, HEIGHT);
+
+    for (j = MAX (doff, 0); j < MIN (HEIGHT, HEIGHT + doff); ++j)
+    {
+       for (k = MAX (doff, 0); k < MIN (WIDTH, WIDTH + doff); ++k)
+       {
+           uint8_t sa, da, oda, ref;
+
+           sa = get_alpha (src, k, j, soff, soff);
+           da = get_alpha (dst, k, j, doff, doff);
+           oda = get_alpha (orig_dst, k, j, doff, doff);
+
+           if (sa + oda > 255)
+               ref = 255;
+           else
+               ref = sa + oda;
+
+           if (da >> (8 - n_alpha_bits) != ref >> (8 - n_alpha_bits))
+           {
+               printf ("\nWrong alpha value at (%d, %d). Should be 0x%x; got 0x%x. Source was 0x%x, original dest was 0x%x\n",
+                       k, j, ref, da, sa, oda);
+
+               printf ("src: %s, alpha: %s, origin %d %d\ndst: %s, alpha: %s, origin: %d %d\n\n",
+                       format_name (sf),
+                       format_name (saf),
+                       soff, soff,
+                       format_name (df),
+                       format_name (daf),
+                       doff, doff);
+               return 1;
+           }
+       }
+    }
+
+    pixman_image_set_alpha_map (src, NULL, 0, 0);
+    pixman_image_set_alpha_map (dst, NULL, 0, 0);
+    pixman_image_set_alpha_map (orig_dst, NULL, 0, 0);
+
+    pixman_image_unref (src);
+    pixman_image_unref (dst);
+    pixman_image_unref (orig_dst);
+
+    return 0;
+}
+
+int
+main (int argc, char **argv)
+{
+    int i, j, a, b, x, y;
+
+    for (i = 0; i < ARRAY_LENGTH (formats); ++i)
+    {
+       for (j = 0; j < ARRAY_LENGTH (formats); ++j)
+       {
+           for (a = 0; a < ARRAY_LENGTH (alpha_formats); ++a)
+           {
+               for (b = 0; b < ARRAY_LENGTH (alpha_formats); ++b)
+               {
+                   for (x = 0; x < ARRAY_LENGTH (origins); ++x)
+                   {
+                       for (y = 0; y < ARRAY_LENGTH (origins); ++y)
+                       {
+                           if (run_test (i, j, a, b, x, y) != 0)
+                               return 1;
+                       }
+                   }
+               }
+           }
+       }
+    }
+
+    return 0;
+}
diff --git a/test/blitters-test.c b/test/blitters-test.c
new file mode 100755 (executable)
index 0000000..4f931c4
--- /dev/null
@@ -0,0 +1,430 @@
+/*
+ * Test program, which stresses the use of different color formats and
+ * compositing operations.
+ *
+ * Script 'fuzzer-find-diff.pl' can be used to narrow down the problem in
+ * the case of test failure.
+ */
+#include <assert.h>
+#include <stdlib.h>
+#include <stdio.h>
+#include "utils.h"
+
+static pixman_indexed_t rgb_palette[9];
+static pixman_indexed_t y_palette[9];
+
+/* The first eight format in the list are by far the most widely
+ * used formats, so we test those more than the others
+ */
+#define N_MOST_LIKELY_FORMATS 8
+
+/* Create random image for testing purposes */
+static pixman_image_t *
+create_random_image (pixman_format_code_t *allowed_formats,
+                    int                   max_width,
+                    int                   max_height,
+                    int                   max_extra_stride,
+                    pixman_format_code_t *used_fmt)
+{
+    int n = 0, i, width, height, stride;
+    pixman_format_code_t fmt;
+    uint32_t *buf;
+    pixman_image_t *img;
+
+    while (allowed_formats[n] != PIXMAN_null)
+       n++;
+
+    if (n > N_MOST_LIKELY_FORMATS && lcg_rand_n (4) != 0)
+       n = N_MOST_LIKELY_FORMATS;
+    fmt = allowed_formats[lcg_rand_n (n)];
+
+    width = lcg_rand_n (max_width) + 1;
+    height = lcg_rand_n (max_height) + 1;
+    stride = (width * PIXMAN_FORMAT_BPP (fmt) + 7) / 8 +
+       lcg_rand_n (max_extra_stride + 1);
+    stride = (stride + 3) & ~3;
+
+    /* do the allocation */
+    buf = aligned_malloc (64, stride * height);
+
+    /* initialize image with random data */
+    for (i = 0; i < stride * height; i++)
+    {
+       /* generation is biased to having more 0 or 255 bytes as
+        * they are more likely to be special-cased in code
+        */
+       *((uint8_t *)buf + i) = lcg_rand_n (4) ? lcg_rand_n (256) :
+           (lcg_rand_n (2) ? 0 : 255);
+    }
+
+    img = pixman_image_create_bits (fmt, width, height, buf, stride);
+
+    if (PIXMAN_FORMAT_TYPE (fmt) == PIXMAN_TYPE_COLOR)
+    {
+       pixman_image_set_indexed (img, &(rgb_palette[PIXMAN_FORMAT_BPP (fmt)]));
+    }
+    else if (PIXMAN_FORMAT_TYPE (fmt) == PIXMAN_TYPE_GRAY)
+    {
+       pixman_image_set_indexed (img, &(y_palette[PIXMAN_FORMAT_BPP (fmt)]));
+    }
+
+    if (lcg_rand_n (16) == 0)
+       pixman_image_set_filter (img, PIXMAN_FILTER_BILINEAR, NULL, 0);
+
+    image_endian_swap (img);
+
+    if (used_fmt) *used_fmt = fmt;
+    return img;
+}
+
+/* Free random image, and optionally update crc32 based on its data */
+static uint32_t
+free_random_image (uint32_t initcrc,
+                  pixman_image_t *img,
+                  pixman_format_code_t fmt)
+{
+    uint32_t crc32 = 0;
+    int stride = pixman_image_get_stride (img);
+    uint32_t *data = pixman_image_get_data (img);
+    int height = pixman_image_get_height (img);
+
+    if (fmt != PIXMAN_null)
+    {
+       /* mask unused 'x' part */
+       if (PIXMAN_FORMAT_BPP (fmt) - PIXMAN_FORMAT_DEPTH (fmt) &&
+           PIXMAN_FORMAT_DEPTH (fmt) != 0)
+       {
+           int i;
+           uint32_t *data = pixman_image_get_data (img);
+           uint32_t mask = (1 << PIXMAN_FORMAT_DEPTH (fmt)) - 1;
+
+           if (PIXMAN_FORMAT_TYPE (fmt) == PIXMAN_TYPE_BGRA ||
+               PIXMAN_FORMAT_TYPE (fmt) == PIXMAN_TYPE_RGBA)
+           {
+               mask <<= (PIXMAN_FORMAT_BPP (fmt) - PIXMAN_FORMAT_DEPTH (fmt));
+           }
+
+           for (i = 0; i < 32; i++)
+               mask |= mask << (i * PIXMAN_FORMAT_BPP (fmt));
+
+           for (i = 0; i < stride * height / 4; i++)
+               data[i] &= mask;
+       }
+
+       /* swap endiannes in order to provide identical results on both big
+        * and litte endian systems
+        */
+       image_endian_swap (img);
+       crc32 = compute_crc32 (initcrc, data, stride * height);
+    }
+
+    pixman_image_unref (img);
+    free (data);
+
+    return crc32;
+}
+
+static pixman_op_t op_list[] = {
+    PIXMAN_OP_SRC,
+    PIXMAN_OP_OVER,
+    PIXMAN_OP_ADD,
+    PIXMAN_OP_CLEAR,
+    PIXMAN_OP_SRC,
+    PIXMAN_OP_DST,
+    PIXMAN_OP_OVER,
+    PIXMAN_OP_OVER_REVERSE,
+    PIXMAN_OP_IN,
+    PIXMAN_OP_IN_REVERSE,
+    PIXMAN_OP_OUT,
+    PIXMAN_OP_OUT_REVERSE,
+    PIXMAN_OP_ATOP,
+    PIXMAN_OP_ATOP_REVERSE,
+    PIXMAN_OP_XOR,
+    PIXMAN_OP_ADD,
+    PIXMAN_OP_SATURATE,
+    PIXMAN_OP_DISJOINT_CLEAR,
+    PIXMAN_OP_DISJOINT_SRC,
+    PIXMAN_OP_DISJOINT_DST,
+    PIXMAN_OP_DISJOINT_OVER,
+    PIXMAN_OP_DISJOINT_OVER_REVERSE,
+    PIXMAN_OP_DISJOINT_IN,
+    PIXMAN_OP_DISJOINT_IN_REVERSE,
+    PIXMAN_OP_DISJOINT_OUT,
+    PIXMAN_OP_DISJOINT_OUT_REVERSE,
+    PIXMAN_OP_DISJOINT_ATOP,
+    PIXMAN_OP_DISJOINT_ATOP_REVERSE,
+    PIXMAN_OP_DISJOINT_XOR,
+    PIXMAN_OP_CONJOINT_CLEAR,
+    PIXMAN_OP_CONJOINT_SRC,
+    PIXMAN_OP_CONJOINT_DST,
+    PIXMAN_OP_CONJOINT_OVER,
+    PIXMAN_OP_CONJOINT_OVER_REVERSE,
+    PIXMAN_OP_CONJOINT_IN,
+    PIXMAN_OP_CONJOINT_IN_REVERSE,
+    PIXMAN_OP_CONJOINT_OUT,
+    PIXMAN_OP_CONJOINT_OUT_REVERSE,
+    PIXMAN_OP_CONJOINT_ATOP,
+    PIXMAN_OP_CONJOINT_ATOP_REVERSE,
+    PIXMAN_OP_CONJOINT_XOR,
+    PIXMAN_OP_MULTIPLY,
+    PIXMAN_OP_SCREEN,
+    PIXMAN_OP_OVERLAY,
+    PIXMAN_OP_DARKEN,
+    PIXMAN_OP_LIGHTEN,
+    PIXMAN_OP_COLOR_DODGE,
+    PIXMAN_OP_COLOR_BURN,
+    PIXMAN_OP_HARD_LIGHT,
+    PIXMAN_OP_DIFFERENCE,
+    PIXMAN_OP_EXCLUSION,
+#if 0 /* these use floating point math and are not always bitexact on different platforms */
+    PIXMAN_OP_SOFT_LIGHT,
+    PIXMAN_OP_HSL_HUE,
+    PIXMAN_OP_HSL_SATURATION,
+    PIXMAN_OP_HSL_COLOR,
+    PIXMAN_OP_HSL_LUMINOSITY,
+#endif
+};
+
+static pixman_format_code_t img_fmt_list[] = {
+    PIXMAN_a8r8g8b8,
+    PIXMAN_a8b8g8r8,
+    PIXMAN_x8r8g8b8,
+    PIXMAN_x8b8g8r8,
+    PIXMAN_r5g6b5,
+    PIXMAN_b5g6r5,
+    PIXMAN_a8,
+    PIXMAN_a1,
+    PIXMAN_r3g3b2,
+    PIXMAN_b8g8r8a8,
+    PIXMAN_b8g8r8x8,
+    PIXMAN_r8g8b8a8,
+    PIXMAN_r8g8b8x8,
+    PIXMAN_x14r6g6b6,
+    PIXMAN_r8g8b8,
+    PIXMAN_b8g8r8,
+    PIXMAN_x2r10g10b10,
+    PIXMAN_a2r10g10b10,
+    PIXMAN_x2b10g10r10,
+    PIXMAN_a2b10g10r10,
+    PIXMAN_a1r5g5b5,
+    PIXMAN_x1r5g5b5,
+    PIXMAN_a1b5g5r5,
+    PIXMAN_x1b5g5r5,
+    PIXMAN_a4r4g4b4,
+    PIXMAN_x4r4g4b4,
+    PIXMAN_a4b4g4r4,
+    PIXMAN_x4b4g4r4,
+    PIXMAN_r3g3b2,
+    PIXMAN_b2g3r3,
+    PIXMAN_a2r2g2b2,
+    PIXMAN_a2b2g2r2,
+    PIXMAN_c8,
+    PIXMAN_g8,
+    PIXMAN_x4c4,
+    PIXMAN_x4g4,
+    PIXMAN_c4,
+    PIXMAN_g4,
+    PIXMAN_g1,
+    PIXMAN_x4a4,
+    PIXMAN_a4,
+    PIXMAN_r1g2b1,
+    PIXMAN_b1g2r1,
+    PIXMAN_a1r1g1b1,
+    PIXMAN_a1b1g1r1,
+    PIXMAN_null
+};
+
+static pixman_format_code_t mask_fmt_list[] = {
+    PIXMAN_a8r8g8b8,
+    PIXMAN_a8,
+    PIXMAN_a4,
+    PIXMAN_a1,
+    PIXMAN_null
+};
+
+
+/*
+ * Composite operation with pseudorandom images
+ */
+uint32_t
+test_composite (int testnum, int verbose)
+{
+    int i;
+    pixman_image_t *src_img = NULL;
+    pixman_image_t *dst_img = NULL;
+    pixman_image_t *mask_img = NULL;
+    int src_width, src_height;
+    int dst_width, dst_height;
+    int src_stride, dst_stride;
+    int src_x, src_y;
+    int dst_x, dst_y;
+    int mask_x, mask_y;
+    int w, h;
+    pixman_op_t op;
+    pixman_format_code_t src_fmt, dst_fmt, mask_fmt;
+    uint32_t *dstbuf, *srcbuf, *maskbuf;
+    uint32_t crc32;
+    int max_width, max_height, max_extra_stride;
+    FLOAT_REGS_CORRUPTION_DETECTOR_START ();
+
+    max_width = max_height = 24 + testnum / 10000;
+    max_extra_stride = 4 + testnum / 1000000;
+
+    if (max_width > 256)
+       max_width = 256;
+
+    if (max_height > 16)
+       max_height = 16;
+
+    if (max_extra_stride > 8)
+       max_extra_stride = 8;
+
+    lcg_srand (testnum);
+
+    op = op_list[lcg_rand_n (sizeof (op_list) / sizeof (op_list[0]))];
+
+    if (lcg_rand_n (8))
+    {
+       /* normal image */
+       src_img = create_random_image (img_fmt_list, max_width, max_height,
+                                      max_extra_stride, &src_fmt);
+    }
+    else
+    {
+       /* solid case */
+       src_img = create_random_image (img_fmt_list, 1, 1,
+                                      max_extra_stride, &src_fmt);
+
+       pixman_image_set_repeat (src_img, PIXMAN_REPEAT_NORMAL);
+    }
+
+    dst_img = create_random_image (img_fmt_list, max_width, max_height,
+                                  max_extra_stride, &dst_fmt);
+
+    src_width = pixman_image_get_width (src_img);
+    src_height = pixman_image_get_height (src_img);
+    src_stride = pixman_image_get_stride (src_img);
+
+    dst_width = pixman_image_get_width (dst_img);
+    dst_height = pixman_image_get_height (dst_img);
+    dst_stride = pixman_image_get_stride (dst_img);
+
+    dstbuf = pixman_image_get_data (dst_img);
+    srcbuf = pixman_image_get_data (src_img);
+
+    src_x = lcg_rand_n (src_width);
+    src_y = lcg_rand_n (src_height);
+    dst_x = lcg_rand_n (dst_width);
+    dst_y = lcg_rand_n (dst_height);
+
+    mask_img = NULL;
+    mask_fmt = PIXMAN_null;
+    mask_x = 0;
+    mask_y = 0;
+    maskbuf = NULL;
+
+    if ((src_fmt == PIXMAN_x8r8g8b8 || src_fmt == PIXMAN_x8b8g8r8) &&
+       (lcg_rand_n (4) == 0))
+    {
+       /* PIXBUF */
+       mask_fmt = lcg_rand_n (2) ? PIXMAN_a8r8g8b8 : PIXMAN_a8b8g8r8;
+       mask_img = pixman_image_create_bits (mask_fmt,
+                                            src_width,
+                                            src_height,
+                                            srcbuf,
+                                            src_stride);
+       mask_x = src_x;
+       mask_y = src_y;
+       maskbuf = srcbuf;
+    }
+    else if (lcg_rand_n (2))
+    {
+       if (lcg_rand_n (2))
+       {
+           mask_img = create_random_image (mask_fmt_list, max_width, max_height,
+                                          max_extra_stride, &mask_fmt);
+       }
+       else
+       {
+           /* solid case */
+           mask_img = create_random_image (mask_fmt_list, 1, 1,
+                                          max_extra_stride, &mask_fmt);
+           pixman_image_set_repeat (mask_img, PIXMAN_REPEAT_NORMAL);
+       }
+
+       if (lcg_rand_n (2))
+           pixman_image_set_component_alpha (mask_img, 1);
+
+       mask_x = lcg_rand_n (pixman_image_get_width (mask_img));
+       mask_y = lcg_rand_n (pixman_image_get_height (mask_img));
+    }
+
+
+    w = lcg_rand_n (dst_width - dst_x + 1);
+    h = lcg_rand_n (dst_height - dst_y + 1);
+
+    if (verbose)
+    {
+       printf ("op=%d, src_fmt=%08X, dst_fmt=%08X, mask_fmt=%08X\n",
+           op, src_fmt, dst_fmt, mask_fmt);
+       printf ("src_width=%d, src_height=%d, dst_width=%d, dst_height=%d\n",
+           src_width, src_height, dst_width, dst_height);
+       printf ("src_x=%d, src_y=%d, dst_x=%d, dst_y=%d\n",
+           src_x, src_y, dst_x, dst_y);
+       printf ("src_stride=%d, dst_stride=%d\n",
+           src_stride, dst_stride);
+       printf ("w=%d, h=%d\n", w, h);
+    }
+
+    pixman_image_composite (op, src_img, mask_img, dst_img,
+                           src_x, src_y, mask_x, mask_y, dst_x, dst_y, w, h);
+
+    if (verbose)
+    {
+       int j;
+
+       printf ("---\n");
+       for (i = 0; i < dst_height; i++)
+       {
+           for (j = 0; j < dst_stride; j++)
+           {
+               if (j == (dst_width * PIXMAN_FORMAT_BPP (dst_fmt) + 7) / 8)
+                   printf ("| ");
+
+               printf ("%02X ", *((uint8_t *)dstbuf + i * dst_stride + j));
+           }
+           printf ("\n");
+       }
+       printf ("---\n");
+    }
+
+    free_random_image (0, src_img, PIXMAN_null);
+    crc32 = free_random_image (0, dst_img, dst_fmt);
+
+    if (mask_img)
+    {
+       if (srcbuf == maskbuf)
+           pixman_image_unref(mask_img);
+       else
+           free_random_image (0, mask_img, PIXMAN_null);
+    }
+
+    FLOAT_REGS_CORRUPTION_DETECTOR_FINISH ();
+    return crc32;
+}
+
+int
+main (int argc, const char *argv[])
+{
+    int i;
+
+    for (i = 1; i <= 8; i++)
+    {
+       initialize_palette (&(rgb_palette[i]), i, TRUE);
+       initialize_palette (&(y_palette[i]), i, FALSE);
+    }
+
+    return fuzzer_test_main("blitters", 2000000,
+                           0x29137844,
+                           test_composite, argc, argv);
+}
diff --git a/test/composite-traps-test.c b/test/composite-traps-test.c
new file mode 100755 (executable)
index 0000000..fa6d8a9
--- /dev/null
@@ -0,0 +1,257 @@
+/* Based loosely on scaling-test */
+
+#include <assert.h>
+#include <stdlib.h>
+#include <stdio.h>
+#include "utils.h"
+
+#define MAX_SRC_WIDTH  48
+#define MAX_SRC_HEIGHT 48
+#define MAX_DST_WIDTH  48
+#define MAX_DST_HEIGHT 48
+#define MAX_STRIDE     4
+
+static pixman_format_code_t formats[] =
+{
+    PIXMAN_a8r8g8b8, PIXMAN_a8, PIXMAN_r5g6b5, PIXMAN_a1, PIXMAN_a4
+};
+
+static pixman_format_code_t mask_formats[] =
+{
+    PIXMAN_a1, PIXMAN_a4, PIXMAN_a8,
+};
+
+static pixman_op_t operators[] =
+{
+    PIXMAN_OP_OVER, PIXMAN_OP_ADD, PIXMAN_OP_SRC, PIXMAN_OP_IN
+};
+
+#define RANDOM_ELT(array)                                              \
+    ((array)[lcg_rand_n(ARRAY_LENGTH((array)))])
+
+static void
+destroy_bits (pixman_image_t *image, void *data)
+{
+    fence_free (data);
+}
+
+static pixman_fixed_t
+random_fixed (int n)
+{
+    return lcg_rand_N (n << 16);
+}
+
+/*
+ * Composite operation with pseudorandom images
+ */
+uint32_t
+test_composite (int      testnum,
+               int      verbose)
+{
+    int                i;
+    pixman_image_t *   src_img;
+    pixman_image_t *   dst_img;
+    pixman_region16_t  clip;
+    int                dst_width, dst_height;
+    int                dst_stride;
+    int                dst_x, dst_y;
+    int                dst_bpp;
+    pixman_op_t        op;
+    uint32_t *         dst_bits;
+    uint32_t           crc32;
+    pixman_format_code_t mask_format, dst_format;
+    pixman_trapezoid_t *traps;
+    int src_x, src_y;
+    int n_traps;
+
+    static pixman_color_t colors[] =
+    {
+       { 0xffff, 0xffff, 0xffff, 0xffff },
+       { 0x0000, 0x0000, 0x0000, 0x0000 },
+       { 0xabcd, 0xabcd, 0x0000, 0xabcd },
+       { 0x0000, 0x0000, 0x0000, 0xffff },
+       { 0x0101, 0x0101, 0x0101, 0x0101 },
+       { 0x7777, 0x6666, 0x5555, 0x9999 },
+    };
+    
+    FLOAT_REGS_CORRUPTION_DETECTOR_START ();
+
+    lcg_srand (testnum);
+
+    op = RANDOM_ELT (operators);
+    mask_format = RANDOM_ELT (mask_formats);
+
+    /* Create source image */
+    
+    if (lcg_rand_n (4) == 0)
+    {
+       src_img = pixman_image_create_solid_fill (
+           &(colors[lcg_rand_n (ARRAY_LENGTH (colors))]));
+
+       src_x = 10;
+       src_y = 234;
+    }
+    else
+    {
+       pixman_format_code_t src_format = RANDOM_ELT(formats);
+       int src_bpp = (PIXMAN_FORMAT_BPP (src_format) + 7) / 8;
+       int src_width = lcg_rand_n (MAX_SRC_WIDTH) + 1;
+       int src_height = lcg_rand_n (MAX_SRC_HEIGHT) + 1;
+       int src_stride = src_width * src_bpp + lcg_rand_n (MAX_STRIDE) * src_bpp;
+       uint32_t *bits;
+
+       src_x = -(src_width / 4) + lcg_rand_n (src_width * 3 / 2);
+       src_y = -(src_height / 4) + lcg_rand_n (src_height * 3 / 2);
+
+       src_stride = (src_stride + 3) & ~3;
+       
+       bits = (uint32_t *)make_random_bytes (src_stride * src_height);
+
+       src_img = pixman_image_create_bits (
+           src_format, src_width, src_height, bits, src_stride);
+
+       pixman_image_set_destroy_function (src_img, destroy_bits, bits);
+
+       if (lcg_rand_n (8) == 0)
+       {
+           pixman_box16_t clip_boxes[2];
+           int            n = lcg_rand_n (2) + 1;
+           
+           for (i = 0; i < n; i++)
+           {
+               clip_boxes[i].x1 = lcg_rand_n (src_width);
+               clip_boxes[i].y1 = lcg_rand_n (src_height);
+               clip_boxes[i].x2 =
+                   clip_boxes[i].x1 + lcg_rand_n (src_width - clip_boxes[i].x1);
+               clip_boxes[i].y2 =
+                   clip_boxes[i].y1 + lcg_rand_n (src_height - clip_boxes[i].y1);
+               
+               if (verbose)
+               {
+                   printf ("source clip box: [%d,%d-%d,%d]\n",
+                           clip_boxes[i].x1, clip_boxes[i].y1,
+                           clip_boxes[i].x2, clip_boxes[i].y2);
+               }
+           }
+           
+           pixman_region_init_rects (&clip, clip_boxes, n);
+           pixman_image_set_clip_region (src_img, &clip);
+           pixman_image_set_source_clipping (src_img, 1);
+           pixman_region_fini (&clip);
+       }
+
+       image_endian_swap (src_img);
+    }
+
+    /* Create destination image */
+    {
+       dst_format = RANDOM_ELT(formats);
+       dst_bpp = (PIXMAN_FORMAT_BPP (dst_format) + 7) / 8;
+       dst_width = lcg_rand_n (MAX_DST_WIDTH) + 1;
+       dst_height = lcg_rand_n (MAX_DST_HEIGHT) + 1;
+       dst_stride = dst_width * dst_bpp + lcg_rand_n (MAX_STRIDE) * dst_bpp;
+       dst_stride = (dst_stride + 3) & ~3;
+       
+       dst_bits = (uint32_t *)make_random_bytes (dst_stride * dst_height);
+
+       dst_x = -(dst_width / 4) + lcg_rand_n (dst_width * 3 / 2);
+       dst_y = -(dst_height / 4) + lcg_rand_n (dst_height * 3 / 2);
+       
+       dst_img = pixman_image_create_bits (
+           dst_format, dst_width, dst_height, dst_bits, dst_stride);
+
+       image_endian_swap (dst_img);
+    }
+
+    /* Create traps */
+    {
+       int i;
+
+       n_traps = lcg_rand_n (25);
+       traps = fence_malloc (n_traps * sizeof (pixman_trapezoid_t));
+
+       for (i = 0; i < n_traps; ++i)
+       {
+           pixman_trapezoid_t *t = &(traps[i]);
+           
+           t->top = random_fixed (MAX_DST_HEIGHT) - MAX_DST_HEIGHT / 2;
+           t->bottom = t->top + random_fixed (MAX_DST_HEIGHT);
+           t->left.p1.x = random_fixed (MAX_DST_WIDTH) - MAX_DST_WIDTH / 2;
+           t->left.p1.y = t->top - random_fixed (50);
+           t->left.p2.x = random_fixed (MAX_DST_WIDTH) - MAX_DST_WIDTH / 2;
+           t->left.p2.y = t->bottom + random_fixed (50);
+           t->right.p1.x = t->left.p1.x + random_fixed (MAX_DST_WIDTH);
+           t->right.p1.y = t->top - random_fixed (50);
+           t->right.p2.x = t->left.p2.x + random_fixed (MAX_DST_WIDTH);
+           t->right.p2.y = t->bottom - random_fixed (50);
+       }
+    }
+    
+    if (lcg_rand_n (8) == 0)
+    {
+       pixman_box16_t clip_boxes[2];
+       int            n = lcg_rand_n (2) + 1;
+       for (i = 0; i < n; i++)
+       {
+           clip_boxes[i].x1 = lcg_rand_n (dst_width);
+           clip_boxes[i].y1 = lcg_rand_n (dst_height);
+           clip_boxes[i].x2 =
+               clip_boxes[i].x1 + lcg_rand_n (dst_width - clip_boxes[i].x1);
+           clip_boxes[i].y2 =
+               clip_boxes[i].y1 + lcg_rand_n (dst_height - clip_boxes[i].y1);
+
+           if (verbose)
+           {
+               printf ("destination clip box: [%d,%d-%d,%d]\n",
+                       clip_boxes[i].x1, clip_boxes[i].y1,
+                       clip_boxes[i].x2, clip_boxes[i].y2);
+           }
+       }
+       pixman_region_init_rects (&clip, clip_boxes, n);
+       pixman_image_set_clip_region (dst_img, &clip);
+       pixman_region_fini (&clip);
+    }
+
+    pixman_composite_trapezoids (op, src_img, dst_img, mask_format,
+                                src_x, src_y, dst_x, dst_y, n_traps, traps);
+
+    if (dst_format == PIXMAN_x8r8g8b8)
+    {
+       /* ignore unused part */
+       for (i = 0; i < dst_stride * dst_height / 4; i++)
+           dst_bits[i] &= 0xFFFFFF;
+    }
+
+    image_endian_swap (dst_img);
+
+    if (verbose)
+    {
+       int j;
+       
+       for (i = 0; i < dst_height; i++)
+       {
+           for (j = 0; j < dst_stride; j++)
+               printf ("%02X ", *((uint8_t *)dst_bits + i * dst_stride + j));
+
+           printf ("\n");
+       }
+    }
+
+    crc32 = compute_crc32 (0, dst_bits, dst_stride * dst_height);
+
+    fence_free (dst_bits);
+    
+    pixman_image_unref (src_img);
+    pixman_image_unref (dst_img);
+    fence_free (traps);
+
+    FLOAT_REGS_CORRUPTION_DETECTOR_FINISH ();
+    return crc32;
+}
+
+int
+main (int argc, const char *argv[])
+{
+    return fuzzer_test_main("composite traps", 40000, 0xE3112106,
+                           test_composite, argc, argv);
+}
diff --git a/test/composite.c b/test/composite.c
new file mode 100755 (executable)
index 0000000..408c363
--- /dev/null
@@ -0,0 +1,920 @@
+/*
+ * Copyright © 2005 Eric Anholt
+ * Copyright © 2009 Chris Wilson
+ * Copyright © 2010 Soeren Sandmann
+ * Copyright © 2010 Red Hat, Inc.
+ *
+ * Permission to use, copy, modify, distribute, and sell this software and its
+ * documentation for any purpose is hereby granted without fee, provided that
+ * the above copyright notice appear in all copies and that both that
+ * copyright notice and this permission notice appear in supporting
+ * documentation, and that the name of Eric Anholt not be used in
+ * advertising or publicity pertaining to distribution of the software without
+ * specific, written prior permission.  Eric Anholt makes no
+ * representations about the suitability of this software for any purpose.  It
+ * is provided "as is" without express or implied warranty.
+ *
+ * ERIC ANHOLT DISCLAIMS ALL WARRANTIES WITH REGARD TO THIS SOFTWARE,
+ * INCLUDING ALL IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS, IN NO
+ * EVENT SHALL ERIC ANHOLT BE LIABLE FOR ANY SPECIAL, INDIRECT OR
+ * CONSEQUENTIAL DAMAGES OR ANY DAMAGES WHATSOEVER RESULTING FROM LOSS OF USE,
+ * DATA OR PROFITS, WHETHER IN AN ACTION OF CONTRACT, NEGLIGENCE OR OTHER
+ * TORTIOUS ACTION, ARISING OUT OF OR IN CONNECTION WITH THE USE OR
+ * PERFORMANCE OF THIS SOFTWARE.
+ */
+#define PIXMAN_USE_INTERNAL_API
+#include <pixman.h>
+#include <stdio.h>
+#include <stdlib.h> /* abort() */
+#include <math.h>
+#include <time.h>
+#include "utils.h"
+
+typedef struct color_t color_t;
+typedef struct format_t format_t;
+typedef struct image_t image_t;
+typedef struct operator_t operator_t;
+
+struct color_t
+{
+    double r, g, b, a;
+};
+
+struct format_t
+{
+    pixman_format_code_t format;
+    const char *name;
+};
+
+static const color_t colors[] =
+{
+    { 1.0, 1.0, 1.0, 1.0 },
+    { 1.0, 1.0, 1.0, 0.0 },
+    { 0.0, 0.0, 0.0, 1.0 },
+    { 0.0, 0.0, 0.0, 0.0 },
+    { 1.0, 0.0, 0.0, 1.0 },
+    { 0.0, 1.0, 0.0, 1.0 },
+    { 0.0, 0.0, 1.0, 1.0 },
+    { 0.5, 0.0, 0.0, 0.5 },
+};
+
+static uint16_t
+_color_double_to_short (double d)
+{
+    uint32_t i;
+
+    i = (uint32_t) (d * 65536);
+    i -= (i >> 16);
+
+    return i;
+}
+
+static void
+compute_pixman_color (const color_t *color,
+                     pixman_color_t *out)
+{
+    out->red   = _color_double_to_short (color->r);
+    out->green = _color_double_to_short (color->g);
+    out->blue  = _color_double_to_short (color->b);
+    out->alpha = _color_double_to_short (color->a);
+}
+
+#define REPEAT 0x01000000
+#define FLAGS  0xff000000
+
+static const int sizes[] =
+{
+    0,
+    1,
+    1 | REPEAT,
+    10
+};
+
+static const format_t formats[] =
+{
+#define P(x) { PIXMAN_##x, #x }
+
+    /* 32 bpp formats */
+    P(a8r8g8b8),
+    P(x8r8g8b8),
+    P(a8b8g8r8),
+    P(x8b8g8r8),
+    P(b8g8r8a8),
+    P(b8g8r8x8),
+    P(r8g8b8a8),
+    P(r8g8b8x8),
+    P(x2r10g10b10),
+    P(x2b10g10r10),
+    P(a2r10g10b10),
+    P(a2b10g10r10),
+
+    /* 24 bpp formats */
+    P(r8g8b8),
+    P(b8g8r8),
+    P(r5g6b5),
+    P(b5g6r5),
+
+    /* 16 bpp formats */
+    P(x1r5g5b5),
+    P(x1b5g5r5),
+    P(a1r5g5b5),
+    P(a1b5g5r5),
+    P(a4b4g4r4),
+    P(x4b4g4r4),
+    P(a4r4g4b4),
+    P(x4r4g4b4),
+
+    /* 8 bpp formats */
+    P(a8),
+    P(r3g3b2),
+    P(b2g3r3),
+    P(a2r2g2b2),
+    P(a2b2g2r2),
+    P(x4a4),
+
+    /* 4 bpp formats */
+    P(a4),
+    P(r1g2b1),
+    P(b1g2r1),
+    P(a1r1g1b1),
+    P(a1b1g1r1),
+
+    /* 1 bpp formats */
+    P(a1)
+#undef P
+};
+
+struct image_t
+{
+    pixman_image_t *image;
+    const format_t *format;
+    const color_t *color;
+    pixman_repeat_t repeat;
+    int size;
+};
+
+struct operator_t
+{
+    pixman_op_t op;
+    const char *name;
+};
+
+static const operator_t operators[] =
+{
+#define P(x) { PIXMAN_OP_##x, #x }
+    P(CLEAR),
+    P(SRC),
+    P(DST),
+    P(OVER),
+    P(OVER_REVERSE),
+    P(IN),
+    P(IN_REVERSE),
+    P(OUT),
+    P(OUT_REVERSE),
+    P(ATOP),
+    P(ATOP_REVERSE),
+    P(XOR),
+    P(ADD),
+    P(SATURATE),
+
+    P(DISJOINT_CLEAR),
+    P(DISJOINT_SRC),
+    P(DISJOINT_DST),
+    P(DISJOINT_OVER),
+    P(DISJOINT_OVER_REVERSE),
+    P(DISJOINT_IN),
+    P(DISJOINT_IN_REVERSE),
+    P(DISJOINT_OUT),
+    P(DISJOINT_OUT_REVERSE),
+    P(DISJOINT_ATOP),
+    P(DISJOINT_ATOP_REVERSE),
+    P(DISJOINT_XOR),
+
+    P(CONJOINT_CLEAR),
+    P(CONJOINT_SRC),
+    P(CONJOINT_DST),
+    P(CONJOINT_OVER),
+    P(CONJOINT_OVER_REVERSE),
+    P(CONJOINT_IN),
+    P(CONJOINT_IN_REVERSE),
+    P(CONJOINT_OUT),
+    P(CONJOINT_OUT_REVERSE),
+    P(CONJOINT_ATOP),
+    P(CONJOINT_ATOP_REVERSE),
+    P(CONJOINT_XOR),
+#undef P
+};
+
+static double
+calc_op (pixman_op_t op, double src, double dst, double srca, double dsta)
+{
+#define mult_chan(src, dst, Fa, Fb) MIN ((src) * (Fa) + (dst) * (Fb), 1.0)
+
+    double Fa, Fb;
+
+    switch (op)
+    {
+    case PIXMAN_OP_CLEAR:
+    case PIXMAN_OP_DISJOINT_CLEAR:
+    case PIXMAN_OP_CONJOINT_CLEAR:
+       return mult_chan (src, dst, 0.0, 0.0);
+
+    case PIXMAN_OP_SRC:
+    case PIXMAN_OP_DISJOINT_SRC:
+    case PIXMAN_OP_CONJOINT_SRC:
+       return mult_chan (src, dst, 1.0, 0.0);
+
+    case PIXMAN_OP_DST:
+    case PIXMAN_OP_DISJOINT_DST:
+    case PIXMAN_OP_CONJOINT_DST:
+       return mult_chan (src, dst, 0.0, 1.0);
+
+    case PIXMAN_OP_OVER:
+       return mult_chan (src, dst, 1.0, 1.0 - srca);
+
+    case PIXMAN_OP_OVER_REVERSE:
+       return mult_chan (src, dst, 1.0 - dsta, 1.0);
+
+    case PIXMAN_OP_IN:
+       return mult_chan (src, dst, dsta, 0.0);
+
+    case PIXMAN_OP_IN_REVERSE:
+       return mult_chan (src, dst, 0.0, srca);
+
+    case PIXMAN_OP_OUT:
+       return mult_chan (src, dst, 1.0 - dsta, 0.0);
+
+    case PIXMAN_OP_OUT_REVERSE:
+       return mult_chan (src, dst, 0.0, 1.0 - srca);
+
+    case PIXMAN_OP_ATOP:
+       return mult_chan (src, dst, dsta, 1.0 - srca);
+
+    case PIXMAN_OP_ATOP_REVERSE:
+       return mult_chan (src, dst, 1.0 - dsta,  srca);
+
+    case PIXMAN_OP_XOR:
+       return mult_chan (src, dst, 1.0 - dsta, 1.0 - srca);
+
+    case PIXMAN_OP_ADD:
+       return mult_chan (src, dst, 1.0, 1.0);
+
+    case PIXMAN_OP_SATURATE:
+    case PIXMAN_OP_DISJOINT_OVER_REVERSE:
+       if (srca == 0.0)
+           Fa = 1.0;
+       else
+           Fa = MIN (1.0, (1.0 - dsta) / srca);
+       return mult_chan (src, dst, Fa, 1.0);
+
+    case PIXMAN_OP_DISJOINT_OVER:
+       if (dsta == 0.0)
+           Fb = 1.0;
+       else
+           Fb = MIN (1.0, (1.0 - srca) / dsta);
+       return mult_chan (src, dst, 1.0, Fb);
+
+    case PIXMAN_OP_DISJOINT_IN:
+       if (srca == 0.0)
+           Fa = 0.0;
+       else
+           Fa = MAX (0.0, 1.0 - (1.0 - dsta) / srca);
+       return mult_chan (src, dst, Fa, 0.0);
+
+    case PIXMAN_OP_DISJOINT_IN_REVERSE:
+       if (dsta == 0.0)
+           Fb = 0.0;
+       else
+           Fb = MAX (0.0, 1.0 - (1.0 - srca) / dsta);
+       return mult_chan (src, dst, 0.0, Fb);
+
+    case PIXMAN_OP_DISJOINT_OUT:
+       if (srca == 0.0)
+           Fa = 1.0;
+       else
+           Fa = MIN (1.0, (1.0 - dsta) / srca);
+       return mult_chan (src, dst, Fa, 0.0);
+
+    case PIXMAN_OP_DISJOINT_OUT_REVERSE:
+       if (dsta == 0.0)
+           Fb = 1.0;
+       else
+           Fb = MIN (1.0, (1.0 - srca) / dsta);
+       return mult_chan (src, dst, 0.0, Fb);
+
+    case PIXMAN_OP_DISJOINT_ATOP:
+       if (srca == 0.0)
+           Fa = 0.0;
+       else
+           Fa = MAX (0.0, 1.0 - (1.0 - dsta) / srca);
+       if (dsta == 0.0)
+           Fb = 1.0;
+       else
+           Fb = MIN (1.0, (1.0 - srca) / dsta);
+       return mult_chan (src, dst, Fa, Fb);
+
+    case PIXMAN_OP_DISJOINT_ATOP_REVERSE:
+       if (srca == 0.0)
+           Fa = 1.0;
+       else
+           Fa = MIN (1.0, (1.0 - dsta) / srca);
+       if (dsta == 0.0)
+           Fb = 0.0;
+       else
+           Fb = MAX (0.0, 1.0 - (1.0 - srca) / dsta);
+       return mult_chan (src, dst, Fa, Fb);
+
+    case PIXMAN_OP_DISJOINT_XOR:
+       if (srca == 0.0)
+           Fa = 1.0;
+       else
+           Fa = MIN (1.0, (1.0 - dsta) / srca);
+       if (dsta == 0.0)
+           Fb = 1.0;
+       else
+           Fb = MIN (1.0, (1.0 - srca) / dsta);
+       return mult_chan (src, dst, Fa, Fb);
+
+    case PIXMAN_OP_CONJOINT_OVER:
+       if (dsta == 0.0)
+           Fb = 0.0;
+       else
+           Fb = MAX (0.0, 1.0 - srca / dsta);
+       return mult_chan (src, dst, 1.0, Fb);
+
+    case PIXMAN_OP_CONJOINT_OVER_REVERSE:
+       if (srca == 0.0)
+           Fa = 0.0;
+       else
+           Fa = MAX (0.0, 1.0 - dsta / srca);
+       return mult_chan (src, dst, Fa, 1.0);
+
+    case PIXMAN_OP_CONJOINT_IN:
+       if (srca == 0.0)
+           Fa = 1.0;
+       else
+           Fa = MIN (1.0, dsta / srca);
+       return mult_chan (src, dst, Fa, 0.0);
+
+    case PIXMAN_OP_CONJOINT_IN_REVERSE:
+       if (dsta == 0.0)
+           Fb = 1.0;
+       else
+           Fb = MIN (1.0, srca / dsta);
+       return mult_chan (src, dst, 0.0, Fb);
+
+    case PIXMAN_OP_CONJOINT_OUT:
+       if (srca == 0.0)
+           Fa = 0.0;
+       else
+           Fa = MAX (0.0, 1.0 - dsta / srca);
+       return mult_chan (src, dst, Fa, 0.0);
+
+    case PIXMAN_OP_CONJOINT_OUT_REVERSE:
+       if (dsta == 0.0)
+           Fb = 0.0;
+       else
+           Fb = MAX (0.0, 1.0 - srca / dsta);
+       return mult_chan (src, dst, 0.0, Fb);
+
+    case PIXMAN_OP_CONJOINT_ATOP:
+       if (srca == 0.0)
+           Fa = 1.0;
+       else
+           Fa = MIN (1.0, dsta / srca);
+       if (dsta == 0.0)
+           Fb = 0.0;
+       else
+           Fb = MAX (0.0, 1.0 - srca / dsta);
+       return mult_chan (src, dst, Fa, Fb);
+
+    case PIXMAN_OP_CONJOINT_ATOP_REVERSE:
+       if (srca == 0.0)
+           Fa = 0.0;
+       else
+           Fa = MAX (0.0, 1.0 - dsta / srca);
+       if (dsta == 0.0)
+           Fb = 1.0;
+       else
+           Fb = MIN (1.0, srca / dsta);
+       return mult_chan (src, dst, Fa, Fb);
+
+    case PIXMAN_OP_CONJOINT_XOR:
+       if (srca == 0.0)
+           Fa = 0.0;
+       else
+           Fa = MAX (0.0, 1.0 - dsta / srca);
+       if (dsta == 0.0)
+           Fb = 0.0;
+       else
+           Fb = MAX (0.0, 1.0 - srca / dsta);
+       return mult_chan (src, dst, Fa, Fb);
+
+    case PIXMAN_OP_MULTIPLY:
+    case PIXMAN_OP_SCREEN:
+    case PIXMAN_OP_OVERLAY:
+    case PIXMAN_OP_DARKEN:
+    case PIXMAN_OP_LIGHTEN:
+    case PIXMAN_OP_COLOR_DODGE:
+    case PIXMAN_OP_COLOR_BURN:
+    case PIXMAN_OP_HARD_LIGHT:
+    case PIXMAN_OP_SOFT_LIGHT:
+    case PIXMAN_OP_DIFFERENCE:
+    case PIXMAN_OP_EXCLUSION:
+    case PIXMAN_OP_HSL_HUE:
+    case PIXMAN_OP_HSL_SATURATION:
+    case PIXMAN_OP_HSL_COLOR:
+    case PIXMAN_OP_HSL_LUMINOSITY:
+    default:
+       abort();
+       return 0; /* silence MSVC */
+    }
+#undef mult_chan
+}
+
+static void
+do_composite (pixman_op_t op,
+             const color_t *src,
+             const color_t *mask,
+             const color_t *dst,
+             color_t *result,
+             pixman_bool_t component_alpha)
+{
+    color_t srcval, srcalpha;
+
+    if (mask == NULL)
+    {
+       srcval = *src;
+
+       srcalpha.r = src->a;
+       srcalpha.g = src->a;
+       srcalpha.b = src->a;
+       srcalpha.a = src->a;
+    }
+    else if (component_alpha)
+    {
+       srcval.r = src->r * mask->r;
+       srcval.g = src->g * mask->g;
+       srcval.b = src->b * mask->b;
+       srcval.a = src->a * mask->a;
+
+       srcalpha.r = src->a * mask->r;
+       srcalpha.g = src->a * mask->g;
+       srcalpha.b = src->a * mask->b;
+       srcalpha.a = src->a * mask->a;
+    }
+    else
+    {
+       srcval.r = src->r * mask->a;
+       srcval.g = src->g * mask->a;
+       srcval.b = src->b * mask->a;
+       srcval.a = src->a * mask->a;
+
+       srcalpha.r = src->a * mask->a;
+       srcalpha.g = src->a * mask->a;
+       srcalpha.b = src->a * mask->a;
+       srcalpha.a = src->a * mask->a;
+    }
+
+    result->r = calc_op (op, srcval.r, dst->r, srcalpha.r, dst->a);
+    result->g = calc_op (op, srcval.g, dst->g, srcalpha.g, dst->a);
+    result->b = calc_op (op, srcval.b, dst->b, srcalpha.b, dst->a);
+    result->a = calc_op (op, srcval.a, dst->a, srcalpha.a, dst->a);
+}
+
+static void
+color_correct (pixman_format_code_t format,
+              color_t *color)
+{
+#define MASK(x) ((1 << (x)) - 1)
+#define round_pix(pix, m)                                              \
+    ((int)((pix) * (MASK(m)) + .5) / (double) (MASK(m)))
+
+    if (PIXMAN_FORMAT_R (format) == 0)
+    {
+       color->r = 0.0;
+       color->g = 0.0;
+       color->b = 0.0;
+    }
+    else
+    {
+       color->r = round_pix (color->r, PIXMAN_FORMAT_R (format));
+       color->g = round_pix (color->g, PIXMAN_FORMAT_G (format));
+       color->b = round_pix (color->b, PIXMAN_FORMAT_B (format));
+    }
+
+    if (PIXMAN_FORMAT_A (format) == 0)
+       color->a = 1.0;
+    else
+       color->a = round_pix (color->a, PIXMAN_FORMAT_A (format));
+
+#undef round_pix
+#undef MASK
+}
+
+static void
+get_pixel (pixman_image_t *image,
+          pixman_format_code_t format,
+          color_t *color)
+{
+#define MASK(N) ((1UL << (N))-1)
+
+    unsigned long rs, gs, bs, as;
+    int a, r, g, b;
+    unsigned long val;
+
+    val = *(unsigned long *) pixman_image_get_data (image);
+#ifdef WORDS_BIGENDIAN
+    val >>= 8 * sizeof(val) - PIXMAN_FORMAT_BPP (format);
+#endif
+
+    /* Number of bits in each channel */
+    a = PIXMAN_FORMAT_A (format);
+    r = PIXMAN_FORMAT_R (format);
+    g = PIXMAN_FORMAT_G (format);
+    b = PIXMAN_FORMAT_B (format);
+
+    switch (PIXMAN_FORMAT_TYPE (format))
+    {
+    case PIXMAN_TYPE_ARGB:
+        bs = 0;
+        gs = b + bs;
+        rs = g + gs;
+        as = r + rs;
+       break;
+
+    case PIXMAN_TYPE_ABGR:
+        rs = 0;
+        gs = r + rs;
+        bs = g + gs;
+        as = b + bs;
+       break;
+
+    case PIXMAN_TYPE_BGRA:
+        as = 0;
+       rs = PIXMAN_FORMAT_BPP (format) - (b + g + r);
+        gs = r + rs;
+        bs = g + gs;
+       break;
+
+    case PIXMAN_TYPE_RGBA:
+       as = 0;
+       bs = PIXMAN_FORMAT_BPP (format) - (b + g + r);
+       gs = b + bs;
+       rs = g + gs;
+       break;
+
+    case PIXMAN_TYPE_A:
+        as = 0;
+        rs = 0;
+        gs = 0;
+        bs = 0;
+       break;
+
+    case PIXMAN_TYPE_OTHER:
+    case PIXMAN_TYPE_COLOR:
+    case PIXMAN_TYPE_GRAY:
+    case PIXMAN_TYPE_YUY2:
+    case PIXMAN_TYPE_YV12:
+    default:
+       abort ();
+        as = 0;
+        rs = 0;
+        gs = 0;
+        bs = 0;
+       break;
+    }
+
+    if (MASK (a) != 0)
+       color->a = ((val >> as) & MASK (a)) / (double) MASK (a);
+    else
+       color->a = 1.0;
+
+    if (MASK (r) != 0)
+    {
+       color->r = ((val >> rs) & MASK (r)) / (double) MASK (r);
+       color->g = ((val >> gs) & MASK (g)) / (double) MASK (g);
+       color->b = ((val >> bs) & MASK (b)) / (double) MASK (b);
+    }
+    else
+    {
+       color->r = 0.0;
+       color->g = 0.0;
+       color->b = 0.0;
+    }
+
+#undef MASK
+}
+
+static double
+eval_diff (color_t *expected, color_t *test, pixman_format_code_t format)
+{
+    double rscale, gscale, bscale, ascale;
+    double rdiff, gdiff, bdiff, adiff;
+
+    rscale = 1.0 * ((1 << PIXMAN_FORMAT_R (format)) - 1);
+    gscale = 1.0 * ((1 << PIXMAN_FORMAT_G (format)) - 1);
+    bscale = 1.0 * ((1 << PIXMAN_FORMAT_B (format)) - 1);
+    ascale = 1.0 * ((1 << PIXMAN_FORMAT_A (format)) - 1);
+
+    rdiff = fabs (test->r - expected->r) * rscale;
+    bdiff = fabs (test->g - expected->g) * gscale;
+    gdiff = fabs (test->b - expected->b) * bscale;
+    adiff = fabs (test->a - expected->a) * ascale;
+
+    return MAX (MAX (MAX (rdiff, gdiff), bdiff), adiff);
+}
+
+static char *
+describe_image (image_t *info, char *buf)
+{
+    if (info->size)
+    {
+       sprintf (buf, "%s %dx%d%s",
+                info->format->name,
+                info->size, info->size,
+                info->repeat ? "R" :"");
+    }
+    else
+    {
+       sprintf (buf, "solid");
+    }
+
+    return buf;
+}
+
+/* Test a composite of a given operation, source, mask, and destination
+ * picture.
+ * Fills the window, and samples from the 0,0 pixel corner.
+ */
+static pixman_bool_t
+composite_test (image_t *dst,
+               const operator_t *op,
+               image_t *src,
+               image_t *mask,
+               pixman_bool_t component_alpha)
+{
+    pixman_color_t fill;
+    pixman_rectangle16_t rect;
+    color_t expected, result, tdst, tsrc, tmsk;
+    double diff;
+    pixman_bool_t success = TRUE;
+
+    compute_pixman_color (dst->color, &fill);
+    rect.x = rect.y = 0;
+    rect.width = rect.height = dst->size;
+    pixman_image_fill_rectangles (PIXMAN_OP_SRC, dst->image,
+                                 &fill, 1, &rect);
+
+    if (mask != NULL)
+    {
+       pixman_image_set_component_alpha (mask->image, component_alpha);
+       pixman_image_composite (op->op, src->image, mask->image, dst->image,
+                               0, 0,
+                               0, 0,
+                               0, 0,
+                               dst->size, dst->size);
+
+       tmsk = *mask->color;
+       if (mask->size)
+       {
+           color_correct (mask->format->format, &tmsk);
+
+           if (component_alpha &&
+               PIXMAN_FORMAT_R (mask->format->format) == 0)
+           {
+               /* Ax component-alpha masks expand alpha into
+                * all color channels.
+                */
+               tmsk.r = tmsk.g = tmsk.b = tmsk.a;
+           }
+       }
+    }
+    else
+    {
+       pixman_image_composite (op->op, src->image, NULL, dst->image,
+                               0, 0,
+                               0, 0,
+                               0, 0,
+                               dst->size, dst->size);
+    }
+    get_pixel (dst->image, dst->format->format, &result);
+
+    tdst = *dst->color;
+    color_correct (dst->format->format, &tdst);
+    tsrc = *src->color;
+    if (src->size)
+       color_correct (src->format->format, &tsrc);
+    do_composite (op->op, &tsrc, mask ? &tmsk : NULL, &tdst,
+                 &expected, component_alpha);
+    color_correct (dst->format->format, &expected);
+
+    diff = eval_diff (&expected, &result, dst->format->format);
+
+    /* FIXME: We should find out what deviation is acceptable. 3.0
+     * is clearly absurd for 2 bit formats for example. On the other
+     * hand currently 1.0 does not work.
+     */
+    if (diff > 3.0)
+    {
+       char buf[40];
+
+       sprintf (buf, "%s %scomposite",
+                op->name,
+                component_alpha ? "CA " : "");
+
+       printf ("%s test error of %.4f --\n"
+               "           R    G    B    A\n"
+               "got:       %.2f %.2f %.2f %.2f [%08lx]\n"
+               "expected:  %.2f %.2f %.2f %.2f\n",
+               buf, diff,
+               result.r, result.g, result.b, result.a,
+               *(unsigned long *) pixman_image_get_data (dst->image),
+               expected.r, expected.g, expected.b, expected.a);
+
+       if (mask != NULL)
+       {
+           printf ("src color: %.2f %.2f %.2f %.2f\n"
+                   "msk color: %.2f %.2f %.2f %.2f\n"
+                   "dst color: %.2f %.2f %.2f %.2f\n",
+                   src->color->r, src->color->g,
+                   src->color->b, src->color->a,
+                   mask->color->r, mask->color->g,
+                   mask->color->b, mask->color->a,
+                   dst->color->r, dst->color->g,
+                   dst->color->b, dst->color->a);
+           printf ("src: %s, ", describe_image (src, buf));
+           printf ("mask: %s, ", describe_image (mask, buf));
+           printf ("dst: %s\n\n", describe_image (dst, buf));
+       }
+       else
+       {
+           printf ("src color: %.2f %.2f %.2f %.2f\n"
+                   "dst color: %.2f %.2f %.2f %.2f\n",
+                   src->color->r, src->color->g,
+                   src->color->b, src->color->a,
+                   dst->color->r, dst->color->g,
+                   dst->color->b, dst->color->a);
+           printf ("src: %s, ", describe_image (src, buf));
+           printf ("dst: %s\n\n", describe_image (dst, buf));
+       }
+
+       success = FALSE;
+    }
+
+    return success;
+}
+
+static void
+image_init (image_t *info,
+           int color,
+           int format,
+           int size)
+{
+    pixman_color_t fill;
+
+    info->color = &colors[color];
+    compute_pixman_color (info->color, &fill);
+
+    info->format = &formats[format];
+    info->size = sizes[size] & ~FLAGS;
+    info->repeat = PIXMAN_REPEAT_NONE;
+
+    if (info->size)
+    {
+       pixman_rectangle16_t rect;
+
+       info->image = pixman_image_create_bits (info->format->format,
+                                               info->size, info->size,
+                                               NULL, 0);
+
+       rect.x = rect.y = 0;
+       rect.width = rect.height = info->size;
+       pixman_image_fill_rectangles (PIXMAN_OP_SRC, info->image, &fill,
+                                     1, &rect);
+
+       if (size & REPEAT)
+       {
+           pixman_image_set_repeat (info->image, PIXMAN_REPEAT_NORMAL);
+           info->repeat = PIXMAN_REPEAT_NORMAL;
+       }
+    }
+    else
+    {
+       info->image = pixman_image_create_solid_fill (&fill);
+    }
+}
+
+static void
+image_fini (image_t *info)
+{
+    pixman_image_unref (info->image);
+}
+
+static int
+random_size (void)
+{
+    return lcg_rand_n (ARRAY_LENGTH (sizes));
+}
+
+static int
+random_color (void)
+{
+    return lcg_rand_n (ARRAY_LENGTH (colors));
+}
+
+static int
+random_format (void)
+{
+    return lcg_rand_n (ARRAY_LENGTH (formats));
+}
+
+static pixman_bool_t
+run_test (uint32_t seed)
+{
+    image_t src, mask, dst;
+    const operator_t *op;
+    int ca;
+    int ok;
+
+    lcg_srand (seed);
+    
+    image_init (&dst, random_color(), random_format(), 1);
+    image_init (&src, random_color(), random_format(), random_size());
+    image_init (&mask, random_color(), random_format(), random_size());
+
+    op = &(operators [lcg_rand_n (ARRAY_LENGTH (operators))]);
+
+    ca = lcg_rand_n (3);
+
+    switch (ca)
+    {
+    case 0:
+       ok = composite_test (&dst, op, &src, NULL, FALSE);
+       break;
+    case 1:
+       ok = composite_test (&dst, op, &src, &mask, FALSE);
+       break;
+    case 2:
+       ok = composite_test (&dst, op, &src, &mask,
+                            mask.size? TRUE : FALSE);
+       break;
+    default:
+       ok = FALSE;
+       break;
+    }
+
+    image_fini (&src);
+    image_fini (&mask);
+    image_fini (&dst);
+
+    return ok;
+}
+
+int
+main (int argc, char **argv)
+{
+#define N_TESTS (8 * 1024 * 1024)
+    int result = 0;
+    uint32_t i, seed;
+
+    if (argc > 1)
+    {
+       char *end;
+       
+       i = strtol (argv[1], &end, 0);
+
+       if (end != argv[1])
+       {
+           if (!run_test (i))
+               return 1;
+           else
+               return 0;
+       }
+       else
+       {
+           printf ("Usage:\n\n   %s <number>\n\n", argv[0]);
+           return -1;
+       }
+    }
+
+    if (getenv ("PIXMAN_RANDOMIZE_TESTS"))
+       seed = get_random_seed();
+    else
+       seed = 1;
+    
+#ifdef USE_OPENMP
+#   pragma omp parallel for default(none) shared(result, argv, seed)
+#endif
+    for (i = 0; i <= N_TESTS; ++i)
+    {
+       if (!result && !run_test (i + seed))
+       {
+           printf ("Test 0x%08X failed.\n", seed + i);
+           
+           result = seed + i;
+       }
+    }
+    
+    return result;
+}
diff --git a/test/fetch-test.c b/test/fetch-test.c
new file mode 100755 (executable)
index 0000000..9f80eec
--- /dev/null
@@ -0,0 +1,209 @@
+#ifdef HAVE_CONFIG_H
+#include <config.h>
+#endif
+
+#include <assert.h>
+#include <stdlib.h>
+#include <stdio.h>
+#include "pixman.h"
+
+#define SIZE 1024
+
+static pixman_indexed_t mono_palette =
+{
+    0, { 0x00000000, 0x00ffffff },
+};
+
+
+typedef struct {
+    pixman_format_code_t format;
+    int width, height;
+    int stride;
+    uint32_t src[SIZE];
+    uint32_t dst[SIZE];
+    pixman_indexed_t *indexed;
+} testcase_t;
+
+static testcase_t testcases[] =
+{
+    {
+       PIXMAN_a8r8g8b8,
+       2, 2,
+       8,
+       { 0x00112233, 0x44556677,
+         0x8899aabb, 0xccddeeff },
+       { 0x00112233, 0x44556677,
+         0x8899aabb, 0xccddeeff },
+       NULL,
+    },
+    {
+       PIXMAN_r8g8b8a8,
+       2, 2,
+       8,
+       { 0x11223300, 0x55667744,
+         0x99aabb88, 0xddeeffcc },
+       { 0x00112233, 0x44556677,
+         0x8899aabb, 0xccddeeff },
+       NULL,
+    },
+    {
+       PIXMAN_g1,
+       8, 2,
+       4,
+#ifdef WORDS_BIGENDIAN
+       {
+           0xaa000000,
+           0x55000000
+       },
+#else
+       {
+           0x00000055,
+           0x000000aa
+       },
+#endif
+       {
+           0x00ffffff, 0x00000000, 0x00ffffff, 0x00000000, 0x00ffffff, 0x00000000, 0x00ffffff, 0x00000000,
+           0x00000000, 0x00ffffff, 0x00000000, 0x00ffffff, 0x00000000, 0x00ffffff, 0x00000000, 0x00ffffff
+       },
+       &mono_palette,
+    },
+#if 0
+    {
+       PIXMAN_g8,
+       4, 2,
+       4,
+       { 0x01234567,
+         0x89abcdef },
+       { 0x00010101, 0x00232323, 0x00454545, 0x00676767,
+         0x00898989, 0x00ababab, 0x00cdcdcd, 0x00efefef, },
+    },
+#endif
+    /* FIXME: make this work on big endian */
+    {
+       PIXMAN_yv12,
+       8, 2,
+       8,
+#ifdef WORDS_BIGENDIAN
+       {
+           0x00ff00ff, 0x00ff00ff,
+           0xff00ff00, 0xff00ff00,
+           0x80ff8000,
+           0x800080ff
+       },
+#else
+       {
+           0xff00ff00, 0xff00ff00,
+           0x00ff00ff, 0x00ff00ff,
+           0x0080ff80,
+           0xff800080
+       },
+#endif
+       {
+           0xff000000, 0xffffffff, 0xffb80000, 0xffffe113,
+           0xff000000, 0xffffffff, 0xff0023ee, 0xff4affff,
+           0xffffffff, 0xff000000, 0xffffe113, 0xffb80000,
+           0xffffffff, 0xff000000, 0xff4affff, 0xff0023ee,
+       },
+    },
+};
+
+int n_test_cases = sizeof(testcases)/sizeof(testcases[0]);
+
+
+static uint32_t
+reader (const void *src, int size)
+{
+    switch (size)
+    {
+    case 1:
+       return *(uint8_t *)src;
+    case 2:
+       return *(uint16_t *)src;
+    case 4:
+       return *(uint32_t *)src;
+    default:
+       assert(0);
+       return 0; /* silence MSVC */
+    }
+}
+
+
+static void
+writer (void *src, uint32_t value, int size)
+{
+    switch (size)
+    {
+    case 1:
+       *(uint8_t *)src = value;
+       break;
+    case 2:
+       *(uint16_t *)src = value;
+       break;
+    case 4:
+       *(uint32_t *)src = value;
+       break;
+    default:
+       assert(0);
+    }
+}
+
+
+int
+main (int argc, char **argv)
+{
+    uint32_t dst[SIZE];
+    pixman_image_t *src_img;
+    pixman_image_t *dst_img;
+    int i, j, x, y;
+    int ret = 0;
+
+    for (i = 0; i < n_test_cases; ++i)
+    {
+       for (j = 0; j < 2; ++j)
+       {
+           src_img = pixman_image_create_bits (testcases[i].format,
+                                               testcases[i].width,
+                                               testcases[i].height,
+                                               testcases[i].src,
+                                               testcases[i].stride);
+           pixman_image_set_indexed(src_img, testcases[i].indexed);
+
+           dst_img = pixman_image_create_bits (PIXMAN_a8r8g8b8,
+                                               testcases[i].width,
+                                               testcases[i].height,
+                                               dst,
+                                               testcases[i].width*4);
+
+           if (j)
+           {
+               pixman_image_set_accessors (src_img, reader, writer);
+               pixman_image_set_accessors (dst_img, reader, writer);
+           }
+
+           pixman_image_composite (PIXMAN_OP_SRC, src_img, NULL, dst_img,
+                                   0, 0, 0, 0, 0, 0, testcases[i].width, testcases[i].height);
+
+           pixman_image_unref (src_img);
+           pixman_image_unref (dst_img);
+
+           for (y = 0; y < testcases[i].height; ++y)
+           {
+               for (x = 0; x < testcases[i].width; ++x)
+               {
+                   int offset = y * testcases[i].width + x;
+
+                   if (dst[offset] != testcases[i].dst[offset])
+                   {
+                       printf ("test %i%c: pixel mismatch at (x=%d,y=%d): %08x expected, %08x obtained\n",
+                               i + 1, 'a' + j,
+                               x, y,
+                               testcases[i].dst[offset], dst[offset]);
+                       ret = 1;
+                   }
+               }
+           }
+       }
+    }
+
+    return ret;
+}
diff --git a/test/fuzzer-find-diff.pl b/test/fuzzer-find-diff.pl
new file mode 100644 (file)
index 0000000..53d9b8d
--- /dev/null
@@ -0,0 +1,68 @@
+#!/usr/bin/env perl
+
+$usage = "Usage:
+  fuzzer-find-diff.pl reference_binary new_binary [number_of_tests_to_run]
+
+The first two input arguments are the commands to run the test programs
+based on fuzzer_test_main() function from 'util.c' (preferably they should
+be statically compiled, this can be achieved via '--disable-shared' pixman
+configure option). The third optional argument is the number of test rounds
+to run (if not specified, then testing runs infinitely or until some problem
+is detected).
+
+Usage examples:
+  fuzzer-find-diff.pl ./blitters-test-with-sse-disabled ./blitters-test 9000000
+  fuzzer-find-diff.pl ./blitters-test \"ssh ppc64_host /path/to/blitters-test\"
+";
+
+$#ARGV >= 1 or die $usage;
+
+$batch_size = 10000;
+
+if ($#ARGV >= 2) {
+    $number_of_tests = int($ARGV[2]);
+} else {
+    $number_of_tests = -1
+}
+
+sub test_range {
+    my $min = shift;
+    my $max = shift;
+
+    if (`$ARGV[0] $min $max 2>/dev/null` eq `$ARGV[1] $min $max 2>/dev/null`) {
+        return;
+    }
+
+    while ($max != $min + 1) {
+        my $avg = int(($min + $max) / 2);
+        my $res1 = `$ARGV[0] $min $avg 2>/dev/null`;
+        my $res2 = `$ARGV[1] $min $avg 2>/dev/null`;
+        if ($res1 ne $res2) {
+            $max = $avg;
+        } else {
+            $min = $avg;
+        }
+    }
+    return $max;
+}
+
+$base = 1;
+while ($number_of_tests <= 0 || $base <= $number_of_tests) {
+    printf("testing %-12d\r", $base + $batch_size - 1);
+    my $res = test_range($base, $base + $batch_size - 1);
+    if ($res) {
+        printf("Failure: results are different for test %d:\n", $res);
+
+        printf("\n-- ref --\n");
+        print `$ARGV[0] $res`;
+        printf("-- new --\n");
+        print `$ARGV[1] $res`;
+
+        printf("The problematic conditions can be reproduced by running:\n");
+        printf("$ARGV[1] %d\n", $res);
+
+        exit(1);
+    }
+    $base += $batch_size;
+}
+printf("Success: %d tests finished\n", $base - 1);
diff --git a/test/gradient-crash-test.c b/test/gradient-crash-test.c
new file mode 100644 (file)
index 0000000..c85712d
--- /dev/null
@@ -0,0 +1,158 @@
+#include <stdio.h>
+#include <stdlib.h>
+#include "utils.h"
+
+int
+main (int argc, char **argv)
+{
+#define WIDTH 400
+#define HEIGHT 200
+    
+    uint32_t *dest = malloc (WIDTH * HEIGHT * 4);
+    pixman_image_t *src_img;
+    pixman_image_t *dest_img;
+    int i, j, k, p;
+
+    typedef struct
+    {
+       pixman_point_fixed_t p0;
+       pixman_point_fixed_t p1;
+    } point_pair_t;
+    
+    pixman_gradient_stop_t onestop[1] =
+       {
+           { pixman_int_to_fixed (1), { 0xffff, 0xeeee, 0xeeee, 0xeeee } },
+       };
+
+    pixman_gradient_stop_t subsetstops[2] =
+       {
+           { pixman_int_to_fixed (1), { 0xffff, 0xeeee, 0xeeee, 0xeeee } },
+           { pixman_int_to_fixed (1), { 0xffff, 0xeeee, 0xeeee, 0xeeee } },
+       };
+
+    pixman_gradient_stop_t stops01[2] =
+       {
+           { pixman_int_to_fixed (0), { 0xffff, 0xeeee, 0xeeee, 0xeeee } },
+           { pixman_int_to_fixed (1), { 0xffff, 0x1111, 0x1111, 0x1111 } }
+       };
+
+    point_pair_t point_pairs [] =
+       { { { pixman_double_to_fixed (0), 0 },
+           { pixman_double_to_fixed (WIDTH / 8.), pixman_int_to_fixed (0) } },
+         { { pixman_double_to_fixed (WIDTH / 2.0), pixman_double_to_fixed (HEIGHT / 2.0) },
+           { pixman_double_to_fixed (WIDTH / 2.0), pixman_double_to_fixed (HEIGHT / 2.0) } }
+       };
+    
+    pixman_transform_t transformations[] = {
+       {
+           { { pixman_double_to_fixed (2), pixman_double_to_fixed (0.5), pixman_double_to_fixed (-100), },
+             { pixman_double_to_fixed (0), pixman_double_to_fixed (3), pixman_double_to_fixed (0), },
+             { pixman_double_to_fixed (0), pixman_double_to_fixed (0.000), pixman_double_to_fixed (1.0) } 
+           }
+       },
+       {
+           { { pixman_double_to_fixed (1), pixman_double_to_fixed (0), pixman_double_to_fixed (0), },
+             { pixman_double_to_fixed (0), pixman_double_to_fixed (1), pixman_double_to_fixed (0), },
+             { pixman_double_to_fixed (0), pixman_double_to_fixed (0.000), pixman_double_to_fixed (1.0) } 
+           }
+       },
+       {
+           { { pixman_double_to_fixed (2), pixman_double_to_fixed (1), pixman_double_to_fixed (0), },
+             { pixman_double_to_fixed (1), pixman_double_to_fixed (1), pixman_double_to_fixed (0), },
+             { pixman_double_to_fixed (2), pixman_double_to_fixed (1.000), pixman_double_to_fixed (1.0) } 
+           }
+       },
+       {
+           { { pixman_double_to_fixed (2), pixman_double_to_fixed (1), pixman_double_to_fixed (0), },
+             { pixman_double_to_fixed (1), pixman_double_to_fixed (1), pixman_double_to_fixed (0), },
+             { pixman_double_to_fixed (0), pixman_double_to_fixed (0), pixman_double_to_fixed (0) } 
+           }
+       },
+       {
+           { { pixman_double_to_fixed (2), pixman_double_to_fixed (1), pixman_double_to_fixed (0), },
+             { pixman_double_to_fixed (1), pixman_double_to_fixed (1), pixman_double_to_fixed (0), },
+             { pixman_double_to_fixed (2), pixman_double_to_fixed (-1), pixman_double_to_fixed (0) } 
+           }
+       },
+       {
+           { { pixman_double_to_fixed (2), pixman_double_to_fixed (1), pixman_double_to_fixed (3), },
+             { pixman_double_to_fixed (1), pixman_double_to_fixed (1), pixman_double_to_fixed (0), },
+             { pixman_double_to_fixed (2), pixman_double_to_fixed (-1), pixman_double_to_fixed (0) } 
+           }
+       },
+    };
+    
+    pixman_fixed_t r_inner;
+    pixman_fixed_t r_outer;
+
+    enable_fp_exceptions();
+    
+    for (i = 0; i < WIDTH * HEIGHT; ++i)
+       dest[i] = 0x4f00004f; /* pale blue */
+    
+    dest_img = pixman_image_create_bits (PIXMAN_a8r8g8b8,
+                                        WIDTH, HEIGHT, 
+                                        dest,
+                                        WIDTH * 4);
+
+    r_inner = 0;
+    r_outer = pixman_double_to_fixed (50.0);
+    
+    for (i = 0; i < 3; ++i)
+    {
+       pixman_gradient_stop_t *stops;
+        int num_stops;
+
+       if (i == 0)
+       {
+           stops = onestop;
+           num_stops = sizeof(onestop) / sizeof(onestop[0]);
+       }
+       else if (i == 1)
+       {
+           stops = subsetstops;
+           num_stops = sizeof(subsetstops) / sizeof(subsetstops[0]);
+       }
+       else
+       {
+           stops = stops01;
+           num_stops = sizeof(stops01) / sizeof(stops01[0]);
+       }
+       
+       for (j = 0; j < 3; ++j)
+       {
+           for (p = 0; p < ARRAY_LENGTH (point_pairs); ++p)
+           {
+               point_pair_t *pair = &(point_pairs[p]);
+
+               if (j == 0)
+                   src_img = pixman_image_create_conical_gradient (&(pair->p0), r_inner,
+                                                                   stops, num_stops);
+               else if (j == 1)
+                   src_img = pixman_image_create_radial_gradient  (&(pair->p0), &(pair->p1),
+                                                                   r_inner, r_outer,
+                                                                   stops, num_stops);
+               else
+                   src_img = pixman_image_create_linear_gradient  (&(pair->p0), &(pair->p1),
+                                                                   stops, num_stops);
+               
+               for (k = 0; k < ARRAY_LENGTH (transformations); ++k)
+               {
+                   pixman_image_set_transform (src_img, &transformations[k]);
+                   
+                   pixman_image_set_repeat (src_img, PIXMAN_REPEAT_NONE);
+                   pixman_image_composite (PIXMAN_OP_OVER, src_img, NULL, dest_img,
+                                           0, 0, 0, 0, 0, 0, 10 * WIDTH, HEIGHT);
+               }
+
+               pixman_image_unref (src_img);
+           }
+
+       }
+    }
+
+    pixman_image_unref (dest_img);
+    free (dest);
+    
+    return 0;
+}
diff --git a/test/lowlevel-blt-bench.c b/test/lowlevel-blt-bench.c
new file mode 100644 (file)
index 0000000..bdafb35
--- /dev/null
@@ -0,0 +1,727 @@
+/*
+ * Copyright © 2009 Nokia Corporation
+ * Copyright © 2010 Movial Creative Technologies Oy
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and associated documentation files (the "Software"),
+ * to deal in the Software without restriction, including without limitation
+ * the rights to use, copy, modify, merge, publish, distribute, sublicense,
+ * and/or sell copies of the Software, and to permit persons to whom the
+ * Software is furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice (including the next
+ * paragraph) shall be included in all copies or substantial portions of the
+ * Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
+ * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
+ * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER
+ * DEALINGS IN THE SOFTWARE.
+ */
+
+#include <stdio.h>
+#include <stdlib.h>
+#include <string.h>
+
+#define PIXMAN_USE_INTERNAL_API
+#include <pixman.h>
+
+#include "utils.h"
+
+#define SOLID_FLAG 1
+#define CA_FLAG    2
+
+#define L1CACHE_SIZE (8 * 1024)
+#define L2CACHE_SIZE (128 * 1024)
+
+#define WIDTH  1920
+#define HEIGHT 1080
+#define BUFSIZE (WIDTH * HEIGHT * 4)
+#define XWIDTH 256
+#define XHEIGHT 256
+#define TILEWIDTH 32
+#define TINYWIDTH 8
+
+#define EXCLUDE_OVERHEAD 1
+
+uint32_t *dst;
+uint32_t *src;
+uint32_t *mask;
+
+double bandwidth = 0;
+
+double
+bench_memcpy ()
+{
+    int64_t n = 0, total;
+    double  t1, t2;
+    int     x = 0;
+
+    t1 = gettime ();
+    while (1)
+    {
+       memcpy (dst, src, BUFSIZE - 64);
+       memcpy (src, dst, BUFSIZE - 64);
+       n += 4 * (BUFSIZE - 64);
+       t2 = gettime ();
+       if (t2 - t1 > 0.5)
+           break;
+    }
+    n = total = n * 5;
+    t1 = gettime ();
+    while (n > 0)
+    {
+       if (++x >= 64)
+           x = 0;
+       memcpy ((char *)dst + 1, (char *)src + x, BUFSIZE - 64);
+       memcpy ((char *)src + 1, (char *)dst + x, BUFSIZE - 64);
+       n -= 4 * (BUFSIZE - 64);
+    }
+    t2 = gettime ();
+    return (double)total / (t2 - t1);
+}
+
+static void
+pixman_image_composite_wrapper (pixman_implementation_t *impl,
+                               pixman_composite_info_t *info)
+{
+    pixman_image_composite (info->op,
+                           info->src_image, info->mask_image, info->dest_image,
+                           info->src_x, info->src_y,
+                           info->mask_x, info->mask_y,
+                           info->dest_x, info->dest_y,
+                           info->width, info->height);
+}
+
+static void
+pixman_image_composite_empty (pixman_implementation_t *impl,
+                             pixman_composite_info_t *info)
+{
+    pixman_image_composite (info->op,
+                           info->src_image, info->mask_image, info->dest_image,
+                           0, 0, 0, 0, 0, 0, 1, 1);
+}
+
+static inline void
+call_func (pixman_composite_func_t func,
+          pixman_op_t             op,
+          pixman_image_t *        src_image,
+          pixman_image_t *        mask_image,
+          pixman_image_t *        dest_image,
+          int32_t                 src_x,
+          int32_t                 src_y,
+          int32_t                 mask_x,
+          int32_t                 mask_y,
+          int32_t                 dest_x,
+          int32_t                 dest_y,
+          int32_t                 width,
+          int32_t                 height)
+{
+    pixman_composite_info_t info;
+
+    info.op = op;
+    info.src_image = src_image;
+    info.mask_image = mask_image;
+    info.dest_image = dest_image;
+    info.src_x = src_x;
+    info.src_y = src_y;
+    info.mask_x = mask_x;
+    info.mask_y = mask_y;
+    info.dest_x = dest_x;
+    info.dest_y = dest_y;
+    info.width = width;
+    info.height = height;
+
+    func (0, &info);
+}
+
+void
+noinline
+bench_L  (pixman_op_t              op,
+          pixman_image_t *         src_img,
+          pixman_image_t *         mask_img,
+          pixman_image_t *         dst_img,
+          int64_t                  n,
+          pixman_composite_func_t  func,
+          int                      width,
+          int                      lines_count)
+{
+    int64_t      i, j;
+    int          x = 0;
+    int          q = 0;
+    volatile int qx;
+
+    for (i = 0; i < n; i++)
+    {
+       /* touch destination buffer to fetch it into L1 cache */
+       for (j = 0; j < width + 64; j += 16) {
+           q += dst[j];
+           q += src[j];
+       }
+       if (++x >= 64)
+           x = 0;
+       call_func (func, op, src_img, mask_img, dst_img, x, 0, x, 0, 63 - x, 0, width, lines_count);
+    }
+    qx = q;
+}
+
+void
+noinline
+bench_M (pixman_op_t              op,
+         pixman_image_t *         src_img,
+         pixman_image_t *         mask_img,
+         pixman_image_t *         dst_img,
+         int64_t                  n,
+         pixman_composite_func_t  func)
+{
+    int64_t i;
+    int     x = 0;
+
+    for (i = 0; i < n; i++)
+    {
+       if (++x >= 64)
+           x = 0;
+       call_func (func, op, src_img, mask_img, dst_img, x, 0, x, 0, 1, 0, WIDTH - 64, HEIGHT);
+    }
+}
+
+double
+noinline
+bench_HT (pixman_op_t              op,
+          pixman_image_t *         src_img,
+          pixman_image_t *         mask_img,
+          pixman_image_t *         dst_img,
+          int64_t                  n,
+          pixman_composite_func_t  func)
+{
+    double  pix_cnt = 0;
+    int     x = 0;
+    int     y = 0;
+    int64_t i;
+
+    srand (0);
+    for (i = 0; i < n; i++)
+    {
+       int w = (rand () % (TILEWIDTH * 2)) + 1;
+       int h = (rand () % (TILEWIDTH * 2)) + 1;
+       if (x + w > WIDTH)
+       {
+           x = 0;
+           y += TILEWIDTH * 2;
+       }
+       if (y + h > HEIGHT)
+       {
+           y = 0;
+       }
+       call_func (func, op, src_img, mask_img, dst_img, x, y, x, y, x, y, w, h);
+       x += w;
+       pix_cnt += w * h;
+    }
+    return pix_cnt;
+}
+
+double
+noinline
+bench_VT (pixman_op_t              op,
+          pixman_image_t *         src_img,
+          pixman_image_t *         mask_img,
+          pixman_image_t *         dst_img,
+          int64_t                  n,
+          pixman_composite_func_t  func)
+{
+    double  pix_cnt = 0;
+    int     x = 0;
+    int     y = 0;
+    int64_t i;
+
+    srand (0);
+    for (i = 0; i < n; i++)
+    {
+       int w = (rand () % (TILEWIDTH * 2)) + 1;
+       int h = (rand () % (TILEWIDTH * 2)) + 1;
+       if (y + h > HEIGHT)
+       {
+           y = 0;
+           x += TILEWIDTH * 2;
+       }
+       if (x + w > WIDTH)
+       {
+           x = 0;
+       }
+       call_func (func, op, src_img, mask_img, dst_img, x, y, x, y, x, y, w, h);
+       y += h;
+       pix_cnt += w * h;
+    }
+    return pix_cnt;
+}
+
+double
+noinline
+bench_R (pixman_op_t              op,
+         pixman_image_t *         src_img,
+         pixman_image_t *         mask_img,
+         pixman_image_t *         dst_img,
+         int64_t                  n,
+         pixman_composite_func_t  func,
+         int                      maxw,
+         int                      maxh)
+{
+    double  pix_cnt = 0;
+    int64_t i;
+
+    if (maxw <= TILEWIDTH * 2 || maxh <= TILEWIDTH * 2)
+    {
+       printf("error: maxw <= TILEWIDTH * 2 || maxh <= TILEWIDTH * 2\n");
+        return 0;
+    }
+
+    srand (0);
+    for (i = 0; i < n; i++)
+    {
+       int w = (rand () % (TILEWIDTH * 2)) + 1;
+       int h = (rand () % (TILEWIDTH * 2)) + 1;
+       int sx = rand () % (maxw - TILEWIDTH * 2);
+       int sy = rand () % (maxh - TILEWIDTH * 2);
+       int dx = rand () % (maxw - TILEWIDTH * 2);
+       int dy = rand () % (maxh - TILEWIDTH * 2);
+       call_func (func, op, src_img, mask_img, dst_img, sx, sy, sx, sy, dx, dy, w, h);
+       pix_cnt += w * h;
+    }
+    return pix_cnt;
+}
+
+double
+noinline
+bench_RT (pixman_op_t              op,
+          pixman_image_t *         src_img,
+          pixman_image_t *         mask_img,
+          pixman_image_t *         dst_img,
+          int64_t                  n,
+          pixman_composite_func_t  func,
+          int                      maxw,
+          int                      maxh)
+{
+    double  pix_cnt = 0;
+    int64_t i;
+
+    if (maxw <= TINYWIDTH * 2 || maxh <= TINYWIDTH * 2)
+    {
+       printf("error: maxw <= TINYWIDTH * 2 || maxh <= TINYWIDTH * 2\n");
+        return 0;
+    }
+
+    srand (0);
+    for (i = 0; i < n; i++)
+    {
+       int w = (rand () % (TINYWIDTH * 2)) + 1;
+       int h = (rand () % (TINYWIDTH * 2)) + 1;
+       int sx = rand () % (maxw - TINYWIDTH * 2);
+       int sy = rand () % (maxh - TINYWIDTH * 2);
+       int dx = rand () % (maxw - TINYWIDTH * 2);
+       int dy = rand () % (maxh - TINYWIDTH * 2);
+       call_func (func, op, src_img, mask_img, dst_img, sx, sy, sx, sy, dx, dy, w, h);
+       pix_cnt += w * h;
+    }
+    return pix_cnt;
+}
+
+void
+bench_composite (char * testname,
+                 int    src_fmt,
+                 int    src_flags,
+                 int    op,
+                 int    mask_fmt,
+                 int    mask_flags,
+                 int    dst_fmt,
+                 double npix)
+{
+    pixman_image_t *                src_img;
+    pixman_image_t *                dst_img;
+    pixman_image_t *                mask_img;
+    pixman_image_t *                xsrc_img;
+    pixman_image_t *                xdst_img;
+    pixman_image_t *                xmask_img;
+    double                          t1, t2, t3, pix_cnt;
+    int64_t                         n, l1test_width, nlines;
+    double                             bytes_per_pix = 0;
+
+    pixman_composite_func_t func = pixman_image_composite_wrapper;
+
+    if (!(src_flags & SOLID_FLAG))
+    {
+        bytes_per_pix += (src_fmt >> 24) / 8.0;
+        src_img = pixman_image_create_bits (src_fmt,
+                                            WIDTH, HEIGHT,
+                                            src,
+                                            WIDTH * 4);
+        xsrc_img = pixman_image_create_bits (src_fmt,
+                                             XWIDTH, XHEIGHT,
+                                             src,
+                                             XWIDTH * 4);
+    }
+    else
+    {
+        src_img = pixman_image_create_bits (src_fmt,
+                                            1, 1,
+                                            src,
+                                            4);
+        xsrc_img = pixman_image_create_bits (src_fmt,
+                                             1, 1,
+                                             src,
+                                             4);
+        pixman_image_set_repeat (src_img, PIXMAN_REPEAT_NORMAL);
+        pixman_image_set_repeat (xsrc_img, PIXMAN_REPEAT_NORMAL);
+    }
+
+    bytes_per_pix += (dst_fmt >> 24) / 8.0;
+    dst_img = pixman_image_create_bits (dst_fmt,
+                                        WIDTH, HEIGHT,
+                                        dst,
+                                        WIDTH * 4);
+
+    mask_img = NULL;
+    xmask_img = NULL;
+    if (!(mask_flags & SOLID_FLAG) && mask_fmt != PIXMAN_null)
+    {
+        bytes_per_pix += (mask_fmt >> 24) / ((op == PIXMAN_OP_SRC) ? 8.0 : 4.0);
+        mask_img = pixman_image_create_bits (mask_fmt,
+                                             WIDTH, HEIGHT,
+                                             mask,
+                                             WIDTH * 4);
+        xmask_img = pixman_image_create_bits (mask_fmt,
+                                             XWIDTH, XHEIGHT,
+                                             mask,
+                                             XWIDTH * 4);
+    }
+    else if (mask_fmt != PIXMAN_null)
+    {
+        mask_img = pixman_image_create_bits (mask_fmt,
+                                             1, 1,
+                                             mask,
+                                             4);
+        xmask_img = pixman_image_create_bits (mask_fmt,
+                                             1, 1,
+                                             mask,
+                                             4 * 4);
+       pixman_image_set_repeat (mask_img, PIXMAN_REPEAT_NORMAL);
+       pixman_image_set_repeat (xmask_img, PIXMAN_REPEAT_NORMAL);
+    }
+    if ((mask_flags & CA_FLAG) && mask_fmt != PIXMAN_null)
+    {
+       pixman_image_set_component_alpha (mask_img, 1);
+    }
+    xdst_img = pixman_image_create_bits (dst_fmt,
+                                         XWIDTH, XHEIGHT,
+                                         dst,
+                                         XWIDTH * 4);
+
+
+    printf ("%24s %c", testname, func != pixman_image_composite_wrapper ?
+            '-' : '=');
+
+    memcpy (src, dst, BUFSIZE);
+    memcpy (dst, src, BUFSIZE);
+
+    l1test_width = L1CACHE_SIZE / 8 - 64;
+    if (l1test_width < 1)
+       l1test_width = 1;
+    if (l1test_width > WIDTH - 64)
+       l1test_width = WIDTH - 64;
+    n = 1 + npix / (l1test_width * 8);
+    t1 = gettime ();
+#if EXCLUDE_OVERHEAD
+    bench_L (op, src_img, mask_img, dst_img, n, pixman_image_composite_empty, l1test_width, 1);
+#endif
+    t2 = gettime ();
+    bench_L (op, src_img, mask_img, dst_img, n, func, l1test_width, 1);
+    t3 = gettime ();
+    printf ("  L1:%7.2f", (double)n * l1test_width * 1 /
+            ((t3 - t2) - (t2 - t1)) / 1000000.);
+    fflush (stdout);
+
+    memcpy (src, dst, BUFSIZE);
+    memcpy (dst, src, BUFSIZE);
+
+    nlines = (L2CACHE_SIZE / l1test_width) /
+       ((PIXMAN_FORMAT_BPP(src_fmt) + PIXMAN_FORMAT_BPP(dst_fmt)) / 8);
+    if (nlines < 1)
+       nlines = 1;
+    n = 1 + npix / (l1test_width * nlines);
+    t1 = gettime ();
+#if EXCLUDE_OVERHEAD
+    bench_L (op, src_img, mask_img, dst_img, n, pixman_image_composite_empty, l1test_width, nlines);
+#endif
+    t2 = gettime ();
+    bench_L (op, src_img, mask_img, dst_img, n, func, l1test_width, nlines);
+    t3 = gettime ();
+    printf ("  L2:%7.2f", (double)n * l1test_width * nlines /
+            ((t3 - t2) - (t2 - t1)) / 1000000.);
+    fflush (stdout);
+
+    memcpy (src, dst, BUFSIZE);
+    memcpy (dst, src, BUFSIZE);
+
+    n = 1 + npix / (WIDTH * HEIGHT);
+    t1 = gettime ();
+#if EXCLUDE_OVERHEAD
+    bench_M (op, src_img, mask_img, dst_img, n, pixman_image_composite_empty);
+#endif
+    t2 = gettime ();
+    bench_M (op, src_img, mask_img, dst_img, n, func);
+    t3 = gettime ();
+    printf ("  M:%6.2f (%6.2f%%)",
+        ((double)n * (WIDTH - 64) * HEIGHT / ((t3 - t2) - (t2 - t1))) / 1000000.,
+        ((double)n * (WIDTH - 64) * HEIGHT / ((t3 - t2) - (t2 - t1)) * bytes_per_pix) * (100.0 / bandwidth) );
+    fflush (stdout);
+
+    memcpy (src, dst, BUFSIZE);
+    memcpy (dst, src, BUFSIZE);
+
+    n = 1 + npix / (8 * TILEWIDTH * TILEWIDTH);
+    t1 = gettime ();
+#if EXCLUDE_OVERHEAD
+    pix_cnt = bench_HT (op, src_img, mask_img, dst_img, n, pixman_image_composite_empty);
+#endif
+    t2 = gettime ();
+    pix_cnt = bench_HT (op, src_img, mask_img, dst_img, n, func);
+    t3 = gettime ();
+    printf ("  HT:%6.2f", (double)pix_cnt / ((t3 - t2) - (t2 - t1)) / 1000000.);
+    fflush (stdout);
+
+    memcpy (src, dst, BUFSIZE);
+    memcpy (dst, src, BUFSIZE);
+
+    n = 1 + npix / (8 * TILEWIDTH * TILEWIDTH);
+    t1 = gettime ();
+#if EXCLUDE_OVERHEAD
+    pix_cnt = bench_VT (op, src_img, mask_img, dst_img, n, pixman_image_composite_empty);
+#endif
+    t2 = gettime ();
+    pix_cnt = bench_VT (op, src_img, mask_img, dst_img, n, func);
+    t3 = gettime ();
+    printf ("  VT:%6.2f", (double)pix_cnt / ((t3 - t2) - (t2 - t1)) / 1000000.);
+    fflush (stdout);
+
+    memcpy (src, dst, BUFSIZE);
+    memcpy (dst, src, BUFSIZE);
+
+    n = 1 + npix / (8 * TILEWIDTH * TILEWIDTH);
+    t1 = gettime ();
+#if EXCLUDE_OVERHEAD
+    pix_cnt = bench_R (op, src_img, mask_img, dst_img, n, pixman_image_composite_empty, WIDTH, HEIGHT);
+#endif
+    t2 = gettime ();
+    pix_cnt = bench_R (op, src_img, mask_img, dst_img, n, func, WIDTH, HEIGHT);
+    t3 = gettime ();
+    printf ("  R:%6.2f", (double)pix_cnt / ((t3 - t2) - (t2 - t1)) / 1000000.);
+    fflush (stdout);
+
+    memcpy (src, dst, BUFSIZE);
+    memcpy (dst, src, BUFSIZE);
+
+    n = 1 + npix / (16 * TINYWIDTH * TINYWIDTH);
+    t1 = gettime ();
+#if EXCLUDE_OVERHEAD
+    pix_cnt = bench_RT (op, src_img, mask_img, dst_img, n, pixman_image_composite_empty, WIDTH, HEIGHT);
+#endif
+    t2 = gettime ();
+    pix_cnt = bench_RT (op, src_img, mask_img, dst_img, n, func, WIDTH, HEIGHT);
+    t3 = gettime ();
+    printf ("  RT:%6.2f (%4.0fKops/s)\n", (double)pix_cnt / ((t3 - t2) - (t2 - t1)) / 1000000., (double) n / ((t3 - t2) * 1000));
+
+    if (mask_img) {
+       pixman_image_unref (mask_img);
+       pixman_image_unref (xmask_img);
+    }
+    pixman_image_unref (src_img);
+    pixman_image_unref (dst_img);
+    pixman_image_unref (xsrc_img);
+    pixman_image_unref (xdst_img);
+}
+
+#define PIXMAN_OP_OUT_REV (PIXMAN_OP_OUT_REVERSE)
+
+struct
+{
+    char *testname;
+    int   src_fmt;
+    int   src_flags;
+    int   op;
+    int   mask_fmt;
+    int   mask_flags;
+    int   dst_fmt;
+}
+tests_tbl[] =
+{
+    { "add_8_8_8",             PIXMAN_a8,          0, PIXMAN_OP_ADD,     PIXMAN_a8,       0, PIXMAN_a8 },
+    { "add_n_8_8",             PIXMAN_a8r8g8b8,    1, PIXMAN_OP_ADD,     PIXMAN_a8,       0, PIXMAN_a8 },
+    { "add_n_8_8888",          PIXMAN_a8r8g8b8,    1, PIXMAN_OP_ADD,     PIXMAN_a8,       0, PIXMAN_a8r8g8b8 },
+    { "add_n_8_x888",          PIXMAN_a8r8g8b8,    1, PIXMAN_OP_ADD,     PIXMAN_a8,       0, PIXMAN_x8r8g8b8 },
+    { "add_n_8_0565",          PIXMAN_a8r8g8b8,    1, PIXMAN_OP_ADD,     PIXMAN_a8,       0, PIXMAN_r5g6b5 },
+    { "add_n_8_1555",          PIXMAN_a8r8g8b8,    1, PIXMAN_OP_ADD,     PIXMAN_a8,       0, PIXMAN_a1r5g5b5 },
+    { "add_n_8_4444",          PIXMAN_a8r8g8b8,    1, PIXMAN_OP_ADD,     PIXMAN_a8,       0, PIXMAN_a4r4g4b4 },
+    { "add_n_8_2222",          PIXMAN_a8r8g8b8,    1, PIXMAN_OP_ADD,     PIXMAN_a8,       0, PIXMAN_a2r2g2b2 },
+    { "add_n_8_2x10",          PIXMAN_a8r8g8b8,    1, PIXMAN_OP_ADD,     PIXMAN_a8,       0, PIXMAN_x2r10g10b10 },
+    { "add_n_8_2a10",          PIXMAN_a8r8g8b8,    1, PIXMAN_OP_ADD,     PIXMAN_a8,       0, PIXMAN_a2r10g10b10 },
+    { "add_n_8",               PIXMAN_a8r8g8b8,    1, PIXMAN_OP_ADD,     PIXMAN_null,     0, PIXMAN_a8 },
+    { "add_n_8888",            PIXMAN_a8r8g8b8,    1, PIXMAN_OP_ADD,     PIXMAN_null,     0, PIXMAN_a8r8g8b8 },
+    { "add_n_x888",            PIXMAN_a8r8g8b8,    1, PIXMAN_OP_ADD,     PIXMAN_null,     0, PIXMAN_x8r8g8b8 },
+    { "add_n_0565",            PIXMAN_a8r8g8b8,    1, PIXMAN_OP_ADD,     PIXMAN_null,     0, PIXMAN_r5g6b5 },
+    { "add_n_1555",            PIXMAN_a8r8g8b8,    1, PIXMAN_OP_ADD,     PIXMAN_null,     0, PIXMAN_a1r5g5b5 },
+    { "add_n_4444",            PIXMAN_a8r8g8b8,    1, PIXMAN_OP_ADD,     PIXMAN_null,     0, PIXMAN_a4r4g4b4 },
+    { "add_n_2222",            PIXMAN_a8r8g8b8,    1, PIXMAN_OP_ADD,     PIXMAN_null,     0, PIXMAN_a2r2g2b2 },
+    { "add_n_2x10",            PIXMAN_a2r10g10b10, 1, PIXMAN_OP_ADD,     PIXMAN_null,     0, PIXMAN_x2r10g10b10 },
+    { "add_n_2a10",            PIXMAN_a2r10g10b10, 1, PIXMAN_OP_ADD,     PIXMAN_null,     0, PIXMAN_a2r10g10b10 },
+    { "add_8_8",               PIXMAN_a8,          0, PIXMAN_OP_ADD,     PIXMAN_null,     0, PIXMAN_a8 },
+    { "add_x888_x888",         PIXMAN_x8r8g8b8,    0, PIXMAN_OP_ADD,     PIXMAN_null,     0, PIXMAN_x8r8g8b8 },
+    { "add_8888_8888",         PIXMAN_a8r8g8b8,    0, PIXMAN_OP_ADD,     PIXMAN_null,     0, PIXMAN_a8r8g8b8 },
+    { "add_8888_0565",         PIXMAN_a8r8g8b8,    0, PIXMAN_OP_ADD,     PIXMAN_null,     0, PIXMAN_r5g6b5 },
+    { "add_8888_1555",         PIXMAN_a8r8g8b8,    0, PIXMAN_OP_ADD,     PIXMAN_null,     0, PIXMAN_a1r5g5b5 },
+    { "add_8888_4444",         PIXMAN_a8r8g8b8,    0, PIXMAN_OP_ADD,     PIXMAN_null,     0, PIXMAN_a4r4g4b4 },
+    { "add_8888_2222",         PIXMAN_a8r8g8b8,    0, PIXMAN_OP_ADD,     PIXMAN_null,     0, PIXMAN_a2r2g2b2 },
+    { "add_0565_0565",         PIXMAN_r5g6b5,      0, PIXMAN_OP_ADD,     PIXMAN_null,     0, PIXMAN_r5g6b5 },
+    { "add_1555_1555",         PIXMAN_a1r5g5b5,    0, PIXMAN_OP_ADD,     PIXMAN_null,     0, PIXMAN_a1r5g5b5 },
+    { "add_0565_2x10",         PIXMAN_r5g6b5,      0, PIXMAN_OP_ADD,     PIXMAN_null,     0, PIXMAN_x2r10g10b10 },
+    { "add_2a10_2a10",         PIXMAN_a2r10g10b10, 0, PIXMAN_OP_ADD,     PIXMAN_null,     0, PIXMAN_a2r10g10b10 },
+    { "src_n_2222",            PIXMAN_a8r8g8b8,    1, PIXMAN_OP_SRC,     PIXMAN_null,     0, PIXMAN_a2r2g2b2 },
+    { "src_n_0565",            PIXMAN_a8r8g8b8,    1, PIXMAN_OP_SRC,     PIXMAN_null,     0, PIXMAN_r5g6b5 },
+    { "src_n_1555",            PIXMAN_a8r8g8b8,    1, PIXMAN_OP_SRC,     PIXMAN_null,     0, PIXMAN_a1r5g5b5 },
+    { "src_n_4444",            PIXMAN_a8r8g8b8,    1, PIXMAN_OP_SRC,     PIXMAN_null,     0, PIXMAN_a4r4g4b4 },
+    { "src_n_x888",            PIXMAN_a8r8g8b8,    1, PIXMAN_OP_SRC,     PIXMAN_null,     0, PIXMAN_x8r8g8b8 },
+    { "src_n_8888",            PIXMAN_a8r8g8b8,    1, PIXMAN_OP_SRC,     PIXMAN_null,     0, PIXMAN_a8r8g8b8 },
+    { "src_n_2x10",            PIXMAN_a2r10g10b10, 1, PIXMAN_OP_SRC,     PIXMAN_null,     0, PIXMAN_x2r10g10b10 },
+    { "src_n_2a10",            PIXMAN_a2r10g10b10, 1, PIXMAN_OP_SRC,     PIXMAN_null,     0, PIXMAN_a2r10g10b10 },
+    { "src_8888_0565",         PIXMAN_a8r8g8b8,    0, PIXMAN_OP_SRC,     PIXMAN_null,     0, PIXMAN_r5g6b5 },
+    { "src_8888_4444",         PIXMAN_a8r8g8b8,    0, PIXMAN_OP_SRC,     PIXMAN_null,     0, PIXMAN_a4r4g4b4 },
+    { "src_8888_2222",         PIXMAN_a8r8g8b8,    0, PIXMAN_OP_SRC,     PIXMAN_null,     0, PIXMAN_a2r2g2b2 },
+    { "src_8888_2x10",         PIXMAN_a8r8g8b8,    0, PIXMAN_OP_SRC,     PIXMAN_null,     0, PIXMAN_x2r10g10b10 },
+    { "src_8888_2a10",         PIXMAN_a8r8g8b8,    0, PIXMAN_OP_SRC,     PIXMAN_null,     0, PIXMAN_a2r10g10b10 },
+    { "src_0888_0565",         PIXMAN_r8g8b8,      0, PIXMAN_OP_SRC,     PIXMAN_null,     0, PIXMAN_r5g6b5 },
+    { "src_0888_8888",         PIXMAN_r8g8b8,      0, PIXMAN_OP_SRC,     PIXMAN_null,     0, PIXMAN_a8r8g8b8 },
+    { "src_0888_x888",         PIXMAN_r8g8b8,      0, PIXMAN_OP_SRC,     PIXMAN_null,     0, PIXMAN_x8r8g8b8 },
+    { "src_x888_x888",         PIXMAN_x8r8g8b8,    0, PIXMAN_OP_SRC,     PIXMAN_null,     0, PIXMAN_x8r8g8b8 },
+    { "src_x888_8888",         PIXMAN_x8r8g8b8,    0, PIXMAN_OP_SRC,     PIXMAN_null,     0, PIXMAN_a8r8g8b8 },
+    { "src_8888_8888",         PIXMAN_a8r8g8b8,    0, PIXMAN_OP_SRC,     PIXMAN_null,     0, PIXMAN_a8r8g8b8 },
+    { "src_0565_0565",         PIXMAN_r5g6b5,      0, PIXMAN_OP_SRC,     PIXMAN_null,     0, PIXMAN_r5g6b5 },
+    { "src_1555_0565",         PIXMAN_a1r5g5b5,    0, PIXMAN_OP_SRC,     PIXMAN_null,     0, PIXMAN_r5g6b5 },
+    { "src_0565_1555",         PIXMAN_r5g6b5,      0, PIXMAN_OP_SRC,     PIXMAN_null,     0, PIXMAN_a1r5g5b5 },
+    { "src_n_8_0565",          PIXMAN_a8r8g8b8,    1, PIXMAN_OP_SRC,     PIXMAN_a8,       0, PIXMAN_r5g6b5 },
+    { "src_n_8_1555",          PIXMAN_a8r8g8b8,    1, PIXMAN_OP_SRC,     PIXMAN_a8,       0, PIXMAN_a1r5g5b5 },
+    { "src_n_8_4444",          PIXMAN_a8r8g8b8,    1, PIXMAN_OP_SRC,     PIXMAN_a8,       0, PIXMAN_a4r4g4b4 },
+    { "src_n_8_2222",          PIXMAN_a8r8g8b8,    1, PIXMAN_OP_SRC,     PIXMAN_a8,       0, PIXMAN_a2r2g2b2 },
+    { "src_n_8_x888",          PIXMAN_a8r8g8b8,    1, PIXMAN_OP_SRC,     PIXMAN_a8,       0, PIXMAN_x8r8g8b8 },
+    { "src_n_8_8888",          PIXMAN_a8r8g8b8,    1, PIXMAN_OP_SRC,     PIXMAN_a8,       0, PIXMAN_a8r8g8b8 },
+    { "src_n_8_2x10",          PIXMAN_a8r8g8b8,    1, PIXMAN_OP_SRC,     PIXMAN_a8,       0, PIXMAN_x2r10g10b10 },
+    { "src_n_8_2a10",          PIXMAN_a8r8g8b8,    1, PIXMAN_OP_SRC,     PIXMAN_a8,       0, PIXMAN_a2r10g10b10 },
+    { "src_8888_8_0565",       PIXMAN_a8r8g8b8,    0, PIXMAN_OP_SRC,     PIXMAN_a8,       0, PIXMAN_r5g6b5 },
+    { "src_0888_8_0565",       PIXMAN_r8g8b8,      0, PIXMAN_OP_SRC,     PIXMAN_a8,       0, PIXMAN_r5g6b5 },
+    { "src_0888_8_8888",       PIXMAN_r8g8b8,      0, PIXMAN_OP_SRC,     PIXMAN_a8,       0, PIXMAN_a8r8g8b8 },
+    { "src_0888_8_x888",       PIXMAN_r8g8b8,      0, PIXMAN_OP_SRC,     PIXMAN_a8,       0, PIXMAN_x8r8g8b8 },
+    { "src_x888_8_x888",       PIXMAN_x8r8g8b8,    0, PIXMAN_OP_SRC,     PIXMAN_a8,       0, PIXMAN_x8r8g8b8 },
+    { "src_x888_8_8888",       PIXMAN_x8r8g8b8,    0, PIXMAN_OP_SRC,     PIXMAN_a8,       0, PIXMAN_a8r8g8b8 },
+    { "src_0565_8_0565",       PIXMAN_r5g6b5,      0, PIXMAN_OP_SRC,     PIXMAN_a8,       0, PIXMAN_r5g6b5 },
+    { "src_1555_8_0565",       PIXMAN_a1r5g5b5,    0, PIXMAN_OP_SRC,     PIXMAN_a8,       0, PIXMAN_r5g6b5 },
+    { "src_0565_8_1555",       PIXMAN_r5g6b5,      0, PIXMAN_OP_SRC,     PIXMAN_a8,       0, PIXMAN_a1r5g5b5 },
+    { "over_n_x888",           PIXMAN_a8r8g8b8,    1, PIXMAN_OP_OVER,    PIXMAN_null,     0, PIXMAN_x8r8g8b8 },
+    { "over_n_8888",           PIXMAN_a8r8g8b8,    1, PIXMAN_OP_OVER,    PIXMAN_null,     0, PIXMAN_a8r8g8b8 },
+    { "over_n_0565",           PIXMAN_a8r8g8b8,    1, PIXMAN_OP_OVER,    PIXMAN_null,     0, PIXMAN_r5g6b5 },
+    { "over_n_1555",           PIXMAN_a8r8g8b8,    1, PIXMAN_OP_OVER,    PIXMAN_null,     0, PIXMAN_a1r5g5b5 },
+    { "over_8888_0565",        PIXMAN_a8r8g8b8,    0, PIXMAN_OP_OVER,    PIXMAN_null,     0, PIXMAN_r5g6b5 },
+    { "over_8888_x888",        PIXMAN_a8r8g8b8,    0, PIXMAN_OP_OVER,    PIXMAN_null,     0, PIXMAN_x8r8g8b8 },
+    { "over_x888_8_0565",      PIXMAN_x8r8g8b8,    0, PIXMAN_OP_OVER,    PIXMAN_a8,       0, PIXMAN_r5g6b5 },
+    { "over_x888_8_8888",      PIXMAN_x8r8g8b8,    0, PIXMAN_OP_OVER,    PIXMAN_a8,       0, PIXMAN_a8r8g8b8 },
+    { "over_n_8_0565",         PIXMAN_a8r8g8b8,    1, PIXMAN_OP_OVER,    PIXMAN_a8,       0, PIXMAN_r5g6b5 },
+    { "over_n_8_1555",         PIXMAN_a8r8g8b8,    1, PIXMAN_OP_OVER,    PIXMAN_a8,       0, PIXMAN_a1r5g5b5 },
+    { "over_n_8_4444",         PIXMAN_a8r8g8b8,    1, PIXMAN_OP_OVER,    PIXMAN_a8,       0, PIXMAN_a4r4g4b4 },
+    { "over_n_8_2222",         PIXMAN_a8r8g8b8,    1, PIXMAN_OP_OVER,    PIXMAN_a8,       0, PIXMAN_a2r2g2b2 },
+    { "over_n_8_x888",         PIXMAN_a8r8g8b8,    1, PIXMAN_OP_OVER,    PIXMAN_a8,       0, PIXMAN_x8r8g8b8 },
+    { "over_n_8_8888",         PIXMAN_a8r8g8b8,    1, PIXMAN_OP_OVER,    PIXMAN_a8,       0, PIXMAN_a8r8g8b8 },
+    { "over_n_8_2x10",         PIXMAN_a8r8g8b8,    1, PIXMAN_OP_OVER,    PIXMAN_a8,       0, PIXMAN_x2r10g10b10 },
+    { "over_n_8_2a10",         PIXMAN_a8r8g8b8,    1, PIXMAN_OP_OVER,    PIXMAN_a8,       0, PIXMAN_a2r10g10b10 },
+    { "over_n_8888_8888_ca",   PIXMAN_a8r8g8b8,    1, PIXMAN_OP_OVER,    PIXMAN_a8r8g8b8, 2, PIXMAN_a8r8g8b8 },
+    { "over_n_8888_x888_ca",   PIXMAN_a8r8g8b8,    1, PIXMAN_OP_OVER,    PIXMAN_a8r8g8b8, 2, PIXMAN_x8r8g8b8 },
+    { "over_n_8888_0565_ca",   PIXMAN_a8r8g8b8,    1, PIXMAN_OP_OVER,    PIXMAN_a8r8g8b8, 2, PIXMAN_r5g6b5 },
+    { "over_n_8888_1555_ca",   PIXMAN_a8r8g8b8,    1, PIXMAN_OP_OVER,    PIXMAN_a8r8g8b8, 2, PIXMAN_a1r5g5b5 },
+    { "over_n_8888_4444_ca",   PIXMAN_a8r8g8b8,    1, PIXMAN_OP_OVER,    PIXMAN_a8r8g8b8, 2, PIXMAN_a4r4g4b4 },
+    { "over_n_8888_2222_ca",   PIXMAN_a8r8g8b8,    1, PIXMAN_OP_OVER,    PIXMAN_a8r8g8b8, 2, PIXMAN_a2r2g2b2 },
+    { "over_n_8888_2x10_ca",   PIXMAN_a8r8g8b8,    1, PIXMAN_OP_OVER,    PIXMAN_a8r8g8b8, 2, PIXMAN_x2r10g10b10 },
+    { "over_n_8888_2a10_ca",   PIXMAN_a8r8g8b8,    1, PIXMAN_OP_OVER,    PIXMAN_a8r8g8b8, 2, PIXMAN_a2r10g10b10 },
+    { "over_8888_n_8888",      PIXMAN_a8r8g8b8,    0, PIXMAN_OP_OVER,    PIXMAN_a8,       1, PIXMAN_a8r8g8b8 },
+    { "over_8888_n_x888",      PIXMAN_a8r8g8b8,    0, PIXMAN_OP_OVER,    PIXMAN_a8,       1, PIXMAN_x8r8g8b8 },
+    { "over_8888_n_0565",      PIXMAN_a8r8g8b8,    0, PIXMAN_OP_OVER,    PIXMAN_a8,       1, PIXMAN_r5g6b5 },
+    { "over_8888_n_1555",      PIXMAN_a8r8g8b8,    0, PIXMAN_OP_OVER,    PIXMAN_a8,       1, PIXMAN_a1r5g5b5 },
+    { "outrev_n_8_0565",       PIXMAN_a8r8g8b8,    1, PIXMAN_OP_OUT_REV, PIXMAN_a8,       0, PIXMAN_r5g6b5 },
+    { "outrev_n_8_1555",       PIXMAN_a8r8g8b8,    1, PIXMAN_OP_OUT_REV, PIXMAN_a8,       0, PIXMAN_a1r5g5b5 },
+    { "outrev_n_8_x888",       PIXMAN_a8r8g8b8,    1, PIXMAN_OP_OUT_REV, PIXMAN_a8,       0, PIXMAN_x8r8g8b8 },
+    { "outrev_n_8_8888",       PIXMAN_a8r8g8b8,    1, PIXMAN_OP_OUT_REV, PIXMAN_a8,       0, PIXMAN_a8r8g8b8 },
+    { "outrev_n_8888_0565_ca", PIXMAN_a8r8g8b8,    1, PIXMAN_OP_OUT_REV, PIXMAN_a8r8g8b8, 2, PIXMAN_r5g6b5 },
+    { "outrev_n_8888_1555_ca", PIXMAN_a8r8g8b8,    1, PIXMAN_OP_OUT_REV, PIXMAN_a8r8g8b8, 2, PIXMAN_a1r5g5b5 },
+    { "outrev_n_8888_x888_ca", PIXMAN_a8r8g8b8,    1, PIXMAN_OP_OUT_REV, PIXMAN_a8r8g8b8, 2, PIXMAN_x8r8g8b8 },
+    { "outrev_n_8888_8888_ca", PIXMAN_a8r8g8b8,    1, PIXMAN_OP_OUT_REV, PIXMAN_a8r8g8b8, 2, PIXMAN_a8r8g8b8 },
+};
+
+int
+main (int argc, char *argv[])
+{
+    double x;
+    int i;
+    char *pattern = argc > 1 ? argv[1] : "all";
+
+    src = aligned_malloc (4096, BUFSIZE * 3);
+    memset (src, 0xCC, BUFSIZE * 3);
+    dst = src + (BUFSIZE / 4);
+    mask = dst + (BUFSIZE / 4);
+
+    printf ("Benchmark for a set of most commonly used functions\n");
+    printf ("---\n");
+    printf ("All results are presented in millions of pixels per second\n");
+    printf ("L1  - small Xx1 rectangle (fitting L1 cache), always blitted at the same\n");
+    printf ("      memory location with small drift in horizontal direction\n");
+    printf ("L2  - small XxY rectangle (fitting L2 cache), always blitted at the same\n");
+    printf ("      memory location with small drift in horizontal direction\n");
+    printf ("M   - large %dx%d rectangle, always blitted at the same\n",
+            WIDTH - 64, HEIGHT);
+    printf ("      memory location with small drift in horizontal direction\n");
+    printf ("HT  - random rectangles with %dx%d average size are copied from\n",
+            TILEWIDTH, TILEWIDTH);
+    printf ("      one %dx%d buffer to another, traversing from left to right\n",
+            WIDTH, HEIGHT);
+    printf ("      and from top to bottom\n");
+    printf ("VT  - random rectangles with %dx%d average size are copied from\n",
+            TILEWIDTH, TILEWIDTH);
+    printf ("      one %dx%d buffer to another, traversing from top to bottom\n",
+            WIDTH, HEIGHT);
+    printf ("      and from left to right\n");
+    printf ("R   - random rectangles with %dx%d average size are copied from\n",
+            TILEWIDTH, TILEWIDTH);
+    printf ("      random locations of one %dx%d buffer to another\n",
+            WIDTH, HEIGHT);
+    printf ("RT  - as R, but %dx%d average sized rectangles are copied\n",
+            TINYWIDTH, TINYWIDTH);
+    printf ("---\n");
+    bandwidth = x = bench_memcpy ();
+    printf ("reference memcpy speed = %.1fMB/s (%.1fMP/s for 32bpp fills)\n",
+            x / 1000000., x / 4000000);
+    printf ("---\n");
+
+    for (i = 0; i < sizeof(tests_tbl) / sizeof(tests_tbl[0]); i++)
+    {
+       if (strcmp (pattern, "all") == 0 || strstr (tests_tbl[i].testname, pattern))
+       {
+           bench_composite (tests_tbl[i].testname,
+                            tests_tbl[i].src_fmt,
+                            tests_tbl[i].src_flags,
+                            tests_tbl[i].op,
+                            tests_tbl[i].mask_fmt,
+                            tests_tbl[i].mask_flags,
+                            tests_tbl[i].dst_fmt,
+                            bandwidth/8);
+       }
+    }
+
+    free (src);
+    return 0;
+}
diff --git a/test/oob-test.c b/test/oob-test.c
new file mode 100644 (file)
index 0000000..4f9e5a2
--- /dev/null
@@ -0,0 +1,101 @@
+#include <stdio.h>
+#include <stdlib.h>
+#include "pixman.h"
+
+typedef struct
+{
+    int                                width;
+    int                                height;
+    int                                stride;
+    pixman_format_code_t       format;
+    
+} image_info_t;
+
+typedef struct
+{
+    pixman_op_t                op;
+    
+    image_info_t       src;
+    image_info_t       dest;
+
+    int                        src_x;
+    int                        src_y;
+    int                        dest_x;
+    int                        dest_y;
+    int                        width;
+    int                        height;
+} composite_info_t;
+
+const composite_info_t info[] =
+{
+    {
+       PIXMAN_OP_SRC,
+       {  3, 6, 16, PIXMAN_a8r8g8b8 },
+       {  5, 7, 20, PIXMAN_x8r8g8b8 },
+       1, 8,
+       1, -1,
+       1, 8
+    },
+    {
+       PIXMAN_OP_SRC,
+       { 7, 5, 36, PIXMAN_a8r8g8b8 },
+       { 6, 5, 28, PIXMAN_x8r8g8b8 },
+       8, 5,
+       5, 3,
+       1, 2
+    },
+    {
+       PIXMAN_OP_OVER,
+       { 10, 10, 40, PIXMAN_a2b10g10r10 },
+       { 10, 10, 40, PIXMAN_a2b10g10r10 },
+       0, 0,
+       0, 0,
+       10, 10
+    },
+    {
+       PIXMAN_OP_OVER,
+       { 10, 10, 40, PIXMAN_x2b10g10r10 },
+       { 10, 10, 40, PIXMAN_x2b10g10r10 },
+       0, 0,
+       0, 0,
+       10, 10
+    },
+};
+
+static pixman_image_t *
+make_image (const image_info_t *info)
+{
+    char *data = malloc (info->stride * info->height);
+    int i;
+
+    for (i = 0; i < info->height * info->stride; ++i)
+       data[i] = (i % 255) ^ (((i % 16) << 4) | (i & 0xf0));
+
+    return pixman_image_create_bits (info->format, info->width, info->height, (uint32_t *)data, info->stride);
+}
+    
+static void
+test_composite (const composite_info_t *info)
+{
+    pixman_image_t *src = make_image (&info->src);
+    pixman_image_t *dest = make_image (&info->dest);
+
+    pixman_image_composite (PIXMAN_OP_SRC, src, NULL, dest,
+                           info->src_x, info->src_y,
+                           0, 0,
+                           info->dest_x, info->dest_y,
+                           info->width, info->height);
+}
+
+
+
+int
+main (int argc, char **argv)
+{
+    int i;
+
+    for (i = 0; i < sizeof (info) / sizeof (info[0]); ++i)
+       test_composite (&info[i]);
+    
+    return 0;
+}
diff --git a/test/pdf-op-test.c b/test/pdf-op-test.c
new file mode 100644 (file)
index 0000000..99cb7df
--- /dev/null
@@ -0,0 +1,83 @@
+#include <stdlib.h>
+#include "utils.h"
+
+static const pixman_op_t pdf_ops[] =
+{
+    PIXMAN_OP_MULTIPLY,
+    PIXMAN_OP_SCREEN,
+    PIXMAN_OP_OVERLAY,
+    PIXMAN_OP_DARKEN,
+    PIXMAN_OP_LIGHTEN,
+    PIXMAN_OP_COLOR_DODGE,
+    PIXMAN_OP_COLOR_BURN,
+    PIXMAN_OP_HARD_LIGHT,
+    PIXMAN_OP_SOFT_LIGHT,
+    PIXMAN_OP_DIFFERENCE,
+    PIXMAN_OP_EXCLUSION,
+    PIXMAN_OP_HSL_HUE,
+    PIXMAN_OP_HSL_SATURATION,
+    PIXMAN_OP_HSL_COLOR,
+    PIXMAN_OP_HSL_LUMINOSITY
+};
+
+static const uint32_t pixels[] =
+{
+    0x00808080,
+    0x80123456,
+    0x00000000,
+    0xffffffff,
+    0x00ffffff,
+    0x80808080,
+    0x00123456,
+};
+
+int
+main ()
+{
+    int o, s, m, d;
+
+    enable_fp_exceptions();
+
+    for (o = 0; o < ARRAY_LENGTH (pdf_ops); ++o)
+    {
+       pixman_op_t op = pdf_ops[o];
+
+       for (s = 0; s < ARRAY_LENGTH (pixels); ++s)
+       {
+           pixman_image_t *src;
+
+           src = pixman_image_create_bits (
+               PIXMAN_a8r8g8b8, 1, 1, (uint32_t *)&(pixels[s]), 4);
+
+           for (m = -1; m < ARRAY_LENGTH (pixels); ++m)
+           {
+               pixman_image_t *msk = NULL;
+               if (m >= 0)
+               {
+                   msk = pixman_image_create_bits (
+                       PIXMAN_a8r8g8b8, 1, 1, (uint32_t *)&(pixels[m]), 4);
+               }
+
+               for (d = 0; d < ARRAY_LENGTH (pixels); ++d)
+               {
+                   pixman_image_t *dst;
+                   uint32_t dp = pixels[d];
+
+                   dst = pixman_image_create_bits (
+                       PIXMAN_a8r8g8b8, 1, 1, &dp, 4);
+
+                   pixman_image_composite (op, src, msk, dst,
+                                           0, 0, 0, 0, 0, 0, 1, 1);
+
+                   pixman_image_unref (dst);
+               }
+               if (msk)
+                   pixman_image_unref (msk);
+           }
+
+           pixman_image_unref (src);
+       }
+    }
+
+    return 0;
+}
diff --git a/test/region-contains-test.c b/test/region-contains-test.c
new file mode 100644 (file)
index 0000000..b660fdf
--- /dev/null
@@ -0,0 +1,170 @@
+#include <assert.h>
+#include <stdlib.h>
+#include <stdio.h>
+#include "utils.h"
+
+static void
+make_random_region (pixman_region32_t *region)
+{
+    int n_boxes;
+
+    pixman_region32_init (region);
+
+    n_boxes = lcg_rand_n (64);
+    while (n_boxes--)
+    {
+       int32_t x, y;
+       uint32_t w, h;
+
+       x = (int32_t)lcg_rand_u32() >> 2;
+       y = (int32_t)lcg_rand_u32() >> 2;
+       w = lcg_rand_u32() >> 2;
+       h = lcg_rand_u32() >> 2;
+
+       pixman_region32_union_rect (region, region, x, y, w, h);
+    }
+}
+
+static void
+print_box (pixman_box32_t *box)
+{
+    printf ("    %d %d %d %d\n", box->x1, box->y1, box->x2, box->y2);
+}
+
+static int32_t
+random_coord (pixman_region32_t *region, pixman_bool_t x)
+{
+    pixman_box32_t *b, *bb;
+    int n_boxes;
+    int begin, end;
+
+    if (lcg_rand_n (14))
+    {
+       bb = pixman_region32_rectangles (region, &n_boxes);
+       if (n_boxes == 0)
+           goto use_extent;
+       b = bb + lcg_rand_n (n_boxes);
+    }
+    else
+    {
+    use_extent:
+       b = pixman_region32_extents (region);
+       n_boxes = 1;
+    }
+
+    if (x)
+    {
+       begin = b->x1;
+       end = b->x2;
+    }
+    else
+    {
+       begin = b->y1;
+       end = b->y2;
+    }
+
+    switch (lcg_rand_n (5))
+    {
+    case 0:
+       return begin - lcg_rand_u32();
+    case 1:
+       return end + lcg_rand_u32 ();
+    case 2:
+       return end;
+    case 3:
+       return begin;
+    default:
+       return (begin + end) / 2;
+    }
+    return 0;
+}
+
+static uint32_t
+compute_crc32_u32 (uint32_t crc32, uint32_t v)
+{
+    if (!is_little_endian())
+    {
+       v = ((v & 0xff000000) >> 24)    |
+           ((v & 0x00ff0000) >> 8)     |
+           ((v & 0x0000ff00) << 8)     |
+           ((v & 0x000000ff) << 24);
+    }
+
+    return compute_crc32 (crc32, &v, sizeof (int32_t));
+}
+
+static uint32_t
+crc32_box32 (uint32_t crc32, pixman_box32_t *box)
+{
+    crc32 = compute_crc32_u32 (crc32, box->x1);
+    crc32 = compute_crc32_u32 (crc32, box->y1);
+    crc32 = compute_crc32_u32 (crc32, box->x2);
+    crc32 = compute_crc32_u32 (crc32, box->y2);
+
+    return crc32;
+}
+
+static uint32_t
+test_region_contains_rectangle (int i, int verbose)
+{
+    pixman_box32_t box;
+    pixman_box32_t rbox = { 0, 0, 0, 0 };
+    pixman_region32_t region;
+    uint32_t r, r1, r2, r3, r4, crc32;
+
+    lcg_srand (i);
+
+    make_random_region (&region);
+
+    box.x1 = random_coord (&region, TRUE);
+    box.x2 = box.x1 + lcg_rand_u32 ();
+    box.y1 = random_coord (&region, FALSE);
+    box.y2 = box.y1 + lcg_rand_u32 ();
+
+    if (verbose)
+    {
+       int n_rects;
+       pixman_box32_t *boxes;
+
+       boxes = pixman_region32_rectangles (&region, &n_rects);
+
+       printf ("region:\n");
+       while (n_rects--)
+           print_box (boxes++);
+       printf ("box:\n");
+       print_box (&box);
+    }
+
+    crc32 = 0;
+
+    r1 = pixman_region32_contains_point (&region, box.x1, box.y1, &rbox);
+    crc32 = crc32_box32 (crc32, &rbox);
+    r2 = pixman_region32_contains_point (&region, box.x1, box.y2, &rbox);
+    crc32 = crc32_box32 (crc32, &rbox);
+    r3 = pixman_region32_contains_point (&region, box.x2, box.y1, &rbox);
+    crc32 = crc32_box32 (crc32, &rbox);
+    r4 = pixman_region32_contains_point (&region, box.x2, box.y2, &rbox);
+    crc32 = crc32_box32 (crc32, &rbox);
+
+    r = pixman_region32_contains_rectangle (&region, &box);
+    r = (i << 8) | (r << 4) | (r1 << 3) | (r2 << 2) | (r3 << 1) | (r4 << 0);
+
+    crc32 = compute_crc32_u32 (crc32, r);
+
+    if (verbose)
+       printf ("results: %d %d %d %d %d\n", (r & 0xf0) >> 4, r1, r2, r3, r4);
+
+    pixman_region32_fini (&region);
+
+    return crc32;
+}
+
+int
+main (int argc, const char *argv[])
+{
+    return fuzzer_test_main ("region_contains",
+                            1000000,
+                            0xD7C297CC,
+                            test_region_contains_rectangle,
+                            argc, argv);
+}
diff --git a/test/region-test.c b/test/region-test.c
new file mode 100644 (file)
index 0000000..9d5a41e
--- /dev/null
@@ -0,0 +1,123 @@
+#include <assert.h>
+#include <stdlib.h>
+#include <stdio.h>
+#include "utils.h"
+
+int
+main ()
+{
+    pixman_region32_t r1;
+    pixman_region32_t r2;
+    pixman_region32_t r3;
+    pixman_box32_t boxes[] = {
+       { 10, 10, 20, 20 },
+       { 30, 30, 30, 40 },
+       { 50, 45, 60, 44 },
+    };
+    pixman_box32_t boxes2[] = {
+       { 2, 6, 7, 6 },
+       { 4, 1, 6, 7 },
+    };
+    pixman_box32_t boxes3[] = {
+       { 2, 6, 7, 6 },
+       { 4, 1, 6, 1 },
+    };
+    int i, j;
+    pixman_box32_t *b;
+    pixman_image_t *image, *fill;
+    pixman_color_t white = {
+       0xffff,
+       0xffff,
+       0xffff,
+       0xffff
+    };
+
+    /* This used to go into an infinite loop before pixman-region.c
+     * was fixed to not use explict "short" variables
+     */
+    pixman_region32_init_rect (&r1, 0, 0, 20, 64000);
+    pixman_region32_init_rect (&r2, 0, 0, 20, 64000);
+    pixman_region32_init_rect (&r3, 0, 0, 20, 64000);
+
+    pixman_region32_subtract (&r1, &r2, &r3);
+
+
+    /* This would produce a region containing an empty
+     * rectangle in it. Such regions are considered malformed,
+     * but using an empty rectangle for initialization should
+     * work.
+     */
+    pixman_region32_init_rects (&r1, boxes, 3);
+
+    b = pixman_region32_rectangles (&r1, &i);
+
+    assert (i == 1);
+    
+    while (i--)
+    {
+       assert (b[i].x1 < b[i].x2);
+       assert (b[i].y1 < b[i].y2);
+    }
+
+    /* This would produce a rectangle containing the bounding box
+     * of the two rectangles. The correct result is to eliminate
+     * the broken rectangle.
+     */
+    pixman_region32_init_rects (&r1, boxes2, 2);
+
+    b = pixman_region32_rectangles (&r1, &i);
+
+    assert (i == 1);
+
+    assert (b[0].x1 == 4);
+    assert (b[0].y1 == 1);
+    assert (b[0].x2 == 6);
+    assert (b[0].y2 == 7);
+
+    /* This should produce an empty region */
+    pixman_region32_init_rects (&r1, boxes3, 2);
+
+    b = pixman_region32_rectangles (&r1, &i);
+
+    assert (i == 0);
+
+    fill = pixman_image_create_solid_fill (&white);
+    for (i = 0; i < 100; i++)
+    {
+       int image_size = 128;
+
+       pixman_region32_init (&r1);
+
+       /* Add some random rectangles */
+       for (j = 0; j < 64; j++)
+           pixman_region32_union_rect (&r1, &r1,
+                                       lcg_rand_n (image_size),
+                                       lcg_rand_n (image_size),
+                                       lcg_rand_n (25),
+                                       lcg_rand_n (25));
+
+       /* Clip to image size */
+       pixman_region32_init_rect (&r2, 0, 0, image_size, image_size);
+       pixman_region32_intersect (&r1, &r1, &r2);
+       pixman_region32_fini (&r2);
+
+       /* render region to a1 mask */
+       image = pixman_image_create_bits (PIXMAN_a1, image_size, image_size, NULL, 0);
+       pixman_image_set_clip_region32 (image, &r1);
+       pixman_image_composite32 (PIXMAN_OP_SRC,
+                                 fill, NULL, image,
+                                 0, 0, 0, 0, 0, 0,
+                                 image_size, image_size);
+       pixman_region32_init_from_image (&r2, image);
+
+       pixman_image_unref (image);
+
+       assert (pixman_region32_equal (&r1, &r2));
+       pixman_region32_fini (&r1);
+       pixman_region32_fini (&r2);
+
+    }
+    pixman_image_unref (fill);
+
+    return 0;
+}
diff --git a/test/region-translate-test.c b/test/region-translate-test.c
new file mode 100644 (file)
index 0000000..0e96a5e
--- /dev/null
@@ -0,0 +1,30 @@
+#include <pixman.h>
+#include <assert.h>
+
+/* Pixman had a bug where 32bit regions where clipped to 16bit sizes when
+ * pixman_region32_translate() was called. This test exercises that bug.
+ */
+
+#define LARGE 32000
+
+int
+main (int argc, char **argv)
+{
+  pixman_box32_t rect = { -LARGE, -LARGE, LARGE, LARGE };
+  pixman_region32_t r1, r2;
+
+  pixman_region32_init_rects (&r1, &rect, 1);
+  pixman_region32_init_rect (&r2, rect.x1, rect.y1, rect.x2 - rect.x1, rect.y2 - rect.y1);
+
+  assert (pixman_region32_equal (&r1,  &r2));
+
+  pixman_region32_translate (&r1, -LARGE, LARGE);
+  pixman_region32_translate (&r1, LARGE, -LARGE);
+
+  assert (pixman_region32_equal (&r1,  &r2));
+
+  pixman_region32_fini (&r1);
+  pixman_region32_fini (&r2);
+
+  return 0;
+}
diff --git a/test/scaling-crash-test.c b/test/scaling-crash-test.c
new file mode 100644 (file)
index 0000000..40323d4
--- /dev/null
@@ -0,0 +1,217 @@
+#include <assert.h>
+#include <stdlib.h>
+#include <stdio.h>
+#include <string.h>
+#include "pixman.h"
+
+/*
+ * We have a source image filled with solid color, set NORMAL or PAD repeat,
+ * and some transform which results in nearest neighbour scaling.
+ *
+ * The expected result is either that the destination image filled with this solid
+ * color or, if the transformation is such that we can't composite anything at
+ * all, that nothing has changed in the destination.
+ *
+ * The surrounding memory of the source image is a different solid color so that
+ * we are sure to get failures if we access it.
+ */
+static int
+run_test (int32_t              dst_width,
+         int32_t               dst_height,
+         int32_t               src_width,
+         int32_t               src_height,
+         int32_t               src_x,
+         int32_t               src_y,
+         int32_t               scale_x,
+         int32_t               scale_y,
+         pixman_filter_t       filter,
+         pixman_repeat_t       repeat)
+{
+    pixman_image_t *   src_img;
+    pixman_image_t *   dst_img;
+    pixman_transform_t transform;
+    uint32_t *         srcbuf;
+    uint32_t *         dstbuf;
+    pixman_box32_t     box = { 0, 0, src_width, src_height };
+    pixman_color_t     color_cc = { 0xcccc, 0xcccc, 0xcccc, 0xcccc };
+    int result;
+    int i;
+
+    static const pixman_fixed_t kernel[] =
+    {
+#define D(f)   (pixman_double_to_fixed (f) + 0x0001)
+
+       pixman_int_to_fixed (5),
+       pixman_int_to_fixed (5),
+       D(1/25.0), D(1/25.0), D(1/25.0), D(1/25.0), D(1/25.0),
+       D(1/25.0), D(1/25.0), D(1/25.0), D(1/25.0), D(1/25.0),
+       D(1/25.0), D(1/25.0), D(1/25.0), D(1/25.0), D(1/25.0),
+       D(1/25.0), D(1/25.0), D(1/25.0), D(1/25.0), D(1/25.0),
+       D(1/25.0), D(1/25.0), D(1/25.0), D(1/25.0), D(1/25.0)
+    };
+
+    result = 0;
+
+    srcbuf = (uint32_t *)malloc ((src_width + 10) * (src_height + 10) * 4);
+    dstbuf = (uint32_t *)malloc (dst_width * dst_height * 4);
+
+    memset (srcbuf, 0x88, src_width * src_height * 4);
+    memset (dstbuf, 0x33, dst_width * dst_height * 4);
+
+    src_img = pixman_image_create_bits (
+        PIXMAN_a8r8g8b8, src_width, src_height,
+       srcbuf + (src_width + 10) * 5 + 5, (src_width + 10) * 4);
+
+    pixman_image_fill_boxes (PIXMAN_OP_SRC, src_img, &color_cc, 1, &box);
+
+    dst_img = pixman_image_create_bits (
+        PIXMAN_a8r8g8b8, dst_width, dst_height, dstbuf, dst_width * 4);
+
+    pixman_transform_init_scale (&transform, scale_x, scale_y);
+    pixman_image_set_transform (src_img, &transform);
+    pixman_image_set_repeat (src_img, repeat);
+    if (filter == PIXMAN_FILTER_CONVOLUTION)
+       pixman_image_set_filter (src_img, filter, kernel, 27);
+    else
+       pixman_image_set_filter (src_img, filter, NULL, 0);
+
+    pixman_image_composite (PIXMAN_OP_SRC, src_img, NULL, dst_img,
+                            src_x, src_y, 0, 0, 0, 0, dst_width, dst_height);
+
+    pixman_image_unref (src_img);
+    pixman_image_unref (dst_img);
+
+    for (i = 0; i < dst_width * dst_height; i++)
+    {
+       if (dstbuf[i] != 0xCCCCCCCC && dstbuf[i] != 0x33333333)
+       {
+           result = 1;
+           break;
+       }
+    }
+
+    free (srcbuf);
+    free (dstbuf);
+    return result;
+}
+
+typedef struct filter_info_t filter_info_t;
+struct filter_info_t
+{
+    pixman_filter_t value;
+    char name[28];
+};
+
+static const filter_info_t filters[] =
+{
+    { PIXMAN_FILTER_NEAREST, "NEAREST" },
+    { PIXMAN_FILTER_BILINEAR, "BILINEAR" },
+    { PIXMAN_FILTER_CONVOLUTION, "CONVOLUTION" },
+};
+
+typedef struct repeat_info_t repeat_info_t;
+struct repeat_info_t
+{
+    pixman_repeat_t value;
+    char name[28];
+};
+
+
+static const repeat_info_t repeats[] =
+{
+    { PIXMAN_REPEAT_PAD, "PAD" },
+    { PIXMAN_REPEAT_REFLECT, "REFLECT" },
+    { PIXMAN_REPEAT_NORMAL, "NORMAL" }
+};
+
+static int
+do_test (int32_t               dst_size,
+        int32_t                src_size,
+        int32_t                src_offs,
+        int32_t                scale_factor)
+{
+#define N_ELEMENTS(a)  (sizeof (a) / sizeof ((a)[0]))
+    int i, j;
+
+    for (i = 0; i < N_ELEMENTS(filters); ++i)
+    {
+       for (j = 0; j < N_ELEMENTS (repeats); ++j)
+       {
+           /* horizontal test */
+           if (run_test (dst_size, 1,
+                         src_size, 1,
+                         src_offs, 0,
+                         scale_factor, 65536,
+                         filters[i].value,
+                         repeats[j].value) != 0)
+           {
+               printf ("Vertical test failed with %s filter and repeat mode %s\n",
+                       filters[i].name, repeats[j].name);
+
+               return 1;
+           }
+
+           /* vertical test */
+           if (run_test (1, dst_size,
+                         1, src_size,
+                         0, src_offs,
+                         65536, scale_factor,
+                         filters[i].value,
+                         repeats[j].value) != 0)
+           {
+               printf ("Vertical test failed with %s filter and repeat mode %s\n",
+                       filters[i].name, repeats[j].name);
+
+               return 1;
+           }
+       }
+    }
+
+    return 0;
+}
+
+int
+main (int argc, char *argv[])
+{
+    int i;
+
+    pixman_disable_out_of_bounds_workaround ();
+
+    /* can potentially crash */
+    assert (do_test (
+               48000, 32767, 1, 65536 * 128) == 0);
+
+    /* can potentially get into a deadloop */
+    assert (do_test (
+               16384, 65536, 32, 32768) == 0);
+
+    /* can potentially access memory outside source image buffer */
+    assert (do_test (
+               10, 10, 0, 1) == 0);
+    assert (do_test (
+               10, 10, 0, 0) == 0);
+
+    for (i = 0; i < 100; ++i)
+    {
+       pixman_fixed_t one_seventh =
+           (((pixman_fixed_48_16_t)pixman_fixed_1) << 16) / (7 << 16);
+
+       assert (do_test (
+                   1, 7, 3, one_seventh + i - 50) == 0);
+    }
+
+    for (i = 0; i < 100; ++i)
+    {
+       pixman_fixed_t scale =
+           (((pixman_fixed_48_16_t)pixman_fixed_1) << 16) / (32767 << 16);
+
+       assert (do_test (
+                   1, 32767, 16383, scale + i - 50) == 0);
+    }
+
+    /* can potentially provide invalid results (out of range matrix stuff) */
+    assert (do_test (
+       48000, 32767, 16384, 65536 * 128) == 0);
+
+    return 0;
+}
diff --git a/test/scaling-helpers-test.c b/test/scaling-helpers-test.c
new file mode 100755 (executable)
index 0000000..33ec47c
--- /dev/null
@@ -0,0 +1,91 @@
+#include <stdlib.h>
+#include <stdio.h>
+#include <assert.h>
+#include "utils.h"
+#include "pixman-inlines.h"
+
+/* A trivial reference implementation for
+ * 'bilinear_pad_repeat_get_scanline_bounds'
+ */
+static void
+bilinear_pad_repeat_get_scanline_bounds_ref (int32_t        source_image_width,
+                                            pixman_fixed_t vx_,
+                                            pixman_fixed_t unit_x,
+                                            int32_t *      left_pad,
+                                            int32_t *      left_tz,
+                                            int32_t *      width,
+                                            int32_t *      right_tz,
+                                            int32_t *      right_pad)
+{
+    int w = *width;
+    int64_t vx = vx_;
+    *left_pad = 0;
+    *left_tz = 0;
+    *width = 0;
+    *right_tz = 0;
+    *right_pad = 0;
+    while (--w >= 0)
+    {
+       if (vx < 0)
+       {
+           if (vx + pixman_fixed_1 < 0)
+               *left_pad += 1;
+           else
+               *left_tz += 1;
+       }
+       else if (vx + pixman_fixed_1 >= pixman_int_to_fixed (source_image_width))
+       {
+           if (vx >= pixman_int_to_fixed (source_image_width))
+               *right_pad += 1;
+           else
+               *right_tz += 1;
+       }
+       else
+       {
+           *width += 1;
+       }
+       vx += unit_x;
+    }
+}
+
+int
+main (void)
+{
+    int i;
+    for (i = 0; i < 10000; i++)
+    {
+       int32_t left_pad1, left_tz1, width1, right_tz1, right_pad1;
+       int32_t left_pad2, left_tz2, width2, right_tz2, right_pad2;
+       pixman_fixed_t vx = lcg_rand_N(10000 << 16) - (3000 << 16);
+       int32_t width = lcg_rand_N(10000);
+       int32_t source_image_width = lcg_rand_N(10000) + 1;
+       pixman_fixed_t unit_x = lcg_rand_N(10 << 16) + 1;
+       width1 = width2 = width;
+
+       bilinear_pad_repeat_get_scanline_bounds_ref (source_image_width,
+                                                    vx,
+                                                    unit_x,
+                                                    &left_pad1,
+                                                    &left_tz1,
+                                                    &width1,
+                                                    &right_tz1,
+                                                    &right_pad1);
+
+       bilinear_pad_repeat_get_scanline_bounds (source_image_width,
+                                                vx,
+                                                unit_x,
+                                                &left_pad2,
+                                                &left_tz2,
+                                                &width2,
+                                                &right_tz2,
+                                                &right_pad2);
+
+       assert (left_pad1 == left_pad2);
+       assert (left_tz1 == left_tz2);
+       assert (width1 == width2);
+       assert (right_tz1 == right_tz2);
+       assert (right_pad1 == right_pad2);
+    }
+
+    return 0;
+}
diff --git a/test/scaling-test.c b/test/scaling-test.c
new file mode 100755 (executable)
index 0000000..82370f7
--- /dev/null
@@ -0,0 +1,368 @@
+/*
+ * Test program, which can detect some problems with nearest neighbour
+ * and bilinear scaling in pixman. Testing is done by running lots
+ * of random SRC and OVER compositing operations a8r8g8b8, x8a8r8g8b8
+ * and r5g6b5 color formats.
+ *
+ * Script 'fuzzer-find-diff.pl' can be used to narrow down the problem in
+ * the case of test failure.
+ */
+#include <assert.h>
+#include <stdlib.h>
+#include <stdio.h>
+#include "utils.h"
+
+#define MAX_SRC_WIDTH  48
+#define MAX_SRC_HEIGHT 8
+#define MAX_DST_WIDTH  48
+#define MAX_DST_HEIGHT 8
+#define MAX_STRIDE     4
+
+/*
+ * Composite operation with pseudorandom images
+ */
+uint32_t
+test_composite (int      testnum,
+               int      verbose)
+{
+    int                i;
+    pixman_image_t *   src_img;
+    pixman_image_t *   mask_img;
+    pixman_image_t *   dst_img;
+    pixman_transform_t transform;
+    pixman_region16_t  clip;
+    int                src_width, src_height;
+    int                mask_width, mask_height;
+    int                dst_width, dst_height;
+    int                src_stride, mask_stride, dst_stride;
+    int                src_x, src_y;
+    int                mask_x, mask_y;
+    int                dst_x, dst_y;
+    int                src_bpp;
+    int                mask_bpp = 1;
+    int                dst_bpp;
+    int                w, h;
+    pixman_fixed_t     scale_x = 65536, scale_y = 65536;
+    pixman_fixed_t     translate_x = 0, translate_y = 0;
+    pixman_fixed_t     mask_scale_x = 65536, mask_scale_y = 65536;
+    pixman_fixed_t     mask_translate_x = 0, mask_translate_y = 0;
+    pixman_op_t        op;
+    pixman_repeat_t    repeat = PIXMAN_REPEAT_NONE;
+    pixman_repeat_t    mask_repeat = PIXMAN_REPEAT_NONE;
+    pixman_format_code_t src_fmt, dst_fmt;
+    uint32_t *         srcbuf;
+    uint32_t *         dstbuf;
+    uint32_t *         maskbuf;
+    uint32_t           crc32;
+    FLOAT_REGS_CORRUPTION_DETECTOR_START ();
+
+    lcg_srand (testnum);
+
+    src_bpp = (lcg_rand_n (2) == 0) ? 2 : 4;
+    dst_bpp = (lcg_rand_n (2) == 0) ? 2 : 4;
+    switch (lcg_rand_n (3))
+    {
+    case 0:
+       op = PIXMAN_OP_SRC;
+       break;
+    case 1:
+       op = PIXMAN_OP_OVER;
+       break;
+    default:
+       op = PIXMAN_OP_ADD;
+       break;
+    }
+
+    src_width = lcg_rand_n (MAX_SRC_WIDTH) + 1;
+    src_height = lcg_rand_n (MAX_SRC_HEIGHT) + 1;
+
+    if (lcg_rand_n (2))
+    {
+       mask_width = lcg_rand_n (MAX_SRC_WIDTH) + 1;
+       mask_height = lcg_rand_n (MAX_SRC_HEIGHT) + 1;
+    }
+    else
+    {
+       mask_width = mask_height = 1;
+    }
+
+    dst_width = lcg_rand_n (MAX_DST_WIDTH) + 1;
+    dst_height = lcg_rand_n (MAX_DST_HEIGHT) + 1;
+    src_stride = src_width * src_bpp + lcg_rand_n (MAX_STRIDE) * src_bpp;
+    mask_stride = mask_width * mask_bpp + lcg_rand_n (MAX_STRIDE) * mask_bpp;
+    dst_stride = dst_width * dst_bpp + lcg_rand_n (MAX_STRIDE) * dst_bpp;
+
+    if (src_stride & 3)
+       src_stride += 2;
+
+    if (mask_stride & 1)
+       mask_stride += 1;
+    if (mask_stride & 2)
+       mask_stride += 2;
+
+    if (dst_stride & 3)
+       dst_stride += 2;
+
+    src_x = -(src_width / 4) + lcg_rand_n (src_width * 3 / 2);
+    src_y = -(src_height / 4) + lcg_rand_n (src_height * 3 / 2);
+    mask_x = -(mask_width / 4) + lcg_rand_n (mask_width * 3 / 2);
+    mask_y = -(mask_height / 4) + lcg_rand_n (mask_height * 3 / 2);
+    dst_x = -(dst_width / 4) + lcg_rand_n (dst_width * 3 / 2);
+    dst_y = -(dst_height / 4) + lcg_rand_n (dst_height * 3 / 2);
+    w = lcg_rand_n (dst_width * 3 / 2 - dst_x);
+    h = lcg_rand_n (dst_height * 3 / 2 - dst_y);
+
+    srcbuf = (uint32_t *)malloc (src_stride * src_height);
+    maskbuf = (uint32_t *)malloc (mask_stride * mask_height);
+    dstbuf = (uint32_t *)malloc (dst_stride * dst_height);
+
+    for (i = 0; i < src_stride * src_height; i++)
+       *((uint8_t *)srcbuf + i) = lcg_rand_n (256);
+
+    for (i = 0; i < mask_stride * mask_height; i++)
+       *((uint8_t *)maskbuf + i) = lcg_rand_n (256);
+
+    for (i = 0; i < dst_stride * dst_height; i++)
+       *((uint8_t *)dstbuf + i) = lcg_rand_n (256);
+
+    src_fmt = src_bpp == 4 ? (lcg_rand_n (2) == 0 ?
+                              PIXMAN_a8r8g8b8 : PIXMAN_x8r8g8b8) : PIXMAN_r5g6b5;
+
+    dst_fmt = dst_bpp == 4 ? (lcg_rand_n (2) == 0 ?
+                              PIXMAN_a8r8g8b8 : PIXMAN_x8r8g8b8) : PIXMAN_r5g6b5;
+
+    src_img = pixman_image_create_bits (
+        src_fmt, src_width, src_height, srcbuf, src_stride);
+
+    mask_img = pixman_image_create_bits (
+        PIXMAN_a8, mask_width, mask_height, maskbuf, mask_stride);
+
+    dst_img = pixman_image_create_bits (
+        dst_fmt, dst_width, dst_height, dstbuf, dst_stride);
+
+    image_endian_swap (src_img);
+    image_endian_swap (dst_img);
+
+    if (lcg_rand_n (4) > 0)
+    {
+       scale_x = -32768 * 3 + lcg_rand_N (65536 * 5);
+       scale_y = -32768 * 3 + lcg_rand_N (65536 * 5);
+       translate_x = lcg_rand_N (65536);
+       translate_y = lcg_rand_N (65536);
+       pixman_transform_init_scale (&transform, scale_x, scale_y);
+       pixman_transform_translate (&transform, NULL, translate_x, translate_y);
+       pixman_image_set_transform (src_img, &transform);
+    }
+
+    if (lcg_rand_n (2) > 0)
+    {
+       mask_scale_x = -32768 * 3 + lcg_rand_N (65536 * 5);
+       mask_scale_y = -32768 * 3 + lcg_rand_N (65536 * 5);
+       mask_translate_x = lcg_rand_N (65536);
+       mask_translate_y = lcg_rand_N (65536);
+       pixman_transform_init_scale (&transform, mask_scale_x, mask_scale_y);
+       pixman_transform_translate (&transform, NULL, mask_translate_x, mask_translate_y);
+       pixman_image_set_transform (mask_img, &transform);
+    }
+
+    switch (lcg_rand_n (4))
+    {
+    case 0:
+       mask_repeat = PIXMAN_REPEAT_NONE;
+       break;
+
+    case 1:
+       mask_repeat = PIXMAN_REPEAT_NORMAL;
+       break;
+
+    case 2:
+       mask_repeat = PIXMAN_REPEAT_PAD;
+       break;
+
+    case 3:
+       mask_repeat = PIXMAN_REPEAT_REFLECT;
+       break;
+
+    default:
+        break;
+    }
+    pixman_image_set_repeat (mask_img, mask_repeat);
+
+    switch (lcg_rand_n (4))
+    {
+    case 0:
+       repeat = PIXMAN_REPEAT_NONE;
+       break;
+
+    case 1:
+       repeat = PIXMAN_REPEAT_NORMAL;
+       break;
+
+    case 2:
+       repeat = PIXMAN_REPEAT_PAD;
+       break;
+
+    case 3:
+       repeat = PIXMAN_REPEAT_REFLECT;
+       break;
+
+    default:
+        break;
+    }
+    pixman_image_set_repeat (src_img, repeat);
+
+    if (lcg_rand_n (2))
+       pixman_image_set_filter (src_img, PIXMAN_FILTER_NEAREST, NULL, 0);
+    else
+       pixman_image_set_filter (src_img, PIXMAN_FILTER_BILINEAR, NULL, 0);
+
+    if (lcg_rand_n (2))
+       pixman_image_set_filter (mask_img, PIXMAN_FILTER_NEAREST, NULL, 0);
+    else
+       pixman_image_set_filter (mask_img, PIXMAN_FILTER_BILINEAR, NULL, 0);
+
+    if (verbose)
+    {
+       printf ("src_fmt=%08X, dst_fmt=%08X\n", src_fmt, dst_fmt);
+       printf ("op=%d, scale_x=%d, scale_y=%d, repeat=%d\n",
+               op, scale_x, scale_y, repeat);
+       printf ("translate_x=%d, translate_y=%d\n",
+               translate_x, translate_y);
+       printf ("src_width=%d, src_height=%d, dst_width=%d, dst_height=%d\n",
+               src_width, src_height, dst_width, dst_height);
+       printf ("src_x=%d, src_y=%d, dst_x=%d, dst_y=%d\n",
+               src_x, src_y, dst_x, dst_y);
+       printf ("w=%d, h=%d\n", w, h);
+    }
+
+    if (lcg_rand_n (8) == 0)
+    {
+       pixman_box16_t clip_boxes[2];
+       int            n = lcg_rand_n (2) + 1;
+
+       for (i = 0; i < n; i++)
+       {
+           clip_boxes[i].x1 = lcg_rand_n (src_width);
+           clip_boxes[i].y1 = lcg_rand_n (src_height);
+           clip_boxes[i].x2 =
+               clip_boxes[i].x1 + lcg_rand_n (src_width - clip_boxes[i].x1);
+           clip_boxes[i].y2 =
+               clip_boxes[i].y1 + lcg_rand_n (src_height - clip_boxes[i].y1);
+
+           if (verbose)
+           {
+               printf ("source clip box: [%d,%d-%d,%d]\n",
+                       clip_boxes[i].x1, clip_boxes[i].y1,
+                       clip_boxes[i].x2, clip_boxes[i].y2);
+           }
+       }
+
+       pixman_region_init_rects (&clip, clip_boxes, n);
+       pixman_image_set_clip_region (src_img, &clip);
+       pixman_image_set_source_clipping (src_img, 1);
+       pixman_region_fini (&clip);
+    }
+
+    if (lcg_rand_n (8) == 0)
+    {
+       pixman_box16_t clip_boxes[2];
+       int            n = lcg_rand_n (2) + 1;
+
+       for (i = 0; i < n; i++)
+       {
+           clip_boxes[i].x1 = lcg_rand_n (mask_width);
+           clip_boxes[i].y1 = lcg_rand_n (mask_height);
+           clip_boxes[i].x2 =
+               clip_boxes[i].x1 + lcg_rand_n (mask_width - clip_boxes[i].x1);
+           clip_boxes[i].y2 =
+               clip_boxes[i].y1 + lcg_rand_n (mask_height - clip_boxes[i].y1);
+
+           if (verbose)
+           {
+               printf ("mask clip box: [%d,%d-%d,%d]\n",
+                       clip_boxes[i].x1, clip_boxes[i].y1,
+                       clip_boxes[i].x2, clip_boxes[i].y2);
+           }
+       }
+
+       pixman_region_init_rects (&clip, clip_boxes, n);
+       pixman_image_set_clip_region (mask_img, &clip);
+       pixman_image_set_source_clipping (mask_img, 1);
+       pixman_region_fini (&clip);
+    }
+
+    if (lcg_rand_n (8) == 0)
+    {
+       pixman_box16_t clip_boxes[2];
+       int            n = lcg_rand_n (2) + 1;
+       for (i = 0; i < n; i++)
+       {
+           clip_boxes[i].x1 = lcg_rand_n (dst_width);
+           clip_boxes[i].y1 = lcg_rand_n (dst_height);
+           clip_boxes[i].x2 =
+               clip_boxes[i].x1 + lcg_rand_n (dst_width - clip_boxes[i].x1);
+           clip_boxes[i].y2 =
+               clip_boxes[i].y1 + lcg_rand_n (dst_height - clip_boxes[i].y1);
+
+           if (verbose)
+           {
+               printf ("destination clip box: [%d,%d-%d,%d]\n",
+                       clip_boxes[i].x1, clip_boxes[i].y1,
+                       clip_boxes[i].x2, clip_boxes[i].y2);
+           }
+       }
+       pixman_region_init_rects (&clip, clip_boxes, n);
+       pixman_image_set_clip_region (dst_img, &clip);
+       pixman_region_fini (&clip);
+    }
+
+    if (lcg_rand_n (2) == 0)
+       pixman_image_composite (op, src_img, NULL, dst_img,
+                            src_x, src_y, 0, 0, dst_x, dst_y, w, h);
+    else
+       pixman_image_composite (op, src_img, mask_img, dst_img,
+                            src_x, src_y, mask_x, mask_y, dst_x, dst_y, w, h);
+
+    if (dst_fmt == PIXMAN_x8r8g8b8)
+    {
+       /* ignore unused part */
+       for (i = 0; i < dst_stride * dst_height / 4; i++)
+           dstbuf[i] &= 0xFFFFFF;
+    }
+
+    image_endian_swap (dst_img);
+
+    if (verbose)
+    {
+       int j;
+       
+       for (i = 0; i < dst_height; i++)
+       {
+           for (j = 0; j < dst_stride; j++)
+               printf ("%02X ", *((uint8_t *)dstbuf + i * dst_stride + j));
+
+           printf ("\n");
+       }
+    }
+
+    pixman_image_unref (src_img);
+    pixman_image_unref (mask_img);
+    pixman_image_unref (dst_img);
+
+    crc32 = compute_crc32 (0, dstbuf, dst_stride * dst_height);
+    free (srcbuf);
+    free (maskbuf);
+    free (dstbuf);
+
+    FLOAT_REGS_CORRUPTION_DETECTOR_FINISH ();
+    return crc32;
+}
+
+int
+main (int argc, const char *argv[])
+{
+    pixman_disable_out_of_bounds_workaround ();
+
+    return fuzzer_test_main("scaling", 8000000, 0x80DF1CB2,
+                           test_composite, argc, argv);
+}
diff --git a/test/stress-test.c b/test/stress-test.c
new file mode 100755 (executable)
index 0000000..571420a
--- /dev/null
@@ -0,0 +1,872 @@
+#include <stdio.h>
+#include "utils.h"
+#include <sys/types.h>
+
+#if 0
+#define fence_malloc malloc
+#define fence_free free
+#define make_random_bytes malloc
+#endif
+
+static const pixman_format_code_t image_formats[] =
+{
+    PIXMAN_a8r8g8b8,
+    PIXMAN_x8r8g8b8,
+    PIXMAN_r5g6b5,
+    PIXMAN_r3g3b2,
+    PIXMAN_a8,
+    PIXMAN_a8b8g8r8,
+    PIXMAN_x8b8g8r8,
+    PIXMAN_b8g8r8a8,
+    PIXMAN_b8g8r8x8,
+    PIXMAN_r8g8b8a8,
+    PIXMAN_r8g8b8x8,
+    PIXMAN_x14r6g6b6,
+    PIXMAN_r8g8b8,
+    PIXMAN_b8g8r8,
+    PIXMAN_r5g6b5,
+    PIXMAN_b5g6r5,
+    PIXMAN_x2r10g10b10,
+    PIXMAN_a2r10g10b10,
+    PIXMAN_x2b10g10r10,
+    PIXMAN_a2b10g10r10,
+    PIXMAN_a1r5g5b5,
+    PIXMAN_x1r5g5b5,
+    PIXMAN_a1b5g5r5,
+    PIXMAN_x1b5g5r5,
+    PIXMAN_a4r4g4b4,
+    PIXMAN_x4r4g4b4,
+    PIXMAN_a4b4g4r4,
+    PIXMAN_x4b4g4r4,
+    PIXMAN_a8,
+    PIXMAN_r3g3b2,
+    PIXMAN_b2g3r3,
+    PIXMAN_a2r2g2b2,
+    PIXMAN_a2b2g2r2,
+    PIXMAN_c8,
+    PIXMAN_g8,
+    PIXMAN_x4c4,
+    PIXMAN_x4g4,
+    PIXMAN_c4,
+    PIXMAN_g4,
+    PIXMAN_g1,
+    PIXMAN_x4a4,
+    PIXMAN_a4,
+    PIXMAN_r1g2b1,
+    PIXMAN_b1g2r1,
+    PIXMAN_a1r1g1b1,
+    PIXMAN_a1b1g1r1,
+    PIXMAN_a1
+};
+
+static pixman_filter_t filters[] =
+{
+    PIXMAN_FILTER_NEAREST,
+    PIXMAN_FILTER_BILINEAR,
+    PIXMAN_FILTER_FAST,
+    PIXMAN_FILTER_GOOD,
+    PIXMAN_FILTER_BEST,
+    PIXMAN_FILTER_CONVOLUTION
+};
+
+static int
+get_size (void)
+{
+    switch (lcg_rand_n (28))
+    {
+    case 0:
+       return 1;
+
+    case 1:
+       return 2;
+
+    default:
+    case 2:
+       return lcg_rand_n (200);
+
+    case 4:
+       return lcg_rand_n (2000) + 1000;
+
+    case 5:
+       return 65535;
+
+    case 6:
+       return 65536;
+
+    case 7:
+       return lcg_rand_N (64000) + 63000;
+    }
+}
+
+static void
+destroy (pixman_image_t *image, void *data)
+{
+    if (image->type == BITS && image->bits.free_me != image->bits.bits)
+    {
+       uint32_t *bits;
+
+       if (image->bits.bits != (void *)0x01)
+       {
+           bits = image->bits.bits;
+
+           if (image->bits.rowstride < 0)
+               bits -= (- image->bits.rowstride * (image->bits.height - 1));
+
+           fence_free (bits);
+       }
+    }
+
+    free (data);
+}
+
+static uint32_t
+real_reader (const void *src, int size)
+{
+    switch (size)
+    {
+    case 1:
+       return *(uint8_t *)src;
+    case 2:
+       return *(uint16_t *)src;
+    case 4:
+       return *(uint32_t *)src;
+    default:
+       assert (0);
+       return 0; /* silence MSVC */
+    }
+}
+
+static void
+real_writer (void *src, uint32_t value, int size)
+{
+    switch (size)
+    {
+    case 1:
+       *(uint8_t *)src = value;
+       break;
+
+    case 2:
+       *(uint16_t *)src = value;
+       break;
+
+    case 4:
+       *(uint32_t *)src = value;
+       break;
+
+    default:
+       assert (0);
+       break;
+    }
+}
+
+static uint32_t
+fake_reader (const void *src, int size)
+{
+    uint32_t r = lcg_rand_u32 ();
+
+    assert (size == 1 || size == 2 || size == 4);
+    return r & ((1 << (size * 8)) - 1);
+}
+
+static void
+fake_writer (void *src, uint32_t value, int size)
+{
+    assert (size == 1 || size == 2 || size == 4);
+}
+
+static int32_t
+log_rand (void)
+{
+    uint32_t mask;
+
+    mask = (1 << lcg_rand_n (31)) - 1;
+
+    return (lcg_rand () & mask) - (mask >> 1);
+}
+
+static pixman_image_t *
+create_random_bits_image (void)
+{
+    pixman_format_code_t format;
+    pixman_indexed_t *indexed;
+    pixman_image_t *image;
+    int width, height, stride;
+    uint32_t *bits;
+    pixman_read_memory_func_t read_func = NULL;
+    pixman_write_memory_func_t write_func = NULL;
+    pixman_filter_t filter;
+    pixman_fixed_t *coefficients = NULL;
+    int n_coefficients = 0;
+
+    /* format */
+    format = image_formats[lcg_rand_n (ARRAY_LENGTH (image_formats))];
+
+    indexed = NULL;
+    if (PIXMAN_FORMAT_TYPE (format) == PIXMAN_TYPE_COLOR)
+    {
+       indexed = malloc (sizeof (pixman_indexed_t));
+
+       initialize_palette (indexed, PIXMAN_FORMAT_BPP (format), TRUE);
+    }
+    else if (PIXMAN_FORMAT_TYPE (format) == PIXMAN_TYPE_GRAY)
+    {
+       indexed = malloc (sizeof (pixman_indexed_t));
+
+       initialize_palette (indexed, PIXMAN_FORMAT_BPP (format), FALSE);
+    }
+    else
+    {
+       indexed = NULL;
+    }
+
+    /* size */
+    width = get_size ();
+    height = get_size ();
+
+    if ((uint64_t)width * height > 200000)
+    {
+       if (lcg_rand_n(2) == 0)
+           height = 200000 / width;
+       else
+           width = 200000 / height;
+    }
+
+    if (height == 0)
+       height = 1;
+    if (width == 0)
+       width = 1;
+
+    /* bits */
+    switch (lcg_rand_n (7))
+    {
+    default:
+    case 0:
+       stride = width * PIXMAN_FORMAT_BPP (format) + lcg_rand_n (17);
+       stride = (stride + 3) & (~3);
+       bits = (uint32_t *)make_random_bytes (height * stride);
+       break;
+
+    case 1:
+       stride = 0;
+       bits = NULL;
+       break;
+
+    case 2: /* Zero-filled */
+       stride = width * PIXMAN_FORMAT_BPP (format) + lcg_rand_n (17);
+       stride = (stride + 3) & (~3);
+       bits = fence_malloc (height * stride);
+       if (!bits)
+           return NULL;
+       memset (bits, 0, height * stride);
+       break;
+
+    case 3: /* Filled with 0xFF */
+       stride = width * PIXMAN_FORMAT_BPP (format) + lcg_rand_n (17);
+       stride = (stride + 3) & (~3);
+       bits = fence_malloc (height * stride);
+       if (!bits)
+           return NULL;
+       memset (bits, 0xff, height * stride);
+       break;
+
+    case 4: /* bits is a bad pointer, has read/write functions */
+       stride = 232;
+       bits = (void *)0x01;
+       read_func = fake_reader;
+       write_func = fake_writer;
+       break;
+
+    case 5: /* bits is a real pointer, has read/write functions */
+       stride = width * PIXMAN_FORMAT_BPP (format) + lcg_rand_n (17);
+       stride = (stride + 3) & (~3);
+       bits = fence_malloc (height * stride);
+       if (!bits)
+           return NULL;
+       memset (bits, 0xff, height * stride);
+       read_func = real_reader;
+       write_func = real_writer;
+       break;
+
+    case 6: /* bits is a real pointer, stride is negative */
+       stride = (width * PIXMAN_FORMAT_BPP (format) + lcg_rand_n (17));
+       stride = (stride + 3) & (~3);
+       bits = (uint32_t *)make_random_bytes (height * stride);
+       if (!bits)
+           return NULL;
+       bits += ((height - 1) * stride) / 4;
+       stride = - stride;
+       break;
+    }
+
+    /* Filter */
+    filter = filters[lcg_rand_n (ARRAY_LENGTH (filters))];
+    if (filter == PIXMAN_FILTER_CONVOLUTION)
+    {
+       int width = lcg_rand_n (17);
+       int height = lcg_rand_n (19);
+
+       n_coefficients = width * height + 2;
+       coefficients = malloc (n_coefficients * sizeof (pixman_fixed_t));
+
+       if (coefficients)
+       {
+           int i;
+
+           for (i = 0; i < width * height; ++i)
+               coefficients[i + 2] = lcg_rand_u32();
+
+           coefficients[0] = width << 16;
+           coefficients[1] = height << 16;
+       }
+       else
+       {
+           filter = PIXMAN_FILTER_BEST;
+       }
+    }
+
+    /* Finally create the image */
+    image = pixman_image_create_bits (format, width, height, bits, stride);
+    if (!image)
+       return NULL;
+
+    pixman_image_set_indexed (image, indexed);
+    pixman_image_set_destroy_function (image, destroy, indexed);
+    pixman_image_set_accessors (image, read_func, write_func);
+    pixman_image_set_filter (image, filter, coefficients, n_coefficients);
+
+    return image;
+}
+
+static pixman_repeat_t repeats[] =
+{
+    PIXMAN_REPEAT_NONE,
+    PIXMAN_REPEAT_NORMAL,
+    PIXMAN_REPEAT_REFLECT,
+    PIXMAN_REPEAT_PAD
+};
+
+static uint32_t
+absolute (int32_t i)
+{
+    return i < 0? -i : i;
+}
+
+static void
+set_general_properties (pixman_image_t *image, pixman_bool_t allow_alpha_map)
+{
+    pixman_repeat_t repeat;
+
+    /* Set properties that are generic to all images */
+
+    /* Repeat */
+    repeat = repeats[lcg_rand_n (ARRAY_LENGTH (repeats))];
+    pixman_image_set_repeat (image, repeat);
+
+    /* Alpha map */
+    if (allow_alpha_map && lcg_rand_n (3) == 0)
+    {
+       pixman_image_t *alpha_map;
+       int16_t x, y;
+
+       alpha_map = create_random_bits_image ();
+
+       if (alpha_map)
+       {
+           set_general_properties (alpha_map, FALSE);
+
+           x = lcg_rand_N (100000) - 65536;
+           y = lcg_rand_N (100000) - 65536;
+
+           pixman_image_set_alpha_map (image, alpha_map, x, y);
+
+           pixman_image_unref (alpha_map);
+       }
+    }
+
+    /* Component alpha */
+    pixman_image_set_component_alpha (image, lcg_rand_n (3) == 0);
+
+    /* Clip region */
+    if (lcg_rand_n (8) != 0)
+    {
+       pixman_region32_t region;
+       int i, n_rects;
+
+       pixman_region32_init (&region);
+
+       switch (lcg_rand_n (10))
+       {
+       case 0:
+           n_rects = 0;
+           break;
+
+       case 1: case 2: case 3:
+           n_rects = 1;
+           break;
+
+       case 4: case 5:
+           n_rects = 2;
+           break;
+
+       case 6: case 7:
+           n_rects = 3;
+
+       default:
+           n_rects = lcg_rand_n (100);
+           break;
+       }
+
+       for (i = 0; i < n_rects; ++i)
+       {
+           uint32_t width, height;
+           int x, y;
+
+           x = log_rand();
+           y = log_rand();
+           width = absolute (log_rand ()) + 1;
+           height = absolute (log_rand ()) + 1;
+
+           pixman_region32_union_rect (
+               &region, &region, x, y, width, height);
+       }
+
+       pixman_image_set_clip_region32 (image, &region);
+
+       pixman_region32_fini (&region);
+    }
+
+    /* Whether source clipping is enabled */
+    pixman_image_set_source_clipping (image, !!lcg_rand_n (2));
+
+    /* Client clip */
+    pixman_image_set_has_client_clip (image, !!lcg_rand_n (2));
+
+    /* Transform */
+    if (lcg_rand_n (5) < 2)
+    {
+       pixman_transform_t xform;
+       int i, j, k;
+       uint32_t tx, ty, sx, sy;
+       uint32_t c, s;
+
+       memset (&xform, 0, sizeof xform);
+       xform.matrix[0][0] = pixman_fixed_1;
+       xform.matrix[1][1] = pixman_fixed_1;
+       xform.matrix[2][2] = pixman_fixed_1;
+
+       for (k = 0; k < 3; ++k)
+       {
+           switch (lcg_rand_n (4))
+           {
+           case 0:
+               /* rotation */
+               c = lcg_rand_N (2 * 65536) - 65536;
+               s = lcg_rand_N (2 * 65536) - 65536;
+               pixman_transform_rotate (&xform, NULL, c, s);
+               break;
+
+           case 1:
+               /* translation */
+               tx = lcg_rand_u32();
+               ty = lcg_rand_u32();
+               pixman_transform_translate (&xform, NULL, tx, ty);
+               break;
+
+           case 2:
+               /* scale */
+               sx = lcg_rand_u32();
+               sy = lcg_rand_u32();
+               pixman_transform_scale (&xform, NULL, sx, sy);
+               break;
+
+           case 3:
+               if (lcg_rand_n (16) == 0)
+               {
+                   /* random */
+                   for (i = 0; i < 3; ++i)
+                       for (j = 0; j < 3; ++j)
+                           xform.matrix[i][j] = lcg_rand_u32();
+                   break;
+               }
+               else if (lcg_rand_n (16) == 0)
+               {
+                   /* zero */
+                   memset (&xform, 0, sizeof xform);
+               }
+               break;
+           }
+       }
+
+       pixman_image_set_transform (image, &xform);
+    }
+}
+
+static pixman_color_t
+random_color (void)
+{
+    pixman_color_t color =
+    {
+       lcg_rand() & 0xffff,
+       lcg_rand() & 0xffff,
+       lcg_rand() & 0xffff,
+       lcg_rand() & 0xffff,
+    };
+
+    return color;
+}
+
+
+static pixman_image_t *
+create_random_solid_image (void)
+{
+    pixman_color_t color = random_color();
+    pixman_image_t *image = pixman_image_create_solid_fill (&color);
+
+    return image;
+}
+
+static pixman_gradient_stop_t *
+create_random_stops (int *n_stops)
+{
+    pixman_fixed_t step;
+    pixman_fixed_t s;
+    int i;
+    pixman_gradient_stop_t *stops;
+
+    *n_stops = lcg_rand_n (50) + 1;
+
+    step = pixman_fixed_1 / *n_stops;
+
+    stops = malloc (*n_stops * sizeof (pixman_gradient_stop_t));
+
+    s = 0;
+    for (i = 0; i < (*n_stops) - 1; ++i)
+    {
+       stops[i].x = s;
+       stops[i].color = random_color();
+
+       s += step;
+    }
+
+    stops[*n_stops - 1].x = pixman_fixed_1;
+    stops[*n_stops - 1].color = random_color();
+
+    return stops;
+}
+
+static pixman_point_fixed_t
+create_random_point (void)
+{
+    pixman_point_fixed_t p;
+
+    p.x = log_rand ();
+    p.y = log_rand ();
+
+    return p;
+}
+
+static pixman_image_t *
+create_random_linear_image (void)
+{
+    int n_stops;
+    pixman_gradient_stop_t *stops;
+    pixman_point_fixed_t p1, p2;
+    pixman_image_t *result;
+
+    stops = create_random_stops (&n_stops);
+    if (!stops)
+       return NULL;
+
+    p1 = create_random_point ();
+    p2 = create_random_point ();
+
+    result = pixman_image_create_linear_gradient (&p1, &p2, stops, n_stops);
+
+    free (stops);
+
+    return result;
+}
+
+static pixman_image_t *
+create_random_radial_image (void)
+{
+    int n_stops;
+    pixman_gradient_stop_t *stops;
+    pixman_point_fixed_t inner_c, outer_c;
+    pixman_fixed_t inner_r, outer_r;
+    pixman_image_t *result;
+
+    inner_c = create_random_point();
+    outer_c = create_random_point();
+    inner_r = lcg_rand();
+    outer_r = lcg_rand();
+
+    stops = create_random_stops (&n_stops);
+
+    if (!stops)
+       return NULL;
+
+    result = pixman_image_create_radial_gradient (
+       &inner_c, &outer_c, inner_r, outer_r, stops, n_stops);
+
+    free (stops);
+
+    return result;
+}
+
+static pixman_image_t *
+create_random_conical_image (void)
+{
+    pixman_gradient_stop_t *stops;
+    int n_stops;
+    pixman_point_fixed_t c;
+    pixman_fixed_t angle;
+    pixman_image_t *result;
+
+    c = create_random_point();
+    angle = lcg_rand();
+
+    stops = create_random_stops (&n_stops);
+
+    if (!stops)
+       return NULL;
+
+    result = pixman_image_create_conical_gradient (&c, angle, stops, n_stops);
+
+    free (stops);
+
+    return result;
+}
+
+static pixman_image_t *
+create_random_image (void)
+{
+    pixman_image_t *result;
+
+    switch (lcg_rand_n (5))
+    {
+    default:
+    case 0:
+       result = create_random_bits_image ();
+       break;
+
+    case 1:
+       result = create_random_solid_image ();
+       break;
+
+    case 2:
+       result = create_random_linear_image ();
+       break;
+
+    case 3:
+       result = create_random_radial_image ();
+       break;
+
+    case 4:
+       result = create_random_conical_image ();
+       break;
+    }
+
+    if (result)
+       set_general_properties (result, TRUE);
+
+    return result;
+}
+
+static const pixman_op_t op_list[] =
+{
+    PIXMAN_OP_SRC,
+    PIXMAN_OP_OVER,
+    PIXMAN_OP_ADD,
+    PIXMAN_OP_CLEAR,
+    PIXMAN_OP_SRC,
+    PIXMAN_OP_DST,
+    PIXMAN_OP_OVER,
+    PIXMAN_OP_OVER_REVERSE,
+    PIXMAN_OP_IN,
+    PIXMAN_OP_IN_REVERSE,
+    PIXMAN_OP_OUT,
+    PIXMAN_OP_OUT_REVERSE,
+    PIXMAN_OP_ATOP,
+    PIXMAN_OP_ATOP_REVERSE,
+    PIXMAN_OP_XOR,
+    PIXMAN_OP_ADD,
+    PIXMAN_OP_SATURATE,
+    PIXMAN_OP_DISJOINT_CLEAR,
+    PIXMAN_OP_DISJOINT_SRC,
+    PIXMAN_OP_DISJOINT_DST,
+    PIXMAN_OP_DISJOINT_OVER,
+    PIXMAN_OP_DISJOINT_OVER_REVERSE,
+    PIXMAN_OP_DISJOINT_IN,
+    PIXMAN_OP_DISJOINT_IN_REVERSE,
+    PIXMAN_OP_DISJOINT_OUT,
+    PIXMAN_OP_DISJOINT_OUT_REVERSE,
+    PIXMAN_OP_DISJOINT_ATOP,
+    PIXMAN_OP_DISJOINT_ATOP_REVERSE,
+    PIXMAN_OP_DISJOINT_XOR,
+    PIXMAN_OP_CONJOINT_CLEAR,
+    PIXMAN_OP_CONJOINT_SRC,
+    PIXMAN_OP_CONJOINT_DST,
+    PIXMAN_OP_CONJOINT_OVER,
+    PIXMAN_OP_CONJOINT_OVER_REVERSE,
+    PIXMAN_OP_CONJOINT_IN,
+    PIXMAN_OP_CONJOINT_IN_REVERSE,
+    PIXMAN_OP_CONJOINT_OUT,
+    PIXMAN_OP_CONJOINT_OUT_REVERSE,
+    PIXMAN_OP_CONJOINT_ATOP,
+    PIXMAN_OP_CONJOINT_ATOP_REVERSE,
+    PIXMAN_OP_CONJOINT_XOR,
+    PIXMAN_OP_MULTIPLY,
+    PIXMAN_OP_SCREEN,
+    PIXMAN_OP_OVERLAY,
+    PIXMAN_OP_DARKEN,
+    PIXMAN_OP_LIGHTEN,
+    PIXMAN_OP_COLOR_DODGE,
+    PIXMAN_OP_COLOR_BURN,
+    PIXMAN_OP_HARD_LIGHT,
+    PIXMAN_OP_DIFFERENCE,
+    PIXMAN_OP_EXCLUSION,
+    PIXMAN_OP_SOFT_LIGHT,
+    PIXMAN_OP_HSL_HUE,
+    PIXMAN_OP_HSL_SATURATION,
+    PIXMAN_OP_HSL_COLOR,
+    PIXMAN_OP_HSL_LUMINOSITY,
+};
+
+static void
+run_test (uint32_t seed, pixman_bool_t verbose, uint32_t mod)
+{
+    pixman_image_t *source, *mask, *dest;
+    pixman_op_t op;
+
+    if (verbose)
+    {
+       if (mod == 0 || (seed % mod) == 0)
+           printf ("Seed 0x%08x\n", seed);
+    }
+           
+    lcg_srand (seed);
+
+    source = create_random_image ();
+    mask   = create_random_image ();
+    dest   = create_random_bits_image ();
+
+    if (source && mask && dest)
+    {
+       set_general_properties (dest, TRUE);
+
+       op = op_list [lcg_rand_n (ARRAY_LENGTH (op_list))];
+
+       pixman_image_composite32 (op,
+                                 source, mask, dest,
+                                 log_rand(), log_rand(),
+                                 log_rand(), log_rand(),
+                                 log_rand(), log_rand(),
+                                 absolute (log_rand()),
+                                 absolute (log_rand()));
+    }
+    if (source)
+       pixman_image_unref (source);
+    if (mask)
+       pixman_image_unref (mask);
+    if (dest)
+       pixman_image_unref (dest);
+}
+
+static pixman_bool_t
+get_int (char *s, uint32_t *i)
+{
+    char *end;
+    int p;
+
+    p = strtol (s, &end, 0);
+
+    if (end != s && *end == 0)
+    {
+       *i = p;
+       return TRUE;
+    }
+
+    return FALSE;
+}
+
+int
+main (int argc, char **argv)
+{
+    int verbose = FALSE;
+    uint32_t seed = 1;
+    uint32_t n_tests = 0xffffffff;
+    uint32_t mod = 0;
+    pixman_bool_t use_threads = TRUE;
+    uint32_t i;
+
+    pixman_disable_out_of_bounds_workaround ();
+
+    enable_fp_exceptions();
+
+    if (getenv ("VERBOSE") != NULL)
+       verbose = TRUE;
+
+    for (i = 1; i < argc; ++i)
+    {
+       if (strcmp (argv[i], "-v") == 0)
+       {
+           verbose = TRUE;
+
+           if (i + 1 < argc)
+           {
+               get_int (argv[i + 1], &mod);
+               i++;
+           }
+       }
+       else if (strcmp (argv[i], "-s") == 0 && i + 1 < argc)
+       {
+           get_int (argv[i + 1], &seed);
+           use_threads = FALSE;
+           i++;
+       }
+       else if (strcmp (argv[i], "-n") == 0 && i + 1 < argc)
+       {
+           get_int (argv[i + 1], &n_tests);
+           i++;
+       }
+       else
+       {
+           if (strcmp (argv[i], "-h") != 0)
+               printf ("Unknown option '%s'\n\n", argv[i]);
+
+           printf ("Options:\n\n"
+                   "-n <number>        Number of tests to run\n"
+                   "-s <seed>          Seed of first test (ignored if PIXMAN_RANDOMIZE_TESTS is set)\n"
+                   "-v                 Print out seeds\n"
+                   "-v <n>             Print out every n'th seed\n\n");
+
+           exit (-1);
+       }
+    }
+
+    if (n_tests == 0xffffffff)
+       n_tests = 8000;
+
+    if (getenv ("PIXMAN_RANDOMIZE_TESTS"))
+    {
+       seed = get_random_seed();
+       printf ("First seed: 0x%08x\n", seed);
+    }
+
+    if (use_threads)
+    {
+#ifdef USE_OPENMP
+#   pragma omp parallel for default(none) shared(verbose, n_tests, mod, seed)
+#endif
+       for (i = seed; i < seed + n_tests; ++i)
+           run_test (i, verbose, mod);
+    }
+    else
+    {
+       for (i = seed; i < seed + n_tests; ++i)
+           run_test (i, verbose, mod);
+    }
+
+    return 0;
+}
diff --git a/test/trap-crasher.c b/test/trap-crasher.c
new file mode 100755 (executable)
index 0000000..7485e62
--- /dev/null
@@ -0,0 +1,27 @@
+#include <stdlib.h>
+#include <pixman.h>
+
+int
+main()
+{
+    pixman_image_t *dst;
+    pixman_trapezoid_t traps[1] = {
+       {
+           2147483646,
+           2147483647,
+           {
+               { 0, 0 },
+               { 0, 2147483647 }
+           },
+           {
+               { 65536, 0 },
+               { 0, 2147483647 }
+           }
+       },
+    };
+
+    dst = pixman_image_create_bits (PIXMAN_a8, 1, 1, NULL, -1);
+
+    pixman_add_trapezoids (dst, 0, 0, sizeof (traps)/sizeof (traps[0]), traps);
+    return (0);
+}
diff --git a/test/utils.c b/test/utils.c
new file mode 100755 (executable)
index 0000000..adabd75
--- /dev/null
@@ -0,0 +1,704 @@
+#define _GNU_SOURCE
+
+#include "utils.h"
+#include <signal.h>
+
+#ifdef HAVE_GETTIMEOFDAY
+#include <sys/time.h>
+#else
+#include <time.h>
+#endif
+
+#ifdef HAVE_UNISTD_H
+#include <unistd.h>
+#endif
+
+#ifdef HAVE_SYS_MMAN_H
+#include <sys/mman.h>
+#endif
+
+#ifdef HAVE_FENV_H
+#include <fenv.h>
+#endif
+
+#ifdef HAVE_LIBPNG
+#include <png.h>
+#endif
+
+/* Random number seed
+ */
+
+uint32_t lcg_seed;
+
+/*----------------------------------------------------------------------------*\
+ *  CRC-32 version 2.0.0 by Craig Bruce, 2006-04-29.
+ *
+ *  This program generates the CRC-32 values for the files named in the
+ *  command-line arguments.  These are the same CRC-32 values used by GZIP,
+ *  PKZIP, and ZMODEM.  The Crc32_ComputeBuf () can also be detached and
+ *  used independently.
+ *
+ *  THIS PROGRAM IS PUBLIC-DOMAIN SOFTWARE.
+ *
+ *  Based on the byte-oriented implementation "File Verification Using CRC"
+ *  by Mark R. Nelson in Dr. Dobb's Journal, May 1992, pp. 64-67.
+ *
+ *  v1.0.0: original release.
+ *  v1.0.1: fixed printf formats.
+ *  v1.0.2: fixed something else.
+ *  v1.0.3: replaced CRC constant table by generator function.
+ *  v1.0.4: reformatted code, made ANSI C.  1994-12-05.
+ *  v2.0.0: rewrote to use memory buffer & static table, 2006-04-29.
+\*----------------------------------------------------------------------------*/
+
+/*----------------------------------------------------------------------------*\
+ *  NAME:
+ *     Crc32_ComputeBuf () - computes the CRC-32 value of a memory buffer
+ *  DESCRIPTION:
+ *     Computes or accumulates the CRC-32 value for a memory buffer.
+ *     The 'inCrc32' gives a previously accumulated CRC-32 value to allow
+ *     a CRC to be generated for multiple sequential buffer-fuls of data.
+ *     The 'inCrc32' for the first buffer must be zero.
+ *  ARGUMENTS:
+ *     inCrc32 - accumulated CRC-32 value, must be 0 on first call
+ *     buf     - buffer to compute CRC-32 value for
+ *     bufLen  - number of bytes in buffer
+ *  RETURNS:
+ *     crc32 - computed CRC-32 value
+ *  ERRORS:
+ *     (no errors are possible)
+\*----------------------------------------------------------------------------*/
+
+uint32_t
+compute_crc32 (uint32_t    in_crc32,
+              const void *buf,
+              size_t      buf_len)
+{
+    static const uint32_t crc_table[256] = {
+       0x00000000, 0x77073096, 0xEE0E612C, 0x990951BA, 0x076DC419, 0x706AF48F,
+       0xE963A535, 0x9E6495A3, 0x0EDB8832, 0x79DCB8A4, 0xE0D5E91E, 0x97D2D988,
+       0x09B64C2B, 0x7EB17CBD, 0xE7B82D07, 0x90BF1D91, 0x1DB71064, 0x6AB020F2,
+       0xF3B97148, 0x84BE41DE, 0x1ADAD47D, 0x6DDDE4EB, 0xF4D4B551, 0x83D385C7,
+       0x136C9856, 0x646BA8C0, 0xFD62F97A, 0x8A65C9EC, 0x14015C4F, 0x63066CD9,
+       0xFA0F3D63, 0x8D080DF5, 0x3B6E20C8, 0x4C69105E, 0xD56041E4, 0xA2677172,
+       0x3C03E4D1, 0x4B04D447, 0xD20D85FD, 0xA50AB56B, 0x35B5A8FA, 0x42B2986C,
+       0xDBBBC9D6, 0xACBCF940, 0x32D86CE3, 0x45DF5C75, 0xDCD60DCF, 0xABD13D59,
+       0x26D930AC, 0x51DE003A, 0xC8D75180, 0xBFD06116, 0x21B4F4B5, 0x56B3C423,
+       0xCFBA9599, 0xB8BDA50F, 0x2802B89E, 0x5F058808, 0xC60CD9B2, 0xB10BE924,
+       0x2F6F7C87, 0x58684C11, 0xC1611DAB, 0xB6662D3D, 0x76DC4190, 0x01DB7106,
+       0x98D220BC, 0xEFD5102A, 0x71B18589, 0x06B6B51F, 0x9FBFE4A5, 0xE8B8D433,
+       0x7807C9A2, 0x0F00F934, 0x9609A88E, 0xE10E9818, 0x7F6A0DBB, 0x086D3D2D,
+       0x91646C97, 0xE6635C01, 0x6B6B51F4, 0x1C6C6162, 0x856530D8, 0xF262004E,
+       0x6C0695ED, 0x1B01A57B, 0x8208F4C1, 0xF50FC457, 0x65B0D9C6, 0x12B7E950,
+       0x8BBEB8EA, 0xFCB9887C, 0x62DD1DDF, 0x15DA2D49, 0x8CD37CF3, 0xFBD44C65,
+       0x4DB26158, 0x3AB551CE, 0xA3BC0074, 0xD4BB30E2, 0x4ADFA541, 0x3DD895D7,
+       0xA4D1C46D, 0xD3D6F4FB, 0x4369E96A, 0x346ED9FC, 0xAD678846, 0xDA60B8D0,
+       0x44042D73, 0x33031DE5, 0xAA0A4C5F, 0xDD0D7CC9, 0x5005713C, 0x270241AA,
+       0xBE0B1010, 0xC90C2086, 0x5768B525, 0x206F85B3, 0xB966D409, 0xCE61E49F,
+       0x5EDEF90E, 0x29D9C998, 0xB0D09822, 0xC7D7A8B4, 0x59B33D17, 0x2EB40D81,
+       0xB7BD5C3B, 0xC0BA6CAD, 0xEDB88320, 0x9ABFB3B6, 0x03B6E20C, 0x74B1D29A,
+       0xEAD54739, 0x9DD277AF, 0x04DB2615, 0x73DC1683, 0xE3630B12, 0x94643B84,
+       0x0D6D6A3E, 0x7A6A5AA8, 0xE40ECF0B, 0x9309FF9D, 0x0A00AE27, 0x7D079EB1,
+       0xF00F9344, 0x8708A3D2, 0x1E01F268, 0x6906C2FE, 0xF762575D, 0x806567CB,
+       0x196C3671, 0x6E6B06E7, 0xFED41B76, 0x89D32BE0, 0x10DA7A5A, 0x67DD4ACC,
+       0xF9B9DF6F, 0x8EBEEFF9, 0x17B7BE43, 0x60B08ED5, 0xD6D6A3E8, 0xA1D1937E,
+       0x38D8C2C4, 0x4FDFF252, 0xD1BB67F1, 0xA6BC5767, 0x3FB506DD, 0x48B2364B,
+       0xD80D2BDA, 0xAF0A1B4C, 0x36034AF6, 0x41047A60, 0xDF60EFC3, 0xA867DF55,
+       0x316E8EEF, 0x4669BE79, 0xCB61B38C, 0xBC66831A, 0x256FD2A0, 0x5268E236,
+       0xCC0C7795, 0xBB0B4703, 0x220216B9, 0x5505262F, 0xC5BA3BBE, 0xB2BD0B28,
+       0x2BB45A92, 0x5CB36A04, 0xC2D7FFA7, 0xB5D0CF31, 0x2CD99E8B, 0x5BDEAE1D,
+       0x9B64C2B0, 0xEC63F226, 0x756AA39C, 0x026D930A, 0x9C0906A9, 0xEB0E363F,
+       0x72076785, 0x05005713, 0x95BF4A82, 0xE2B87A14, 0x7BB12BAE, 0x0CB61B38,
+       0x92D28E9B, 0xE5D5BE0D, 0x7CDCEFB7, 0x0BDBDF21, 0x86D3D2D4, 0xF1D4E242,
+       0x68DDB3F8, 0x1FDA836E, 0x81BE16CD, 0xF6B9265B, 0x6FB077E1, 0x18B74777,
+       0x88085AE6, 0xFF0F6A70, 0x66063BCA, 0x11010B5C, 0x8F659EFF, 0xF862AE69,
+       0x616BFFD3, 0x166CCF45, 0xA00AE278, 0xD70DD2EE, 0x4E048354, 0x3903B3C2,
+       0xA7672661, 0xD06016F7, 0x4969474D, 0x3E6E77DB, 0xAED16A4A, 0xD9D65ADC,
+       0x40DF0B66, 0x37D83BF0, 0xA9BCAE53, 0xDEBB9EC5, 0x47B2CF7F, 0x30B5FFE9,
+       0xBDBDF21C, 0xCABAC28A, 0x53B39330, 0x24B4A3A6, 0xBAD03605, 0xCDD70693,
+       0x54DE5729, 0x23D967BF, 0xB3667A2E, 0xC4614AB8, 0x5D681B02, 0x2A6F2B94,
+       0xB40BBE37, 0xC30C8EA1, 0x5A05DF1B, 0x2D02EF8D
+    };
+
+    uint32_t              crc32;
+    unsigned char *       byte_buf;
+    size_t                i;
+
+    /* accumulate crc32 for buffer */
+    crc32 = in_crc32 ^ 0xFFFFFFFF;
+    byte_buf = (unsigned char*) buf;
+
+    for (i = 0; i < buf_len; i++)
+       crc32 = (crc32 >> 8) ^ crc_table[(crc32 ^ byte_buf[i]) & 0xFF];
+
+    return (crc32 ^ 0xFFFFFFFF);
+}
+
+pixman_bool_t
+is_little_endian (void)
+{
+    volatile uint16_t endian_check_var = 0x1234;
+
+    return (*(volatile uint8_t *)&endian_check_var == 0x34);
+}
+
+/* perform endian conversion of pixel data
+ */
+void
+image_endian_swap (pixman_image_t *img)
+{
+    int stride = pixman_image_get_stride (img);
+    uint32_t *data = pixman_image_get_data (img);
+    int height = pixman_image_get_height (img);
+    int bpp = PIXMAN_FORMAT_BPP (pixman_image_get_format (img));
+    int i, j;
+
+    /* swap bytes only on big endian systems */
+    if (is_little_endian())
+       return;
+
+    if (bpp == 8)
+       return;
+
+    for (i = 0; i < height; i++)
+    {
+       uint8_t *line_data = (uint8_t *)data + stride * i;
+       
+       switch (bpp)
+       {
+       case 1:
+           for (j = 0; j < stride; j++)
+           {
+               line_data[j] =
+                   ((line_data[j] & 0x80) >> 7) |
+                   ((line_data[j] & 0x40) >> 5) |
+                   ((line_data[j] & 0x20) >> 3) |
+                   ((line_data[j] & 0x10) >> 1) |
+                   ((line_data[j] & 0x08) << 1) |
+                   ((line_data[j] & 0x04) << 3) |
+                   ((line_data[j] & 0x02) << 5) |
+                   ((line_data[j] & 0x01) << 7);
+           }
+           break;
+       case 4:
+           for (j = 0; j < stride; j++)
+           {
+               line_data[j] = (line_data[j] >> 4) | (line_data[j] << 4);
+           }
+           break;
+       case 16:
+           for (j = 0; j + 2 <= stride; j += 2)
+           {
+               char t1 = line_data[j + 0];
+               char t2 = line_data[j + 1];
+
+               line_data[j + 1] = t1;
+               line_data[j + 0] = t2;
+           }
+           break;
+       case 24:
+           for (j = 0; j + 3 <= stride; j += 3)
+           {
+               char t1 = line_data[j + 0];
+               char t2 = line_data[j + 1];
+               char t3 = line_data[j + 2];
+
+               line_data[j + 2] = t1;
+               line_data[j + 1] = t2;
+               line_data[j + 0] = t3;
+           }
+           break;
+       case 32:
+           for (j = 0; j + 4 <= stride; j += 4)
+           {
+               char t1 = line_data[j + 0];
+               char t2 = line_data[j + 1];
+               char t3 = line_data[j + 2];
+               char t4 = line_data[j + 3];
+
+               line_data[j + 3] = t1;
+               line_data[j + 2] = t2;
+               line_data[j + 1] = t3;
+               line_data[j + 0] = t4;
+           }
+           break;
+       default:
+           assert (FALSE);
+           break;
+       }
+    }
+}
+
+#define N_LEADING_PROTECTED    10
+#define N_TRAILING_PROTECTED   10
+
+typedef struct
+{
+    void *addr;
+    uint32_t len;
+    uint8_t *trailing;
+    int n_bytes;
+} info_t;
+
+#if defined(HAVE_MPROTECT) && defined(HAVE_GETPAGESIZE) && defined(HAVE_SYS_MMAN_H) && defined(HAVE_MMAP)
+
+/* This is apparently necessary on at least OS X */
+#ifndef MAP_ANONYMOUS
+#define MAP_ANONYMOUS MAP_ANON
+#endif
+
+void *
+fence_malloc (int64_t len)
+{
+    unsigned long page_size = getpagesize();
+    unsigned long page_mask = page_size - 1;
+    uint32_t n_payload_bytes = (len + page_mask) & ~page_mask;
+    uint32_t n_bytes =
+       (page_size * (N_LEADING_PROTECTED + N_TRAILING_PROTECTED + 2) +
+        n_payload_bytes) & ~page_mask;
+    uint8_t *initial_page;
+    uint8_t *leading_protected;
+    uint8_t *trailing_protected;
+    uint8_t *payload;
+    uint8_t *addr;
+
+    if (len < 0)
+       abort();
+    
+    addr = mmap (NULL, n_bytes, PROT_READ | PROT_WRITE, MAP_PRIVATE | MAP_ANONYMOUS,
+                -1, 0);
+
+    if (addr == MAP_FAILED)
+    {
+       printf ("mmap failed on %lld %u\n", (long long int)len, n_bytes);
+       return NULL;
+    }
+
+    initial_page = (uint8_t *)(((unsigned long)addr + page_mask) & ~page_mask);
+    leading_protected = initial_page + page_size;
+    payload = leading_protected + N_LEADING_PROTECTED * page_size;
+    trailing_protected = payload + n_payload_bytes;
+
+    ((info_t *)initial_page)->addr = addr;
+    ((info_t *)initial_page)->len = len;
+    ((info_t *)initial_page)->trailing = trailing_protected;
+    ((info_t *)initial_page)->n_bytes = n_bytes;
+
+    if ((mprotect (leading_protected, N_LEADING_PROTECTED * page_size,
+                 PROT_NONE) == -1) ||
+       (mprotect (trailing_protected, N_TRAILING_PROTECTED * page_size,
+                 PROT_NONE) == -1))
+    {
+       munmap (addr, n_bytes);
+       return NULL;
+    }
+
+    return payload;
+}
+
+void
+fence_free (void *data)
+{
+    uint32_t page_size = getpagesize();
+    uint8_t *payload = data;
+    uint8_t *leading_protected = payload - N_LEADING_PROTECTED * page_size;
+    uint8_t *initial_page = leading_protected - page_size;
+    info_t *info = (info_t *)initial_page;
+
+    munmap (info->addr, info->n_bytes);
+}
+
+#else
+
+void *
+fence_malloc (int64_t len)
+{
+    return malloc (len);
+}
+
+void
+fence_free (void *data)
+{
+    free (data);
+}
+
+#endif
+
+uint8_t *
+make_random_bytes (int n_bytes)
+{
+    uint8_t *bytes = fence_malloc (n_bytes);
+    int i;
+
+    if (!bytes)
+       return NULL;
+
+    for (i = 0; i < n_bytes; ++i)
+       bytes[i] = lcg_rand () & 0xff;
+
+    return bytes;
+}
+
+#ifdef HAVE_LIBPNG
+
+static void
+pngify_pixels (uint32_t *pixels, int n_pixels)
+{
+    int i;
+
+    for (i = 0; i < n_pixels; ++i)
+    {
+       uint32_t p = pixels[i];
+       uint8_t *out = (uint8_t *)&(pixels[i]);
+       uint8_t a, r, g, b;
+
+       a = (p & 0xff000000) >> 24;
+       r = (p & 0x00ff0000) >> 16;
+       g = (p & 0x0000ff00) >> 8;
+       b = (p & 0x000000ff) >> 0;
+
+       if (a != 0)
+       {
+           r = (r * 255) / a;
+           g = (g * 255) / a;
+           b = (b * 255) / a;
+       }
+
+       *out++ = r;
+       *out++ = g;
+       *out++ = b;
+       *out++ = a;
+    }
+}
+
+pixman_bool_t
+write_png (pixman_image_t *image, const char *filename)
+{
+    int width = pixman_image_get_width (image);
+    int height = pixman_image_get_height (image);
+    int stride = width * 4;
+    uint32_t *data = malloc (height * stride);
+    pixman_image_t *copy;
+    png_struct *write_struct;
+    png_info *info_struct;
+    pixman_bool_t result = FALSE;
+    FILE *f = fopen (filename, "wb");
+    png_bytep *row_pointers;
+    int i;
+
+    if (!f)
+       return FALSE;
+
+    row_pointers = malloc (height * sizeof (png_bytep));
+
+    copy = pixman_image_create_bits (
+       PIXMAN_a8r8g8b8, width, height, data, stride);
+
+    pixman_image_composite32 (
+       PIXMAN_OP_SRC, image, NULL, copy, 0, 0, 0, 0, 0, 0, width, height);
+
+    pngify_pixels (data, height * width);
+
+    for (i = 0; i < height; ++i)
+       row_pointers[i] = (png_bytep)(data + i * width);
+
+    if (!(write_struct = png_create_write_struct (
+             PNG_LIBPNG_VER_STRING, NULL, NULL, NULL)))
+       goto out1;
+
+    if (!(info_struct = png_create_info_struct (write_struct)))
+       goto out2;
+
+    png_init_io (write_struct, f);
+
+    png_set_IHDR (write_struct, info_struct, width, height,
+                 8, PNG_COLOR_TYPE_RGB_ALPHA,
+                 PNG_INTERLACE_NONE, PNG_COMPRESSION_TYPE_BASE,
+                 PNG_FILTER_TYPE_BASE);
+
+    png_write_info (write_struct, info_struct);
+
+    png_write_image (write_struct, row_pointers);
+
+    png_write_end (write_struct, NULL);
+
+    result = TRUE;
+
+out2:
+    png_destroy_write_struct (&write_struct, &info_struct);
+
+out1:
+    if (fclose (f) != 0)
+       result = FALSE;
+
+    pixman_image_unref (copy);
+    free (row_pointers);
+    free (data);
+    return result;
+}
+
+#else /* no libpng */
+
+pixman_bool_t
+write_png (pixman_image_t *image, const char *filename)
+{
+    return FALSE;
+}
+
+#endif
+
+/*
+ * A function, which can be used as a core part of the test programs,
+ * intended to detect various problems with the help of fuzzing input
+ * to pixman API (according to some templates, aka "smart" fuzzing).
+ * Some general information about such testing can be found here:
+ * http://en.wikipedia.org/wiki/Fuzz_testing
+ *
+ * It may help detecting:
+ *  - crashes on bad handling of valid or reasonably invalid input to
+ *    pixman API.
+ *  - deviations from the behavior of older pixman releases.
+ *  - deviations from the behavior of the same pixman release, but
+ *    configured in a different way (for example with SIMD optimizations
+ *    disabled), or running on a different OS or hardware.
+ *
+ * The test is performed by calling a callback function a huge number
+ * of times. The callback function is expected to run some snippet of
+ * pixman code with pseudorandom variations to the data feeded to
+ * pixman API. A result of running each callback function should be
+ * some deterministic value which depends on test number (test number
+ * can be used as a seed for PRNG). When 'verbose' argument is nonzero,
+ * callback function is expected to print to stdout some information
+ * about what it does.
+ *
+ * Return values from many small tests are accumulated together and
+ * used as final checksum, which can be compared to some expected
+ * value. Running the tests not individually, but in a batch helps
+ * to reduce process start overhead and also allows to parallelize
+ * testing and utilize multiple CPU cores.
+ *
+ * The resulting executable can be run without any arguments. In
+ * this case it runs a batch of tests starting from 1 and up to
+ * 'default_number_of_iterations'. The resulting checksum is
+ * compared with 'expected_checksum' and FAIL or PASS verdict
+ * depends on the result of this comparison.
+ *
+ * If the executable is run with 2 numbers provided as command line
+ * arguments, they specify the starting and ending numbers for a test
+ * batch.
+ *
+ * If the executable is run with only one number provided as a command
+ * line argument, then this number is used to call the callback function
+ * once, and also with verbose flag set.
+ */
+int
+fuzzer_test_main (const char *test_name,
+                 int         default_number_of_iterations,
+                 uint32_t    expected_checksum,
+                 uint32_t    (*test_function)(int testnum, int verbose),
+                 int         argc,
+                 const char *argv[])
+{
+    int i, n1 = 1, n2 = 0;
+    uint32_t checksum = 0;
+    int verbose = getenv ("VERBOSE") != NULL;
+
+    if (argc >= 3)
+    {
+       n1 = atoi (argv[1]);
+       n2 = atoi (argv[2]);
+       if (n2 < n1)
+       {
+           printf ("invalid test range\n");
+           return 1;
+       }
+    }
+    else if (argc >= 2)
+    {
+       n2 = atoi (argv[1]);
+       checksum = test_function (n2, 1);
+       printf ("%d: checksum=%08X\n", n2, checksum);
+       return 0;
+    }
+    else
+    {
+       n1 = 1;
+       n2 = default_number_of_iterations;
+    }
+
+#ifdef USE_OPENMP
+    #pragma omp parallel for reduction(+:checksum) default(none) \
+                                       shared(n1, n2, test_function, verbose)
+#endif
+    for (i = n1; i <= n2; i++)
+    {
+       uint32_t crc = test_function (i, 0);
+       if (verbose)
+           printf ("%d: %08X\n", i, crc);
+       checksum += crc;
+    }
+
+    if (n1 == 1 && n2 == default_number_of_iterations)
+    {
+       if (checksum == expected_checksum)
+       {
+           printf ("%s test passed (checksum=%08X)\n",
+                   test_name, checksum);
+       }
+       else
+       {
+           printf ("%s test failed! (checksum=%08X, expected %08X)\n",
+                   test_name, checksum, expected_checksum);
+           return 1;
+       }
+    }
+    else
+    {
+       printf ("%d-%d: checksum=%08X\n", n1, n2, checksum);
+    }
+
+    return 0;
+}
+
+/* Try to obtain current time in seconds */
+double
+gettime (void)
+{
+#ifdef HAVE_GETTIMEOFDAY
+    struct timeval tv;
+
+    gettimeofday (&tv, NULL);
+    return (double)((int64_t)tv.tv_sec * 1000000 + tv.tv_usec) / 1000000.;
+#else
+    return (double)clock() / (double)CLOCKS_PER_SEC;
+#endif
+}
+
+uint32_t
+get_random_seed (void)
+{
+    double d = gettime();
+
+    lcg_srand (*(uint32_t *)&d);
+
+    return lcg_rand_u32 ();
+}
+
+static const char *global_msg;
+
+static void
+on_alarm (int signo)
+{
+    printf ("%s\n", global_msg);
+    exit (1);
+}
+
+void
+fail_after (int seconds, const char *msg)
+{
+#ifdef HAVE_SIGACTION
+#ifdef HAVE_ALARM
+    struct sigaction action;
+
+    global_msg = msg;
+
+    memset (&action, 0, sizeof (action));
+    action.sa_handler = on_alarm;
+
+    alarm (seconds);
+
+    sigaction (SIGALRM, &action, NULL);
+#endif
+#endif
+}
+
+void
+enable_fp_exceptions (void)
+{
+#ifdef HAVE_FENV_H
+#ifdef HAVE_FEENABLEEXCEPT
+    /* Note: we don't enable the FE_INEXACT trap because
+     * that happens quite commonly. It is possible that
+     * over- and underflow should similarly be considered
+     * okay, but for now the test suite passes with them
+     * enabled, and it's useful to know if they start
+     * occuring.
+     */
+    feenableexcept (FE_DIVBYZERO       |
+                   FE_INVALID          |
+                   FE_OVERFLOW         |
+                   FE_UNDERFLOW);
+#endif
+#endif
+}
+
+void *
+aligned_malloc (size_t align, size_t size)
+{
+    void *result;
+
+#ifdef HAVE_POSIX_MEMALIGN
+    if (posix_memalign (&result, align, size) != 0)
+      result = NULL;
+#else
+    result = malloc (size);
+#endif
+
+    return result;
+}
+
+#define CONVERT_15(c, is_rgb)                                          \
+    (is_rgb?                                                           \
+     ((((c) >> 3) & 0x001f) |                                          \
+      (((c) >> 6) & 0x03e0) |                                          \
+      (((c) >> 9) & 0x7c00)) :                                         \
+     (((((c) >> 16) & 0xff) * 153 +                                    \
+       (((c) >>  8) & 0xff) * 301 +                                    \
+       (((c)      ) & 0xff) * 58) >> 2))
+
+void
+initialize_palette (pixman_indexed_t *palette, uint32_t depth, int is_rgb)
+{
+    int i;
+    uint32_t mask = (1 << depth) - 1;
+
+    for (i = 0; i < 32768; ++i)
+       palette->ent[i] = lcg_rand() & mask;
+
+    memset (palette->rgba, 0, sizeof (palette->rgba));
+
+    for (i = 0; i < mask + 1; ++i)
+    {
+       uint32_t rgba24;
+       pixman_bool_t retry;
+       uint32_t i15;
+
+       /* We filled the rgb->index map with random numbers, but we
+        * do need the ability to round trip, that is if some indexed
+        * color expands to an argb24, then the 15 bit version of that
+        * color must map back to the index. Anything else, we don't
+        * care about too much.
+        */
+       do
+       {
+           uint32_t old_idx;
+
+           rgba24 = lcg_rand();
+           i15 = CONVERT_15 (rgba24, is_rgb);
+
+           old_idx = palette->ent[i15];
+           if (CONVERT_15 (palette->rgba[old_idx], is_rgb) == i15)
+               retry = 1;
+           else
+               retry = 0;
+       } while (retry);
+
+       palette->rgba[i] = rgba24;
+       palette->ent[i15] = i;
+    }
+
+    for (i = 0; i < mask + 1; ++i)
+    {
+       assert (palette->ent[CONVERT_15 (palette->rgba[i], is_rgb)] == i);
+    }
+}
diff --git a/test/utils.h b/test/utils.h
new file mode 100755 (executable)
index 0000000..b23925c
--- /dev/null
@@ -0,0 +1,154 @@
+#ifdef HAVE_CONFIG_H
+#include <config.h>
+#endif
+
+#include <stdlib.h>
+#include <assert.h>
+#include "pixman-private.h" /* For 'inline' definition */
+
+#define ARRAY_LENGTH(A) ((int) (sizeof (A) / sizeof ((A) [0])))
+
+/* A primitive pseudorandom number generator,
+ * taken from POSIX.1-2001 example
+ */
+
+extern uint32_t lcg_seed;
+#ifdef USE_OPENMP
+#pragma omp threadprivate(lcg_seed)
+#endif
+
+static inline uint32_t
+lcg_rand (void)
+{
+    lcg_seed = lcg_seed * 1103515245 + 12345;
+    return ((uint32_t)(lcg_seed / 65536) % 32768);
+}
+
+static inline void
+lcg_srand (uint32_t seed)
+{
+    lcg_seed = seed;
+}
+
+static inline uint32_t
+lcg_rand_n (int max)
+{
+    return lcg_rand () % max;
+}
+
+static inline uint32_t
+lcg_rand_N (int max)
+{
+    uint32_t lo = lcg_rand ();
+    uint32_t hi = lcg_rand () << 15;
+    return (lo | hi) % max;
+}
+
+static inline uint32_t
+lcg_rand_u32 (void)
+{
+    /* This uses the 10/11 most significant bits from the 3 lcg results
+     * (and mixes them with the low from the adjacent one).
+     */
+    uint32_t lo = lcg_rand() >> -(32 - 15 - 11 * 2);
+    uint32_t mid = lcg_rand() << (32 - 15 - 11 * 1);
+    uint32_t hi = lcg_rand() << (32 - 15 - 11 * 0);
+
+    return (hi ^ mid ^ lo);
+}
+
+/* CRC 32 computation
+ */
+uint32_t
+compute_crc32 (uint32_t    in_crc32,
+              const void *buf,
+              size_t      buf_len);
+
+/* Returns TRUE if running on a little endian system */
+pixman_bool_t
+is_little_endian (void);
+
+/* perform endian conversion of pixel data
+ */
+void
+image_endian_swap (pixman_image_t *img);
+
+/* Allocate memory that is bounded by protected pages,
+ * so that out-of-bounds access will cause segfaults
+ */
+void *
+fence_malloc (int64_t len);
+
+void
+fence_free (void *data);
+
+/* Generate n_bytes random bytes in fence_malloced memory */
+uint8_t *
+make_random_bytes (int n_bytes);
+
+/* Return current time in seconds */
+double
+gettime (void);
+
+uint32_t
+get_random_seed (void);
+
+/* main body of the fuzzer test */
+int
+fuzzer_test_main (const char *test_name,
+                 int         default_number_of_iterations,
+                 uint32_t    expected_checksum,
+                 uint32_t    (*test_function)(int testnum, int verbose),
+                 int         argc,
+                 const char *argv[]);
+
+void
+fail_after (int seconds, const char *msg);
+
+/* If possible, enable traps for floating point exceptions */
+void enable_fp_exceptions(void);
+
+pixman_bool_t
+write_png (pixman_image_t *image, const char *filename);
+
+/* A pair of macros which can help to detect corruption of
+ * floating point registers after a function call. This may
+ * happen if _mm_empty() call is forgotten in MMX/SSE2 fast
+ * path code, or ARM NEON assembly optimized function forgets
+ * to save/restore d8-d15 registers before use.
+ */
+
+#define FLOAT_REGS_CORRUPTION_DETECTOR_START()                 \
+    static volatile double frcd_volatile_constant1 = 123451;   \
+    static volatile double frcd_volatile_constant2 = 123452;   \
+    static volatile double frcd_volatile_constant3 = 123453;   \
+    static volatile double frcd_volatile_constant4 = 123454;   \
+    static volatile double frcd_volatile_constant5 = 123455;   \
+    static volatile double frcd_volatile_constant6 = 123456;   \
+    static volatile double frcd_volatile_constant7 = 123457;   \
+    static volatile double frcd_volatile_constant8 = 123458;   \
+    double frcd_canary_variable1 = frcd_volatile_constant1;    \
+    double frcd_canary_variable2 = frcd_volatile_constant2;    \
+    double frcd_canary_variable3 = frcd_volatile_constant3;    \
+    double frcd_canary_variable4 = frcd_volatile_constant4;    \
+    double frcd_canary_variable5 = frcd_volatile_constant5;    \
+    double frcd_canary_variable6 = frcd_volatile_constant6;    \
+    double frcd_canary_variable7 = frcd_volatile_constant7;    \
+    double frcd_canary_variable8 = frcd_volatile_constant8;
+
+#define FLOAT_REGS_CORRUPTION_DETECTOR_FINISH()                \
+    assert (frcd_canary_variable1 == frcd_volatile_constant1); \
+    assert (frcd_canary_variable2 == frcd_volatile_constant2); \
+    assert (frcd_canary_variable3 == frcd_volatile_constant3); \
+    assert (frcd_canary_variable4 == frcd_volatile_constant4); \
+    assert (frcd_canary_variable5 == frcd_volatile_constant5); \
+    assert (frcd_canary_variable6 == frcd_volatile_constant6); \
+    assert (frcd_canary_variable7 == frcd_volatile_constant7); \
+    assert (frcd_canary_variable8 == frcd_volatile_constant8);
+
+/* Try to get an aligned memory chunk */
+void *
+aligned_malloc (size_t align, size_t size);
+
+void
+initialize_palette (pixman_indexed_t *palette, uint32_t depth, int is_rgb);